1 /*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 * Copyright (c) 2014 Andrew Turner
13 * All rights reserved.
14 * Copyright (c) 2014-2016 The FreeBSD Foundation
15 * All rights reserved.
16 *
17 * This code is derived from software contributed to Berkeley by
18 * the Systems Programming Group of the University of Utah Computer
19 * Science Department and William Jolitz of UUNET Technologies Inc.
20 *
21 * This software was developed by Andrew Turner under sponsorship from
22 * the FreeBSD Foundation.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 * 1. Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * 2. Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in the
31 * documentation and/or other materials provided with the distribution.
32 * 3. All advertising materials mentioning features or use of this software
33 * must display the following acknowledgement:
34 * This product includes software developed by the University of
35 * California, Berkeley and its contributors.
36 * 4. Neither the name of the University nor the names of its contributors
37 * may be used to endorse or promote products derived from this software
38 * without specific prior written permission.
39 *
40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 */
52 /*-
53 * Copyright (c) 2003 Networks Associates Technology, Inc.
54 * All rights reserved.
55 *
56 * This software was developed for the FreeBSD Project by Jake Burkholder,
57 * Safeport Network Services, and Network Associates Laboratories, the
58 * Security Research Division of Network Associates, Inc. under
59 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
60 * CHATS research program.
61 *
62 * Redistribution and use in source and binary forms, with or without
63 * modification, are permitted provided that the following conditions
64 * are met:
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 * 2. Redistributions in binary form must reproduce the above copyright
68 * notice, this list of conditions and the following disclaimer in the
69 * documentation and/or other materials provided with the distribution.
70 *
71 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
72 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
73 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
74 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
75 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
76 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
77 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
78 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
79 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
80 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
81 * SUCH DAMAGE.
82 */
83
84 #include <sys/cdefs.h>
85 /*
86 * Manages physical address maps.
87 *
88 * Since the information managed by this module is
89 * also stored by the logical address mapping module,
90 * this module may throw away valid virtual-to-physical
91 * mappings at almost any time. However, invalidations
92 * of virtual-to-physical mappings must be done as
93 * requested.
94 *
95 * In order to cope with hardware architectures which
96 * make virtual-to-physical map invalidates expensive,
97 * this module may delay invalidate or reduced protection
98 * operations until such time as they are actually
99 * necessary. This module is given full information as
100 * to which processors are currently using which maps,
101 * and to when physical maps must be made correct.
102 */
103
104 #include "opt_vm.h"
105
106 #include <sys/param.h>
107 #include <sys/asan.h>
108 #include <sys/bitstring.h>
109 #include <sys/bus.h>
110 #include <sys/systm.h>
111 #include <sys/kernel.h>
112 #include <sys/ktr.h>
113 #include <sys/limits.h>
114 #include <sys/lock.h>
115 #include <sys/malloc.h>
116 #include <sys/mman.h>
117 #include <sys/msan.h>
118 #include <sys/msgbuf.h>
119 #include <sys/mutex.h>
120 #include <sys/physmem.h>
121 #include <sys/proc.h>
122 #include <sys/rangeset.h>
123 #include <sys/rwlock.h>
124 #include <sys/sbuf.h>
125 #include <sys/sx.h>
126 #include <sys/vmem.h>
127 #include <sys/vmmeter.h>
128 #include <sys/sched.h>
129 #include <sys/sysctl.h>
130 #include <sys/_unrhdr.h>
131 #include <sys/smp.h>
132
133 #include <vm/vm.h>
134 #include <vm/vm_param.h>
135 #include <vm/vm_kern.h>
136 #include <vm/vm_page.h>
137 #include <vm/vm_map.h>
138 #include <vm/vm_object.h>
139 #include <vm/vm_extern.h>
140 #include <vm/vm_pageout.h>
141 #include <vm/vm_pager.h>
142 #include <vm/vm_phys.h>
143 #include <vm/vm_radix.h>
144 #include <vm/vm_reserv.h>
145 #include <vm/vm_dumpset.h>
146 #include <vm/uma.h>
147
148 #include <machine/asan.h>
149 #include <machine/machdep.h>
150 #include <machine/md_var.h>
151 #include <machine/pcb.h>
152
153 #ifdef NUMA
154 #define PMAP_MEMDOM MAXMEMDOM
155 #else
156 #define PMAP_MEMDOM 1
157 #endif
158
159 #define PMAP_ASSERT_STAGE1(pmap) MPASS((pmap)->pm_stage == PM_STAGE1)
160 #define PMAP_ASSERT_STAGE2(pmap) MPASS((pmap)->pm_stage == PM_STAGE2)
161
162 #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t)))
163 #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t)))
164 #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t)))
165 #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t)))
166
167 #define NUL0E L0_ENTRIES
168 #define NUL1E (NUL0E * NL1PG)
169 #define NUL2E (NUL1E * NL2PG)
170
171 #ifdef PV_STATS
172 #define PV_STAT(x) do { x ; } while (0)
173 #define __pvused
174 #else
175 #define PV_STAT(x) do { } while (0)
176 #define __pvused __unused
177 #endif
178
179 #define pmap_l0_pindex(v) (NUL2E + NUL1E + ((v) >> L0_SHIFT))
180 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT))
181 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT)
182
183 #ifdef __ARM_FEATURE_BTI_DEFAULT
184 #define ATTR_KERN_GP ATTR_S1_GP
185 #else
186 #define ATTR_KERN_GP 0
187 #endif
188 #define PMAP_SAN_PTE_BITS (ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP | \
189 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | ATTR_S1_AP(ATTR_S1_AP_RW))
190
191 struct pmap_large_md_page {
192 struct rwlock pv_lock;
193 struct md_page pv_page;
194 /* Pad to a power of 2, see pmap_init_pv_table(). */
195 int pv_pad[2];
196 };
197
198 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large;
199 #define pv_dummy pv_dummy_large.pv_page
200 __read_mostly static struct pmap_large_md_page *pv_table;
201
202 static struct pmap_large_md_page *
_pa_to_pmdp(vm_paddr_t pa)203 _pa_to_pmdp(vm_paddr_t pa)
204 {
205 struct vm_phys_seg *seg;
206
207 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL)
208 return ((struct pmap_large_md_page *)seg->md_first +
209 pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start));
210 return (NULL);
211 }
212
213 static struct pmap_large_md_page *
pa_to_pmdp(vm_paddr_t pa)214 pa_to_pmdp(vm_paddr_t pa)
215 {
216 struct pmap_large_md_page *pvd;
217
218 pvd = _pa_to_pmdp(pa);
219 if (pvd == NULL)
220 panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa);
221 return (pvd);
222 }
223
224 static struct pmap_large_md_page *
page_to_pmdp(vm_page_t m)225 page_to_pmdp(vm_page_t m)
226 {
227 struct vm_phys_seg *seg;
228
229 seg = &vm_phys_segs[m->segind];
230 return ((struct pmap_large_md_page *)seg->md_first +
231 pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start));
232 }
233
234 #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page))
235 #define page_to_pvh(m) (&(page_to_pmdp(m)->pv_page))
236
237 #define PHYS_TO_PV_LIST_LOCK(pa) ({ \
238 struct pmap_large_md_page *_pvd; \
239 struct rwlock *_lock; \
240 _pvd = _pa_to_pmdp(pa); \
241 if (__predict_false(_pvd == NULL)) \
242 _lock = &pv_dummy_large.pv_lock; \
243 else \
244 _lock = &(_pvd->pv_lock); \
245 _lock; \
246 })
247
248 static struct rwlock *
VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m)249 VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m)
250 {
251 if ((m->flags & PG_FICTITIOUS) == 0)
252 return (&page_to_pmdp(m)->pv_lock);
253 else
254 return (&pv_dummy_large.pv_lock);
255 }
256
257 #define CHANGE_PV_LIST_LOCK(lockp, new_lock) do { \
258 struct rwlock **_lockp = (lockp); \
259 struct rwlock *_new_lock = (new_lock); \
260 \
261 if (_new_lock != *_lockp) { \
262 if (*_lockp != NULL) \
263 rw_wunlock(*_lockp); \
264 *_lockp = _new_lock; \
265 rw_wlock(*_lockp); \
266 } \
267 } while (0)
268
269 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) \
270 CHANGE_PV_LIST_LOCK(lockp, PHYS_TO_PV_LIST_LOCK(pa))
271
272 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \
273 CHANGE_PV_LIST_LOCK(lockp, VM_PAGE_TO_PV_LIST_LOCK(m))
274
275 #define RELEASE_PV_LIST_LOCK(lockp) do { \
276 struct rwlock **_lockp = (lockp); \
277 \
278 if (*_lockp != NULL) { \
279 rw_wunlock(*_lockp); \
280 *_lockp = NULL; \
281 } \
282 } while (0)
283
284 #define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte))
285 #define VM_PAGE_TO_PTE(m) PHYS_TO_PTE(VM_PAGE_TO_PHYS(m))
286
287 /*
288 * The presence of this flag indicates that the mapping is writeable.
289 * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise
290 * it is dirty. This flag may only be set on managed mappings.
291 *
292 * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it
293 * as a software managed bit.
294 */
295 #define ATTR_SW_DBM ATTR_DBM
296
297 struct pmap kernel_pmap_store;
298
299 /* Used for mapping ACPI memory before VM is initialized */
300 #define PMAP_PREINIT_MAPPING_COUNT 32
301 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE)
302 static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */
303 static int vm_initialized = 0; /* No need to use pre-init maps when set */
304
305 /*
306 * Reserve a few L2 blocks starting from 'preinit_map_va' pointer.
307 * Always map entire L2 block for simplicity.
308 * VA of L2 block = preinit_map_va + i * L2_SIZE
309 */
310 static struct pmap_preinit_mapping {
311 vm_paddr_t pa;
312 vm_offset_t va;
313 vm_size_t size;
314 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
315
316 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
317 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
318 vm_offset_t kernel_vm_end = 0;
319
320 /*
321 * Data for the pv entry allocation mechanism.
322 */
323 #ifdef NUMA
324 static __inline int
pc_to_domain(struct pv_chunk * pc)325 pc_to_domain(struct pv_chunk *pc)
326 {
327 return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc)));
328 }
329 #else
330 static __inline int
pc_to_domain(struct pv_chunk * pc __unused)331 pc_to_domain(struct pv_chunk *pc __unused)
332 {
333 return (0);
334 }
335 #endif
336
337 struct pv_chunks_list {
338 struct mtx pvc_lock;
339 TAILQ_HEAD(pch, pv_chunk) pvc_list;
340 int active_reclaims;
341 } __aligned(CACHE_LINE_SIZE);
342
343 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM];
344
345 vm_paddr_t dmap_phys_base; /* The start of the dmap region */
346 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */
347 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */
348
349 extern pt_entry_t pagetable_l0_ttbr1[];
350
351 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
352 static vm_paddr_t physmap[PHYSMAP_SIZE];
353 static u_int physmap_idx;
354
355 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
356 "VM/pmap parameters");
357
358 #if PAGE_SIZE == PAGE_SIZE_4K
359 #define L1_BLOCKS_SUPPORTED 1
360 #else
361 /* TODO: Make this dynamic when we support FEAT_LPA2 (TCR_EL1.DS == 1) */
362 #define L1_BLOCKS_SUPPORTED 0
363 #endif
364
365 #define PMAP_ASSERT_L1_BLOCKS_SUPPORTED MPASS(L1_BLOCKS_SUPPORTED)
366
367 /*
368 * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs
369 * that it has currently allocated to a pmap, a cursor ("asid_next") to
370 * optimize its search for a free ASID in the bit vector, and an epoch number
371 * ("asid_epoch") to indicate when it has reclaimed all previously allocated
372 * ASIDs that are not currently active on a processor.
373 *
374 * The current epoch number is always in the range [0, INT_MAX). Negative
375 * numbers and INT_MAX are reserved for special cases that are described
376 * below.
377 */
378 struct asid_set {
379 int asid_bits;
380 bitstr_t *asid_set;
381 int asid_set_size;
382 int asid_next;
383 int asid_epoch;
384 struct mtx asid_set_mutex;
385 };
386
387 static struct asid_set asids;
388 static struct asid_set vmids;
389
390 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
391 "ASID allocator");
392 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0,
393 "The number of bits in an ASID");
394 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0,
395 "The last allocated ASID plus one");
396 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0,
397 "The current epoch number");
398
399 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator");
400 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0,
401 "The number of bits in an VMID");
402 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0,
403 "The last allocated VMID plus one");
404 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0,
405 "The current epoch number");
406
407 void (*pmap_clean_stage2_tlbi)(void);
408 void (*pmap_invalidate_vpipt_icache)(void);
409 void (*pmap_stage2_invalidate_range)(uint64_t, vm_offset_t, vm_offset_t, bool);
410 void (*pmap_stage2_invalidate_all)(uint64_t);
411
412 /*
413 * A pmap's cookie encodes an ASID and epoch number. Cookies for reserved
414 * ASIDs have a negative epoch number, specifically, INT_MIN. Cookies for
415 * dynamically allocated ASIDs have a non-negative epoch number.
416 *
417 * An invalid ASID is represented by -1.
418 *
419 * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN),
420 * which indicates that an ASID should never be allocated to the pmap, and
421 * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be
422 * allocated when the pmap is next activated.
423 */
424 #define COOKIE_FROM(asid, epoch) ((long)((u_int)(asid) | \
425 ((u_long)(epoch) << 32)))
426 #define COOKIE_TO_ASID(cookie) ((int)(cookie))
427 #define COOKIE_TO_EPOCH(cookie) ((int)((u_long)(cookie) >> 32))
428
429 #define TLBI_VA_SHIFT 12
430 #define TLBI_VA_MASK ((1ul << 44) - 1)
431 #define TLBI_VA(addr) (((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK)
432
433 static int __read_frequently superpages_enabled = 1;
434 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
435 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
436 "Are large page mappings enabled?");
437
438 /*
439 * True when Branch Target Identification should be used by userspace. This
440 * allows pmap to mark pages as guarded with ATTR_S1_GP.
441 */
442 __read_mostly static bool pmap_bti_support = false;
443
444 /*
445 * Internal flags for pmap_enter()'s helper functions.
446 */
447 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */
448 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */
449
450 TAILQ_HEAD(pv_chunklist, pv_chunk);
451
452 static void free_pv_chunk(struct pv_chunk *pc);
453 static void free_pv_chunk_batch(struct pv_chunklist *batch);
454 static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
455 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
456 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
457 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
458 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
459 vm_offset_t va);
460
461 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte);
462 static bool pmap_activate_int(pmap_t pmap);
463 static void pmap_alloc_asid(pmap_t pmap);
464 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size,
465 vm_prot_t prot, int mode, bool skip_unmapped);
466 static bool pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
467 pt_entry_t l3e, vm_page_t ml3, struct rwlock **lockp);
468 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
469 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
470 vm_offset_t va, struct rwlock **lockp);
471 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
472 static bool pmap_demote_l2c(pmap_t pmap, pt_entry_t *l2p, vm_offset_t va);
473 static bool pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va);
474 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
475 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
476 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
477 u_int flags, vm_page_t m, struct rwlock **lockp);
478 static int pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags,
479 vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp);
480 static bool pmap_every_pte_zero(vm_paddr_t pa);
481 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
482 bool all_l3e_AF_set);
483 static pt_entry_t pmap_load_l3c(pt_entry_t *l3p);
484 static void pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
485 vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits);
486 static bool pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
487 struct rwlock **lockp);
488 static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
489 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
490 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp);
491 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
492 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
493 static bool pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
494 vm_offset_t *vap, vm_offset_t va_next, vm_page_t ml3, struct spglist *free,
495 struct rwlock **lockp);
496 static void pmap_reset_asid_set(pmap_t pmap);
497 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
498 vm_page_t m, struct rwlock **lockp);
499
500 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
501 struct rwlock **lockp);
502
503 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
504 struct spglist *free);
505 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
506 static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
507 vm_offset_t va, vm_size_t size);
508 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
509
510 static uma_zone_t pmap_bti_ranges_zone;
511 static bool pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
512 pt_entry_t *pte);
513 static pt_entry_t pmap_pte_bti(pmap_t pmap, vm_offset_t va);
514 static void pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
515 static void *bti_dup_range(void *ctx, void *data);
516 static void bti_free_range(void *ctx, void *node);
517 static int pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap);
518 static void pmap_bti_deassign_all(pmap_t pmap);
519
520 /*
521 * These load the old table data and store the new value.
522 * They need to be atomic as the System MMU may write to the table at
523 * the same time as the CPU.
524 */
525 #define pmap_clear(table) atomic_store_64(table, 0)
526 #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits)
527 #define pmap_load(table) (*table)
528 #define pmap_load_clear(table) atomic_swap_64(table, 0)
529 #define pmap_load_store(table, entry) atomic_swap_64(table, entry)
530 #define pmap_set_bits(table, bits) atomic_set_64(table, bits)
531 #define pmap_store(table, entry) atomic_store_64(table, entry)
532
533 /********************/
534 /* Inline functions */
535 /********************/
536
537 static __inline void
pagecopy(void * s,void * d)538 pagecopy(void *s, void *d)
539 {
540
541 memcpy(d, s, PAGE_SIZE);
542 }
543
544 static __inline pd_entry_t *
pmap_l0(pmap_t pmap,vm_offset_t va)545 pmap_l0(pmap_t pmap, vm_offset_t va)
546 {
547
548 return (&pmap->pm_l0[pmap_l0_index(va)]);
549 }
550
551 static __inline pd_entry_t *
pmap_l0_to_l1(pd_entry_t * l0,vm_offset_t va)552 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
553 {
554 pd_entry_t *l1;
555
556 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
557 return (&l1[pmap_l1_index(va)]);
558 }
559
560 static __inline pd_entry_t *
pmap_l1(pmap_t pmap,vm_offset_t va)561 pmap_l1(pmap_t pmap, vm_offset_t va)
562 {
563 pd_entry_t *l0;
564
565 l0 = pmap_l0(pmap, va);
566 if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
567 return (NULL);
568
569 return (pmap_l0_to_l1(l0, va));
570 }
571
572 static __inline pd_entry_t *
pmap_l1_to_l2(pd_entry_t * l1p,vm_offset_t va)573 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va)
574 {
575 pd_entry_t l1, *l2p;
576
577 l1 = pmap_load(l1p);
578
579 KASSERT(ADDR_IS_CANONICAL(va),
580 ("%s: Address not in canonical form: %lx", __func__, va));
581 /*
582 * The valid bit may be clear if pmap_update_entry() is concurrently
583 * modifying the entry, so for KVA only the entry type may be checked.
584 */
585 KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0,
586 ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va));
587 KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
588 ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va));
589 l2p = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l1));
590 return (&l2p[pmap_l2_index(va)]);
591 }
592
593 static __inline pd_entry_t *
pmap_l2(pmap_t pmap,vm_offset_t va)594 pmap_l2(pmap_t pmap, vm_offset_t va)
595 {
596 pd_entry_t *l1;
597
598 l1 = pmap_l1(pmap, va);
599 if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
600 return (NULL);
601
602 return (pmap_l1_to_l2(l1, va));
603 }
604
605 static __inline pt_entry_t *
pmap_l2_to_l3(pd_entry_t * l2p,vm_offset_t va)606 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va)
607 {
608 pd_entry_t l2;
609 pt_entry_t *l3p;
610
611 l2 = pmap_load(l2p);
612
613 KASSERT(ADDR_IS_CANONICAL(va),
614 ("%s: Address not in canonical form: %lx", __func__, va));
615 /*
616 * The valid bit may be clear if pmap_update_entry() is concurrently
617 * modifying the entry, so for KVA only the entry type may be checked.
618 */
619 KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0,
620 ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va));
621 KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
622 ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va));
623 l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l2));
624 return (&l3p[pmap_l3_index(va)]);
625 }
626
627 /*
628 * Returns the lowest valid pde for a given virtual address.
629 * The next level may or may not point to a valid page or block.
630 */
631 static __inline pd_entry_t *
pmap_pde(pmap_t pmap,vm_offset_t va,int * level)632 pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
633 {
634 pd_entry_t *l0, *l1, *l2, desc;
635
636 l0 = pmap_l0(pmap, va);
637 desc = pmap_load(l0) & ATTR_DESCR_MASK;
638 if (desc != L0_TABLE) {
639 *level = -1;
640 return (NULL);
641 }
642
643 l1 = pmap_l0_to_l1(l0, va);
644 desc = pmap_load(l1) & ATTR_DESCR_MASK;
645 if (desc != L1_TABLE) {
646 *level = 0;
647 return (l0);
648 }
649
650 l2 = pmap_l1_to_l2(l1, va);
651 desc = pmap_load(l2) & ATTR_DESCR_MASK;
652 if (desc != L2_TABLE) {
653 *level = 1;
654 return (l1);
655 }
656
657 *level = 2;
658 return (l2);
659 }
660
661 /*
662 * Returns the lowest valid pte block or table entry for a given virtual
663 * address. If there are no valid entries return NULL and set the level to
664 * the first invalid level.
665 */
666 static __inline pt_entry_t *
pmap_pte(pmap_t pmap,vm_offset_t va,int * level)667 pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
668 {
669 pd_entry_t *l1, *l2, desc;
670 pt_entry_t *l3;
671
672 l1 = pmap_l1(pmap, va);
673 if (l1 == NULL) {
674 *level = 0;
675 return (NULL);
676 }
677 desc = pmap_load(l1) & ATTR_DESCR_MASK;
678 if (desc == L1_BLOCK) {
679 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
680 *level = 1;
681 return (l1);
682 }
683
684 if (desc != L1_TABLE) {
685 *level = 1;
686 return (NULL);
687 }
688
689 l2 = pmap_l1_to_l2(l1, va);
690 desc = pmap_load(l2) & ATTR_DESCR_MASK;
691 if (desc == L2_BLOCK) {
692 *level = 2;
693 return (l2);
694 }
695
696 if (desc != L2_TABLE) {
697 *level = 2;
698 return (NULL);
699 }
700
701 *level = 3;
702 l3 = pmap_l2_to_l3(l2, va);
703 if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
704 return (NULL);
705
706 return (l3);
707 }
708
709 /*
710 * If the given pmap has an L{1,2}_BLOCK or L3_PAGE entry at the specified
711 * level that maps the specified virtual address, then a pointer to that entry
712 * is returned. Otherwise, NULL is returned, unless INVARIANTS are enabled
713 * and a diagnostic message is provided, in which case this function panics.
714 */
715 static __always_inline pt_entry_t *
pmap_pte_exists(pmap_t pmap,vm_offset_t va,int level,const char * diag)716 pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag)
717 {
718 pd_entry_t *l0p, *l1p, *l2p;
719 pt_entry_t desc, *l3p;
720 int walk_level __diagused;
721
722 KASSERT(level >= 0 && level < 4,
723 ("%s: %s passed an out-of-range level (%d)", __func__, diag,
724 level));
725 l0p = pmap_l0(pmap, va);
726 desc = pmap_load(l0p) & ATTR_DESCR_MASK;
727 if (desc == L0_TABLE && level > 0) {
728 l1p = pmap_l0_to_l1(l0p, va);
729 desc = pmap_load(l1p) & ATTR_DESCR_MASK;
730 if (desc == L1_BLOCK && level == 1) {
731 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
732 return (l1p);
733 }
734 if (desc == L1_TABLE && level > 1) {
735 l2p = pmap_l1_to_l2(l1p, va);
736 desc = pmap_load(l2p) & ATTR_DESCR_MASK;
737 if (desc == L2_BLOCK && level == 2)
738 return (l2p);
739 else if (desc == L2_TABLE && level > 2) {
740 l3p = pmap_l2_to_l3(l2p, va);
741 desc = pmap_load(l3p) & ATTR_DESCR_MASK;
742 if (desc == L3_PAGE && level == 3)
743 return (l3p);
744 else
745 walk_level = 3;
746 } else
747 walk_level = 2;
748 } else
749 walk_level = 1;
750 } else
751 walk_level = 0;
752 KASSERT(diag == NULL,
753 ("%s: va %#lx not mapped at level %d, desc %ld at level %d",
754 diag, va, level, desc, walk_level));
755 return (NULL);
756 }
757
758 bool
pmap_ps_enabled(pmap_t pmap)759 pmap_ps_enabled(pmap_t pmap)
760 {
761 /*
762 * Promotion requires a hypervisor call when the kernel is running
763 * in EL1. To stop this disable superpage support on non-stage 1
764 * pmaps for now.
765 */
766 if (pmap->pm_stage != PM_STAGE1)
767 return (false);
768
769 #ifdef KMSAN
770 /*
771 * The break-before-make in pmap_update_entry() results in a situation
772 * where a CPU may call into the KMSAN runtime while the entry is
773 * invalid. If the entry is used to map the current thread structure,
774 * then the runtime will attempt to access unmapped memory. Avoid this
775 * by simply disabling superpage promotion for the kernel map.
776 */
777 if (pmap == kernel_pmap)
778 return (false);
779 #endif
780
781 return (superpages_enabled != 0);
782 }
783
784 bool
pmap_get_tables(pmap_t pmap,vm_offset_t va,pd_entry_t ** l0,pd_entry_t ** l1,pd_entry_t ** l2,pt_entry_t ** l3)785 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
786 pd_entry_t **l2, pt_entry_t **l3)
787 {
788 pd_entry_t *l0p, *l1p, *l2p;
789
790 if (pmap->pm_l0 == NULL)
791 return (false);
792
793 l0p = pmap_l0(pmap, va);
794 *l0 = l0p;
795
796 if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
797 return (false);
798
799 l1p = pmap_l0_to_l1(l0p, va);
800 *l1 = l1p;
801
802 if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
803 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
804 *l2 = NULL;
805 *l3 = NULL;
806 return (true);
807 }
808
809 if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
810 return (false);
811
812 l2p = pmap_l1_to_l2(l1p, va);
813 *l2 = l2p;
814
815 if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
816 *l3 = NULL;
817 return (true);
818 }
819
820 if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE)
821 return (false);
822
823 *l3 = pmap_l2_to_l3(l2p, va);
824
825 return (true);
826 }
827
828 static __inline int
pmap_l3_valid(pt_entry_t l3)829 pmap_l3_valid(pt_entry_t l3)
830 {
831
832 return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
833 }
834
835 CTASSERT(L1_BLOCK == L2_BLOCK);
836
837 static pt_entry_t
pmap_pte_memattr(pmap_t pmap,vm_memattr_t memattr)838 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr)
839 {
840 pt_entry_t val;
841
842 if (pmap->pm_stage == PM_STAGE1) {
843 val = ATTR_S1_IDX(memattr);
844 if (memattr == VM_MEMATTR_DEVICE)
845 val |= ATTR_S1_XN;
846 return (val);
847 }
848
849 val = 0;
850
851 switch (memattr) {
852 case VM_MEMATTR_DEVICE:
853 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) |
854 ATTR_S2_XN(ATTR_S2_XN_ALL));
855 case VM_MEMATTR_UNCACHEABLE:
856 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC));
857 case VM_MEMATTR_WRITE_BACK:
858 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB));
859 case VM_MEMATTR_WRITE_THROUGH:
860 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT));
861 default:
862 panic("%s: invalid memory attribute %x", __func__, memattr);
863 }
864 }
865
866 static pt_entry_t
pmap_pte_prot(pmap_t pmap,vm_prot_t prot)867 pmap_pte_prot(pmap_t pmap, vm_prot_t prot)
868 {
869 pt_entry_t val;
870
871 val = 0;
872 if (pmap->pm_stage == PM_STAGE1) {
873 if ((prot & VM_PROT_EXECUTE) == 0)
874 val |= ATTR_S1_XN;
875 if ((prot & VM_PROT_WRITE) == 0)
876 val |= ATTR_S1_AP(ATTR_S1_AP_RO);
877 } else {
878 if ((prot & VM_PROT_WRITE) != 0)
879 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
880 if ((prot & VM_PROT_READ) != 0)
881 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ);
882 if ((prot & VM_PROT_EXECUTE) == 0)
883 val |= ATTR_S2_XN(ATTR_S2_XN_ALL);
884 }
885
886 return (val);
887 }
888
889 /*
890 * Checks if the PTE is dirty.
891 */
892 static inline int
pmap_pte_dirty(pmap_t pmap,pt_entry_t pte)893 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte)
894 {
895
896 KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte));
897
898 if (pmap->pm_stage == PM_STAGE1) {
899 KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0,
900 ("pte %#lx is writeable and missing ATTR_SW_DBM", pte));
901
902 return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
903 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM));
904 }
905
906 return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
907 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE));
908 }
909
910 static __inline void
pmap_resident_count_inc(pmap_t pmap,int count)911 pmap_resident_count_inc(pmap_t pmap, int count)
912 {
913
914 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
915 pmap->pm_stats.resident_count += count;
916 }
917
918 static __inline void
pmap_resident_count_dec(pmap_t pmap,int count)919 pmap_resident_count_dec(pmap_t pmap, int count)
920 {
921
922 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
923 KASSERT(pmap->pm_stats.resident_count >= count,
924 ("pmap %p resident count underflow %ld %d", pmap,
925 pmap->pm_stats.resident_count, count));
926 pmap->pm_stats.resident_count -= count;
927 }
928
929 static vm_paddr_t
pmap_early_vtophys(vm_offset_t va)930 pmap_early_vtophys(vm_offset_t va)
931 {
932 vm_paddr_t pa_page;
933
934 pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK;
935 return (pa_page | (va & PAR_LOW_MASK));
936 }
937
938 /* State of the bootstrapped DMAP page tables */
939 struct pmap_bootstrap_state {
940 pt_entry_t *l1;
941 pt_entry_t *l2;
942 pt_entry_t *l3;
943 vm_offset_t freemempos;
944 vm_offset_t va;
945 vm_paddr_t pa;
946 pt_entry_t table_attrs;
947 u_int l0_slot;
948 u_int l1_slot;
949 u_int l2_slot;
950 bool dmap_valid;
951 };
952
953 /* The bootstrap state */
954 static struct pmap_bootstrap_state bs_state = {
955 .l1 = NULL,
956 .l2 = NULL,
957 .l3 = NULL,
958 .table_attrs = TATTR_PXN_TABLE,
959 .l0_slot = L0_ENTRIES,
960 .l1_slot = Ln_ENTRIES,
961 .l2_slot = Ln_ENTRIES,
962 .dmap_valid = false,
963 };
964
965 static void
pmap_bootstrap_l0_table(struct pmap_bootstrap_state * state)966 pmap_bootstrap_l0_table(struct pmap_bootstrap_state *state)
967 {
968 vm_paddr_t l1_pa;
969 pd_entry_t l0e;
970 u_int l0_slot;
971
972 /* Link the level 0 table to a level 1 table */
973 l0_slot = pmap_l0_index(state->va);
974 if (l0_slot != state->l0_slot) {
975 /*
976 * Make sure we move from a low address to high address
977 * before the DMAP region is ready. This ensures we never
978 * modify an existing mapping until we can map from a
979 * physical address to a virtual address.
980 */
981 MPASS(state->l0_slot < l0_slot ||
982 state->l0_slot == L0_ENTRIES ||
983 state->dmap_valid);
984
985 /* Reset lower levels */
986 state->l2 = NULL;
987 state->l3 = NULL;
988 state->l1_slot = Ln_ENTRIES;
989 state->l2_slot = Ln_ENTRIES;
990
991 /* Check the existing L0 entry */
992 state->l0_slot = l0_slot;
993 if (state->dmap_valid) {
994 l0e = pagetable_l0_ttbr1[l0_slot];
995 if ((l0e & ATTR_DESCR_VALID) != 0) {
996 MPASS((l0e & ATTR_DESCR_MASK) == L0_TABLE);
997 l1_pa = PTE_TO_PHYS(l0e);
998 state->l1 = (pt_entry_t *)PHYS_TO_DMAP(l1_pa);
999 return;
1000 }
1001 }
1002
1003 /* Create a new L0 table entry */
1004 state->l1 = (pt_entry_t *)state->freemempos;
1005 memset(state->l1, 0, PAGE_SIZE);
1006 state->freemempos += PAGE_SIZE;
1007
1008 l1_pa = pmap_early_vtophys((vm_offset_t)state->l1);
1009 MPASS((l1_pa & Ln_TABLE_MASK) == 0);
1010 MPASS(pagetable_l0_ttbr1[l0_slot] == 0);
1011 pmap_store(&pagetable_l0_ttbr1[l0_slot], PHYS_TO_PTE(l1_pa) |
1012 TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0 | L0_TABLE);
1013 }
1014 KASSERT(state->l1 != NULL, ("%s: NULL l1", __func__));
1015 }
1016
1017 static void
pmap_bootstrap_l1_table(struct pmap_bootstrap_state * state)1018 pmap_bootstrap_l1_table(struct pmap_bootstrap_state *state)
1019 {
1020 vm_paddr_t l2_pa;
1021 pd_entry_t l1e;
1022 u_int l1_slot;
1023
1024 /* Make sure there is a valid L0 -> L1 table */
1025 pmap_bootstrap_l0_table(state);
1026
1027 /* Link the level 1 table to a level 2 table */
1028 l1_slot = pmap_l1_index(state->va);
1029 if (l1_slot != state->l1_slot) {
1030 /* See pmap_bootstrap_l0_table for a description */
1031 MPASS(state->l1_slot < l1_slot ||
1032 state->l1_slot == Ln_ENTRIES ||
1033 state->dmap_valid);
1034
1035 /* Reset lower levels */
1036 state->l3 = NULL;
1037 state->l2_slot = Ln_ENTRIES;
1038
1039 /* Check the existing L1 entry */
1040 state->l1_slot = l1_slot;
1041 if (state->dmap_valid) {
1042 l1e = state->l1[l1_slot];
1043 if ((l1e & ATTR_DESCR_VALID) != 0) {
1044 MPASS((l1e & ATTR_DESCR_MASK) == L1_TABLE);
1045 l2_pa = PTE_TO_PHYS(l1e);
1046 state->l2 = (pt_entry_t *)PHYS_TO_DMAP(l2_pa);
1047 return;
1048 }
1049 }
1050
1051 /* Create a new L1 table entry */
1052 state->l2 = (pt_entry_t *)state->freemempos;
1053 memset(state->l2, 0, PAGE_SIZE);
1054 state->freemempos += PAGE_SIZE;
1055
1056 l2_pa = pmap_early_vtophys((vm_offset_t)state->l2);
1057 MPASS((l2_pa & Ln_TABLE_MASK) == 0);
1058 MPASS(state->l1[l1_slot] == 0);
1059 pmap_store(&state->l1[l1_slot], PHYS_TO_PTE(l2_pa) |
1060 state->table_attrs | L1_TABLE);
1061 }
1062 KASSERT(state->l2 != NULL, ("%s: NULL l2", __func__));
1063 }
1064
1065 static void
pmap_bootstrap_l2_table(struct pmap_bootstrap_state * state)1066 pmap_bootstrap_l2_table(struct pmap_bootstrap_state *state)
1067 {
1068 vm_paddr_t l3_pa;
1069 pd_entry_t l2e;
1070 u_int l2_slot;
1071
1072 /* Make sure there is a valid L1 -> L2 table */
1073 pmap_bootstrap_l1_table(state);
1074
1075 /* Link the level 2 table to a level 3 table */
1076 l2_slot = pmap_l2_index(state->va);
1077 if (l2_slot != state->l2_slot) {
1078 /* See pmap_bootstrap_l0_table for a description */
1079 MPASS(state->l2_slot < l2_slot ||
1080 state->l2_slot == Ln_ENTRIES ||
1081 state->dmap_valid);
1082
1083 /* Check the existing L2 entry */
1084 state->l2_slot = l2_slot;
1085 if (state->dmap_valid) {
1086 l2e = state->l2[l2_slot];
1087 if ((l2e & ATTR_DESCR_VALID) != 0) {
1088 MPASS((l2e & ATTR_DESCR_MASK) == L2_TABLE);
1089 l3_pa = PTE_TO_PHYS(l2e);
1090 state->l3 = (pt_entry_t *)PHYS_TO_DMAP(l3_pa);
1091 return;
1092 }
1093 }
1094
1095 /* Create a new L2 table entry */
1096 state->l3 = (pt_entry_t *)state->freemempos;
1097 memset(state->l3, 0, PAGE_SIZE);
1098 state->freemempos += PAGE_SIZE;
1099
1100 l3_pa = pmap_early_vtophys((vm_offset_t)state->l3);
1101 MPASS((l3_pa & Ln_TABLE_MASK) == 0);
1102 MPASS(state->l2[l2_slot] == 0);
1103 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(l3_pa) |
1104 state->table_attrs | L2_TABLE);
1105 }
1106 KASSERT(state->l3 != NULL, ("%s: NULL l3", __func__));
1107 }
1108
1109 static void
pmap_bootstrap_l2_block(struct pmap_bootstrap_state * state,int i)1110 pmap_bootstrap_l2_block(struct pmap_bootstrap_state *state, int i)
1111 {
1112 pt_entry_t contig;
1113 u_int l2_slot;
1114 bool first;
1115
1116 if ((physmap[i + 1] - state->pa) < L2_SIZE)
1117 return;
1118
1119 /* Make sure there is a valid L1 table */
1120 pmap_bootstrap_l1_table(state);
1121
1122 MPASS((state->va & L2_OFFSET) == 0);
1123 for (first = true, contig = 0;
1124 state->va < DMAP_MAX_ADDRESS &&
1125 (physmap[i + 1] - state->pa) >= L2_SIZE;
1126 state->va += L2_SIZE, state->pa += L2_SIZE) {
1127 /*
1128 * Stop if we are about to walk off the end of what the
1129 * current L1 slot can address.
1130 */
1131 if (!first && (state->pa & L1_OFFSET) == 0)
1132 break;
1133
1134 /*
1135 * If we have an aligned, contiguous chunk of L2C_ENTRIES
1136 * L2 blocks, set the contiguous bit within each PTE so that
1137 * the chunk can be cached using only one TLB entry.
1138 */
1139 if ((state->pa & L2C_OFFSET) == 0) {
1140 if (state->va + L2C_SIZE < DMAP_MAX_ADDRESS &&
1141 physmap[i + 1] - state->pa >= L2C_SIZE) {
1142 contig = ATTR_CONTIGUOUS;
1143 } else {
1144 contig = 0;
1145 }
1146 }
1147
1148 first = false;
1149 l2_slot = pmap_l2_index(state->va);
1150 MPASS((state->pa & L2_OFFSET) == 0);
1151 MPASS(state->l2[l2_slot] == 0);
1152 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(state->pa) |
1153 ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP |
1154 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L2_BLOCK);
1155 }
1156 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1157 }
1158
1159 static void
pmap_bootstrap_l3_page(struct pmap_bootstrap_state * state,int i)1160 pmap_bootstrap_l3_page(struct pmap_bootstrap_state *state, int i)
1161 {
1162 pt_entry_t contig;
1163 u_int l3_slot;
1164 bool first;
1165
1166 if (physmap[i + 1] - state->pa < L3_SIZE)
1167 return;
1168
1169 /* Make sure there is a valid L2 table */
1170 pmap_bootstrap_l2_table(state);
1171
1172 MPASS((state->va & L3_OFFSET) == 0);
1173 for (first = true, contig = 0;
1174 state->va < DMAP_MAX_ADDRESS &&
1175 physmap[i + 1] - state->pa >= L3_SIZE;
1176 state->va += L3_SIZE, state->pa += L3_SIZE) {
1177 /*
1178 * Stop if we are about to walk off the end of what the
1179 * current L2 slot can address.
1180 */
1181 if (!first && (state->pa & L2_OFFSET) == 0)
1182 break;
1183
1184 /*
1185 * If we have an aligned, contiguous chunk of L3C_ENTRIES
1186 * L3 pages, set the contiguous bit within each PTE so that
1187 * the chunk can be cached using only one TLB entry.
1188 */
1189 if ((state->pa & L3C_OFFSET) == 0) {
1190 if (state->va + L3C_SIZE < DMAP_MAX_ADDRESS &&
1191 physmap[i + 1] - state->pa >= L3C_SIZE) {
1192 contig = ATTR_CONTIGUOUS;
1193 } else {
1194 contig = 0;
1195 }
1196 }
1197
1198 first = false;
1199 l3_slot = pmap_l3_index(state->va);
1200 MPASS((state->pa & L3_OFFSET) == 0);
1201 MPASS(state->l3[l3_slot] == 0);
1202 pmap_store(&state->l3[l3_slot], PHYS_TO_PTE(state->pa) |
1203 ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP |
1204 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L3_PAGE);
1205 }
1206 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1207 }
1208
1209 static void
pmap_bootstrap_dmap(void)1210 pmap_bootstrap_dmap(void)
1211 {
1212 int i;
1213
1214 /* Fill in physmap array. */
1215 physmap_idx = physmem_avail(physmap, nitems(physmap));
1216
1217 dmap_phys_base = physmap[0] & ~L1_OFFSET;
1218 dmap_phys_max = 0;
1219 dmap_max_addr = 0;
1220
1221 for (i = 0; i < physmap_idx; i += 2) {
1222 bs_state.pa = physmap[i] & ~L3_OFFSET;
1223 bs_state.va = bs_state.pa - dmap_phys_base + DMAP_MIN_ADDRESS;
1224
1225 /* Create L3 mappings at the start of the region */
1226 if ((bs_state.pa & L2_OFFSET) != 0)
1227 pmap_bootstrap_l3_page(&bs_state, i);
1228 MPASS(bs_state.pa <= physmap[i + 1]);
1229
1230 if (L1_BLOCKS_SUPPORTED) {
1231 /* Create L2 mappings at the start of the region */
1232 if ((bs_state.pa & L1_OFFSET) != 0)
1233 pmap_bootstrap_l2_block(&bs_state, i);
1234 MPASS(bs_state.pa <= physmap[i + 1]);
1235
1236 /* Create the main L1 block mappings */
1237 for (; bs_state.va < DMAP_MAX_ADDRESS &&
1238 (physmap[i + 1] - bs_state.pa) >= L1_SIZE;
1239 bs_state.va += L1_SIZE, bs_state.pa += L1_SIZE) {
1240 /* Make sure there is a valid L1 table */
1241 pmap_bootstrap_l0_table(&bs_state);
1242 MPASS((bs_state.pa & L1_OFFSET) == 0);
1243 pmap_store(
1244 &bs_state.l1[pmap_l1_index(bs_state.va)],
1245 PHYS_TO_PTE(bs_state.pa) | ATTR_DEFAULT |
1246 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
1247 ATTR_S1_XN | ATTR_KERN_GP | L1_BLOCK);
1248 }
1249 MPASS(bs_state.pa <= physmap[i + 1]);
1250
1251 /* Create L2 mappings at the end of the region */
1252 pmap_bootstrap_l2_block(&bs_state, i);
1253 } else {
1254 while (bs_state.va < DMAP_MAX_ADDRESS &&
1255 (physmap[i + 1] - bs_state.pa) >= L2_SIZE) {
1256 pmap_bootstrap_l2_block(&bs_state, i);
1257 }
1258 }
1259 MPASS(bs_state.pa <= physmap[i + 1]);
1260
1261 /* Create L3 mappings at the end of the region */
1262 pmap_bootstrap_l3_page(&bs_state, i);
1263 MPASS(bs_state.pa == physmap[i + 1]);
1264
1265 if (bs_state.pa > dmap_phys_max) {
1266 dmap_phys_max = bs_state.pa;
1267 dmap_max_addr = bs_state.va;
1268 }
1269 }
1270
1271 cpu_tlb_flushID();
1272 }
1273
1274 static void
pmap_bootstrap_l2(vm_offset_t va)1275 pmap_bootstrap_l2(vm_offset_t va)
1276 {
1277 KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
1278
1279 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1280 bs_state.va = va;
1281
1282 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L1_SIZE)
1283 pmap_bootstrap_l1_table(&bs_state);
1284 }
1285
1286 static void
pmap_bootstrap_l3(vm_offset_t va)1287 pmap_bootstrap_l3(vm_offset_t va)
1288 {
1289 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
1290
1291 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1292 bs_state.va = va;
1293
1294 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L2_SIZE)
1295 pmap_bootstrap_l2_table(&bs_state);
1296 }
1297
1298 /*
1299 * Bootstrap the system enough to run with virtual memory.
1300 */
1301 void
pmap_bootstrap(vm_size_t kernlen)1302 pmap_bootstrap(vm_size_t kernlen)
1303 {
1304 vm_offset_t dpcpu, msgbufpv;
1305 vm_paddr_t start_pa, pa;
1306
1307 /* Verify that the ASID is set through TTBR0. */
1308 KASSERT((READ_SPECIALREG(tcr_el1) & TCR_A1) == 0,
1309 ("pmap_bootstrap: TCR_EL1.A1 != 0"));
1310
1311 /* Set this early so we can use the pagetable walking functions */
1312 kernel_pmap_store.pm_l0 = pagetable_l0_ttbr1;
1313 PMAP_LOCK_INIT(kernel_pmap);
1314 kernel_pmap->pm_l0_paddr =
1315 pmap_early_vtophys((vm_offset_t)kernel_pmap_store.pm_l0);
1316 TAILQ_INIT(&kernel_pmap->pm_pvchunk);
1317 vm_radix_init(&kernel_pmap->pm_root);
1318 kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN);
1319 kernel_pmap->pm_stage = PM_STAGE1;
1320 kernel_pmap->pm_levels = 4;
1321 kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr;
1322 kernel_pmap->pm_asid_set = &asids;
1323
1324 bs_state.freemempos = KERNBASE + kernlen;
1325 bs_state.freemempos = roundup2(bs_state.freemempos, PAGE_SIZE);
1326
1327 /* Create a direct map region early so we can use it for pa -> va */
1328 pmap_bootstrap_dmap();
1329 bs_state.dmap_valid = true;
1330
1331 /*
1332 * We only use PXN when we know nothing will be executed from it, e.g.
1333 * the DMAP region.
1334 */
1335 bs_state.table_attrs &= ~TATTR_PXN_TABLE;
1336
1337 start_pa = pa = pmap_early_vtophys(KERNBASE);
1338
1339 /*
1340 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS. We assume that the
1341 * loader allocated the first and only l2 page table page used to map
1342 * the kernel, preloaded files and module metadata.
1343 */
1344 pmap_bootstrap_l2(KERNBASE + L1_SIZE);
1345 /* And the l3 tables for the early devmap */
1346 pmap_bootstrap_l3(VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE));
1347
1348 cpu_tlb_flushID();
1349
1350 #define alloc_pages(var, np) \
1351 (var) = bs_state.freemempos; \
1352 bs_state.freemempos += (np * PAGE_SIZE); \
1353 memset((char *)(var), 0, ((np) * PAGE_SIZE));
1354
1355 /* Allocate dynamic per-cpu area. */
1356 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
1357 dpcpu_init((void *)dpcpu, 0);
1358
1359 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
1360 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
1361 msgbufp = (void *)msgbufpv;
1362
1363 /* Reserve some VA space for early BIOS/ACPI mapping */
1364 preinit_map_va = roundup2(bs_state.freemempos, L2_SIZE);
1365
1366 virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE;
1367 virtual_avail = roundup2(virtual_avail, L1_SIZE);
1368 virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE);
1369 kernel_vm_end = virtual_avail;
1370
1371 pa = pmap_early_vtophys(bs_state.freemempos);
1372
1373 physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
1374
1375 cpu_tlb_flushID();
1376 }
1377
1378 #if defined(KASAN) || defined(KMSAN)
1379 static void
pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa,vm_paddr_t end_pa,vm_offset_t * vap,vm_offset_t eva)1380 pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa, vm_paddr_t end_pa,
1381 vm_offset_t *vap, vm_offset_t eva)
1382 {
1383 vm_paddr_t pa;
1384 vm_offset_t va;
1385 pd_entry_t *l2;
1386
1387 va = *vap;
1388 pa = rounddown2(end_pa - L2_SIZE, L2_SIZE);
1389 for (; pa >= start_pa && va < eva; va += L2_SIZE, pa -= L2_SIZE) {
1390 l2 = pmap_l2(kernel_pmap, va);
1391
1392 /*
1393 * KASAN stack checking results in us having already allocated
1394 * part of our shadow map, so we can just skip those segments.
1395 */
1396 if ((pmap_load(l2) & ATTR_DESCR_VALID) != 0) {
1397 pa += L2_SIZE;
1398 continue;
1399 }
1400
1401 bzero((void *)PHYS_TO_DMAP(pa), L2_SIZE);
1402 physmem_exclude_region(pa, L2_SIZE, EXFLAG_NOALLOC);
1403 pmap_store(l2, PHYS_TO_PTE(pa) | PMAP_SAN_PTE_BITS | L2_BLOCK);
1404 }
1405 *vap = va;
1406 }
1407
1408 /*
1409 * Finish constructing the initial shadow map:
1410 * - Count how many pages from KERNBASE to virtual_avail (scaled for
1411 * shadow map)
1412 * - Map that entire range using L2 superpages.
1413 */
1414 static void
pmap_bootstrap_san1(vm_offset_t va,int scale)1415 pmap_bootstrap_san1(vm_offset_t va, int scale)
1416 {
1417 vm_offset_t eva;
1418 vm_paddr_t kernstart;
1419 int i;
1420
1421 kernstart = pmap_early_vtophys(KERNBASE);
1422
1423 /*
1424 * Rebuild physmap one more time, we may have excluded more regions from
1425 * allocation since pmap_bootstrap().
1426 */
1427 physmap_idx = physmem_avail(physmap, nitems(physmap));
1428
1429 eva = va + (virtual_avail - VM_MIN_KERNEL_ADDRESS) / scale;
1430
1431 /*
1432 * Find a slot in the physmap large enough for what we needed. We try to put
1433 * the shadow map as high up as we can to avoid depleting the lower 4GB in case
1434 * it's needed for, e.g., an xhci controller that can only do 32-bit DMA.
1435 */
1436 for (i = physmap_idx - 2; i >= 0; i -= 2) {
1437 vm_paddr_t plow, phigh;
1438
1439 /* L2 mappings must be backed by memory that is L2-aligned */
1440 plow = roundup2(physmap[i], L2_SIZE);
1441 phigh = physmap[i + 1];
1442 if (plow >= phigh)
1443 continue;
1444 if (kernstart >= plow && kernstart < phigh)
1445 phigh = kernstart;
1446 if (phigh - plow >= L2_SIZE) {
1447 pmap_bootstrap_allocate_san_l2(plow, phigh, &va, eva);
1448 if (va >= eva)
1449 break;
1450 }
1451 }
1452 if (i < 0)
1453 panic("Could not find phys region for shadow map");
1454
1455 /*
1456 * Done. We should now have a valid shadow address mapped for all KVA
1457 * that has been mapped so far, i.e., KERNBASE to virtual_avail. Thus,
1458 * shadow accesses by the sanitizer runtime will succeed for this range.
1459 * When the kernel virtual address range is later expanded, as will
1460 * happen in vm_mem_init(), the shadow map will be grown as well. This
1461 * is handled by pmap_san_enter().
1462 */
1463 }
1464
1465 void
pmap_bootstrap_san(void)1466 pmap_bootstrap_san(void)
1467 {
1468 #ifdef KASAN
1469 pmap_bootstrap_san1(KASAN_MIN_ADDRESS, KASAN_SHADOW_SCALE);
1470 #else
1471 static uint8_t kmsan_shad_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE);
1472 static uint8_t kmsan_orig_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE);
1473 pd_entry_t *l0, *l1;
1474
1475 if (virtual_avail - VM_MIN_KERNEL_ADDRESS > L1_SIZE)
1476 panic("initial kernel map is too large");
1477
1478 l0 = pmap_l0(kernel_pmap, KMSAN_SHAD_MIN_ADDRESS);
1479 pmap_store(l0, L0_TABLE | PHYS_TO_PTE(
1480 pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp)));
1481 l1 = pmap_l0_to_l1(l0, KMSAN_SHAD_MIN_ADDRESS);
1482 pmap_store(l1, L1_TABLE | PHYS_TO_PTE(
1483 pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp + PAGE_SIZE)));
1484 pmap_bootstrap_san1(KMSAN_SHAD_MIN_ADDRESS, 1);
1485
1486 l0 = pmap_l0(kernel_pmap, KMSAN_ORIG_MIN_ADDRESS);
1487 pmap_store(l0, L0_TABLE | PHYS_TO_PTE(
1488 pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp)));
1489 l1 = pmap_l0_to_l1(l0, KMSAN_ORIG_MIN_ADDRESS);
1490 pmap_store(l1, L1_TABLE | PHYS_TO_PTE(
1491 pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp + PAGE_SIZE)));
1492 pmap_bootstrap_san1(KMSAN_ORIG_MIN_ADDRESS, 1);
1493 #endif
1494 }
1495 #endif
1496
1497 /*
1498 * Initialize a vm_page's machine-dependent fields.
1499 */
1500 void
pmap_page_init(vm_page_t m)1501 pmap_page_init(vm_page_t m)
1502 {
1503
1504 TAILQ_INIT(&m->md.pv_list);
1505 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
1506 }
1507
1508 static void
pmap_init_asids(struct asid_set * set,int bits)1509 pmap_init_asids(struct asid_set *set, int bits)
1510 {
1511 int i;
1512
1513 set->asid_bits = bits;
1514
1515 /*
1516 * We may be too early in the overall initialization process to use
1517 * bit_alloc().
1518 */
1519 set->asid_set_size = 1 << set->asid_bits;
1520 set->asid_set = kmem_malloc(bitstr_size(set->asid_set_size),
1521 M_WAITOK | M_ZERO);
1522 for (i = 0; i < ASID_FIRST_AVAILABLE; i++)
1523 bit_set(set->asid_set, i);
1524 set->asid_next = ASID_FIRST_AVAILABLE;
1525 mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN);
1526 }
1527
1528 static void
pmap_init_pv_table(void)1529 pmap_init_pv_table(void)
1530 {
1531 struct vm_phys_seg *seg, *next_seg;
1532 struct pmap_large_md_page *pvd;
1533 vm_size_t s;
1534 int domain, i, j, pages;
1535
1536 /*
1537 * We strongly depend on the size being a power of two, so the assert
1538 * is overzealous. However, should the struct be resized to a
1539 * different power of two, the code below needs to be revisited.
1540 */
1541 CTASSERT((sizeof(*pvd) == 64));
1542
1543 /*
1544 * Calculate the size of the array.
1545 */
1546 s = 0;
1547 for (i = 0; i < vm_phys_nsegs; i++) {
1548 seg = &vm_phys_segs[i];
1549 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1550 pmap_l2_pindex(seg->start);
1551 s += round_page(pages * sizeof(*pvd));
1552 }
1553 pv_table = (struct pmap_large_md_page *)kva_alloc(s);
1554 if (pv_table == NULL)
1555 panic("%s: kva_alloc failed\n", __func__);
1556
1557 /*
1558 * Iterate physical segments to allocate domain-local memory for PV
1559 * list headers.
1560 */
1561 pvd = pv_table;
1562 for (i = 0; i < vm_phys_nsegs; i++) {
1563 seg = &vm_phys_segs[i];
1564 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1565 pmap_l2_pindex(seg->start);
1566 domain = seg->domain;
1567
1568 s = round_page(pages * sizeof(*pvd));
1569
1570 for (j = 0; j < s; j += PAGE_SIZE) {
1571 vm_page_t m = vm_page_alloc_noobj_domain(domain,
1572 VM_ALLOC_ZERO);
1573 if (m == NULL)
1574 panic("failed to allocate PV table page");
1575 pmap_qenter((vm_offset_t)pvd + j, &m, 1);
1576 }
1577
1578 for (j = 0; j < s / sizeof(*pvd); j++) {
1579 rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW);
1580 TAILQ_INIT(&pvd->pv_page.pv_list);
1581 pvd++;
1582 }
1583 }
1584 pvd = &pv_dummy_large;
1585 memset(pvd, 0, sizeof(*pvd));
1586 rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW);
1587 TAILQ_INIT(&pvd->pv_page.pv_list);
1588
1589 /*
1590 * Set pointers from vm_phys_segs to pv_table.
1591 */
1592 for (i = 0, pvd = pv_table; i < vm_phys_nsegs; i++) {
1593 seg = &vm_phys_segs[i];
1594 seg->md_first = pvd;
1595 pvd += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1596 pmap_l2_pindex(seg->start);
1597
1598 /*
1599 * If there is a following segment, and the final
1600 * superpage of this segment and the initial superpage
1601 * of the next segment are the same then adjust the
1602 * pv_table entry for that next segment down by one so
1603 * that the pv_table entries will be shared.
1604 */
1605 if (i + 1 < vm_phys_nsegs) {
1606 next_seg = &vm_phys_segs[i + 1];
1607 if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 ==
1608 pmap_l2_pindex(next_seg->start)) {
1609 pvd--;
1610 }
1611 }
1612 }
1613 }
1614
1615 /*
1616 * Initialize the pmap module.
1617 *
1618 * Called by vm_mem_init(), to initialize any structures that the pmap
1619 * system needs to map virtual memory.
1620 */
1621 void
pmap_init(void)1622 pmap_init(void)
1623 {
1624 uint64_t mmfr1;
1625 int i, vmid_bits;
1626
1627 /*
1628 * Are large page mappings enabled?
1629 */
1630 TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
1631 if (superpages_enabled) {
1632 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1633 ("pmap_init: can't assign to pagesizes[1]"));
1634 pagesizes[1] = L3C_SIZE;
1635 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
1636 ("pmap_init: can't assign to pagesizes[2]"));
1637 pagesizes[2] = L2_SIZE;
1638 if (L1_BLOCKS_SUPPORTED) {
1639 KASSERT(MAXPAGESIZES > 3 && pagesizes[3] == 0,
1640 ("pmap_init: can't assign to pagesizes[3]"));
1641 pagesizes[3] = L1_SIZE;
1642 }
1643 }
1644
1645 /*
1646 * Initialize the ASID allocator.
1647 */
1648 pmap_init_asids(&asids,
1649 (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8);
1650
1651 if (has_hyp()) {
1652 mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
1653 vmid_bits = 8;
1654
1655 if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) ==
1656 ID_AA64MMFR1_VMIDBits_16)
1657 vmid_bits = 16;
1658 pmap_init_asids(&vmids, vmid_bits);
1659 }
1660
1661 /*
1662 * Initialize pv chunk lists.
1663 */
1664 for (i = 0; i < PMAP_MEMDOM; i++) {
1665 mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL,
1666 MTX_DEF);
1667 TAILQ_INIT(&pv_chunks[i].pvc_list);
1668 }
1669 pmap_init_pv_table();
1670
1671 vm_initialized = 1;
1672 }
1673
1674 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1675 "L1 (1GB/64GB) page mapping counters");
1676
1677 static COUNTER_U64_DEFINE_EARLY(pmap_l1_demotions);
1678 SYSCTL_COUNTER_U64(_vm_pmap_l1, OID_AUTO, demotions, CTLFLAG_RD,
1679 &pmap_l1_demotions, "L1 (1GB/64GB) page demotions");
1680
1681 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1682 "L2C (32MB/1GB) page mapping counters");
1683
1684 static COUNTER_U64_DEFINE_EARLY(pmap_l2c_demotions);
1685 SYSCTL_COUNTER_U64(_vm_pmap_l2c, OID_AUTO, demotions, CTLFLAG_RD,
1686 &pmap_l2c_demotions, "L2C (32MB/1GB) page demotions");
1687
1688 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1689 "2MB page mapping counters");
1690
1691 static u_long pmap_l2_demotions;
1692 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
1693 &pmap_l2_demotions, 0, "2MB page demotions");
1694
1695 static u_long pmap_l2_mappings;
1696 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
1697 &pmap_l2_mappings, 0, "2MB page mappings");
1698
1699 static u_long pmap_l2_p_failures;
1700 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
1701 &pmap_l2_p_failures, 0, "2MB page promotion failures");
1702
1703 static u_long pmap_l2_promotions;
1704 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
1705 &pmap_l2_promotions, 0, "2MB page promotions");
1706
1707 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1708 "L3C (64KB/2MB) page mapping counters");
1709
1710 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_demotions);
1711 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, demotions, CTLFLAG_RD,
1712 &pmap_l3c_demotions, "L3C (64KB/2MB) page demotions");
1713
1714 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_mappings);
1715 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, mappings, CTLFLAG_RD,
1716 &pmap_l3c_mappings, "L3C (64KB/2MB) page mappings");
1717
1718 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_p_failures);
1719 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, p_failures, CTLFLAG_RD,
1720 &pmap_l3c_p_failures, "L3C (64KB/2MB) page promotion failures");
1721
1722 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_promotions);
1723 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, promotions, CTLFLAG_RD,
1724 &pmap_l3c_promotions, "L3C (64KB/2MB) page promotions");
1725
1726 /*
1727 * If the given value for "final_only" is false, then any cached intermediate-
1728 * level entries, i.e., L{0,1,2}_TABLE entries, are invalidated in addition to
1729 * any cached final-level entry, i.e., either an L{1,2}_BLOCK or L3_PAGE entry.
1730 * Otherwise, just the cached final-level entry is invalidated.
1731 */
1732 static __inline void
pmap_s1_invalidate_kernel(uint64_t r,bool final_only)1733 pmap_s1_invalidate_kernel(uint64_t r, bool final_only)
1734 {
1735 if (final_only)
1736 __asm __volatile("tlbi vaale1is, %0" : : "r" (r));
1737 else
1738 __asm __volatile("tlbi vaae1is, %0" : : "r" (r));
1739 }
1740
1741 static __inline void
pmap_s1_invalidate_user(uint64_t r,bool final_only)1742 pmap_s1_invalidate_user(uint64_t r, bool final_only)
1743 {
1744 if (final_only)
1745 __asm __volatile("tlbi vale1is, %0" : : "r" (r));
1746 else
1747 __asm __volatile("tlbi vae1is, %0" : : "r" (r));
1748 }
1749
1750 /*
1751 * Invalidates any cached final- and optionally intermediate-level TLB entries
1752 * for the specified virtual address in the given virtual address space.
1753 */
1754 static __inline void
pmap_s1_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1755 pmap_s1_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1756 {
1757 uint64_t r;
1758
1759 PMAP_ASSERT_STAGE1(pmap);
1760
1761 dsb(ishst);
1762 r = TLBI_VA(va);
1763 if (pmap == kernel_pmap) {
1764 pmap_s1_invalidate_kernel(r, final_only);
1765 } else {
1766 r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1767 pmap_s1_invalidate_user(r, final_only);
1768 }
1769 dsb(ish);
1770 isb();
1771 }
1772
1773 static __inline void
pmap_s2_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1774 pmap_s2_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1775 {
1776 PMAP_ASSERT_STAGE2(pmap);
1777 MPASS(pmap_stage2_invalidate_range != NULL);
1778 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), va, va + PAGE_SIZE,
1779 final_only);
1780 }
1781
1782 static __inline void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1783 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1784 {
1785 if (pmap->pm_stage == PM_STAGE1)
1786 pmap_s1_invalidate_page(pmap, va, final_only);
1787 else
1788 pmap_s2_invalidate_page(pmap, va, final_only);
1789 }
1790
1791 /*
1792 * Use stride L{1,2}_SIZE when invalidating the TLB entries for L{1,2}_BLOCK
1793 * mappings. Otherwise, use stride L3_SIZE.
1794 */
1795 static __inline void
pmap_s1_invalidate_strided(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_offset_t stride,bool final_only)1796 pmap_s1_invalidate_strided(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1797 vm_offset_t stride, bool final_only)
1798 {
1799 uint64_t end, r, start;
1800
1801 PMAP_ASSERT_STAGE1(pmap);
1802
1803 dsb(ishst);
1804 if (pmap == kernel_pmap) {
1805 start = TLBI_VA(sva);
1806 end = TLBI_VA(eva);
1807 for (r = start; r < end; r += TLBI_VA(stride))
1808 pmap_s1_invalidate_kernel(r, final_only);
1809 } else {
1810 start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1811 start |= TLBI_VA(sva);
1812 end |= TLBI_VA(eva);
1813 for (r = start; r < end; r += TLBI_VA(stride))
1814 pmap_s1_invalidate_user(r, final_only);
1815 }
1816 dsb(ish);
1817 isb();
1818 }
1819
1820 /*
1821 * Invalidates any cached final- and optionally intermediate-level TLB entries
1822 * for the specified virtual address range in the given virtual address space.
1823 */
1824 static __inline void
pmap_s1_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1825 pmap_s1_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1826 bool final_only)
1827 {
1828 pmap_s1_invalidate_strided(pmap, sva, eva, L3_SIZE, final_only);
1829 }
1830
1831 static __inline void
pmap_s2_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1832 pmap_s2_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1833 bool final_only)
1834 {
1835 PMAP_ASSERT_STAGE2(pmap);
1836 MPASS(pmap_stage2_invalidate_range != NULL);
1837 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), sva, eva, final_only);
1838 }
1839
1840 static __inline void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1841 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1842 bool final_only)
1843 {
1844 if (pmap->pm_stage == PM_STAGE1)
1845 pmap_s1_invalidate_range(pmap, sva, eva, final_only);
1846 else
1847 pmap_s2_invalidate_range(pmap, sva, eva, final_only);
1848 }
1849
1850 /*
1851 * Invalidates all cached intermediate- and final-level TLB entries for the
1852 * given virtual address space.
1853 */
1854 static __inline void
pmap_s1_invalidate_all(pmap_t pmap)1855 pmap_s1_invalidate_all(pmap_t pmap)
1856 {
1857 uint64_t r;
1858
1859 PMAP_ASSERT_STAGE1(pmap);
1860
1861 dsb(ishst);
1862 if (pmap == kernel_pmap) {
1863 __asm __volatile("tlbi vmalle1is");
1864 } else {
1865 r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1866 __asm __volatile("tlbi aside1is, %0" : : "r" (r));
1867 }
1868 dsb(ish);
1869 isb();
1870 }
1871
1872 static __inline void
pmap_s2_invalidate_all(pmap_t pmap)1873 pmap_s2_invalidate_all(pmap_t pmap)
1874 {
1875 PMAP_ASSERT_STAGE2(pmap);
1876 MPASS(pmap_stage2_invalidate_all != NULL);
1877 pmap_stage2_invalidate_all(pmap_to_ttbr0(pmap));
1878 }
1879
1880 static __inline void
pmap_invalidate_all(pmap_t pmap)1881 pmap_invalidate_all(pmap_t pmap)
1882 {
1883 if (pmap->pm_stage == PM_STAGE1)
1884 pmap_s1_invalidate_all(pmap);
1885 else
1886 pmap_s2_invalidate_all(pmap);
1887 }
1888
1889 /*
1890 * Routine: pmap_extract
1891 * Function:
1892 * Extract the physical page address associated
1893 * with the given map/virtual_address pair.
1894 */
1895 vm_paddr_t
pmap_extract(pmap_t pmap,vm_offset_t va)1896 pmap_extract(pmap_t pmap, vm_offset_t va)
1897 {
1898 pt_entry_t *pte, tpte;
1899 vm_paddr_t pa;
1900 int lvl;
1901
1902 pa = 0;
1903 PMAP_LOCK(pmap);
1904 /*
1905 * Find the block or page map for this virtual address. pmap_pte
1906 * will return either a valid block/page entry, or NULL.
1907 */
1908 pte = pmap_pte(pmap, va, &lvl);
1909 if (pte != NULL) {
1910 tpte = pmap_load(pte);
1911 pa = PTE_TO_PHYS(tpte);
1912 switch(lvl) {
1913 case 1:
1914 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
1915 KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
1916 ("pmap_extract: Invalid L1 pte found: %lx",
1917 tpte & ATTR_DESCR_MASK));
1918 pa |= (va & L1_OFFSET);
1919 break;
1920 case 2:
1921 KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
1922 ("pmap_extract: Invalid L2 pte found: %lx",
1923 tpte & ATTR_DESCR_MASK));
1924 pa |= (va & L2_OFFSET);
1925 break;
1926 case 3:
1927 KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
1928 ("pmap_extract: Invalid L3 pte found: %lx",
1929 tpte & ATTR_DESCR_MASK));
1930 pa |= (va & L3_OFFSET);
1931 break;
1932 }
1933 }
1934 PMAP_UNLOCK(pmap);
1935 return (pa);
1936 }
1937
1938 /*
1939 * Routine: pmap_extract_and_hold
1940 * Function:
1941 * Atomically extract and hold the physical page
1942 * with the given pmap and virtual address pair
1943 * if that mapping permits the given protection.
1944 */
1945 vm_page_t
pmap_extract_and_hold(pmap_t pmap,vm_offset_t va,vm_prot_t prot)1946 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1947 {
1948 pt_entry_t *pte, tpte;
1949 vm_offset_t off;
1950 vm_page_t m;
1951 int lvl;
1952 bool use;
1953
1954 m = NULL;
1955 PMAP_LOCK(pmap);
1956 pte = pmap_pte(pmap, va, &lvl);
1957 if (pte != NULL) {
1958 tpte = pmap_load(pte);
1959
1960 KASSERT(lvl > 0 && lvl <= 3,
1961 ("pmap_extract_and_hold: Invalid level %d", lvl));
1962 /*
1963 * Check that the pte is either a L3 page, or a L1 or L2 block
1964 * entry. We can assume L1_BLOCK == L2_BLOCK.
1965 */
1966 KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
1967 (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
1968 ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
1969 tpte & ATTR_DESCR_MASK));
1970
1971 use = false;
1972 if ((prot & VM_PROT_WRITE) == 0)
1973 use = true;
1974 else if (pmap->pm_stage == PM_STAGE1 &&
1975 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))
1976 use = true;
1977 else if (pmap->pm_stage == PM_STAGE2 &&
1978 ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
1979 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)))
1980 use = true;
1981
1982 if (use) {
1983 switch (lvl) {
1984 case 1:
1985 off = va & L1_OFFSET;
1986 break;
1987 case 2:
1988 off = va & L2_OFFSET;
1989 break;
1990 case 3:
1991 default:
1992 off = 0;
1993 }
1994 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte) | off);
1995 if (m != NULL && !vm_page_wire_mapped(m))
1996 m = NULL;
1997 }
1998 }
1999 PMAP_UNLOCK(pmap);
2000 return (m);
2001 }
2002
2003 /*
2004 * Walks the page tables to translate a kernel virtual address to a
2005 * physical address. Returns true if the kva is valid and stores the
2006 * physical address in pa if it is not NULL.
2007 *
2008 * See the comment above data_abort() for the rationale for specifying
2009 * NO_PERTHREAD_SSP here.
2010 */
2011 bool NO_PERTHREAD_SSP
pmap_klookup(vm_offset_t va,vm_paddr_t * pa)2012 pmap_klookup(vm_offset_t va, vm_paddr_t *pa)
2013 {
2014 pt_entry_t *pte, tpte;
2015 register_t intr;
2016 uint64_t par;
2017
2018 /*
2019 * Disable interrupts so we don't get interrupted between asking
2020 * for address translation, and getting the result back.
2021 */
2022 intr = intr_disable();
2023 par = arm64_address_translate_s1e1r(va);
2024 intr_restore(intr);
2025
2026 if (PAR_SUCCESS(par)) {
2027 if (pa != NULL)
2028 *pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK);
2029 return (true);
2030 }
2031
2032 /*
2033 * Fall back to walking the page table. The address translation
2034 * instruction may fail when the page is in a break-before-make
2035 * sequence. As we only clear the valid bit in said sequence we
2036 * can walk the page table to find the physical address.
2037 */
2038
2039 pte = pmap_l1(kernel_pmap, va);
2040 if (pte == NULL)
2041 return (false);
2042
2043 /*
2044 * A concurrent pmap_update_entry() will clear the entry's valid bit
2045 * but leave the rest of the entry unchanged. Therefore, we treat a
2046 * non-zero entry as being valid, and we ignore the valid bit when
2047 * determining whether the entry maps a block, page, or table.
2048 */
2049 tpte = pmap_load(pte);
2050 if (tpte == 0)
2051 return (false);
2052 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2053 if (pa != NULL)
2054 *pa = PTE_TO_PHYS(tpte) | (va & L1_OFFSET);
2055 return (true);
2056 }
2057 pte = pmap_l1_to_l2(&tpte, va);
2058 tpte = pmap_load(pte);
2059 if (tpte == 0)
2060 return (false);
2061 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2062 if (pa != NULL)
2063 *pa = PTE_TO_PHYS(tpte) | (va & L2_OFFSET);
2064 return (true);
2065 }
2066 pte = pmap_l2_to_l3(&tpte, va);
2067 tpte = pmap_load(pte);
2068 if (tpte == 0)
2069 return (false);
2070 if (pa != NULL)
2071 *pa = PTE_TO_PHYS(tpte) | (va & L3_OFFSET);
2072 return (true);
2073 }
2074
2075 /*
2076 * Routine: pmap_kextract
2077 * Function:
2078 * Extract the physical page address associated with the given kernel
2079 * virtual address.
2080 */
2081 vm_paddr_t
pmap_kextract(vm_offset_t va)2082 pmap_kextract(vm_offset_t va)
2083 {
2084 vm_paddr_t pa;
2085
2086 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
2087 return (DMAP_TO_PHYS(va));
2088
2089 if (pmap_klookup(va, &pa) == false)
2090 return (0);
2091 return (pa);
2092 }
2093
2094 /***************************************************
2095 * Low level mapping routines.....
2096 ***************************************************/
2097
2098 void
pmap_kenter(vm_offset_t sva,vm_size_t size,vm_paddr_t pa,int mode)2099 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
2100 {
2101 pd_entry_t *pde;
2102 pt_entry_t attr, old_l3e, *pte;
2103 vm_offset_t va;
2104 vm_page_t mpte;
2105 int error, lvl;
2106
2107 KASSERT((pa & L3_OFFSET) == 0,
2108 ("pmap_kenter: Invalid physical address"));
2109 KASSERT((sva & L3_OFFSET) == 0,
2110 ("pmap_kenter: Invalid virtual address"));
2111 KASSERT((size & PAGE_MASK) == 0,
2112 ("pmap_kenter: Mapping is not page-sized"));
2113
2114 attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
2115 ATTR_KERN_GP | ATTR_S1_IDX(mode);
2116 old_l3e = 0;
2117 va = sva;
2118 while (size != 0) {
2119 pde = pmap_pde(kernel_pmap, va, &lvl);
2120 KASSERT(pde != NULL,
2121 ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
2122 KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
2123
2124 /*
2125 * If we have an aligned, contiguous chunk of L2_SIZE, try
2126 * to create an L2_BLOCK mapping.
2127 */
2128 if ((va & L2_OFFSET) == 0 && size >= L2_SIZE &&
2129 (pa & L2_OFFSET) == 0 && vm_initialized) {
2130 mpte = PTE_TO_VM_PAGE(pmap_load(pde));
2131 KASSERT(pmap_every_pte_zero(VM_PAGE_TO_PHYS(mpte)),
2132 ("pmap_kenter: Unexpected mapping"));
2133 PMAP_LOCK(kernel_pmap);
2134 error = pmap_insert_pt_page(kernel_pmap, mpte, false,
2135 false);
2136 if (error == 0) {
2137 attr &= ~ATTR_CONTIGUOUS;
2138
2139 /*
2140 * Although the page table page "mpte" should
2141 * be devoid of mappings, the TLB might hold
2142 * intermediate entries that reference it, so
2143 * we perform a single-page invalidation.
2144 */
2145 pmap_update_entry(kernel_pmap, pde,
2146 PHYS_TO_PTE(pa) | attr | L2_BLOCK, va,
2147 PAGE_SIZE);
2148 }
2149 PMAP_UNLOCK(kernel_pmap);
2150 if (error == 0) {
2151 va += L2_SIZE;
2152 pa += L2_SIZE;
2153 size -= L2_SIZE;
2154 continue;
2155 }
2156 }
2157
2158 /*
2159 * If we have an aligned, contiguous chunk of L3C_ENTRIES
2160 * L3 pages, set the contiguous bit within each PTE so that
2161 * the chunk can be cached using only one TLB entry.
2162 */
2163 if ((va & L3C_OFFSET) == 0 && (pa & L3C_OFFSET) == 0) {
2164 if (size >= L3C_SIZE)
2165 attr |= ATTR_CONTIGUOUS;
2166 else
2167 attr &= ~ATTR_CONTIGUOUS;
2168 }
2169
2170 pte = pmap_l2_to_l3(pde, va);
2171 old_l3e |= pmap_load_store(pte, PHYS_TO_PTE(pa) | attr |
2172 L3_PAGE);
2173
2174 va += PAGE_SIZE;
2175 pa += PAGE_SIZE;
2176 size -= PAGE_SIZE;
2177 }
2178 if ((old_l3e & ATTR_DESCR_VALID) != 0)
2179 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2180 else {
2181 /*
2182 * Because the old entries were invalid and the new mappings
2183 * are not executable, an isb is not required.
2184 */
2185 dsb(ishst);
2186 }
2187 }
2188
2189 void
pmap_kenter_device(vm_offset_t sva,vm_size_t size,vm_paddr_t pa)2190 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
2191 {
2192
2193 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
2194 }
2195
2196 /*
2197 * Remove a page from the kernel pagetables.
2198 */
2199 void
pmap_kremove(vm_offset_t va)2200 pmap_kremove(vm_offset_t va)
2201 {
2202 pt_entry_t *pte;
2203
2204 pte = pmap_pte_exists(kernel_pmap, va, 3, __func__);
2205 KASSERT((pmap_load(pte) & ATTR_CONTIGUOUS) == 0,
2206 ("pmap_kremove: unexpected ATTR_CONTIGUOUS"));
2207 pmap_clear(pte);
2208 pmap_s1_invalidate_page(kernel_pmap, va, true);
2209 }
2210
2211 /*
2212 * Remove the specified range of mappings from the kernel address space.
2213 *
2214 * Should only be applied to mappings that were created by pmap_kenter() or
2215 * pmap_kenter_device(). Nothing about this function is actually specific
2216 * to device mappings.
2217 */
2218 void
pmap_kremove_device(vm_offset_t sva,vm_size_t size)2219 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
2220 {
2221 pt_entry_t *ptep, *ptep_end;
2222 vm_offset_t va;
2223 int lvl;
2224
2225 KASSERT((sva & L3_OFFSET) == 0,
2226 ("pmap_kremove_device: Invalid virtual address"));
2227 KASSERT((size & PAGE_MASK) == 0,
2228 ("pmap_kremove_device: Mapping is not page-sized"));
2229
2230 va = sva;
2231 while (size != 0) {
2232 ptep = pmap_pte(kernel_pmap, va, &lvl);
2233 KASSERT(ptep != NULL, ("Invalid page table, va: 0x%lx", va));
2234 switch (lvl) {
2235 case 2:
2236 KASSERT((va & L2_OFFSET) == 0,
2237 ("Unaligned virtual address"));
2238 KASSERT(size >= L2_SIZE, ("Insufficient size"));
2239
2240 if (va != sva) {
2241 pmap_s1_invalidate_range(kernel_pmap, sva, va,
2242 true);
2243 }
2244 pmap_clear(ptep);
2245 pmap_s1_invalidate_page(kernel_pmap, va, true);
2246 PMAP_LOCK(kernel_pmap);
2247 pmap_remove_kernel_l2(kernel_pmap, ptep, va);
2248 PMAP_UNLOCK(kernel_pmap);
2249
2250 va += L2_SIZE;
2251 sva = va;
2252 size -= L2_SIZE;
2253 break;
2254 case 3:
2255 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
2256 KASSERT((va & L3C_OFFSET) == 0,
2257 ("Unaligned L3C virtual address"));
2258 KASSERT(size >= L3C_SIZE,
2259 ("Insufficient L3C size"));
2260
2261 ptep_end = ptep + L3C_ENTRIES;
2262 for (; ptep < ptep_end; ptep++)
2263 pmap_clear(ptep);
2264
2265 va += L3C_SIZE;
2266 size -= L3C_SIZE;
2267 break;
2268 }
2269 pmap_clear(ptep);
2270
2271 va += PAGE_SIZE;
2272 size -= PAGE_SIZE;
2273 break;
2274 default:
2275 __assert_unreachable();
2276 break;
2277 }
2278 }
2279 if (va != sva)
2280 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2281 }
2282
2283 /*
2284 * Used to map a range of physical addresses into kernel
2285 * virtual address space.
2286 *
2287 * The value passed in '*virt' is a suggested virtual address for
2288 * the mapping. Architectures which can support a direct-mapped
2289 * physical to virtual region can return the appropriate address
2290 * within that region, leaving '*virt' unchanged. Other
2291 * architectures should map the pages starting at '*virt' and
2292 * update '*virt' with the first usable address after the mapped
2293 * region.
2294 */
2295 vm_offset_t
pmap_map(vm_offset_t * virt,vm_paddr_t start,vm_paddr_t end,int prot)2296 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
2297 {
2298 return PHYS_TO_DMAP(start);
2299 }
2300
2301 /*
2302 * Add a list of wired pages to the kva
2303 * this routine is only used for temporary
2304 * kernel mappings that do not need to have
2305 * page modification or references recorded.
2306 * Note that old mappings are simply written
2307 * over. The page *must* be wired.
2308 * Note: SMP coherent. Uses a ranged shootdown IPI.
2309 */
2310 void
pmap_qenter(vm_offset_t sva,vm_page_t * ma,int count)2311 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
2312 {
2313 pd_entry_t *pde;
2314 pt_entry_t attr, old_l3e, *pte;
2315 vm_offset_t va;
2316 vm_page_t m;
2317 int i, lvl;
2318
2319 old_l3e = 0;
2320 va = sva;
2321 for (i = 0; i < count; i++) {
2322 pde = pmap_pde(kernel_pmap, va, &lvl);
2323 KASSERT(pde != NULL,
2324 ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
2325 KASSERT(lvl == 2,
2326 ("pmap_qenter: Invalid level %d", lvl));
2327
2328 m = ma[i];
2329 attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
2330 ATTR_KERN_GP | ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE;
2331 pte = pmap_l2_to_l3(pde, va);
2332 old_l3e |= pmap_load_store(pte, VM_PAGE_TO_PTE(m) | attr);
2333
2334 va += L3_SIZE;
2335 }
2336 if ((old_l3e & ATTR_DESCR_VALID) != 0)
2337 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2338 else {
2339 /*
2340 * Because the old entries were invalid and the new mappings
2341 * are not executable, an isb is not required.
2342 */
2343 dsb(ishst);
2344 }
2345 }
2346
2347 /*
2348 * This routine tears out page mappings from the
2349 * kernel -- it is meant only for temporary mappings.
2350 */
2351 void
pmap_qremove(vm_offset_t sva,int count)2352 pmap_qremove(vm_offset_t sva, int count)
2353 {
2354 pt_entry_t *pte;
2355 vm_offset_t va;
2356
2357 KASSERT(ADDR_IS_CANONICAL(sva),
2358 ("%s: Address not in canonical form: %lx", __func__, sva));
2359 KASSERT(ADDR_IS_KERNEL(sva), ("usermode va %lx", sva));
2360
2361 va = sva;
2362 while (count-- > 0) {
2363 pte = pmap_pte_exists(kernel_pmap, va, 3, NULL);
2364 if (pte != NULL) {
2365 pmap_clear(pte);
2366 }
2367
2368 va += PAGE_SIZE;
2369 }
2370 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2371 }
2372
2373 /***************************************************
2374 * Page table page management routines.....
2375 ***************************************************/
2376 /*
2377 * Schedule the specified unused page table page to be freed. Specifically,
2378 * add the page to the specified list of pages that will be released to the
2379 * physical memory manager after the TLB has been updated.
2380 */
2381 static __inline void
pmap_add_delayed_free_list(vm_page_t m,struct spglist * free,bool set_PG_ZERO)2382 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO)
2383 {
2384
2385 if (set_PG_ZERO)
2386 m->flags |= PG_ZERO;
2387 else
2388 m->flags &= ~PG_ZERO;
2389 SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2390 }
2391
2392 /*
2393 * Decrements a page table page's reference count, which is used to record the
2394 * number of valid page table entries within the page. If the reference count
2395 * drops to zero, then the page table page is unmapped. Returns true if the
2396 * page table page was unmapped and false otherwise.
2397 */
2398 static inline bool
pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)2399 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2400 {
2401
2402 --m->ref_count;
2403 if (m->ref_count == 0) {
2404 _pmap_unwire_l3(pmap, va, m, free);
2405 return (true);
2406 } else
2407 return (false);
2408 }
2409
2410 static void
_pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)2411 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2412 {
2413
2414 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2415 /*
2416 * unmap the page table page
2417 */
2418 if (m->pindex >= (NUL2E + NUL1E)) {
2419 /* l1 page */
2420 pd_entry_t *l0;
2421
2422 l0 = pmap_l0(pmap, va);
2423 pmap_clear(l0);
2424 } else if (m->pindex >= NUL2E) {
2425 /* l2 page */
2426 pd_entry_t *l1;
2427
2428 l1 = pmap_l1(pmap, va);
2429 pmap_clear(l1);
2430 } else {
2431 /* l3 page */
2432 pd_entry_t *l2;
2433
2434 l2 = pmap_l2(pmap, va);
2435 pmap_clear(l2);
2436 }
2437 pmap_resident_count_dec(pmap, 1);
2438 if (m->pindex < NUL2E) {
2439 /* We just released an l3, unhold the matching l2 */
2440 pd_entry_t *l1, tl1;
2441 vm_page_t l2pg;
2442
2443 l1 = pmap_l1(pmap, va);
2444 tl1 = pmap_load(l1);
2445 l2pg = PTE_TO_VM_PAGE(tl1);
2446 pmap_unwire_l3(pmap, va, l2pg, free);
2447 } else if (m->pindex < (NUL2E + NUL1E)) {
2448 /* We just released an l2, unhold the matching l1 */
2449 pd_entry_t *l0, tl0;
2450 vm_page_t l1pg;
2451
2452 l0 = pmap_l0(pmap, va);
2453 tl0 = pmap_load(l0);
2454 l1pg = PTE_TO_VM_PAGE(tl0);
2455 pmap_unwire_l3(pmap, va, l1pg, free);
2456 }
2457 pmap_invalidate_page(pmap, va, false);
2458
2459 /*
2460 * Put page on a list so that it is released after
2461 * *ALL* TLB shootdown is done
2462 */
2463 pmap_add_delayed_free_list(m, free, true);
2464 }
2465
2466 /*
2467 * After removing a page table entry, this routine is used to
2468 * conditionally free the page, and manage the reference count.
2469 */
2470 static int
pmap_unuse_pt(pmap_t pmap,vm_offset_t va,pd_entry_t ptepde,struct spglist * free)2471 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2472 struct spglist *free)
2473 {
2474 vm_page_t mpte;
2475
2476 KASSERT(ADDR_IS_CANONICAL(va),
2477 ("%s: Address not in canonical form: %lx", __func__, va));
2478 if (ADDR_IS_KERNEL(va))
2479 return (0);
2480 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2481 mpte = PTE_TO_VM_PAGE(ptepde);
2482 return (pmap_unwire_l3(pmap, va, mpte, free));
2483 }
2484
2485 /*
2486 * Release a page table page reference after a failed attempt to create a
2487 * mapping.
2488 */
2489 static void
pmap_abort_ptp(pmap_t pmap,vm_offset_t va,vm_page_t mpte)2490 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
2491 {
2492 struct spglist free;
2493
2494 SLIST_INIT(&free);
2495 if (pmap_unwire_l3(pmap, va, mpte, &free))
2496 vm_page_free_pages_toq(&free, true);
2497 }
2498
2499 void
pmap_pinit0(pmap_t pmap)2500 pmap_pinit0(pmap_t pmap)
2501 {
2502
2503 PMAP_LOCK_INIT(pmap);
2504 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2505 pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1);
2506 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2507 TAILQ_INIT(&pmap->pm_pvchunk);
2508 vm_radix_init(&pmap->pm_root);
2509 pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN);
2510 pmap->pm_stage = PM_STAGE1;
2511 pmap->pm_levels = 4;
2512 pmap->pm_ttbr = pmap->pm_l0_paddr;
2513 pmap->pm_asid_set = &asids;
2514 pmap->pm_bti = NULL;
2515
2516 PCPU_SET(curpmap, pmap);
2517 }
2518
2519 int
pmap_pinit_stage(pmap_t pmap,enum pmap_stage stage,int levels)2520 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels)
2521 {
2522 vm_page_t m;
2523
2524 /*
2525 * allocate the l0 page
2526 */
2527 m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED |
2528 VM_ALLOC_ZERO);
2529 pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m);
2530 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2531
2532 TAILQ_INIT(&pmap->pm_pvchunk);
2533 vm_radix_init(&pmap->pm_root);
2534 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2535 pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX);
2536
2537 MPASS(levels == 3 || levels == 4);
2538 pmap->pm_levels = levels;
2539 pmap->pm_stage = stage;
2540 pmap->pm_bti = NULL;
2541 switch (stage) {
2542 case PM_STAGE1:
2543 pmap->pm_asid_set = &asids;
2544 if (pmap_bti_support) {
2545 pmap->pm_bti = malloc(sizeof(struct rangeset), M_DEVBUF,
2546 M_ZERO | M_WAITOK);
2547 rangeset_init(pmap->pm_bti, bti_dup_range,
2548 bti_free_range, pmap, M_NOWAIT);
2549 }
2550 break;
2551 case PM_STAGE2:
2552 pmap->pm_asid_set = &vmids;
2553 break;
2554 default:
2555 panic("%s: Invalid pmap type %d", __func__, stage);
2556 break;
2557 }
2558
2559 /* XXX Temporarily disable deferred ASID allocation. */
2560 pmap_alloc_asid(pmap);
2561
2562 /*
2563 * Allocate the level 1 entry to use as the root. This will increase
2564 * the refcount on the level 1 page so it won't be removed until
2565 * pmap_release() is called.
2566 */
2567 if (pmap->pm_levels == 3) {
2568 PMAP_LOCK(pmap);
2569 m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL);
2570 PMAP_UNLOCK(pmap);
2571 }
2572 pmap->pm_ttbr = VM_PAGE_TO_PHYS(m);
2573
2574 return (1);
2575 }
2576
2577 int
pmap_pinit(pmap_t pmap)2578 pmap_pinit(pmap_t pmap)
2579 {
2580
2581 return (pmap_pinit_stage(pmap, PM_STAGE1, 4));
2582 }
2583
2584 /*
2585 * This routine is called if the desired page table page does not exist.
2586 *
2587 * If page table page allocation fails, this routine may sleep before
2588 * returning NULL. It sleeps only if a lock pointer was given.
2589 *
2590 * Note: If a page allocation fails at page table level two or three,
2591 * one or two pages may be held during the wait, only to be released
2592 * afterwards. This conservative approach is easily argued to avoid
2593 * race conditions.
2594 */
2595 static vm_page_t
_pmap_alloc_l3(pmap_t pmap,vm_pindex_t ptepindex,struct rwlock ** lockp)2596 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2597 {
2598 vm_page_t m, l1pg, l2pg;
2599
2600 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2601
2602 /*
2603 * Allocate a page table page.
2604 */
2605 if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2606 if (lockp != NULL) {
2607 RELEASE_PV_LIST_LOCK(lockp);
2608 PMAP_UNLOCK(pmap);
2609 vm_wait(NULL);
2610 PMAP_LOCK(pmap);
2611 }
2612
2613 /*
2614 * Indicate the need to retry. While waiting, the page table
2615 * page may have been allocated.
2616 */
2617 return (NULL);
2618 }
2619 m->pindex = ptepindex;
2620
2621 /*
2622 * Because of AArch64's weak memory consistency model, we must have a
2623 * barrier here to ensure that the stores for zeroing "m", whether by
2624 * pmap_zero_page() or an earlier function, are visible before adding
2625 * "m" to the page table. Otherwise, a page table walk by another
2626 * processor's MMU could see the mapping to "m" and a stale, non-zero
2627 * PTE within "m".
2628 */
2629 dmb(ishst);
2630
2631 /*
2632 * Map the pagetable page into the process address space, if
2633 * it isn't already there.
2634 */
2635
2636 if (ptepindex >= (NUL2E + NUL1E)) {
2637 pd_entry_t *l0p, l0e;
2638 vm_pindex_t l0index;
2639
2640 l0index = ptepindex - (NUL2E + NUL1E);
2641 l0p = &pmap->pm_l0[l0index];
2642 KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0,
2643 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p)));
2644 l0e = VM_PAGE_TO_PTE(m) | L0_TABLE;
2645
2646 /*
2647 * Mark all kernel memory as not accessible from userspace
2648 * and userspace memory as not executable from the kernel.
2649 * This has been done for the bootstrap L0 entries in
2650 * locore.S.
2651 */
2652 if (pmap == kernel_pmap)
2653 l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0;
2654 else
2655 l0e |= TATTR_PXN_TABLE;
2656 pmap_store(l0p, l0e);
2657 } else if (ptepindex >= NUL2E) {
2658 vm_pindex_t l0index, l1index;
2659 pd_entry_t *l0, *l1;
2660 pd_entry_t tl0;
2661
2662 l1index = ptepindex - NUL2E;
2663 l0index = l1index >> Ln_ENTRIES_SHIFT;
2664
2665 l0 = &pmap->pm_l0[l0index];
2666 tl0 = pmap_load(l0);
2667 if (tl0 == 0) {
2668 /* recurse for allocating page dir */
2669 if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
2670 lockp) == NULL) {
2671 vm_page_unwire_noq(m);
2672 vm_page_free_zero(m);
2673 return (NULL);
2674 }
2675 } else {
2676 l1pg = PTE_TO_VM_PAGE(tl0);
2677 l1pg->ref_count++;
2678 }
2679
2680 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
2681 l1 = &l1[ptepindex & Ln_ADDR_MASK];
2682 KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0,
2683 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
2684 pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE);
2685 } else {
2686 vm_pindex_t l0index, l1index;
2687 pd_entry_t *l0, *l1, *l2;
2688 pd_entry_t tl0, tl1;
2689
2690 l1index = ptepindex >> Ln_ENTRIES_SHIFT;
2691 l0index = l1index >> Ln_ENTRIES_SHIFT;
2692
2693 l0 = &pmap->pm_l0[l0index];
2694 tl0 = pmap_load(l0);
2695 if (tl0 == 0) {
2696 /* recurse for allocating page dir */
2697 if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2698 lockp) == NULL) {
2699 vm_page_unwire_noq(m);
2700 vm_page_free_zero(m);
2701 return (NULL);
2702 }
2703 tl0 = pmap_load(l0);
2704 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2705 l1 = &l1[l1index & Ln_ADDR_MASK];
2706 } else {
2707 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2708 l1 = &l1[l1index & Ln_ADDR_MASK];
2709 tl1 = pmap_load(l1);
2710 if (tl1 == 0) {
2711 /* recurse for allocating page dir */
2712 if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2713 lockp) == NULL) {
2714 vm_page_unwire_noq(m);
2715 vm_page_free_zero(m);
2716 return (NULL);
2717 }
2718 } else {
2719 l2pg = PTE_TO_VM_PAGE(tl1);
2720 l2pg->ref_count++;
2721 }
2722 }
2723
2724 l2 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l1)));
2725 l2 = &l2[ptepindex & Ln_ADDR_MASK];
2726 KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0,
2727 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
2728 pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE);
2729 }
2730
2731 pmap_resident_count_inc(pmap, 1);
2732
2733 return (m);
2734 }
2735
2736 static pd_entry_t *
pmap_alloc_l2(pmap_t pmap,vm_offset_t va,vm_page_t * l2pgp,struct rwlock ** lockp)2737 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp,
2738 struct rwlock **lockp)
2739 {
2740 pd_entry_t *l1, *l2;
2741 vm_page_t l2pg;
2742 vm_pindex_t l2pindex;
2743
2744 KASSERT(ADDR_IS_CANONICAL(va),
2745 ("%s: Address not in canonical form: %lx", __func__, va));
2746
2747 retry:
2748 l1 = pmap_l1(pmap, va);
2749 if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) {
2750 l2 = pmap_l1_to_l2(l1, va);
2751 if (!ADDR_IS_KERNEL(va)) {
2752 /* Add a reference to the L2 page. */
2753 l2pg = PTE_TO_VM_PAGE(pmap_load(l1));
2754 l2pg->ref_count++;
2755 } else
2756 l2pg = NULL;
2757 } else if (!ADDR_IS_KERNEL(va)) {
2758 /* Allocate a L2 page. */
2759 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
2760 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
2761 if (l2pg == NULL) {
2762 if (lockp != NULL)
2763 goto retry;
2764 else
2765 return (NULL);
2766 }
2767 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
2768 l2 = &l2[pmap_l2_index(va)];
2769 } else
2770 panic("pmap_alloc_l2: missing page table page for va %#lx",
2771 va);
2772 *l2pgp = l2pg;
2773 return (l2);
2774 }
2775
2776 static vm_page_t
pmap_alloc_l3(pmap_t pmap,vm_offset_t va,struct rwlock ** lockp)2777 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2778 {
2779 vm_pindex_t ptepindex;
2780 pd_entry_t *pde, tpde;
2781 #ifdef INVARIANTS
2782 pt_entry_t *pte;
2783 #endif
2784 vm_page_t m;
2785 int lvl;
2786
2787 /*
2788 * Calculate pagetable page index
2789 */
2790 ptepindex = pmap_l2_pindex(va);
2791 retry:
2792 /*
2793 * Get the page directory entry
2794 */
2795 pde = pmap_pde(pmap, va, &lvl);
2796
2797 /*
2798 * If the page table page is mapped, we just increment the hold count,
2799 * and activate it. If we get a level 2 pde it will point to a level 3
2800 * table.
2801 */
2802 switch (lvl) {
2803 case -1:
2804 break;
2805 case 0:
2806 #ifdef INVARIANTS
2807 pte = pmap_l0_to_l1(pde, va);
2808 KASSERT(pmap_load(pte) == 0,
2809 ("pmap_alloc_l3: TODO: l0 superpages"));
2810 #endif
2811 break;
2812 case 1:
2813 #ifdef INVARIANTS
2814 pte = pmap_l1_to_l2(pde, va);
2815 KASSERT(pmap_load(pte) == 0,
2816 ("pmap_alloc_l3: TODO: l1 superpages"));
2817 #endif
2818 break;
2819 case 2:
2820 tpde = pmap_load(pde);
2821 if (tpde != 0) {
2822 m = PTE_TO_VM_PAGE(tpde);
2823 m->ref_count++;
2824 return (m);
2825 }
2826 break;
2827 default:
2828 panic("pmap_alloc_l3: Invalid level %d", lvl);
2829 }
2830
2831 /*
2832 * Here if the pte page isn't mapped, or if it has been deallocated.
2833 */
2834 m = _pmap_alloc_l3(pmap, ptepindex, lockp);
2835 if (m == NULL && lockp != NULL)
2836 goto retry;
2837
2838 return (m);
2839 }
2840
2841 /***************************************************
2842 * Pmap allocation/deallocation routines.
2843 ***************************************************/
2844
2845 /*
2846 * Release any resources held by the given physical map.
2847 * Called when a pmap initialized by pmap_pinit is being released.
2848 * Should only be called if the map contains no valid mappings.
2849 */
2850 void
pmap_release(pmap_t pmap)2851 pmap_release(pmap_t pmap)
2852 {
2853 bool rv __diagused;
2854 struct spglist freelist;
2855 struct asid_set *set;
2856 vm_page_t m;
2857 int asid;
2858
2859 if (pmap->pm_levels != 4) {
2860 PMAP_ASSERT_STAGE2(pmap);
2861 KASSERT(pmap->pm_stats.resident_count == 1,
2862 ("pmap_release: pmap resident count %ld != 0",
2863 pmap->pm_stats.resident_count));
2864 KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID,
2865 ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0]));
2866
2867 SLIST_INIT(&freelist);
2868 m = PHYS_TO_VM_PAGE(pmap->pm_ttbr);
2869 PMAP_LOCK(pmap);
2870 rv = pmap_unwire_l3(pmap, 0, m, &freelist);
2871 PMAP_UNLOCK(pmap);
2872 MPASS(rv == true);
2873 vm_page_free_pages_toq(&freelist, true);
2874 }
2875
2876 KASSERT(pmap->pm_stats.resident_count == 0,
2877 ("pmap_release: pmap resident count %ld != 0",
2878 pmap->pm_stats.resident_count));
2879 KASSERT(vm_radix_is_empty(&pmap->pm_root),
2880 ("pmap_release: pmap has reserved page table page(s)"));
2881
2882 set = pmap->pm_asid_set;
2883 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
2884
2885 /*
2886 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate
2887 * the entries when removing them so rely on a later tlb invalidation.
2888 * this will happen when updating the VMID generation. Because of this
2889 * we don't reuse VMIDs within a generation.
2890 */
2891 if (pmap->pm_stage == PM_STAGE1) {
2892 mtx_lock_spin(&set->asid_set_mutex);
2893 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) {
2894 asid = COOKIE_TO_ASID(pmap->pm_cookie);
2895 KASSERT(asid >= ASID_FIRST_AVAILABLE &&
2896 asid < set->asid_set_size,
2897 ("pmap_release: pmap cookie has out-of-range asid"));
2898 bit_clear(set->asid_set, asid);
2899 }
2900 mtx_unlock_spin(&set->asid_set_mutex);
2901
2902 if (pmap->pm_bti != NULL) {
2903 rangeset_fini(pmap->pm_bti);
2904 free(pmap->pm_bti, M_DEVBUF);
2905 }
2906 }
2907
2908 m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr);
2909 vm_page_unwire_noq(m);
2910 vm_page_free_zero(m);
2911 }
2912
2913 static int
kvm_size(SYSCTL_HANDLER_ARGS)2914 kvm_size(SYSCTL_HANDLER_ARGS)
2915 {
2916 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
2917
2918 return sysctl_handle_long(oidp, &ksize, 0, req);
2919 }
2920 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2921 0, 0, kvm_size, "LU",
2922 "Size of KVM");
2923
2924 static int
kvm_free(SYSCTL_HANDLER_ARGS)2925 kvm_free(SYSCTL_HANDLER_ARGS)
2926 {
2927 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2928
2929 return sysctl_handle_long(oidp, &kfree, 0, req);
2930 }
2931 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2932 0, 0, kvm_free, "LU",
2933 "Amount of KVM free");
2934
2935 /*
2936 * grow the number of kernel page table entries, if needed
2937 */
2938 void
pmap_growkernel(vm_offset_t addr)2939 pmap_growkernel(vm_offset_t addr)
2940 {
2941 vm_page_t nkpg;
2942 pd_entry_t *l0, *l1, *l2;
2943
2944 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2945
2946 addr = roundup2(addr, L2_SIZE);
2947 if (addr - 1 >= vm_map_max(kernel_map))
2948 addr = vm_map_max(kernel_map);
2949 if (kernel_vm_end < addr) {
2950 kasan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
2951 kmsan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
2952 }
2953 while (kernel_vm_end < addr) {
2954 l0 = pmap_l0(kernel_pmap, kernel_vm_end);
2955 KASSERT(pmap_load(l0) != 0,
2956 ("pmap_growkernel: No level 0 kernel entry"));
2957
2958 l1 = pmap_l0_to_l1(l0, kernel_vm_end);
2959 if (pmap_load(l1) == 0) {
2960 /* We need a new PDP entry */
2961 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
2962 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2963 if (nkpg == NULL)
2964 panic("pmap_growkernel: no memory to grow kernel");
2965 nkpg->pindex = kernel_vm_end >> L1_SHIFT;
2966 /* See the dmb() in _pmap_alloc_l3(). */
2967 dmb(ishst);
2968 pmap_store(l1, VM_PAGE_TO_PTE(nkpg) | L1_TABLE);
2969 continue; /* try again */
2970 }
2971 l2 = pmap_l1_to_l2(l1, kernel_vm_end);
2972 if (pmap_load(l2) != 0) {
2973 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2974 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2975 kernel_vm_end = vm_map_max(kernel_map);
2976 break;
2977 }
2978 continue;
2979 }
2980
2981 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
2982 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2983 if (nkpg == NULL)
2984 panic("pmap_growkernel: no memory to grow kernel");
2985 nkpg->pindex = kernel_vm_end >> L2_SHIFT;
2986 /* See the dmb() in _pmap_alloc_l3(). */
2987 dmb(ishst);
2988 pmap_store(l2, VM_PAGE_TO_PTE(nkpg) | L2_TABLE);
2989
2990 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2991 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2992 kernel_vm_end = vm_map_max(kernel_map);
2993 break;
2994 }
2995 }
2996 }
2997
2998 /***************************************************
2999 * page management routines.
3000 ***************************************************/
3001
3002 static const uint64_t pc_freemask[_NPCM] = {
3003 [0 ... _NPCM - 2] = PC_FREEN,
3004 [_NPCM - 1] = PC_FREEL
3005 };
3006
3007 #ifdef PV_STATS
3008 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
3009
3010 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
3011 "Current number of pv entry chunks");
3012 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
3013 "Current number of pv entry chunks allocated");
3014 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
3015 "Current number of pv entry chunks frees");
3016 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
3017 "Number of times tried to get a chunk page but failed.");
3018
3019 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
3020 static int pv_entry_spare;
3021
3022 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
3023 "Current number of pv entry frees");
3024 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
3025 "Current number of pv entry allocs");
3026 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
3027 "Current number of pv entries");
3028 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
3029 "Current number of spare pv entries");
3030 #endif
3031
3032 /*
3033 * We are in a serious low memory condition. Resort to
3034 * drastic measures to free some pages so we can allocate
3035 * another pv entry chunk.
3036 *
3037 * Returns NULL if PV entries were reclaimed from the specified pmap.
3038 *
3039 * We do not, however, unmap 2mpages because subsequent accesses will
3040 * allocate per-page pv entries until repromotion occurs, thereby
3041 * exacerbating the shortage of free pv entries.
3042 */
3043 static vm_page_t
reclaim_pv_chunk_domain(pmap_t locked_pmap,struct rwlock ** lockp,int domain)3044 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain)
3045 {
3046 struct pv_chunks_list *pvc;
3047 struct pv_chunk *pc, *pc_marker, *pc_marker_end;
3048 struct pv_chunk_header pc_marker_b, pc_marker_end_b;
3049 struct md_page *pvh;
3050 pd_entry_t *pde;
3051 pmap_t next_pmap, pmap;
3052 pt_entry_t *pte, tpte;
3053 pv_entry_t pv;
3054 vm_offset_t va;
3055 vm_page_t m, m_pc;
3056 struct spglist free;
3057 uint64_t inuse;
3058 int bit, field, freed, lvl;
3059
3060 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
3061 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
3062
3063 pmap = NULL;
3064 m_pc = NULL;
3065 SLIST_INIT(&free);
3066 bzero(&pc_marker_b, sizeof(pc_marker_b));
3067 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
3068 pc_marker = (struct pv_chunk *)&pc_marker_b;
3069 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
3070
3071 pvc = &pv_chunks[domain];
3072 mtx_lock(&pvc->pvc_lock);
3073 pvc->active_reclaims++;
3074 TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru);
3075 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru);
3076 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
3077 SLIST_EMPTY(&free)) {
3078 next_pmap = pc->pc_pmap;
3079 if (next_pmap == NULL) {
3080 /*
3081 * The next chunk is a marker. However, it is
3082 * not our marker, so active_reclaims must be
3083 * > 1. Consequently, the next_chunk code
3084 * will not rotate the pv_chunks list.
3085 */
3086 goto next_chunk;
3087 }
3088 mtx_unlock(&pvc->pvc_lock);
3089
3090 /*
3091 * A pv_chunk can only be removed from the pc_lru list
3092 * when both pvc->pvc_lock is owned and the
3093 * corresponding pmap is locked.
3094 */
3095 if (pmap != next_pmap) {
3096 if (pmap != NULL && pmap != locked_pmap)
3097 PMAP_UNLOCK(pmap);
3098 pmap = next_pmap;
3099 /* Avoid deadlock and lock recursion. */
3100 if (pmap > locked_pmap) {
3101 RELEASE_PV_LIST_LOCK(lockp);
3102 PMAP_LOCK(pmap);
3103 mtx_lock(&pvc->pvc_lock);
3104 continue;
3105 } else if (pmap != locked_pmap) {
3106 if (PMAP_TRYLOCK(pmap)) {
3107 mtx_lock(&pvc->pvc_lock);
3108 continue;
3109 } else {
3110 pmap = NULL; /* pmap is not locked */
3111 mtx_lock(&pvc->pvc_lock);
3112 pc = TAILQ_NEXT(pc_marker, pc_lru);
3113 if (pc == NULL ||
3114 pc->pc_pmap != next_pmap)
3115 continue;
3116 goto next_chunk;
3117 }
3118 }
3119 }
3120
3121 /*
3122 * Destroy every non-wired, 4 KB page mapping in the chunk.
3123 */
3124 freed = 0;
3125 for (field = 0; field < _NPCM; field++) {
3126 for (inuse = ~pc->pc_map[field] & pc_freemask[field];
3127 inuse != 0; inuse &= ~(1UL << bit)) {
3128 bit = ffsl(inuse) - 1;
3129 pv = &pc->pc_pventry[field * 64 + bit];
3130 va = pv->pv_va;
3131 pde = pmap_pde(pmap, va, &lvl);
3132 if (lvl != 2)
3133 continue;
3134 pte = pmap_l2_to_l3(pde, va);
3135 tpte = pmap_load(pte);
3136 if ((tpte & ATTR_SW_WIRED) != 0)
3137 continue;
3138 if ((tpte & ATTR_CONTIGUOUS) != 0)
3139 (void)pmap_demote_l3c(pmap, pte, va);
3140 tpte = pmap_load_clear(pte);
3141 m = PTE_TO_VM_PAGE(tpte);
3142 if (pmap_pte_dirty(pmap, tpte))
3143 vm_page_dirty(m);
3144 if ((tpte & ATTR_AF) != 0) {
3145 pmap_s1_invalidate_page(pmap, va, true);
3146 vm_page_aflag_set(m, PGA_REFERENCED);
3147 }
3148 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3149 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3150 m->md.pv_gen++;
3151 if (TAILQ_EMPTY(&m->md.pv_list) &&
3152 (m->flags & PG_FICTITIOUS) == 0) {
3153 pvh = page_to_pvh(m);
3154 if (TAILQ_EMPTY(&pvh->pv_list)) {
3155 vm_page_aflag_clear(m,
3156 PGA_WRITEABLE);
3157 }
3158 }
3159 pc->pc_map[field] |= 1UL << bit;
3160 pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
3161 freed++;
3162 }
3163 }
3164 if (freed == 0) {
3165 mtx_lock(&pvc->pvc_lock);
3166 goto next_chunk;
3167 }
3168 /* Every freed mapping is for a 4 KB page. */
3169 pmap_resident_count_dec(pmap, freed);
3170 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
3171 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
3172 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
3173 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3174 if (pc_is_free(pc)) {
3175 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3176 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3177 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3178 /* Entire chunk is free; return it. */
3179 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3180 dump_drop_page(m_pc->phys_addr);
3181 mtx_lock(&pvc->pvc_lock);
3182 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3183 break;
3184 }
3185 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3186 mtx_lock(&pvc->pvc_lock);
3187 /* One freed pv entry in locked_pmap is sufficient. */
3188 if (pmap == locked_pmap)
3189 break;
3190
3191 next_chunk:
3192 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
3193 TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru);
3194 if (pvc->active_reclaims == 1 && pmap != NULL) {
3195 /*
3196 * Rotate the pv chunks list so that we do not
3197 * scan the same pv chunks that could not be
3198 * freed (because they contained a wired
3199 * and/or superpage mapping) on every
3200 * invocation of reclaim_pv_chunk().
3201 */
3202 while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker){
3203 MPASS(pc->pc_pmap != NULL);
3204 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3205 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3206 }
3207 }
3208 }
3209 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
3210 TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru);
3211 pvc->active_reclaims--;
3212 mtx_unlock(&pvc->pvc_lock);
3213 if (pmap != NULL && pmap != locked_pmap)
3214 PMAP_UNLOCK(pmap);
3215 if (m_pc == NULL && !SLIST_EMPTY(&free)) {
3216 m_pc = SLIST_FIRST(&free);
3217 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
3218 /* Recycle a freed page table page. */
3219 m_pc->ref_count = 1;
3220 }
3221 vm_page_free_pages_toq(&free, true);
3222 return (m_pc);
3223 }
3224
3225 static vm_page_t
reclaim_pv_chunk(pmap_t locked_pmap,struct rwlock ** lockp)3226 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
3227 {
3228 vm_page_t m;
3229 int i, domain;
3230
3231 domain = PCPU_GET(domain);
3232 for (i = 0; i < vm_ndomains; i++) {
3233 m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain);
3234 if (m != NULL)
3235 break;
3236 domain = (domain + 1) % vm_ndomains;
3237 }
3238
3239 return (m);
3240 }
3241
3242 /*
3243 * free the pv_entry back to the free list
3244 */
3245 static void
free_pv_entry(pmap_t pmap,pv_entry_t pv)3246 free_pv_entry(pmap_t pmap, pv_entry_t pv)
3247 {
3248 struct pv_chunk *pc;
3249 int idx, field, bit;
3250
3251 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3252 PV_STAT(atomic_add_long(&pv_entry_frees, 1));
3253 PV_STAT(atomic_add_int(&pv_entry_spare, 1));
3254 PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
3255 pc = pv_to_chunk(pv);
3256 idx = pv - &pc->pc_pventry[0];
3257 field = idx / 64;
3258 bit = idx % 64;
3259 pc->pc_map[field] |= 1ul << bit;
3260 if (!pc_is_free(pc)) {
3261 /* 98% of the time, pc is already at the head of the list. */
3262 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
3263 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3264 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3265 }
3266 return;
3267 }
3268 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3269 free_pv_chunk(pc);
3270 }
3271
3272 static void
free_pv_chunk_dequeued(struct pv_chunk * pc)3273 free_pv_chunk_dequeued(struct pv_chunk *pc)
3274 {
3275 vm_page_t m;
3276
3277 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3278 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3279 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3280 /* entire chunk is free, return it */
3281 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3282 dump_drop_page(m->phys_addr);
3283 vm_page_unwire_noq(m);
3284 vm_page_free(m);
3285 }
3286
3287 static void
free_pv_chunk(struct pv_chunk * pc)3288 free_pv_chunk(struct pv_chunk *pc)
3289 {
3290 struct pv_chunks_list *pvc;
3291
3292 pvc = &pv_chunks[pc_to_domain(pc)];
3293 mtx_lock(&pvc->pvc_lock);
3294 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3295 mtx_unlock(&pvc->pvc_lock);
3296 free_pv_chunk_dequeued(pc);
3297 }
3298
3299 static void
free_pv_chunk_batch(struct pv_chunklist * batch)3300 free_pv_chunk_batch(struct pv_chunklist *batch)
3301 {
3302 struct pv_chunks_list *pvc;
3303 struct pv_chunk *pc, *npc;
3304 int i;
3305
3306 for (i = 0; i < vm_ndomains; i++) {
3307 if (TAILQ_EMPTY(&batch[i]))
3308 continue;
3309 pvc = &pv_chunks[i];
3310 mtx_lock(&pvc->pvc_lock);
3311 TAILQ_FOREACH(pc, &batch[i], pc_list) {
3312 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3313 }
3314 mtx_unlock(&pvc->pvc_lock);
3315 }
3316
3317 for (i = 0; i < vm_ndomains; i++) {
3318 TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) {
3319 free_pv_chunk_dequeued(pc);
3320 }
3321 }
3322 }
3323
3324 /*
3325 * Returns a new PV entry, allocating a new PV chunk from the system when
3326 * needed. If this PV chunk allocation fails and a PV list lock pointer was
3327 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is
3328 * returned.
3329 *
3330 * The given PV list lock may be released.
3331 */
3332 static pv_entry_t
get_pv_entry(pmap_t pmap,struct rwlock ** lockp)3333 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
3334 {
3335 struct pv_chunks_list *pvc;
3336 int bit, field;
3337 pv_entry_t pv;
3338 struct pv_chunk *pc;
3339 vm_page_t m;
3340
3341 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3342 PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
3343 retry:
3344 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3345 if (pc != NULL) {
3346 for (field = 0; field < _NPCM; field++) {
3347 if (pc->pc_map[field]) {
3348 bit = ffsl(pc->pc_map[field]) - 1;
3349 break;
3350 }
3351 }
3352 if (field < _NPCM) {
3353 pv = &pc->pc_pventry[field * 64 + bit];
3354 pc->pc_map[field] &= ~(1ul << bit);
3355 /* If this was the last item, move it to tail */
3356 if (pc_is_full(pc)) {
3357 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3358 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
3359 pc_list);
3360 }
3361 PV_STAT(atomic_add_long(&pv_entry_count, 1));
3362 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
3363 return (pv);
3364 }
3365 }
3366 /* No free items, allocate another chunk */
3367 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3368 if (m == NULL) {
3369 if (lockp == NULL) {
3370 PV_STAT(pc_chunk_tryfail++);
3371 return (NULL);
3372 }
3373 m = reclaim_pv_chunk(pmap, lockp);
3374 if (m == NULL)
3375 goto retry;
3376 }
3377 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3378 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3379 dump_add_page(m->phys_addr);
3380 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3381 pc->pc_pmap = pmap;
3382 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3383 pc->pc_map[0] &= ~1ul; /* preallocated bit 0 */
3384 pvc = &pv_chunks[vm_page_domain(m)];
3385 mtx_lock(&pvc->pvc_lock);
3386 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3387 mtx_unlock(&pvc->pvc_lock);
3388 pv = &pc->pc_pventry[0];
3389 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3390 PV_STAT(atomic_add_long(&pv_entry_count, 1));
3391 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
3392 return (pv);
3393 }
3394
3395 /*
3396 * Ensure that the number of spare PV entries in the specified pmap meets or
3397 * exceeds the given count, "needed".
3398 *
3399 * The given PV list lock may be released.
3400 */
3401 static void
reserve_pv_entries(pmap_t pmap,int needed,struct rwlock ** lockp)3402 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
3403 {
3404 struct pv_chunks_list *pvc;
3405 struct pch new_tail[PMAP_MEMDOM];
3406 struct pv_chunk *pc;
3407 vm_page_t m;
3408 int avail, free, i;
3409 bool reclaimed;
3410
3411 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3412 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
3413
3414 /*
3415 * Newly allocated PV chunks must be stored in a private list until
3416 * the required number of PV chunks have been allocated. Otherwise,
3417 * reclaim_pv_chunk() could recycle one of these chunks. In
3418 * contrast, these chunks must be added to the pmap upon allocation.
3419 */
3420 for (i = 0; i < PMAP_MEMDOM; i++)
3421 TAILQ_INIT(&new_tail[i]);
3422 retry:
3423 avail = 0;
3424 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3425 bit_count((bitstr_t *)pc->pc_map, 0,
3426 sizeof(pc->pc_map) * NBBY, &free);
3427 if (free == 0)
3428 break;
3429 avail += free;
3430 if (avail >= needed)
3431 break;
3432 }
3433 for (reclaimed = false; avail < needed; avail += _NPCPV) {
3434 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3435 if (m == NULL) {
3436 m = reclaim_pv_chunk(pmap, lockp);
3437 if (m == NULL)
3438 goto retry;
3439 reclaimed = true;
3440 }
3441 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3442 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3443 dump_add_page(m->phys_addr);
3444 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3445 pc->pc_pmap = pmap;
3446 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3447 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3448 TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru);
3449 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3450
3451 /*
3452 * The reclaim might have freed a chunk from the current pmap.
3453 * If that chunk contained available entries, we need to
3454 * re-count the number of available entries.
3455 */
3456 if (reclaimed)
3457 goto retry;
3458 }
3459 for (i = 0; i < vm_ndomains; i++) {
3460 if (TAILQ_EMPTY(&new_tail[i]))
3461 continue;
3462 pvc = &pv_chunks[i];
3463 mtx_lock(&pvc->pvc_lock);
3464 TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru);
3465 mtx_unlock(&pvc->pvc_lock);
3466 }
3467 }
3468
3469 /*
3470 * First find and then remove the pv entry for the specified pmap and virtual
3471 * address from the specified pv list. Returns the pv entry if found and NULL
3472 * otherwise. This operation can be performed on pv lists for either 4KB or
3473 * 2MB page mappings.
3474 */
3475 static __inline pv_entry_t
pmap_pvh_remove(struct md_page * pvh,pmap_t pmap,vm_offset_t va)3476 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3477 {
3478 pv_entry_t pv;
3479
3480 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3481 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3482 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3483 pvh->pv_gen++;
3484 break;
3485 }
3486 }
3487 return (pv);
3488 }
3489
3490 /*
3491 * After demotion from a 2MB page mapping to 512 4KB page mappings,
3492 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3493 * entries for each of the 4KB page mappings.
3494 */
3495 static void
pmap_pv_demote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)3496 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3497 struct rwlock **lockp)
3498 {
3499 struct md_page *pvh;
3500 struct pv_chunk *pc;
3501 pv_entry_t pv;
3502 vm_offset_t va_last;
3503 vm_page_t m;
3504 int bit, field;
3505
3506 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3507 KASSERT((va & L2_OFFSET) == 0,
3508 ("pmap_pv_demote_l2: va is not 2mpage aligned"));
3509 KASSERT((pa & L2_OFFSET) == 0,
3510 ("pmap_pv_demote_l2: pa is not 2mpage aligned"));
3511 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3512
3513 /*
3514 * Transfer the 2mpage's pv entry for this mapping to the first
3515 * page's pv list. Once this transfer begins, the pv list lock
3516 * must not be released until the last pv entry is reinstantiated.
3517 */
3518 pvh = pa_to_pvh(pa);
3519 pv = pmap_pvh_remove(pvh, pmap, va);
3520 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
3521 m = PHYS_TO_VM_PAGE(pa);
3522 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3523 m->md.pv_gen++;
3524 /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
3525 PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
3526 va_last = va + L2_SIZE - PAGE_SIZE;
3527 for (;;) {
3528 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3529 KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare"));
3530 for (field = 0; field < _NPCM; field++) {
3531 while (pc->pc_map[field]) {
3532 bit = ffsl(pc->pc_map[field]) - 1;
3533 pc->pc_map[field] &= ~(1ul << bit);
3534 pv = &pc->pc_pventry[field * 64 + bit];
3535 va += PAGE_SIZE;
3536 pv->pv_va = va;
3537 m++;
3538 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3539 ("pmap_pv_demote_l2: page %p is not managed", m));
3540 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3541 m->md.pv_gen++;
3542 if (va == va_last)
3543 goto out;
3544 }
3545 }
3546 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3547 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3548 }
3549 out:
3550 if (pc_is_full(pc)) {
3551 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3552 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3553 }
3554 PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
3555 PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
3556 }
3557
3558 /*
3559 * First find and then destroy the pv entry for the specified pmap and virtual
3560 * address. This operation can be performed on pv lists for either 4KB or 2MB
3561 * page mappings.
3562 */
3563 static void
pmap_pvh_free(struct md_page * pvh,pmap_t pmap,vm_offset_t va)3564 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3565 {
3566 pv_entry_t pv;
3567
3568 pv = pmap_pvh_remove(pvh, pmap, va);
3569 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3570 free_pv_entry(pmap, pv);
3571 }
3572
3573 /*
3574 * Conditionally create the PV entry for a 4KB page mapping if the required
3575 * memory can be allocated without resorting to reclamation.
3576 */
3577 static bool
pmap_try_insert_pv_entry(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)3578 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3579 struct rwlock **lockp)
3580 {
3581 pv_entry_t pv;
3582
3583 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3584 /* Pass NULL instead of the lock pointer to disable reclamation. */
3585 if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3586 pv->pv_va = va;
3587 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3588 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3589 m->md.pv_gen++;
3590 return (true);
3591 } else
3592 return (false);
3593 }
3594
3595 /*
3596 * Create the PV entry for a 2MB page mapping. Always returns true unless the
3597 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns
3598 * false if the PV entry cannot be allocated without resorting to reclamation.
3599 */
3600 static bool
pmap_pv_insert_l2(pmap_t pmap,vm_offset_t va,pd_entry_t l2e,u_int flags,struct rwlock ** lockp)3601 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
3602 struct rwlock **lockp)
3603 {
3604 struct md_page *pvh;
3605 pv_entry_t pv;
3606 vm_paddr_t pa;
3607
3608 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3609 /* Pass NULL instead of the lock pointer to disable reclamation. */
3610 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
3611 NULL : lockp)) == NULL)
3612 return (false);
3613 pv->pv_va = va;
3614 pa = PTE_TO_PHYS(l2e);
3615 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3616 pvh = pa_to_pvh(pa);
3617 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3618 pvh->pv_gen++;
3619 return (true);
3620 }
3621
3622 /*
3623 * Conditionally creates the PV entries for a L3C superpage mapping if
3624 * the required memory can be allocated without resorting to reclamation.
3625 */
3626 static bool
pmap_pv_insert_l3c(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)3627 pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
3628 struct rwlock **lockp)
3629 {
3630 pv_entry_t pv;
3631 vm_offset_t tva;
3632 vm_paddr_t pa __diagused;
3633 vm_page_t mt;
3634
3635 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3636 KASSERT((va & L3C_OFFSET) == 0,
3637 ("pmap_pv_insert_l3c: va is not aligned"));
3638 pa = VM_PAGE_TO_PHYS(m);
3639 KASSERT((pa & L3C_OFFSET) == 0,
3640 ("pmap_pv_insert_l3c: pa is not aligned"));
3641 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3642 for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva += L3_SIZE) {
3643 /* Pass NULL instead of lockp to disable reclamation. */
3644 pv = get_pv_entry(pmap, NULL);
3645 if (__predict_false(pv == NULL)) {
3646 while (tva > va) {
3647 mt--;
3648 tva -= L3_SIZE;
3649 pmap_pvh_free(&mt->md, pmap, tva);
3650 }
3651 return (false);
3652 }
3653 pv->pv_va = tva;
3654 TAILQ_INSERT_TAIL(&mt->md.pv_list, pv, pv_next);
3655 mt->md.pv_gen++;
3656 }
3657 return (true);
3658 }
3659
3660 static void
pmap_remove_kernel_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)3661 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
3662 {
3663 pt_entry_t newl2, oldl2 __diagused;
3664 vm_page_t ml3;
3665 vm_paddr_t ml3pa;
3666
3667 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
3668 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
3669 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3670
3671 ml3 = pmap_remove_pt_page(pmap, va);
3672 if (ml3 == NULL)
3673 panic("pmap_remove_kernel_l2: Missing pt page");
3674
3675 ml3pa = VM_PAGE_TO_PHYS(ml3);
3676 newl2 = PHYS_TO_PTE(ml3pa) | L2_TABLE;
3677
3678 /*
3679 * If this page table page was unmapped by a promotion, then it
3680 * contains valid mappings. Zero it to invalidate those mappings.
3681 */
3682 if (vm_page_any_valid(ml3))
3683 pagezero((void *)PHYS_TO_DMAP(ml3pa));
3684
3685 /*
3686 * Demote the mapping. The caller must have already invalidated the
3687 * mapping (i.e., the "break" in break-before-make).
3688 */
3689 oldl2 = pmap_load_store(l2, newl2);
3690 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
3691 __func__, l2, oldl2));
3692 }
3693
3694 /*
3695 * pmap_remove_l2: Do the things to unmap a level 2 superpage.
3696 */
3697 static int
pmap_remove_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pd_entry_t l1e,struct spglist * free,struct rwlock ** lockp)3698 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
3699 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
3700 {
3701 struct md_page *pvh;
3702 pt_entry_t old_l2;
3703 vm_page_t m, ml3, mt;
3704
3705 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3706 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
3707 old_l2 = pmap_load_clear(l2);
3708 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
3709 ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2));
3710
3711 /*
3712 * Since a promotion must break the 4KB page mappings before making
3713 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
3714 */
3715 pmap_s1_invalidate_page(pmap, sva, true);
3716
3717 if (old_l2 & ATTR_SW_WIRED)
3718 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
3719 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
3720 if (old_l2 & ATTR_SW_MANAGED) {
3721 m = PTE_TO_VM_PAGE(old_l2);
3722 pvh = page_to_pvh(m);
3723 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3724 pmap_pvh_free(pvh, pmap, sva);
3725 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) {
3726 if (pmap_pte_dirty(pmap, old_l2))
3727 vm_page_dirty(mt);
3728 if (old_l2 & ATTR_AF)
3729 vm_page_aflag_set(mt, PGA_REFERENCED);
3730 if (TAILQ_EMPTY(&mt->md.pv_list) &&
3731 TAILQ_EMPTY(&pvh->pv_list))
3732 vm_page_aflag_clear(mt, PGA_WRITEABLE);
3733 }
3734 }
3735 if (pmap == kernel_pmap) {
3736 pmap_remove_kernel_l2(pmap, l2, sva);
3737 } else {
3738 ml3 = pmap_remove_pt_page(pmap, sva);
3739 if (ml3 != NULL) {
3740 KASSERT(vm_page_any_valid(ml3),
3741 ("pmap_remove_l2: l3 page not promoted"));
3742 pmap_resident_count_dec(pmap, 1);
3743 KASSERT(ml3->ref_count == NL3PG,
3744 ("pmap_remove_l2: l3 page ref count error"));
3745 ml3->ref_count = 0;
3746 pmap_add_delayed_free_list(ml3, free, false);
3747 }
3748 }
3749 return (pmap_unuse_pt(pmap, sva, l1e, free));
3750 }
3751
3752 /*
3753 * pmap_remove_l3: do the things to unmap a page in a process
3754 */
3755 static int
pmap_remove_l3(pmap_t pmap,pt_entry_t * l3,vm_offset_t va,pd_entry_t l2e,struct spglist * free,struct rwlock ** lockp)3756 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
3757 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
3758 {
3759 struct md_page *pvh;
3760 pt_entry_t old_l3;
3761 vm_page_t m;
3762
3763 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3764 old_l3 = pmap_load(l3);
3765 if ((old_l3 & ATTR_CONTIGUOUS) != 0)
3766 (void)pmap_demote_l3c(pmap, l3, va);
3767 old_l3 = pmap_load_clear(l3);
3768 pmap_s1_invalidate_page(pmap, va, true);
3769 if (old_l3 & ATTR_SW_WIRED)
3770 pmap->pm_stats.wired_count -= 1;
3771 pmap_resident_count_dec(pmap, 1);
3772 if (old_l3 & ATTR_SW_MANAGED) {
3773 m = PTE_TO_VM_PAGE(old_l3);
3774 if (pmap_pte_dirty(pmap, old_l3))
3775 vm_page_dirty(m);
3776 if (old_l3 & ATTR_AF)
3777 vm_page_aflag_set(m, PGA_REFERENCED);
3778 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3779 pmap_pvh_free(&m->md, pmap, va);
3780 if (TAILQ_EMPTY(&m->md.pv_list) &&
3781 (m->flags & PG_FICTITIOUS) == 0) {
3782 pvh = page_to_pvh(m);
3783 if (TAILQ_EMPTY(&pvh->pv_list))
3784 vm_page_aflag_clear(m, PGA_WRITEABLE);
3785 }
3786 }
3787 return (pmap_unuse_pt(pmap, va, l2e, free));
3788 }
3789
3790 /*
3791 * Removes the specified L3C superpage mapping. Requests TLB invalidations
3792 * to be performed by the caller through the returned "*vap". Returns true
3793 * if the level 3 table "ml3" was unmapped and added to the spglist "free".
3794 * Otherwise, returns false.
3795 */
3796 static bool
pmap_remove_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,vm_offset_t * vap,vm_offset_t va_next,vm_page_t ml3,struct spglist * free,struct rwlock ** lockp)3797 pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, vm_offset_t *vap,
3798 vm_offset_t va_next, vm_page_t ml3, struct spglist *free,
3799 struct rwlock **lockp)
3800 {
3801 struct md_page *pvh;
3802 struct rwlock *new_lock;
3803 pt_entry_t first_l3e, l3e, *tl3p;
3804 vm_offset_t tva;
3805 vm_page_t m, mt;
3806
3807 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3808 KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) ==
3809 0, ("pmap_remove_l3c: l3p is not aligned"));
3810 KASSERT((va & L3C_OFFSET) == 0,
3811 ("pmap_remove_l3c: va is not aligned"));
3812
3813 /*
3814 * Hardware accessed and dirty bit maintenance might only update a
3815 * single L3 entry, so we must combine the accessed and dirty bits
3816 * from this entire set of contiguous L3 entries.
3817 */
3818 first_l3e = pmap_load_clear(l3p);
3819 for (tl3p = l3p + 1; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
3820 l3e = pmap_load_clear(tl3p);
3821 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
3822 ("pmap_remove_l3c: l3e is missing ATTR_CONTIGUOUS"));
3823 if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) ==
3824 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW)))
3825 first_l3e &= ~ATTR_S1_AP_RW_BIT;
3826 first_l3e |= l3e & ATTR_AF;
3827 }
3828 if ((first_l3e & ATTR_SW_WIRED) != 0)
3829 pmap->pm_stats.wired_count -= L3C_ENTRIES;
3830 pmap_resident_count_dec(pmap, L3C_ENTRIES);
3831 if ((first_l3e & ATTR_SW_MANAGED) != 0) {
3832 m = PTE_TO_VM_PAGE(first_l3e);
3833 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3834 if (new_lock != *lockp) {
3835 if (*lockp != NULL) {
3836 /*
3837 * Pending TLB invalidations must be
3838 * performed before the PV list lock is
3839 * released. Otherwise, a concurrent
3840 * pmap_remove_all() on a physical page
3841 * could return while a stale TLB entry
3842 * still provides access to that page.
3843 */
3844 if (*vap != va_next) {
3845 pmap_invalidate_range(pmap, *vap, va,
3846 true);
3847 *vap = va_next;
3848 }
3849 rw_wunlock(*lockp);
3850 }
3851 *lockp = new_lock;
3852 rw_wlock(*lockp);
3853 }
3854 pvh = page_to_pvh(m);
3855 for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva +=
3856 L3_SIZE) {
3857 if (pmap_pte_dirty(pmap, first_l3e))
3858 vm_page_dirty(mt);
3859 if ((first_l3e & ATTR_AF) != 0)
3860 vm_page_aflag_set(mt, PGA_REFERENCED);
3861 pmap_pvh_free(&mt->md, pmap, tva);
3862 if (TAILQ_EMPTY(&mt->md.pv_list) &&
3863 TAILQ_EMPTY(&pvh->pv_list))
3864 vm_page_aflag_clear(mt, PGA_WRITEABLE);
3865 }
3866 }
3867 if (*vap == va_next)
3868 *vap = va;
3869 if (ml3 != NULL) {
3870 ml3->ref_count -= L3C_ENTRIES;
3871 if (ml3->ref_count == 0) {
3872 _pmap_unwire_l3(pmap, va, ml3, free);
3873 return (true);
3874 }
3875 }
3876 return (false);
3877 }
3878
3879 /*
3880 * Remove the specified range of addresses from the L3 page table that is
3881 * identified by the given L2 entry.
3882 */
3883 static void
pmap_remove_l3_range(pmap_t pmap,pd_entry_t l2e,vm_offset_t sva,vm_offset_t eva,struct spglist * free,struct rwlock ** lockp)3884 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva,
3885 vm_offset_t eva, struct spglist *free, struct rwlock **lockp)
3886 {
3887 struct md_page *pvh;
3888 struct rwlock *new_lock;
3889 pt_entry_t *l3, old_l3;
3890 vm_offset_t va;
3891 vm_page_t l3pg, m;
3892
3893 KASSERT(ADDR_IS_CANONICAL(sva),
3894 ("%s: Start address not in canonical form: %lx", __func__, sva));
3895 KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS,
3896 ("%s: End address not in canonical form: %lx", __func__, eva));
3897
3898 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3899 KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE),
3900 ("pmap_remove_l3_range: range crosses an L3 page table boundary"));
3901 l3pg = !ADDR_IS_KERNEL(sva) ? PTE_TO_VM_PAGE(l2e) : NULL;
3902 va = eva;
3903 for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) {
3904 old_l3 = pmap_load(l3);
3905 if (!pmap_l3_valid(old_l3)) {
3906 if (va != eva) {
3907 pmap_invalidate_range(pmap, va, sva, true);
3908 va = eva;
3909 }
3910 continue;
3911 }
3912 if ((old_l3 & ATTR_CONTIGUOUS) != 0) {
3913 /*
3914 * Is this entire set of contiguous L3 entries being
3915 * removed? Handle the possibility that "eva" is zero
3916 * because of address wraparound.
3917 */
3918 if ((sva & L3C_OFFSET) == 0 &&
3919 sva + L3C_OFFSET <= eva - 1) {
3920 if (pmap_remove_l3c(pmap, l3, sva, &va, eva,
3921 l3pg, free, lockp)) {
3922 /* The L3 table was unmapped. */
3923 sva += L3C_SIZE;
3924 break;
3925 }
3926 l3 += L3C_ENTRIES - 1;
3927 sva += L3C_SIZE - L3_SIZE;
3928 continue;
3929 }
3930
3931 (void)pmap_demote_l3c(pmap, l3, sva);
3932 }
3933 old_l3 = pmap_load_clear(l3);
3934 if ((old_l3 & ATTR_SW_WIRED) != 0)
3935 pmap->pm_stats.wired_count--;
3936 pmap_resident_count_dec(pmap, 1);
3937 if ((old_l3 & ATTR_SW_MANAGED) != 0) {
3938 m = PTE_TO_VM_PAGE(old_l3);
3939 if (pmap_pte_dirty(pmap, old_l3))
3940 vm_page_dirty(m);
3941 if ((old_l3 & ATTR_AF) != 0)
3942 vm_page_aflag_set(m, PGA_REFERENCED);
3943 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3944 if (new_lock != *lockp) {
3945 if (*lockp != NULL) {
3946 /*
3947 * Pending TLB invalidations must be
3948 * performed before the PV list lock is
3949 * released. Otherwise, a concurrent
3950 * pmap_remove_all() on a physical page
3951 * could return while a stale TLB entry
3952 * still provides access to that page.
3953 */
3954 if (va != eva) {
3955 pmap_invalidate_range(pmap, va,
3956 sva, true);
3957 va = eva;
3958 }
3959 rw_wunlock(*lockp);
3960 }
3961 *lockp = new_lock;
3962 rw_wlock(*lockp);
3963 }
3964 pmap_pvh_free(&m->md, pmap, sva);
3965 if (TAILQ_EMPTY(&m->md.pv_list) &&
3966 (m->flags & PG_FICTITIOUS) == 0) {
3967 pvh = page_to_pvh(m);
3968 if (TAILQ_EMPTY(&pvh->pv_list))
3969 vm_page_aflag_clear(m, PGA_WRITEABLE);
3970 }
3971 }
3972 if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) {
3973 /*
3974 * _pmap_unwire_l3() has already invalidated the TLB
3975 * entries at all levels for "sva". So, we need not
3976 * perform "sva += L3_SIZE;" here. Moreover, we need
3977 * not perform "va = sva;" if "sva" is at the start
3978 * of a new valid range consisting of a single page.
3979 */
3980 break;
3981 }
3982 if (va == eva)
3983 va = sva;
3984 }
3985 if (va != eva)
3986 pmap_invalidate_range(pmap, va, sva, true);
3987 }
3988
3989 static void
pmap_remove1(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool map_delete)3990 pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete)
3991 {
3992 struct rwlock *lock;
3993 vm_offset_t va_next;
3994 pd_entry_t *l0, *l1, *l2;
3995 pt_entry_t l3_paddr;
3996 struct spglist free;
3997
3998 /*
3999 * Perform an unsynchronized read. This is, however, safe.
4000 */
4001 if (pmap->pm_stats.resident_count == 0)
4002 return;
4003
4004 SLIST_INIT(&free);
4005
4006 PMAP_LOCK(pmap);
4007 if (map_delete)
4008 pmap_bti_on_remove(pmap, sva, eva);
4009
4010 lock = NULL;
4011 for (; sva < eva; sva = va_next) {
4012 if (pmap->pm_stats.resident_count == 0)
4013 break;
4014
4015 l0 = pmap_l0(pmap, sva);
4016 if (pmap_load(l0) == 0) {
4017 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
4018 if (va_next < sva)
4019 va_next = eva;
4020 continue;
4021 }
4022
4023 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
4024 if (va_next < sva)
4025 va_next = eva;
4026 l1 = pmap_l0_to_l1(l0, sva);
4027 if (pmap_load(l1) == 0)
4028 continue;
4029 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4030 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4031 KASSERT(va_next <= eva,
4032 ("partial update of non-transparent 1G page "
4033 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4034 pmap_load(l1), sva, eva, va_next));
4035 MPASS(pmap != kernel_pmap);
4036 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
4037 pmap_clear(l1);
4038 pmap_s1_invalidate_page(pmap, sva, true);
4039 pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE);
4040 pmap_unuse_pt(pmap, sva, pmap_load(l0), &free);
4041 continue;
4042 }
4043
4044 /*
4045 * Calculate index for next page table.
4046 */
4047 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4048 if (va_next < sva)
4049 va_next = eva;
4050
4051 l2 = pmap_l1_to_l2(l1, sva);
4052 if (l2 == NULL)
4053 continue;
4054
4055 l3_paddr = pmap_load(l2);
4056
4057 if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
4058 if (sva + L2_SIZE == va_next && eva >= va_next) {
4059 pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
4060 &free, &lock);
4061 continue;
4062 } else if (pmap_demote_l2_locked(pmap, l2, sva,
4063 &lock) == NULL)
4064 continue;
4065 l3_paddr = pmap_load(l2);
4066 }
4067
4068 /*
4069 * Weed out invalid mappings.
4070 */
4071 if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
4072 continue;
4073
4074 /*
4075 * Limit our scan to either the end of the va represented
4076 * by the current page table page, or to the end of the
4077 * range being removed.
4078 */
4079 if (va_next > eva)
4080 va_next = eva;
4081
4082 pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free,
4083 &lock);
4084 }
4085 if (lock != NULL)
4086 rw_wunlock(lock);
4087 PMAP_UNLOCK(pmap);
4088 vm_page_free_pages_toq(&free, true);
4089 }
4090
4091 /*
4092 * Remove the given range of addresses from the specified map.
4093 *
4094 * It is assumed that the start and end are properly
4095 * rounded to the page size.
4096 */
4097 void
pmap_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)4098 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4099 {
4100 pmap_remove1(pmap, sva, eva, false);
4101 }
4102
4103 /*
4104 * Remove the given range of addresses as part of a logical unmap
4105 * operation. This has the effect of calling pmap_remove(), but
4106 * also clears any metadata that should persist for the lifetime
4107 * of a logical mapping.
4108 */
4109 void
pmap_map_delete(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)4110 pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4111 {
4112 pmap_remove1(pmap, sva, eva, true);
4113 }
4114
4115 /*
4116 * Routine: pmap_remove_all
4117 * Function:
4118 * Removes this physical page from
4119 * all physical maps in which it resides.
4120 * Reflects back modify bits to the pager.
4121 *
4122 * Notes:
4123 * Original versions of this routine were very
4124 * inefficient because they iteratively called
4125 * pmap_remove (slow...)
4126 */
4127
4128 void
pmap_remove_all(vm_page_t m)4129 pmap_remove_all(vm_page_t m)
4130 {
4131 struct md_page *pvh;
4132 pv_entry_t pv;
4133 pmap_t pmap;
4134 struct rwlock *lock;
4135 pd_entry_t *pde, tpde;
4136 pt_entry_t *pte, tpte;
4137 vm_offset_t va;
4138 struct spglist free;
4139 int lvl, pvh_gen, md_gen;
4140
4141 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4142 ("pmap_remove_all: page %p is not managed", m));
4143 SLIST_INIT(&free);
4144 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4145 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
4146 rw_wlock(lock);
4147 retry:
4148 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
4149 pmap = PV_PMAP(pv);
4150 if (!PMAP_TRYLOCK(pmap)) {
4151 pvh_gen = pvh->pv_gen;
4152 rw_wunlock(lock);
4153 PMAP_LOCK(pmap);
4154 rw_wlock(lock);
4155 if (pvh_gen != pvh->pv_gen) {
4156 PMAP_UNLOCK(pmap);
4157 goto retry;
4158 }
4159 }
4160 va = pv->pv_va;
4161 pte = pmap_pte_exists(pmap, va, 2, __func__);
4162 pmap_demote_l2_locked(pmap, pte, va, &lock);
4163 PMAP_UNLOCK(pmap);
4164 }
4165 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4166 pmap = PV_PMAP(pv);
4167 if (!PMAP_TRYLOCK(pmap)) {
4168 pvh_gen = pvh->pv_gen;
4169 md_gen = m->md.pv_gen;
4170 rw_wunlock(lock);
4171 PMAP_LOCK(pmap);
4172 rw_wlock(lock);
4173 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4174 PMAP_UNLOCK(pmap);
4175 goto retry;
4176 }
4177 }
4178 pmap_resident_count_dec(pmap, 1);
4179
4180 pde = pmap_pde(pmap, pv->pv_va, &lvl);
4181 KASSERT(pde != NULL,
4182 ("pmap_remove_all: no page directory entry found"));
4183 KASSERT(lvl == 2,
4184 ("pmap_remove_all: invalid pde level %d", lvl));
4185 tpde = pmap_load(pde);
4186
4187 pte = pmap_l2_to_l3(pde, pv->pv_va);
4188 tpte = pmap_load(pte);
4189 if ((tpte & ATTR_CONTIGUOUS) != 0)
4190 (void)pmap_demote_l3c(pmap, pte, pv->pv_va);
4191 tpte = pmap_load_clear(pte);
4192 if (tpte & ATTR_SW_WIRED)
4193 pmap->pm_stats.wired_count--;
4194 if ((tpte & ATTR_AF) != 0) {
4195 pmap_invalidate_page(pmap, pv->pv_va, true);
4196 vm_page_aflag_set(m, PGA_REFERENCED);
4197 }
4198
4199 /*
4200 * Update the vm_page_t clean and reference bits.
4201 */
4202 if (pmap_pte_dirty(pmap, tpte))
4203 vm_page_dirty(m);
4204 pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
4205 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4206 m->md.pv_gen++;
4207 free_pv_entry(pmap, pv);
4208 PMAP_UNLOCK(pmap);
4209 }
4210 vm_page_aflag_clear(m, PGA_WRITEABLE);
4211 rw_wunlock(lock);
4212 vm_page_free_pages_toq(&free, true);
4213 }
4214
4215 /*
4216 * Masks and sets bits in a level 2 page table entries in the specified pmap
4217 */
4218 static void
pmap_protect_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pt_entry_t mask,pt_entry_t nbits)4219 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask,
4220 pt_entry_t nbits)
4221 {
4222 pd_entry_t old_l2;
4223 vm_page_t m, mt;
4224
4225 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4226 PMAP_ASSERT_STAGE1(pmap);
4227 KASSERT((sva & L2_OFFSET) == 0,
4228 ("pmap_protect_l2: sva is not 2mpage aligned"));
4229 old_l2 = pmap_load(l2);
4230 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
4231 ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2));
4232
4233 /*
4234 * Return if the L2 entry already has the desired access restrictions
4235 * in place.
4236 */
4237 if ((old_l2 & mask) == nbits)
4238 return;
4239
4240 while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits))
4241 cpu_spinwait();
4242
4243 /*
4244 * When a dirty read/write superpage mapping is write protected,
4245 * update the dirty field of each of the superpage's constituent 4KB
4246 * pages.
4247 */
4248 if ((old_l2 & ATTR_SW_MANAGED) != 0 &&
4249 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4250 pmap_pte_dirty(pmap, old_l2)) {
4251 m = PTE_TO_VM_PAGE(old_l2);
4252 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
4253 vm_page_dirty(mt);
4254 }
4255
4256 /*
4257 * Since a promotion must break the 4KB page mappings before making
4258 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
4259 */
4260 pmap_s1_invalidate_page(pmap, sva, true);
4261 }
4262
4263 /*
4264 * Masks and sets bits in the specified L3C superpage mapping.
4265 *
4266 * Requests TLB invalidations to be performed by the caller through the
4267 * returned "*vap".
4268 */
4269 static void
pmap_mask_set_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,vm_offset_t * vap,vm_offset_t va_next,pt_entry_t mask,pt_entry_t nbits)4270 pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
4271 vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits)
4272 {
4273 pt_entry_t l3e, *tl3p;
4274 vm_page_t m, mt;
4275 bool dirty;
4276
4277 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4278 KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) ==
4279 0, ("pmap_mask_set_l3c: l3p is not aligned"));
4280 KASSERT((va & L3C_OFFSET) == 0,
4281 ("pmap_mask_set_l3c: va is not aligned"));
4282 dirty = false;
4283 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
4284 l3e = pmap_load(tl3p);
4285 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
4286 ("pmap_mask_set_l3c: l3e is missing ATTR_CONTIGUOUS"));
4287 while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits))
4288 cpu_spinwait();
4289 if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) ==
4290 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW)))
4291 dirty = true;
4292 }
4293
4294 /*
4295 * When a dirty read/write superpage mapping is write protected,
4296 * update the dirty field of each of the superpage's constituent 4KB
4297 * pages.
4298 */
4299 if ((l3e & ATTR_SW_MANAGED) != 0 &&
4300 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4301 dirty) {
4302 m = PTE_TO_VM_PAGE(pmap_load(l3p));
4303 for (mt = m; mt < &m[L3C_ENTRIES]; mt++)
4304 vm_page_dirty(mt);
4305 }
4306
4307 if (*vap == va_next)
4308 *vap = va;
4309 }
4310
4311 /*
4312 * Masks and sets bits in last level page table entries in the specified
4313 * pmap and range
4314 */
4315 static void
pmap_mask_set_locked(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t mask,pt_entry_t nbits,bool invalidate)4316 pmap_mask_set_locked(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
4317 pt_entry_t nbits, bool invalidate)
4318 {
4319 vm_offset_t va, va_next;
4320 pd_entry_t *l0, *l1, *l2;
4321 pt_entry_t *l3p, l3;
4322
4323 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4324 for (; sva < eva; sva = va_next) {
4325 l0 = pmap_l0(pmap, sva);
4326 if (pmap_load(l0) == 0) {
4327 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
4328 if (va_next < sva)
4329 va_next = eva;
4330 continue;
4331 }
4332
4333 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
4334 if (va_next < sva)
4335 va_next = eva;
4336 l1 = pmap_l0_to_l1(l0, sva);
4337 if (pmap_load(l1) == 0)
4338 continue;
4339 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4340 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4341 KASSERT(va_next <= eva,
4342 ("partial update of non-transparent 1G page "
4343 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4344 pmap_load(l1), sva, eva, va_next));
4345 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
4346 if ((pmap_load(l1) & mask) != nbits) {
4347 pmap_store(l1, (pmap_load(l1) & ~mask) | nbits);
4348 if (invalidate)
4349 pmap_s1_invalidate_page(pmap, sva, true);
4350 }
4351 continue;
4352 }
4353
4354 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4355 if (va_next < sva)
4356 va_next = eva;
4357
4358 l2 = pmap_l1_to_l2(l1, sva);
4359 if (pmap_load(l2) == 0)
4360 continue;
4361
4362 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
4363 if (sva + L2_SIZE == va_next && eva >= va_next) {
4364 pmap_protect_l2(pmap, l2, sva, mask, nbits);
4365 continue;
4366 } else if ((pmap_load(l2) & mask) == nbits ||
4367 pmap_demote_l2(pmap, l2, sva) == NULL)
4368 continue;
4369 }
4370 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
4371 ("pmap_protect: Invalid L2 entry after demotion"));
4372
4373 if (va_next > eva)
4374 va_next = eva;
4375
4376 va = va_next;
4377 for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
4378 sva += L3_SIZE) {
4379 l3 = pmap_load(l3p);
4380
4381 /*
4382 * Go to the next L3 entry if the current one is
4383 * invalid or already has the desired access
4384 * restrictions in place. (The latter case occurs
4385 * frequently. For example, in a "buildworld"
4386 * workload, almost 1 out of 4 L3 entries already
4387 * have the desired restrictions.)
4388 */
4389 if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) {
4390 if (va != va_next) {
4391 if (invalidate)
4392 pmap_s1_invalidate_range(pmap,
4393 va, sva, true);
4394 va = va_next;
4395 }
4396 if ((l3 & ATTR_CONTIGUOUS) != 0) {
4397 /*
4398 * Does this L3C page extend beyond
4399 * the requested range? Handle the
4400 * possibility that "va_next" is zero.
4401 */
4402 if ((sva | L3C_OFFSET) > va_next - 1)
4403 break;
4404
4405 /*
4406 * Skip ahead to the last L3_PAGE
4407 * within this L3C page.
4408 */
4409 l3p = (pt_entry_t *)((uintptr_t)l3p |
4410 ((L3C_ENTRIES - 1) *
4411 sizeof(pt_entry_t)));
4412 sva |= L3C_SIZE - L3_SIZE;
4413 }
4414 continue;
4415 }
4416
4417 if ((l3 & ATTR_CONTIGUOUS) != 0) {
4418 /*
4419 * Is this entire set of contiguous L3 entries
4420 * being protected? Handle the possibility
4421 * that "va_next" is zero because of address
4422 * wraparound.
4423 */
4424 if ((sva & L3C_OFFSET) == 0 &&
4425 sva + L3C_OFFSET <= va_next - 1) {
4426 pmap_mask_set_l3c(pmap, l3p, sva, &va,
4427 va_next, mask, nbits);
4428 l3p += L3C_ENTRIES - 1;
4429 sva += L3C_SIZE - L3_SIZE;
4430 continue;
4431 }
4432
4433 (void)pmap_demote_l3c(pmap, l3p, sva);
4434
4435 /*
4436 * The L3 entry's accessed bit may have changed.
4437 */
4438 l3 = pmap_load(l3p);
4439 }
4440 while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) |
4441 nbits))
4442 cpu_spinwait();
4443
4444 /*
4445 * When a dirty read/write mapping is write protected,
4446 * update the page's dirty field.
4447 */
4448 if ((l3 & ATTR_SW_MANAGED) != 0 &&
4449 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4450 pmap_pte_dirty(pmap, l3))
4451 vm_page_dirty(PTE_TO_VM_PAGE(l3));
4452
4453 if (va == va_next)
4454 va = sva;
4455 }
4456 if (va != va_next && invalidate)
4457 pmap_s1_invalidate_range(pmap, va, sva, true);
4458 }
4459 }
4460
4461 static void
pmap_mask_set(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t mask,pt_entry_t nbits,bool invalidate)4462 pmap_mask_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
4463 pt_entry_t nbits, bool invalidate)
4464 {
4465 PMAP_LOCK(pmap);
4466 pmap_mask_set_locked(pmap, sva, eva, mask, nbits, invalidate);
4467 PMAP_UNLOCK(pmap);
4468 }
4469
4470 /*
4471 * Set the physical protection on the
4472 * specified range of this map as requested.
4473 */
4474 void
pmap_protect(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_prot_t prot)4475 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
4476 {
4477 pt_entry_t mask, nbits;
4478
4479 PMAP_ASSERT_STAGE1(pmap);
4480 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
4481 if (prot == VM_PROT_NONE) {
4482 pmap_remove(pmap, sva, eva);
4483 return;
4484 }
4485
4486 mask = nbits = 0;
4487 if ((prot & VM_PROT_WRITE) == 0) {
4488 mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM;
4489 nbits |= ATTR_S1_AP(ATTR_S1_AP_RO);
4490 }
4491 if ((prot & VM_PROT_EXECUTE) == 0) {
4492 mask |= ATTR_S1_XN;
4493 nbits |= ATTR_S1_XN;
4494 }
4495 if (pmap == kernel_pmap) {
4496 mask |= ATTR_KERN_GP;
4497 nbits |= ATTR_KERN_GP;
4498 }
4499 if (mask == 0)
4500 return;
4501
4502 pmap_mask_set(pmap, sva, eva, mask, nbits, true);
4503 }
4504
4505 void
pmap_disable_promotion(vm_offset_t sva,vm_size_t size)4506 pmap_disable_promotion(vm_offset_t sva, vm_size_t size)
4507 {
4508
4509 MPASS((sva & L3_OFFSET) == 0);
4510 MPASS(((sva + size) & L3_OFFSET) == 0);
4511
4512 pmap_mask_set(kernel_pmap, sva, sva + size, ATTR_SW_NO_PROMOTE,
4513 ATTR_SW_NO_PROMOTE, false);
4514 }
4515
4516 /*
4517 * Inserts the specified page table page into the specified pmap's collection
4518 * of idle page table pages. Each of a pmap's page table pages is responsible
4519 * for mapping a distinct range of virtual addresses. The pmap's collection is
4520 * ordered by this virtual address range.
4521 *
4522 * If "promoted" is false, then the page table page "mpte" must be zero filled;
4523 * "mpte"'s valid field will be set to 0.
4524 *
4525 * If "promoted" is true and "all_l3e_AF_set" is false, then "mpte" must
4526 * contain valid mappings with identical attributes except for ATTR_AF;
4527 * "mpte"'s valid field will be set to 1.
4528 *
4529 * If "promoted" and "all_l3e_AF_set" are both true, then "mpte" must contain
4530 * valid mappings with identical attributes including ATTR_AF; "mpte"'s valid
4531 * field will be set to VM_PAGE_BITS_ALL.
4532 */
4533 static __inline int
pmap_insert_pt_page(pmap_t pmap,vm_page_t mpte,bool promoted,bool all_l3e_AF_set)4534 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
4535 bool all_l3e_AF_set)
4536 {
4537
4538 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4539 KASSERT(promoted || !all_l3e_AF_set,
4540 ("a zero-filled PTP can't have ATTR_AF set in every PTE"));
4541 mpte->valid = promoted ? (all_l3e_AF_set ? VM_PAGE_BITS_ALL : 1) : 0;
4542 return (vm_radix_insert(&pmap->pm_root, mpte));
4543 }
4544
4545 /*
4546 * Removes the page table page mapping the specified virtual address from the
4547 * specified pmap's collection of idle page table pages, and returns it.
4548 * Otherwise, returns NULL if there is no page table page corresponding to the
4549 * specified virtual address.
4550 */
4551 static __inline vm_page_t
pmap_remove_pt_page(pmap_t pmap,vm_offset_t va)4552 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
4553 {
4554
4555 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4556 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
4557 }
4558
4559 /*
4560 * Performs a break-before-make update of a pmap entry. This is needed when
4561 * either promoting or demoting pages to ensure the TLB doesn't get into an
4562 * inconsistent state.
4563 */
4564 static void
pmap_update_entry(pmap_t pmap,pd_entry_t * ptep,pd_entry_t newpte,vm_offset_t va,vm_size_t size)4565 pmap_update_entry(pmap_t pmap, pd_entry_t *ptep, pd_entry_t newpte,
4566 vm_offset_t va, vm_size_t size)
4567 {
4568 register_t intr;
4569
4570 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4571 KASSERT((newpte & ATTR_SW_NO_PROMOTE) == 0,
4572 ("%s: Updating non-promote pte", __func__));
4573
4574 /*
4575 * Ensure we don't get switched out with the page table in an
4576 * inconsistent state. We also need to ensure no interrupts fire
4577 * as they may make use of an address we are about to invalidate.
4578 */
4579 intr = intr_disable();
4580
4581 /*
4582 * Clear the old mapping's valid bit, but leave the rest of the entry
4583 * unchanged, so that a lockless, concurrent pmap_kextract() can still
4584 * lookup the physical address.
4585 */
4586 pmap_clear_bits(ptep, ATTR_DESCR_VALID);
4587
4588 /*
4589 * When promoting, the L{1,2}_TABLE entry that is being replaced might
4590 * be cached, so we invalidate intermediate entries as well as final
4591 * entries.
4592 */
4593 pmap_s1_invalidate_range(pmap, va, va + size, false);
4594
4595 /* Create the new mapping */
4596 pmap_store(ptep, newpte);
4597 dsb(ishst);
4598
4599 intr_restore(intr);
4600 }
4601
4602 /*
4603 * Performs a break-before-make update of an ATTR_CONTIGUOUS mapping.
4604 */
4605 static void
pmap_update_strided(pmap_t pmap,pd_entry_t * ptep,pd_entry_t * ptep_end,pd_entry_t newpte,vm_offset_t va,vm_offset_t stride,vm_size_t size)4606 pmap_update_strided(pmap_t pmap, pd_entry_t *ptep, pd_entry_t *ptep_end,
4607 pd_entry_t newpte, vm_offset_t va, vm_offset_t stride, vm_size_t size)
4608 {
4609 pd_entry_t *lip;
4610 register_t intr;
4611
4612 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4613 KASSERT((newpte & ATTR_SW_NO_PROMOTE) == 0,
4614 ("%s: Updating non-promote pte", __func__));
4615
4616 /*
4617 * Ensure we don't get switched out with the page table in an
4618 * inconsistent state. We also need to ensure no interrupts fire
4619 * as they may make use of an address we are about to invalidate.
4620 */
4621 intr = intr_disable();
4622
4623 /*
4624 * Clear the old mapping's valid bits, but leave the rest of each
4625 * entry unchanged, so that a lockless, concurrent pmap_kextract() can
4626 * still lookup the physical address.
4627 */
4628 for (lip = ptep; lip < ptep_end; lip++)
4629 pmap_clear_bits(lip, ATTR_DESCR_VALID);
4630
4631 /* Only final entries are changing. */
4632 pmap_s1_invalidate_strided(pmap, va, va + size, stride, true);
4633
4634 /* Create the new mapping. */
4635 for (lip = ptep; lip < ptep_end; lip++) {
4636 pmap_store(lip, newpte);
4637 newpte += stride;
4638 }
4639 dsb(ishst);
4640
4641 intr_restore(intr);
4642 }
4643
4644 #if VM_NRESERVLEVEL > 0
4645 /*
4646 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
4647 * replace the many pv entries for the 4KB page mappings by a single pv entry
4648 * for the 2MB page mapping.
4649 */
4650 static void
pmap_pv_promote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)4651 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
4652 struct rwlock **lockp)
4653 {
4654 struct md_page *pvh;
4655 pv_entry_t pv;
4656 vm_offset_t va_last;
4657 vm_page_t m;
4658
4659 KASSERT((pa & L2_OFFSET) == 0,
4660 ("pmap_pv_promote_l2: pa is not 2mpage aligned"));
4661 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
4662
4663 /*
4664 * Transfer the first page's pv entry for this mapping to the 2mpage's
4665 * pv list. Aside from avoiding the cost of a call to get_pv_entry(),
4666 * a transfer avoids the possibility that get_pv_entry() calls
4667 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
4668 * mappings that is being promoted.
4669 */
4670 m = PHYS_TO_VM_PAGE(pa);
4671 va = va & ~L2_OFFSET;
4672 pv = pmap_pvh_remove(&m->md, pmap, va);
4673 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
4674 pvh = page_to_pvh(m);
4675 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4676 pvh->pv_gen++;
4677 /* Free the remaining NPTEPG - 1 pv entries. */
4678 va_last = va + L2_SIZE - PAGE_SIZE;
4679 do {
4680 m++;
4681 va += PAGE_SIZE;
4682 pmap_pvh_free(&m->md, pmap, va);
4683 } while (va < va_last);
4684 }
4685
4686 /*
4687 * Tries to promote the 512, contiguous 4KB page mappings that are within a
4688 * single level 2 table entry to a single 2MB page mapping. For promotion
4689 * to occur, two conditions must be met: (1) the 4KB page mappings must map
4690 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
4691 * identical characteristics.
4692 */
4693 static bool
pmap_promote_l2(pmap_t pmap,pd_entry_t * l2,vm_offset_t va,vm_page_t mpte,struct rwlock ** lockp)4694 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t mpte,
4695 struct rwlock **lockp)
4696 {
4697 pt_entry_t all_l3e_AF, *firstl3, *l3, newl2, oldl3, pa;
4698
4699 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4700
4701 /*
4702 * Currently, this function only supports promotion on stage 1 pmaps
4703 * because it tests stage 1 specific fields and performs a break-
4704 * before-make sequence that is incorrect for stage 2 pmaps.
4705 */
4706 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap))
4707 return (false);
4708
4709 /*
4710 * Examine the first L3E in the specified PTP. Abort if this L3E is
4711 * ineligible for promotion...
4712 */
4713 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2)));
4714 newl2 = pmap_load(firstl3);
4715 if ((newl2 & ATTR_SW_NO_PROMOTE) != 0)
4716 return (false);
4717 /* ... is not the first physical page within an L2 block */
4718 if ((PTE_TO_PHYS(newl2) & L2_OFFSET) != 0 ||
4719 ((newl2 & ATTR_DESCR_MASK) != L3_PAGE)) { /* ... or is invalid */
4720 atomic_add_long(&pmap_l2_p_failures, 1);
4721 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4722 " in pmap %p", va, pmap);
4723 return (false);
4724 }
4725
4726 /*
4727 * Both here and in the below "for" loop, to allow for repromotion
4728 * after MADV_FREE, conditionally write protect a clean L3E before
4729 * possibly aborting the promotion due to other L3E attributes. Why?
4730 * Suppose that MADV_FREE is applied to a part of a superpage, the
4731 * address range [S, E). pmap_advise() will demote the superpage
4732 * mapping, destroy the 4KB page mapping at the end of [S, E), and
4733 * set AP_RO and clear AF in the L3Es for the rest of [S, E). Later,
4734 * imagine that the memory in [S, E) is recycled, but the last 4KB
4735 * page in [S, E) is not the last to be rewritten, or simply accessed.
4736 * In other words, there is still a 4KB page in [S, E), call it P,
4737 * that is writeable but AP_RO is set and AF is clear in P's L3E.
4738 * Unless we write protect P before aborting the promotion, if and
4739 * when P is finally rewritten, there won't be a page fault to trigger
4740 * repromotion.
4741 */
4742 setl2:
4743 if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4744 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4745 /*
4746 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
4747 * ATTR_SW_DBM can be cleared without a TLB invalidation.
4748 */
4749 if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM))
4750 goto setl2;
4751 newl2 &= ~ATTR_SW_DBM;
4752 CTR2(KTR_PMAP, "pmap_promote_l2: protect for va %#lx"
4753 " in pmap %p", va & ~L2_OFFSET, pmap);
4754 }
4755
4756 /*
4757 * Examine each of the other L3Es in the specified PTP. Abort if this
4758 * L3E maps an unexpected 4KB physical page or does not have identical
4759 * characteristics to the first L3E. If ATTR_AF is not set in every
4760 * PTE, then request that the PTP be refilled on demotion.
4761 */
4762 all_l3e_AF = newl2 & ATTR_AF;
4763 pa = (PTE_TO_PHYS(newl2) | (newl2 & ATTR_DESCR_MASK))
4764 + L2_SIZE - PAGE_SIZE;
4765 for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
4766 oldl3 = pmap_load(l3);
4767 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) {
4768 atomic_add_long(&pmap_l2_p_failures, 1);
4769 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4770 " in pmap %p", va, pmap);
4771 return (false);
4772 }
4773 setl3:
4774 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4775 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4776 /*
4777 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
4778 * set, ATTR_SW_DBM can be cleared without a TLB
4779 * invalidation.
4780 */
4781 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
4782 ~ATTR_SW_DBM))
4783 goto setl3;
4784 oldl3 &= ~ATTR_SW_DBM;
4785 }
4786 if ((oldl3 & ATTR_PROMOTE) != (newl2 & ATTR_PROMOTE)) {
4787 atomic_add_long(&pmap_l2_p_failures, 1);
4788 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4789 " in pmap %p", va, pmap);
4790 return (false);
4791 }
4792 all_l3e_AF &= oldl3;
4793 pa -= PAGE_SIZE;
4794 }
4795
4796 /*
4797 * Unless all PTEs have ATTR_AF set, clear it from the superpage
4798 * mapping, so that promotions triggered by speculative mappings,
4799 * such as pmap_enter_quick(), don't automatically mark the
4800 * underlying pages as referenced.
4801 */
4802 newl2 &= ~(ATTR_CONTIGUOUS | ATTR_AF | ATTR_DESCR_MASK) | all_l3e_AF;
4803
4804 /*
4805 * Save the page table page in its current state until the L2
4806 * mapping the superpage is demoted by pmap_demote_l2() or
4807 * destroyed by pmap_remove_l3().
4808 */
4809 if (mpte == NULL)
4810 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
4811 KASSERT(mpte >= vm_page_array &&
4812 mpte < &vm_page_array[vm_page_array_size],
4813 ("pmap_promote_l2: page table page is out of range"));
4814 KASSERT(mpte->pindex == pmap_l2_pindex(va),
4815 ("pmap_promote_l2: page table page's pindex is wrong"));
4816 if (pmap_insert_pt_page(pmap, mpte, true, all_l3e_AF != 0)) {
4817 atomic_add_long(&pmap_l2_p_failures, 1);
4818 CTR2(KTR_PMAP,
4819 "pmap_promote_l2: failure for va %#lx in pmap %p", va,
4820 pmap);
4821 return (false);
4822 }
4823
4824 if ((newl2 & ATTR_SW_MANAGED) != 0)
4825 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(newl2), lockp);
4826
4827 pmap_update_entry(pmap, l2, newl2 | L2_BLOCK, va & ~L2_OFFSET, L2_SIZE);
4828
4829 atomic_add_long(&pmap_l2_promotions, 1);
4830 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
4831 pmap);
4832 return (true);
4833 }
4834
4835 /*
4836 * Tries to promote an aligned, contiguous set of base page mappings to a
4837 * single L3C page mapping. For promotion to occur, two conditions must be
4838 * met: (1) the base page mappings must map aligned, contiguous physical
4839 * memory and (2) the base page mappings must have identical characteristics
4840 * except for the accessed flag.
4841 */
4842 static bool
pmap_promote_l3c(pmap_t pmap,pd_entry_t * l3p,vm_offset_t va)4843 pmap_promote_l3c(pmap_t pmap, pd_entry_t *l3p, vm_offset_t va)
4844 {
4845 pd_entry_t all_l3e_AF, firstl3c, *l3, oldl3, pa;
4846
4847 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4848
4849 /*
4850 * Currently, this function only supports promotion on stage 1 pmaps
4851 * because it tests stage 1 specific fields and performs a break-
4852 * before-make sequence that is incorrect for stage 2 pmaps.
4853 */
4854 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap))
4855 return (false);
4856
4857 /*
4858 * Compute the address of the first L3 entry in the superpage
4859 * candidate.
4860 */
4861 l3p = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
4862 sizeof(pt_entry_t)) - 1));
4863
4864 firstl3c = pmap_load(l3p);
4865
4866 /*
4867 * Examine the first L3 entry. Abort if this L3E is ineligible for
4868 * promotion...
4869 */
4870 if ((firstl3c & ATTR_SW_NO_PROMOTE) != 0)
4871 return (false);
4872 /* ...is not properly aligned... */
4873 if ((PTE_TO_PHYS(firstl3c) & L3C_OFFSET) != 0 ||
4874 (firstl3c & ATTR_DESCR_MASK) != L3_PAGE) { /* ...or is invalid. */
4875 counter_u64_add(pmap_l3c_p_failures, 1);
4876 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
4877 " in pmap %p", va, pmap);
4878 return (false);
4879 }
4880
4881 /*
4882 * If the first L3 entry is a clean read-write mapping, convert it
4883 * to a read-only mapping. See pmap_promote_l2() for the rationale.
4884 */
4885 set_first:
4886 if ((firstl3c & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4887 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4888 /*
4889 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
4890 * ATTR_SW_DBM can be cleared without a TLB invalidation.
4891 */
4892 if (!atomic_fcmpset_64(l3p, &firstl3c, firstl3c & ~ATTR_SW_DBM))
4893 goto set_first;
4894 firstl3c &= ~ATTR_SW_DBM;
4895 CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx"
4896 " in pmap %p", va & ~L3C_OFFSET, pmap);
4897 }
4898
4899 /*
4900 * Check that the rest of the L3 entries are compatible with the first,
4901 * and convert clean read-write mappings to read-only mappings.
4902 */
4903 all_l3e_AF = firstl3c & ATTR_AF;
4904 pa = (PTE_TO_PHYS(firstl3c) | (firstl3c & ATTR_DESCR_MASK)) +
4905 L3C_SIZE - PAGE_SIZE;
4906 for (l3 = l3p + L3C_ENTRIES - 1; l3 > l3p; l3--) {
4907 oldl3 = pmap_load(l3);
4908 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) {
4909 counter_u64_add(pmap_l3c_p_failures, 1);
4910 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
4911 " in pmap %p", va, pmap);
4912 return (false);
4913 }
4914 set_l3:
4915 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4916 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4917 /*
4918 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
4919 * set, ATTR_SW_DBM can be cleared without a TLB
4920 * invalidation.
4921 */
4922 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
4923 ~ATTR_SW_DBM))
4924 goto set_l3;
4925 oldl3 &= ~ATTR_SW_DBM;
4926 CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx"
4927 " in pmap %p", (oldl3 & ~ATTR_MASK & L3C_OFFSET) |
4928 (va & ~L3C_OFFSET), pmap);
4929 }
4930 if ((oldl3 & ATTR_PROMOTE) != (firstl3c & ATTR_PROMOTE)) {
4931 counter_u64_add(pmap_l3c_p_failures, 1);
4932 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
4933 " in pmap %p", va, pmap);
4934 return (false);
4935 }
4936 all_l3e_AF &= oldl3;
4937 pa -= PAGE_SIZE;
4938 }
4939
4940 /*
4941 * Unless all PTEs have ATTR_AF set, clear it from the superpage
4942 * mapping, so that promotions triggered by speculative mappings,
4943 * such as pmap_enter_quick(), don't automatically mark the
4944 * underlying pages as referenced.
4945 */
4946 firstl3c &= ~ATTR_AF | all_l3e_AF;
4947
4948 /*
4949 * Remake the mappings with the contiguous bit set.
4950 */
4951 pmap_update_strided(pmap, l3p, l3p + L3C_ENTRIES, firstl3c |
4952 ATTR_CONTIGUOUS, va & ~L3C_OFFSET, L3_SIZE, L3C_SIZE);
4953
4954 counter_u64_add(pmap_l3c_promotions, 1);
4955 CTR2(KTR_PMAP, "pmap_promote_l3c: success for va %#lx in pmap %p", va,
4956 pmap);
4957 return (true);
4958 }
4959 #endif /* VM_NRESERVLEVEL > 0 */
4960
4961 static int
pmap_enter_largepage(pmap_t pmap,vm_offset_t va,pt_entry_t pte,int flags,int psind)4962 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t pte, int flags,
4963 int psind)
4964 {
4965 pd_entry_t *l0p, *l1p, *l2p, *l3p, newpte, origpte, *tl3p;
4966 vm_page_t mp;
4967
4968 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4969 KASSERT(psind > 0 && psind < MAXPAGESIZES,
4970 ("psind %d unexpected", psind));
4971 KASSERT((PTE_TO_PHYS(pte) & (pagesizes[psind] - 1)) == 0,
4972 ("unaligned phys address %#lx pte %#lx psind %d",
4973 PTE_TO_PHYS(pte), pte, psind));
4974
4975 restart:
4976 newpte = pte;
4977 if (!pmap_bti_same(pmap, va, va + pagesizes[psind], &newpte))
4978 return (KERN_PROTECTION_FAILURE);
4979 if (psind == 3) {
4980 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4981
4982 KASSERT(pagesizes[psind] == L1_SIZE,
4983 ("pagesizes[%d] != L1_SIZE", psind));
4984 l0p = pmap_l0(pmap, va);
4985 if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) {
4986 mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL);
4987 if (mp == NULL) {
4988 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
4989 return (KERN_RESOURCE_SHORTAGE);
4990 PMAP_UNLOCK(pmap);
4991 vm_wait(NULL);
4992 PMAP_LOCK(pmap);
4993 goto restart;
4994 }
4995 l1p = pmap_l0_to_l1(l0p, va);
4996 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
4997 origpte = pmap_load(l1p);
4998 } else {
4999 l1p = pmap_l0_to_l1(l0p, va);
5000 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
5001 origpte = pmap_load(l1p);
5002 if ((origpte & ATTR_DESCR_VALID) == 0) {
5003 mp = PTE_TO_VM_PAGE(pmap_load(l0p));
5004 mp->ref_count++;
5005 }
5006 }
5007 KASSERT((PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte) &&
5008 (origpte & ATTR_DESCR_MASK) == L1_BLOCK) ||
5009 (origpte & ATTR_DESCR_VALID) == 0,
5010 ("va %#lx changing 1G phys page l1 %#lx newpte %#lx",
5011 va, origpte, newpte));
5012 pmap_store(l1p, newpte);
5013 } else if (psind == 2) {
5014 KASSERT(pagesizes[psind] == L2_SIZE,
5015 ("pagesizes[%d] != L2_SIZE", psind));
5016 l2p = pmap_l2(pmap, va);
5017 if (l2p == NULL) {
5018 mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL);
5019 if (mp == NULL) {
5020 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5021 return (KERN_RESOURCE_SHORTAGE);
5022 PMAP_UNLOCK(pmap);
5023 vm_wait(NULL);
5024 PMAP_LOCK(pmap);
5025 goto restart;
5026 }
5027 l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
5028 l2p = &l2p[pmap_l2_index(va)];
5029 origpte = pmap_load(l2p);
5030 } else {
5031 l1p = pmap_l1(pmap, va);
5032 origpte = pmap_load(l2p);
5033 if ((origpte & ATTR_DESCR_VALID) == 0) {
5034 mp = PTE_TO_VM_PAGE(pmap_load(l1p));
5035 mp->ref_count++;
5036 }
5037 }
5038 KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
5039 ((origpte & ATTR_DESCR_MASK) == L2_BLOCK &&
5040 PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)),
5041 ("va %#lx changing 2M phys page l2 %#lx newpte %#lx",
5042 va, origpte, newpte));
5043 pmap_store(l2p, newpte);
5044 } else /* (psind == 1) */ {
5045 KASSERT(pagesizes[psind] == L3C_SIZE,
5046 ("pagesizes[%d] != L3C_SIZE", psind));
5047 l2p = pmap_l2(pmap, va);
5048 if (l2p == NULL || (pmap_load(l2p) & ATTR_DESCR_VALID) == 0) {
5049 mp = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), NULL);
5050 if (mp == NULL) {
5051 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5052 return (KERN_RESOURCE_SHORTAGE);
5053 PMAP_UNLOCK(pmap);
5054 vm_wait(NULL);
5055 PMAP_LOCK(pmap);
5056 goto restart;
5057 }
5058 mp->ref_count += L3C_ENTRIES - 1;
5059 l3p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
5060 l3p = &l3p[pmap_l3_index(va)];
5061 } else {
5062 l3p = pmap_l2_to_l3(l2p, va);
5063 if ((pmap_load(l3p) & ATTR_DESCR_VALID) == 0) {
5064 mp = PTE_TO_VM_PAGE(pmap_load(l2p));
5065 mp->ref_count += L3C_ENTRIES;
5066 }
5067 }
5068 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
5069 origpte = pmap_load(tl3p);
5070 KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
5071 ((origpte & ATTR_CONTIGUOUS) != 0 &&
5072 PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)),
5073 ("va %#lx changing 64K phys page l3 %#lx newpte %#lx",
5074 va, origpte, newpte));
5075 pmap_store(tl3p, newpte);
5076 newpte += L3_SIZE;
5077 }
5078 }
5079 dsb(ishst);
5080
5081 if ((origpte & ATTR_DESCR_VALID) == 0)
5082 pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE);
5083 if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0)
5084 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE;
5085 else if ((newpte & ATTR_SW_WIRED) == 0 &&
5086 (origpte & ATTR_SW_WIRED) != 0)
5087 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE;
5088
5089 return (KERN_SUCCESS);
5090 }
5091
5092 /*
5093 * Insert the given physical page (p) at
5094 * the specified virtual address (v) in the
5095 * target physical map with the protection requested.
5096 *
5097 * If specified, the page will be wired down, meaning
5098 * that the related pte can not be reclaimed.
5099 *
5100 * NB: This is the only routine which MAY NOT lazy-evaluate
5101 * or lose information. That is, this routine must actually
5102 * insert this page into the given map NOW.
5103 */
5104 int
pmap_enter(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,u_int flags,int8_t psind)5105 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
5106 u_int flags, int8_t psind)
5107 {
5108 struct rwlock *lock;
5109 pd_entry_t *pde;
5110 pt_entry_t new_l3, orig_l3;
5111 pt_entry_t *l2, *l3;
5112 pv_entry_t pv;
5113 vm_paddr_t opa, pa;
5114 vm_page_t mpte, om;
5115 bool nosleep;
5116 int full_lvl, lvl, rv;
5117
5118 KASSERT(ADDR_IS_CANONICAL(va),
5119 ("%s: Address not in canonical form: %lx", __func__, va));
5120
5121 va = trunc_page(va);
5122 if ((m->oflags & VPO_UNMANAGED) == 0)
5123 VM_PAGE_OBJECT_BUSY_ASSERT(m);
5124 pa = VM_PAGE_TO_PHYS(m);
5125 new_l3 = (pt_entry_t)(PHYS_TO_PTE(pa) | ATTR_DEFAULT | L3_PAGE);
5126 new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr);
5127 new_l3 |= pmap_pte_prot(pmap, prot);
5128 if ((flags & PMAP_ENTER_WIRED) != 0)
5129 new_l3 |= ATTR_SW_WIRED;
5130 if (pmap->pm_stage == PM_STAGE1) {
5131 if (!ADDR_IS_KERNEL(va))
5132 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5133 else
5134 new_l3 |= ATTR_S1_UXN;
5135 if (pmap != kernel_pmap)
5136 new_l3 |= ATTR_S1_nG;
5137 } else {
5138 /*
5139 * Clear the access flag on executable mappings, this will be
5140 * set later when the page is accessed. The fault handler is
5141 * required to invalidate the I-cache.
5142 *
5143 * TODO: Switch to the valid flag to allow hardware management
5144 * of the access flag. Much of the pmap code assumes the
5145 * valid flag is set and fails to destroy the old page tables
5146 * correctly if it is clear.
5147 */
5148 if (prot & VM_PROT_EXECUTE)
5149 new_l3 &= ~ATTR_AF;
5150 }
5151 if ((m->oflags & VPO_UNMANAGED) == 0) {
5152 new_l3 |= ATTR_SW_MANAGED;
5153 if ((prot & VM_PROT_WRITE) != 0) {
5154 new_l3 |= ATTR_SW_DBM;
5155 if ((flags & VM_PROT_WRITE) == 0) {
5156 if (pmap->pm_stage == PM_STAGE1)
5157 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO);
5158 else
5159 new_l3 &=
5160 ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
5161 }
5162 }
5163 }
5164
5165 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
5166
5167 lock = NULL;
5168 PMAP_LOCK(pmap);
5169 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
5170 KASSERT((m->oflags & VPO_UNMANAGED) != 0,
5171 ("managed largepage va %#lx flags %#x", va, flags));
5172 if (psind == 3) {
5173 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
5174 new_l3 &= ~L3_PAGE;
5175 new_l3 |= L1_BLOCK;
5176 } else if (psind == 2) {
5177 new_l3 &= ~L3_PAGE;
5178 new_l3 |= L2_BLOCK;
5179 } else /* (psind == 1) */
5180 new_l3 |= ATTR_CONTIGUOUS;
5181 rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind);
5182 goto out;
5183 }
5184 if (psind == 2) {
5185 /* Assert the required virtual and physical alignment. */
5186 KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned"));
5187 KASSERT(m->psind > 1, ("pmap_enter: m->psind < psind"));
5188 rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK,
5189 flags, m, &lock);
5190 goto out;
5191 }
5192 mpte = NULL;
5193 if (psind == 1) {
5194 KASSERT((va & L3C_OFFSET) == 0, ("pmap_enter: va unaligned"));
5195 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
5196 rv = pmap_enter_l3c(pmap, va, new_l3 | ATTR_CONTIGUOUS, flags,
5197 m, &mpte, &lock);
5198 #if VM_NRESERVLEVEL > 0
5199 /*
5200 * Attempt L2 promotion, if both the PTP and a level 1
5201 * reservation are fully populated.
5202 */
5203 if (rv == KERN_SUCCESS &&
5204 (mpte == NULL || mpte->ref_count == NL3PG) &&
5205 (m->flags & PG_FICTITIOUS) == 0 &&
5206 vm_reserv_level_iffullpop(m) == 1) {
5207 pde = pmap_l2(pmap, va);
5208 (void)pmap_promote_l2(pmap, pde, va, mpte, &lock);
5209 }
5210 #endif
5211 goto out;
5212 }
5213
5214 /*
5215 * In the case that a page table page is not
5216 * resident, we are creating it here.
5217 */
5218 retry:
5219 pde = pmap_pde(pmap, va, &lvl);
5220 if (pde != NULL && lvl == 2) {
5221 l3 = pmap_l2_to_l3(pde, va);
5222 if (!ADDR_IS_KERNEL(va) && mpte == NULL) {
5223 mpte = PTE_TO_VM_PAGE(pmap_load(pde));
5224 mpte->ref_count++;
5225 }
5226 goto havel3;
5227 } else if (pde != NULL && lvl == 1) {
5228 l2 = pmap_l1_to_l2(pde, va);
5229 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
5230 (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) {
5231 l3 = &l3[pmap_l3_index(va)];
5232 if (!ADDR_IS_KERNEL(va)) {
5233 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
5234 mpte->ref_count++;
5235 }
5236 goto havel3;
5237 }
5238 /* We need to allocate an L3 table. */
5239 }
5240 if (!ADDR_IS_KERNEL(va)) {
5241 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
5242
5243 /*
5244 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order
5245 * to handle the possibility that a superpage mapping for "va"
5246 * was created while we slept.
5247 */
5248 mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va),
5249 nosleep ? NULL : &lock);
5250 if (mpte == NULL && nosleep) {
5251 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
5252 rv = KERN_RESOURCE_SHORTAGE;
5253 goto out;
5254 }
5255 goto retry;
5256 } else
5257 panic("pmap_enter: missing L3 table for kernel va %#lx", va);
5258
5259 havel3:
5260 orig_l3 = pmap_load(l3);
5261 opa = PTE_TO_PHYS(orig_l3);
5262 pv = NULL;
5263 new_l3 |= pmap_pte_bti(pmap, va);
5264
5265 /*
5266 * Is the specified virtual address already mapped?
5267 */
5268 if (pmap_l3_valid(orig_l3)) {
5269 /*
5270 * Wiring change, just update stats. We don't worry about
5271 * wiring PT pages as they remain resident as long as there
5272 * are valid mappings in them. Hence, if a user page is wired,
5273 * the PT page will be also.
5274 */
5275 if ((flags & PMAP_ENTER_WIRED) != 0 &&
5276 (orig_l3 & ATTR_SW_WIRED) == 0)
5277 pmap->pm_stats.wired_count++;
5278 else if ((flags & PMAP_ENTER_WIRED) == 0 &&
5279 (orig_l3 & ATTR_SW_WIRED) != 0)
5280 pmap->pm_stats.wired_count--;
5281
5282 /*
5283 * Remove the extra PT page reference.
5284 */
5285 if (mpte != NULL) {
5286 mpte->ref_count--;
5287 KASSERT(mpte->ref_count > 0,
5288 ("pmap_enter: missing reference to page table page,"
5289 " va: 0x%lx", va));
5290 }
5291
5292 /*
5293 * Has the physical page changed?
5294 */
5295 if (opa == pa) {
5296 /*
5297 * No, might be a protection or wiring change.
5298 */
5299 if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
5300 (new_l3 & ATTR_SW_DBM) != 0)
5301 vm_page_aflag_set(m, PGA_WRITEABLE);
5302 goto validate;
5303 }
5304
5305 /*
5306 * The physical page has changed. Temporarily invalidate
5307 * the mapping.
5308 */
5309 if ((orig_l3 & ATTR_CONTIGUOUS) != 0)
5310 (void)pmap_demote_l3c(pmap, l3, va);
5311 orig_l3 = pmap_load_clear(l3);
5312 KASSERT(PTE_TO_PHYS(orig_l3) == opa,
5313 ("pmap_enter: unexpected pa update for %#lx", va));
5314 if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
5315 om = PHYS_TO_VM_PAGE(opa);
5316
5317 /*
5318 * The pmap lock is sufficient to synchronize with
5319 * concurrent calls to pmap_page_test_mappings() and
5320 * pmap_ts_referenced().
5321 */
5322 if (pmap_pte_dirty(pmap, orig_l3))
5323 vm_page_dirty(om);
5324 if ((orig_l3 & ATTR_AF) != 0) {
5325 pmap_invalidate_page(pmap, va, true);
5326 vm_page_aflag_set(om, PGA_REFERENCED);
5327 }
5328 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, om);
5329 pv = pmap_pvh_remove(&om->md, pmap, va);
5330 if ((m->oflags & VPO_UNMANAGED) != 0)
5331 free_pv_entry(pmap, pv);
5332 if ((om->a.flags & PGA_WRITEABLE) != 0 &&
5333 TAILQ_EMPTY(&om->md.pv_list) &&
5334 ((om->flags & PG_FICTITIOUS) != 0 ||
5335 TAILQ_EMPTY(&page_to_pvh(om)->pv_list)))
5336 vm_page_aflag_clear(om, PGA_WRITEABLE);
5337 } else {
5338 KASSERT((orig_l3 & ATTR_AF) != 0,
5339 ("pmap_enter: unmanaged mapping lacks ATTR_AF"));
5340 pmap_invalidate_page(pmap, va, true);
5341 }
5342 orig_l3 = 0;
5343 } else {
5344 /*
5345 * Increment the counters.
5346 */
5347 if ((new_l3 & ATTR_SW_WIRED) != 0)
5348 pmap->pm_stats.wired_count++;
5349 pmap_resident_count_inc(pmap, 1);
5350 }
5351 /*
5352 * Enter on the PV list if part of our managed memory.
5353 */
5354 if ((m->oflags & VPO_UNMANAGED) == 0) {
5355 if (pv == NULL) {
5356 pv = get_pv_entry(pmap, &lock);
5357 pv->pv_va = va;
5358 }
5359 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5360 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5361 m->md.pv_gen++;
5362 if ((new_l3 & ATTR_SW_DBM) != 0)
5363 vm_page_aflag_set(m, PGA_WRITEABLE);
5364 }
5365
5366 validate:
5367 if (pmap->pm_stage == PM_STAGE1) {
5368 /*
5369 * Sync icache if exec permission and attribute
5370 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping
5371 * is stored and made valid for hardware table walk. If done
5372 * later, then other can access this page before caches are
5373 * properly synced. Don't do it for kernel memory which is
5374 * mapped with exec permission even if the memory isn't going
5375 * to hold executable code. The only time when icache sync is
5376 * needed is after kernel module is loaded and the relocation
5377 * info is processed. And it's done in elf_cpu_load_file().
5378 */
5379 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
5380 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK &&
5381 (opa != pa || (orig_l3 & ATTR_S1_XN))) {
5382 PMAP_ASSERT_STAGE1(pmap);
5383 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa),
5384 PAGE_SIZE);
5385 }
5386 } else {
5387 cpu_dcache_wb_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE);
5388 }
5389
5390 /*
5391 * Update the L3 entry
5392 */
5393 if (pmap_l3_valid(orig_l3)) {
5394 KASSERT(opa == pa, ("pmap_enter: invalid update"));
5395 if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) {
5396 /* same PA, different attributes */
5397 if ((orig_l3 & ATTR_CONTIGUOUS) != 0)
5398 (void)pmap_demote_l3c(pmap, l3, va);
5399 orig_l3 = pmap_load_store(l3, new_l3);
5400 pmap_invalidate_page(pmap, va, true);
5401 if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
5402 pmap_pte_dirty(pmap, orig_l3))
5403 vm_page_dirty(m);
5404 } else {
5405 /*
5406 * orig_l3 == new_l3
5407 * This can happens if multiple threads simultaneously
5408 * access not yet mapped page. This bad for performance
5409 * since this can cause full demotion-NOP-promotion
5410 * cycle.
5411 * Another possible reasons are:
5412 * - VM and pmap memory layout are diverged
5413 * - tlb flush is missing somewhere and CPU doesn't see
5414 * actual mapping.
5415 */
5416 CTR4(KTR_PMAP, "%s: already mapped page - "
5417 "pmap %p va 0x%#lx pte 0x%lx",
5418 __func__, pmap, va, new_l3);
5419 }
5420 } else {
5421 /* New mapping */
5422 pmap_store(l3, new_l3);
5423 dsb(ishst);
5424 }
5425
5426 #if VM_NRESERVLEVEL > 0
5427 /*
5428 * First, attempt L3C promotion, if the virtual and physical addresses
5429 * are aligned with each other and an underlying reservation has the
5430 * neighboring L3 pages allocated. The first condition is simply an
5431 * optimization that recognizes some eventual promotion failures early
5432 * at a lower run-time cost. Then, if both a level 1 reservation and
5433 * the PTP are fully populated, attempt L2 promotion.
5434 */
5435 if ((va & L3C_OFFSET) == (pa & L3C_OFFSET) &&
5436 (m->flags & PG_FICTITIOUS) == 0 &&
5437 (full_lvl = vm_reserv_level_iffullpop(m)) >= 0 &&
5438 pmap_promote_l3c(pmap, l3, va) &&
5439 full_lvl == 1 && (mpte == NULL || mpte->ref_count == NL3PG))
5440 (void)pmap_promote_l2(pmap, pde, va, mpte, &lock);
5441 #endif
5442
5443 rv = KERN_SUCCESS;
5444 out:
5445 if (lock != NULL)
5446 rw_wunlock(lock);
5447 PMAP_UNLOCK(pmap);
5448 return (rv);
5449 }
5450
5451 /*
5452 * Tries to create a read- and/or execute-only L2 page mapping. Returns
5453 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error
5454 * value. See pmap_enter_l2() for the possible error values when "no sleep",
5455 * "no replace", and "no reclaim" are specified.
5456 */
5457 static int
pmap_enter_l2_rx(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,struct rwlock ** lockp)5458 pmap_enter_l2_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
5459 struct rwlock **lockp)
5460 {
5461 pd_entry_t new_l2;
5462
5463 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5464 PMAP_ASSERT_STAGE1(pmap);
5465 KASSERT(ADDR_IS_CANONICAL(va),
5466 ("%s: Address not in canonical form: %lx", __func__, va));
5467
5468 new_l2 = (pd_entry_t)(VM_PAGE_TO_PTE(m) | ATTR_DEFAULT |
5469 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
5470 L2_BLOCK);
5471 if ((m->oflags & VPO_UNMANAGED) == 0) {
5472 new_l2 |= ATTR_SW_MANAGED;
5473 new_l2 &= ~ATTR_AF;
5474 }
5475 if ((prot & VM_PROT_EXECUTE) == 0 ||
5476 m->md.pv_memattr == VM_MEMATTR_DEVICE)
5477 new_l2 |= ATTR_S1_XN;
5478 if (!ADDR_IS_KERNEL(va))
5479 new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5480 else
5481 new_l2 |= ATTR_S1_UXN;
5482 if (pmap != kernel_pmap)
5483 new_l2 |= ATTR_S1_nG;
5484 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
5485 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp));
5486 }
5487
5488 /*
5489 * Returns true if every page table entry in the specified page table is
5490 * zero.
5491 */
5492 static bool
pmap_every_pte_zero(vm_paddr_t pa)5493 pmap_every_pte_zero(vm_paddr_t pa)
5494 {
5495 pt_entry_t *pt_end, *pte;
5496
5497 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
5498 pte = (pt_entry_t *)PHYS_TO_DMAP(pa);
5499 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) {
5500 if (*pte != 0)
5501 return (false);
5502 }
5503 return (true);
5504 }
5505
5506 /*
5507 * Tries to create the specified L2 page mapping. Returns KERN_SUCCESS if
5508 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or
5509 * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if
5510 * PMAP_ENTER_NOREPLACE was specified and a base page mapping already exists
5511 * within the L2 virtual address range starting at the specified virtual
5512 * address. Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a
5513 * L2 page mapping already exists at the specified virtual address. Returns
5514 * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a
5515 * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified
5516 * and a PV entry allocation failed.
5517 */
5518 static int
pmap_enter_l2(pmap_t pmap,vm_offset_t va,pd_entry_t new_l2,u_int flags,vm_page_t m,struct rwlock ** lockp)5519 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
5520 vm_page_t m, struct rwlock **lockp)
5521 {
5522 struct spglist free;
5523 pd_entry_t *l2, old_l2;
5524 vm_page_t l2pg, mt;
5525 vm_page_t uwptpg;
5526
5527 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5528 KASSERT(ADDR_IS_CANONICAL(va),
5529 ("%s: Address not in canonical form: %lx", __func__, va));
5530
5531 if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags &
5532 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
5533 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
5534 va, pmap);
5535 return (KERN_RESOURCE_SHORTAGE);
5536 }
5537
5538 /*
5539 * If bti is not the same for the whole l2 range, return failure
5540 * and let vm_fault() cope. Check after l2 allocation, since
5541 * it could sleep.
5542 */
5543 if (!pmap_bti_same(pmap, va, va + L2_SIZE, &new_l2)) {
5544 KASSERT(l2pg != NULL, ("pmap_enter_l2: missing L2 PTP"));
5545 pmap_abort_ptp(pmap, va, l2pg);
5546 return (KERN_PROTECTION_FAILURE);
5547 }
5548
5549 /*
5550 * If there are existing mappings, either abort or remove them.
5551 */
5552 if ((old_l2 = pmap_load(l2)) != 0) {
5553 KASSERT(l2pg == NULL || l2pg->ref_count > 1,
5554 ("pmap_enter_l2: l2pg's ref count is too low"));
5555 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
5556 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
5557 if (l2pg != NULL)
5558 l2pg->ref_count--;
5559 CTR2(KTR_PMAP,
5560 "pmap_enter_l2: no space for va %#lx"
5561 " in pmap %p", va, pmap);
5562 return (KERN_NO_SPACE);
5563 } else if (!ADDR_IS_KERNEL(va) ||
5564 !pmap_every_pte_zero(PTE_TO_PHYS(old_l2))) {
5565 if (l2pg != NULL)
5566 l2pg->ref_count--;
5567 CTR2(KTR_PMAP,
5568 "pmap_enter_l2: failure for va %#lx"
5569 " in pmap %p", va, pmap);
5570 return (KERN_FAILURE);
5571 }
5572 }
5573 SLIST_INIT(&free);
5574 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK)
5575 (void)pmap_remove_l2(pmap, l2, va,
5576 pmap_load(pmap_l1(pmap, va)), &free, lockp);
5577 else
5578 pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE,
5579 &free, lockp);
5580 if (!ADDR_IS_KERNEL(va)) {
5581 vm_page_free_pages_toq(&free, true);
5582 KASSERT(pmap_load(l2) == 0,
5583 ("pmap_enter_l2: non-zero L2 entry %p", l2));
5584 } else {
5585 KASSERT(SLIST_EMPTY(&free),
5586 ("pmap_enter_l2: freed kernel page table page"));
5587
5588 /*
5589 * Both pmap_remove_l2() and pmap_remove_l3_range()
5590 * will leave the kernel page table page zero filled.
5591 * Nonetheless, the TLB could have an intermediate
5592 * entry for the kernel page table page, so request
5593 * an invalidation at all levels after clearing
5594 * the L2_TABLE entry.
5595 */
5596 mt = PTE_TO_VM_PAGE(pmap_load(l2));
5597 if (pmap_insert_pt_page(pmap, mt, false, false))
5598 panic("pmap_enter_l2: trie insert failed");
5599 pmap_clear(l2);
5600 pmap_s1_invalidate_page(pmap, va, false);
5601 }
5602 }
5603
5604 /*
5605 * Allocate leaf ptpage for wired userspace pages.
5606 */
5607 uwptpg = NULL;
5608 if ((new_l2 & ATTR_SW_WIRED) != 0 && pmap != kernel_pmap) {
5609 uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED);
5610 if (uwptpg == NULL) {
5611 pmap_abort_ptp(pmap, va, l2pg);
5612 return (KERN_RESOURCE_SHORTAGE);
5613 }
5614 uwptpg->pindex = pmap_l2_pindex(va);
5615 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) {
5616 vm_page_unwire_noq(uwptpg);
5617 vm_page_free(uwptpg);
5618 pmap_abort_ptp(pmap, va, l2pg);
5619 return (KERN_RESOURCE_SHORTAGE);
5620 }
5621 pmap_resident_count_inc(pmap, 1);
5622 uwptpg->ref_count = NL3PG;
5623 }
5624 if ((new_l2 & ATTR_SW_MANAGED) != 0) {
5625 /*
5626 * Abort this mapping if its PV entry could not be created.
5627 */
5628 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
5629 if (l2pg != NULL)
5630 pmap_abort_ptp(pmap, va, l2pg);
5631 if (uwptpg != NULL) {
5632 mt = pmap_remove_pt_page(pmap, va);
5633 KASSERT(mt == uwptpg,
5634 ("removed pt page %p, expected %p", mt,
5635 uwptpg));
5636 pmap_resident_count_dec(pmap, 1);
5637 uwptpg->ref_count = 1;
5638 vm_page_unwire_noq(uwptpg);
5639 vm_page_free(uwptpg);
5640 }
5641 CTR2(KTR_PMAP,
5642 "pmap_enter_l2: failure for va %#lx in pmap %p",
5643 va, pmap);
5644 return (KERN_RESOURCE_SHORTAGE);
5645 }
5646 if ((new_l2 & ATTR_SW_DBM) != 0)
5647 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
5648 vm_page_aflag_set(mt, PGA_WRITEABLE);
5649 }
5650
5651 /*
5652 * Increment counters.
5653 */
5654 if ((new_l2 & ATTR_SW_WIRED) != 0)
5655 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
5656 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
5657
5658 /*
5659 * Conditionally sync the icache. See pmap_enter() for details.
5660 */
5661 if ((new_l2 & ATTR_S1_XN) == 0 && (PTE_TO_PHYS(new_l2) !=
5662 PTE_TO_PHYS(old_l2) || (old_l2 & ATTR_S1_XN) != 0) &&
5663 pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) {
5664 cpu_icache_sync_range((void *)PHYS_TO_DMAP(PTE_TO_PHYS(new_l2)),
5665 L2_SIZE);
5666 }
5667
5668 /*
5669 * Map the superpage.
5670 */
5671 pmap_store(l2, new_l2);
5672 dsb(ishst);
5673
5674 atomic_add_long(&pmap_l2_mappings, 1);
5675 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
5676 va, pmap);
5677
5678 return (KERN_SUCCESS);
5679 }
5680
5681 /*
5682 * Tries to create a read- and/or execute-only L3C page mapping. Returns
5683 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error
5684 * value.
5685 */
5686 static int
pmap_enter_l3c_rx(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_page_t * ml3p,vm_prot_t prot,struct rwlock ** lockp)5687 pmap_enter_l3c_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *ml3p,
5688 vm_prot_t prot, struct rwlock **lockp)
5689 {
5690 pt_entry_t l3e;
5691
5692 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5693 PMAP_ASSERT_STAGE1(pmap);
5694 KASSERT(ADDR_IS_CANONICAL(va),
5695 ("%s: Address not in canonical form: %lx", __func__, va));
5696
5697 l3e = VM_PAGE_TO_PTE(m) | ATTR_DEFAULT |
5698 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
5699 ATTR_CONTIGUOUS | L3_PAGE;
5700 if ((m->oflags & VPO_UNMANAGED) == 0) {
5701 l3e |= ATTR_SW_MANAGED;
5702 l3e &= ~ATTR_AF;
5703 }
5704 if ((prot & VM_PROT_EXECUTE) == 0 ||
5705 m->md.pv_memattr == VM_MEMATTR_DEVICE)
5706 l3e |= ATTR_S1_XN;
5707 if (!ADDR_IS_KERNEL(va))
5708 l3e |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5709 else
5710 l3e |= ATTR_S1_UXN;
5711 if (pmap != kernel_pmap)
5712 l3e |= ATTR_S1_nG;
5713 return (pmap_enter_l3c(pmap, va, l3e, PMAP_ENTER_NOSLEEP |
5714 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, ml3p, lockp));
5715 }
5716
5717 static int
pmap_enter_l3c(pmap_t pmap,vm_offset_t va,pt_entry_t l3e,u_int flags,vm_page_t m,vm_page_t * ml3p,struct rwlock ** lockp)5718 pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags,
5719 vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp)
5720 {
5721 pd_entry_t *l2p, *pde;
5722 pt_entry_t *l3p, *tl3p;
5723 vm_page_t mt;
5724 vm_paddr_t pa;
5725 vm_pindex_t l2pindex;
5726 int lvl;
5727
5728 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5729 KASSERT((va & L3C_OFFSET) == 0,
5730 ("pmap_enter_l3c: va is not aligned"));
5731 KASSERT(!VA_IS_CLEANMAP(va) || (l3e & ATTR_SW_MANAGED) == 0,
5732 ("pmap_enter_l3c: managed mapping within the clean submap"));
5733 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
5734 ("pmap_enter_l3c: l3e is missing ATTR_CONTIGUOUS"));
5735
5736 /*
5737 * If the L3 PTP is not resident, we attempt to create it here.
5738 */
5739 if (!ADDR_IS_KERNEL(va)) {
5740 /*
5741 * Were we given the correct L3 PTP? If so, we can simply
5742 * increment its ref count.
5743 */
5744 l2pindex = pmap_l2_pindex(va);
5745 if (*ml3p != NULL && (*ml3p)->pindex == l2pindex) {
5746 (*ml3p)->ref_count += L3C_ENTRIES;
5747 } else {
5748 retry:
5749 /*
5750 * Get the L2 entry.
5751 */
5752 pde = pmap_pde(pmap, va, &lvl);
5753
5754 /*
5755 * If the L2 entry is a superpage, we either abort or
5756 * demote depending on the given flags.
5757 */
5758 if (lvl == 1) {
5759 l2p = pmap_l1_to_l2(pde, va);
5760 if ((pmap_load(l2p) & ATTR_DESCR_MASK) ==
5761 L2_BLOCK) {
5762 if ((flags & PMAP_ENTER_NOREPLACE) != 0)
5763 return (KERN_FAILURE);
5764 l3p = pmap_demote_l2_locked(pmap, l2p,
5765 va, lockp);
5766 if (l3p != NULL) {
5767 *ml3p = PTE_TO_VM_PAGE(
5768 pmap_load(l2p));
5769 (*ml3p)->ref_count +=
5770 L3C_ENTRIES;
5771 goto have_l3p;
5772 }
5773 }
5774 /* We need to allocate an L3 PTP. */
5775 }
5776
5777 /*
5778 * If the L3 PTP is mapped, we just increment its ref
5779 * count. Otherwise, we attempt to allocate it.
5780 */
5781 if (lvl == 2 && pmap_load(pde) != 0) {
5782 *ml3p = PTE_TO_VM_PAGE(pmap_load(pde));
5783 (*ml3p)->ref_count += L3C_ENTRIES;
5784 } else {
5785 *ml3p = _pmap_alloc_l3(pmap, l2pindex, (flags &
5786 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp);
5787 if (*ml3p == NULL) {
5788 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5789 return (KERN_FAILURE);
5790
5791 /*
5792 * The page table may have changed
5793 * while we slept.
5794 */
5795 goto retry;
5796 }
5797 (*ml3p)->ref_count += L3C_ENTRIES - 1;
5798 }
5799 }
5800 l3p = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(*ml3p));
5801 } else {
5802 *ml3p = NULL;
5803
5804 /*
5805 * If the L2 entry is a superpage, we either abort or demote
5806 * depending on the given flags.
5807 */
5808 pde = pmap_pde(kernel_pmap, va, &lvl);
5809 if (lvl == 1) {
5810 l2p = pmap_l1_to_l2(pde, va);
5811 KASSERT((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK,
5812 ("pmap_enter_l3c: missing L2 block"));
5813 if ((flags & PMAP_ENTER_NOREPLACE) != 0)
5814 return (KERN_FAILURE);
5815 l3p = pmap_demote_l2_locked(pmap, l2p, va, lockp);
5816 } else {
5817 KASSERT(lvl == 2,
5818 ("pmap_enter_l3c: Invalid level %d", lvl));
5819 l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(
5820 pmap_load(pde)));
5821 }
5822 }
5823 have_l3p:
5824 l3p = &l3p[pmap_l3_index(va)];
5825
5826 /*
5827 * If bti is not the same for the whole L3C range, return failure
5828 * and let vm_fault() cope. Check after L3 allocation, since
5829 * it could sleep.
5830 */
5831 if (!pmap_bti_same(pmap, va, va + L3C_SIZE, &l3e)) {
5832 KASSERT(*ml3p != NULL, ("pmap_enter_l3c: missing L3 PTP"));
5833 (*ml3p)->ref_count -= L3C_ENTRIES - 1;
5834 pmap_abort_ptp(pmap, va, *ml3p);
5835 *ml3p = NULL;
5836 return (KERN_PROTECTION_FAILURE);
5837 }
5838
5839 /*
5840 * If there are existing mappings, either abort or remove them.
5841 */
5842 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
5843 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
5844 if (pmap_load(tl3p) != 0) {
5845 if (*ml3p != NULL)
5846 (*ml3p)->ref_count -= L3C_ENTRIES;
5847 return (KERN_FAILURE);
5848 }
5849 }
5850 } else {
5851 /*
5852 * Because we increment the L3 page's reference count above,
5853 * it is guaranteed not to be freed here and we can pass NULL
5854 * instead of a valid free list.
5855 */
5856 pmap_remove_l3_range(pmap, pmap_load(pmap_l2(pmap, va)), va,
5857 va + L3C_SIZE, NULL, lockp);
5858 }
5859
5860 /*
5861 * Enter on the PV list if part of our managed memory.
5862 */
5863 if ((l3e & ATTR_SW_MANAGED) != 0) {
5864 if (!pmap_pv_insert_l3c(pmap, va, m, lockp)) {
5865 if (*ml3p != NULL) {
5866 (*ml3p)->ref_count -= L3C_ENTRIES - 1;
5867 pmap_abort_ptp(pmap, va, *ml3p);
5868 *ml3p = NULL;
5869 }
5870 return (KERN_RESOURCE_SHORTAGE);
5871 }
5872 if ((l3e & ATTR_SW_DBM) != 0)
5873 for (mt = m; mt < &m[L3C_ENTRIES]; mt++)
5874 vm_page_aflag_set(mt, PGA_WRITEABLE);
5875 }
5876
5877 /*
5878 * Increment counters.
5879 */
5880 if ((l3e & ATTR_SW_WIRED) != 0)
5881 pmap->pm_stats.wired_count += L3C_ENTRIES;
5882 pmap_resident_count_inc(pmap, L3C_ENTRIES);
5883
5884 pa = VM_PAGE_TO_PHYS(m);
5885 KASSERT((pa & L3C_OFFSET) == 0, ("pmap_enter_l3c: pa is not aligned"));
5886
5887 /*
5888 * Sync the icache before the mapping is stored.
5889 */
5890 if ((l3e & ATTR_S1_XN) == 0 && pmap != kernel_pmap &&
5891 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
5892 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), L3C_SIZE);
5893
5894 /*
5895 * Map the superpage.
5896 */
5897 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
5898 pmap_store(tl3p, l3e);
5899 l3e += L3_SIZE;
5900 }
5901 dsb(ishst);
5902
5903 counter_u64_add(pmap_l3c_mappings, 1);
5904 CTR2(KTR_PMAP, "pmap_enter_l3c: success for va %#lx in pmap %p",
5905 va, pmap);
5906 return (KERN_SUCCESS);
5907 }
5908
5909 /*
5910 * Maps a sequence of resident pages belonging to the same object.
5911 * The sequence begins with the given page m_start. This page is
5912 * mapped at the given virtual address start. Each subsequent page is
5913 * mapped at a virtual address that is offset from start by the same
5914 * amount as the page is offset from m_start within the object. The
5915 * last page in the sequence is the page with the largest offset from
5916 * m_start that can be mapped at a virtual address less than the given
5917 * virtual address end. Not every virtual page between start and end
5918 * is mapped; only those for which a resident page exists with the
5919 * corresponding offset from m_start are mapped.
5920 */
5921 void
pmap_enter_object(pmap_t pmap,vm_offset_t start,vm_offset_t end,vm_page_t m_start,vm_prot_t prot)5922 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
5923 vm_page_t m_start, vm_prot_t prot)
5924 {
5925 struct rwlock *lock;
5926 vm_offset_t va;
5927 vm_page_t m, mpte;
5928 vm_pindex_t diff, psize;
5929 int rv;
5930
5931 VM_OBJECT_ASSERT_LOCKED(m_start->object);
5932
5933 psize = atop(end - start);
5934 mpte = NULL;
5935 m = m_start;
5936 lock = NULL;
5937 PMAP_LOCK(pmap);
5938 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
5939 va = start + ptoa(diff);
5940 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
5941 m->psind == 2 && pmap_ps_enabled(pmap) &&
5942 ((rv = pmap_enter_l2_rx(pmap, va, m, prot, &lock)) ==
5943 KERN_SUCCESS || rv == KERN_NO_SPACE))
5944 m = &m[L2_SIZE / PAGE_SIZE - 1];
5945 else if ((va & L3C_OFFSET) == 0 && va + L3C_SIZE <= end &&
5946 m->psind >= 1 && pmap_ps_enabled(pmap) &&
5947 ((rv = pmap_enter_l3c_rx(pmap, va, m, &mpte, prot,
5948 &lock)) == KERN_SUCCESS || rv == KERN_NO_SPACE))
5949 m = &m[L3C_ENTRIES - 1];
5950 else {
5951 /*
5952 * In general, if a superpage mapping were possible,
5953 * it would have been created above. That said, if
5954 * start and end are not superpage aligned, then
5955 * promotion might be possible at the ends of [start,
5956 * end). However, in practice, those promotion
5957 * attempts are so unlikely to succeed that they are
5958 * not worth trying.
5959 */
5960 mpte = pmap_enter_quick_locked(pmap, va, m, prot |
5961 VM_PROT_NO_PROMOTE, mpte, &lock);
5962 }
5963 m = TAILQ_NEXT(m, listq);
5964 }
5965 if (lock != NULL)
5966 rw_wunlock(lock);
5967 PMAP_UNLOCK(pmap);
5968 }
5969
5970 /*
5971 * this code makes some *MAJOR* assumptions:
5972 * 1. Current pmap & pmap exists.
5973 * 2. Not wired.
5974 * 3. Read access.
5975 * 4. No page table pages.
5976 * but is *MUCH* faster than pmap_enter...
5977 */
5978
5979 void
pmap_enter_quick(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot)5980 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
5981 {
5982 struct rwlock *lock;
5983
5984 lock = NULL;
5985 PMAP_LOCK(pmap);
5986 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
5987 if (lock != NULL)
5988 rw_wunlock(lock);
5989 PMAP_UNLOCK(pmap);
5990 }
5991
5992 static vm_page_t
pmap_enter_quick_locked(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,vm_page_t mpte,struct rwlock ** lockp)5993 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
5994 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
5995 {
5996 pt_entry_t *l1, *l2, *l3, l3_val;
5997 vm_paddr_t pa;
5998 int full_lvl, lvl;
5999
6000 KASSERT(!VA_IS_CLEANMAP(va) ||
6001 (m->oflags & VPO_UNMANAGED) != 0,
6002 ("pmap_enter_quick_locked: managed mapping within the clean submap"));
6003 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6004 PMAP_ASSERT_STAGE1(pmap);
6005 KASSERT(ADDR_IS_CANONICAL(va),
6006 ("%s: Address not in canonical form: %lx", __func__, va));
6007 l2 = NULL;
6008
6009 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
6010 /*
6011 * In the case that a page table page is not
6012 * resident, we are creating it here.
6013 */
6014 if (!ADDR_IS_KERNEL(va)) {
6015 vm_pindex_t l2pindex;
6016
6017 /*
6018 * Calculate pagetable page index
6019 */
6020 l2pindex = pmap_l2_pindex(va);
6021 if (mpte && (mpte->pindex == l2pindex)) {
6022 mpte->ref_count++;
6023 } else {
6024 /*
6025 * If the page table page is mapped, we just increment
6026 * the hold count, and activate it. Otherwise, we
6027 * attempt to allocate a page table page, passing NULL
6028 * instead of the PV list lock pointer because we don't
6029 * intend to sleep. If this attempt fails, we don't
6030 * retry. Instead, we give up.
6031 */
6032 l1 = pmap_l1(pmap, va);
6033 if (l1 != NULL && pmap_load(l1) != 0) {
6034 if ((pmap_load(l1) & ATTR_DESCR_MASK) ==
6035 L1_BLOCK)
6036 return (NULL);
6037 l2 = pmap_l1_to_l2(l1, va);
6038 if (pmap_load(l2) != 0) {
6039 if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
6040 L2_BLOCK)
6041 return (NULL);
6042 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
6043 mpte->ref_count++;
6044 } else {
6045 mpte = _pmap_alloc_l3(pmap, l2pindex,
6046 NULL);
6047 if (mpte == NULL)
6048 return (mpte);
6049 }
6050 } else {
6051 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
6052 if (mpte == NULL)
6053 return (mpte);
6054 }
6055 }
6056 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
6057 l3 = &l3[pmap_l3_index(va)];
6058 } else {
6059 mpte = NULL;
6060 l2 = pmap_pde(kernel_pmap, va, &lvl);
6061 KASSERT(l2 != NULL,
6062 ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
6063 va));
6064 KASSERT(lvl == 2,
6065 ("pmap_enter_quick_locked: Invalid level %d", lvl));
6066 l3 = pmap_l2_to_l3(l2, va);
6067 }
6068
6069 /*
6070 * Abort if a mapping already exists.
6071 */
6072 if (pmap_load(l3) != 0) {
6073 if (mpte != NULL)
6074 mpte->ref_count--;
6075 return (NULL);
6076 }
6077
6078 /*
6079 * Enter on the PV list if part of our managed memory.
6080 */
6081 if ((m->oflags & VPO_UNMANAGED) == 0 &&
6082 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
6083 if (mpte != NULL)
6084 pmap_abort_ptp(pmap, va, mpte);
6085 return (NULL);
6086 }
6087
6088 /*
6089 * Increment counters
6090 */
6091 pmap_resident_count_inc(pmap, 1);
6092
6093 pa = VM_PAGE_TO_PHYS(m);
6094 l3_val = PHYS_TO_PTE(pa) | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) |
6095 ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE;
6096 l3_val |= pmap_pte_bti(pmap, va);
6097 if ((prot & VM_PROT_EXECUTE) == 0 ||
6098 m->md.pv_memattr == VM_MEMATTR_DEVICE)
6099 l3_val |= ATTR_S1_XN;
6100 if (!ADDR_IS_KERNEL(va))
6101 l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
6102 else
6103 l3_val |= ATTR_S1_UXN;
6104 if (pmap != kernel_pmap)
6105 l3_val |= ATTR_S1_nG;
6106
6107 /*
6108 * Now validate mapping with RO protection
6109 */
6110 if ((m->oflags & VPO_UNMANAGED) == 0) {
6111 l3_val |= ATTR_SW_MANAGED;
6112 l3_val &= ~ATTR_AF;
6113 }
6114
6115 /* Sync icache before the mapping is stored to PTE */
6116 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
6117 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
6118 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE);
6119
6120 pmap_store(l3, l3_val);
6121 dsb(ishst);
6122
6123 #if VM_NRESERVLEVEL > 0
6124 /*
6125 * First, attempt L3C promotion, if the virtual and physical addresses
6126 * are aligned with each other and an underlying reservation has the
6127 * neighboring L3 pages allocated. The first condition is simply an
6128 * optimization that recognizes some eventual promotion failures early
6129 * at a lower run-time cost. Then, attempt L2 promotion, if both a
6130 * level 1 reservation and the PTP are fully populated.
6131 */
6132 if ((prot & VM_PROT_NO_PROMOTE) == 0 &&
6133 (va & L3C_OFFSET) == (pa & L3C_OFFSET) &&
6134 (m->flags & PG_FICTITIOUS) == 0 &&
6135 (full_lvl = vm_reserv_level_iffullpop(m)) >= 0 &&
6136 pmap_promote_l3c(pmap, l3, va) &&
6137 full_lvl == 1 && (mpte == NULL || mpte->ref_count == NL3PG)) {
6138 if (l2 == NULL)
6139 l2 = pmap_l2(pmap, va);
6140
6141 /*
6142 * If promotion succeeds, then the next call to this function
6143 * should not be given the unmapped PTP as a hint.
6144 */
6145 if (pmap_promote_l2(pmap, l2, va, mpte, lockp))
6146 mpte = NULL;
6147 }
6148 #endif
6149
6150 return (mpte);
6151 }
6152
6153 /*
6154 * This code maps large physical mmap regions into the
6155 * processor address space. Note that some shortcuts
6156 * are taken, but the code works.
6157 */
6158 void
pmap_object_init_pt(pmap_t pmap,vm_offset_t addr,vm_object_t object,vm_pindex_t pindex,vm_size_t size)6159 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
6160 vm_pindex_t pindex, vm_size_t size)
6161 {
6162
6163 VM_OBJECT_ASSERT_WLOCKED(object);
6164 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
6165 ("pmap_object_init_pt: non-device object"));
6166 }
6167
6168 /*
6169 * Clear the wired attribute from the mappings for the specified range of
6170 * addresses in the given pmap. Every valid mapping within that range
6171 * must have the wired attribute set. In contrast, invalid mappings
6172 * cannot have the wired attribute set, so they are ignored.
6173 *
6174 * The wired attribute of the page table entry is not a hardware feature,
6175 * so there is no need to invalidate any TLB entries.
6176 */
6177 void
pmap_unwire(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)6178 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
6179 {
6180 vm_offset_t va_next;
6181 pd_entry_t *l0, *l1, *l2;
6182 pt_entry_t *l3;
6183 bool partial_l3c;
6184
6185 PMAP_LOCK(pmap);
6186 for (; sva < eva; sva = va_next) {
6187 l0 = pmap_l0(pmap, sva);
6188 if (pmap_load(l0) == 0) {
6189 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
6190 if (va_next < sva)
6191 va_next = eva;
6192 continue;
6193 }
6194
6195 l1 = pmap_l0_to_l1(l0, sva);
6196 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
6197 if (va_next < sva)
6198 va_next = eva;
6199 if (pmap_load(l1) == 0)
6200 continue;
6201
6202 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6203 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6204 KASSERT(va_next <= eva,
6205 ("partial update of non-transparent 1G page "
6206 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
6207 pmap_load(l1), sva, eva, va_next));
6208 MPASS(pmap != kernel_pmap);
6209 MPASS((pmap_load(l1) & (ATTR_SW_MANAGED |
6210 ATTR_SW_WIRED)) == ATTR_SW_WIRED);
6211 pmap_clear_bits(l1, ATTR_SW_WIRED);
6212 pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE;
6213 continue;
6214 }
6215
6216 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
6217 if (va_next < sva)
6218 va_next = eva;
6219
6220 l2 = pmap_l1_to_l2(l1, sva);
6221 if (pmap_load(l2) == 0)
6222 continue;
6223
6224 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
6225 if ((pmap_load(l2) & ATTR_SW_WIRED) == 0)
6226 panic("pmap_unwire: l2 %#jx is missing "
6227 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2));
6228
6229 /*
6230 * Are we unwiring the entire large page? If not,
6231 * demote the mapping and fall through.
6232 */
6233 if (sva + L2_SIZE == va_next && eva >= va_next) {
6234 pmap_clear_bits(l2, ATTR_SW_WIRED);
6235 pmap->pm_stats.wired_count -= L2_SIZE /
6236 PAGE_SIZE;
6237 continue;
6238 } else if (pmap_demote_l2(pmap, l2, sva) == NULL)
6239 panic("pmap_unwire: demotion failed");
6240 }
6241 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
6242 ("pmap_unwire: Invalid l2 entry after demotion"));
6243
6244 if (va_next > eva)
6245 va_next = eva;
6246 for (partial_l3c = true, l3 = pmap_l2_to_l3(l2, sva);
6247 sva != va_next; l3++, sva += L3_SIZE) {
6248 if (pmap_load(l3) == 0)
6249 continue;
6250 if ((pmap_load(l3) & ATTR_CONTIGUOUS) != 0) {
6251 /*
6252 * Avoid demotion for whole-page unwiring.
6253 */
6254 if ((sva & L3C_OFFSET) == 0) {
6255 /*
6256 * Handle the possibility that
6257 * "va_next" is zero because of
6258 * address wraparound.
6259 */
6260 partial_l3c = sva + L3C_OFFSET >
6261 va_next - 1;
6262 }
6263 if (partial_l3c)
6264 (void)pmap_demote_l3c(pmap, l3, sva);
6265 }
6266 if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
6267 panic("pmap_unwire: l3 %#jx is missing "
6268 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
6269
6270 /*
6271 * ATTR_SW_WIRED must be cleared atomically. Although
6272 * the pmap lock synchronizes access to ATTR_SW_WIRED,
6273 * the System MMU may write to the entry concurrently.
6274 */
6275 pmap_clear_bits(l3, ATTR_SW_WIRED);
6276 pmap->pm_stats.wired_count--;
6277 }
6278 }
6279 PMAP_UNLOCK(pmap);
6280 }
6281
6282 /*
6283 * This function requires that the caller has already added one to ml3's
6284 * ref_count in anticipation of creating a 4KB page mapping.
6285 */
6286 static bool
pmap_copy_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,pt_entry_t l3e,vm_page_t ml3,struct rwlock ** lockp)6287 pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, pt_entry_t l3e,
6288 vm_page_t ml3, struct rwlock **lockp)
6289 {
6290 pt_entry_t *tl3p;
6291
6292 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6293 KASSERT((va & L3C_OFFSET) == 0,
6294 ("pmap_copy_l3c: va is not aligned"));
6295 KASSERT((l3e & ATTR_SW_MANAGED) != 0,
6296 ("pmap_copy_l3c: l3e is not managed"));
6297
6298 /*
6299 * Abort if a mapping already exists.
6300 */
6301 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++)
6302 if (pmap_load(tl3p) != 0) {
6303 if (ml3 != NULL)
6304 ml3->ref_count--;
6305 return (false);
6306 }
6307
6308 if (!pmap_pv_insert_l3c(pmap, va, PTE_TO_VM_PAGE(l3e), lockp)) {
6309 if (ml3 != NULL)
6310 pmap_abort_ptp(pmap, va, ml3);
6311 return (false);
6312 }
6313 ml3->ref_count += L3C_ENTRIES - 1;
6314
6315 /*
6316 * Clear the wired and accessed bits. However, leave the dirty bit
6317 * unchanged because read/write superpage mappings are required to be
6318 * dirty.
6319 */
6320 l3e &= ~(ATTR_SW_WIRED | ATTR_AF);
6321
6322 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
6323 pmap_store(tl3p, l3e);
6324 l3e += L3_SIZE;
6325 }
6326 pmap_resident_count_inc(pmap, L3C_ENTRIES);
6327 counter_u64_add(pmap_l3c_mappings, 1);
6328 CTR2(KTR_PMAP, "pmap_copy_l3c: success for va %#lx in pmap %p",
6329 va, pmap);
6330 return (true);
6331 }
6332
6333 /*
6334 * Copy the range specified by src_addr/len
6335 * from the source map to the range dst_addr/len
6336 * in the destination map.
6337 *
6338 * This routine is only advisory and need not do anything.
6339 *
6340 * Because the executable mappings created by this routine are copied,
6341 * it should not have to flush the instruction cache.
6342 */
6343 void
pmap_copy(pmap_t dst_pmap,pmap_t src_pmap,vm_offset_t dst_addr,vm_size_t len,vm_offset_t src_addr)6344 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
6345 vm_offset_t src_addr)
6346 {
6347 struct rwlock *lock;
6348 pd_entry_t *l0, *l1, *l2, srcptepaddr;
6349 pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte;
6350 vm_offset_t addr, end_addr, va_next;
6351 vm_page_t dst_m, dstmpte, srcmpte;
6352
6353 PMAP_ASSERT_STAGE1(dst_pmap);
6354 PMAP_ASSERT_STAGE1(src_pmap);
6355
6356 if (dst_addr != src_addr)
6357 return;
6358 end_addr = src_addr + len;
6359 lock = NULL;
6360 if (dst_pmap < src_pmap) {
6361 PMAP_LOCK(dst_pmap);
6362 PMAP_LOCK(src_pmap);
6363 } else {
6364 PMAP_LOCK(src_pmap);
6365 PMAP_LOCK(dst_pmap);
6366 }
6367 for (addr = src_addr; addr < end_addr; addr = va_next) {
6368 l0 = pmap_l0(src_pmap, addr);
6369 if (pmap_load(l0) == 0) {
6370 va_next = (addr + L0_SIZE) & ~L0_OFFSET;
6371 if (va_next < addr)
6372 va_next = end_addr;
6373 continue;
6374 }
6375
6376 va_next = (addr + L1_SIZE) & ~L1_OFFSET;
6377 if (va_next < addr)
6378 va_next = end_addr;
6379 l1 = pmap_l0_to_l1(l0, addr);
6380 if (pmap_load(l1) == 0)
6381 continue;
6382 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6383 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6384 KASSERT(va_next <= end_addr,
6385 ("partial update of non-transparent 1G page "
6386 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
6387 pmap_load(l1), addr, end_addr, va_next));
6388 srcptepaddr = pmap_load(l1);
6389 l1 = pmap_l1(dst_pmap, addr);
6390 if (l1 == NULL) {
6391 if (_pmap_alloc_l3(dst_pmap,
6392 pmap_l0_pindex(addr), NULL) == NULL)
6393 break;
6394 l1 = pmap_l1(dst_pmap, addr);
6395 } else {
6396 l0 = pmap_l0(dst_pmap, addr);
6397 dst_m = PTE_TO_VM_PAGE(pmap_load(l0));
6398 dst_m->ref_count++;
6399 }
6400 KASSERT(pmap_load(l1) == 0,
6401 ("1G mapping present in dst pmap "
6402 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
6403 pmap_load(l1), addr, end_addr, va_next));
6404 pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED);
6405 pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE);
6406 continue;
6407 }
6408
6409 va_next = (addr + L2_SIZE) & ~L2_OFFSET;
6410 if (va_next < addr)
6411 va_next = end_addr;
6412 l2 = pmap_l1_to_l2(l1, addr);
6413 srcptepaddr = pmap_load(l2);
6414 if (srcptepaddr == 0)
6415 continue;
6416 if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) {
6417 /*
6418 * We can only virtual copy whole superpages.
6419 */
6420 if ((addr & L2_OFFSET) != 0 ||
6421 addr + L2_SIZE > end_addr)
6422 continue;
6423 l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL);
6424 if (l2 == NULL)
6425 break;
6426 if (pmap_load(l2) == 0 &&
6427 ((srcptepaddr & ATTR_SW_MANAGED) == 0 ||
6428 pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr,
6429 PMAP_ENTER_NORECLAIM, &lock))) {
6430 /*
6431 * We leave the dirty bit unchanged because
6432 * managed read/write superpage mappings are
6433 * required to be dirty. However, managed
6434 * superpage mappings are not required to
6435 * have their accessed bit set, so we clear
6436 * it because we don't know if this mapping
6437 * will be used.
6438 */
6439 srcptepaddr &= ~ATTR_SW_WIRED;
6440 if ((srcptepaddr & ATTR_SW_MANAGED) != 0)
6441 srcptepaddr &= ~ATTR_AF;
6442 pmap_store(l2, srcptepaddr);
6443 pmap_resident_count_inc(dst_pmap, L2_SIZE /
6444 PAGE_SIZE);
6445 atomic_add_long(&pmap_l2_mappings, 1);
6446 } else
6447 pmap_abort_ptp(dst_pmap, addr, dst_m);
6448 continue;
6449 }
6450 KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE,
6451 ("pmap_copy: invalid L2 entry"));
6452 srcmpte = PTE_TO_VM_PAGE(srcptepaddr);
6453 KASSERT(srcmpte->ref_count > 0,
6454 ("pmap_copy: source page table page is unused"));
6455 if (va_next > end_addr)
6456 va_next = end_addr;
6457 src_pte = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(srcptepaddr));
6458 src_pte = &src_pte[pmap_l3_index(addr)];
6459 dstmpte = NULL;
6460 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
6461 ptetemp = pmap_load(src_pte);
6462
6463 /*
6464 * We only virtual copy managed pages.
6465 */
6466 if ((ptetemp & ATTR_SW_MANAGED) == 0)
6467 continue;
6468
6469 if (dstmpte != NULL) {
6470 KASSERT(dstmpte->pindex == pmap_l2_pindex(addr),
6471 ("dstmpte pindex/addr mismatch"));
6472 dstmpte->ref_count++;
6473 } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr,
6474 NULL)) == NULL)
6475 goto out;
6476 dst_pte = (pt_entry_t *)
6477 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
6478 dst_pte = &dst_pte[pmap_l3_index(addr)];
6479 if ((ptetemp & ATTR_CONTIGUOUS) != 0 && (addr &
6480 L3C_OFFSET) == 0 && addr + L3C_OFFSET <=
6481 va_next - 1) {
6482 if (!pmap_copy_l3c(dst_pmap, dst_pte, addr,
6483 ptetemp, dstmpte, &lock))
6484 goto out;
6485 addr += L3C_SIZE - PAGE_SIZE;
6486 src_pte += L3C_ENTRIES - 1;
6487 } else if (pmap_load(dst_pte) == 0 &&
6488 pmap_try_insert_pv_entry(dst_pmap, addr,
6489 PTE_TO_VM_PAGE(ptetemp), &lock)) {
6490 /*
6491 * Clear the wired, contiguous, modified, and
6492 * accessed bits from the destination PTE.
6493 * The contiguous bit is cleared because we
6494 * are not copying the entire L3C superpage.
6495 */
6496 mask = ATTR_SW_WIRED | ATTR_CONTIGUOUS |
6497 ATTR_AF;
6498 nbits = 0;
6499 if ((ptetemp & ATTR_SW_DBM) != 0)
6500 nbits |= ATTR_S1_AP_RW_BIT;
6501 pmap_store(dst_pte, (ptetemp & ~mask) | nbits);
6502 pmap_resident_count_inc(dst_pmap, 1);
6503 } else {
6504 pmap_abort_ptp(dst_pmap, addr, dstmpte);
6505 goto out;
6506 }
6507 /* Have we copied all of the valid mappings? */
6508 if (dstmpte->ref_count >= srcmpte->ref_count)
6509 break;
6510 }
6511 }
6512 out:
6513 /*
6514 * XXX This barrier may not be needed because the destination pmap is
6515 * not active.
6516 */
6517 dsb(ishst);
6518
6519 if (lock != NULL)
6520 rw_wunlock(lock);
6521 PMAP_UNLOCK(src_pmap);
6522 PMAP_UNLOCK(dst_pmap);
6523 }
6524
6525 int
pmap_vmspace_copy(pmap_t dst_pmap,pmap_t src_pmap)6526 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap)
6527 {
6528 int error;
6529
6530 if (dst_pmap->pm_stage != src_pmap->pm_stage)
6531 return (EINVAL);
6532
6533 if (dst_pmap->pm_stage != PM_STAGE1 || src_pmap->pm_bti == NULL)
6534 return (0);
6535
6536 for (;;) {
6537 if (dst_pmap < src_pmap) {
6538 PMAP_LOCK(dst_pmap);
6539 PMAP_LOCK(src_pmap);
6540 } else {
6541 PMAP_LOCK(src_pmap);
6542 PMAP_LOCK(dst_pmap);
6543 }
6544 error = pmap_bti_copy(dst_pmap, src_pmap);
6545 /* Clean up partial copy on failure due to no memory. */
6546 if (error == ENOMEM)
6547 pmap_bti_deassign_all(dst_pmap);
6548 PMAP_UNLOCK(src_pmap);
6549 PMAP_UNLOCK(dst_pmap);
6550 if (error != ENOMEM)
6551 break;
6552 vm_wait(NULL);
6553 }
6554 return (error);
6555 }
6556
6557 /*
6558 * pmap_zero_page zeros the specified hardware page by mapping
6559 * the page into KVM and using bzero to clear its contents.
6560 */
6561 void
pmap_zero_page(vm_page_t m)6562 pmap_zero_page(vm_page_t m)
6563 {
6564 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
6565
6566 pagezero((void *)va);
6567 }
6568
6569 /*
6570 * pmap_zero_page_area zeros the specified hardware page by mapping
6571 * the page into KVM and using bzero to clear its contents.
6572 *
6573 * off and size may not cover an area beyond a single hardware page.
6574 */
6575 void
pmap_zero_page_area(vm_page_t m,int off,int size)6576 pmap_zero_page_area(vm_page_t m, int off, int size)
6577 {
6578 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
6579
6580 if (off == 0 && size == PAGE_SIZE)
6581 pagezero((void *)va);
6582 else
6583 bzero((char *)va + off, size);
6584 }
6585
6586 /*
6587 * pmap_copy_page copies the specified (machine independent)
6588 * page by mapping the page into virtual memory and using
6589 * bcopy to copy the page, one machine dependent page at a
6590 * time.
6591 */
6592 void
pmap_copy_page(vm_page_t msrc,vm_page_t mdst)6593 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
6594 {
6595 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
6596 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
6597
6598 pagecopy((void *)src, (void *)dst);
6599 }
6600
6601 int unmapped_buf_allowed = 1;
6602
6603 void
pmap_copy_pages(vm_page_t ma[],vm_offset_t a_offset,vm_page_t mb[],vm_offset_t b_offset,int xfersize)6604 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
6605 vm_offset_t b_offset, int xfersize)
6606 {
6607 void *a_cp, *b_cp;
6608 vm_page_t m_a, m_b;
6609 vm_paddr_t p_a, p_b;
6610 vm_offset_t a_pg_offset, b_pg_offset;
6611 int cnt;
6612
6613 while (xfersize > 0) {
6614 a_pg_offset = a_offset & PAGE_MASK;
6615 m_a = ma[a_offset >> PAGE_SHIFT];
6616 p_a = m_a->phys_addr;
6617 b_pg_offset = b_offset & PAGE_MASK;
6618 m_b = mb[b_offset >> PAGE_SHIFT];
6619 p_b = m_b->phys_addr;
6620 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
6621 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
6622 if (__predict_false(!PHYS_IN_DMAP(p_a))) {
6623 panic("!DMAP a %lx", p_a);
6624 } else {
6625 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
6626 }
6627 if (__predict_false(!PHYS_IN_DMAP(p_b))) {
6628 panic("!DMAP b %lx", p_b);
6629 } else {
6630 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
6631 }
6632 bcopy(a_cp, b_cp, cnt);
6633 a_offset += cnt;
6634 b_offset += cnt;
6635 xfersize -= cnt;
6636 }
6637 }
6638
6639 vm_offset_t
pmap_quick_enter_page(vm_page_t m)6640 pmap_quick_enter_page(vm_page_t m)
6641 {
6642
6643 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
6644 }
6645
6646 void
pmap_quick_remove_page(vm_offset_t addr)6647 pmap_quick_remove_page(vm_offset_t addr)
6648 {
6649 }
6650
6651 /*
6652 * Returns true if the pmap's pv is one of the first
6653 * 16 pvs linked to from this page. This count may
6654 * be changed upwards or downwards in the future; it
6655 * is only necessary that true be returned for a small
6656 * subset of pmaps for proper page aging.
6657 */
6658 bool
pmap_page_exists_quick(pmap_t pmap,vm_page_t m)6659 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
6660 {
6661 struct md_page *pvh;
6662 struct rwlock *lock;
6663 pv_entry_t pv;
6664 int loops = 0;
6665 bool rv;
6666
6667 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6668 ("pmap_page_exists_quick: page %p is not managed", m));
6669 rv = false;
6670 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6671 rw_rlock(lock);
6672 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6673 if (PV_PMAP(pv) == pmap) {
6674 rv = true;
6675 break;
6676 }
6677 loops++;
6678 if (loops >= 16)
6679 break;
6680 }
6681 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
6682 pvh = page_to_pvh(m);
6683 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6684 if (PV_PMAP(pv) == pmap) {
6685 rv = true;
6686 break;
6687 }
6688 loops++;
6689 if (loops >= 16)
6690 break;
6691 }
6692 }
6693 rw_runlock(lock);
6694 return (rv);
6695 }
6696
6697 /*
6698 * pmap_page_wired_mappings:
6699 *
6700 * Return the number of managed mappings to the given physical page
6701 * that are wired.
6702 */
6703 int
pmap_page_wired_mappings(vm_page_t m)6704 pmap_page_wired_mappings(vm_page_t m)
6705 {
6706 struct rwlock *lock;
6707 struct md_page *pvh;
6708 pmap_t pmap;
6709 pt_entry_t *pte;
6710 pv_entry_t pv;
6711 int count, md_gen, pvh_gen;
6712
6713 if ((m->oflags & VPO_UNMANAGED) != 0)
6714 return (0);
6715 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6716 rw_rlock(lock);
6717 restart:
6718 count = 0;
6719 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6720 pmap = PV_PMAP(pv);
6721 if (!PMAP_TRYLOCK(pmap)) {
6722 md_gen = m->md.pv_gen;
6723 rw_runlock(lock);
6724 PMAP_LOCK(pmap);
6725 rw_rlock(lock);
6726 if (md_gen != m->md.pv_gen) {
6727 PMAP_UNLOCK(pmap);
6728 goto restart;
6729 }
6730 }
6731 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
6732 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
6733 count++;
6734 PMAP_UNLOCK(pmap);
6735 }
6736 if ((m->flags & PG_FICTITIOUS) == 0) {
6737 pvh = page_to_pvh(m);
6738 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6739 pmap = PV_PMAP(pv);
6740 if (!PMAP_TRYLOCK(pmap)) {
6741 md_gen = m->md.pv_gen;
6742 pvh_gen = pvh->pv_gen;
6743 rw_runlock(lock);
6744 PMAP_LOCK(pmap);
6745 rw_rlock(lock);
6746 if (md_gen != m->md.pv_gen ||
6747 pvh_gen != pvh->pv_gen) {
6748 PMAP_UNLOCK(pmap);
6749 goto restart;
6750 }
6751 }
6752 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
6753 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
6754 count++;
6755 PMAP_UNLOCK(pmap);
6756 }
6757 }
6758 rw_runlock(lock);
6759 return (count);
6760 }
6761
6762 /*
6763 * Returns true if the given page is mapped individually or as part of
6764 * a 2mpage. Otherwise, returns false.
6765 */
6766 bool
pmap_page_is_mapped(vm_page_t m)6767 pmap_page_is_mapped(vm_page_t m)
6768 {
6769 struct rwlock *lock;
6770 bool rv;
6771
6772 if ((m->oflags & VPO_UNMANAGED) != 0)
6773 return (false);
6774 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6775 rw_rlock(lock);
6776 rv = !TAILQ_EMPTY(&m->md.pv_list) ||
6777 ((m->flags & PG_FICTITIOUS) == 0 &&
6778 !TAILQ_EMPTY(&page_to_pvh(m)->pv_list));
6779 rw_runlock(lock);
6780 return (rv);
6781 }
6782
6783 /*
6784 * Destroy all managed, non-wired mappings in the given user-space
6785 * pmap. This pmap cannot be active on any processor besides the
6786 * caller.
6787 *
6788 * This function cannot be applied to the kernel pmap. Moreover, it
6789 * is not intended for general use. It is only to be used during
6790 * process termination. Consequently, it can be implemented in ways
6791 * that make it faster than pmap_remove(). First, it can more quickly
6792 * destroy mappings by iterating over the pmap's collection of PV
6793 * entries, rather than searching the page table. Second, it doesn't
6794 * have to test and clear the page table entries atomically, because
6795 * no processor is currently accessing the user address space. In
6796 * particular, a page table entry's dirty bit won't change state once
6797 * this function starts.
6798 */
6799 void
pmap_remove_pages(pmap_t pmap)6800 pmap_remove_pages(pmap_t pmap)
6801 {
6802 pd_entry_t *pde;
6803 pt_entry_t *pte, tpte;
6804 struct spglist free;
6805 struct pv_chunklist free_chunks[PMAP_MEMDOM];
6806 vm_page_t m, ml3, mt;
6807 pv_entry_t pv;
6808 struct md_page *pvh;
6809 struct pv_chunk *pc, *npc;
6810 struct rwlock *lock;
6811 int64_t bit;
6812 uint64_t inuse, bitmask;
6813 int allfree, field, i, idx, lvl;
6814 int freed __pvused;
6815 vm_paddr_t pa;
6816
6817 lock = NULL;
6818
6819 for (i = 0; i < PMAP_MEMDOM; i++)
6820 TAILQ_INIT(&free_chunks[i]);
6821 SLIST_INIT(&free);
6822 PMAP_LOCK(pmap);
6823 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
6824 allfree = 1;
6825 freed = 0;
6826 for (field = 0; field < _NPCM; field++) {
6827 inuse = ~pc->pc_map[field] & pc_freemask[field];
6828 while (inuse != 0) {
6829 bit = ffsl(inuse) - 1;
6830 bitmask = 1UL << bit;
6831 idx = field * 64 + bit;
6832 pv = &pc->pc_pventry[idx];
6833 inuse &= ~bitmask;
6834
6835 pde = pmap_pde(pmap, pv->pv_va, &lvl);
6836 KASSERT(pde != NULL,
6837 ("Attempting to remove an unmapped page"));
6838
6839 switch(lvl) {
6840 case 1:
6841 pte = pmap_l1_to_l2(pde, pv->pv_va);
6842 tpte = pmap_load(pte);
6843 KASSERT((tpte & ATTR_DESCR_MASK) ==
6844 L2_BLOCK,
6845 ("Attempting to remove an invalid "
6846 "block: %lx", tpte));
6847 break;
6848 case 2:
6849 pte = pmap_l2_to_l3(pde, pv->pv_va);
6850 tpte = pmap_load(pte);
6851 KASSERT((tpte & ATTR_DESCR_MASK) ==
6852 L3_PAGE,
6853 ("Attempting to remove an invalid "
6854 "page: %lx", tpte));
6855 break;
6856 default:
6857 panic(
6858 "Invalid page directory level: %d",
6859 lvl);
6860 }
6861
6862 /*
6863 * We cannot remove wired mappings at this time.
6864 *
6865 * For L3C superpages, all of the constituent PTEs
6866 * should have the wired bit set, so we don't
6867 * check for ATTR_CONTIGUOUS here.
6868 */
6869 if (tpte & ATTR_SW_WIRED) {
6870 allfree = 0;
6871 continue;
6872 }
6873
6874 /* Mark free */
6875 pc->pc_map[field] |= bitmask;
6876
6877 /*
6878 * Because this pmap is not active on other
6879 * processors, the dirty bit cannot have
6880 * changed state since we last loaded pte.
6881 */
6882 pmap_clear(pte);
6883
6884 pa = PTE_TO_PHYS(tpte);
6885
6886 m = PHYS_TO_VM_PAGE(pa);
6887 KASSERT(m->phys_addr == pa,
6888 ("vm_page_t %p phys_addr mismatch %016jx %016jx",
6889 m, (uintmax_t)m->phys_addr,
6890 (uintmax_t)tpte));
6891
6892 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
6893 m < &vm_page_array[vm_page_array_size],
6894 ("pmap_remove_pages: bad pte %#jx",
6895 (uintmax_t)tpte));
6896
6897 /*
6898 * Update the vm_page_t clean/reference bits.
6899 *
6900 * We don't check for ATTR_CONTIGUOUS here
6901 * because writeable L3C superpages are expected
6902 * to be dirty, i.e., every constituent PTE
6903 * should be dirty.
6904 */
6905 if (pmap_pte_dirty(pmap, tpte)) {
6906 switch (lvl) {
6907 case 1:
6908 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
6909 vm_page_dirty(mt);
6910 break;
6911 case 2:
6912 vm_page_dirty(m);
6913 break;
6914 }
6915 }
6916
6917 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
6918
6919 switch (lvl) {
6920 case 1:
6921 pmap_resident_count_dec(pmap,
6922 L2_SIZE / PAGE_SIZE);
6923 pvh = page_to_pvh(m);
6924 TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
6925 pvh->pv_gen++;
6926 if (TAILQ_EMPTY(&pvh->pv_list)) {
6927 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
6928 if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
6929 TAILQ_EMPTY(&mt->md.pv_list))
6930 vm_page_aflag_clear(mt, PGA_WRITEABLE);
6931 }
6932 ml3 = pmap_remove_pt_page(pmap,
6933 pv->pv_va);
6934 if (ml3 != NULL) {
6935 KASSERT(vm_page_any_valid(ml3),
6936 ("pmap_remove_pages: l3 page not promoted"));
6937 pmap_resident_count_dec(pmap,1);
6938 KASSERT(ml3->ref_count == NL3PG,
6939 ("pmap_remove_pages: l3 page ref count error"));
6940 ml3->ref_count = 0;
6941 pmap_add_delayed_free_list(ml3,
6942 &free, false);
6943 }
6944 break;
6945 case 2:
6946 pmap_resident_count_dec(pmap, 1);
6947 TAILQ_REMOVE(&m->md.pv_list, pv,
6948 pv_next);
6949 m->md.pv_gen++;
6950 if ((m->a.flags & PGA_WRITEABLE) != 0 &&
6951 TAILQ_EMPTY(&m->md.pv_list) &&
6952 (m->flags & PG_FICTITIOUS) == 0) {
6953 pvh = page_to_pvh(m);
6954 if (TAILQ_EMPTY(&pvh->pv_list))
6955 vm_page_aflag_clear(m,
6956 PGA_WRITEABLE);
6957 }
6958 break;
6959 }
6960 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
6961 &free);
6962 freed++;
6963 }
6964 }
6965 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
6966 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
6967 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
6968 if (allfree) {
6969 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
6970 TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc,
6971 pc_list);
6972 }
6973 }
6974 if (lock != NULL)
6975 rw_wunlock(lock);
6976 pmap_invalidate_all(pmap);
6977 pmap_bti_deassign_all(pmap);
6978 free_pv_chunk_batch(free_chunks);
6979 PMAP_UNLOCK(pmap);
6980 vm_page_free_pages_toq(&free, true);
6981 }
6982
6983 /*
6984 * This is used to check if a page has been accessed or modified.
6985 */
6986 static bool
pmap_page_test_mappings(vm_page_t m,bool accessed,bool modified)6987 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified)
6988 {
6989 struct rwlock *lock;
6990 pv_entry_t pv;
6991 struct md_page *pvh;
6992 pt_entry_t l3e, mask, *pte, value;
6993 pmap_t pmap;
6994 int md_gen, pvh_gen;
6995 bool rv;
6996
6997 rv = false;
6998 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6999 rw_rlock(lock);
7000 restart:
7001 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7002 pmap = PV_PMAP(pv);
7003 PMAP_ASSERT_STAGE1(pmap);
7004 if (!PMAP_TRYLOCK(pmap)) {
7005 md_gen = m->md.pv_gen;
7006 rw_runlock(lock);
7007 PMAP_LOCK(pmap);
7008 rw_rlock(lock);
7009 if (md_gen != m->md.pv_gen) {
7010 PMAP_UNLOCK(pmap);
7011 goto restart;
7012 }
7013 }
7014 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7015 mask = 0;
7016 value = 0;
7017 if (modified) {
7018 mask |= ATTR_S1_AP_RW_BIT;
7019 value |= ATTR_S1_AP(ATTR_S1_AP_RW);
7020 }
7021 if (accessed) {
7022 mask |= ATTR_AF | ATTR_DESCR_MASK;
7023 value |= ATTR_AF | L3_PAGE;
7024 }
7025 l3e = pmap_load(pte);
7026 if ((l3e & ATTR_CONTIGUOUS) != 0)
7027 l3e = pmap_load_l3c(pte);
7028 PMAP_UNLOCK(pmap);
7029 rv = (l3e & mask) == value;
7030 if (rv)
7031 goto out;
7032 }
7033 if ((m->flags & PG_FICTITIOUS) == 0) {
7034 pvh = page_to_pvh(m);
7035 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
7036 pmap = PV_PMAP(pv);
7037 PMAP_ASSERT_STAGE1(pmap);
7038 if (!PMAP_TRYLOCK(pmap)) {
7039 md_gen = m->md.pv_gen;
7040 pvh_gen = pvh->pv_gen;
7041 rw_runlock(lock);
7042 PMAP_LOCK(pmap);
7043 rw_rlock(lock);
7044 if (md_gen != m->md.pv_gen ||
7045 pvh_gen != pvh->pv_gen) {
7046 PMAP_UNLOCK(pmap);
7047 goto restart;
7048 }
7049 }
7050 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
7051 mask = 0;
7052 value = 0;
7053 if (modified) {
7054 mask |= ATTR_S1_AP_RW_BIT;
7055 value |= ATTR_S1_AP(ATTR_S1_AP_RW);
7056 }
7057 if (accessed) {
7058 mask |= ATTR_AF | ATTR_DESCR_MASK;
7059 value |= ATTR_AF | L2_BLOCK;
7060 }
7061 rv = (pmap_load(pte) & mask) == value;
7062 PMAP_UNLOCK(pmap);
7063 if (rv)
7064 goto out;
7065 }
7066 }
7067 out:
7068 rw_runlock(lock);
7069 return (rv);
7070 }
7071
7072 /*
7073 * pmap_is_modified:
7074 *
7075 * Return whether or not the specified physical page was modified
7076 * in any physical maps.
7077 */
7078 bool
pmap_is_modified(vm_page_t m)7079 pmap_is_modified(vm_page_t m)
7080 {
7081
7082 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7083 ("pmap_is_modified: page %p is not managed", m));
7084
7085 /*
7086 * If the page is not busied then this check is racy.
7087 */
7088 if (!pmap_page_is_write_mapped(m))
7089 return (false);
7090 return (pmap_page_test_mappings(m, false, true));
7091 }
7092
7093 /*
7094 * pmap_is_prefaultable:
7095 *
7096 * Return whether or not the specified virtual address is eligible
7097 * for prefault.
7098 */
7099 bool
pmap_is_prefaultable(pmap_t pmap,vm_offset_t addr)7100 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
7101 {
7102 pd_entry_t *pde;
7103 pt_entry_t *pte;
7104 bool rv;
7105 int lvl;
7106
7107 /*
7108 * Return true if and only if the L3 entry for the specified virtual
7109 * address is allocated but invalid.
7110 */
7111 rv = false;
7112 PMAP_LOCK(pmap);
7113 pde = pmap_pde(pmap, addr, &lvl);
7114 if (pde != NULL && lvl == 2) {
7115 pte = pmap_l2_to_l3(pde, addr);
7116 rv = pmap_load(pte) == 0;
7117 }
7118 PMAP_UNLOCK(pmap);
7119 return (rv);
7120 }
7121
7122 /*
7123 * pmap_is_referenced:
7124 *
7125 * Return whether or not the specified physical page was referenced
7126 * in any physical maps.
7127 */
7128 bool
pmap_is_referenced(vm_page_t m)7129 pmap_is_referenced(vm_page_t m)
7130 {
7131
7132 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7133 ("pmap_is_referenced: page %p is not managed", m));
7134 return (pmap_page_test_mappings(m, true, false));
7135 }
7136
7137 /*
7138 * Clear the write and modified bits in each of the given page's mappings.
7139 */
7140 void
pmap_remove_write(vm_page_t m)7141 pmap_remove_write(vm_page_t m)
7142 {
7143 struct md_page *pvh;
7144 pmap_t pmap;
7145 struct rwlock *lock;
7146 pv_entry_t next_pv, pv;
7147 pt_entry_t oldpte, *pte, set, clear, mask, val;
7148 vm_offset_t va;
7149 int md_gen, pvh_gen;
7150
7151 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7152 ("pmap_remove_write: page %p is not managed", m));
7153 vm_page_assert_busied(m);
7154
7155 if (!pmap_page_is_write_mapped(m))
7156 return;
7157 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7158 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7159 rw_wlock(lock);
7160 retry:
7161 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7162 pmap = PV_PMAP(pv);
7163 PMAP_ASSERT_STAGE1(pmap);
7164 if (!PMAP_TRYLOCK(pmap)) {
7165 pvh_gen = pvh->pv_gen;
7166 rw_wunlock(lock);
7167 PMAP_LOCK(pmap);
7168 rw_wlock(lock);
7169 if (pvh_gen != pvh->pv_gen) {
7170 PMAP_UNLOCK(pmap);
7171 goto retry;
7172 }
7173 }
7174 va = pv->pv_va;
7175 pte = pmap_pte_exists(pmap, va, 2, __func__);
7176 if ((pmap_load(pte) & ATTR_SW_DBM) != 0)
7177 (void)pmap_demote_l2_locked(pmap, pte, va, &lock);
7178 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
7179 ("inconsistent pv lock %p %p for page %p",
7180 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
7181 PMAP_UNLOCK(pmap);
7182 }
7183 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7184 pmap = PV_PMAP(pv);
7185 if (!PMAP_TRYLOCK(pmap)) {
7186 pvh_gen = pvh->pv_gen;
7187 md_gen = m->md.pv_gen;
7188 rw_wunlock(lock);
7189 PMAP_LOCK(pmap);
7190 rw_wlock(lock);
7191 if (pvh_gen != pvh->pv_gen ||
7192 md_gen != m->md.pv_gen) {
7193 PMAP_UNLOCK(pmap);
7194 goto retry;
7195 }
7196 }
7197 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7198 oldpte = pmap_load(pte);
7199 if ((oldpte & ATTR_SW_DBM) != 0) {
7200 if ((oldpte & ATTR_CONTIGUOUS) != 0) {
7201 (void)pmap_demote_l3c(pmap, pte, pv->pv_va);
7202
7203 /*
7204 * The L3 entry's accessed bit may have
7205 * changed.
7206 */
7207 oldpte = pmap_load(pte);
7208 }
7209 if (pmap->pm_stage == PM_STAGE1) {
7210 set = ATTR_S1_AP_RW_BIT;
7211 clear = 0;
7212 mask = ATTR_S1_AP_RW_BIT;
7213 val = ATTR_S1_AP(ATTR_S1_AP_RW);
7214 } else {
7215 set = 0;
7216 clear = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7217 mask = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7218 val = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7219 }
7220 clear |= ATTR_SW_DBM;
7221 while (!atomic_fcmpset_64(pte, &oldpte,
7222 (oldpte | set) & ~clear))
7223 cpu_spinwait();
7224
7225 if ((oldpte & mask) == val)
7226 vm_page_dirty(m);
7227 pmap_invalidate_page(pmap, pv->pv_va, true);
7228 }
7229 PMAP_UNLOCK(pmap);
7230 }
7231 rw_wunlock(lock);
7232 vm_page_aflag_clear(m, PGA_WRITEABLE);
7233 }
7234
7235 /*
7236 * pmap_ts_referenced:
7237 *
7238 * Return a count of reference bits for a page, clearing those bits.
7239 * It is not necessary for every reference bit to be cleared, but it
7240 * is necessary that 0 only be returned when there are truly no
7241 * reference bits set.
7242 *
7243 * As an optimization, update the page's dirty field if a modified bit is
7244 * found while counting reference bits. This opportunistic update can be
7245 * performed at low cost and can eliminate the need for some future calls
7246 * to pmap_is_modified(). However, since this function stops after
7247 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
7248 * dirty pages. Those dirty pages will only be detected by a future call
7249 * to pmap_is_modified().
7250 */
7251 int
pmap_ts_referenced(vm_page_t m)7252 pmap_ts_referenced(vm_page_t m)
7253 {
7254 struct md_page *pvh;
7255 pv_entry_t pv, pvf;
7256 pmap_t pmap;
7257 struct rwlock *lock;
7258 pt_entry_t *pte, tpte;
7259 vm_offset_t va;
7260 vm_paddr_t pa;
7261 int cleared, md_gen, not_cleared, pvh_gen;
7262 struct spglist free;
7263
7264 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7265 ("pmap_ts_referenced: page %p is not managed", m));
7266 SLIST_INIT(&free);
7267 cleared = 0;
7268 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7269 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7270 rw_wlock(lock);
7271 retry:
7272 not_cleared = 0;
7273 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
7274 goto small_mappings;
7275 pv = pvf;
7276 do {
7277 if (pvf == NULL)
7278 pvf = pv;
7279 pmap = PV_PMAP(pv);
7280 if (!PMAP_TRYLOCK(pmap)) {
7281 pvh_gen = pvh->pv_gen;
7282 rw_wunlock(lock);
7283 PMAP_LOCK(pmap);
7284 rw_wlock(lock);
7285 if (pvh_gen != pvh->pv_gen) {
7286 PMAP_UNLOCK(pmap);
7287 goto retry;
7288 }
7289 }
7290 va = pv->pv_va;
7291 pte = pmap_pte_exists(pmap, va, 2, __func__);
7292 tpte = pmap_load(pte);
7293 if (pmap_pte_dirty(pmap, tpte)) {
7294 /*
7295 * Although "tpte" is mapping a 2MB page, because
7296 * this function is called at a 4KB page granularity,
7297 * we only update the 4KB page under test.
7298 */
7299 vm_page_dirty(m);
7300 }
7301 if ((tpte & ATTR_AF) != 0) {
7302 pa = VM_PAGE_TO_PHYS(m);
7303
7304 /*
7305 * Since this reference bit is shared by 512 4KB pages,
7306 * it should not be cleared every time it is tested.
7307 * Apply a simple "hash" function on the physical page
7308 * number, the virtual superpage number, and the pmap
7309 * address to select one 4KB page out of the 512 on
7310 * which testing the reference bit will result in
7311 * clearing that reference bit. This function is
7312 * designed to avoid the selection of the same 4KB page
7313 * for every 2MB page mapping.
7314 *
7315 * On demotion, a mapping that hasn't been referenced
7316 * is simply destroyed. To avoid the possibility of a
7317 * subsequent page fault on a demoted wired mapping,
7318 * always leave its reference bit set. Moreover,
7319 * since the superpage is wired, the current state of
7320 * its reference bit won't affect page replacement.
7321 */
7322 if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^
7323 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
7324 (tpte & ATTR_SW_WIRED) == 0) {
7325 pmap_clear_bits(pte, ATTR_AF);
7326 pmap_invalidate_page(pmap, va, true);
7327 cleared++;
7328 } else
7329 not_cleared++;
7330 }
7331 PMAP_UNLOCK(pmap);
7332 /* Rotate the PV list if it has more than one entry. */
7333 if (TAILQ_NEXT(pv, pv_next) != NULL) {
7334 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
7335 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
7336 pvh->pv_gen++;
7337 }
7338 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
7339 goto out;
7340 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
7341 small_mappings:
7342 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
7343 goto out;
7344 pv = pvf;
7345 do {
7346 if (pvf == NULL)
7347 pvf = pv;
7348 pmap = PV_PMAP(pv);
7349 if (!PMAP_TRYLOCK(pmap)) {
7350 pvh_gen = pvh->pv_gen;
7351 md_gen = m->md.pv_gen;
7352 rw_wunlock(lock);
7353 PMAP_LOCK(pmap);
7354 rw_wlock(lock);
7355 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
7356 PMAP_UNLOCK(pmap);
7357 goto retry;
7358 }
7359 }
7360 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7361 tpte = pmap_load(pte);
7362 if (pmap_pte_dirty(pmap, tpte))
7363 vm_page_dirty(m);
7364 if ((tpte & ATTR_AF) != 0) {
7365 if ((tpte & ATTR_SW_WIRED) == 0) {
7366 /*
7367 * Clear the accessed bit in this L3 entry
7368 * regardless of the contiguous bit.
7369 */
7370 pmap_clear_bits(pte, ATTR_AF);
7371 pmap_invalidate_page(pmap, pv->pv_va, true);
7372 cleared++;
7373 } else
7374 not_cleared++;
7375 } else if ((tpte & ATTR_CONTIGUOUS) != 0 &&
7376 (pmap_load_l3c(pte) & ATTR_AF) != 0) {
7377 /*
7378 * An L3C superpage mapping is regarded as accessed
7379 * until the accessed bit has been cleared in all
7380 * of its constituent entries.
7381 */
7382 not_cleared++;
7383 }
7384 PMAP_UNLOCK(pmap);
7385 /* Rotate the PV list if it has more than one entry. */
7386 if (TAILQ_NEXT(pv, pv_next) != NULL) {
7387 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
7388 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
7389 m->md.pv_gen++;
7390 }
7391 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
7392 not_cleared < PMAP_TS_REFERENCED_MAX);
7393 out:
7394 rw_wunlock(lock);
7395 vm_page_free_pages_toq(&free, true);
7396 return (cleared + not_cleared);
7397 }
7398
7399 /*
7400 * Apply the given advice to the specified range of addresses within the
7401 * given pmap. Depending on the advice, clear the referenced and/or
7402 * modified flags in each mapping and set the mapped page's dirty field.
7403 */
7404 void
pmap_advise(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,int advice)7405 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
7406 {
7407 struct rwlock *lock;
7408 vm_offset_t va, va_next, dva;
7409 vm_page_t m;
7410 pd_entry_t *l0, *l1, *l2, oldl2;
7411 pt_entry_t *l3, *dl3, oldl3;
7412
7413 PMAP_ASSERT_STAGE1(pmap);
7414
7415 if (advice != MADV_DONTNEED && advice != MADV_FREE)
7416 return;
7417
7418 PMAP_LOCK(pmap);
7419 for (; sva < eva; sva = va_next) {
7420 l0 = pmap_l0(pmap, sva);
7421 if (pmap_load(l0) == 0) {
7422 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
7423 if (va_next < sva)
7424 va_next = eva;
7425 continue;
7426 }
7427
7428 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
7429 if (va_next < sva)
7430 va_next = eva;
7431 l1 = pmap_l0_to_l1(l0, sva);
7432 if (pmap_load(l1) == 0)
7433 continue;
7434 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
7435 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
7436 continue;
7437 }
7438
7439 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
7440 if (va_next < sva)
7441 va_next = eva;
7442 l2 = pmap_l1_to_l2(l1, sva);
7443 oldl2 = pmap_load(l2);
7444 if (oldl2 == 0)
7445 continue;
7446 if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) {
7447 if ((oldl2 & ATTR_SW_MANAGED) == 0)
7448 continue;
7449 lock = NULL;
7450 if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) {
7451 if (lock != NULL)
7452 rw_wunlock(lock);
7453
7454 /*
7455 * The 2MB page mapping was destroyed.
7456 */
7457 continue;
7458 }
7459
7460 /*
7461 * Unless the page mappings are wired, remove the
7462 * mapping to a single page so that a subsequent
7463 * access may repromote. Choosing the last page
7464 * within the address range [sva, min(va_next, eva))
7465 * generally results in more repromotions. Since the
7466 * underlying page table page is fully populated, this
7467 * removal never frees a page table page.
7468 */
7469 if ((oldl2 & ATTR_SW_WIRED) == 0) {
7470 va = eva;
7471 if (va > va_next)
7472 va = va_next;
7473 va -= PAGE_SIZE;
7474 KASSERT(va >= sva,
7475 ("pmap_advise: no address gap"));
7476 l3 = pmap_l2_to_l3(l2, va);
7477 KASSERT(pmap_load(l3) != 0,
7478 ("pmap_advise: invalid PTE"));
7479 pmap_remove_l3(pmap, l3, va, pmap_load(l2),
7480 NULL, &lock);
7481 }
7482 if (lock != NULL)
7483 rw_wunlock(lock);
7484 }
7485 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
7486 ("pmap_advise: invalid L2 entry after demotion"));
7487 if (va_next > eva)
7488 va_next = eva;
7489 va = va_next;
7490 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
7491 sva += L3_SIZE) {
7492 oldl3 = pmap_load(l3);
7493 if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) !=
7494 (ATTR_SW_MANAGED | L3_PAGE))
7495 goto maybe_invlrng;
7496 else if (pmap_pte_dirty(pmap, oldl3)) {
7497 if (advice == MADV_DONTNEED) {
7498 /*
7499 * Future calls to pmap_is_modified()
7500 * can be avoided by making the page
7501 * dirty now.
7502 */
7503 m = PTE_TO_VM_PAGE(oldl3);
7504 vm_page_dirty(m);
7505 }
7506 if ((oldl3 & ATTR_CONTIGUOUS) != 0) {
7507 /*
7508 * Unconditionally demote the L3C
7509 * superpage because we do not allow
7510 * writeable, clean superpages.
7511 */
7512 (void)pmap_demote_l3c(pmap, l3, sva);
7513
7514 /*
7515 * Destroy the final mapping before the
7516 * next L3C boundary or va_next,
7517 * whichever comes first, so that a
7518 * subsequent access may act as a
7519 * repromotion trigger.
7520 */
7521 if ((oldl3 & ATTR_SW_WIRED) == 0) {
7522 dva = MIN((sva & ~L3C_OFFSET) +
7523 L3C_SIZE - PAGE_SIZE,
7524 va_next - PAGE_SIZE);
7525 dl3 = pmap_l2_to_l3(l2, dva);
7526 KASSERT(pmap_load(dl3) != 0,
7527 ("pmap_advise: invalid PTE"));
7528 lock = NULL;
7529 pmap_remove_l3(pmap, dl3, dva,
7530 pmap_load(l2), NULL, &lock);
7531 if (lock != NULL)
7532 rw_wunlock(lock);
7533 }
7534
7535 /*
7536 * The L3 entry's accessed bit may have
7537 * changed.
7538 */
7539 oldl3 = pmap_load(l3);
7540 }
7541
7542 /*
7543 * Check that we did not just destroy this entry so
7544 * we avoid corrupting the page able.
7545 */
7546 if (oldl3 != 0) {
7547 while (!atomic_fcmpset_long(l3, &oldl3,
7548 (oldl3 & ~ATTR_AF) |
7549 ATTR_S1_AP(ATTR_S1_AP_RO)))
7550 cpu_spinwait();
7551 }
7552 } else if ((oldl3 & ATTR_AF) != 0) {
7553 /*
7554 * Clear the accessed bit in this L3 entry
7555 * regardless of the contiguous bit.
7556 */
7557 pmap_clear_bits(l3, ATTR_AF);
7558 } else
7559 goto maybe_invlrng;
7560 if (va == va_next)
7561 va = sva;
7562 continue;
7563 maybe_invlrng:
7564 if (va != va_next) {
7565 pmap_s1_invalidate_range(pmap, va, sva, true);
7566 va = va_next;
7567 }
7568 }
7569 if (va != va_next)
7570 pmap_s1_invalidate_range(pmap, va, sva, true);
7571 }
7572 PMAP_UNLOCK(pmap);
7573 }
7574
7575 /*
7576 * Clear the modify bits on the specified physical page.
7577 */
7578 void
pmap_clear_modify(vm_page_t m)7579 pmap_clear_modify(vm_page_t m)
7580 {
7581 struct md_page *pvh;
7582 struct rwlock *lock;
7583 pmap_t pmap;
7584 pv_entry_t next_pv, pv;
7585 pd_entry_t *l2, oldl2;
7586 pt_entry_t *l3, oldl3;
7587 vm_offset_t va;
7588 int md_gen, pvh_gen;
7589
7590 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7591 ("pmap_clear_modify: page %p is not managed", m));
7592 vm_page_assert_busied(m);
7593
7594 if (!pmap_page_is_write_mapped(m))
7595 return;
7596 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7597 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7598 rw_wlock(lock);
7599 restart:
7600 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7601 pmap = PV_PMAP(pv);
7602 PMAP_ASSERT_STAGE1(pmap);
7603 if (!PMAP_TRYLOCK(pmap)) {
7604 pvh_gen = pvh->pv_gen;
7605 rw_wunlock(lock);
7606 PMAP_LOCK(pmap);
7607 rw_wlock(lock);
7608 if (pvh_gen != pvh->pv_gen) {
7609 PMAP_UNLOCK(pmap);
7610 goto restart;
7611 }
7612 }
7613 va = pv->pv_va;
7614 l2 = pmap_l2(pmap, va);
7615 oldl2 = pmap_load(l2);
7616 /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */
7617 if ((oldl2 & ATTR_SW_DBM) != 0 &&
7618 pmap_demote_l2_locked(pmap, l2, va, &lock) &&
7619 (oldl2 & ATTR_SW_WIRED) == 0) {
7620 /*
7621 * Write protect the mapping to a single page so that
7622 * a subsequent write access may repromote.
7623 */
7624 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2);
7625 l3 = pmap_l2_to_l3(l2, va);
7626 oldl3 = pmap_load(l3);
7627 while (!atomic_fcmpset_long(l3, &oldl3,
7628 (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO)))
7629 cpu_spinwait();
7630 vm_page_dirty(m);
7631 pmap_s1_invalidate_page(pmap, va, true);
7632 }
7633 PMAP_UNLOCK(pmap);
7634 }
7635 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7636 pmap = PV_PMAP(pv);
7637 PMAP_ASSERT_STAGE1(pmap);
7638 if (!PMAP_TRYLOCK(pmap)) {
7639 md_gen = m->md.pv_gen;
7640 pvh_gen = pvh->pv_gen;
7641 rw_wunlock(lock);
7642 PMAP_LOCK(pmap);
7643 rw_wlock(lock);
7644 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
7645 PMAP_UNLOCK(pmap);
7646 goto restart;
7647 }
7648 }
7649 l2 = pmap_l2(pmap, pv->pv_va);
7650 l3 = pmap_l2_to_l3(l2, pv->pv_va);
7651 oldl3 = pmap_load(l3);
7652 KASSERT((oldl3 & ATTR_CONTIGUOUS) == 0 ||
7653 (oldl3 & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
7654 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
7655 ("writeable L3C superpage not dirty"));
7656 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM) {
7657 if ((oldl3 & ATTR_CONTIGUOUS) != 0)
7658 (void)pmap_demote_l3c(pmap, l3, pv->pv_va);
7659 pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO));
7660 pmap_s1_invalidate_page(pmap, pv->pv_va, true);
7661 }
7662 PMAP_UNLOCK(pmap);
7663 }
7664 rw_wunlock(lock);
7665 }
7666
7667 void *
pmap_mapbios(vm_paddr_t pa,vm_size_t size)7668 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
7669 {
7670 struct pmap_preinit_mapping *ppim;
7671 vm_offset_t va, offset;
7672 pd_entry_t old_l2e, *pde;
7673 pt_entry_t *l2;
7674 int i, lvl, l2_blocks, free_l2_count, start_idx;
7675
7676 if (!vm_initialized) {
7677 /*
7678 * No L3 ptables so map entire L2 blocks where start VA is:
7679 * preinit_map_va + start_idx * L2_SIZE
7680 * There may be duplicate mappings (multiple VA -> same PA) but
7681 * ARM64 dcache is always PIPT so that's acceptable.
7682 */
7683 if (size == 0)
7684 return (NULL);
7685
7686 /* Calculate how many L2 blocks are needed for the mapping */
7687 l2_blocks = (roundup2(pa + size, L2_SIZE) -
7688 rounddown2(pa, L2_SIZE)) >> L2_SHIFT;
7689
7690 offset = pa & L2_OFFSET;
7691
7692 if (preinit_map_va == 0)
7693 return (NULL);
7694
7695 /* Map 2MiB L2 blocks from reserved VA space */
7696
7697 free_l2_count = 0;
7698 start_idx = -1;
7699 /* Find enough free contiguous VA space */
7700 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7701 ppim = pmap_preinit_mapping + i;
7702 if (free_l2_count > 0 && ppim->pa != 0) {
7703 /* Not enough space here */
7704 free_l2_count = 0;
7705 start_idx = -1;
7706 continue;
7707 }
7708
7709 if (ppim->pa == 0) {
7710 /* Free L2 block */
7711 if (start_idx == -1)
7712 start_idx = i;
7713 free_l2_count++;
7714 if (free_l2_count == l2_blocks)
7715 break;
7716 }
7717 }
7718 if (free_l2_count != l2_blocks)
7719 panic("%s: too many preinit mappings", __func__);
7720
7721 va = preinit_map_va + (start_idx * L2_SIZE);
7722 for (i = start_idx; i < start_idx + l2_blocks; i++) {
7723 /* Mark entries as allocated */
7724 ppim = pmap_preinit_mapping + i;
7725 ppim->pa = pa;
7726 ppim->va = va + offset;
7727 ppim->size = size;
7728 }
7729
7730 /* Map L2 blocks */
7731 pa = rounddown2(pa, L2_SIZE);
7732 old_l2e = 0;
7733 for (i = 0; i < l2_blocks; i++) {
7734 pde = pmap_pde(kernel_pmap, va, &lvl);
7735 KASSERT(pde != NULL,
7736 ("pmap_mapbios: Invalid page entry, va: 0x%lx",
7737 va));
7738 KASSERT(lvl == 1,
7739 ("pmap_mapbios: Invalid level %d", lvl));
7740
7741 /* Insert L2_BLOCK */
7742 l2 = pmap_l1_to_l2(pde, va);
7743 old_l2e |= pmap_load_store(l2,
7744 PHYS_TO_PTE(pa) | ATTR_DEFAULT | ATTR_S1_XN |
7745 ATTR_KERN_GP | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
7746 L2_BLOCK);
7747
7748 va += L2_SIZE;
7749 pa += L2_SIZE;
7750 }
7751 if ((old_l2e & ATTR_DESCR_VALID) != 0)
7752 pmap_s1_invalidate_all(kernel_pmap);
7753 else {
7754 /*
7755 * Because the old entries were invalid and the new
7756 * mappings are not executable, an isb is not required.
7757 */
7758 dsb(ishst);
7759 }
7760
7761 va = preinit_map_va + (start_idx * L2_SIZE);
7762
7763 } else {
7764 /* kva_alloc may be used to map the pages */
7765 offset = pa & PAGE_MASK;
7766 size = round_page(offset + size);
7767
7768 va = kva_alloc(size);
7769 if (va == 0)
7770 panic("%s: Couldn't allocate KVA", __func__);
7771
7772 pde = pmap_pde(kernel_pmap, va, &lvl);
7773 KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl));
7774
7775 /* L3 table is linked */
7776 va = trunc_page(va);
7777 pa = trunc_page(pa);
7778 pmap_kenter(va, size, pa, memory_mapping_mode(pa));
7779 }
7780
7781 return ((void *)(va + offset));
7782 }
7783
7784 void
pmap_unmapbios(void * p,vm_size_t size)7785 pmap_unmapbios(void *p, vm_size_t size)
7786 {
7787 struct pmap_preinit_mapping *ppim;
7788 vm_offset_t offset, va, va_trunc;
7789 pd_entry_t *pde;
7790 pt_entry_t *l2;
7791 int i, lvl, l2_blocks, block;
7792 bool preinit_map;
7793
7794 va = (vm_offset_t)p;
7795 l2_blocks =
7796 (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT;
7797 KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size));
7798
7799 /* Remove preinit mapping */
7800 preinit_map = false;
7801 block = 0;
7802 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7803 ppim = pmap_preinit_mapping + i;
7804 if (ppim->va == va) {
7805 KASSERT(ppim->size == size,
7806 ("pmap_unmapbios: size mismatch"));
7807 ppim->va = 0;
7808 ppim->pa = 0;
7809 ppim->size = 0;
7810 preinit_map = true;
7811 offset = block * L2_SIZE;
7812 va_trunc = rounddown2(va, L2_SIZE) + offset;
7813
7814 /* Remove L2_BLOCK */
7815 pde = pmap_pde(kernel_pmap, va_trunc, &lvl);
7816 KASSERT(pde != NULL,
7817 ("pmap_unmapbios: Invalid page entry, va: 0x%lx",
7818 va_trunc));
7819 l2 = pmap_l1_to_l2(pde, va_trunc);
7820 pmap_clear(l2);
7821
7822 if (block == (l2_blocks - 1))
7823 break;
7824 block++;
7825 }
7826 }
7827 if (preinit_map) {
7828 pmap_s1_invalidate_all(kernel_pmap);
7829 return;
7830 }
7831
7832 /* Unmap the pages reserved with kva_alloc. */
7833 if (vm_initialized) {
7834 offset = va & PAGE_MASK;
7835 size = round_page(offset + size);
7836 va = trunc_page(va);
7837
7838 /* Unmap and invalidate the pages */
7839 pmap_kremove_device(va, size);
7840
7841 kva_free(va, size);
7842 }
7843 }
7844
7845 /*
7846 * Sets the memory attribute for the specified page.
7847 */
7848 void
pmap_page_set_memattr(vm_page_t m,vm_memattr_t ma)7849 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
7850 {
7851
7852 m->md.pv_memattr = ma;
7853
7854 /*
7855 * If "m" is a normal page, update its direct mapping. This update
7856 * can be relied upon to perform any cache operations that are
7857 * required for data coherence.
7858 */
7859 if ((m->flags & PG_FICTITIOUS) == 0 &&
7860 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
7861 m->md.pv_memattr) != 0)
7862 panic("memory attribute change on the direct map failed");
7863 }
7864
7865 /*
7866 * Changes the specified virtual address range's memory type to that given by
7867 * the parameter "mode". The specified virtual address range must be
7868 * completely contained within either the direct map or the kernel map. If
7869 * the virtual address range is contained within the kernel map, then the
7870 * memory type for each of the corresponding ranges of the direct map is also
7871 * changed. (The corresponding ranges of the direct map are those ranges that
7872 * map the same physical pages as the specified virtual address range.) These
7873 * changes to the direct map are necessary because Intel describes the
7874 * behavior of their processors as "undefined" if two or more mappings to the
7875 * same physical page have different memory types.
7876 *
7877 * Returns zero if the change completed successfully, and either EINVAL or
7878 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part
7879 * of the virtual address range was not mapped, and ENOMEM is returned if
7880 * there was insufficient memory available to complete the change. In the
7881 * latter case, the memory type may have been changed on some part of the
7882 * virtual address range or the direct map.
7883 */
7884 int
pmap_change_attr(vm_offset_t va,vm_size_t size,int mode)7885 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
7886 {
7887 int error;
7888
7889 PMAP_LOCK(kernel_pmap);
7890 error = pmap_change_props_locked(va, size, PROT_NONE, mode, false);
7891 PMAP_UNLOCK(kernel_pmap);
7892 return (error);
7893 }
7894
7895 /*
7896 * Changes the specified virtual address range's protections to those
7897 * specified by "prot". Like pmap_change_attr(), protections for aliases
7898 * in the direct map are updated as well. Protections on aliasing mappings may
7899 * be a subset of the requested protections; for example, mappings in the direct
7900 * map are never executable.
7901 */
7902 int
pmap_change_prot(vm_offset_t va,vm_size_t size,vm_prot_t prot)7903 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
7904 {
7905 int error;
7906
7907 /* Only supported within the kernel map. */
7908 if (va < VM_MIN_KERNEL_ADDRESS)
7909 return (EINVAL);
7910
7911 PMAP_LOCK(kernel_pmap);
7912 error = pmap_change_props_locked(va, size, prot, -1, false);
7913 PMAP_UNLOCK(kernel_pmap);
7914 return (error);
7915 }
7916
7917 static int
pmap_change_props_locked(vm_offset_t va,vm_size_t size,vm_prot_t prot,int mode,bool skip_unmapped)7918 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
7919 int mode, bool skip_unmapped)
7920 {
7921 vm_offset_t base, offset, tmpva;
7922 vm_size_t pte_size;
7923 vm_paddr_t pa;
7924 pt_entry_t pte, *ptep, *newpte;
7925 pt_entry_t bits, mask;
7926 int lvl, rv;
7927
7928 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
7929 base = trunc_page(va);
7930 offset = va & PAGE_MASK;
7931 size = round_page(offset + size);
7932
7933 if (!VIRT_IN_DMAP(base) &&
7934 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
7935 return (EINVAL);
7936
7937 bits = 0;
7938 mask = 0;
7939 if (mode != -1) {
7940 bits = ATTR_S1_IDX(mode);
7941 mask = ATTR_S1_IDX_MASK;
7942 if (mode == VM_MEMATTR_DEVICE) {
7943 mask |= ATTR_S1_XN;
7944 bits |= ATTR_S1_XN;
7945 }
7946 }
7947 if (prot != VM_PROT_NONE) {
7948 /* Don't mark the DMAP as executable. It never is on arm64. */
7949 if (VIRT_IN_DMAP(base)) {
7950 prot &= ~VM_PROT_EXECUTE;
7951 /*
7952 * XXX Mark the DMAP as writable for now. We rely
7953 * on this in ddb & dtrace to insert breakpoint
7954 * instructions.
7955 */
7956 prot |= VM_PROT_WRITE;
7957 }
7958
7959 if ((prot & VM_PROT_WRITE) == 0) {
7960 bits |= ATTR_S1_AP(ATTR_S1_AP_RO);
7961 }
7962 if ((prot & VM_PROT_EXECUTE) == 0) {
7963 bits |= ATTR_S1_PXN;
7964 }
7965 bits |= ATTR_S1_UXN;
7966 mask |= ATTR_S1_AP_MASK | ATTR_S1_XN;
7967 }
7968
7969 for (tmpva = base; tmpva < base + size; ) {
7970 ptep = pmap_pte(kernel_pmap, tmpva, &lvl);
7971 if (ptep == NULL && !skip_unmapped) {
7972 return (EINVAL);
7973 } else if ((ptep == NULL && skip_unmapped) ||
7974 (pmap_load(ptep) & mask) == bits) {
7975 /*
7976 * We already have the correct attribute or there
7977 * is no memory mapped at this address and we are
7978 * skipping unmapped memory.
7979 */
7980 switch (lvl) {
7981 default:
7982 panic("Invalid DMAP table level: %d\n", lvl);
7983 case 1:
7984 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
7985 break;
7986 case 2:
7987 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
7988 break;
7989 case 3:
7990 tmpva += PAGE_SIZE;
7991 break;
7992 }
7993 } else {
7994 /* We can't demote/promote this entry */
7995 MPASS((pmap_load(ptep) & ATTR_SW_NO_PROMOTE) == 0);
7996
7997 /*
7998 * Find the entry and demote it if the requested change
7999 * only applies to part of the address range mapped by
8000 * the entry.
8001 */
8002 switch (lvl) {
8003 default:
8004 panic("Invalid DMAP table level: %d\n", lvl);
8005 case 1:
8006 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
8007 if ((tmpva & L1_OFFSET) == 0 &&
8008 (base + size - tmpva) >= L1_SIZE) {
8009 pte_size = L1_SIZE;
8010 break;
8011 }
8012 newpte = pmap_demote_l1(kernel_pmap, ptep,
8013 tmpva & ~L1_OFFSET);
8014 if (newpte == NULL)
8015 return (EINVAL);
8016 ptep = pmap_l1_to_l2(ptep, tmpva);
8017 /* FALLTHROUGH */
8018 case 2:
8019 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
8020 if ((tmpva & L2C_OFFSET) == 0 &&
8021 (base + size - tmpva) >= L2C_SIZE) {
8022 pte_size = L2C_SIZE;
8023 break;
8024 }
8025 if (!pmap_demote_l2c(kernel_pmap, ptep,
8026 tmpva))
8027 return (EINVAL);
8028 }
8029 if ((tmpva & L2_OFFSET) == 0 &&
8030 (base + size - tmpva) >= L2_SIZE) {
8031 pte_size = L2_SIZE;
8032 break;
8033 }
8034 newpte = pmap_demote_l2(kernel_pmap, ptep,
8035 tmpva);
8036 if (newpte == NULL)
8037 return (EINVAL);
8038 ptep = pmap_l2_to_l3(ptep, tmpva);
8039 /* FALLTHROUGH */
8040 case 3:
8041 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
8042 if ((tmpva & L3C_OFFSET) == 0 &&
8043 (base + size - tmpva) >= L3C_SIZE) {
8044 pte_size = L3C_SIZE;
8045 break;
8046 }
8047 if (!pmap_demote_l3c(kernel_pmap, ptep,
8048 tmpva))
8049 return (EINVAL);
8050 }
8051 pte_size = PAGE_SIZE;
8052 break;
8053 }
8054
8055 /* Update the entry */
8056 pte = pmap_load(ptep);
8057 pte &= ~mask;
8058 pte |= bits;
8059
8060 switch (pte_size) {
8061 case L2C_SIZE:
8062 pmap_update_strided(kernel_pmap, ptep, ptep +
8063 L2C_ENTRIES, pte, tmpva, L2_SIZE, L2C_SIZE);
8064 break;
8065 case L3C_SIZE:
8066 pmap_update_strided(kernel_pmap, ptep, ptep +
8067 L3C_ENTRIES, pte, tmpva, L3_SIZE, L3C_SIZE);
8068 break;
8069 default:
8070 /*
8071 * We are updating a single block or page entry,
8072 * so regardless of pte_size pass PAGE_SIZE in
8073 * order that a single TLB invalidation is
8074 * performed.
8075 */
8076 pmap_update_entry(kernel_pmap, ptep, pte, tmpva,
8077 PAGE_SIZE);
8078 break;
8079 }
8080
8081 pa = PTE_TO_PHYS(pte);
8082 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) {
8083 /*
8084 * Keep the DMAP memory in sync.
8085 */
8086 rv = pmap_change_props_locked(
8087 PHYS_TO_DMAP(pa), pte_size,
8088 prot, mode, true);
8089 if (rv != 0)
8090 return (rv);
8091 }
8092
8093 /*
8094 * If moving to a non-cacheable entry flush
8095 * the cache.
8096 */
8097 if (mode == VM_MEMATTR_UNCACHEABLE)
8098 cpu_dcache_wbinv_range((void *)tmpva, pte_size);
8099 tmpva += pte_size;
8100 }
8101 }
8102
8103 return (0);
8104 }
8105
8106 /*
8107 * Create an L2 table to map all addresses within an L1 mapping.
8108 */
8109 static pt_entry_t *
pmap_demote_l1(pmap_t pmap,pt_entry_t * l1,vm_offset_t va)8110 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
8111 {
8112 pt_entry_t *l2, newl2, oldl1;
8113 vm_offset_t tmpl1;
8114 vm_paddr_t l2phys, phys;
8115 vm_page_t ml2;
8116 int i;
8117
8118 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8119 oldl1 = pmap_load(l1);
8120 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
8121 KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
8122 ("pmap_demote_l1: Demoting a non-block entry"));
8123 KASSERT((va & L1_OFFSET) == 0,
8124 ("pmap_demote_l1: Invalid virtual address %#lx", va));
8125 KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
8126 ("pmap_demote_l1: Level 1 table shouldn't be managed"));
8127 KASSERT((oldl1 & ATTR_SW_NO_PROMOTE) == 0,
8128 ("pmap_demote_l1: Demoting entry with no-demote flag set"));
8129
8130 tmpl1 = 0;
8131 if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
8132 tmpl1 = kva_alloc(PAGE_SIZE);
8133 if (tmpl1 == 0)
8134 return (NULL);
8135 }
8136
8137 if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) ==
8138 NULL) {
8139 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
8140 " in pmap %p", va, pmap);
8141 l2 = NULL;
8142 goto fail;
8143 }
8144
8145 l2phys = VM_PAGE_TO_PHYS(ml2);
8146 l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
8147
8148 /* Address the range points at */
8149 phys = PTE_TO_PHYS(oldl1);
8150 /* The attributed from the old l1 table to be copied */
8151 newl2 = oldl1 & ATTR_MASK;
8152
8153 /* Create the new entries */
8154 newl2 |= ATTR_CONTIGUOUS;
8155 for (i = 0; i < Ln_ENTRIES; i++) {
8156 l2[i] = newl2 | phys;
8157 phys += L2_SIZE;
8158 }
8159 KASSERT(l2[0] == (ATTR_CONTIGUOUS | (oldl1 & ~ATTR_DESCR_MASK) |
8160 L2_BLOCK), ("Invalid l2 page (%lx != %lx)", l2[0],
8161 ATTR_CONTIGUOUS | (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
8162
8163 if (tmpl1 != 0) {
8164 pmap_kenter(tmpl1, PAGE_SIZE,
8165 DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET,
8166 VM_MEMATTR_WRITE_BACK);
8167 l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
8168 }
8169
8170 pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
8171
8172 counter_u64_add(pmap_l1_demotions, 1);
8173 fail:
8174 if (tmpl1 != 0) {
8175 pmap_kremove(tmpl1);
8176 kva_free(tmpl1, PAGE_SIZE);
8177 }
8178
8179 return (l2);
8180 }
8181
8182 static void
pmap_fill_l3(pt_entry_t * firstl3,pt_entry_t newl3)8183 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3)
8184 {
8185 pt_entry_t *l3;
8186
8187 for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) {
8188 *l3 = newl3;
8189 newl3 += L3_SIZE;
8190 }
8191 }
8192
8193 static void
pmap_demote_l2_check(pt_entry_t * firstl3p __unused,pt_entry_t newl3e __unused)8194 pmap_demote_l2_check(pt_entry_t *firstl3p __unused, pt_entry_t newl3e __unused)
8195 {
8196 #ifdef INVARIANTS
8197 #ifdef DIAGNOSTIC
8198 pt_entry_t *xl3p, *yl3p;
8199
8200 for (xl3p = firstl3p; xl3p < firstl3p + Ln_ENTRIES;
8201 xl3p++, newl3e += PAGE_SIZE) {
8202 if (PTE_TO_PHYS(pmap_load(xl3p)) != PTE_TO_PHYS(newl3e)) {
8203 printf("pmap_demote_l2: xl3e %zd and newl3e map "
8204 "different pages: found %#lx, expected %#lx\n",
8205 xl3p - firstl3p, pmap_load(xl3p), newl3e);
8206 printf("page table dump\n");
8207 for (yl3p = firstl3p; yl3p < firstl3p + Ln_ENTRIES;
8208 yl3p++) {
8209 printf("%zd %#lx\n", yl3p - firstl3p,
8210 pmap_load(yl3p));
8211 }
8212 panic("firstpte");
8213 }
8214 }
8215 #else
8216 KASSERT(PTE_TO_PHYS(pmap_load(firstl3p)) == PTE_TO_PHYS(newl3e),
8217 ("pmap_demote_l2: firstl3 and newl3e map different physical"
8218 " addresses"));
8219 #endif
8220 #endif
8221 }
8222
8223 static void
pmap_demote_l2_abort(pmap_t pmap,vm_offset_t va,pt_entry_t * l2,struct rwlock ** lockp)8224 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2,
8225 struct rwlock **lockp)
8226 {
8227 struct spglist free;
8228
8229 SLIST_INIT(&free);
8230 (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free,
8231 lockp);
8232 vm_page_free_pages_toq(&free, true);
8233 }
8234
8235 /*
8236 * Create an L3 table to map all addresses within an L2 mapping.
8237 */
8238 static pt_entry_t *
pmap_demote_l2_locked(pmap_t pmap,pt_entry_t * l2,vm_offset_t va,struct rwlock ** lockp)8239 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
8240 struct rwlock **lockp)
8241 {
8242 pt_entry_t *l3, newl3, oldl2;
8243 vm_offset_t tmpl2;
8244 vm_paddr_t l3phys;
8245 vm_page_t ml3;
8246
8247 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8248 PMAP_ASSERT_STAGE1(pmap);
8249 KASSERT(ADDR_IS_CANONICAL(va),
8250 ("%s: Address not in canonical form: %lx", __func__, va));
8251
8252 l3 = NULL;
8253 oldl2 = pmap_load(l2);
8254 KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
8255 ("pmap_demote_l2: Demoting a non-block entry"));
8256 KASSERT((oldl2 & ATTR_SW_NO_PROMOTE) == 0,
8257 ("pmap_demote_l2: Demoting entry with no-demote flag set"));
8258 va &= ~L2_OFFSET;
8259
8260 tmpl2 = 0;
8261 if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
8262 tmpl2 = kva_alloc(PAGE_SIZE);
8263 if (tmpl2 == 0)
8264 return (NULL);
8265 }
8266
8267 /*
8268 * Invalidate the 2MB page mapping and return "failure" if the
8269 * mapping was never accessed.
8270 */
8271 if ((oldl2 & ATTR_AF) == 0) {
8272 KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
8273 ("pmap_demote_l2: a wired mapping is missing ATTR_AF"));
8274 pmap_demote_l2_abort(pmap, va, l2, lockp);
8275 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p",
8276 va, pmap);
8277 goto fail;
8278 }
8279
8280 if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
8281 KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
8282 ("pmap_demote_l2: page table page for a wired mapping"
8283 " is missing"));
8284
8285 /*
8286 * If the page table page is missing and the mapping
8287 * is for a kernel address, the mapping must belong to
8288 * either the direct map or the early kernel memory.
8289 * Page table pages are preallocated for every other
8290 * part of the kernel address space, so the direct map
8291 * region and early kernel memory are the only parts of the
8292 * kernel address space that must be handled here.
8293 */
8294 KASSERT(!ADDR_IS_KERNEL(va) || VIRT_IN_DMAP(va) ||
8295 (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end),
8296 ("pmap_demote_l2: No saved mpte for va %#lx", va));
8297
8298 /*
8299 * If the 2MB page mapping belongs to the direct map
8300 * region of the kernel's address space, then the page
8301 * allocation request specifies the highest possible
8302 * priority (VM_ALLOC_INTERRUPT). Otherwise, the
8303 * priority is normal.
8304 */
8305 ml3 = vm_page_alloc_noobj(
8306 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
8307 VM_ALLOC_WIRED);
8308
8309 /*
8310 * If the allocation of the new page table page fails,
8311 * invalidate the 2MB page mapping and return "failure".
8312 */
8313 if (ml3 == NULL) {
8314 pmap_demote_l2_abort(pmap, va, l2, lockp);
8315 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
8316 " in pmap %p", va, pmap);
8317 goto fail;
8318 }
8319 ml3->pindex = pmap_l2_pindex(va);
8320
8321 if (!ADDR_IS_KERNEL(va)) {
8322 ml3->ref_count = NL3PG;
8323 pmap_resident_count_inc(pmap, 1);
8324 }
8325 }
8326 l3phys = VM_PAGE_TO_PHYS(ml3);
8327 l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys);
8328 newl3 = ATTR_CONTIGUOUS | (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE;
8329 KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) !=
8330 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM),
8331 ("pmap_demote_l2: L2 entry is writeable but not dirty"));
8332
8333 /*
8334 * If the PTP is not leftover from an earlier promotion or it does not
8335 * have ATTR_AF set in every L3E, then fill it. The new L3Es will all
8336 * have ATTR_AF set.
8337 *
8338 * When pmap_update_entry() clears the old L2 mapping, it (indirectly)
8339 * performs a dsb(). That dsb() ensures that the stores for filling
8340 * "l3" are visible before "l3" is added to the page table.
8341 */
8342 if (!vm_page_all_valid(ml3))
8343 pmap_fill_l3(l3, newl3);
8344
8345 pmap_demote_l2_check(l3, newl3);
8346
8347 /*
8348 * If the mapping has changed attributes, update the L3Es.
8349 */
8350 if ((pmap_load(l3) & ATTR_PROMOTE) != (newl3 & ATTR_PROMOTE))
8351 pmap_fill_l3(l3, newl3);
8352
8353 /*
8354 * Map the temporary page so we don't lose access to the l2 table.
8355 */
8356 if (tmpl2 != 0) {
8357 pmap_kenter(tmpl2, PAGE_SIZE,
8358 DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET,
8359 VM_MEMATTR_WRITE_BACK);
8360 l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
8361 }
8362
8363 /*
8364 * The spare PV entries must be reserved prior to demoting the
8365 * mapping, that is, prior to changing the PDE. Otherwise, the state
8366 * of the L2 and the PV lists will be inconsistent, which can result
8367 * in reclaim_pv_chunk() attempting to remove a PV entry from the
8368 * wrong PV list and pmap_pv_demote_l2() failing to find the expected
8369 * PV entry for the 2MB page mapping that is being demoted.
8370 */
8371 if ((oldl2 & ATTR_SW_MANAGED) != 0)
8372 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
8373
8374 /*
8375 * Pass PAGE_SIZE so that a single TLB invalidation is performed on
8376 * the 2MB page mapping.
8377 */
8378 pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
8379
8380 /*
8381 * Demote the PV entry.
8382 */
8383 if ((oldl2 & ATTR_SW_MANAGED) != 0)
8384 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp);
8385
8386 atomic_add_long(&pmap_l2_demotions, 1);
8387 CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
8388 " in pmap %p %lx", va, pmap, l3[0]);
8389
8390 fail:
8391 if (tmpl2 != 0) {
8392 pmap_kremove(tmpl2);
8393 kva_free(tmpl2, PAGE_SIZE);
8394 }
8395
8396 return (l3);
8397
8398 }
8399
8400 static pt_entry_t *
pmap_demote_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)8401 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
8402 {
8403 struct rwlock *lock;
8404 pt_entry_t *l3;
8405
8406 lock = NULL;
8407 l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
8408 if (lock != NULL)
8409 rw_wunlock(lock);
8410 return (l3);
8411 }
8412
8413 /*
8414 * Demote an L2C superpage mapping to L2C_ENTRIES L2 block mappings.
8415 */
8416 static bool
pmap_demote_l2c(pmap_t pmap,pt_entry_t * l2p,vm_offset_t va)8417 pmap_demote_l2c(pmap_t pmap, pt_entry_t *l2p, vm_offset_t va)
8418 {
8419 pd_entry_t *l2c_end, *l2c_start, l2e, mask, nbits, *tl2p;
8420 vm_offset_t tmpl3;
8421 register_t intr;
8422
8423 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8424 PMAP_ASSERT_STAGE1(pmap);
8425 l2c_start = (pd_entry_t *)((uintptr_t)l2p & ~((L2C_ENTRIES *
8426 sizeof(pd_entry_t)) - 1));
8427 l2c_end = l2c_start + L2C_ENTRIES;
8428 tmpl3 = 0;
8429 if ((va & ~L2C_OFFSET) < (vm_offset_t)l2c_end &&
8430 (vm_offset_t)l2c_start < (va & ~L2C_OFFSET) + L2C_SIZE) {
8431 tmpl3 = kva_alloc(PAGE_SIZE);
8432 if (tmpl3 == 0)
8433 return (false);
8434 pmap_kenter(tmpl3, PAGE_SIZE,
8435 DMAP_TO_PHYS((vm_offset_t)l2c_start) & ~L3_OFFSET,
8436 VM_MEMATTR_WRITE_BACK);
8437 l2c_start = (pd_entry_t *)(tmpl3 +
8438 ((vm_offset_t)l2c_start & PAGE_MASK));
8439 l2c_end = (pd_entry_t *)(tmpl3 +
8440 ((vm_offset_t)l2c_end & PAGE_MASK));
8441 }
8442 mask = 0;
8443 nbits = ATTR_DESCR_VALID;
8444 intr = intr_disable();
8445
8446 /*
8447 * Break the mappings.
8448 */
8449 for (tl2p = l2c_start; tl2p < l2c_end; tl2p++) {
8450 /*
8451 * Clear the mapping's contiguous and valid bits, but leave
8452 * the rest of the entry unchanged, so that a lockless,
8453 * concurrent pmap_kextract() can still lookup the physical
8454 * address.
8455 */
8456 l2e = pmap_load(tl2p);
8457 KASSERT((l2e & ATTR_CONTIGUOUS) != 0,
8458 ("pmap_demote_l2c: missing ATTR_CONTIGUOUS"));
8459 KASSERT((l2e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
8460 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
8461 ("pmap_demote_l2c: missing ATTR_S1_AP_RW"));
8462 while (!atomic_fcmpset_64(tl2p, &l2e, l2e & ~(ATTR_CONTIGUOUS |
8463 ATTR_DESCR_VALID)))
8464 cpu_spinwait();
8465
8466 /*
8467 * Hardware accessed and dirty bit maintenance might only
8468 * update a single L2 entry, so we must combine the accessed
8469 * and dirty bits from this entire set of contiguous L2
8470 * entries.
8471 */
8472 if ((l2e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8473 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8474 mask = ATTR_S1_AP_RW_BIT;
8475 nbits |= l2e & ATTR_AF;
8476 }
8477 if ((nbits & ATTR_AF) != 0) {
8478 pmap_s1_invalidate_strided(pmap, va & ~L2C_OFFSET, (va +
8479 L2C_SIZE) & ~L2C_OFFSET, L2_SIZE, true);
8480 }
8481
8482 /*
8483 * Remake the mappings, updating the accessed and dirty bits.
8484 */
8485 for (tl2p = l2c_start; tl2p < l2c_end; tl2p++) {
8486 l2e = pmap_load(tl2p);
8487 while (!atomic_fcmpset_64(tl2p, &l2e, (l2e & ~mask) | nbits))
8488 cpu_spinwait();
8489 }
8490 dsb(ishst);
8491
8492 intr_restore(intr);
8493 if (tmpl3 != 0) {
8494 pmap_kremove(tmpl3);
8495 kva_free(tmpl3, PAGE_SIZE);
8496 }
8497 counter_u64_add(pmap_l2c_demotions, 1);
8498 CTR2(KTR_PMAP, "pmap_demote_l2c: success for va %#lx in pmap %p",
8499 va, pmap);
8500 return (true);
8501 }
8502
8503 /*
8504 * Demote a L3C superpage mapping to L3C_ENTRIES 4KB page mappings.
8505 */
8506 static bool
pmap_demote_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va)8507 pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va)
8508 {
8509 pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p;
8510 vm_offset_t tmpl3;
8511 register_t intr;
8512
8513 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8514 l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
8515 sizeof(pt_entry_t)) - 1));
8516 l3c_end = l3c_start + L3C_ENTRIES;
8517 tmpl3 = 0;
8518 if ((va & ~L3C_OFFSET) < (vm_offset_t)l3c_end &&
8519 (vm_offset_t)l3c_start < (va & ~L3C_OFFSET) + L3C_SIZE) {
8520 tmpl3 = kva_alloc(PAGE_SIZE);
8521 if (tmpl3 == 0)
8522 return (false);
8523 pmap_kenter(tmpl3, PAGE_SIZE,
8524 DMAP_TO_PHYS((vm_offset_t)l3c_start) & ~L3_OFFSET,
8525 VM_MEMATTR_WRITE_BACK);
8526 l3c_start = (pt_entry_t *)(tmpl3 +
8527 ((vm_offset_t)l3c_start & PAGE_MASK));
8528 l3c_end = (pt_entry_t *)(tmpl3 +
8529 ((vm_offset_t)l3c_end & PAGE_MASK));
8530 }
8531 mask = 0;
8532 nbits = ATTR_DESCR_VALID;
8533 intr = intr_disable();
8534
8535 /*
8536 * Break the mappings.
8537 */
8538 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8539 /*
8540 * Clear the mapping's contiguous and valid bits, but leave
8541 * the rest of the entry unchanged, so that a lockless,
8542 * concurrent pmap_kextract() can still lookup the physical
8543 * address.
8544 */
8545 l3e = pmap_load(tl3p);
8546 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
8547 ("pmap_demote_l3c: missing ATTR_CONTIGUOUS"));
8548 KASSERT((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
8549 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
8550 ("pmap_demote_l3c: missing ATTR_S1_AP_RW"));
8551 while (!atomic_fcmpset_64(tl3p, &l3e, l3e & ~(ATTR_CONTIGUOUS |
8552 ATTR_DESCR_VALID)))
8553 cpu_spinwait();
8554
8555 /*
8556 * Hardware accessed and dirty bit maintenance might only
8557 * update a single L3 entry, so we must combine the accessed
8558 * and dirty bits from this entire set of contiguous L3
8559 * entries.
8560 */
8561 if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8562 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8563 mask = ATTR_S1_AP_RW_BIT;
8564 nbits |= l3e & ATTR_AF;
8565 }
8566 if ((nbits & ATTR_AF) != 0) {
8567 pmap_invalidate_range(pmap, va & ~L3C_OFFSET, (va + L3C_SIZE) &
8568 ~L3C_OFFSET, true);
8569 }
8570
8571 /*
8572 * Remake the mappings, updating the accessed and dirty bits.
8573 */
8574 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8575 l3e = pmap_load(tl3p);
8576 while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits))
8577 cpu_spinwait();
8578 }
8579 dsb(ishst);
8580
8581 intr_restore(intr);
8582 if (tmpl3 != 0) {
8583 pmap_kremove(tmpl3);
8584 kva_free(tmpl3, PAGE_SIZE);
8585 }
8586 counter_u64_add(pmap_l3c_demotions, 1);
8587 CTR2(KTR_PMAP, "pmap_demote_l3c: success for va %#lx in pmap %p",
8588 va, pmap);
8589 return (true);
8590 }
8591
8592 /*
8593 * Accumulate the accessed and dirty bits within a L3C superpage and
8594 * return the specified PTE with them applied correctly.
8595 */
8596 static pt_entry_t
pmap_load_l3c(pt_entry_t * l3p)8597 pmap_load_l3c(pt_entry_t *l3p)
8598 {
8599 pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p;
8600
8601 l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
8602 sizeof(pt_entry_t)) - 1));
8603 l3c_end = l3c_start + L3C_ENTRIES;
8604 mask = 0;
8605 nbits = 0;
8606 /* Iterate over each mapping in the superpage. */
8607 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8608 l3e = pmap_load(tl3p);
8609 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
8610 ("pmap_load_l3c: missing ATTR_CONTIGUOUS"));
8611 /* Update mask if the current page has its dirty bit set. */
8612 if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8613 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8614 mask = ATTR_S1_AP_RW_BIT;
8615 /* Update nbits if the accessed bit is set. */
8616 nbits |= l3e & ATTR_AF;
8617 }
8618 return ((pmap_load(l3p) & ~mask) | nbits);
8619 }
8620
8621 /*
8622 * Perform the pmap work for mincore(2). If the page is not both referenced and
8623 * modified by this pmap, returns its physical address so that the caller can
8624 * find other mappings.
8625 */
8626 int
pmap_mincore(pmap_t pmap,vm_offset_t addr,vm_paddr_t * pap)8627 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
8628 {
8629 pt_entry_t *pte, tpte;
8630 vm_paddr_t mask, pa;
8631 int lvl, psind, val;
8632 bool managed;
8633
8634 PMAP_ASSERT_STAGE1(pmap);
8635 PMAP_LOCK(pmap);
8636 pte = pmap_pte(pmap, addr, &lvl);
8637 if (pte != NULL) {
8638 tpte = pmap_load(pte);
8639
8640 switch (lvl) {
8641 case 3:
8642 mask = L3_OFFSET;
8643 psind = (tpte & ATTR_CONTIGUOUS) != 0 ? 1 : 0;
8644 break;
8645 case 2:
8646 mask = L2_OFFSET;
8647 psind = 2;
8648 break;
8649 case 1:
8650 mask = L1_OFFSET;
8651 psind = 3;
8652 break;
8653 default:
8654 panic("pmap_mincore: invalid level %d", lvl);
8655 }
8656
8657 managed = (tpte & ATTR_SW_MANAGED) != 0;
8658 val = MINCORE_INCORE | MINCORE_PSIND(psind);
8659 if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed &&
8660 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)))
8661 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
8662 if ((tpte & ATTR_AF) == ATTR_AF)
8663 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
8664
8665 pa = PTE_TO_PHYS(tpte) | (addr & mask);
8666 } else {
8667 managed = false;
8668 val = 0;
8669 }
8670
8671 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
8672 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
8673 *pap = pa;
8674 }
8675 PMAP_UNLOCK(pmap);
8676 return (val);
8677 }
8678
8679 /*
8680 * Garbage collect every ASID that is neither active on a processor nor
8681 * reserved.
8682 */
8683 static void
pmap_reset_asid_set(pmap_t pmap)8684 pmap_reset_asid_set(pmap_t pmap)
8685 {
8686 pmap_t curpmap;
8687 int asid, cpuid, epoch;
8688 struct asid_set *set;
8689 enum pmap_stage stage;
8690
8691 set = pmap->pm_asid_set;
8692 stage = pmap->pm_stage;
8693
8694 set = pmap->pm_asid_set;
8695 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
8696 mtx_assert(&set->asid_set_mutex, MA_OWNED);
8697
8698 /*
8699 * Ensure that the store to asid_epoch is globally visible before the
8700 * loads from pc_curpmap are performed.
8701 */
8702 epoch = set->asid_epoch + 1;
8703 if (epoch == INT_MAX)
8704 epoch = 0;
8705 set->asid_epoch = epoch;
8706 dsb(ishst);
8707 if (stage == PM_STAGE1) {
8708 __asm __volatile("tlbi vmalle1is");
8709 } else {
8710 KASSERT(pmap_clean_stage2_tlbi != NULL,
8711 ("%s: Unset stage 2 tlb invalidation callback\n",
8712 __func__));
8713 pmap_clean_stage2_tlbi();
8714 }
8715 dsb(ish);
8716 bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE,
8717 set->asid_set_size - 1);
8718 CPU_FOREACH(cpuid) {
8719 if (cpuid == curcpu)
8720 continue;
8721 if (stage == PM_STAGE1) {
8722 curpmap = pcpu_find(cpuid)->pc_curpmap;
8723 PMAP_ASSERT_STAGE1(pmap);
8724 } else {
8725 curpmap = pcpu_find(cpuid)->pc_curvmpmap;
8726 if (curpmap == NULL)
8727 continue;
8728 PMAP_ASSERT_STAGE2(pmap);
8729 }
8730 KASSERT(curpmap->pm_asid_set == set, ("Incorrect set"));
8731 asid = COOKIE_TO_ASID(curpmap->pm_cookie);
8732 if (asid == -1)
8733 continue;
8734 bit_set(set->asid_set, asid);
8735 curpmap->pm_cookie = COOKIE_FROM(asid, epoch);
8736 }
8737 }
8738
8739 /*
8740 * Allocate a new ASID for the specified pmap.
8741 */
8742 static void
pmap_alloc_asid(pmap_t pmap)8743 pmap_alloc_asid(pmap_t pmap)
8744 {
8745 struct asid_set *set;
8746 int new_asid;
8747
8748 set = pmap->pm_asid_set;
8749 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
8750
8751 mtx_lock_spin(&set->asid_set_mutex);
8752
8753 /*
8754 * While this processor was waiting to acquire the asid set mutex,
8755 * pmap_reset_asid_set() running on another processor might have
8756 * updated this pmap's cookie to the current epoch. In which case, we
8757 * don't need to allocate a new ASID.
8758 */
8759 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch)
8760 goto out;
8761
8762 bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size,
8763 &new_asid);
8764 if (new_asid == -1) {
8765 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
8766 set->asid_next, &new_asid);
8767 if (new_asid == -1) {
8768 pmap_reset_asid_set(pmap);
8769 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
8770 set->asid_set_size, &new_asid);
8771 KASSERT(new_asid != -1, ("ASID allocation failure"));
8772 }
8773 }
8774 bit_set(set->asid_set, new_asid);
8775 set->asid_next = new_asid + 1;
8776 pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch);
8777 out:
8778 mtx_unlock_spin(&set->asid_set_mutex);
8779 }
8780
8781 static uint64_t __read_mostly ttbr_flags;
8782
8783 /*
8784 * Compute the value that should be stored in ttbr0 to activate the specified
8785 * pmap. This value may change from time to time.
8786 */
8787 uint64_t
pmap_to_ttbr0(pmap_t pmap)8788 pmap_to_ttbr0(pmap_t pmap)
8789 {
8790 uint64_t ttbr;
8791
8792 ttbr = pmap->pm_ttbr;
8793 ttbr |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
8794 ttbr |= ttbr_flags;
8795
8796 return (ttbr);
8797 }
8798
8799 static void
pmap_set_cnp(void * arg)8800 pmap_set_cnp(void *arg)
8801 {
8802 uint64_t ttbr0, ttbr1;
8803 u_int cpuid;
8804
8805 cpuid = *(u_int *)arg;
8806 if (cpuid == curcpu) {
8807 /*
8808 * Set the flags while all CPUs are handling the
8809 * smp_rendezvous so will not call pmap_to_ttbr0. Any calls
8810 * to pmap_to_ttbr0 after this will have the CnP flag set.
8811 * The dsb after invalidating the TLB will act as a barrier
8812 * to ensure all CPUs can observe this change.
8813 */
8814 ttbr_flags |= TTBR_CnP;
8815 }
8816
8817 ttbr0 = READ_SPECIALREG(ttbr0_el1);
8818 ttbr0 |= TTBR_CnP;
8819
8820 ttbr1 = READ_SPECIALREG(ttbr1_el1);
8821 ttbr1 |= TTBR_CnP;
8822
8823 /* Update ttbr{0,1}_el1 with the CnP flag */
8824 WRITE_SPECIALREG(ttbr0_el1, ttbr0);
8825 WRITE_SPECIALREG(ttbr1_el1, ttbr1);
8826 isb();
8827 __asm __volatile("tlbi vmalle1is");
8828 dsb(ish);
8829 isb();
8830 }
8831
8832 /*
8833 * Defer enabling some features until we have read the ID registers to know
8834 * if they are supported on all CPUs.
8835 */
8836 static void
pmap_init_mp(void * dummy __unused)8837 pmap_init_mp(void *dummy __unused)
8838 {
8839 uint64_t reg;
8840
8841 if (get_kernel_reg(ID_AA64PFR1_EL1, ®)) {
8842 if (ID_AA64PFR1_BT_VAL(reg) != ID_AA64PFR1_BT_NONE) {
8843 if (bootverbose)
8844 printf("Enabling BTI\n");
8845 pmap_bti_support = true;
8846
8847 pmap_bti_ranges_zone = uma_zcreate("BTI ranges",
8848 sizeof(struct rs_el), NULL, NULL, NULL, NULL,
8849 UMA_ALIGN_PTR, 0);
8850 }
8851 }
8852 }
8853 SYSINIT(pmap_init_mp, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_mp, NULL);
8854
8855 /*
8856 * Defer enabling CnP until we have read the ID registers to know if it's
8857 * supported on all CPUs.
8858 */
8859 static void
pmap_init_cnp(void * dummy __unused)8860 pmap_init_cnp(void *dummy __unused)
8861 {
8862 uint64_t reg;
8863 u_int cpuid;
8864
8865 if (!get_kernel_reg(ID_AA64MMFR2_EL1, ®))
8866 return;
8867
8868 if (ID_AA64MMFR2_CnP_VAL(reg) != ID_AA64MMFR2_CnP_NONE) {
8869 if (bootverbose)
8870 printf("Enabling CnP\n");
8871 cpuid = curcpu;
8872 smp_rendezvous(NULL, pmap_set_cnp, NULL, &cpuid);
8873 }
8874
8875 }
8876 SYSINIT(pmap_init_cnp, SI_SUB_SMP, SI_ORDER_ANY, pmap_init_cnp, NULL);
8877
8878 static bool
pmap_activate_int(pmap_t pmap)8879 pmap_activate_int(pmap_t pmap)
8880 {
8881 struct asid_set *set;
8882 int epoch;
8883
8884 KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap"));
8885 KASSERT(pmap != kernel_pmap, ("kernel pmap activation"));
8886
8887 if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) ||
8888 (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) {
8889 /*
8890 * Handle the possibility that the old thread was preempted
8891 * after an "ic" or "tlbi" instruction but before it performed
8892 * a "dsb" instruction. If the old thread migrates to a new
8893 * processor, its completion of a "dsb" instruction on that
8894 * new processor does not guarantee that the "ic" or "tlbi"
8895 * instructions performed on the old processor have completed.
8896 */
8897 dsb(ish);
8898 return (false);
8899 }
8900
8901 set = pmap->pm_asid_set;
8902 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
8903
8904 /*
8905 * Ensure that the store to curpmap is globally visible before the
8906 * load from asid_epoch is performed.
8907 */
8908 if (pmap->pm_stage == PM_STAGE1)
8909 PCPU_SET(curpmap, pmap);
8910 else
8911 PCPU_SET(curvmpmap, pmap);
8912 dsb(ish);
8913 epoch = COOKIE_TO_EPOCH(pmap->pm_cookie);
8914 if (epoch >= 0 && epoch != set->asid_epoch)
8915 pmap_alloc_asid(pmap);
8916
8917 if (pmap->pm_stage == PM_STAGE1) {
8918 set_ttbr0(pmap_to_ttbr0(pmap));
8919 if (PCPU_GET(bcast_tlbi_workaround) != 0)
8920 invalidate_local_icache();
8921 }
8922 return (true);
8923 }
8924
8925 void
pmap_activate_vm(pmap_t pmap)8926 pmap_activate_vm(pmap_t pmap)
8927 {
8928
8929 PMAP_ASSERT_STAGE2(pmap);
8930
8931 (void)pmap_activate_int(pmap);
8932 }
8933
8934 void
pmap_activate(struct thread * td)8935 pmap_activate(struct thread *td)
8936 {
8937 pmap_t pmap;
8938
8939 pmap = vmspace_pmap(td->td_proc->p_vmspace);
8940 PMAP_ASSERT_STAGE1(pmap);
8941 critical_enter();
8942 (void)pmap_activate_int(pmap);
8943 critical_exit();
8944 }
8945
8946 /*
8947 * Activate the thread we are switching to.
8948 * To simplify the assembly in cpu_throw return the new threads pcb.
8949 */
8950 struct pcb *
pmap_switch(struct thread * new)8951 pmap_switch(struct thread *new)
8952 {
8953 pcpu_bp_harden bp_harden;
8954 struct pcb *pcb;
8955
8956 /* Store the new curthread */
8957 PCPU_SET(curthread, new);
8958
8959 /* And the new pcb */
8960 pcb = new->td_pcb;
8961 PCPU_SET(curpcb, pcb);
8962
8963 /*
8964 * TODO: We may need to flush the cache here if switching
8965 * to a user process.
8966 */
8967
8968 if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) {
8969 /*
8970 * Stop userspace from training the branch predictor against
8971 * other processes. This will call into a CPU specific
8972 * function that clears the branch predictor state.
8973 */
8974 bp_harden = PCPU_GET(bp_harden);
8975 if (bp_harden != NULL)
8976 bp_harden();
8977 }
8978
8979 return (pcb);
8980 }
8981
8982 void
pmap_sync_icache(pmap_t pmap,vm_offset_t va,vm_size_t sz)8983 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
8984 {
8985
8986 PMAP_ASSERT_STAGE1(pmap);
8987 KASSERT(ADDR_IS_CANONICAL(va),
8988 ("%s: Address not in canonical form: %lx", __func__, va));
8989
8990 if (ADDR_IS_KERNEL(va)) {
8991 cpu_icache_sync_range((void *)va, sz);
8992 } else {
8993 u_int len, offset;
8994 vm_paddr_t pa;
8995
8996 /* Find the length of data in this page to flush */
8997 offset = va & PAGE_MASK;
8998 len = imin(PAGE_SIZE - offset, sz);
8999
9000 while (sz != 0) {
9001 /* Extract the physical address & find it in the DMAP */
9002 pa = pmap_extract(pmap, va);
9003 if (pa != 0)
9004 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa),
9005 len);
9006
9007 /* Move to the next page */
9008 sz -= len;
9009 va += len;
9010 /* Set the length for the next iteration */
9011 len = imin(PAGE_SIZE, sz);
9012 }
9013 }
9014 }
9015
9016 static int
pmap_stage2_fault(pmap_t pmap,uint64_t esr,uint64_t far)9017 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far)
9018 {
9019 pd_entry_t *pdep;
9020 pt_entry_t *ptep, pte;
9021 int rv, lvl, dfsc;
9022
9023 PMAP_ASSERT_STAGE2(pmap);
9024 rv = KERN_FAILURE;
9025
9026 /* Data and insn aborts use same encoding for FSC field. */
9027 dfsc = esr & ISS_DATA_DFSC_MASK;
9028 switch (dfsc) {
9029 case ISS_DATA_DFSC_TF_L0:
9030 case ISS_DATA_DFSC_TF_L1:
9031 case ISS_DATA_DFSC_TF_L2:
9032 case ISS_DATA_DFSC_TF_L3:
9033 PMAP_LOCK(pmap);
9034 pdep = pmap_pde(pmap, far, &lvl);
9035 if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) {
9036 PMAP_UNLOCK(pmap);
9037 break;
9038 }
9039
9040 switch (lvl) {
9041 case 0:
9042 ptep = pmap_l0_to_l1(pdep, far);
9043 break;
9044 case 1:
9045 ptep = pmap_l1_to_l2(pdep, far);
9046 break;
9047 case 2:
9048 ptep = pmap_l2_to_l3(pdep, far);
9049 break;
9050 default:
9051 panic("%s: Invalid pde level %d", __func__,lvl);
9052 }
9053 goto fault_exec;
9054
9055 case ISS_DATA_DFSC_AFF_L1:
9056 case ISS_DATA_DFSC_AFF_L2:
9057 case ISS_DATA_DFSC_AFF_L3:
9058 PMAP_LOCK(pmap);
9059 ptep = pmap_pte(pmap, far, &lvl);
9060 fault_exec:
9061 if (ptep != NULL && (pte = pmap_load(ptep)) != 0) {
9062 if (icache_vmid) {
9063 pmap_invalidate_vpipt_icache();
9064 } else {
9065 /*
9066 * If accessing an executable page invalidate
9067 * the I-cache so it will be valid when we
9068 * continue execution in the guest. The D-cache
9069 * is assumed to already be clean to the Point
9070 * of Coherency.
9071 */
9072 if ((pte & ATTR_S2_XN_MASK) !=
9073 ATTR_S2_XN(ATTR_S2_XN_NONE)) {
9074 invalidate_icache();
9075 }
9076 }
9077 pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID);
9078 rv = KERN_SUCCESS;
9079 }
9080 PMAP_UNLOCK(pmap);
9081 break;
9082 }
9083
9084 return (rv);
9085 }
9086
9087 int
pmap_fault(pmap_t pmap,uint64_t esr,uint64_t far)9088 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
9089 {
9090 pt_entry_t pte, *ptep;
9091 register_t intr;
9092 uint64_t ec, par;
9093 int lvl, rv;
9094
9095 rv = KERN_FAILURE;
9096
9097 ec = ESR_ELx_EXCEPTION(esr);
9098 switch (ec) {
9099 case EXCP_INSN_ABORT_L:
9100 case EXCP_INSN_ABORT:
9101 case EXCP_DATA_ABORT_L:
9102 case EXCP_DATA_ABORT:
9103 break;
9104 default:
9105 return (rv);
9106 }
9107
9108 if (pmap->pm_stage == PM_STAGE2)
9109 return (pmap_stage2_fault(pmap, esr, far));
9110
9111 /* Data and insn aborts use same encoding for FSC field. */
9112 switch (esr & ISS_DATA_DFSC_MASK) {
9113 case ISS_DATA_DFSC_AFF_L1:
9114 case ISS_DATA_DFSC_AFF_L2:
9115 case ISS_DATA_DFSC_AFF_L3:
9116 PMAP_LOCK(pmap);
9117 ptep = pmap_pte(pmap, far, &lvl);
9118 if (ptep != NULL) {
9119 pmap_set_bits(ptep, ATTR_AF);
9120 rv = KERN_SUCCESS;
9121 /*
9122 * XXXMJ as an optimization we could mark the entry
9123 * dirty if this is a write fault.
9124 */
9125 }
9126 PMAP_UNLOCK(pmap);
9127 break;
9128 case ISS_DATA_DFSC_PF_L1:
9129 case ISS_DATA_DFSC_PF_L2:
9130 case ISS_DATA_DFSC_PF_L3:
9131 if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) ||
9132 (esr & ISS_DATA_WnR) == 0)
9133 return (rv);
9134 PMAP_LOCK(pmap);
9135 ptep = pmap_pte(pmap, far, &lvl);
9136 if (ptep != NULL &&
9137 ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) {
9138 if ((pte & ATTR_S1_AP_RW_BIT) ==
9139 ATTR_S1_AP(ATTR_S1_AP_RO)) {
9140 pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT);
9141 pmap_s1_invalidate_page(pmap, far, true);
9142 }
9143 rv = KERN_SUCCESS;
9144 }
9145 PMAP_UNLOCK(pmap);
9146 break;
9147 case ISS_DATA_DFSC_TF_L0:
9148 case ISS_DATA_DFSC_TF_L1:
9149 case ISS_DATA_DFSC_TF_L2:
9150 case ISS_DATA_DFSC_TF_L3:
9151 /*
9152 * Retry the translation. A break-before-make sequence can
9153 * produce a transient fault.
9154 */
9155 if (pmap == kernel_pmap) {
9156 /*
9157 * The translation fault may have occurred within a
9158 * critical section. Therefore, we must check the
9159 * address without acquiring the kernel pmap's lock.
9160 */
9161 if (pmap_klookup(far, NULL))
9162 rv = KERN_SUCCESS;
9163 } else {
9164 PMAP_LOCK(pmap);
9165 /* Ask the MMU to check the address. */
9166 intr = intr_disable();
9167 par = arm64_address_translate_s1e0r(far);
9168 intr_restore(intr);
9169 PMAP_UNLOCK(pmap);
9170
9171 /*
9172 * If the translation was successful, then we can
9173 * return success to the trap handler.
9174 */
9175 if (PAR_SUCCESS(par))
9176 rv = KERN_SUCCESS;
9177 }
9178 break;
9179 }
9180
9181 return (rv);
9182 }
9183
9184 /*
9185 * Increase the starting virtual address of the given mapping if a
9186 * different alignment might result in more superpage mappings.
9187 */
9188 void
pmap_align_superpage(vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t size)9189 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
9190 vm_offset_t *addr, vm_size_t size)
9191 {
9192 vm_offset_t superpage_offset;
9193
9194 if (size < L3C_SIZE)
9195 return;
9196 if (object != NULL && (object->flags & OBJ_COLORED) != 0)
9197 offset += ptoa(object->pg_color);
9198
9199 /*
9200 * Considering the object's physical alignment, is the mapping large
9201 * enough to encompass an L2 (2MB/32MB) superpage ...
9202 */
9203 superpage_offset = offset & L2_OFFSET;
9204 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) >= L2_SIZE) {
9205 /*
9206 * If the virtual and physical alignments differ, then
9207 * increase the virtual address so that the alignments match.
9208 */
9209 if ((*addr & L2_OFFSET) < superpage_offset)
9210 *addr = (*addr & ~L2_OFFSET) + superpage_offset;
9211 else if ((*addr & L2_OFFSET) > superpage_offset)
9212 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) +
9213 superpage_offset;
9214 return;
9215 }
9216 /* ... or an L3C (64KB/2MB) superpage? */
9217 superpage_offset = offset & L3C_OFFSET;
9218 if (size - ((L3C_SIZE - superpage_offset) & L3C_OFFSET) >= L3C_SIZE) {
9219 if ((*addr & L3C_OFFSET) < superpage_offset)
9220 *addr = (*addr & ~L3C_OFFSET) + superpage_offset;
9221 else if ((*addr & L3C_OFFSET) > superpage_offset)
9222 *addr = ((*addr + L3C_OFFSET) & ~L3C_OFFSET) +
9223 superpage_offset;
9224 }
9225 }
9226
9227 /**
9228 * Get the kernel virtual address of a set of physical pages. If there are
9229 * physical addresses not covered by the DMAP perform a transient mapping
9230 * that will be removed when calling pmap_unmap_io_transient.
9231 *
9232 * \param page The pages the caller wishes to obtain the virtual
9233 * address on the kernel memory map.
9234 * \param vaddr On return contains the kernel virtual memory address
9235 * of the pages passed in the page parameter.
9236 * \param count Number of pages passed in.
9237 * \param can_fault true if the thread using the mapped pages can take
9238 * page faults, false otherwise.
9239 *
9240 * \returns true if the caller must call pmap_unmap_io_transient when
9241 * finished or false otherwise.
9242 *
9243 */
9244 bool
pmap_map_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)9245 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
9246 bool can_fault)
9247 {
9248 vm_paddr_t paddr;
9249 bool needs_mapping;
9250 int error __diagused, i;
9251
9252 /*
9253 * Allocate any KVA space that we need, this is done in a separate
9254 * loop to prevent calling vmem_alloc while pinned.
9255 */
9256 needs_mapping = false;
9257 for (i = 0; i < count; i++) {
9258 paddr = VM_PAGE_TO_PHYS(page[i]);
9259 if (__predict_false(!PHYS_IN_DMAP(paddr))) {
9260 error = vmem_alloc(kernel_arena, PAGE_SIZE,
9261 M_BESTFIT | M_WAITOK, &vaddr[i]);
9262 KASSERT(error == 0, ("vmem_alloc failed: %d", error));
9263 needs_mapping = true;
9264 } else {
9265 vaddr[i] = PHYS_TO_DMAP(paddr);
9266 }
9267 }
9268
9269 /* Exit early if everything is covered by the DMAP */
9270 if (!needs_mapping)
9271 return (false);
9272
9273 if (!can_fault)
9274 sched_pin();
9275 for (i = 0; i < count; i++) {
9276 paddr = VM_PAGE_TO_PHYS(page[i]);
9277 if (!PHYS_IN_DMAP(paddr)) {
9278 panic(
9279 "pmap_map_io_transient: TODO: Map out of DMAP data");
9280 }
9281 }
9282
9283 return (needs_mapping);
9284 }
9285
9286 void
pmap_unmap_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)9287 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
9288 bool can_fault)
9289 {
9290 vm_paddr_t paddr;
9291 int i;
9292
9293 if (!can_fault)
9294 sched_unpin();
9295 for (i = 0; i < count; i++) {
9296 paddr = VM_PAGE_TO_PHYS(page[i]);
9297 if (!PHYS_IN_DMAP(paddr)) {
9298 panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
9299 }
9300 }
9301 }
9302
9303 bool
pmap_is_valid_memattr(pmap_t pmap __unused,vm_memattr_t mode)9304 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
9305 {
9306
9307 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH);
9308 }
9309
9310 static void *
bti_dup_range(void * ctx __unused,void * data)9311 bti_dup_range(void *ctx __unused, void *data)
9312 {
9313 struct rs_el *node, *new_node;
9314
9315 new_node = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT);
9316 if (new_node == NULL)
9317 return (NULL);
9318 node = data;
9319 memcpy(new_node, node, sizeof(*node));
9320 return (new_node);
9321 }
9322
9323 static void
bti_free_range(void * ctx __unused,void * node)9324 bti_free_range(void *ctx __unused, void *node)
9325 {
9326
9327 uma_zfree(pmap_bti_ranges_zone, node);
9328 }
9329
9330 static int
pmap_bti_assign(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9331 pmap_bti_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9332 {
9333 struct rs_el *rs;
9334 int error;
9335
9336 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9337 PMAP_ASSERT_STAGE1(pmap);
9338 MPASS(pmap->pm_bti != NULL);
9339 rs = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT);
9340 if (rs == NULL)
9341 return (ENOMEM);
9342 error = rangeset_insert(pmap->pm_bti, sva, eva, rs);
9343 if (error != 0)
9344 uma_zfree(pmap_bti_ranges_zone, rs);
9345 return (error);
9346 }
9347
9348 static void
pmap_bti_deassign_all(pmap_t pmap)9349 pmap_bti_deassign_all(pmap_t pmap)
9350 {
9351
9352 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9353 if (pmap->pm_bti != NULL)
9354 rangeset_remove_all(pmap->pm_bti);
9355 }
9356
9357 /*
9358 * Returns true if the BTI setting is the same across the specified address
9359 * range, and false otherwise. When returning true, updates the referenced PTE
9360 * to reflect the BTI setting.
9361 *
9362 * Only stage 1 pmaps support BTI. The kernel pmap is always a stage 1 pmap
9363 * that has the same BTI setting implicitly across its entire address range.
9364 */
9365 static bool
pmap_bti_same(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t * pte)9366 pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t *pte)
9367 {
9368 struct rs_el *next_rs, *rs;
9369 vm_offset_t va;
9370
9371 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9372 KASSERT(ADDR_IS_CANONICAL(sva),
9373 ("%s: Start address not in canonical form: %lx", __func__, sva));
9374 KASSERT(ADDR_IS_CANONICAL(eva),
9375 ("%s: End address not in canonical form: %lx", __func__, eva));
9376 KASSERT((*pte & ATTR_S1_GP) == 0,
9377 ("%s: pte %lx has ATTR_S1_GP preset", __func__, *pte));
9378
9379 if (pmap == kernel_pmap) {
9380 *pte |= ATTR_KERN_GP;
9381 return (true);
9382 }
9383 if (pmap->pm_bti == NULL)
9384 return (true);
9385 PMAP_ASSERT_STAGE1(pmap);
9386 rs = rangeset_lookup(pmap->pm_bti, sva);
9387 if (rs == NULL) {
9388 rs = rangeset_next(pmap->pm_bti, sva);
9389 return (rs == NULL ||
9390 rs->re_start >= eva);
9391 }
9392 while ((va = rs->re_end) < eva) {
9393 next_rs = rangeset_next(pmap->pm_bti, va);
9394 if (next_rs == NULL ||
9395 va != next_rs->re_start)
9396 return (false);
9397 rs = next_rs;
9398 }
9399 *pte |= ATTR_S1_GP;
9400 return (true);
9401 }
9402
9403 static pt_entry_t
pmap_pte_bti(pmap_t pmap,vm_offset_t va)9404 pmap_pte_bti(pmap_t pmap, vm_offset_t va)
9405 {
9406 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9407 MPASS(ADDR_IS_CANONICAL(va));
9408
9409 if (pmap->pm_stage != PM_STAGE1)
9410 return (0);
9411 if (pmap == kernel_pmap)
9412 return (ATTR_KERN_GP);
9413 if (pmap->pm_bti != NULL && rangeset_lookup(pmap->pm_bti, va) != NULL)
9414 return (ATTR_S1_GP);
9415 return (0);
9416 }
9417
9418 static void
pmap_bti_on_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9419 pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9420 {
9421
9422 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9423 if (pmap->pm_bti != NULL)
9424 rangeset_remove(pmap->pm_bti, sva, eva);
9425 }
9426
9427 static int
pmap_bti_copy(pmap_t dst_pmap,pmap_t src_pmap)9428 pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap)
9429 {
9430
9431 PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED);
9432 PMAP_LOCK_ASSERT(src_pmap, MA_OWNED);
9433 MPASS(src_pmap->pm_stage == dst_pmap->pm_stage);
9434 MPASS(src_pmap->pm_bti != NULL);
9435 MPASS(dst_pmap->pm_bti != NULL);
9436 if (src_pmap->pm_bti->rs_data_ctx == NULL)
9437 return (0);
9438 return (rangeset_copy(dst_pmap->pm_bti, src_pmap->pm_bti));
9439 }
9440
9441 static void
pmap_bti_update_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool set)9442 pmap_bti_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool set)
9443 {
9444 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9445 PMAP_ASSERT_STAGE1(pmap);
9446
9447 pmap_mask_set_locked(pmap, sva, eva, ATTR_S1_GP, set ? ATTR_S1_GP : 0,
9448 true);
9449 }
9450
9451 int
pmap_bti_set(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9452 pmap_bti_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9453 {
9454 int error;
9455
9456 if (pmap->pm_bti == NULL)
9457 return (0);
9458 if (!ADDR_IS_CANONICAL(sva) || !ADDR_IS_CANONICAL(eva))
9459 return (EINVAL);
9460 if (pmap->pm_stage != PM_STAGE1)
9461 return (EINVAL);
9462 if (eva <= sva || ADDR_IS_KERNEL(eva))
9463 return (EFAULT);
9464
9465 sva = trunc_page(sva);
9466 eva = round_page(eva);
9467 for (;;) {
9468 PMAP_LOCK(pmap);
9469 error = pmap_bti_assign(pmap, sva, eva);
9470 if (error == 0)
9471 pmap_bti_update_range(pmap, sva, eva, true);
9472 PMAP_UNLOCK(pmap);
9473 if (error != ENOMEM)
9474 break;
9475 vm_wait(NULL);
9476 }
9477 return (error);
9478 }
9479
9480 #if defined(KASAN) || defined(KMSAN)
9481 static pd_entry_t *pmap_san_early_l2;
9482
9483 #define SAN_BOOTSTRAP_L2_SIZE (1 * L2_SIZE)
9484 #define SAN_BOOTSTRAP_SIZE (2 * PAGE_SIZE)
9485 static vm_offset_t __nosanitizeaddress
pmap_san_enter_bootstrap_alloc_l2(void)9486 pmap_san_enter_bootstrap_alloc_l2(void)
9487 {
9488 static uint8_t bootstrap_data[SAN_BOOTSTRAP_L2_SIZE] __aligned(L2_SIZE);
9489 static size_t offset = 0;
9490 vm_offset_t addr;
9491
9492 if (offset + L2_SIZE > sizeof(bootstrap_data)) {
9493 panic("%s: out of memory for the bootstrap shadow map L2 entries",
9494 __func__);
9495 }
9496
9497 addr = (uintptr_t)&bootstrap_data[offset];
9498 offset += L2_SIZE;
9499 return (addr);
9500 }
9501
9502 /*
9503 * SAN L1 + L2 pages, maybe L3 entries later?
9504 */
9505 static vm_offset_t __nosanitizeaddress
pmap_san_enter_bootstrap_alloc_pages(int npages)9506 pmap_san_enter_bootstrap_alloc_pages(int npages)
9507 {
9508 static uint8_t bootstrap_data[SAN_BOOTSTRAP_SIZE] __aligned(PAGE_SIZE);
9509 static size_t offset = 0;
9510 vm_offset_t addr;
9511
9512 if (offset + (npages * PAGE_SIZE) > sizeof(bootstrap_data)) {
9513 panic("%s: out of memory for the bootstrap shadow map",
9514 __func__);
9515 }
9516
9517 addr = (uintptr_t)&bootstrap_data[offset];
9518 offset += (npages * PAGE_SIZE);
9519 return (addr);
9520 }
9521
9522 static void __nosanitizeaddress
pmap_san_enter_bootstrap(void)9523 pmap_san_enter_bootstrap(void)
9524 {
9525 vm_offset_t freemempos;
9526
9527 /* L1, L2 */
9528 freemempos = pmap_san_enter_bootstrap_alloc_pages(2);
9529 bs_state.freemempos = freemempos;
9530 bs_state.va = KASAN_MIN_ADDRESS;
9531 pmap_bootstrap_l1_table(&bs_state);
9532 pmap_san_early_l2 = bs_state.l2;
9533 }
9534
9535 static vm_page_t
pmap_san_enter_alloc_l3(void)9536 pmap_san_enter_alloc_l3(void)
9537 {
9538 vm_page_t m;
9539
9540 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
9541 VM_ALLOC_ZERO);
9542 if (m == NULL)
9543 panic("%s: no memory to grow shadow map", __func__);
9544 return (m);
9545 }
9546
9547 static vm_page_t
pmap_san_enter_alloc_l2(void)9548 pmap_san_enter_alloc_l2(void)
9549 {
9550 return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO,
9551 Ln_ENTRIES, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT));
9552 }
9553
9554 void __nosanitizeaddress __nosanitizememory
pmap_san_enter(vm_offset_t va)9555 pmap_san_enter(vm_offset_t va)
9556 {
9557 pd_entry_t *l1, *l2;
9558 pt_entry_t *l3;
9559 vm_page_t m;
9560
9561 if (virtual_avail == 0) {
9562 vm_offset_t block;
9563 int slot;
9564 bool first;
9565
9566 /* Temporary shadow map prior to pmap_bootstrap(). */
9567 first = pmap_san_early_l2 == NULL;
9568 if (first)
9569 pmap_san_enter_bootstrap();
9570
9571 l2 = pmap_san_early_l2;
9572 slot = pmap_l2_index(va);
9573
9574 if ((pmap_load(&l2[slot]) & ATTR_DESCR_VALID) == 0) {
9575 MPASS(first);
9576 block = pmap_san_enter_bootstrap_alloc_l2();
9577 pmap_store(&l2[slot],
9578 PHYS_TO_PTE(pmap_early_vtophys(block)) |
9579 PMAP_SAN_PTE_BITS | L2_BLOCK);
9580 dmb(ishst);
9581 }
9582
9583 return;
9584 }
9585
9586 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
9587 l1 = pmap_l1(kernel_pmap, va);
9588 MPASS(l1 != NULL);
9589 if ((pmap_load(l1) & ATTR_DESCR_VALID) == 0) {
9590 m = pmap_san_enter_alloc_l3();
9591 pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE);
9592 }
9593 l2 = pmap_l1_to_l2(l1, va);
9594 if ((pmap_load(l2) & ATTR_DESCR_VALID) == 0) {
9595 m = pmap_san_enter_alloc_l2();
9596 if (m != NULL) {
9597 pmap_store(l2, VM_PAGE_TO_PTE(m) |
9598 PMAP_SAN_PTE_BITS | L2_BLOCK);
9599 } else {
9600 m = pmap_san_enter_alloc_l3();
9601 pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE);
9602 }
9603 dmb(ishst);
9604 }
9605 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK)
9606 return;
9607 l3 = pmap_l2_to_l3(l2, va);
9608 if ((pmap_load(l3) & ATTR_DESCR_VALID) != 0)
9609 return;
9610 m = pmap_san_enter_alloc_l3();
9611 pmap_store(l3, VM_PAGE_TO_PTE(m) | PMAP_SAN_PTE_BITS | L3_PAGE);
9612 dmb(ishst);
9613 }
9614 #endif /* KASAN || KMSAN */
9615
9616 /*
9617 * Track a range of the kernel's virtual address space that is contiguous
9618 * in various mapping attributes.
9619 */
9620 struct pmap_kernel_map_range {
9621 vm_offset_t sva;
9622 pt_entry_t attrs;
9623 int l3pages;
9624 int l3contig;
9625 int l2blocks;
9626 int l2contig;
9627 int l1blocks;
9628 };
9629
9630 static void
sysctl_kmaps_dump(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t eva)9631 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
9632 vm_offset_t eva)
9633 {
9634 const char *mode;
9635 int index;
9636
9637 if (eva <= range->sva)
9638 return;
9639
9640 index = range->attrs & ATTR_S1_IDX_MASK;
9641 switch (index) {
9642 case ATTR_S1_IDX(VM_MEMATTR_DEVICE_NP):
9643 mode = "DEV-NP";
9644 break;
9645 case ATTR_S1_IDX(VM_MEMATTR_DEVICE):
9646 mode = "DEV";
9647 break;
9648 case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE):
9649 mode = "UC";
9650 break;
9651 case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK):
9652 mode = "WB";
9653 break;
9654 case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH):
9655 mode = "WT";
9656 break;
9657 default:
9658 printf(
9659 "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n",
9660 __func__, index, range->sva, eva);
9661 mode = "??";
9662 break;
9663 }
9664
9665 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c%c %6s %d %d %d %d %d\n",
9666 range->sva, eva,
9667 (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-',
9668 (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x',
9669 (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X',
9670 (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's',
9671 (range->attrs & ATTR_S1_GP) != 0 ? 'g' : '-',
9672 mode, range->l1blocks, range->l2contig, range->l2blocks,
9673 range->l3contig, range->l3pages);
9674
9675 /* Reset to sentinel value. */
9676 range->sva = 0xfffffffffffffffful;
9677 }
9678
9679 /*
9680 * Determine whether the attributes specified by a page table entry match those
9681 * being tracked by the current range.
9682 */
9683 static bool
sysctl_kmaps_match(struct pmap_kernel_map_range * range,pt_entry_t attrs)9684 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
9685 {
9686
9687 return (range->attrs == attrs);
9688 }
9689
9690 static void
sysctl_kmaps_reinit(struct pmap_kernel_map_range * range,vm_offset_t va,pt_entry_t attrs)9691 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
9692 pt_entry_t attrs)
9693 {
9694
9695 memset(range, 0, sizeof(*range));
9696 range->sva = va;
9697 range->attrs = attrs;
9698 }
9699
9700 /* Get the block/page attributes that correspond to the table attributes */
9701 static pt_entry_t
sysctl_kmaps_table_attrs(pd_entry_t table)9702 sysctl_kmaps_table_attrs(pd_entry_t table)
9703 {
9704 pt_entry_t attrs;
9705
9706 attrs = 0;
9707 if ((table & TATTR_UXN_TABLE) != 0)
9708 attrs |= ATTR_S1_UXN;
9709 if ((table & TATTR_PXN_TABLE) != 0)
9710 attrs |= ATTR_S1_PXN;
9711 if ((table & TATTR_AP_TABLE_RO) != 0)
9712 attrs |= ATTR_S1_AP(ATTR_S1_AP_RO);
9713
9714 return (attrs);
9715 }
9716
9717 /* Read the block/page attributes we care about */
9718 static pt_entry_t
sysctl_kmaps_block_attrs(pt_entry_t block)9719 sysctl_kmaps_block_attrs(pt_entry_t block)
9720 {
9721 return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK |
9722 ATTR_S1_GP));
9723 }
9724
9725 /*
9726 * Given a leaf PTE, derive the mapping's attributes. If they do not match
9727 * those of the current run, dump the address range and its attributes, and
9728 * begin a new run.
9729 */
9730 static void
sysctl_kmaps_check(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t va,pd_entry_t l0e,pd_entry_t l1e,pd_entry_t l2e,pt_entry_t l3e)9731 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
9732 vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e,
9733 pt_entry_t l3e)
9734 {
9735 pt_entry_t attrs;
9736
9737 attrs = sysctl_kmaps_table_attrs(l0e);
9738
9739 if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
9740 attrs |= sysctl_kmaps_block_attrs(l1e);
9741 goto done;
9742 }
9743 attrs |= sysctl_kmaps_table_attrs(l1e);
9744
9745 if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
9746 attrs |= sysctl_kmaps_block_attrs(l2e);
9747 goto done;
9748 }
9749 attrs |= sysctl_kmaps_table_attrs(l2e);
9750 attrs |= sysctl_kmaps_block_attrs(l3e);
9751
9752 done:
9753 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
9754 sysctl_kmaps_dump(sb, range, va);
9755 sysctl_kmaps_reinit(range, va, attrs);
9756 }
9757 }
9758
9759 static int
sysctl_kmaps(SYSCTL_HANDLER_ARGS)9760 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
9761 {
9762 struct pmap_kernel_map_range range;
9763 struct sbuf sbuf, *sb;
9764 pd_entry_t l0e, *l1, l1e, *l2, l2e;
9765 pt_entry_t *l3, l3e;
9766 vm_offset_t sva;
9767 vm_paddr_t pa;
9768 int error, i, j, k, l;
9769
9770 error = sysctl_wire_old_buffer(req, 0);
9771 if (error != 0)
9772 return (error);
9773 sb = &sbuf;
9774 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
9775
9776 /* Sentinel value. */
9777 range.sva = 0xfffffffffffffffful;
9778
9779 /*
9780 * Iterate over the kernel page tables without holding the kernel pmap
9781 * lock. Kernel page table pages are never freed, so at worst we will
9782 * observe inconsistencies in the output.
9783 */
9784 for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES;
9785 i++) {
9786 if (i == pmap_l0_index(DMAP_MIN_ADDRESS))
9787 sbuf_printf(sb, "\nDirect map:\n");
9788 else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS))
9789 sbuf_printf(sb, "\nKernel map:\n");
9790 #ifdef KASAN
9791 else if (i == pmap_l0_index(KASAN_MIN_ADDRESS))
9792 sbuf_printf(sb, "\nKASAN shadow map:\n");
9793 #endif
9794 #ifdef KMSAN
9795 else if (i == pmap_l0_index(KMSAN_SHAD_MIN_ADDRESS))
9796 sbuf_printf(sb, "\nKMSAN shadow map:\n");
9797 else if (i == pmap_l0_index(KMSAN_ORIG_MIN_ADDRESS))
9798 sbuf_printf(sb, "\nKMSAN origin map:\n");
9799 #endif
9800
9801 l0e = kernel_pmap->pm_l0[i];
9802 if ((l0e & ATTR_DESCR_VALID) == 0) {
9803 sysctl_kmaps_dump(sb, &range, sva);
9804 sva += L0_SIZE;
9805 continue;
9806 }
9807 pa = PTE_TO_PHYS(l0e);
9808 l1 = (pd_entry_t *)PHYS_TO_DMAP(pa);
9809
9810 for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) {
9811 l1e = l1[j];
9812 if ((l1e & ATTR_DESCR_VALID) == 0) {
9813 sysctl_kmaps_dump(sb, &range, sva);
9814 sva += L1_SIZE;
9815 continue;
9816 }
9817 if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) {
9818 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
9819 sysctl_kmaps_check(sb, &range, sva, l0e, l1e,
9820 0, 0);
9821 range.l1blocks++;
9822 sva += L1_SIZE;
9823 continue;
9824 }
9825 pa = PTE_TO_PHYS(l1e);
9826 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa);
9827
9828 for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) {
9829 l2e = l2[k];
9830 if ((l2e & ATTR_DESCR_VALID) == 0) {
9831 sysctl_kmaps_dump(sb, &range, sva);
9832 sva += L2_SIZE;
9833 continue;
9834 }
9835 if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) {
9836 sysctl_kmaps_check(sb, &range, sva,
9837 l0e, l1e, l2e, 0);
9838 if ((l2e & ATTR_CONTIGUOUS) != 0)
9839 range.l2contig +=
9840 k % L2C_ENTRIES == 0 ?
9841 1 : 0;
9842 else
9843 range.l2blocks++;
9844 sva += L2_SIZE;
9845 continue;
9846 }
9847 pa = PTE_TO_PHYS(l2e);
9848 l3 = (pt_entry_t *)PHYS_TO_DMAP(pa);
9849
9850 for (l = pmap_l3_index(sva); l < Ln_ENTRIES;
9851 l++, sva += L3_SIZE) {
9852 l3e = l3[l];
9853 if ((l3e & ATTR_DESCR_VALID) == 0) {
9854 sysctl_kmaps_dump(sb, &range,
9855 sva);
9856 continue;
9857 }
9858 sysctl_kmaps_check(sb, &range, sva,
9859 l0e, l1e, l2e, l3e);
9860 if ((l3e & ATTR_CONTIGUOUS) != 0)
9861 range.l3contig +=
9862 l % L3C_ENTRIES == 0 ?
9863 1 : 0;
9864 else
9865 range.l3pages++;
9866 }
9867 }
9868 }
9869 }
9870
9871 error = sbuf_finish(sb);
9872 sbuf_delete(sb);
9873 return (error);
9874 }
9875 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
9876 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
9877 NULL, 0, sysctl_kmaps, "A",
9878 "Dump kernel address layout");
9879