1 /*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2003 Peter Wemm 9 * All rights reserved. 10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 11 * All rights reserved. 12 * Copyright (c) 2014 Andrew Turner 13 * All rights reserved. 14 * Copyright (c) 2014-2016 The FreeBSD Foundation 15 * All rights reserved. 16 * 17 * This code is derived from software contributed to Berkeley by 18 * the Systems Programming Group of the University of Utah Computer 19 * Science Department and William Jolitz of UUNET Technologies Inc. 20 * 21 * This software was developed by Andrew Turner under sponsorship from 22 * the FreeBSD Foundation. 23 * 24 * Redistribution and use in source and binary forms, with or without 25 * modification, are permitted provided that the following conditions 26 * are met: 27 * 1. Redistributions of source code must retain the above copyright 28 * notice, this list of conditions and the following disclaimer. 29 * 2. Redistributions in binary form must reproduce the above copyright 30 * notice, this list of conditions and the following disclaimer in the 31 * documentation and/or other materials provided with the distribution. 32 * 3. All advertising materials mentioning features or use of this software 33 * must display the following acknowledgement: 34 * This product includes software developed by the University of 35 * California, Berkeley and its contributors. 36 * 4. Neither the name of the University nor the names of its contributors 37 * may be used to endorse or promote products derived from this software 38 * without specific prior written permission. 39 * 40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 43 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 50 * SUCH DAMAGE. 51 */ 52 /*- 53 * Copyright (c) 2003 Networks Associates Technology, Inc. 54 * All rights reserved. 55 * 56 * This software was developed for the FreeBSD Project by Jake Burkholder, 57 * Safeport Network Services, and Network Associates Laboratories, the 58 * Security Research Division of Network Associates, Inc. under 59 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 60 * CHATS research program. 61 * 62 * Redistribution and use in source and binary forms, with or without 63 * modification, are permitted provided that the following conditions 64 * are met: 65 * 1. Redistributions of source code must retain the above copyright 66 * notice, this list of conditions and the following disclaimer. 67 * 2. Redistributions in binary form must reproduce the above copyright 68 * notice, this list of conditions and the following disclaimer in the 69 * documentation and/or other materials provided with the distribution. 70 * 71 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 72 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 73 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 74 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 75 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 76 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 77 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 78 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 79 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 80 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 81 * SUCH DAMAGE. 82 */ 83 84 #include <sys/cdefs.h> 85 /* 86 * Manages physical address maps. 87 * 88 * Since the information managed by this module is 89 * also stored by the logical address mapping module, 90 * this module may throw away valid virtual-to-physical 91 * mappings at almost any time. However, invalidations 92 * of virtual-to-physical mappings must be done as 93 * requested. 94 * 95 * In order to cope with hardware architectures which 96 * make virtual-to-physical map invalidates expensive, 97 * this module may delay invalidate or reduced protection 98 * operations until such time as they are actually 99 * necessary. This module is given full information as 100 * to which processors are currently using which maps, 101 * and to when physical maps must be made correct. 102 */ 103 104 #include "opt_vm.h" 105 106 #include <sys/param.h> 107 #include <sys/asan.h> 108 #include <sys/bitstring.h> 109 #include <sys/bus.h> 110 #include <sys/systm.h> 111 #include <sys/kernel.h> 112 #include <sys/ktr.h> 113 #include <sys/limits.h> 114 #include <sys/lock.h> 115 #include <sys/malloc.h> 116 #include <sys/mman.h> 117 #include <sys/msgbuf.h> 118 #include <sys/mutex.h> 119 #include <sys/physmem.h> 120 #include <sys/proc.h> 121 #include <sys/rwlock.h> 122 #include <sys/sbuf.h> 123 #include <sys/sx.h> 124 #include <sys/vmem.h> 125 #include <sys/vmmeter.h> 126 #include <sys/sched.h> 127 #include <sys/sysctl.h> 128 #include <sys/_unrhdr.h> 129 #include <sys/smp.h> 130 131 #include <vm/vm.h> 132 #include <vm/vm_param.h> 133 #include <vm/vm_kern.h> 134 #include <vm/vm_page.h> 135 #include <vm/vm_map.h> 136 #include <vm/vm_object.h> 137 #include <vm/vm_extern.h> 138 #include <vm/vm_pageout.h> 139 #include <vm/vm_pager.h> 140 #include <vm/vm_phys.h> 141 #include <vm/vm_radix.h> 142 #include <vm/vm_reserv.h> 143 #include <vm/vm_dumpset.h> 144 #include <vm/uma.h> 145 146 #include <machine/asan.h> 147 #include <machine/machdep.h> 148 #include <machine/md_var.h> 149 #include <machine/pcb.h> 150 151 #ifdef NUMA 152 #define PMAP_MEMDOM MAXMEMDOM 153 #else 154 #define PMAP_MEMDOM 1 155 #endif 156 157 #define PMAP_ASSERT_STAGE1(pmap) MPASS((pmap)->pm_stage == PM_STAGE1) 158 #define PMAP_ASSERT_STAGE2(pmap) MPASS((pmap)->pm_stage == PM_STAGE2) 159 160 #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t))) 161 #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t))) 162 #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t))) 163 #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t))) 164 165 #define NUL0E L0_ENTRIES 166 #define NUL1E (NUL0E * NL1PG) 167 #define NUL2E (NUL1E * NL2PG) 168 169 #ifdef PV_STATS 170 #define PV_STAT(x) do { x ; } while (0) 171 #define __pvused 172 #else 173 #define PV_STAT(x) do { } while (0) 174 #define __pvused __unused 175 #endif 176 177 #define pmap_l0_pindex(v) (NUL2E + NUL1E + ((v) >> L0_SHIFT)) 178 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT)) 179 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) 180 181 #ifdef __ARM_FEATURE_BTI_DEFAULT 182 #define ATTR_KERN_GP ATTR_S1_GP 183 #else 184 #define ATTR_KERN_GP 0 185 #endif 186 #define PMAP_SAN_PTE_BITS (ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP | \ 187 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | ATTR_S1_AP(ATTR_S1_AP_RW)) 188 189 struct pmap_large_md_page { 190 struct rwlock pv_lock; 191 struct md_page pv_page; 192 /* Pad to a power of 2, see pmap_init_pv_table(). */ 193 int pv_pad[2]; 194 }; 195 196 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large; 197 #define pv_dummy pv_dummy_large.pv_page 198 __read_mostly static struct pmap_large_md_page *pv_table; 199 200 static struct pmap_large_md_page * 201 _pa_to_pmdp(vm_paddr_t pa) 202 { 203 struct vm_phys_seg *seg; 204 205 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL) 206 return ((struct pmap_large_md_page *)seg->md_first + 207 pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start)); 208 return (NULL); 209 } 210 211 static struct pmap_large_md_page * 212 pa_to_pmdp(vm_paddr_t pa) 213 { 214 struct pmap_large_md_page *pvd; 215 216 pvd = _pa_to_pmdp(pa); 217 if (pvd == NULL) 218 panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa); 219 return (pvd); 220 } 221 222 static struct pmap_large_md_page * 223 page_to_pmdp(vm_page_t m) 224 { 225 struct vm_phys_seg *seg; 226 227 seg = &vm_phys_segs[m->segind]; 228 return ((struct pmap_large_md_page *)seg->md_first + 229 pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start)); 230 } 231 232 #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page)) 233 #define page_to_pvh(m) (&(page_to_pmdp(m)->pv_page)) 234 235 #define PHYS_TO_PV_LIST_LOCK(pa) ({ \ 236 struct pmap_large_md_page *_pvd; \ 237 struct rwlock *_lock; \ 238 _pvd = _pa_to_pmdp(pa); \ 239 if (__predict_false(_pvd == NULL)) \ 240 _lock = &pv_dummy_large.pv_lock; \ 241 else \ 242 _lock = &(_pvd->pv_lock); \ 243 _lock; \ 244 }) 245 246 static struct rwlock * 247 VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m) 248 { 249 if ((m->flags & PG_FICTITIOUS) == 0) 250 return (&page_to_pmdp(m)->pv_lock); 251 else 252 return (&pv_dummy_large.pv_lock); 253 } 254 255 #define CHANGE_PV_LIST_LOCK(lockp, new_lock) do { \ 256 struct rwlock **_lockp = (lockp); \ 257 struct rwlock *_new_lock = (new_lock); \ 258 \ 259 if (_new_lock != *_lockp) { \ 260 if (*_lockp != NULL) \ 261 rw_wunlock(*_lockp); \ 262 *_lockp = _new_lock; \ 263 rw_wlock(*_lockp); \ 264 } \ 265 } while (0) 266 267 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) \ 268 CHANGE_PV_LIST_LOCK(lockp, PHYS_TO_PV_LIST_LOCK(pa)) 269 270 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 271 CHANGE_PV_LIST_LOCK(lockp, VM_PAGE_TO_PV_LIST_LOCK(m)) 272 273 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 274 struct rwlock **_lockp = (lockp); \ 275 \ 276 if (*_lockp != NULL) { \ 277 rw_wunlock(*_lockp); \ 278 *_lockp = NULL; \ 279 } \ 280 } while (0) 281 282 /* 283 * The presence of this flag indicates that the mapping is writeable. 284 * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise 285 * it is dirty. This flag may only be set on managed mappings. 286 * 287 * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it 288 * as a software managed bit. 289 */ 290 #define ATTR_SW_DBM ATTR_DBM 291 292 struct pmap kernel_pmap_store; 293 294 /* Used for mapping ACPI memory before VM is initialized */ 295 #define PMAP_PREINIT_MAPPING_COUNT 32 296 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE) 297 static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */ 298 static int vm_initialized = 0; /* No need to use pre-init maps when set */ 299 300 /* 301 * Reserve a few L2 blocks starting from 'preinit_map_va' pointer. 302 * Always map entire L2 block for simplicity. 303 * VA of L2 block = preinit_map_va + i * L2_SIZE 304 */ 305 static struct pmap_preinit_mapping { 306 vm_paddr_t pa; 307 vm_offset_t va; 308 vm_size_t size; 309 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 310 311 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 312 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 313 vm_offset_t kernel_vm_end = 0; 314 315 /* 316 * Data for the pv entry allocation mechanism. 317 */ 318 #ifdef NUMA 319 static __inline int 320 pc_to_domain(struct pv_chunk *pc) 321 { 322 return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc))); 323 } 324 #else 325 static __inline int 326 pc_to_domain(struct pv_chunk *pc __unused) 327 { 328 return (0); 329 } 330 #endif 331 332 struct pv_chunks_list { 333 struct mtx pvc_lock; 334 TAILQ_HEAD(pch, pv_chunk) pvc_list; 335 int active_reclaims; 336 } __aligned(CACHE_LINE_SIZE); 337 338 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM]; 339 340 vm_paddr_t dmap_phys_base; /* The start of the dmap region */ 341 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ 342 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ 343 344 extern pt_entry_t pagetable_l0_ttbr1[]; 345 346 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 347 static vm_paddr_t physmap[PHYSMAP_SIZE]; 348 static u_int physmap_idx; 349 350 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 351 "VM/pmap parameters"); 352 353 #if PAGE_SIZE == PAGE_SIZE_4K 354 #define L1_BLOCKS_SUPPORTED 1 355 #else 356 /* TODO: Make this dynamic when we support FEAT_LPA2 (TCR_EL1.DS == 1) */ 357 #define L1_BLOCKS_SUPPORTED 0 358 #endif 359 360 #define PMAP_ASSERT_L1_BLOCKS_SUPPORTED MPASS(L1_BLOCKS_SUPPORTED) 361 362 /* 363 * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs 364 * that it has currently allocated to a pmap, a cursor ("asid_next") to 365 * optimize its search for a free ASID in the bit vector, and an epoch number 366 * ("asid_epoch") to indicate when it has reclaimed all previously allocated 367 * ASIDs that are not currently active on a processor. 368 * 369 * The current epoch number is always in the range [0, INT_MAX). Negative 370 * numbers and INT_MAX are reserved for special cases that are described 371 * below. 372 */ 373 struct asid_set { 374 int asid_bits; 375 bitstr_t *asid_set; 376 int asid_set_size; 377 int asid_next; 378 int asid_epoch; 379 struct mtx asid_set_mutex; 380 }; 381 382 static struct asid_set asids; 383 static struct asid_set vmids; 384 385 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 386 "ASID allocator"); 387 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0, 388 "The number of bits in an ASID"); 389 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0, 390 "The last allocated ASID plus one"); 391 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0, 392 "The current epoch number"); 393 394 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator"); 395 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0, 396 "The number of bits in an VMID"); 397 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0, 398 "The last allocated VMID plus one"); 399 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0, 400 "The current epoch number"); 401 402 void (*pmap_clean_stage2_tlbi)(void); 403 void (*pmap_invalidate_vpipt_icache)(void); 404 void (*pmap_stage2_invalidate_range)(uint64_t, vm_offset_t, vm_offset_t, bool); 405 void (*pmap_stage2_invalidate_all)(uint64_t); 406 407 /* 408 * A pmap's cookie encodes an ASID and epoch number. Cookies for reserved 409 * ASIDs have a negative epoch number, specifically, INT_MIN. Cookies for 410 * dynamically allocated ASIDs have a non-negative epoch number. 411 * 412 * An invalid ASID is represented by -1. 413 * 414 * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN), 415 * which indicates that an ASID should never be allocated to the pmap, and 416 * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be 417 * allocated when the pmap is next activated. 418 */ 419 #define COOKIE_FROM(asid, epoch) ((long)((u_int)(asid) | \ 420 ((u_long)(epoch) << 32))) 421 #define COOKIE_TO_ASID(cookie) ((int)(cookie)) 422 #define COOKIE_TO_EPOCH(cookie) ((int)((u_long)(cookie) >> 32)) 423 424 #define TLBI_VA_SHIFT 12 425 #define TLBI_VA_MASK ((1ul << 44) - 1) 426 #define TLBI_VA(addr) (((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK) 427 #define TLBI_VA_L3_INCR (L3_SIZE >> TLBI_VA_SHIFT) 428 429 static int __read_frequently superpages_enabled = 1; 430 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, 431 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0, 432 "Are large page mappings enabled?"); 433 434 /* 435 * Internal flags for pmap_enter()'s helper functions. 436 */ 437 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 438 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 439 440 TAILQ_HEAD(pv_chunklist, pv_chunk); 441 442 static void free_pv_chunk(struct pv_chunk *pc); 443 static void free_pv_chunk_batch(struct pv_chunklist *batch); 444 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 445 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 446 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 447 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 448 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 449 vm_offset_t va); 450 451 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); 452 static bool pmap_activate_int(pmap_t pmap); 453 static void pmap_alloc_asid(pmap_t pmap); 454 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, 455 vm_prot_t prot, int mode, bool skip_unmapped); 456 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va); 457 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, 458 vm_offset_t va, struct rwlock **lockp); 459 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); 460 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 461 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 462 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, 463 u_int flags, vm_page_t m, struct rwlock **lockp); 464 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 465 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp); 466 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, 467 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp); 468 static void pmap_reset_asid_set(pmap_t pmap); 469 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 470 vm_page_t m, struct rwlock **lockp); 471 472 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, 473 struct rwlock **lockp); 474 475 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, 476 struct spglist *free); 477 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 478 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 479 480 static pt_entry_t pmap_pte_bti(pmap_t pmap, vm_offset_t va); 481 482 /* 483 * These load the old table data and store the new value. 484 * They need to be atomic as the System MMU may write to the table at 485 * the same time as the CPU. 486 */ 487 #define pmap_clear(table) atomic_store_64(table, 0) 488 #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits) 489 #define pmap_load(table) (*table) 490 #define pmap_load_clear(table) atomic_swap_64(table, 0) 491 #define pmap_load_store(table, entry) atomic_swap_64(table, entry) 492 #define pmap_set_bits(table, bits) atomic_set_64(table, bits) 493 #define pmap_store(table, entry) atomic_store_64(table, entry) 494 495 /********************/ 496 /* Inline functions */ 497 /********************/ 498 499 static __inline void 500 pagecopy(void *s, void *d) 501 { 502 503 memcpy(d, s, PAGE_SIZE); 504 } 505 506 static __inline pd_entry_t * 507 pmap_l0(pmap_t pmap, vm_offset_t va) 508 { 509 510 return (&pmap->pm_l0[pmap_l0_index(va)]); 511 } 512 513 static __inline pd_entry_t * 514 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) 515 { 516 pd_entry_t *l1; 517 518 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0))); 519 return (&l1[pmap_l1_index(va)]); 520 } 521 522 static __inline pd_entry_t * 523 pmap_l1(pmap_t pmap, vm_offset_t va) 524 { 525 pd_entry_t *l0; 526 527 l0 = pmap_l0(pmap, va); 528 if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE) 529 return (NULL); 530 531 return (pmap_l0_to_l1(l0, va)); 532 } 533 534 static __inline pd_entry_t * 535 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va) 536 { 537 pd_entry_t l1, *l2p; 538 539 l1 = pmap_load(l1p); 540 541 KASSERT(ADDR_IS_CANONICAL(va), 542 ("%s: Address not in canonical form: %lx", __func__, va)); 543 /* 544 * The valid bit may be clear if pmap_update_entry() is concurrently 545 * modifying the entry, so for KVA only the entry type may be checked. 546 */ 547 KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0, 548 ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va)); 549 KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, 550 ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va)); 551 l2p = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l1)); 552 return (&l2p[pmap_l2_index(va)]); 553 } 554 555 static __inline pd_entry_t * 556 pmap_l2(pmap_t pmap, vm_offset_t va) 557 { 558 pd_entry_t *l1; 559 560 l1 = pmap_l1(pmap, va); 561 if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE) 562 return (NULL); 563 564 return (pmap_l1_to_l2(l1, va)); 565 } 566 567 static __inline pt_entry_t * 568 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va) 569 { 570 pd_entry_t l2; 571 pt_entry_t *l3p; 572 573 l2 = pmap_load(l2p); 574 575 KASSERT(ADDR_IS_CANONICAL(va), 576 ("%s: Address not in canonical form: %lx", __func__, va)); 577 /* 578 * The valid bit may be clear if pmap_update_entry() is concurrently 579 * modifying the entry, so for KVA only the entry type may be checked. 580 */ 581 KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0, 582 ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va)); 583 KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, 584 ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va)); 585 l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l2)); 586 return (&l3p[pmap_l3_index(va)]); 587 } 588 589 /* 590 * Returns the lowest valid pde for a given virtual address. 591 * The next level may or may not point to a valid page or block. 592 */ 593 static __inline pd_entry_t * 594 pmap_pde(pmap_t pmap, vm_offset_t va, int *level) 595 { 596 pd_entry_t *l0, *l1, *l2, desc; 597 598 l0 = pmap_l0(pmap, va); 599 desc = pmap_load(l0) & ATTR_DESCR_MASK; 600 if (desc != L0_TABLE) { 601 *level = -1; 602 return (NULL); 603 } 604 605 l1 = pmap_l0_to_l1(l0, va); 606 desc = pmap_load(l1) & ATTR_DESCR_MASK; 607 if (desc != L1_TABLE) { 608 *level = 0; 609 return (l0); 610 } 611 612 l2 = pmap_l1_to_l2(l1, va); 613 desc = pmap_load(l2) & ATTR_DESCR_MASK; 614 if (desc != L2_TABLE) { 615 *level = 1; 616 return (l1); 617 } 618 619 *level = 2; 620 return (l2); 621 } 622 623 /* 624 * Returns the lowest valid pte block or table entry for a given virtual 625 * address. If there are no valid entries return NULL and set the level to 626 * the first invalid level. 627 */ 628 static __inline pt_entry_t * 629 pmap_pte(pmap_t pmap, vm_offset_t va, int *level) 630 { 631 pd_entry_t *l1, *l2, desc; 632 pt_entry_t *l3; 633 634 l1 = pmap_l1(pmap, va); 635 if (l1 == NULL) { 636 *level = 0; 637 return (NULL); 638 } 639 desc = pmap_load(l1) & ATTR_DESCR_MASK; 640 if (desc == L1_BLOCK) { 641 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 642 *level = 1; 643 return (l1); 644 } 645 646 if (desc != L1_TABLE) { 647 *level = 1; 648 return (NULL); 649 } 650 651 l2 = pmap_l1_to_l2(l1, va); 652 desc = pmap_load(l2) & ATTR_DESCR_MASK; 653 if (desc == L2_BLOCK) { 654 *level = 2; 655 return (l2); 656 } 657 658 if (desc != L2_TABLE) { 659 *level = 2; 660 return (NULL); 661 } 662 663 *level = 3; 664 l3 = pmap_l2_to_l3(l2, va); 665 if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE) 666 return (NULL); 667 668 return (l3); 669 } 670 671 /* 672 * If the given pmap has an L{1,2}_BLOCK or L3_PAGE entry at the specified 673 * level that maps the specified virtual address, then a pointer to that entry 674 * is returned. Otherwise, NULL is returned, unless INVARIANTS are enabled 675 * and a diagnostic message is provided, in which case this function panics. 676 */ 677 static __always_inline pt_entry_t * 678 pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag) 679 { 680 pd_entry_t *l0p, *l1p, *l2p; 681 pt_entry_t desc, *l3p; 682 int walk_level __diagused; 683 684 KASSERT(level >= 0 && level < 4, 685 ("%s: %s passed an out-of-range level (%d)", __func__, diag, 686 level)); 687 l0p = pmap_l0(pmap, va); 688 desc = pmap_load(l0p) & ATTR_DESCR_MASK; 689 if (desc == L0_TABLE && level > 0) { 690 l1p = pmap_l0_to_l1(l0p, va); 691 desc = pmap_load(l1p) & ATTR_DESCR_MASK; 692 if (desc == L1_BLOCK && level == 1) { 693 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 694 return (l1p); 695 } 696 if (desc == L1_TABLE && level > 1) { 697 l2p = pmap_l1_to_l2(l1p, va); 698 desc = pmap_load(l2p) & ATTR_DESCR_MASK; 699 if (desc == L2_BLOCK && level == 2) 700 return (l2p); 701 else if (desc == L2_TABLE && level > 2) { 702 l3p = pmap_l2_to_l3(l2p, va); 703 desc = pmap_load(l3p) & ATTR_DESCR_MASK; 704 if (desc == L3_PAGE && level == 3) 705 return (l3p); 706 else 707 walk_level = 3; 708 } else 709 walk_level = 2; 710 } else 711 walk_level = 1; 712 } else 713 walk_level = 0; 714 KASSERT(diag == NULL, 715 ("%s: va %#lx not mapped at level %d, desc %ld at level %d", 716 diag, va, level, desc, walk_level)); 717 return (NULL); 718 } 719 720 bool 721 pmap_ps_enabled(pmap_t pmap) 722 { 723 /* 724 * Promotion requires a hypervisor call when the kernel is running 725 * in EL1. To stop this disable superpage support on non-stage 1 726 * pmaps for now. 727 */ 728 if (pmap->pm_stage != PM_STAGE1) 729 return (false); 730 731 return (superpages_enabled != 0); 732 } 733 734 bool 735 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1, 736 pd_entry_t **l2, pt_entry_t **l3) 737 { 738 pd_entry_t *l0p, *l1p, *l2p; 739 740 if (pmap->pm_l0 == NULL) 741 return (false); 742 743 l0p = pmap_l0(pmap, va); 744 *l0 = l0p; 745 746 if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) 747 return (false); 748 749 l1p = pmap_l0_to_l1(l0p, va); 750 *l1 = l1p; 751 752 if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) { 753 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 754 *l2 = NULL; 755 *l3 = NULL; 756 return (true); 757 } 758 759 if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE) 760 return (false); 761 762 l2p = pmap_l1_to_l2(l1p, va); 763 *l2 = l2p; 764 765 if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) { 766 *l3 = NULL; 767 return (true); 768 } 769 770 if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE) 771 return (false); 772 773 *l3 = pmap_l2_to_l3(l2p, va); 774 775 return (true); 776 } 777 778 static __inline int 779 pmap_l3_valid(pt_entry_t l3) 780 { 781 782 return ((l3 & ATTR_DESCR_MASK) == L3_PAGE); 783 } 784 785 CTASSERT(L1_BLOCK == L2_BLOCK); 786 787 static pt_entry_t 788 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr) 789 { 790 pt_entry_t val; 791 792 if (pmap->pm_stage == PM_STAGE1) { 793 val = ATTR_S1_IDX(memattr); 794 if (memattr == VM_MEMATTR_DEVICE) 795 val |= ATTR_S1_XN; 796 return (val); 797 } 798 799 val = 0; 800 801 switch (memattr) { 802 case VM_MEMATTR_DEVICE: 803 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) | 804 ATTR_S2_XN(ATTR_S2_XN_ALL)); 805 case VM_MEMATTR_UNCACHEABLE: 806 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC)); 807 case VM_MEMATTR_WRITE_BACK: 808 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB)); 809 case VM_MEMATTR_WRITE_THROUGH: 810 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT)); 811 default: 812 panic("%s: invalid memory attribute %x", __func__, memattr); 813 } 814 } 815 816 static pt_entry_t 817 pmap_pte_prot(pmap_t pmap, vm_prot_t prot) 818 { 819 pt_entry_t val; 820 821 val = 0; 822 if (pmap->pm_stage == PM_STAGE1) { 823 if ((prot & VM_PROT_EXECUTE) == 0) 824 val |= ATTR_S1_XN; 825 if ((prot & VM_PROT_WRITE) == 0) 826 val |= ATTR_S1_AP(ATTR_S1_AP_RO); 827 } else { 828 if ((prot & VM_PROT_WRITE) != 0) 829 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 830 if ((prot & VM_PROT_READ) != 0) 831 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ); 832 if ((prot & VM_PROT_EXECUTE) == 0) 833 val |= ATTR_S2_XN(ATTR_S2_XN_ALL); 834 } 835 836 return (val); 837 } 838 839 /* 840 * Checks if the PTE is dirty. 841 */ 842 static inline int 843 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte) 844 { 845 846 KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte)); 847 848 if (pmap->pm_stage == PM_STAGE1) { 849 KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0, 850 ("pte %#lx is writeable and missing ATTR_SW_DBM", pte)); 851 852 return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 853 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM)); 854 } 855 856 return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) == 857 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)); 858 } 859 860 static __inline void 861 pmap_resident_count_inc(pmap_t pmap, int count) 862 { 863 864 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 865 pmap->pm_stats.resident_count += count; 866 } 867 868 static __inline void 869 pmap_resident_count_dec(pmap_t pmap, int count) 870 { 871 872 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 873 KASSERT(pmap->pm_stats.resident_count >= count, 874 ("pmap %p resident count underflow %ld %d", pmap, 875 pmap->pm_stats.resident_count, count)); 876 pmap->pm_stats.resident_count -= count; 877 } 878 879 static vm_paddr_t 880 pmap_early_vtophys(vm_offset_t va) 881 { 882 vm_paddr_t pa_page; 883 884 pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK; 885 return (pa_page | (va & PAR_LOW_MASK)); 886 } 887 888 /* State of the bootstrapped DMAP page tables */ 889 struct pmap_bootstrap_state { 890 pt_entry_t *l1; 891 pt_entry_t *l2; 892 pt_entry_t *l3; 893 vm_offset_t freemempos; 894 vm_offset_t va; 895 vm_paddr_t pa; 896 pt_entry_t table_attrs; 897 u_int l0_slot; 898 u_int l1_slot; 899 u_int l2_slot; 900 bool dmap_valid; 901 }; 902 903 /* The bootstrap state */ 904 static struct pmap_bootstrap_state bs_state = { 905 .l1 = NULL, 906 .l2 = NULL, 907 .l3 = NULL, 908 .table_attrs = TATTR_PXN_TABLE, 909 .l0_slot = L0_ENTRIES, 910 .l1_slot = Ln_ENTRIES, 911 .l2_slot = Ln_ENTRIES, 912 .dmap_valid = false, 913 }; 914 915 static void 916 pmap_bootstrap_l0_table(struct pmap_bootstrap_state *state) 917 { 918 vm_paddr_t l1_pa; 919 pd_entry_t l0e; 920 u_int l0_slot; 921 922 /* Link the level 0 table to a level 1 table */ 923 l0_slot = pmap_l0_index(state->va); 924 if (l0_slot != state->l0_slot) { 925 /* 926 * Make sure we move from a low address to high address 927 * before the DMAP region is ready. This ensures we never 928 * modify an existing mapping until we can map from a 929 * physical address to a virtual address. 930 */ 931 MPASS(state->l0_slot < l0_slot || 932 state->l0_slot == L0_ENTRIES || 933 state->dmap_valid); 934 935 /* Reset lower levels */ 936 state->l2 = NULL; 937 state->l3 = NULL; 938 state->l1_slot = Ln_ENTRIES; 939 state->l2_slot = Ln_ENTRIES; 940 941 /* Check the existing L0 entry */ 942 state->l0_slot = l0_slot; 943 if (state->dmap_valid) { 944 l0e = pagetable_l0_ttbr1[l0_slot]; 945 if ((l0e & ATTR_DESCR_VALID) != 0) { 946 MPASS((l0e & ATTR_DESCR_MASK) == L0_TABLE); 947 l1_pa = PTE_TO_PHYS(l0e); 948 state->l1 = (pt_entry_t *)PHYS_TO_DMAP(l1_pa); 949 return; 950 } 951 } 952 953 /* Create a new L0 table entry */ 954 state->l1 = (pt_entry_t *)state->freemempos; 955 memset(state->l1, 0, PAGE_SIZE); 956 state->freemempos += PAGE_SIZE; 957 958 l1_pa = pmap_early_vtophys((vm_offset_t)state->l1); 959 MPASS((l1_pa & Ln_TABLE_MASK) == 0); 960 MPASS(pagetable_l0_ttbr1[l0_slot] == 0); 961 pmap_store(&pagetable_l0_ttbr1[l0_slot], PHYS_TO_PTE(l1_pa) | 962 TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0 | L0_TABLE); 963 } 964 KASSERT(state->l1 != NULL, ("%s: NULL l1", __func__)); 965 } 966 967 static void 968 pmap_bootstrap_l1_table(struct pmap_bootstrap_state *state) 969 { 970 vm_paddr_t l2_pa; 971 pd_entry_t l1e; 972 u_int l1_slot; 973 974 /* Make sure there is a valid L0 -> L1 table */ 975 pmap_bootstrap_l0_table(state); 976 977 /* Link the level 1 table to a level 2 table */ 978 l1_slot = pmap_l1_index(state->va); 979 if (l1_slot != state->l1_slot) { 980 /* See pmap_bootstrap_l0_table for a description */ 981 MPASS(state->l1_slot < l1_slot || 982 state->l1_slot == Ln_ENTRIES || 983 state->dmap_valid); 984 985 /* Reset lower levels */ 986 state->l3 = NULL; 987 state->l2_slot = Ln_ENTRIES; 988 989 /* Check the existing L1 entry */ 990 state->l1_slot = l1_slot; 991 if (state->dmap_valid) { 992 l1e = state->l1[l1_slot]; 993 if ((l1e & ATTR_DESCR_VALID) != 0) { 994 MPASS((l1e & ATTR_DESCR_MASK) == L1_TABLE); 995 l2_pa = PTE_TO_PHYS(l1e); 996 state->l2 = (pt_entry_t *)PHYS_TO_DMAP(l2_pa); 997 return; 998 } 999 } 1000 1001 /* Create a new L1 table entry */ 1002 state->l2 = (pt_entry_t *)state->freemempos; 1003 memset(state->l2, 0, PAGE_SIZE); 1004 state->freemempos += PAGE_SIZE; 1005 1006 l2_pa = pmap_early_vtophys((vm_offset_t)state->l2); 1007 MPASS((l2_pa & Ln_TABLE_MASK) == 0); 1008 MPASS(state->l1[l1_slot] == 0); 1009 pmap_store(&state->l1[l1_slot], PHYS_TO_PTE(l2_pa) | 1010 state->table_attrs | L1_TABLE); 1011 } 1012 KASSERT(state->l2 != NULL, ("%s: NULL l2", __func__)); 1013 } 1014 1015 static void 1016 pmap_bootstrap_l2_table(struct pmap_bootstrap_state *state) 1017 { 1018 vm_paddr_t l3_pa; 1019 pd_entry_t l2e; 1020 u_int l2_slot; 1021 1022 /* Make sure there is a valid L1 -> L2 table */ 1023 pmap_bootstrap_l1_table(state); 1024 1025 /* Link the level 2 table to a level 3 table */ 1026 l2_slot = pmap_l2_index(state->va); 1027 if (l2_slot != state->l2_slot) { 1028 /* See pmap_bootstrap_l0_table for a description */ 1029 MPASS(state->l2_slot < l2_slot || 1030 state->l2_slot == Ln_ENTRIES || 1031 state->dmap_valid); 1032 1033 /* Check the existing L2 entry */ 1034 state->l2_slot = l2_slot; 1035 if (state->dmap_valid) { 1036 l2e = state->l2[l2_slot]; 1037 if ((l2e & ATTR_DESCR_VALID) != 0) { 1038 MPASS((l2e & ATTR_DESCR_MASK) == L2_TABLE); 1039 l3_pa = PTE_TO_PHYS(l2e); 1040 state->l3 = (pt_entry_t *)PHYS_TO_DMAP(l3_pa); 1041 return; 1042 } 1043 } 1044 1045 /* Create a new L2 table entry */ 1046 state->l3 = (pt_entry_t *)state->freemempos; 1047 memset(state->l3, 0, PAGE_SIZE); 1048 state->freemempos += PAGE_SIZE; 1049 1050 l3_pa = pmap_early_vtophys((vm_offset_t)state->l3); 1051 MPASS((l3_pa & Ln_TABLE_MASK) == 0); 1052 MPASS(state->l2[l2_slot] == 0); 1053 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(l3_pa) | 1054 state->table_attrs | L2_TABLE); 1055 } 1056 KASSERT(state->l3 != NULL, ("%s: NULL l3", __func__)); 1057 } 1058 1059 static void 1060 pmap_bootstrap_l2_block(struct pmap_bootstrap_state *state, int i) 1061 { 1062 u_int l2_slot; 1063 bool first; 1064 1065 if ((physmap[i + 1] - state->pa) < L2_SIZE) 1066 return; 1067 1068 /* Make sure there is a valid L1 table */ 1069 pmap_bootstrap_l1_table(state); 1070 1071 MPASS((state->va & L2_OFFSET) == 0); 1072 for (first = true; 1073 state->va < DMAP_MAX_ADDRESS && 1074 (physmap[i + 1] - state->pa) >= L2_SIZE; 1075 state->va += L2_SIZE, state->pa += L2_SIZE) { 1076 /* 1077 * Stop if we are about to walk off the end of what the 1078 * current L1 slot can address. 1079 */ 1080 if (!first && (state->pa & L1_OFFSET) == 0) 1081 break; 1082 1083 first = false; 1084 l2_slot = pmap_l2_index(state->va); 1085 MPASS((state->pa & L2_OFFSET) == 0); 1086 MPASS(state->l2[l2_slot] == 0); 1087 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(state->pa) | 1088 ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP | 1089 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK); 1090 } 1091 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS)); 1092 } 1093 1094 static void 1095 pmap_bootstrap_l3_page(struct pmap_bootstrap_state *state, int i) 1096 { 1097 u_int l3_slot; 1098 bool first; 1099 1100 if ((physmap[i + 1] - state->pa) < L3_SIZE) 1101 return; 1102 1103 /* Make sure there is a valid L2 table */ 1104 pmap_bootstrap_l2_table(state); 1105 1106 MPASS((state->va & L3_OFFSET) == 0); 1107 for (first = true; 1108 state->va < DMAP_MAX_ADDRESS && 1109 (physmap[i + 1] - state->pa) >= L3_SIZE; 1110 state->va += L3_SIZE, state->pa += L3_SIZE) { 1111 /* 1112 * Stop if we are about to walk off the end of what the 1113 * current L2 slot can address. 1114 */ 1115 if (!first && (state->pa & L2_OFFSET) == 0) 1116 break; 1117 1118 first = false; 1119 l3_slot = pmap_l3_index(state->va); 1120 MPASS((state->pa & L3_OFFSET) == 0); 1121 MPASS(state->l3[l3_slot] == 0); 1122 pmap_store(&state->l3[l3_slot], PHYS_TO_PTE(state->pa) | 1123 ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP | 1124 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L3_PAGE); 1125 } 1126 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS)); 1127 } 1128 1129 static void 1130 pmap_bootstrap_dmap(vm_paddr_t min_pa) 1131 { 1132 int i; 1133 1134 dmap_phys_base = min_pa & ~L1_OFFSET; 1135 dmap_phys_max = 0; 1136 dmap_max_addr = 0; 1137 1138 for (i = 0; i < (physmap_idx * 2); i += 2) { 1139 bs_state.pa = physmap[i] & ~L3_OFFSET; 1140 bs_state.va = bs_state.pa - dmap_phys_base + DMAP_MIN_ADDRESS; 1141 1142 /* Create L3 mappings at the start of the region */ 1143 if ((bs_state.pa & L2_OFFSET) != 0) 1144 pmap_bootstrap_l3_page(&bs_state, i); 1145 MPASS(bs_state.pa <= physmap[i + 1]); 1146 1147 if (L1_BLOCKS_SUPPORTED) { 1148 /* Create L2 mappings at the start of the region */ 1149 if ((bs_state.pa & L1_OFFSET) != 0) 1150 pmap_bootstrap_l2_block(&bs_state, i); 1151 MPASS(bs_state.pa <= physmap[i + 1]); 1152 1153 /* Create the main L1 block mappings */ 1154 for (; bs_state.va < DMAP_MAX_ADDRESS && 1155 (physmap[i + 1] - bs_state.pa) >= L1_SIZE; 1156 bs_state.va += L1_SIZE, bs_state.pa += L1_SIZE) { 1157 /* Make sure there is a valid L1 table */ 1158 pmap_bootstrap_l0_table(&bs_state); 1159 MPASS((bs_state.pa & L1_OFFSET) == 0); 1160 pmap_store( 1161 &bs_state.l1[pmap_l1_index(bs_state.va)], 1162 PHYS_TO_PTE(bs_state.pa) | ATTR_DEFAULT | 1163 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | 1164 ATTR_S1_XN | ATTR_KERN_GP | L1_BLOCK); 1165 } 1166 MPASS(bs_state.pa <= physmap[i + 1]); 1167 1168 /* Create L2 mappings at the end of the region */ 1169 pmap_bootstrap_l2_block(&bs_state, i); 1170 } else { 1171 while (bs_state.va < DMAP_MAX_ADDRESS && 1172 (physmap[i + 1] - bs_state.pa) >= L2_SIZE) { 1173 pmap_bootstrap_l2_block(&bs_state, i); 1174 } 1175 } 1176 MPASS(bs_state.pa <= physmap[i + 1]); 1177 1178 /* Create L3 mappings at the end of the region */ 1179 pmap_bootstrap_l3_page(&bs_state, i); 1180 MPASS(bs_state.pa == physmap[i + 1]); 1181 1182 if (bs_state.pa > dmap_phys_max) { 1183 dmap_phys_max = bs_state.pa; 1184 dmap_max_addr = bs_state.va; 1185 } 1186 } 1187 1188 cpu_tlb_flushID(); 1189 } 1190 1191 static void 1192 pmap_bootstrap_l2(vm_offset_t va) 1193 { 1194 KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address")); 1195 1196 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/ 1197 bs_state.va = va; 1198 1199 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L1_SIZE) 1200 pmap_bootstrap_l1_table(&bs_state); 1201 } 1202 1203 static void 1204 pmap_bootstrap_l3(vm_offset_t va) 1205 { 1206 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); 1207 1208 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/ 1209 bs_state.va = va; 1210 1211 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L2_SIZE) 1212 pmap_bootstrap_l2_table(&bs_state); 1213 } 1214 1215 #ifdef KASAN 1216 static void 1217 pmap_bootstrap_allocate_kasan_l2(vm_paddr_t start_pa, vm_paddr_t end_pa, 1218 vm_offset_t *start_va, int *nkasan_l2) 1219 { 1220 int i; 1221 vm_paddr_t pa; 1222 vm_offset_t va; 1223 pd_entry_t *l2; 1224 1225 va = *start_va; 1226 pa = rounddown2(end_pa - L2_SIZE, L2_SIZE); 1227 l2 = pmap_l2(kernel_pmap, va); 1228 1229 for (i = 0; pa >= start_pa && i < *nkasan_l2; 1230 i++, va += L2_SIZE, pa -= L2_SIZE, l2++) { 1231 /* 1232 * KASAN stack checking results in us having already allocated 1233 * part of our shadow map, so we can just skip those segments. 1234 */ 1235 if ((pmap_load(l2) & ATTR_DESCR_VALID) != 0) { 1236 pa += L2_SIZE; 1237 continue; 1238 } 1239 1240 pmap_store(l2, PHYS_TO_PTE(pa) | PMAP_SAN_PTE_BITS | L2_BLOCK); 1241 } 1242 1243 /* 1244 * Ended the allocation due to start_pa constraint, rather than because 1245 * we allocated everything. Adjust back up to the start_pa and remove 1246 * the invalid L2 block from our accounting. 1247 */ 1248 if (pa < start_pa) { 1249 va += L2_SIZE; 1250 i--; 1251 pa = start_pa; 1252 } 1253 1254 bzero((void *)PHYS_TO_DMAP(pa), i * L2_SIZE); 1255 physmem_exclude_region(pa, i * L2_SIZE, EXFLAG_NOALLOC); 1256 1257 *nkasan_l2 -= i; 1258 *start_va = va; 1259 } 1260 #endif 1261 1262 /* 1263 * Bootstrap the system enough to run with virtual memory. 1264 */ 1265 void 1266 pmap_bootstrap(vm_size_t kernlen) 1267 { 1268 vm_offset_t dpcpu, msgbufpv; 1269 vm_paddr_t start_pa, pa, min_pa; 1270 int i; 1271 1272 /* Verify that the ASID is set through TTBR0. */ 1273 KASSERT((READ_SPECIALREG(tcr_el1) & TCR_A1) == 0, 1274 ("pmap_bootstrap: TCR_EL1.A1 != 0")); 1275 1276 /* Set this early so we can use the pagetable walking functions */ 1277 kernel_pmap_store.pm_l0 = pagetable_l0_ttbr1; 1278 PMAP_LOCK_INIT(kernel_pmap); 1279 kernel_pmap->pm_l0_paddr = 1280 pmap_early_vtophys((vm_offset_t)kernel_pmap_store.pm_l0); 1281 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1282 vm_radix_init(&kernel_pmap->pm_root); 1283 kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN); 1284 kernel_pmap->pm_stage = PM_STAGE1; 1285 kernel_pmap->pm_levels = 4; 1286 kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr; 1287 kernel_pmap->pm_asid_set = &asids; 1288 1289 /* Assume the address we were loaded to is a valid physical address */ 1290 min_pa = pmap_early_vtophys(KERNBASE); 1291 1292 physmap_idx = physmem_avail(physmap, nitems(physmap)); 1293 physmap_idx /= 2; 1294 1295 /* 1296 * Find the minimum physical address. physmap is sorted, 1297 * but may contain empty ranges. 1298 */ 1299 for (i = 0; i < physmap_idx * 2; i += 2) { 1300 if (physmap[i] == physmap[i + 1]) 1301 continue; 1302 if (physmap[i] <= min_pa) 1303 min_pa = physmap[i]; 1304 } 1305 1306 bs_state.freemempos = KERNBASE + kernlen; 1307 bs_state.freemempos = roundup2(bs_state.freemempos, PAGE_SIZE); 1308 1309 /* Create a direct map region early so we can use it for pa -> va */ 1310 pmap_bootstrap_dmap(min_pa); 1311 bs_state.dmap_valid = true; 1312 /* 1313 * We only use PXN when we know nothing will be executed from it, e.g. 1314 * the DMAP region. 1315 */ 1316 bs_state.table_attrs &= ~TATTR_PXN_TABLE; 1317 1318 start_pa = pa = pmap_early_vtophys(KERNBASE); 1319 1320 /* 1321 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS. We assume that the 1322 * loader allocated the first and only l2 page table page used to map 1323 * the kernel, preloaded files and module metadata. 1324 */ 1325 pmap_bootstrap_l2(KERNBASE + L1_SIZE); 1326 /* And the l3 tables for the early devmap */ 1327 pmap_bootstrap_l3(VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE)); 1328 1329 cpu_tlb_flushID(); 1330 1331 #define alloc_pages(var, np) \ 1332 (var) = bs_state.freemempos; \ 1333 bs_state.freemempos += (np * PAGE_SIZE); \ 1334 memset((char *)(var), 0, ((np) * PAGE_SIZE)); 1335 1336 /* Allocate dynamic per-cpu area. */ 1337 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); 1338 dpcpu_init((void *)dpcpu, 0); 1339 1340 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ 1341 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); 1342 msgbufp = (void *)msgbufpv; 1343 1344 /* Reserve some VA space for early BIOS/ACPI mapping */ 1345 preinit_map_va = roundup2(bs_state.freemempos, L2_SIZE); 1346 1347 virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE; 1348 virtual_avail = roundup2(virtual_avail, L1_SIZE); 1349 virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE); 1350 kernel_vm_end = virtual_avail; 1351 1352 pa = pmap_early_vtophys(bs_state.freemempos); 1353 1354 physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC); 1355 1356 cpu_tlb_flushID(); 1357 } 1358 1359 #if defined(KASAN) 1360 /* 1361 * Finish constructing the initial shadow map: 1362 * - Count how many pages from KERNBASE to virtual_avail (scaled for 1363 * shadow map) 1364 * - Map that entire range using L2 superpages. 1365 */ 1366 void 1367 pmap_bootstrap_san(void) 1368 { 1369 vm_offset_t va; 1370 vm_paddr_t kernstart; 1371 int i, shadow_npages, nkasan_l2; 1372 1373 kernstart = pmap_early_vtophys(KERNBASE); 1374 1375 /* 1376 * Rebuild physmap one more time, we may have excluded more regions from 1377 * allocation since pmap_bootstrap(). 1378 */ 1379 bzero(physmap, sizeof(physmap)); 1380 physmap_idx = physmem_avail(physmap, nitems(physmap)); 1381 physmap_idx /= 2; 1382 1383 shadow_npages = (virtual_avail - VM_MIN_KERNEL_ADDRESS) / PAGE_SIZE; 1384 shadow_npages = howmany(shadow_npages, KASAN_SHADOW_SCALE); 1385 nkasan_l2 = howmany(shadow_npages, Ln_ENTRIES); 1386 1387 /* Map the valid KVA up to this point. */ 1388 va = KASAN_MIN_ADDRESS; 1389 1390 /* 1391 * Find a slot in the physmap large enough for what we needed. We try to put 1392 * the shadow map as high up as we can to avoid depleting the lower 4GB in case 1393 * it's needed for, e.g., an xhci controller that can only do 32-bit DMA. 1394 */ 1395 for (i = (physmap_idx * 2) - 2; i >= 0 && nkasan_l2 > 0; i -= 2) { 1396 vm_paddr_t plow, phigh; 1397 1398 /* L2 mappings must be backed by memory that is L2-aligned */ 1399 plow = roundup2(physmap[i], L2_SIZE); 1400 phigh = physmap[i + 1]; 1401 if (plow >= phigh) 1402 continue; 1403 if (kernstart >= plow && kernstart < phigh) 1404 phigh = kernstart; 1405 if (phigh - plow >= L2_SIZE) 1406 pmap_bootstrap_allocate_kasan_l2(plow, phigh, &va, 1407 &nkasan_l2); 1408 } 1409 1410 if (nkasan_l2 != 0) 1411 panic("Could not find phys region for shadow map"); 1412 1413 /* 1414 * Done. We should now have a valid shadow address mapped for all KVA 1415 * that has been mapped so far, i.e., KERNBASE to virtual_avail. Thus, 1416 * shadow accesses by the kasan(9) runtime will succeed for this range. 1417 * When the kernel virtual address range is later expanded, as will 1418 * happen in vm_mem_init(), the shadow map will be grown as well. This 1419 * is handled by pmap_san_enter(). 1420 */ 1421 } 1422 #endif 1423 1424 /* 1425 * Initialize a vm_page's machine-dependent fields. 1426 */ 1427 void 1428 pmap_page_init(vm_page_t m) 1429 { 1430 1431 TAILQ_INIT(&m->md.pv_list); 1432 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; 1433 } 1434 1435 static void 1436 pmap_init_asids(struct asid_set *set, int bits) 1437 { 1438 int i; 1439 1440 set->asid_bits = bits; 1441 1442 /* 1443 * We may be too early in the overall initialization process to use 1444 * bit_alloc(). 1445 */ 1446 set->asid_set_size = 1 << set->asid_bits; 1447 set->asid_set = kmem_malloc(bitstr_size(set->asid_set_size), 1448 M_WAITOK | M_ZERO); 1449 for (i = 0; i < ASID_FIRST_AVAILABLE; i++) 1450 bit_set(set->asid_set, i); 1451 set->asid_next = ASID_FIRST_AVAILABLE; 1452 mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN); 1453 } 1454 1455 static void 1456 pmap_init_pv_table(void) 1457 { 1458 struct vm_phys_seg *seg, *next_seg; 1459 struct pmap_large_md_page *pvd; 1460 vm_size_t s; 1461 int domain, i, j, pages; 1462 1463 /* 1464 * We strongly depend on the size being a power of two, so the assert 1465 * is overzealous. However, should the struct be resized to a 1466 * different power of two, the code below needs to be revisited. 1467 */ 1468 CTASSERT((sizeof(*pvd) == 64)); 1469 1470 /* 1471 * Calculate the size of the array. 1472 */ 1473 s = 0; 1474 for (i = 0; i < vm_phys_nsegs; i++) { 1475 seg = &vm_phys_segs[i]; 1476 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1477 pmap_l2_pindex(seg->start); 1478 s += round_page(pages * sizeof(*pvd)); 1479 } 1480 pv_table = (struct pmap_large_md_page *)kva_alloc(s); 1481 if (pv_table == NULL) 1482 panic("%s: kva_alloc failed\n", __func__); 1483 1484 /* 1485 * Iterate physical segments to allocate domain-local memory for PV 1486 * list headers. 1487 */ 1488 pvd = pv_table; 1489 for (i = 0; i < vm_phys_nsegs; i++) { 1490 seg = &vm_phys_segs[i]; 1491 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1492 pmap_l2_pindex(seg->start); 1493 domain = seg->domain; 1494 1495 s = round_page(pages * sizeof(*pvd)); 1496 1497 for (j = 0; j < s; j += PAGE_SIZE) { 1498 vm_page_t m = vm_page_alloc_noobj_domain(domain, 1499 VM_ALLOC_ZERO); 1500 if (m == NULL) 1501 panic("failed to allocate PV table page"); 1502 pmap_qenter((vm_offset_t)pvd + j, &m, 1); 1503 } 1504 1505 for (j = 0; j < s / sizeof(*pvd); j++) { 1506 rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW); 1507 TAILQ_INIT(&pvd->pv_page.pv_list); 1508 pvd++; 1509 } 1510 } 1511 pvd = &pv_dummy_large; 1512 memset(pvd, 0, sizeof(*pvd)); 1513 rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW); 1514 TAILQ_INIT(&pvd->pv_page.pv_list); 1515 1516 /* 1517 * Set pointers from vm_phys_segs to pv_table. 1518 */ 1519 for (i = 0, pvd = pv_table; i < vm_phys_nsegs; i++) { 1520 seg = &vm_phys_segs[i]; 1521 seg->md_first = pvd; 1522 pvd += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1523 pmap_l2_pindex(seg->start); 1524 1525 /* 1526 * If there is a following segment, and the final 1527 * superpage of this segment and the initial superpage 1528 * of the next segment are the same then adjust the 1529 * pv_table entry for that next segment down by one so 1530 * that the pv_table entries will be shared. 1531 */ 1532 if (i + 1 < vm_phys_nsegs) { 1533 next_seg = &vm_phys_segs[i + 1]; 1534 if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 == 1535 pmap_l2_pindex(next_seg->start)) { 1536 pvd--; 1537 } 1538 } 1539 } 1540 } 1541 1542 /* 1543 * Initialize the pmap module. 1544 * Called by vm_init, to initialize any structures that the pmap 1545 * system needs to map virtual memory. 1546 */ 1547 void 1548 pmap_init(void) 1549 { 1550 uint64_t mmfr1; 1551 int i, vmid_bits; 1552 1553 /* 1554 * Are large page mappings enabled? 1555 */ 1556 TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); 1557 if (superpages_enabled) { 1558 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1559 ("pmap_init: can't assign to pagesizes[1]")); 1560 pagesizes[1] = L2_SIZE; 1561 if (L1_BLOCKS_SUPPORTED) { 1562 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0, 1563 ("pmap_init: can't assign to pagesizes[2]")); 1564 pagesizes[2] = L1_SIZE; 1565 } 1566 } 1567 1568 /* 1569 * Initialize the ASID allocator. 1570 */ 1571 pmap_init_asids(&asids, 1572 (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8); 1573 1574 if (has_hyp()) { 1575 mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1); 1576 vmid_bits = 8; 1577 1578 if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) == 1579 ID_AA64MMFR1_VMIDBits_16) 1580 vmid_bits = 16; 1581 pmap_init_asids(&vmids, vmid_bits); 1582 } 1583 1584 /* 1585 * Initialize pv chunk lists. 1586 */ 1587 for (i = 0; i < PMAP_MEMDOM; i++) { 1588 mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL, 1589 MTX_DEF); 1590 TAILQ_INIT(&pv_chunks[i].pvc_list); 1591 } 1592 pmap_init_pv_table(); 1593 1594 vm_initialized = 1; 1595 } 1596 1597 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1598 "2MB page mapping counters"); 1599 1600 static u_long pmap_l2_demotions; 1601 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, 1602 &pmap_l2_demotions, 0, "2MB page demotions"); 1603 1604 static u_long pmap_l2_mappings; 1605 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, 1606 &pmap_l2_mappings, 0, "2MB page mappings"); 1607 1608 static u_long pmap_l2_p_failures; 1609 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, 1610 &pmap_l2_p_failures, 0, "2MB page promotion failures"); 1611 1612 static u_long pmap_l2_promotions; 1613 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, 1614 &pmap_l2_promotions, 0, "2MB page promotions"); 1615 1616 /* 1617 * If the given value for "final_only" is false, then any cached intermediate- 1618 * level entries, i.e., L{0,1,2}_TABLE entries, are invalidated in addition to 1619 * any cached final-level entry, i.e., either an L{1,2}_BLOCK or L3_PAGE entry. 1620 * Otherwise, just the cached final-level entry is invalidated. 1621 */ 1622 static __inline void 1623 pmap_s1_invalidate_kernel(uint64_t r, bool final_only) 1624 { 1625 if (final_only) 1626 __asm __volatile("tlbi vaale1is, %0" : : "r" (r)); 1627 else 1628 __asm __volatile("tlbi vaae1is, %0" : : "r" (r)); 1629 } 1630 1631 static __inline void 1632 pmap_s1_invalidate_user(uint64_t r, bool final_only) 1633 { 1634 if (final_only) 1635 __asm __volatile("tlbi vale1is, %0" : : "r" (r)); 1636 else 1637 __asm __volatile("tlbi vae1is, %0" : : "r" (r)); 1638 } 1639 1640 /* 1641 * Invalidates any cached final- and optionally intermediate-level TLB entries 1642 * for the specified virtual address in the given virtual address space. 1643 */ 1644 static __inline void 1645 pmap_s1_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only) 1646 { 1647 uint64_t r; 1648 1649 PMAP_ASSERT_STAGE1(pmap); 1650 1651 dsb(ishst); 1652 r = TLBI_VA(va); 1653 if (pmap == kernel_pmap) { 1654 pmap_s1_invalidate_kernel(r, final_only); 1655 } else { 1656 r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 1657 pmap_s1_invalidate_user(r, final_only); 1658 } 1659 dsb(ish); 1660 isb(); 1661 } 1662 1663 static __inline void 1664 pmap_s2_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only) 1665 { 1666 PMAP_ASSERT_STAGE2(pmap); 1667 MPASS(pmap_stage2_invalidate_range != NULL); 1668 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), va, va + PAGE_SIZE, 1669 final_only); 1670 } 1671 1672 static __inline void 1673 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only) 1674 { 1675 if (pmap->pm_stage == PM_STAGE1) 1676 pmap_s1_invalidate_page(pmap, va, final_only); 1677 else 1678 pmap_s2_invalidate_page(pmap, va, final_only); 1679 } 1680 1681 /* 1682 * Invalidates any cached final- and optionally intermediate-level TLB entries 1683 * for the specified virtual address range in the given virtual address space. 1684 */ 1685 static __inline void 1686 pmap_s1_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1687 bool final_only) 1688 { 1689 uint64_t end, r, start; 1690 1691 PMAP_ASSERT_STAGE1(pmap); 1692 1693 dsb(ishst); 1694 if (pmap == kernel_pmap) { 1695 start = TLBI_VA(sva); 1696 end = TLBI_VA(eva); 1697 for (r = start; r < end; r += TLBI_VA_L3_INCR) 1698 pmap_s1_invalidate_kernel(r, final_only); 1699 } else { 1700 start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 1701 start |= TLBI_VA(sva); 1702 end |= TLBI_VA(eva); 1703 for (r = start; r < end; r += TLBI_VA_L3_INCR) 1704 pmap_s1_invalidate_user(r, final_only); 1705 } 1706 dsb(ish); 1707 isb(); 1708 } 1709 1710 static __inline void 1711 pmap_s2_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1712 bool final_only) 1713 { 1714 PMAP_ASSERT_STAGE2(pmap); 1715 MPASS(pmap_stage2_invalidate_range != NULL); 1716 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), sva, eva, final_only); 1717 } 1718 1719 static __inline void 1720 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1721 bool final_only) 1722 { 1723 if (pmap->pm_stage == PM_STAGE1) 1724 pmap_s1_invalidate_range(pmap, sva, eva, final_only); 1725 else 1726 pmap_s2_invalidate_range(pmap, sva, eva, final_only); 1727 } 1728 1729 /* 1730 * Invalidates all cached intermediate- and final-level TLB entries for the 1731 * given virtual address space. 1732 */ 1733 static __inline void 1734 pmap_s1_invalidate_all(pmap_t pmap) 1735 { 1736 uint64_t r; 1737 1738 PMAP_ASSERT_STAGE1(pmap); 1739 1740 dsb(ishst); 1741 if (pmap == kernel_pmap) { 1742 __asm __volatile("tlbi vmalle1is"); 1743 } else { 1744 r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 1745 __asm __volatile("tlbi aside1is, %0" : : "r" (r)); 1746 } 1747 dsb(ish); 1748 isb(); 1749 } 1750 1751 static __inline void 1752 pmap_s2_invalidate_all(pmap_t pmap) 1753 { 1754 PMAP_ASSERT_STAGE2(pmap); 1755 MPASS(pmap_stage2_invalidate_all != NULL); 1756 pmap_stage2_invalidate_all(pmap_to_ttbr0(pmap)); 1757 } 1758 1759 static __inline void 1760 pmap_invalidate_all(pmap_t pmap) 1761 { 1762 if (pmap->pm_stage == PM_STAGE1) 1763 pmap_s1_invalidate_all(pmap); 1764 else 1765 pmap_s2_invalidate_all(pmap); 1766 } 1767 1768 /* 1769 * Routine: pmap_extract 1770 * Function: 1771 * Extract the physical page address associated 1772 * with the given map/virtual_address pair. 1773 */ 1774 vm_paddr_t 1775 pmap_extract(pmap_t pmap, vm_offset_t va) 1776 { 1777 pt_entry_t *pte, tpte; 1778 vm_paddr_t pa; 1779 int lvl; 1780 1781 pa = 0; 1782 PMAP_LOCK(pmap); 1783 /* 1784 * Find the block or page map for this virtual address. pmap_pte 1785 * will return either a valid block/page entry, or NULL. 1786 */ 1787 pte = pmap_pte(pmap, va, &lvl); 1788 if (pte != NULL) { 1789 tpte = pmap_load(pte); 1790 pa = PTE_TO_PHYS(tpte); 1791 switch(lvl) { 1792 case 1: 1793 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 1794 KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, 1795 ("pmap_extract: Invalid L1 pte found: %lx", 1796 tpte & ATTR_DESCR_MASK)); 1797 pa |= (va & L1_OFFSET); 1798 break; 1799 case 2: 1800 KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, 1801 ("pmap_extract: Invalid L2 pte found: %lx", 1802 tpte & ATTR_DESCR_MASK)); 1803 pa |= (va & L2_OFFSET); 1804 break; 1805 case 3: 1806 KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, 1807 ("pmap_extract: Invalid L3 pte found: %lx", 1808 tpte & ATTR_DESCR_MASK)); 1809 pa |= (va & L3_OFFSET); 1810 break; 1811 } 1812 } 1813 PMAP_UNLOCK(pmap); 1814 return (pa); 1815 } 1816 1817 /* 1818 * Routine: pmap_extract_and_hold 1819 * Function: 1820 * Atomically extract and hold the physical page 1821 * with the given pmap and virtual address pair 1822 * if that mapping permits the given protection. 1823 */ 1824 vm_page_t 1825 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1826 { 1827 pt_entry_t *pte, tpte; 1828 vm_offset_t off; 1829 vm_page_t m; 1830 int lvl; 1831 bool use; 1832 1833 m = NULL; 1834 PMAP_LOCK(pmap); 1835 pte = pmap_pte(pmap, va, &lvl); 1836 if (pte != NULL) { 1837 tpte = pmap_load(pte); 1838 1839 KASSERT(lvl > 0 && lvl <= 3, 1840 ("pmap_extract_and_hold: Invalid level %d", lvl)); 1841 /* 1842 * Check that the pte is either a L3 page, or a L1 or L2 block 1843 * entry. We can assume L1_BLOCK == L2_BLOCK. 1844 */ 1845 KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) || 1846 (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK), 1847 ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl, 1848 tpte & ATTR_DESCR_MASK)); 1849 1850 use = false; 1851 if ((prot & VM_PROT_WRITE) == 0) 1852 use = true; 1853 else if (pmap->pm_stage == PM_STAGE1 && 1854 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)) 1855 use = true; 1856 else if (pmap->pm_stage == PM_STAGE2 && 1857 ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) == 1858 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE))) 1859 use = true; 1860 1861 if (use) { 1862 switch (lvl) { 1863 case 1: 1864 off = va & L1_OFFSET; 1865 break; 1866 case 2: 1867 off = va & L2_OFFSET; 1868 break; 1869 case 3: 1870 default: 1871 off = 0; 1872 } 1873 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte) | off); 1874 if (m != NULL && !vm_page_wire_mapped(m)) 1875 m = NULL; 1876 } 1877 } 1878 PMAP_UNLOCK(pmap); 1879 return (m); 1880 } 1881 1882 /* 1883 * Walks the page tables to translate a kernel virtual address to a 1884 * physical address. Returns true if the kva is valid and stores the 1885 * physical address in pa if it is not NULL. 1886 * 1887 * See the comment above data_abort() for the rationale for specifying 1888 * NO_PERTHREAD_SSP here. 1889 */ 1890 bool NO_PERTHREAD_SSP 1891 pmap_klookup(vm_offset_t va, vm_paddr_t *pa) 1892 { 1893 pt_entry_t *pte, tpte; 1894 register_t intr; 1895 uint64_t par; 1896 1897 /* 1898 * Disable interrupts so we don't get interrupted between asking 1899 * for address translation, and getting the result back. 1900 */ 1901 intr = intr_disable(); 1902 par = arm64_address_translate_s1e1r(va); 1903 intr_restore(intr); 1904 1905 if (PAR_SUCCESS(par)) { 1906 if (pa != NULL) 1907 *pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK); 1908 return (true); 1909 } 1910 1911 /* 1912 * Fall back to walking the page table. The address translation 1913 * instruction may fail when the page is in a break-before-make 1914 * sequence. As we only clear the valid bit in said sequence we 1915 * can walk the page table to find the physical address. 1916 */ 1917 1918 pte = pmap_l1(kernel_pmap, va); 1919 if (pte == NULL) 1920 return (false); 1921 1922 /* 1923 * A concurrent pmap_update_entry() will clear the entry's valid bit 1924 * but leave the rest of the entry unchanged. Therefore, we treat a 1925 * non-zero entry as being valid, and we ignore the valid bit when 1926 * determining whether the entry maps a block, page, or table. 1927 */ 1928 tpte = pmap_load(pte); 1929 if (tpte == 0) 1930 return (false); 1931 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 1932 if (pa != NULL) 1933 *pa = PTE_TO_PHYS(tpte) | (va & L1_OFFSET); 1934 return (true); 1935 } 1936 pte = pmap_l1_to_l2(&tpte, va); 1937 tpte = pmap_load(pte); 1938 if (tpte == 0) 1939 return (false); 1940 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 1941 if (pa != NULL) 1942 *pa = PTE_TO_PHYS(tpte) | (va & L2_OFFSET); 1943 return (true); 1944 } 1945 pte = pmap_l2_to_l3(&tpte, va); 1946 tpte = pmap_load(pte); 1947 if (tpte == 0) 1948 return (false); 1949 if (pa != NULL) 1950 *pa = PTE_TO_PHYS(tpte) | (va & L3_OFFSET); 1951 return (true); 1952 } 1953 1954 /* 1955 * Routine: pmap_kextract 1956 * Function: 1957 * Extract the physical page address associated with the given kernel 1958 * virtual address. 1959 */ 1960 vm_paddr_t 1961 pmap_kextract(vm_offset_t va) 1962 { 1963 vm_paddr_t pa; 1964 1965 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 1966 return (DMAP_TO_PHYS(va)); 1967 1968 if (pmap_klookup(va, &pa) == false) 1969 return (0); 1970 return (pa); 1971 } 1972 1973 /*************************************************** 1974 * Low level mapping routines..... 1975 ***************************************************/ 1976 1977 void 1978 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode) 1979 { 1980 pd_entry_t *pde; 1981 pt_entry_t attr, old_l3e, *pte; 1982 vm_offset_t va; 1983 int lvl; 1984 1985 KASSERT((pa & L3_OFFSET) == 0, 1986 ("pmap_kenter: Invalid physical address")); 1987 KASSERT((sva & L3_OFFSET) == 0, 1988 ("pmap_kenter: Invalid virtual address")); 1989 KASSERT((size & PAGE_MASK) == 0, 1990 ("pmap_kenter: Mapping is not page-sized")); 1991 1992 attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN | 1993 ATTR_KERN_GP | ATTR_S1_IDX(mode) | L3_PAGE; 1994 old_l3e = 0; 1995 va = sva; 1996 while (size != 0) { 1997 pde = pmap_pde(kernel_pmap, va, &lvl); 1998 KASSERT(pde != NULL, 1999 ("pmap_kenter: Invalid page entry, va: 0x%lx", va)); 2000 KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl)); 2001 2002 pte = pmap_l2_to_l3(pde, va); 2003 old_l3e |= pmap_load_store(pte, PHYS_TO_PTE(pa) | attr); 2004 2005 va += PAGE_SIZE; 2006 pa += PAGE_SIZE; 2007 size -= PAGE_SIZE; 2008 } 2009 if ((old_l3e & ATTR_DESCR_VALID) != 0) 2010 pmap_s1_invalidate_range(kernel_pmap, sva, va, true); 2011 else { 2012 /* 2013 * Because the old entries were invalid and the new mappings 2014 * are not executable, an isb is not required. 2015 */ 2016 dsb(ishst); 2017 } 2018 } 2019 2020 void 2021 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) 2022 { 2023 2024 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE); 2025 } 2026 2027 /* 2028 * Remove a page from the kernel pagetables. 2029 */ 2030 void 2031 pmap_kremove(vm_offset_t va) 2032 { 2033 pt_entry_t *pte; 2034 2035 pte = pmap_pte_exists(kernel_pmap, va, 3, __func__); 2036 pmap_clear(pte); 2037 pmap_s1_invalidate_page(kernel_pmap, va, true); 2038 } 2039 2040 /* 2041 * Remove the specified range of mappings from the kernel address space. 2042 * 2043 * Should only be applied to mappings that were created by pmap_kenter() or 2044 * pmap_kenter_device(). Nothing about this function is actually specific 2045 * to device mappings. 2046 */ 2047 void 2048 pmap_kremove_device(vm_offset_t sva, vm_size_t size) 2049 { 2050 pt_entry_t *pte; 2051 vm_offset_t va; 2052 2053 KASSERT((sva & L3_OFFSET) == 0, 2054 ("pmap_kremove_device: Invalid virtual address")); 2055 KASSERT((size & PAGE_MASK) == 0, 2056 ("pmap_kremove_device: Mapping is not page-sized")); 2057 2058 va = sva; 2059 while (size != 0) { 2060 pte = pmap_pte_exists(kernel_pmap, va, 3, __func__); 2061 pmap_clear(pte); 2062 2063 va += PAGE_SIZE; 2064 size -= PAGE_SIZE; 2065 } 2066 pmap_s1_invalidate_range(kernel_pmap, sva, va, true); 2067 } 2068 2069 /* 2070 * Used to map a range of physical addresses into kernel 2071 * virtual address space. 2072 * 2073 * The value passed in '*virt' is a suggested virtual address for 2074 * the mapping. Architectures which can support a direct-mapped 2075 * physical to virtual region can return the appropriate address 2076 * within that region, leaving '*virt' unchanged. Other 2077 * architectures should map the pages starting at '*virt' and 2078 * update '*virt' with the first usable address after the mapped 2079 * region. 2080 */ 2081 vm_offset_t 2082 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 2083 { 2084 return PHYS_TO_DMAP(start); 2085 } 2086 2087 /* 2088 * Add a list of wired pages to the kva 2089 * this routine is only used for temporary 2090 * kernel mappings that do not need to have 2091 * page modification or references recorded. 2092 * Note that old mappings are simply written 2093 * over. The page *must* be wired. 2094 * Note: SMP coherent. Uses a ranged shootdown IPI. 2095 */ 2096 void 2097 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 2098 { 2099 pd_entry_t *pde; 2100 pt_entry_t attr, old_l3e, pa, *pte; 2101 vm_offset_t va; 2102 vm_page_t m; 2103 int i, lvl; 2104 2105 old_l3e = 0; 2106 va = sva; 2107 for (i = 0; i < count; i++) { 2108 pde = pmap_pde(kernel_pmap, va, &lvl); 2109 KASSERT(pde != NULL, 2110 ("pmap_qenter: Invalid page entry, va: 0x%lx", va)); 2111 KASSERT(lvl == 2, 2112 ("pmap_qenter: Invalid level %d", lvl)); 2113 2114 m = ma[i]; 2115 pa = VM_PAGE_TO_PHYS(m); 2116 attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN | 2117 ATTR_KERN_GP | ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE; 2118 pte = pmap_l2_to_l3(pde, va); 2119 old_l3e |= pmap_load_store(pte, PHYS_TO_PTE(pa) | attr); 2120 2121 va += L3_SIZE; 2122 } 2123 if ((old_l3e & ATTR_DESCR_VALID) != 0) 2124 pmap_s1_invalidate_range(kernel_pmap, sva, va, true); 2125 else { 2126 /* 2127 * Because the old entries were invalid and the new mappings 2128 * are not executable, an isb is not required. 2129 */ 2130 dsb(ishst); 2131 } 2132 } 2133 2134 /* 2135 * This routine tears out page mappings from the 2136 * kernel -- it is meant only for temporary mappings. 2137 */ 2138 void 2139 pmap_qremove(vm_offset_t sva, int count) 2140 { 2141 pt_entry_t *pte; 2142 vm_offset_t va; 2143 2144 KASSERT(ADDR_IS_CANONICAL(sva), 2145 ("%s: Address not in canonical form: %lx", __func__, sva)); 2146 KASSERT(ADDR_IS_KERNEL(sva), ("usermode va %lx", sva)); 2147 2148 va = sva; 2149 while (count-- > 0) { 2150 pte = pmap_pte_exists(kernel_pmap, va, 3, NULL); 2151 if (pte != NULL) { 2152 pmap_clear(pte); 2153 } 2154 2155 va += PAGE_SIZE; 2156 } 2157 pmap_s1_invalidate_range(kernel_pmap, sva, va, true); 2158 } 2159 2160 /*************************************************** 2161 * Page table page management routines..... 2162 ***************************************************/ 2163 /* 2164 * Schedule the specified unused page table page to be freed. Specifically, 2165 * add the page to the specified list of pages that will be released to the 2166 * physical memory manager after the TLB has been updated. 2167 */ 2168 static __inline void 2169 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 2170 boolean_t set_PG_ZERO) 2171 { 2172 2173 if (set_PG_ZERO) 2174 m->flags |= PG_ZERO; 2175 else 2176 m->flags &= ~PG_ZERO; 2177 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 2178 } 2179 2180 /* 2181 * Decrements a page table page's reference count, which is used to record the 2182 * number of valid page table entries within the page. If the reference count 2183 * drops to zero, then the page table page is unmapped. Returns TRUE if the 2184 * page table page was unmapped and FALSE otherwise. 2185 */ 2186 static inline boolean_t 2187 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2188 { 2189 2190 --m->ref_count; 2191 if (m->ref_count == 0) { 2192 _pmap_unwire_l3(pmap, va, m, free); 2193 return (TRUE); 2194 } else 2195 return (FALSE); 2196 } 2197 2198 static void 2199 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2200 { 2201 2202 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2203 /* 2204 * unmap the page table page 2205 */ 2206 if (m->pindex >= (NUL2E + NUL1E)) { 2207 /* l1 page */ 2208 pd_entry_t *l0; 2209 2210 l0 = pmap_l0(pmap, va); 2211 pmap_clear(l0); 2212 } else if (m->pindex >= NUL2E) { 2213 /* l2 page */ 2214 pd_entry_t *l1; 2215 2216 l1 = pmap_l1(pmap, va); 2217 pmap_clear(l1); 2218 } else { 2219 /* l3 page */ 2220 pd_entry_t *l2; 2221 2222 l2 = pmap_l2(pmap, va); 2223 pmap_clear(l2); 2224 } 2225 pmap_resident_count_dec(pmap, 1); 2226 if (m->pindex < NUL2E) { 2227 /* We just released an l3, unhold the matching l2 */ 2228 pd_entry_t *l1, tl1; 2229 vm_page_t l2pg; 2230 2231 l1 = pmap_l1(pmap, va); 2232 tl1 = pmap_load(l1); 2233 l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tl1)); 2234 pmap_unwire_l3(pmap, va, l2pg, free); 2235 } else if (m->pindex < (NUL2E + NUL1E)) { 2236 /* We just released an l2, unhold the matching l1 */ 2237 pd_entry_t *l0, tl0; 2238 vm_page_t l1pg; 2239 2240 l0 = pmap_l0(pmap, va); 2241 tl0 = pmap_load(l0); 2242 l1pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tl0)); 2243 pmap_unwire_l3(pmap, va, l1pg, free); 2244 } 2245 pmap_invalidate_page(pmap, va, false); 2246 2247 /* 2248 * Put page on a list so that it is released after 2249 * *ALL* TLB shootdown is done 2250 */ 2251 pmap_add_delayed_free_list(m, free, TRUE); 2252 } 2253 2254 /* 2255 * After removing a page table entry, this routine is used to 2256 * conditionally free the page, and manage the reference count. 2257 */ 2258 static int 2259 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 2260 struct spglist *free) 2261 { 2262 vm_page_t mpte; 2263 2264 KASSERT(ADDR_IS_CANONICAL(va), 2265 ("%s: Address not in canonical form: %lx", __func__, va)); 2266 if (ADDR_IS_KERNEL(va)) 2267 return (0); 2268 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 2269 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde)); 2270 return (pmap_unwire_l3(pmap, va, mpte, free)); 2271 } 2272 2273 /* 2274 * Release a page table page reference after a failed attempt to create a 2275 * mapping. 2276 */ 2277 static void 2278 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 2279 { 2280 struct spglist free; 2281 2282 SLIST_INIT(&free); 2283 if (pmap_unwire_l3(pmap, va, mpte, &free)) 2284 vm_page_free_pages_toq(&free, true); 2285 } 2286 2287 void 2288 pmap_pinit0(pmap_t pmap) 2289 { 2290 2291 PMAP_LOCK_INIT(pmap); 2292 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 2293 pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1); 2294 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); 2295 TAILQ_INIT(&pmap->pm_pvchunk); 2296 vm_radix_init(&pmap->pm_root); 2297 pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN); 2298 pmap->pm_stage = PM_STAGE1; 2299 pmap->pm_levels = 4; 2300 pmap->pm_ttbr = pmap->pm_l0_paddr; 2301 pmap->pm_asid_set = &asids; 2302 2303 PCPU_SET(curpmap, pmap); 2304 } 2305 2306 int 2307 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels) 2308 { 2309 vm_page_t m; 2310 2311 /* 2312 * allocate the l0 page 2313 */ 2314 m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED | 2315 VM_ALLOC_ZERO); 2316 pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m); 2317 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); 2318 2319 TAILQ_INIT(&pmap->pm_pvchunk); 2320 vm_radix_init(&pmap->pm_root); 2321 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 2322 pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX); 2323 2324 MPASS(levels == 3 || levels == 4); 2325 pmap->pm_levels = levels; 2326 pmap->pm_stage = stage; 2327 switch (stage) { 2328 case PM_STAGE1: 2329 pmap->pm_asid_set = &asids; 2330 break; 2331 case PM_STAGE2: 2332 pmap->pm_asid_set = &vmids; 2333 break; 2334 default: 2335 panic("%s: Invalid pmap type %d", __func__, stage); 2336 break; 2337 } 2338 2339 /* XXX Temporarily disable deferred ASID allocation. */ 2340 pmap_alloc_asid(pmap); 2341 2342 /* 2343 * Allocate the level 1 entry to use as the root. This will increase 2344 * the refcount on the level 1 page so it won't be removed until 2345 * pmap_release() is called. 2346 */ 2347 if (pmap->pm_levels == 3) { 2348 PMAP_LOCK(pmap); 2349 m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL); 2350 PMAP_UNLOCK(pmap); 2351 } 2352 pmap->pm_ttbr = VM_PAGE_TO_PHYS(m); 2353 2354 return (1); 2355 } 2356 2357 int 2358 pmap_pinit(pmap_t pmap) 2359 { 2360 2361 return (pmap_pinit_stage(pmap, PM_STAGE1, 4)); 2362 } 2363 2364 /* 2365 * This routine is called if the desired page table page does not exist. 2366 * 2367 * If page table page allocation fails, this routine may sleep before 2368 * returning NULL. It sleeps only if a lock pointer was given. 2369 * 2370 * Note: If a page allocation fails at page table level two or three, 2371 * one or two pages may be held during the wait, only to be released 2372 * afterwards. This conservative approach is easily argued to avoid 2373 * race conditions. 2374 */ 2375 static vm_page_t 2376 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 2377 { 2378 vm_page_t m, l1pg, l2pg; 2379 2380 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2381 2382 /* 2383 * Allocate a page table page. 2384 */ 2385 if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 2386 if (lockp != NULL) { 2387 RELEASE_PV_LIST_LOCK(lockp); 2388 PMAP_UNLOCK(pmap); 2389 vm_wait(NULL); 2390 PMAP_LOCK(pmap); 2391 } 2392 2393 /* 2394 * Indicate the need to retry. While waiting, the page table 2395 * page may have been allocated. 2396 */ 2397 return (NULL); 2398 } 2399 m->pindex = ptepindex; 2400 2401 /* 2402 * Because of AArch64's weak memory consistency model, we must have a 2403 * barrier here to ensure that the stores for zeroing "m", whether by 2404 * pmap_zero_page() or an earlier function, are visible before adding 2405 * "m" to the page table. Otherwise, a page table walk by another 2406 * processor's MMU could see the mapping to "m" and a stale, non-zero 2407 * PTE within "m". 2408 */ 2409 dmb(ishst); 2410 2411 /* 2412 * Map the pagetable page into the process address space, if 2413 * it isn't already there. 2414 */ 2415 2416 if (ptepindex >= (NUL2E + NUL1E)) { 2417 pd_entry_t *l0p, l0e; 2418 vm_pindex_t l0index; 2419 2420 l0index = ptepindex - (NUL2E + NUL1E); 2421 l0p = &pmap->pm_l0[l0index]; 2422 KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0, 2423 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p))); 2424 l0e = PHYS_TO_PTE(VM_PAGE_TO_PHYS(m)) | L0_TABLE; 2425 2426 /* 2427 * Mark all kernel memory as not accessible from userspace 2428 * and userspace memory as not executable from the kernel. 2429 * This has been done for the bootstrap L0 entries in 2430 * locore.S. 2431 */ 2432 if (pmap == kernel_pmap) 2433 l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0; 2434 else 2435 l0e |= TATTR_PXN_TABLE; 2436 pmap_store(l0p, l0e); 2437 } else if (ptepindex >= NUL2E) { 2438 vm_pindex_t l0index, l1index; 2439 pd_entry_t *l0, *l1; 2440 pd_entry_t tl0; 2441 2442 l1index = ptepindex - NUL2E; 2443 l0index = l1index >> Ln_ENTRIES_SHIFT; 2444 2445 l0 = &pmap->pm_l0[l0index]; 2446 tl0 = pmap_load(l0); 2447 if (tl0 == 0) { 2448 /* recurse for allocating page dir */ 2449 if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, 2450 lockp) == NULL) { 2451 vm_page_unwire_noq(m); 2452 vm_page_free_zero(m); 2453 return (NULL); 2454 } 2455 } else { 2456 l1pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tl0)); 2457 l1pg->ref_count++; 2458 } 2459 2460 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0))); 2461 l1 = &l1[ptepindex & Ln_ADDR_MASK]; 2462 KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0, 2463 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1))); 2464 pmap_store(l1, PHYS_TO_PTE(VM_PAGE_TO_PHYS(m)) | L1_TABLE); 2465 } else { 2466 vm_pindex_t l0index, l1index; 2467 pd_entry_t *l0, *l1, *l2; 2468 pd_entry_t tl0, tl1; 2469 2470 l1index = ptepindex >> Ln_ENTRIES_SHIFT; 2471 l0index = l1index >> Ln_ENTRIES_SHIFT; 2472 2473 l0 = &pmap->pm_l0[l0index]; 2474 tl0 = pmap_load(l0); 2475 if (tl0 == 0) { 2476 /* recurse for allocating page dir */ 2477 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 2478 lockp) == NULL) { 2479 vm_page_unwire_noq(m); 2480 vm_page_free_zero(m); 2481 return (NULL); 2482 } 2483 tl0 = pmap_load(l0); 2484 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0)); 2485 l1 = &l1[l1index & Ln_ADDR_MASK]; 2486 } else { 2487 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0)); 2488 l1 = &l1[l1index & Ln_ADDR_MASK]; 2489 tl1 = pmap_load(l1); 2490 if (tl1 == 0) { 2491 /* recurse for allocating page dir */ 2492 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 2493 lockp) == NULL) { 2494 vm_page_unwire_noq(m); 2495 vm_page_free_zero(m); 2496 return (NULL); 2497 } 2498 } else { 2499 l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tl1)); 2500 l2pg->ref_count++; 2501 } 2502 } 2503 2504 l2 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l1))); 2505 l2 = &l2[ptepindex & Ln_ADDR_MASK]; 2506 KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0, 2507 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2))); 2508 pmap_store(l2, PHYS_TO_PTE(VM_PAGE_TO_PHYS(m)) | L2_TABLE); 2509 } 2510 2511 pmap_resident_count_inc(pmap, 1); 2512 2513 return (m); 2514 } 2515 2516 static pd_entry_t * 2517 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp, 2518 struct rwlock **lockp) 2519 { 2520 pd_entry_t *l1, *l2; 2521 vm_page_t l2pg; 2522 vm_pindex_t l2pindex; 2523 2524 KASSERT(ADDR_IS_CANONICAL(va), 2525 ("%s: Address not in canonical form: %lx", __func__, va)); 2526 2527 retry: 2528 l1 = pmap_l1(pmap, va); 2529 if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) { 2530 l2 = pmap_l1_to_l2(l1, va); 2531 if (!ADDR_IS_KERNEL(va)) { 2532 /* Add a reference to the L2 page. */ 2533 l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1))); 2534 l2pg->ref_count++; 2535 } else 2536 l2pg = NULL; 2537 } else if (!ADDR_IS_KERNEL(va)) { 2538 /* Allocate a L2 page. */ 2539 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; 2540 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); 2541 if (l2pg == NULL) { 2542 if (lockp != NULL) 2543 goto retry; 2544 else 2545 return (NULL); 2546 } 2547 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); 2548 l2 = &l2[pmap_l2_index(va)]; 2549 } else 2550 panic("pmap_alloc_l2: missing page table page for va %#lx", 2551 va); 2552 *l2pgp = l2pg; 2553 return (l2); 2554 } 2555 2556 static vm_page_t 2557 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 2558 { 2559 vm_pindex_t ptepindex; 2560 pd_entry_t *pde, tpde; 2561 #ifdef INVARIANTS 2562 pt_entry_t *pte; 2563 #endif 2564 vm_page_t m; 2565 int lvl; 2566 2567 /* 2568 * Calculate pagetable page index 2569 */ 2570 ptepindex = pmap_l2_pindex(va); 2571 retry: 2572 /* 2573 * Get the page directory entry 2574 */ 2575 pde = pmap_pde(pmap, va, &lvl); 2576 2577 /* 2578 * If the page table page is mapped, we just increment the hold count, 2579 * and activate it. If we get a level 2 pde it will point to a level 3 2580 * table. 2581 */ 2582 switch (lvl) { 2583 case -1: 2584 break; 2585 case 0: 2586 #ifdef INVARIANTS 2587 pte = pmap_l0_to_l1(pde, va); 2588 KASSERT(pmap_load(pte) == 0, 2589 ("pmap_alloc_l3: TODO: l0 superpages")); 2590 #endif 2591 break; 2592 case 1: 2593 #ifdef INVARIANTS 2594 pte = pmap_l1_to_l2(pde, va); 2595 KASSERT(pmap_load(pte) == 0, 2596 ("pmap_alloc_l3: TODO: l1 superpages")); 2597 #endif 2598 break; 2599 case 2: 2600 tpde = pmap_load(pde); 2601 if (tpde != 0) { 2602 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpde)); 2603 m->ref_count++; 2604 return (m); 2605 } 2606 break; 2607 default: 2608 panic("pmap_alloc_l3: Invalid level %d", lvl); 2609 } 2610 2611 /* 2612 * Here if the pte page isn't mapped, or if it has been deallocated. 2613 */ 2614 m = _pmap_alloc_l3(pmap, ptepindex, lockp); 2615 if (m == NULL && lockp != NULL) 2616 goto retry; 2617 2618 return (m); 2619 } 2620 2621 /*************************************************** 2622 * Pmap allocation/deallocation routines. 2623 ***************************************************/ 2624 2625 /* 2626 * Release any resources held by the given physical map. 2627 * Called when a pmap initialized by pmap_pinit is being released. 2628 * Should only be called if the map contains no valid mappings. 2629 */ 2630 void 2631 pmap_release(pmap_t pmap) 2632 { 2633 boolean_t rv __diagused; 2634 struct spglist free; 2635 struct asid_set *set; 2636 vm_page_t m; 2637 int asid; 2638 2639 if (pmap->pm_levels != 4) { 2640 PMAP_ASSERT_STAGE2(pmap); 2641 KASSERT(pmap->pm_stats.resident_count == 1, 2642 ("pmap_release: pmap resident count %ld != 0", 2643 pmap->pm_stats.resident_count)); 2644 KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID, 2645 ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0])); 2646 2647 SLIST_INIT(&free); 2648 m = PHYS_TO_VM_PAGE(pmap->pm_ttbr); 2649 PMAP_LOCK(pmap); 2650 rv = pmap_unwire_l3(pmap, 0, m, &free); 2651 PMAP_UNLOCK(pmap); 2652 MPASS(rv == TRUE); 2653 vm_page_free_pages_toq(&free, true); 2654 } 2655 2656 KASSERT(pmap->pm_stats.resident_count == 0, 2657 ("pmap_release: pmap resident count %ld != 0", 2658 pmap->pm_stats.resident_count)); 2659 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2660 ("pmap_release: pmap has reserved page table page(s)")); 2661 2662 set = pmap->pm_asid_set; 2663 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 2664 2665 /* 2666 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate 2667 * the entries when removing them so rely on a later tlb invalidation. 2668 * this will happen when updating the VMID generation. Because of this 2669 * we don't reuse VMIDs within a generation. 2670 */ 2671 if (pmap->pm_stage == PM_STAGE1) { 2672 mtx_lock_spin(&set->asid_set_mutex); 2673 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) { 2674 asid = COOKIE_TO_ASID(pmap->pm_cookie); 2675 KASSERT(asid >= ASID_FIRST_AVAILABLE && 2676 asid < set->asid_set_size, 2677 ("pmap_release: pmap cookie has out-of-range asid")); 2678 bit_clear(set->asid_set, asid); 2679 } 2680 mtx_unlock_spin(&set->asid_set_mutex); 2681 } 2682 2683 m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr); 2684 vm_page_unwire_noq(m); 2685 vm_page_free_zero(m); 2686 } 2687 2688 static int 2689 kvm_size(SYSCTL_HANDLER_ARGS) 2690 { 2691 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 2692 2693 return sysctl_handle_long(oidp, &ksize, 0, req); 2694 } 2695 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 2696 0, 0, kvm_size, "LU", 2697 "Size of KVM"); 2698 2699 static int 2700 kvm_free(SYSCTL_HANDLER_ARGS) 2701 { 2702 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 2703 2704 return sysctl_handle_long(oidp, &kfree, 0, req); 2705 } 2706 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 2707 0, 0, kvm_free, "LU", 2708 "Amount of KVM free"); 2709 2710 /* 2711 * grow the number of kernel page table entries, if needed 2712 */ 2713 void 2714 pmap_growkernel(vm_offset_t addr) 2715 { 2716 vm_paddr_t paddr; 2717 vm_page_t nkpg; 2718 pd_entry_t *l0, *l1, *l2; 2719 2720 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2721 2722 addr = roundup2(addr, L2_SIZE); 2723 if (addr - 1 >= vm_map_max(kernel_map)) 2724 addr = vm_map_max(kernel_map); 2725 if (kernel_vm_end < addr) 2726 kasan_shadow_map(kernel_vm_end, addr - kernel_vm_end); 2727 while (kernel_vm_end < addr) { 2728 l0 = pmap_l0(kernel_pmap, kernel_vm_end); 2729 KASSERT(pmap_load(l0) != 0, 2730 ("pmap_growkernel: No level 0 kernel entry")); 2731 2732 l1 = pmap_l0_to_l1(l0, kernel_vm_end); 2733 if (pmap_load(l1) == 0) { 2734 /* We need a new PDP entry */ 2735 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | 2736 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2737 if (nkpg == NULL) 2738 panic("pmap_growkernel: no memory to grow kernel"); 2739 nkpg->pindex = kernel_vm_end >> L1_SHIFT; 2740 /* See the dmb() in _pmap_alloc_l3(). */ 2741 dmb(ishst); 2742 paddr = VM_PAGE_TO_PHYS(nkpg); 2743 pmap_store(l1, PHYS_TO_PTE(paddr) | L1_TABLE); 2744 continue; /* try again */ 2745 } 2746 l2 = pmap_l1_to_l2(l1, kernel_vm_end); 2747 if (pmap_load(l2) != 0) { 2748 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 2749 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2750 kernel_vm_end = vm_map_max(kernel_map); 2751 break; 2752 } 2753 continue; 2754 } 2755 2756 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 2757 VM_ALLOC_ZERO); 2758 if (nkpg == NULL) 2759 panic("pmap_growkernel: no memory to grow kernel"); 2760 nkpg->pindex = kernel_vm_end >> L2_SHIFT; 2761 /* See the dmb() in _pmap_alloc_l3(). */ 2762 dmb(ishst); 2763 paddr = VM_PAGE_TO_PHYS(nkpg); 2764 pmap_store(l2, PHYS_TO_PTE(paddr) | L2_TABLE); 2765 2766 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 2767 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2768 kernel_vm_end = vm_map_max(kernel_map); 2769 break; 2770 } 2771 } 2772 } 2773 2774 /*************************************************** 2775 * page management routines. 2776 ***************************************************/ 2777 2778 static const uint64_t pc_freemask[_NPCM] = { 2779 [0 ... _NPCM - 2] = PC_FREEN, 2780 [_NPCM - 1] = PC_FREEL 2781 }; 2782 2783 #ifdef PV_STATS 2784 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2785 2786 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2787 "Current number of pv entry chunks"); 2788 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2789 "Current number of pv entry chunks allocated"); 2790 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2791 "Current number of pv entry chunks frees"); 2792 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2793 "Number of times tried to get a chunk page but failed."); 2794 2795 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 2796 static int pv_entry_spare; 2797 2798 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2799 "Current number of pv entry frees"); 2800 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2801 "Current number of pv entry allocs"); 2802 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2803 "Current number of pv entries"); 2804 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2805 "Current number of spare pv entries"); 2806 #endif 2807 2808 /* 2809 * We are in a serious low memory condition. Resort to 2810 * drastic measures to free some pages so we can allocate 2811 * another pv entry chunk. 2812 * 2813 * Returns NULL if PV entries were reclaimed from the specified pmap. 2814 * 2815 * We do not, however, unmap 2mpages because subsequent accesses will 2816 * allocate per-page pv entries until repromotion occurs, thereby 2817 * exacerbating the shortage of free pv entries. 2818 */ 2819 static vm_page_t 2820 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain) 2821 { 2822 struct pv_chunks_list *pvc; 2823 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 2824 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 2825 struct md_page *pvh; 2826 pd_entry_t *pde; 2827 pmap_t next_pmap, pmap; 2828 pt_entry_t *pte, tpte; 2829 pv_entry_t pv; 2830 vm_offset_t va; 2831 vm_page_t m, m_pc; 2832 struct spglist free; 2833 uint64_t inuse; 2834 int bit, field, freed, lvl; 2835 2836 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2837 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 2838 2839 pmap = NULL; 2840 m_pc = NULL; 2841 SLIST_INIT(&free); 2842 bzero(&pc_marker_b, sizeof(pc_marker_b)); 2843 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 2844 pc_marker = (struct pv_chunk *)&pc_marker_b; 2845 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 2846 2847 pvc = &pv_chunks[domain]; 2848 mtx_lock(&pvc->pvc_lock); 2849 pvc->active_reclaims++; 2850 TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru); 2851 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru); 2852 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 2853 SLIST_EMPTY(&free)) { 2854 next_pmap = pc->pc_pmap; 2855 if (next_pmap == NULL) { 2856 /* 2857 * The next chunk is a marker. However, it is 2858 * not our marker, so active_reclaims must be 2859 * > 1. Consequently, the next_chunk code 2860 * will not rotate the pv_chunks list. 2861 */ 2862 goto next_chunk; 2863 } 2864 mtx_unlock(&pvc->pvc_lock); 2865 2866 /* 2867 * A pv_chunk can only be removed from the pc_lru list 2868 * when both pvc->pvc_lock is owned and the 2869 * corresponding pmap is locked. 2870 */ 2871 if (pmap != next_pmap) { 2872 if (pmap != NULL && pmap != locked_pmap) 2873 PMAP_UNLOCK(pmap); 2874 pmap = next_pmap; 2875 /* Avoid deadlock and lock recursion. */ 2876 if (pmap > locked_pmap) { 2877 RELEASE_PV_LIST_LOCK(lockp); 2878 PMAP_LOCK(pmap); 2879 mtx_lock(&pvc->pvc_lock); 2880 continue; 2881 } else if (pmap != locked_pmap) { 2882 if (PMAP_TRYLOCK(pmap)) { 2883 mtx_lock(&pvc->pvc_lock); 2884 continue; 2885 } else { 2886 pmap = NULL; /* pmap is not locked */ 2887 mtx_lock(&pvc->pvc_lock); 2888 pc = TAILQ_NEXT(pc_marker, pc_lru); 2889 if (pc == NULL || 2890 pc->pc_pmap != next_pmap) 2891 continue; 2892 goto next_chunk; 2893 } 2894 } 2895 } 2896 2897 /* 2898 * Destroy every non-wired, 4 KB page mapping in the chunk. 2899 */ 2900 freed = 0; 2901 for (field = 0; field < _NPCM; field++) { 2902 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2903 inuse != 0; inuse &= ~(1UL << bit)) { 2904 bit = ffsl(inuse) - 1; 2905 pv = &pc->pc_pventry[field * 64 + bit]; 2906 va = pv->pv_va; 2907 pde = pmap_pde(pmap, va, &lvl); 2908 if (lvl != 2) 2909 continue; 2910 pte = pmap_l2_to_l3(pde, va); 2911 tpte = pmap_load(pte); 2912 if ((tpte & ATTR_SW_WIRED) != 0) 2913 continue; 2914 tpte = pmap_load_clear(pte); 2915 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte)); 2916 if (pmap_pte_dirty(pmap, tpte)) 2917 vm_page_dirty(m); 2918 if ((tpte & ATTR_AF) != 0) { 2919 pmap_s1_invalidate_page(pmap, va, true); 2920 vm_page_aflag_set(m, PGA_REFERENCED); 2921 } 2922 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2923 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2924 m->md.pv_gen++; 2925 if (TAILQ_EMPTY(&m->md.pv_list) && 2926 (m->flags & PG_FICTITIOUS) == 0) { 2927 pvh = page_to_pvh(m); 2928 if (TAILQ_EMPTY(&pvh->pv_list)) { 2929 vm_page_aflag_clear(m, 2930 PGA_WRITEABLE); 2931 } 2932 } 2933 pc->pc_map[field] |= 1UL << bit; 2934 pmap_unuse_pt(pmap, va, pmap_load(pde), &free); 2935 freed++; 2936 } 2937 } 2938 if (freed == 0) { 2939 mtx_lock(&pvc->pvc_lock); 2940 goto next_chunk; 2941 } 2942 /* Every freed mapping is for a 4 KB page. */ 2943 pmap_resident_count_dec(pmap, freed); 2944 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 2945 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 2946 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 2947 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2948 if (pc_is_free(pc)) { 2949 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 2950 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 2951 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 2952 /* Entire chunk is free; return it. */ 2953 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2954 dump_drop_page(m_pc->phys_addr); 2955 mtx_lock(&pvc->pvc_lock); 2956 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 2957 break; 2958 } 2959 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2960 mtx_lock(&pvc->pvc_lock); 2961 /* One freed pv entry in locked_pmap is sufficient. */ 2962 if (pmap == locked_pmap) 2963 break; 2964 2965 next_chunk: 2966 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 2967 TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru); 2968 if (pvc->active_reclaims == 1 && pmap != NULL) { 2969 /* 2970 * Rotate the pv chunks list so that we do not 2971 * scan the same pv chunks that could not be 2972 * freed (because they contained a wired 2973 * and/or superpage mapping) on every 2974 * invocation of reclaim_pv_chunk(). 2975 */ 2976 while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker){ 2977 MPASS(pc->pc_pmap != NULL); 2978 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 2979 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 2980 } 2981 } 2982 } 2983 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 2984 TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru); 2985 pvc->active_reclaims--; 2986 mtx_unlock(&pvc->pvc_lock); 2987 if (pmap != NULL && pmap != locked_pmap) 2988 PMAP_UNLOCK(pmap); 2989 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 2990 m_pc = SLIST_FIRST(&free); 2991 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2992 /* Recycle a freed page table page. */ 2993 m_pc->ref_count = 1; 2994 } 2995 vm_page_free_pages_toq(&free, true); 2996 return (m_pc); 2997 } 2998 2999 static vm_page_t 3000 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 3001 { 3002 vm_page_t m; 3003 int i, domain; 3004 3005 domain = PCPU_GET(domain); 3006 for (i = 0; i < vm_ndomains; i++) { 3007 m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain); 3008 if (m != NULL) 3009 break; 3010 domain = (domain + 1) % vm_ndomains; 3011 } 3012 3013 return (m); 3014 } 3015 3016 /* 3017 * free the pv_entry back to the free list 3018 */ 3019 static void 3020 free_pv_entry(pmap_t pmap, pv_entry_t pv) 3021 { 3022 struct pv_chunk *pc; 3023 int idx, field, bit; 3024 3025 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3026 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 3027 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 3028 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 3029 pc = pv_to_chunk(pv); 3030 idx = pv - &pc->pc_pventry[0]; 3031 field = idx / 64; 3032 bit = idx % 64; 3033 pc->pc_map[field] |= 1ul << bit; 3034 if (!pc_is_free(pc)) { 3035 /* 98% of the time, pc is already at the head of the list. */ 3036 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 3037 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3038 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3039 } 3040 return; 3041 } 3042 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3043 free_pv_chunk(pc); 3044 } 3045 3046 static void 3047 free_pv_chunk_dequeued(struct pv_chunk *pc) 3048 { 3049 vm_page_t m; 3050 3051 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 3052 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 3053 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 3054 /* entire chunk is free, return it */ 3055 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 3056 dump_drop_page(m->phys_addr); 3057 vm_page_unwire_noq(m); 3058 vm_page_free(m); 3059 } 3060 3061 static void 3062 free_pv_chunk(struct pv_chunk *pc) 3063 { 3064 struct pv_chunks_list *pvc; 3065 3066 pvc = &pv_chunks[pc_to_domain(pc)]; 3067 mtx_lock(&pvc->pvc_lock); 3068 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 3069 mtx_unlock(&pvc->pvc_lock); 3070 free_pv_chunk_dequeued(pc); 3071 } 3072 3073 static void 3074 free_pv_chunk_batch(struct pv_chunklist *batch) 3075 { 3076 struct pv_chunks_list *pvc; 3077 struct pv_chunk *pc, *npc; 3078 int i; 3079 3080 for (i = 0; i < vm_ndomains; i++) { 3081 if (TAILQ_EMPTY(&batch[i])) 3082 continue; 3083 pvc = &pv_chunks[i]; 3084 mtx_lock(&pvc->pvc_lock); 3085 TAILQ_FOREACH(pc, &batch[i], pc_list) { 3086 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 3087 } 3088 mtx_unlock(&pvc->pvc_lock); 3089 } 3090 3091 for (i = 0; i < vm_ndomains; i++) { 3092 TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) { 3093 free_pv_chunk_dequeued(pc); 3094 } 3095 } 3096 } 3097 3098 /* 3099 * Returns a new PV entry, allocating a new PV chunk from the system when 3100 * needed. If this PV chunk allocation fails and a PV list lock pointer was 3101 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 3102 * returned. 3103 * 3104 * The given PV list lock may be released. 3105 */ 3106 static pv_entry_t 3107 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 3108 { 3109 struct pv_chunks_list *pvc; 3110 int bit, field; 3111 pv_entry_t pv; 3112 struct pv_chunk *pc; 3113 vm_page_t m; 3114 3115 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3116 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 3117 retry: 3118 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3119 if (pc != NULL) { 3120 for (field = 0; field < _NPCM; field++) { 3121 if (pc->pc_map[field]) { 3122 bit = ffsl(pc->pc_map[field]) - 1; 3123 break; 3124 } 3125 } 3126 if (field < _NPCM) { 3127 pv = &pc->pc_pventry[field * 64 + bit]; 3128 pc->pc_map[field] &= ~(1ul << bit); 3129 /* If this was the last item, move it to tail */ 3130 if (pc_is_full(pc)) { 3131 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3132 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 3133 pc_list); 3134 } 3135 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 3136 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 3137 return (pv); 3138 } 3139 } 3140 /* No free items, allocate another chunk */ 3141 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 3142 if (m == NULL) { 3143 if (lockp == NULL) { 3144 PV_STAT(pc_chunk_tryfail++); 3145 return (NULL); 3146 } 3147 m = reclaim_pv_chunk(pmap, lockp); 3148 if (m == NULL) 3149 goto retry; 3150 } 3151 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3152 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3153 dump_add_page(m->phys_addr); 3154 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 3155 pc->pc_pmap = pmap; 3156 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask)); 3157 pc->pc_map[0] &= ~1ul; /* preallocated bit 0 */ 3158 pvc = &pv_chunks[vm_page_domain(m)]; 3159 mtx_lock(&pvc->pvc_lock); 3160 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 3161 mtx_unlock(&pvc->pvc_lock); 3162 pv = &pc->pc_pventry[0]; 3163 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3164 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 3165 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 3166 return (pv); 3167 } 3168 3169 /* 3170 * Ensure that the number of spare PV entries in the specified pmap meets or 3171 * exceeds the given count, "needed". 3172 * 3173 * The given PV list lock may be released. 3174 */ 3175 static void 3176 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 3177 { 3178 struct pv_chunks_list *pvc; 3179 struct pch new_tail[PMAP_MEMDOM]; 3180 struct pv_chunk *pc; 3181 vm_page_t m; 3182 int avail, free, i; 3183 bool reclaimed; 3184 3185 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3186 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 3187 3188 /* 3189 * Newly allocated PV chunks must be stored in a private list until 3190 * the required number of PV chunks have been allocated. Otherwise, 3191 * reclaim_pv_chunk() could recycle one of these chunks. In 3192 * contrast, these chunks must be added to the pmap upon allocation. 3193 */ 3194 for (i = 0; i < PMAP_MEMDOM; i++) 3195 TAILQ_INIT(&new_tail[i]); 3196 retry: 3197 avail = 0; 3198 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 3199 bit_count((bitstr_t *)pc->pc_map, 0, 3200 sizeof(pc->pc_map) * NBBY, &free); 3201 if (free == 0) 3202 break; 3203 avail += free; 3204 if (avail >= needed) 3205 break; 3206 } 3207 for (reclaimed = false; avail < needed; avail += _NPCPV) { 3208 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 3209 if (m == NULL) { 3210 m = reclaim_pv_chunk(pmap, lockp); 3211 if (m == NULL) 3212 goto retry; 3213 reclaimed = true; 3214 } 3215 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3216 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3217 dump_add_page(m->phys_addr); 3218 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 3219 pc->pc_pmap = pmap; 3220 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask)); 3221 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3222 TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru); 3223 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 3224 3225 /* 3226 * The reclaim might have freed a chunk from the current pmap. 3227 * If that chunk contained available entries, we need to 3228 * re-count the number of available entries. 3229 */ 3230 if (reclaimed) 3231 goto retry; 3232 } 3233 for (i = 0; i < vm_ndomains; i++) { 3234 if (TAILQ_EMPTY(&new_tail[i])) 3235 continue; 3236 pvc = &pv_chunks[i]; 3237 mtx_lock(&pvc->pvc_lock); 3238 TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru); 3239 mtx_unlock(&pvc->pvc_lock); 3240 } 3241 } 3242 3243 /* 3244 * First find and then remove the pv entry for the specified pmap and virtual 3245 * address from the specified pv list. Returns the pv entry if found and NULL 3246 * otherwise. This operation can be performed on pv lists for either 4KB or 3247 * 2MB page mappings. 3248 */ 3249 static __inline pv_entry_t 3250 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3251 { 3252 pv_entry_t pv; 3253 3254 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3255 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 3256 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3257 pvh->pv_gen++; 3258 break; 3259 } 3260 } 3261 return (pv); 3262 } 3263 3264 /* 3265 * After demotion from a 2MB page mapping to 512 4KB page mappings, 3266 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 3267 * entries for each of the 4KB page mappings. 3268 */ 3269 static void 3270 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3271 struct rwlock **lockp) 3272 { 3273 struct md_page *pvh; 3274 struct pv_chunk *pc; 3275 pv_entry_t pv; 3276 vm_offset_t va_last; 3277 vm_page_t m; 3278 int bit, field; 3279 3280 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3281 KASSERT((va & L2_OFFSET) == 0, 3282 ("pmap_pv_demote_l2: va is not 2mpage aligned")); 3283 KASSERT((pa & L2_OFFSET) == 0, 3284 ("pmap_pv_demote_l2: pa is not 2mpage aligned")); 3285 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3286 3287 /* 3288 * Transfer the 2mpage's pv entry for this mapping to the first 3289 * page's pv list. Once this transfer begins, the pv list lock 3290 * must not be released until the last pv entry is reinstantiated. 3291 */ 3292 pvh = pa_to_pvh(pa); 3293 pv = pmap_pvh_remove(pvh, pmap, va); 3294 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); 3295 m = PHYS_TO_VM_PAGE(pa); 3296 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3297 m->md.pv_gen++; 3298 /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */ 3299 PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1)); 3300 va_last = va + L2_SIZE - PAGE_SIZE; 3301 for (;;) { 3302 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3303 KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare")); 3304 for (field = 0; field < _NPCM; field++) { 3305 while (pc->pc_map[field]) { 3306 bit = ffsl(pc->pc_map[field]) - 1; 3307 pc->pc_map[field] &= ~(1ul << bit); 3308 pv = &pc->pc_pventry[field * 64 + bit]; 3309 va += PAGE_SIZE; 3310 pv->pv_va = va; 3311 m++; 3312 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3313 ("pmap_pv_demote_l2: page %p is not managed", m)); 3314 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3315 m->md.pv_gen++; 3316 if (va == va_last) 3317 goto out; 3318 } 3319 } 3320 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3321 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3322 } 3323 out: 3324 if (pc_is_full(pc)) { 3325 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3326 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3327 } 3328 PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1)); 3329 PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1)); 3330 } 3331 3332 /* 3333 * First find and then destroy the pv entry for the specified pmap and virtual 3334 * address. This operation can be performed on pv lists for either 4KB or 2MB 3335 * page mappings. 3336 */ 3337 static void 3338 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3339 { 3340 pv_entry_t pv; 3341 3342 pv = pmap_pvh_remove(pvh, pmap, va); 3343 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 3344 free_pv_entry(pmap, pv); 3345 } 3346 3347 /* 3348 * Conditionally create the PV entry for a 4KB page mapping if the required 3349 * memory can be allocated without resorting to reclamation. 3350 */ 3351 static boolean_t 3352 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 3353 struct rwlock **lockp) 3354 { 3355 pv_entry_t pv; 3356 3357 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3358 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3359 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 3360 pv->pv_va = va; 3361 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3362 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3363 m->md.pv_gen++; 3364 return (TRUE); 3365 } else 3366 return (FALSE); 3367 } 3368 3369 /* 3370 * Create the PV entry for a 2MB page mapping. Always returns true unless the 3371 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 3372 * false if the PV entry cannot be allocated without resorting to reclamation. 3373 */ 3374 static bool 3375 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, 3376 struct rwlock **lockp) 3377 { 3378 struct md_page *pvh; 3379 pv_entry_t pv; 3380 vm_paddr_t pa; 3381 3382 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3383 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3384 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 3385 NULL : lockp)) == NULL) 3386 return (false); 3387 pv->pv_va = va; 3388 pa = PTE_TO_PHYS(l2e); 3389 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3390 pvh = pa_to_pvh(pa); 3391 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3392 pvh->pv_gen++; 3393 return (true); 3394 } 3395 3396 static void 3397 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 3398 { 3399 pt_entry_t newl2, oldl2 __diagused; 3400 vm_page_t ml3; 3401 vm_paddr_t ml3pa; 3402 3403 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); 3404 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 3405 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3406 3407 ml3 = pmap_remove_pt_page(pmap, va); 3408 if (ml3 == NULL) 3409 panic("pmap_remove_kernel_l2: Missing pt page"); 3410 3411 ml3pa = VM_PAGE_TO_PHYS(ml3); 3412 newl2 = PHYS_TO_PTE(ml3pa) | L2_TABLE; 3413 3414 /* 3415 * If this page table page was unmapped by a promotion, then it 3416 * contains valid mappings. Zero it to invalidate those mappings. 3417 */ 3418 if (vm_page_any_valid(ml3)) 3419 pagezero((void *)PHYS_TO_DMAP(ml3pa)); 3420 3421 /* 3422 * Demote the mapping. The caller must have already invalidated the 3423 * mapping (i.e., the "break" in break-before-make). 3424 */ 3425 oldl2 = pmap_load_store(l2, newl2); 3426 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", 3427 __func__, l2, oldl2)); 3428 } 3429 3430 /* 3431 * pmap_remove_l2: Do the things to unmap a level 2 superpage. 3432 */ 3433 static int 3434 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 3435 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) 3436 { 3437 struct md_page *pvh; 3438 pt_entry_t old_l2; 3439 vm_page_t m, ml3, mt; 3440 3441 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3442 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); 3443 old_l2 = pmap_load_clear(l2); 3444 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, 3445 ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2)); 3446 3447 /* 3448 * Since a promotion must break the 4KB page mappings before making 3449 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices. 3450 */ 3451 pmap_s1_invalidate_page(pmap, sva, true); 3452 3453 if (old_l2 & ATTR_SW_WIRED) 3454 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; 3455 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); 3456 if (old_l2 & ATTR_SW_MANAGED) { 3457 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(old_l2)); 3458 pvh = page_to_pvh(m); 3459 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3460 pmap_pvh_free(pvh, pmap, sva); 3461 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) { 3462 if (pmap_pte_dirty(pmap, old_l2)) 3463 vm_page_dirty(mt); 3464 if (old_l2 & ATTR_AF) 3465 vm_page_aflag_set(mt, PGA_REFERENCED); 3466 if (TAILQ_EMPTY(&mt->md.pv_list) && 3467 TAILQ_EMPTY(&pvh->pv_list)) 3468 vm_page_aflag_clear(mt, PGA_WRITEABLE); 3469 } 3470 } 3471 if (pmap == kernel_pmap) { 3472 pmap_remove_kernel_l2(pmap, l2, sva); 3473 } else { 3474 ml3 = pmap_remove_pt_page(pmap, sva); 3475 if (ml3 != NULL) { 3476 KASSERT(vm_page_any_valid(ml3), 3477 ("pmap_remove_l2: l3 page not promoted")); 3478 pmap_resident_count_dec(pmap, 1); 3479 KASSERT(ml3->ref_count == NL3PG, 3480 ("pmap_remove_l2: l3 page ref count error")); 3481 ml3->ref_count = 0; 3482 pmap_add_delayed_free_list(ml3, free, FALSE); 3483 } 3484 } 3485 return (pmap_unuse_pt(pmap, sva, l1e, free)); 3486 } 3487 3488 /* 3489 * pmap_remove_l3: do the things to unmap a page in a process 3490 */ 3491 static int 3492 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 3493 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) 3494 { 3495 struct md_page *pvh; 3496 pt_entry_t old_l3; 3497 vm_page_t m; 3498 3499 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3500 old_l3 = pmap_load_clear(l3); 3501 pmap_s1_invalidate_page(pmap, va, true); 3502 if (old_l3 & ATTR_SW_WIRED) 3503 pmap->pm_stats.wired_count -= 1; 3504 pmap_resident_count_dec(pmap, 1); 3505 if (old_l3 & ATTR_SW_MANAGED) { 3506 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(old_l3)); 3507 if (pmap_pte_dirty(pmap, old_l3)) 3508 vm_page_dirty(m); 3509 if (old_l3 & ATTR_AF) 3510 vm_page_aflag_set(m, PGA_REFERENCED); 3511 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3512 pmap_pvh_free(&m->md, pmap, va); 3513 if (TAILQ_EMPTY(&m->md.pv_list) && 3514 (m->flags & PG_FICTITIOUS) == 0) { 3515 pvh = page_to_pvh(m); 3516 if (TAILQ_EMPTY(&pvh->pv_list)) 3517 vm_page_aflag_clear(m, PGA_WRITEABLE); 3518 } 3519 } 3520 return (pmap_unuse_pt(pmap, va, l2e, free)); 3521 } 3522 3523 /* 3524 * Remove the specified range of addresses from the L3 page table that is 3525 * identified by the given L2 entry. 3526 */ 3527 static void 3528 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva, 3529 vm_offset_t eva, struct spglist *free, struct rwlock **lockp) 3530 { 3531 struct md_page *pvh; 3532 struct rwlock *new_lock; 3533 pt_entry_t *l3, old_l3; 3534 vm_offset_t va; 3535 vm_page_t l3pg, m; 3536 3537 KASSERT(ADDR_IS_CANONICAL(sva), 3538 ("%s: Start address not in canonical form: %lx", __func__, sva)); 3539 KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS, 3540 ("%s: End address not in canonical form: %lx", __func__, eva)); 3541 3542 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3543 KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE), 3544 ("pmap_remove_l3_range: range crosses an L3 page table boundary")); 3545 l3pg = !ADDR_IS_KERNEL(sva) ? PHYS_TO_VM_PAGE(PTE_TO_PHYS(l2e)) : NULL; 3546 va = eva; 3547 for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) { 3548 if (!pmap_l3_valid(pmap_load(l3))) { 3549 if (va != eva) { 3550 pmap_invalidate_range(pmap, va, sva, true); 3551 va = eva; 3552 } 3553 continue; 3554 } 3555 old_l3 = pmap_load_clear(l3); 3556 if ((old_l3 & ATTR_SW_WIRED) != 0) 3557 pmap->pm_stats.wired_count--; 3558 pmap_resident_count_dec(pmap, 1); 3559 if ((old_l3 & ATTR_SW_MANAGED) != 0) { 3560 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(old_l3)); 3561 if (pmap_pte_dirty(pmap, old_l3)) 3562 vm_page_dirty(m); 3563 if ((old_l3 & ATTR_AF) != 0) 3564 vm_page_aflag_set(m, PGA_REFERENCED); 3565 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3566 if (new_lock != *lockp) { 3567 if (*lockp != NULL) { 3568 /* 3569 * Pending TLB invalidations must be 3570 * performed before the PV list lock is 3571 * released. Otherwise, a concurrent 3572 * pmap_remove_all() on a physical page 3573 * could return while a stale TLB entry 3574 * still provides access to that page. 3575 */ 3576 if (va != eva) { 3577 pmap_invalidate_range(pmap, va, 3578 sva, true); 3579 va = eva; 3580 } 3581 rw_wunlock(*lockp); 3582 } 3583 *lockp = new_lock; 3584 rw_wlock(*lockp); 3585 } 3586 pmap_pvh_free(&m->md, pmap, sva); 3587 if (TAILQ_EMPTY(&m->md.pv_list) && 3588 (m->flags & PG_FICTITIOUS) == 0) { 3589 pvh = page_to_pvh(m); 3590 if (TAILQ_EMPTY(&pvh->pv_list)) 3591 vm_page_aflag_clear(m, PGA_WRITEABLE); 3592 } 3593 } 3594 if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) { 3595 /* 3596 * _pmap_unwire_l3() has already invalidated the TLB 3597 * entries at all levels for "sva". So, we need not 3598 * perform "sva += L3_SIZE;" here. Moreover, we need 3599 * not perform "va = sva;" if "sva" is at the start 3600 * of a new valid range consisting of a single page. 3601 */ 3602 break; 3603 } 3604 if (va == eva) 3605 va = sva; 3606 } 3607 if (va != eva) 3608 pmap_invalidate_range(pmap, va, sva, true); 3609 } 3610 3611 /* 3612 * Remove the given range of addresses from the specified map. 3613 * 3614 * It is assumed that the start and end are properly 3615 * rounded to the page size. 3616 */ 3617 void 3618 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3619 { 3620 struct rwlock *lock; 3621 vm_offset_t va_next; 3622 pd_entry_t *l0, *l1, *l2; 3623 pt_entry_t l3_paddr; 3624 struct spglist free; 3625 3626 /* 3627 * Perform an unsynchronized read. This is, however, safe. 3628 */ 3629 if (pmap->pm_stats.resident_count == 0) 3630 return; 3631 3632 SLIST_INIT(&free); 3633 3634 PMAP_LOCK(pmap); 3635 3636 lock = NULL; 3637 for (; sva < eva; sva = va_next) { 3638 if (pmap->pm_stats.resident_count == 0) 3639 break; 3640 3641 l0 = pmap_l0(pmap, sva); 3642 if (pmap_load(l0) == 0) { 3643 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 3644 if (va_next < sva) 3645 va_next = eva; 3646 continue; 3647 } 3648 3649 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3650 if (va_next < sva) 3651 va_next = eva; 3652 l1 = pmap_l0_to_l1(l0, sva); 3653 if (pmap_load(l1) == 0) 3654 continue; 3655 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 3656 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 3657 KASSERT(va_next <= eva, 3658 ("partial update of non-transparent 1G page " 3659 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 3660 pmap_load(l1), sva, eva, va_next)); 3661 MPASS(pmap != kernel_pmap); 3662 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0); 3663 pmap_clear(l1); 3664 pmap_s1_invalidate_page(pmap, sva, true); 3665 pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE); 3666 pmap_unuse_pt(pmap, sva, pmap_load(l0), &free); 3667 continue; 3668 } 3669 3670 /* 3671 * Calculate index for next page table. 3672 */ 3673 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3674 if (va_next < sva) 3675 va_next = eva; 3676 3677 l2 = pmap_l1_to_l2(l1, sva); 3678 if (l2 == NULL) 3679 continue; 3680 3681 l3_paddr = pmap_load(l2); 3682 3683 if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) { 3684 if (sva + L2_SIZE == va_next && eva >= va_next) { 3685 pmap_remove_l2(pmap, l2, sva, pmap_load(l1), 3686 &free, &lock); 3687 continue; 3688 } else if (pmap_demote_l2_locked(pmap, l2, sva, 3689 &lock) == NULL) 3690 continue; 3691 l3_paddr = pmap_load(l2); 3692 } 3693 3694 /* 3695 * Weed out invalid mappings. 3696 */ 3697 if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE) 3698 continue; 3699 3700 /* 3701 * Limit our scan to either the end of the va represented 3702 * by the current page table page, or to the end of the 3703 * range being removed. 3704 */ 3705 if (va_next > eva) 3706 va_next = eva; 3707 3708 pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free, 3709 &lock); 3710 } 3711 if (lock != NULL) 3712 rw_wunlock(lock); 3713 PMAP_UNLOCK(pmap); 3714 vm_page_free_pages_toq(&free, true); 3715 } 3716 3717 /* 3718 * Remove the given range of addresses as part of a logical unmap 3719 * operation. This has the effect of calling pmap_remove(), but 3720 * also clears any metadata that should persist for the lifetime 3721 * of a logical mapping. 3722 */ 3723 void 3724 pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3725 { 3726 pmap_remove(pmap, sva, eva); 3727 } 3728 3729 /* 3730 * Routine: pmap_remove_all 3731 * Function: 3732 * Removes this physical page from 3733 * all physical maps in which it resides. 3734 * Reflects back modify bits to the pager. 3735 * 3736 * Notes: 3737 * Original versions of this routine were very 3738 * inefficient because they iteratively called 3739 * pmap_remove (slow...) 3740 */ 3741 3742 void 3743 pmap_remove_all(vm_page_t m) 3744 { 3745 struct md_page *pvh; 3746 pv_entry_t pv; 3747 pmap_t pmap; 3748 struct rwlock *lock; 3749 pd_entry_t *pde, tpde; 3750 pt_entry_t *pte, tpte; 3751 vm_offset_t va; 3752 struct spglist free; 3753 int lvl, pvh_gen, md_gen; 3754 3755 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3756 ("pmap_remove_all: page %p is not managed", m)); 3757 SLIST_INIT(&free); 3758 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3759 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 3760 rw_wlock(lock); 3761 retry: 3762 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 3763 pmap = PV_PMAP(pv); 3764 if (!PMAP_TRYLOCK(pmap)) { 3765 pvh_gen = pvh->pv_gen; 3766 rw_wunlock(lock); 3767 PMAP_LOCK(pmap); 3768 rw_wlock(lock); 3769 if (pvh_gen != pvh->pv_gen) { 3770 PMAP_UNLOCK(pmap); 3771 goto retry; 3772 } 3773 } 3774 va = pv->pv_va; 3775 pte = pmap_pte_exists(pmap, va, 2, __func__); 3776 pmap_demote_l2_locked(pmap, pte, va, &lock); 3777 PMAP_UNLOCK(pmap); 3778 } 3779 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3780 pmap = PV_PMAP(pv); 3781 if (!PMAP_TRYLOCK(pmap)) { 3782 pvh_gen = pvh->pv_gen; 3783 md_gen = m->md.pv_gen; 3784 rw_wunlock(lock); 3785 PMAP_LOCK(pmap); 3786 rw_wlock(lock); 3787 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 3788 PMAP_UNLOCK(pmap); 3789 goto retry; 3790 } 3791 } 3792 pmap_resident_count_dec(pmap, 1); 3793 3794 pde = pmap_pde(pmap, pv->pv_va, &lvl); 3795 KASSERT(pde != NULL, 3796 ("pmap_remove_all: no page directory entry found")); 3797 KASSERT(lvl == 2, 3798 ("pmap_remove_all: invalid pde level %d", lvl)); 3799 tpde = pmap_load(pde); 3800 3801 pte = pmap_l2_to_l3(pde, pv->pv_va); 3802 tpte = pmap_load_clear(pte); 3803 if (tpte & ATTR_SW_WIRED) 3804 pmap->pm_stats.wired_count--; 3805 if ((tpte & ATTR_AF) != 0) { 3806 pmap_invalidate_page(pmap, pv->pv_va, true); 3807 vm_page_aflag_set(m, PGA_REFERENCED); 3808 } 3809 3810 /* 3811 * Update the vm_page_t clean and reference bits. 3812 */ 3813 if (pmap_pte_dirty(pmap, tpte)) 3814 vm_page_dirty(m); 3815 pmap_unuse_pt(pmap, pv->pv_va, tpde, &free); 3816 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3817 m->md.pv_gen++; 3818 free_pv_entry(pmap, pv); 3819 PMAP_UNLOCK(pmap); 3820 } 3821 vm_page_aflag_clear(m, PGA_WRITEABLE); 3822 rw_wunlock(lock); 3823 vm_page_free_pages_toq(&free, true); 3824 } 3825 3826 /* 3827 * Masks and sets bits in a level 2 page table entries in the specified pmap 3828 */ 3829 static void 3830 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask, 3831 pt_entry_t nbits) 3832 { 3833 pd_entry_t old_l2; 3834 vm_page_t m, mt; 3835 3836 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3837 PMAP_ASSERT_STAGE1(pmap); 3838 KASSERT((sva & L2_OFFSET) == 0, 3839 ("pmap_protect_l2: sva is not 2mpage aligned")); 3840 old_l2 = pmap_load(l2); 3841 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, 3842 ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2)); 3843 3844 /* 3845 * Return if the L2 entry already has the desired access restrictions 3846 * in place. 3847 */ 3848 if ((old_l2 & mask) == nbits) 3849 return; 3850 3851 while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits)) 3852 cpu_spinwait(); 3853 3854 /* 3855 * When a dirty read/write superpage mapping is write protected, 3856 * update the dirty field of each of the superpage's constituent 4KB 3857 * pages. 3858 */ 3859 if ((old_l2 & ATTR_SW_MANAGED) != 0 && 3860 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 3861 pmap_pte_dirty(pmap, old_l2)) { 3862 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(old_l2)); 3863 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 3864 vm_page_dirty(mt); 3865 } 3866 3867 /* 3868 * Since a promotion must break the 4KB page mappings before making 3869 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices. 3870 */ 3871 pmap_s1_invalidate_page(pmap, sva, true); 3872 } 3873 3874 /* 3875 * Masks and sets bits in last level page table entries in the specified 3876 * pmap and range 3877 */ 3878 static void 3879 pmap_mask_set_locked(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask, 3880 pt_entry_t nbits, bool invalidate) 3881 { 3882 vm_offset_t va, va_next; 3883 pd_entry_t *l0, *l1, *l2; 3884 pt_entry_t *l3p, l3; 3885 3886 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3887 for (; sva < eva; sva = va_next) { 3888 l0 = pmap_l0(pmap, sva); 3889 if (pmap_load(l0) == 0) { 3890 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 3891 if (va_next < sva) 3892 va_next = eva; 3893 continue; 3894 } 3895 3896 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3897 if (va_next < sva) 3898 va_next = eva; 3899 l1 = pmap_l0_to_l1(l0, sva); 3900 if (pmap_load(l1) == 0) 3901 continue; 3902 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 3903 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 3904 KASSERT(va_next <= eva, 3905 ("partial update of non-transparent 1G page " 3906 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 3907 pmap_load(l1), sva, eva, va_next)); 3908 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0); 3909 if ((pmap_load(l1) & mask) != nbits) { 3910 pmap_store(l1, (pmap_load(l1) & ~mask) | nbits); 3911 if (invalidate) 3912 pmap_s1_invalidate_page(pmap, sva, true); 3913 } 3914 continue; 3915 } 3916 3917 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3918 if (va_next < sva) 3919 va_next = eva; 3920 3921 l2 = pmap_l1_to_l2(l1, sva); 3922 if (pmap_load(l2) == 0) 3923 continue; 3924 3925 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { 3926 if (sva + L2_SIZE == va_next && eva >= va_next) { 3927 pmap_protect_l2(pmap, l2, sva, mask, nbits); 3928 continue; 3929 } else if (pmap_demote_l2(pmap, l2, sva) == NULL) 3930 continue; 3931 } 3932 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 3933 ("pmap_protect: Invalid L2 entry after demotion")); 3934 3935 if (va_next > eva) 3936 va_next = eva; 3937 3938 va = va_next; 3939 for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++, 3940 sva += L3_SIZE) { 3941 l3 = pmap_load(l3p); 3942 3943 /* 3944 * Go to the next L3 entry if the current one is 3945 * invalid or already has the desired access 3946 * restrictions in place. (The latter case occurs 3947 * frequently. For example, in a "buildworld" 3948 * workload, almost 1 out of 4 L3 entries already 3949 * have the desired restrictions.) 3950 */ 3951 if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) { 3952 if (va != va_next) { 3953 if (invalidate) 3954 pmap_s1_invalidate_range(pmap, 3955 va, sva, true); 3956 va = va_next; 3957 } 3958 continue; 3959 } 3960 3961 while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) | 3962 nbits)) 3963 cpu_spinwait(); 3964 3965 /* 3966 * When a dirty read/write mapping is write protected, 3967 * update the page's dirty field. 3968 */ 3969 if ((l3 & ATTR_SW_MANAGED) != 0 && 3970 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 3971 pmap_pte_dirty(pmap, l3)) 3972 vm_page_dirty(PHYS_TO_VM_PAGE(PTE_TO_PHYS(l3))); 3973 3974 if (va == va_next) 3975 va = sva; 3976 } 3977 if (va != va_next && invalidate) 3978 pmap_s1_invalidate_range(pmap, va, sva, true); 3979 } 3980 } 3981 3982 static void 3983 pmap_mask_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask, 3984 pt_entry_t nbits, bool invalidate) 3985 { 3986 PMAP_LOCK(pmap); 3987 pmap_mask_set_locked(pmap, sva, eva, mask, nbits, invalidate); 3988 PMAP_UNLOCK(pmap); 3989 } 3990 3991 /* 3992 * Set the physical protection on the 3993 * specified range of this map as requested. 3994 */ 3995 void 3996 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 3997 { 3998 pt_entry_t mask, nbits; 3999 4000 PMAP_ASSERT_STAGE1(pmap); 4001 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 4002 if (prot == VM_PROT_NONE) { 4003 pmap_remove(pmap, sva, eva); 4004 return; 4005 } 4006 4007 mask = nbits = 0; 4008 if ((prot & VM_PROT_WRITE) == 0) { 4009 mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM; 4010 nbits |= ATTR_S1_AP(ATTR_S1_AP_RO); 4011 } 4012 if ((prot & VM_PROT_EXECUTE) == 0) { 4013 mask |= ATTR_S1_XN; 4014 nbits |= ATTR_S1_XN; 4015 } 4016 if (pmap == kernel_pmap) { 4017 mask |= ATTR_KERN_GP; 4018 nbits |= ATTR_KERN_GP; 4019 } 4020 if (mask == 0) 4021 return; 4022 4023 pmap_mask_set(pmap, sva, eva, mask, nbits, true); 4024 } 4025 4026 void 4027 pmap_disable_promotion(vm_offset_t sva, vm_size_t size) 4028 { 4029 4030 MPASS((sva & L3_OFFSET) == 0); 4031 MPASS(((sva + size) & L3_OFFSET) == 0); 4032 4033 pmap_mask_set(kernel_pmap, sva, sva + size, ATTR_SW_NO_PROMOTE, 4034 ATTR_SW_NO_PROMOTE, false); 4035 } 4036 4037 /* 4038 * Inserts the specified page table page into the specified pmap's collection 4039 * of idle page table pages. Each of a pmap's page table pages is responsible 4040 * for mapping a distinct range of virtual addresses. The pmap's collection is 4041 * ordered by this virtual address range. 4042 * 4043 * If "promoted" is false, then the page table page "mpte" must be zero filled; 4044 * "mpte"'s valid field will be set to 0. 4045 * 4046 * If "promoted" is true and "all_l3e_AF_set" is false, then "mpte" must 4047 * contain valid mappings with identical attributes except for ATTR_AF; 4048 * "mpte"'s valid field will be set to 1. 4049 * 4050 * If "promoted" and "all_l3e_AF_set" are both true, then "mpte" must contain 4051 * valid mappings with identical attributes including ATTR_AF; "mpte"'s valid 4052 * field will be set to VM_PAGE_BITS_ALL. 4053 */ 4054 static __inline int 4055 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, 4056 bool all_l3e_AF_set) 4057 { 4058 4059 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4060 KASSERT(promoted || !all_l3e_AF_set, 4061 ("a zero-filled PTP can't have ATTR_AF set in every PTE")); 4062 mpte->valid = promoted ? (all_l3e_AF_set ? VM_PAGE_BITS_ALL : 1) : 0; 4063 return (vm_radix_insert(&pmap->pm_root, mpte)); 4064 } 4065 4066 /* 4067 * Removes the page table page mapping the specified virtual address from the 4068 * specified pmap's collection of idle page table pages, and returns it. 4069 * Otherwise, returns NULL if there is no page table page corresponding to the 4070 * specified virtual address. 4071 */ 4072 static __inline vm_page_t 4073 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 4074 { 4075 4076 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4077 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); 4078 } 4079 4080 /* 4081 * Performs a break-before-make update of a pmap entry. This is needed when 4082 * either promoting or demoting pages to ensure the TLB doesn't get into an 4083 * inconsistent state. 4084 */ 4085 static void 4086 pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte, 4087 vm_offset_t va, vm_size_t size) 4088 { 4089 register_t intr; 4090 4091 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4092 4093 if ((newpte & ATTR_SW_NO_PROMOTE) != 0) 4094 panic("%s: Updating non-promote pte", __func__); 4095 4096 /* 4097 * Ensure we don't get switched out with the page table in an 4098 * inconsistent state. We also need to ensure no interrupts fire 4099 * as they may make use of an address we are about to invalidate. 4100 */ 4101 intr = intr_disable(); 4102 4103 /* 4104 * Clear the old mapping's valid bit, but leave the rest of the entry 4105 * unchanged, so that a lockless, concurrent pmap_kextract() can still 4106 * lookup the physical address. 4107 */ 4108 pmap_clear_bits(pte, ATTR_DESCR_VALID); 4109 4110 /* 4111 * When promoting, the L{1,2}_TABLE entry that is being replaced might 4112 * be cached, so we invalidate intermediate entries as well as final 4113 * entries. 4114 */ 4115 pmap_s1_invalidate_range(pmap, va, va + size, false); 4116 4117 /* Create the new mapping */ 4118 pmap_store(pte, newpte); 4119 dsb(ishst); 4120 4121 intr_restore(intr); 4122 } 4123 4124 #if VM_NRESERVLEVEL > 0 4125 /* 4126 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 4127 * replace the many pv entries for the 4KB page mappings by a single pv entry 4128 * for the 2MB page mapping. 4129 */ 4130 static void 4131 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 4132 struct rwlock **lockp) 4133 { 4134 struct md_page *pvh; 4135 pv_entry_t pv; 4136 vm_offset_t va_last; 4137 vm_page_t m; 4138 4139 KASSERT((pa & L2_OFFSET) == 0, 4140 ("pmap_pv_promote_l2: pa is not 2mpage aligned")); 4141 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 4142 4143 /* 4144 * Transfer the first page's pv entry for this mapping to the 2mpage's 4145 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 4146 * a transfer avoids the possibility that get_pv_entry() calls 4147 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 4148 * mappings that is being promoted. 4149 */ 4150 m = PHYS_TO_VM_PAGE(pa); 4151 va = va & ~L2_OFFSET; 4152 pv = pmap_pvh_remove(&m->md, pmap, va); 4153 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found")); 4154 pvh = page_to_pvh(m); 4155 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4156 pvh->pv_gen++; 4157 /* Free the remaining NPTEPG - 1 pv entries. */ 4158 va_last = va + L2_SIZE - PAGE_SIZE; 4159 do { 4160 m++; 4161 va += PAGE_SIZE; 4162 pmap_pvh_free(&m->md, pmap, va); 4163 } while (va < va_last); 4164 } 4165 4166 /* 4167 * Tries to promote the 512, contiguous 4KB page mappings that are within a 4168 * single level 2 table entry to a single 2MB page mapping. For promotion 4169 * to occur, two conditions must be met: (1) the 4KB page mappings must map 4170 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 4171 * identical characteristics. 4172 */ 4173 static bool 4174 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t mpte, 4175 struct rwlock **lockp) 4176 { 4177 pt_entry_t all_l3e_AF, *firstl3, *l3, newl2, oldl3, pa; 4178 4179 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4180 4181 /* 4182 * Currently, this function only supports promotion on stage 1 pmaps 4183 * because it tests stage 1 specific fields and performs a break- 4184 * before-make sequence that is incorrect for stage 2 pmaps. 4185 */ 4186 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap)) 4187 return (false); 4188 4189 /* 4190 * Examine the first L3E in the specified PTP. Abort if this L3E is 4191 * ineligible for promotion... 4192 */ 4193 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2))); 4194 newl2 = pmap_load(firstl3); 4195 if ((newl2 & ATTR_SW_NO_PROMOTE) != 0) 4196 return (false); 4197 /* ... is not the first physical page within an L2 block */ 4198 if ((PTE_TO_PHYS(newl2) & L2_OFFSET) != 0 || 4199 ((newl2 & ATTR_DESCR_MASK) != L3_PAGE)) { /* ... or is invalid */ 4200 atomic_add_long(&pmap_l2_p_failures, 1); 4201 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 4202 " in pmap %p", va, pmap); 4203 return (false); 4204 } 4205 4206 /* 4207 * Both here and in the below "for" loop, to allow for repromotion 4208 * after MADV_FREE, conditionally write protect a clean L3E before 4209 * possibly aborting the promotion due to other L3E attributes. Why? 4210 * Suppose that MADV_FREE is applied to a part of a superpage, the 4211 * address range [S, E). pmap_advise() will demote the superpage 4212 * mapping, destroy the 4KB page mapping at the end of [S, E), and 4213 * set AP_RO and clear AF in the L3Es for the rest of [S, E). Later, 4214 * imagine that the memory in [S, E) is recycled, but the last 4KB 4215 * page in [S, E) is not the last to be rewritten, or simply accessed. 4216 * In other words, there is still a 4KB page in [S, E), call it P, 4217 * that is writeable but AP_RO is set and AF is clear in P's L3E. 4218 * Unless we write protect P before aborting the promotion, if and 4219 * when P is finally rewritten, there won't be a page fault to trigger 4220 * repromotion. 4221 */ 4222 setl2: 4223 if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 4224 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 4225 /* 4226 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set, 4227 * ATTR_SW_DBM can be cleared without a TLB invalidation. 4228 */ 4229 if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM)) 4230 goto setl2; 4231 newl2 &= ~ATTR_SW_DBM; 4232 CTR2(KTR_PMAP, "pmap_promote_l2: protect for va %#lx" 4233 " in pmap %p", va & ~L2_OFFSET, pmap); 4234 } 4235 4236 /* 4237 * Examine each of the other L3Es in the specified PTP. Abort if this 4238 * L3E maps an unexpected 4KB physical page or does not have identical 4239 * characteristics to the first L3E. If ATTR_AF is not set in every 4240 * PTE, then request that the PTP be refilled on demotion. 4241 */ 4242 all_l3e_AF = newl2 & ATTR_AF; 4243 pa = (PTE_TO_PHYS(newl2) | (newl2 & ATTR_DESCR_MASK)) 4244 + L2_SIZE - PAGE_SIZE; 4245 for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) { 4246 oldl3 = pmap_load(l3); 4247 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) { 4248 atomic_add_long(&pmap_l2_p_failures, 1); 4249 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 4250 " in pmap %p", va, pmap); 4251 return (false); 4252 } 4253 setl3: 4254 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 4255 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 4256 /* 4257 * When the mapping is clean, i.e., ATTR_S1_AP_RO is 4258 * set, ATTR_SW_DBM can be cleared without a TLB 4259 * invalidation. 4260 */ 4261 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 & 4262 ~ATTR_SW_DBM)) 4263 goto setl3; 4264 oldl3 &= ~ATTR_SW_DBM; 4265 } 4266 if ((oldl3 & (ATTR_MASK & ~ATTR_AF)) != (newl2 & (ATTR_MASK & 4267 ~ATTR_AF))) { 4268 atomic_add_long(&pmap_l2_p_failures, 1); 4269 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 4270 " in pmap %p", va, pmap); 4271 return (false); 4272 } 4273 all_l3e_AF &= oldl3; 4274 pa -= PAGE_SIZE; 4275 } 4276 4277 /* 4278 * Unless all PTEs have ATTR_AF set, clear it from the superpage 4279 * mapping, so that promotions triggered by speculative mappings, 4280 * such as pmap_enter_quick(), don't automatically mark the 4281 * underlying pages as referenced. 4282 */ 4283 newl2 &= ~ATTR_AF | all_l3e_AF; 4284 4285 /* 4286 * Save the page table page in its current state until the L2 4287 * mapping the superpage is demoted by pmap_demote_l2() or 4288 * destroyed by pmap_remove_l3(). 4289 */ 4290 if (mpte == NULL) 4291 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 4292 KASSERT(mpte >= vm_page_array && 4293 mpte < &vm_page_array[vm_page_array_size], 4294 ("pmap_promote_l2: page table page is out of range")); 4295 KASSERT(mpte->pindex == pmap_l2_pindex(va), 4296 ("pmap_promote_l2: page table page's pindex is wrong")); 4297 if (pmap_insert_pt_page(pmap, mpte, true, all_l3e_AF != 0)) { 4298 atomic_add_long(&pmap_l2_p_failures, 1); 4299 CTR2(KTR_PMAP, 4300 "pmap_promote_l2: failure for va %#lx in pmap %p", va, 4301 pmap); 4302 return (false); 4303 } 4304 4305 if ((newl2 & ATTR_SW_MANAGED) != 0) 4306 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(newl2), lockp); 4307 4308 newl2 &= ~ATTR_DESCR_MASK; 4309 newl2 |= L2_BLOCK; 4310 4311 pmap_update_entry(pmap, l2, newl2, va & ~L2_OFFSET, L2_SIZE); 4312 4313 atomic_add_long(&pmap_l2_promotions, 1); 4314 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, 4315 pmap); 4316 return (true); 4317 } 4318 #endif /* VM_NRESERVLEVEL > 0 */ 4319 4320 static int 4321 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags, 4322 int psind) 4323 { 4324 pd_entry_t *l0p, *l1p, *l2p, origpte; 4325 vm_page_t mp; 4326 4327 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4328 KASSERT(psind > 0 && psind < MAXPAGESIZES, 4329 ("psind %d unexpected", psind)); 4330 KASSERT((PTE_TO_PHYS(newpte) & (pagesizes[psind] - 1)) == 0, 4331 ("unaligned phys address %#lx newpte %#lx psind %d", 4332 PTE_TO_PHYS(newpte), newpte, psind)); 4333 4334 restart: 4335 if (psind == 2) { 4336 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 4337 4338 l0p = pmap_l0(pmap, va); 4339 if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) { 4340 mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL); 4341 if (mp == NULL) { 4342 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 4343 return (KERN_RESOURCE_SHORTAGE); 4344 PMAP_UNLOCK(pmap); 4345 vm_wait(NULL); 4346 PMAP_LOCK(pmap); 4347 goto restart; 4348 } 4349 l1p = pmap_l0_to_l1(l0p, va); 4350 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va)); 4351 origpte = pmap_load(l1p); 4352 } else { 4353 l1p = pmap_l0_to_l1(l0p, va); 4354 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va)); 4355 origpte = pmap_load(l1p); 4356 if ((origpte & ATTR_DESCR_VALID) == 0) { 4357 mp = PHYS_TO_VM_PAGE( 4358 PTE_TO_PHYS(pmap_load(l0p))); 4359 mp->ref_count++; 4360 } 4361 } 4362 KASSERT((PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte) && 4363 (origpte & ATTR_DESCR_MASK) == L1_BLOCK) || 4364 (origpte & ATTR_DESCR_VALID) == 0, 4365 ("va %#lx changing 1G phys page l1 %#lx newpte %#lx", 4366 va, origpte, newpte)); 4367 pmap_store(l1p, newpte); 4368 } else /* (psind == 1) */ { 4369 l2p = pmap_l2(pmap, va); 4370 if (l2p == NULL) { 4371 mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL); 4372 if (mp == NULL) { 4373 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 4374 return (KERN_RESOURCE_SHORTAGE); 4375 PMAP_UNLOCK(pmap); 4376 vm_wait(NULL); 4377 PMAP_LOCK(pmap); 4378 goto restart; 4379 } 4380 l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 4381 l2p = &l2p[pmap_l2_index(va)]; 4382 origpte = pmap_load(l2p); 4383 } else { 4384 l1p = pmap_l1(pmap, va); 4385 origpte = pmap_load(l2p); 4386 if ((origpte & ATTR_DESCR_VALID) == 0) { 4387 mp = PHYS_TO_VM_PAGE( 4388 PTE_TO_PHYS(pmap_load(l1p))); 4389 mp->ref_count++; 4390 } 4391 } 4392 KASSERT((origpte & ATTR_DESCR_VALID) == 0 || 4393 ((origpte & ATTR_DESCR_MASK) == L2_BLOCK && 4394 PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)), 4395 ("va %#lx changing 2M phys page l2 %#lx newpte %#lx", 4396 va, origpte, newpte)); 4397 pmap_store(l2p, newpte); 4398 } 4399 dsb(ishst); 4400 4401 if ((origpte & ATTR_DESCR_VALID) == 0) 4402 pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE); 4403 if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0) 4404 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE; 4405 else if ((newpte & ATTR_SW_WIRED) == 0 && 4406 (origpte & ATTR_SW_WIRED) != 0) 4407 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE; 4408 4409 return (KERN_SUCCESS); 4410 } 4411 4412 /* 4413 * Insert the given physical page (p) at 4414 * the specified virtual address (v) in the 4415 * target physical map with the protection requested. 4416 * 4417 * If specified, the page will be wired down, meaning 4418 * that the related pte can not be reclaimed. 4419 * 4420 * NB: This is the only routine which MAY NOT lazy-evaluate 4421 * or lose information. That is, this routine must actually 4422 * insert this page into the given map NOW. 4423 */ 4424 int 4425 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4426 u_int flags, int8_t psind) 4427 { 4428 struct rwlock *lock; 4429 pd_entry_t *pde; 4430 pt_entry_t new_l3, orig_l3; 4431 pt_entry_t *l2, *l3; 4432 pv_entry_t pv; 4433 vm_paddr_t opa, pa; 4434 vm_page_t mpte, om; 4435 boolean_t nosleep; 4436 int lvl, rv; 4437 4438 KASSERT(ADDR_IS_CANONICAL(va), 4439 ("%s: Address not in canonical form: %lx", __func__, va)); 4440 4441 va = trunc_page(va); 4442 if ((m->oflags & VPO_UNMANAGED) == 0) 4443 VM_PAGE_OBJECT_BUSY_ASSERT(m); 4444 pa = VM_PAGE_TO_PHYS(m); 4445 new_l3 = (pt_entry_t)(PHYS_TO_PTE(pa) | ATTR_DEFAULT | L3_PAGE); 4446 new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr); 4447 new_l3 |= pmap_pte_prot(pmap, prot); 4448 if ((flags & PMAP_ENTER_WIRED) != 0) 4449 new_l3 |= ATTR_SW_WIRED; 4450 if (pmap->pm_stage == PM_STAGE1) { 4451 if (!ADDR_IS_KERNEL(va)) 4452 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 4453 else 4454 new_l3 |= ATTR_S1_UXN; 4455 if (pmap != kernel_pmap) 4456 new_l3 |= ATTR_S1_nG; 4457 } else { 4458 /* 4459 * Clear the access flag on executable mappings, this will be 4460 * set later when the page is accessed. The fault handler is 4461 * required to invalidate the I-cache. 4462 * 4463 * TODO: Switch to the valid flag to allow hardware management 4464 * of the access flag. Much of the pmap code assumes the 4465 * valid flag is set and fails to destroy the old page tables 4466 * correctly if it is clear. 4467 */ 4468 if (prot & VM_PROT_EXECUTE) 4469 new_l3 &= ~ATTR_AF; 4470 } 4471 if ((m->oflags & VPO_UNMANAGED) == 0) { 4472 new_l3 |= ATTR_SW_MANAGED; 4473 if ((prot & VM_PROT_WRITE) != 0) { 4474 new_l3 |= ATTR_SW_DBM; 4475 if ((flags & VM_PROT_WRITE) == 0) { 4476 if (pmap->pm_stage == PM_STAGE1) 4477 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO); 4478 else 4479 new_l3 &= 4480 ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 4481 } 4482 } 4483 } 4484 4485 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); 4486 4487 lock = NULL; 4488 PMAP_LOCK(pmap); 4489 /* Wait until we lock the pmap to protect the bti rangeset */ 4490 new_l3 |= pmap_pte_bti(pmap, va); 4491 4492 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { 4493 KASSERT((m->oflags & VPO_UNMANAGED) != 0, 4494 ("managed largepage va %#lx flags %#x", va, flags)); 4495 new_l3 &= ~L3_PAGE; 4496 if (psind == 2) { 4497 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 4498 new_l3 |= L1_BLOCK; 4499 } else /* (psind == 1) */ 4500 new_l3 |= L2_BLOCK; 4501 rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind); 4502 goto out; 4503 } 4504 if (psind == 1) { 4505 /* Assert the required virtual and physical alignment. */ 4506 KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned")); 4507 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 4508 rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK, 4509 flags, m, &lock); 4510 goto out; 4511 } 4512 mpte = NULL; 4513 4514 /* 4515 * In the case that a page table page is not 4516 * resident, we are creating it here. 4517 */ 4518 retry: 4519 pde = pmap_pde(pmap, va, &lvl); 4520 if (pde != NULL && lvl == 2) { 4521 l3 = pmap_l2_to_l3(pde, va); 4522 if (!ADDR_IS_KERNEL(va) && mpte == NULL) { 4523 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(pde))); 4524 mpte->ref_count++; 4525 } 4526 goto havel3; 4527 } else if (pde != NULL && lvl == 1) { 4528 l2 = pmap_l1_to_l2(pde, va); 4529 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK && 4530 (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) { 4531 l3 = &l3[pmap_l3_index(va)]; 4532 if (!ADDR_IS_KERNEL(va)) { 4533 mpte = PHYS_TO_VM_PAGE( 4534 PTE_TO_PHYS(pmap_load(l2))); 4535 mpte->ref_count++; 4536 } 4537 goto havel3; 4538 } 4539 /* We need to allocate an L3 table. */ 4540 } 4541 if (!ADDR_IS_KERNEL(va)) { 4542 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 4543 4544 /* 4545 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order 4546 * to handle the possibility that a superpage mapping for "va" 4547 * was created while we slept. 4548 */ 4549 mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), 4550 nosleep ? NULL : &lock); 4551 if (mpte == NULL && nosleep) { 4552 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); 4553 rv = KERN_RESOURCE_SHORTAGE; 4554 goto out; 4555 } 4556 goto retry; 4557 } else 4558 panic("pmap_enter: missing L3 table for kernel va %#lx", va); 4559 4560 havel3: 4561 orig_l3 = pmap_load(l3); 4562 opa = PTE_TO_PHYS(orig_l3); 4563 pv = NULL; 4564 4565 /* 4566 * Is the specified virtual address already mapped? 4567 */ 4568 if (pmap_l3_valid(orig_l3)) { 4569 /* 4570 * Wiring change, just update stats. We don't worry about 4571 * wiring PT pages as they remain resident as long as there 4572 * are valid mappings in them. Hence, if a user page is wired, 4573 * the PT page will be also. 4574 */ 4575 if ((flags & PMAP_ENTER_WIRED) != 0 && 4576 (orig_l3 & ATTR_SW_WIRED) == 0) 4577 pmap->pm_stats.wired_count++; 4578 else if ((flags & PMAP_ENTER_WIRED) == 0 && 4579 (orig_l3 & ATTR_SW_WIRED) != 0) 4580 pmap->pm_stats.wired_count--; 4581 4582 /* 4583 * Remove the extra PT page reference. 4584 */ 4585 if (mpte != NULL) { 4586 mpte->ref_count--; 4587 KASSERT(mpte->ref_count > 0, 4588 ("pmap_enter: missing reference to page table page," 4589 " va: 0x%lx", va)); 4590 } 4591 4592 /* 4593 * Has the physical page changed? 4594 */ 4595 if (opa == pa) { 4596 /* 4597 * No, might be a protection or wiring change. 4598 */ 4599 if ((orig_l3 & ATTR_SW_MANAGED) != 0 && 4600 (new_l3 & ATTR_SW_DBM) != 0) 4601 vm_page_aflag_set(m, PGA_WRITEABLE); 4602 goto validate; 4603 } 4604 4605 /* 4606 * The physical page has changed. Temporarily invalidate 4607 * the mapping. 4608 */ 4609 orig_l3 = pmap_load_clear(l3); 4610 KASSERT(PTE_TO_PHYS(orig_l3) == opa, 4611 ("pmap_enter: unexpected pa update for %#lx", va)); 4612 if ((orig_l3 & ATTR_SW_MANAGED) != 0) { 4613 om = PHYS_TO_VM_PAGE(opa); 4614 4615 /* 4616 * The pmap lock is sufficient to synchronize with 4617 * concurrent calls to pmap_page_test_mappings() and 4618 * pmap_ts_referenced(). 4619 */ 4620 if (pmap_pte_dirty(pmap, orig_l3)) 4621 vm_page_dirty(om); 4622 if ((orig_l3 & ATTR_AF) != 0) { 4623 pmap_invalidate_page(pmap, va, true); 4624 vm_page_aflag_set(om, PGA_REFERENCED); 4625 } 4626 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, om); 4627 pv = pmap_pvh_remove(&om->md, pmap, va); 4628 if ((m->oflags & VPO_UNMANAGED) != 0) 4629 free_pv_entry(pmap, pv); 4630 if ((om->a.flags & PGA_WRITEABLE) != 0 && 4631 TAILQ_EMPTY(&om->md.pv_list) && 4632 ((om->flags & PG_FICTITIOUS) != 0 || 4633 TAILQ_EMPTY(&page_to_pvh(om)->pv_list))) 4634 vm_page_aflag_clear(om, PGA_WRITEABLE); 4635 } else { 4636 KASSERT((orig_l3 & ATTR_AF) != 0, 4637 ("pmap_enter: unmanaged mapping lacks ATTR_AF")); 4638 pmap_invalidate_page(pmap, va, true); 4639 } 4640 orig_l3 = 0; 4641 } else { 4642 /* 4643 * Increment the counters. 4644 */ 4645 if ((new_l3 & ATTR_SW_WIRED) != 0) 4646 pmap->pm_stats.wired_count++; 4647 pmap_resident_count_inc(pmap, 1); 4648 } 4649 /* 4650 * Enter on the PV list if part of our managed memory. 4651 */ 4652 if ((m->oflags & VPO_UNMANAGED) == 0) { 4653 if (pv == NULL) { 4654 pv = get_pv_entry(pmap, &lock); 4655 pv->pv_va = va; 4656 } 4657 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 4658 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4659 m->md.pv_gen++; 4660 if ((new_l3 & ATTR_SW_DBM) != 0) 4661 vm_page_aflag_set(m, PGA_WRITEABLE); 4662 } 4663 4664 validate: 4665 if (pmap->pm_stage == PM_STAGE1) { 4666 /* 4667 * Sync icache if exec permission and attribute 4668 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping 4669 * is stored and made valid for hardware table walk. If done 4670 * later, then other can access this page before caches are 4671 * properly synced. Don't do it for kernel memory which is 4672 * mapped with exec permission even if the memory isn't going 4673 * to hold executable code. The only time when icache sync is 4674 * needed is after kernel module is loaded and the relocation 4675 * info is processed. And it's done in elf_cpu_load_file(). 4676 */ 4677 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 4678 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK && 4679 (opa != pa || (orig_l3 & ATTR_S1_XN))) { 4680 PMAP_ASSERT_STAGE1(pmap); 4681 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); 4682 } 4683 } else { 4684 cpu_dcache_wb_range(PHYS_TO_DMAP(pa), PAGE_SIZE); 4685 } 4686 4687 /* 4688 * Update the L3 entry 4689 */ 4690 if (pmap_l3_valid(orig_l3)) { 4691 KASSERT(opa == pa, ("pmap_enter: invalid update")); 4692 if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) { 4693 /* same PA, different attributes */ 4694 orig_l3 = pmap_load_store(l3, new_l3); 4695 pmap_invalidate_page(pmap, va, true); 4696 if ((orig_l3 & ATTR_SW_MANAGED) != 0 && 4697 pmap_pte_dirty(pmap, orig_l3)) 4698 vm_page_dirty(m); 4699 } else { 4700 /* 4701 * orig_l3 == new_l3 4702 * This can happens if multiple threads simultaneously 4703 * access not yet mapped page. This bad for performance 4704 * since this can cause full demotion-NOP-promotion 4705 * cycle. 4706 * Another possible reasons are: 4707 * - VM and pmap memory layout are diverged 4708 * - tlb flush is missing somewhere and CPU doesn't see 4709 * actual mapping. 4710 */ 4711 CTR4(KTR_PMAP, "%s: already mapped page - " 4712 "pmap %p va 0x%#lx pte 0x%lx", 4713 __func__, pmap, va, new_l3); 4714 } 4715 } else { 4716 /* New mapping */ 4717 pmap_store(l3, new_l3); 4718 dsb(ishst); 4719 } 4720 4721 #if VM_NRESERVLEVEL > 0 4722 /* 4723 * If both the page table page and the reservation are fully 4724 * populated, then attempt promotion. 4725 */ 4726 if ((mpte == NULL || mpte->ref_count == NL3PG) && 4727 (m->flags & PG_FICTITIOUS) == 0 && 4728 vm_reserv_level_iffullpop(m) == 0) 4729 (void)pmap_promote_l2(pmap, pde, va, mpte, &lock); 4730 #endif 4731 4732 rv = KERN_SUCCESS; 4733 out: 4734 if (lock != NULL) 4735 rw_wunlock(lock); 4736 PMAP_UNLOCK(pmap); 4737 return (rv); 4738 } 4739 4740 /* 4741 * Tries to create a read- and/or execute-only 2MB page mapping. Returns 4742 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error 4743 * value. See pmap_enter_l2() for the possible error values when "no sleep", 4744 * "no replace", and "no reclaim" are specified. 4745 */ 4746 static int 4747 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4748 struct rwlock **lockp) 4749 { 4750 pd_entry_t new_l2; 4751 4752 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4753 PMAP_ASSERT_STAGE1(pmap); 4754 KASSERT(ADDR_IS_CANONICAL(va), 4755 ("%s: Address not in canonical form: %lx", __func__, va)); 4756 4757 new_l2 = (pd_entry_t)(PHYS_TO_PTE(VM_PAGE_TO_PHYS(m)) | ATTR_DEFAULT | 4758 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | 4759 L2_BLOCK); 4760 new_l2 |= pmap_pte_bti(pmap, va); 4761 if ((m->oflags & VPO_UNMANAGED) == 0) { 4762 new_l2 |= ATTR_SW_MANAGED; 4763 new_l2 &= ~ATTR_AF; 4764 } 4765 if ((prot & VM_PROT_EXECUTE) == 0 || 4766 m->md.pv_memattr == VM_MEMATTR_DEVICE) 4767 new_l2 |= ATTR_S1_XN; 4768 if (!ADDR_IS_KERNEL(va)) 4769 new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 4770 else 4771 new_l2 |= ATTR_S1_UXN; 4772 if (pmap != kernel_pmap) 4773 new_l2 |= ATTR_S1_nG; 4774 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | 4775 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp)); 4776 } 4777 4778 /* 4779 * Returns true if every page table entry in the specified page table is 4780 * zero. 4781 */ 4782 static bool 4783 pmap_every_pte_zero(vm_paddr_t pa) 4784 { 4785 pt_entry_t *pt_end, *pte; 4786 4787 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); 4788 pte = (pt_entry_t *)PHYS_TO_DMAP(pa); 4789 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) { 4790 if (*pte != 0) 4791 return (false); 4792 } 4793 return (true); 4794 } 4795 4796 /* 4797 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 4798 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or 4799 * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if 4800 * PMAP_ENTER_NOREPLACE was specified and a 4KB page mapping already exists 4801 * within the 2MB virtual address range starting at the specified virtual 4802 * address. Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a 4803 * 2MB page mapping already exists at the specified virtual address. Returns 4804 * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a 4805 * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified 4806 * and a PV entry allocation failed. 4807 */ 4808 static int 4809 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, 4810 vm_page_t m, struct rwlock **lockp) 4811 { 4812 struct spglist free; 4813 pd_entry_t *l2, old_l2; 4814 vm_page_t l2pg, mt; 4815 vm_page_t uwptpg; 4816 4817 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4818 KASSERT(ADDR_IS_CANONICAL(va), 4819 ("%s: Address not in canonical form: %lx", __func__, va)); 4820 4821 if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags & 4822 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { 4823 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", 4824 va, pmap); 4825 return (KERN_RESOURCE_SHORTAGE); 4826 } 4827 4828 /* 4829 * If there are existing mappings, either abort or remove them. 4830 */ 4831 if ((old_l2 = pmap_load(l2)) != 0) { 4832 KASSERT(l2pg == NULL || l2pg->ref_count > 1, 4833 ("pmap_enter_l2: l2pg's ref count is too low")); 4834 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 4835 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) { 4836 if (l2pg != NULL) 4837 l2pg->ref_count--; 4838 CTR2(KTR_PMAP, 4839 "pmap_enter_l2: no space for va %#lx" 4840 " in pmap %p", va, pmap); 4841 return (KERN_NO_SPACE); 4842 } else if (!ADDR_IS_KERNEL(va) || 4843 !pmap_every_pte_zero(PTE_TO_PHYS(old_l2))) { 4844 if (l2pg != NULL) 4845 l2pg->ref_count--; 4846 CTR2(KTR_PMAP, 4847 "pmap_enter_l2: failure for va %#lx" 4848 " in pmap %p", va, pmap); 4849 return (KERN_FAILURE); 4850 } 4851 } 4852 SLIST_INIT(&free); 4853 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) 4854 (void)pmap_remove_l2(pmap, l2, va, 4855 pmap_load(pmap_l1(pmap, va)), &free, lockp); 4856 else 4857 pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE, 4858 &free, lockp); 4859 if (!ADDR_IS_KERNEL(va)) { 4860 vm_page_free_pages_toq(&free, true); 4861 KASSERT(pmap_load(l2) == 0, 4862 ("pmap_enter_l2: non-zero L2 entry %p", l2)); 4863 } else { 4864 KASSERT(SLIST_EMPTY(&free), 4865 ("pmap_enter_l2: freed kernel page table page")); 4866 4867 /* 4868 * Both pmap_remove_l2() and pmap_remove_l3_range() 4869 * will leave the kernel page table page zero filled. 4870 * Nonetheless, the TLB could have an intermediate 4871 * entry for the kernel page table page, so request 4872 * an invalidation at all levels after clearing 4873 * the L2_TABLE entry. 4874 */ 4875 mt = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 4876 if (pmap_insert_pt_page(pmap, mt, false, false)) 4877 panic("pmap_enter_l2: trie insert failed"); 4878 pmap_clear(l2); 4879 pmap_s1_invalidate_page(pmap, va, false); 4880 } 4881 } 4882 4883 /* 4884 * Allocate leaf ptpage for wired userspace pages. 4885 */ 4886 uwptpg = NULL; 4887 if ((new_l2 & ATTR_SW_WIRED) != 0 && pmap != kernel_pmap) { 4888 uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED); 4889 if (uwptpg == NULL) { 4890 return (KERN_RESOURCE_SHORTAGE); 4891 } 4892 uwptpg->pindex = pmap_l2_pindex(va); 4893 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) { 4894 vm_page_unwire_noq(uwptpg); 4895 vm_page_free(uwptpg); 4896 return (KERN_RESOURCE_SHORTAGE); 4897 } 4898 pmap_resident_count_inc(pmap, 1); 4899 uwptpg->ref_count = NL3PG; 4900 } 4901 if ((new_l2 & ATTR_SW_MANAGED) != 0) { 4902 /* 4903 * Abort this mapping if its PV entry could not be created. 4904 */ 4905 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { 4906 if (l2pg != NULL) 4907 pmap_abort_ptp(pmap, va, l2pg); 4908 if (uwptpg != NULL) { 4909 mt = pmap_remove_pt_page(pmap, va); 4910 KASSERT(mt == uwptpg, 4911 ("removed pt page %p, expected %p", mt, 4912 uwptpg)); 4913 pmap_resident_count_dec(pmap, 1); 4914 uwptpg->ref_count = 1; 4915 vm_page_unwire_noq(uwptpg); 4916 vm_page_free(uwptpg); 4917 } 4918 CTR2(KTR_PMAP, 4919 "pmap_enter_l2: failure for va %#lx in pmap %p", 4920 va, pmap); 4921 return (KERN_RESOURCE_SHORTAGE); 4922 } 4923 if ((new_l2 & ATTR_SW_DBM) != 0) 4924 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 4925 vm_page_aflag_set(mt, PGA_WRITEABLE); 4926 } 4927 4928 /* 4929 * Increment counters. 4930 */ 4931 if ((new_l2 & ATTR_SW_WIRED) != 0) 4932 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; 4933 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; 4934 4935 /* 4936 * Conditionally sync the icache. See pmap_enter() for details. 4937 */ 4938 if ((new_l2 & ATTR_S1_XN) == 0 && (PTE_TO_PHYS(new_l2) != 4939 PTE_TO_PHYS(old_l2) || (old_l2 & ATTR_S1_XN) != 0) && 4940 pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) { 4941 cpu_icache_sync_range(PHYS_TO_DMAP(PTE_TO_PHYS(new_l2)), 4942 L2_SIZE); 4943 } 4944 4945 /* 4946 * Map the superpage. 4947 */ 4948 pmap_store(l2, new_l2); 4949 dsb(ishst); 4950 4951 atomic_add_long(&pmap_l2_mappings, 1); 4952 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", 4953 va, pmap); 4954 4955 return (KERN_SUCCESS); 4956 } 4957 4958 /* 4959 * Maps a sequence of resident pages belonging to the same object. 4960 * The sequence begins with the given page m_start. This page is 4961 * mapped at the given virtual address start. Each subsequent page is 4962 * mapped at a virtual address that is offset from start by the same 4963 * amount as the page is offset from m_start within the object. The 4964 * last page in the sequence is the page with the largest offset from 4965 * m_start that can be mapped at a virtual address less than the given 4966 * virtual address end. Not every virtual page between start and end 4967 * is mapped; only those for which a resident page exists with the 4968 * corresponding offset from m_start are mapped. 4969 */ 4970 void 4971 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 4972 vm_page_t m_start, vm_prot_t prot) 4973 { 4974 struct rwlock *lock; 4975 vm_offset_t va; 4976 vm_page_t m, mpte; 4977 vm_pindex_t diff, psize; 4978 int rv; 4979 4980 VM_OBJECT_ASSERT_LOCKED(m_start->object); 4981 4982 psize = atop(end - start); 4983 mpte = NULL; 4984 m = m_start; 4985 lock = NULL; 4986 PMAP_LOCK(pmap); 4987 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 4988 va = start + ptoa(diff); 4989 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && 4990 m->psind == 1 && pmap_ps_enabled(pmap) && 4991 ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) == 4992 KERN_SUCCESS || rv == KERN_NO_SPACE)) 4993 m = &m[L2_SIZE / PAGE_SIZE - 1]; 4994 else 4995 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, 4996 &lock); 4997 m = TAILQ_NEXT(m, listq); 4998 } 4999 if (lock != NULL) 5000 rw_wunlock(lock); 5001 PMAP_UNLOCK(pmap); 5002 } 5003 5004 /* 5005 * this code makes some *MAJOR* assumptions: 5006 * 1. Current pmap & pmap exists. 5007 * 2. Not wired. 5008 * 3. Read access. 5009 * 4. No page table pages. 5010 * but is *MUCH* faster than pmap_enter... 5011 */ 5012 5013 void 5014 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 5015 { 5016 struct rwlock *lock; 5017 5018 lock = NULL; 5019 PMAP_LOCK(pmap); 5020 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 5021 if (lock != NULL) 5022 rw_wunlock(lock); 5023 PMAP_UNLOCK(pmap); 5024 } 5025 5026 static vm_page_t 5027 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 5028 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 5029 { 5030 pd_entry_t *pde; 5031 pt_entry_t *l1, *l2, *l3, l3_val; 5032 vm_paddr_t pa; 5033 int lvl; 5034 5035 KASSERT(!VA_IS_CLEANMAP(va) || 5036 (m->oflags & VPO_UNMANAGED) != 0, 5037 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 5038 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5039 PMAP_ASSERT_STAGE1(pmap); 5040 KASSERT(ADDR_IS_CANONICAL(va), 5041 ("%s: Address not in canonical form: %lx", __func__, va)); 5042 l2 = NULL; 5043 5044 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); 5045 /* 5046 * In the case that a page table page is not 5047 * resident, we are creating it here. 5048 */ 5049 if (!ADDR_IS_KERNEL(va)) { 5050 vm_pindex_t l2pindex; 5051 5052 /* 5053 * Calculate pagetable page index 5054 */ 5055 l2pindex = pmap_l2_pindex(va); 5056 if (mpte && (mpte->pindex == l2pindex)) { 5057 mpte->ref_count++; 5058 } else { 5059 /* 5060 * If the page table page is mapped, we just increment 5061 * the hold count, and activate it. Otherwise, we 5062 * attempt to allocate a page table page, passing NULL 5063 * instead of the PV list lock pointer because we don't 5064 * intend to sleep. If this attempt fails, we don't 5065 * retry. Instead, we give up. 5066 */ 5067 l1 = pmap_l1(pmap, va); 5068 if (l1 != NULL && pmap_load(l1) != 0) { 5069 if ((pmap_load(l1) & ATTR_DESCR_MASK) == 5070 L1_BLOCK) 5071 return (NULL); 5072 l2 = pmap_l1_to_l2(l1, va); 5073 if (pmap_load(l2) != 0) { 5074 if ((pmap_load(l2) & ATTR_DESCR_MASK) == 5075 L2_BLOCK) 5076 return (NULL); 5077 mpte = PHYS_TO_VM_PAGE( 5078 PTE_TO_PHYS(pmap_load(l2))); 5079 mpte->ref_count++; 5080 } else { 5081 mpte = _pmap_alloc_l3(pmap, l2pindex, 5082 NULL); 5083 if (mpte == NULL) 5084 return (mpte); 5085 } 5086 } else { 5087 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); 5088 if (mpte == NULL) 5089 return (mpte); 5090 } 5091 } 5092 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 5093 l3 = &l3[pmap_l3_index(va)]; 5094 } else { 5095 mpte = NULL; 5096 pde = pmap_pde(kernel_pmap, va, &lvl); 5097 KASSERT(pde != NULL, 5098 ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx", 5099 va)); 5100 KASSERT(lvl == 2, 5101 ("pmap_enter_quick_locked: Invalid level %d", lvl)); 5102 l3 = pmap_l2_to_l3(pde, va); 5103 } 5104 5105 /* 5106 * Abort if a mapping already exists. 5107 */ 5108 if (pmap_load(l3) != 0) { 5109 if (mpte != NULL) 5110 mpte->ref_count--; 5111 return (NULL); 5112 } 5113 5114 /* 5115 * Enter on the PV list if part of our managed memory. 5116 */ 5117 if ((m->oflags & VPO_UNMANAGED) == 0 && 5118 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 5119 if (mpte != NULL) 5120 pmap_abort_ptp(pmap, va, mpte); 5121 return (NULL); 5122 } 5123 5124 /* 5125 * Increment counters 5126 */ 5127 pmap_resident_count_inc(pmap, 1); 5128 5129 pa = VM_PAGE_TO_PHYS(m); 5130 l3_val = PHYS_TO_PTE(pa) | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) | 5131 ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE; 5132 l3_val |= pmap_pte_bti(pmap, va); 5133 if ((prot & VM_PROT_EXECUTE) == 0 || 5134 m->md.pv_memattr == VM_MEMATTR_DEVICE) 5135 l3_val |= ATTR_S1_XN; 5136 if (!ADDR_IS_KERNEL(va)) 5137 l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 5138 else 5139 l3_val |= ATTR_S1_UXN; 5140 if (pmap != kernel_pmap) 5141 l3_val |= ATTR_S1_nG; 5142 5143 /* 5144 * Now validate mapping with RO protection 5145 */ 5146 if ((m->oflags & VPO_UNMANAGED) == 0) { 5147 l3_val |= ATTR_SW_MANAGED; 5148 l3_val &= ~ATTR_AF; 5149 } 5150 5151 /* Sync icache before the mapping is stored to PTE */ 5152 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 5153 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) 5154 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); 5155 5156 pmap_store(l3, l3_val); 5157 dsb(ishst); 5158 5159 #if VM_NRESERVLEVEL > 0 5160 /* 5161 * If both the PTP and the reservation are fully populated, then 5162 * attempt promotion. 5163 */ 5164 if ((mpte == NULL || mpte->ref_count == NL3PG) && 5165 (m->flags & PG_FICTITIOUS) == 0 && 5166 vm_reserv_level_iffullpop(m) == 0) { 5167 if (l2 == NULL) 5168 l2 = pmap_pde(pmap, va, &lvl); 5169 5170 /* 5171 * If promotion succeeds, then the next call to this function 5172 * should not be given the unmapped PTP as a hint. 5173 */ 5174 if (pmap_promote_l2(pmap, l2, va, mpte, lockp)) 5175 mpte = NULL; 5176 } 5177 #endif 5178 5179 return (mpte); 5180 } 5181 5182 /* 5183 * This code maps large physical mmap regions into the 5184 * processor address space. Note that some shortcuts 5185 * are taken, but the code works. 5186 */ 5187 void 5188 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 5189 vm_pindex_t pindex, vm_size_t size) 5190 { 5191 5192 VM_OBJECT_ASSERT_WLOCKED(object); 5193 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 5194 ("pmap_object_init_pt: non-device object")); 5195 } 5196 5197 /* 5198 * Clear the wired attribute from the mappings for the specified range of 5199 * addresses in the given pmap. Every valid mapping within that range 5200 * must have the wired attribute set. In contrast, invalid mappings 5201 * cannot have the wired attribute set, so they are ignored. 5202 * 5203 * The wired attribute of the page table entry is not a hardware feature, 5204 * so there is no need to invalidate any TLB entries. 5205 */ 5206 void 5207 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5208 { 5209 vm_offset_t va_next; 5210 pd_entry_t *l0, *l1, *l2; 5211 pt_entry_t *l3; 5212 5213 PMAP_LOCK(pmap); 5214 for (; sva < eva; sva = va_next) { 5215 l0 = pmap_l0(pmap, sva); 5216 if (pmap_load(l0) == 0) { 5217 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 5218 if (va_next < sva) 5219 va_next = eva; 5220 continue; 5221 } 5222 5223 l1 = pmap_l0_to_l1(l0, sva); 5224 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 5225 if (va_next < sva) 5226 va_next = eva; 5227 if (pmap_load(l1) == 0) 5228 continue; 5229 5230 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 5231 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 5232 KASSERT(va_next <= eva, 5233 ("partial update of non-transparent 1G page " 5234 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 5235 pmap_load(l1), sva, eva, va_next)); 5236 MPASS(pmap != kernel_pmap); 5237 MPASS((pmap_load(l1) & (ATTR_SW_MANAGED | 5238 ATTR_SW_WIRED)) == ATTR_SW_WIRED); 5239 pmap_clear_bits(l1, ATTR_SW_WIRED); 5240 pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE; 5241 continue; 5242 } 5243 5244 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 5245 if (va_next < sva) 5246 va_next = eva; 5247 5248 l2 = pmap_l1_to_l2(l1, sva); 5249 if (pmap_load(l2) == 0) 5250 continue; 5251 5252 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { 5253 if ((pmap_load(l2) & ATTR_SW_WIRED) == 0) 5254 panic("pmap_unwire: l2 %#jx is missing " 5255 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2)); 5256 5257 /* 5258 * Are we unwiring the entire large page? If not, 5259 * demote the mapping and fall through. 5260 */ 5261 if (sva + L2_SIZE == va_next && eva >= va_next) { 5262 pmap_clear_bits(l2, ATTR_SW_WIRED); 5263 pmap->pm_stats.wired_count -= L2_SIZE / 5264 PAGE_SIZE; 5265 continue; 5266 } else if (pmap_demote_l2(pmap, l2, sva) == NULL) 5267 panic("pmap_unwire: demotion failed"); 5268 } 5269 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 5270 ("pmap_unwire: Invalid l2 entry after demotion")); 5271 5272 if (va_next > eva) 5273 va_next = eva; 5274 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 5275 sva += L3_SIZE) { 5276 if (pmap_load(l3) == 0) 5277 continue; 5278 if ((pmap_load(l3) & ATTR_SW_WIRED) == 0) 5279 panic("pmap_unwire: l3 %#jx is missing " 5280 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3)); 5281 5282 /* 5283 * ATTR_SW_WIRED must be cleared atomically. Although 5284 * the pmap lock synchronizes access to ATTR_SW_WIRED, 5285 * the System MMU may write to the entry concurrently. 5286 */ 5287 pmap_clear_bits(l3, ATTR_SW_WIRED); 5288 pmap->pm_stats.wired_count--; 5289 } 5290 } 5291 PMAP_UNLOCK(pmap); 5292 } 5293 5294 /* 5295 * Copy the range specified by src_addr/len 5296 * from the source map to the range dst_addr/len 5297 * in the destination map. 5298 * 5299 * This routine is only advisory and need not do anything. 5300 * 5301 * Because the executable mappings created by this routine are copied, 5302 * it should not have to flush the instruction cache. 5303 */ 5304 void 5305 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 5306 vm_offset_t src_addr) 5307 { 5308 struct rwlock *lock; 5309 pd_entry_t *l0, *l1, *l2, srcptepaddr; 5310 pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte; 5311 vm_offset_t addr, end_addr, va_next; 5312 vm_page_t dst_m, dstmpte, srcmpte; 5313 5314 PMAP_ASSERT_STAGE1(dst_pmap); 5315 PMAP_ASSERT_STAGE1(src_pmap); 5316 5317 if (dst_addr != src_addr) 5318 return; 5319 end_addr = src_addr + len; 5320 lock = NULL; 5321 if (dst_pmap < src_pmap) { 5322 PMAP_LOCK(dst_pmap); 5323 PMAP_LOCK(src_pmap); 5324 } else { 5325 PMAP_LOCK(src_pmap); 5326 PMAP_LOCK(dst_pmap); 5327 } 5328 for (addr = src_addr; addr < end_addr; addr = va_next) { 5329 l0 = pmap_l0(src_pmap, addr); 5330 if (pmap_load(l0) == 0) { 5331 va_next = (addr + L0_SIZE) & ~L0_OFFSET; 5332 if (va_next < addr) 5333 va_next = end_addr; 5334 continue; 5335 } 5336 5337 va_next = (addr + L1_SIZE) & ~L1_OFFSET; 5338 if (va_next < addr) 5339 va_next = end_addr; 5340 l1 = pmap_l0_to_l1(l0, addr); 5341 if (pmap_load(l1) == 0) 5342 continue; 5343 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 5344 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 5345 KASSERT(va_next <= end_addr, 5346 ("partial update of non-transparent 1G page " 5347 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx", 5348 pmap_load(l1), addr, end_addr, va_next)); 5349 srcptepaddr = pmap_load(l1); 5350 l1 = pmap_l1(dst_pmap, addr); 5351 if (l1 == NULL) { 5352 if (_pmap_alloc_l3(dst_pmap, 5353 pmap_l0_pindex(addr), NULL) == NULL) 5354 break; 5355 l1 = pmap_l1(dst_pmap, addr); 5356 } else { 5357 l0 = pmap_l0(dst_pmap, addr); 5358 dst_m = PHYS_TO_VM_PAGE( 5359 PTE_TO_PHYS(pmap_load(l0))); 5360 dst_m->ref_count++; 5361 } 5362 KASSERT(pmap_load(l1) == 0, 5363 ("1G mapping present in dst pmap " 5364 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx", 5365 pmap_load(l1), addr, end_addr, va_next)); 5366 pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED); 5367 pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE); 5368 continue; 5369 } 5370 5371 va_next = (addr + L2_SIZE) & ~L2_OFFSET; 5372 if (va_next < addr) 5373 va_next = end_addr; 5374 l2 = pmap_l1_to_l2(l1, addr); 5375 srcptepaddr = pmap_load(l2); 5376 if (srcptepaddr == 0) 5377 continue; 5378 if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) { 5379 /* 5380 * We can only virtual copy whole superpages. 5381 */ 5382 if ((addr & L2_OFFSET) != 0 || 5383 addr + L2_SIZE > end_addr) 5384 continue; 5385 l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL); 5386 if (l2 == NULL) 5387 break; 5388 if (pmap_load(l2) == 0 && 5389 ((srcptepaddr & ATTR_SW_MANAGED) == 0 || 5390 pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr, 5391 PMAP_ENTER_NORECLAIM, &lock))) { 5392 /* 5393 * We leave the dirty bit unchanged because 5394 * managed read/write superpage mappings are 5395 * required to be dirty. However, managed 5396 * superpage mappings are not required to 5397 * have their accessed bit set, so we clear 5398 * it because we don't know if this mapping 5399 * will be used. 5400 */ 5401 srcptepaddr &= ~ATTR_SW_WIRED; 5402 if ((srcptepaddr & ATTR_SW_MANAGED) != 0) 5403 srcptepaddr &= ~ATTR_AF; 5404 pmap_store(l2, srcptepaddr); 5405 pmap_resident_count_inc(dst_pmap, L2_SIZE / 5406 PAGE_SIZE); 5407 atomic_add_long(&pmap_l2_mappings, 1); 5408 } else 5409 pmap_abort_ptp(dst_pmap, addr, dst_m); 5410 continue; 5411 } 5412 KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE, 5413 ("pmap_copy: invalid L2 entry")); 5414 srcmpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(srcptepaddr)); 5415 KASSERT(srcmpte->ref_count > 0, 5416 ("pmap_copy: source page table page is unused")); 5417 if (va_next > end_addr) 5418 va_next = end_addr; 5419 src_pte = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(srcptepaddr)); 5420 src_pte = &src_pte[pmap_l3_index(addr)]; 5421 dstmpte = NULL; 5422 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { 5423 ptetemp = pmap_load(src_pte); 5424 5425 /* 5426 * We only virtual copy managed pages. 5427 */ 5428 if ((ptetemp & ATTR_SW_MANAGED) == 0) 5429 continue; 5430 5431 if (dstmpte != NULL) { 5432 KASSERT(dstmpte->pindex == pmap_l2_pindex(addr), 5433 ("dstmpte pindex/addr mismatch")); 5434 dstmpte->ref_count++; 5435 } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr, 5436 NULL)) == NULL) 5437 goto out; 5438 dst_pte = (pt_entry_t *) 5439 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 5440 dst_pte = &dst_pte[pmap_l3_index(addr)]; 5441 if (pmap_load(dst_pte) == 0 && 5442 pmap_try_insert_pv_entry(dst_pmap, addr, 5443 PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptetemp)), &lock)) { 5444 /* 5445 * Clear the wired, modified, and accessed 5446 * (referenced) bits during the copy. 5447 */ 5448 mask = ATTR_AF | ATTR_SW_WIRED; 5449 nbits = 0; 5450 if ((ptetemp & ATTR_SW_DBM) != 0) 5451 nbits |= ATTR_S1_AP_RW_BIT; 5452 pmap_store(dst_pte, (ptetemp & ~mask) | nbits); 5453 pmap_resident_count_inc(dst_pmap, 1); 5454 } else { 5455 pmap_abort_ptp(dst_pmap, addr, dstmpte); 5456 goto out; 5457 } 5458 /* Have we copied all of the valid mappings? */ 5459 if (dstmpte->ref_count >= srcmpte->ref_count) 5460 break; 5461 } 5462 } 5463 out: 5464 /* 5465 * XXX This barrier may not be needed because the destination pmap is 5466 * not active. 5467 */ 5468 dsb(ishst); 5469 5470 if (lock != NULL) 5471 rw_wunlock(lock); 5472 PMAP_UNLOCK(src_pmap); 5473 PMAP_UNLOCK(dst_pmap); 5474 } 5475 5476 /* 5477 * pmap_zero_page zeros the specified hardware page by mapping 5478 * the page into KVM and using bzero to clear its contents. 5479 */ 5480 void 5481 pmap_zero_page(vm_page_t m) 5482 { 5483 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5484 5485 pagezero((void *)va); 5486 } 5487 5488 /* 5489 * pmap_zero_page_area zeros the specified hardware page by mapping 5490 * the page into KVM and using bzero to clear its contents. 5491 * 5492 * off and size may not cover an area beyond a single hardware page. 5493 */ 5494 void 5495 pmap_zero_page_area(vm_page_t m, int off, int size) 5496 { 5497 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5498 5499 if (off == 0 && size == PAGE_SIZE) 5500 pagezero((void *)va); 5501 else 5502 bzero((char *)va + off, size); 5503 } 5504 5505 /* 5506 * pmap_copy_page copies the specified (machine independent) 5507 * page by mapping the page into virtual memory and using 5508 * bcopy to copy the page, one machine dependent page at a 5509 * time. 5510 */ 5511 void 5512 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 5513 { 5514 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 5515 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 5516 5517 pagecopy((void *)src, (void *)dst); 5518 } 5519 5520 int unmapped_buf_allowed = 1; 5521 5522 void 5523 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 5524 vm_offset_t b_offset, int xfersize) 5525 { 5526 void *a_cp, *b_cp; 5527 vm_page_t m_a, m_b; 5528 vm_paddr_t p_a, p_b; 5529 vm_offset_t a_pg_offset, b_pg_offset; 5530 int cnt; 5531 5532 while (xfersize > 0) { 5533 a_pg_offset = a_offset & PAGE_MASK; 5534 m_a = ma[a_offset >> PAGE_SHIFT]; 5535 p_a = m_a->phys_addr; 5536 b_pg_offset = b_offset & PAGE_MASK; 5537 m_b = mb[b_offset >> PAGE_SHIFT]; 5538 p_b = m_b->phys_addr; 5539 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 5540 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 5541 if (__predict_false(!PHYS_IN_DMAP(p_a))) { 5542 panic("!DMAP a %lx", p_a); 5543 } else { 5544 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 5545 } 5546 if (__predict_false(!PHYS_IN_DMAP(p_b))) { 5547 panic("!DMAP b %lx", p_b); 5548 } else { 5549 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 5550 } 5551 bcopy(a_cp, b_cp, cnt); 5552 a_offset += cnt; 5553 b_offset += cnt; 5554 xfersize -= cnt; 5555 } 5556 } 5557 5558 vm_offset_t 5559 pmap_quick_enter_page(vm_page_t m) 5560 { 5561 5562 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); 5563 } 5564 5565 void 5566 pmap_quick_remove_page(vm_offset_t addr) 5567 { 5568 } 5569 5570 /* 5571 * Returns true if the pmap's pv is one of the first 5572 * 16 pvs linked to from this page. This count may 5573 * be changed upwards or downwards in the future; it 5574 * is only necessary that true be returned for a small 5575 * subset of pmaps for proper page aging. 5576 */ 5577 boolean_t 5578 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 5579 { 5580 struct md_page *pvh; 5581 struct rwlock *lock; 5582 pv_entry_t pv; 5583 int loops = 0; 5584 boolean_t rv; 5585 5586 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5587 ("pmap_page_exists_quick: page %p is not managed", m)); 5588 rv = FALSE; 5589 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5590 rw_rlock(lock); 5591 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5592 if (PV_PMAP(pv) == pmap) { 5593 rv = TRUE; 5594 break; 5595 } 5596 loops++; 5597 if (loops >= 16) 5598 break; 5599 } 5600 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 5601 pvh = page_to_pvh(m); 5602 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5603 if (PV_PMAP(pv) == pmap) { 5604 rv = TRUE; 5605 break; 5606 } 5607 loops++; 5608 if (loops >= 16) 5609 break; 5610 } 5611 } 5612 rw_runlock(lock); 5613 return (rv); 5614 } 5615 5616 /* 5617 * pmap_page_wired_mappings: 5618 * 5619 * Return the number of managed mappings to the given physical page 5620 * that are wired. 5621 */ 5622 int 5623 pmap_page_wired_mappings(vm_page_t m) 5624 { 5625 struct rwlock *lock; 5626 struct md_page *pvh; 5627 pmap_t pmap; 5628 pt_entry_t *pte; 5629 pv_entry_t pv; 5630 int count, md_gen, pvh_gen; 5631 5632 if ((m->oflags & VPO_UNMANAGED) != 0) 5633 return (0); 5634 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5635 rw_rlock(lock); 5636 restart: 5637 count = 0; 5638 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5639 pmap = PV_PMAP(pv); 5640 if (!PMAP_TRYLOCK(pmap)) { 5641 md_gen = m->md.pv_gen; 5642 rw_runlock(lock); 5643 PMAP_LOCK(pmap); 5644 rw_rlock(lock); 5645 if (md_gen != m->md.pv_gen) { 5646 PMAP_UNLOCK(pmap); 5647 goto restart; 5648 } 5649 } 5650 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 5651 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0) 5652 count++; 5653 PMAP_UNLOCK(pmap); 5654 } 5655 if ((m->flags & PG_FICTITIOUS) == 0) { 5656 pvh = page_to_pvh(m); 5657 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5658 pmap = PV_PMAP(pv); 5659 if (!PMAP_TRYLOCK(pmap)) { 5660 md_gen = m->md.pv_gen; 5661 pvh_gen = pvh->pv_gen; 5662 rw_runlock(lock); 5663 PMAP_LOCK(pmap); 5664 rw_rlock(lock); 5665 if (md_gen != m->md.pv_gen || 5666 pvh_gen != pvh->pv_gen) { 5667 PMAP_UNLOCK(pmap); 5668 goto restart; 5669 } 5670 } 5671 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__); 5672 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0) 5673 count++; 5674 PMAP_UNLOCK(pmap); 5675 } 5676 } 5677 rw_runlock(lock); 5678 return (count); 5679 } 5680 5681 /* 5682 * Returns true if the given page is mapped individually or as part of 5683 * a 2mpage. Otherwise, returns false. 5684 */ 5685 bool 5686 pmap_page_is_mapped(vm_page_t m) 5687 { 5688 struct rwlock *lock; 5689 bool rv; 5690 5691 if ((m->oflags & VPO_UNMANAGED) != 0) 5692 return (false); 5693 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5694 rw_rlock(lock); 5695 rv = !TAILQ_EMPTY(&m->md.pv_list) || 5696 ((m->flags & PG_FICTITIOUS) == 0 && 5697 !TAILQ_EMPTY(&page_to_pvh(m)->pv_list)); 5698 rw_runlock(lock); 5699 return (rv); 5700 } 5701 5702 /* 5703 * Destroy all managed, non-wired mappings in the given user-space 5704 * pmap. This pmap cannot be active on any processor besides the 5705 * caller. 5706 * 5707 * This function cannot be applied to the kernel pmap. Moreover, it 5708 * is not intended for general use. It is only to be used during 5709 * process termination. Consequently, it can be implemented in ways 5710 * that make it faster than pmap_remove(). First, it can more quickly 5711 * destroy mappings by iterating over the pmap's collection of PV 5712 * entries, rather than searching the page table. Second, it doesn't 5713 * have to test and clear the page table entries atomically, because 5714 * no processor is currently accessing the user address space. In 5715 * particular, a page table entry's dirty bit won't change state once 5716 * this function starts. 5717 */ 5718 void 5719 pmap_remove_pages(pmap_t pmap) 5720 { 5721 pd_entry_t *pde; 5722 pt_entry_t *pte, tpte; 5723 struct spglist free; 5724 struct pv_chunklist free_chunks[PMAP_MEMDOM]; 5725 vm_page_t m, ml3, mt; 5726 pv_entry_t pv; 5727 struct md_page *pvh; 5728 struct pv_chunk *pc, *npc; 5729 struct rwlock *lock; 5730 int64_t bit; 5731 uint64_t inuse, bitmask; 5732 int allfree, field, i, idx, lvl; 5733 int freed __pvused; 5734 vm_paddr_t pa; 5735 5736 lock = NULL; 5737 5738 for (i = 0; i < PMAP_MEMDOM; i++) 5739 TAILQ_INIT(&free_chunks[i]); 5740 SLIST_INIT(&free); 5741 PMAP_LOCK(pmap); 5742 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 5743 allfree = 1; 5744 freed = 0; 5745 for (field = 0; field < _NPCM; field++) { 5746 inuse = ~pc->pc_map[field] & pc_freemask[field]; 5747 while (inuse != 0) { 5748 bit = ffsl(inuse) - 1; 5749 bitmask = 1UL << bit; 5750 idx = field * 64 + bit; 5751 pv = &pc->pc_pventry[idx]; 5752 inuse &= ~bitmask; 5753 5754 pde = pmap_pde(pmap, pv->pv_va, &lvl); 5755 KASSERT(pde != NULL, 5756 ("Attempting to remove an unmapped page")); 5757 5758 switch(lvl) { 5759 case 1: 5760 pte = pmap_l1_to_l2(pde, pv->pv_va); 5761 tpte = pmap_load(pte); 5762 KASSERT((tpte & ATTR_DESCR_MASK) == 5763 L2_BLOCK, 5764 ("Attempting to remove an invalid " 5765 "block: %lx", tpte)); 5766 break; 5767 case 2: 5768 pte = pmap_l2_to_l3(pde, pv->pv_va); 5769 tpte = pmap_load(pte); 5770 KASSERT((tpte & ATTR_DESCR_MASK) == 5771 L3_PAGE, 5772 ("Attempting to remove an invalid " 5773 "page: %lx", tpte)); 5774 break; 5775 default: 5776 panic( 5777 "Invalid page directory level: %d", 5778 lvl); 5779 } 5780 5781 /* 5782 * We cannot remove wired pages from a process' mapping at this time 5783 */ 5784 if (tpte & ATTR_SW_WIRED) { 5785 allfree = 0; 5786 continue; 5787 } 5788 5789 /* Mark free */ 5790 pc->pc_map[field] |= bitmask; 5791 5792 /* 5793 * Because this pmap is not active on other 5794 * processors, the dirty bit cannot have 5795 * changed state since we last loaded pte. 5796 */ 5797 pmap_clear(pte); 5798 5799 pa = PTE_TO_PHYS(tpte); 5800 5801 m = PHYS_TO_VM_PAGE(pa); 5802 KASSERT(m->phys_addr == pa, 5803 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 5804 m, (uintmax_t)m->phys_addr, 5805 (uintmax_t)tpte)); 5806 5807 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 5808 m < &vm_page_array[vm_page_array_size], 5809 ("pmap_remove_pages: bad pte %#jx", 5810 (uintmax_t)tpte)); 5811 5812 /* 5813 * Update the vm_page_t clean/reference bits. 5814 */ 5815 if (pmap_pte_dirty(pmap, tpte)) { 5816 switch (lvl) { 5817 case 1: 5818 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 5819 vm_page_dirty(mt); 5820 break; 5821 case 2: 5822 vm_page_dirty(m); 5823 break; 5824 } 5825 } 5826 5827 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 5828 5829 switch (lvl) { 5830 case 1: 5831 pmap_resident_count_dec(pmap, 5832 L2_SIZE / PAGE_SIZE); 5833 pvh = page_to_pvh(m); 5834 TAILQ_REMOVE(&pvh->pv_list, pv,pv_next); 5835 pvh->pv_gen++; 5836 if (TAILQ_EMPTY(&pvh->pv_list)) { 5837 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 5838 if ((mt->a.flags & PGA_WRITEABLE) != 0 && 5839 TAILQ_EMPTY(&mt->md.pv_list)) 5840 vm_page_aflag_clear(mt, PGA_WRITEABLE); 5841 } 5842 ml3 = pmap_remove_pt_page(pmap, 5843 pv->pv_va); 5844 if (ml3 != NULL) { 5845 KASSERT(vm_page_any_valid(ml3), 5846 ("pmap_remove_pages: l3 page not promoted")); 5847 pmap_resident_count_dec(pmap,1); 5848 KASSERT(ml3->ref_count == NL3PG, 5849 ("pmap_remove_pages: l3 page ref count error")); 5850 ml3->ref_count = 0; 5851 pmap_add_delayed_free_list(ml3, 5852 &free, FALSE); 5853 } 5854 break; 5855 case 2: 5856 pmap_resident_count_dec(pmap, 1); 5857 TAILQ_REMOVE(&m->md.pv_list, pv, 5858 pv_next); 5859 m->md.pv_gen++; 5860 if ((m->a.flags & PGA_WRITEABLE) != 0 && 5861 TAILQ_EMPTY(&m->md.pv_list) && 5862 (m->flags & PG_FICTITIOUS) == 0) { 5863 pvh = page_to_pvh(m); 5864 if (TAILQ_EMPTY(&pvh->pv_list)) 5865 vm_page_aflag_clear(m, 5866 PGA_WRITEABLE); 5867 } 5868 break; 5869 } 5870 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde), 5871 &free); 5872 freed++; 5873 } 5874 } 5875 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 5876 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 5877 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 5878 if (allfree) { 5879 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5880 TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc, 5881 pc_list); 5882 } 5883 } 5884 if (lock != NULL) 5885 rw_wunlock(lock); 5886 pmap_invalidate_all(pmap); 5887 free_pv_chunk_batch(free_chunks); 5888 PMAP_UNLOCK(pmap); 5889 vm_page_free_pages_toq(&free, true); 5890 } 5891 5892 /* 5893 * This is used to check if a page has been accessed or modified. 5894 */ 5895 static boolean_t 5896 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 5897 { 5898 struct rwlock *lock; 5899 pv_entry_t pv; 5900 struct md_page *pvh; 5901 pt_entry_t *pte, mask, value; 5902 pmap_t pmap; 5903 int md_gen, pvh_gen; 5904 boolean_t rv; 5905 5906 rv = FALSE; 5907 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5908 rw_rlock(lock); 5909 restart: 5910 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5911 pmap = PV_PMAP(pv); 5912 PMAP_ASSERT_STAGE1(pmap); 5913 if (!PMAP_TRYLOCK(pmap)) { 5914 md_gen = m->md.pv_gen; 5915 rw_runlock(lock); 5916 PMAP_LOCK(pmap); 5917 rw_rlock(lock); 5918 if (md_gen != m->md.pv_gen) { 5919 PMAP_UNLOCK(pmap); 5920 goto restart; 5921 } 5922 } 5923 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 5924 mask = 0; 5925 value = 0; 5926 if (modified) { 5927 mask |= ATTR_S1_AP_RW_BIT; 5928 value |= ATTR_S1_AP(ATTR_S1_AP_RW); 5929 } 5930 if (accessed) { 5931 mask |= ATTR_AF | ATTR_DESCR_MASK; 5932 value |= ATTR_AF | L3_PAGE; 5933 } 5934 rv = (pmap_load(pte) & mask) == value; 5935 PMAP_UNLOCK(pmap); 5936 if (rv) 5937 goto out; 5938 } 5939 if ((m->flags & PG_FICTITIOUS) == 0) { 5940 pvh = page_to_pvh(m); 5941 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5942 pmap = PV_PMAP(pv); 5943 PMAP_ASSERT_STAGE1(pmap); 5944 if (!PMAP_TRYLOCK(pmap)) { 5945 md_gen = m->md.pv_gen; 5946 pvh_gen = pvh->pv_gen; 5947 rw_runlock(lock); 5948 PMAP_LOCK(pmap); 5949 rw_rlock(lock); 5950 if (md_gen != m->md.pv_gen || 5951 pvh_gen != pvh->pv_gen) { 5952 PMAP_UNLOCK(pmap); 5953 goto restart; 5954 } 5955 } 5956 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__); 5957 mask = 0; 5958 value = 0; 5959 if (modified) { 5960 mask |= ATTR_S1_AP_RW_BIT; 5961 value |= ATTR_S1_AP(ATTR_S1_AP_RW); 5962 } 5963 if (accessed) { 5964 mask |= ATTR_AF | ATTR_DESCR_MASK; 5965 value |= ATTR_AF | L2_BLOCK; 5966 } 5967 rv = (pmap_load(pte) & mask) == value; 5968 PMAP_UNLOCK(pmap); 5969 if (rv) 5970 goto out; 5971 } 5972 } 5973 out: 5974 rw_runlock(lock); 5975 return (rv); 5976 } 5977 5978 /* 5979 * pmap_is_modified: 5980 * 5981 * Return whether or not the specified physical page was modified 5982 * in any physical maps. 5983 */ 5984 boolean_t 5985 pmap_is_modified(vm_page_t m) 5986 { 5987 5988 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5989 ("pmap_is_modified: page %p is not managed", m)); 5990 5991 /* 5992 * If the page is not busied then this check is racy. 5993 */ 5994 if (!pmap_page_is_write_mapped(m)) 5995 return (FALSE); 5996 return (pmap_page_test_mappings(m, FALSE, TRUE)); 5997 } 5998 5999 /* 6000 * pmap_is_prefaultable: 6001 * 6002 * Return whether or not the specified virtual address is eligible 6003 * for prefault. 6004 */ 6005 boolean_t 6006 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 6007 { 6008 pd_entry_t *pde; 6009 pt_entry_t *pte; 6010 boolean_t rv; 6011 int lvl; 6012 6013 /* 6014 * Return TRUE if and only if the L3 entry for the specified virtual 6015 * address is allocated but invalid. 6016 */ 6017 rv = FALSE; 6018 PMAP_LOCK(pmap); 6019 pde = pmap_pde(pmap, addr, &lvl); 6020 if (pde != NULL && lvl == 2) { 6021 pte = pmap_l2_to_l3(pde, addr); 6022 rv = pmap_load(pte) == 0; 6023 } 6024 PMAP_UNLOCK(pmap); 6025 return (rv); 6026 } 6027 6028 /* 6029 * pmap_is_referenced: 6030 * 6031 * Return whether or not the specified physical page was referenced 6032 * in any physical maps. 6033 */ 6034 boolean_t 6035 pmap_is_referenced(vm_page_t m) 6036 { 6037 6038 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6039 ("pmap_is_referenced: page %p is not managed", m)); 6040 return (pmap_page_test_mappings(m, TRUE, FALSE)); 6041 } 6042 6043 /* 6044 * Clear the write and modified bits in each of the given page's mappings. 6045 */ 6046 void 6047 pmap_remove_write(vm_page_t m) 6048 { 6049 struct md_page *pvh; 6050 pmap_t pmap; 6051 struct rwlock *lock; 6052 pv_entry_t next_pv, pv; 6053 pt_entry_t oldpte, *pte, set, clear, mask, val; 6054 vm_offset_t va; 6055 int md_gen, pvh_gen; 6056 6057 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6058 ("pmap_remove_write: page %p is not managed", m)); 6059 vm_page_assert_busied(m); 6060 6061 if (!pmap_page_is_write_mapped(m)) 6062 return; 6063 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6064 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 6065 rw_wlock(lock); 6066 retry: 6067 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 6068 pmap = PV_PMAP(pv); 6069 PMAP_ASSERT_STAGE1(pmap); 6070 if (!PMAP_TRYLOCK(pmap)) { 6071 pvh_gen = pvh->pv_gen; 6072 rw_wunlock(lock); 6073 PMAP_LOCK(pmap); 6074 rw_wlock(lock); 6075 if (pvh_gen != pvh->pv_gen) { 6076 PMAP_UNLOCK(pmap); 6077 goto retry; 6078 } 6079 } 6080 va = pv->pv_va; 6081 pte = pmap_pte_exists(pmap, va, 2, __func__); 6082 if ((pmap_load(pte) & ATTR_SW_DBM) != 0) 6083 (void)pmap_demote_l2_locked(pmap, pte, va, &lock); 6084 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 6085 ("inconsistent pv lock %p %p for page %p", 6086 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 6087 PMAP_UNLOCK(pmap); 6088 } 6089 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6090 pmap = PV_PMAP(pv); 6091 if (!PMAP_TRYLOCK(pmap)) { 6092 pvh_gen = pvh->pv_gen; 6093 md_gen = m->md.pv_gen; 6094 rw_wunlock(lock); 6095 PMAP_LOCK(pmap); 6096 rw_wlock(lock); 6097 if (pvh_gen != pvh->pv_gen || 6098 md_gen != m->md.pv_gen) { 6099 PMAP_UNLOCK(pmap); 6100 goto retry; 6101 } 6102 } 6103 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 6104 oldpte = pmap_load(pte); 6105 if ((oldpte & ATTR_SW_DBM) != 0) { 6106 if (pmap->pm_stage == PM_STAGE1) { 6107 set = ATTR_S1_AP_RW_BIT; 6108 clear = 0; 6109 mask = ATTR_S1_AP_RW_BIT; 6110 val = ATTR_S1_AP(ATTR_S1_AP_RW); 6111 } else { 6112 set = 0; 6113 clear = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 6114 mask = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 6115 val = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 6116 } 6117 clear |= ATTR_SW_DBM; 6118 while (!atomic_fcmpset_64(pte, &oldpte, 6119 (oldpte | set) & ~clear)) 6120 cpu_spinwait(); 6121 6122 if ((oldpte & mask) == val) 6123 vm_page_dirty(m); 6124 pmap_invalidate_page(pmap, pv->pv_va, true); 6125 } 6126 PMAP_UNLOCK(pmap); 6127 } 6128 rw_wunlock(lock); 6129 vm_page_aflag_clear(m, PGA_WRITEABLE); 6130 } 6131 6132 /* 6133 * pmap_ts_referenced: 6134 * 6135 * Return a count of reference bits for a page, clearing those bits. 6136 * It is not necessary for every reference bit to be cleared, but it 6137 * is necessary that 0 only be returned when there are truly no 6138 * reference bits set. 6139 * 6140 * As an optimization, update the page's dirty field if a modified bit is 6141 * found while counting reference bits. This opportunistic update can be 6142 * performed at low cost and can eliminate the need for some future calls 6143 * to pmap_is_modified(). However, since this function stops after 6144 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 6145 * dirty pages. Those dirty pages will only be detected by a future call 6146 * to pmap_is_modified(). 6147 */ 6148 int 6149 pmap_ts_referenced(vm_page_t m) 6150 { 6151 struct md_page *pvh; 6152 pv_entry_t pv, pvf; 6153 pmap_t pmap; 6154 struct rwlock *lock; 6155 pt_entry_t *pte, tpte; 6156 vm_offset_t va; 6157 vm_paddr_t pa; 6158 int cleared, md_gen, not_cleared, pvh_gen; 6159 struct spglist free; 6160 6161 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6162 ("pmap_ts_referenced: page %p is not managed", m)); 6163 SLIST_INIT(&free); 6164 cleared = 0; 6165 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 6166 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6167 rw_wlock(lock); 6168 retry: 6169 not_cleared = 0; 6170 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 6171 goto small_mappings; 6172 pv = pvf; 6173 do { 6174 if (pvf == NULL) 6175 pvf = pv; 6176 pmap = PV_PMAP(pv); 6177 if (!PMAP_TRYLOCK(pmap)) { 6178 pvh_gen = pvh->pv_gen; 6179 rw_wunlock(lock); 6180 PMAP_LOCK(pmap); 6181 rw_wlock(lock); 6182 if (pvh_gen != pvh->pv_gen) { 6183 PMAP_UNLOCK(pmap); 6184 goto retry; 6185 } 6186 } 6187 va = pv->pv_va; 6188 pte = pmap_pte_exists(pmap, va, 2, __func__); 6189 tpte = pmap_load(pte); 6190 if (pmap_pte_dirty(pmap, tpte)) { 6191 /* 6192 * Although "tpte" is mapping a 2MB page, because 6193 * this function is called at a 4KB page granularity, 6194 * we only update the 4KB page under test. 6195 */ 6196 vm_page_dirty(m); 6197 } 6198 if ((tpte & ATTR_AF) != 0) { 6199 pa = VM_PAGE_TO_PHYS(m); 6200 6201 /* 6202 * Since this reference bit is shared by 512 4KB pages, 6203 * it should not be cleared every time it is tested. 6204 * Apply a simple "hash" function on the physical page 6205 * number, the virtual superpage number, and the pmap 6206 * address to select one 4KB page out of the 512 on 6207 * which testing the reference bit will result in 6208 * clearing that reference bit. This function is 6209 * designed to avoid the selection of the same 4KB page 6210 * for every 2MB page mapping. 6211 * 6212 * On demotion, a mapping that hasn't been referenced 6213 * is simply destroyed. To avoid the possibility of a 6214 * subsequent page fault on a demoted wired mapping, 6215 * always leave its reference bit set. Moreover, 6216 * since the superpage is wired, the current state of 6217 * its reference bit won't affect page replacement. 6218 */ 6219 if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^ 6220 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && 6221 (tpte & ATTR_SW_WIRED) == 0) { 6222 pmap_clear_bits(pte, ATTR_AF); 6223 pmap_invalidate_page(pmap, va, true); 6224 cleared++; 6225 } else 6226 not_cleared++; 6227 } 6228 PMAP_UNLOCK(pmap); 6229 /* Rotate the PV list if it has more than one entry. */ 6230 if (TAILQ_NEXT(pv, pv_next) != NULL) { 6231 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 6232 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 6233 pvh->pv_gen++; 6234 } 6235 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 6236 goto out; 6237 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 6238 small_mappings: 6239 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 6240 goto out; 6241 pv = pvf; 6242 do { 6243 if (pvf == NULL) 6244 pvf = pv; 6245 pmap = PV_PMAP(pv); 6246 if (!PMAP_TRYLOCK(pmap)) { 6247 pvh_gen = pvh->pv_gen; 6248 md_gen = m->md.pv_gen; 6249 rw_wunlock(lock); 6250 PMAP_LOCK(pmap); 6251 rw_wlock(lock); 6252 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 6253 PMAP_UNLOCK(pmap); 6254 goto retry; 6255 } 6256 } 6257 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 6258 tpte = pmap_load(pte); 6259 if (pmap_pte_dirty(pmap, tpte)) 6260 vm_page_dirty(m); 6261 if ((tpte & ATTR_AF) != 0) { 6262 if ((tpte & ATTR_SW_WIRED) == 0) { 6263 pmap_clear_bits(pte, ATTR_AF); 6264 pmap_invalidate_page(pmap, pv->pv_va, true); 6265 cleared++; 6266 } else 6267 not_cleared++; 6268 } 6269 PMAP_UNLOCK(pmap); 6270 /* Rotate the PV list if it has more than one entry. */ 6271 if (TAILQ_NEXT(pv, pv_next) != NULL) { 6272 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 6273 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 6274 m->md.pv_gen++; 6275 } 6276 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 6277 not_cleared < PMAP_TS_REFERENCED_MAX); 6278 out: 6279 rw_wunlock(lock); 6280 vm_page_free_pages_toq(&free, true); 6281 return (cleared + not_cleared); 6282 } 6283 6284 /* 6285 * Apply the given advice to the specified range of addresses within the 6286 * given pmap. Depending on the advice, clear the referenced and/or 6287 * modified flags in each mapping and set the mapped page's dirty field. 6288 */ 6289 void 6290 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 6291 { 6292 struct rwlock *lock; 6293 vm_offset_t va, va_next; 6294 vm_page_t m; 6295 pd_entry_t *l0, *l1, *l2, oldl2; 6296 pt_entry_t *l3, oldl3; 6297 6298 PMAP_ASSERT_STAGE1(pmap); 6299 6300 if (advice != MADV_DONTNEED && advice != MADV_FREE) 6301 return; 6302 6303 PMAP_LOCK(pmap); 6304 for (; sva < eva; sva = va_next) { 6305 l0 = pmap_l0(pmap, sva); 6306 if (pmap_load(l0) == 0) { 6307 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 6308 if (va_next < sva) 6309 va_next = eva; 6310 continue; 6311 } 6312 6313 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 6314 if (va_next < sva) 6315 va_next = eva; 6316 l1 = pmap_l0_to_l1(l0, sva); 6317 if (pmap_load(l1) == 0) 6318 continue; 6319 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 6320 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 6321 continue; 6322 } 6323 6324 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 6325 if (va_next < sva) 6326 va_next = eva; 6327 l2 = pmap_l1_to_l2(l1, sva); 6328 oldl2 = pmap_load(l2); 6329 if (oldl2 == 0) 6330 continue; 6331 if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) { 6332 if ((oldl2 & ATTR_SW_MANAGED) == 0) 6333 continue; 6334 lock = NULL; 6335 if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) { 6336 if (lock != NULL) 6337 rw_wunlock(lock); 6338 6339 /* 6340 * The 2MB page mapping was destroyed. 6341 */ 6342 continue; 6343 } 6344 6345 /* 6346 * Unless the page mappings are wired, remove the 6347 * mapping to a single page so that a subsequent 6348 * access may repromote. Choosing the last page 6349 * within the address range [sva, min(va_next, eva)) 6350 * generally results in more repromotions. Since the 6351 * underlying page table page is fully populated, this 6352 * removal never frees a page table page. 6353 */ 6354 if ((oldl2 & ATTR_SW_WIRED) == 0) { 6355 va = eva; 6356 if (va > va_next) 6357 va = va_next; 6358 va -= PAGE_SIZE; 6359 KASSERT(va >= sva, 6360 ("pmap_advise: no address gap")); 6361 l3 = pmap_l2_to_l3(l2, va); 6362 KASSERT(pmap_load(l3) != 0, 6363 ("pmap_advise: invalid PTE")); 6364 pmap_remove_l3(pmap, l3, va, pmap_load(l2), 6365 NULL, &lock); 6366 } 6367 if (lock != NULL) 6368 rw_wunlock(lock); 6369 } 6370 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 6371 ("pmap_advise: invalid L2 entry after demotion")); 6372 if (va_next > eva) 6373 va_next = eva; 6374 va = va_next; 6375 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 6376 sva += L3_SIZE) { 6377 oldl3 = pmap_load(l3); 6378 if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) != 6379 (ATTR_SW_MANAGED | L3_PAGE)) 6380 goto maybe_invlrng; 6381 else if (pmap_pte_dirty(pmap, oldl3)) { 6382 if (advice == MADV_DONTNEED) { 6383 /* 6384 * Future calls to pmap_is_modified() 6385 * can be avoided by making the page 6386 * dirty now. 6387 */ 6388 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl3)); 6389 vm_page_dirty(m); 6390 } 6391 while (!atomic_fcmpset_long(l3, &oldl3, 6392 (oldl3 & ~ATTR_AF) | 6393 ATTR_S1_AP(ATTR_S1_AP_RO))) 6394 cpu_spinwait(); 6395 } else if ((oldl3 & ATTR_AF) != 0) 6396 pmap_clear_bits(l3, ATTR_AF); 6397 else 6398 goto maybe_invlrng; 6399 if (va == va_next) 6400 va = sva; 6401 continue; 6402 maybe_invlrng: 6403 if (va != va_next) { 6404 pmap_s1_invalidate_range(pmap, va, sva, true); 6405 va = va_next; 6406 } 6407 } 6408 if (va != va_next) 6409 pmap_s1_invalidate_range(pmap, va, sva, true); 6410 } 6411 PMAP_UNLOCK(pmap); 6412 } 6413 6414 /* 6415 * Clear the modify bits on the specified physical page. 6416 */ 6417 void 6418 pmap_clear_modify(vm_page_t m) 6419 { 6420 struct md_page *pvh; 6421 struct rwlock *lock; 6422 pmap_t pmap; 6423 pv_entry_t next_pv, pv; 6424 pd_entry_t *l2, oldl2; 6425 pt_entry_t *l3, oldl3; 6426 vm_offset_t va; 6427 int md_gen, pvh_gen; 6428 6429 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6430 ("pmap_clear_modify: page %p is not managed", m)); 6431 vm_page_assert_busied(m); 6432 6433 if (!pmap_page_is_write_mapped(m)) 6434 return; 6435 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 6436 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6437 rw_wlock(lock); 6438 restart: 6439 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 6440 pmap = PV_PMAP(pv); 6441 PMAP_ASSERT_STAGE1(pmap); 6442 if (!PMAP_TRYLOCK(pmap)) { 6443 pvh_gen = pvh->pv_gen; 6444 rw_wunlock(lock); 6445 PMAP_LOCK(pmap); 6446 rw_wlock(lock); 6447 if (pvh_gen != pvh->pv_gen) { 6448 PMAP_UNLOCK(pmap); 6449 goto restart; 6450 } 6451 } 6452 va = pv->pv_va; 6453 l2 = pmap_l2(pmap, va); 6454 oldl2 = pmap_load(l2); 6455 /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */ 6456 if ((oldl2 & ATTR_SW_DBM) != 0 && 6457 pmap_demote_l2_locked(pmap, l2, va, &lock) && 6458 (oldl2 & ATTR_SW_WIRED) == 0) { 6459 /* 6460 * Write protect the mapping to a single page so that 6461 * a subsequent write access may repromote. 6462 */ 6463 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2); 6464 l3 = pmap_l2_to_l3(l2, va); 6465 oldl3 = pmap_load(l3); 6466 while (!atomic_fcmpset_long(l3, &oldl3, 6467 (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO))) 6468 cpu_spinwait(); 6469 vm_page_dirty(m); 6470 pmap_s1_invalidate_page(pmap, va, true); 6471 } 6472 PMAP_UNLOCK(pmap); 6473 } 6474 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6475 pmap = PV_PMAP(pv); 6476 PMAP_ASSERT_STAGE1(pmap); 6477 if (!PMAP_TRYLOCK(pmap)) { 6478 md_gen = m->md.pv_gen; 6479 pvh_gen = pvh->pv_gen; 6480 rw_wunlock(lock); 6481 PMAP_LOCK(pmap); 6482 rw_wlock(lock); 6483 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 6484 PMAP_UNLOCK(pmap); 6485 goto restart; 6486 } 6487 } 6488 l2 = pmap_l2(pmap, pv->pv_va); 6489 l3 = pmap_l2_to_l3(l2, pv->pv_va); 6490 oldl3 = pmap_load(l3); 6491 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM){ 6492 pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO)); 6493 pmap_s1_invalidate_page(pmap, pv->pv_va, true); 6494 } 6495 PMAP_UNLOCK(pmap); 6496 } 6497 rw_wunlock(lock); 6498 } 6499 6500 void * 6501 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 6502 { 6503 struct pmap_preinit_mapping *ppim; 6504 vm_offset_t va, offset; 6505 pd_entry_t old_l2e, *pde; 6506 pt_entry_t *l2; 6507 int i, lvl, l2_blocks, free_l2_count, start_idx; 6508 6509 if (!vm_initialized) { 6510 /* 6511 * No L3 ptables so map entire L2 blocks where start VA is: 6512 * preinit_map_va + start_idx * L2_SIZE 6513 * There may be duplicate mappings (multiple VA -> same PA) but 6514 * ARM64 dcache is always PIPT so that's acceptable. 6515 */ 6516 if (size == 0) 6517 return (NULL); 6518 6519 /* Calculate how many L2 blocks are needed for the mapping */ 6520 l2_blocks = (roundup2(pa + size, L2_SIZE) - 6521 rounddown2(pa, L2_SIZE)) >> L2_SHIFT; 6522 6523 offset = pa & L2_OFFSET; 6524 6525 if (preinit_map_va == 0) 6526 return (NULL); 6527 6528 /* Map 2MiB L2 blocks from reserved VA space */ 6529 6530 free_l2_count = 0; 6531 start_idx = -1; 6532 /* Find enough free contiguous VA space */ 6533 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 6534 ppim = pmap_preinit_mapping + i; 6535 if (free_l2_count > 0 && ppim->pa != 0) { 6536 /* Not enough space here */ 6537 free_l2_count = 0; 6538 start_idx = -1; 6539 continue; 6540 } 6541 6542 if (ppim->pa == 0) { 6543 /* Free L2 block */ 6544 if (start_idx == -1) 6545 start_idx = i; 6546 free_l2_count++; 6547 if (free_l2_count == l2_blocks) 6548 break; 6549 } 6550 } 6551 if (free_l2_count != l2_blocks) 6552 panic("%s: too many preinit mappings", __func__); 6553 6554 va = preinit_map_va + (start_idx * L2_SIZE); 6555 for (i = start_idx; i < start_idx + l2_blocks; i++) { 6556 /* Mark entries as allocated */ 6557 ppim = pmap_preinit_mapping + i; 6558 ppim->pa = pa; 6559 ppim->va = va + offset; 6560 ppim->size = size; 6561 } 6562 6563 /* Map L2 blocks */ 6564 pa = rounddown2(pa, L2_SIZE); 6565 old_l2e = 0; 6566 for (i = 0; i < l2_blocks; i++) { 6567 pde = pmap_pde(kernel_pmap, va, &lvl); 6568 KASSERT(pde != NULL, 6569 ("pmap_mapbios: Invalid page entry, va: 0x%lx", 6570 va)); 6571 KASSERT(lvl == 1, 6572 ("pmap_mapbios: Invalid level %d", lvl)); 6573 6574 /* Insert L2_BLOCK */ 6575 l2 = pmap_l1_to_l2(pde, va); 6576 old_l2e |= pmap_load_store(l2, 6577 PHYS_TO_PTE(pa) | ATTR_DEFAULT | ATTR_S1_XN | 6578 ATTR_KERN_GP | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | 6579 L2_BLOCK); 6580 6581 va += L2_SIZE; 6582 pa += L2_SIZE; 6583 } 6584 if ((old_l2e & ATTR_DESCR_VALID) != 0) 6585 pmap_s1_invalidate_all(kernel_pmap); 6586 else { 6587 /* 6588 * Because the old entries were invalid and the new 6589 * mappings are not executable, an isb is not required. 6590 */ 6591 dsb(ishst); 6592 } 6593 6594 va = preinit_map_va + (start_idx * L2_SIZE); 6595 6596 } else { 6597 /* kva_alloc may be used to map the pages */ 6598 offset = pa & PAGE_MASK; 6599 size = round_page(offset + size); 6600 6601 va = kva_alloc(size); 6602 if (va == 0) 6603 panic("%s: Couldn't allocate KVA", __func__); 6604 6605 pde = pmap_pde(kernel_pmap, va, &lvl); 6606 KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl)); 6607 6608 /* L3 table is linked */ 6609 va = trunc_page(va); 6610 pa = trunc_page(pa); 6611 pmap_kenter(va, size, pa, memory_mapping_mode(pa)); 6612 } 6613 6614 return ((void *)(va + offset)); 6615 } 6616 6617 void 6618 pmap_unmapbios(void *p, vm_size_t size) 6619 { 6620 struct pmap_preinit_mapping *ppim; 6621 vm_offset_t offset, va, va_trunc; 6622 pd_entry_t *pde; 6623 pt_entry_t *l2; 6624 int i, lvl, l2_blocks, block; 6625 bool preinit_map; 6626 6627 va = (vm_offset_t)p; 6628 l2_blocks = 6629 (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT; 6630 KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size)); 6631 6632 /* Remove preinit mapping */ 6633 preinit_map = false; 6634 block = 0; 6635 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 6636 ppim = pmap_preinit_mapping + i; 6637 if (ppim->va == va) { 6638 KASSERT(ppim->size == size, 6639 ("pmap_unmapbios: size mismatch")); 6640 ppim->va = 0; 6641 ppim->pa = 0; 6642 ppim->size = 0; 6643 preinit_map = true; 6644 offset = block * L2_SIZE; 6645 va_trunc = rounddown2(va, L2_SIZE) + offset; 6646 6647 /* Remove L2_BLOCK */ 6648 pde = pmap_pde(kernel_pmap, va_trunc, &lvl); 6649 KASSERT(pde != NULL, 6650 ("pmap_unmapbios: Invalid page entry, va: 0x%lx", 6651 va_trunc)); 6652 l2 = pmap_l1_to_l2(pde, va_trunc); 6653 pmap_clear(l2); 6654 6655 if (block == (l2_blocks - 1)) 6656 break; 6657 block++; 6658 } 6659 } 6660 if (preinit_map) { 6661 pmap_s1_invalidate_all(kernel_pmap); 6662 return; 6663 } 6664 6665 /* Unmap the pages reserved with kva_alloc. */ 6666 if (vm_initialized) { 6667 offset = va & PAGE_MASK; 6668 size = round_page(offset + size); 6669 va = trunc_page(va); 6670 6671 /* Unmap and invalidate the pages */ 6672 pmap_kremove_device(va, size); 6673 6674 kva_free(va, size); 6675 } 6676 } 6677 6678 /* 6679 * Sets the memory attribute for the specified page. 6680 */ 6681 void 6682 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 6683 { 6684 6685 m->md.pv_memattr = ma; 6686 6687 /* 6688 * If "m" is a normal page, update its direct mapping. This update 6689 * can be relied upon to perform any cache operations that are 6690 * required for data coherence. 6691 */ 6692 if ((m->flags & PG_FICTITIOUS) == 0 && 6693 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 6694 m->md.pv_memattr) != 0) 6695 panic("memory attribute change on the direct map failed"); 6696 } 6697 6698 /* 6699 * Changes the specified virtual address range's memory type to that given by 6700 * the parameter "mode". The specified virtual address range must be 6701 * completely contained within either the direct map or the kernel map. If 6702 * the virtual address range is contained within the kernel map, then the 6703 * memory type for each of the corresponding ranges of the direct map is also 6704 * changed. (The corresponding ranges of the direct map are those ranges that 6705 * map the same physical pages as the specified virtual address range.) These 6706 * changes to the direct map are necessary because Intel describes the 6707 * behavior of their processors as "undefined" if two or more mappings to the 6708 * same physical page have different memory types. 6709 * 6710 * Returns zero if the change completed successfully, and either EINVAL or 6711 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 6712 * of the virtual address range was not mapped, and ENOMEM is returned if 6713 * there was insufficient memory available to complete the change. In the 6714 * latter case, the memory type may have been changed on some part of the 6715 * virtual address range or the direct map. 6716 */ 6717 int 6718 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 6719 { 6720 int error; 6721 6722 PMAP_LOCK(kernel_pmap); 6723 error = pmap_change_props_locked(va, size, PROT_NONE, mode, false); 6724 PMAP_UNLOCK(kernel_pmap); 6725 return (error); 6726 } 6727 6728 /* 6729 * Changes the specified virtual address range's protections to those 6730 * specified by "prot". Like pmap_change_attr(), protections for aliases 6731 * in the direct map are updated as well. Protections on aliasing mappings may 6732 * be a subset of the requested protections; for example, mappings in the direct 6733 * map are never executable. 6734 */ 6735 int 6736 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot) 6737 { 6738 int error; 6739 6740 /* Only supported within the kernel map. */ 6741 if (va < VM_MIN_KERNEL_ADDRESS) 6742 return (EINVAL); 6743 6744 PMAP_LOCK(kernel_pmap); 6745 error = pmap_change_props_locked(va, size, prot, -1, false); 6746 PMAP_UNLOCK(kernel_pmap); 6747 return (error); 6748 } 6749 6750 static int 6751 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, 6752 int mode, bool skip_unmapped) 6753 { 6754 vm_offset_t base, offset, tmpva; 6755 vm_size_t pte_size; 6756 vm_paddr_t pa; 6757 pt_entry_t pte, *ptep, *newpte; 6758 pt_entry_t bits, mask; 6759 int lvl, rv; 6760 6761 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 6762 base = trunc_page(va); 6763 offset = va & PAGE_MASK; 6764 size = round_page(offset + size); 6765 6766 if (!VIRT_IN_DMAP(base) && 6767 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS)) 6768 return (EINVAL); 6769 6770 bits = 0; 6771 mask = 0; 6772 if (mode != -1) { 6773 bits = ATTR_S1_IDX(mode); 6774 mask = ATTR_S1_IDX_MASK; 6775 if (mode == VM_MEMATTR_DEVICE) { 6776 mask |= ATTR_S1_XN; 6777 bits |= ATTR_S1_XN; 6778 } 6779 } 6780 if (prot != VM_PROT_NONE) { 6781 /* Don't mark the DMAP as executable. It never is on arm64. */ 6782 if (VIRT_IN_DMAP(base)) { 6783 prot &= ~VM_PROT_EXECUTE; 6784 /* 6785 * XXX Mark the DMAP as writable for now. We rely 6786 * on this in ddb & dtrace to insert breakpoint 6787 * instructions. 6788 */ 6789 prot |= VM_PROT_WRITE; 6790 } 6791 6792 if ((prot & VM_PROT_WRITE) == 0) { 6793 bits |= ATTR_S1_AP(ATTR_S1_AP_RO); 6794 } 6795 if ((prot & VM_PROT_EXECUTE) == 0) { 6796 bits |= ATTR_S1_PXN; 6797 } 6798 bits |= ATTR_S1_UXN; 6799 mask |= ATTR_S1_AP_MASK | ATTR_S1_XN; 6800 } 6801 6802 for (tmpva = base; tmpva < base + size; ) { 6803 ptep = pmap_pte(kernel_pmap, tmpva, &lvl); 6804 if (ptep == NULL && !skip_unmapped) { 6805 return (EINVAL); 6806 } else if ((ptep == NULL && skip_unmapped) || 6807 (pmap_load(ptep) & mask) == bits) { 6808 /* 6809 * We already have the correct attribute or there 6810 * is no memory mapped at this address and we are 6811 * skipping unmapped memory. 6812 */ 6813 switch (lvl) { 6814 default: 6815 panic("Invalid DMAP table level: %d\n", lvl); 6816 case 1: 6817 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; 6818 break; 6819 case 2: 6820 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; 6821 break; 6822 case 3: 6823 tmpva += PAGE_SIZE; 6824 break; 6825 } 6826 } else { 6827 /* We can't demote/promote this entry */ 6828 MPASS((pmap_load(ptep) & ATTR_SW_NO_PROMOTE) == 0); 6829 6830 /* 6831 * Split the entry to an level 3 table, then 6832 * set the new attribute. 6833 */ 6834 switch (lvl) { 6835 default: 6836 panic("Invalid DMAP table level: %d\n", lvl); 6837 case 1: 6838 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 6839 if ((tmpva & L1_OFFSET) == 0 && 6840 (base + size - tmpva) >= L1_SIZE) { 6841 pte_size = L1_SIZE; 6842 break; 6843 } 6844 newpte = pmap_demote_l1(kernel_pmap, ptep, 6845 tmpva & ~L1_OFFSET); 6846 if (newpte == NULL) 6847 return (EINVAL); 6848 ptep = pmap_l1_to_l2(ptep, tmpva); 6849 /* FALLTHROUGH */ 6850 case 2: 6851 if ((tmpva & L2_OFFSET) == 0 && 6852 (base + size - tmpva) >= L2_SIZE) { 6853 pte_size = L2_SIZE; 6854 break; 6855 } 6856 newpte = pmap_demote_l2(kernel_pmap, ptep, 6857 tmpva); 6858 if (newpte == NULL) 6859 return (EINVAL); 6860 ptep = pmap_l2_to_l3(ptep, tmpva); 6861 /* FALLTHROUGH */ 6862 case 3: 6863 pte_size = PAGE_SIZE; 6864 break; 6865 } 6866 6867 /* Update the entry */ 6868 pte = pmap_load(ptep); 6869 pte &= ~mask; 6870 pte |= bits; 6871 6872 pmap_update_entry(kernel_pmap, ptep, pte, tmpva, 6873 pte_size); 6874 6875 pa = PTE_TO_PHYS(pte); 6876 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) { 6877 /* 6878 * Keep the DMAP memory in sync. 6879 */ 6880 rv = pmap_change_props_locked( 6881 PHYS_TO_DMAP(pa), pte_size, 6882 prot, mode, true); 6883 if (rv != 0) 6884 return (rv); 6885 } 6886 6887 /* 6888 * If moving to a non-cacheable entry flush 6889 * the cache. 6890 */ 6891 if (mode == VM_MEMATTR_UNCACHEABLE) 6892 cpu_dcache_wbinv_range(tmpva, pte_size); 6893 tmpva += pte_size; 6894 } 6895 } 6896 6897 return (0); 6898 } 6899 6900 /* 6901 * Create an L2 table to map all addresses within an L1 mapping. 6902 */ 6903 static pt_entry_t * 6904 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va) 6905 { 6906 pt_entry_t *l2, newl2, oldl1; 6907 vm_offset_t tmpl1; 6908 vm_paddr_t l2phys, phys; 6909 vm_page_t ml2; 6910 int i; 6911 6912 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6913 oldl1 = pmap_load(l1); 6914 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 6915 KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK, 6916 ("pmap_demote_l1: Demoting a non-block entry")); 6917 KASSERT((va & L1_OFFSET) == 0, 6918 ("pmap_demote_l1: Invalid virtual address %#lx", va)); 6919 KASSERT((oldl1 & ATTR_SW_MANAGED) == 0, 6920 ("pmap_demote_l1: Level 1 table shouldn't be managed")); 6921 KASSERT((oldl1 & ATTR_SW_NO_PROMOTE) == 0, 6922 ("pmap_demote_l1: Demoting entry with no-demote flag set")); 6923 6924 tmpl1 = 0; 6925 if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) { 6926 tmpl1 = kva_alloc(PAGE_SIZE); 6927 if (tmpl1 == 0) 6928 return (NULL); 6929 } 6930 6931 if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) == 6932 NULL) { 6933 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx" 6934 " in pmap %p", va, pmap); 6935 l2 = NULL; 6936 goto fail; 6937 } 6938 6939 l2phys = VM_PAGE_TO_PHYS(ml2); 6940 l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys); 6941 6942 /* Address the range points at */ 6943 phys = PTE_TO_PHYS(oldl1); 6944 /* The attributed from the old l1 table to be copied */ 6945 newl2 = oldl1 & ATTR_MASK; 6946 6947 /* Create the new entries */ 6948 for (i = 0; i < Ln_ENTRIES; i++) { 6949 l2[i] = newl2 | phys; 6950 phys += L2_SIZE; 6951 } 6952 KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK), 6953 ("Invalid l2 page (%lx != %lx)", l2[0], 6954 (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK)); 6955 6956 if (tmpl1 != 0) { 6957 pmap_kenter(tmpl1, PAGE_SIZE, 6958 DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, 6959 VM_MEMATTR_WRITE_BACK); 6960 l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK)); 6961 } 6962 6963 pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE); 6964 6965 fail: 6966 if (tmpl1 != 0) { 6967 pmap_kremove(tmpl1); 6968 kva_free(tmpl1, PAGE_SIZE); 6969 } 6970 6971 return (l2); 6972 } 6973 6974 static void 6975 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3) 6976 { 6977 pt_entry_t *l3; 6978 6979 for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) { 6980 *l3 = newl3; 6981 newl3 += L3_SIZE; 6982 } 6983 } 6984 6985 static void 6986 pmap_demote_l2_check(pt_entry_t *firstl3p __unused, pt_entry_t newl3e __unused) 6987 { 6988 #ifdef INVARIANTS 6989 #ifdef DIAGNOSTIC 6990 pt_entry_t *xl3p, *yl3p; 6991 6992 for (xl3p = firstl3p; xl3p < firstl3p + Ln_ENTRIES; 6993 xl3p++, newl3e += PAGE_SIZE) { 6994 if (PTE_TO_PHYS(pmap_load(xl3p)) != PTE_TO_PHYS(newl3e)) { 6995 printf("pmap_demote_l2: xl3e %zd and newl3e map " 6996 "different pages: found %#lx, expected %#lx\n", 6997 xl3p - firstl3p, pmap_load(xl3p), newl3e); 6998 printf("page table dump\n"); 6999 for (yl3p = firstl3p; yl3p < firstl3p + Ln_ENTRIES; 7000 yl3p++) { 7001 printf("%zd %#lx\n", yl3p - firstl3p, 7002 pmap_load(yl3p)); 7003 } 7004 panic("firstpte"); 7005 } 7006 } 7007 #else 7008 KASSERT(PTE_TO_PHYS(pmap_load(firstl3p)) == PTE_TO_PHYS(newl3e), 7009 ("pmap_demote_l2: firstl3 and newl3e map different physical" 7010 " addresses")); 7011 #endif 7012 #endif 7013 } 7014 7015 static void 7016 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2, 7017 struct rwlock **lockp) 7018 { 7019 struct spglist free; 7020 7021 SLIST_INIT(&free); 7022 (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, 7023 lockp); 7024 vm_page_free_pages_toq(&free, true); 7025 } 7026 7027 /* 7028 * Create an L3 table to map all addresses within an L2 mapping. 7029 */ 7030 static pt_entry_t * 7031 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, 7032 struct rwlock **lockp) 7033 { 7034 pt_entry_t *l3, newl3, oldl2; 7035 vm_offset_t tmpl2; 7036 vm_paddr_t l3phys; 7037 vm_page_t ml3; 7038 7039 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7040 PMAP_ASSERT_STAGE1(pmap); 7041 KASSERT(ADDR_IS_CANONICAL(va), 7042 ("%s: Address not in canonical form: %lx", __func__, va)); 7043 7044 l3 = NULL; 7045 oldl2 = pmap_load(l2); 7046 KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK, 7047 ("pmap_demote_l2: Demoting a non-block entry")); 7048 KASSERT((oldl2 & ATTR_SW_NO_PROMOTE) == 0, 7049 ("pmap_demote_l2: Demoting entry with no-demote flag set")); 7050 va &= ~L2_OFFSET; 7051 7052 tmpl2 = 0; 7053 if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) { 7054 tmpl2 = kva_alloc(PAGE_SIZE); 7055 if (tmpl2 == 0) 7056 return (NULL); 7057 } 7058 7059 /* 7060 * Invalidate the 2MB page mapping and return "failure" if the 7061 * mapping was never accessed. 7062 */ 7063 if ((oldl2 & ATTR_AF) == 0) { 7064 KASSERT((oldl2 & ATTR_SW_WIRED) == 0, 7065 ("pmap_demote_l2: a wired mapping is missing ATTR_AF")); 7066 pmap_demote_l2_abort(pmap, va, l2, lockp); 7067 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p", 7068 va, pmap); 7069 goto fail; 7070 } 7071 7072 if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) { 7073 KASSERT((oldl2 & ATTR_SW_WIRED) == 0, 7074 ("pmap_demote_l2: page table page for a wired mapping" 7075 " is missing")); 7076 7077 /* 7078 * If the page table page is missing and the mapping 7079 * is for a kernel address, the mapping must belong to 7080 * either the direct map or the early kernel memory. 7081 * Page table pages are preallocated for every other 7082 * part of the kernel address space, so the direct map 7083 * region and early kernel memory are the only parts of the 7084 * kernel address space that must be handled here. 7085 */ 7086 KASSERT(!ADDR_IS_KERNEL(va) || VIRT_IN_DMAP(va) || 7087 (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end), 7088 ("pmap_demote_l2: No saved mpte for va %#lx", va)); 7089 7090 /* 7091 * If the 2MB page mapping belongs to the direct map 7092 * region of the kernel's address space, then the page 7093 * allocation request specifies the highest possible 7094 * priority (VM_ALLOC_INTERRUPT). Otherwise, the 7095 * priority is normal. 7096 */ 7097 ml3 = vm_page_alloc_noobj( 7098 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) | 7099 VM_ALLOC_WIRED); 7100 7101 /* 7102 * If the allocation of the new page table page fails, 7103 * invalidate the 2MB page mapping and return "failure". 7104 */ 7105 if (ml3 == NULL) { 7106 pmap_demote_l2_abort(pmap, va, l2, lockp); 7107 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx" 7108 " in pmap %p", va, pmap); 7109 goto fail; 7110 } 7111 ml3->pindex = pmap_l2_pindex(va); 7112 7113 if (!ADDR_IS_KERNEL(va)) { 7114 ml3->ref_count = NL3PG; 7115 pmap_resident_count_inc(pmap, 1); 7116 } 7117 } 7118 l3phys = VM_PAGE_TO_PHYS(ml3); 7119 l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys); 7120 newl3 = (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE; 7121 KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 7122 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM), 7123 ("pmap_demote_l2: L2 entry is writeable but not dirty")); 7124 7125 /* 7126 * If the PTP is not leftover from an earlier promotion or it does not 7127 * have ATTR_AF set in every L3E, then fill it. The new L3Es will all 7128 * have ATTR_AF set. 7129 * 7130 * When pmap_update_entry() clears the old L2 mapping, it (indirectly) 7131 * performs a dsb(). That dsb() ensures that the stores for filling 7132 * "l3" are visible before "l3" is added to the page table. 7133 */ 7134 if (!vm_page_all_valid(ml3)) 7135 pmap_fill_l3(l3, newl3); 7136 7137 pmap_demote_l2_check(l3, newl3); 7138 7139 /* 7140 * If the mapping has changed attributes, update the L3Es. 7141 */ 7142 if ((pmap_load(l3) & (ATTR_MASK & ~ATTR_AF)) != (newl3 & (ATTR_MASK & 7143 ~ATTR_AF))) 7144 pmap_fill_l3(l3, newl3); 7145 7146 /* 7147 * Map the temporary page so we don't lose access to the l2 table. 7148 */ 7149 if (tmpl2 != 0) { 7150 pmap_kenter(tmpl2, PAGE_SIZE, 7151 DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, 7152 VM_MEMATTR_WRITE_BACK); 7153 l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK)); 7154 } 7155 7156 /* 7157 * The spare PV entries must be reserved prior to demoting the 7158 * mapping, that is, prior to changing the PDE. Otherwise, the state 7159 * of the L2 and the PV lists will be inconsistent, which can result 7160 * in reclaim_pv_chunk() attempting to remove a PV entry from the 7161 * wrong PV list and pmap_pv_demote_l2() failing to find the expected 7162 * PV entry for the 2MB page mapping that is being demoted. 7163 */ 7164 if ((oldl2 & ATTR_SW_MANAGED) != 0) 7165 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); 7166 7167 /* 7168 * Pass PAGE_SIZE so that a single TLB invalidation is performed on 7169 * the 2MB page mapping. 7170 */ 7171 pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE); 7172 7173 /* 7174 * Demote the PV entry. 7175 */ 7176 if ((oldl2 & ATTR_SW_MANAGED) != 0) 7177 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp); 7178 7179 atomic_add_long(&pmap_l2_demotions, 1); 7180 CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx" 7181 " in pmap %p %lx", va, pmap, l3[0]); 7182 7183 fail: 7184 if (tmpl2 != 0) { 7185 pmap_kremove(tmpl2); 7186 kva_free(tmpl2, PAGE_SIZE); 7187 } 7188 7189 return (l3); 7190 7191 } 7192 7193 static pt_entry_t * 7194 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 7195 { 7196 struct rwlock *lock; 7197 pt_entry_t *l3; 7198 7199 lock = NULL; 7200 l3 = pmap_demote_l2_locked(pmap, l2, va, &lock); 7201 if (lock != NULL) 7202 rw_wunlock(lock); 7203 return (l3); 7204 } 7205 7206 /* 7207 * Perform the pmap work for mincore(2). If the page is not both referenced and 7208 * modified by this pmap, returns its physical address so that the caller can 7209 * find other mappings. 7210 */ 7211 int 7212 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 7213 { 7214 pt_entry_t *pte, tpte; 7215 vm_paddr_t mask, pa; 7216 int lvl, val; 7217 bool managed; 7218 7219 PMAP_ASSERT_STAGE1(pmap); 7220 PMAP_LOCK(pmap); 7221 pte = pmap_pte(pmap, addr, &lvl); 7222 if (pte != NULL) { 7223 tpte = pmap_load(pte); 7224 7225 switch (lvl) { 7226 case 3: 7227 mask = L3_OFFSET; 7228 break; 7229 case 2: 7230 mask = L2_OFFSET; 7231 break; 7232 case 1: 7233 mask = L1_OFFSET; 7234 break; 7235 default: 7236 panic("pmap_mincore: invalid level %d", lvl); 7237 } 7238 7239 managed = (tpte & ATTR_SW_MANAGED) != 0; 7240 val = MINCORE_INCORE; 7241 if (lvl != 3) 7242 val |= MINCORE_PSIND(3 - lvl); 7243 if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed && 7244 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))) 7245 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 7246 if ((tpte & ATTR_AF) == ATTR_AF) 7247 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 7248 7249 pa = PTE_TO_PHYS(tpte) | (addr & mask); 7250 } else { 7251 managed = false; 7252 val = 0; 7253 } 7254 7255 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 7256 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 7257 *pap = pa; 7258 } 7259 PMAP_UNLOCK(pmap); 7260 return (val); 7261 } 7262 7263 /* 7264 * Garbage collect every ASID that is neither active on a processor nor 7265 * reserved. 7266 */ 7267 static void 7268 pmap_reset_asid_set(pmap_t pmap) 7269 { 7270 pmap_t curpmap; 7271 int asid, cpuid, epoch; 7272 struct asid_set *set; 7273 enum pmap_stage stage; 7274 7275 set = pmap->pm_asid_set; 7276 stage = pmap->pm_stage; 7277 7278 set = pmap->pm_asid_set; 7279 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 7280 mtx_assert(&set->asid_set_mutex, MA_OWNED); 7281 7282 /* 7283 * Ensure that the store to asid_epoch is globally visible before the 7284 * loads from pc_curpmap are performed. 7285 */ 7286 epoch = set->asid_epoch + 1; 7287 if (epoch == INT_MAX) 7288 epoch = 0; 7289 set->asid_epoch = epoch; 7290 dsb(ishst); 7291 if (stage == PM_STAGE1) { 7292 __asm __volatile("tlbi vmalle1is"); 7293 } else { 7294 KASSERT(pmap_clean_stage2_tlbi != NULL, 7295 ("%s: Unset stage 2 tlb invalidation callback\n", 7296 __func__)); 7297 pmap_clean_stage2_tlbi(); 7298 } 7299 dsb(ish); 7300 bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE, 7301 set->asid_set_size - 1); 7302 CPU_FOREACH(cpuid) { 7303 if (cpuid == curcpu) 7304 continue; 7305 if (stage == PM_STAGE1) { 7306 curpmap = pcpu_find(cpuid)->pc_curpmap; 7307 PMAP_ASSERT_STAGE1(pmap); 7308 } else { 7309 curpmap = pcpu_find(cpuid)->pc_curvmpmap; 7310 if (curpmap == NULL) 7311 continue; 7312 PMAP_ASSERT_STAGE2(pmap); 7313 } 7314 KASSERT(curpmap->pm_asid_set == set, ("Incorrect set")); 7315 asid = COOKIE_TO_ASID(curpmap->pm_cookie); 7316 if (asid == -1) 7317 continue; 7318 bit_set(set->asid_set, asid); 7319 curpmap->pm_cookie = COOKIE_FROM(asid, epoch); 7320 } 7321 } 7322 7323 /* 7324 * Allocate a new ASID for the specified pmap. 7325 */ 7326 static void 7327 pmap_alloc_asid(pmap_t pmap) 7328 { 7329 struct asid_set *set; 7330 int new_asid; 7331 7332 set = pmap->pm_asid_set; 7333 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 7334 7335 mtx_lock_spin(&set->asid_set_mutex); 7336 7337 /* 7338 * While this processor was waiting to acquire the asid set mutex, 7339 * pmap_reset_asid_set() running on another processor might have 7340 * updated this pmap's cookie to the current epoch. In which case, we 7341 * don't need to allocate a new ASID. 7342 */ 7343 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) 7344 goto out; 7345 7346 bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size, 7347 &new_asid); 7348 if (new_asid == -1) { 7349 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, 7350 set->asid_next, &new_asid); 7351 if (new_asid == -1) { 7352 pmap_reset_asid_set(pmap); 7353 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, 7354 set->asid_set_size, &new_asid); 7355 KASSERT(new_asid != -1, ("ASID allocation failure")); 7356 } 7357 } 7358 bit_set(set->asid_set, new_asid); 7359 set->asid_next = new_asid + 1; 7360 pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch); 7361 out: 7362 mtx_unlock_spin(&set->asid_set_mutex); 7363 } 7364 7365 static uint64_t __read_mostly ttbr_flags; 7366 7367 /* 7368 * Compute the value that should be stored in ttbr0 to activate the specified 7369 * pmap. This value may change from time to time. 7370 */ 7371 uint64_t 7372 pmap_to_ttbr0(pmap_t pmap) 7373 { 7374 uint64_t ttbr; 7375 7376 ttbr = pmap->pm_ttbr; 7377 ttbr |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 7378 ttbr |= ttbr_flags; 7379 7380 return (ttbr); 7381 } 7382 7383 static void 7384 pmap_set_cnp(void *arg) 7385 { 7386 uint64_t ttbr0, ttbr1; 7387 u_int cpuid; 7388 7389 cpuid = *(u_int *)arg; 7390 if (cpuid == curcpu) { 7391 /* 7392 * Set the flags while all CPUs are handling the 7393 * smp_rendezvous so will not call pmap_to_ttbr0. Any calls 7394 * to pmap_to_ttbr0 after this will have the CnP flag set. 7395 * The dsb after invalidating the TLB will act as a barrier 7396 * to ensure all CPUs can observe this change. 7397 */ 7398 ttbr_flags |= TTBR_CnP; 7399 } 7400 7401 ttbr0 = READ_SPECIALREG(ttbr0_el1); 7402 ttbr0 |= TTBR_CnP; 7403 7404 ttbr1 = READ_SPECIALREG(ttbr1_el1); 7405 ttbr1 |= TTBR_CnP; 7406 7407 /* Update ttbr{0,1}_el1 with the CnP flag */ 7408 WRITE_SPECIALREG(ttbr0_el1, ttbr0); 7409 WRITE_SPECIALREG(ttbr1_el1, ttbr1); 7410 isb(); 7411 __asm __volatile("tlbi vmalle1is"); 7412 dsb(ish); 7413 isb(); 7414 } 7415 7416 /* 7417 * Defer enabling CnP until we have read the ID registers to know if it's 7418 * supported on all CPUs. 7419 */ 7420 static void 7421 pmap_init_cnp(void *dummy __unused) 7422 { 7423 uint64_t reg; 7424 u_int cpuid; 7425 7426 if (!get_kernel_reg(ID_AA64MMFR2_EL1, ®)) 7427 return; 7428 7429 if (ID_AA64MMFR2_CnP_VAL(reg) != ID_AA64MMFR2_CnP_NONE) { 7430 if (bootverbose) 7431 printf("Enabling CnP\n"); 7432 cpuid = curcpu; 7433 smp_rendezvous(NULL, pmap_set_cnp, NULL, &cpuid); 7434 } 7435 7436 } 7437 SYSINIT(pmap_init_cnp, SI_SUB_SMP, SI_ORDER_ANY, pmap_init_cnp, NULL); 7438 7439 static bool 7440 pmap_activate_int(pmap_t pmap) 7441 { 7442 struct asid_set *set; 7443 int epoch; 7444 7445 KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap")); 7446 KASSERT(pmap != kernel_pmap, ("kernel pmap activation")); 7447 7448 if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) || 7449 (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) { 7450 /* 7451 * Handle the possibility that the old thread was preempted 7452 * after an "ic" or "tlbi" instruction but before it performed 7453 * a "dsb" instruction. If the old thread migrates to a new 7454 * processor, its completion of a "dsb" instruction on that 7455 * new processor does not guarantee that the "ic" or "tlbi" 7456 * instructions performed on the old processor have completed. 7457 */ 7458 dsb(ish); 7459 return (false); 7460 } 7461 7462 set = pmap->pm_asid_set; 7463 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 7464 7465 /* 7466 * Ensure that the store to curpmap is globally visible before the 7467 * load from asid_epoch is performed. 7468 */ 7469 if (pmap->pm_stage == PM_STAGE1) 7470 PCPU_SET(curpmap, pmap); 7471 else 7472 PCPU_SET(curvmpmap, pmap); 7473 dsb(ish); 7474 epoch = COOKIE_TO_EPOCH(pmap->pm_cookie); 7475 if (epoch >= 0 && epoch != set->asid_epoch) 7476 pmap_alloc_asid(pmap); 7477 7478 if (pmap->pm_stage == PM_STAGE1) { 7479 set_ttbr0(pmap_to_ttbr0(pmap)); 7480 if (PCPU_GET(bcast_tlbi_workaround) != 0) 7481 invalidate_local_icache(); 7482 } 7483 return (true); 7484 } 7485 7486 void 7487 pmap_activate_vm(pmap_t pmap) 7488 { 7489 7490 PMAP_ASSERT_STAGE2(pmap); 7491 7492 (void)pmap_activate_int(pmap); 7493 } 7494 7495 void 7496 pmap_activate(struct thread *td) 7497 { 7498 pmap_t pmap; 7499 7500 pmap = vmspace_pmap(td->td_proc->p_vmspace); 7501 PMAP_ASSERT_STAGE1(pmap); 7502 critical_enter(); 7503 (void)pmap_activate_int(pmap); 7504 critical_exit(); 7505 } 7506 7507 /* 7508 * Activate the thread we are switching to. 7509 * To simplify the assembly in cpu_throw return the new threads pcb. 7510 */ 7511 struct pcb * 7512 pmap_switch(struct thread *new) 7513 { 7514 pcpu_bp_harden bp_harden; 7515 struct pcb *pcb; 7516 7517 /* Store the new curthread */ 7518 PCPU_SET(curthread, new); 7519 7520 /* And the new pcb */ 7521 pcb = new->td_pcb; 7522 PCPU_SET(curpcb, pcb); 7523 7524 /* 7525 * TODO: We may need to flush the cache here if switching 7526 * to a user process. 7527 */ 7528 7529 if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) { 7530 /* 7531 * Stop userspace from training the branch predictor against 7532 * other processes. This will call into a CPU specific 7533 * function that clears the branch predictor state. 7534 */ 7535 bp_harden = PCPU_GET(bp_harden); 7536 if (bp_harden != NULL) 7537 bp_harden(); 7538 } 7539 7540 return (pcb); 7541 } 7542 7543 void 7544 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) 7545 { 7546 7547 PMAP_ASSERT_STAGE1(pmap); 7548 KASSERT(ADDR_IS_CANONICAL(va), 7549 ("%s: Address not in canonical form: %lx", __func__, va)); 7550 7551 if (ADDR_IS_KERNEL(va)) { 7552 cpu_icache_sync_range(va, sz); 7553 } else { 7554 u_int len, offset; 7555 vm_paddr_t pa; 7556 7557 /* Find the length of data in this page to flush */ 7558 offset = va & PAGE_MASK; 7559 len = imin(PAGE_SIZE - offset, sz); 7560 7561 while (sz != 0) { 7562 /* Extract the physical address & find it in the DMAP */ 7563 pa = pmap_extract(pmap, va); 7564 if (pa != 0) 7565 cpu_icache_sync_range(PHYS_TO_DMAP(pa), len); 7566 7567 /* Move to the next page */ 7568 sz -= len; 7569 va += len; 7570 /* Set the length for the next iteration */ 7571 len = imin(PAGE_SIZE, sz); 7572 } 7573 } 7574 } 7575 7576 static int 7577 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far) 7578 { 7579 pd_entry_t *pdep; 7580 pt_entry_t *ptep, pte; 7581 int rv, lvl, dfsc; 7582 7583 PMAP_ASSERT_STAGE2(pmap); 7584 rv = KERN_FAILURE; 7585 7586 /* Data and insn aborts use same encoding for FSC field. */ 7587 dfsc = esr & ISS_DATA_DFSC_MASK; 7588 switch (dfsc) { 7589 case ISS_DATA_DFSC_TF_L0: 7590 case ISS_DATA_DFSC_TF_L1: 7591 case ISS_DATA_DFSC_TF_L2: 7592 case ISS_DATA_DFSC_TF_L3: 7593 PMAP_LOCK(pmap); 7594 pdep = pmap_pde(pmap, far, &lvl); 7595 if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) { 7596 PMAP_UNLOCK(pmap); 7597 break; 7598 } 7599 7600 switch (lvl) { 7601 case 0: 7602 ptep = pmap_l0_to_l1(pdep, far); 7603 break; 7604 case 1: 7605 ptep = pmap_l1_to_l2(pdep, far); 7606 break; 7607 case 2: 7608 ptep = pmap_l2_to_l3(pdep, far); 7609 break; 7610 default: 7611 panic("%s: Invalid pde level %d", __func__,lvl); 7612 } 7613 goto fault_exec; 7614 7615 case ISS_DATA_DFSC_AFF_L1: 7616 case ISS_DATA_DFSC_AFF_L2: 7617 case ISS_DATA_DFSC_AFF_L3: 7618 PMAP_LOCK(pmap); 7619 ptep = pmap_pte(pmap, far, &lvl); 7620 fault_exec: 7621 if (ptep != NULL && (pte = pmap_load(ptep)) != 0) { 7622 if (icache_vmid) { 7623 pmap_invalidate_vpipt_icache(); 7624 } else { 7625 /* 7626 * If accessing an executable page invalidate 7627 * the I-cache so it will be valid when we 7628 * continue execution in the guest. The D-cache 7629 * is assumed to already be clean to the Point 7630 * of Coherency. 7631 */ 7632 if ((pte & ATTR_S2_XN_MASK) != 7633 ATTR_S2_XN(ATTR_S2_XN_NONE)) { 7634 invalidate_icache(); 7635 } 7636 } 7637 pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID); 7638 rv = KERN_SUCCESS; 7639 } 7640 PMAP_UNLOCK(pmap); 7641 break; 7642 } 7643 7644 return (rv); 7645 } 7646 7647 int 7648 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far) 7649 { 7650 pt_entry_t pte, *ptep; 7651 register_t intr; 7652 uint64_t ec, par; 7653 int lvl, rv; 7654 7655 rv = KERN_FAILURE; 7656 7657 ec = ESR_ELx_EXCEPTION(esr); 7658 switch (ec) { 7659 case EXCP_INSN_ABORT_L: 7660 case EXCP_INSN_ABORT: 7661 case EXCP_DATA_ABORT_L: 7662 case EXCP_DATA_ABORT: 7663 break; 7664 default: 7665 return (rv); 7666 } 7667 7668 if (pmap->pm_stage == PM_STAGE2) 7669 return (pmap_stage2_fault(pmap, esr, far)); 7670 7671 /* Data and insn aborts use same encoding for FSC field. */ 7672 switch (esr & ISS_DATA_DFSC_MASK) { 7673 case ISS_DATA_DFSC_AFF_L1: 7674 case ISS_DATA_DFSC_AFF_L2: 7675 case ISS_DATA_DFSC_AFF_L3: 7676 PMAP_LOCK(pmap); 7677 ptep = pmap_pte(pmap, far, &lvl); 7678 if (ptep != NULL) { 7679 pmap_set_bits(ptep, ATTR_AF); 7680 rv = KERN_SUCCESS; 7681 /* 7682 * XXXMJ as an optimization we could mark the entry 7683 * dirty if this is a write fault. 7684 */ 7685 } 7686 PMAP_UNLOCK(pmap); 7687 break; 7688 case ISS_DATA_DFSC_PF_L1: 7689 case ISS_DATA_DFSC_PF_L2: 7690 case ISS_DATA_DFSC_PF_L3: 7691 if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) || 7692 (esr & ISS_DATA_WnR) == 0) 7693 return (rv); 7694 PMAP_LOCK(pmap); 7695 ptep = pmap_pte(pmap, far, &lvl); 7696 if (ptep != NULL && 7697 ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) { 7698 if ((pte & ATTR_S1_AP_RW_BIT) == 7699 ATTR_S1_AP(ATTR_S1_AP_RO)) { 7700 pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT); 7701 pmap_s1_invalidate_page(pmap, far, true); 7702 } 7703 rv = KERN_SUCCESS; 7704 } 7705 PMAP_UNLOCK(pmap); 7706 break; 7707 case ISS_DATA_DFSC_TF_L0: 7708 case ISS_DATA_DFSC_TF_L1: 7709 case ISS_DATA_DFSC_TF_L2: 7710 case ISS_DATA_DFSC_TF_L3: 7711 /* 7712 * Retry the translation. A break-before-make sequence can 7713 * produce a transient fault. 7714 */ 7715 if (pmap == kernel_pmap) { 7716 /* 7717 * The translation fault may have occurred within a 7718 * critical section. Therefore, we must check the 7719 * address without acquiring the kernel pmap's lock. 7720 */ 7721 if (pmap_klookup(far, NULL)) 7722 rv = KERN_SUCCESS; 7723 } else { 7724 PMAP_LOCK(pmap); 7725 /* Ask the MMU to check the address. */ 7726 intr = intr_disable(); 7727 par = arm64_address_translate_s1e0r(far); 7728 intr_restore(intr); 7729 PMAP_UNLOCK(pmap); 7730 7731 /* 7732 * If the translation was successful, then we can 7733 * return success to the trap handler. 7734 */ 7735 if (PAR_SUCCESS(par)) 7736 rv = KERN_SUCCESS; 7737 } 7738 break; 7739 } 7740 7741 return (rv); 7742 } 7743 7744 /* 7745 * Increase the starting virtual address of the given mapping if a 7746 * different alignment might result in more superpage mappings. 7747 */ 7748 void 7749 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 7750 vm_offset_t *addr, vm_size_t size) 7751 { 7752 vm_offset_t superpage_offset; 7753 7754 if (size < L2_SIZE) 7755 return; 7756 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 7757 offset += ptoa(object->pg_color); 7758 superpage_offset = offset & L2_OFFSET; 7759 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || 7760 (*addr & L2_OFFSET) == superpage_offset) 7761 return; 7762 if ((*addr & L2_OFFSET) < superpage_offset) 7763 *addr = (*addr & ~L2_OFFSET) + superpage_offset; 7764 else 7765 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; 7766 } 7767 7768 /** 7769 * Get the kernel virtual address of a set of physical pages. If there are 7770 * physical addresses not covered by the DMAP perform a transient mapping 7771 * that will be removed when calling pmap_unmap_io_transient. 7772 * 7773 * \param page The pages the caller wishes to obtain the virtual 7774 * address on the kernel memory map. 7775 * \param vaddr On return contains the kernel virtual memory address 7776 * of the pages passed in the page parameter. 7777 * \param count Number of pages passed in. 7778 * \param can_fault true if the thread using the mapped pages can take 7779 * page faults, false otherwise. 7780 * 7781 * \returns true if the caller must call pmap_unmap_io_transient when 7782 * finished or false otherwise. 7783 * 7784 */ 7785 bool 7786 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 7787 bool can_fault) 7788 { 7789 vm_paddr_t paddr; 7790 bool needs_mapping; 7791 int error __diagused, i; 7792 7793 /* 7794 * Allocate any KVA space that we need, this is done in a separate 7795 * loop to prevent calling vmem_alloc while pinned. 7796 */ 7797 needs_mapping = false; 7798 for (i = 0; i < count; i++) { 7799 paddr = VM_PAGE_TO_PHYS(page[i]); 7800 if (__predict_false(!PHYS_IN_DMAP(paddr))) { 7801 error = vmem_alloc(kernel_arena, PAGE_SIZE, 7802 M_BESTFIT | M_WAITOK, &vaddr[i]); 7803 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 7804 needs_mapping = true; 7805 } else { 7806 vaddr[i] = PHYS_TO_DMAP(paddr); 7807 } 7808 } 7809 7810 /* Exit early if everything is covered by the DMAP */ 7811 if (!needs_mapping) 7812 return (false); 7813 7814 if (!can_fault) 7815 sched_pin(); 7816 for (i = 0; i < count; i++) { 7817 paddr = VM_PAGE_TO_PHYS(page[i]); 7818 if (!PHYS_IN_DMAP(paddr)) { 7819 panic( 7820 "pmap_map_io_transient: TODO: Map out of DMAP data"); 7821 } 7822 } 7823 7824 return (needs_mapping); 7825 } 7826 7827 void 7828 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 7829 bool can_fault) 7830 { 7831 vm_paddr_t paddr; 7832 int i; 7833 7834 if (!can_fault) 7835 sched_unpin(); 7836 for (i = 0; i < count; i++) { 7837 paddr = VM_PAGE_TO_PHYS(page[i]); 7838 if (!PHYS_IN_DMAP(paddr)) { 7839 panic("ARM64TODO: pmap_unmap_io_transient: Unmap data"); 7840 } 7841 } 7842 } 7843 7844 boolean_t 7845 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 7846 { 7847 7848 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH); 7849 } 7850 7851 static pt_entry_t 7852 pmap_pte_bti(pmap_t pmap, vm_offset_t va __diagused) 7853 { 7854 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7855 MPASS(ADDR_IS_CANONICAL(va)); 7856 7857 if (pmap->pm_stage != PM_STAGE1) 7858 return (0); 7859 if (pmap == kernel_pmap) 7860 return (ATTR_KERN_GP); 7861 return (0); 7862 } 7863 7864 #if defined(KASAN) 7865 static vm_paddr_t pmap_san_early_kernstart; 7866 static pd_entry_t *pmap_san_early_l2; 7867 7868 void __nosanitizeaddress 7869 pmap_san_bootstrap(struct arm64_bootparams *abp) 7870 { 7871 7872 pmap_san_early_kernstart = pmap_early_vtophys(KERNBASE); 7873 kasan_init_early(abp->kern_stack, KSTACK_PAGES * PAGE_SIZE); 7874 } 7875 7876 #define SAN_BOOTSTRAP_L2_SIZE (1 * L2_SIZE) 7877 #define SAN_BOOTSTRAP_SIZE (2 * PAGE_SIZE) 7878 static vm_offset_t __nosanitizeaddress 7879 pmap_san_enter_bootstrap_alloc_l2(void) 7880 { 7881 static uint8_t bootstrap_data[SAN_BOOTSTRAP_L2_SIZE] __aligned(L2_SIZE); 7882 static size_t offset = 0; 7883 vm_offset_t addr; 7884 7885 if (offset + L2_SIZE > sizeof(bootstrap_data)) { 7886 panic("%s: out of memory for the bootstrap shadow map L2 entries", 7887 __func__); 7888 } 7889 7890 addr = (uintptr_t)&bootstrap_data[offset]; 7891 offset += L2_SIZE; 7892 return (addr); 7893 } 7894 7895 /* 7896 * SAN L1 + L2 pages, maybe L3 entries later? 7897 */ 7898 static vm_offset_t __nosanitizeaddress 7899 pmap_san_enter_bootstrap_alloc_pages(int npages) 7900 { 7901 static uint8_t bootstrap_data[SAN_BOOTSTRAP_SIZE] __aligned(PAGE_SIZE); 7902 static size_t offset = 0; 7903 vm_offset_t addr; 7904 7905 if (offset + (npages * PAGE_SIZE) > sizeof(bootstrap_data)) { 7906 panic("%s: out of memory for the bootstrap shadow map", 7907 __func__); 7908 } 7909 7910 addr = (uintptr_t)&bootstrap_data[offset]; 7911 offset += (npages * PAGE_SIZE); 7912 return (addr); 7913 } 7914 7915 static void __nosanitizeaddress 7916 pmap_san_enter_bootstrap(void) 7917 { 7918 vm_offset_t freemempos; 7919 7920 /* L1, L2 */ 7921 freemempos = pmap_san_enter_bootstrap_alloc_pages(2); 7922 bs_state.freemempos = freemempos; 7923 bs_state.va = KASAN_MIN_ADDRESS; 7924 pmap_bootstrap_l1_table(&bs_state); 7925 pmap_san_early_l2 = bs_state.l2; 7926 } 7927 7928 static vm_page_t 7929 pmap_san_enter_alloc_l3(void) 7930 { 7931 vm_page_t m; 7932 7933 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 7934 VM_ALLOC_ZERO); 7935 if (m == NULL) 7936 panic("%s: no memory to grow shadow map", __func__); 7937 return (m); 7938 } 7939 7940 static vm_page_t 7941 pmap_san_enter_alloc_l2(void) 7942 { 7943 return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO, 7944 Ln_ENTRIES, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT)); 7945 } 7946 7947 void __nosanitizeaddress 7948 pmap_san_enter(vm_offset_t va) 7949 { 7950 pd_entry_t *l1, *l2; 7951 pt_entry_t *l3; 7952 vm_page_t m; 7953 7954 if (virtual_avail == 0) { 7955 vm_offset_t block; 7956 int slot; 7957 bool first; 7958 7959 /* Temporary shadow map prior to pmap_bootstrap(). */ 7960 first = pmap_san_early_l2 == NULL; 7961 if (first) 7962 pmap_san_enter_bootstrap(); 7963 7964 l2 = pmap_san_early_l2; 7965 slot = pmap_l2_index(va); 7966 7967 if ((pmap_load(&l2[slot]) & ATTR_DESCR_VALID) == 0) { 7968 MPASS(first); 7969 block = pmap_san_enter_bootstrap_alloc_l2(); 7970 pmap_store(&l2[slot], 7971 PHYS_TO_PTE(pmap_early_vtophys(block)) | 7972 PMAP_SAN_PTE_BITS | L2_BLOCK); 7973 dmb(ishst); 7974 } 7975 7976 return; 7977 } 7978 7979 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 7980 l1 = pmap_l1(kernel_pmap, va); 7981 MPASS(l1 != NULL); 7982 if ((pmap_load(l1) & ATTR_DESCR_VALID) == 0) { 7983 m = pmap_san_enter_alloc_l3(); 7984 pmap_store(l1, PHYS_TO_PTE(VM_PAGE_TO_PHYS(m)) | L1_TABLE); 7985 } 7986 l2 = pmap_l1_to_l2(l1, va); 7987 if ((pmap_load(l2) & ATTR_DESCR_VALID) == 0) { 7988 m = pmap_san_enter_alloc_l2(); 7989 if (m != NULL) { 7990 pmap_store(l2, PHYS_TO_PTE(VM_PAGE_TO_PHYS(m)) | 7991 PMAP_SAN_PTE_BITS | L2_BLOCK); 7992 } else { 7993 m = pmap_san_enter_alloc_l3(); 7994 pmap_store(l2, PHYS_TO_PTE(VM_PAGE_TO_PHYS(m)) | 7995 L2_TABLE); 7996 } 7997 dmb(ishst); 7998 } 7999 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) 8000 return; 8001 l3 = pmap_l2_to_l3(l2, va); 8002 if ((pmap_load(l3) & ATTR_DESCR_VALID) != 0) 8003 return; 8004 m = pmap_san_enter_alloc_l3(); 8005 pmap_store(l3, PHYS_TO_PTE(VM_PAGE_TO_PHYS(m)) | 8006 PMAP_SAN_PTE_BITS | L3_PAGE); 8007 dmb(ishst); 8008 } 8009 #endif /* KASAN */ 8010 8011 /* 8012 * Track a range of the kernel's virtual address space that is contiguous 8013 * in various mapping attributes. 8014 */ 8015 struct pmap_kernel_map_range { 8016 vm_offset_t sva; 8017 pt_entry_t attrs; 8018 int l3pages; 8019 int l3contig; 8020 int l2blocks; 8021 int l1blocks; 8022 }; 8023 8024 static void 8025 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 8026 vm_offset_t eva) 8027 { 8028 const char *mode; 8029 int index; 8030 8031 if (eva <= range->sva) 8032 return; 8033 8034 index = range->attrs & ATTR_S1_IDX_MASK; 8035 switch (index) { 8036 case ATTR_S1_IDX(VM_MEMATTR_DEVICE_NP): 8037 mode = "DEV-NP"; 8038 break; 8039 case ATTR_S1_IDX(VM_MEMATTR_DEVICE): 8040 mode = "DEV"; 8041 break; 8042 case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE): 8043 mode = "UC"; 8044 break; 8045 case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK): 8046 mode = "WB"; 8047 break; 8048 case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH): 8049 mode = "WT"; 8050 break; 8051 default: 8052 printf( 8053 "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n", 8054 __func__, index, range->sva, eva); 8055 mode = "??"; 8056 break; 8057 } 8058 8059 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c%c %6s %d %d %d %d\n", 8060 range->sva, eva, 8061 (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-', 8062 (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x', 8063 (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X', 8064 (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's', 8065 (range->attrs & ATTR_S1_GP) != 0 ? 'g' : '-', 8066 mode, range->l1blocks, range->l2blocks, range->l3contig, 8067 range->l3pages); 8068 8069 /* Reset to sentinel value. */ 8070 range->sva = 0xfffffffffffffffful; 8071 } 8072 8073 /* 8074 * Determine whether the attributes specified by a page table entry match those 8075 * being tracked by the current range. 8076 */ 8077 static bool 8078 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 8079 { 8080 8081 return (range->attrs == attrs); 8082 } 8083 8084 static void 8085 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 8086 pt_entry_t attrs) 8087 { 8088 8089 memset(range, 0, sizeof(*range)); 8090 range->sva = va; 8091 range->attrs = attrs; 8092 } 8093 8094 /* Get the block/page attributes that correspond to the table attributes */ 8095 static pt_entry_t 8096 sysctl_kmaps_table_attrs(pd_entry_t table) 8097 { 8098 pt_entry_t attrs; 8099 8100 attrs = 0; 8101 if ((table & TATTR_UXN_TABLE) != 0) 8102 attrs |= ATTR_S1_UXN; 8103 if ((table & TATTR_PXN_TABLE) != 0) 8104 attrs |= ATTR_S1_PXN; 8105 if ((table & TATTR_AP_TABLE_RO) != 0) 8106 attrs |= ATTR_S1_AP(ATTR_S1_AP_RO); 8107 8108 return (attrs); 8109 } 8110 8111 /* Read the block/page attributes we care about */ 8112 static pt_entry_t 8113 sysctl_kmaps_block_attrs(pt_entry_t block) 8114 { 8115 return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK | 8116 ATTR_S1_GP)); 8117 } 8118 8119 /* 8120 * Given a leaf PTE, derive the mapping's attributes. If they do not match 8121 * those of the current run, dump the address range and its attributes, and 8122 * begin a new run. 8123 */ 8124 static void 8125 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 8126 vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e, 8127 pt_entry_t l3e) 8128 { 8129 pt_entry_t attrs; 8130 8131 attrs = sysctl_kmaps_table_attrs(l0e); 8132 8133 if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 8134 attrs |= sysctl_kmaps_block_attrs(l1e); 8135 goto done; 8136 } 8137 attrs |= sysctl_kmaps_table_attrs(l1e); 8138 8139 if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 8140 attrs |= sysctl_kmaps_block_attrs(l2e); 8141 goto done; 8142 } 8143 attrs |= sysctl_kmaps_table_attrs(l2e); 8144 attrs |= sysctl_kmaps_block_attrs(l3e); 8145 8146 done: 8147 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 8148 sysctl_kmaps_dump(sb, range, va); 8149 sysctl_kmaps_reinit(range, va, attrs); 8150 } 8151 } 8152 8153 static int 8154 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 8155 { 8156 struct pmap_kernel_map_range range; 8157 struct sbuf sbuf, *sb; 8158 pd_entry_t l0e, *l1, l1e, *l2, l2e; 8159 pt_entry_t *l3, l3e; 8160 vm_offset_t sva; 8161 vm_paddr_t pa; 8162 int error, i, j, k, l; 8163 8164 error = sysctl_wire_old_buffer(req, 0); 8165 if (error != 0) 8166 return (error); 8167 sb = &sbuf; 8168 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 8169 8170 /* Sentinel value. */ 8171 range.sva = 0xfffffffffffffffful; 8172 8173 /* 8174 * Iterate over the kernel page tables without holding the kernel pmap 8175 * lock. Kernel page table pages are never freed, so at worst we will 8176 * observe inconsistencies in the output. 8177 */ 8178 for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES; 8179 i++) { 8180 if (i == pmap_l0_index(DMAP_MIN_ADDRESS)) 8181 sbuf_printf(sb, "\nDirect map:\n"); 8182 else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS)) 8183 sbuf_printf(sb, "\nKernel map:\n"); 8184 #ifdef KASAN 8185 else if (i == pmap_l0_index(KASAN_MIN_ADDRESS)) 8186 sbuf_printf(sb, "\nKASAN shadow map:\n"); 8187 #endif 8188 8189 l0e = kernel_pmap->pm_l0[i]; 8190 if ((l0e & ATTR_DESCR_VALID) == 0) { 8191 sysctl_kmaps_dump(sb, &range, sva); 8192 sva += L0_SIZE; 8193 continue; 8194 } 8195 pa = PTE_TO_PHYS(l0e); 8196 l1 = (pd_entry_t *)PHYS_TO_DMAP(pa); 8197 8198 for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) { 8199 l1e = l1[j]; 8200 if ((l1e & ATTR_DESCR_VALID) == 0) { 8201 sysctl_kmaps_dump(sb, &range, sva); 8202 sva += L1_SIZE; 8203 continue; 8204 } 8205 if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) { 8206 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 8207 sysctl_kmaps_check(sb, &range, sva, l0e, l1e, 8208 0, 0); 8209 range.l1blocks++; 8210 sva += L1_SIZE; 8211 continue; 8212 } 8213 pa = PTE_TO_PHYS(l1e); 8214 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); 8215 8216 for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) { 8217 l2e = l2[k]; 8218 if ((l2e & ATTR_DESCR_VALID) == 0) { 8219 sysctl_kmaps_dump(sb, &range, sva); 8220 sva += L2_SIZE; 8221 continue; 8222 } 8223 if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) { 8224 sysctl_kmaps_check(sb, &range, sva, 8225 l0e, l1e, l2e, 0); 8226 range.l2blocks++; 8227 sva += L2_SIZE; 8228 continue; 8229 } 8230 pa = PTE_TO_PHYS(l2e); 8231 l3 = (pt_entry_t *)PHYS_TO_DMAP(pa); 8232 8233 for (l = pmap_l3_index(sva); l < Ln_ENTRIES; 8234 l++, sva += L3_SIZE) { 8235 l3e = l3[l]; 8236 if ((l3e & ATTR_DESCR_VALID) == 0) { 8237 sysctl_kmaps_dump(sb, &range, 8238 sva); 8239 continue; 8240 } 8241 sysctl_kmaps_check(sb, &range, sva, 8242 l0e, l1e, l2e, l3e); 8243 if ((l3e & ATTR_CONTIGUOUS) != 0) 8244 range.l3contig += l % 16 == 0 ? 8245 1 : 0; 8246 else 8247 range.l3pages++; 8248 } 8249 } 8250 } 8251 } 8252 8253 error = sbuf_finish(sb); 8254 sbuf_delete(sb); 8255 return (error); 8256 } 8257 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 8258 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 8259 NULL, 0, sysctl_kmaps, "A", 8260 "Dump kernel address layout"); 8261