1 /*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2003 Peter Wemm 9 * All rights reserved. 10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 11 * All rights reserved. 12 * Copyright (c) 2014 Andrew Turner 13 * All rights reserved. 14 * Copyright (c) 2014-2016 The FreeBSD Foundation 15 * All rights reserved. 16 * 17 * This code is derived from software contributed to Berkeley by 18 * the Systems Programming Group of the University of Utah Computer 19 * Science Department and William Jolitz of UUNET Technologies Inc. 20 * 21 * This software was developed by Andrew Turner under sponsorship from 22 * the FreeBSD Foundation. 23 * 24 * Redistribution and use in source and binary forms, with or without 25 * modification, are permitted provided that the following conditions 26 * are met: 27 * 1. Redistributions of source code must retain the above copyright 28 * notice, this list of conditions and the following disclaimer. 29 * 2. Redistributions in binary form must reproduce the above copyright 30 * notice, this list of conditions and the following disclaimer in the 31 * documentation and/or other materials provided with the distribution. 32 * 3. All advertising materials mentioning features or use of this software 33 * must display the following acknowledgement: 34 * This product includes software developed by the University of 35 * California, Berkeley and its contributors. 36 * 4. Neither the name of the University nor the names of its contributors 37 * may be used to endorse or promote products derived from this software 38 * without specific prior written permission. 39 * 40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 43 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 50 * SUCH DAMAGE. 51 * 52 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 53 */ 54 /*- 55 * Copyright (c) 2003 Networks Associates Technology, Inc. 56 * All rights reserved. 57 * 58 * This software was developed for the FreeBSD Project by Jake Burkholder, 59 * Safeport Network Services, and Network Associates Laboratories, the 60 * Security Research Division of Network Associates, Inc. under 61 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 62 * CHATS research program. 63 * 64 * Redistribution and use in source and binary forms, with or without 65 * modification, are permitted provided that the following conditions 66 * are met: 67 * 1. Redistributions of source code must retain the above copyright 68 * notice, this list of conditions and the following disclaimer. 69 * 2. Redistributions in binary form must reproduce the above copyright 70 * notice, this list of conditions and the following disclaimer in the 71 * documentation and/or other materials provided with the distribution. 72 * 73 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 74 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 75 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 76 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 77 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 78 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 79 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 80 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 81 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 82 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 83 * SUCH DAMAGE. 84 */ 85 86 #include <sys/cdefs.h> 87 __FBSDID("$FreeBSD$"); 88 89 /* 90 * Manages physical address maps. 91 * 92 * Since the information managed by this module is 93 * also stored by the logical address mapping module, 94 * this module may throw away valid virtual-to-physical 95 * mappings at almost any time. However, invalidations 96 * of virtual-to-physical mappings must be done as 97 * requested. 98 * 99 * In order to cope with hardware architectures which 100 * make virtual-to-physical map invalidates expensive, 101 * this module may delay invalidate or reduced protection 102 * operations until such time as they are actually 103 * necessary. This module is given full information as 104 * to which processors are currently using which maps, 105 * and to when physical maps must be made correct. 106 */ 107 108 #include "opt_vm.h" 109 110 #include <sys/param.h> 111 #include <sys/bitstring.h> 112 #include <sys/bus.h> 113 #include <sys/systm.h> 114 #include <sys/kernel.h> 115 #include <sys/ktr.h> 116 #include <sys/limits.h> 117 #include <sys/lock.h> 118 #include <sys/malloc.h> 119 #include <sys/mman.h> 120 #include <sys/msgbuf.h> 121 #include <sys/mutex.h> 122 #include <sys/physmem.h> 123 #include <sys/proc.h> 124 #include <sys/rwlock.h> 125 #include <sys/sbuf.h> 126 #include <sys/sx.h> 127 #include <sys/vmem.h> 128 #include <sys/vmmeter.h> 129 #include <sys/sched.h> 130 #include <sys/sysctl.h> 131 #include <sys/_unrhdr.h> 132 #include <sys/smp.h> 133 134 #include <vm/vm.h> 135 #include <vm/vm_param.h> 136 #include <vm/vm_kern.h> 137 #include <vm/vm_page.h> 138 #include <vm/vm_map.h> 139 #include <vm/vm_object.h> 140 #include <vm/vm_extern.h> 141 #include <vm/vm_pageout.h> 142 #include <vm/vm_pager.h> 143 #include <vm/vm_phys.h> 144 #include <vm/vm_radix.h> 145 #include <vm/vm_reserv.h> 146 #include <vm/vm_dumpset.h> 147 #include <vm/uma.h> 148 149 #include <machine/machdep.h> 150 #include <machine/md_var.h> 151 #include <machine/pcb.h> 152 153 #ifdef NUMA 154 #define PMAP_MEMDOM MAXMEMDOM 155 #else 156 #define PMAP_MEMDOM 1 157 #endif 158 159 #define PMAP_ASSERT_STAGE1(pmap) MPASS((pmap)->pm_stage == PM_STAGE1) 160 #define PMAP_ASSERT_STAGE2(pmap) MPASS((pmap)->pm_stage == PM_STAGE2) 161 162 #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t))) 163 #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t))) 164 #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t))) 165 #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t))) 166 167 #define NUL0E L0_ENTRIES 168 #define NUL1E (NUL0E * NL1PG) 169 #define NUL2E (NUL1E * NL2PG) 170 171 #if !defined(DIAGNOSTIC) 172 #ifdef __GNUC_GNU_INLINE__ 173 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 174 #else 175 #define PMAP_INLINE extern inline 176 #endif 177 #else 178 #define PMAP_INLINE 179 #endif 180 181 #ifdef PV_STATS 182 #define PV_STAT(x) do { x ; } while (0) 183 #define __pvused 184 #else 185 #define PV_STAT(x) do { } while (0) 186 #define __pvused __unused 187 #endif 188 189 #define pmap_l0_pindex(v) (NUL2E + NUL1E + ((v) >> L0_SHIFT)) 190 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT)) 191 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) 192 193 struct pmap_large_md_page { 194 struct rwlock pv_lock; 195 struct md_page pv_page; 196 /* Pad to a power of 2, see pmap_init_pv_table(). */ 197 int pv_pad[2]; 198 }; 199 200 static struct pmap_large_md_page * 201 _pa_to_pmdp(vm_paddr_t pa) 202 { 203 struct vm_phys_seg *seg; 204 int segind; 205 206 for (segind = 0; segind < vm_phys_nsegs; segind++) { 207 seg = &vm_phys_segs[segind]; 208 if (pa >= seg->start && pa < seg->end) 209 return ((struct pmap_large_md_page *)seg->md_first + 210 pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start)); 211 } 212 return (NULL); 213 } 214 215 static struct pmap_large_md_page * 216 pa_to_pmdp(vm_paddr_t pa) 217 { 218 struct pmap_large_md_page *pvd; 219 220 pvd = _pa_to_pmdp(pa); 221 if (pvd == NULL) 222 panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa); 223 return (pvd); 224 } 225 226 static struct pmap_large_md_page * 227 page_to_pmdp(vm_page_t m) 228 { 229 struct vm_phys_seg *seg; 230 231 seg = &vm_phys_segs[m->segind]; 232 return ((struct pmap_large_md_page *)seg->md_first + 233 pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start)); 234 } 235 236 #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page)) 237 #define page_to_pvh(m) (&(page_to_pmdp(m)->pv_page)) 238 239 #define PHYS_TO_PV_LIST_LOCK(pa) ({ \ 240 struct pmap_large_md_page *_pvd; \ 241 struct rwlock *_lock; \ 242 _pvd = _pa_to_pmdp(pa); \ 243 if (__predict_false(_pvd == NULL)) \ 244 _lock = &pv_dummy_large.pv_lock; \ 245 else \ 246 _lock = &(_pvd->pv_lock); \ 247 _lock; \ 248 }) 249 250 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 251 struct rwlock **_lockp = (lockp); \ 252 struct rwlock *_new_lock; \ 253 \ 254 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 255 if (_new_lock != *_lockp) { \ 256 if (*_lockp != NULL) \ 257 rw_wunlock(*_lockp); \ 258 *_lockp = _new_lock; \ 259 rw_wlock(*_lockp); \ 260 } \ 261 } while (0) 262 263 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 264 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 265 266 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 267 struct rwlock **_lockp = (lockp); \ 268 \ 269 if (*_lockp != NULL) { \ 270 rw_wunlock(*_lockp); \ 271 *_lockp = NULL; \ 272 } \ 273 } while (0) 274 275 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 276 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 277 278 /* 279 * The presence of this flag indicates that the mapping is writeable. 280 * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise 281 * it is dirty. This flag may only be set on managed mappings. 282 * 283 * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it 284 * as a software managed bit. 285 */ 286 #define ATTR_SW_DBM ATTR_DBM 287 288 struct pmap kernel_pmap_store; 289 290 /* Used for mapping ACPI memory before VM is initialized */ 291 #define PMAP_PREINIT_MAPPING_COUNT 32 292 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE) 293 static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */ 294 static int vm_initialized = 0; /* No need to use pre-init maps when set */ 295 296 /* 297 * Reserve a few L2 blocks starting from 'preinit_map_va' pointer. 298 * Always map entire L2 block for simplicity. 299 * VA of L2 block = preinit_map_va + i * L2_SIZE 300 */ 301 static struct pmap_preinit_mapping { 302 vm_paddr_t pa; 303 vm_offset_t va; 304 vm_size_t size; 305 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 306 307 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 308 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 309 vm_offset_t kernel_vm_end = 0; 310 311 /* 312 * Data for the pv entry allocation mechanism. 313 */ 314 #ifdef NUMA 315 static __inline int 316 pc_to_domain(struct pv_chunk *pc) 317 { 318 return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc))); 319 } 320 #else 321 static __inline int 322 pc_to_domain(struct pv_chunk *pc __unused) 323 { 324 return (0); 325 } 326 #endif 327 328 struct pv_chunks_list { 329 struct mtx pvc_lock; 330 TAILQ_HEAD(pch, pv_chunk) pvc_list; 331 int active_reclaims; 332 } __aligned(CACHE_LINE_SIZE); 333 334 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM]; 335 336 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large; 337 #define pv_dummy pv_dummy_large.pv_page 338 __read_mostly static struct pmap_large_md_page *pv_table; 339 __read_mostly vm_paddr_t pmap_last_pa; 340 341 vm_paddr_t dmap_phys_base; /* The start of the dmap region */ 342 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ 343 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ 344 345 extern pt_entry_t pagetable_l0_ttbr1[]; 346 347 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 348 static vm_paddr_t physmap[PHYSMAP_SIZE]; 349 static u_int physmap_idx; 350 351 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 352 "VM/pmap parameters"); 353 354 #if PAGE_SIZE == PAGE_SIZE_4K 355 #define L1_BLOCKS_SUPPORTED 1 356 #else 357 /* TODO: Make this dynamic when we support FEAT_LPA2 (TCR_EL1.DS == 1) */ 358 #define L1_BLOCKS_SUPPORTED 0 359 #endif 360 361 #define PMAP_ASSERT_L1_BLOCKS_SUPPORTED MPASS(L1_BLOCKS_SUPPORTED) 362 363 /* 364 * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs 365 * that it has currently allocated to a pmap, a cursor ("asid_next") to 366 * optimize its search for a free ASID in the bit vector, and an epoch number 367 * ("asid_epoch") to indicate when it has reclaimed all previously allocated 368 * ASIDs that are not currently active on a processor. 369 * 370 * The current epoch number is always in the range [0, INT_MAX). Negative 371 * numbers and INT_MAX are reserved for special cases that are described 372 * below. 373 */ 374 struct asid_set { 375 int asid_bits; 376 bitstr_t *asid_set; 377 int asid_set_size; 378 int asid_next; 379 int asid_epoch; 380 struct mtx asid_set_mutex; 381 }; 382 383 static struct asid_set asids; 384 static struct asid_set vmids; 385 386 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 387 "ASID allocator"); 388 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0, 389 "The number of bits in an ASID"); 390 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0, 391 "The last allocated ASID plus one"); 392 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0, 393 "The current epoch number"); 394 395 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator"); 396 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0, 397 "The number of bits in an VMID"); 398 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0, 399 "The last allocated VMID plus one"); 400 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0, 401 "The current epoch number"); 402 403 void (*pmap_clean_stage2_tlbi)(void); 404 void (*pmap_invalidate_vpipt_icache)(void); 405 406 /* 407 * A pmap's cookie encodes an ASID and epoch number. Cookies for reserved 408 * ASIDs have a negative epoch number, specifically, INT_MIN. Cookies for 409 * dynamically allocated ASIDs have a non-negative epoch number. 410 * 411 * An invalid ASID is represented by -1. 412 * 413 * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN), 414 * which indicates that an ASID should never be allocated to the pmap, and 415 * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be 416 * allocated when the pmap is next activated. 417 */ 418 #define COOKIE_FROM(asid, epoch) ((long)((u_int)(asid) | \ 419 ((u_long)(epoch) << 32))) 420 #define COOKIE_TO_ASID(cookie) ((int)(cookie)) 421 #define COOKIE_TO_EPOCH(cookie) ((int)((u_long)(cookie) >> 32)) 422 423 #define TLBI_VA_SHIFT 12 424 #define TLBI_VA_MASK ((1ul << 44) - 1) 425 #define TLBI_VA(addr) (((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK) 426 #define TLBI_VA_L3_INCR (L3_SIZE >> TLBI_VA_SHIFT) 427 428 static int superpages_enabled = 1; 429 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, 430 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0, 431 "Are large page mappings enabled?"); 432 433 /* 434 * Internal flags for pmap_enter()'s helper functions. 435 */ 436 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 437 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 438 439 TAILQ_HEAD(pv_chunklist, pv_chunk); 440 441 static void free_pv_chunk(struct pv_chunk *pc); 442 static void free_pv_chunk_batch(struct pv_chunklist *batch); 443 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 444 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 445 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 446 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 447 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 448 vm_offset_t va); 449 450 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); 451 static bool pmap_activate_int(pmap_t pmap); 452 static void pmap_alloc_asid(pmap_t pmap); 453 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, 454 vm_prot_t prot, int mode, bool skip_unmapped); 455 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va); 456 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, 457 vm_offset_t va, struct rwlock **lockp); 458 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); 459 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 460 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 461 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, 462 u_int flags, vm_page_t m, struct rwlock **lockp); 463 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 464 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp); 465 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, 466 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp); 467 static void pmap_reset_asid_set(pmap_t pmap); 468 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 469 vm_page_t m, struct rwlock **lockp); 470 471 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, 472 struct rwlock **lockp); 473 474 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, 475 struct spglist *free); 476 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 477 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 478 479 /* 480 * These load the old table data and store the new value. 481 * They need to be atomic as the System MMU may write to the table at 482 * the same time as the CPU. 483 */ 484 #define pmap_clear(table) atomic_store_64(table, 0) 485 #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits) 486 #define pmap_load(table) (*table) 487 #define pmap_load_clear(table) atomic_swap_64(table, 0) 488 #define pmap_load_store(table, entry) atomic_swap_64(table, entry) 489 #define pmap_set_bits(table, bits) atomic_set_64(table, bits) 490 #define pmap_store(table, entry) atomic_store_64(table, entry) 491 492 /********************/ 493 /* Inline functions */ 494 /********************/ 495 496 static __inline void 497 pagecopy(void *s, void *d) 498 { 499 500 memcpy(d, s, PAGE_SIZE); 501 } 502 503 static __inline pd_entry_t * 504 pmap_l0(pmap_t pmap, vm_offset_t va) 505 { 506 507 return (&pmap->pm_l0[pmap_l0_index(va)]); 508 } 509 510 static __inline pd_entry_t * 511 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) 512 { 513 pd_entry_t *l1; 514 515 l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); 516 return (&l1[pmap_l1_index(va)]); 517 } 518 519 static __inline pd_entry_t * 520 pmap_l1(pmap_t pmap, vm_offset_t va) 521 { 522 pd_entry_t *l0; 523 524 l0 = pmap_l0(pmap, va); 525 if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE) 526 return (NULL); 527 528 return (pmap_l0_to_l1(l0, va)); 529 } 530 531 static __inline pd_entry_t * 532 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va) 533 { 534 pd_entry_t l1, *l2p; 535 536 l1 = pmap_load(l1p); 537 538 KASSERT(ADDR_IS_CANONICAL(va), 539 ("%s: Address not in canonical form: %lx", __func__, va)); 540 /* 541 * The valid bit may be clear if pmap_update_entry() is concurrently 542 * modifying the entry, so for KVA only the entry type may be checked. 543 */ 544 KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0, 545 ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va)); 546 KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, 547 ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va)); 548 l2p = (pd_entry_t *)PHYS_TO_DMAP(l1 & ~ATTR_MASK); 549 return (&l2p[pmap_l2_index(va)]); 550 } 551 552 static __inline pd_entry_t * 553 pmap_l2(pmap_t pmap, vm_offset_t va) 554 { 555 pd_entry_t *l1; 556 557 l1 = pmap_l1(pmap, va); 558 if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE) 559 return (NULL); 560 561 return (pmap_l1_to_l2(l1, va)); 562 } 563 564 static __inline pt_entry_t * 565 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va) 566 { 567 pd_entry_t l2; 568 pt_entry_t *l3p; 569 570 l2 = pmap_load(l2p); 571 572 KASSERT(ADDR_IS_CANONICAL(va), 573 ("%s: Address not in canonical form: %lx", __func__, va)); 574 /* 575 * The valid bit may be clear if pmap_update_entry() is concurrently 576 * modifying the entry, so for KVA only the entry type may be checked. 577 */ 578 KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0, 579 ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va)); 580 KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, 581 ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va)); 582 l3p = (pt_entry_t *)PHYS_TO_DMAP(l2 & ~ATTR_MASK); 583 return (&l3p[pmap_l3_index(va)]); 584 } 585 586 /* 587 * Returns the lowest valid pde for a given virtual address. 588 * The next level may or may not point to a valid page or block. 589 */ 590 static __inline pd_entry_t * 591 pmap_pde(pmap_t pmap, vm_offset_t va, int *level) 592 { 593 pd_entry_t *l0, *l1, *l2, desc; 594 595 l0 = pmap_l0(pmap, va); 596 desc = pmap_load(l0) & ATTR_DESCR_MASK; 597 if (desc != L0_TABLE) { 598 *level = -1; 599 return (NULL); 600 } 601 602 l1 = pmap_l0_to_l1(l0, va); 603 desc = pmap_load(l1) & ATTR_DESCR_MASK; 604 if (desc != L1_TABLE) { 605 *level = 0; 606 return (l0); 607 } 608 609 l2 = pmap_l1_to_l2(l1, va); 610 desc = pmap_load(l2) & ATTR_DESCR_MASK; 611 if (desc != L2_TABLE) { 612 *level = 1; 613 return (l1); 614 } 615 616 *level = 2; 617 return (l2); 618 } 619 620 /* 621 * Returns the lowest valid pte block or table entry for a given virtual 622 * address. If there are no valid entries return NULL and set the level to 623 * the first invalid level. 624 */ 625 static __inline pt_entry_t * 626 pmap_pte(pmap_t pmap, vm_offset_t va, int *level) 627 { 628 pd_entry_t *l1, *l2, desc; 629 pt_entry_t *l3; 630 631 l1 = pmap_l1(pmap, va); 632 if (l1 == NULL) { 633 *level = 0; 634 return (NULL); 635 } 636 desc = pmap_load(l1) & ATTR_DESCR_MASK; 637 if (desc == L1_BLOCK) { 638 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 639 *level = 1; 640 return (l1); 641 } 642 643 if (desc != L1_TABLE) { 644 *level = 1; 645 return (NULL); 646 } 647 648 l2 = pmap_l1_to_l2(l1, va); 649 desc = pmap_load(l2) & ATTR_DESCR_MASK; 650 if (desc == L2_BLOCK) { 651 *level = 2; 652 return (l2); 653 } 654 655 if (desc != L2_TABLE) { 656 *level = 2; 657 return (NULL); 658 } 659 660 *level = 3; 661 l3 = pmap_l2_to_l3(l2, va); 662 if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE) 663 return (NULL); 664 665 return (l3); 666 } 667 668 /* 669 * If the given pmap has an L{1,2}_BLOCK or L3_PAGE entry at the specified 670 * level that maps the specified virtual address, then a pointer to that entry 671 * is returned. Otherwise, NULL is returned, unless INVARIANTS are enabled 672 * and a diagnostic message is provided, in which case this function panics. 673 */ 674 static __always_inline pt_entry_t * 675 pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag) 676 { 677 pd_entry_t *l0p, *l1p, *l2p; 678 pt_entry_t desc, *l3p; 679 int walk_level __diagused; 680 681 KASSERT(level >= 0 && level < 4, 682 ("%s: %s passed an out-of-range level (%d)", __func__, diag, 683 level)); 684 l0p = pmap_l0(pmap, va); 685 desc = pmap_load(l0p) & ATTR_DESCR_MASK; 686 if (desc == L0_TABLE && level > 0) { 687 l1p = pmap_l0_to_l1(l0p, va); 688 desc = pmap_load(l1p) & ATTR_DESCR_MASK; 689 if (desc == L1_BLOCK && level == 1) { 690 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 691 return (l1p); 692 } 693 if (desc == L1_TABLE && level > 1) { 694 l2p = pmap_l1_to_l2(l1p, va); 695 desc = pmap_load(l2p) & ATTR_DESCR_MASK; 696 if (desc == L2_BLOCK && level == 2) 697 return (l2p); 698 else if (desc == L2_TABLE && level > 2) { 699 l3p = pmap_l2_to_l3(l2p, va); 700 desc = pmap_load(l3p) & ATTR_DESCR_MASK; 701 if (desc == L3_PAGE && level == 3) 702 return (l3p); 703 else 704 walk_level = 3; 705 } else 706 walk_level = 2; 707 } else 708 walk_level = 1; 709 } else 710 walk_level = 0; 711 KASSERT(diag == NULL, 712 ("%s: va %#lx not mapped at level %d, desc %ld at level %d", 713 diag, va, level, desc, walk_level)); 714 return (NULL); 715 } 716 717 bool 718 pmap_ps_enabled(pmap_t pmap) 719 { 720 /* 721 * Promotion requires a hypervisor call when the kernel is running 722 * in EL1. To stop this disable superpage support on non-stage 1 723 * pmaps for now. 724 */ 725 if (pmap->pm_stage != PM_STAGE1) 726 return (false); 727 728 return (superpages_enabled != 0); 729 } 730 731 bool 732 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1, 733 pd_entry_t **l2, pt_entry_t **l3) 734 { 735 pd_entry_t *l0p, *l1p, *l2p; 736 737 if (pmap->pm_l0 == NULL) 738 return (false); 739 740 l0p = pmap_l0(pmap, va); 741 *l0 = l0p; 742 743 if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) 744 return (false); 745 746 l1p = pmap_l0_to_l1(l0p, va); 747 *l1 = l1p; 748 749 if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) { 750 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 751 *l2 = NULL; 752 *l3 = NULL; 753 return (true); 754 } 755 756 if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE) 757 return (false); 758 759 l2p = pmap_l1_to_l2(l1p, va); 760 *l2 = l2p; 761 762 if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) { 763 *l3 = NULL; 764 return (true); 765 } 766 767 if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE) 768 return (false); 769 770 *l3 = pmap_l2_to_l3(l2p, va); 771 772 return (true); 773 } 774 775 static __inline int 776 pmap_l3_valid(pt_entry_t l3) 777 { 778 779 return ((l3 & ATTR_DESCR_MASK) == L3_PAGE); 780 } 781 782 CTASSERT(L1_BLOCK == L2_BLOCK); 783 784 static pt_entry_t 785 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr) 786 { 787 pt_entry_t val; 788 789 if (pmap->pm_stage == PM_STAGE1) { 790 val = ATTR_S1_IDX(memattr); 791 if (memattr == VM_MEMATTR_DEVICE) 792 val |= ATTR_S1_XN; 793 return (val); 794 } 795 796 val = 0; 797 798 switch (memattr) { 799 case VM_MEMATTR_DEVICE: 800 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) | 801 ATTR_S2_XN(ATTR_S2_XN_ALL)); 802 case VM_MEMATTR_UNCACHEABLE: 803 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC)); 804 case VM_MEMATTR_WRITE_BACK: 805 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB)); 806 case VM_MEMATTR_WRITE_THROUGH: 807 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT)); 808 default: 809 panic("%s: invalid memory attribute %x", __func__, memattr); 810 } 811 } 812 813 static pt_entry_t 814 pmap_pte_prot(pmap_t pmap, vm_prot_t prot) 815 { 816 pt_entry_t val; 817 818 val = 0; 819 if (pmap->pm_stage == PM_STAGE1) { 820 if ((prot & VM_PROT_EXECUTE) == 0) 821 val |= ATTR_S1_XN; 822 if ((prot & VM_PROT_WRITE) == 0) 823 val |= ATTR_S1_AP(ATTR_S1_AP_RO); 824 } else { 825 if ((prot & VM_PROT_WRITE) != 0) 826 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 827 if ((prot & VM_PROT_READ) != 0) 828 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ); 829 if ((prot & VM_PROT_EXECUTE) == 0) 830 val |= ATTR_S2_XN(ATTR_S2_XN_ALL); 831 } 832 833 return (val); 834 } 835 836 /* 837 * Checks if the PTE is dirty. 838 */ 839 static inline int 840 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte) 841 { 842 843 KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte)); 844 845 if (pmap->pm_stage == PM_STAGE1) { 846 KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0, 847 ("pte %#lx is writeable and missing ATTR_SW_DBM", pte)); 848 849 return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 850 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM)); 851 } 852 853 return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) == 854 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)); 855 } 856 857 static __inline void 858 pmap_resident_count_inc(pmap_t pmap, int count) 859 { 860 861 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 862 pmap->pm_stats.resident_count += count; 863 } 864 865 static __inline void 866 pmap_resident_count_dec(pmap_t pmap, int count) 867 { 868 869 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 870 KASSERT(pmap->pm_stats.resident_count >= count, 871 ("pmap %p resident count underflow %ld %d", pmap, 872 pmap->pm_stats.resident_count, count)); 873 pmap->pm_stats.resident_count -= count; 874 } 875 876 static vm_paddr_t 877 pmap_early_vtophys(vm_offset_t va) 878 { 879 vm_paddr_t pa_page; 880 881 pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK; 882 return (pa_page | (va & PAR_LOW_MASK)); 883 } 884 885 /* State of the bootstrapped DMAP page tables */ 886 struct pmap_bootstrap_state { 887 pt_entry_t *l1; 888 pt_entry_t *l2; 889 pt_entry_t *l3; 890 vm_offset_t freemempos; 891 vm_offset_t va; 892 vm_paddr_t pa; 893 pt_entry_t table_attrs; 894 u_int l0_slot; 895 u_int l1_slot; 896 u_int l2_slot; 897 bool dmap_valid; 898 }; 899 900 /* The bootstrap state */ 901 static struct pmap_bootstrap_state bs_state = { 902 .l1 = NULL, 903 .l2 = NULL, 904 .l3 = NULL, 905 .table_attrs = TATTR_PXN_TABLE, 906 .l0_slot = L0_ENTRIES, 907 .l1_slot = Ln_ENTRIES, 908 .l2_slot = Ln_ENTRIES, 909 .dmap_valid = false, 910 }; 911 912 static void 913 pmap_bootstrap_l0_table(struct pmap_bootstrap_state *state) 914 { 915 vm_paddr_t l1_pa; 916 pd_entry_t l0e; 917 u_int l0_slot; 918 919 /* Link the level 0 table to a level 1 table */ 920 l0_slot = pmap_l0_index(state->va); 921 if (l0_slot != state->l0_slot) { 922 /* 923 * Make sure we move from a low address to high address 924 * before the DMAP region is ready. This ensures we never 925 * modify an existing mapping until we can map from a 926 * physical address to a virtual address. 927 */ 928 MPASS(state->l0_slot < l0_slot || 929 state->l0_slot == L0_ENTRIES || 930 state->dmap_valid); 931 932 /* Reset lower levels */ 933 state->l2 = NULL; 934 state->l3 = NULL; 935 state->l1_slot = Ln_ENTRIES; 936 state->l2_slot = Ln_ENTRIES; 937 938 /* Check the existing L0 entry */ 939 state->l0_slot = l0_slot; 940 if (state->dmap_valid) { 941 l0e = pagetable_l0_ttbr1[l0_slot]; 942 if ((l0e & ATTR_DESCR_VALID) != 0) { 943 MPASS((l0e & ATTR_DESCR_MASK) == L0_TABLE); 944 l1_pa = l0e & ~ATTR_MASK; 945 state->l1 = (pt_entry_t *)PHYS_TO_DMAP(l1_pa); 946 return; 947 } 948 } 949 950 /* Create a new L0 table entry */ 951 state->l1 = (pt_entry_t *)state->freemempos; 952 memset(state->l1, 0, PAGE_SIZE); 953 state->freemempos += PAGE_SIZE; 954 955 l1_pa = pmap_early_vtophys((vm_offset_t)state->l1); 956 MPASS((l1_pa & Ln_TABLE_MASK) == 0); 957 MPASS(pagetable_l0_ttbr1[l0_slot] == 0); 958 pmap_store(&pagetable_l0_ttbr1[l0_slot], l1_pa | 959 TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0 | L0_TABLE); 960 } 961 KASSERT(state->l1 != NULL, ("%s: NULL l1", __func__)); 962 } 963 964 static void 965 pmap_bootstrap_l1_table(struct pmap_bootstrap_state *state) 966 { 967 vm_paddr_t l2_pa; 968 pd_entry_t l1e; 969 u_int l1_slot; 970 971 /* Make sure there is a valid L0 -> L1 table */ 972 pmap_bootstrap_l0_table(state); 973 974 /* Link the level 1 table to a level 2 table */ 975 l1_slot = pmap_l1_index(state->va); 976 if (l1_slot != state->l1_slot) { 977 /* See pmap_bootstrap_l0_table for a description */ 978 MPASS(state->l1_slot < l1_slot || 979 state->l1_slot == Ln_ENTRIES || 980 state->dmap_valid); 981 982 /* Reset lower levels */ 983 state->l3 = NULL; 984 state->l2_slot = Ln_ENTRIES; 985 986 /* Check the existing L1 entry */ 987 state->l1_slot = l1_slot; 988 if (state->dmap_valid) { 989 l1e = state->l1[l1_slot]; 990 if ((l1e & ATTR_DESCR_VALID) != 0) { 991 MPASS((l1e & ATTR_DESCR_MASK) == L1_TABLE); 992 l2_pa = l1e & ~ATTR_MASK; 993 state->l2 = (pt_entry_t *)PHYS_TO_DMAP(l2_pa); 994 return; 995 } 996 } 997 998 /* Create a new L1 table entry */ 999 state->l2 = (pt_entry_t *)state->freemempos; 1000 memset(state->l2, 0, PAGE_SIZE); 1001 state->freemempos += PAGE_SIZE; 1002 1003 l2_pa = pmap_early_vtophys((vm_offset_t)state->l2); 1004 MPASS((l2_pa & Ln_TABLE_MASK) == 0); 1005 MPASS(state->l1[l1_slot] == 0); 1006 pmap_store(&state->l1[l1_slot], l2_pa | state->table_attrs | 1007 L1_TABLE); 1008 } 1009 KASSERT(state->l2 != NULL, ("%s: NULL l2", __func__)); 1010 } 1011 1012 static void 1013 pmap_bootstrap_l2_table(struct pmap_bootstrap_state *state) 1014 { 1015 vm_paddr_t l3_pa; 1016 pd_entry_t l2e; 1017 u_int l2_slot; 1018 1019 /* Make sure there is a valid L1 -> L2 table */ 1020 pmap_bootstrap_l1_table(state); 1021 1022 /* Link the level 2 table to a level 3 table */ 1023 l2_slot = pmap_l2_index(state->va); 1024 if (l2_slot != state->l2_slot) { 1025 /* See pmap_bootstrap_l0_table for a description */ 1026 MPASS(state->l2_slot < l2_slot || 1027 state->l2_slot == Ln_ENTRIES || 1028 state->dmap_valid); 1029 1030 /* Check the existing L2 entry */ 1031 state->l2_slot = l2_slot; 1032 if (state->dmap_valid) { 1033 l2e = state->l2[l2_slot]; 1034 if ((l2e & ATTR_DESCR_VALID) != 0) { 1035 MPASS((l2e & ATTR_DESCR_MASK) == L2_TABLE); 1036 l3_pa = l2e & ~ATTR_MASK; 1037 state->l3 = (pt_entry_t *)PHYS_TO_DMAP(l3_pa); 1038 return; 1039 } 1040 } 1041 1042 /* Create a new L2 table entry */ 1043 state->l3 = (pt_entry_t *)state->freemempos; 1044 memset(state->l3, 0, PAGE_SIZE); 1045 state->freemempos += PAGE_SIZE; 1046 1047 l3_pa = pmap_early_vtophys((vm_offset_t)state->l3); 1048 MPASS((l3_pa & Ln_TABLE_MASK) == 0); 1049 MPASS(state->l2[l2_slot] == 0); 1050 pmap_store(&state->l2[l2_slot], l3_pa | state->table_attrs | 1051 L2_TABLE); 1052 } 1053 KASSERT(state->l3 != NULL, ("%s: NULL l3", __func__)); 1054 } 1055 1056 static void 1057 pmap_bootstrap_l2_block(struct pmap_bootstrap_state *state, int i) 1058 { 1059 u_int l2_slot; 1060 bool first; 1061 1062 if ((physmap[i + 1] - state->pa) < L2_SIZE) 1063 return; 1064 1065 /* Make sure there is a valid L1 table */ 1066 pmap_bootstrap_l1_table(state); 1067 1068 MPASS((state->va & L2_OFFSET) == 0); 1069 for (first = true; 1070 state->va < DMAP_MAX_ADDRESS && 1071 (physmap[i + 1] - state->pa) >= L2_SIZE; 1072 state->va += L2_SIZE, state->pa += L2_SIZE) { 1073 /* 1074 * Stop if we are about to walk off the end of what the 1075 * current L1 slot can address. 1076 */ 1077 if (!first && (state->pa & L1_OFFSET) == 0) 1078 break; 1079 1080 first = false; 1081 l2_slot = pmap_l2_index(state->va); 1082 MPASS((state->pa & L2_OFFSET) == 0); 1083 MPASS(state->l2[l2_slot] == 0); 1084 pmap_store(&state->l2[l2_slot], state->pa | ATTR_DEFAULT | 1085 ATTR_S1_XN | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | 1086 L2_BLOCK); 1087 } 1088 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS)); 1089 } 1090 1091 static void 1092 pmap_bootstrap_l3_page(struct pmap_bootstrap_state *state, int i) 1093 { 1094 u_int l3_slot; 1095 bool first; 1096 1097 if ((physmap[i + 1] - state->pa) < L3_SIZE) 1098 return; 1099 1100 /* Make sure there is a valid L2 table */ 1101 pmap_bootstrap_l2_table(state); 1102 1103 MPASS((state->va & L3_OFFSET) == 0); 1104 for (first = true; 1105 state->va < DMAP_MAX_ADDRESS && 1106 (physmap[i + 1] - state->pa) >= L3_SIZE; 1107 state->va += L3_SIZE, state->pa += L3_SIZE) { 1108 /* 1109 * Stop if we are about to walk off the end of what the 1110 * current L2 slot can address. 1111 */ 1112 if (!first && (state->pa & L2_OFFSET) == 0) 1113 break; 1114 1115 first = false; 1116 l3_slot = pmap_l3_index(state->va); 1117 MPASS((state->pa & L3_OFFSET) == 0); 1118 MPASS(state->l3[l3_slot] == 0); 1119 pmap_store(&state->l3[l3_slot], state->pa | ATTR_DEFAULT | 1120 ATTR_S1_XN | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | 1121 L3_PAGE); 1122 } 1123 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS)); 1124 } 1125 1126 static void 1127 pmap_bootstrap_dmap(vm_paddr_t min_pa) 1128 { 1129 int i; 1130 1131 dmap_phys_base = min_pa & ~L1_OFFSET; 1132 dmap_phys_max = 0; 1133 dmap_max_addr = 0; 1134 1135 for (i = 0; i < (physmap_idx * 2); i += 2) { 1136 bs_state.pa = physmap[i] & ~L3_OFFSET; 1137 bs_state.va = bs_state.pa - dmap_phys_base + DMAP_MIN_ADDRESS; 1138 1139 /* Create L3 mappings at the start of the region */ 1140 if ((bs_state.pa & L2_OFFSET) != 0) 1141 pmap_bootstrap_l3_page(&bs_state, i); 1142 MPASS(bs_state.pa <= physmap[i + 1]); 1143 1144 if (L1_BLOCKS_SUPPORTED) { 1145 /* Create L2 mappings at the start of the region */ 1146 if ((bs_state.pa & L1_OFFSET) != 0) 1147 pmap_bootstrap_l2_block(&bs_state, i); 1148 MPASS(bs_state.pa <= physmap[i + 1]); 1149 1150 /* Create the main L1 block mappings */ 1151 for (; bs_state.va < DMAP_MAX_ADDRESS && 1152 (physmap[i + 1] - bs_state.pa) >= L1_SIZE; 1153 bs_state.va += L1_SIZE, bs_state.pa += L1_SIZE) { 1154 /* Make sure there is a valid L1 table */ 1155 pmap_bootstrap_l0_table(&bs_state); 1156 MPASS((bs_state.pa & L1_OFFSET) == 0); 1157 pmap_store( 1158 &bs_state.l1[pmap_l1_index(bs_state.va)], 1159 bs_state.pa | ATTR_DEFAULT | ATTR_S1_XN | 1160 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | 1161 L1_BLOCK); 1162 } 1163 MPASS(bs_state.pa <= physmap[i + 1]); 1164 1165 /* Create L2 mappings at the end of the region */ 1166 pmap_bootstrap_l2_block(&bs_state, i); 1167 } else { 1168 while (bs_state.va < DMAP_MAX_ADDRESS && 1169 (physmap[i + 1] - bs_state.pa) >= L2_SIZE) { 1170 pmap_bootstrap_l2_block(&bs_state, i); 1171 } 1172 } 1173 MPASS(bs_state.pa <= physmap[i + 1]); 1174 1175 /* Create L3 mappings at the end of the region */ 1176 pmap_bootstrap_l3_page(&bs_state, i); 1177 MPASS(bs_state.pa == physmap[i + 1]); 1178 1179 if (bs_state.pa > dmap_phys_max) { 1180 dmap_phys_max = bs_state.pa; 1181 dmap_max_addr = bs_state.va; 1182 } 1183 } 1184 1185 cpu_tlb_flushID(); 1186 } 1187 1188 static void 1189 pmap_bootstrap_l2(vm_offset_t va) 1190 { 1191 KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address")); 1192 1193 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/ 1194 bs_state.va = va; 1195 1196 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L1_SIZE) 1197 pmap_bootstrap_l1_table(&bs_state); 1198 } 1199 1200 static void 1201 pmap_bootstrap_l3(vm_offset_t va) 1202 { 1203 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); 1204 1205 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/ 1206 bs_state.va = va; 1207 1208 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L2_SIZE) 1209 pmap_bootstrap_l2_table(&bs_state); 1210 } 1211 1212 /* 1213 * Bootstrap the system enough to run with virtual memory. 1214 */ 1215 void 1216 pmap_bootstrap(vm_paddr_t kernstart, vm_size_t kernlen) 1217 { 1218 vm_offset_t dpcpu, msgbufpv; 1219 vm_paddr_t start_pa, pa, min_pa; 1220 uint64_t kern_delta; 1221 int i; 1222 1223 /* Verify that the ASID is set through TTBR0. */ 1224 KASSERT((READ_SPECIALREG(tcr_el1) & TCR_A1) == 0, 1225 ("pmap_bootstrap: TCR_EL1.A1 != 0")); 1226 1227 kern_delta = KERNBASE - kernstart; 1228 1229 printf("pmap_bootstrap %lx %lx\n", kernstart, kernlen); 1230 printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK); 1231 1232 /* Set this early so we can use the pagetable walking functions */ 1233 kernel_pmap_store.pm_l0 = pagetable_l0_ttbr1; 1234 PMAP_LOCK_INIT(kernel_pmap); 1235 kernel_pmap->pm_l0_paddr = 1236 pmap_early_vtophys((vm_offset_t)kernel_pmap_store.pm_l0); 1237 kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN); 1238 kernel_pmap->pm_stage = PM_STAGE1; 1239 kernel_pmap->pm_levels = 4; 1240 kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr; 1241 kernel_pmap->pm_asid_set = &asids; 1242 1243 /* Assume the address we were loaded to is a valid physical address */ 1244 min_pa = KERNBASE - kern_delta; 1245 1246 physmap_idx = physmem_avail(physmap, nitems(physmap)); 1247 physmap_idx /= 2; 1248 1249 /* 1250 * Find the minimum physical address. physmap is sorted, 1251 * but may contain empty ranges. 1252 */ 1253 for (i = 0; i < physmap_idx * 2; i += 2) { 1254 if (physmap[i] == physmap[i + 1]) 1255 continue; 1256 if (physmap[i] <= min_pa) 1257 min_pa = physmap[i]; 1258 } 1259 1260 bs_state.freemempos = KERNBASE + kernlen; 1261 bs_state.freemempos = roundup2(bs_state.freemempos, PAGE_SIZE); 1262 1263 /* Create a direct map region early so we can use it for pa -> va */ 1264 pmap_bootstrap_dmap(min_pa); 1265 bs_state.dmap_valid = true; 1266 /* 1267 * We only use PXN when we know nothing will be executed from it, e.g. 1268 * the DMAP region. 1269 */ 1270 bs_state.table_attrs &= ~TATTR_PXN_TABLE; 1271 1272 start_pa = pa = KERNBASE - kern_delta; 1273 1274 /* 1275 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS. We assume that the 1276 * loader allocated the first and only l2 page table page used to map 1277 * the kernel, preloaded files and module metadata. 1278 */ 1279 pmap_bootstrap_l2(KERNBASE + L1_SIZE); 1280 /* And the l3 tables for the early devmap */ 1281 pmap_bootstrap_l3(VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE)); 1282 1283 cpu_tlb_flushID(); 1284 1285 #define alloc_pages(var, np) \ 1286 (var) = bs_state.freemempos; \ 1287 bs_state.freemempos += (np * PAGE_SIZE); \ 1288 memset((char *)(var), 0, ((np) * PAGE_SIZE)); 1289 1290 /* Allocate dynamic per-cpu area. */ 1291 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); 1292 dpcpu_init((void *)dpcpu, 0); 1293 1294 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ 1295 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); 1296 msgbufp = (void *)msgbufpv; 1297 1298 /* Reserve some VA space for early BIOS/ACPI mapping */ 1299 preinit_map_va = roundup2(bs_state.freemempos, L2_SIZE); 1300 1301 virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE; 1302 virtual_avail = roundup2(virtual_avail, L1_SIZE); 1303 virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE); 1304 kernel_vm_end = virtual_avail; 1305 1306 pa = pmap_early_vtophys(bs_state.freemempos); 1307 1308 physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC); 1309 1310 cpu_tlb_flushID(); 1311 } 1312 1313 /* 1314 * Initialize a vm_page's machine-dependent fields. 1315 */ 1316 void 1317 pmap_page_init(vm_page_t m) 1318 { 1319 1320 TAILQ_INIT(&m->md.pv_list); 1321 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; 1322 } 1323 1324 static void 1325 pmap_init_asids(struct asid_set *set, int bits) 1326 { 1327 int i; 1328 1329 set->asid_bits = bits; 1330 1331 /* 1332 * We may be too early in the overall initialization process to use 1333 * bit_alloc(). 1334 */ 1335 set->asid_set_size = 1 << set->asid_bits; 1336 set->asid_set = kmem_malloc(bitstr_size(set->asid_set_size), 1337 M_WAITOK | M_ZERO); 1338 for (i = 0; i < ASID_FIRST_AVAILABLE; i++) 1339 bit_set(set->asid_set, i); 1340 set->asid_next = ASID_FIRST_AVAILABLE; 1341 mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN); 1342 } 1343 1344 static void 1345 pmap_init_pv_table(void) 1346 { 1347 struct vm_phys_seg *seg, *next_seg; 1348 struct pmap_large_md_page *pvd; 1349 vm_size_t s; 1350 long start, end, highest, pv_npg; 1351 int domain, i, j, pages; 1352 1353 /* 1354 * We strongly depend on the size being a power of two, so the assert 1355 * is overzealous. However, should the struct be resized to a 1356 * different power of two, the code below needs to be revisited. 1357 */ 1358 CTASSERT((sizeof(*pvd) == 64)); 1359 1360 /* 1361 * Calculate the size of the array. 1362 */ 1363 pv_npg = 0; 1364 for (i = 0; i < vm_phys_nsegs; i++) { 1365 seg = &vm_phys_segs[i]; 1366 pv_npg += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1367 pmap_l2_pindex(seg->start); 1368 } 1369 s = (vm_size_t)pv_npg * sizeof(struct pmap_large_md_page); 1370 s = round_page(s); 1371 pv_table = (struct pmap_large_md_page *)kva_alloc(s); 1372 if (pv_table == NULL) 1373 panic("%s: kva_alloc failed\n", __func__); 1374 1375 /* 1376 * Iterate physical segments to allocate domain-local memory for PV 1377 * list headers. 1378 */ 1379 highest = -1; 1380 s = 0; 1381 for (i = 0; i < vm_phys_nsegs; i++) { 1382 seg = &vm_phys_segs[i]; 1383 start = highest + 1; 1384 end = start + pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1385 pmap_l2_pindex(seg->start); 1386 domain = seg->domain; 1387 1388 if (highest >= end) 1389 continue; 1390 1391 pvd = &pv_table[start]; 1392 1393 pages = end - start + 1; 1394 s = round_page(pages * sizeof(*pvd)); 1395 highest = start + (s / sizeof(*pvd)) - 1; 1396 1397 for (j = 0; j < s; j += PAGE_SIZE) { 1398 vm_page_t m = vm_page_alloc_noobj_domain(domain, 1399 VM_ALLOC_ZERO); 1400 if (m == NULL) 1401 panic("failed to allocate PV table page"); 1402 pmap_qenter((vm_offset_t)pvd + j, &m, 1); 1403 } 1404 1405 for (j = 0; j < s / sizeof(*pvd); j++) { 1406 rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW); 1407 TAILQ_INIT(&pvd->pv_page.pv_list); 1408 pvd++; 1409 } 1410 } 1411 pvd = &pv_dummy_large; 1412 memset(pvd, 0, sizeof(*pvd)); 1413 rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW); 1414 TAILQ_INIT(&pvd->pv_page.pv_list); 1415 1416 /* 1417 * Set pointers from vm_phys_segs to pv_table. 1418 */ 1419 for (i = 0, pvd = pv_table; i < vm_phys_nsegs; i++) { 1420 seg = &vm_phys_segs[i]; 1421 seg->md_first = pvd; 1422 pvd += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1423 pmap_l2_pindex(seg->start); 1424 1425 /* 1426 * If there is a following segment, and the final 1427 * superpage of this segment and the initial superpage 1428 * of the next segment are the same then adjust the 1429 * pv_table entry for that next segment down by one so 1430 * that the pv_table entries will be shared. 1431 */ 1432 if (i + 1 < vm_phys_nsegs) { 1433 next_seg = &vm_phys_segs[i + 1]; 1434 if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 == 1435 pmap_l2_pindex(next_seg->start)) { 1436 pvd--; 1437 } 1438 } 1439 } 1440 } 1441 1442 /* 1443 * Initialize the pmap module. 1444 * Called by vm_init, to initialize any structures that the pmap 1445 * system needs to map virtual memory. 1446 */ 1447 void 1448 pmap_init(void) 1449 { 1450 uint64_t mmfr1; 1451 int i, vmid_bits; 1452 1453 /* 1454 * Are large page mappings enabled? 1455 */ 1456 TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); 1457 if (superpages_enabled) { 1458 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1459 ("pmap_init: can't assign to pagesizes[1]")); 1460 pagesizes[1] = L2_SIZE; 1461 if (L1_BLOCKS_SUPPORTED) { 1462 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0, 1463 ("pmap_init: can't assign to pagesizes[2]")); 1464 pagesizes[2] = L1_SIZE; 1465 } 1466 } 1467 1468 /* 1469 * Initialize the ASID allocator. 1470 */ 1471 pmap_init_asids(&asids, 1472 (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8); 1473 1474 if (has_hyp()) { 1475 mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1); 1476 vmid_bits = 8; 1477 1478 if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) == 1479 ID_AA64MMFR1_VMIDBits_16) 1480 vmid_bits = 16; 1481 pmap_init_asids(&vmids, vmid_bits); 1482 } 1483 1484 /* 1485 * Initialize pv chunk lists. 1486 */ 1487 for (i = 0; i < PMAP_MEMDOM; i++) { 1488 mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL, 1489 MTX_DEF); 1490 TAILQ_INIT(&pv_chunks[i].pvc_list); 1491 } 1492 pmap_init_pv_table(); 1493 1494 vm_initialized = 1; 1495 } 1496 1497 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1498 "2MB page mapping counters"); 1499 1500 static u_long pmap_l2_demotions; 1501 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, 1502 &pmap_l2_demotions, 0, "2MB page demotions"); 1503 1504 static u_long pmap_l2_mappings; 1505 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, 1506 &pmap_l2_mappings, 0, "2MB page mappings"); 1507 1508 static u_long pmap_l2_p_failures; 1509 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, 1510 &pmap_l2_p_failures, 0, "2MB page promotion failures"); 1511 1512 static u_long pmap_l2_promotions; 1513 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, 1514 &pmap_l2_promotions, 0, "2MB page promotions"); 1515 1516 /* 1517 * If the given value for "final_only" is false, then any cached intermediate- 1518 * level entries, i.e., L{0,1,2}_TABLE entries, are invalidated in addition to 1519 * any cached final-level entry, i.e., either an L{1,2}_BLOCK or L3_PAGE entry. 1520 * Otherwise, just the cached final-level entry is invalidated. 1521 */ 1522 static __inline void 1523 pmap_s1_invalidate_kernel(uint64_t r, bool final_only) 1524 { 1525 if (final_only) 1526 __asm __volatile("tlbi vaale1is, %0" : : "r" (r)); 1527 else 1528 __asm __volatile("tlbi vaae1is, %0" : : "r" (r)); 1529 } 1530 1531 static __inline void 1532 pmap_s1_invalidate_user(uint64_t r, bool final_only) 1533 { 1534 if (final_only) 1535 __asm __volatile("tlbi vale1is, %0" : : "r" (r)); 1536 else 1537 __asm __volatile("tlbi vae1is, %0" : : "r" (r)); 1538 } 1539 1540 /* 1541 * Invalidates any cached final- and optionally intermediate-level TLB entries 1542 * for the specified virtual address in the given virtual address space. 1543 */ 1544 static __inline void 1545 pmap_s1_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only) 1546 { 1547 uint64_t r; 1548 1549 PMAP_ASSERT_STAGE1(pmap); 1550 1551 dsb(ishst); 1552 r = TLBI_VA(va); 1553 if (pmap == kernel_pmap) { 1554 pmap_s1_invalidate_kernel(r, final_only); 1555 } else { 1556 r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 1557 pmap_s1_invalidate_user(r, final_only); 1558 } 1559 dsb(ish); 1560 isb(); 1561 } 1562 1563 /* 1564 * Invalidates any cached final- and optionally intermediate-level TLB entries 1565 * for the specified virtual address range in the given virtual address space. 1566 */ 1567 static __inline void 1568 pmap_s1_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1569 bool final_only) 1570 { 1571 uint64_t end, r, start; 1572 1573 PMAP_ASSERT_STAGE1(pmap); 1574 1575 dsb(ishst); 1576 if (pmap == kernel_pmap) { 1577 start = TLBI_VA(sva); 1578 end = TLBI_VA(eva); 1579 for (r = start; r < end; r += TLBI_VA_L3_INCR) 1580 pmap_s1_invalidate_kernel(r, final_only); 1581 } else { 1582 start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 1583 start |= TLBI_VA(sva); 1584 end |= TLBI_VA(eva); 1585 for (r = start; r < end; r += TLBI_VA_L3_INCR) 1586 pmap_s1_invalidate_user(r, final_only); 1587 } 1588 dsb(ish); 1589 isb(); 1590 } 1591 1592 /* 1593 * Invalidates all cached intermediate- and final-level TLB entries for the 1594 * given virtual address space. 1595 */ 1596 static __inline void 1597 pmap_s1_invalidate_all(pmap_t pmap) 1598 { 1599 uint64_t r; 1600 1601 PMAP_ASSERT_STAGE1(pmap); 1602 1603 dsb(ishst); 1604 if (pmap == kernel_pmap) { 1605 __asm __volatile("tlbi vmalle1is"); 1606 } else { 1607 r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 1608 __asm __volatile("tlbi aside1is, %0" : : "r" (r)); 1609 } 1610 dsb(ish); 1611 isb(); 1612 } 1613 1614 /* 1615 * Routine: pmap_extract 1616 * Function: 1617 * Extract the physical page address associated 1618 * with the given map/virtual_address pair. 1619 */ 1620 vm_paddr_t 1621 pmap_extract(pmap_t pmap, vm_offset_t va) 1622 { 1623 pt_entry_t *pte, tpte; 1624 vm_paddr_t pa; 1625 int lvl; 1626 1627 pa = 0; 1628 PMAP_LOCK(pmap); 1629 /* 1630 * Find the block or page map for this virtual address. pmap_pte 1631 * will return either a valid block/page entry, or NULL. 1632 */ 1633 pte = pmap_pte(pmap, va, &lvl); 1634 if (pte != NULL) { 1635 tpte = pmap_load(pte); 1636 pa = tpte & ~ATTR_MASK; 1637 switch(lvl) { 1638 case 1: 1639 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 1640 KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, 1641 ("pmap_extract: Invalid L1 pte found: %lx", 1642 tpte & ATTR_DESCR_MASK)); 1643 pa |= (va & L1_OFFSET); 1644 break; 1645 case 2: 1646 KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, 1647 ("pmap_extract: Invalid L2 pte found: %lx", 1648 tpte & ATTR_DESCR_MASK)); 1649 pa |= (va & L2_OFFSET); 1650 break; 1651 case 3: 1652 KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, 1653 ("pmap_extract: Invalid L3 pte found: %lx", 1654 tpte & ATTR_DESCR_MASK)); 1655 pa |= (va & L3_OFFSET); 1656 break; 1657 } 1658 } 1659 PMAP_UNLOCK(pmap); 1660 return (pa); 1661 } 1662 1663 /* 1664 * Routine: pmap_extract_and_hold 1665 * Function: 1666 * Atomically extract and hold the physical page 1667 * with the given pmap and virtual address pair 1668 * if that mapping permits the given protection. 1669 */ 1670 vm_page_t 1671 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1672 { 1673 pt_entry_t *pte, tpte; 1674 vm_offset_t off; 1675 vm_page_t m; 1676 int lvl; 1677 bool use; 1678 1679 m = NULL; 1680 PMAP_LOCK(pmap); 1681 pte = pmap_pte(pmap, va, &lvl); 1682 if (pte != NULL) { 1683 tpte = pmap_load(pte); 1684 1685 KASSERT(lvl > 0 && lvl <= 3, 1686 ("pmap_extract_and_hold: Invalid level %d", lvl)); 1687 /* 1688 * Check that the pte is either a L3 page, or a L1 or L2 block 1689 * entry. We can assume L1_BLOCK == L2_BLOCK. 1690 */ 1691 KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) || 1692 (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK), 1693 ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl, 1694 tpte & ATTR_DESCR_MASK)); 1695 1696 use = false; 1697 if ((prot & VM_PROT_WRITE) == 0) 1698 use = true; 1699 else if (pmap->pm_stage == PM_STAGE1 && 1700 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)) 1701 use = true; 1702 else if (pmap->pm_stage == PM_STAGE2 && 1703 ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) == 1704 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE))) 1705 use = true; 1706 1707 if (use) { 1708 switch (lvl) { 1709 case 1: 1710 off = va & L1_OFFSET; 1711 break; 1712 case 2: 1713 off = va & L2_OFFSET; 1714 break; 1715 case 3: 1716 default: 1717 off = 0; 1718 } 1719 m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off); 1720 if (m != NULL && !vm_page_wire_mapped(m)) 1721 m = NULL; 1722 } 1723 } 1724 PMAP_UNLOCK(pmap); 1725 return (m); 1726 } 1727 1728 /* 1729 * Walks the page tables to translate a kernel virtual address to a 1730 * physical address. Returns true if the kva is valid and stores the 1731 * physical address in pa if it is not NULL. 1732 * 1733 * See the comment above data_abort() for the rationale for specifying 1734 * NO_PERTHREAD_SSP here. 1735 */ 1736 bool NO_PERTHREAD_SSP 1737 pmap_klookup(vm_offset_t va, vm_paddr_t *pa) 1738 { 1739 pt_entry_t *pte, tpte; 1740 register_t intr; 1741 uint64_t par; 1742 1743 /* 1744 * Disable interrupts so we don't get interrupted between asking 1745 * for address translation, and getting the result back. 1746 */ 1747 intr = intr_disable(); 1748 par = arm64_address_translate_s1e1r(va); 1749 intr_restore(intr); 1750 1751 if (PAR_SUCCESS(par)) { 1752 if (pa != NULL) 1753 *pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK); 1754 return (true); 1755 } 1756 1757 /* 1758 * Fall back to walking the page table. The address translation 1759 * instruction may fail when the page is in a break-before-make 1760 * sequence. As we only clear the valid bit in said sequence we 1761 * can walk the page table to find the physical address. 1762 */ 1763 1764 pte = pmap_l1(kernel_pmap, va); 1765 if (pte == NULL) 1766 return (false); 1767 1768 /* 1769 * A concurrent pmap_update_entry() will clear the entry's valid bit 1770 * but leave the rest of the entry unchanged. Therefore, we treat a 1771 * non-zero entry as being valid, and we ignore the valid bit when 1772 * determining whether the entry maps a block, page, or table. 1773 */ 1774 tpte = pmap_load(pte); 1775 if (tpte == 0) 1776 return (false); 1777 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 1778 if (pa != NULL) 1779 *pa = (tpte & ~ATTR_MASK) | (va & L1_OFFSET); 1780 return (true); 1781 } 1782 pte = pmap_l1_to_l2(&tpte, va); 1783 tpte = pmap_load(pte); 1784 if (tpte == 0) 1785 return (false); 1786 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 1787 if (pa != NULL) 1788 *pa = (tpte & ~ATTR_MASK) | (va & L2_OFFSET); 1789 return (true); 1790 } 1791 pte = pmap_l2_to_l3(&tpte, va); 1792 tpte = pmap_load(pte); 1793 if (tpte == 0) 1794 return (false); 1795 if (pa != NULL) 1796 *pa = (tpte & ~ATTR_MASK) | (va & L3_OFFSET); 1797 return (true); 1798 } 1799 1800 vm_paddr_t 1801 pmap_kextract(vm_offset_t va) 1802 { 1803 vm_paddr_t pa; 1804 1805 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 1806 return (DMAP_TO_PHYS(va)); 1807 1808 if (pmap_klookup(va, &pa) == false) 1809 return (0); 1810 return (pa); 1811 } 1812 1813 /*************************************************** 1814 * Low level mapping routines..... 1815 ***************************************************/ 1816 1817 void 1818 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode) 1819 { 1820 pd_entry_t *pde; 1821 pt_entry_t *pte, attr; 1822 vm_offset_t va; 1823 int lvl; 1824 1825 KASSERT((pa & L3_OFFSET) == 0, 1826 ("pmap_kenter: Invalid physical address")); 1827 KASSERT((sva & L3_OFFSET) == 0, 1828 ("pmap_kenter: Invalid virtual address")); 1829 KASSERT((size & PAGE_MASK) == 0, 1830 ("pmap_kenter: Mapping is not page-sized")); 1831 1832 attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN | 1833 ATTR_S1_IDX(mode) | L3_PAGE; 1834 va = sva; 1835 while (size != 0) { 1836 pde = pmap_pde(kernel_pmap, va, &lvl); 1837 KASSERT(pde != NULL, 1838 ("pmap_kenter: Invalid page entry, va: 0x%lx", va)); 1839 KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl)); 1840 1841 pte = pmap_l2_to_l3(pde, va); 1842 pmap_load_store(pte, (pa & ~L3_OFFSET) | attr); 1843 1844 va += PAGE_SIZE; 1845 pa += PAGE_SIZE; 1846 size -= PAGE_SIZE; 1847 } 1848 pmap_s1_invalidate_range(kernel_pmap, sva, va, true); 1849 } 1850 1851 void 1852 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) 1853 { 1854 1855 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE); 1856 } 1857 1858 /* 1859 * Remove a page from the kernel pagetables. 1860 */ 1861 PMAP_INLINE void 1862 pmap_kremove(vm_offset_t va) 1863 { 1864 pt_entry_t *pte; 1865 1866 pte = pmap_pte_exists(kernel_pmap, va, 3, __func__); 1867 pmap_clear(pte); 1868 pmap_s1_invalidate_page(kernel_pmap, va, true); 1869 } 1870 1871 void 1872 pmap_kremove_device(vm_offset_t sva, vm_size_t size) 1873 { 1874 pt_entry_t *pte; 1875 vm_offset_t va; 1876 1877 KASSERT((sva & L3_OFFSET) == 0, 1878 ("pmap_kremove_device: Invalid virtual address")); 1879 KASSERT((size & PAGE_MASK) == 0, 1880 ("pmap_kremove_device: Mapping is not page-sized")); 1881 1882 va = sva; 1883 while (size != 0) { 1884 pte = pmap_pte_exists(kernel_pmap, va, 3, __func__); 1885 pmap_clear(pte); 1886 1887 va += PAGE_SIZE; 1888 size -= PAGE_SIZE; 1889 } 1890 pmap_s1_invalidate_range(kernel_pmap, sva, va, true); 1891 } 1892 1893 /* 1894 * Used to map a range of physical addresses into kernel 1895 * virtual address space. 1896 * 1897 * The value passed in '*virt' is a suggested virtual address for 1898 * the mapping. Architectures which can support a direct-mapped 1899 * physical to virtual region can return the appropriate address 1900 * within that region, leaving '*virt' unchanged. Other 1901 * architectures should map the pages starting at '*virt' and 1902 * update '*virt' with the first usable address after the mapped 1903 * region. 1904 */ 1905 vm_offset_t 1906 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1907 { 1908 return PHYS_TO_DMAP(start); 1909 } 1910 1911 /* 1912 * Add a list of wired pages to the kva 1913 * this routine is only used for temporary 1914 * kernel mappings that do not need to have 1915 * page modification or references recorded. 1916 * Note that old mappings are simply written 1917 * over. The page *must* be wired. 1918 * Note: SMP coherent. Uses a ranged shootdown IPI. 1919 */ 1920 void 1921 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1922 { 1923 pd_entry_t *pde; 1924 pt_entry_t *pte, pa; 1925 vm_offset_t va; 1926 vm_page_t m; 1927 int i, lvl; 1928 1929 va = sva; 1930 for (i = 0; i < count; i++) { 1931 pde = pmap_pde(kernel_pmap, va, &lvl); 1932 KASSERT(pde != NULL, 1933 ("pmap_qenter: Invalid page entry, va: 0x%lx", va)); 1934 KASSERT(lvl == 2, 1935 ("pmap_qenter: Invalid level %d", lvl)); 1936 1937 m = ma[i]; 1938 pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | 1939 ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN | 1940 ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE; 1941 pte = pmap_l2_to_l3(pde, va); 1942 pmap_load_store(pte, pa); 1943 1944 va += L3_SIZE; 1945 } 1946 pmap_s1_invalidate_range(kernel_pmap, sva, va, true); 1947 } 1948 1949 /* 1950 * This routine tears out page mappings from the 1951 * kernel -- it is meant only for temporary mappings. 1952 */ 1953 void 1954 pmap_qremove(vm_offset_t sva, int count) 1955 { 1956 pt_entry_t *pte; 1957 vm_offset_t va; 1958 1959 KASSERT(ADDR_IS_CANONICAL(sva), 1960 ("%s: Address not in canonical form: %lx", __func__, sva)); 1961 KASSERT(ADDR_IS_KERNEL(sva), ("usermode va %lx", sva)); 1962 1963 va = sva; 1964 while (count-- > 0) { 1965 pte = pmap_pte_exists(kernel_pmap, va, 3, NULL); 1966 if (pte != NULL) { 1967 pmap_clear(pte); 1968 } 1969 1970 va += PAGE_SIZE; 1971 } 1972 pmap_s1_invalidate_range(kernel_pmap, sva, va, true); 1973 } 1974 1975 /*************************************************** 1976 * Page table page management routines..... 1977 ***************************************************/ 1978 /* 1979 * Schedule the specified unused page table page to be freed. Specifically, 1980 * add the page to the specified list of pages that will be released to the 1981 * physical memory manager after the TLB has been updated. 1982 */ 1983 static __inline void 1984 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 1985 boolean_t set_PG_ZERO) 1986 { 1987 1988 if (set_PG_ZERO) 1989 m->flags |= PG_ZERO; 1990 else 1991 m->flags &= ~PG_ZERO; 1992 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1993 } 1994 1995 /* 1996 * Decrements a page table page's reference count, which is used to record the 1997 * number of valid page table entries within the page. If the reference count 1998 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1999 * page table page was unmapped and FALSE otherwise. 2000 */ 2001 static inline boolean_t 2002 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2003 { 2004 2005 --m->ref_count; 2006 if (m->ref_count == 0) { 2007 _pmap_unwire_l3(pmap, va, m, free); 2008 return (TRUE); 2009 } else 2010 return (FALSE); 2011 } 2012 2013 static void 2014 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2015 { 2016 2017 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2018 /* 2019 * unmap the page table page 2020 */ 2021 if (m->pindex >= (NUL2E + NUL1E)) { 2022 /* l1 page */ 2023 pd_entry_t *l0; 2024 2025 l0 = pmap_l0(pmap, va); 2026 pmap_clear(l0); 2027 } else if (m->pindex >= NUL2E) { 2028 /* l2 page */ 2029 pd_entry_t *l1; 2030 2031 l1 = pmap_l1(pmap, va); 2032 pmap_clear(l1); 2033 } else { 2034 /* l3 page */ 2035 pd_entry_t *l2; 2036 2037 l2 = pmap_l2(pmap, va); 2038 pmap_clear(l2); 2039 } 2040 pmap_resident_count_dec(pmap, 1); 2041 if (m->pindex < NUL2E) { 2042 /* We just released an l3, unhold the matching l2 */ 2043 pd_entry_t *l1, tl1; 2044 vm_page_t l2pg; 2045 2046 l1 = pmap_l1(pmap, va); 2047 tl1 = pmap_load(l1); 2048 l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); 2049 pmap_unwire_l3(pmap, va, l2pg, free); 2050 } else if (m->pindex < (NUL2E + NUL1E)) { 2051 /* We just released an l2, unhold the matching l1 */ 2052 pd_entry_t *l0, tl0; 2053 vm_page_t l1pg; 2054 2055 l0 = pmap_l0(pmap, va); 2056 tl0 = pmap_load(l0); 2057 l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); 2058 pmap_unwire_l3(pmap, va, l1pg, free); 2059 } 2060 pmap_s1_invalidate_page(pmap, va, false); 2061 2062 /* 2063 * Put page on a list so that it is released after 2064 * *ALL* TLB shootdown is done 2065 */ 2066 pmap_add_delayed_free_list(m, free, TRUE); 2067 } 2068 2069 /* 2070 * After removing a page table entry, this routine is used to 2071 * conditionally free the page, and manage the reference count. 2072 */ 2073 static int 2074 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 2075 struct spglist *free) 2076 { 2077 vm_page_t mpte; 2078 2079 KASSERT(ADDR_IS_CANONICAL(va), 2080 ("%s: Address not in canonical form: %lx", __func__, va)); 2081 if (ADDR_IS_KERNEL(va)) 2082 return (0); 2083 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 2084 mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK); 2085 return (pmap_unwire_l3(pmap, va, mpte, free)); 2086 } 2087 2088 /* 2089 * Release a page table page reference after a failed attempt to create a 2090 * mapping. 2091 */ 2092 static void 2093 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 2094 { 2095 struct spglist free; 2096 2097 SLIST_INIT(&free); 2098 if (pmap_unwire_l3(pmap, va, mpte, &free)) 2099 vm_page_free_pages_toq(&free, true); 2100 } 2101 2102 void 2103 pmap_pinit0(pmap_t pmap) 2104 { 2105 2106 PMAP_LOCK_INIT(pmap); 2107 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 2108 pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1); 2109 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); 2110 vm_radix_init(&pmap->pm_root); 2111 pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN); 2112 pmap->pm_stage = PM_STAGE1; 2113 pmap->pm_levels = 4; 2114 pmap->pm_ttbr = pmap->pm_l0_paddr; 2115 pmap->pm_asid_set = &asids; 2116 2117 PCPU_SET(curpmap, pmap); 2118 } 2119 2120 int 2121 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels) 2122 { 2123 vm_page_t m; 2124 2125 /* 2126 * allocate the l0 page 2127 */ 2128 m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED | 2129 VM_ALLOC_ZERO); 2130 pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m); 2131 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); 2132 2133 vm_radix_init(&pmap->pm_root); 2134 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 2135 pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX); 2136 2137 MPASS(levels == 3 || levels == 4); 2138 pmap->pm_levels = levels; 2139 pmap->pm_stage = stage; 2140 switch (stage) { 2141 case PM_STAGE1: 2142 pmap->pm_asid_set = &asids; 2143 break; 2144 case PM_STAGE2: 2145 pmap->pm_asid_set = &vmids; 2146 break; 2147 default: 2148 panic("%s: Invalid pmap type %d", __func__, stage); 2149 break; 2150 } 2151 2152 /* XXX Temporarily disable deferred ASID allocation. */ 2153 pmap_alloc_asid(pmap); 2154 2155 /* 2156 * Allocate the level 1 entry to use as the root. This will increase 2157 * the refcount on the level 1 page so it won't be removed until 2158 * pmap_release() is called. 2159 */ 2160 if (pmap->pm_levels == 3) { 2161 PMAP_LOCK(pmap); 2162 m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL); 2163 PMAP_UNLOCK(pmap); 2164 } 2165 pmap->pm_ttbr = VM_PAGE_TO_PHYS(m); 2166 2167 return (1); 2168 } 2169 2170 int 2171 pmap_pinit(pmap_t pmap) 2172 { 2173 2174 return (pmap_pinit_stage(pmap, PM_STAGE1, 4)); 2175 } 2176 2177 /* 2178 * This routine is called if the desired page table page does not exist. 2179 * 2180 * If page table page allocation fails, this routine may sleep before 2181 * returning NULL. It sleeps only if a lock pointer was given. 2182 * 2183 * Note: If a page allocation fails at page table level two or three, 2184 * one or two pages may be held during the wait, only to be released 2185 * afterwards. This conservative approach is easily argued to avoid 2186 * race conditions. 2187 */ 2188 static vm_page_t 2189 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 2190 { 2191 vm_page_t m, l1pg, l2pg; 2192 2193 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2194 2195 /* 2196 * Allocate a page table page. 2197 */ 2198 if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 2199 if (lockp != NULL) { 2200 RELEASE_PV_LIST_LOCK(lockp); 2201 PMAP_UNLOCK(pmap); 2202 vm_wait(NULL); 2203 PMAP_LOCK(pmap); 2204 } 2205 2206 /* 2207 * Indicate the need to retry. While waiting, the page table 2208 * page may have been allocated. 2209 */ 2210 return (NULL); 2211 } 2212 m->pindex = ptepindex; 2213 2214 /* 2215 * Because of AArch64's weak memory consistency model, we must have a 2216 * barrier here to ensure that the stores for zeroing "m", whether by 2217 * pmap_zero_page() or an earlier function, are visible before adding 2218 * "m" to the page table. Otherwise, a page table walk by another 2219 * processor's MMU could see the mapping to "m" and a stale, non-zero 2220 * PTE within "m". 2221 */ 2222 dmb(ishst); 2223 2224 /* 2225 * Map the pagetable page into the process address space, if 2226 * it isn't already there. 2227 */ 2228 2229 if (ptepindex >= (NUL2E + NUL1E)) { 2230 pd_entry_t *l0p, l0e; 2231 vm_pindex_t l0index; 2232 2233 l0index = ptepindex - (NUL2E + NUL1E); 2234 l0p = &pmap->pm_l0[l0index]; 2235 KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0, 2236 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p))); 2237 l0e = VM_PAGE_TO_PHYS(m) | L0_TABLE; 2238 2239 /* 2240 * Mark all kernel memory as not accessible from userspace 2241 * and userspace memory as not executable from the kernel. 2242 * This has been done for the bootstrap L0 entries in 2243 * locore.S. 2244 */ 2245 if (pmap == kernel_pmap) 2246 l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0; 2247 else 2248 l0e |= TATTR_PXN_TABLE; 2249 pmap_store(l0p, l0e); 2250 } else if (ptepindex >= NUL2E) { 2251 vm_pindex_t l0index, l1index; 2252 pd_entry_t *l0, *l1; 2253 pd_entry_t tl0; 2254 2255 l1index = ptepindex - NUL2E; 2256 l0index = l1index >> Ln_ENTRIES_SHIFT; 2257 2258 l0 = &pmap->pm_l0[l0index]; 2259 tl0 = pmap_load(l0); 2260 if (tl0 == 0) { 2261 /* recurse for allocating page dir */ 2262 if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, 2263 lockp) == NULL) { 2264 vm_page_unwire_noq(m); 2265 vm_page_free_zero(m); 2266 return (NULL); 2267 } 2268 } else { 2269 l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); 2270 l1pg->ref_count++; 2271 } 2272 2273 l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); 2274 l1 = &l1[ptepindex & Ln_ADDR_MASK]; 2275 KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0, 2276 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1))); 2277 pmap_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE); 2278 } else { 2279 vm_pindex_t l0index, l1index; 2280 pd_entry_t *l0, *l1, *l2; 2281 pd_entry_t tl0, tl1; 2282 2283 l1index = ptepindex >> Ln_ENTRIES_SHIFT; 2284 l0index = l1index >> Ln_ENTRIES_SHIFT; 2285 2286 l0 = &pmap->pm_l0[l0index]; 2287 tl0 = pmap_load(l0); 2288 if (tl0 == 0) { 2289 /* recurse for allocating page dir */ 2290 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 2291 lockp) == NULL) { 2292 vm_page_unwire_noq(m); 2293 vm_page_free_zero(m); 2294 return (NULL); 2295 } 2296 tl0 = pmap_load(l0); 2297 l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); 2298 l1 = &l1[l1index & Ln_ADDR_MASK]; 2299 } else { 2300 l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); 2301 l1 = &l1[l1index & Ln_ADDR_MASK]; 2302 tl1 = pmap_load(l1); 2303 if (tl1 == 0) { 2304 /* recurse for allocating page dir */ 2305 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 2306 lockp) == NULL) { 2307 vm_page_unwire_noq(m); 2308 vm_page_free_zero(m); 2309 return (NULL); 2310 } 2311 } else { 2312 l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); 2313 l2pg->ref_count++; 2314 } 2315 } 2316 2317 l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); 2318 l2 = &l2[ptepindex & Ln_ADDR_MASK]; 2319 KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0, 2320 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2))); 2321 pmap_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE); 2322 } 2323 2324 pmap_resident_count_inc(pmap, 1); 2325 2326 return (m); 2327 } 2328 2329 static pd_entry_t * 2330 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp, 2331 struct rwlock **lockp) 2332 { 2333 pd_entry_t *l1, *l2; 2334 vm_page_t l2pg; 2335 vm_pindex_t l2pindex; 2336 2337 KASSERT(ADDR_IS_CANONICAL(va), 2338 ("%s: Address not in canonical form: %lx", __func__, va)); 2339 2340 retry: 2341 l1 = pmap_l1(pmap, va); 2342 if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) { 2343 l2 = pmap_l1_to_l2(l1, va); 2344 if (!ADDR_IS_KERNEL(va)) { 2345 /* Add a reference to the L2 page. */ 2346 l2pg = PHYS_TO_VM_PAGE(pmap_load(l1) & ~ATTR_MASK); 2347 l2pg->ref_count++; 2348 } else 2349 l2pg = NULL; 2350 } else if (!ADDR_IS_KERNEL(va)) { 2351 /* Allocate a L2 page. */ 2352 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; 2353 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); 2354 if (l2pg == NULL) { 2355 if (lockp != NULL) 2356 goto retry; 2357 else 2358 return (NULL); 2359 } 2360 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); 2361 l2 = &l2[pmap_l2_index(va)]; 2362 } else 2363 panic("pmap_alloc_l2: missing page table page for va %#lx", 2364 va); 2365 *l2pgp = l2pg; 2366 return (l2); 2367 } 2368 2369 static vm_page_t 2370 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 2371 { 2372 vm_pindex_t ptepindex; 2373 pd_entry_t *pde, tpde; 2374 #ifdef INVARIANTS 2375 pt_entry_t *pte; 2376 #endif 2377 vm_page_t m; 2378 int lvl; 2379 2380 /* 2381 * Calculate pagetable page index 2382 */ 2383 ptepindex = pmap_l2_pindex(va); 2384 retry: 2385 /* 2386 * Get the page directory entry 2387 */ 2388 pde = pmap_pde(pmap, va, &lvl); 2389 2390 /* 2391 * If the page table page is mapped, we just increment the hold count, 2392 * and activate it. If we get a level 2 pde it will point to a level 3 2393 * table. 2394 */ 2395 switch (lvl) { 2396 case -1: 2397 break; 2398 case 0: 2399 #ifdef INVARIANTS 2400 pte = pmap_l0_to_l1(pde, va); 2401 KASSERT(pmap_load(pte) == 0, 2402 ("pmap_alloc_l3: TODO: l0 superpages")); 2403 #endif 2404 break; 2405 case 1: 2406 #ifdef INVARIANTS 2407 pte = pmap_l1_to_l2(pde, va); 2408 KASSERT(pmap_load(pte) == 0, 2409 ("pmap_alloc_l3: TODO: l1 superpages")); 2410 #endif 2411 break; 2412 case 2: 2413 tpde = pmap_load(pde); 2414 if (tpde != 0) { 2415 m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK); 2416 m->ref_count++; 2417 return (m); 2418 } 2419 break; 2420 default: 2421 panic("pmap_alloc_l3: Invalid level %d", lvl); 2422 } 2423 2424 /* 2425 * Here if the pte page isn't mapped, or if it has been deallocated. 2426 */ 2427 m = _pmap_alloc_l3(pmap, ptepindex, lockp); 2428 if (m == NULL && lockp != NULL) 2429 goto retry; 2430 2431 return (m); 2432 } 2433 2434 /*************************************************** 2435 * Pmap allocation/deallocation routines. 2436 ***************************************************/ 2437 2438 /* 2439 * Release any resources held by the given physical map. 2440 * Called when a pmap initialized by pmap_pinit is being released. 2441 * Should only be called if the map contains no valid mappings. 2442 */ 2443 void 2444 pmap_release(pmap_t pmap) 2445 { 2446 boolean_t rv __diagused; 2447 struct spglist free; 2448 struct asid_set *set; 2449 vm_page_t m; 2450 int asid; 2451 2452 if (pmap->pm_levels != 4) { 2453 PMAP_ASSERT_STAGE2(pmap); 2454 KASSERT(pmap->pm_stats.resident_count == 1, 2455 ("pmap_release: pmap resident count %ld != 0", 2456 pmap->pm_stats.resident_count)); 2457 KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID, 2458 ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0])); 2459 2460 SLIST_INIT(&free); 2461 m = PHYS_TO_VM_PAGE(pmap->pm_ttbr); 2462 PMAP_LOCK(pmap); 2463 rv = pmap_unwire_l3(pmap, 0, m, &free); 2464 PMAP_UNLOCK(pmap); 2465 MPASS(rv == TRUE); 2466 vm_page_free_pages_toq(&free, true); 2467 } 2468 2469 KASSERT(pmap->pm_stats.resident_count == 0, 2470 ("pmap_release: pmap resident count %ld != 0", 2471 pmap->pm_stats.resident_count)); 2472 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2473 ("pmap_release: pmap has reserved page table page(s)")); 2474 2475 set = pmap->pm_asid_set; 2476 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 2477 2478 /* 2479 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate 2480 * the entries when removing them so rely on a later tlb invalidation. 2481 * this will happen when updating the VMID generation. Because of this 2482 * we don't reuse VMIDs within a generation. 2483 */ 2484 if (pmap->pm_stage == PM_STAGE1) { 2485 mtx_lock_spin(&set->asid_set_mutex); 2486 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) { 2487 asid = COOKIE_TO_ASID(pmap->pm_cookie); 2488 KASSERT(asid >= ASID_FIRST_AVAILABLE && 2489 asid < set->asid_set_size, 2490 ("pmap_release: pmap cookie has out-of-range asid")); 2491 bit_clear(set->asid_set, asid); 2492 } 2493 mtx_unlock_spin(&set->asid_set_mutex); 2494 } 2495 2496 m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr); 2497 vm_page_unwire_noq(m); 2498 vm_page_free_zero(m); 2499 } 2500 2501 static int 2502 kvm_size(SYSCTL_HANDLER_ARGS) 2503 { 2504 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 2505 2506 return sysctl_handle_long(oidp, &ksize, 0, req); 2507 } 2508 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 2509 0, 0, kvm_size, "LU", 2510 "Size of KVM"); 2511 2512 static int 2513 kvm_free(SYSCTL_HANDLER_ARGS) 2514 { 2515 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 2516 2517 return sysctl_handle_long(oidp, &kfree, 0, req); 2518 } 2519 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 2520 0, 0, kvm_free, "LU", 2521 "Amount of KVM free"); 2522 2523 /* 2524 * grow the number of kernel page table entries, if needed 2525 */ 2526 void 2527 pmap_growkernel(vm_offset_t addr) 2528 { 2529 vm_paddr_t paddr; 2530 vm_page_t nkpg; 2531 pd_entry_t *l0, *l1, *l2; 2532 2533 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2534 2535 addr = roundup2(addr, L2_SIZE); 2536 if (addr - 1 >= vm_map_max(kernel_map)) 2537 addr = vm_map_max(kernel_map); 2538 while (kernel_vm_end < addr) { 2539 l0 = pmap_l0(kernel_pmap, kernel_vm_end); 2540 KASSERT(pmap_load(l0) != 0, 2541 ("pmap_growkernel: No level 0 kernel entry")); 2542 2543 l1 = pmap_l0_to_l1(l0, kernel_vm_end); 2544 if (pmap_load(l1) == 0) { 2545 /* We need a new PDP entry */ 2546 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | 2547 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2548 if (nkpg == NULL) 2549 panic("pmap_growkernel: no memory to grow kernel"); 2550 nkpg->pindex = kernel_vm_end >> L1_SHIFT; 2551 /* See the dmb() in _pmap_alloc_l3(). */ 2552 dmb(ishst); 2553 paddr = VM_PAGE_TO_PHYS(nkpg); 2554 pmap_store(l1, paddr | L1_TABLE); 2555 continue; /* try again */ 2556 } 2557 l2 = pmap_l1_to_l2(l1, kernel_vm_end); 2558 if (pmap_load(l2) != 0) { 2559 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 2560 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2561 kernel_vm_end = vm_map_max(kernel_map); 2562 break; 2563 } 2564 continue; 2565 } 2566 2567 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 2568 VM_ALLOC_ZERO); 2569 if (nkpg == NULL) 2570 panic("pmap_growkernel: no memory to grow kernel"); 2571 nkpg->pindex = kernel_vm_end >> L2_SHIFT; 2572 /* See the dmb() in _pmap_alloc_l3(). */ 2573 dmb(ishst); 2574 paddr = VM_PAGE_TO_PHYS(nkpg); 2575 pmap_store(l2, paddr | L2_TABLE); 2576 2577 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 2578 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2579 kernel_vm_end = vm_map_max(kernel_map); 2580 break; 2581 } 2582 } 2583 } 2584 2585 /*************************************************** 2586 * page management routines. 2587 ***************************************************/ 2588 2589 static const uint64_t pc_freemask[_NPCM] = { 2590 [0 ... _NPCM - 2] = PC_FREEN, 2591 [_NPCM - 1] = PC_FREEL 2592 }; 2593 2594 #ifdef PV_STATS 2595 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2596 2597 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2598 "Current number of pv entry chunks"); 2599 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2600 "Current number of pv entry chunks allocated"); 2601 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2602 "Current number of pv entry chunks frees"); 2603 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2604 "Number of times tried to get a chunk page but failed."); 2605 2606 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 2607 static int pv_entry_spare; 2608 2609 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2610 "Current number of pv entry frees"); 2611 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2612 "Current number of pv entry allocs"); 2613 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2614 "Current number of pv entries"); 2615 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2616 "Current number of spare pv entries"); 2617 #endif 2618 2619 /* 2620 * We are in a serious low memory condition. Resort to 2621 * drastic measures to free some pages so we can allocate 2622 * another pv entry chunk. 2623 * 2624 * Returns NULL if PV entries were reclaimed from the specified pmap. 2625 * 2626 * We do not, however, unmap 2mpages because subsequent accesses will 2627 * allocate per-page pv entries until repromotion occurs, thereby 2628 * exacerbating the shortage of free pv entries. 2629 */ 2630 static vm_page_t 2631 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain) 2632 { 2633 struct pv_chunks_list *pvc; 2634 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 2635 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 2636 struct md_page *pvh; 2637 pd_entry_t *pde; 2638 pmap_t next_pmap, pmap; 2639 pt_entry_t *pte, tpte; 2640 pv_entry_t pv; 2641 vm_offset_t va; 2642 vm_page_t m, m_pc; 2643 struct spglist free; 2644 uint64_t inuse; 2645 int bit, field, freed, lvl; 2646 2647 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2648 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 2649 2650 pmap = NULL; 2651 m_pc = NULL; 2652 SLIST_INIT(&free); 2653 bzero(&pc_marker_b, sizeof(pc_marker_b)); 2654 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 2655 pc_marker = (struct pv_chunk *)&pc_marker_b; 2656 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 2657 2658 pvc = &pv_chunks[domain]; 2659 mtx_lock(&pvc->pvc_lock); 2660 pvc->active_reclaims++; 2661 TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru); 2662 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru); 2663 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 2664 SLIST_EMPTY(&free)) { 2665 next_pmap = pc->pc_pmap; 2666 if (next_pmap == NULL) { 2667 /* 2668 * The next chunk is a marker. However, it is 2669 * not our marker, so active_reclaims must be 2670 * > 1. Consequently, the next_chunk code 2671 * will not rotate the pv_chunks list. 2672 */ 2673 goto next_chunk; 2674 } 2675 mtx_unlock(&pvc->pvc_lock); 2676 2677 /* 2678 * A pv_chunk can only be removed from the pc_lru list 2679 * when both pvc->pvc_lock is owned and the 2680 * corresponding pmap is locked. 2681 */ 2682 if (pmap != next_pmap) { 2683 if (pmap != NULL && pmap != locked_pmap) 2684 PMAP_UNLOCK(pmap); 2685 pmap = next_pmap; 2686 /* Avoid deadlock and lock recursion. */ 2687 if (pmap > locked_pmap) { 2688 RELEASE_PV_LIST_LOCK(lockp); 2689 PMAP_LOCK(pmap); 2690 mtx_lock(&pvc->pvc_lock); 2691 continue; 2692 } else if (pmap != locked_pmap) { 2693 if (PMAP_TRYLOCK(pmap)) { 2694 mtx_lock(&pvc->pvc_lock); 2695 continue; 2696 } else { 2697 pmap = NULL; /* pmap is not locked */ 2698 mtx_lock(&pvc->pvc_lock); 2699 pc = TAILQ_NEXT(pc_marker, pc_lru); 2700 if (pc == NULL || 2701 pc->pc_pmap != next_pmap) 2702 continue; 2703 goto next_chunk; 2704 } 2705 } 2706 } 2707 2708 /* 2709 * Destroy every non-wired, 4 KB page mapping in the chunk. 2710 */ 2711 freed = 0; 2712 for (field = 0; field < _NPCM; field++) { 2713 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2714 inuse != 0; inuse &= ~(1UL << bit)) { 2715 bit = ffsl(inuse) - 1; 2716 pv = &pc->pc_pventry[field * 64 + bit]; 2717 va = pv->pv_va; 2718 pde = pmap_pde(pmap, va, &lvl); 2719 if (lvl != 2) 2720 continue; 2721 pte = pmap_l2_to_l3(pde, va); 2722 tpte = pmap_load(pte); 2723 if ((tpte & ATTR_SW_WIRED) != 0) 2724 continue; 2725 tpte = pmap_load_clear(pte); 2726 m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK); 2727 if (pmap_pte_dirty(pmap, tpte)) 2728 vm_page_dirty(m); 2729 if ((tpte & ATTR_AF) != 0) { 2730 pmap_s1_invalidate_page(pmap, va, true); 2731 vm_page_aflag_set(m, PGA_REFERENCED); 2732 } 2733 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2734 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2735 m->md.pv_gen++; 2736 if (TAILQ_EMPTY(&m->md.pv_list) && 2737 (m->flags & PG_FICTITIOUS) == 0) { 2738 pvh = page_to_pvh(m); 2739 if (TAILQ_EMPTY(&pvh->pv_list)) { 2740 vm_page_aflag_clear(m, 2741 PGA_WRITEABLE); 2742 } 2743 } 2744 pc->pc_map[field] |= 1UL << bit; 2745 pmap_unuse_pt(pmap, va, pmap_load(pde), &free); 2746 freed++; 2747 } 2748 } 2749 if (freed == 0) { 2750 mtx_lock(&pvc->pvc_lock); 2751 goto next_chunk; 2752 } 2753 /* Every freed mapping is for a 4 KB page. */ 2754 pmap_resident_count_dec(pmap, freed); 2755 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 2756 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 2757 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 2758 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2759 if (pc_is_free(pc)) { 2760 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 2761 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 2762 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 2763 /* Entire chunk is free; return it. */ 2764 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2765 dump_drop_page(m_pc->phys_addr); 2766 mtx_lock(&pvc->pvc_lock); 2767 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 2768 break; 2769 } 2770 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2771 mtx_lock(&pvc->pvc_lock); 2772 /* One freed pv entry in locked_pmap is sufficient. */ 2773 if (pmap == locked_pmap) 2774 break; 2775 2776 next_chunk: 2777 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 2778 TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru); 2779 if (pvc->active_reclaims == 1 && pmap != NULL) { 2780 /* 2781 * Rotate the pv chunks list so that we do not 2782 * scan the same pv chunks that could not be 2783 * freed (because they contained a wired 2784 * and/or superpage mapping) on every 2785 * invocation of reclaim_pv_chunk(). 2786 */ 2787 while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker){ 2788 MPASS(pc->pc_pmap != NULL); 2789 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 2790 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 2791 } 2792 } 2793 } 2794 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 2795 TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru); 2796 pvc->active_reclaims--; 2797 mtx_unlock(&pvc->pvc_lock); 2798 if (pmap != NULL && pmap != locked_pmap) 2799 PMAP_UNLOCK(pmap); 2800 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 2801 m_pc = SLIST_FIRST(&free); 2802 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2803 /* Recycle a freed page table page. */ 2804 m_pc->ref_count = 1; 2805 } 2806 vm_page_free_pages_toq(&free, true); 2807 return (m_pc); 2808 } 2809 2810 static vm_page_t 2811 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 2812 { 2813 vm_page_t m; 2814 int i, domain; 2815 2816 domain = PCPU_GET(domain); 2817 for (i = 0; i < vm_ndomains; i++) { 2818 m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain); 2819 if (m != NULL) 2820 break; 2821 domain = (domain + 1) % vm_ndomains; 2822 } 2823 2824 return (m); 2825 } 2826 2827 /* 2828 * free the pv_entry back to the free list 2829 */ 2830 static void 2831 free_pv_entry(pmap_t pmap, pv_entry_t pv) 2832 { 2833 struct pv_chunk *pc; 2834 int idx, field, bit; 2835 2836 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2837 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 2838 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 2839 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 2840 pc = pv_to_chunk(pv); 2841 idx = pv - &pc->pc_pventry[0]; 2842 field = idx / 64; 2843 bit = idx % 64; 2844 pc->pc_map[field] |= 1ul << bit; 2845 if (!pc_is_free(pc)) { 2846 /* 98% of the time, pc is already at the head of the list. */ 2847 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 2848 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2849 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2850 } 2851 return; 2852 } 2853 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2854 free_pv_chunk(pc); 2855 } 2856 2857 static void 2858 free_pv_chunk_dequeued(struct pv_chunk *pc) 2859 { 2860 vm_page_t m; 2861 2862 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 2863 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 2864 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 2865 /* entire chunk is free, return it */ 2866 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2867 dump_drop_page(m->phys_addr); 2868 vm_page_unwire_noq(m); 2869 vm_page_free(m); 2870 } 2871 2872 static void 2873 free_pv_chunk(struct pv_chunk *pc) 2874 { 2875 struct pv_chunks_list *pvc; 2876 2877 pvc = &pv_chunks[pc_to_domain(pc)]; 2878 mtx_lock(&pvc->pvc_lock); 2879 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 2880 mtx_unlock(&pvc->pvc_lock); 2881 free_pv_chunk_dequeued(pc); 2882 } 2883 2884 static void 2885 free_pv_chunk_batch(struct pv_chunklist *batch) 2886 { 2887 struct pv_chunks_list *pvc; 2888 struct pv_chunk *pc, *npc; 2889 int i; 2890 2891 for (i = 0; i < vm_ndomains; i++) { 2892 if (TAILQ_EMPTY(&batch[i])) 2893 continue; 2894 pvc = &pv_chunks[i]; 2895 mtx_lock(&pvc->pvc_lock); 2896 TAILQ_FOREACH(pc, &batch[i], pc_list) { 2897 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 2898 } 2899 mtx_unlock(&pvc->pvc_lock); 2900 } 2901 2902 for (i = 0; i < vm_ndomains; i++) { 2903 TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) { 2904 free_pv_chunk_dequeued(pc); 2905 } 2906 } 2907 } 2908 2909 /* 2910 * Returns a new PV entry, allocating a new PV chunk from the system when 2911 * needed. If this PV chunk allocation fails and a PV list lock pointer was 2912 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 2913 * returned. 2914 * 2915 * The given PV list lock may be released. 2916 */ 2917 static pv_entry_t 2918 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 2919 { 2920 struct pv_chunks_list *pvc; 2921 int bit, field; 2922 pv_entry_t pv; 2923 struct pv_chunk *pc; 2924 vm_page_t m; 2925 2926 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2927 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 2928 retry: 2929 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2930 if (pc != NULL) { 2931 for (field = 0; field < _NPCM; field++) { 2932 if (pc->pc_map[field]) { 2933 bit = ffsl(pc->pc_map[field]) - 1; 2934 break; 2935 } 2936 } 2937 if (field < _NPCM) { 2938 pv = &pc->pc_pventry[field * 64 + bit]; 2939 pc->pc_map[field] &= ~(1ul << bit); 2940 /* If this was the last item, move it to tail */ 2941 if (pc_is_full(pc)) { 2942 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2943 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 2944 pc_list); 2945 } 2946 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 2947 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 2948 return (pv); 2949 } 2950 } 2951 /* No free items, allocate another chunk */ 2952 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 2953 if (m == NULL) { 2954 if (lockp == NULL) { 2955 PV_STAT(pc_chunk_tryfail++); 2956 return (NULL); 2957 } 2958 m = reclaim_pv_chunk(pmap, lockp); 2959 if (m == NULL) 2960 goto retry; 2961 } 2962 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 2963 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 2964 dump_add_page(m->phys_addr); 2965 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 2966 pc->pc_pmap = pmap; 2967 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask)); 2968 pc->pc_map[0] &= ~1ul; /* preallocated bit 0 */ 2969 pvc = &pv_chunks[vm_page_domain(m)]; 2970 mtx_lock(&pvc->pvc_lock); 2971 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 2972 mtx_unlock(&pvc->pvc_lock); 2973 pv = &pc->pc_pventry[0]; 2974 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2975 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 2976 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 2977 return (pv); 2978 } 2979 2980 /* 2981 * Ensure that the number of spare PV entries in the specified pmap meets or 2982 * exceeds the given count, "needed". 2983 * 2984 * The given PV list lock may be released. 2985 */ 2986 static void 2987 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 2988 { 2989 struct pv_chunks_list *pvc; 2990 struct pch new_tail[PMAP_MEMDOM]; 2991 struct pv_chunk *pc; 2992 vm_page_t m; 2993 int avail, free, i; 2994 bool reclaimed; 2995 2996 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2997 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 2998 2999 /* 3000 * Newly allocated PV chunks must be stored in a private list until 3001 * the required number of PV chunks have been allocated. Otherwise, 3002 * reclaim_pv_chunk() could recycle one of these chunks. In 3003 * contrast, these chunks must be added to the pmap upon allocation. 3004 */ 3005 for (i = 0; i < PMAP_MEMDOM; i++) 3006 TAILQ_INIT(&new_tail[i]); 3007 retry: 3008 avail = 0; 3009 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 3010 bit_count((bitstr_t *)pc->pc_map, 0, 3011 sizeof(pc->pc_map) * NBBY, &free); 3012 if (free == 0) 3013 break; 3014 avail += free; 3015 if (avail >= needed) 3016 break; 3017 } 3018 for (reclaimed = false; avail < needed; avail += _NPCPV) { 3019 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 3020 if (m == NULL) { 3021 m = reclaim_pv_chunk(pmap, lockp); 3022 if (m == NULL) 3023 goto retry; 3024 reclaimed = true; 3025 } 3026 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3027 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3028 dump_add_page(m->phys_addr); 3029 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 3030 pc->pc_pmap = pmap; 3031 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask)); 3032 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3033 TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru); 3034 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 3035 3036 /* 3037 * The reclaim might have freed a chunk from the current pmap. 3038 * If that chunk contained available entries, we need to 3039 * re-count the number of available entries. 3040 */ 3041 if (reclaimed) 3042 goto retry; 3043 } 3044 for (i = 0; i < vm_ndomains; i++) { 3045 if (TAILQ_EMPTY(&new_tail[i])) 3046 continue; 3047 pvc = &pv_chunks[i]; 3048 mtx_lock(&pvc->pvc_lock); 3049 TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru); 3050 mtx_unlock(&pvc->pvc_lock); 3051 } 3052 } 3053 3054 /* 3055 * First find and then remove the pv entry for the specified pmap and virtual 3056 * address from the specified pv list. Returns the pv entry if found and NULL 3057 * otherwise. This operation can be performed on pv lists for either 4KB or 3058 * 2MB page mappings. 3059 */ 3060 static __inline pv_entry_t 3061 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3062 { 3063 pv_entry_t pv; 3064 3065 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3066 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 3067 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3068 pvh->pv_gen++; 3069 break; 3070 } 3071 } 3072 return (pv); 3073 } 3074 3075 /* 3076 * After demotion from a 2MB page mapping to 512 4KB page mappings, 3077 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 3078 * entries for each of the 4KB page mappings. 3079 */ 3080 static void 3081 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3082 struct rwlock **lockp) 3083 { 3084 struct md_page *pvh; 3085 struct pv_chunk *pc; 3086 pv_entry_t pv; 3087 vm_offset_t va_last; 3088 vm_page_t m; 3089 int bit, field; 3090 3091 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3092 KASSERT((va & L2_OFFSET) == 0, 3093 ("pmap_pv_demote_l2: va is not 2mpage aligned")); 3094 KASSERT((pa & L2_OFFSET) == 0, 3095 ("pmap_pv_demote_l2: pa is not 2mpage aligned")); 3096 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3097 3098 /* 3099 * Transfer the 2mpage's pv entry for this mapping to the first 3100 * page's pv list. Once this transfer begins, the pv list lock 3101 * must not be released until the last pv entry is reinstantiated. 3102 */ 3103 pvh = pa_to_pvh(pa); 3104 pv = pmap_pvh_remove(pvh, pmap, va); 3105 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); 3106 m = PHYS_TO_VM_PAGE(pa); 3107 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3108 m->md.pv_gen++; 3109 /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */ 3110 PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1)); 3111 va_last = va + L2_SIZE - PAGE_SIZE; 3112 for (;;) { 3113 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3114 KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare")); 3115 for (field = 0; field < _NPCM; field++) { 3116 while (pc->pc_map[field]) { 3117 bit = ffsl(pc->pc_map[field]) - 1; 3118 pc->pc_map[field] &= ~(1ul << bit); 3119 pv = &pc->pc_pventry[field * 64 + bit]; 3120 va += PAGE_SIZE; 3121 pv->pv_va = va; 3122 m++; 3123 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3124 ("pmap_pv_demote_l2: page %p is not managed", m)); 3125 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3126 m->md.pv_gen++; 3127 if (va == va_last) 3128 goto out; 3129 } 3130 } 3131 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3132 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3133 } 3134 out: 3135 if (pc_is_full(pc)) { 3136 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3137 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3138 } 3139 PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1)); 3140 PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1)); 3141 } 3142 3143 /* 3144 * First find and then destroy the pv entry for the specified pmap and virtual 3145 * address. This operation can be performed on pv lists for either 4KB or 2MB 3146 * page mappings. 3147 */ 3148 static void 3149 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3150 { 3151 pv_entry_t pv; 3152 3153 pv = pmap_pvh_remove(pvh, pmap, va); 3154 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 3155 free_pv_entry(pmap, pv); 3156 } 3157 3158 /* 3159 * Conditionally create the PV entry for a 4KB page mapping if the required 3160 * memory can be allocated without resorting to reclamation. 3161 */ 3162 static boolean_t 3163 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 3164 struct rwlock **lockp) 3165 { 3166 pv_entry_t pv; 3167 3168 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3169 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3170 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 3171 pv->pv_va = va; 3172 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3173 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3174 m->md.pv_gen++; 3175 return (TRUE); 3176 } else 3177 return (FALSE); 3178 } 3179 3180 /* 3181 * Create the PV entry for a 2MB page mapping. Always returns true unless the 3182 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 3183 * false if the PV entry cannot be allocated without resorting to reclamation. 3184 */ 3185 static bool 3186 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, 3187 struct rwlock **lockp) 3188 { 3189 struct md_page *pvh; 3190 pv_entry_t pv; 3191 vm_paddr_t pa; 3192 3193 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3194 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3195 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 3196 NULL : lockp)) == NULL) 3197 return (false); 3198 pv->pv_va = va; 3199 pa = l2e & ~ATTR_MASK; 3200 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3201 pvh = pa_to_pvh(pa); 3202 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3203 pvh->pv_gen++; 3204 return (true); 3205 } 3206 3207 static void 3208 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 3209 { 3210 pt_entry_t newl2, oldl2 __diagused; 3211 vm_page_t ml3; 3212 vm_paddr_t ml3pa; 3213 3214 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); 3215 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 3216 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3217 3218 ml3 = pmap_remove_pt_page(pmap, va); 3219 if (ml3 == NULL) 3220 panic("pmap_remove_kernel_l2: Missing pt page"); 3221 3222 ml3pa = VM_PAGE_TO_PHYS(ml3); 3223 newl2 = ml3pa | L2_TABLE; 3224 3225 /* 3226 * If this page table page was unmapped by a promotion, then it 3227 * contains valid mappings. Zero it to invalidate those mappings. 3228 */ 3229 if (ml3->valid != 0) 3230 pagezero((void *)PHYS_TO_DMAP(ml3pa)); 3231 3232 /* 3233 * Demote the mapping. The caller must have already invalidated the 3234 * mapping (i.e., the "break" in break-before-make). 3235 */ 3236 oldl2 = pmap_load_store(l2, newl2); 3237 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", 3238 __func__, l2, oldl2)); 3239 } 3240 3241 /* 3242 * pmap_remove_l2: Do the things to unmap a level 2 superpage. 3243 */ 3244 static int 3245 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 3246 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) 3247 { 3248 struct md_page *pvh; 3249 pt_entry_t old_l2; 3250 vm_page_t m, ml3, mt; 3251 3252 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3253 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); 3254 old_l2 = pmap_load_clear(l2); 3255 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, 3256 ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2)); 3257 3258 /* 3259 * Since a promotion must break the 4KB page mappings before making 3260 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices. 3261 */ 3262 pmap_s1_invalidate_page(pmap, sva, true); 3263 3264 if (old_l2 & ATTR_SW_WIRED) 3265 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; 3266 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); 3267 if (old_l2 & ATTR_SW_MANAGED) { 3268 m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); 3269 pvh = page_to_pvh(m); 3270 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK); 3271 pmap_pvh_free(pvh, pmap, sva); 3272 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) { 3273 if (pmap_pte_dirty(pmap, old_l2)) 3274 vm_page_dirty(mt); 3275 if (old_l2 & ATTR_AF) 3276 vm_page_aflag_set(mt, PGA_REFERENCED); 3277 if (TAILQ_EMPTY(&mt->md.pv_list) && 3278 TAILQ_EMPTY(&pvh->pv_list)) 3279 vm_page_aflag_clear(mt, PGA_WRITEABLE); 3280 } 3281 } 3282 if (pmap == kernel_pmap) { 3283 pmap_remove_kernel_l2(pmap, l2, sva); 3284 } else { 3285 ml3 = pmap_remove_pt_page(pmap, sva); 3286 if (ml3 != NULL) { 3287 KASSERT(ml3->valid == VM_PAGE_BITS_ALL, 3288 ("pmap_remove_l2: l3 page not promoted")); 3289 pmap_resident_count_dec(pmap, 1); 3290 KASSERT(ml3->ref_count == NL3PG, 3291 ("pmap_remove_l2: l3 page ref count error")); 3292 ml3->ref_count = 0; 3293 pmap_add_delayed_free_list(ml3, free, FALSE); 3294 } 3295 } 3296 return (pmap_unuse_pt(pmap, sva, l1e, free)); 3297 } 3298 3299 /* 3300 * pmap_remove_l3: do the things to unmap a page in a process 3301 */ 3302 static int 3303 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 3304 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) 3305 { 3306 struct md_page *pvh; 3307 pt_entry_t old_l3; 3308 vm_page_t m; 3309 3310 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3311 old_l3 = pmap_load_clear(l3); 3312 pmap_s1_invalidate_page(pmap, va, true); 3313 if (old_l3 & ATTR_SW_WIRED) 3314 pmap->pm_stats.wired_count -= 1; 3315 pmap_resident_count_dec(pmap, 1); 3316 if (old_l3 & ATTR_SW_MANAGED) { 3317 m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); 3318 if (pmap_pte_dirty(pmap, old_l3)) 3319 vm_page_dirty(m); 3320 if (old_l3 & ATTR_AF) 3321 vm_page_aflag_set(m, PGA_REFERENCED); 3322 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3323 pmap_pvh_free(&m->md, pmap, va); 3324 if (TAILQ_EMPTY(&m->md.pv_list) && 3325 (m->flags & PG_FICTITIOUS) == 0) { 3326 pvh = page_to_pvh(m); 3327 if (TAILQ_EMPTY(&pvh->pv_list)) 3328 vm_page_aflag_clear(m, PGA_WRITEABLE); 3329 } 3330 } 3331 return (pmap_unuse_pt(pmap, va, l2e, free)); 3332 } 3333 3334 /* 3335 * Remove the specified range of addresses from the L3 page table that is 3336 * identified by the given L2 entry. 3337 */ 3338 static void 3339 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva, 3340 vm_offset_t eva, struct spglist *free, struct rwlock **lockp) 3341 { 3342 struct md_page *pvh; 3343 struct rwlock *new_lock; 3344 pt_entry_t *l3, old_l3; 3345 vm_offset_t va; 3346 vm_page_t l3pg, m; 3347 3348 KASSERT(ADDR_IS_CANONICAL(sva), 3349 ("%s: Start address not in canonical form: %lx", __func__, sva)); 3350 KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS, 3351 ("%s: End address not in canonical form: %lx", __func__, eva)); 3352 3353 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3354 KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE), 3355 ("pmap_remove_l3_range: range crosses an L3 page table boundary")); 3356 l3pg = !ADDR_IS_KERNEL(sva) ? PHYS_TO_VM_PAGE(l2e & ~ATTR_MASK) : NULL; 3357 va = eva; 3358 for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) { 3359 if (!pmap_l3_valid(pmap_load(l3))) { 3360 if (va != eva) { 3361 pmap_s1_invalidate_range(pmap, va, sva, true); 3362 va = eva; 3363 } 3364 continue; 3365 } 3366 old_l3 = pmap_load_clear(l3); 3367 if ((old_l3 & ATTR_SW_WIRED) != 0) 3368 pmap->pm_stats.wired_count--; 3369 pmap_resident_count_dec(pmap, 1); 3370 if ((old_l3 & ATTR_SW_MANAGED) != 0) { 3371 m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); 3372 if (pmap_pte_dirty(pmap, old_l3)) 3373 vm_page_dirty(m); 3374 if ((old_l3 & ATTR_AF) != 0) 3375 vm_page_aflag_set(m, PGA_REFERENCED); 3376 new_lock = PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)); 3377 if (new_lock != *lockp) { 3378 if (*lockp != NULL) { 3379 /* 3380 * Pending TLB invalidations must be 3381 * performed before the PV list lock is 3382 * released. Otherwise, a concurrent 3383 * pmap_remove_all() on a physical page 3384 * could return while a stale TLB entry 3385 * still provides access to that page. 3386 */ 3387 if (va != eva) { 3388 pmap_s1_invalidate_range(pmap, va, 3389 sva, true); 3390 va = eva; 3391 } 3392 rw_wunlock(*lockp); 3393 } 3394 *lockp = new_lock; 3395 rw_wlock(*lockp); 3396 } 3397 pmap_pvh_free(&m->md, pmap, sva); 3398 if (TAILQ_EMPTY(&m->md.pv_list) && 3399 (m->flags & PG_FICTITIOUS) == 0) { 3400 pvh = page_to_pvh(m); 3401 if (TAILQ_EMPTY(&pvh->pv_list)) 3402 vm_page_aflag_clear(m, PGA_WRITEABLE); 3403 } 3404 } 3405 if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) { 3406 /* 3407 * _pmap_unwire_l3() has already invalidated the TLB 3408 * entries at all levels for "sva". So, we need not 3409 * perform "sva += L3_SIZE;" here. Moreover, we need 3410 * not perform "va = sva;" if "sva" is at the start 3411 * of a new valid range consisting of a single page. 3412 */ 3413 break; 3414 } 3415 if (va == eva) 3416 va = sva; 3417 } 3418 if (va != eva) 3419 pmap_s1_invalidate_range(pmap, va, sva, true); 3420 } 3421 3422 /* 3423 * Remove the given range of addresses from the specified map. 3424 * 3425 * It is assumed that the start and end are properly 3426 * rounded to the page size. 3427 */ 3428 void 3429 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3430 { 3431 struct rwlock *lock; 3432 vm_offset_t va_next; 3433 pd_entry_t *l0, *l1, *l2; 3434 pt_entry_t l3_paddr; 3435 struct spglist free; 3436 3437 /* 3438 * Perform an unsynchronized read. This is, however, safe. 3439 */ 3440 if (pmap->pm_stats.resident_count == 0) 3441 return; 3442 3443 SLIST_INIT(&free); 3444 3445 PMAP_LOCK(pmap); 3446 3447 lock = NULL; 3448 for (; sva < eva; sva = va_next) { 3449 if (pmap->pm_stats.resident_count == 0) 3450 break; 3451 3452 l0 = pmap_l0(pmap, sva); 3453 if (pmap_load(l0) == 0) { 3454 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 3455 if (va_next < sva) 3456 va_next = eva; 3457 continue; 3458 } 3459 3460 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3461 if (va_next < sva) 3462 va_next = eva; 3463 l1 = pmap_l0_to_l1(l0, sva); 3464 if (pmap_load(l1) == 0) 3465 continue; 3466 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 3467 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 3468 KASSERT(va_next <= eva, 3469 ("partial update of non-transparent 1G page " 3470 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 3471 pmap_load(l1), sva, eva, va_next)); 3472 MPASS(pmap != kernel_pmap); 3473 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0); 3474 pmap_clear(l1); 3475 pmap_s1_invalidate_page(pmap, sva, true); 3476 pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE); 3477 pmap_unuse_pt(pmap, sva, pmap_load(l0), &free); 3478 continue; 3479 } 3480 3481 /* 3482 * Calculate index for next page table. 3483 */ 3484 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3485 if (va_next < sva) 3486 va_next = eva; 3487 3488 l2 = pmap_l1_to_l2(l1, sva); 3489 if (l2 == NULL) 3490 continue; 3491 3492 l3_paddr = pmap_load(l2); 3493 3494 if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) { 3495 if (sva + L2_SIZE == va_next && eva >= va_next) { 3496 pmap_remove_l2(pmap, l2, sva, pmap_load(l1), 3497 &free, &lock); 3498 continue; 3499 } else if (pmap_demote_l2_locked(pmap, l2, sva, 3500 &lock) == NULL) 3501 continue; 3502 l3_paddr = pmap_load(l2); 3503 } 3504 3505 /* 3506 * Weed out invalid mappings. 3507 */ 3508 if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE) 3509 continue; 3510 3511 /* 3512 * Limit our scan to either the end of the va represented 3513 * by the current page table page, or to the end of the 3514 * range being removed. 3515 */ 3516 if (va_next > eva) 3517 va_next = eva; 3518 3519 pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free, 3520 &lock); 3521 } 3522 if (lock != NULL) 3523 rw_wunlock(lock); 3524 PMAP_UNLOCK(pmap); 3525 vm_page_free_pages_toq(&free, true); 3526 } 3527 3528 /* 3529 * Routine: pmap_remove_all 3530 * Function: 3531 * Removes this physical page from 3532 * all physical maps in which it resides. 3533 * Reflects back modify bits to the pager. 3534 * 3535 * Notes: 3536 * Original versions of this routine were very 3537 * inefficient because they iteratively called 3538 * pmap_remove (slow...) 3539 */ 3540 3541 void 3542 pmap_remove_all(vm_page_t m) 3543 { 3544 struct md_page *pvh; 3545 pv_entry_t pv; 3546 pmap_t pmap; 3547 struct rwlock *lock; 3548 pd_entry_t *pde, tpde; 3549 pt_entry_t *pte, tpte; 3550 vm_offset_t va; 3551 struct spglist free; 3552 int lvl, pvh_gen, md_gen; 3553 3554 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3555 ("pmap_remove_all: page %p is not managed", m)); 3556 SLIST_INIT(&free); 3557 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3558 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 3559 rw_wlock(lock); 3560 retry: 3561 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 3562 pmap = PV_PMAP(pv); 3563 if (!PMAP_TRYLOCK(pmap)) { 3564 pvh_gen = pvh->pv_gen; 3565 rw_wunlock(lock); 3566 PMAP_LOCK(pmap); 3567 rw_wlock(lock); 3568 if (pvh_gen != pvh->pv_gen) { 3569 PMAP_UNLOCK(pmap); 3570 goto retry; 3571 } 3572 } 3573 va = pv->pv_va; 3574 pte = pmap_pte_exists(pmap, va, 2, __func__); 3575 pmap_demote_l2_locked(pmap, pte, va, &lock); 3576 PMAP_UNLOCK(pmap); 3577 } 3578 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3579 pmap = PV_PMAP(pv); 3580 PMAP_ASSERT_STAGE1(pmap); 3581 if (!PMAP_TRYLOCK(pmap)) { 3582 pvh_gen = pvh->pv_gen; 3583 md_gen = m->md.pv_gen; 3584 rw_wunlock(lock); 3585 PMAP_LOCK(pmap); 3586 rw_wlock(lock); 3587 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 3588 PMAP_UNLOCK(pmap); 3589 goto retry; 3590 } 3591 } 3592 pmap_resident_count_dec(pmap, 1); 3593 3594 pde = pmap_pde(pmap, pv->pv_va, &lvl); 3595 KASSERT(pde != NULL, 3596 ("pmap_remove_all: no page directory entry found")); 3597 KASSERT(lvl == 2, 3598 ("pmap_remove_all: invalid pde level %d", lvl)); 3599 tpde = pmap_load(pde); 3600 3601 pte = pmap_l2_to_l3(pde, pv->pv_va); 3602 tpte = pmap_load_clear(pte); 3603 if (tpte & ATTR_SW_WIRED) 3604 pmap->pm_stats.wired_count--; 3605 if ((tpte & ATTR_AF) != 0) { 3606 pmap_s1_invalidate_page(pmap, pv->pv_va, true); 3607 vm_page_aflag_set(m, PGA_REFERENCED); 3608 } 3609 3610 /* 3611 * Update the vm_page_t clean and reference bits. 3612 */ 3613 if (pmap_pte_dirty(pmap, tpte)) 3614 vm_page_dirty(m); 3615 pmap_unuse_pt(pmap, pv->pv_va, tpde, &free); 3616 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3617 m->md.pv_gen++; 3618 free_pv_entry(pmap, pv); 3619 PMAP_UNLOCK(pmap); 3620 } 3621 vm_page_aflag_clear(m, PGA_WRITEABLE); 3622 rw_wunlock(lock); 3623 vm_page_free_pages_toq(&free, true); 3624 } 3625 3626 /* 3627 * Masks and sets bits in a level 2 page table entries in the specified pmap 3628 */ 3629 static void 3630 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask, 3631 pt_entry_t nbits) 3632 { 3633 pd_entry_t old_l2; 3634 vm_page_t m, mt; 3635 3636 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3637 PMAP_ASSERT_STAGE1(pmap); 3638 KASSERT((sva & L2_OFFSET) == 0, 3639 ("pmap_protect_l2: sva is not 2mpage aligned")); 3640 old_l2 = pmap_load(l2); 3641 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, 3642 ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2)); 3643 3644 /* 3645 * Return if the L2 entry already has the desired access restrictions 3646 * in place. 3647 */ 3648 if ((old_l2 & mask) == nbits) 3649 return; 3650 3651 while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits)) 3652 cpu_spinwait(); 3653 3654 /* 3655 * When a dirty read/write superpage mapping is write protected, 3656 * update the dirty field of each of the superpage's constituent 4KB 3657 * pages. 3658 */ 3659 if ((old_l2 & ATTR_SW_MANAGED) != 0 && 3660 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 3661 pmap_pte_dirty(pmap, old_l2)) { 3662 m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); 3663 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 3664 vm_page_dirty(mt); 3665 } 3666 3667 /* 3668 * Since a promotion must break the 4KB page mappings before making 3669 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices. 3670 */ 3671 pmap_s1_invalidate_page(pmap, sva, true); 3672 } 3673 3674 /* 3675 * Masks and sets bits in last level page table entries in the specified 3676 * pmap and range 3677 */ 3678 static void 3679 pmap_mask_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask, 3680 pt_entry_t nbits, bool invalidate) 3681 { 3682 vm_offset_t va, va_next; 3683 pd_entry_t *l0, *l1, *l2; 3684 pt_entry_t *l3p, l3; 3685 3686 PMAP_LOCK(pmap); 3687 for (; sva < eva; sva = va_next) { 3688 l0 = pmap_l0(pmap, sva); 3689 if (pmap_load(l0) == 0) { 3690 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 3691 if (va_next < sva) 3692 va_next = eva; 3693 continue; 3694 } 3695 3696 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3697 if (va_next < sva) 3698 va_next = eva; 3699 l1 = pmap_l0_to_l1(l0, sva); 3700 if (pmap_load(l1) == 0) 3701 continue; 3702 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 3703 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 3704 KASSERT(va_next <= eva, 3705 ("partial update of non-transparent 1G page " 3706 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 3707 pmap_load(l1), sva, eva, va_next)); 3708 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0); 3709 if ((pmap_load(l1) & mask) != nbits) { 3710 pmap_store(l1, (pmap_load(l1) & ~mask) | nbits); 3711 if (invalidate) 3712 pmap_s1_invalidate_page(pmap, sva, true); 3713 } 3714 continue; 3715 } 3716 3717 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3718 if (va_next < sva) 3719 va_next = eva; 3720 3721 l2 = pmap_l1_to_l2(l1, sva); 3722 if (pmap_load(l2) == 0) 3723 continue; 3724 3725 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { 3726 if (sva + L2_SIZE == va_next && eva >= va_next) { 3727 pmap_protect_l2(pmap, l2, sva, mask, nbits); 3728 continue; 3729 } else if (pmap_demote_l2(pmap, l2, sva) == NULL) 3730 continue; 3731 } 3732 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 3733 ("pmap_protect: Invalid L2 entry after demotion")); 3734 3735 if (va_next > eva) 3736 va_next = eva; 3737 3738 va = va_next; 3739 for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++, 3740 sva += L3_SIZE) { 3741 l3 = pmap_load(l3p); 3742 3743 /* 3744 * Go to the next L3 entry if the current one is 3745 * invalid or already has the desired access 3746 * restrictions in place. (The latter case occurs 3747 * frequently. For example, in a "buildworld" 3748 * workload, almost 1 out of 4 L3 entries already 3749 * have the desired restrictions.) 3750 */ 3751 if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) { 3752 if (va != va_next) { 3753 if (invalidate) 3754 pmap_s1_invalidate_range(pmap, 3755 va, sva, true); 3756 va = va_next; 3757 } 3758 continue; 3759 } 3760 3761 while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) | 3762 nbits)) 3763 cpu_spinwait(); 3764 3765 /* 3766 * When a dirty read/write mapping is write protected, 3767 * update the page's dirty field. 3768 */ 3769 if ((l3 & ATTR_SW_MANAGED) != 0 && 3770 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 3771 pmap_pte_dirty(pmap, l3)) 3772 vm_page_dirty(PHYS_TO_VM_PAGE(l3 & ~ATTR_MASK)); 3773 3774 if (va == va_next) 3775 va = sva; 3776 } 3777 if (va != va_next && invalidate) 3778 pmap_s1_invalidate_range(pmap, va, sva, true); 3779 } 3780 PMAP_UNLOCK(pmap); 3781 } 3782 3783 /* 3784 * Set the physical protection on the 3785 * specified range of this map as requested. 3786 */ 3787 void 3788 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 3789 { 3790 pt_entry_t mask, nbits; 3791 3792 PMAP_ASSERT_STAGE1(pmap); 3793 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 3794 if (prot == VM_PROT_NONE) { 3795 pmap_remove(pmap, sva, eva); 3796 return; 3797 } 3798 3799 mask = nbits = 0; 3800 if ((prot & VM_PROT_WRITE) == 0) { 3801 mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM; 3802 nbits |= ATTR_S1_AP(ATTR_S1_AP_RO); 3803 } 3804 if ((prot & VM_PROT_EXECUTE) == 0) { 3805 mask |= ATTR_S1_XN; 3806 nbits |= ATTR_S1_XN; 3807 } 3808 if (mask == 0) 3809 return; 3810 3811 pmap_mask_set(pmap, sva, eva, mask, nbits, true); 3812 } 3813 3814 void 3815 pmap_disable_promotion(vm_offset_t sva, vm_size_t size) 3816 { 3817 3818 MPASS((sva & L3_OFFSET) == 0); 3819 MPASS(((sva + size) & L3_OFFSET) == 0); 3820 3821 pmap_mask_set(kernel_pmap, sva, sva + size, ATTR_SW_NO_PROMOTE, 3822 ATTR_SW_NO_PROMOTE, false); 3823 } 3824 3825 /* 3826 * Inserts the specified page table page into the specified pmap's collection 3827 * of idle page table pages. Each of a pmap's page table pages is responsible 3828 * for mapping a distinct range of virtual addresses. The pmap's collection is 3829 * ordered by this virtual address range. 3830 * 3831 * If "promoted" is false, then the page table page "mpte" must be zero filled. 3832 */ 3833 static __inline int 3834 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) 3835 { 3836 3837 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3838 mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; 3839 return (vm_radix_insert(&pmap->pm_root, mpte)); 3840 } 3841 3842 /* 3843 * Removes the page table page mapping the specified virtual address from the 3844 * specified pmap's collection of idle page table pages, and returns it. 3845 * Otherwise, returns NULL if there is no page table page corresponding to the 3846 * specified virtual address. 3847 */ 3848 static __inline vm_page_t 3849 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 3850 { 3851 3852 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3853 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); 3854 } 3855 3856 /* 3857 * Performs a break-before-make update of a pmap entry. This is needed when 3858 * either promoting or demoting pages to ensure the TLB doesn't get into an 3859 * inconsistent state. 3860 */ 3861 static void 3862 pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte, 3863 vm_offset_t va, vm_size_t size) 3864 { 3865 register_t intr; 3866 3867 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3868 3869 if ((newpte & ATTR_SW_NO_PROMOTE) != 0) 3870 panic("%s: Updating non-promote pte", __func__); 3871 3872 /* 3873 * Ensure we don't get switched out with the page table in an 3874 * inconsistent state. We also need to ensure no interrupts fire 3875 * as they may make use of an address we are about to invalidate. 3876 */ 3877 intr = intr_disable(); 3878 3879 /* 3880 * Clear the old mapping's valid bit, but leave the rest of the entry 3881 * unchanged, so that a lockless, concurrent pmap_kextract() can still 3882 * lookup the physical address. 3883 */ 3884 pmap_clear_bits(pte, ATTR_DESCR_VALID); 3885 3886 /* 3887 * When promoting, the L{1,2}_TABLE entry that is being replaced might 3888 * be cached, so we invalidate intermediate entries as well as final 3889 * entries. 3890 */ 3891 pmap_s1_invalidate_range(pmap, va, va + size, false); 3892 3893 /* Create the new mapping */ 3894 pmap_store(pte, newpte); 3895 dsb(ishst); 3896 3897 intr_restore(intr); 3898 } 3899 3900 #if VM_NRESERVLEVEL > 0 3901 /* 3902 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 3903 * replace the many pv entries for the 4KB page mappings by a single pv entry 3904 * for the 2MB page mapping. 3905 */ 3906 static void 3907 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3908 struct rwlock **lockp) 3909 { 3910 struct md_page *pvh; 3911 pv_entry_t pv; 3912 vm_offset_t va_last; 3913 vm_page_t m; 3914 3915 KASSERT((pa & L2_OFFSET) == 0, 3916 ("pmap_pv_promote_l2: pa is not 2mpage aligned")); 3917 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3918 3919 /* 3920 * Transfer the first page's pv entry for this mapping to the 2mpage's 3921 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 3922 * a transfer avoids the possibility that get_pv_entry() calls 3923 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 3924 * mappings that is being promoted. 3925 */ 3926 m = PHYS_TO_VM_PAGE(pa); 3927 va = va & ~L2_OFFSET; 3928 pv = pmap_pvh_remove(&m->md, pmap, va); 3929 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found")); 3930 pvh = page_to_pvh(m); 3931 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3932 pvh->pv_gen++; 3933 /* Free the remaining NPTEPG - 1 pv entries. */ 3934 va_last = va + L2_SIZE - PAGE_SIZE; 3935 do { 3936 m++; 3937 va += PAGE_SIZE; 3938 pmap_pvh_free(&m->md, pmap, va); 3939 } while (va < va_last); 3940 } 3941 3942 /* 3943 * Tries to promote the 512, contiguous 4KB page mappings that are within a 3944 * single level 2 table entry to a single 2MB page mapping. For promotion 3945 * to occur, two conditions must be met: (1) the 4KB page mappings must map 3946 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 3947 * identical characteristics. 3948 */ 3949 static void 3950 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t mpte, 3951 struct rwlock **lockp) 3952 { 3953 pt_entry_t *firstl3, *l3, newl2, oldl3, pa; 3954 3955 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3956 PMAP_ASSERT_STAGE1(pmap); 3957 3958 /* 3959 * Examine the first L3E in the specified PTP. Abort if this L3E is 3960 * ineligible for promotion, invalid, or does not map the first 4KB 3961 * physical page within a 2MB page. 3962 */ 3963 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(pmap_load(l2) & ~ATTR_MASK); 3964 newl2 = pmap_load(firstl3); 3965 if ((newl2 & ATTR_SW_NO_PROMOTE) != 0) 3966 return; 3967 if ((newl2 & ((~ATTR_MASK & L2_OFFSET) | ATTR_DESCR_MASK)) != L3_PAGE) { 3968 atomic_add_long(&pmap_l2_p_failures, 1); 3969 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 3970 " in pmap %p", va, pmap); 3971 return; 3972 } 3973 3974 /* 3975 * Both here and in the below "for" loop, to allow for repromotion 3976 * after MADV_FREE, conditionally write protect a clean L3E before 3977 * possibly aborting the promotion due to other L3E attributes. Why? 3978 * Suppose that MADV_FREE is applied to a part of a superpage, the 3979 * address range [S, E). pmap_advise() will demote the superpage 3980 * mapping, destroy the 4KB page mapping at the end of [S, E), and 3981 * set AP_RO and clear AF in the L3Es for the rest of [S, E). Later, 3982 * imagine that the memory in [S, E) is recycled, but the last 4KB 3983 * page in [S, E) is not the last to be rewritten, or simply accessed. 3984 * In other words, there is still a 4KB page in [S, E), call it P, 3985 * that is writeable but AP_RO is set and AF is clear in P's L3E. 3986 * Unless we write protect P before aborting the promotion, if and 3987 * when P is finally rewritten, there won't be a page fault to trigger 3988 * repromotion. 3989 */ 3990 setl2: 3991 if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 3992 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 3993 /* 3994 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set, 3995 * ATTR_SW_DBM can be cleared without a TLB invalidation. 3996 */ 3997 if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM)) 3998 goto setl2; 3999 newl2 &= ~ATTR_SW_DBM; 4000 } 4001 if ((newl2 & ATTR_AF) == 0) { 4002 atomic_add_long(&pmap_l2_p_failures, 1); 4003 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 4004 " in pmap %p", va, pmap); 4005 return; 4006 } 4007 4008 /* 4009 * Examine each of the other L3Es in the specified PTP. Abort if this 4010 * L3E maps an unexpected 4KB physical page or does not have identical 4011 * characteristics to the first L3E. 4012 */ 4013 pa = (newl2 & (~ATTR_MASK | ATTR_DESCR_MASK)) + L2_SIZE - PAGE_SIZE; 4014 for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) { 4015 oldl3 = pmap_load(l3); 4016 if ((oldl3 & (~ATTR_MASK | ATTR_DESCR_MASK)) != pa) { 4017 atomic_add_long(&pmap_l2_p_failures, 1); 4018 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 4019 " in pmap %p", va, pmap); 4020 return; 4021 } 4022 setl3: 4023 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 4024 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 4025 /* 4026 * When the mapping is clean, i.e., ATTR_S1_AP_RO is 4027 * set, ATTR_SW_DBM can be cleared without a TLB 4028 * invalidation. 4029 */ 4030 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 & 4031 ~ATTR_SW_DBM)) 4032 goto setl3; 4033 oldl3 &= ~ATTR_SW_DBM; 4034 } 4035 if ((oldl3 & ATTR_MASK) != (newl2 & ATTR_MASK)) { 4036 atomic_add_long(&pmap_l2_p_failures, 1); 4037 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 4038 " in pmap %p", va, pmap); 4039 return; 4040 } 4041 pa -= PAGE_SIZE; 4042 } 4043 4044 /* 4045 * Save the page table page in its current state until the L2 4046 * mapping the superpage is demoted by pmap_demote_l2() or 4047 * destroyed by pmap_remove_l3(). 4048 */ 4049 if (mpte == NULL) 4050 mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); 4051 KASSERT(mpte >= vm_page_array && 4052 mpte < &vm_page_array[vm_page_array_size], 4053 ("pmap_promote_l2: page table page is out of range")); 4054 KASSERT(mpte->pindex == pmap_l2_pindex(va), 4055 ("pmap_promote_l2: page table page's pindex is wrong")); 4056 if (pmap_insert_pt_page(pmap, mpte, true)) { 4057 atomic_add_long(&pmap_l2_p_failures, 1); 4058 CTR2(KTR_PMAP, 4059 "pmap_promote_l2: failure for va %#lx in pmap %p", va, 4060 pmap); 4061 return; 4062 } 4063 4064 if ((newl2 & ATTR_SW_MANAGED) != 0) 4065 pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp); 4066 4067 newl2 &= ~ATTR_DESCR_MASK; 4068 newl2 |= L2_BLOCK; 4069 4070 pmap_update_entry(pmap, l2, newl2, va & ~L2_OFFSET, L2_SIZE); 4071 4072 atomic_add_long(&pmap_l2_promotions, 1); 4073 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, 4074 pmap); 4075 } 4076 #endif /* VM_NRESERVLEVEL > 0 */ 4077 4078 static int 4079 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags, 4080 int psind) 4081 { 4082 pd_entry_t *l0p, *l1p, *l2p, origpte; 4083 vm_page_t mp; 4084 4085 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4086 KASSERT(psind > 0 && psind < MAXPAGESIZES, 4087 ("psind %d unexpected", psind)); 4088 KASSERT(((newpte & ~ATTR_MASK) & (pagesizes[psind] - 1)) == 0, 4089 ("unaligned phys address %#lx newpte %#lx psind %d", 4090 (newpte & ~ATTR_MASK), newpte, psind)); 4091 4092 restart: 4093 if (psind == 2) { 4094 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 4095 4096 l0p = pmap_l0(pmap, va); 4097 if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) { 4098 mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL); 4099 if (mp == NULL) { 4100 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 4101 return (KERN_RESOURCE_SHORTAGE); 4102 PMAP_UNLOCK(pmap); 4103 vm_wait(NULL); 4104 PMAP_LOCK(pmap); 4105 goto restart; 4106 } 4107 l1p = pmap_l0_to_l1(l0p, va); 4108 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va)); 4109 origpte = pmap_load(l1p); 4110 } else { 4111 l1p = pmap_l0_to_l1(l0p, va); 4112 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va)); 4113 origpte = pmap_load(l1p); 4114 if ((origpte & ATTR_DESCR_VALID) == 0) { 4115 mp = PHYS_TO_VM_PAGE(pmap_load(l0p) & 4116 ~ATTR_MASK); 4117 mp->ref_count++; 4118 } 4119 } 4120 KASSERT(((origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK) && 4121 (origpte & ATTR_DESCR_MASK) == L1_BLOCK) || 4122 (origpte & ATTR_DESCR_VALID) == 0, 4123 ("va %#lx changing 1G phys page l1 %#lx newpte %#lx", 4124 va, origpte, newpte)); 4125 pmap_store(l1p, newpte); 4126 } else /* (psind == 1) */ { 4127 l2p = pmap_l2(pmap, va); 4128 if (l2p == NULL) { 4129 mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL); 4130 if (mp == NULL) { 4131 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 4132 return (KERN_RESOURCE_SHORTAGE); 4133 PMAP_UNLOCK(pmap); 4134 vm_wait(NULL); 4135 PMAP_LOCK(pmap); 4136 goto restart; 4137 } 4138 l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 4139 l2p = &l2p[pmap_l2_index(va)]; 4140 origpte = pmap_load(l2p); 4141 } else { 4142 l1p = pmap_l1(pmap, va); 4143 origpte = pmap_load(l2p); 4144 if ((origpte & ATTR_DESCR_VALID) == 0) { 4145 mp = PHYS_TO_VM_PAGE(pmap_load(l1p) & 4146 ~ATTR_MASK); 4147 mp->ref_count++; 4148 } 4149 } 4150 KASSERT((origpte & ATTR_DESCR_VALID) == 0 || 4151 ((origpte & ATTR_DESCR_MASK) == L2_BLOCK && 4152 (origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK)), 4153 ("va %#lx changing 2M phys page l2 %#lx newpte %#lx", 4154 va, origpte, newpte)); 4155 pmap_store(l2p, newpte); 4156 } 4157 dsb(ishst); 4158 4159 if ((origpte & ATTR_DESCR_VALID) == 0) 4160 pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE); 4161 if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0) 4162 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE; 4163 else if ((newpte & ATTR_SW_WIRED) == 0 && 4164 (origpte & ATTR_SW_WIRED) != 0) 4165 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE; 4166 4167 return (KERN_SUCCESS); 4168 } 4169 4170 /* 4171 * Insert the given physical page (p) at 4172 * the specified virtual address (v) in the 4173 * target physical map with the protection requested. 4174 * 4175 * If specified, the page will be wired down, meaning 4176 * that the related pte can not be reclaimed. 4177 * 4178 * NB: This is the only routine which MAY NOT lazy-evaluate 4179 * or lose information. That is, this routine must actually 4180 * insert this page into the given map NOW. 4181 */ 4182 int 4183 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4184 u_int flags, int8_t psind) 4185 { 4186 struct rwlock *lock; 4187 pd_entry_t *pde; 4188 pt_entry_t new_l3, orig_l3; 4189 pt_entry_t *l2, *l3; 4190 pv_entry_t pv; 4191 vm_paddr_t opa, pa; 4192 vm_page_t mpte, om; 4193 boolean_t nosleep; 4194 int lvl, rv; 4195 4196 KASSERT(ADDR_IS_CANONICAL(va), 4197 ("%s: Address not in canonical form: %lx", __func__, va)); 4198 4199 va = trunc_page(va); 4200 if ((m->oflags & VPO_UNMANAGED) == 0) 4201 VM_PAGE_OBJECT_BUSY_ASSERT(m); 4202 pa = VM_PAGE_TO_PHYS(m); 4203 new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | L3_PAGE); 4204 new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr); 4205 new_l3 |= pmap_pte_prot(pmap, prot); 4206 4207 if ((flags & PMAP_ENTER_WIRED) != 0) 4208 new_l3 |= ATTR_SW_WIRED; 4209 if (pmap->pm_stage == PM_STAGE1) { 4210 if (!ADDR_IS_KERNEL(va)) 4211 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 4212 else 4213 new_l3 |= ATTR_S1_UXN; 4214 if (pmap != kernel_pmap) 4215 new_l3 |= ATTR_S1_nG; 4216 } else { 4217 /* 4218 * Clear the access flag on executable mappings, this will be 4219 * set later when the page is accessed. The fault handler is 4220 * required to invalidate the I-cache. 4221 * 4222 * TODO: Switch to the valid flag to allow hardware management 4223 * of the access flag. Much of the pmap code assumes the 4224 * valid flag is set and fails to destroy the old page tables 4225 * correctly if it is clear. 4226 */ 4227 if (prot & VM_PROT_EXECUTE) 4228 new_l3 &= ~ATTR_AF; 4229 } 4230 if ((m->oflags & VPO_UNMANAGED) == 0) { 4231 new_l3 |= ATTR_SW_MANAGED; 4232 if ((prot & VM_PROT_WRITE) != 0) { 4233 new_l3 |= ATTR_SW_DBM; 4234 if ((flags & VM_PROT_WRITE) == 0) { 4235 if (pmap->pm_stage == PM_STAGE1) 4236 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO); 4237 else 4238 new_l3 &= 4239 ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 4240 } 4241 } 4242 } 4243 4244 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); 4245 4246 lock = NULL; 4247 PMAP_LOCK(pmap); 4248 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { 4249 KASSERT((m->oflags & VPO_UNMANAGED) != 0, 4250 ("managed largepage va %#lx flags %#x", va, flags)); 4251 new_l3 &= ~L3_PAGE; 4252 if (psind == 2) { 4253 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 4254 new_l3 |= L1_BLOCK; 4255 } else /* (psind == 1) */ 4256 new_l3 |= L2_BLOCK; 4257 rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind); 4258 goto out; 4259 } 4260 if (psind == 1) { 4261 /* Assert the required virtual and physical alignment. */ 4262 KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned")); 4263 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 4264 rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK, 4265 flags, m, &lock); 4266 goto out; 4267 } 4268 mpte = NULL; 4269 4270 /* 4271 * In the case that a page table page is not 4272 * resident, we are creating it here. 4273 */ 4274 retry: 4275 pde = pmap_pde(pmap, va, &lvl); 4276 if (pde != NULL && lvl == 2) { 4277 l3 = pmap_l2_to_l3(pde, va); 4278 if (!ADDR_IS_KERNEL(va) && mpte == NULL) { 4279 mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); 4280 mpte->ref_count++; 4281 } 4282 goto havel3; 4283 } else if (pde != NULL && lvl == 1) { 4284 l2 = pmap_l1_to_l2(pde, va); 4285 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK && 4286 (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) { 4287 l3 = &l3[pmap_l3_index(va)]; 4288 if (!ADDR_IS_KERNEL(va)) { 4289 mpte = PHYS_TO_VM_PAGE( 4290 pmap_load(l2) & ~ATTR_MASK); 4291 mpte->ref_count++; 4292 } 4293 goto havel3; 4294 } 4295 /* We need to allocate an L3 table. */ 4296 } 4297 if (!ADDR_IS_KERNEL(va)) { 4298 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 4299 4300 /* 4301 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order 4302 * to handle the possibility that a superpage mapping for "va" 4303 * was created while we slept. 4304 */ 4305 mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), 4306 nosleep ? NULL : &lock); 4307 if (mpte == NULL && nosleep) { 4308 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); 4309 rv = KERN_RESOURCE_SHORTAGE; 4310 goto out; 4311 } 4312 goto retry; 4313 } else 4314 panic("pmap_enter: missing L3 table for kernel va %#lx", va); 4315 4316 havel3: 4317 orig_l3 = pmap_load(l3); 4318 opa = orig_l3 & ~ATTR_MASK; 4319 pv = NULL; 4320 4321 /* 4322 * Is the specified virtual address already mapped? 4323 */ 4324 if (pmap_l3_valid(orig_l3)) { 4325 /* 4326 * Only allow adding new entries on stage 2 tables for now. 4327 * This simplifies cache invalidation as we may need to call 4328 * into EL2 to perform such actions. 4329 */ 4330 PMAP_ASSERT_STAGE1(pmap); 4331 /* 4332 * Wiring change, just update stats. We don't worry about 4333 * wiring PT pages as they remain resident as long as there 4334 * are valid mappings in them. Hence, if a user page is wired, 4335 * the PT page will be also. 4336 */ 4337 if ((flags & PMAP_ENTER_WIRED) != 0 && 4338 (orig_l3 & ATTR_SW_WIRED) == 0) 4339 pmap->pm_stats.wired_count++; 4340 else if ((flags & PMAP_ENTER_WIRED) == 0 && 4341 (orig_l3 & ATTR_SW_WIRED) != 0) 4342 pmap->pm_stats.wired_count--; 4343 4344 /* 4345 * Remove the extra PT page reference. 4346 */ 4347 if (mpte != NULL) { 4348 mpte->ref_count--; 4349 KASSERT(mpte->ref_count > 0, 4350 ("pmap_enter: missing reference to page table page," 4351 " va: 0x%lx", va)); 4352 } 4353 4354 /* 4355 * Has the physical page changed? 4356 */ 4357 if (opa == pa) { 4358 /* 4359 * No, might be a protection or wiring change. 4360 */ 4361 if ((orig_l3 & ATTR_SW_MANAGED) != 0 && 4362 (new_l3 & ATTR_SW_DBM) != 0) 4363 vm_page_aflag_set(m, PGA_WRITEABLE); 4364 goto validate; 4365 } 4366 4367 /* 4368 * The physical page has changed. Temporarily invalidate 4369 * the mapping. 4370 */ 4371 orig_l3 = pmap_load_clear(l3); 4372 KASSERT((orig_l3 & ~ATTR_MASK) == opa, 4373 ("pmap_enter: unexpected pa update for %#lx", va)); 4374 if ((orig_l3 & ATTR_SW_MANAGED) != 0) { 4375 om = PHYS_TO_VM_PAGE(opa); 4376 4377 /* 4378 * The pmap lock is sufficient to synchronize with 4379 * concurrent calls to pmap_page_test_mappings() and 4380 * pmap_ts_referenced(). 4381 */ 4382 if (pmap_pte_dirty(pmap, orig_l3)) 4383 vm_page_dirty(om); 4384 if ((orig_l3 & ATTR_AF) != 0) { 4385 pmap_s1_invalidate_page(pmap, va, true); 4386 vm_page_aflag_set(om, PGA_REFERENCED); 4387 } 4388 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 4389 pv = pmap_pvh_remove(&om->md, pmap, va); 4390 if ((m->oflags & VPO_UNMANAGED) != 0) 4391 free_pv_entry(pmap, pv); 4392 if ((om->a.flags & PGA_WRITEABLE) != 0 && 4393 TAILQ_EMPTY(&om->md.pv_list) && 4394 ((om->flags & PG_FICTITIOUS) != 0 || 4395 TAILQ_EMPTY(&page_to_pvh(om)->pv_list))) 4396 vm_page_aflag_clear(om, PGA_WRITEABLE); 4397 } else { 4398 KASSERT((orig_l3 & ATTR_AF) != 0, 4399 ("pmap_enter: unmanaged mapping lacks ATTR_AF")); 4400 pmap_s1_invalidate_page(pmap, va, true); 4401 } 4402 orig_l3 = 0; 4403 } else { 4404 /* 4405 * Increment the counters. 4406 */ 4407 if ((new_l3 & ATTR_SW_WIRED) != 0) 4408 pmap->pm_stats.wired_count++; 4409 pmap_resident_count_inc(pmap, 1); 4410 } 4411 /* 4412 * Enter on the PV list if part of our managed memory. 4413 */ 4414 if ((m->oflags & VPO_UNMANAGED) == 0) { 4415 if (pv == NULL) { 4416 pv = get_pv_entry(pmap, &lock); 4417 pv->pv_va = va; 4418 } 4419 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 4420 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4421 m->md.pv_gen++; 4422 if ((new_l3 & ATTR_SW_DBM) != 0) 4423 vm_page_aflag_set(m, PGA_WRITEABLE); 4424 } 4425 4426 validate: 4427 if (pmap->pm_stage == PM_STAGE1) { 4428 /* 4429 * Sync icache if exec permission and attribute 4430 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping 4431 * is stored and made valid for hardware table walk. If done 4432 * later, then other can access this page before caches are 4433 * properly synced. Don't do it for kernel memory which is 4434 * mapped with exec permission even if the memory isn't going 4435 * to hold executable code. The only time when icache sync is 4436 * needed is after kernel module is loaded and the relocation 4437 * info is processed. And it's done in elf_cpu_load_file(). 4438 */ 4439 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 4440 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK && 4441 (opa != pa || (orig_l3 & ATTR_S1_XN))) { 4442 PMAP_ASSERT_STAGE1(pmap); 4443 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); 4444 } 4445 } else { 4446 cpu_dcache_wb_range(PHYS_TO_DMAP(pa), PAGE_SIZE); 4447 } 4448 4449 /* 4450 * Update the L3 entry 4451 */ 4452 if (pmap_l3_valid(orig_l3)) { 4453 PMAP_ASSERT_STAGE1(pmap); 4454 KASSERT(opa == pa, ("pmap_enter: invalid update")); 4455 if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) { 4456 /* same PA, different attributes */ 4457 orig_l3 = pmap_load_store(l3, new_l3); 4458 pmap_s1_invalidate_page(pmap, va, true); 4459 if ((orig_l3 & ATTR_SW_MANAGED) != 0 && 4460 pmap_pte_dirty(pmap, orig_l3)) 4461 vm_page_dirty(m); 4462 } else { 4463 /* 4464 * orig_l3 == new_l3 4465 * This can happens if multiple threads simultaneously 4466 * access not yet mapped page. This bad for performance 4467 * since this can cause full demotion-NOP-promotion 4468 * cycle. 4469 * Another possible reasons are: 4470 * - VM and pmap memory layout are diverged 4471 * - tlb flush is missing somewhere and CPU doesn't see 4472 * actual mapping. 4473 */ 4474 CTR4(KTR_PMAP, "%s: already mapped page - " 4475 "pmap %p va 0x%#lx pte 0x%lx", 4476 __func__, pmap, va, new_l3); 4477 } 4478 } else { 4479 /* New mapping */ 4480 pmap_store(l3, new_l3); 4481 dsb(ishst); 4482 } 4483 4484 #if VM_NRESERVLEVEL > 0 4485 /* 4486 * Try to promote from level 3 pages to a level 2 superpage. This 4487 * currently only works on stage 1 pmaps as pmap_promote_l2 looks at 4488 * stage 1 specific fields and performs a break-before-make sequence 4489 * that is incorrect a stage 2 pmap. 4490 */ 4491 if ((mpte == NULL || mpte->ref_count == NL3PG) && 4492 pmap_ps_enabled(pmap) && pmap->pm_stage == PM_STAGE1 && 4493 (m->flags & PG_FICTITIOUS) == 0 && 4494 vm_reserv_level_iffullpop(m) == 0) { 4495 pmap_promote_l2(pmap, pde, va, mpte, &lock); 4496 } 4497 #endif 4498 4499 rv = KERN_SUCCESS; 4500 out: 4501 if (lock != NULL) 4502 rw_wunlock(lock); 4503 PMAP_UNLOCK(pmap); 4504 return (rv); 4505 } 4506 4507 /* 4508 * Tries to create a read- and/or execute-only 2MB page mapping. Returns 4509 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error 4510 * value. See pmap_enter_l2() for the possible error values when "no sleep", 4511 * "no replace", and "no reclaim" are specified. 4512 */ 4513 static int 4514 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4515 struct rwlock **lockp) 4516 { 4517 pd_entry_t new_l2; 4518 4519 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4520 PMAP_ASSERT_STAGE1(pmap); 4521 KASSERT(ADDR_IS_CANONICAL(va), 4522 ("%s: Address not in canonical form: %lx", __func__, va)); 4523 4524 new_l2 = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | 4525 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | 4526 L2_BLOCK); 4527 if ((m->oflags & VPO_UNMANAGED) == 0) { 4528 new_l2 |= ATTR_SW_MANAGED; 4529 new_l2 &= ~ATTR_AF; 4530 } 4531 if ((prot & VM_PROT_EXECUTE) == 0 || 4532 m->md.pv_memattr == VM_MEMATTR_DEVICE) 4533 new_l2 |= ATTR_S1_XN; 4534 if (!ADDR_IS_KERNEL(va)) 4535 new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 4536 else 4537 new_l2 |= ATTR_S1_UXN; 4538 if (pmap != kernel_pmap) 4539 new_l2 |= ATTR_S1_nG; 4540 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | 4541 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp)); 4542 } 4543 4544 /* 4545 * Returns true if every page table entry in the specified page table is 4546 * zero. 4547 */ 4548 static bool 4549 pmap_every_pte_zero(vm_paddr_t pa) 4550 { 4551 pt_entry_t *pt_end, *pte; 4552 4553 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); 4554 pte = (pt_entry_t *)PHYS_TO_DMAP(pa); 4555 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) { 4556 if (*pte != 0) 4557 return (false); 4558 } 4559 return (true); 4560 } 4561 4562 /* 4563 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 4564 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or 4565 * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if 4566 * PMAP_ENTER_NOREPLACE was specified and a 4KB page mapping already exists 4567 * within the 2MB virtual address range starting at the specified virtual 4568 * address. Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a 4569 * 2MB page mapping already exists at the specified virtual address. Returns 4570 * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a 4571 * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified 4572 * and a PV entry allocation failed. 4573 */ 4574 static int 4575 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, 4576 vm_page_t m, struct rwlock **lockp) 4577 { 4578 struct spglist free; 4579 pd_entry_t *l2, old_l2; 4580 vm_page_t l2pg, mt; 4581 4582 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4583 KASSERT(ADDR_IS_CANONICAL(va), 4584 ("%s: Address not in canonical form: %lx", __func__, va)); 4585 4586 if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags & 4587 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { 4588 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", 4589 va, pmap); 4590 return (KERN_RESOURCE_SHORTAGE); 4591 } 4592 4593 /* 4594 * If there are existing mappings, either abort or remove them. 4595 */ 4596 if ((old_l2 = pmap_load(l2)) != 0) { 4597 KASSERT(l2pg == NULL || l2pg->ref_count > 1, 4598 ("pmap_enter_l2: l2pg's ref count is too low")); 4599 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 4600 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) { 4601 if (l2pg != NULL) 4602 l2pg->ref_count--; 4603 CTR2(KTR_PMAP, 4604 "pmap_enter_l2: no space for va %#lx" 4605 " in pmap %p", va, pmap); 4606 return (KERN_NO_SPACE); 4607 } else if (!ADDR_IS_KERNEL(va) || 4608 !pmap_every_pte_zero(old_l2 & ~ATTR_MASK)) { 4609 if (l2pg != NULL) 4610 l2pg->ref_count--; 4611 CTR2(KTR_PMAP, 4612 "pmap_enter_l2: failure for va %#lx" 4613 " in pmap %p", va, pmap); 4614 return (KERN_FAILURE); 4615 } 4616 } 4617 SLIST_INIT(&free); 4618 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) 4619 (void)pmap_remove_l2(pmap, l2, va, 4620 pmap_load(pmap_l1(pmap, va)), &free, lockp); 4621 else 4622 pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE, 4623 &free, lockp); 4624 if (!ADDR_IS_KERNEL(va)) { 4625 vm_page_free_pages_toq(&free, true); 4626 KASSERT(pmap_load(l2) == 0, 4627 ("pmap_enter_l2: non-zero L2 entry %p", l2)); 4628 } else { 4629 KASSERT(SLIST_EMPTY(&free), 4630 ("pmap_enter_l2: freed kernel page table page")); 4631 4632 /* 4633 * Both pmap_remove_l2() and pmap_remove_l3_range() 4634 * will leave the kernel page table page zero filled. 4635 * Nonetheless, the TLB could have an intermediate 4636 * entry for the kernel page table page, so request 4637 * an invalidation at all levels after clearing 4638 * the L2_TABLE entry. 4639 */ 4640 mt = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); 4641 if (pmap_insert_pt_page(pmap, mt, false)) 4642 panic("pmap_enter_l2: trie insert failed"); 4643 pmap_clear(l2); 4644 pmap_s1_invalidate_page(pmap, va, false); 4645 } 4646 } 4647 4648 if ((new_l2 & ATTR_SW_MANAGED) != 0) { 4649 /* 4650 * Abort this mapping if its PV entry could not be created. 4651 */ 4652 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { 4653 if (l2pg != NULL) 4654 pmap_abort_ptp(pmap, va, l2pg); 4655 CTR2(KTR_PMAP, 4656 "pmap_enter_l2: failure for va %#lx in pmap %p", 4657 va, pmap); 4658 return (KERN_RESOURCE_SHORTAGE); 4659 } 4660 if ((new_l2 & ATTR_SW_DBM) != 0) 4661 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 4662 vm_page_aflag_set(mt, PGA_WRITEABLE); 4663 } 4664 4665 /* 4666 * Increment counters. 4667 */ 4668 if ((new_l2 & ATTR_SW_WIRED) != 0) 4669 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; 4670 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; 4671 4672 /* 4673 * Conditionally sync the icache. See pmap_enter() for details. 4674 */ 4675 if ((new_l2 & ATTR_S1_XN) == 0 && ((new_l2 & ~ATTR_MASK) != 4676 (old_l2 & ~ATTR_MASK) || (old_l2 & ATTR_S1_XN) != 0) && 4677 pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) { 4678 cpu_icache_sync_range(PHYS_TO_DMAP(new_l2 & ~ATTR_MASK), 4679 L2_SIZE); 4680 } 4681 4682 /* 4683 * Map the superpage. 4684 */ 4685 pmap_store(l2, new_l2); 4686 dsb(ishst); 4687 4688 atomic_add_long(&pmap_l2_mappings, 1); 4689 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", 4690 va, pmap); 4691 4692 return (KERN_SUCCESS); 4693 } 4694 4695 /* 4696 * Maps a sequence of resident pages belonging to the same object. 4697 * The sequence begins with the given page m_start. This page is 4698 * mapped at the given virtual address start. Each subsequent page is 4699 * mapped at a virtual address that is offset from start by the same 4700 * amount as the page is offset from m_start within the object. The 4701 * last page in the sequence is the page with the largest offset from 4702 * m_start that can be mapped at a virtual address less than the given 4703 * virtual address end. Not every virtual page between start and end 4704 * is mapped; only those for which a resident page exists with the 4705 * corresponding offset from m_start are mapped. 4706 */ 4707 void 4708 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 4709 vm_page_t m_start, vm_prot_t prot) 4710 { 4711 struct rwlock *lock; 4712 vm_offset_t va; 4713 vm_page_t m, mpte; 4714 vm_pindex_t diff, psize; 4715 int rv; 4716 4717 VM_OBJECT_ASSERT_LOCKED(m_start->object); 4718 4719 psize = atop(end - start); 4720 mpte = NULL; 4721 m = m_start; 4722 lock = NULL; 4723 PMAP_LOCK(pmap); 4724 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 4725 va = start + ptoa(diff); 4726 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && 4727 m->psind == 1 && pmap_ps_enabled(pmap) && 4728 ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) == 4729 KERN_SUCCESS || rv == KERN_NO_SPACE)) 4730 m = &m[L2_SIZE / PAGE_SIZE - 1]; 4731 else 4732 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, 4733 &lock); 4734 m = TAILQ_NEXT(m, listq); 4735 } 4736 if (lock != NULL) 4737 rw_wunlock(lock); 4738 PMAP_UNLOCK(pmap); 4739 } 4740 4741 /* 4742 * this code makes some *MAJOR* assumptions: 4743 * 1. Current pmap & pmap exists. 4744 * 2. Not wired. 4745 * 3. Read access. 4746 * 4. No page table pages. 4747 * but is *MUCH* faster than pmap_enter... 4748 */ 4749 4750 void 4751 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4752 { 4753 struct rwlock *lock; 4754 4755 lock = NULL; 4756 PMAP_LOCK(pmap); 4757 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 4758 if (lock != NULL) 4759 rw_wunlock(lock); 4760 PMAP_UNLOCK(pmap); 4761 } 4762 4763 static vm_page_t 4764 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 4765 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 4766 { 4767 pd_entry_t *pde; 4768 pt_entry_t *l1, *l2, *l3, l3_val; 4769 vm_paddr_t pa; 4770 int lvl; 4771 4772 KASSERT(!VA_IS_CLEANMAP(va) || 4773 (m->oflags & VPO_UNMANAGED) != 0, 4774 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 4775 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4776 PMAP_ASSERT_STAGE1(pmap); 4777 KASSERT(ADDR_IS_CANONICAL(va), 4778 ("%s: Address not in canonical form: %lx", __func__, va)); 4779 4780 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); 4781 /* 4782 * In the case that a page table page is not 4783 * resident, we are creating it here. 4784 */ 4785 if (!ADDR_IS_KERNEL(va)) { 4786 vm_pindex_t l2pindex; 4787 4788 /* 4789 * Calculate pagetable page index 4790 */ 4791 l2pindex = pmap_l2_pindex(va); 4792 if (mpte && (mpte->pindex == l2pindex)) { 4793 mpte->ref_count++; 4794 } else { 4795 /* 4796 * If the page table page is mapped, we just increment 4797 * the hold count, and activate it. Otherwise, we 4798 * attempt to allocate a page table page, passing NULL 4799 * instead of the PV list lock pointer because we don't 4800 * intend to sleep. If this attempt fails, we don't 4801 * retry. Instead, we give up. 4802 */ 4803 l1 = pmap_l1(pmap, va); 4804 if (l1 != NULL && pmap_load(l1) != 0) { 4805 if ((pmap_load(l1) & ATTR_DESCR_MASK) == 4806 L1_BLOCK) 4807 return (NULL); 4808 l2 = pmap_l1_to_l2(l1, va); 4809 if (pmap_load(l2) != 0) { 4810 if ((pmap_load(l2) & ATTR_DESCR_MASK) == 4811 L2_BLOCK) 4812 return (NULL); 4813 mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & 4814 ~ATTR_MASK); 4815 mpte->ref_count++; 4816 } else { 4817 mpte = _pmap_alloc_l3(pmap, l2pindex, 4818 NULL); 4819 if (mpte == NULL) 4820 return (mpte); 4821 } 4822 } else { 4823 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); 4824 if (mpte == NULL) 4825 return (mpte); 4826 } 4827 } 4828 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 4829 l3 = &l3[pmap_l3_index(va)]; 4830 } else { 4831 mpte = NULL; 4832 pde = pmap_pde(kernel_pmap, va, &lvl); 4833 KASSERT(pde != NULL, 4834 ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx", 4835 va)); 4836 KASSERT(lvl == 2, 4837 ("pmap_enter_quick_locked: Invalid level %d", lvl)); 4838 l3 = pmap_l2_to_l3(pde, va); 4839 } 4840 4841 /* 4842 * Abort if a mapping already exists. 4843 */ 4844 if (pmap_load(l3) != 0) { 4845 if (mpte != NULL) 4846 mpte->ref_count--; 4847 return (NULL); 4848 } 4849 4850 /* 4851 * Enter on the PV list if part of our managed memory. 4852 */ 4853 if ((m->oflags & VPO_UNMANAGED) == 0 && 4854 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 4855 if (mpte != NULL) 4856 pmap_abort_ptp(pmap, va, mpte); 4857 return (NULL); 4858 } 4859 4860 /* 4861 * Increment counters 4862 */ 4863 pmap_resident_count_inc(pmap, 1); 4864 4865 pa = VM_PAGE_TO_PHYS(m); 4866 l3_val = pa | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) | 4867 ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE; 4868 if ((prot & VM_PROT_EXECUTE) == 0 || 4869 m->md.pv_memattr == VM_MEMATTR_DEVICE) 4870 l3_val |= ATTR_S1_XN; 4871 if (!ADDR_IS_KERNEL(va)) 4872 l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 4873 else 4874 l3_val |= ATTR_S1_UXN; 4875 if (pmap != kernel_pmap) 4876 l3_val |= ATTR_S1_nG; 4877 4878 /* 4879 * Now validate mapping with RO protection 4880 */ 4881 if ((m->oflags & VPO_UNMANAGED) == 0) { 4882 l3_val |= ATTR_SW_MANAGED; 4883 l3_val &= ~ATTR_AF; 4884 } 4885 4886 /* Sync icache before the mapping is stored to PTE */ 4887 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 4888 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) 4889 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); 4890 4891 pmap_store(l3, l3_val); 4892 dsb(ishst); 4893 4894 return (mpte); 4895 } 4896 4897 /* 4898 * This code maps large physical mmap regions into the 4899 * processor address space. Note that some shortcuts 4900 * are taken, but the code works. 4901 */ 4902 void 4903 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 4904 vm_pindex_t pindex, vm_size_t size) 4905 { 4906 4907 VM_OBJECT_ASSERT_WLOCKED(object); 4908 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 4909 ("pmap_object_init_pt: non-device object")); 4910 } 4911 4912 /* 4913 * Clear the wired attribute from the mappings for the specified range of 4914 * addresses in the given pmap. Every valid mapping within that range 4915 * must have the wired attribute set. In contrast, invalid mappings 4916 * cannot have the wired attribute set, so they are ignored. 4917 * 4918 * The wired attribute of the page table entry is not a hardware feature, 4919 * so there is no need to invalidate any TLB entries. 4920 */ 4921 void 4922 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4923 { 4924 vm_offset_t va_next; 4925 pd_entry_t *l0, *l1, *l2; 4926 pt_entry_t *l3; 4927 4928 PMAP_LOCK(pmap); 4929 for (; sva < eva; sva = va_next) { 4930 l0 = pmap_l0(pmap, sva); 4931 if (pmap_load(l0) == 0) { 4932 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 4933 if (va_next < sva) 4934 va_next = eva; 4935 continue; 4936 } 4937 4938 l1 = pmap_l0_to_l1(l0, sva); 4939 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 4940 if (va_next < sva) 4941 va_next = eva; 4942 if (pmap_load(l1) == 0) 4943 continue; 4944 4945 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 4946 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 4947 KASSERT(va_next <= eva, 4948 ("partial update of non-transparent 1G page " 4949 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 4950 pmap_load(l1), sva, eva, va_next)); 4951 MPASS(pmap != kernel_pmap); 4952 MPASS((pmap_load(l1) & (ATTR_SW_MANAGED | 4953 ATTR_SW_WIRED)) == ATTR_SW_WIRED); 4954 pmap_clear_bits(l1, ATTR_SW_WIRED); 4955 pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE; 4956 continue; 4957 } 4958 4959 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 4960 if (va_next < sva) 4961 va_next = eva; 4962 4963 l2 = pmap_l1_to_l2(l1, sva); 4964 if (pmap_load(l2) == 0) 4965 continue; 4966 4967 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { 4968 if ((pmap_load(l2) & ATTR_SW_WIRED) == 0) 4969 panic("pmap_unwire: l2 %#jx is missing " 4970 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2)); 4971 4972 /* 4973 * Are we unwiring the entire large page? If not, 4974 * demote the mapping and fall through. 4975 */ 4976 if (sva + L2_SIZE == va_next && eva >= va_next) { 4977 pmap_clear_bits(l2, ATTR_SW_WIRED); 4978 pmap->pm_stats.wired_count -= L2_SIZE / 4979 PAGE_SIZE; 4980 continue; 4981 } else if (pmap_demote_l2(pmap, l2, sva) == NULL) 4982 panic("pmap_unwire: demotion failed"); 4983 } 4984 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 4985 ("pmap_unwire: Invalid l2 entry after demotion")); 4986 4987 if (va_next > eva) 4988 va_next = eva; 4989 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 4990 sva += L3_SIZE) { 4991 if (pmap_load(l3) == 0) 4992 continue; 4993 if ((pmap_load(l3) & ATTR_SW_WIRED) == 0) 4994 panic("pmap_unwire: l3 %#jx is missing " 4995 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3)); 4996 4997 /* 4998 * ATTR_SW_WIRED must be cleared atomically. Although 4999 * the pmap lock synchronizes access to ATTR_SW_WIRED, 5000 * the System MMU may write to the entry concurrently. 5001 */ 5002 pmap_clear_bits(l3, ATTR_SW_WIRED); 5003 pmap->pm_stats.wired_count--; 5004 } 5005 } 5006 PMAP_UNLOCK(pmap); 5007 } 5008 5009 /* 5010 * Copy the range specified by src_addr/len 5011 * from the source map to the range dst_addr/len 5012 * in the destination map. 5013 * 5014 * This routine is only advisory and need not do anything. 5015 * 5016 * Because the executable mappings created by this routine are copied, 5017 * it should not have to flush the instruction cache. 5018 */ 5019 void 5020 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 5021 vm_offset_t src_addr) 5022 { 5023 struct rwlock *lock; 5024 pd_entry_t *l0, *l1, *l2, srcptepaddr; 5025 pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte; 5026 vm_offset_t addr, end_addr, va_next; 5027 vm_page_t dst_m, dstmpte, srcmpte; 5028 5029 PMAP_ASSERT_STAGE1(dst_pmap); 5030 PMAP_ASSERT_STAGE1(src_pmap); 5031 5032 if (dst_addr != src_addr) 5033 return; 5034 end_addr = src_addr + len; 5035 lock = NULL; 5036 if (dst_pmap < src_pmap) { 5037 PMAP_LOCK(dst_pmap); 5038 PMAP_LOCK(src_pmap); 5039 } else { 5040 PMAP_LOCK(src_pmap); 5041 PMAP_LOCK(dst_pmap); 5042 } 5043 for (addr = src_addr; addr < end_addr; addr = va_next) { 5044 l0 = pmap_l0(src_pmap, addr); 5045 if (pmap_load(l0) == 0) { 5046 va_next = (addr + L0_SIZE) & ~L0_OFFSET; 5047 if (va_next < addr) 5048 va_next = end_addr; 5049 continue; 5050 } 5051 5052 va_next = (addr + L1_SIZE) & ~L1_OFFSET; 5053 if (va_next < addr) 5054 va_next = end_addr; 5055 l1 = pmap_l0_to_l1(l0, addr); 5056 if (pmap_load(l1) == 0) 5057 continue; 5058 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 5059 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 5060 KASSERT(va_next <= end_addr, 5061 ("partial update of non-transparent 1G page " 5062 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx", 5063 pmap_load(l1), addr, end_addr, va_next)); 5064 srcptepaddr = pmap_load(l1); 5065 l1 = pmap_l1(dst_pmap, addr); 5066 if (l1 == NULL) { 5067 if (_pmap_alloc_l3(dst_pmap, 5068 pmap_l0_pindex(addr), NULL) == NULL) 5069 break; 5070 l1 = pmap_l1(dst_pmap, addr); 5071 } else { 5072 l0 = pmap_l0(dst_pmap, addr); 5073 dst_m = PHYS_TO_VM_PAGE(pmap_load(l0) & 5074 ~ATTR_MASK); 5075 dst_m->ref_count++; 5076 } 5077 KASSERT(pmap_load(l1) == 0, 5078 ("1G mapping present in dst pmap " 5079 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx", 5080 pmap_load(l1), addr, end_addr, va_next)); 5081 pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED); 5082 pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE); 5083 continue; 5084 } 5085 5086 va_next = (addr + L2_SIZE) & ~L2_OFFSET; 5087 if (va_next < addr) 5088 va_next = end_addr; 5089 l2 = pmap_l1_to_l2(l1, addr); 5090 srcptepaddr = pmap_load(l2); 5091 if (srcptepaddr == 0) 5092 continue; 5093 if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) { 5094 /* 5095 * We can only virtual copy whole superpages. 5096 */ 5097 if ((addr & L2_OFFSET) != 0 || 5098 addr + L2_SIZE > end_addr) 5099 continue; 5100 l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL); 5101 if (l2 == NULL) 5102 break; 5103 if (pmap_load(l2) == 0 && 5104 ((srcptepaddr & ATTR_SW_MANAGED) == 0 || 5105 pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr, 5106 PMAP_ENTER_NORECLAIM, &lock))) { 5107 /* 5108 * We leave the dirty bit unchanged because 5109 * managed read/write superpage mappings are 5110 * required to be dirty. However, managed 5111 * superpage mappings are not required to 5112 * have their accessed bit set, so we clear 5113 * it because we don't know if this mapping 5114 * will be used. 5115 */ 5116 srcptepaddr &= ~ATTR_SW_WIRED; 5117 if ((srcptepaddr & ATTR_SW_MANAGED) != 0) 5118 srcptepaddr &= ~ATTR_AF; 5119 pmap_store(l2, srcptepaddr); 5120 pmap_resident_count_inc(dst_pmap, L2_SIZE / 5121 PAGE_SIZE); 5122 atomic_add_long(&pmap_l2_mappings, 1); 5123 } else 5124 pmap_abort_ptp(dst_pmap, addr, dst_m); 5125 continue; 5126 } 5127 KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE, 5128 ("pmap_copy: invalid L2 entry")); 5129 srcptepaddr &= ~ATTR_MASK; 5130 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 5131 KASSERT(srcmpte->ref_count > 0, 5132 ("pmap_copy: source page table page is unused")); 5133 if (va_next > end_addr) 5134 va_next = end_addr; 5135 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 5136 src_pte = &src_pte[pmap_l3_index(addr)]; 5137 dstmpte = NULL; 5138 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { 5139 ptetemp = pmap_load(src_pte); 5140 5141 /* 5142 * We only virtual copy managed pages. 5143 */ 5144 if ((ptetemp & ATTR_SW_MANAGED) == 0) 5145 continue; 5146 5147 if (dstmpte != NULL) { 5148 KASSERT(dstmpte->pindex == pmap_l2_pindex(addr), 5149 ("dstmpte pindex/addr mismatch")); 5150 dstmpte->ref_count++; 5151 } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr, 5152 NULL)) == NULL) 5153 goto out; 5154 dst_pte = (pt_entry_t *) 5155 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 5156 dst_pte = &dst_pte[pmap_l3_index(addr)]; 5157 if (pmap_load(dst_pte) == 0 && 5158 pmap_try_insert_pv_entry(dst_pmap, addr, 5159 PHYS_TO_VM_PAGE(ptetemp & ~ATTR_MASK), &lock)) { 5160 /* 5161 * Clear the wired, modified, and accessed 5162 * (referenced) bits during the copy. 5163 */ 5164 mask = ATTR_AF | ATTR_SW_WIRED; 5165 nbits = 0; 5166 if ((ptetemp & ATTR_SW_DBM) != 0) 5167 nbits |= ATTR_S1_AP_RW_BIT; 5168 pmap_store(dst_pte, (ptetemp & ~mask) | nbits); 5169 pmap_resident_count_inc(dst_pmap, 1); 5170 } else { 5171 pmap_abort_ptp(dst_pmap, addr, dstmpte); 5172 goto out; 5173 } 5174 /* Have we copied all of the valid mappings? */ 5175 if (dstmpte->ref_count >= srcmpte->ref_count) 5176 break; 5177 } 5178 } 5179 out: 5180 /* 5181 * XXX This barrier may not be needed because the destination pmap is 5182 * not active. 5183 */ 5184 dsb(ishst); 5185 5186 if (lock != NULL) 5187 rw_wunlock(lock); 5188 PMAP_UNLOCK(src_pmap); 5189 PMAP_UNLOCK(dst_pmap); 5190 } 5191 5192 /* 5193 * pmap_zero_page zeros the specified hardware page by mapping 5194 * the page into KVM and using bzero to clear its contents. 5195 */ 5196 void 5197 pmap_zero_page(vm_page_t m) 5198 { 5199 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5200 5201 pagezero((void *)va); 5202 } 5203 5204 /* 5205 * pmap_zero_page_area zeros the specified hardware page by mapping 5206 * the page into KVM and using bzero to clear its contents. 5207 * 5208 * off and size may not cover an area beyond a single hardware page. 5209 */ 5210 void 5211 pmap_zero_page_area(vm_page_t m, int off, int size) 5212 { 5213 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5214 5215 if (off == 0 && size == PAGE_SIZE) 5216 pagezero((void *)va); 5217 else 5218 bzero((char *)va + off, size); 5219 } 5220 5221 /* 5222 * pmap_copy_page copies the specified (machine independent) 5223 * page by mapping the page into virtual memory and using 5224 * bcopy to copy the page, one machine dependent page at a 5225 * time. 5226 */ 5227 void 5228 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 5229 { 5230 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 5231 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 5232 5233 pagecopy((void *)src, (void *)dst); 5234 } 5235 5236 int unmapped_buf_allowed = 1; 5237 5238 void 5239 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 5240 vm_offset_t b_offset, int xfersize) 5241 { 5242 void *a_cp, *b_cp; 5243 vm_page_t m_a, m_b; 5244 vm_paddr_t p_a, p_b; 5245 vm_offset_t a_pg_offset, b_pg_offset; 5246 int cnt; 5247 5248 while (xfersize > 0) { 5249 a_pg_offset = a_offset & PAGE_MASK; 5250 m_a = ma[a_offset >> PAGE_SHIFT]; 5251 p_a = m_a->phys_addr; 5252 b_pg_offset = b_offset & PAGE_MASK; 5253 m_b = mb[b_offset >> PAGE_SHIFT]; 5254 p_b = m_b->phys_addr; 5255 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 5256 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 5257 if (__predict_false(!PHYS_IN_DMAP(p_a))) { 5258 panic("!DMAP a %lx", p_a); 5259 } else { 5260 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 5261 } 5262 if (__predict_false(!PHYS_IN_DMAP(p_b))) { 5263 panic("!DMAP b %lx", p_b); 5264 } else { 5265 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 5266 } 5267 bcopy(a_cp, b_cp, cnt); 5268 a_offset += cnt; 5269 b_offset += cnt; 5270 xfersize -= cnt; 5271 } 5272 } 5273 5274 vm_offset_t 5275 pmap_quick_enter_page(vm_page_t m) 5276 { 5277 5278 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); 5279 } 5280 5281 void 5282 pmap_quick_remove_page(vm_offset_t addr) 5283 { 5284 } 5285 5286 /* 5287 * Returns true if the pmap's pv is one of the first 5288 * 16 pvs linked to from this page. This count may 5289 * be changed upwards or downwards in the future; it 5290 * is only necessary that true be returned for a small 5291 * subset of pmaps for proper page aging. 5292 */ 5293 boolean_t 5294 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 5295 { 5296 struct md_page *pvh; 5297 struct rwlock *lock; 5298 pv_entry_t pv; 5299 int loops = 0; 5300 boolean_t rv; 5301 5302 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5303 ("pmap_page_exists_quick: page %p is not managed", m)); 5304 rv = FALSE; 5305 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5306 rw_rlock(lock); 5307 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5308 if (PV_PMAP(pv) == pmap) { 5309 rv = TRUE; 5310 break; 5311 } 5312 loops++; 5313 if (loops >= 16) 5314 break; 5315 } 5316 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 5317 pvh = page_to_pvh(m); 5318 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5319 if (PV_PMAP(pv) == pmap) { 5320 rv = TRUE; 5321 break; 5322 } 5323 loops++; 5324 if (loops >= 16) 5325 break; 5326 } 5327 } 5328 rw_runlock(lock); 5329 return (rv); 5330 } 5331 5332 /* 5333 * pmap_page_wired_mappings: 5334 * 5335 * Return the number of managed mappings to the given physical page 5336 * that are wired. 5337 */ 5338 int 5339 pmap_page_wired_mappings(vm_page_t m) 5340 { 5341 struct rwlock *lock; 5342 struct md_page *pvh; 5343 pmap_t pmap; 5344 pt_entry_t *pte; 5345 pv_entry_t pv; 5346 int count, md_gen, pvh_gen; 5347 5348 if ((m->oflags & VPO_UNMANAGED) != 0) 5349 return (0); 5350 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5351 rw_rlock(lock); 5352 restart: 5353 count = 0; 5354 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5355 pmap = PV_PMAP(pv); 5356 if (!PMAP_TRYLOCK(pmap)) { 5357 md_gen = m->md.pv_gen; 5358 rw_runlock(lock); 5359 PMAP_LOCK(pmap); 5360 rw_rlock(lock); 5361 if (md_gen != m->md.pv_gen) { 5362 PMAP_UNLOCK(pmap); 5363 goto restart; 5364 } 5365 } 5366 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 5367 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0) 5368 count++; 5369 PMAP_UNLOCK(pmap); 5370 } 5371 if ((m->flags & PG_FICTITIOUS) == 0) { 5372 pvh = page_to_pvh(m); 5373 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5374 pmap = PV_PMAP(pv); 5375 if (!PMAP_TRYLOCK(pmap)) { 5376 md_gen = m->md.pv_gen; 5377 pvh_gen = pvh->pv_gen; 5378 rw_runlock(lock); 5379 PMAP_LOCK(pmap); 5380 rw_rlock(lock); 5381 if (md_gen != m->md.pv_gen || 5382 pvh_gen != pvh->pv_gen) { 5383 PMAP_UNLOCK(pmap); 5384 goto restart; 5385 } 5386 } 5387 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__); 5388 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0) 5389 count++; 5390 PMAP_UNLOCK(pmap); 5391 } 5392 } 5393 rw_runlock(lock); 5394 return (count); 5395 } 5396 5397 /* 5398 * Returns true if the given page is mapped individually or as part of 5399 * a 2mpage. Otherwise, returns false. 5400 */ 5401 bool 5402 pmap_page_is_mapped(vm_page_t m) 5403 { 5404 struct rwlock *lock; 5405 bool rv; 5406 5407 if ((m->oflags & VPO_UNMANAGED) != 0) 5408 return (false); 5409 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5410 rw_rlock(lock); 5411 rv = !TAILQ_EMPTY(&m->md.pv_list) || 5412 ((m->flags & PG_FICTITIOUS) == 0 && 5413 !TAILQ_EMPTY(&page_to_pvh(m)->pv_list)); 5414 rw_runlock(lock); 5415 return (rv); 5416 } 5417 5418 /* 5419 * Destroy all managed, non-wired mappings in the given user-space 5420 * pmap. This pmap cannot be active on any processor besides the 5421 * caller. 5422 * 5423 * This function cannot be applied to the kernel pmap. Moreover, it 5424 * is not intended for general use. It is only to be used during 5425 * process termination. Consequently, it can be implemented in ways 5426 * that make it faster than pmap_remove(). First, it can more quickly 5427 * destroy mappings by iterating over the pmap's collection of PV 5428 * entries, rather than searching the page table. Second, it doesn't 5429 * have to test and clear the page table entries atomically, because 5430 * no processor is currently accessing the user address space. In 5431 * particular, a page table entry's dirty bit won't change state once 5432 * this function starts. 5433 */ 5434 void 5435 pmap_remove_pages(pmap_t pmap) 5436 { 5437 pd_entry_t *pde; 5438 pt_entry_t *pte, tpte; 5439 struct spglist free; 5440 struct pv_chunklist free_chunks[PMAP_MEMDOM]; 5441 vm_page_t m, ml3, mt; 5442 pv_entry_t pv; 5443 struct md_page *pvh; 5444 struct pv_chunk *pc, *npc; 5445 struct rwlock *lock; 5446 int64_t bit; 5447 uint64_t inuse, bitmask; 5448 int allfree, field, i, idx, lvl; 5449 int freed __pvused; 5450 vm_paddr_t pa; 5451 5452 lock = NULL; 5453 5454 for (i = 0; i < PMAP_MEMDOM; i++) 5455 TAILQ_INIT(&free_chunks[i]); 5456 SLIST_INIT(&free); 5457 PMAP_LOCK(pmap); 5458 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 5459 allfree = 1; 5460 freed = 0; 5461 for (field = 0; field < _NPCM; field++) { 5462 inuse = ~pc->pc_map[field] & pc_freemask[field]; 5463 while (inuse != 0) { 5464 bit = ffsl(inuse) - 1; 5465 bitmask = 1UL << bit; 5466 idx = field * 64 + bit; 5467 pv = &pc->pc_pventry[idx]; 5468 inuse &= ~bitmask; 5469 5470 pde = pmap_pde(pmap, pv->pv_va, &lvl); 5471 KASSERT(pde != NULL, 5472 ("Attempting to remove an unmapped page")); 5473 5474 switch(lvl) { 5475 case 1: 5476 pte = pmap_l1_to_l2(pde, pv->pv_va); 5477 tpte = pmap_load(pte); 5478 KASSERT((tpte & ATTR_DESCR_MASK) == 5479 L2_BLOCK, 5480 ("Attempting to remove an invalid " 5481 "block: %lx", tpte)); 5482 break; 5483 case 2: 5484 pte = pmap_l2_to_l3(pde, pv->pv_va); 5485 tpte = pmap_load(pte); 5486 KASSERT((tpte & ATTR_DESCR_MASK) == 5487 L3_PAGE, 5488 ("Attempting to remove an invalid " 5489 "page: %lx", tpte)); 5490 break; 5491 default: 5492 panic( 5493 "Invalid page directory level: %d", 5494 lvl); 5495 } 5496 5497 /* 5498 * We cannot remove wired pages from a process' mapping at this time 5499 */ 5500 if (tpte & ATTR_SW_WIRED) { 5501 allfree = 0; 5502 continue; 5503 } 5504 5505 /* Mark free */ 5506 pc->pc_map[field] |= bitmask; 5507 5508 /* 5509 * Because this pmap is not active on other 5510 * processors, the dirty bit cannot have 5511 * changed state since we last loaded pte. 5512 */ 5513 pmap_clear(pte); 5514 5515 pa = tpte & ~ATTR_MASK; 5516 5517 m = PHYS_TO_VM_PAGE(pa); 5518 KASSERT(m->phys_addr == pa, 5519 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 5520 m, (uintmax_t)m->phys_addr, 5521 (uintmax_t)tpte)); 5522 5523 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 5524 m < &vm_page_array[vm_page_array_size], 5525 ("pmap_remove_pages: bad pte %#jx", 5526 (uintmax_t)tpte)); 5527 5528 /* 5529 * Update the vm_page_t clean/reference bits. 5530 */ 5531 if (pmap_pte_dirty(pmap, tpte)) { 5532 switch (lvl) { 5533 case 1: 5534 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 5535 vm_page_dirty(mt); 5536 break; 5537 case 2: 5538 vm_page_dirty(m); 5539 break; 5540 } 5541 } 5542 5543 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 5544 5545 switch (lvl) { 5546 case 1: 5547 pmap_resident_count_dec(pmap, 5548 L2_SIZE / PAGE_SIZE); 5549 pvh = page_to_pvh(m); 5550 TAILQ_REMOVE(&pvh->pv_list, pv,pv_next); 5551 pvh->pv_gen++; 5552 if (TAILQ_EMPTY(&pvh->pv_list)) { 5553 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 5554 if ((mt->a.flags & PGA_WRITEABLE) != 0 && 5555 TAILQ_EMPTY(&mt->md.pv_list)) 5556 vm_page_aflag_clear(mt, PGA_WRITEABLE); 5557 } 5558 ml3 = pmap_remove_pt_page(pmap, 5559 pv->pv_va); 5560 if (ml3 != NULL) { 5561 KASSERT(ml3->valid == VM_PAGE_BITS_ALL, 5562 ("pmap_remove_pages: l3 page not promoted")); 5563 pmap_resident_count_dec(pmap,1); 5564 KASSERT(ml3->ref_count == NL3PG, 5565 ("pmap_remove_pages: l3 page ref count error")); 5566 ml3->ref_count = 0; 5567 pmap_add_delayed_free_list(ml3, 5568 &free, FALSE); 5569 } 5570 break; 5571 case 2: 5572 pmap_resident_count_dec(pmap, 1); 5573 TAILQ_REMOVE(&m->md.pv_list, pv, 5574 pv_next); 5575 m->md.pv_gen++; 5576 if ((m->a.flags & PGA_WRITEABLE) != 0 && 5577 TAILQ_EMPTY(&m->md.pv_list) && 5578 (m->flags & PG_FICTITIOUS) == 0) { 5579 pvh = page_to_pvh(m); 5580 if (TAILQ_EMPTY(&pvh->pv_list)) 5581 vm_page_aflag_clear(m, 5582 PGA_WRITEABLE); 5583 } 5584 break; 5585 } 5586 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde), 5587 &free); 5588 freed++; 5589 } 5590 } 5591 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 5592 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 5593 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 5594 if (allfree) { 5595 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5596 TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc, 5597 pc_list); 5598 } 5599 } 5600 if (lock != NULL) 5601 rw_wunlock(lock); 5602 pmap_s1_invalidate_all(pmap); 5603 free_pv_chunk_batch(free_chunks); 5604 PMAP_UNLOCK(pmap); 5605 vm_page_free_pages_toq(&free, true); 5606 } 5607 5608 /* 5609 * This is used to check if a page has been accessed or modified. 5610 */ 5611 static boolean_t 5612 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 5613 { 5614 struct rwlock *lock; 5615 pv_entry_t pv; 5616 struct md_page *pvh; 5617 pt_entry_t *pte, mask, value; 5618 pmap_t pmap; 5619 int md_gen, pvh_gen; 5620 boolean_t rv; 5621 5622 rv = FALSE; 5623 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5624 rw_rlock(lock); 5625 restart: 5626 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5627 pmap = PV_PMAP(pv); 5628 PMAP_ASSERT_STAGE1(pmap); 5629 if (!PMAP_TRYLOCK(pmap)) { 5630 md_gen = m->md.pv_gen; 5631 rw_runlock(lock); 5632 PMAP_LOCK(pmap); 5633 rw_rlock(lock); 5634 if (md_gen != m->md.pv_gen) { 5635 PMAP_UNLOCK(pmap); 5636 goto restart; 5637 } 5638 } 5639 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 5640 mask = 0; 5641 value = 0; 5642 if (modified) { 5643 mask |= ATTR_S1_AP_RW_BIT; 5644 value |= ATTR_S1_AP(ATTR_S1_AP_RW); 5645 } 5646 if (accessed) { 5647 mask |= ATTR_AF | ATTR_DESCR_MASK; 5648 value |= ATTR_AF | L3_PAGE; 5649 } 5650 rv = (pmap_load(pte) & mask) == value; 5651 PMAP_UNLOCK(pmap); 5652 if (rv) 5653 goto out; 5654 } 5655 if ((m->flags & PG_FICTITIOUS) == 0) { 5656 pvh = page_to_pvh(m); 5657 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5658 pmap = PV_PMAP(pv); 5659 PMAP_ASSERT_STAGE1(pmap); 5660 if (!PMAP_TRYLOCK(pmap)) { 5661 md_gen = m->md.pv_gen; 5662 pvh_gen = pvh->pv_gen; 5663 rw_runlock(lock); 5664 PMAP_LOCK(pmap); 5665 rw_rlock(lock); 5666 if (md_gen != m->md.pv_gen || 5667 pvh_gen != pvh->pv_gen) { 5668 PMAP_UNLOCK(pmap); 5669 goto restart; 5670 } 5671 } 5672 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__); 5673 mask = 0; 5674 value = 0; 5675 if (modified) { 5676 mask |= ATTR_S1_AP_RW_BIT; 5677 value |= ATTR_S1_AP(ATTR_S1_AP_RW); 5678 } 5679 if (accessed) { 5680 mask |= ATTR_AF | ATTR_DESCR_MASK; 5681 value |= ATTR_AF | L2_BLOCK; 5682 } 5683 rv = (pmap_load(pte) & mask) == value; 5684 PMAP_UNLOCK(pmap); 5685 if (rv) 5686 goto out; 5687 } 5688 } 5689 out: 5690 rw_runlock(lock); 5691 return (rv); 5692 } 5693 5694 /* 5695 * pmap_is_modified: 5696 * 5697 * Return whether or not the specified physical page was modified 5698 * in any physical maps. 5699 */ 5700 boolean_t 5701 pmap_is_modified(vm_page_t m) 5702 { 5703 5704 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5705 ("pmap_is_modified: page %p is not managed", m)); 5706 5707 /* 5708 * If the page is not busied then this check is racy. 5709 */ 5710 if (!pmap_page_is_write_mapped(m)) 5711 return (FALSE); 5712 return (pmap_page_test_mappings(m, FALSE, TRUE)); 5713 } 5714 5715 /* 5716 * pmap_is_prefaultable: 5717 * 5718 * Return whether or not the specified virtual address is eligible 5719 * for prefault. 5720 */ 5721 boolean_t 5722 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 5723 { 5724 pd_entry_t *pde; 5725 pt_entry_t *pte; 5726 boolean_t rv; 5727 int lvl; 5728 5729 /* 5730 * Return TRUE if and only if the L3 entry for the specified virtual 5731 * address is allocated but invalid. 5732 */ 5733 rv = FALSE; 5734 PMAP_LOCK(pmap); 5735 pde = pmap_pde(pmap, addr, &lvl); 5736 if (pde != NULL && lvl == 2) { 5737 pte = pmap_l2_to_l3(pde, addr); 5738 rv = pmap_load(pte) == 0; 5739 } 5740 PMAP_UNLOCK(pmap); 5741 return (rv); 5742 } 5743 5744 /* 5745 * pmap_is_referenced: 5746 * 5747 * Return whether or not the specified physical page was referenced 5748 * in any physical maps. 5749 */ 5750 boolean_t 5751 pmap_is_referenced(vm_page_t m) 5752 { 5753 5754 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5755 ("pmap_is_referenced: page %p is not managed", m)); 5756 return (pmap_page_test_mappings(m, TRUE, FALSE)); 5757 } 5758 5759 /* 5760 * Clear the write and modified bits in each of the given page's mappings. 5761 */ 5762 void 5763 pmap_remove_write(vm_page_t m) 5764 { 5765 struct md_page *pvh; 5766 pmap_t pmap; 5767 struct rwlock *lock; 5768 pv_entry_t next_pv, pv; 5769 pt_entry_t oldpte, *pte; 5770 vm_offset_t va; 5771 int md_gen, pvh_gen; 5772 5773 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5774 ("pmap_remove_write: page %p is not managed", m)); 5775 vm_page_assert_busied(m); 5776 5777 if (!pmap_page_is_write_mapped(m)) 5778 return; 5779 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5780 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 5781 rw_wlock(lock); 5782 retry: 5783 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5784 pmap = PV_PMAP(pv); 5785 PMAP_ASSERT_STAGE1(pmap); 5786 if (!PMAP_TRYLOCK(pmap)) { 5787 pvh_gen = pvh->pv_gen; 5788 rw_wunlock(lock); 5789 PMAP_LOCK(pmap); 5790 rw_wlock(lock); 5791 if (pvh_gen != pvh->pv_gen) { 5792 PMAP_UNLOCK(pmap); 5793 goto retry; 5794 } 5795 } 5796 va = pv->pv_va; 5797 pte = pmap_pte_exists(pmap, va, 2, __func__); 5798 if ((pmap_load(pte) & ATTR_SW_DBM) != 0) 5799 (void)pmap_demote_l2_locked(pmap, pte, va, &lock); 5800 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 5801 ("inconsistent pv lock %p %p for page %p", 5802 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 5803 PMAP_UNLOCK(pmap); 5804 } 5805 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5806 pmap = PV_PMAP(pv); 5807 PMAP_ASSERT_STAGE1(pmap); 5808 if (!PMAP_TRYLOCK(pmap)) { 5809 pvh_gen = pvh->pv_gen; 5810 md_gen = m->md.pv_gen; 5811 rw_wunlock(lock); 5812 PMAP_LOCK(pmap); 5813 rw_wlock(lock); 5814 if (pvh_gen != pvh->pv_gen || 5815 md_gen != m->md.pv_gen) { 5816 PMAP_UNLOCK(pmap); 5817 goto retry; 5818 } 5819 } 5820 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 5821 oldpte = pmap_load(pte); 5822 if ((oldpte & ATTR_SW_DBM) != 0) { 5823 while (!atomic_fcmpset_64(pte, &oldpte, 5824 (oldpte | ATTR_S1_AP_RW_BIT) & ~ATTR_SW_DBM)) 5825 cpu_spinwait(); 5826 if ((oldpte & ATTR_S1_AP_RW_BIT) == 5827 ATTR_S1_AP(ATTR_S1_AP_RW)) 5828 vm_page_dirty(m); 5829 pmap_s1_invalidate_page(pmap, pv->pv_va, true); 5830 } 5831 PMAP_UNLOCK(pmap); 5832 } 5833 rw_wunlock(lock); 5834 vm_page_aflag_clear(m, PGA_WRITEABLE); 5835 } 5836 5837 /* 5838 * pmap_ts_referenced: 5839 * 5840 * Return a count of reference bits for a page, clearing those bits. 5841 * It is not necessary for every reference bit to be cleared, but it 5842 * is necessary that 0 only be returned when there are truly no 5843 * reference bits set. 5844 * 5845 * As an optimization, update the page's dirty field if a modified bit is 5846 * found while counting reference bits. This opportunistic update can be 5847 * performed at low cost and can eliminate the need for some future calls 5848 * to pmap_is_modified(). However, since this function stops after 5849 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 5850 * dirty pages. Those dirty pages will only be detected by a future call 5851 * to pmap_is_modified(). 5852 */ 5853 int 5854 pmap_ts_referenced(vm_page_t m) 5855 { 5856 struct md_page *pvh; 5857 pv_entry_t pv, pvf; 5858 pmap_t pmap; 5859 struct rwlock *lock; 5860 pt_entry_t *pte, tpte; 5861 vm_offset_t va; 5862 vm_paddr_t pa; 5863 int cleared, md_gen, not_cleared, pvh_gen; 5864 struct spglist free; 5865 5866 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5867 ("pmap_ts_referenced: page %p is not managed", m)); 5868 SLIST_INIT(&free); 5869 cleared = 0; 5870 pa = VM_PAGE_TO_PHYS(m); 5871 lock = PHYS_TO_PV_LIST_LOCK(pa); 5872 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 5873 rw_wlock(lock); 5874 retry: 5875 not_cleared = 0; 5876 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 5877 goto small_mappings; 5878 pv = pvf; 5879 do { 5880 if (pvf == NULL) 5881 pvf = pv; 5882 pmap = PV_PMAP(pv); 5883 if (!PMAP_TRYLOCK(pmap)) { 5884 pvh_gen = pvh->pv_gen; 5885 rw_wunlock(lock); 5886 PMAP_LOCK(pmap); 5887 rw_wlock(lock); 5888 if (pvh_gen != pvh->pv_gen) { 5889 PMAP_UNLOCK(pmap); 5890 goto retry; 5891 } 5892 } 5893 va = pv->pv_va; 5894 pte = pmap_pte_exists(pmap, va, 2, __func__); 5895 tpte = pmap_load(pte); 5896 if (pmap_pte_dirty(pmap, tpte)) { 5897 /* 5898 * Although "tpte" is mapping a 2MB page, because 5899 * this function is called at a 4KB page granularity, 5900 * we only update the 4KB page under test. 5901 */ 5902 vm_page_dirty(m); 5903 } 5904 if ((tpte & ATTR_AF) != 0) { 5905 /* 5906 * Since this reference bit is shared by 512 4KB pages, 5907 * it should not be cleared every time it is tested. 5908 * Apply a simple "hash" function on the physical page 5909 * number, the virtual superpage number, and the pmap 5910 * address to select one 4KB page out of the 512 on 5911 * which testing the reference bit will result in 5912 * clearing that reference bit. This function is 5913 * designed to avoid the selection of the same 4KB page 5914 * for every 2MB page mapping. 5915 * 5916 * On demotion, a mapping that hasn't been referenced 5917 * is simply destroyed. To avoid the possibility of a 5918 * subsequent page fault on a demoted wired mapping, 5919 * always leave its reference bit set. Moreover, 5920 * since the superpage is wired, the current state of 5921 * its reference bit won't affect page replacement. 5922 */ 5923 if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^ 5924 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && 5925 (tpte & ATTR_SW_WIRED) == 0) { 5926 pmap_clear_bits(pte, ATTR_AF); 5927 pmap_s1_invalidate_page(pmap, va, true); 5928 cleared++; 5929 } else 5930 not_cleared++; 5931 } 5932 PMAP_UNLOCK(pmap); 5933 /* Rotate the PV list if it has more than one entry. */ 5934 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5935 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5936 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5937 pvh->pv_gen++; 5938 } 5939 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 5940 goto out; 5941 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 5942 small_mappings: 5943 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 5944 goto out; 5945 pv = pvf; 5946 do { 5947 if (pvf == NULL) 5948 pvf = pv; 5949 pmap = PV_PMAP(pv); 5950 if (!PMAP_TRYLOCK(pmap)) { 5951 pvh_gen = pvh->pv_gen; 5952 md_gen = m->md.pv_gen; 5953 rw_wunlock(lock); 5954 PMAP_LOCK(pmap); 5955 rw_wlock(lock); 5956 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 5957 PMAP_UNLOCK(pmap); 5958 goto retry; 5959 } 5960 } 5961 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 5962 tpte = pmap_load(pte); 5963 if (pmap_pte_dirty(pmap, tpte)) 5964 vm_page_dirty(m); 5965 if ((tpte & ATTR_AF) != 0) { 5966 if ((tpte & ATTR_SW_WIRED) == 0) { 5967 pmap_clear_bits(pte, ATTR_AF); 5968 pmap_s1_invalidate_page(pmap, pv->pv_va, true); 5969 cleared++; 5970 } else 5971 not_cleared++; 5972 } 5973 PMAP_UNLOCK(pmap); 5974 /* Rotate the PV list if it has more than one entry. */ 5975 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5976 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5977 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5978 m->md.pv_gen++; 5979 } 5980 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 5981 not_cleared < PMAP_TS_REFERENCED_MAX); 5982 out: 5983 rw_wunlock(lock); 5984 vm_page_free_pages_toq(&free, true); 5985 return (cleared + not_cleared); 5986 } 5987 5988 /* 5989 * Apply the given advice to the specified range of addresses within the 5990 * given pmap. Depending on the advice, clear the referenced and/or 5991 * modified flags in each mapping and set the mapped page's dirty field. 5992 */ 5993 void 5994 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 5995 { 5996 struct rwlock *lock; 5997 vm_offset_t va, va_next; 5998 vm_page_t m; 5999 pd_entry_t *l0, *l1, *l2, oldl2; 6000 pt_entry_t *l3, oldl3; 6001 6002 PMAP_ASSERT_STAGE1(pmap); 6003 6004 if (advice != MADV_DONTNEED && advice != MADV_FREE) 6005 return; 6006 6007 PMAP_LOCK(pmap); 6008 for (; sva < eva; sva = va_next) { 6009 l0 = pmap_l0(pmap, sva); 6010 if (pmap_load(l0) == 0) { 6011 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 6012 if (va_next < sva) 6013 va_next = eva; 6014 continue; 6015 } 6016 6017 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 6018 if (va_next < sva) 6019 va_next = eva; 6020 l1 = pmap_l0_to_l1(l0, sva); 6021 if (pmap_load(l1) == 0) 6022 continue; 6023 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 6024 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 6025 continue; 6026 } 6027 6028 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 6029 if (va_next < sva) 6030 va_next = eva; 6031 l2 = pmap_l1_to_l2(l1, sva); 6032 oldl2 = pmap_load(l2); 6033 if (oldl2 == 0) 6034 continue; 6035 if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) { 6036 if ((oldl2 & ATTR_SW_MANAGED) == 0) 6037 continue; 6038 lock = NULL; 6039 if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) { 6040 if (lock != NULL) 6041 rw_wunlock(lock); 6042 6043 /* 6044 * The 2MB page mapping was destroyed. 6045 */ 6046 continue; 6047 } 6048 6049 /* 6050 * Unless the page mappings are wired, remove the 6051 * mapping to a single page so that a subsequent 6052 * access may repromote. Choosing the last page 6053 * within the address range [sva, min(va_next, eva)) 6054 * generally results in more repromotions. Since the 6055 * underlying page table page is fully populated, this 6056 * removal never frees a page table page. 6057 */ 6058 if ((oldl2 & ATTR_SW_WIRED) == 0) { 6059 va = eva; 6060 if (va > va_next) 6061 va = va_next; 6062 va -= PAGE_SIZE; 6063 KASSERT(va >= sva, 6064 ("pmap_advise: no address gap")); 6065 l3 = pmap_l2_to_l3(l2, va); 6066 KASSERT(pmap_load(l3) != 0, 6067 ("pmap_advise: invalid PTE")); 6068 pmap_remove_l3(pmap, l3, va, pmap_load(l2), 6069 NULL, &lock); 6070 } 6071 if (lock != NULL) 6072 rw_wunlock(lock); 6073 } 6074 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 6075 ("pmap_advise: invalid L2 entry after demotion")); 6076 if (va_next > eva) 6077 va_next = eva; 6078 va = va_next; 6079 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 6080 sva += L3_SIZE) { 6081 oldl3 = pmap_load(l3); 6082 if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) != 6083 (ATTR_SW_MANAGED | L3_PAGE)) 6084 goto maybe_invlrng; 6085 else if (pmap_pte_dirty(pmap, oldl3)) { 6086 if (advice == MADV_DONTNEED) { 6087 /* 6088 * Future calls to pmap_is_modified() 6089 * can be avoided by making the page 6090 * dirty now. 6091 */ 6092 m = PHYS_TO_VM_PAGE(oldl3 & ~ATTR_MASK); 6093 vm_page_dirty(m); 6094 } 6095 while (!atomic_fcmpset_long(l3, &oldl3, 6096 (oldl3 & ~ATTR_AF) | 6097 ATTR_S1_AP(ATTR_S1_AP_RO))) 6098 cpu_spinwait(); 6099 } else if ((oldl3 & ATTR_AF) != 0) 6100 pmap_clear_bits(l3, ATTR_AF); 6101 else 6102 goto maybe_invlrng; 6103 if (va == va_next) 6104 va = sva; 6105 continue; 6106 maybe_invlrng: 6107 if (va != va_next) { 6108 pmap_s1_invalidate_range(pmap, va, sva, true); 6109 va = va_next; 6110 } 6111 } 6112 if (va != va_next) 6113 pmap_s1_invalidate_range(pmap, va, sva, true); 6114 } 6115 PMAP_UNLOCK(pmap); 6116 } 6117 6118 /* 6119 * Clear the modify bits on the specified physical page. 6120 */ 6121 void 6122 pmap_clear_modify(vm_page_t m) 6123 { 6124 struct md_page *pvh; 6125 struct rwlock *lock; 6126 pmap_t pmap; 6127 pv_entry_t next_pv, pv; 6128 pd_entry_t *l2, oldl2; 6129 pt_entry_t *l3, oldl3; 6130 vm_offset_t va; 6131 int md_gen, pvh_gen; 6132 6133 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6134 ("pmap_clear_modify: page %p is not managed", m)); 6135 vm_page_assert_busied(m); 6136 6137 if (!pmap_page_is_write_mapped(m)) 6138 return; 6139 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 6140 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6141 rw_wlock(lock); 6142 restart: 6143 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 6144 pmap = PV_PMAP(pv); 6145 PMAP_ASSERT_STAGE1(pmap); 6146 if (!PMAP_TRYLOCK(pmap)) { 6147 pvh_gen = pvh->pv_gen; 6148 rw_wunlock(lock); 6149 PMAP_LOCK(pmap); 6150 rw_wlock(lock); 6151 if (pvh_gen != pvh->pv_gen) { 6152 PMAP_UNLOCK(pmap); 6153 goto restart; 6154 } 6155 } 6156 va = pv->pv_va; 6157 l2 = pmap_l2(pmap, va); 6158 oldl2 = pmap_load(l2); 6159 /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */ 6160 if ((oldl2 & ATTR_SW_DBM) != 0 && 6161 pmap_demote_l2_locked(pmap, l2, va, &lock) && 6162 (oldl2 & ATTR_SW_WIRED) == 0) { 6163 /* 6164 * Write protect the mapping to a single page so that 6165 * a subsequent write access may repromote. 6166 */ 6167 va += VM_PAGE_TO_PHYS(m) - (oldl2 & ~ATTR_MASK); 6168 l3 = pmap_l2_to_l3(l2, va); 6169 oldl3 = pmap_load(l3); 6170 while (!atomic_fcmpset_long(l3, &oldl3, 6171 (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO))) 6172 cpu_spinwait(); 6173 vm_page_dirty(m); 6174 pmap_s1_invalidate_page(pmap, va, true); 6175 } 6176 PMAP_UNLOCK(pmap); 6177 } 6178 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6179 pmap = PV_PMAP(pv); 6180 PMAP_ASSERT_STAGE1(pmap); 6181 if (!PMAP_TRYLOCK(pmap)) { 6182 md_gen = m->md.pv_gen; 6183 pvh_gen = pvh->pv_gen; 6184 rw_wunlock(lock); 6185 PMAP_LOCK(pmap); 6186 rw_wlock(lock); 6187 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 6188 PMAP_UNLOCK(pmap); 6189 goto restart; 6190 } 6191 } 6192 l2 = pmap_l2(pmap, pv->pv_va); 6193 l3 = pmap_l2_to_l3(l2, pv->pv_va); 6194 oldl3 = pmap_load(l3); 6195 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM){ 6196 pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO)); 6197 pmap_s1_invalidate_page(pmap, pv->pv_va, true); 6198 } 6199 PMAP_UNLOCK(pmap); 6200 } 6201 rw_wunlock(lock); 6202 } 6203 6204 void * 6205 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 6206 { 6207 struct pmap_preinit_mapping *ppim; 6208 vm_offset_t va, offset; 6209 pd_entry_t *pde; 6210 pt_entry_t *l2; 6211 int i, lvl, l2_blocks, free_l2_count, start_idx; 6212 6213 if (!vm_initialized) { 6214 /* 6215 * No L3 ptables so map entire L2 blocks where start VA is: 6216 * preinit_map_va + start_idx * L2_SIZE 6217 * There may be duplicate mappings (multiple VA -> same PA) but 6218 * ARM64 dcache is always PIPT so that's acceptable. 6219 */ 6220 if (size == 0) 6221 return (NULL); 6222 6223 /* Calculate how many L2 blocks are needed for the mapping */ 6224 l2_blocks = (roundup2(pa + size, L2_SIZE) - 6225 rounddown2(pa, L2_SIZE)) >> L2_SHIFT; 6226 6227 offset = pa & L2_OFFSET; 6228 6229 if (preinit_map_va == 0) 6230 return (NULL); 6231 6232 /* Map 2MiB L2 blocks from reserved VA space */ 6233 6234 free_l2_count = 0; 6235 start_idx = -1; 6236 /* Find enough free contiguous VA space */ 6237 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 6238 ppim = pmap_preinit_mapping + i; 6239 if (free_l2_count > 0 && ppim->pa != 0) { 6240 /* Not enough space here */ 6241 free_l2_count = 0; 6242 start_idx = -1; 6243 continue; 6244 } 6245 6246 if (ppim->pa == 0) { 6247 /* Free L2 block */ 6248 if (start_idx == -1) 6249 start_idx = i; 6250 free_l2_count++; 6251 if (free_l2_count == l2_blocks) 6252 break; 6253 } 6254 } 6255 if (free_l2_count != l2_blocks) 6256 panic("%s: too many preinit mappings", __func__); 6257 6258 va = preinit_map_va + (start_idx * L2_SIZE); 6259 for (i = start_idx; i < start_idx + l2_blocks; i++) { 6260 /* Mark entries as allocated */ 6261 ppim = pmap_preinit_mapping + i; 6262 ppim->pa = pa; 6263 ppim->va = va + offset; 6264 ppim->size = size; 6265 } 6266 6267 /* Map L2 blocks */ 6268 pa = rounddown2(pa, L2_SIZE); 6269 for (i = 0; i < l2_blocks; i++) { 6270 pde = pmap_pde(kernel_pmap, va, &lvl); 6271 KASSERT(pde != NULL, 6272 ("pmap_mapbios: Invalid page entry, va: 0x%lx", 6273 va)); 6274 KASSERT(lvl == 1, 6275 ("pmap_mapbios: Invalid level %d", lvl)); 6276 6277 /* Insert L2_BLOCK */ 6278 l2 = pmap_l1_to_l2(pde, va); 6279 pmap_load_store(l2, 6280 pa | ATTR_DEFAULT | ATTR_S1_XN | 6281 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK); 6282 6283 va += L2_SIZE; 6284 pa += L2_SIZE; 6285 } 6286 pmap_s1_invalidate_all(kernel_pmap); 6287 6288 va = preinit_map_va + (start_idx * L2_SIZE); 6289 6290 } else { 6291 /* kva_alloc may be used to map the pages */ 6292 offset = pa & PAGE_MASK; 6293 size = round_page(offset + size); 6294 6295 va = kva_alloc(size); 6296 if (va == 0) 6297 panic("%s: Couldn't allocate KVA", __func__); 6298 6299 pde = pmap_pde(kernel_pmap, va, &lvl); 6300 KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl)); 6301 6302 /* L3 table is linked */ 6303 va = trunc_page(va); 6304 pa = trunc_page(pa); 6305 pmap_kenter(va, size, pa, memory_mapping_mode(pa)); 6306 } 6307 6308 return ((void *)(va + offset)); 6309 } 6310 6311 void 6312 pmap_unmapbios(void *p, vm_size_t size) 6313 { 6314 struct pmap_preinit_mapping *ppim; 6315 vm_offset_t offset, tmpsize, va, va_trunc; 6316 pd_entry_t *pde; 6317 pt_entry_t *l2; 6318 int i, lvl, l2_blocks, block; 6319 bool preinit_map; 6320 6321 va = (vm_offset_t)p; 6322 l2_blocks = 6323 (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT; 6324 KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size)); 6325 6326 /* Remove preinit mapping */ 6327 preinit_map = false; 6328 block = 0; 6329 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 6330 ppim = pmap_preinit_mapping + i; 6331 if (ppim->va == va) { 6332 KASSERT(ppim->size == size, 6333 ("pmap_unmapbios: size mismatch")); 6334 ppim->va = 0; 6335 ppim->pa = 0; 6336 ppim->size = 0; 6337 preinit_map = true; 6338 offset = block * L2_SIZE; 6339 va_trunc = rounddown2(va, L2_SIZE) + offset; 6340 6341 /* Remove L2_BLOCK */ 6342 pde = pmap_pde(kernel_pmap, va_trunc, &lvl); 6343 KASSERT(pde != NULL, 6344 ("pmap_unmapbios: Invalid page entry, va: 0x%lx", 6345 va_trunc)); 6346 l2 = pmap_l1_to_l2(pde, va_trunc); 6347 pmap_clear(l2); 6348 6349 if (block == (l2_blocks - 1)) 6350 break; 6351 block++; 6352 } 6353 } 6354 if (preinit_map) { 6355 pmap_s1_invalidate_all(kernel_pmap); 6356 return; 6357 } 6358 6359 /* Unmap the pages reserved with kva_alloc. */ 6360 if (vm_initialized) { 6361 offset = va & PAGE_MASK; 6362 size = round_page(offset + size); 6363 va = trunc_page(va); 6364 6365 pde = pmap_pde(kernel_pmap, va, &lvl); 6366 KASSERT(pde != NULL, 6367 ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va)); 6368 KASSERT(lvl == 2, ("pmap_unmapbios: Invalid level %d", lvl)); 6369 6370 /* Unmap and invalidate the pages */ 6371 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 6372 pmap_kremove(va + tmpsize); 6373 6374 kva_free(va, size); 6375 } 6376 } 6377 6378 /* 6379 * Sets the memory attribute for the specified page. 6380 */ 6381 void 6382 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 6383 { 6384 6385 m->md.pv_memattr = ma; 6386 6387 /* 6388 * If "m" is a normal page, update its direct mapping. This update 6389 * can be relied upon to perform any cache operations that are 6390 * required for data coherence. 6391 */ 6392 if ((m->flags & PG_FICTITIOUS) == 0 && 6393 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 6394 m->md.pv_memattr) != 0) 6395 panic("memory attribute change on the direct map failed"); 6396 } 6397 6398 /* 6399 * Changes the specified virtual address range's memory type to that given by 6400 * the parameter "mode". The specified virtual address range must be 6401 * completely contained within either the direct map or the kernel map. If 6402 * the virtual address range is contained within the kernel map, then the 6403 * memory type for each of the corresponding ranges of the direct map is also 6404 * changed. (The corresponding ranges of the direct map are those ranges that 6405 * map the same physical pages as the specified virtual address range.) These 6406 * changes to the direct map are necessary because Intel describes the 6407 * behavior of their processors as "undefined" if two or more mappings to the 6408 * same physical page have different memory types. 6409 * 6410 * Returns zero if the change completed successfully, and either EINVAL or 6411 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 6412 * of the virtual address range was not mapped, and ENOMEM is returned if 6413 * there was insufficient memory available to complete the change. In the 6414 * latter case, the memory type may have been changed on some part of the 6415 * virtual address range or the direct map. 6416 */ 6417 int 6418 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 6419 { 6420 int error; 6421 6422 PMAP_LOCK(kernel_pmap); 6423 error = pmap_change_props_locked(va, size, PROT_NONE, mode, false); 6424 PMAP_UNLOCK(kernel_pmap); 6425 return (error); 6426 } 6427 6428 /* 6429 * Changes the specified virtual address range's protections to those 6430 * specified by "prot". Like pmap_change_attr(), protections for aliases 6431 * in the direct map are updated as well. Protections on aliasing mappings may 6432 * be a subset of the requested protections; for example, mappings in the direct 6433 * map are never executable. 6434 */ 6435 int 6436 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot) 6437 { 6438 int error; 6439 6440 /* Only supported within the kernel map. */ 6441 if (va < VM_MIN_KERNEL_ADDRESS) 6442 return (EINVAL); 6443 6444 PMAP_LOCK(kernel_pmap); 6445 error = pmap_change_props_locked(va, size, prot, -1, false); 6446 PMAP_UNLOCK(kernel_pmap); 6447 return (error); 6448 } 6449 6450 static int 6451 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, 6452 int mode, bool skip_unmapped) 6453 { 6454 vm_offset_t base, offset, tmpva; 6455 vm_size_t pte_size; 6456 vm_paddr_t pa; 6457 pt_entry_t pte, *ptep, *newpte; 6458 pt_entry_t bits, mask; 6459 int lvl, rv; 6460 6461 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 6462 base = trunc_page(va); 6463 offset = va & PAGE_MASK; 6464 size = round_page(offset + size); 6465 6466 if (!VIRT_IN_DMAP(base) && 6467 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS)) 6468 return (EINVAL); 6469 6470 bits = 0; 6471 mask = 0; 6472 if (mode != -1) { 6473 bits = ATTR_S1_IDX(mode); 6474 mask = ATTR_S1_IDX_MASK; 6475 if (mode == VM_MEMATTR_DEVICE) { 6476 mask |= ATTR_S1_XN; 6477 bits |= ATTR_S1_XN; 6478 } 6479 } 6480 if (prot != VM_PROT_NONE) { 6481 /* Don't mark the DMAP as executable. It never is on arm64. */ 6482 if (VIRT_IN_DMAP(base)) { 6483 prot &= ~VM_PROT_EXECUTE; 6484 /* 6485 * XXX Mark the DMAP as writable for now. We rely 6486 * on this in ddb & dtrace to insert breakpoint 6487 * instructions. 6488 */ 6489 prot |= VM_PROT_WRITE; 6490 } 6491 6492 if ((prot & VM_PROT_WRITE) == 0) { 6493 bits |= ATTR_S1_AP(ATTR_S1_AP_RO); 6494 } 6495 if ((prot & VM_PROT_EXECUTE) == 0) { 6496 bits |= ATTR_S1_PXN; 6497 } 6498 bits |= ATTR_S1_UXN; 6499 mask |= ATTR_S1_AP_MASK | ATTR_S1_XN; 6500 } 6501 6502 for (tmpva = base; tmpva < base + size; ) { 6503 ptep = pmap_pte(kernel_pmap, tmpva, &lvl); 6504 if (ptep == NULL && !skip_unmapped) { 6505 return (EINVAL); 6506 } else if ((ptep == NULL && skip_unmapped) || 6507 (pmap_load(ptep) & mask) == bits) { 6508 /* 6509 * We already have the correct attribute or there 6510 * is no memory mapped at this address and we are 6511 * skipping unmapped memory. 6512 */ 6513 switch (lvl) { 6514 default: 6515 panic("Invalid DMAP table level: %d\n", lvl); 6516 case 1: 6517 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; 6518 break; 6519 case 2: 6520 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; 6521 break; 6522 case 3: 6523 tmpva += PAGE_SIZE; 6524 break; 6525 } 6526 } else { 6527 /* We can't demote/promote this entry */ 6528 MPASS((pmap_load(ptep) & ATTR_SW_NO_PROMOTE) == 0); 6529 6530 /* 6531 * Split the entry to an level 3 table, then 6532 * set the new attribute. 6533 */ 6534 switch (lvl) { 6535 default: 6536 panic("Invalid DMAP table level: %d\n", lvl); 6537 case 1: 6538 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 6539 if ((tmpva & L1_OFFSET) == 0 && 6540 (base + size - tmpva) >= L1_SIZE) { 6541 pte_size = L1_SIZE; 6542 break; 6543 } 6544 newpte = pmap_demote_l1(kernel_pmap, ptep, 6545 tmpva & ~L1_OFFSET); 6546 if (newpte == NULL) 6547 return (EINVAL); 6548 ptep = pmap_l1_to_l2(ptep, tmpva); 6549 /* FALLTHROUGH */ 6550 case 2: 6551 if ((tmpva & L2_OFFSET) == 0 && 6552 (base + size - tmpva) >= L2_SIZE) { 6553 pte_size = L2_SIZE; 6554 break; 6555 } 6556 newpte = pmap_demote_l2(kernel_pmap, ptep, 6557 tmpva); 6558 if (newpte == NULL) 6559 return (EINVAL); 6560 ptep = pmap_l2_to_l3(ptep, tmpva); 6561 /* FALLTHROUGH */ 6562 case 3: 6563 pte_size = PAGE_SIZE; 6564 break; 6565 } 6566 6567 /* Update the entry */ 6568 pte = pmap_load(ptep); 6569 pte &= ~mask; 6570 pte |= bits; 6571 6572 pmap_update_entry(kernel_pmap, ptep, pte, tmpva, 6573 pte_size); 6574 6575 pa = pte & ~ATTR_MASK; 6576 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) { 6577 /* 6578 * Keep the DMAP memory in sync. 6579 */ 6580 rv = pmap_change_props_locked( 6581 PHYS_TO_DMAP(pa), pte_size, 6582 prot, mode, true); 6583 if (rv != 0) 6584 return (rv); 6585 } 6586 6587 /* 6588 * If moving to a non-cacheable entry flush 6589 * the cache. 6590 */ 6591 if (mode == VM_MEMATTR_UNCACHEABLE) 6592 cpu_dcache_wbinv_range(tmpva, pte_size); 6593 tmpva += pte_size; 6594 } 6595 } 6596 6597 return (0); 6598 } 6599 6600 /* 6601 * Create an L2 table to map all addresses within an L1 mapping. 6602 */ 6603 static pt_entry_t * 6604 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va) 6605 { 6606 pt_entry_t *l2, newl2, oldl1; 6607 vm_offset_t tmpl1; 6608 vm_paddr_t l2phys, phys; 6609 vm_page_t ml2; 6610 int i; 6611 6612 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6613 oldl1 = pmap_load(l1); 6614 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 6615 KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK, 6616 ("pmap_demote_l1: Demoting a non-block entry")); 6617 KASSERT((va & L1_OFFSET) == 0, 6618 ("pmap_demote_l1: Invalid virtual address %#lx", va)); 6619 KASSERT((oldl1 & ATTR_SW_MANAGED) == 0, 6620 ("pmap_demote_l1: Level 1 table shouldn't be managed")); 6621 KASSERT((oldl1 & ATTR_SW_NO_PROMOTE) == 0, 6622 ("pmap_demote_l1: Demoting entry with no-demote flag set")); 6623 6624 tmpl1 = 0; 6625 if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) { 6626 tmpl1 = kva_alloc(PAGE_SIZE); 6627 if (tmpl1 == 0) 6628 return (NULL); 6629 } 6630 6631 if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) == 6632 NULL) { 6633 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx" 6634 " in pmap %p", va, pmap); 6635 l2 = NULL; 6636 goto fail; 6637 } 6638 6639 l2phys = VM_PAGE_TO_PHYS(ml2); 6640 l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys); 6641 6642 /* Address the range points at */ 6643 phys = oldl1 & ~ATTR_MASK; 6644 /* The attributed from the old l1 table to be copied */ 6645 newl2 = oldl1 & ATTR_MASK; 6646 6647 /* Create the new entries */ 6648 for (i = 0; i < Ln_ENTRIES; i++) { 6649 l2[i] = newl2 | phys; 6650 phys += L2_SIZE; 6651 } 6652 KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK), 6653 ("Invalid l2 page (%lx != %lx)", l2[0], 6654 (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK)); 6655 6656 if (tmpl1 != 0) { 6657 pmap_kenter(tmpl1, PAGE_SIZE, 6658 DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, 6659 VM_MEMATTR_WRITE_BACK); 6660 l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK)); 6661 } 6662 6663 pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE); 6664 6665 fail: 6666 if (tmpl1 != 0) { 6667 pmap_kremove(tmpl1); 6668 kva_free(tmpl1, PAGE_SIZE); 6669 } 6670 6671 return (l2); 6672 } 6673 6674 static void 6675 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3) 6676 { 6677 pt_entry_t *l3; 6678 6679 for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) { 6680 *l3 = newl3; 6681 newl3 += L3_SIZE; 6682 } 6683 } 6684 6685 static void 6686 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2, 6687 struct rwlock **lockp) 6688 { 6689 struct spglist free; 6690 6691 SLIST_INIT(&free); 6692 (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, 6693 lockp); 6694 vm_page_free_pages_toq(&free, true); 6695 } 6696 6697 /* 6698 * Create an L3 table to map all addresses within an L2 mapping. 6699 */ 6700 static pt_entry_t * 6701 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, 6702 struct rwlock **lockp) 6703 { 6704 pt_entry_t *l3, newl3, oldl2; 6705 vm_offset_t tmpl2; 6706 vm_paddr_t l3phys; 6707 vm_page_t ml3; 6708 6709 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6710 PMAP_ASSERT_STAGE1(pmap); 6711 KASSERT(ADDR_IS_CANONICAL(va), 6712 ("%s: Address not in canonical form: %lx", __func__, va)); 6713 6714 l3 = NULL; 6715 oldl2 = pmap_load(l2); 6716 KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK, 6717 ("pmap_demote_l2: Demoting a non-block entry")); 6718 KASSERT((oldl2 & ATTR_SW_NO_PROMOTE) == 0, 6719 ("pmap_demote_l2: Demoting entry with no-demote flag set")); 6720 va &= ~L2_OFFSET; 6721 6722 tmpl2 = 0; 6723 if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) { 6724 tmpl2 = kva_alloc(PAGE_SIZE); 6725 if (tmpl2 == 0) 6726 return (NULL); 6727 } 6728 6729 /* 6730 * Invalidate the 2MB page mapping and return "failure" if the 6731 * mapping was never accessed. 6732 */ 6733 if ((oldl2 & ATTR_AF) == 0) { 6734 KASSERT((oldl2 & ATTR_SW_WIRED) == 0, 6735 ("pmap_demote_l2: a wired mapping is missing ATTR_AF")); 6736 pmap_demote_l2_abort(pmap, va, l2, lockp); 6737 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p", 6738 va, pmap); 6739 goto fail; 6740 } 6741 6742 if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) { 6743 KASSERT((oldl2 & ATTR_SW_WIRED) == 0, 6744 ("pmap_demote_l2: page table page for a wired mapping" 6745 " is missing")); 6746 6747 /* 6748 * If the page table page is missing and the mapping 6749 * is for a kernel address, the mapping must belong to 6750 * either the direct map or the early kernel memory. 6751 * Page table pages are preallocated for every other 6752 * part of the kernel address space, so the direct map 6753 * region and early kernel memory are the only parts of the 6754 * kernel address space that must be handled here. 6755 */ 6756 KASSERT(!ADDR_IS_KERNEL(va) || VIRT_IN_DMAP(va) || 6757 (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end), 6758 ("pmap_demote_l2: No saved mpte for va %#lx", va)); 6759 6760 /* 6761 * If the 2MB page mapping belongs to the direct map 6762 * region of the kernel's address space, then the page 6763 * allocation request specifies the highest possible 6764 * priority (VM_ALLOC_INTERRUPT). Otherwise, the 6765 * priority is normal. 6766 */ 6767 ml3 = vm_page_alloc_noobj( 6768 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) | 6769 VM_ALLOC_WIRED); 6770 6771 /* 6772 * If the allocation of the new page table page fails, 6773 * invalidate the 2MB page mapping and return "failure". 6774 */ 6775 if (ml3 == NULL) { 6776 pmap_demote_l2_abort(pmap, va, l2, lockp); 6777 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx" 6778 " in pmap %p", va, pmap); 6779 goto fail; 6780 } 6781 ml3->pindex = pmap_l2_pindex(va); 6782 6783 if (!ADDR_IS_KERNEL(va)) { 6784 ml3->ref_count = NL3PG; 6785 pmap_resident_count_inc(pmap, 1); 6786 } 6787 } 6788 l3phys = VM_PAGE_TO_PHYS(ml3); 6789 l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys); 6790 newl3 = (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE; 6791 KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 6792 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM), 6793 ("pmap_demote_l2: L2 entry is writeable but not dirty")); 6794 6795 /* 6796 * If the page table page is not leftover from an earlier promotion, 6797 * or the mapping attributes have changed, (re)initialize the L3 table. 6798 * 6799 * When pmap_update_entry() clears the old L2 mapping, it (indirectly) 6800 * performs a dsb(). That dsb() ensures that the stores for filling 6801 * "l3" are visible before "l3" is added to the page table. 6802 */ 6803 if (ml3->valid == 0 || (l3[0] & ATTR_MASK) != (newl3 & ATTR_MASK)) 6804 pmap_fill_l3(l3, newl3); 6805 6806 /* 6807 * Map the temporary page so we don't lose access to the l2 table. 6808 */ 6809 if (tmpl2 != 0) { 6810 pmap_kenter(tmpl2, PAGE_SIZE, 6811 DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, 6812 VM_MEMATTR_WRITE_BACK); 6813 l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK)); 6814 } 6815 6816 /* 6817 * The spare PV entries must be reserved prior to demoting the 6818 * mapping, that is, prior to changing the PDE. Otherwise, the state 6819 * of the L2 and the PV lists will be inconsistent, which can result 6820 * in reclaim_pv_chunk() attempting to remove a PV entry from the 6821 * wrong PV list and pmap_pv_demote_l2() failing to find the expected 6822 * PV entry for the 2MB page mapping that is being demoted. 6823 */ 6824 if ((oldl2 & ATTR_SW_MANAGED) != 0) 6825 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); 6826 6827 /* 6828 * Pass PAGE_SIZE so that a single TLB invalidation is performed on 6829 * the 2MB page mapping. 6830 */ 6831 pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE); 6832 6833 /* 6834 * Demote the PV entry. 6835 */ 6836 if ((oldl2 & ATTR_SW_MANAGED) != 0) 6837 pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp); 6838 6839 atomic_add_long(&pmap_l2_demotions, 1); 6840 CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx" 6841 " in pmap %p %lx", va, pmap, l3[0]); 6842 6843 fail: 6844 if (tmpl2 != 0) { 6845 pmap_kremove(tmpl2); 6846 kva_free(tmpl2, PAGE_SIZE); 6847 } 6848 6849 return (l3); 6850 6851 } 6852 6853 static pt_entry_t * 6854 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 6855 { 6856 struct rwlock *lock; 6857 pt_entry_t *l3; 6858 6859 lock = NULL; 6860 l3 = pmap_demote_l2_locked(pmap, l2, va, &lock); 6861 if (lock != NULL) 6862 rw_wunlock(lock); 6863 return (l3); 6864 } 6865 6866 /* 6867 * Perform the pmap work for mincore(2). If the page is not both referenced and 6868 * modified by this pmap, returns its physical address so that the caller can 6869 * find other mappings. 6870 */ 6871 int 6872 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 6873 { 6874 pt_entry_t *pte, tpte; 6875 vm_paddr_t mask, pa; 6876 int lvl, val; 6877 bool managed; 6878 6879 PMAP_ASSERT_STAGE1(pmap); 6880 PMAP_LOCK(pmap); 6881 pte = pmap_pte(pmap, addr, &lvl); 6882 if (pte != NULL) { 6883 tpte = pmap_load(pte); 6884 6885 switch (lvl) { 6886 case 3: 6887 mask = L3_OFFSET; 6888 break; 6889 case 2: 6890 mask = L2_OFFSET; 6891 break; 6892 case 1: 6893 mask = L1_OFFSET; 6894 break; 6895 default: 6896 panic("pmap_mincore: invalid level %d", lvl); 6897 } 6898 6899 managed = (tpte & ATTR_SW_MANAGED) != 0; 6900 val = MINCORE_INCORE; 6901 if (lvl != 3) 6902 val |= MINCORE_PSIND(3 - lvl); 6903 if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed && 6904 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))) 6905 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6906 if ((tpte & ATTR_AF) == ATTR_AF) 6907 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6908 6909 pa = (tpte & ~ATTR_MASK) | (addr & mask); 6910 } else { 6911 managed = false; 6912 val = 0; 6913 } 6914 6915 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 6916 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 6917 *pap = pa; 6918 } 6919 PMAP_UNLOCK(pmap); 6920 return (val); 6921 } 6922 6923 /* 6924 * Garbage collect every ASID that is neither active on a processor nor 6925 * reserved. 6926 */ 6927 static void 6928 pmap_reset_asid_set(pmap_t pmap) 6929 { 6930 pmap_t curpmap; 6931 int asid, cpuid, epoch; 6932 struct asid_set *set; 6933 enum pmap_stage stage; 6934 6935 set = pmap->pm_asid_set; 6936 stage = pmap->pm_stage; 6937 6938 set = pmap->pm_asid_set; 6939 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 6940 mtx_assert(&set->asid_set_mutex, MA_OWNED); 6941 6942 /* 6943 * Ensure that the store to asid_epoch is globally visible before the 6944 * loads from pc_curpmap are performed. 6945 */ 6946 epoch = set->asid_epoch + 1; 6947 if (epoch == INT_MAX) 6948 epoch = 0; 6949 set->asid_epoch = epoch; 6950 dsb(ishst); 6951 if (stage == PM_STAGE1) { 6952 __asm __volatile("tlbi vmalle1is"); 6953 } else { 6954 KASSERT(pmap_clean_stage2_tlbi != NULL, 6955 ("%s: Unset stage 2 tlb invalidation callback\n", 6956 __func__)); 6957 pmap_clean_stage2_tlbi(); 6958 } 6959 dsb(ish); 6960 bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE, 6961 set->asid_set_size - 1); 6962 CPU_FOREACH(cpuid) { 6963 if (cpuid == curcpu) 6964 continue; 6965 if (stage == PM_STAGE1) { 6966 curpmap = pcpu_find(cpuid)->pc_curpmap; 6967 PMAP_ASSERT_STAGE1(pmap); 6968 } else { 6969 curpmap = pcpu_find(cpuid)->pc_curvmpmap; 6970 if (curpmap == NULL) 6971 continue; 6972 PMAP_ASSERT_STAGE2(pmap); 6973 } 6974 KASSERT(curpmap->pm_asid_set == set, ("Incorrect set")); 6975 asid = COOKIE_TO_ASID(curpmap->pm_cookie); 6976 if (asid == -1) 6977 continue; 6978 bit_set(set->asid_set, asid); 6979 curpmap->pm_cookie = COOKIE_FROM(asid, epoch); 6980 } 6981 } 6982 6983 /* 6984 * Allocate a new ASID for the specified pmap. 6985 */ 6986 static void 6987 pmap_alloc_asid(pmap_t pmap) 6988 { 6989 struct asid_set *set; 6990 int new_asid; 6991 6992 set = pmap->pm_asid_set; 6993 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 6994 6995 mtx_lock_spin(&set->asid_set_mutex); 6996 6997 /* 6998 * While this processor was waiting to acquire the asid set mutex, 6999 * pmap_reset_asid_set() running on another processor might have 7000 * updated this pmap's cookie to the current epoch. In which case, we 7001 * don't need to allocate a new ASID. 7002 */ 7003 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) 7004 goto out; 7005 7006 bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size, 7007 &new_asid); 7008 if (new_asid == -1) { 7009 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, 7010 set->asid_next, &new_asid); 7011 if (new_asid == -1) { 7012 pmap_reset_asid_set(pmap); 7013 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, 7014 set->asid_set_size, &new_asid); 7015 KASSERT(new_asid != -1, ("ASID allocation failure")); 7016 } 7017 } 7018 bit_set(set->asid_set, new_asid); 7019 set->asid_next = new_asid + 1; 7020 pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch); 7021 out: 7022 mtx_unlock_spin(&set->asid_set_mutex); 7023 } 7024 7025 static uint64_t __read_mostly ttbr_flags; 7026 7027 /* 7028 * Compute the value that should be stored in ttbr0 to activate the specified 7029 * pmap. This value may change from time to time. 7030 */ 7031 uint64_t 7032 pmap_to_ttbr0(pmap_t pmap) 7033 { 7034 uint64_t ttbr; 7035 7036 ttbr = pmap->pm_ttbr; 7037 ttbr |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 7038 ttbr |= ttbr_flags; 7039 7040 return (ttbr); 7041 } 7042 7043 static void 7044 pmap_set_cnp(void *arg) 7045 { 7046 uint64_t ttbr0, ttbr1; 7047 u_int cpuid; 7048 7049 cpuid = *(u_int *)arg; 7050 if (cpuid == curcpu) { 7051 /* 7052 * Set the flags while all CPUs are handling the 7053 * smp_rendezvous so will not call pmap_to_ttbr0. Any calls 7054 * to pmap_to_ttbr0 after this will have the CnP flag set. 7055 * The dsb after invalidating the TLB will act as a barrier 7056 * to ensure all CPUs can observe this change. 7057 */ 7058 ttbr_flags |= TTBR_CnP; 7059 } 7060 7061 ttbr0 = READ_SPECIALREG(ttbr0_el1); 7062 ttbr0 |= TTBR_CnP; 7063 7064 ttbr1 = READ_SPECIALREG(ttbr1_el1); 7065 ttbr1 |= TTBR_CnP; 7066 7067 /* Update ttbr{0,1}_el1 with the CnP flag */ 7068 WRITE_SPECIALREG(ttbr0_el1, ttbr0); 7069 WRITE_SPECIALREG(ttbr1_el1, ttbr1); 7070 isb(); 7071 __asm __volatile("tlbi vmalle1is"); 7072 dsb(ish); 7073 isb(); 7074 } 7075 7076 /* 7077 * Defer enabling CnP until we have read the ID registers to know if it's 7078 * supported on all CPUs. 7079 */ 7080 static void 7081 pmap_init_cnp(void *dummy __unused) 7082 { 7083 uint64_t reg; 7084 u_int cpuid; 7085 7086 if (!get_kernel_reg(ID_AA64MMFR2_EL1, ®)) 7087 return; 7088 7089 if (ID_AA64MMFR2_CnP_VAL(reg) != ID_AA64MMFR2_CnP_NONE) { 7090 if (bootverbose) 7091 printf("Enabling CnP\n"); 7092 cpuid = curcpu; 7093 smp_rendezvous(NULL, pmap_set_cnp, NULL, &cpuid); 7094 } 7095 7096 } 7097 SYSINIT(pmap_init_cnp, SI_SUB_SMP, SI_ORDER_ANY, pmap_init_cnp, NULL); 7098 7099 static bool 7100 pmap_activate_int(pmap_t pmap) 7101 { 7102 struct asid_set *set; 7103 int epoch; 7104 7105 KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap")); 7106 KASSERT(pmap != kernel_pmap, ("kernel pmap activation")); 7107 7108 if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) || 7109 (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) { 7110 /* 7111 * Handle the possibility that the old thread was preempted 7112 * after an "ic" or "tlbi" instruction but before it performed 7113 * a "dsb" instruction. If the old thread migrates to a new 7114 * processor, its completion of a "dsb" instruction on that 7115 * new processor does not guarantee that the "ic" or "tlbi" 7116 * instructions performed on the old processor have completed. 7117 */ 7118 dsb(ish); 7119 return (false); 7120 } 7121 7122 set = pmap->pm_asid_set; 7123 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 7124 7125 /* 7126 * Ensure that the store to curpmap is globally visible before the 7127 * load from asid_epoch is performed. 7128 */ 7129 if (pmap->pm_stage == PM_STAGE1) 7130 PCPU_SET(curpmap, pmap); 7131 else 7132 PCPU_SET(curvmpmap, pmap); 7133 dsb(ish); 7134 epoch = COOKIE_TO_EPOCH(pmap->pm_cookie); 7135 if (epoch >= 0 && epoch != set->asid_epoch) 7136 pmap_alloc_asid(pmap); 7137 7138 if (pmap->pm_stage == PM_STAGE1) { 7139 set_ttbr0(pmap_to_ttbr0(pmap)); 7140 if (PCPU_GET(bcast_tlbi_workaround) != 0) 7141 invalidate_local_icache(); 7142 } 7143 return (true); 7144 } 7145 7146 void 7147 pmap_activate_vm(pmap_t pmap) 7148 { 7149 7150 PMAP_ASSERT_STAGE2(pmap); 7151 7152 (void)pmap_activate_int(pmap); 7153 } 7154 7155 void 7156 pmap_activate(struct thread *td) 7157 { 7158 pmap_t pmap; 7159 7160 pmap = vmspace_pmap(td->td_proc->p_vmspace); 7161 PMAP_ASSERT_STAGE1(pmap); 7162 critical_enter(); 7163 (void)pmap_activate_int(pmap); 7164 critical_exit(); 7165 } 7166 7167 /* 7168 * Activate the thread we are switching to. 7169 * To simplify the assembly in cpu_throw return the new threads pcb. 7170 */ 7171 struct pcb * 7172 pmap_switch(struct thread *new) 7173 { 7174 pcpu_bp_harden bp_harden; 7175 struct pcb *pcb; 7176 7177 /* Store the new curthread */ 7178 PCPU_SET(curthread, new); 7179 7180 /* And the new pcb */ 7181 pcb = new->td_pcb; 7182 PCPU_SET(curpcb, pcb); 7183 7184 /* 7185 * TODO: We may need to flush the cache here if switching 7186 * to a user process. 7187 */ 7188 7189 if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) { 7190 /* 7191 * Stop userspace from training the branch predictor against 7192 * other processes. This will call into a CPU specific 7193 * function that clears the branch predictor state. 7194 */ 7195 bp_harden = PCPU_GET(bp_harden); 7196 if (bp_harden != NULL) 7197 bp_harden(); 7198 } 7199 7200 return (pcb); 7201 } 7202 7203 void 7204 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) 7205 { 7206 7207 PMAP_ASSERT_STAGE1(pmap); 7208 KASSERT(ADDR_IS_CANONICAL(va), 7209 ("%s: Address not in canonical form: %lx", __func__, va)); 7210 7211 if (ADDR_IS_KERNEL(va)) { 7212 cpu_icache_sync_range(va, sz); 7213 } else { 7214 u_int len, offset; 7215 vm_paddr_t pa; 7216 7217 /* Find the length of data in this page to flush */ 7218 offset = va & PAGE_MASK; 7219 len = imin(PAGE_SIZE - offset, sz); 7220 7221 while (sz != 0) { 7222 /* Extract the physical address & find it in the DMAP */ 7223 pa = pmap_extract(pmap, va); 7224 if (pa != 0) 7225 cpu_icache_sync_range(PHYS_TO_DMAP(pa), len); 7226 7227 /* Move to the next page */ 7228 sz -= len; 7229 va += len; 7230 /* Set the length for the next iteration */ 7231 len = imin(PAGE_SIZE, sz); 7232 } 7233 } 7234 } 7235 7236 static int 7237 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far) 7238 { 7239 pd_entry_t *pdep; 7240 pt_entry_t *ptep, pte; 7241 int rv, lvl, dfsc; 7242 7243 PMAP_ASSERT_STAGE2(pmap); 7244 rv = KERN_FAILURE; 7245 7246 /* Data and insn aborts use same encoding for FSC field. */ 7247 dfsc = esr & ISS_DATA_DFSC_MASK; 7248 switch (dfsc) { 7249 case ISS_DATA_DFSC_TF_L0: 7250 case ISS_DATA_DFSC_TF_L1: 7251 case ISS_DATA_DFSC_TF_L2: 7252 case ISS_DATA_DFSC_TF_L3: 7253 PMAP_LOCK(pmap); 7254 pdep = pmap_pde(pmap, far, &lvl); 7255 if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) { 7256 PMAP_LOCK(pmap); 7257 break; 7258 } 7259 7260 switch (lvl) { 7261 case 0: 7262 ptep = pmap_l0_to_l1(pdep, far); 7263 break; 7264 case 1: 7265 ptep = pmap_l1_to_l2(pdep, far); 7266 break; 7267 case 2: 7268 ptep = pmap_l2_to_l3(pdep, far); 7269 break; 7270 default: 7271 panic("%s: Invalid pde level %d", __func__,lvl); 7272 } 7273 goto fault_exec; 7274 7275 case ISS_DATA_DFSC_AFF_L1: 7276 case ISS_DATA_DFSC_AFF_L2: 7277 case ISS_DATA_DFSC_AFF_L3: 7278 PMAP_LOCK(pmap); 7279 ptep = pmap_pte(pmap, far, &lvl); 7280 fault_exec: 7281 if (ptep != NULL && (pte = pmap_load(ptep)) != 0) { 7282 if (icache_vmid) { 7283 pmap_invalidate_vpipt_icache(); 7284 } else { 7285 /* 7286 * If accessing an executable page invalidate 7287 * the I-cache so it will be valid when we 7288 * continue execution in the guest. The D-cache 7289 * is assumed to already be clean to the Point 7290 * of Coherency. 7291 */ 7292 if ((pte & ATTR_S2_XN_MASK) != 7293 ATTR_S2_XN(ATTR_S2_XN_NONE)) { 7294 invalidate_icache(); 7295 } 7296 } 7297 pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID); 7298 rv = KERN_SUCCESS; 7299 } 7300 PMAP_UNLOCK(pmap); 7301 break; 7302 } 7303 7304 return (rv); 7305 } 7306 7307 int 7308 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far) 7309 { 7310 pt_entry_t pte, *ptep; 7311 register_t intr; 7312 uint64_t ec, par; 7313 int lvl, rv; 7314 7315 rv = KERN_FAILURE; 7316 7317 ec = ESR_ELx_EXCEPTION(esr); 7318 switch (ec) { 7319 case EXCP_INSN_ABORT_L: 7320 case EXCP_INSN_ABORT: 7321 case EXCP_DATA_ABORT_L: 7322 case EXCP_DATA_ABORT: 7323 break; 7324 default: 7325 return (rv); 7326 } 7327 7328 if (pmap->pm_stage == PM_STAGE2) 7329 return (pmap_stage2_fault(pmap, esr, far)); 7330 7331 /* Data and insn aborts use same encoding for FSC field. */ 7332 switch (esr & ISS_DATA_DFSC_MASK) { 7333 case ISS_DATA_DFSC_AFF_L1: 7334 case ISS_DATA_DFSC_AFF_L2: 7335 case ISS_DATA_DFSC_AFF_L3: 7336 PMAP_LOCK(pmap); 7337 ptep = pmap_pte(pmap, far, &lvl); 7338 if (ptep != NULL) { 7339 pmap_set_bits(ptep, ATTR_AF); 7340 rv = KERN_SUCCESS; 7341 /* 7342 * XXXMJ as an optimization we could mark the entry 7343 * dirty if this is a write fault. 7344 */ 7345 } 7346 PMAP_UNLOCK(pmap); 7347 break; 7348 case ISS_DATA_DFSC_PF_L1: 7349 case ISS_DATA_DFSC_PF_L2: 7350 case ISS_DATA_DFSC_PF_L3: 7351 if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) || 7352 (esr & ISS_DATA_WnR) == 0) 7353 return (rv); 7354 PMAP_LOCK(pmap); 7355 ptep = pmap_pte(pmap, far, &lvl); 7356 if (ptep != NULL && 7357 ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) { 7358 if ((pte & ATTR_S1_AP_RW_BIT) == 7359 ATTR_S1_AP(ATTR_S1_AP_RO)) { 7360 pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT); 7361 pmap_s1_invalidate_page(pmap, far, true); 7362 } 7363 rv = KERN_SUCCESS; 7364 } 7365 PMAP_UNLOCK(pmap); 7366 break; 7367 case ISS_DATA_DFSC_TF_L0: 7368 case ISS_DATA_DFSC_TF_L1: 7369 case ISS_DATA_DFSC_TF_L2: 7370 case ISS_DATA_DFSC_TF_L3: 7371 /* 7372 * Retry the translation. A break-before-make sequence can 7373 * produce a transient fault. 7374 */ 7375 if (pmap == kernel_pmap) { 7376 /* 7377 * The translation fault may have occurred within a 7378 * critical section. Therefore, we must check the 7379 * address without acquiring the kernel pmap's lock. 7380 */ 7381 if (pmap_klookup(far, NULL)) 7382 rv = KERN_SUCCESS; 7383 } else { 7384 PMAP_LOCK(pmap); 7385 /* Ask the MMU to check the address. */ 7386 intr = intr_disable(); 7387 par = arm64_address_translate_s1e0r(far); 7388 intr_restore(intr); 7389 PMAP_UNLOCK(pmap); 7390 7391 /* 7392 * If the translation was successful, then we can 7393 * return success to the trap handler. 7394 */ 7395 if (PAR_SUCCESS(par)) 7396 rv = KERN_SUCCESS; 7397 } 7398 break; 7399 } 7400 7401 return (rv); 7402 } 7403 7404 /* 7405 * Increase the starting virtual address of the given mapping if a 7406 * different alignment might result in more superpage mappings. 7407 */ 7408 void 7409 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 7410 vm_offset_t *addr, vm_size_t size) 7411 { 7412 vm_offset_t superpage_offset; 7413 7414 if (size < L2_SIZE) 7415 return; 7416 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 7417 offset += ptoa(object->pg_color); 7418 superpage_offset = offset & L2_OFFSET; 7419 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || 7420 (*addr & L2_OFFSET) == superpage_offset) 7421 return; 7422 if ((*addr & L2_OFFSET) < superpage_offset) 7423 *addr = (*addr & ~L2_OFFSET) + superpage_offset; 7424 else 7425 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; 7426 } 7427 7428 /** 7429 * Get the kernel virtual address of a set of physical pages. If there are 7430 * physical addresses not covered by the DMAP perform a transient mapping 7431 * that will be removed when calling pmap_unmap_io_transient. 7432 * 7433 * \param page The pages the caller wishes to obtain the virtual 7434 * address on the kernel memory map. 7435 * \param vaddr On return contains the kernel virtual memory address 7436 * of the pages passed in the page parameter. 7437 * \param count Number of pages passed in. 7438 * \param can_fault TRUE if the thread using the mapped pages can take 7439 * page faults, FALSE otherwise. 7440 * 7441 * \returns TRUE if the caller must call pmap_unmap_io_transient when 7442 * finished or FALSE otherwise. 7443 * 7444 */ 7445 boolean_t 7446 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 7447 boolean_t can_fault) 7448 { 7449 vm_paddr_t paddr; 7450 boolean_t needs_mapping; 7451 int error __diagused, i; 7452 7453 /* 7454 * Allocate any KVA space that we need, this is done in a separate 7455 * loop to prevent calling vmem_alloc while pinned. 7456 */ 7457 needs_mapping = FALSE; 7458 for (i = 0; i < count; i++) { 7459 paddr = VM_PAGE_TO_PHYS(page[i]); 7460 if (__predict_false(!PHYS_IN_DMAP(paddr))) { 7461 error = vmem_alloc(kernel_arena, PAGE_SIZE, 7462 M_BESTFIT | M_WAITOK, &vaddr[i]); 7463 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 7464 needs_mapping = TRUE; 7465 } else { 7466 vaddr[i] = PHYS_TO_DMAP(paddr); 7467 } 7468 } 7469 7470 /* Exit early if everything is covered by the DMAP */ 7471 if (!needs_mapping) 7472 return (FALSE); 7473 7474 if (!can_fault) 7475 sched_pin(); 7476 for (i = 0; i < count; i++) { 7477 paddr = VM_PAGE_TO_PHYS(page[i]); 7478 if (!PHYS_IN_DMAP(paddr)) { 7479 panic( 7480 "pmap_map_io_transient: TODO: Map out of DMAP data"); 7481 } 7482 } 7483 7484 return (needs_mapping); 7485 } 7486 7487 void 7488 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 7489 boolean_t can_fault) 7490 { 7491 vm_paddr_t paddr; 7492 int i; 7493 7494 if (!can_fault) 7495 sched_unpin(); 7496 for (i = 0; i < count; i++) { 7497 paddr = VM_PAGE_TO_PHYS(page[i]); 7498 if (!PHYS_IN_DMAP(paddr)) { 7499 panic("ARM64TODO: pmap_unmap_io_transient: Unmap data"); 7500 } 7501 } 7502 } 7503 7504 boolean_t 7505 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 7506 { 7507 7508 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH); 7509 } 7510 7511 /* 7512 * Track a range of the kernel's virtual address space that is contiguous 7513 * in various mapping attributes. 7514 */ 7515 struct pmap_kernel_map_range { 7516 vm_offset_t sva; 7517 pt_entry_t attrs; 7518 int l3pages; 7519 int l3contig; 7520 int l2blocks; 7521 int l1blocks; 7522 }; 7523 7524 static void 7525 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 7526 vm_offset_t eva) 7527 { 7528 const char *mode; 7529 int index; 7530 7531 if (eva <= range->sva) 7532 return; 7533 7534 index = range->attrs & ATTR_S1_IDX_MASK; 7535 switch (index) { 7536 case ATTR_S1_IDX(VM_MEMATTR_DEVICE): 7537 mode = "DEV"; 7538 break; 7539 case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE): 7540 mode = "UC"; 7541 break; 7542 case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK): 7543 mode = "WB"; 7544 break; 7545 case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH): 7546 mode = "WT"; 7547 break; 7548 default: 7549 printf( 7550 "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n", 7551 __func__, index, range->sva, eva); 7552 mode = "??"; 7553 break; 7554 } 7555 7556 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %3s %d %d %d %d\n", 7557 range->sva, eva, 7558 (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-', 7559 (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x', 7560 (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X', 7561 (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's', 7562 mode, range->l1blocks, range->l2blocks, range->l3contig, 7563 range->l3pages); 7564 7565 /* Reset to sentinel value. */ 7566 range->sva = 0xfffffffffffffffful; 7567 } 7568 7569 /* 7570 * Determine whether the attributes specified by a page table entry match those 7571 * being tracked by the current range. 7572 */ 7573 static bool 7574 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 7575 { 7576 7577 return (range->attrs == attrs); 7578 } 7579 7580 static void 7581 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 7582 pt_entry_t attrs) 7583 { 7584 7585 memset(range, 0, sizeof(*range)); 7586 range->sva = va; 7587 range->attrs = attrs; 7588 } 7589 7590 /* Get the block/page attributes that correspond to the table attributes */ 7591 static pt_entry_t 7592 sysctl_kmaps_table_attrs(pd_entry_t table) 7593 { 7594 pt_entry_t attrs; 7595 7596 attrs = 0; 7597 if ((table & TATTR_UXN_TABLE) != 0) 7598 attrs |= ATTR_S1_UXN; 7599 if ((table & TATTR_PXN_TABLE) != 0) 7600 attrs |= ATTR_S1_PXN; 7601 if ((table & TATTR_AP_TABLE_RO) != 0) 7602 attrs |= ATTR_S1_AP(ATTR_S1_AP_RO); 7603 7604 return (attrs); 7605 } 7606 7607 /* Read the block/page attributes we care about */ 7608 static pt_entry_t 7609 sysctl_kmaps_block_attrs(pt_entry_t block) 7610 { 7611 return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK)); 7612 } 7613 7614 /* 7615 * Given a leaf PTE, derive the mapping's attributes. If they do not match 7616 * those of the current run, dump the address range and its attributes, and 7617 * begin a new run. 7618 */ 7619 static void 7620 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 7621 vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e, 7622 pt_entry_t l3e) 7623 { 7624 pt_entry_t attrs; 7625 7626 attrs = sysctl_kmaps_table_attrs(l0e); 7627 7628 if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 7629 attrs |= sysctl_kmaps_block_attrs(l1e); 7630 goto done; 7631 } 7632 attrs |= sysctl_kmaps_table_attrs(l1e); 7633 7634 if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 7635 attrs |= sysctl_kmaps_block_attrs(l2e); 7636 goto done; 7637 } 7638 attrs |= sysctl_kmaps_table_attrs(l2e); 7639 attrs |= sysctl_kmaps_block_attrs(l3e); 7640 7641 done: 7642 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 7643 sysctl_kmaps_dump(sb, range, va); 7644 sysctl_kmaps_reinit(range, va, attrs); 7645 } 7646 } 7647 7648 static int 7649 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 7650 { 7651 struct pmap_kernel_map_range range; 7652 struct sbuf sbuf, *sb; 7653 pd_entry_t l0e, *l1, l1e, *l2, l2e; 7654 pt_entry_t *l3, l3e; 7655 vm_offset_t sva; 7656 vm_paddr_t pa; 7657 int error, i, j, k, l; 7658 7659 error = sysctl_wire_old_buffer(req, 0); 7660 if (error != 0) 7661 return (error); 7662 sb = &sbuf; 7663 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 7664 7665 /* Sentinel value. */ 7666 range.sva = 0xfffffffffffffffful; 7667 7668 /* 7669 * Iterate over the kernel page tables without holding the kernel pmap 7670 * lock. Kernel page table pages are never freed, so at worst we will 7671 * observe inconsistencies in the output. 7672 */ 7673 for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES; 7674 i++) { 7675 if (i == pmap_l0_index(DMAP_MIN_ADDRESS)) 7676 sbuf_printf(sb, "\nDirect map:\n"); 7677 else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS)) 7678 sbuf_printf(sb, "\nKernel map:\n"); 7679 7680 l0e = kernel_pmap->pm_l0[i]; 7681 if ((l0e & ATTR_DESCR_VALID) == 0) { 7682 sysctl_kmaps_dump(sb, &range, sva); 7683 sva += L0_SIZE; 7684 continue; 7685 } 7686 pa = l0e & ~ATTR_MASK; 7687 l1 = (pd_entry_t *)PHYS_TO_DMAP(pa); 7688 7689 for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) { 7690 l1e = l1[j]; 7691 if ((l1e & ATTR_DESCR_VALID) == 0) { 7692 sysctl_kmaps_dump(sb, &range, sva); 7693 sva += L1_SIZE; 7694 continue; 7695 } 7696 if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) { 7697 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 7698 sysctl_kmaps_check(sb, &range, sva, l0e, l1e, 7699 0, 0); 7700 range.l1blocks++; 7701 sva += L1_SIZE; 7702 continue; 7703 } 7704 pa = l1e & ~ATTR_MASK; 7705 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); 7706 7707 for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) { 7708 l2e = l2[k]; 7709 if ((l2e & ATTR_DESCR_VALID) == 0) { 7710 sysctl_kmaps_dump(sb, &range, sva); 7711 sva += L2_SIZE; 7712 continue; 7713 } 7714 if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) { 7715 sysctl_kmaps_check(sb, &range, sva, 7716 l0e, l1e, l2e, 0); 7717 range.l2blocks++; 7718 sva += L2_SIZE; 7719 continue; 7720 } 7721 pa = l2e & ~ATTR_MASK; 7722 l3 = (pt_entry_t *)PHYS_TO_DMAP(pa); 7723 7724 for (l = pmap_l3_index(sva); l < Ln_ENTRIES; 7725 l++, sva += L3_SIZE) { 7726 l3e = l3[l]; 7727 if ((l3e & ATTR_DESCR_VALID) == 0) { 7728 sysctl_kmaps_dump(sb, &range, 7729 sva); 7730 continue; 7731 } 7732 sysctl_kmaps_check(sb, &range, sva, 7733 l0e, l1e, l2e, l3e); 7734 if ((l3e & ATTR_CONTIGUOUS) != 0) 7735 range.l3contig += l % 16 == 0 ? 7736 1 : 0; 7737 else 7738 range.l3pages++; 7739 } 7740 } 7741 } 7742 } 7743 7744 error = sbuf_finish(sb); 7745 sbuf_delete(sb); 7746 return (error); 7747 } 7748 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 7749 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 7750 NULL, 0, sysctl_kmaps, "A", 7751 "Dump kernel address layout"); 7752