1 /*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2003 Peter Wemm 9 * All rights reserved. 10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 11 * All rights reserved. 12 * Copyright (c) 2014 Andrew Turner 13 * All rights reserved. 14 * Copyright (c) 2014-2016 The FreeBSD Foundation 15 * All rights reserved. 16 * 17 * This code is derived from software contributed to Berkeley by 18 * the Systems Programming Group of the University of Utah Computer 19 * Science Department and William Jolitz of UUNET Technologies Inc. 20 * 21 * This software was developed by Andrew Turner under sponsorship from 22 * the FreeBSD Foundation. 23 * 24 * Redistribution and use in source and binary forms, with or without 25 * modification, are permitted provided that the following conditions 26 * are met: 27 * 1. Redistributions of source code must retain the above copyright 28 * notice, this list of conditions and the following disclaimer. 29 * 2. Redistributions in binary form must reproduce the above copyright 30 * notice, this list of conditions and the following disclaimer in the 31 * documentation and/or other materials provided with the distribution. 32 * 3. All advertising materials mentioning features or use of this software 33 * must display the following acknowledgement: 34 * This product includes software developed by the University of 35 * California, Berkeley and its contributors. 36 * 4. Neither the name of the University nor the names of its contributors 37 * may be used to endorse or promote products derived from this software 38 * without specific prior written permission. 39 * 40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 43 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 50 * SUCH DAMAGE. 51 * 52 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 53 */ 54 /*- 55 * Copyright (c) 2003 Networks Associates Technology, Inc. 56 * All rights reserved. 57 * 58 * This software was developed for the FreeBSD Project by Jake Burkholder, 59 * Safeport Network Services, and Network Associates Laboratories, the 60 * Security Research Division of Network Associates, Inc. under 61 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 62 * CHATS research program. 63 * 64 * Redistribution and use in source and binary forms, with or without 65 * modification, are permitted provided that the following conditions 66 * are met: 67 * 1. Redistributions of source code must retain the above copyright 68 * notice, this list of conditions and the following disclaimer. 69 * 2. Redistributions in binary form must reproduce the above copyright 70 * notice, this list of conditions and the following disclaimer in the 71 * documentation and/or other materials provided with the distribution. 72 * 73 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 74 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 75 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 76 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 77 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 78 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 79 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 80 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 81 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 82 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 83 * SUCH DAMAGE. 84 */ 85 86 #include <sys/cdefs.h> 87 __FBSDID("$FreeBSD$"); 88 89 /* 90 * Manages physical address maps. 91 * 92 * Since the information managed by this module is 93 * also stored by the logical address mapping module, 94 * this module may throw away valid virtual-to-physical 95 * mappings at almost any time. However, invalidations 96 * of virtual-to-physical mappings must be done as 97 * requested. 98 * 99 * In order to cope with hardware architectures which 100 * make virtual-to-physical map invalidates expensive, 101 * this module may delay invalidate or reduced protection 102 * operations until such time as they are actually 103 * necessary. This module is given full information as 104 * to which processors are currently using which maps, 105 * and to when physical maps must be made correct. 106 */ 107 108 #include "opt_vm.h" 109 110 #include <sys/param.h> 111 #include <sys/bitstring.h> 112 #include <sys/bus.h> 113 #include <sys/systm.h> 114 #include <sys/kernel.h> 115 #include <sys/ktr.h> 116 #include <sys/limits.h> 117 #include <sys/lock.h> 118 #include <sys/malloc.h> 119 #include <sys/mman.h> 120 #include <sys/msgbuf.h> 121 #include <sys/mutex.h> 122 #include <sys/physmem.h> 123 #include <sys/proc.h> 124 #include <sys/rwlock.h> 125 #include <sys/sbuf.h> 126 #include <sys/sx.h> 127 #include <sys/vmem.h> 128 #include <sys/vmmeter.h> 129 #include <sys/sched.h> 130 #include <sys/sysctl.h> 131 #include <sys/_unrhdr.h> 132 #include <sys/smp.h> 133 134 #include <vm/vm.h> 135 #include <vm/vm_param.h> 136 #include <vm/vm_kern.h> 137 #include <vm/vm_page.h> 138 #include <vm/vm_map.h> 139 #include <vm/vm_object.h> 140 #include <vm/vm_extern.h> 141 #include <vm/vm_pageout.h> 142 #include <vm/vm_pager.h> 143 #include <vm/vm_phys.h> 144 #include <vm/vm_radix.h> 145 #include <vm/vm_reserv.h> 146 #include <vm/vm_dumpset.h> 147 #include <vm/uma.h> 148 149 #include <machine/machdep.h> 150 #include <machine/md_var.h> 151 #include <machine/pcb.h> 152 153 #define PMAP_ASSERT_STAGE1(pmap) MPASS((pmap)->pm_stage == PM_STAGE1) 154 #define PMAP_ASSERT_STAGE2(pmap) MPASS((pmap)->pm_stage == PM_STAGE2) 155 156 #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t))) 157 #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t))) 158 #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t))) 159 #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t))) 160 161 #define NUL0E L0_ENTRIES 162 #define NUL1E (NUL0E * NL1PG) 163 #define NUL2E (NUL1E * NL2PG) 164 165 #if !defined(DIAGNOSTIC) 166 #ifdef __GNUC_GNU_INLINE__ 167 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 168 #else 169 #define PMAP_INLINE extern inline 170 #endif 171 #else 172 #define PMAP_INLINE 173 #endif 174 175 #ifdef PV_STATS 176 #define PV_STAT(x) do { x ; } while (0) 177 #else 178 #define PV_STAT(x) do { } while (0) 179 #endif 180 181 #define pmap_l0_pindex(v) (NUL2E + NUL1E + ((v) >> L0_SHIFT)) 182 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT)) 183 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) 184 185 static struct md_page * 186 pa_to_pvh(vm_paddr_t pa) 187 { 188 struct vm_phys_seg *seg; 189 int segind; 190 191 for (segind = 0; segind < vm_phys_nsegs; segind++) { 192 seg = &vm_phys_segs[segind]; 193 if (pa >= seg->start && pa < seg->end) 194 return ((struct md_page *)seg->md_first + 195 pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start)); 196 } 197 panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa); 198 } 199 200 static struct md_page * 201 page_to_pvh(vm_page_t m) 202 { 203 struct vm_phys_seg *seg; 204 205 seg = &vm_phys_segs[m->segind]; 206 return ((struct md_page *)seg->md_first + 207 pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start)); 208 } 209 210 #define NPV_LIST_LOCKS MAXCPU 211 212 #define PHYS_TO_PV_LIST_LOCK(pa) \ 213 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) 214 215 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 216 struct rwlock **_lockp = (lockp); \ 217 struct rwlock *_new_lock; \ 218 \ 219 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 220 if (_new_lock != *_lockp) { \ 221 if (*_lockp != NULL) \ 222 rw_wunlock(*_lockp); \ 223 *_lockp = _new_lock; \ 224 rw_wlock(*_lockp); \ 225 } \ 226 } while (0) 227 228 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 229 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 230 231 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 232 struct rwlock **_lockp = (lockp); \ 233 \ 234 if (*_lockp != NULL) { \ 235 rw_wunlock(*_lockp); \ 236 *_lockp = NULL; \ 237 } \ 238 } while (0) 239 240 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 241 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 242 243 /* 244 * The presence of this flag indicates that the mapping is writeable. 245 * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise 246 * it is dirty. This flag may only be set on managed mappings. 247 * 248 * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it 249 * as a software managed bit. 250 */ 251 #define ATTR_SW_DBM ATTR_DBM 252 253 struct pmap kernel_pmap_store; 254 255 /* Used for mapping ACPI memory before VM is initialized */ 256 #define PMAP_PREINIT_MAPPING_COUNT 32 257 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE) 258 static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */ 259 static int vm_initialized = 0; /* No need to use pre-init maps when set */ 260 261 /* 262 * Reserve a few L2 blocks starting from 'preinit_map_va' pointer. 263 * Always map entire L2 block for simplicity. 264 * VA of L2 block = preinit_map_va + i * L2_SIZE 265 */ 266 static struct pmap_preinit_mapping { 267 vm_paddr_t pa; 268 vm_offset_t va; 269 vm_size_t size; 270 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 271 272 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 273 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 274 vm_offset_t kernel_vm_end = 0; 275 276 /* 277 * Data for the pv entry allocation mechanism. 278 */ 279 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 280 static struct mtx pv_chunks_mutex; 281 static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 282 static struct md_page *pv_table; 283 static struct md_page pv_dummy; 284 285 vm_paddr_t dmap_phys_base; /* The start of the dmap region */ 286 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ 287 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ 288 289 /* This code assumes all L1 DMAP entries will be used */ 290 CTASSERT((DMAP_MIN_ADDRESS & ~L0_OFFSET) == DMAP_MIN_ADDRESS); 291 CTASSERT((DMAP_MAX_ADDRESS & ~L0_OFFSET) == DMAP_MAX_ADDRESS); 292 293 extern pt_entry_t pagetable_l0_ttbr1[]; 294 295 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 296 static vm_paddr_t physmap[PHYSMAP_SIZE]; 297 static u_int physmap_idx; 298 299 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 300 "VM/pmap parameters"); 301 302 /* 303 * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs 304 * that it has currently allocated to a pmap, a cursor ("asid_next") to 305 * optimize its search for a free ASID in the bit vector, and an epoch number 306 * ("asid_epoch") to indicate when it has reclaimed all previously allocated 307 * ASIDs that are not currently active on a processor. 308 * 309 * The current epoch number is always in the range [0, INT_MAX). Negative 310 * numbers and INT_MAX are reserved for special cases that are described 311 * below. 312 */ 313 struct asid_set { 314 int asid_bits; 315 bitstr_t *asid_set; 316 int asid_set_size; 317 int asid_next; 318 int asid_epoch; 319 struct mtx asid_set_mutex; 320 }; 321 322 static struct asid_set asids; 323 static struct asid_set vmids; 324 325 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 326 "ASID allocator"); 327 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0, 328 "The number of bits in an ASID"); 329 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0, 330 "The last allocated ASID plus one"); 331 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0, 332 "The current epoch number"); 333 334 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator"); 335 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0, 336 "The number of bits in an VMID"); 337 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0, 338 "The last allocated VMID plus one"); 339 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0, 340 "The current epoch number"); 341 342 void (*pmap_clean_stage2_tlbi)(void); 343 void (*pmap_invalidate_vpipt_icache)(void); 344 345 /* 346 * A pmap's cookie encodes an ASID and epoch number. Cookies for reserved 347 * ASIDs have a negative epoch number, specifically, INT_MIN. Cookies for 348 * dynamically allocated ASIDs have a non-negative epoch number. 349 * 350 * An invalid ASID is represented by -1. 351 * 352 * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN), 353 * which indicates that an ASID should never be allocated to the pmap, and 354 * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be 355 * allocated when the pmap is next activated. 356 */ 357 #define COOKIE_FROM(asid, epoch) ((long)((u_int)(asid) | \ 358 ((u_long)(epoch) << 32))) 359 #define COOKIE_TO_ASID(cookie) ((int)(cookie)) 360 #define COOKIE_TO_EPOCH(cookie) ((int)((u_long)(cookie) >> 32)) 361 362 #define TLBI_VA_SHIFT 12 363 #define TLBI_VA_MASK ((1ul << 44) - 1) 364 #define TLBI_VA(addr) (((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK) 365 #define TLBI_VA_L3_INCR (L3_SIZE >> TLBI_VA_SHIFT) 366 367 static int superpages_enabled = 1; 368 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, 369 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0, 370 "Are large page mappings enabled?"); 371 372 /* 373 * Internal flags for pmap_enter()'s helper functions. 374 */ 375 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 376 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 377 378 static void free_pv_chunk(struct pv_chunk *pc); 379 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 380 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 381 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 382 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 383 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 384 vm_offset_t va); 385 386 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); 387 static bool pmap_activate_int(pmap_t pmap); 388 static void pmap_alloc_asid(pmap_t pmap); 389 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, 390 vm_prot_t prot, int mode, bool skip_unmapped); 391 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va); 392 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, 393 vm_offset_t va, struct rwlock **lockp); 394 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); 395 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 396 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 397 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, 398 u_int flags, vm_page_t m, struct rwlock **lockp); 399 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 400 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp); 401 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, 402 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp); 403 static void pmap_reset_asid_set(pmap_t pmap); 404 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 405 vm_page_t m, struct rwlock **lockp); 406 407 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, 408 struct rwlock **lockp); 409 410 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, 411 struct spglist *free); 412 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 413 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 414 415 /* 416 * These load the old table data and store the new value. 417 * They need to be atomic as the System MMU may write to the table at 418 * the same time as the CPU. 419 */ 420 #define pmap_clear(table) atomic_store_64(table, 0) 421 #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits) 422 #define pmap_load(table) (*table) 423 #define pmap_load_clear(table) atomic_swap_64(table, 0) 424 #define pmap_load_store(table, entry) atomic_swap_64(table, entry) 425 #define pmap_set_bits(table, bits) atomic_set_64(table, bits) 426 #define pmap_store(table, entry) atomic_store_64(table, entry) 427 428 /********************/ 429 /* Inline functions */ 430 /********************/ 431 432 static __inline void 433 pagecopy(void *s, void *d) 434 { 435 436 memcpy(d, s, PAGE_SIZE); 437 } 438 439 static __inline pd_entry_t * 440 pmap_l0(pmap_t pmap, vm_offset_t va) 441 { 442 443 return (&pmap->pm_l0[pmap_l0_index(va)]); 444 } 445 446 static __inline pd_entry_t * 447 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) 448 { 449 pd_entry_t *l1; 450 451 l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); 452 return (&l1[pmap_l1_index(va)]); 453 } 454 455 static __inline pd_entry_t * 456 pmap_l1(pmap_t pmap, vm_offset_t va) 457 { 458 pd_entry_t *l0; 459 460 l0 = pmap_l0(pmap, va); 461 if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE) 462 return (NULL); 463 464 return (pmap_l0_to_l1(l0, va)); 465 } 466 467 static __inline pd_entry_t * 468 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va) 469 { 470 pd_entry_t l1, *l2p; 471 472 l1 = pmap_load(l1p); 473 474 KASSERT(ADDR_IS_CANONICAL(va), 475 ("%s: Address not in canonical form: %lx", __func__, va)); 476 /* 477 * The valid bit may be clear if pmap_update_entry() is concurrently 478 * modifying the entry, so for KVA only the entry type may be checked. 479 */ 480 KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0, 481 ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va)); 482 KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, 483 ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va)); 484 l2p = (pd_entry_t *)PHYS_TO_DMAP(l1 & ~ATTR_MASK); 485 return (&l2p[pmap_l2_index(va)]); 486 } 487 488 static __inline pd_entry_t * 489 pmap_l2(pmap_t pmap, vm_offset_t va) 490 { 491 pd_entry_t *l1; 492 493 l1 = pmap_l1(pmap, va); 494 if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE) 495 return (NULL); 496 497 return (pmap_l1_to_l2(l1, va)); 498 } 499 500 static __inline pt_entry_t * 501 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va) 502 { 503 pd_entry_t l2; 504 pt_entry_t *l3p; 505 506 l2 = pmap_load(l2p); 507 508 KASSERT(ADDR_IS_CANONICAL(va), 509 ("%s: Address not in canonical form: %lx", __func__, va)); 510 /* 511 * The valid bit may be clear if pmap_update_entry() is concurrently 512 * modifying the entry, so for KVA only the entry type may be checked. 513 */ 514 KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0, 515 ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va)); 516 KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, 517 ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va)); 518 l3p = (pt_entry_t *)PHYS_TO_DMAP(l2 & ~ATTR_MASK); 519 return (&l3p[pmap_l3_index(va)]); 520 } 521 522 /* 523 * Returns the lowest valid pde for a given virtual address. 524 * The next level may or may not point to a valid page or block. 525 */ 526 static __inline pd_entry_t * 527 pmap_pde(pmap_t pmap, vm_offset_t va, int *level) 528 { 529 pd_entry_t *l0, *l1, *l2, desc; 530 531 l0 = pmap_l0(pmap, va); 532 desc = pmap_load(l0) & ATTR_DESCR_MASK; 533 if (desc != L0_TABLE) { 534 *level = -1; 535 return (NULL); 536 } 537 538 l1 = pmap_l0_to_l1(l0, va); 539 desc = pmap_load(l1) & ATTR_DESCR_MASK; 540 if (desc != L1_TABLE) { 541 *level = 0; 542 return (l0); 543 } 544 545 l2 = pmap_l1_to_l2(l1, va); 546 desc = pmap_load(l2) & ATTR_DESCR_MASK; 547 if (desc != L2_TABLE) { 548 *level = 1; 549 return (l1); 550 } 551 552 *level = 2; 553 return (l2); 554 } 555 556 /* 557 * Returns the lowest valid pte block or table entry for a given virtual 558 * address. If there are no valid entries return NULL and set the level to 559 * the first invalid level. 560 */ 561 static __inline pt_entry_t * 562 pmap_pte(pmap_t pmap, vm_offset_t va, int *level) 563 { 564 pd_entry_t *l1, *l2, desc; 565 pt_entry_t *l3; 566 567 l1 = pmap_l1(pmap, va); 568 if (l1 == NULL) { 569 *level = 0; 570 return (NULL); 571 } 572 desc = pmap_load(l1) & ATTR_DESCR_MASK; 573 if (desc == L1_BLOCK) { 574 *level = 1; 575 return (l1); 576 } 577 578 if (desc != L1_TABLE) { 579 *level = 1; 580 return (NULL); 581 } 582 583 l2 = pmap_l1_to_l2(l1, va); 584 desc = pmap_load(l2) & ATTR_DESCR_MASK; 585 if (desc == L2_BLOCK) { 586 *level = 2; 587 return (l2); 588 } 589 590 if (desc != L2_TABLE) { 591 *level = 2; 592 return (NULL); 593 } 594 595 *level = 3; 596 l3 = pmap_l2_to_l3(l2, va); 597 if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE) 598 return (NULL); 599 600 return (l3); 601 } 602 603 /* 604 * If the given pmap has an L{1,2}_BLOCK or L3_PAGE entry at the specified 605 * level that maps the specified virtual address, then a pointer to that entry 606 * is returned. Otherwise, NULL is returned, unless INVARIANTS are enabled 607 * and a diagnostic message is provided, in which case this function panics. 608 */ 609 static __always_inline pt_entry_t * 610 pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag) 611 { 612 pd_entry_t *l0p, *l1p, *l2p; 613 pt_entry_t desc, *l3p; 614 int walk_level __diagused; 615 616 KASSERT(level >= 0 && level < 4, 617 ("%s: %s passed an out-of-range level (%d)", __func__, diag, 618 level)); 619 l0p = pmap_l0(pmap, va); 620 desc = pmap_load(l0p) & ATTR_DESCR_MASK; 621 if (desc == L0_TABLE && level > 0) { 622 l1p = pmap_l0_to_l1(l0p, va); 623 desc = pmap_load(l1p) & ATTR_DESCR_MASK; 624 if (desc == L1_BLOCK && level == 1) 625 return (l1p); 626 else if (desc == L1_TABLE && level > 1) { 627 l2p = pmap_l1_to_l2(l1p, va); 628 desc = pmap_load(l2p) & ATTR_DESCR_MASK; 629 if (desc == L2_BLOCK && level == 2) 630 return (l2p); 631 else if (desc == L2_TABLE && level > 2) { 632 l3p = pmap_l2_to_l3(l2p, va); 633 desc = pmap_load(l3p) & ATTR_DESCR_MASK; 634 if (desc == L3_PAGE && level == 3) 635 return (l3p); 636 else 637 walk_level = 3; 638 } else 639 walk_level = 2; 640 } else 641 walk_level = 1; 642 } else 643 walk_level = 0; 644 KASSERT(diag == NULL, 645 ("%s: va %#lx not mapped at level %d, desc %ld at level %d", 646 diag, va, level, desc, walk_level)); 647 return (NULL); 648 } 649 650 bool 651 pmap_ps_enabled(pmap_t pmap __unused) 652 { 653 654 return (superpages_enabled != 0); 655 } 656 657 bool 658 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1, 659 pd_entry_t **l2, pt_entry_t **l3) 660 { 661 pd_entry_t *l0p, *l1p, *l2p; 662 663 if (pmap->pm_l0 == NULL) 664 return (false); 665 666 l0p = pmap_l0(pmap, va); 667 *l0 = l0p; 668 669 if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) 670 return (false); 671 672 l1p = pmap_l0_to_l1(l0p, va); 673 *l1 = l1p; 674 675 if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) { 676 *l2 = NULL; 677 *l3 = NULL; 678 return (true); 679 } 680 681 if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE) 682 return (false); 683 684 l2p = pmap_l1_to_l2(l1p, va); 685 *l2 = l2p; 686 687 if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) { 688 *l3 = NULL; 689 return (true); 690 } 691 692 if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE) 693 return (false); 694 695 *l3 = pmap_l2_to_l3(l2p, va); 696 697 return (true); 698 } 699 700 static __inline int 701 pmap_l3_valid(pt_entry_t l3) 702 { 703 704 return ((l3 & ATTR_DESCR_MASK) == L3_PAGE); 705 } 706 707 CTASSERT(L1_BLOCK == L2_BLOCK); 708 709 static pt_entry_t 710 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr) 711 { 712 pt_entry_t val; 713 714 if (pmap->pm_stage == PM_STAGE1) { 715 val = ATTR_S1_IDX(memattr); 716 if (memattr == VM_MEMATTR_DEVICE) 717 val |= ATTR_S1_XN; 718 return (val); 719 } 720 721 val = 0; 722 723 switch (memattr) { 724 case VM_MEMATTR_DEVICE: 725 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) | 726 ATTR_S2_XN(ATTR_S2_XN_ALL)); 727 case VM_MEMATTR_UNCACHEABLE: 728 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC)); 729 case VM_MEMATTR_WRITE_BACK: 730 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB)); 731 case VM_MEMATTR_WRITE_THROUGH: 732 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT)); 733 default: 734 panic("%s: invalid memory attribute %x", __func__, memattr); 735 } 736 } 737 738 static pt_entry_t 739 pmap_pte_prot(pmap_t pmap, vm_prot_t prot) 740 { 741 pt_entry_t val; 742 743 val = 0; 744 if (pmap->pm_stage == PM_STAGE1) { 745 if ((prot & VM_PROT_EXECUTE) == 0) 746 val |= ATTR_S1_XN; 747 if ((prot & VM_PROT_WRITE) == 0) 748 val |= ATTR_S1_AP(ATTR_S1_AP_RO); 749 } else { 750 if ((prot & VM_PROT_WRITE) != 0) 751 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 752 if ((prot & VM_PROT_READ) != 0) 753 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ); 754 if ((prot & VM_PROT_EXECUTE) == 0) 755 val |= ATTR_S2_XN(ATTR_S2_XN_ALL); 756 } 757 758 return (val); 759 } 760 761 /* 762 * Checks if the PTE is dirty. 763 */ 764 static inline int 765 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte) 766 { 767 768 KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte)); 769 770 if (pmap->pm_stage == PM_STAGE1) { 771 KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0, 772 ("pte %#lx is writeable and missing ATTR_SW_DBM", pte)); 773 774 return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 775 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM)); 776 } 777 778 return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) == 779 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)); 780 } 781 782 static __inline void 783 pmap_resident_count_inc(pmap_t pmap, int count) 784 { 785 786 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 787 pmap->pm_stats.resident_count += count; 788 } 789 790 static __inline void 791 pmap_resident_count_dec(pmap_t pmap, int count) 792 { 793 794 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 795 KASSERT(pmap->pm_stats.resident_count >= count, 796 ("pmap %p resident count underflow %ld %d", pmap, 797 pmap->pm_stats.resident_count, count)); 798 pmap->pm_stats.resident_count -= count; 799 } 800 801 static vm_paddr_t 802 pmap_early_vtophys(vm_offset_t va) 803 { 804 vm_paddr_t pa_page; 805 806 pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK; 807 return (pa_page | (va & PAR_LOW_MASK)); 808 } 809 810 /* State of the bootstrapped DMAP page tables */ 811 struct dmap_bootstrap_state { 812 vm_offset_t va; 813 vm_paddr_t pa; 814 pt_entry_t *l1; 815 pt_entry_t *l2; 816 pt_entry_t *l3; 817 u_int l0_slot; 818 u_int l1_slot; 819 u_int l2_slot; 820 vm_offset_t freemempos; 821 }; 822 823 static void 824 pmap_bootstrap_dmap_l0_table(struct dmap_bootstrap_state *state) 825 { 826 vm_paddr_t l1_pa; 827 u_int l0_slot; 828 829 /* Link the level 0 table to a level 1 table */ 830 l0_slot = pmap_l0_index(state->va); 831 if (l0_slot != state->l0_slot) { 832 MPASS(state->l0_slot < l0_slot || 833 state->l0_slot == L0_ENTRIES); 834 835 /* Create a new L0 table entry */ 836 state->l0_slot = l0_slot; 837 state->l1 = (pt_entry_t *)state->freemempos; 838 memset(state->l1, 0, PAGE_SIZE); 839 state->freemempos += PAGE_SIZE; 840 841 /* Reset lower levels */ 842 state->l2 = NULL; 843 state->l3 = NULL; 844 state->l1_slot = Ln_ENTRIES; 845 state->l2_slot = Ln_ENTRIES; 846 847 l1_pa = pmap_early_vtophys((vm_offset_t)state->l1); 848 MPASS((l1_pa & Ln_TABLE_MASK) == 0); 849 MPASS(pagetable_l0_ttbr1[l0_slot] == 0); 850 pmap_store(&pagetable_l0_ttbr1[l0_slot], l1_pa | 851 TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0 | L0_TABLE); 852 } 853 KASSERT(state->l1 != NULL, ("%s: NULL l1", __func__)); 854 } 855 856 static void 857 pmap_bootstrap_dmap_l1_table(struct dmap_bootstrap_state *state) 858 { 859 vm_paddr_t l2_pa; 860 u_int l1_slot; 861 862 /* Make sure there is a valid L0 -> L1 table */ 863 pmap_bootstrap_dmap_l0_table(state); 864 865 /* Link the level 1 table to a level 2 table */ 866 l1_slot = pmap_l1_index(state->va); 867 if (l1_slot != state->l1_slot) { 868 MPASS(state->l1_slot < l1_slot || 869 state->l1_slot == Ln_ENTRIES); 870 871 /* Create a new L1 table entry */ 872 state->l1_slot = l1_slot; 873 state->l2 = (pt_entry_t *)state->freemempos; 874 memset(state->l2, 0, PAGE_SIZE); 875 state->freemempos += PAGE_SIZE; 876 877 /* Reset lower levels */ 878 state->l3 = NULL; 879 state->l2_slot = Ln_ENTRIES; 880 881 l2_pa = pmap_early_vtophys((vm_offset_t)state->l2); 882 MPASS((l2_pa & Ln_TABLE_MASK) == 0); 883 MPASS(state->l1[l1_slot] == 0); 884 pmap_store(&state->l1[l1_slot], l2_pa | TATTR_PXN_TABLE | 885 L1_TABLE); 886 } 887 KASSERT(state->l2 != NULL, ("%s: NULL l2", __func__)); 888 } 889 890 static void 891 pmap_bootstrap_dmap_l2_table(struct dmap_bootstrap_state *state) 892 { 893 vm_paddr_t l3_pa; 894 u_int l2_slot; 895 896 /* Make sure there is a valid L1 -> L2 table */ 897 pmap_bootstrap_dmap_l1_table(state); 898 899 /* Link the level 2 table to a level 3 table */ 900 l2_slot = pmap_l2_index(state->va); 901 if (l2_slot != state->l2_slot) { 902 MPASS(state->l2_slot < l2_slot || 903 state->l2_slot == Ln_ENTRIES); 904 905 /* Create a new L2 table entry */ 906 state->l2_slot = l2_slot; 907 state->l3 = (pt_entry_t *)state->freemempos; 908 memset(state->l3, 0, PAGE_SIZE); 909 state->freemempos += PAGE_SIZE; 910 911 l3_pa = pmap_early_vtophys((vm_offset_t)state->l3); 912 MPASS((l3_pa & Ln_TABLE_MASK) == 0); 913 MPASS(state->l2[l2_slot] == 0); 914 pmap_store(&state->l2[l2_slot], l3_pa | TATTR_PXN_TABLE | 915 L2_TABLE); 916 } 917 KASSERT(state->l3 != NULL, ("%s: NULL l3", __func__)); 918 } 919 920 static void 921 pmap_bootstrap_dmap_l2_block(struct dmap_bootstrap_state *state, int i) 922 { 923 u_int l2_slot; 924 bool first; 925 926 if ((physmap[i + 1] - state->pa) < L2_SIZE) 927 return; 928 929 /* Make sure there is a valid L1 table */ 930 pmap_bootstrap_dmap_l1_table(state); 931 932 MPASS((state->va & L2_OFFSET) == 0); 933 for (first = true; 934 state->va < DMAP_MAX_ADDRESS && 935 (physmap[i + 1] - state->pa) >= L2_SIZE; 936 state->va += L2_SIZE, state->pa += L2_SIZE) { 937 /* 938 * Stop if we are about to walk off the end of what the 939 * current L1 slot can address. 940 */ 941 if (!first && (state->pa & L1_OFFSET) == 0) 942 break; 943 944 first = false; 945 l2_slot = pmap_l2_index(state->va); 946 MPASS((state->pa & L2_OFFSET) == 0); 947 MPASS(state->l2[l2_slot] == 0); 948 pmap_store(&state->l2[l2_slot], state->pa | ATTR_DEFAULT | 949 ATTR_S1_XN | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | 950 L2_BLOCK); 951 } 952 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS)); 953 } 954 955 static void 956 pmap_bootstrap_dmap_l3_page(struct dmap_bootstrap_state *state, int i) 957 { 958 u_int l3_slot; 959 bool first; 960 961 if ((physmap[i + 1] - state->pa) < L3_SIZE) 962 return; 963 964 /* Make sure there is a valid L2 table */ 965 pmap_bootstrap_dmap_l2_table(state); 966 967 MPASS((state->va & L3_OFFSET) == 0); 968 for (first = true; 969 state->va < DMAP_MAX_ADDRESS && 970 (physmap[i + 1] - state->pa) >= L3_SIZE; 971 state->va += L3_SIZE, state->pa += L3_SIZE) { 972 /* 973 * Stop if we are about to walk off the end of what the 974 * current L2 slot can address. 975 */ 976 if (!first && (state->pa & L2_OFFSET) == 0) 977 break; 978 979 first = false; 980 l3_slot = pmap_l3_index(state->va); 981 MPASS((state->pa & L3_OFFSET) == 0); 982 MPASS(state->l3[l3_slot] == 0); 983 pmap_store(&state->l3[l3_slot], state->pa | ATTR_DEFAULT | 984 ATTR_S1_XN | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | 985 L3_PAGE); 986 } 987 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS)); 988 } 989 990 static vm_offset_t 991 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, 992 vm_offset_t freemempos) 993 { 994 struct dmap_bootstrap_state state; 995 int i; 996 997 dmap_phys_base = min_pa & ~L1_OFFSET; 998 dmap_phys_max = 0; 999 dmap_max_addr = 0; 1000 1001 state.l1 = state.l2 = state.l3 = NULL; 1002 state.l0_slot = L0_ENTRIES; 1003 state.l1_slot = Ln_ENTRIES; 1004 state.l2_slot = Ln_ENTRIES; 1005 state.freemempos = freemempos; 1006 1007 for (i = 0; i < (physmap_idx * 2); i += 2) { 1008 state.pa = physmap[i] & ~L3_OFFSET; 1009 state.va = state.pa - dmap_phys_base + DMAP_MIN_ADDRESS; 1010 1011 /* Create L3 mappings at the start of the region */ 1012 if ((state.pa & L2_OFFSET) != 0) 1013 pmap_bootstrap_dmap_l3_page(&state, i); 1014 MPASS(state.pa <= physmap[i + 1]); 1015 1016 /* Create L2 mappings at the start of the region */ 1017 if ((state.pa & L1_OFFSET) != 0) 1018 pmap_bootstrap_dmap_l2_block(&state, i); 1019 MPASS(state.pa <= physmap[i + 1]); 1020 1021 /* Create the main L1 block mappings */ 1022 for (; state.va < DMAP_MAX_ADDRESS && 1023 (physmap[i + 1] - state.pa) >= L1_SIZE; 1024 state.va += L1_SIZE, state.pa += L1_SIZE) { 1025 /* Make sure there is a valid L1 table */ 1026 pmap_bootstrap_dmap_l0_table(&state); 1027 MPASS((state.pa & L1_OFFSET) == 0); 1028 pmap_store(&state.l1[pmap_l1_index(state.va)], 1029 state.pa | ATTR_DEFAULT | ATTR_S1_XN | 1030 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | 1031 L1_BLOCK); 1032 } 1033 MPASS(state.pa <= physmap[i + 1]); 1034 1035 /* Create L2 mappings at the end of the region */ 1036 pmap_bootstrap_dmap_l2_block(&state, i); 1037 MPASS(state.pa <= physmap[i + 1]); 1038 1039 /* Create L3 mappings at the end of the region */ 1040 pmap_bootstrap_dmap_l3_page(&state, i); 1041 MPASS(state.pa == physmap[i + 1]); 1042 1043 if (state.pa > dmap_phys_max) { 1044 dmap_phys_max = state.pa; 1045 dmap_max_addr = state.va; 1046 } 1047 } 1048 1049 cpu_tlb_flushID(); 1050 1051 return (state.freemempos); 1052 } 1053 1054 static vm_offset_t 1055 pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start) 1056 { 1057 vm_offset_t l2pt; 1058 vm_paddr_t pa; 1059 pd_entry_t *l1; 1060 u_int l1_slot; 1061 1062 KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address")); 1063 1064 l1 = (pd_entry_t *)l1pt; 1065 l1_slot = pmap_l1_index(va); 1066 l2pt = l2_start; 1067 1068 for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) { 1069 KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); 1070 1071 pa = pmap_early_vtophys(l2pt); 1072 pmap_store(&l1[l1_slot], 1073 (pa & ~Ln_TABLE_MASK) | L1_TABLE); 1074 l2pt += PAGE_SIZE; 1075 } 1076 1077 /* Clean the L2 page table */ 1078 memset((void *)l2_start, 0, l2pt - l2_start); 1079 1080 return l2pt; 1081 } 1082 1083 static vm_offset_t 1084 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) 1085 { 1086 vm_offset_t l3pt; 1087 vm_paddr_t pa; 1088 pd_entry_t *l2; 1089 u_int l2_slot; 1090 1091 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); 1092 1093 l2 = pmap_l2(kernel_pmap, va); 1094 l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE); 1095 l2_slot = pmap_l2_index(va); 1096 l3pt = l3_start; 1097 1098 for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { 1099 KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); 1100 1101 pa = pmap_early_vtophys(l3pt); 1102 pmap_store(&l2[l2_slot], 1103 (pa & ~Ln_TABLE_MASK) | ATTR_S1_UXN | L2_TABLE); 1104 l3pt += PAGE_SIZE; 1105 } 1106 1107 /* Clean the L2 page table */ 1108 memset((void *)l3_start, 0, l3pt - l3_start); 1109 1110 return l3pt; 1111 } 1112 1113 /* 1114 * Bootstrap the system enough to run with virtual memory. 1115 */ 1116 void 1117 pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart, 1118 vm_size_t kernlen) 1119 { 1120 vm_offset_t freemempos; 1121 vm_offset_t dpcpu, msgbufpv; 1122 vm_paddr_t start_pa, pa, min_pa; 1123 uint64_t kern_delta; 1124 int i; 1125 1126 /* Verify that the ASID is set through TTBR0. */ 1127 KASSERT((READ_SPECIALREG(tcr_el1) & TCR_A1) == 0, 1128 ("pmap_bootstrap: TCR_EL1.A1 != 0")); 1129 1130 kern_delta = KERNBASE - kernstart; 1131 1132 printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); 1133 printf("%lx\n", l1pt); 1134 printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK); 1135 1136 /* Set this early so we can use the pagetable walking functions */ 1137 kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt; 1138 PMAP_LOCK_INIT(kernel_pmap); 1139 kernel_pmap->pm_l0_paddr = l0pt - kern_delta; 1140 kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN); 1141 kernel_pmap->pm_stage = PM_STAGE1; 1142 kernel_pmap->pm_levels = 4; 1143 kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr; 1144 kernel_pmap->pm_asid_set = &asids; 1145 1146 /* Assume the address we were loaded to is a valid physical address */ 1147 min_pa = KERNBASE - kern_delta; 1148 1149 physmap_idx = physmem_avail(physmap, nitems(physmap)); 1150 physmap_idx /= 2; 1151 1152 /* 1153 * Find the minimum physical address. physmap is sorted, 1154 * but may contain empty ranges. 1155 */ 1156 for (i = 0; i < physmap_idx * 2; i += 2) { 1157 if (physmap[i] == physmap[i + 1]) 1158 continue; 1159 if (physmap[i] <= min_pa) 1160 min_pa = physmap[i]; 1161 } 1162 1163 freemempos = KERNBASE + kernlen; 1164 freemempos = roundup2(freemempos, PAGE_SIZE); 1165 1166 /* Create a direct map region early so we can use it for pa -> va */ 1167 freemempos = pmap_bootstrap_dmap(l1pt, min_pa, freemempos); 1168 1169 start_pa = pa = KERNBASE - kern_delta; 1170 1171 /* 1172 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS. We assume that the 1173 * loader allocated the first and only l2 page table page used to map 1174 * the kernel, preloaded files and module metadata. 1175 */ 1176 freemempos = pmap_bootstrap_l2(l1pt, KERNBASE + L1_SIZE, freemempos); 1177 /* And the l3 tables for the early devmap */ 1178 freemempos = pmap_bootstrap_l3(l1pt, 1179 VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE), freemempos); 1180 1181 cpu_tlb_flushID(); 1182 1183 #define alloc_pages(var, np) \ 1184 (var) = freemempos; \ 1185 freemempos += (np * PAGE_SIZE); \ 1186 memset((char *)(var), 0, ((np) * PAGE_SIZE)); 1187 1188 /* Allocate dynamic per-cpu area. */ 1189 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); 1190 dpcpu_init((void *)dpcpu, 0); 1191 1192 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ 1193 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); 1194 msgbufp = (void *)msgbufpv; 1195 1196 /* Reserve some VA space for early BIOS/ACPI mapping */ 1197 preinit_map_va = roundup2(freemempos, L2_SIZE); 1198 1199 virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE; 1200 virtual_avail = roundup2(virtual_avail, L1_SIZE); 1201 virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE); 1202 kernel_vm_end = virtual_avail; 1203 1204 pa = pmap_early_vtophys(freemempos); 1205 1206 physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC); 1207 1208 cpu_tlb_flushID(); 1209 } 1210 1211 /* 1212 * Initialize a vm_page's machine-dependent fields. 1213 */ 1214 void 1215 pmap_page_init(vm_page_t m) 1216 { 1217 1218 TAILQ_INIT(&m->md.pv_list); 1219 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; 1220 } 1221 1222 static void 1223 pmap_init_asids(struct asid_set *set, int bits) 1224 { 1225 int i; 1226 1227 set->asid_bits = bits; 1228 1229 /* 1230 * We may be too early in the overall initialization process to use 1231 * bit_alloc(). 1232 */ 1233 set->asid_set_size = 1 << set->asid_bits; 1234 set->asid_set = (bitstr_t *)kmem_malloc(bitstr_size(set->asid_set_size), 1235 M_WAITOK | M_ZERO); 1236 for (i = 0; i < ASID_FIRST_AVAILABLE; i++) 1237 bit_set(set->asid_set, i); 1238 set->asid_next = ASID_FIRST_AVAILABLE; 1239 mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN); 1240 } 1241 1242 /* 1243 * Initialize the pmap module. 1244 * Called by vm_init, to initialize any structures that the pmap 1245 * system needs to map virtual memory. 1246 */ 1247 void 1248 pmap_init(void) 1249 { 1250 struct vm_phys_seg *seg, *next_seg; 1251 struct md_page *pvh; 1252 vm_size_t s; 1253 uint64_t mmfr1; 1254 int i, pv_npg, vmid_bits; 1255 1256 /* 1257 * Are large page mappings enabled? 1258 */ 1259 TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); 1260 if (superpages_enabled) { 1261 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1262 ("pmap_init: can't assign to pagesizes[1]")); 1263 pagesizes[1] = L2_SIZE; 1264 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0, 1265 ("pmap_init: can't assign to pagesizes[2]")); 1266 pagesizes[2] = L1_SIZE; 1267 } 1268 1269 /* 1270 * Initialize the ASID allocator. 1271 */ 1272 pmap_init_asids(&asids, 1273 (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8); 1274 1275 if (has_hyp()) { 1276 mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1); 1277 vmid_bits = 8; 1278 1279 if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) == 1280 ID_AA64MMFR1_VMIDBits_16) 1281 vmid_bits = 16; 1282 pmap_init_asids(&vmids, vmid_bits); 1283 } 1284 1285 /* 1286 * Initialize the pv chunk list mutex. 1287 */ 1288 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 1289 1290 /* 1291 * Initialize the pool of pv list locks. 1292 */ 1293 for (i = 0; i < NPV_LIST_LOCKS; i++) 1294 rw_init(&pv_list_locks[i], "pmap pv list"); 1295 1296 /* 1297 * Calculate the size of the pv head table for superpages. 1298 */ 1299 pv_npg = 0; 1300 for (i = 0; i < vm_phys_nsegs; i++) { 1301 seg = &vm_phys_segs[i]; 1302 pv_npg += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1303 pmap_l2_pindex(seg->start); 1304 } 1305 1306 /* 1307 * Allocate memory for the pv head table for superpages. 1308 */ 1309 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 1310 s = round_page(s); 1311 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 1312 for (i = 0; i < pv_npg; i++) 1313 TAILQ_INIT(&pv_table[i].pv_list); 1314 TAILQ_INIT(&pv_dummy.pv_list); 1315 1316 /* 1317 * Set pointers from vm_phys_segs to pv_table. 1318 */ 1319 for (i = 0, pvh = pv_table; i < vm_phys_nsegs; i++) { 1320 seg = &vm_phys_segs[i]; 1321 seg->md_first = pvh; 1322 pvh += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1323 pmap_l2_pindex(seg->start); 1324 1325 /* 1326 * If there is a following segment, and the final 1327 * superpage of this segment and the initial superpage 1328 * of the next segment are the same then adjust the 1329 * pv_table entry for that next segment down by one so 1330 * that the pv_table entries will be shared. 1331 */ 1332 if (i + 1 < vm_phys_nsegs) { 1333 next_seg = &vm_phys_segs[i + 1]; 1334 if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 == 1335 pmap_l2_pindex(next_seg->start)) { 1336 pvh--; 1337 } 1338 } 1339 } 1340 1341 vm_initialized = 1; 1342 } 1343 1344 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1345 "2MB page mapping counters"); 1346 1347 static u_long pmap_l2_demotions; 1348 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, 1349 &pmap_l2_demotions, 0, "2MB page demotions"); 1350 1351 static u_long pmap_l2_mappings; 1352 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, 1353 &pmap_l2_mappings, 0, "2MB page mappings"); 1354 1355 static u_long pmap_l2_p_failures; 1356 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, 1357 &pmap_l2_p_failures, 0, "2MB page promotion failures"); 1358 1359 static u_long pmap_l2_promotions; 1360 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, 1361 &pmap_l2_promotions, 0, "2MB page promotions"); 1362 1363 /* 1364 * If the given value for "final_only" is false, then any cached intermediate- 1365 * level entries, i.e., L{0,1,2}_TABLE entries, are invalidated in addition to 1366 * any cached final-level entry, i.e., either an L{1,2}_BLOCK or L3_PAGE entry. 1367 * Otherwise, just the cached final-level entry is invalidated. 1368 */ 1369 static __inline void 1370 pmap_invalidate_kernel(uint64_t r, bool final_only) 1371 { 1372 if (final_only) 1373 __asm __volatile("tlbi vaale1is, %0" : : "r" (r)); 1374 else 1375 __asm __volatile("tlbi vaae1is, %0" : : "r" (r)); 1376 } 1377 1378 static __inline void 1379 pmap_invalidate_user(uint64_t r, bool final_only) 1380 { 1381 if (final_only) 1382 __asm __volatile("tlbi vale1is, %0" : : "r" (r)); 1383 else 1384 __asm __volatile("tlbi vae1is, %0" : : "r" (r)); 1385 } 1386 1387 /* 1388 * Invalidates any cached final- and optionally intermediate-level TLB entries 1389 * for the specified virtual address in the given virtual address space. 1390 */ 1391 static __inline void 1392 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only) 1393 { 1394 uint64_t r; 1395 1396 PMAP_ASSERT_STAGE1(pmap); 1397 1398 dsb(ishst); 1399 r = TLBI_VA(va); 1400 if (pmap == kernel_pmap) { 1401 pmap_invalidate_kernel(r, final_only); 1402 } else { 1403 r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 1404 pmap_invalidate_user(r, final_only); 1405 } 1406 dsb(ish); 1407 isb(); 1408 } 1409 1410 /* 1411 * Invalidates any cached final- and optionally intermediate-level TLB entries 1412 * for the specified virtual address range in the given virtual address space. 1413 */ 1414 static __inline void 1415 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1416 bool final_only) 1417 { 1418 uint64_t end, r, start; 1419 1420 PMAP_ASSERT_STAGE1(pmap); 1421 1422 dsb(ishst); 1423 if (pmap == kernel_pmap) { 1424 start = TLBI_VA(sva); 1425 end = TLBI_VA(eva); 1426 for (r = start; r < end; r += TLBI_VA_L3_INCR) 1427 pmap_invalidate_kernel(r, final_only); 1428 } else { 1429 start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 1430 start |= TLBI_VA(sva); 1431 end |= TLBI_VA(eva); 1432 for (r = start; r < end; r += TLBI_VA_L3_INCR) 1433 pmap_invalidate_user(r, final_only); 1434 } 1435 dsb(ish); 1436 isb(); 1437 } 1438 1439 /* 1440 * Invalidates all cached intermediate- and final-level TLB entries for the 1441 * given virtual address space. 1442 */ 1443 static __inline void 1444 pmap_invalidate_all(pmap_t pmap) 1445 { 1446 uint64_t r; 1447 1448 PMAP_ASSERT_STAGE1(pmap); 1449 1450 dsb(ishst); 1451 if (pmap == kernel_pmap) { 1452 __asm __volatile("tlbi vmalle1is"); 1453 } else { 1454 r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 1455 __asm __volatile("tlbi aside1is, %0" : : "r" (r)); 1456 } 1457 dsb(ish); 1458 isb(); 1459 } 1460 1461 /* 1462 * Routine: pmap_extract 1463 * Function: 1464 * Extract the physical page address associated 1465 * with the given map/virtual_address pair. 1466 */ 1467 vm_paddr_t 1468 pmap_extract(pmap_t pmap, vm_offset_t va) 1469 { 1470 pt_entry_t *pte, tpte; 1471 vm_paddr_t pa; 1472 int lvl; 1473 1474 pa = 0; 1475 PMAP_LOCK(pmap); 1476 /* 1477 * Find the block or page map for this virtual address. pmap_pte 1478 * will return either a valid block/page entry, or NULL. 1479 */ 1480 pte = pmap_pte(pmap, va, &lvl); 1481 if (pte != NULL) { 1482 tpte = pmap_load(pte); 1483 pa = tpte & ~ATTR_MASK; 1484 switch(lvl) { 1485 case 1: 1486 KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, 1487 ("pmap_extract: Invalid L1 pte found: %lx", 1488 tpte & ATTR_DESCR_MASK)); 1489 pa |= (va & L1_OFFSET); 1490 break; 1491 case 2: 1492 KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, 1493 ("pmap_extract: Invalid L2 pte found: %lx", 1494 tpte & ATTR_DESCR_MASK)); 1495 pa |= (va & L2_OFFSET); 1496 break; 1497 case 3: 1498 KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, 1499 ("pmap_extract: Invalid L3 pte found: %lx", 1500 tpte & ATTR_DESCR_MASK)); 1501 pa |= (va & L3_OFFSET); 1502 break; 1503 } 1504 } 1505 PMAP_UNLOCK(pmap); 1506 return (pa); 1507 } 1508 1509 /* 1510 * Routine: pmap_extract_and_hold 1511 * Function: 1512 * Atomically extract and hold the physical page 1513 * with the given pmap and virtual address pair 1514 * if that mapping permits the given protection. 1515 */ 1516 vm_page_t 1517 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1518 { 1519 pt_entry_t *pte, tpte; 1520 vm_offset_t off; 1521 vm_page_t m; 1522 int lvl; 1523 bool use; 1524 1525 m = NULL; 1526 PMAP_LOCK(pmap); 1527 pte = pmap_pte(pmap, va, &lvl); 1528 if (pte != NULL) { 1529 tpte = pmap_load(pte); 1530 1531 KASSERT(lvl > 0 && lvl <= 3, 1532 ("pmap_extract_and_hold: Invalid level %d", lvl)); 1533 KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) || 1534 (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK), 1535 ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl, 1536 tpte & ATTR_DESCR_MASK)); 1537 1538 use = false; 1539 if ((prot & VM_PROT_WRITE) == 0) 1540 use = true; 1541 else if (pmap->pm_stage == PM_STAGE1 && 1542 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)) 1543 use = true; 1544 else if (pmap->pm_stage == PM_STAGE2 && 1545 ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) == 1546 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE))) 1547 use = true; 1548 1549 if (use) { 1550 switch (lvl) { 1551 case 1: 1552 off = va & L1_OFFSET; 1553 break; 1554 case 2: 1555 off = va & L2_OFFSET; 1556 break; 1557 case 3: 1558 default: 1559 off = 0; 1560 } 1561 m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off); 1562 if (m != NULL && !vm_page_wire_mapped(m)) 1563 m = NULL; 1564 } 1565 } 1566 PMAP_UNLOCK(pmap); 1567 return (m); 1568 } 1569 1570 /* 1571 * Walks the page tables to translate a kernel virtual address to a 1572 * physical address. Returns true if the kva is valid and stores the 1573 * physical address in pa if it is not NULL. 1574 */ 1575 bool 1576 pmap_klookup(vm_offset_t va, vm_paddr_t *pa) 1577 { 1578 pt_entry_t *pte, tpte; 1579 register_t intr; 1580 uint64_t par; 1581 1582 /* 1583 * Disable interrupts so we don't get interrupted between asking 1584 * for address translation, and getting the result back. 1585 */ 1586 intr = intr_disable(); 1587 par = arm64_address_translate_s1e1r(va); 1588 intr_restore(intr); 1589 1590 if (PAR_SUCCESS(par)) { 1591 if (pa != NULL) 1592 *pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK); 1593 return (true); 1594 } 1595 1596 /* 1597 * Fall back to walking the page table. The address translation 1598 * instruction may fail when the page is in a break-before-make 1599 * sequence. As we only clear the valid bit in said sequence we 1600 * can walk the page table to find the physical address. 1601 */ 1602 1603 pte = pmap_l1(kernel_pmap, va); 1604 if (pte == NULL) 1605 return (false); 1606 1607 /* 1608 * A concurrent pmap_update_entry() will clear the entry's valid bit 1609 * but leave the rest of the entry unchanged. Therefore, we treat a 1610 * non-zero entry as being valid, and we ignore the valid bit when 1611 * determining whether the entry maps a block, page, or table. 1612 */ 1613 tpte = pmap_load(pte); 1614 if (tpte == 0) 1615 return (false); 1616 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 1617 if (pa != NULL) 1618 *pa = (tpte & ~ATTR_MASK) | (va & L1_OFFSET); 1619 return (true); 1620 } 1621 pte = pmap_l1_to_l2(&tpte, va); 1622 tpte = pmap_load(pte); 1623 if (tpte == 0) 1624 return (false); 1625 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 1626 if (pa != NULL) 1627 *pa = (tpte & ~ATTR_MASK) | (va & L2_OFFSET); 1628 return (true); 1629 } 1630 pte = pmap_l2_to_l3(&tpte, va); 1631 tpte = pmap_load(pte); 1632 if (tpte == 0) 1633 return (false); 1634 if (pa != NULL) 1635 *pa = (tpte & ~ATTR_MASK) | (va & L3_OFFSET); 1636 return (true); 1637 } 1638 1639 vm_paddr_t 1640 pmap_kextract(vm_offset_t va) 1641 { 1642 vm_paddr_t pa; 1643 1644 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 1645 return (DMAP_TO_PHYS(va)); 1646 1647 if (pmap_klookup(va, &pa) == false) 1648 return (0); 1649 return (pa); 1650 } 1651 1652 /*************************************************** 1653 * Low level mapping routines..... 1654 ***************************************************/ 1655 1656 void 1657 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode) 1658 { 1659 pd_entry_t *pde; 1660 pt_entry_t *pte, attr; 1661 vm_offset_t va; 1662 int lvl; 1663 1664 KASSERT((pa & L3_OFFSET) == 0, 1665 ("pmap_kenter: Invalid physical address")); 1666 KASSERT((sva & L3_OFFSET) == 0, 1667 ("pmap_kenter: Invalid virtual address")); 1668 KASSERT((size & PAGE_MASK) == 0, 1669 ("pmap_kenter: Mapping is not page-sized")); 1670 1671 attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN | 1672 ATTR_S1_IDX(mode) | L3_PAGE; 1673 va = sva; 1674 while (size != 0) { 1675 pde = pmap_pde(kernel_pmap, va, &lvl); 1676 KASSERT(pde != NULL, 1677 ("pmap_kenter: Invalid page entry, va: 0x%lx", va)); 1678 KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl)); 1679 1680 pte = pmap_l2_to_l3(pde, va); 1681 pmap_load_store(pte, (pa & ~L3_OFFSET) | attr); 1682 1683 va += PAGE_SIZE; 1684 pa += PAGE_SIZE; 1685 size -= PAGE_SIZE; 1686 } 1687 pmap_invalidate_range(kernel_pmap, sva, va, true); 1688 } 1689 1690 void 1691 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) 1692 { 1693 1694 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE); 1695 } 1696 1697 /* 1698 * Remove a page from the kernel pagetables. 1699 */ 1700 PMAP_INLINE void 1701 pmap_kremove(vm_offset_t va) 1702 { 1703 pt_entry_t *pte; 1704 1705 pte = pmap_pte_exists(kernel_pmap, va, 3, __func__); 1706 pmap_clear(pte); 1707 pmap_invalidate_page(kernel_pmap, va, true); 1708 } 1709 1710 void 1711 pmap_kremove_device(vm_offset_t sva, vm_size_t size) 1712 { 1713 pt_entry_t *pte; 1714 vm_offset_t va; 1715 1716 KASSERT((sva & L3_OFFSET) == 0, 1717 ("pmap_kremove_device: Invalid virtual address")); 1718 KASSERT((size & PAGE_MASK) == 0, 1719 ("pmap_kremove_device: Mapping is not page-sized")); 1720 1721 va = sva; 1722 while (size != 0) { 1723 pte = pmap_pte_exists(kernel_pmap, va, 3, __func__); 1724 pmap_clear(pte); 1725 1726 va += PAGE_SIZE; 1727 size -= PAGE_SIZE; 1728 } 1729 pmap_invalidate_range(kernel_pmap, sva, va, true); 1730 } 1731 1732 /* 1733 * Used to map a range of physical addresses into kernel 1734 * virtual address space. 1735 * 1736 * The value passed in '*virt' is a suggested virtual address for 1737 * the mapping. Architectures which can support a direct-mapped 1738 * physical to virtual region can return the appropriate address 1739 * within that region, leaving '*virt' unchanged. Other 1740 * architectures should map the pages starting at '*virt' and 1741 * update '*virt' with the first usable address after the mapped 1742 * region. 1743 */ 1744 vm_offset_t 1745 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1746 { 1747 return PHYS_TO_DMAP(start); 1748 } 1749 1750 /* 1751 * Add a list of wired pages to the kva 1752 * this routine is only used for temporary 1753 * kernel mappings that do not need to have 1754 * page modification or references recorded. 1755 * Note that old mappings are simply written 1756 * over. The page *must* be wired. 1757 * Note: SMP coherent. Uses a ranged shootdown IPI. 1758 */ 1759 void 1760 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1761 { 1762 pd_entry_t *pde; 1763 pt_entry_t *pte, pa; 1764 vm_offset_t va; 1765 vm_page_t m; 1766 int i, lvl; 1767 1768 va = sva; 1769 for (i = 0; i < count; i++) { 1770 pde = pmap_pde(kernel_pmap, va, &lvl); 1771 KASSERT(pde != NULL, 1772 ("pmap_qenter: Invalid page entry, va: 0x%lx", va)); 1773 KASSERT(lvl == 2, 1774 ("pmap_qenter: Invalid level %d", lvl)); 1775 1776 m = ma[i]; 1777 pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | 1778 ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN | 1779 ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE; 1780 pte = pmap_l2_to_l3(pde, va); 1781 pmap_load_store(pte, pa); 1782 1783 va += L3_SIZE; 1784 } 1785 pmap_invalidate_range(kernel_pmap, sva, va, true); 1786 } 1787 1788 /* 1789 * This routine tears out page mappings from the 1790 * kernel -- it is meant only for temporary mappings. 1791 */ 1792 void 1793 pmap_qremove(vm_offset_t sva, int count) 1794 { 1795 pt_entry_t *pte; 1796 vm_offset_t va; 1797 1798 KASSERT(ADDR_IS_CANONICAL(sva), 1799 ("%s: Address not in canonical form: %lx", __func__, sva)); 1800 KASSERT(ADDR_IS_KERNEL(sva), ("usermode va %lx", sva)); 1801 1802 va = sva; 1803 while (count-- > 0) { 1804 pte = pmap_pte_exists(kernel_pmap, va, 3, NULL); 1805 if (pte != NULL) { 1806 pmap_clear(pte); 1807 } 1808 1809 va += PAGE_SIZE; 1810 } 1811 pmap_invalidate_range(kernel_pmap, sva, va, true); 1812 } 1813 1814 /*************************************************** 1815 * Page table page management routines..... 1816 ***************************************************/ 1817 /* 1818 * Schedule the specified unused page table page to be freed. Specifically, 1819 * add the page to the specified list of pages that will be released to the 1820 * physical memory manager after the TLB has been updated. 1821 */ 1822 static __inline void 1823 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 1824 boolean_t set_PG_ZERO) 1825 { 1826 1827 if (set_PG_ZERO) 1828 m->flags |= PG_ZERO; 1829 else 1830 m->flags &= ~PG_ZERO; 1831 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1832 } 1833 1834 /* 1835 * Decrements a page table page's reference count, which is used to record the 1836 * number of valid page table entries within the page. If the reference count 1837 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1838 * page table page was unmapped and FALSE otherwise. 1839 */ 1840 static inline boolean_t 1841 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1842 { 1843 1844 --m->ref_count; 1845 if (m->ref_count == 0) { 1846 _pmap_unwire_l3(pmap, va, m, free); 1847 return (TRUE); 1848 } else 1849 return (FALSE); 1850 } 1851 1852 static void 1853 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1854 { 1855 1856 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1857 /* 1858 * unmap the page table page 1859 */ 1860 if (m->pindex >= (NUL2E + NUL1E)) { 1861 /* l1 page */ 1862 pd_entry_t *l0; 1863 1864 l0 = pmap_l0(pmap, va); 1865 pmap_clear(l0); 1866 } else if (m->pindex >= NUL2E) { 1867 /* l2 page */ 1868 pd_entry_t *l1; 1869 1870 l1 = pmap_l1(pmap, va); 1871 pmap_clear(l1); 1872 } else { 1873 /* l3 page */ 1874 pd_entry_t *l2; 1875 1876 l2 = pmap_l2(pmap, va); 1877 pmap_clear(l2); 1878 } 1879 pmap_resident_count_dec(pmap, 1); 1880 if (m->pindex < NUL2E) { 1881 /* We just released an l3, unhold the matching l2 */ 1882 pd_entry_t *l1, tl1; 1883 vm_page_t l2pg; 1884 1885 l1 = pmap_l1(pmap, va); 1886 tl1 = pmap_load(l1); 1887 l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); 1888 pmap_unwire_l3(pmap, va, l2pg, free); 1889 } else if (m->pindex < (NUL2E + NUL1E)) { 1890 /* We just released an l2, unhold the matching l1 */ 1891 pd_entry_t *l0, tl0; 1892 vm_page_t l1pg; 1893 1894 l0 = pmap_l0(pmap, va); 1895 tl0 = pmap_load(l0); 1896 l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); 1897 pmap_unwire_l3(pmap, va, l1pg, free); 1898 } 1899 pmap_invalidate_page(pmap, va, false); 1900 1901 /* 1902 * Put page on a list so that it is released after 1903 * *ALL* TLB shootdown is done 1904 */ 1905 pmap_add_delayed_free_list(m, free, TRUE); 1906 } 1907 1908 /* 1909 * After removing a page table entry, this routine is used to 1910 * conditionally free the page, and manage the reference count. 1911 */ 1912 static int 1913 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 1914 struct spglist *free) 1915 { 1916 vm_page_t mpte; 1917 1918 KASSERT(ADDR_IS_CANONICAL(va), 1919 ("%s: Address not in canonical form: %lx", __func__, va)); 1920 if (ADDR_IS_KERNEL(va)) 1921 return (0); 1922 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 1923 mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK); 1924 return (pmap_unwire_l3(pmap, va, mpte, free)); 1925 } 1926 1927 /* 1928 * Release a page table page reference after a failed attempt to create a 1929 * mapping. 1930 */ 1931 static void 1932 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 1933 { 1934 struct spglist free; 1935 1936 SLIST_INIT(&free); 1937 if (pmap_unwire_l3(pmap, va, mpte, &free)) 1938 vm_page_free_pages_toq(&free, true); 1939 } 1940 1941 void 1942 pmap_pinit0(pmap_t pmap) 1943 { 1944 1945 PMAP_LOCK_INIT(pmap); 1946 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1947 pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1); 1948 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); 1949 vm_radix_init(&pmap->pm_root); 1950 pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN); 1951 pmap->pm_stage = PM_STAGE1; 1952 pmap->pm_levels = 4; 1953 pmap->pm_ttbr = pmap->pm_l0_paddr; 1954 pmap->pm_asid_set = &asids; 1955 1956 PCPU_SET(curpmap, pmap); 1957 } 1958 1959 int 1960 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels) 1961 { 1962 vm_page_t m; 1963 1964 /* 1965 * allocate the l0 page 1966 */ 1967 m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED | 1968 VM_ALLOC_ZERO); 1969 pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m); 1970 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); 1971 1972 vm_radix_init(&pmap->pm_root); 1973 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1974 pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX); 1975 1976 MPASS(levels == 3 || levels == 4); 1977 pmap->pm_levels = levels; 1978 pmap->pm_stage = stage; 1979 switch (stage) { 1980 case PM_STAGE1: 1981 pmap->pm_asid_set = &asids; 1982 break; 1983 case PM_STAGE2: 1984 pmap->pm_asid_set = &vmids; 1985 break; 1986 default: 1987 panic("%s: Invalid pmap type %d", __func__, stage); 1988 break; 1989 } 1990 1991 /* XXX Temporarily disable deferred ASID allocation. */ 1992 pmap_alloc_asid(pmap); 1993 1994 /* 1995 * Allocate the level 1 entry to use as the root. This will increase 1996 * the refcount on the level 1 page so it won't be removed until 1997 * pmap_release() is called. 1998 */ 1999 if (pmap->pm_levels == 3) { 2000 PMAP_LOCK(pmap); 2001 m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL); 2002 PMAP_UNLOCK(pmap); 2003 } 2004 pmap->pm_ttbr = VM_PAGE_TO_PHYS(m); 2005 2006 return (1); 2007 } 2008 2009 int 2010 pmap_pinit(pmap_t pmap) 2011 { 2012 2013 return (pmap_pinit_stage(pmap, PM_STAGE1, 4)); 2014 } 2015 2016 /* 2017 * This routine is called if the desired page table page does not exist. 2018 * 2019 * If page table page allocation fails, this routine may sleep before 2020 * returning NULL. It sleeps only if a lock pointer was given. 2021 * 2022 * Note: If a page allocation fails at page table level two or three, 2023 * one or two pages may be held during the wait, only to be released 2024 * afterwards. This conservative approach is easily argued to avoid 2025 * race conditions. 2026 */ 2027 static vm_page_t 2028 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 2029 { 2030 vm_page_t m, l1pg, l2pg; 2031 2032 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2033 2034 /* 2035 * Allocate a page table page. 2036 */ 2037 if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 2038 if (lockp != NULL) { 2039 RELEASE_PV_LIST_LOCK(lockp); 2040 PMAP_UNLOCK(pmap); 2041 vm_wait(NULL); 2042 PMAP_LOCK(pmap); 2043 } 2044 2045 /* 2046 * Indicate the need to retry. While waiting, the page table 2047 * page may have been allocated. 2048 */ 2049 return (NULL); 2050 } 2051 m->pindex = ptepindex; 2052 2053 /* 2054 * Because of AArch64's weak memory consistency model, we must have a 2055 * barrier here to ensure that the stores for zeroing "m", whether by 2056 * pmap_zero_page() or an earlier function, are visible before adding 2057 * "m" to the page table. Otherwise, a page table walk by another 2058 * processor's MMU could see the mapping to "m" and a stale, non-zero 2059 * PTE within "m". 2060 */ 2061 dmb(ishst); 2062 2063 /* 2064 * Map the pagetable page into the process address space, if 2065 * it isn't already there. 2066 */ 2067 2068 if (ptepindex >= (NUL2E + NUL1E)) { 2069 pd_entry_t *l0p, l0e; 2070 vm_pindex_t l0index; 2071 2072 l0index = ptepindex - (NUL2E + NUL1E); 2073 l0p = &pmap->pm_l0[l0index]; 2074 KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0, 2075 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p))); 2076 l0e = VM_PAGE_TO_PHYS(m) | L0_TABLE; 2077 2078 /* 2079 * Mark all kernel memory as not accessible from userspace 2080 * and userspace memory as not executable from the kernel. 2081 * This has been done for the bootstrap L0 entries in 2082 * locore.S. 2083 */ 2084 if (pmap == kernel_pmap) 2085 l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0; 2086 else 2087 l0e |= TATTR_PXN_TABLE; 2088 pmap_store(l0p, l0e); 2089 } else if (ptepindex >= NUL2E) { 2090 vm_pindex_t l0index, l1index; 2091 pd_entry_t *l0, *l1; 2092 pd_entry_t tl0; 2093 2094 l1index = ptepindex - NUL2E; 2095 l0index = l1index >> Ln_ENTRIES_SHIFT; 2096 2097 l0 = &pmap->pm_l0[l0index]; 2098 tl0 = pmap_load(l0); 2099 if (tl0 == 0) { 2100 /* recurse for allocating page dir */ 2101 if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, 2102 lockp) == NULL) { 2103 vm_page_unwire_noq(m); 2104 vm_page_free_zero(m); 2105 return (NULL); 2106 } 2107 } else { 2108 l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); 2109 l1pg->ref_count++; 2110 } 2111 2112 l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); 2113 l1 = &l1[ptepindex & Ln_ADDR_MASK]; 2114 KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0, 2115 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1))); 2116 pmap_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE); 2117 } else { 2118 vm_pindex_t l0index, l1index; 2119 pd_entry_t *l0, *l1, *l2; 2120 pd_entry_t tl0, tl1; 2121 2122 l1index = ptepindex >> Ln_ENTRIES_SHIFT; 2123 l0index = l1index >> Ln_ENTRIES_SHIFT; 2124 2125 l0 = &pmap->pm_l0[l0index]; 2126 tl0 = pmap_load(l0); 2127 if (tl0 == 0) { 2128 /* recurse for allocating page dir */ 2129 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 2130 lockp) == NULL) { 2131 vm_page_unwire_noq(m); 2132 vm_page_free_zero(m); 2133 return (NULL); 2134 } 2135 tl0 = pmap_load(l0); 2136 l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); 2137 l1 = &l1[l1index & Ln_ADDR_MASK]; 2138 } else { 2139 l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); 2140 l1 = &l1[l1index & Ln_ADDR_MASK]; 2141 tl1 = pmap_load(l1); 2142 if (tl1 == 0) { 2143 /* recurse for allocating page dir */ 2144 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 2145 lockp) == NULL) { 2146 vm_page_unwire_noq(m); 2147 vm_page_free_zero(m); 2148 return (NULL); 2149 } 2150 } else { 2151 l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); 2152 l2pg->ref_count++; 2153 } 2154 } 2155 2156 l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); 2157 l2 = &l2[ptepindex & Ln_ADDR_MASK]; 2158 KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0, 2159 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2))); 2160 pmap_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE); 2161 } 2162 2163 pmap_resident_count_inc(pmap, 1); 2164 2165 return (m); 2166 } 2167 2168 static pd_entry_t * 2169 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp, 2170 struct rwlock **lockp) 2171 { 2172 pd_entry_t *l1, *l2; 2173 vm_page_t l2pg; 2174 vm_pindex_t l2pindex; 2175 2176 KASSERT(ADDR_IS_CANONICAL(va), 2177 ("%s: Address not in canonical form: %lx", __func__, va)); 2178 2179 retry: 2180 l1 = pmap_l1(pmap, va); 2181 if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) { 2182 l2 = pmap_l1_to_l2(l1, va); 2183 if (!ADDR_IS_KERNEL(va)) { 2184 /* Add a reference to the L2 page. */ 2185 l2pg = PHYS_TO_VM_PAGE(pmap_load(l1) & ~ATTR_MASK); 2186 l2pg->ref_count++; 2187 } else 2188 l2pg = NULL; 2189 } else if (!ADDR_IS_KERNEL(va)) { 2190 /* Allocate a L2 page. */ 2191 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; 2192 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); 2193 if (l2pg == NULL) { 2194 if (lockp != NULL) 2195 goto retry; 2196 else 2197 return (NULL); 2198 } 2199 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); 2200 l2 = &l2[pmap_l2_index(va)]; 2201 } else 2202 panic("pmap_alloc_l2: missing page table page for va %#lx", 2203 va); 2204 *l2pgp = l2pg; 2205 return (l2); 2206 } 2207 2208 static vm_page_t 2209 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 2210 { 2211 vm_pindex_t ptepindex; 2212 pd_entry_t *pde, tpde; 2213 #ifdef INVARIANTS 2214 pt_entry_t *pte; 2215 #endif 2216 vm_page_t m; 2217 int lvl; 2218 2219 /* 2220 * Calculate pagetable page index 2221 */ 2222 ptepindex = pmap_l2_pindex(va); 2223 retry: 2224 /* 2225 * Get the page directory entry 2226 */ 2227 pde = pmap_pde(pmap, va, &lvl); 2228 2229 /* 2230 * If the page table page is mapped, we just increment the hold count, 2231 * and activate it. If we get a level 2 pde it will point to a level 3 2232 * table. 2233 */ 2234 switch (lvl) { 2235 case -1: 2236 break; 2237 case 0: 2238 #ifdef INVARIANTS 2239 pte = pmap_l0_to_l1(pde, va); 2240 KASSERT(pmap_load(pte) == 0, 2241 ("pmap_alloc_l3: TODO: l0 superpages")); 2242 #endif 2243 break; 2244 case 1: 2245 #ifdef INVARIANTS 2246 pte = pmap_l1_to_l2(pde, va); 2247 KASSERT(pmap_load(pte) == 0, 2248 ("pmap_alloc_l3: TODO: l1 superpages")); 2249 #endif 2250 break; 2251 case 2: 2252 tpde = pmap_load(pde); 2253 if (tpde != 0) { 2254 m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK); 2255 m->ref_count++; 2256 return (m); 2257 } 2258 break; 2259 default: 2260 panic("pmap_alloc_l3: Invalid level %d", lvl); 2261 } 2262 2263 /* 2264 * Here if the pte page isn't mapped, or if it has been deallocated. 2265 */ 2266 m = _pmap_alloc_l3(pmap, ptepindex, lockp); 2267 if (m == NULL && lockp != NULL) 2268 goto retry; 2269 2270 return (m); 2271 } 2272 2273 /*************************************************** 2274 * Pmap allocation/deallocation routines. 2275 ***************************************************/ 2276 2277 /* 2278 * Release any resources held by the given physical map. 2279 * Called when a pmap initialized by pmap_pinit is being released. 2280 * Should only be called if the map contains no valid mappings. 2281 */ 2282 void 2283 pmap_release(pmap_t pmap) 2284 { 2285 boolean_t rv __diagused; 2286 struct spglist free; 2287 struct asid_set *set; 2288 vm_page_t m; 2289 int asid; 2290 2291 if (pmap->pm_levels != 4) { 2292 PMAP_ASSERT_STAGE2(pmap); 2293 KASSERT(pmap->pm_stats.resident_count == 1, 2294 ("pmap_release: pmap resident count %ld != 0", 2295 pmap->pm_stats.resident_count)); 2296 KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID, 2297 ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0])); 2298 2299 SLIST_INIT(&free); 2300 m = PHYS_TO_VM_PAGE(pmap->pm_ttbr); 2301 PMAP_LOCK(pmap); 2302 rv = pmap_unwire_l3(pmap, 0, m, &free); 2303 PMAP_UNLOCK(pmap); 2304 MPASS(rv == TRUE); 2305 vm_page_free_pages_toq(&free, true); 2306 } 2307 2308 KASSERT(pmap->pm_stats.resident_count == 0, 2309 ("pmap_release: pmap resident count %ld != 0", 2310 pmap->pm_stats.resident_count)); 2311 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2312 ("pmap_release: pmap has reserved page table page(s)")); 2313 2314 set = pmap->pm_asid_set; 2315 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 2316 2317 /* 2318 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate 2319 * the entries when removing them so rely on a later tlb invalidation. 2320 * this will happen when updating the VMID generation. Because of this 2321 * we don't reuse VMIDs within a generation. 2322 */ 2323 if (pmap->pm_stage == PM_STAGE1) { 2324 mtx_lock_spin(&set->asid_set_mutex); 2325 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) { 2326 asid = COOKIE_TO_ASID(pmap->pm_cookie); 2327 KASSERT(asid >= ASID_FIRST_AVAILABLE && 2328 asid < set->asid_set_size, 2329 ("pmap_release: pmap cookie has out-of-range asid")); 2330 bit_clear(set->asid_set, asid); 2331 } 2332 mtx_unlock_spin(&set->asid_set_mutex); 2333 } 2334 2335 m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr); 2336 vm_page_unwire_noq(m); 2337 vm_page_free_zero(m); 2338 } 2339 2340 static int 2341 kvm_size(SYSCTL_HANDLER_ARGS) 2342 { 2343 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 2344 2345 return sysctl_handle_long(oidp, &ksize, 0, req); 2346 } 2347 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 2348 0, 0, kvm_size, "LU", 2349 "Size of KVM"); 2350 2351 static int 2352 kvm_free(SYSCTL_HANDLER_ARGS) 2353 { 2354 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 2355 2356 return sysctl_handle_long(oidp, &kfree, 0, req); 2357 } 2358 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 2359 0, 0, kvm_free, "LU", 2360 "Amount of KVM free"); 2361 2362 /* 2363 * grow the number of kernel page table entries, if needed 2364 */ 2365 void 2366 pmap_growkernel(vm_offset_t addr) 2367 { 2368 vm_paddr_t paddr; 2369 vm_page_t nkpg; 2370 pd_entry_t *l0, *l1, *l2; 2371 2372 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2373 2374 addr = roundup2(addr, L2_SIZE); 2375 if (addr - 1 >= vm_map_max(kernel_map)) 2376 addr = vm_map_max(kernel_map); 2377 while (kernel_vm_end < addr) { 2378 l0 = pmap_l0(kernel_pmap, kernel_vm_end); 2379 KASSERT(pmap_load(l0) != 0, 2380 ("pmap_growkernel: No level 0 kernel entry")); 2381 2382 l1 = pmap_l0_to_l1(l0, kernel_vm_end); 2383 if (pmap_load(l1) == 0) { 2384 /* We need a new PDP entry */ 2385 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | 2386 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2387 if (nkpg == NULL) 2388 panic("pmap_growkernel: no memory to grow kernel"); 2389 nkpg->pindex = kernel_vm_end >> L1_SHIFT; 2390 /* See the dmb() in _pmap_alloc_l3(). */ 2391 dmb(ishst); 2392 paddr = VM_PAGE_TO_PHYS(nkpg); 2393 pmap_store(l1, paddr | L1_TABLE); 2394 continue; /* try again */ 2395 } 2396 l2 = pmap_l1_to_l2(l1, kernel_vm_end); 2397 if (pmap_load(l2) != 0) { 2398 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 2399 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2400 kernel_vm_end = vm_map_max(kernel_map); 2401 break; 2402 } 2403 continue; 2404 } 2405 2406 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 2407 VM_ALLOC_ZERO); 2408 if (nkpg == NULL) 2409 panic("pmap_growkernel: no memory to grow kernel"); 2410 nkpg->pindex = kernel_vm_end >> L2_SHIFT; 2411 /* See the dmb() in _pmap_alloc_l3(). */ 2412 dmb(ishst); 2413 paddr = VM_PAGE_TO_PHYS(nkpg); 2414 pmap_store(l2, paddr | L2_TABLE); 2415 2416 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 2417 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2418 kernel_vm_end = vm_map_max(kernel_map); 2419 break; 2420 } 2421 } 2422 } 2423 2424 /*************************************************** 2425 * page management routines. 2426 ***************************************************/ 2427 2428 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2429 CTASSERT(_NPCM == 3); 2430 CTASSERT(_NPCPV == 168); 2431 2432 static __inline struct pv_chunk * 2433 pv_to_chunk(pv_entry_t pv) 2434 { 2435 2436 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2437 } 2438 2439 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2440 2441 #define PC_FREE0 0xfffffffffffffffful 2442 #define PC_FREE1 0xfffffffffffffffful 2443 #define PC_FREE2 0x000000fffffffffful 2444 2445 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 2446 2447 #ifdef PV_STATS 2448 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2449 2450 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2451 "Current number of pv entry chunks"); 2452 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2453 "Current number of pv entry chunks allocated"); 2454 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2455 "Current number of pv entry chunks frees"); 2456 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2457 "Number of times tried to get a chunk page but failed."); 2458 2459 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 2460 static int pv_entry_spare; 2461 2462 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2463 "Current number of pv entry frees"); 2464 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2465 "Current number of pv entry allocs"); 2466 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2467 "Current number of pv entries"); 2468 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2469 "Current number of spare pv entries"); 2470 #endif 2471 2472 /* 2473 * We are in a serious low memory condition. Resort to 2474 * drastic measures to free some pages so we can allocate 2475 * another pv entry chunk. 2476 * 2477 * Returns NULL if PV entries were reclaimed from the specified pmap. 2478 * 2479 * We do not, however, unmap 2mpages because subsequent accesses will 2480 * allocate per-page pv entries until repromotion occurs, thereby 2481 * exacerbating the shortage of free pv entries. 2482 */ 2483 static vm_page_t 2484 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 2485 { 2486 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 2487 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 2488 struct md_page *pvh; 2489 pd_entry_t *pde; 2490 pmap_t next_pmap, pmap; 2491 pt_entry_t *pte, tpte; 2492 pv_entry_t pv; 2493 vm_offset_t va; 2494 vm_page_t m, m_pc; 2495 struct spglist free; 2496 uint64_t inuse; 2497 int bit, field, freed, lvl; 2498 static int active_reclaims = 0; 2499 2500 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2501 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 2502 2503 pmap = NULL; 2504 m_pc = NULL; 2505 SLIST_INIT(&free); 2506 bzero(&pc_marker_b, sizeof(pc_marker_b)); 2507 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 2508 pc_marker = (struct pv_chunk *)&pc_marker_b; 2509 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 2510 2511 mtx_lock(&pv_chunks_mutex); 2512 active_reclaims++; 2513 TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); 2514 TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); 2515 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 2516 SLIST_EMPTY(&free)) { 2517 next_pmap = pc->pc_pmap; 2518 if (next_pmap == NULL) { 2519 /* 2520 * The next chunk is a marker. However, it is 2521 * not our marker, so active_reclaims must be 2522 * > 1. Consequently, the next_chunk code 2523 * will not rotate the pv_chunks list. 2524 */ 2525 goto next_chunk; 2526 } 2527 mtx_unlock(&pv_chunks_mutex); 2528 2529 /* 2530 * A pv_chunk can only be removed from the pc_lru list 2531 * when both pv_chunks_mutex is owned and the 2532 * corresponding pmap is locked. 2533 */ 2534 if (pmap != next_pmap) { 2535 if (pmap != NULL && pmap != locked_pmap) 2536 PMAP_UNLOCK(pmap); 2537 pmap = next_pmap; 2538 /* Avoid deadlock and lock recursion. */ 2539 if (pmap > locked_pmap) { 2540 RELEASE_PV_LIST_LOCK(lockp); 2541 PMAP_LOCK(pmap); 2542 mtx_lock(&pv_chunks_mutex); 2543 continue; 2544 } else if (pmap != locked_pmap) { 2545 if (PMAP_TRYLOCK(pmap)) { 2546 mtx_lock(&pv_chunks_mutex); 2547 continue; 2548 } else { 2549 pmap = NULL; /* pmap is not locked */ 2550 mtx_lock(&pv_chunks_mutex); 2551 pc = TAILQ_NEXT(pc_marker, pc_lru); 2552 if (pc == NULL || 2553 pc->pc_pmap != next_pmap) 2554 continue; 2555 goto next_chunk; 2556 } 2557 } 2558 } 2559 2560 /* 2561 * Destroy every non-wired, 4 KB page mapping in the chunk. 2562 */ 2563 freed = 0; 2564 for (field = 0; field < _NPCM; field++) { 2565 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2566 inuse != 0; inuse &= ~(1UL << bit)) { 2567 bit = ffsl(inuse) - 1; 2568 pv = &pc->pc_pventry[field * 64 + bit]; 2569 va = pv->pv_va; 2570 pde = pmap_pde(pmap, va, &lvl); 2571 if (lvl != 2) 2572 continue; 2573 pte = pmap_l2_to_l3(pde, va); 2574 tpte = pmap_load(pte); 2575 if ((tpte & ATTR_SW_WIRED) != 0) 2576 continue; 2577 tpte = pmap_load_clear(pte); 2578 m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK); 2579 if (pmap_pte_dirty(pmap, tpte)) 2580 vm_page_dirty(m); 2581 if ((tpte & ATTR_AF) != 0) { 2582 pmap_invalidate_page(pmap, va, true); 2583 vm_page_aflag_set(m, PGA_REFERENCED); 2584 } 2585 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2586 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2587 m->md.pv_gen++; 2588 if (TAILQ_EMPTY(&m->md.pv_list) && 2589 (m->flags & PG_FICTITIOUS) == 0) { 2590 pvh = page_to_pvh(m); 2591 if (TAILQ_EMPTY(&pvh->pv_list)) { 2592 vm_page_aflag_clear(m, 2593 PGA_WRITEABLE); 2594 } 2595 } 2596 pc->pc_map[field] |= 1UL << bit; 2597 pmap_unuse_pt(pmap, va, pmap_load(pde), &free); 2598 freed++; 2599 } 2600 } 2601 if (freed == 0) { 2602 mtx_lock(&pv_chunks_mutex); 2603 goto next_chunk; 2604 } 2605 /* Every freed mapping is for a 4 KB page. */ 2606 pmap_resident_count_dec(pmap, freed); 2607 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 2608 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 2609 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 2610 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2611 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && 2612 pc->pc_map[2] == PC_FREE2) { 2613 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 2614 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 2615 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 2616 /* Entire chunk is free; return it. */ 2617 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2618 dump_drop_page(m_pc->phys_addr); 2619 mtx_lock(&pv_chunks_mutex); 2620 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2621 break; 2622 } 2623 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2624 mtx_lock(&pv_chunks_mutex); 2625 /* One freed pv entry in locked_pmap is sufficient. */ 2626 if (pmap == locked_pmap) 2627 break; 2628 2629 next_chunk: 2630 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 2631 TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); 2632 if (active_reclaims == 1 && pmap != NULL) { 2633 /* 2634 * Rotate the pv chunks list so that we do not 2635 * scan the same pv chunks that could not be 2636 * freed (because they contained a wired 2637 * and/or superpage mapping) on every 2638 * invocation of reclaim_pv_chunk(). 2639 */ 2640 while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { 2641 MPASS(pc->pc_pmap != NULL); 2642 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2643 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2644 } 2645 } 2646 } 2647 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 2648 TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); 2649 active_reclaims--; 2650 mtx_unlock(&pv_chunks_mutex); 2651 if (pmap != NULL && pmap != locked_pmap) 2652 PMAP_UNLOCK(pmap); 2653 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 2654 m_pc = SLIST_FIRST(&free); 2655 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2656 /* Recycle a freed page table page. */ 2657 m_pc->ref_count = 1; 2658 } 2659 vm_page_free_pages_toq(&free, true); 2660 return (m_pc); 2661 } 2662 2663 /* 2664 * free the pv_entry back to the free list 2665 */ 2666 static void 2667 free_pv_entry(pmap_t pmap, pv_entry_t pv) 2668 { 2669 struct pv_chunk *pc; 2670 int idx, field, bit; 2671 2672 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2673 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 2674 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 2675 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 2676 pc = pv_to_chunk(pv); 2677 idx = pv - &pc->pc_pventry[0]; 2678 field = idx / 64; 2679 bit = idx % 64; 2680 pc->pc_map[field] |= 1ul << bit; 2681 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 2682 pc->pc_map[2] != PC_FREE2) { 2683 /* 98% of the time, pc is already at the head of the list. */ 2684 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 2685 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2686 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2687 } 2688 return; 2689 } 2690 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2691 free_pv_chunk(pc); 2692 } 2693 2694 static void 2695 free_pv_chunk(struct pv_chunk *pc) 2696 { 2697 vm_page_t m; 2698 2699 mtx_lock(&pv_chunks_mutex); 2700 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2701 mtx_unlock(&pv_chunks_mutex); 2702 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 2703 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 2704 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 2705 /* entire chunk is free, return it */ 2706 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2707 dump_drop_page(m->phys_addr); 2708 vm_page_unwire_noq(m); 2709 vm_page_free(m); 2710 } 2711 2712 /* 2713 * Returns a new PV entry, allocating a new PV chunk from the system when 2714 * needed. If this PV chunk allocation fails and a PV list lock pointer was 2715 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 2716 * returned. 2717 * 2718 * The given PV list lock may be released. 2719 */ 2720 static pv_entry_t 2721 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 2722 { 2723 int bit, field; 2724 pv_entry_t pv; 2725 struct pv_chunk *pc; 2726 vm_page_t m; 2727 2728 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2729 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 2730 retry: 2731 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2732 if (pc != NULL) { 2733 for (field = 0; field < _NPCM; field++) { 2734 if (pc->pc_map[field]) { 2735 bit = ffsl(pc->pc_map[field]) - 1; 2736 break; 2737 } 2738 } 2739 if (field < _NPCM) { 2740 pv = &pc->pc_pventry[field * 64 + bit]; 2741 pc->pc_map[field] &= ~(1ul << bit); 2742 /* If this was the last item, move it to tail */ 2743 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 2744 pc->pc_map[2] == 0) { 2745 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2746 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 2747 pc_list); 2748 } 2749 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 2750 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 2751 return (pv); 2752 } 2753 } 2754 /* No free items, allocate another chunk */ 2755 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 2756 if (m == NULL) { 2757 if (lockp == NULL) { 2758 PV_STAT(pc_chunk_tryfail++); 2759 return (NULL); 2760 } 2761 m = reclaim_pv_chunk(pmap, lockp); 2762 if (m == NULL) 2763 goto retry; 2764 } 2765 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 2766 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 2767 dump_add_page(m->phys_addr); 2768 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 2769 pc->pc_pmap = pmap; 2770 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 2771 pc->pc_map[1] = PC_FREE1; 2772 pc->pc_map[2] = PC_FREE2; 2773 mtx_lock(&pv_chunks_mutex); 2774 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2775 mtx_unlock(&pv_chunks_mutex); 2776 pv = &pc->pc_pventry[0]; 2777 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2778 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 2779 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 2780 return (pv); 2781 } 2782 2783 /* 2784 * Ensure that the number of spare PV entries in the specified pmap meets or 2785 * exceeds the given count, "needed". 2786 * 2787 * The given PV list lock may be released. 2788 */ 2789 static void 2790 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 2791 { 2792 struct pch new_tail; 2793 struct pv_chunk *pc; 2794 vm_page_t m; 2795 int avail, free; 2796 bool reclaimed; 2797 2798 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2799 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 2800 2801 /* 2802 * Newly allocated PV chunks must be stored in a private list until 2803 * the required number of PV chunks have been allocated. Otherwise, 2804 * reclaim_pv_chunk() could recycle one of these chunks. In 2805 * contrast, these chunks must be added to the pmap upon allocation. 2806 */ 2807 TAILQ_INIT(&new_tail); 2808 retry: 2809 avail = 0; 2810 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 2811 bit_count((bitstr_t *)pc->pc_map, 0, 2812 sizeof(pc->pc_map) * NBBY, &free); 2813 if (free == 0) 2814 break; 2815 avail += free; 2816 if (avail >= needed) 2817 break; 2818 } 2819 for (reclaimed = false; avail < needed; avail += _NPCPV) { 2820 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 2821 if (m == NULL) { 2822 m = reclaim_pv_chunk(pmap, lockp); 2823 if (m == NULL) 2824 goto retry; 2825 reclaimed = true; 2826 } 2827 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 2828 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 2829 dump_add_page(m->phys_addr); 2830 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 2831 pc->pc_pmap = pmap; 2832 pc->pc_map[0] = PC_FREE0; 2833 pc->pc_map[1] = PC_FREE1; 2834 pc->pc_map[2] = PC_FREE2; 2835 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2836 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2837 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 2838 2839 /* 2840 * The reclaim might have freed a chunk from the current pmap. 2841 * If that chunk contained available entries, we need to 2842 * re-count the number of available entries. 2843 */ 2844 if (reclaimed) 2845 goto retry; 2846 } 2847 if (!TAILQ_EMPTY(&new_tail)) { 2848 mtx_lock(&pv_chunks_mutex); 2849 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 2850 mtx_unlock(&pv_chunks_mutex); 2851 } 2852 } 2853 2854 /* 2855 * First find and then remove the pv entry for the specified pmap and virtual 2856 * address from the specified pv list. Returns the pv entry if found and NULL 2857 * otherwise. This operation can be performed on pv lists for either 4KB or 2858 * 2MB page mappings. 2859 */ 2860 static __inline pv_entry_t 2861 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2862 { 2863 pv_entry_t pv; 2864 2865 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 2866 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 2867 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 2868 pvh->pv_gen++; 2869 break; 2870 } 2871 } 2872 return (pv); 2873 } 2874 2875 /* 2876 * After demotion from a 2MB page mapping to 512 4KB page mappings, 2877 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 2878 * entries for each of the 4KB page mappings. 2879 */ 2880 static void 2881 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 2882 struct rwlock **lockp) 2883 { 2884 struct md_page *pvh; 2885 struct pv_chunk *pc; 2886 pv_entry_t pv; 2887 vm_offset_t va_last; 2888 vm_page_t m; 2889 int bit, field; 2890 2891 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2892 KASSERT((va & L2_OFFSET) == 0, 2893 ("pmap_pv_demote_l2: va is not 2mpage aligned")); 2894 KASSERT((pa & L2_OFFSET) == 0, 2895 ("pmap_pv_demote_l2: pa is not 2mpage aligned")); 2896 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2897 2898 /* 2899 * Transfer the 2mpage's pv entry for this mapping to the first 2900 * page's pv list. Once this transfer begins, the pv list lock 2901 * must not be released until the last pv entry is reinstantiated. 2902 */ 2903 pvh = pa_to_pvh(pa); 2904 pv = pmap_pvh_remove(pvh, pmap, va); 2905 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); 2906 m = PHYS_TO_VM_PAGE(pa); 2907 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2908 m->md.pv_gen++; 2909 /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */ 2910 PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1)); 2911 va_last = va + L2_SIZE - PAGE_SIZE; 2912 for (;;) { 2913 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2914 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 2915 pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); 2916 for (field = 0; field < _NPCM; field++) { 2917 while (pc->pc_map[field]) { 2918 bit = ffsl(pc->pc_map[field]) - 1; 2919 pc->pc_map[field] &= ~(1ul << bit); 2920 pv = &pc->pc_pventry[field * 64 + bit]; 2921 va += PAGE_SIZE; 2922 pv->pv_va = va; 2923 m++; 2924 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2925 ("pmap_pv_demote_l2: page %p is not managed", m)); 2926 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2927 m->md.pv_gen++; 2928 if (va == va_last) 2929 goto out; 2930 } 2931 } 2932 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2933 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2934 } 2935 out: 2936 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 2937 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2938 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2939 } 2940 PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1)); 2941 PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1)); 2942 } 2943 2944 /* 2945 * First find and then destroy the pv entry for the specified pmap and virtual 2946 * address. This operation can be performed on pv lists for either 4KB or 2MB 2947 * page mappings. 2948 */ 2949 static void 2950 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2951 { 2952 pv_entry_t pv; 2953 2954 pv = pmap_pvh_remove(pvh, pmap, va); 2955 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 2956 free_pv_entry(pmap, pv); 2957 } 2958 2959 /* 2960 * Conditionally create the PV entry for a 4KB page mapping if the required 2961 * memory can be allocated without resorting to reclamation. 2962 */ 2963 static boolean_t 2964 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 2965 struct rwlock **lockp) 2966 { 2967 pv_entry_t pv; 2968 2969 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2970 /* Pass NULL instead of the lock pointer to disable reclamation. */ 2971 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 2972 pv->pv_va = va; 2973 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2974 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2975 m->md.pv_gen++; 2976 return (TRUE); 2977 } else 2978 return (FALSE); 2979 } 2980 2981 /* 2982 * Create the PV entry for a 2MB page mapping. Always returns true unless the 2983 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 2984 * false if the PV entry cannot be allocated without resorting to reclamation. 2985 */ 2986 static bool 2987 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, 2988 struct rwlock **lockp) 2989 { 2990 struct md_page *pvh; 2991 pv_entry_t pv; 2992 vm_paddr_t pa; 2993 2994 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2995 /* Pass NULL instead of the lock pointer to disable reclamation. */ 2996 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 2997 NULL : lockp)) == NULL) 2998 return (false); 2999 pv->pv_va = va; 3000 pa = l2e & ~ATTR_MASK; 3001 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3002 pvh = pa_to_pvh(pa); 3003 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3004 pvh->pv_gen++; 3005 return (true); 3006 } 3007 3008 static void 3009 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 3010 { 3011 pt_entry_t newl2, oldl2 __diagused; 3012 vm_page_t ml3; 3013 vm_paddr_t ml3pa; 3014 3015 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); 3016 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 3017 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3018 3019 ml3 = pmap_remove_pt_page(pmap, va); 3020 if (ml3 == NULL) 3021 panic("pmap_remove_kernel_l2: Missing pt page"); 3022 3023 ml3pa = VM_PAGE_TO_PHYS(ml3); 3024 newl2 = ml3pa | L2_TABLE; 3025 3026 /* 3027 * If this page table page was unmapped by a promotion, then it 3028 * contains valid mappings. Zero it to invalidate those mappings. 3029 */ 3030 if (ml3->valid != 0) 3031 pagezero((void *)PHYS_TO_DMAP(ml3pa)); 3032 3033 /* 3034 * Demote the mapping. The caller must have already invalidated the 3035 * mapping (i.e., the "break" in break-before-make). 3036 */ 3037 oldl2 = pmap_load_store(l2, newl2); 3038 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", 3039 __func__, l2, oldl2)); 3040 } 3041 3042 /* 3043 * pmap_remove_l2: Do the things to unmap a level 2 superpage. 3044 */ 3045 static int 3046 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 3047 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) 3048 { 3049 struct md_page *pvh; 3050 pt_entry_t old_l2; 3051 vm_page_t m, ml3, mt; 3052 3053 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3054 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); 3055 old_l2 = pmap_load_clear(l2); 3056 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, 3057 ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2)); 3058 3059 /* 3060 * Since a promotion must break the 4KB page mappings before making 3061 * the 2MB page mapping, a pmap_invalidate_page() suffices. 3062 */ 3063 pmap_invalidate_page(pmap, sva, true); 3064 3065 if (old_l2 & ATTR_SW_WIRED) 3066 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; 3067 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); 3068 if (old_l2 & ATTR_SW_MANAGED) { 3069 m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); 3070 pvh = page_to_pvh(m); 3071 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK); 3072 pmap_pvh_free(pvh, pmap, sva); 3073 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) { 3074 if (pmap_pte_dirty(pmap, old_l2)) 3075 vm_page_dirty(mt); 3076 if (old_l2 & ATTR_AF) 3077 vm_page_aflag_set(mt, PGA_REFERENCED); 3078 if (TAILQ_EMPTY(&mt->md.pv_list) && 3079 TAILQ_EMPTY(&pvh->pv_list)) 3080 vm_page_aflag_clear(mt, PGA_WRITEABLE); 3081 } 3082 } 3083 if (pmap == kernel_pmap) { 3084 pmap_remove_kernel_l2(pmap, l2, sva); 3085 } else { 3086 ml3 = pmap_remove_pt_page(pmap, sva); 3087 if (ml3 != NULL) { 3088 KASSERT(ml3->valid == VM_PAGE_BITS_ALL, 3089 ("pmap_remove_l2: l3 page not promoted")); 3090 pmap_resident_count_dec(pmap, 1); 3091 KASSERT(ml3->ref_count == NL3PG, 3092 ("pmap_remove_l2: l3 page ref count error")); 3093 ml3->ref_count = 0; 3094 pmap_add_delayed_free_list(ml3, free, FALSE); 3095 } 3096 } 3097 return (pmap_unuse_pt(pmap, sva, l1e, free)); 3098 } 3099 3100 /* 3101 * pmap_remove_l3: do the things to unmap a page in a process 3102 */ 3103 static int 3104 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 3105 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) 3106 { 3107 struct md_page *pvh; 3108 pt_entry_t old_l3; 3109 vm_page_t m; 3110 3111 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3112 old_l3 = pmap_load_clear(l3); 3113 pmap_invalidate_page(pmap, va, true); 3114 if (old_l3 & ATTR_SW_WIRED) 3115 pmap->pm_stats.wired_count -= 1; 3116 pmap_resident_count_dec(pmap, 1); 3117 if (old_l3 & ATTR_SW_MANAGED) { 3118 m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); 3119 if (pmap_pte_dirty(pmap, old_l3)) 3120 vm_page_dirty(m); 3121 if (old_l3 & ATTR_AF) 3122 vm_page_aflag_set(m, PGA_REFERENCED); 3123 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3124 pmap_pvh_free(&m->md, pmap, va); 3125 if (TAILQ_EMPTY(&m->md.pv_list) && 3126 (m->flags & PG_FICTITIOUS) == 0) { 3127 pvh = page_to_pvh(m); 3128 if (TAILQ_EMPTY(&pvh->pv_list)) 3129 vm_page_aflag_clear(m, PGA_WRITEABLE); 3130 } 3131 } 3132 return (pmap_unuse_pt(pmap, va, l2e, free)); 3133 } 3134 3135 /* 3136 * Remove the specified range of addresses from the L3 page table that is 3137 * identified by the given L2 entry. 3138 */ 3139 static void 3140 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva, 3141 vm_offset_t eva, struct spglist *free, struct rwlock **lockp) 3142 { 3143 struct md_page *pvh; 3144 struct rwlock *new_lock; 3145 pt_entry_t *l3, old_l3; 3146 vm_offset_t va; 3147 vm_page_t l3pg, m; 3148 3149 KASSERT(ADDR_IS_CANONICAL(sva), 3150 ("%s: Start address not in canonical form: %lx", __func__, sva)); 3151 KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS, 3152 ("%s: End address not in canonical form: %lx", __func__, eva)); 3153 3154 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3155 KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE), 3156 ("pmap_remove_l3_range: range crosses an L3 page table boundary")); 3157 l3pg = !ADDR_IS_KERNEL(sva) ? PHYS_TO_VM_PAGE(l2e & ~ATTR_MASK) : NULL; 3158 va = eva; 3159 for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) { 3160 if (!pmap_l3_valid(pmap_load(l3))) { 3161 if (va != eva) { 3162 pmap_invalidate_range(pmap, va, sva, true); 3163 va = eva; 3164 } 3165 continue; 3166 } 3167 old_l3 = pmap_load_clear(l3); 3168 if ((old_l3 & ATTR_SW_WIRED) != 0) 3169 pmap->pm_stats.wired_count--; 3170 pmap_resident_count_dec(pmap, 1); 3171 if ((old_l3 & ATTR_SW_MANAGED) != 0) { 3172 m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); 3173 if (pmap_pte_dirty(pmap, old_l3)) 3174 vm_page_dirty(m); 3175 if ((old_l3 & ATTR_AF) != 0) 3176 vm_page_aflag_set(m, PGA_REFERENCED); 3177 new_lock = PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)); 3178 if (new_lock != *lockp) { 3179 if (*lockp != NULL) { 3180 /* 3181 * Pending TLB invalidations must be 3182 * performed before the PV list lock is 3183 * released. Otherwise, a concurrent 3184 * pmap_remove_all() on a physical page 3185 * could return while a stale TLB entry 3186 * still provides access to that page. 3187 */ 3188 if (va != eva) { 3189 pmap_invalidate_range(pmap, va, 3190 sva, true); 3191 va = eva; 3192 } 3193 rw_wunlock(*lockp); 3194 } 3195 *lockp = new_lock; 3196 rw_wlock(*lockp); 3197 } 3198 pmap_pvh_free(&m->md, pmap, sva); 3199 if (TAILQ_EMPTY(&m->md.pv_list) && 3200 (m->flags & PG_FICTITIOUS) == 0) { 3201 pvh = page_to_pvh(m); 3202 if (TAILQ_EMPTY(&pvh->pv_list)) 3203 vm_page_aflag_clear(m, PGA_WRITEABLE); 3204 } 3205 } 3206 if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) { 3207 /* 3208 * _pmap_unwire_l3() has already invalidated the TLB 3209 * entries at all levels for "sva". So, we need not 3210 * perform "sva += L3_SIZE;" here. Moreover, we need 3211 * not perform "va = sva;" if "sva" is at the start 3212 * of a new valid range consisting of a single page. 3213 */ 3214 break; 3215 } 3216 if (va == eva) 3217 va = sva; 3218 } 3219 if (va != eva) 3220 pmap_invalidate_range(pmap, va, sva, true); 3221 } 3222 3223 /* 3224 * Remove the given range of addresses from the specified map. 3225 * 3226 * It is assumed that the start and end are properly 3227 * rounded to the page size. 3228 */ 3229 void 3230 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3231 { 3232 struct rwlock *lock; 3233 vm_offset_t va_next; 3234 pd_entry_t *l0, *l1, *l2; 3235 pt_entry_t l3_paddr; 3236 struct spglist free; 3237 3238 /* 3239 * Perform an unsynchronized read. This is, however, safe. 3240 */ 3241 if (pmap->pm_stats.resident_count == 0) 3242 return; 3243 3244 SLIST_INIT(&free); 3245 3246 PMAP_LOCK(pmap); 3247 3248 lock = NULL; 3249 for (; sva < eva; sva = va_next) { 3250 if (pmap->pm_stats.resident_count == 0) 3251 break; 3252 3253 l0 = pmap_l0(pmap, sva); 3254 if (pmap_load(l0) == 0) { 3255 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 3256 if (va_next < sva) 3257 va_next = eva; 3258 continue; 3259 } 3260 3261 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3262 if (va_next < sva) 3263 va_next = eva; 3264 l1 = pmap_l0_to_l1(l0, sva); 3265 if (pmap_load(l1) == 0) 3266 continue; 3267 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 3268 KASSERT(va_next <= eva, 3269 ("partial update of non-transparent 1G page " 3270 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 3271 pmap_load(l1), sva, eva, va_next)); 3272 MPASS(pmap != kernel_pmap); 3273 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0); 3274 pmap_clear(l1); 3275 pmap_invalidate_page(pmap, sva, true); 3276 pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE); 3277 pmap_unuse_pt(pmap, sva, pmap_load(l0), &free); 3278 continue; 3279 } 3280 3281 /* 3282 * Calculate index for next page table. 3283 */ 3284 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3285 if (va_next < sva) 3286 va_next = eva; 3287 3288 l2 = pmap_l1_to_l2(l1, sva); 3289 if (l2 == NULL) 3290 continue; 3291 3292 l3_paddr = pmap_load(l2); 3293 3294 if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) { 3295 if (sva + L2_SIZE == va_next && eva >= va_next) { 3296 pmap_remove_l2(pmap, l2, sva, pmap_load(l1), 3297 &free, &lock); 3298 continue; 3299 } else if (pmap_demote_l2_locked(pmap, l2, sva, 3300 &lock) == NULL) 3301 continue; 3302 l3_paddr = pmap_load(l2); 3303 } 3304 3305 /* 3306 * Weed out invalid mappings. 3307 */ 3308 if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE) 3309 continue; 3310 3311 /* 3312 * Limit our scan to either the end of the va represented 3313 * by the current page table page, or to the end of the 3314 * range being removed. 3315 */ 3316 if (va_next > eva) 3317 va_next = eva; 3318 3319 pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free, 3320 &lock); 3321 } 3322 if (lock != NULL) 3323 rw_wunlock(lock); 3324 PMAP_UNLOCK(pmap); 3325 vm_page_free_pages_toq(&free, true); 3326 } 3327 3328 /* 3329 * Routine: pmap_remove_all 3330 * Function: 3331 * Removes this physical page from 3332 * all physical maps in which it resides. 3333 * Reflects back modify bits to the pager. 3334 * 3335 * Notes: 3336 * Original versions of this routine were very 3337 * inefficient because they iteratively called 3338 * pmap_remove (slow...) 3339 */ 3340 3341 void 3342 pmap_remove_all(vm_page_t m) 3343 { 3344 struct md_page *pvh; 3345 pv_entry_t pv; 3346 pmap_t pmap; 3347 struct rwlock *lock; 3348 pd_entry_t *pde, tpde; 3349 pt_entry_t *pte, tpte; 3350 vm_offset_t va; 3351 struct spglist free; 3352 int lvl, pvh_gen, md_gen; 3353 3354 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3355 ("pmap_remove_all: page %p is not managed", m)); 3356 SLIST_INIT(&free); 3357 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3358 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 3359 rw_wlock(lock); 3360 retry: 3361 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 3362 pmap = PV_PMAP(pv); 3363 if (!PMAP_TRYLOCK(pmap)) { 3364 pvh_gen = pvh->pv_gen; 3365 rw_wunlock(lock); 3366 PMAP_LOCK(pmap); 3367 rw_wlock(lock); 3368 if (pvh_gen != pvh->pv_gen) { 3369 PMAP_UNLOCK(pmap); 3370 goto retry; 3371 } 3372 } 3373 va = pv->pv_va; 3374 pte = pmap_pte_exists(pmap, va, 2, __func__); 3375 pmap_demote_l2_locked(pmap, pte, va, &lock); 3376 PMAP_UNLOCK(pmap); 3377 } 3378 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3379 pmap = PV_PMAP(pv); 3380 PMAP_ASSERT_STAGE1(pmap); 3381 if (!PMAP_TRYLOCK(pmap)) { 3382 pvh_gen = pvh->pv_gen; 3383 md_gen = m->md.pv_gen; 3384 rw_wunlock(lock); 3385 PMAP_LOCK(pmap); 3386 rw_wlock(lock); 3387 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 3388 PMAP_UNLOCK(pmap); 3389 goto retry; 3390 } 3391 } 3392 pmap_resident_count_dec(pmap, 1); 3393 3394 pde = pmap_pde(pmap, pv->pv_va, &lvl); 3395 KASSERT(pde != NULL, 3396 ("pmap_remove_all: no page directory entry found")); 3397 KASSERT(lvl == 2, 3398 ("pmap_remove_all: invalid pde level %d", lvl)); 3399 tpde = pmap_load(pde); 3400 3401 pte = pmap_l2_to_l3(pde, pv->pv_va); 3402 tpte = pmap_load_clear(pte); 3403 if (tpte & ATTR_SW_WIRED) 3404 pmap->pm_stats.wired_count--; 3405 if ((tpte & ATTR_AF) != 0) { 3406 pmap_invalidate_page(pmap, pv->pv_va, true); 3407 vm_page_aflag_set(m, PGA_REFERENCED); 3408 } 3409 3410 /* 3411 * Update the vm_page_t clean and reference bits. 3412 */ 3413 if (pmap_pte_dirty(pmap, tpte)) 3414 vm_page_dirty(m); 3415 pmap_unuse_pt(pmap, pv->pv_va, tpde, &free); 3416 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3417 m->md.pv_gen++; 3418 free_pv_entry(pmap, pv); 3419 PMAP_UNLOCK(pmap); 3420 } 3421 vm_page_aflag_clear(m, PGA_WRITEABLE); 3422 rw_wunlock(lock); 3423 vm_page_free_pages_toq(&free, true); 3424 } 3425 3426 /* 3427 * pmap_protect_l2: do the things to protect a 2MB page in a pmap 3428 */ 3429 static void 3430 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask, 3431 pt_entry_t nbits) 3432 { 3433 pd_entry_t old_l2; 3434 vm_page_t m, mt; 3435 3436 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3437 PMAP_ASSERT_STAGE1(pmap); 3438 KASSERT((sva & L2_OFFSET) == 0, 3439 ("pmap_protect_l2: sva is not 2mpage aligned")); 3440 old_l2 = pmap_load(l2); 3441 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, 3442 ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2)); 3443 3444 /* 3445 * Return if the L2 entry already has the desired access restrictions 3446 * in place. 3447 */ 3448 if ((old_l2 & mask) == nbits) 3449 return; 3450 3451 while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits)) 3452 cpu_spinwait(); 3453 3454 /* 3455 * When a dirty read/write superpage mapping is write protected, 3456 * update the dirty field of each of the superpage's constituent 4KB 3457 * pages. 3458 */ 3459 if ((old_l2 & ATTR_SW_MANAGED) != 0 && 3460 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 3461 pmap_pte_dirty(pmap, old_l2)) { 3462 m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); 3463 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 3464 vm_page_dirty(mt); 3465 } 3466 3467 /* 3468 * Since a promotion must break the 4KB page mappings before making 3469 * the 2MB page mapping, a pmap_invalidate_page() suffices. 3470 */ 3471 pmap_invalidate_page(pmap, sva, true); 3472 } 3473 3474 /* 3475 * Set the physical protection on the 3476 * specified range of this map as requested. 3477 */ 3478 void 3479 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 3480 { 3481 vm_offset_t va, va_next; 3482 pd_entry_t *l0, *l1, *l2; 3483 pt_entry_t *l3p, l3, mask, nbits; 3484 3485 PMAP_ASSERT_STAGE1(pmap); 3486 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 3487 if (prot == VM_PROT_NONE) { 3488 pmap_remove(pmap, sva, eva); 3489 return; 3490 } 3491 3492 mask = nbits = 0; 3493 if ((prot & VM_PROT_WRITE) == 0) { 3494 mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM; 3495 nbits |= ATTR_S1_AP(ATTR_S1_AP_RO); 3496 } 3497 if ((prot & VM_PROT_EXECUTE) == 0) { 3498 mask |= ATTR_S1_XN; 3499 nbits |= ATTR_S1_XN; 3500 } 3501 if (mask == 0) 3502 return; 3503 3504 PMAP_LOCK(pmap); 3505 for (; sva < eva; sva = va_next) { 3506 l0 = pmap_l0(pmap, sva); 3507 if (pmap_load(l0) == 0) { 3508 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 3509 if (va_next < sva) 3510 va_next = eva; 3511 continue; 3512 } 3513 3514 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3515 if (va_next < sva) 3516 va_next = eva; 3517 l1 = pmap_l0_to_l1(l0, sva); 3518 if (pmap_load(l1) == 0) 3519 continue; 3520 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 3521 KASSERT(va_next <= eva, 3522 ("partial update of non-transparent 1G page " 3523 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 3524 pmap_load(l1), sva, eva, va_next)); 3525 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0); 3526 if ((pmap_load(l1) & mask) != nbits) { 3527 pmap_store(l1, (pmap_load(l1) & ~mask) | nbits); 3528 pmap_invalidate_page(pmap, sva, true); 3529 } 3530 continue; 3531 } 3532 3533 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3534 if (va_next < sva) 3535 va_next = eva; 3536 3537 l2 = pmap_l1_to_l2(l1, sva); 3538 if (pmap_load(l2) == 0) 3539 continue; 3540 3541 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { 3542 if (sva + L2_SIZE == va_next && eva >= va_next) { 3543 pmap_protect_l2(pmap, l2, sva, mask, nbits); 3544 continue; 3545 } else if (pmap_demote_l2(pmap, l2, sva) == NULL) 3546 continue; 3547 } 3548 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 3549 ("pmap_protect: Invalid L2 entry after demotion")); 3550 3551 if (va_next > eva) 3552 va_next = eva; 3553 3554 va = va_next; 3555 for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++, 3556 sva += L3_SIZE) { 3557 l3 = pmap_load(l3p); 3558 3559 /* 3560 * Go to the next L3 entry if the current one is 3561 * invalid or already has the desired access 3562 * restrictions in place. (The latter case occurs 3563 * frequently. For example, in a "buildworld" 3564 * workload, almost 1 out of 4 L3 entries already 3565 * have the desired restrictions.) 3566 */ 3567 if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) { 3568 if (va != va_next) { 3569 pmap_invalidate_range(pmap, va, sva, 3570 true); 3571 va = va_next; 3572 } 3573 continue; 3574 } 3575 3576 while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) | 3577 nbits)) 3578 cpu_spinwait(); 3579 3580 /* 3581 * When a dirty read/write mapping is write protected, 3582 * update the page's dirty field. 3583 */ 3584 if ((l3 & ATTR_SW_MANAGED) != 0 && 3585 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 3586 pmap_pte_dirty(pmap, l3)) 3587 vm_page_dirty(PHYS_TO_VM_PAGE(l3 & ~ATTR_MASK)); 3588 3589 if (va == va_next) 3590 va = sva; 3591 } 3592 if (va != va_next) 3593 pmap_invalidate_range(pmap, va, sva, true); 3594 } 3595 PMAP_UNLOCK(pmap); 3596 } 3597 3598 /* 3599 * Inserts the specified page table page into the specified pmap's collection 3600 * of idle page table pages. Each of a pmap's page table pages is responsible 3601 * for mapping a distinct range of virtual addresses. The pmap's collection is 3602 * ordered by this virtual address range. 3603 * 3604 * If "promoted" is false, then the page table page "mpte" must be zero filled. 3605 */ 3606 static __inline int 3607 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) 3608 { 3609 3610 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3611 mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; 3612 return (vm_radix_insert(&pmap->pm_root, mpte)); 3613 } 3614 3615 /* 3616 * Removes the page table page mapping the specified virtual address from the 3617 * specified pmap's collection of idle page table pages, and returns it. 3618 * Otherwise, returns NULL if there is no page table page corresponding to the 3619 * specified virtual address. 3620 */ 3621 static __inline vm_page_t 3622 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 3623 { 3624 3625 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3626 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); 3627 } 3628 3629 /* 3630 * Performs a break-before-make update of a pmap entry. This is needed when 3631 * either promoting or demoting pages to ensure the TLB doesn't get into an 3632 * inconsistent state. 3633 */ 3634 static void 3635 pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte, 3636 vm_offset_t va, vm_size_t size) 3637 { 3638 register_t intr; 3639 3640 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3641 3642 /* 3643 * Ensure we don't get switched out with the page table in an 3644 * inconsistent state. We also need to ensure no interrupts fire 3645 * as they may make use of an address we are about to invalidate. 3646 */ 3647 intr = intr_disable(); 3648 3649 /* 3650 * Clear the old mapping's valid bit, but leave the rest of the entry 3651 * unchanged, so that a lockless, concurrent pmap_kextract() can still 3652 * lookup the physical address. 3653 */ 3654 pmap_clear_bits(pte, ATTR_DESCR_VALID); 3655 3656 /* 3657 * When promoting, the L{1,2}_TABLE entry that is being replaced might 3658 * be cached, so we invalidate intermediate entries as well as final 3659 * entries. 3660 */ 3661 pmap_invalidate_range(pmap, va, va + size, false); 3662 3663 /* Create the new mapping */ 3664 pmap_store(pte, newpte); 3665 dsb(ishst); 3666 3667 intr_restore(intr); 3668 } 3669 3670 #if VM_NRESERVLEVEL > 0 3671 /* 3672 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 3673 * replace the many pv entries for the 4KB page mappings by a single pv entry 3674 * for the 2MB page mapping. 3675 */ 3676 static void 3677 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3678 struct rwlock **lockp) 3679 { 3680 struct md_page *pvh; 3681 pv_entry_t pv; 3682 vm_offset_t va_last; 3683 vm_page_t m; 3684 3685 KASSERT((pa & L2_OFFSET) == 0, 3686 ("pmap_pv_promote_l2: pa is not 2mpage aligned")); 3687 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3688 3689 /* 3690 * Transfer the first page's pv entry for this mapping to the 2mpage's 3691 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 3692 * a transfer avoids the possibility that get_pv_entry() calls 3693 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 3694 * mappings that is being promoted. 3695 */ 3696 m = PHYS_TO_VM_PAGE(pa); 3697 va = va & ~L2_OFFSET; 3698 pv = pmap_pvh_remove(&m->md, pmap, va); 3699 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found")); 3700 pvh = page_to_pvh(m); 3701 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3702 pvh->pv_gen++; 3703 /* Free the remaining NPTEPG - 1 pv entries. */ 3704 va_last = va + L2_SIZE - PAGE_SIZE; 3705 do { 3706 m++; 3707 va += PAGE_SIZE; 3708 pmap_pvh_free(&m->md, pmap, va); 3709 } while (va < va_last); 3710 } 3711 3712 /* 3713 * Tries to promote the 512, contiguous 4KB page mappings that are within a 3714 * single level 2 table entry to a single 2MB page mapping. For promotion 3715 * to occur, two conditions must be met: (1) the 4KB page mappings must map 3716 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 3717 * identical characteristics. 3718 */ 3719 static void 3720 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 3721 struct rwlock **lockp) 3722 { 3723 pt_entry_t *firstl3, *l3, newl2, oldl3, pa; 3724 vm_page_t mpte; 3725 vm_offset_t sva; 3726 3727 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3728 PMAP_ASSERT_STAGE1(pmap); 3729 3730 sva = va & ~L2_OFFSET; 3731 firstl3 = pmap_l2_to_l3(l2, sva); 3732 newl2 = pmap_load(firstl3); 3733 3734 if (((newl2 & (~ATTR_MASK | ATTR_AF)) & L2_OFFSET) != ATTR_AF) { 3735 atomic_add_long(&pmap_l2_p_failures, 1); 3736 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 3737 " in pmap %p", va, pmap); 3738 return; 3739 } 3740 3741 setl2: 3742 if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 3743 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 3744 /* 3745 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set, 3746 * ATTR_SW_DBM can be cleared without a TLB invalidation. 3747 */ 3748 if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM)) 3749 goto setl2; 3750 newl2 &= ~ATTR_SW_DBM; 3751 } 3752 3753 pa = newl2 + L2_SIZE - PAGE_SIZE; 3754 for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) { 3755 oldl3 = pmap_load(l3); 3756 setl3: 3757 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 3758 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 3759 /* 3760 * When the mapping is clean, i.e., ATTR_S1_AP_RO is 3761 * set, ATTR_SW_DBM can be cleared without a TLB 3762 * invalidation. 3763 */ 3764 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 & 3765 ~ATTR_SW_DBM)) 3766 goto setl3; 3767 oldl3 &= ~ATTR_SW_DBM; 3768 } 3769 if (oldl3 != pa) { 3770 atomic_add_long(&pmap_l2_p_failures, 1); 3771 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 3772 " in pmap %p", va, pmap); 3773 return; 3774 } 3775 pa -= PAGE_SIZE; 3776 } 3777 3778 /* 3779 * Save the page table page in its current state until the L2 3780 * mapping the superpage is demoted by pmap_demote_l2() or 3781 * destroyed by pmap_remove_l3(). 3782 */ 3783 mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); 3784 KASSERT(mpte >= vm_page_array && 3785 mpte < &vm_page_array[vm_page_array_size], 3786 ("pmap_promote_l2: page table page is out of range")); 3787 KASSERT(mpte->pindex == pmap_l2_pindex(va), 3788 ("pmap_promote_l2: page table page's pindex is wrong")); 3789 if (pmap_insert_pt_page(pmap, mpte, true)) { 3790 atomic_add_long(&pmap_l2_p_failures, 1); 3791 CTR2(KTR_PMAP, 3792 "pmap_promote_l2: failure for va %#lx in pmap %p", va, 3793 pmap); 3794 return; 3795 } 3796 3797 if ((newl2 & ATTR_SW_MANAGED) != 0) 3798 pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp); 3799 3800 newl2 &= ~ATTR_DESCR_MASK; 3801 newl2 |= L2_BLOCK; 3802 3803 pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE); 3804 3805 atomic_add_long(&pmap_l2_promotions, 1); 3806 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, 3807 pmap); 3808 } 3809 #endif /* VM_NRESERVLEVEL > 0 */ 3810 3811 static int 3812 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags, 3813 int psind) 3814 { 3815 pd_entry_t *l0p, *l1p, *l2p, origpte; 3816 vm_page_t mp; 3817 3818 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3819 KASSERT(psind > 0 && psind < MAXPAGESIZES, 3820 ("psind %d unexpected", psind)); 3821 KASSERT(((newpte & ~ATTR_MASK) & (pagesizes[psind] - 1)) == 0, 3822 ("unaligned phys address %#lx newpte %#lx psind %d", 3823 (newpte & ~ATTR_MASK), newpte, psind)); 3824 3825 restart: 3826 if (psind == 2) { 3827 l0p = pmap_l0(pmap, va); 3828 if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) { 3829 mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL); 3830 if (mp == NULL) { 3831 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 3832 return (KERN_RESOURCE_SHORTAGE); 3833 PMAP_UNLOCK(pmap); 3834 vm_wait(NULL); 3835 PMAP_LOCK(pmap); 3836 goto restart; 3837 } 3838 l1p = pmap_l0_to_l1(l0p, va); 3839 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va)); 3840 origpte = pmap_load(l1p); 3841 } else { 3842 l1p = pmap_l0_to_l1(l0p, va); 3843 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va)); 3844 origpte = pmap_load(l1p); 3845 if ((origpte & ATTR_DESCR_VALID) == 0) { 3846 mp = PHYS_TO_VM_PAGE(pmap_load(l0p) & 3847 ~ATTR_MASK); 3848 mp->ref_count++; 3849 } 3850 } 3851 KASSERT((origpte & ATTR_DESCR_VALID) == 0 || 3852 ((origpte & ATTR_DESCR_MASK) == L1_BLOCK && 3853 (origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK)), 3854 ("va %#lx changing 1G phys page l1 %#lx newpte %#lx", 3855 va, origpte, newpte)); 3856 pmap_store(l1p, newpte); 3857 } else /* (psind == 1) */ { 3858 l2p = pmap_l2(pmap, va); 3859 if (l2p == NULL) { 3860 mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL); 3861 if (mp == NULL) { 3862 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 3863 return (KERN_RESOURCE_SHORTAGE); 3864 PMAP_UNLOCK(pmap); 3865 vm_wait(NULL); 3866 PMAP_LOCK(pmap); 3867 goto restart; 3868 } 3869 l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 3870 l2p = &l2p[pmap_l2_index(va)]; 3871 origpte = pmap_load(l2p); 3872 } else { 3873 l1p = pmap_l1(pmap, va); 3874 origpte = pmap_load(l2p); 3875 if ((origpte & ATTR_DESCR_VALID) == 0) { 3876 mp = PHYS_TO_VM_PAGE(pmap_load(l1p) & 3877 ~ATTR_MASK); 3878 mp->ref_count++; 3879 } 3880 } 3881 KASSERT((origpte & ATTR_DESCR_VALID) == 0 || 3882 ((origpte & ATTR_DESCR_MASK) == L2_BLOCK && 3883 (origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK)), 3884 ("va %#lx changing 2M phys page l2 %#lx newpte %#lx", 3885 va, origpte, newpte)); 3886 pmap_store(l2p, newpte); 3887 } 3888 dsb(ishst); 3889 3890 if ((origpte & ATTR_DESCR_VALID) == 0) 3891 pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE); 3892 if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0) 3893 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE; 3894 else if ((newpte & ATTR_SW_WIRED) == 0 && 3895 (origpte & ATTR_SW_WIRED) != 0) 3896 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE; 3897 3898 return (KERN_SUCCESS); 3899 } 3900 3901 /* 3902 * Insert the given physical page (p) at 3903 * the specified virtual address (v) in the 3904 * target physical map with the protection requested. 3905 * 3906 * If specified, the page will be wired down, meaning 3907 * that the related pte can not be reclaimed. 3908 * 3909 * NB: This is the only routine which MAY NOT lazy-evaluate 3910 * or lose information. That is, this routine must actually 3911 * insert this page into the given map NOW. 3912 */ 3913 int 3914 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3915 u_int flags, int8_t psind) 3916 { 3917 struct rwlock *lock; 3918 pd_entry_t *pde; 3919 pt_entry_t new_l3, orig_l3; 3920 pt_entry_t *l2, *l3; 3921 pv_entry_t pv; 3922 vm_paddr_t opa, pa; 3923 vm_page_t mpte, om; 3924 boolean_t nosleep; 3925 int lvl, rv; 3926 3927 KASSERT(ADDR_IS_CANONICAL(va), 3928 ("%s: Address not in canonical form: %lx", __func__, va)); 3929 3930 va = trunc_page(va); 3931 if ((m->oflags & VPO_UNMANAGED) == 0) 3932 VM_PAGE_OBJECT_BUSY_ASSERT(m); 3933 pa = VM_PAGE_TO_PHYS(m); 3934 new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | L3_PAGE); 3935 new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr); 3936 new_l3 |= pmap_pte_prot(pmap, prot); 3937 3938 if ((flags & PMAP_ENTER_WIRED) != 0) 3939 new_l3 |= ATTR_SW_WIRED; 3940 if (pmap->pm_stage == PM_STAGE1) { 3941 if (!ADDR_IS_KERNEL(va)) 3942 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 3943 else 3944 new_l3 |= ATTR_S1_UXN; 3945 if (pmap != kernel_pmap) 3946 new_l3 |= ATTR_S1_nG; 3947 } else { 3948 /* 3949 * Clear the access flag on executable mappings, this will be 3950 * set later when the page is accessed. The fault handler is 3951 * required to invalidate the I-cache. 3952 * 3953 * TODO: Switch to the valid flag to allow hardware management 3954 * of the access flag. Much of the pmap code assumes the 3955 * valid flag is set and fails to destroy the old page tables 3956 * correctly if it is clear. 3957 */ 3958 if (prot & VM_PROT_EXECUTE) 3959 new_l3 &= ~ATTR_AF; 3960 } 3961 if ((m->oflags & VPO_UNMANAGED) == 0) { 3962 new_l3 |= ATTR_SW_MANAGED; 3963 if ((prot & VM_PROT_WRITE) != 0) { 3964 new_l3 |= ATTR_SW_DBM; 3965 if ((flags & VM_PROT_WRITE) == 0) { 3966 if (pmap->pm_stage == PM_STAGE1) 3967 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO); 3968 else 3969 new_l3 &= 3970 ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 3971 } 3972 } 3973 } 3974 3975 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); 3976 3977 lock = NULL; 3978 PMAP_LOCK(pmap); 3979 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { 3980 KASSERT((m->oflags & VPO_UNMANAGED) != 0, 3981 ("managed largepage va %#lx flags %#x", va, flags)); 3982 new_l3 &= ~L3_PAGE; 3983 if (psind == 2) 3984 new_l3 |= L1_BLOCK; 3985 else /* (psind == 1) */ 3986 new_l3 |= L2_BLOCK; 3987 rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind); 3988 goto out; 3989 } 3990 if (psind == 1) { 3991 /* Assert the required virtual and physical alignment. */ 3992 KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned")); 3993 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 3994 rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK, 3995 flags, m, &lock); 3996 goto out; 3997 } 3998 mpte = NULL; 3999 4000 /* 4001 * In the case that a page table page is not 4002 * resident, we are creating it here. 4003 */ 4004 retry: 4005 pde = pmap_pde(pmap, va, &lvl); 4006 if (pde != NULL && lvl == 2) { 4007 l3 = pmap_l2_to_l3(pde, va); 4008 if (!ADDR_IS_KERNEL(va) && mpte == NULL) { 4009 mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); 4010 mpte->ref_count++; 4011 } 4012 goto havel3; 4013 } else if (pde != NULL && lvl == 1) { 4014 l2 = pmap_l1_to_l2(pde, va); 4015 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK && 4016 (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) { 4017 l3 = &l3[pmap_l3_index(va)]; 4018 if (!ADDR_IS_KERNEL(va)) { 4019 mpte = PHYS_TO_VM_PAGE( 4020 pmap_load(l2) & ~ATTR_MASK); 4021 mpte->ref_count++; 4022 } 4023 goto havel3; 4024 } 4025 /* We need to allocate an L3 table. */ 4026 } 4027 if (!ADDR_IS_KERNEL(va)) { 4028 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 4029 4030 /* 4031 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order 4032 * to handle the possibility that a superpage mapping for "va" 4033 * was created while we slept. 4034 */ 4035 mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), 4036 nosleep ? NULL : &lock); 4037 if (mpte == NULL && nosleep) { 4038 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); 4039 rv = KERN_RESOURCE_SHORTAGE; 4040 goto out; 4041 } 4042 goto retry; 4043 } else 4044 panic("pmap_enter: missing L3 table for kernel va %#lx", va); 4045 4046 havel3: 4047 orig_l3 = pmap_load(l3); 4048 opa = orig_l3 & ~ATTR_MASK; 4049 pv = NULL; 4050 4051 /* 4052 * Is the specified virtual address already mapped? 4053 */ 4054 if (pmap_l3_valid(orig_l3)) { 4055 /* 4056 * Only allow adding new entries on stage 2 tables for now. 4057 * This simplifies cache invalidation as we may need to call 4058 * into EL2 to perform such actions. 4059 */ 4060 PMAP_ASSERT_STAGE1(pmap); 4061 /* 4062 * Wiring change, just update stats. We don't worry about 4063 * wiring PT pages as they remain resident as long as there 4064 * are valid mappings in them. Hence, if a user page is wired, 4065 * the PT page will be also. 4066 */ 4067 if ((flags & PMAP_ENTER_WIRED) != 0 && 4068 (orig_l3 & ATTR_SW_WIRED) == 0) 4069 pmap->pm_stats.wired_count++; 4070 else if ((flags & PMAP_ENTER_WIRED) == 0 && 4071 (orig_l3 & ATTR_SW_WIRED) != 0) 4072 pmap->pm_stats.wired_count--; 4073 4074 /* 4075 * Remove the extra PT page reference. 4076 */ 4077 if (mpte != NULL) { 4078 mpte->ref_count--; 4079 KASSERT(mpte->ref_count > 0, 4080 ("pmap_enter: missing reference to page table page," 4081 " va: 0x%lx", va)); 4082 } 4083 4084 /* 4085 * Has the physical page changed? 4086 */ 4087 if (opa == pa) { 4088 /* 4089 * No, might be a protection or wiring change. 4090 */ 4091 if ((orig_l3 & ATTR_SW_MANAGED) != 0 && 4092 (new_l3 & ATTR_SW_DBM) != 0) 4093 vm_page_aflag_set(m, PGA_WRITEABLE); 4094 goto validate; 4095 } 4096 4097 /* 4098 * The physical page has changed. Temporarily invalidate 4099 * the mapping. 4100 */ 4101 orig_l3 = pmap_load_clear(l3); 4102 KASSERT((orig_l3 & ~ATTR_MASK) == opa, 4103 ("pmap_enter: unexpected pa update for %#lx", va)); 4104 if ((orig_l3 & ATTR_SW_MANAGED) != 0) { 4105 om = PHYS_TO_VM_PAGE(opa); 4106 4107 /* 4108 * The pmap lock is sufficient to synchronize with 4109 * concurrent calls to pmap_page_test_mappings() and 4110 * pmap_ts_referenced(). 4111 */ 4112 if (pmap_pte_dirty(pmap, orig_l3)) 4113 vm_page_dirty(om); 4114 if ((orig_l3 & ATTR_AF) != 0) { 4115 pmap_invalidate_page(pmap, va, true); 4116 vm_page_aflag_set(om, PGA_REFERENCED); 4117 } 4118 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 4119 pv = pmap_pvh_remove(&om->md, pmap, va); 4120 if ((m->oflags & VPO_UNMANAGED) != 0) 4121 free_pv_entry(pmap, pv); 4122 if ((om->a.flags & PGA_WRITEABLE) != 0 && 4123 TAILQ_EMPTY(&om->md.pv_list) && 4124 ((om->flags & PG_FICTITIOUS) != 0 || 4125 TAILQ_EMPTY(&page_to_pvh(om)->pv_list))) 4126 vm_page_aflag_clear(om, PGA_WRITEABLE); 4127 } else { 4128 KASSERT((orig_l3 & ATTR_AF) != 0, 4129 ("pmap_enter: unmanaged mapping lacks ATTR_AF")); 4130 pmap_invalidate_page(pmap, va, true); 4131 } 4132 orig_l3 = 0; 4133 } else { 4134 /* 4135 * Increment the counters. 4136 */ 4137 if ((new_l3 & ATTR_SW_WIRED) != 0) 4138 pmap->pm_stats.wired_count++; 4139 pmap_resident_count_inc(pmap, 1); 4140 } 4141 /* 4142 * Enter on the PV list if part of our managed memory. 4143 */ 4144 if ((m->oflags & VPO_UNMANAGED) == 0) { 4145 if (pv == NULL) { 4146 pv = get_pv_entry(pmap, &lock); 4147 pv->pv_va = va; 4148 } 4149 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 4150 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4151 m->md.pv_gen++; 4152 if ((new_l3 & ATTR_SW_DBM) != 0) 4153 vm_page_aflag_set(m, PGA_WRITEABLE); 4154 } 4155 4156 validate: 4157 if (pmap->pm_stage == PM_STAGE1) { 4158 /* 4159 * Sync icache if exec permission and attribute 4160 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping 4161 * is stored and made valid for hardware table walk. If done 4162 * later, then other can access this page before caches are 4163 * properly synced. Don't do it for kernel memory which is 4164 * mapped with exec permission even if the memory isn't going 4165 * to hold executable code. The only time when icache sync is 4166 * needed is after kernel module is loaded and the relocation 4167 * info is processed. And it's done in elf_cpu_load_file(). 4168 */ 4169 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 4170 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK && 4171 (opa != pa || (orig_l3 & ATTR_S1_XN))) { 4172 PMAP_ASSERT_STAGE1(pmap); 4173 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); 4174 } 4175 } else { 4176 cpu_dcache_wb_range(PHYS_TO_DMAP(pa), PAGE_SIZE); 4177 } 4178 4179 /* 4180 * Update the L3 entry 4181 */ 4182 if (pmap_l3_valid(orig_l3)) { 4183 PMAP_ASSERT_STAGE1(pmap); 4184 KASSERT(opa == pa, ("pmap_enter: invalid update")); 4185 if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) { 4186 /* same PA, different attributes */ 4187 orig_l3 = pmap_load_store(l3, new_l3); 4188 pmap_invalidate_page(pmap, va, true); 4189 if ((orig_l3 & ATTR_SW_MANAGED) != 0 && 4190 pmap_pte_dirty(pmap, orig_l3)) 4191 vm_page_dirty(m); 4192 } else { 4193 /* 4194 * orig_l3 == new_l3 4195 * This can happens if multiple threads simultaneously 4196 * access not yet mapped page. This bad for performance 4197 * since this can cause full demotion-NOP-promotion 4198 * cycle. 4199 * Another possible reasons are: 4200 * - VM and pmap memory layout are diverged 4201 * - tlb flush is missing somewhere and CPU doesn't see 4202 * actual mapping. 4203 */ 4204 CTR4(KTR_PMAP, "%s: already mapped page - " 4205 "pmap %p va 0x%#lx pte 0x%lx", 4206 __func__, pmap, va, new_l3); 4207 } 4208 } else { 4209 /* New mapping */ 4210 pmap_store(l3, new_l3); 4211 dsb(ishst); 4212 } 4213 4214 #if VM_NRESERVLEVEL > 0 4215 /* 4216 * Try to promote from level 3 pages to a level 2 superpage. This 4217 * currently only works on stage 1 pmaps as pmap_promote_l2 looks at 4218 * stage 1 specific fields and performs a break-before-make sequence 4219 * that is incorrect a stage 2 pmap. 4220 */ 4221 if ((mpte == NULL || mpte->ref_count == NL3PG) && 4222 pmap_ps_enabled(pmap) && pmap->pm_stage == PM_STAGE1 && 4223 (m->flags & PG_FICTITIOUS) == 0 && 4224 vm_reserv_level_iffullpop(m) == 0) { 4225 pmap_promote_l2(pmap, pde, va, &lock); 4226 } 4227 #endif 4228 4229 rv = KERN_SUCCESS; 4230 out: 4231 if (lock != NULL) 4232 rw_wunlock(lock); 4233 PMAP_UNLOCK(pmap); 4234 return (rv); 4235 } 4236 4237 /* 4238 * Tries to create a read- and/or execute-only 2MB page mapping. Returns true 4239 * if successful. Returns false if (1) a page table page cannot be allocated 4240 * without sleeping, (2) a mapping already exists at the specified virtual 4241 * address, or (3) a PV entry cannot be allocated without reclaiming another 4242 * PV entry. 4243 */ 4244 static bool 4245 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4246 struct rwlock **lockp) 4247 { 4248 pd_entry_t new_l2; 4249 4250 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4251 PMAP_ASSERT_STAGE1(pmap); 4252 KASSERT(ADDR_IS_CANONICAL(va), 4253 ("%s: Address not in canonical form: %lx", __func__, va)); 4254 4255 new_l2 = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | 4256 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | 4257 L2_BLOCK); 4258 if ((m->oflags & VPO_UNMANAGED) == 0) { 4259 new_l2 |= ATTR_SW_MANAGED; 4260 new_l2 &= ~ATTR_AF; 4261 } 4262 if ((prot & VM_PROT_EXECUTE) == 0 || 4263 m->md.pv_memattr == VM_MEMATTR_DEVICE) 4264 new_l2 |= ATTR_S1_XN; 4265 if (!ADDR_IS_KERNEL(va)) 4266 new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 4267 else 4268 new_l2 |= ATTR_S1_UXN; 4269 if (pmap != kernel_pmap) 4270 new_l2 |= ATTR_S1_nG; 4271 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | 4272 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp) == 4273 KERN_SUCCESS); 4274 } 4275 4276 /* 4277 * Returns true if every page table entry in the specified page table is 4278 * zero. 4279 */ 4280 static bool 4281 pmap_every_pte_zero(vm_paddr_t pa) 4282 { 4283 pt_entry_t *pt_end, *pte; 4284 4285 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); 4286 pte = (pt_entry_t *)PHYS_TO_DMAP(pa); 4287 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) { 4288 if (*pte != 0) 4289 return (false); 4290 } 4291 return (true); 4292 } 4293 4294 /* 4295 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 4296 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 4297 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 4298 * a mapping already exists at the specified virtual address. Returns 4299 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table 4300 * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if 4301 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. 4302 */ 4303 static int 4304 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, 4305 vm_page_t m, struct rwlock **lockp) 4306 { 4307 struct spglist free; 4308 pd_entry_t *l2, old_l2; 4309 vm_page_t l2pg, mt; 4310 4311 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4312 KASSERT(ADDR_IS_CANONICAL(va), 4313 ("%s: Address not in canonical form: %lx", __func__, va)); 4314 4315 if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags & 4316 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { 4317 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", 4318 va, pmap); 4319 return (KERN_RESOURCE_SHORTAGE); 4320 } 4321 4322 /* 4323 * If there are existing mappings, either abort or remove them. 4324 */ 4325 if ((old_l2 = pmap_load(l2)) != 0) { 4326 KASSERT(l2pg == NULL || l2pg->ref_count > 1, 4327 ("pmap_enter_l2: l2pg's ref count is too low")); 4328 if ((flags & PMAP_ENTER_NOREPLACE) != 0 && 4329 (!ADDR_IS_KERNEL(va) || 4330 (old_l2 & ATTR_DESCR_MASK) == L2_BLOCK || 4331 !pmap_every_pte_zero(old_l2 & ~ATTR_MASK))) { 4332 if (l2pg != NULL) 4333 l2pg->ref_count--; 4334 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx" 4335 " in pmap %p", va, pmap); 4336 return (KERN_FAILURE); 4337 } 4338 SLIST_INIT(&free); 4339 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) 4340 (void)pmap_remove_l2(pmap, l2, va, 4341 pmap_load(pmap_l1(pmap, va)), &free, lockp); 4342 else 4343 pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE, 4344 &free, lockp); 4345 if (!ADDR_IS_KERNEL(va)) { 4346 vm_page_free_pages_toq(&free, true); 4347 KASSERT(pmap_load(l2) == 0, 4348 ("pmap_enter_l2: non-zero L2 entry %p", l2)); 4349 } else { 4350 KASSERT(SLIST_EMPTY(&free), 4351 ("pmap_enter_l2: freed kernel page table page")); 4352 4353 /* 4354 * Both pmap_remove_l2() and pmap_remove_l3_range() 4355 * will leave the kernel page table page zero filled. 4356 * Nonetheless, the TLB could have an intermediate 4357 * entry for the kernel page table page, so request 4358 * an invalidation at all levels after clearing 4359 * the L2_TABLE entry. 4360 */ 4361 mt = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); 4362 if (pmap_insert_pt_page(pmap, mt, false)) 4363 panic("pmap_enter_l2: trie insert failed"); 4364 pmap_clear(l2); 4365 pmap_invalidate_page(pmap, va, false); 4366 } 4367 } 4368 4369 if ((new_l2 & ATTR_SW_MANAGED) != 0) { 4370 /* 4371 * Abort this mapping if its PV entry could not be created. 4372 */ 4373 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { 4374 if (l2pg != NULL) 4375 pmap_abort_ptp(pmap, va, l2pg); 4376 CTR2(KTR_PMAP, 4377 "pmap_enter_l2: failure for va %#lx in pmap %p", 4378 va, pmap); 4379 return (KERN_RESOURCE_SHORTAGE); 4380 } 4381 if ((new_l2 & ATTR_SW_DBM) != 0) 4382 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 4383 vm_page_aflag_set(mt, PGA_WRITEABLE); 4384 } 4385 4386 /* 4387 * Increment counters. 4388 */ 4389 if ((new_l2 & ATTR_SW_WIRED) != 0) 4390 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; 4391 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; 4392 4393 /* 4394 * Conditionally sync the icache. See pmap_enter() for details. 4395 */ 4396 if ((new_l2 & ATTR_S1_XN) == 0 && ((new_l2 & ~ATTR_MASK) != 4397 (old_l2 & ~ATTR_MASK) || (old_l2 & ATTR_S1_XN) != 0) && 4398 pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) { 4399 cpu_icache_sync_range(PHYS_TO_DMAP(new_l2 & ~ATTR_MASK), 4400 L2_SIZE); 4401 } 4402 4403 /* 4404 * Map the superpage. 4405 */ 4406 pmap_store(l2, new_l2); 4407 dsb(ishst); 4408 4409 atomic_add_long(&pmap_l2_mappings, 1); 4410 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", 4411 va, pmap); 4412 4413 return (KERN_SUCCESS); 4414 } 4415 4416 /* 4417 * Maps a sequence of resident pages belonging to the same object. 4418 * The sequence begins with the given page m_start. This page is 4419 * mapped at the given virtual address start. Each subsequent page is 4420 * mapped at a virtual address that is offset from start by the same 4421 * amount as the page is offset from m_start within the object. The 4422 * last page in the sequence is the page with the largest offset from 4423 * m_start that can be mapped at a virtual address less than the given 4424 * virtual address end. Not every virtual page between start and end 4425 * is mapped; only those for which a resident page exists with the 4426 * corresponding offset from m_start are mapped. 4427 */ 4428 void 4429 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 4430 vm_page_t m_start, vm_prot_t prot) 4431 { 4432 struct rwlock *lock; 4433 vm_offset_t va; 4434 vm_page_t m, mpte; 4435 vm_pindex_t diff, psize; 4436 4437 VM_OBJECT_ASSERT_LOCKED(m_start->object); 4438 4439 psize = atop(end - start); 4440 mpte = NULL; 4441 m = m_start; 4442 lock = NULL; 4443 PMAP_LOCK(pmap); 4444 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 4445 va = start + ptoa(diff); 4446 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && 4447 m->psind == 1 && pmap_ps_enabled(pmap) && 4448 pmap_enter_2mpage(pmap, va, m, prot, &lock)) 4449 m = &m[L2_SIZE / PAGE_SIZE - 1]; 4450 else 4451 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, 4452 &lock); 4453 m = TAILQ_NEXT(m, listq); 4454 } 4455 if (lock != NULL) 4456 rw_wunlock(lock); 4457 PMAP_UNLOCK(pmap); 4458 } 4459 4460 /* 4461 * this code makes some *MAJOR* assumptions: 4462 * 1. Current pmap & pmap exists. 4463 * 2. Not wired. 4464 * 3. Read access. 4465 * 4. No page table pages. 4466 * but is *MUCH* faster than pmap_enter... 4467 */ 4468 4469 void 4470 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4471 { 4472 struct rwlock *lock; 4473 4474 lock = NULL; 4475 PMAP_LOCK(pmap); 4476 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 4477 if (lock != NULL) 4478 rw_wunlock(lock); 4479 PMAP_UNLOCK(pmap); 4480 } 4481 4482 static vm_page_t 4483 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 4484 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 4485 { 4486 pd_entry_t *pde; 4487 pt_entry_t *l2, *l3, l3_val; 4488 vm_paddr_t pa; 4489 int lvl; 4490 4491 KASSERT(!VA_IS_CLEANMAP(va) || 4492 (m->oflags & VPO_UNMANAGED) != 0, 4493 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 4494 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4495 PMAP_ASSERT_STAGE1(pmap); 4496 KASSERT(ADDR_IS_CANONICAL(va), 4497 ("%s: Address not in canonical form: %lx", __func__, va)); 4498 4499 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); 4500 /* 4501 * In the case that a page table page is not 4502 * resident, we are creating it here. 4503 */ 4504 if (!ADDR_IS_KERNEL(va)) { 4505 vm_pindex_t l2pindex; 4506 4507 /* 4508 * Calculate pagetable page index 4509 */ 4510 l2pindex = pmap_l2_pindex(va); 4511 if (mpte && (mpte->pindex == l2pindex)) { 4512 mpte->ref_count++; 4513 } else { 4514 /* 4515 * Get the l2 entry 4516 */ 4517 pde = pmap_pde(pmap, va, &lvl); 4518 4519 /* 4520 * If the page table page is mapped, we just increment 4521 * the hold count, and activate it. Otherwise, we 4522 * attempt to allocate a page table page. If this 4523 * attempt fails, we don't retry. Instead, we give up. 4524 */ 4525 if (lvl == 1) { 4526 l2 = pmap_l1_to_l2(pde, va); 4527 if ((pmap_load(l2) & ATTR_DESCR_MASK) == 4528 L2_BLOCK) 4529 return (NULL); 4530 } 4531 if (lvl == 2 && pmap_load(pde) != 0) { 4532 mpte = 4533 PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); 4534 mpte->ref_count++; 4535 } else { 4536 /* 4537 * Pass NULL instead of the PV list lock 4538 * pointer, because we don't intend to sleep. 4539 */ 4540 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); 4541 if (mpte == NULL) 4542 return (mpte); 4543 } 4544 } 4545 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 4546 l3 = &l3[pmap_l3_index(va)]; 4547 } else { 4548 mpte = NULL; 4549 pde = pmap_pde(kernel_pmap, va, &lvl); 4550 KASSERT(pde != NULL, 4551 ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx", 4552 va)); 4553 KASSERT(lvl == 2, 4554 ("pmap_enter_quick_locked: Invalid level %d", lvl)); 4555 l3 = pmap_l2_to_l3(pde, va); 4556 } 4557 4558 /* 4559 * Abort if a mapping already exists. 4560 */ 4561 if (pmap_load(l3) != 0) { 4562 if (mpte != NULL) 4563 mpte->ref_count--; 4564 return (NULL); 4565 } 4566 4567 /* 4568 * Enter on the PV list if part of our managed memory. 4569 */ 4570 if ((m->oflags & VPO_UNMANAGED) == 0 && 4571 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 4572 if (mpte != NULL) 4573 pmap_abort_ptp(pmap, va, mpte); 4574 return (NULL); 4575 } 4576 4577 /* 4578 * Increment counters 4579 */ 4580 pmap_resident_count_inc(pmap, 1); 4581 4582 pa = VM_PAGE_TO_PHYS(m); 4583 l3_val = pa | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) | 4584 ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE; 4585 if ((prot & VM_PROT_EXECUTE) == 0 || 4586 m->md.pv_memattr == VM_MEMATTR_DEVICE) 4587 l3_val |= ATTR_S1_XN; 4588 if (!ADDR_IS_KERNEL(va)) 4589 l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 4590 else 4591 l3_val |= ATTR_S1_UXN; 4592 if (pmap != kernel_pmap) 4593 l3_val |= ATTR_S1_nG; 4594 4595 /* 4596 * Now validate mapping with RO protection 4597 */ 4598 if ((m->oflags & VPO_UNMANAGED) == 0) { 4599 l3_val |= ATTR_SW_MANAGED; 4600 l3_val &= ~ATTR_AF; 4601 } 4602 4603 /* Sync icache before the mapping is stored to PTE */ 4604 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 4605 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) 4606 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); 4607 4608 pmap_store(l3, l3_val); 4609 dsb(ishst); 4610 4611 return (mpte); 4612 } 4613 4614 /* 4615 * This code maps large physical mmap regions into the 4616 * processor address space. Note that some shortcuts 4617 * are taken, but the code works. 4618 */ 4619 void 4620 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 4621 vm_pindex_t pindex, vm_size_t size) 4622 { 4623 4624 VM_OBJECT_ASSERT_WLOCKED(object); 4625 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 4626 ("pmap_object_init_pt: non-device object")); 4627 } 4628 4629 /* 4630 * Clear the wired attribute from the mappings for the specified range of 4631 * addresses in the given pmap. Every valid mapping within that range 4632 * must have the wired attribute set. In contrast, invalid mappings 4633 * cannot have the wired attribute set, so they are ignored. 4634 * 4635 * The wired attribute of the page table entry is not a hardware feature, 4636 * so there is no need to invalidate any TLB entries. 4637 */ 4638 void 4639 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4640 { 4641 vm_offset_t va_next; 4642 pd_entry_t *l0, *l1, *l2; 4643 pt_entry_t *l3; 4644 4645 PMAP_LOCK(pmap); 4646 for (; sva < eva; sva = va_next) { 4647 l0 = pmap_l0(pmap, sva); 4648 if (pmap_load(l0) == 0) { 4649 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 4650 if (va_next < sva) 4651 va_next = eva; 4652 continue; 4653 } 4654 4655 l1 = pmap_l0_to_l1(l0, sva); 4656 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 4657 if (va_next < sva) 4658 va_next = eva; 4659 if (pmap_load(l1) == 0) 4660 continue; 4661 4662 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 4663 KASSERT(va_next <= eva, 4664 ("partial update of non-transparent 1G page " 4665 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 4666 pmap_load(l1), sva, eva, va_next)); 4667 MPASS(pmap != kernel_pmap); 4668 MPASS((pmap_load(l1) & (ATTR_SW_MANAGED | 4669 ATTR_SW_WIRED)) == ATTR_SW_WIRED); 4670 pmap_clear_bits(l1, ATTR_SW_WIRED); 4671 pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE; 4672 continue; 4673 } 4674 4675 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 4676 if (va_next < sva) 4677 va_next = eva; 4678 4679 l2 = pmap_l1_to_l2(l1, sva); 4680 if (pmap_load(l2) == 0) 4681 continue; 4682 4683 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { 4684 if ((pmap_load(l2) & ATTR_SW_WIRED) == 0) 4685 panic("pmap_unwire: l2 %#jx is missing " 4686 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2)); 4687 4688 /* 4689 * Are we unwiring the entire large page? If not, 4690 * demote the mapping and fall through. 4691 */ 4692 if (sva + L2_SIZE == va_next && eva >= va_next) { 4693 pmap_clear_bits(l2, ATTR_SW_WIRED); 4694 pmap->pm_stats.wired_count -= L2_SIZE / 4695 PAGE_SIZE; 4696 continue; 4697 } else if (pmap_demote_l2(pmap, l2, sva) == NULL) 4698 panic("pmap_unwire: demotion failed"); 4699 } 4700 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 4701 ("pmap_unwire: Invalid l2 entry after demotion")); 4702 4703 if (va_next > eva) 4704 va_next = eva; 4705 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 4706 sva += L3_SIZE) { 4707 if (pmap_load(l3) == 0) 4708 continue; 4709 if ((pmap_load(l3) & ATTR_SW_WIRED) == 0) 4710 panic("pmap_unwire: l3 %#jx is missing " 4711 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3)); 4712 4713 /* 4714 * ATTR_SW_WIRED must be cleared atomically. Although 4715 * the pmap lock synchronizes access to ATTR_SW_WIRED, 4716 * the System MMU may write to the entry concurrently. 4717 */ 4718 pmap_clear_bits(l3, ATTR_SW_WIRED); 4719 pmap->pm_stats.wired_count--; 4720 } 4721 } 4722 PMAP_UNLOCK(pmap); 4723 } 4724 4725 /* 4726 * Copy the range specified by src_addr/len 4727 * from the source map to the range dst_addr/len 4728 * in the destination map. 4729 * 4730 * This routine is only advisory and need not do anything. 4731 * 4732 * Because the executable mappings created by this routine are copied, 4733 * it should not have to flush the instruction cache. 4734 */ 4735 void 4736 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 4737 vm_offset_t src_addr) 4738 { 4739 struct rwlock *lock; 4740 pd_entry_t *l0, *l1, *l2, srcptepaddr; 4741 pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte; 4742 vm_offset_t addr, end_addr, va_next; 4743 vm_page_t dst_m, dstmpte, srcmpte; 4744 4745 PMAP_ASSERT_STAGE1(dst_pmap); 4746 PMAP_ASSERT_STAGE1(src_pmap); 4747 4748 if (dst_addr != src_addr) 4749 return; 4750 end_addr = src_addr + len; 4751 lock = NULL; 4752 if (dst_pmap < src_pmap) { 4753 PMAP_LOCK(dst_pmap); 4754 PMAP_LOCK(src_pmap); 4755 } else { 4756 PMAP_LOCK(src_pmap); 4757 PMAP_LOCK(dst_pmap); 4758 } 4759 for (addr = src_addr; addr < end_addr; addr = va_next) { 4760 l0 = pmap_l0(src_pmap, addr); 4761 if (pmap_load(l0) == 0) { 4762 va_next = (addr + L0_SIZE) & ~L0_OFFSET; 4763 if (va_next < addr) 4764 va_next = end_addr; 4765 continue; 4766 } 4767 4768 va_next = (addr + L1_SIZE) & ~L1_OFFSET; 4769 if (va_next < addr) 4770 va_next = end_addr; 4771 l1 = pmap_l0_to_l1(l0, addr); 4772 if (pmap_load(l1) == 0) 4773 continue; 4774 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 4775 KASSERT(va_next <= end_addr, 4776 ("partial update of non-transparent 1G page " 4777 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx", 4778 pmap_load(l1), addr, end_addr, va_next)); 4779 srcptepaddr = pmap_load(l1); 4780 l1 = pmap_l1(dst_pmap, addr); 4781 if (l1 == NULL) { 4782 if (_pmap_alloc_l3(dst_pmap, 4783 pmap_l0_pindex(addr), NULL) == NULL) 4784 break; 4785 l1 = pmap_l1(dst_pmap, addr); 4786 } else { 4787 l0 = pmap_l0(dst_pmap, addr); 4788 dst_m = PHYS_TO_VM_PAGE(pmap_load(l0) & 4789 ~ATTR_MASK); 4790 dst_m->ref_count++; 4791 } 4792 KASSERT(pmap_load(l1) == 0, 4793 ("1G mapping present in dst pmap " 4794 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx", 4795 pmap_load(l1), addr, end_addr, va_next)); 4796 pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED); 4797 pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE); 4798 continue; 4799 } 4800 4801 va_next = (addr + L2_SIZE) & ~L2_OFFSET; 4802 if (va_next < addr) 4803 va_next = end_addr; 4804 l2 = pmap_l1_to_l2(l1, addr); 4805 srcptepaddr = pmap_load(l2); 4806 if (srcptepaddr == 0) 4807 continue; 4808 if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) { 4809 /* 4810 * We can only virtual copy whole superpages. 4811 */ 4812 if ((addr & L2_OFFSET) != 0 || 4813 addr + L2_SIZE > end_addr) 4814 continue; 4815 l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL); 4816 if (l2 == NULL) 4817 break; 4818 if (pmap_load(l2) == 0 && 4819 ((srcptepaddr & ATTR_SW_MANAGED) == 0 || 4820 pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr, 4821 PMAP_ENTER_NORECLAIM, &lock))) { 4822 /* 4823 * We leave the dirty bit unchanged because 4824 * managed read/write superpage mappings are 4825 * required to be dirty. However, managed 4826 * superpage mappings are not required to 4827 * have their accessed bit set, so we clear 4828 * it because we don't know if this mapping 4829 * will be used. 4830 */ 4831 srcptepaddr &= ~ATTR_SW_WIRED; 4832 if ((srcptepaddr & ATTR_SW_MANAGED) != 0) 4833 srcptepaddr &= ~ATTR_AF; 4834 pmap_store(l2, srcptepaddr); 4835 pmap_resident_count_inc(dst_pmap, L2_SIZE / 4836 PAGE_SIZE); 4837 atomic_add_long(&pmap_l2_mappings, 1); 4838 } else 4839 pmap_abort_ptp(dst_pmap, addr, dst_m); 4840 continue; 4841 } 4842 KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE, 4843 ("pmap_copy: invalid L2 entry")); 4844 srcptepaddr &= ~ATTR_MASK; 4845 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 4846 KASSERT(srcmpte->ref_count > 0, 4847 ("pmap_copy: source page table page is unused")); 4848 if (va_next > end_addr) 4849 va_next = end_addr; 4850 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 4851 src_pte = &src_pte[pmap_l3_index(addr)]; 4852 dstmpte = NULL; 4853 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { 4854 ptetemp = pmap_load(src_pte); 4855 4856 /* 4857 * We only virtual copy managed pages. 4858 */ 4859 if ((ptetemp & ATTR_SW_MANAGED) == 0) 4860 continue; 4861 4862 if (dstmpte != NULL) { 4863 KASSERT(dstmpte->pindex == pmap_l2_pindex(addr), 4864 ("dstmpte pindex/addr mismatch")); 4865 dstmpte->ref_count++; 4866 } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr, 4867 NULL)) == NULL) 4868 goto out; 4869 dst_pte = (pt_entry_t *) 4870 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 4871 dst_pte = &dst_pte[pmap_l3_index(addr)]; 4872 if (pmap_load(dst_pte) == 0 && 4873 pmap_try_insert_pv_entry(dst_pmap, addr, 4874 PHYS_TO_VM_PAGE(ptetemp & ~ATTR_MASK), &lock)) { 4875 /* 4876 * Clear the wired, modified, and accessed 4877 * (referenced) bits during the copy. 4878 */ 4879 mask = ATTR_AF | ATTR_SW_WIRED; 4880 nbits = 0; 4881 if ((ptetemp & ATTR_SW_DBM) != 0) 4882 nbits |= ATTR_S1_AP_RW_BIT; 4883 pmap_store(dst_pte, (ptetemp & ~mask) | nbits); 4884 pmap_resident_count_inc(dst_pmap, 1); 4885 } else { 4886 pmap_abort_ptp(dst_pmap, addr, dstmpte); 4887 goto out; 4888 } 4889 /* Have we copied all of the valid mappings? */ 4890 if (dstmpte->ref_count >= srcmpte->ref_count) 4891 break; 4892 } 4893 } 4894 out: 4895 /* 4896 * XXX This barrier may not be needed because the destination pmap is 4897 * not active. 4898 */ 4899 dsb(ishst); 4900 4901 if (lock != NULL) 4902 rw_wunlock(lock); 4903 PMAP_UNLOCK(src_pmap); 4904 PMAP_UNLOCK(dst_pmap); 4905 } 4906 4907 /* 4908 * pmap_zero_page zeros the specified hardware page by mapping 4909 * the page into KVM and using bzero to clear its contents. 4910 */ 4911 void 4912 pmap_zero_page(vm_page_t m) 4913 { 4914 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 4915 4916 pagezero((void *)va); 4917 } 4918 4919 /* 4920 * pmap_zero_page_area zeros the specified hardware page by mapping 4921 * the page into KVM and using bzero to clear its contents. 4922 * 4923 * off and size may not cover an area beyond a single hardware page. 4924 */ 4925 void 4926 pmap_zero_page_area(vm_page_t m, int off, int size) 4927 { 4928 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 4929 4930 if (off == 0 && size == PAGE_SIZE) 4931 pagezero((void *)va); 4932 else 4933 bzero((char *)va + off, size); 4934 } 4935 4936 /* 4937 * pmap_copy_page copies the specified (machine independent) 4938 * page by mapping the page into virtual memory and using 4939 * bcopy to copy the page, one machine dependent page at a 4940 * time. 4941 */ 4942 void 4943 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 4944 { 4945 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 4946 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 4947 4948 pagecopy((void *)src, (void *)dst); 4949 } 4950 4951 int unmapped_buf_allowed = 1; 4952 4953 void 4954 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 4955 vm_offset_t b_offset, int xfersize) 4956 { 4957 void *a_cp, *b_cp; 4958 vm_page_t m_a, m_b; 4959 vm_paddr_t p_a, p_b; 4960 vm_offset_t a_pg_offset, b_pg_offset; 4961 int cnt; 4962 4963 while (xfersize > 0) { 4964 a_pg_offset = a_offset & PAGE_MASK; 4965 m_a = ma[a_offset >> PAGE_SHIFT]; 4966 p_a = m_a->phys_addr; 4967 b_pg_offset = b_offset & PAGE_MASK; 4968 m_b = mb[b_offset >> PAGE_SHIFT]; 4969 p_b = m_b->phys_addr; 4970 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 4971 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 4972 if (__predict_false(!PHYS_IN_DMAP(p_a))) { 4973 panic("!DMAP a %lx", p_a); 4974 } else { 4975 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 4976 } 4977 if (__predict_false(!PHYS_IN_DMAP(p_b))) { 4978 panic("!DMAP b %lx", p_b); 4979 } else { 4980 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 4981 } 4982 bcopy(a_cp, b_cp, cnt); 4983 a_offset += cnt; 4984 b_offset += cnt; 4985 xfersize -= cnt; 4986 } 4987 } 4988 4989 vm_offset_t 4990 pmap_quick_enter_page(vm_page_t m) 4991 { 4992 4993 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); 4994 } 4995 4996 void 4997 pmap_quick_remove_page(vm_offset_t addr) 4998 { 4999 } 5000 5001 /* 5002 * Returns true if the pmap's pv is one of the first 5003 * 16 pvs linked to from this page. This count may 5004 * be changed upwards or downwards in the future; it 5005 * is only necessary that true be returned for a small 5006 * subset of pmaps for proper page aging. 5007 */ 5008 boolean_t 5009 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 5010 { 5011 struct md_page *pvh; 5012 struct rwlock *lock; 5013 pv_entry_t pv; 5014 int loops = 0; 5015 boolean_t rv; 5016 5017 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5018 ("pmap_page_exists_quick: page %p is not managed", m)); 5019 rv = FALSE; 5020 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5021 rw_rlock(lock); 5022 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5023 if (PV_PMAP(pv) == pmap) { 5024 rv = TRUE; 5025 break; 5026 } 5027 loops++; 5028 if (loops >= 16) 5029 break; 5030 } 5031 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 5032 pvh = page_to_pvh(m); 5033 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5034 if (PV_PMAP(pv) == pmap) { 5035 rv = TRUE; 5036 break; 5037 } 5038 loops++; 5039 if (loops >= 16) 5040 break; 5041 } 5042 } 5043 rw_runlock(lock); 5044 return (rv); 5045 } 5046 5047 /* 5048 * pmap_page_wired_mappings: 5049 * 5050 * Return the number of managed mappings to the given physical page 5051 * that are wired. 5052 */ 5053 int 5054 pmap_page_wired_mappings(vm_page_t m) 5055 { 5056 struct rwlock *lock; 5057 struct md_page *pvh; 5058 pmap_t pmap; 5059 pt_entry_t *pte; 5060 pv_entry_t pv; 5061 int count, md_gen, pvh_gen; 5062 5063 if ((m->oflags & VPO_UNMANAGED) != 0) 5064 return (0); 5065 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5066 rw_rlock(lock); 5067 restart: 5068 count = 0; 5069 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5070 pmap = PV_PMAP(pv); 5071 if (!PMAP_TRYLOCK(pmap)) { 5072 md_gen = m->md.pv_gen; 5073 rw_runlock(lock); 5074 PMAP_LOCK(pmap); 5075 rw_rlock(lock); 5076 if (md_gen != m->md.pv_gen) { 5077 PMAP_UNLOCK(pmap); 5078 goto restart; 5079 } 5080 } 5081 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 5082 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0) 5083 count++; 5084 PMAP_UNLOCK(pmap); 5085 } 5086 if ((m->flags & PG_FICTITIOUS) == 0) { 5087 pvh = page_to_pvh(m); 5088 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5089 pmap = PV_PMAP(pv); 5090 if (!PMAP_TRYLOCK(pmap)) { 5091 md_gen = m->md.pv_gen; 5092 pvh_gen = pvh->pv_gen; 5093 rw_runlock(lock); 5094 PMAP_LOCK(pmap); 5095 rw_rlock(lock); 5096 if (md_gen != m->md.pv_gen || 5097 pvh_gen != pvh->pv_gen) { 5098 PMAP_UNLOCK(pmap); 5099 goto restart; 5100 } 5101 } 5102 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__); 5103 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0) 5104 count++; 5105 PMAP_UNLOCK(pmap); 5106 } 5107 } 5108 rw_runlock(lock); 5109 return (count); 5110 } 5111 5112 /* 5113 * Returns true if the given page is mapped individually or as part of 5114 * a 2mpage. Otherwise, returns false. 5115 */ 5116 bool 5117 pmap_page_is_mapped(vm_page_t m) 5118 { 5119 struct rwlock *lock; 5120 bool rv; 5121 5122 if ((m->oflags & VPO_UNMANAGED) != 0) 5123 return (false); 5124 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5125 rw_rlock(lock); 5126 rv = !TAILQ_EMPTY(&m->md.pv_list) || 5127 ((m->flags & PG_FICTITIOUS) == 0 && 5128 !TAILQ_EMPTY(&page_to_pvh(m)->pv_list)); 5129 rw_runlock(lock); 5130 return (rv); 5131 } 5132 5133 /* 5134 * Destroy all managed, non-wired mappings in the given user-space 5135 * pmap. This pmap cannot be active on any processor besides the 5136 * caller. 5137 * 5138 * This function cannot be applied to the kernel pmap. Moreover, it 5139 * is not intended for general use. It is only to be used during 5140 * process termination. Consequently, it can be implemented in ways 5141 * that make it faster than pmap_remove(). First, it can more quickly 5142 * destroy mappings by iterating over the pmap's collection of PV 5143 * entries, rather than searching the page table. Second, it doesn't 5144 * have to test and clear the page table entries atomically, because 5145 * no processor is currently accessing the user address space. In 5146 * particular, a page table entry's dirty bit won't change state once 5147 * this function starts. 5148 */ 5149 void 5150 pmap_remove_pages(pmap_t pmap) 5151 { 5152 pd_entry_t *pde; 5153 pt_entry_t *pte, tpte; 5154 struct spglist free; 5155 vm_page_t m, ml3, mt; 5156 pv_entry_t pv; 5157 struct md_page *pvh; 5158 struct pv_chunk *pc, *npc; 5159 struct rwlock *lock; 5160 int64_t bit; 5161 uint64_t inuse, bitmask; 5162 int allfree, field, freed, idx, lvl; 5163 vm_paddr_t pa; 5164 5165 lock = NULL; 5166 5167 SLIST_INIT(&free); 5168 PMAP_LOCK(pmap); 5169 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 5170 allfree = 1; 5171 freed = 0; 5172 for (field = 0; field < _NPCM; field++) { 5173 inuse = ~pc->pc_map[field] & pc_freemask[field]; 5174 while (inuse != 0) { 5175 bit = ffsl(inuse) - 1; 5176 bitmask = 1UL << bit; 5177 idx = field * 64 + bit; 5178 pv = &pc->pc_pventry[idx]; 5179 inuse &= ~bitmask; 5180 5181 pde = pmap_pde(pmap, pv->pv_va, &lvl); 5182 KASSERT(pde != NULL, 5183 ("Attempting to remove an unmapped page")); 5184 5185 switch(lvl) { 5186 case 1: 5187 pte = pmap_l1_to_l2(pde, pv->pv_va); 5188 tpte = pmap_load(pte); 5189 KASSERT((tpte & ATTR_DESCR_MASK) == 5190 L2_BLOCK, 5191 ("Attempting to remove an invalid " 5192 "block: %lx", tpte)); 5193 break; 5194 case 2: 5195 pte = pmap_l2_to_l3(pde, pv->pv_va); 5196 tpte = pmap_load(pte); 5197 KASSERT((tpte & ATTR_DESCR_MASK) == 5198 L3_PAGE, 5199 ("Attempting to remove an invalid " 5200 "page: %lx", tpte)); 5201 break; 5202 default: 5203 panic( 5204 "Invalid page directory level: %d", 5205 lvl); 5206 } 5207 5208 /* 5209 * We cannot remove wired pages from a process' mapping at this time 5210 */ 5211 if (tpte & ATTR_SW_WIRED) { 5212 allfree = 0; 5213 continue; 5214 } 5215 5216 /* Mark free */ 5217 pc->pc_map[field] |= bitmask; 5218 5219 /* 5220 * Because this pmap is not active on other 5221 * processors, the dirty bit cannot have 5222 * changed state since we last loaded pte. 5223 */ 5224 pmap_clear(pte); 5225 5226 pa = tpte & ~ATTR_MASK; 5227 5228 m = PHYS_TO_VM_PAGE(pa); 5229 KASSERT(m->phys_addr == pa, 5230 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 5231 m, (uintmax_t)m->phys_addr, 5232 (uintmax_t)tpte)); 5233 5234 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 5235 m < &vm_page_array[vm_page_array_size], 5236 ("pmap_remove_pages: bad pte %#jx", 5237 (uintmax_t)tpte)); 5238 5239 /* 5240 * Update the vm_page_t clean/reference bits. 5241 */ 5242 if (pmap_pte_dirty(pmap, tpte)) { 5243 switch (lvl) { 5244 case 1: 5245 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 5246 vm_page_dirty(mt); 5247 break; 5248 case 2: 5249 vm_page_dirty(m); 5250 break; 5251 } 5252 } 5253 5254 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 5255 5256 switch (lvl) { 5257 case 1: 5258 pmap_resident_count_dec(pmap, 5259 L2_SIZE / PAGE_SIZE); 5260 pvh = page_to_pvh(m); 5261 TAILQ_REMOVE(&pvh->pv_list, pv,pv_next); 5262 pvh->pv_gen++; 5263 if (TAILQ_EMPTY(&pvh->pv_list)) { 5264 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 5265 if ((mt->a.flags & PGA_WRITEABLE) != 0 && 5266 TAILQ_EMPTY(&mt->md.pv_list)) 5267 vm_page_aflag_clear(mt, PGA_WRITEABLE); 5268 } 5269 ml3 = pmap_remove_pt_page(pmap, 5270 pv->pv_va); 5271 if (ml3 != NULL) { 5272 KASSERT(ml3->valid == VM_PAGE_BITS_ALL, 5273 ("pmap_remove_pages: l3 page not promoted")); 5274 pmap_resident_count_dec(pmap,1); 5275 KASSERT(ml3->ref_count == NL3PG, 5276 ("pmap_remove_pages: l3 page ref count error")); 5277 ml3->ref_count = 0; 5278 pmap_add_delayed_free_list(ml3, 5279 &free, FALSE); 5280 } 5281 break; 5282 case 2: 5283 pmap_resident_count_dec(pmap, 1); 5284 TAILQ_REMOVE(&m->md.pv_list, pv, 5285 pv_next); 5286 m->md.pv_gen++; 5287 if ((m->a.flags & PGA_WRITEABLE) != 0 && 5288 TAILQ_EMPTY(&m->md.pv_list) && 5289 (m->flags & PG_FICTITIOUS) == 0) { 5290 pvh = page_to_pvh(m); 5291 if (TAILQ_EMPTY(&pvh->pv_list)) 5292 vm_page_aflag_clear(m, 5293 PGA_WRITEABLE); 5294 } 5295 break; 5296 } 5297 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde), 5298 &free); 5299 freed++; 5300 } 5301 } 5302 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 5303 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 5304 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 5305 if (allfree) { 5306 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5307 free_pv_chunk(pc); 5308 } 5309 } 5310 if (lock != NULL) 5311 rw_wunlock(lock); 5312 pmap_invalidate_all(pmap); 5313 PMAP_UNLOCK(pmap); 5314 vm_page_free_pages_toq(&free, true); 5315 } 5316 5317 /* 5318 * This is used to check if a page has been accessed or modified. 5319 */ 5320 static boolean_t 5321 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 5322 { 5323 struct rwlock *lock; 5324 pv_entry_t pv; 5325 struct md_page *pvh; 5326 pt_entry_t *pte, mask, value; 5327 pmap_t pmap; 5328 int md_gen, pvh_gen; 5329 boolean_t rv; 5330 5331 rv = FALSE; 5332 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5333 rw_rlock(lock); 5334 restart: 5335 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5336 pmap = PV_PMAP(pv); 5337 PMAP_ASSERT_STAGE1(pmap); 5338 if (!PMAP_TRYLOCK(pmap)) { 5339 md_gen = m->md.pv_gen; 5340 rw_runlock(lock); 5341 PMAP_LOCK(pmap); 5342 rw_rlock(lock); 5343 if (md_gen != m->md.pv_gen) { 5344 PMAP_UNLOCK(pmap); 5345 goto restart; 5346 } 5347 } 5348 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 5349 mask = 0; 5350 value = 0; 5351 if (modified) { 5352 mask |= ATTR_S1_AP_RW_BIT; 5353 value |= ATTR_S1_AP(ATTR_S1_AP_RW); 5354 } 5355 if (accessed) { 5356 mask |= ATTR_AF | ATTR_DESCR_MASK; 5357 value |= ATTR_AF | L3_PAGE; 5358 } 5359 rv = (pmap_load(pte) & mask) == value; 5360 PMAP_UNLOCK(pmap); 5361 if (rv) 5362 goto out; 5363 } 5364 if ((m->flags & PG_FICTITIOUS) == 0) { 5365 pvh = page_to_pvh(m); 5366 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5367 pmap = PV_PMAP(pv); 5368 PMAP_ASSERT_STAGE1(pmap); 5369 if (!PMAP_TRYLOCK(pmap)) { 5370 md_gen = m->md.pv_gen; 5371 pvh_gen = pvh->pv_gen; 5372 rw_runlock(lock); 5373 PMAP_LOCK(pmap); 5374 rw_rlock(lock); 5375 if (md_gen != m->md.pv_gen || 5376 pvh_gen != pvh->pv_gen) { 5377 PMAP_UNLOCK(pmap); 5378 goto restart; 5379 } 5380 } 5381 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__); 5382 mask = 0; 5383 value = 0; 5384 if (modified) { 5385 mask |= ATTR_S1_AP_RW_BIT; 5386 value |= ATTR_S1_AP(ATTR_S1_AP_RW); 5387 } 5388 if (accessed) { 5389 mask |= ATTR_AF | ATTR_DESCR_MASK; 5390 value |= ATTR_AF | L2_BLOCK; 5391 } 5392 rv = (pmap_load(pte) & mask) == value; 5393 PMAP_UNLOCK(pmap); 5394 if (rv) 5395 goto out; 5396 } 5397 } 5398 out: 5399 rw_runlock(lock); 5400 return (rv); 5401 } 5402 5403 /* 5404 * pmap_is_modified: 5405 * 5406 * Return whether or not the specified physical page was modified 5407 * in any physical maps. 5408 */ 5409 boolean_t 5410 pmap_is_modified(vm_page_t m) 5411 { 5412 5413 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5414 ("pmap_is_modified: page %p is not managed", m)); 5415 5416 /* 5417 * If the page is not busied then this check is racy. 5418 */ 5419 if (!pmap_page_is_write_mapped(m)) 5420 return (FALSE); 5421 return (pmap_page_test_mappings(m, FALSE, TRUE)); 5422 } 5423 5424 /* 5425 * pmap_is_prefaultable: 5426 * 5427 * Return whether or not the specified virtual address is eligible 5428 * for prefault. 5429 */ 5430 boolean_t 5431 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 5432 { 5433 pd_entry_t *pde; 5434 pt_entry_t *pte; 5435 boolean_t rv; 5436 int lvl; 5437 5438 /* 5439 * Return TRUE if and only if the L3 entry for the specified virtual 5440 * address is allocated but invalid. 5441 */ 5442 rv = FALSE; 5443 PMAP_LOCK(pmap); 5444 pde = pmap_pde(pmap, addr, &lvl); 5445 if (pde != NULL && lvl == 2) { 5446 pte = pmap_l2_to_l3(pde, addr); 5447 rv = pmap_load(pte) == 0; 5448 } 5449 PMAP_UNLOCK(pmap); 5450 return (rv); 5451 } 5452 5453 /* 5454 * pmap_is_referenced: 5455 * 5456 * Return whether or not the specified physical page was referenced 5457 * in any physical maps. 5458 */ 5459 boolean_t 5460 pmap_is_referenced(vm_page_t m) 5461 { 5462 5463 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5464 ("pmap_is_referenced: page %p is not managed", m)); 5465 return (pmap_page_test_mappings(m, TRUE, FALSE)); 5466 } 5467 5468 /* 5469 * Clear the write and modified bits in each of the given page's mappings. 5470 */ 5471 void 5472 pmap_remove_write(vm_page_t m) 5473 { 5474 struct md_page *pvh; 5475 pmap_t pmap; 5476 struct rwlock *lock; 5477 pv_entry_t next_pv, pv; 5478 pt_entry_t oldpte, *pte; 5479 vm_offset_t va; 5480 int md_gen, pvh_gen; 5481 5482 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5483 ("pmap_remove_write: page %p is not managed", m)); 5484 vm_page_assert_busied(m); 5485 5486 if (!pmap_page_is_write_mapped(m)) 5487 return; 5488 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5489 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 5490 rw_wlock(lock); 5491 retry: 5492 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5493 pmap = PV_PMAP(pv); 5494 PMAP_ASSERT_STAGE1(pmap); 5495 if (!PMAP_TRYLOCK(pmap)) { 5496 pvh_gen = pvh->pv_gen; 5497 rw_wunlock(lock); 5498 PMAP_LOCK(pmap); 5499 rw_wlock(lock); 5500 if (pvh_gen != pvh->pv_gen) { 5501 PMAP_UNLOCK(pmap); 5502 goto retry; 5503 } 5504 } 5505 va = pv->pv_va; 5506 pte = pmap_pte_exists(pmap, va, 2, __func__); 5507 if ((pmap_load(pte) & ATTR_SW_DBM) != 0) 5508 (void)pmap_demote_l2_locked(pmap, pte, va, &lock); 5509 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 5510 ("inconsistent pv lock %p %p for page %p", 5511 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 5512 PMAP_UNLOCK(pmap); 5513 } 5514 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5515 pmap = PV_PMAP(pv); 5516 PMAP_ASSERT_STAGE1(pmap); 5517 if (!PMAP_TRYLOCK(pmap)) { 5518 pvh_gen = pvh->pv_gen; 5519 md_gen = m->md.pv_gen; 5520 rw_wunlock(lock); 5521 PMAP_LOCK(pmap); 5522 rw_wlock(lock); 5523 if (pvh_gen != pvh->pv_gen || 5524 md_gen != m->md.pv_gen) { 5525 PMAP_UNLOCK(pmap); 5526 goto retry; 5527 } 5528 } 5529 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 5530 oldpte = pmap_load(pte); 5531 if ((oldpte & ATTR_SW_DBM) != 0) { 5532 while (!atomic_fcmpset_64(pte, &oldpte, 5533 (oldpte | ATTR_S1_AP_RW_BIT) & ~ATTR_SW_DBM)) 5534 cpu_spinwait(); 5535 if ((oldpte & ATTR_S1_AP_RW_BIT) == 5536 ATTR_S1_AP(ATTR_S1_AP_RW)) 5537 vm_page_dirty(m); 5538 pmap_invalidate_page(pmap, pv->pv_va, true); 5539 } 5540 PMAP_UNLOCK(pmap); 5541 } 5542 rw_wunlock(lock); 5543 vm_page_aflag_clear(m, PGA_WRITEABLE); 5544 } 5545 5546 /* 5547 * pmap_ts_referenced: 5548 * 5549 * Return a count of reference bits for a page, clearing those bits. 5550 * It is not necessary for every reference bit to be cleared, but it 5551 * is necessary that 0 only be returned when there are truly no 5552 * reference bits set. 5553 * 5554 * As an optimization, update the page's dirty field if a modified bit is 5555 * found while counting reference bits. This opportunistic update can be 5556 * performed at low cost and can eliminate the need for some future calls 5557 * to pmap_is_modified(). However, since this function stops after 5558 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 5559 * dirty pages. Those dirty pages will only be detected by a future call 5560 * to pmap_is_modified(). 5561 */ 5562 int 5563 pmap_ts_referenced(vm_page_t m) 5564 { 5565 struct md_page *pvh; 5566 pv_entry_t pv, pvf; 5567 pmap_t pmap; 5568 struct rwlock *lock; 5569 pt_entry_t *pte, tpte; 5570 vm_offset_t va; 5571 vm_paddr_t pa; 5572 int cleared, md_gen, not_cleared, pvh_gen; 5573 struct spglist free; 5574 5575 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5576 ("pmap_ts_referenced: page %p is not managed", m)); 5577 SLIST_INIT(&free); 5578 cleared = 0; 5579 pa = VM_PAGE_TO_PHYS(m); 5580 lock = PHYS_TO_PV_LIST_LOCK(pa); 5581 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 5582 rw_wlock(lock); 5583 retry: 5584 not_cleared = 0; 5585 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 5586 goto small_mappings; 5587 pv = pvf; 5588 do { 5589 if (pvf == NULL) 5590 pvf = pv; 5591 pmap = PV_PMAP(pv); 5592 if (!PMAP_TRYLOCK(pmap)) { 5593 pvh_gen = pvh->pv_gen; 5594 rw_wunlock(lock); 5595 PMAP_LOCK(pmap); 5596 rw_wlock(lock); 5597 if (pvh_gen != pvh->pv_gen) { 5598 PMAP_UNLOCK(pmap); 5599 goto retry; 5600 } 5601 } 5602 va = pv->pv_va; 5603 pte = pmap_pte_exists(pmap, va, 2, __func__); 5604 tpte = pmap_load(pte); 5605 if (pmap_pte_dirty(pmap, tpte)) { 5606 /* 5607 * Although "tpte" is mapping a 2MB page, because 5608 * this function is called at a 4KB page granularity, 5609 * we only update the 4KB page under test. 5610 */ 5611 vm_page_dirty(m); 5612 } 5613 if ((tpte & ATTR_AF) != 0) { 5614 /* 5615 * Since this reference bit is shared by 512 4KB pages, 5616 * it should not be cleared every time it is tested. 5617 * Apply a simple "hash" function on the physical page 5618 * number, the virtual superpage number, and the pmap 5619 * address to select one 4KB page out of the 512 on 5620 * which testing the reference bit will result in 5621 * clearing that reference bit. This function is 5622 * designed to avoid the selection of the same 4KB page 5623 * for every 2MB page mapping. 5624 * 5625 * On demotion, a mapping that hasn't been referenced 5626 * is simply destroyed. To avoid the possibility of a 5627 * subsequent page fault on a demoted wired mapping, 5628 * always leave its reference bit set. Moreover, 5629 * since the superpage is wired, the current state of 5630 * its reference bit won't affect page replacement. 5631 */ 5632 if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^ 5633 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && 5634 (tpte & ATTR_SW_WIRED) == 0) { 5635 pmap_clear_bits(pte, ATTR_AF); 5636 pmap_invalidate_page(pmap, va, true); 5637 cleared++; 5638 } else 5639 not_cleared++; 5640 } 5641 PMAP_UNLOCK(pmap); 5642 /* Rotate the PV list if it has more than one entry. */ 5643 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5644 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5645 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5646 pvh->pv_gen++; 5647 } 5648 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 5649 goto out; 5650 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 5651 small_mappings: 5652 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 5653 goto out; 5654 pv = pvf; 5655 do { 5656 if (pvf == NULL) 5657 pvf = pv; 5658 pmap = PV_PMAP(pv); 5659 if (!PMAP_TRYLOCK(pmap)) { 5660 pvh_gen = pvh->pv_gen; 5661 md_gen = m->md.pv_gen; 5662 rw_wunlock(lock); 5663 PMAP_LOCK(pmap); 5664 rw_wlock(lock); 5665 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 5666 PMAP_UNLOCK(pmap); 5667 goto retry; 5668 } 5669 } 5670 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 5671 tpte = pmap_load(pte); 5672 if (pmap_pte_dirty(pmap, tpte)) 5673 vm_page_dirty(m); 5674 if ((tpte & ATTR_AF) != 0) { 5675 if ((tpte & ATTR_SW_WIRED) == 0) { 5676 pmap_clear_bits(pte, ATTR_AF); 5677 pmap_invalidate_page(pmap, pv->pv_va, true); 5678 cleared++; 5679 } else 5680 not_cleared++; 5681 } 5682 PMAP_UNLOCK(pmap); 5683 /* Rotate the PV list if it has more than one entry. */ 5684 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5685 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5686 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5687 m->md.pv_gen++; 5688 } 5689 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 5690 not_cleared < PMAP_TS_REFERENCED_MAX); 5691 out: 5692 rw_wunlock(lock); 5693 vm_page_free_pages_toq(&free, true); 5694 return (cleared + not_cleared); 5695 } 5696 5697 /* 5698 * Apply the given advice to the specified range of addresses within the 5699 * given pmap. Depending on the advice, clear the referenced and/or 5700 * modified flags in each mapping and set the mapped page's dirty field. 5701 */ 5702 void 5703 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 5704 { 5705 struct rwlock *lock; 5706 vm_offset_t va, va_next; 5707 vm_page_t m; 5708 pd_entry_t *l0, *l1, *l2, oldl2; 5709 pt_entry_t *l3, oldl3; 5710 5711 PMAP_ASSERT_STAGE1(pmap); 5712 5713 if (advice != MADV_DONTNEED && advice != MADV_FREE) 5714 return; 5715 5716 PMAP_LOCK(pmap); 5717 for (; sva < eva; sva = va_next) { 5718 l0 = pmap_l0(pmap, sva); 5719 if (pmap_load(l0) == 0) { 5720 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 5721 if (va_next < sva) 5722 va_next = eva; 5723 continue; 5724 } 5725 5726 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 5727 if (va_next < sva) 5728 va_next = eva; 5729 l1 = pmap_l0_to_l1(l0, sva); 5730 if (pmap_load(l1) == 0) 5731 continue; 5732 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 5733 KASSERT(va_next <= eva, 5734 ("partial update of non-transparent 1G page " 5735 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 5736 pmap_load(l1), sva, eva, va_next)); 5737 continue; 5738 } 5739 5740 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 5741 if (va_next < sva) 5742 va_next = eva; 5743 l2 = pmap_l1_to_l2(l1, sva); 5744 oldl2 = pmap_load(l2); 5745 if (oldl2 == 0) 5746 continue; 5747 if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) { 5748 if ((oldl2 & ATTR_SW_MANAGED) == 0) 5749 continue; 5750 lock = NULL; 5751 if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) { 5752 if (lock != NULL) 5753 rw_wunlock(lock); 5754 5755 /* 5756 * The 2MB page mapping was destroyed. 5757 */ 5758 continue; 5759 } 5760 5761 /* 5762 * Unless the page mappings are wired, remove the 5763 * mapping to a single page so that a subsequent 5764 * access may repromote. Choosing the last page 5765 * within the address range [sva, min(va_next, eva)) 5766 * generally results in more repromotions. Since the 5767 * underlying page table page is fully populated, this 5768 * removal never frees a page table page. 5769 */ 5770 if ((oldl2 & ATTR_SW_WIRED) == 0) { 5771 va = eva; 5772 if (va > va_next) 5773 va = va_next; 5774 va -= PAGE_SIZE; 5775 KASSERT(va >= sva, 5776 ("pmap_advise: no address gap")); 5777 l3 = pmap_l2_to_l3(l2, va); 5778 KASSERT(pmap_load(l3) != 0, 5779 ("pmap_advise: invalid PTE")); 5780 pmap_remove_l3(pmap, l3, va, pmap_load(l2), 5781 NULL, &lock); 5782 } 5783 if (lock != NULL) 5784 rw_wunlock(lock); 5785 } 5786 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 5787 ("pmap_advise: invalid L2 entry after demotion")); 5788 if (va_next > eva) 5789 va_next = eva; 5790 va = va_next; 5791 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 5792 sva += L3_SIZE) { 5793 oldl3 = pmap_load(l3); 5794 if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) != 5795 (ATTR_SW_MANAGED | L3_PAGE)) 5796 goto maybe_invlrng; 5797 else if (pmap_pte_dirty(pmap, oldl3)) { 5798 if (advice == MADV_DONTNEED) { 5799 /* 5800 * Future calls to pmap_is_modified() 5801 * can be avoided by making the page 5802 * dirty now. 5803 */ 5804 m = PHYS_TO_VM_PAGE(oldl3 & ~ATTR_MASK); 5805 vm_page_dirty(m); 5806 } 5807 while (!atomic_fcmpset_long(l3, &oldl3, 5808 (oldl3 & ~ATTR_AF) | 5809 ATTR_S1_AP(ATTR_S1_AP_RO))) 5810 cpu_spinwait(); 5811 } else if ((oldl3 & ATTR_AF) != 0) 5812 pmap_clear_bits(l3, ATTR_AF); 5813 else 5814 goto maybe_invlrng; 5815 if (va == va_next) 5816 va = sva; 5817 continue; 5818 maybe_invlrng: 5819 if (va != va_next) { 5820 pmap_invalidate_range(pmap, va, sva, true); 5821 va = va_next; 5822 } 5823 } 5824 if (va != va_next) 5825 pmap_invalidate_range(pmap, va, sva, true); 5826 } 5827 PMAP_UNLOCK(pmap); 5828 } 5829 5830 /* 5831 * Clear the modify bits on the specified physical page. 5832 */ 5833 void 5834 pmap_clear_modify(vm_page_t m) 5835 { 5836 struct md_page *pvh; 5837 struct rwlock *lock; 5838 pmap_t pmap; 5839 pv_entry_t next_pv, pv; 5840 pd_entry_t *l2, oldl2; 5841 pt_entry_t *l3, oldl3; 5842 vm_offset_t va; 5843 int md_gen, pvh_gen; 5844 5845 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5846 ("pmap_clear_modify: page %p is not managed", m)); 5847 vm_page_assert_busied(m); 5848 5849 if (!pmap_page_is_write_mapped(m)) 5850 return; 5851 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 5852 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5853 rw_wlock(lock); 5854 restart: 5855 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5856 pmap = PV_PMAP(pv); 5857 PMAP_ASSERT_STAGE1(pmap); 5858 if (!PMAP_TRYLOCK(pmap)) { 5859 pvh_gen = pvh->pv_gen; 5860 rw_wunlock(lock); 5861 PMAP_LOCK(pmap); 5862 rw_wlock(lock); 5863 if (pvh_gen != pvh->pv_gen) { 5864 PMAP_UNLOCK(pmap); 5865 goto restart; 5866 } 5867 } 5868 va = pv->pv_va; 5869 l2 = pmap_l2(pmap, va); 5870 oldl2 = pmap_load(l2); 5871 /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */ 5872 if ((oldl2 & ATTR_SW_DBM) != 0 && 5873 pmap_demote_l2_locked(pmap, l2, va, &lock) && 5874 (oldl2 & ATTR_SW_WIRED) == 0) { 5875 /* 5876 * Write protect the mapping to a single page so that 5877 * a subsequent write access may repromote. 5878 */ 5879 va += VM_PAGE_TO_PHYS(m) - (oldl2 & ~ATTR_MASK); 5880 l3 = pmap_l2_to_l3(l2, va); 5881 oldl3 = pmap_load(l3); 5882 while (!atomic_fcmpset_long(l3, &oldl3, 5883 (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO))) 5884 cpu_spinwait(); 5885 vm_page_dirty(m); 5886 pmap_invalidate_page(pmap, va, true); 5887 } 5888 PMAP_UNLOCK(pmap); 5889 } 5890 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5891 pmap = PV_PMAP(pv); 5892 PMAP_ASSERT_STAGE1(pmap); 5893 if (!PMAP_TRYLOCK(pmap)) { 5894 md_gen = m->md.pv_gen; 5895 pvh_gen = pvh->pv_gen; 5896 rw_wunlock(lock); 5897 PMAP_LOCK(pmap); 5898 rw_wlock(lock); 5899 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 5900 PMAP_UNLOCK(pmap); 5901 goto restart; 5902 } 5903 } 5904 l2 = pmap_l2(pmap, pv->pv_va); 5905 l3 = pmap_l2_to_l3(l2, pv->pv_va); 5906 oldl3 = pmap_load(l3); 5907 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM){ 5908 pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO)); 5909 pmap_invalidate_page(pmap, pv->pv_va, true); 5910 } 5911 PMAP_UNLOCK(pmap); 5912 } 5913 rw_wunlock(lock); 5914 } 5915 5916 void * 5917 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 5918 { 5919 struct pmap_preinit_mapping *ppim; 5920 vm_offset_t va, offset; 5921 pd_entry_t *pde; 5922 pt_entry_t *l2; 5923 int i, lvl, l2_blocks, free_l2_count, start_idx; 5924 5925 if (!vm_initialized) { 5926 /* 5927 * No L3 ptables so map entire L2 blocks where start VA is: 5928 * preinit_map_va + start_idx * L2_SIZE 5929 * There may be duplicate mappings (multiple VA -> same PA) but 5930 * ARM64 dcache is always PIPT so that's acceptable. 5931 */ 5932 if (size == 0) 5933 return (NULL); 5934 5935 /* Calculate how many L2 blocks are needed for the mapping */ 5936 l2_blocks = (roundup2(pa + size, L2_SIZE) - 5937 rounddown2(pa, L2_SIZE)) >> L2_SHIFT; 5938 5939 offset = pa & L2_OFFSET; 5940 5941 if (preinit_map_va == 0) 5942 return (NULL); 5943 5944 /* Map 2MiB L2 blocks from reserved VA space */ 5945 5946 free_l2_count = 0; 5947 start_idx = -1; 5948 /* Find enough free contiguous VA space */ 5949 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5950 ppim = pmap_preinit_mapping + i; 5951 if (free_l2_count > 0 && ppim->pa != 0) { 5952 /* Not enough space here */ 5953 free_l2_count = 0; 5954 start_idx = -1; 5955 continue; 5956 } 5957 5958 if (ppim->pa == 0) { 5959 /* Free L2 block */ 5960 if (start_idx == -1) 5961 start_idx = i; 5962 free_l2_count++; 5963 if (free_l2_count == l2_blocks) 5964 break; 5965 } 5966 } 5967 if (free_l2_count != l2_blocks) 5968 panic("%s: too many preinit mappings", __func__); 5969 5970 va = preinit_map_va + (start_idx * L2_SIZE); 5971 for (i = start_idx; i < start_idx + l2_blocks; i++) { 5972 /* Mark entries as allocated */ 5973 ppim = pmap_preinit_mapping + i; 5974 ppim->pa = pa; 5975 ppim->va = va + offset; 5976 ppim->size = size; 5977 } 5978 5979 /* Map L2 blocks */ 5980 pa = rounddown2(pa, L2_SIZE); 5981 for (i = 0; i < l2_blocks; i++) { 5982 pde = pmap_pde(kernel_pmap, va, &lvl); 5983 KASSERT(pde != NULL, 5984 ("pmap_mapbios: Invalid page entry, va: 0x%lx", 5985 va)); 5986 KASSERT(lvl == 1, 5987 ("pmap_mapbios: Invalid level %d", lvl)); 5988 5989 /* Insert L2_BLOCK */ 5990 l2 = pmap_l1_to_l2(pde, va); 5991 pmap_load_store(l2, 5992 pa | ATTR_DEFAULT | ATTR_S1_XN | 5993 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK); 5994 5995 va += L2_SIZE; 5996 pa += L2_SIZE; 5997 } 5998 pmap_invalidate_all(kernel_pmap); 5999 6000 va = preinit_map_va + (start_idx * L2_SIZE); 6001 6002 } else { 6003 /* kva_alloc may be used to map the pages */ 6004 offset = pa & PAGE_MASK; 6005 size = round_page(offset + size); 6006 6007 va = kva_alloc(size); 6008 if (va == 0) 6009 panic("%s: Couldn't allocate KVA", __func__); 6010 6011 pde = pmap_pde(kernel_pmap, va, &lvl); 6012 KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl)); 6013 6014 /* L3 table is linked */ 6015 va = trunc_page(va); 6016 pa = trunc_page(pa); 6017 pmap_kenter(va, size, pa, memory_mapping_mode(pa)); 6018 } 6019 6020 return ((void *)(va + offset)); 6021 } 6022 6023 void 6024 pmap_unmapbios(vm_offset_t va, vm_size_t size) 6025 { 6026 struct pmap_preinit_mapping *ppim; 6027 vm_offset_t offset, tmpsize, va_trunc; 6028 pd_entry_t *pde; 6029 pt_entry_t *l2; 6030 int i, lvl, l2_blocks, block; 6031 bool preinit_map; 6032 6033 l2_blocks = 6034 (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT; 6035 KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size)); 6036 6037 /* Remove preinit mapping */ 6038 preinit_map = false; 6039 block = 0; 6040 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 6041 ppim = pmap_preinit_mapping + i; 6042 if (ppim->va == va) { 6043 KASSERT(ppim->size == size, 6044 ("pmap_unmapbios: size mismatch")); 6045 ppim->va = 0; 6046 ppim->pa = 0; 6047 ppim->size = 0; 6048 preinit_map = true; 6049 offset = block * L2_SIZE; 6050 va_trunc = rounddown2(va, L2_SIZE) + offset; 6051 6052 /* Remove L2_BLOCK */ 6053 pde = pmap_pde(kernel_pmap, va_trunc, &lvl); 6054 KASSERT(pde != NULL, 6055 ("pmap_unmapbios: Invalid page entry, va: 0x%lx", 6056 va_trunc)); 6057 l2 = pmap_l1_to_l2(pde, va_trunc); 6058 pmap_clear(l2); 6059 6060 if (block == (l2_blocks - 1)) 6061 break; 6062 block++; 6063 } 6064 } 6065 if (preinit_map) { 6066 pmap_invalidate_all(kernel_pmap); 6067 return; 6068 } 6069 6070 /* Unmap the pages reserved with kva_alloc. */ 6071 if (vm_initialized) { 6072 offset = va & PAGE_MASK; 6073 size = round_page(offset + size); 6074 va = trunc_page(va); 6075 6076 pde = pmap_pde(kernel_pmap, va, &lvl); 6077 KASSERT(pde != NULL, 6078 ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va)); 6079 KASSERT(lvl == 2, ("pmap_unmapbios: Invalid level %d", lvl)); 6080 6081 /* Unmap and invalidate the pages */ 6082 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 6083 pmap_kremove(va + tmpsize); 6084 6085 kva_free(va, size); 6086 } 6087 } 6088 6089 /* 6090 * Sets the memory attribute for the specified page. 6091 */ 6092 void 6093 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 6094 { 6095 6096 m->md.pv_memattr = ma; 6097 6098 /* 6099 * If "m" is a normal page, update its direct mapping. This update 6100 * can be relied upon to perform any cache operations that are 6101 * required for data coherence. 6102 */ 6103 if ((m->flags & PG_FICTITIOUS) == 0 && 6104 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 6105 m->md.pv_memattr) != 0) 6106 panic("memory attribute change on the direct map failed"); 6107 } 6108 6109 /* 6110 * Changes the specified virtual address range's memory type to that given by 6111 * the parameter "mode". The specified virtual address range must be 6112 * completely contained within either the direct map or the kernel map. If 6113 * the virtual address range is contained within the kernel map, then the 6114 * memory type for each of the corresponding ranges of the direct map is also 6115 * changed. (The corresponding ranges of the direct map are those ranges that 6116 * map the same physical pages as the specified virtual address range.) These 6117 * changes to the direct map are necessary because Intel describes the 6118 * behavior of their processors as "undefined" if two or more mappings to the 6119 * same physical page have different memory types. 6120 * 6121 * Returns zero if the change completed successfully, and either EINVAL or 6122 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 6123 * of the virtual address range was not mapped, and ENOMEM is returned if 6124 * there was insufficient memory available to complete the change. In the 6125 * latter case, the memory type may have been changed on some part of the 6126 * virtual address range or the direct map. 6127 */ 6128 int 6129 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 6130 { 6131 int error; 6132 6133 PMAP_LOCK(kernel_pmap); 6134 error = pmap_change_props_locked(va, size, PROT_NONE, mode, false); 6135 PMAP_UNLOCK(kernel_pmap); 6136 return (error); 6137 } 6138 6139 /* 6140 * Changes the specified virtual address range's protections to those 6141 * specified by "prot". Like pmap_change_attr(), protections for aliases 6142 * in the direct map are updated as well. Protections on aliasing mappings may 6143 * be a subset of the requested protections; for example, mappings in the direct 6144 * map are never executable. 6145 */ 6146 int 6147 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot) 6148 { 6149 int error; 6150 6151 /* Only supported within the kernel map. */ 6152 if (va < VM_MIN_KERNEL_ADDRESS) 6153 return (EINVAL); 6154 6155 PMAP_LOCK(kernel_pmap); 6156 error = pmap_change_props_locked(va, size, prot, -1, false); 6157 PMAP_UNLOCK(kernel_pmap); 6158 return (error); 6159 } 6160 6161 static int 6162 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, 6163 int mode, bool skip_unmapped) 6164 { 6165 vm_offset_t base, offset, tmpva; 6166 vm_size_t pte_size; 6167 vm_paddr_t pa; 6168 pt_entry_t pte, *ptep, *newpte; 6169 pt_entry_t bits, mask; 6170 int lvl, rv; 6171 6172 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 6173 base = trunc_page(va); 6174 offset = va & PAGE_MASK; 6175 size = round_page(offset + size); 6176 6177 if (!VIRT_IN_DMAP(base) && 6178 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS)) 6179 return (EINVAL); 6180 6181 bits = 0; 6182 mask = 0; 6183 if (mode != -1) { 6184 bits = ATTR_S1_IDX(mode); 6185 mask = ATTR_S1_IDX_MASK; 6186 if (mode == VM_MEMATTR_DEVICE) { 6187 mask |= ATTR_S1_XN; 6188 bits |= ATTR_S1_XN; 6189 } 6190 } 6191 if (prot != VM_PROT_NONE) { 6192 /* Don't mark the DMAP as executable. It never is on arm64. */ 6193 if (VIRT_IN_DMAP(base)) { 6194 prot &= ~VM_PROT_EXECUTE; 6195 /* 6196 * XXX Mark the DMAP as writable for now. We rely 6197 * on this in ddb & dtrace to insert breakpoint 6198 * instructions. 6199 */ 6200 prot |= VM_PROT_WRITE; 6201 } 6202 6203 if ((prot & VM_PROT_WRITE) == 0) { 6204 bits |= ATTR_S1_AP(ATTR_S1_AP_RO); 6205 } 6206 if ((prot & VM_PROT_EXECUTE) == 0) { 6207 bits |= ATTR_S1_PXN; 6208 } 6209 bits |= ATTR_S1_UXN; 6210 mask |= ATTR_S1_AP_MASK | ATTR_S1_XN; 6211 } 6212 6213 for (tmpva = base; tmpva < base + size; ) { 6214 ptep = pmap_pte(kernel_pmap, tmpva, &lvl); 6215 if (ptep == NULL && !skip_unmapped) { 6216 return (EINVAL); 6217 } else if ((ptep == NULL && skip_unmapped) || 6218 (pmap_load(ptep) & mask) == bits) { 6219 /* 6220 * We already have the correct attribute or there 6221 * is no memory mapped at this address and we are 6222 * skipping unmapped memory. 6223 */ 6224 switch (lvl) { 6225 default: 6226 panic("Invalid DMAP table level: %d\n", lvl); 6227 case 1: 6228 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; 6229 break; 6230 case 2: 6231 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; 6232 break; 6233 case 3: 6234 tmpva += PAGE_SIZE; 6235 break; 6236 } 6237 } else { 6238 /* 6239 * Split the entry to an level 3 table, then 6240 * set the new attribute. 6241 */ 6242 switch (lvl) { 6243 default: 6244 panic("Invalid DMAP table level: %d\n", lvl); 6245 case 1: 6246 if ((tmpva & L1_OFFSET) == 0 && 6247 (base + size - tmpva) >= L1_SIZE) { 6248 pte_size = L1_SIZE; 6249 break; 6250 } 6251 newpte = pmap_demote_l1(kernel_pmap, ptep, 6252 tmpva & ~L1_OFFSET); 6253 if (newpte == NULL) 6254 return (EINVAL); 6255 ptep = pmap_l1_to_l2(ptep, tmpva); 6256 /* FALLTHROUGH */ 6257 case 2: 6258 if ((tmpva & L2_OFFSET) == 0 && 6259 (base + size - tmpva) >= L2_SIZE) { 6260 pte_size = L2_SIZE; 6261 break; 6262 } 6263 newpte = pmap_demote_l2(kernel_pmap, ptep, 6264 tmpva); 6265 if (newpte == NULL) 6266 return (EINVAL); 6267 ptep = pmap_l2_to_l3(ptep, tmpva); 6268 /* FALLTHROUGH */ 6269 case 3: 6270 pte_size = PAGE_SIZE; 6271 break; 6272 } 6273 6274 /* Update the entry */ 6275 pte = pmap_load(ptep); 6276 pte &= ~mask; 6277 pte |= bits; 6278 6279 pmap_update_entry(kernel_pmap, ptep, pte, tmpva, 6280 pte_size); 6281 6282 pa = pte & ~ATTR_MASK; 6283 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) { 6284 /* 6285 * Keep the DMAP memory in sync. 6286 */ 6287 rv = pmap_change_props_locked( 6288 PHYS_TO_DMAP(pa), pte_size, 6289 prot, mode, true); 6290 if (rv != 0) 6291 return (rv); 6292 } 6293 6294 /* 6295 * If moving to a non-cacheable entry flush 6296 * the cache. 6297 */ 6298 if (mode == VM_MEMATTR_UNCACHEABLE) 6299 cpu_dcache_wbinv_range(tmpva, pte_size); 6300 tmpva += pte_size; 6301 } 6302 } 6303 6304 return (0); 6305 } 6306 6307 /* 6308 * Create an L2 table to map all addresses within an L1 mapping. 6309 */ 6310 static pt_entry_t * 6311 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va) 6312 { 6313 pt_entry_t *l2, newl2, oldl1; 6314 vm_offset_t tmpl1; 6315 vm_paddr_t l2phys, phys; 6316 vm_page_t ml2; 6317 int i; 6318 6319 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6320 oldl1 = pmap_load(l1); 6321 KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK, 6322 ("pmap_demote_l1: Demoting a non-block entry")); 6323 KASSERT((va & L1_OFFSET) == 0, 6324 ("pmap_demote_l1: Invalid virtual address %#lx", va)); 6325 KASSERT((oldl1 & ATTR_SW_MANAGED) == 0, 6326 ("pmap_demote_l1: Level 1 table shouldn't be managed")); 6327 6328 tmpl1 = 0; 6329 if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) { 6330 tmpl1 = kva_alloc(PAGE_SIZE); 6331 if (tmpl1 == 0) 6332 return (NULL); 6333 } 6334 6335 if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) == 6336 NULL) { 6337 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx" 6338 " in pmap %p", va, pmap); 6339 l2 = NULL; 6340 goto fail; 6341 } 6342 6343 l2phys = VM_PAGE_TO_PHYS(ml2); 6344 l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys); 6345 6346 /* Address the range points at */ 6347 phys = oldl1 & ~ATTR_MASK; 6348 /* The attributed from the old l1 table to be copied */ 6349 newl2 = oldl1 & ATTR_MASK; 6350 6351 /* Create the new entries */ 6352 for (i = 0; i < Ln_ENTRIES; i++) { 6353 l2[i] = newl2 | phys; 6354 phys += L2_SIZE; 6355 } 6356 KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK), 6357 ("Invalid l2 page (%lx != %lx)", l2[0], 6358 (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK)); 6359 6360 if (tmpl1 != 0) { 6361 pmap_kenter(tmpl1, PAGE_SIZE, 6362 DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, 6363 VM_MEMATTR_WRITE_BACK); 6364 l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK)); 6365 } 6366 6367 pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE); 6368 6369 fail: 6370 if (tmpl1 != 0) { 6371 pmap_kremove(tmpl1); 6372 kva_free(tmpl1, PAGE_SIZE); 6373 } 6374 6375 return (l2); 6376 } 6377 6378 static void 6379 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3) 6380 { 6381 pt_entry_t *l3; 6382 6383 for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) { 6384 *l3 = newl3; 6385 newl3 += L3_SIZE; 6386 } 6387 } 6388 6389 static void 6390 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2, 6391 struct rwlock **lockp) 6392 { 6393 struct spglist free; 6394 6395 SLIST_INIT(&free); 6396 (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, 6397 lockp); 6398 vm_page_free_pages_toq(&free, true); 6399 } 6400 6401 /* 6402 * Create an L3 table to map all addresses within an L2 mapping. 6403 */ 6404 static pt_entry_t * 6405 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, 6406 struct rwlock **lockp) 6407 { 6408 pt_entry_t *l3, newl3, oldl2; 6409 vm_offset_t tmpl2; 6410 vm_paddr_t l3phys; 6411 vm_page_t ml3; 6412 6413 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6414 PMAP_ASSERT_STAGE1(pmap); 6415 KASSERT(ADDR_IS_CANONICAL(va), 6416 ("%s: Address not in canonical form: %lx", __func__, va)); 6417 6418 l3 = NULL; 6419 oldl2 = pmap_load(l2); 6420 KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK, 6421 ("pmap_demote_l2: Demoting a non-block entry")); 6422 va &= ~L2_OFFSET; 6423 6424 tmpl2 = 0; 6425 if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) { 6426 tmpl2 = kva_alloc(PAGE_SIZE); 6427 if (tmpl2 == 0) 6428 return (NULL); 6429 } 6430 6431 /* 6432 * Invalidate the 2MB page mapping and return "failure" if the 6433 * mapping was never accessed. 6434 */ 6435 if ((oldl2 & ATTR_AF) == 0) { 6436 KASSERT((oldl2 & ATTR_SW_WIRED) == 0, 6437 ("pmap_demote_l2: a wired mapping is missing ATTR_AF")); 6438 pmap_demote_l2_abort(pmap, va, l2, lockp); 6439 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p", 6440 va, pmap); 6441 goto fail; 6442 } 6443 6444 if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) { 6445 KASSERT((oldl2 & ATTR_SW_WIRED) == 0, 6446 ("pmap_demote_l2: page table page for a wired mapping" 6447 " is missing")); 6448 6449 /* 6450 * If the page table page is missing and the mapping 6451 * is for a kernel address, the mapping must belong to 6452 * either the direct map or the early kernel memory. 6453 * Page table pages are preallocated for every other 6454 * part of the kernel address space, so the direct map 6455 * region and early kernel memory are the only parts of the 6456 * kernel address space that must be handled here. 6457 */ 6458 KASSERT(!ADDR_IS_KERNEL(va) || VIRT_IN_DMAP(va) || 6459 (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end), 6460 ("pmap_demote_l2: No saved mpte for va %#lx", va)); 6461 6462 /* 6463 * If the 2MB page mapping belongs to the direct map 6464 * region of the kernel's address space, then the page 6465 * allocation request specifies the highest possible 6466 * priority (VM_ALLOC_INTERRUPT). Otherwise, the 6467 * priority is normal. 6468 */ 6469 ml3 = vm_page_alloc_noobj( 6470 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) | 6471 VM_ALLOC_WIRED); 6472 6473 /* 6474 * If the allocation of the new page table page fails, 6475 * invalidate the 2MB page mapping and return "failure". 6476 */ 6477 if (ml3 == NULL) { 6478 pmap_demote_l2_abort(pmap, va, l2, lockp); 6479 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx" 6480 " in pmap %p", va, pmap); 6481 goto fail; 6482 } 6483 ml3->pindex = pmap_l2_pindex(va); 6484 6485 if (!ADDR_IS_KERNEL(va)) { 6486 ml3->ref_count = NL3PG; 6487 pmap_resident_count_inc(pmap, 1); 6488 } 6489 } 6490 l3phys = VM_PAGE_TO_PHYS(ml3); 6491 l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys); 6492 newl3 = (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE; 6493 KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 6494 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM), 6495 ("pmap_demote_l2: L2 entry is writeable but not dirty")); 6496 6497 /* 6498 * If the page table page is not leftover from an earlier promotion, 6499 * or the mapping attributes have changed, (re)initialize the L3 table. 6500 * 6501 * When pmap_update_entry() clears the old L2 mapping, it (indirectly) 6502 * performs a dsb(). That dsb() ensures that the stores for filling 6503 * "l3" are visible before "l3" is added to the page table. 6504 */ 6505 if (ml3->valid == 0 || (l3[0] & ATTR_MASK) != (newl3 & ATTR_MASK)) 6506 pmap_fill_l3(l3, newl3); 6507 6508 /* 6509 * Map the temporary page so we don't lose access to the l2 table. 6510 */ 6511 if (tmpl2 != 0) { 6512 pmap_kenter(tmpl2, PAGE_SIZE, 6513 DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, 6514 VM_MEMATTR_WRITE_BACK); 6515 l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK)); 6516 } 6517 6518 /* 6519 * The spare PV entries must be reserved prior to demoting the 6520 * mapping, that is, prior to changing the PDE. Otherwise, the state 6521 * of the L2 and the PV lists will be inconsistent, which can result 6522 * in reclaim_pv_chunk() attempting to remove a PV entry from the 6523 * wrong PV list and pmap_pv_demote_l2() failing to find the expected 6524 * PV entry for the 2MB page mapping that is being demoted. 6525 */ 6526 if ((oldl2 & ATTR_SW_MANAGED) != 0) 6527 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); 6528 6529 /* 6530 * Pass PAGE_SIZE so that a single TLB invalidation is performed on 6531 * the 2MB page mapping. 6532 */ 6533 pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE); 6534 6535 /* 6536 * Demote the PV entry. 6537 */ 6538 if ((oldl2 & ATTR_SW_MANAGED) != 0) 6539 pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp); 6540 6541 atomic_add_long(&pmap_l2_demotions, 1); 6542 CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx" 6543 " in pmap %p %lx", va, pmap, l3[0]); 6544 6545 fail: 6546 if (tmpl2 != 0) { 6547 pmap_kremove(tmpl2); 6548 kva_free(tmpl2, PAGE_SIZE); 6549 } 6550 6551 return (l3); 6552 6553 } 6554 6555 static pt_entry_t * 6556 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 6557 { 6558 struct rwlock *lock; 6559 pt_entry_t *l3; 6560 6561 lock = NULL; 6562 l3 = pmap_demote_l2_locked(pmap, l2, va, &lock); 6563 if (lock != NULL) 6564 rw_wunlock(lock); 6565 return (l3); 6566 } 6567 6568 /* 6569 * Perform the pmap work for mincore(2). If the page is not both referenced and 6570 * modified by this pmap, returns its physical address so that the caller can 6571 * find other mappings. 6572 */ 6573 int 6574 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 6575 { 6576 pt_entry_t *pte, tpte; 6577 vm_paddr_t mask, pa; 6578 int lvl, val; 6579 bool managed; 6580 6581 PMAP_ASSERT_STAGE1(pmap); 6582 PMAP_LOCK(pmap); 6583 pte = pmap_pte(pmap, addr, &lvl); 6584 if (pte != NULL) { 6585 tpte = pmap_load(pte); 6586 6587 switch (lvl) { 6588 case 3: 6589 mask = L3_OFFSET; 6590 break; 6591 case 2: 6592 mask = L2_OFFSET; 6593 break; 6594 case 1: 6595 mask = L1_OFFSET; 6596 break; 6597 default: 6598 panic("pmap_mincore: invalid level %d", lvl); 6599 } 6600 6601 managed = (tpte & ATTR_SW_MANAGED) != 0; 6602 val = MINCORE_INCORE; 6603 if (lvl != 3) 6604 val |= MINCORE_PSIND(3 - lvl); 6605 if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed && 6606 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))) 6607 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6608 if ((tpte & ATTR_AF) == ATTR_AF) 6609 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6610 6611 pa = (tpte & ~ATTR_MASK) | (addr & mask); 6612 } else { 6613 managed = false; 6614 val = 0; 6615 } 6616 6617 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 6618 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 6619 *pap = pa; 6620 } 6621 PMAP_UNLOCK(pmap); 6622 return (val); 6623 } 6624 6625 /* 6626 * Garbage collect every ASID that is neither active on a processor nor 6627 * reserved. 6628 */ 6629 static void 6630 pmap_reset_asid_set(pmap_t pmap) 6631 { 6632 pmap_t curpmap; 6633 int asid, cpuid, epoch; 6634 struct asid_set *set; 6635 enum pmap_stage stage; 6636 6637 set = pmap->pm_asid_set; 6638 stage = pmap->pm_stage; 6639 6640 set = pmap->pm_asid_set; 6641 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 6642 mtx_assert(&set->asid_set_mutex, MA_OWNED); 6643 6644 /* 6645 * Ensure that the store to asid_epoch is globally visible before the 6646 * loads from pc_curpmap are performed. 6647 */ 6648 epoch = set->asid_epoch + 1; 6649 if (epoch == INT_MAX) 6650 epoch = 0; 6651 set->asid_epoch = epoch; 6652 dsb(ishst); 6653 if (stage == PM_STAGE1) { 6654 __asm __volatile("tlbi vmalle1is"); 6655 } else { 6656 KASSERT(pmap_clean_stage2_tlbi != NULL, 6657 ("%s: Unset stage 2 tlb invalidation callback\n", 6658 __func__)); 6659 pmap_clean_stage2_tlbi(); 6660 } 6661 dsb(ish); 6662 bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE, 6663 set->asid_set_size - 1); 6664 CPU_FOREACH(cpuid) { 6665 if (cpuid == curcpu) 6666 continue; 6667 if (stage == PM_STAGE1) { 6668 curpmap = pcpu_find(cpuid)->pc_curpmap; 6669 PMAP_ASSERT_STAGE1(pmap); 6670 } else { 6671 curpmap = pcpu_find(cpuid)->pc_curvmpmap; 6672 if (curpmap == NULL) 6673 continue; 6674 PMAP_ASSERT_STAGE2(pmap); 6675 } 6676 KASSERT(curpmap->pm_asid_set == set, ("Incorrect set")); 6677 asid = COOKIE_TO_ASID(curpmap->pm_cookie); 6678 if (asid == -1) 6679 continue; 6680 bit_set(set->asid_set, asid); 6681 curpmap->pm_cookie = COOKIE_FROM(asid, epoch); 6682 } 6683 } 6684 6685 /* 6686 * Allocate a new ASID for the specified pmap. 6687 */ 6688 static void 6689 pmap_alloc_asid(pmap_t pmap) 6690 { 6691 struct asid_set *set; 6692 int new_asid; 6693 6694 set = pmap->pm_asid_set; 6695 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 6696 6697 mtx_lock_spin(&set->asid_set_mutex); 6698 6699 /* 6700 * While this processor was waiting to acquire the asid set mutex, 6701 * pmap_reset_asid_set() running on another processor might have 6702 * updated this pmap's cookie to the current epoch. In which case, we 6703 * don't need to allocate a new ASID. 6704 */ 6705 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) 6706 goto out; 6707 6708 bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size, 6709 &new_asid); 6710 if (new_asid == -1) { 6711 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, 6712 set->asid_next, &new_asid); 6713 if (new_asid == -1) { 6714 pmap_reset_asid_set(pmap); 6715 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, 6716 set->asid_set_size, &new_asid); 6717 KASSERT(new_asid != -1, ("ASID allocation failure")); 6718 } 6719 } 6720 bit_set(set->asid_set, new_asid); 6721 set->asid_next = new_asid + 1; 6722 pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch); 6723 out: 6724 mtx_unlock_spin(&set->asid_set_mutex); 6725 } 6726 6727 /* 6728 * Compute the value that should be stored in ttbr0 to activate the specified 6729 * pmap. This value may change from time to time. 6730 */ 6731 uint64_t 6732 pmap_to_ttbr0(pmap_t pmap) 6733 { 6734 6735 return (ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)) | 6736 pmap->pm_ttbr); 6737 } 6738 6739 static bool 6740 pmap_activate_int(pmap_t pmap) 6741 { 6742 struct asid_set *set; 6743 int epoch; 6744 6745 KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap")); 6746 KASSERT(pmap != kernel_pmap, ("kernel pmap activation")); 6747 6748 if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) || 6749 (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) { 6750 /* 6751 * Handle the possibility that the old thread was preempted 6752 * after an "ic" or "tlbi" instruction but before it performed 6753 * a "dsb" instruction. If the old thread migrates to a new 6754 * processor, its completion of a "dsb" instruction on that 6755 * new processor does not guarantee that the "ic" or "tlbi" 6756 * instructions performed on the old processor have completed. 6757 */ 6758 dsb(ish); 6759 return (false); 6760 } 6761 6762 set = pmap->pm_asid_set; 6763 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 6764 6765 /* 6766 * Ensure that the store to curpmap is globally visible before the 6767 * load from asid_epoch is performed. 6768 */ 6769 if (pmap->pm_stage == PM_STAGE1) 6770 PCPU_SET(curpmap, pmap); 6771 else 6772 PCPU_SET(curvmpmap, pmap); 6773 dsb(ish); 6774 epoch = COOKIE_TO_EPOCH(pmap->pm_cookie); 6775 if (epoch >= 0 && epoch != set->asid_epoch) 6776 pmap_alloc_asid(pmap); 6777 6778 if (pmap->pm_stage == PM_STAGE1) { 6779 set_ttbr0(pmap_to_ttbr0(pmap)); 6780 if (PCPU_GET(bcast_tlbi_workaround) != 0) 6781 invalidate_local_icache(); 6782 } 6783 return (true); 6784 } 6785 6786 void 6787 pmap_activate_vm(pmap_t pmap) 6788 { 6789 6790 PMAP_ASSERT_STAGE2(pmap); 6791 6792 (void)pmap_activate_int(pmap); 6793 } 6794 6795 void 6796 pmap_activate(struct thread *td) 6797 { 6798 pmap_t pmap; 6799 6800 pmap = vmspace_pmap(td->td_proc->p_vmspace); 6801 PMAP_ASSERT_STAGE1(pmap); 6802 critical_enter(); 6803 (void)pmap_activate_int(pmap); 6804 critical_exit(); 6805 } 6806 6807 /* 6808 * Activate the thread we are switching to. 6809 * To simplify the assembly in cpu_throw return the new threads pcb. 6810 */ 6811 struct pcb * 6812 pmap_switch(struct thread *new) 6813 { 6814 pcpu_bp_harden bp_harden; 6815 struct pcb *pcb; 6816 6817 /* Store the new curthread */ 6818 PCPU_SET(curthread, new); 6819 #if defined(PERTHREAD_SSP) 6820 /* Set the new threads SSP canary */ 6821 __asm("msr sp_el0, %0" :: "r"(&new->td_md.md_canary)); 6822 #endif 6823 6824 /* And the new pcb */ 6825 pcb = new->td_pcb; 6826 PCPU_SET(curpcb, pcb); 6827 6828 /* 6829 * TODO: We may need to flush the cache here if switching 6830 * to a user process. 6831 */ 6832 6833 if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) { 6834 /* 6835 * Stop userspace from training the branch predictor against 6836 * other processes. This will call into a CPU specific 6837 * function that clears the branch predictor state. 6838 */ 6839 bp_harden = PCPU_GET(bp_harden); 6840 if (bp_harden != NULL) 6841 bp_harden(); 6842 } 6843 6844 return (pcb); 6845 } 6846 6847 void 6848 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) 6849 { 6850 6851 PMAP_ASSERT_STAGE1(pmap); 6852 KASSERT(ADDR_IS_CANONICAL(va), 6853 ("%s: Address not in canonical form: %lx", __func__, va)); 6854 6855 if (ADDR_IS_KERNEL(va)) { 6856 cpu_icache_sync_range(va, sz); 6857 } else { 6858 u_int len, offset; 6859 vm_paddr_t pa; 6860 6861 /* Find the length of data in this page to flush */ 6862 offset = va & PAGE_MASK; 6863 len = imin(PAGE_SIZE - offset, sz); 6864 6865 while (sz != 0) { 6866 /* Extract the physical address & find it in the DMAP */ 6867 pa = pmap_extract(pmap, va); 6868 if (pa != 0) 6869 cpu_icache_sync_range(PHYS_TO_DMAP(pa), len); 6870 6871 /* Move to the next page */ 6872 sz -= len; 6873 va += len; 6874 /* Set the length for the next iteration */ 6875 len = imin(PAGE_SIZE, sz); 6876 } 6877 } 6878 } 6879 6880 static int 6881 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far) 6882 { 6883 pd_entry_t *pdep; 6884 pt_entry_t *ptep, pte; 6885 int rv, lvl, dfsc; 6886 6887 PMAP_ASSERT_STAGE2(pmap); 6888 rv = KERN_FAILURE; 6889 6890 /* Data and insn aborts use same encoding for FSC field. */ 6891 dfsc = esr & ISS_DATA_DFSC_MASK; 6892 switch (dfsc) { 6893 case ISS_DATA_DFSC_TF_L0: 6894 case ISS_DATA_DFSC_TF_L1: 6895 case ISS_DATA_DFSC_TF_L2: 6896 case ISS_DATA_DFSC_TF_L3: 6897 PMAP_LOCK(pmap); 6898 pdep = pmap_pde(pmap, far, &lvl); 6899 if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) { 6900 PMAP_LOCK(pmap); 6901 break; 6902 } 6903 6904 switch (lvl) { 6905 case 0: 6906 ptep = pmap_l0_to_l1(pdep, far); 6907 break; 6908 case 1: 6909 ptep = pmap_l1_to_l2(pdep, far); 6910 break; 6911 case 2: 6912 ptep = pmap_l2_to_l3(pdep, far); 6913 break; 6914 default: 6915 panic("%s: Invalid pde level %d", __func__,lvl); 6916 } 6917 goto fault_exec; 6918 6919 case ISS_DATA_DFSC_AFF_L1: 6920 case ISS_DATA_DFSC_AFF_L2: 6921 case ISS_DATA_DFSC_AFF_L3: 6922 PMAP_LOCK(pmap); 6923 ptep = pmap_pte(pmap, far, &lvl); 6924 fault_exec: 6925 if (ptep != NULL && (pte = pmap_load(ptep)) != 0) { 6926 if (icache_vmid) { 6927 pmap_invalidate_vpipt_icache(); 6928 } else { 6929 /* 6930 * If accessing an executable page invalidate 6931 * the I-cache so it will be valid when we 6932 * continue execution in the guest. The D-cache 6933 * is assumed to already be clean to the Point 6934 * of Coherency. 6935 */ 6936 if ((pte & ATTR_S2_XN_MASK) != 6937 ATTR_S2_XN(ATTR_S2_XN_NONE)) { 6938 invalidate_icache(); 6939 } 6940 } 6941 pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID); 6942 rv = KERN_SUCCESS; 6943 } 6944 PMAP_UNLOCK(pmap); 6945 break; 6946 } 6947 6948 return (rv); 6949 } 6950 6951 int 6952 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far) 6953 { 6954 pt_entry_t pte, *ptep; 6955 register_t intr; 6956 uint64_t ec, par; 6957 int lvl, rv; 6958 6959 rv = KERN_FAILURE; 6960 6961 ec = ESR_ELx_EXCEPTION(esr); 6962 switch (ec) { 6963 case EXCP_INSN_ABORT_L: 6964 case EXCP_INSN_ABORT: 6965 case EXCP_DATA_ABORT_L: 6966 case EXCP_DATA_ABORT: 6967 break; 6968 default: 6969 return (rv); 6970 } 6971 6972 if (pmap->pm_stage == PM_STAGE2) 6973 return (pmap_stage2_fault(pmap, esr, far)); 6974 6975 /* Data and insn aborts use same encoding for FSC field. */ 6976 switch (esr & ISS_DATA_DFSC_MASK) { 6977 case ISS_DATA_DFSC_AFF_L1: 6978 case ISS_DATA_DFSC_AFF_L2: 6979 case ISS_DATA_DFSC_AFF_L3: 6980 PMAP_LOCK(pmap); 6981 ptep = pmap_pte(pmap, far, &lvl); 6982 if (ptep != NULL) { 6983 pmap_set_bits(ptep, ATTR_AF); 6984 rv = KERN_SUCCESS; 6985 /* 6986 * XXXMJ as an optimization we could mark the entry 6987 * dirty if this is a write fault. 6988 */ 6989 } 6990 PMAP_UNLOCK(pmap); 6991 break; 6992 case ISS_DATA_DFSC_PF_L1: 6993 case ISS_DATA_DFSC_PF_L2: 6994 case ISS_DATA_DFSC_PF_L3: 6995 if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) || 6996 (esr & ISS_DATA_WnR) == 0) 6997 return (rv); 6998 PMAP_LOCK(pmap); 6999 ptep = pmap_pte(pmap, far, &lvl); 7000 if (ptep != NULL && 7001 ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) { 7002 if ((pte & ATTR_S1_AP_RW_BIT) == 7003 ATTR_S1_AP(ATTR_S1_AP_RO)) { 7004 pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT); 7005 pmap_invalidate_page(pmap, far, true); 7006 } 7007 rv = KERN_SUCCESS; 7008 } 7009 PMAP_UNLOCK(pmap); 7010 break; 7011 case ISS_DATA_DFSC_TF_L0: 7012 case ISS_DATA_DFSC_TF_L1: 7013 case ISS_DATA_DFSC_TF_L2: 7014 case ISS_DATA_DFSC_TF_L3: 7015 /* 7016 * Retry the translation. A break-before-make sequence can 7017 * produce a transient fault. 7018 */ 7019 if (pmap == kernel_pmap) { 7020 /* 7021 * The translation fault may have occurred within a 7022 * critical section. Therefore, we must check the 7023 * address without acquiring the kernel pmap's lock. 7024 */ 7025 if (pmap_klookup(far, NULL)) 7026 rv = KERN_SUCCESS; 7027 } else { 7028 PMAP_LOCK(pmap); 7029 /* Ask the MMU to check the address. */ 7030 intr = intr_disable(); 7031 par = arm64_address_translate_s1e0r(far); 7032 intr_restore(intr); 7033 PMAP_UNLOCK(pmap); 7034 7035 /* 7036 * If the translation was successful, then we can 7037 * return success to the trap handler. 7038 */ 7039 if (PAR_SUCCESS(par)) 7040 rv = KERN_SUCCESS; 7041 } 7042 break; 7043 } 7044 7045 return (rv); 7046 } 7047 7048 /* 7049 * Increase the starting virtual address of the given mapping if a 7050 * different alignment might result in more superpage mappings. 7051 */ 7052 void 7053 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 7054 vm_offset_t *addr, vm_size_t size) 7055 { 7056 vm_offset_t superpage_offset; 7057 7058 if (size < L2_SIZE) 7059 return; 7060 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 7061 offset += ptoa(object->pg_color); 7062 superpage_offset = offset & L2_OFFSET; 7063 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || 7064 (*addr & L2_OFFSET) == superpage_offset) 7065 return; 7066 if ((*addr & L2_OFFSET) < superpage_offset) 7067 *addr = (*addr & ~L2_OFFSET) + superpage_offset; 7068 else 7069 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; 7070 } 7071 7072 /** 7073 * Get the kernel virtual address of a set of physical pages. If there are 7074 * physical addresses not covered by the DMAP perform a transient mapping 7075 * that will be removed when calling pmap_unmap_io_transient. 7076 * 7077 * \param page The pages the caller wishes to obtain the virtual 7078 * address on the kernel memory map. 7079 * \param vaddr On return contains the kernel virtual memory address 7080 * of the pages passed in the page parameter. 7081 * \param count Number of pages passed in. 7082 * \param can_fault TRUE if the thread using the mapped pages can take 7083 * page faults, FALSE otherwise. 7084 * 7085 * \returns TRUE if the caller must call pmap_unmap_io_transient when 7086 * finished or FALSE otherwise. 7087 * 7088 */ 7089 boolean_t 7090 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 7091 boolean_t can_fault) 7092 { 7093 vm_paddr_t paddr; 7094 boolean_t needs_mapping; 7095 int error __diagused, i; 7096 7097 /* 7098 * Allocate any KVA space that we need, this is done in a separate 7099 * loop to prevent calling vmem_alloc while pinned. 7100 */ 7101 needs_mapping = FALSE; 7102 for (i = 0; i < count; i++) { 7103 paddr = VM_PAGE_TO_PHYS(page[i]); 7104 if (__predict_false(!PHYS_IN_DMAP(paddr))) { 7105 error = vmem_alloc(kernel_arena, PAGE_SIZE, 7106 M_BESTFIT | M_WAITOK, &vaddr[i]); 7107 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 7108 needs_mapping = TRUE; 7109 } else { 7110 vaddr[i] = PHYS_TO_DMAP(paddr); 7111 } 7112 } 7113 7114 /* Exit early if everything is covered by the DMAP */ 7115 if (!needs_mapping) 7116 return (FALSE); 7117 7118 if (!can_fault) 7119 sched_pin(); 7120 for (i = 0; i < count; i++) { 7121 paddr = VM_PAGE_TO_PHYS(page[i]); 7122 if (!PHYS_IN_DMAP(paddr)) { 7123 panic( 7124 "pmap_map_io_transient: TODO: Map out of DMAP data"); 7125 } 7126 } 7127 7128 return (needs_mapping); 7129 } 7130 7131 void 7132 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 7133 boolean_t can_fault) 7134 { 7135 vm_paddr_t paddr; 7136 int i; 7137 7138 if (!can_fault) 7139 sched_unpin(); 7140 for (i = 0; i < count; i++) { 7141 paddr = VM_PAGE_TO_PHYS(page[i]); 7142 if (!PHYS_IN_DMAP(paddr)) { 7143 panic("ARM64TODO: pmap_unmap_io_transient: Unmap data"); 7144 } 7145 } 7146 } 7147 7148 boolean_t 7149 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 7150 { 7151 7152 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH); 7153 } 7154 7155 /* 7156 * Track a range of the kernel's virtual address space that is contiguous 7157 * in various mapping attributes. 7158 */ 7159 struct pmap_kernel_map_range { 7160 vm_offset_t sva; 7161 pt_entry_t attrs; 7162 int l3pages; 7163 int l3contig; 7164 int l2blocks; 7165 int l1blocks; 7166 }; 7167 7168 static void 7169 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 7170 vm_offset_t eva) 7171 { 7172 const char *mode; 7173 int index; 7174 7175 if (eva <= range->sva) 7176 return; 7177 7178 index = range->attrs & ATTR_S1_IDX_MASK; 7179 switch (index) { 7180 case ATTR_S1_IDX(VM_MEMATTR_DEVICE): 7181 mode = "DEV"; 7182 break; 7183 case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE): 7184 mode = "UC"; 7185 break; 7186 case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK): 7187 mode = "WB"; 7188 break; 7189 case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH): 7190 mode = "WT"; 7191 break; 7192 default: 7193 printf( 7194 "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n", 7195 __func__, index, range->sva, eva); 7196 mode = "??"; 7197 break; 7198 } 7199 7200 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %3s %d %d %d %d\n", 7201 range->sva, eva, 7202 (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-', 7203 (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x', 7204 (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X', 7205 (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's', 7206 mode, range->l1blocks, range->l2blocks, range->l3contig, 7207 range->l3pages); 7208 7209 /* Reset to sentinel value. */ 7210 range->sva = 0xfffffffffffffffful; 7211 } 7212 7213 /* 7214 * Determine whether the attributes specified by a page table entry match those 7215 * being tracked by the current range. 7216 */ 7217 static bool 7218 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 7219 { 7220 7221 return (range->attrs == attrs); 7222 } 7223 7224 static void 7225 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 7226 pt_entry_t attrs) 7227 { 7228 7229 memset(range, 0, sizeof(*range)); 7230 range->sva = va; 7231 range->attrs = attrs; 7232 } 7233 7234 /* Get the block/page attributes that correspond to the table attributes */ 7235 static pt_entry_t 7236 sysctl_kmaps_table_attrs(pd_entry_t table) 7237 { 7238 pt_entry_t attrs; 7239 7240 attrs = 0; 7241 if ((table & TATTR_UXN_TABLE) != 0) 7242 attrs |= ATTR_S1_UXN; 7243 if ((table & TATTR_PXN_TABLE) != 0) 7244 attrs |= ATTR_S1_PXN; 7245 if ((table & TATTR_AP_TABLE_RO) != 0) 7246 attrs |= ATTR_S1_AP(ATTR_S1_AP_RO); 7247 7248 return (attrs); 7249 } 7250 7251 /* Read the block/page attributes we care about */ 7252 static pt_entry_t 7253 sysctl_kmaps_block_attrs(pt_entry_t block) 7254 { 7255 return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK)); 7256 } 7257 7258 /* 7259 * Given a leaf PTE, derive the mapping's attributes. If they do not match 7260 * those of the current run, dump the address range and its attributes, and 7261 * begin a new run. 7262 */ 7263 static void 7264 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 7265 vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e, 7266 pt_entry_t l3e) 7267 { 7268 pt_entry_t attrs; 7269 7270 attrs = sysctl_kmaps_table_attrs(l0e); 7271 7272 if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 7273 attrs |= sysctl_kmaps_block_attrs(l1e); 7274 goto done; 7275 } 7276 attrs |= sysctl_kmaps_table_attrs(l1e); 7277 7278 if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 7279 attrs |= sysctl_kmaps_block_attrs(l2e); 7280 goto done; 7281 } 7282 attrs |= sysctl_kmaps_table_attrs(l2e); 7283 attrs |= sysctl_kmaps_block_attrs(l3e); 7284 7285 done: 7286 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 7287 sysctl_kmaps_dump(sb, range, va); 7288 sysctl_kmaps_reinit(range, va, attrs); 7289 } 7290 } 7291 7292 static int 7293 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 7294 { 7295 struct pmap_kernel_map_range range; 7296 struct sbuf sbuf, *sb; 7297 pd_entry_t l0e, *l1, l1e, *l2, l2e; 7298 pt_entry_t *l3, l3e; 7299 vm_offset_t sva; 7300 vm_paddr_t pa; 7301 int error, i, j, k, l; 7302 7303 error = sysctl_wire_old_buffer(req, 0); 7304 if (error != 0) 7305 return (error); 7306 sb = &sbuf; 7307 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 7308 7309 /* Sentinel value. */ 7310 range.sva = 0xfffffffffffffffful; 7311 7312 /* 7313 * Iterate over the kernel page tables without holding the kernel pmap 7314 * lock. Kernel page table pages are never freed, so at worst we will 7315 * observe inconsistencies in the output. 7316 */ 7317 for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES; 7318 i++) { 7319 if (i == pmap_l0_index(DMAP_MIN_ADDRESS)) 7320 sbuf_printf(sb, "\nDirect map:\n"); 7321 else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS)) 7322 sbuf_printf(sb, "\nKernel map:\n"); 7323 7324 l0e = kernel_pmap->pm_l0[i]; 7325 if ((l0e & ATTR_DESCR_VALID) == 0) { 7326 sysctl_kmaps_dump(sb, &range, sva); 7327 sva += L0_SIZE; 7328 continue; 7329 } 7330 pa = l0e & ~ATTR_MASK; 7331 l1 = (pd_entry_t *)PHYS_TO_DMAP(pa); 7332 7333 for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) { 7334 l1e = l1[j]; 7335 if ((l1e & ATTR_DESCR_VALID) == 0) { 7336 sysctl_kmaps_dump(sb, &range, sva); 7337 sva += L1_SIZE; 7338 continue; 7339 } 7340 if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) { 7341 sysctl_kmaps_check(sb, &range, sva, l0e, l1e, 7342 0, 0); 7343 range.l1blocks++; 7344 sva += L1_SIZE; 7345 continue; 7346 } 7347 pa = l1e & ~ATTR_MASK; 7348 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); 7349 7350 for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) { 7351 l2e = l2[k]; 7352 if ((l2e & ATTR_DESCR_VALID) == 0) { 7353 sysctl_kmaps_dump(sb, &range, sva); 7354 sva += L2_SIZE; 7355 continue; 7356 } 7357 if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) { 7358 sysctl_kmaps_check(sb, &range, sva, 7359 l0e, l1e, l2e, 0); 7360 range.l2blocks++; 7361 sva += L2_SIZE; 7362 continue; 7363 } 7364 pa = l2e & ~ATTR_MASK; 7365 l3 = (pt_entry_t *)PHYS_TO_DMAP(pa); 7366 7367 for (l = pmap_l3_index(sva); l < Ln_ENTRIES; 7368 l++, sva += L3_SIZE) { 7369 l3e = l3[l]; 7370 if ((l3e & ATTR_DESCR_VALID) == 0) { 7371 sysctl_kmaps_dump(sb, &range, 7372 sva); 7373 continue; 7374 } 7375 sysctl_kmaps_check(sb, &range, sva, 7376 l0e, l1e, l2e, l3e); 7377 if ((l3e & ATTR_CONTIGUOUS) != 0) 7378 range.l3contig += l % 16 == 0 ? 7379 1 : 0; 7380 else 7381 range.l3pages++; 7382 } 7383 } 7384 } 7385 } 7386 7387 error = sbuf_finish(sb); 7388 sbuf_delete(sb); 7389 return (error); 7390 } 7391 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 7392 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 7393 NULL, 0, sysctl_kmaps, "A", 7394 "Dump kernel address layout"); 7395