1 /*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2003 Peter Wemm 9 * All rights reserved. 10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 11 * All rights reserved. 12 * Copyright (c) 2014 Andrew Turner 13 * All rights reserved. 14 * Copyright (c) 2014-2016 The FreeBSD Foundation 15 * All rights reserved. 16 * 17 * This code is derived from software contributed to Berkeley by 18 * the Systems Programming Group of the University of Utah Computer 19 * Science Department and William Jolitz of UUNET Technologies Inc. 20 * 21 * This software was developed by Andrew Turner under sponsorship from 22 * the FreeBSD Foundation. 23 * 24 * Redistribution and use in source and binary forms, with or without 25 * modification, are permitted provided that the following conditions 26 * are met: 27 * 1. Redistributions of source code must retain the above copyright 28 * notice, this list of conditions and the following disclaimer. 29 * 2. Redistributions in binary form must reproduce the above copyright 30 * notice, this list of conditions and the following disclaimer in the 31 * documentation and/or other materials provided with the distribution. 32 * 3. All advertising materials mentioning features or use of this software 33 * must display the following acknowledgement: 34 * This product includes software developed by the University of 35 * California, Berkeley and its contributors. 36 * 4. Neither the name of the University nor the names of its contributors 37 * may be used to endorse or promote products derived from this software 38 * without specific prior written permission. 39 * 40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 43 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 50 * SUCH DAMAGE. 51 * 52 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 53 */ 54 /*- 55 * Copyright (c) 2003 Networks Associates Technology, Inc. 56 * All rights reserved. 57 * 58 * This software was developed for the FreeBSD Project by Jake Burkholder, 59 * Safeport Network Services, and Network Associates Laboratories, the 60 * Security Research Division of Network Associates, Inc. under 61 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 62 * CHATS research program. 63 * 64 * Redistribution and use in source and binary forms, with or without 65 * modification, are permitted provided that the following conditions 66 * are met: 67 * 1. Redistributions of source code must retain the above copyright 68 * notice, this list of conditions and the following disclaimer. 69 * 2. Redistributions in binary form must reproduce the above copyright 70 * notice, this list of conditions and the following disclaimer in the 71 * documentation and/or other materials provided with the distribution. 72 * 73 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 74 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 75 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 76 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 77 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 78 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 79 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 80 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 81 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 82 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 83 * SUCH DAMAGE. 84 */ 85 86 #include <sys/cdefs.h> 87 __FBSDID("$FreeBSD$"); 88 89 /* 90 * Manages physical address maps. 91 * 92 * Since the information managed by this module is 93 * also stored by the logical address mapping module, 94 * this module may throw away valid virtual-to-physical 95 * mappings at almost any time. However, invalidations 96 * of virtual-to-physical mappings must be done as 97 * requested. 98 * 99 * In order to cope with hardware architectures which 100 * make virtual-to-physical map invalidates expensive, 101 * this module may delay invalidate or reduced protection 102 * operations until such time as they are actually 103 * necessary. This module is given full information as 104 * to which processors are currently using which maps, 105 * and to when physical maps must be made correct. 106 */ 107 108 #include "opt_vm.h" 109 110 #include <sys/param.h> 111 #include <sys/bitstring.h> 112 #include <sys/bus.h> 113 #include <sys/systm.h> 114 #include <sys/kernel.h> 115 #include <sys/ktr.h> 116 #include <sys/limits.h> 117 #include <sys/lock.h> 118 #include <sys/malloc.h> 119 #include <sys/mman.h> 120 #include <sys/msgbuf.h> 121 #include <sys/mutex.h> 122 #include <sys/physmem.h> 123 #include <sys/proc.h> 124 #include <sys/rwlock.h> 125 #include <sys/sbuf.h> 126 #include <sys/sx.h> 127 #include <sys/vmem.h> 128 #include <sys/vmmeter.h> 129 #include <sys/sched.h> 130 #include <sys/sysctl.h> 131 #include <sys/_unrhdr.h> 132 #include <sys/smp.h> 133 134 #include <vm/vm.h> 135 #include <vm/vm_param.h> 136 #include <vm/vm_kern.h> 137 #include <vm/vm_page.h> 138 #include <vm/vm_map.h> 139 #include <vm/vm_object.h> 140 #include <vm/vm_extern.h> 141 #include <vm/vm_pageout.h> 142 #include <vm/vm_pager.h> 143 #include <vm/vm_phys.h> 144 #include <vm/vm_radix.h> 145 #include <vm/vm_reserv.h> 146 #include <vm/vm_dumpset.h> 147 #include <vm/uma.h> 148 149 #include <machine/machdep.h> 150 #include <machine/md_var.h> 151 #include <machine/pcb.h> 152 153 #define PMAP_ASSERT_STAGE1(pmap) MPASS((pmap)->pm_stage == PM_STAGE1) 154 #define PMAP_ASSERT_STAGE2(pmap) MPASS((pmap)->pm_stage == PM_STAGE2) 155 156 #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t))) 157 #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t))) 158 #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t))) 159 #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t))) 160 161 #define NUL0E L0_ENTRIES 162 #define NUL1E (NUL0E * NL1PG) 163 #define NUL2E (NUL1E * NL2PG) 164 165 #if !defined(DIAGNOSTIC) 166 #ifdef __GNUC_GNU_INLINE__ 167 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 168 #else 169 #define PMAP_INLINE extern inline 170 #endif 171 #else 172 #define PMAP_INLINE 173 #endif 174 175 #ifdef PV_STATS 176 #define PV_STAT(x) do { x ; } while (0) 177 #else 178 #define PV_STAT(x) do { } while (0) 179 #endif 180 181 #define pmap_l0_pindex(v) (NUL2E + NUL1E + ((v) >> L0_SHIFT)) 182 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT)) 183 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) 184 185 static struct md_page * 186 pa_to_pvh(vm_paddr_t pa) 187 { 188 struct vm_phys_seg *seg; 189 int segind; 190 191 for (segind = 0; segind < vm_phys_nsegs; segind++) { 192 seg = &vm_phys_segs[segind]; 193 if (pa >= seg->start && pa < seg->end) 194 return ((struct md_page *)seg->md_first + 195 pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start)); 196 } 197 panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa); 198 } 199 200 static struct md_page * 201 page_to_pvh(vm_page_t m) 202 { 203 struct vm_phys_seg *seg; 204 205 seg = &vm_phys_segs[m->segind]; 206 return ((struct md_page *)seg->md_first + 207 pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start)); 208 } 209 210 #define NPV_LIST_LOCKS MAXCPU 211 212 #define PHYS_TO_PV_LIST_LOCK(pa) \ 213 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) 214 215 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 216 struct rwlock **_lockp = (lockp); \ 217 struct rwlock *_new_lock; \ 218 \ 219 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 220 if (_new_lock != *_lockp) { \ 221 if (*_lockp != NULL) \ 222 rw_wunlock(*_lockp); \ 223 *_lockp = _new_lock; \ 224 rw_wlock(*_lockp); \ 225 } \ 226 } while (0) 227 228 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 229 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 230 231 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 232 struct rwlock **_lockp = (lockp); \ 233 \ 234 if (*_lockp != NULL) { \ 235 rw_wunlock(*_lockp); \ 236 *_lockp = NULL; \ 237 } \ 238 } while (0) 239 240 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 241 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 242 243 /* 244 * The presence of this flag indicates that the mapping is writeable. 245 * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise 246 * it is dirty. This flag may only be set on managed mappings. 247 * 248 * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it 249 * as a software managed bit. 250 */ 251 #define ATTR_SW_DBM ATTR_DBM 252 253 struct pmap kernel_pmap_store; 254 255 /* Used for mapping ACPI memory before VM is initialized */ 256 #define PMAP_PREINIT_MAPPING_COUNT 32 257 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE) 258 static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */ 259 static int vm_initialized = 0; /* No need to use pre-init maps when set */ 260 261 /* 262 * Reserve a few L2 blocks starting from 'preinit_map_va' pointer. 263 * Always map entire L2 block for simplicity. 264 * VA of L2 block = preinit_map_va + i * L2_SIZE 265 */ 266 static struct pmap_preinit_mapping { 267 vm_paddr_t pa; 268 vm_offset_t va; 269 vm_size_t size; 270 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 271 272 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 273 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 274 vm_offset_t kernel_vm_end = 0; 275 276 /* 277 * Data for the pv entry allocation mechanism. 278 */ 279 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 280 static struct mtx pv_chunks_mutex; 281 static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 282 static struct md_page *pv_table; 283 static struct md_page pv_dummy; 284 285 vm_paddr_t dmap_phys_base; /* The start of the dmap region */ 286 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ 287 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ 288 289 /* This code assumes all L1 DMAP entries will be used */ 290 CTASSERT((DMAP_MIN_ADDRESS & ~L0_OFFSET) == DMAP_MIN_ADDRESS); 291 CTASSERT((DMAP_MAX_ADDRESS & ~L0_OFFSET) == DMAP_MAX_ADDRESS); 292 293 #define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT) 294 extern pt_entry_t pagetable_dmap[]; 295 296 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 297 static vm_paddr_t physmap[PHYSMAP_SIZE]; 298 static u_int physmap_idx; 299 300 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 301 "VM/pmap parameters"); 302 303 /* 304 * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs 305 * that it has currently allocated to a pmap, a cursor ("asid_next") to 306 * optimize its search for a free ASID in the bit vector, and an epoch number 307 * ("asid_epoch") to indicate when it has reclaimed all previously allocated 308 * ASIDs that are not currently active on a processor. 309 * 310 * The current epoch number is always in the range [0, INT_MAX). Negative 311 * numbers and INT_MAX are reserved for special cases that are described 312 * below. 313 */ 314 struct asid_set { 315 int asid_bits; 316 bitstr_t *asid_set; 317 int asid_set_size; 318 int asid_next; 319 int asid_epoch; 320 struct mtx asid_set_mutex; 321 }; 322 323 static struct asid_set asids; 324 static struct asid_set vmids; 325 326 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 327 "ASID allocator"); 328 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0, 329 "The number of bits in an ASID"); 330 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0, 331 "The last allocated ASID plus one"); 332 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0, 333 "The current epoch number"); 334 335 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator"); 336 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0, 337 "The number of bits in an VMID"); 338 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0, 339 "The last allocated VMID plus one"); 340 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0, 341 "The current epoch number"); 342 343 void (*pmap_clean_stage2_tlbi)(void); 344 void (*pmap_invalidate_vpipt_icache)(void); 345 346 /* 347 * A pmap's cookie encodes an ASID and epoch number. Cookies for reserved 348 * ASIDs have a negative epoch number, specifically, INT_MIN. Cookies for 349 * dynamically allocated ASIDs have a non-negative epoch number. 350 * 351 * An invalid ASID is represented by -1. 352 * 353 * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN), 354 * which indicates that an ASID should never be allocated to the pmap, and 355 * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be 356 * allocated when the pmap is next activated. 357 */ 358 #define COOKIE_FROM(asid, epoch) ((long)((u_int)(asid) | \ 359 ((u_long)(epoch) << 32))) 360 #define COOKIE_TO_ASID(cookie) ((int)(cookie)) 361 #define COOKIE_TO_EPOCH(cookie) ((int)((u_long)(cookie) >> 32)) 362 363 static int superpages_enabled = 1; 364 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, 365 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0, 366 "Are large page mappings enabled?"); 367 368 /* 369 * Internal flags for pmap_enter()'s helper functions. 370 */ 371 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 372 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 373 374 static void free_pv_chunk(struct pv_chunk *pc); 375 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 376 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 377 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 378 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 379 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 380 vm_offset_t va); 381 382 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); 383 static bool pmap_activate_int(pmap_t pmap); 384 static void pmap_alloc_asid(pmap_t pmap); 385 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, 386 vm_prot_t prot, int mode, bool skip_unmapped); 387 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va); 388 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, 389 vm_offset_t va, struct rwlock **lockp); 390 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); 391 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 392 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 393 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, 394 u_int flags, vm_page_t m, struct rwlock **lockp); 395 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 396 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp); 397 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, 398 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp); 399 static void pmap_reset_asid_set(pmap_t pmap); 400 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 401 vm_page_t m, struct rwlock **lockp); 402 403 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, 404 struct rwlock **lockp); 405 406 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, 407 struct spglist *free); 408 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 409 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 410 411 /* 412 * These load the old table data and store the new value. 413 * They need to be atomic as the System MMU may write to the table at 414 * the same time as the CPU. 415 */ 416 #define pmap_clear(table) atomic_store_64(table, 0) 417 #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits) 418 #define pmap_load(table) (*table) 419 #define pmap_load_clear(table) atomic_swap_64(table, 0) 420 #define pmap_load_store(table, entry) atomic_swap_64(table, entry) 421 #define pmap_set_bits(table, bits) atomic_set_64(table, bits) 422 #define pmap_store(table, entry) atomic_store_64(table, entry) 423 424 /********************/ 425 /* Inline functions */ 426 /********************/ 427 428 static __inline void 429 pagecopy(void *s, void *d) 430 { 431 432 memcpy(d, s, PAGE_SIZE); 433 } 434 435 static __inline pd_entry_t * 436 pmap_l0(pmap_t pmap, vm_offset_t va) 437 { 438 439 return (&pmap->pm_l0[pmap_l0_index(va)]); 440 } 441 442 static __inline pd_entry_t * 443 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) 444 { 445 pd_entry_t *l1; 446 447 l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); 448 return (&l1[pmap_l1_index(va)]); 449 } 450 451 static __inline pd_entry_t * 452 pmap_l1(pmap_t pmap, vm_offset_t va) 453 { 454 pd_entry_t *l0; 455 456 l0 = pmap_l0(pmap, va); 457 if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE) 458 return (NULL); 459 460 return (pmap_l0_to_l1(l0, va)); 461 } 462 463 static __inline pd_entry_t * 464 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va) 465 { 466 pd_entry_t l1, *l2p; 467 468 l1 = pmap_load(l1p); 469 470 KASSERT(ADDR_IS_CANONICAL(va), 471 ("%s: Address not in canonical form: %lx", __func__, va)); 472 /* 473 * The valid bit may be clear if pmap_update_entry() is concurrently 474 * modifying the entry, so for KVA only the entry type may be checked. 475 */ 476 KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0, 477 ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va)); 478 KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, 479 ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va)); 480 l2p = (pd_entry_t *)PHYS_TO_DMAP(l1 & ~ATTR_MASK); 481 return (&l2p[pmap_l2_index(va)]); 482 } 483 484 static __inline pd_entry_t * 485 pmap_l2(pmap_t pmap, vm_offset_t va) 486 { 487 pd_entry_t *l1; 488 489 l1 = pmap_l1(pmap, va); 490 if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE) 491 return (NULL); 492 493 return (pmap_l1_to_l2(l1, va)); 494 } 495 496 static __inline pt_entry_t * 497 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va) 498 { 499 pd_entry_t l2; 500 pt_entry_t *l3p; 501 502 l2 = pmap_load(l2p); 503 504 KASSERT(ADDR_IS_CANONICAL(va), 505 ("%s: Address not in canonical form: %lx", __func__, va)); 506 /* 507 * The valid bit may be clear if pmap_update_entry() is concurrently 508 * modifying the entry, so for KVA only the entry type may be checked. 509 */ 510 KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0, 511 ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va)); 512 KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, 513 ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va)); 514 l3p = (pt_entry_t *)PHYS_TO_DMAP(l2 & ~ATTR_MASK); 515 return (&l3p[pmap_l3_index(va)]); 516 } 517 518 /* 519 * Returns the lowest valid pde for a given virtual address. 520 * The next level may or may not point to a valid page or block. 521 */ 522 static __inline pd_entry_t * 523 pmap_pde(pmap_t pmap, vm_offset_t va, int *level) 524 { 525 pd_entry_t *l0, *l1, *l2, desc; 526 527 l0 = pmap_l0(pmap, va); 528 desc = pmap_load(l0) & ATTR_DESCR_MASK; 529 if (desc != L0_TABLE) { 530 *level = -1; 531 return (NULL); 532 } 533 534 l1 = pmap_l0_to_l1(l0, va); 535 desc = pmap_load(l1) & ATTR_DESCR_MASK; 536 if (desc != L1_TABLE) { 537 *level = 0; 538 return (l0); 539 } 540 541 l2 = pmap_l1_to_l2(l1, va); 542 desc = pmap_load(l2) & ATTR_DESCR_MASK; 543 if (desc != L2_TABLE) { 544 *level = 1; 545 return (l1); 546 } 547 548 *level = 2; 549 return (l2); 550 } 551 552 /* 553 * Returns the lowest valid pte block or table entry for a given virtual 554 * address. If there are no valid entries return NULL and set the level to 555 * the first invalid level. 556 */ 557 static __inline pt_entry_t * 558 pmap_pte(pmap_t pmap, vm_offset_t va, int *level) 559 { 560 pd_entry_t *l1, *l2, desc; 561 pt_entry_t *l3; 562 563 l1 = pmap_l1(pmap, va); 564 if (l1 == NULL) { 565 *level = 0; 566 return (NULL); 567 } 568 desc = pmap_load(l1) & ATTR_DESCR_MASK; 569 if (desc == L1_BLOCK) { 570 *level = 1; 571 return (l1); 572 } 573 574 if (desc != L1_TABLE) { 575 *level = 1; 576 return (NULL); 577 } 578 579 l2 = pmap_l1_to_l2(l1, va); 580 desc = pmap_load(l2) & ATTR_DESCR_MASK; 581 if (desc == L2_BLOCK) { 582 *level = 2; 583 return (l2); 584 } 585 586 if (desc != L2_TABLE) { 587 *level = 2; 588 return (NULL); 589 } 590 591 *level = 3; 592 l3 = pmap_l2_to_l3(l2, va); 593 if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE) 594 return (NULL); 595 596 return (l3); 597 } 598 599 /* 600 * If the given pmap has an L{1,2}_BLOCK or L3_PAGE entry at the specified 601 * level that maps the specified virtual address, then a pointer to that entry 602 * is returned. Otherwise, NULL is returned, unless INVARIANTS are enabled 603 * and a diagnostic message is provided, in which case this function panics. 604 */ 605 static __always_inline pt_entry_t * 606 pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag) 607 { 608 pd_entry_t *l0p, *l1p, *l2p; 609 pt_entry_t desc, *l3p; 610 int walk_level __diagused; 611 612 KASSERT(level >= 0 && level < 4, 613 ("%s: %s passed an out-of-range level (%d)", __func__, diag, 614 level)); 615 l0p = pmap_l0(pmap, va); 616 desc = pmap_load(l0p) & ATTR_DESCR_MASK; 617 if (desc == L0_TABLE && level > 0) { 618 l1p = pmap_l0_to_l1(l0p, va); 619 desc = pmap_load(l1p) & ATTR_DESCR_MASK; 620 if (desc == L1_BLOCK && level == 1) 621 return (l1p); 622 else if (desc == L1_TABLE && level > 1) { 623 l2p = pmap_l1_to_l2(l1p, va); 624 desc = pmap_load(l2p) & ATTR_DESCR_MASK; 625 if (desc == L2_BLOCK && level == 2) 626 return (l2p); 627 else if (desc == L2_TABLE && level > 2) { 628 l3p = pmap_l2_to_l3(l2p, va); 629 desc = pmap_load(l3p) & ATTR_DESCR_MASK; 630 if (desc == L3_PAGE && level == 3) 631 return (l3p); 632 else 633 walk_level = 3; 634 } else 635 walk_level = 2; 636 } else 637 walk_level = 1; 638 } else 639 walk_level = 0; 640 KASSERT(diag == NULL, 641 ("%s: va %#lx not mapped at level %d, desc %ld at level %d", 642 diag, va, level, desc, walk_level)); 643 return (NULL); 644 } 645 646 bool 647 pmap_ps_enabled(pmap_t pmap __unused) 648 { 649 650 return (superpages_enabled != 0); 651 } 652 653 bool 654 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1, 655 pd_entry_t **l2, pt_entry_t **l3) 656 { 657 pd_entry_t *l0p, *l1p, *l2p; 658 659 if (pmap->pm_l0 == NULL) 660 return (false); 661 662 l0p = pmap_l0(pmap, va); 663 *l0 = l0p; 664 665 if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) 666 return (false); 667 668 l1p = pmap_l0_to_l1(l0p, va); 669 *l1 = l1p; 670 671 if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) { 672 *l2 = NULL; 673 *l3 = NULL; 674 return (true); 675 } 676 677 if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE) 678 return (false); 679 680 l2p = pmap_l1_to_l2(l1p, va); 681 *l2 = l2p; 682 683 if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) { 684 *l3 = NULL; 685 return (true); 686 } 687 688 if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE) 689 return (false); 690 691 *l3 = pmap_l2_to_l3(l2p, va); 692 693 return (true); 694 } 695 696 static __inline int 697 pmap_l3_valid(pt_entry_t l3) 698 { 699 700 return ((l3 & ATTR_DESCR_MASK) == L3_PAGE); 701 } 702 703 CTASSERT(L1_BLOCK == L2_BLOCK); 704 705 static pt_entry_t 706 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr) 707 { 708 pt_entry_t val; 709 710 if (pmap->pm_stage == PM_STAGE1) { 711 val = ATTR_S1_IDX(memattr); 712 if (memattr == VM_MEMATTR_DEVICE) 713 val |= ATTR_S1_XN; 714 return (val); 715 } 716 717 val = 0; 718 719 switch (memattr) { 720 case VM_MEMATTR_DEVICE: 721 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) | 722 ATTR_S2_XN(ATTR_S2_XN_ALL)); 723 case VM_MEMATTR_UNCACHEABLE: 724 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC)); 725 case VM_MEMATTR_WRITE_BACK: 726 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB)); 727 case VM_MEMATTR_WRITE_THROUGH: 728 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT)); 729 default: 730 panic("%s: invalid memory attribute %x", __func__, memattr); 731 } 732 } 733 734 static pt_entry_t 735 pmap_pte_prot(pmap_t pmap, vm_prot_t prot) 736 { 737 pt_entry_t val; 738 739 val = 0; 740 if (pmap->pm_stage == PM_STAGE1) { 741 if ((prot & VM_PROT_EXECUTE) == 0) 742 val |= ATTR_S1_XN; 743 if ((prot & VM_PROT_WRITE) == 0) 744 val |= ATTR_S1_AP(ATTR_S1_AP_RO); 745 } else { 746 if ((prot & VM_PROT_WRITE) != 0) 747 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 748 if ((prot & VM_PROT_READ) != 0) 749 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ); 750 if ((prot & VM_PROT_EXECUTE) == 0) 751 val |= ATTR_S2_XN(ATTR_S2_XN_ALL); 752 } 753 754 return (val); 755 } 756 757 /* 758 * Checks if the PTE is dirty. 759 */ 760 static inline int 761 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte) 762 { 763 764 KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte)); 765 766 if (pmap->pm_stage == PM_STAGE1) { 767 KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0, 768 ("pte %#lx is writeable and missing ATTR_SW_DBM", pte)); 769 770 return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 771 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM)); 772 } 773 774 return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) == 775 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)); 776 } 777 778 static __inline void 779 pmap_resident_count_inc(pmap_t pmap, int count) 780 { 781 782 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 783 pmap->pm_stats.resident_count += count; 784 } 785 786 static __inline void 787 pmap_resident_count_dec(pmap_t pmap, int count) 788 { 789 790 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 791 KASSERT(pmap->pm_stats.resident_count >= count, 792 ("pmap %p resident count underflow %ld %d", pmap, 793 pmap->pm_stats.resident_count, count)); 794 pmap->pm_stats.resident_count -= count; 795 } 796 797 static vm_paddr_t 798 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) 799 { 800 vm_paddr_t pa_page; 801 802 pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK; 803 return (pa_page | (va & PAR_LOW_MASK)); 804 } 805 806 static vm_offset_t 807 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, 808 vm_offset_t freemempos) 809 { 810 pt_entry_t *l2; 811 vm_offset_t va; 812 vm_paddr_t l2_pa, pa; 813 u_int l1_slot, l2_slot, prev_l1_slot; 814 int i; 815 816 dmap_phys_base = min_pa & ~L1_OFFSET; 817 dmap_phys_max = 0; 818 dmap_max_addr = 0; 819 l2 = NULL; 820 prev_l1_slot = -1; 821 822 #define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT) 823 memset(pagetable_dmap, 0, PAGE_SIZE * DMAP_TABLES); 824 825 for (i = 0; i < (physmap_idx * 2); i += 2) { 826 pa = physmap[i] & ~L2_OFFSET; 827 va = pa - dmap_phys_base + DMAP_MIN_ADDRESS; 828 829 /* Create L2 mappings at the start of the region */ 830 if ((pa & L1_OFFSET) != 0) { 831 l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); 832 if (l1_slot != prev_l1_slot) { 833 prev_l1_slot = l1_slot; 834 l2 = (pt_entry_t *)freemempos; 835 l2_pa = pmap_early_vtophys(kern_l1, 836 (vm_offset_t)l2); 837 freemempos += PAGE_SIZE; 838 839 pmap_store(&pagetable_dmap[l1_slot], 840 (l2_pa & ~Ln_TABLE_MASK) | 841 TATTR_PXN_TABLE | L1_TABLE); 842 843 memset(l2, 0, PAGE_SIZE); 844 } 845 KASSERT(l2 != NULL, 846 ("pmap_bootstrap_dmap: NULL l2 map")); 847 for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1]; 848 pa += L2_SIZE, va += L2_SIZE) { 849 /* 850 * We are on a boundary, stop to 851 * create a level 1 block 852 */ 853 if ((pa & L1_OFFSET) == 0) 854 break; 855 856 l2_slot = pmap_l2_index(va); 857 KASSERT(l2_slot != 0, ("...")); 858 pmap_store(&l2[l2_slot], 859 (pa & ~L2_OFFSET) | ATTR_DEFAULT | 860 ATTR_S1_XN | 861 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | 862 L2_BLOCK); 863 } 864 KASSERT(va == (pa - dmap_phys_base + DMAP_MIN_ADDRESS), 865 ("...")); 866 } 867 868 for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1] && 869 (physmap[i + 1] - pa) >= L1_SIZE; 870 pa += L1_SIZE, va += L1_SIZE) { 871 l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); 872 pmap_store(&pagetable_dmap[l1_slot], 873 (pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_S1_XN | 874 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L1_BLOCK); 875 } 876 877 /* Create L2 mappings at the end of the region */ 878 if (pa < physmap[i + 1]) { 879 l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); 880 if (l1_slot != prev_l1_slot) { 881 prev_l1_slot = l1_slot; 882 l2 = (pt_entry_t *)freemempos; 883 l2_pa = pmap_early_vtophys(kern_l1, 884 (vm_offset_t)l2); 885 freemempos += PAGE_SIZE; 886 887 pmap_store(&pagetable_dmap[l1_slot], 888 (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE); 889 890 memset(l2, 0, PAGE_SIZE); 891 } 892 KASSERT(l2 != NULL, 893 ("pmap_bootstrap_dmap: NULL l2 map")); 894 for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1]; 895 pa += L2_SIZE, va += L2_SIZE) { 896 l2_slot = pmap_l2_index(va); 897 pmap_store(&l2[l2_slot], 898 (pa & ~L2_OFFSET) | ATTR_DEFAULT | 899 ATTR_S1_XN | 900 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | 901 L2_BLOCK); 902 } 903 } 904 905 if (pa > dmap_phys_max) { 906 dmap_phys_max = pa; 907 dmap_max_addr = va; 908 } 909 } 910 911 cpu_tlb_flushID(); 912 913 return (freemempos); 914 } 915 916 static vm_offset_t 917 pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start) 918 { 919 vm_offset_t l2pt; 920 vm_paddr_t pa; 921 pd_entry_t *l1; 922 u_int l1_slot; 923 924 KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address")); 925 926 l1 = (pd_entry_t *)l1pt; 927 l1_slot = pmap_l1_index(va); 928 l2pt = l2_start; 929 930 for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) { 931 KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); 932 933 pa = pmap_early_vtophys(l1pt, l2pt); 934 pmap_store(&l1[l1_slot], 935 (pa & ~Ln_TABLE_MASK) | L1_TABLE); 936 l2pt += PAGE_SIZE; 937 } 938 939 /* Clean the L2 page table */ 940 memset((void *)l2_start, 0, l2pt - l2_start); 941 942 return l2pt; 943 } 944 945 static vm_offset_t 946 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) 947 { 948 vm_offset_t l3pt; 949 vm_paddr_t pa; 950 pd_entry_t *l2; 951 u_int l2_slot; 952 953 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); 954 955 l2 = pmap_l2(kernel_pmap, va); 956 l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE); 957 l2_slot = pmap_l2_index(va); 958 l3pt = l3_start; 959 960 for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { 961 KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); 962 963 pa = pmap_early_vtophys(l1pt, l3pt); 964 pmap_store(&l2[l2_slot], 965 (pa & ~Ln_TABLE_MASK) | ATTR_S1_UXN | L2_TABLE); 966 l3pt += PAGE_SIZE; 967 } 968 969 /* Clean the L2 page table */ 970 memset((void *)l3_start, 0, l3pt - l3_start); 971 972 return l3pt; 973 } 974 975 /* 976 * Bootstrap the system enough to run with virtual memory. 977 */ 978 void 979 pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart, 980 vm_size_t kernlen) 981 { 982 vm_offset_t freemempos; 983 vm_offset_t dpcpu, msgbufpv; 984 vm_paddr_t start_pa, pa, min_pa; 985 uint64_t kern_delta; 986 int i; 987 988 /* Verify that the ASID is set through TTBR0. */ 989 KASSERT((READ_SPECIALREG(tcr_el1) & TCR_A1) == 0, 990 ("pmap_bootstrap: TCR_EL1.A1 != 0")); 991 992 kern_delta = KERNBASE - kernstart; 993 994 printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); 995 printf("%lx\n", l1pt); 996 printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK); 997 998 /* Set this early so we can use the pagetable walking functions */ 999 kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt; 1000 PMAP_LOCK_INIT(kernel_pmap); 1001 kernel_pmap->pm_l0_paddr = l0pt - kern_delta; 1002 kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN); 1003 kernel_pmap->pm_stage = PM_STAGE1; 1004 kernel_pmap->pm_levels = 4; 1005 kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr; 1006 kernel_pmap->pm_asid_set = &asids; 1007 1008 /* Assume the address we were loaded to is a valid physical address */ 1009 min_pa = KERNBASE - kern_delta; 1010 1011 physmap_idx = physmem_avail(physmap, nitems(physmap)); 1012 physmap_idx /= 2; 1013 1014 /* 1015 * Find the minimum physical address. physmap is sorted, 1016 * but may contain empty ranges. 1017 */ 1018 for (i = 0; i < physmap_idx * 2; i += 2) { 1019 if (physmap[i] == physmap[i + 1]) 1020 continue; 1021 if (physmap[i] <= min_pa) 1022 min_pa = physmap[i]; 1023 } 1024 1025 freemempos = KERNBASE + kernlen; 1026 freemempos = roundup2(freemempos, PAGE_SIZE); 1027 1028 /* Create a direct map region early so we can use it for pa -> va */ 1029 freemempos = pmap_bootstrap_dmap(l1pt, min_pa, freemempos); 1030 1031 start_pa = pa = KERNBASE - kern_delta; 1032 1033 /* 1034 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS. We assume that the 1035 * loader allocated the first and only l2 page table page used to map 1036 * the kernel, preloaded files and module metadata. 1037 */ 1038 freemempos = pmap_bootstrap_l2(l1pt, KERNBASE + L1_SIZE, freemempos); 1039 /* And the l3 tables for the early devmap */ 1040 freemempos = pmap_bootstrap_l3(l1pt, 1041 VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE), freemempos); 1042 1043 cpu_tlb_flushID(); 1044 1045 #define alloc_pages(var, np) \ 1046 (var) = freemempos; \ 1047 freemempos += (np * PAGE_SIZE); \ 1048 memset((char *)(var), 0, ((np) * PAGE_SIZE)); 1049 1050 /* Allocate dynamic per-cpu area. */ 1051 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); 1052 dpcpu_init((void *)dpcpu, 0); 1053 1054 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ 1055 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); 1056 msgbufp = (void *)msgbufpv; 1057 1058 /* Reserve some VA space for early BIOS/ACPI mapping */ 1059 preinit_map_va = roundup2(freemempos, L2_SIZE); 1060 1061 virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE; 1062 virtual_avail = roundup2(virtual_avail, L1_SIZE); 1063 virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE); 1064 kernel_vm_end = virtual_avail; 1065 1066 pa = pmap_early_vtophys(l1pt, freemempos); 1067 1068 physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC); 1069 1070 cpu_tlb_flushID(); 1071 } 1072 1073 /* 1074 * Initialize a vm_page's machine-dependent fields. 1075 */ 1076 void 1077 pmap_page_init(vm_page_t m) 1078 { 1079 1080 TAILQ_INIT(&m->md.pv_list); 1081 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; 1082 } 1083 1084 static void 1085 pmap_init_asids(struct asid_set *set, int bits) 1086 { 1087 int i; 1088 1089 set->asid_bits = bits; 1090 1091 /* 1092 * We may be too early in the overall initialization process to use 1093 * bit_alloc(). 1094 */ 1095 set->asid_set_size = 1 << set->asid_bits; 1096 set->asid_set = (bitstr_t *)kmem_malloc(bitstr_size(set->asid_set_size), 1097 M_WAITOK | M_ZERO); 1098 for (i = 0; i < ASID_FIRST_AVAILABLE; i++) 1099 bit_set(set->asid_set, i); 1100 set->asid_next = ASID_FIRST_AVAILABLE; 1101 mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN); 1102 } 1103 1104 /* 1105 * Initialize the pmap module. 1106 * Called by vm_init, to initialize any structures that the pmap 1107 * system needs to map virtual memory. 1108 */ 1109 void 1110 pmap_init(void) 1111 { 1112 struct vm_phys_seg *seg, *next_seg; 1113 struct md_page *pvh; 1114 vm_size_t s; 1115 uint64_t mmfr1; 1116 int i, pv_npg, vmid_bits; 1117 1118 /* 1119 * Are large page mappings enabled? 1120 */ 1121 TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); 1122 if (superpages_enabled) { 1123 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1124 ("pmap_init: can't assign to pagesizes[1]")); 1125 pagesizes[1] = L2_SIZE; 1126 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0, 1127 ("pmap_init: can't assign to pagesizes[2]")); 1128 pagesizes[2] = L1_SIZE; 1129 } 1130 1131 /* 1132 * Initialize the ASID allocator. 1133 */ 1134 pmap_init_asids(&asids, 1135 (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8); 1136 1137 if (has_hyp()) { 1138 mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1); 1139 vmid_bits = 8; 1140 1141 if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) == 1142 ID_AA64MMFR1_VMIDBits_16) 1143 vmid_bits = 16; 1144 pmap_init_asids(&vmids, vmid_bits); 1145 } 1146 1147 /* 1148 * Initialize the pv chunk list mutex. 1149 */ 1150 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 1151 1152 /* 1153 * Initialize the pool of pv list locks. 1154 */ 1155 for (i = 0; i < NPV_LIST_LOCKS; i++) 1156 rw_init(&pv_list_locks[i], "pmap pv list"); 1157 1158 /* 1159 * Calculate the size of the pv head table for superpages. 1160 */ 1161 pv_npg = 0; 1162 for (i = 0; i < vm_phys_nsegs; i++) { 1163 seg = &vm_phys_segs[i]; 1164 pv_npg += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1165 pmap_l2_pindex(seg->start); 1166 } 1167 1168 /* 1169 * Allocate memory for the pv head table for superpages. 1170 */ 1171 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 1172 s = round_page(s); 1173 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 1174 for (i = 0; i < pv_npg; i++) 1175 TAILQ_INIT(&pv_table[i].pv_list); 1176 TAILQ_INIT(&pv_dummy.pv_list); 1177 1178 /* 1179 * Set pointers from vm_phys_segs to pv_table. 1180 */ 1181 for (i = 0, pvh = pv_table; i < vm_phys_nsegs; i++) { 1182 seg = &vm_phys_segs[i]; 1183 seg->md_first = pvh; 1184 pvh += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1185 pmap_l2_pindex(seg->start); 1186 1187 /* 1188 * If there is a following segment, and the final 1189 * superpage of this segment and the initial superpage 1190 * of the next segment are the same then adjust the 1191 * pv_table entry for that next segment down by one so 1192 * that the pv_table entries will be shared. 1193 */ 1194 if (i + 1 < vm_phys_nsegs) { 1195 next_seg = &vm_phys_segs[i + 1]; 1196 if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 == 1197 pmap_l2_pindex(next_seg->start)) { 1198 pvh--; 1199 } 1200 } 1201 } 1202 1203 vm_initialized = 1; 1204 } 1205 1206 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1207 "2MB page mapping counters"); 1208 1209 static u_long pmap_l2_demotions; 1210 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, 1211 &pmap_l2_demotions, 0, "2MB page demotions"); 1212 1213 static u_long pmap_l2_mappings; 1214 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, 1215 &pmap_l2_mappings, 0, "2MB page mappings"); 1216 1217 static u_long pmap_l2_p_failures; 1218 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, 1219 &pmap_l2_p_failures, 0, "2MB page promotion failures"); 1220 1221 static u_long pmap_l2_promotions; 1222 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, 1223 &pmap_l2_promotions, 0, "2MB page promotions"); 1224 1225 /* 1226 * If the given value for "final_only" is false, then any cached intermediate- 1227 * level entries, i.e., L{0,1,2}_TABLE entries, are invalidated in addition to 1228 * any cached final-level entry, i.e., either an L{1,2}_BLOCK or L3_PAGE entry. 1229 * Otherwise, just the cached final-level entry is invalidated. 1230 */ 1231 static __inline void 1232 pmap_invalidate_kernel(uint64_t r, bool final_only) 1233 { 1234 if (final_only) 1235 __asm __volatile("tlbi vaale1is, %0" : : "r" (r)); 1236 else 1237 __asm __volatile("tlbi vaae1is, %0" : : "r" (r)); 1238 } 1239 1240 static __inline void 1241 pmap_invalidate_user(uint64_t r, bool final_only) 1242 { 1243 if (final_only) 1244 __asm __volatile("tlbi vale1is, %0" : : "r" (r)); 1245 else 1246 __asm __volatile("tlbi vae1is, %0" : : "r" (r)); 1247 } 1248 1249 /* 1250 * Invalidates any cached final- and optionally intermediate-level TLB entries 1251 * for the specified virtual address in the given virtual address space. 1252 */ 1253 static __inline void 1254 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only) 1255 { 1256 uint64_t r; 1257 1258 PMAP_ASSERT_STAGE1(pmap); 1259 1260 dsb(ishst); 1261 if (pmap == kernel_pmap) { 1262 r = atop(va); 1263 pmap_invalidate_kernel(r, final_only); 1264 } else { 1265 r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)) | atop(va); 1266 pmap_invalidate_user(r, final_only); 1267 } 1268 dsb(ish); 1269 isb(); 1270 } 1271 1272 /* 1273 * Invalidates any cached final- and optionally intermediate-level TLB entries 1274 * for the specified virtual address range in the given virtual address space. 1275 */ 1276 static __inline void 1277 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1278 bool final_only) 1279 { 1280 uint64_t end, r, start; 1281 1282 PMAP_ASSERT_STAGE1(pmap); 1283 1284 dsb(ishst); 1285 if (pmap == kernel_pmap) { 1286 start = atop(sva); 1287 end = atop(eva); 1288 for (r = start; r < end; r++) 1289 pmap_invalidate_kernel(r, final_only); 1290 } else { 1291 start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 1292 start |= atop(sva); 1293 end |= atop(eva); 1294 for (r = start; r < end; r++) 1295 pmap_invalidate_user(r, final_only); 1296 } 1297 dsb(ish); 1298 isb(); 1299 } 1300 1301 /* 1302 * Invalidates all cached intermediate- and final-level TLB entries for the 1303 * given virtual address space. 1304 */ 1305 static __inline void 1306 pmap_invalidate_all(pmap_t pmap) 1307 { 1308 uint64_t r; 1309 1310 PMAP_ASSERT_STAGE1(pmap); 1311 1312 dsb(ishst); 1313 if (pmap == kernel_pmap) { 1314 __asm __volatile("tlbi vmalle1is"); 1315 } else { 1316 r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 1317 __asm __volatile("tlbi aside1is, %0" : : "r" (r)); 1318 } 1319 dsb(ish); 1320 isb(); 1321 } 1322 1323 /* 1324 * Routine: pmap_extract 1325 * Function: 1326 * Extract the physical page address associated 1327 * with the given map/virtual_address pair. 1328 */ 1329 vm_paddr_t 1330 pmap_extract(pmap_t pmap, vm_offset_t va) 1331 { 1332 pt_entry_t *pte, tpte; 1333 vm_paddr_t pa; 1334 int lvl; 1335 1336 pa = 0; 1337 PMAP_LOCK(pmap); 1338 /* 1339 * Find the block or page map for this virtual address. pmap_pte 1340 * will return either a valid block/page entry, or NULL. 1341 */ 1342 pte = pmap_pte(pmap, va, &lvl); 1343 if (pte != NULL) { 1344 tpte = pmap_load(pte); 1345 pa = tpte & ~ATTR_MASK; 1346 switch(lvl) { 1347 case 1: 1348 KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, 1349 ("pmap_extract: Invalid L1 pte found: %lx", 1350 tpte & ATTR_DESCR_MASK)); 1351 pa |= (va & L1_OFFSET); 1352 break; 1353 case 2: 1354 KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, 1355 ("pmap_extract: Invalid L2 pte found: %lx", 1356 tpte & ATTR_DESCR_MASK)); 1357 pa |= (va & L2_OFFSET); 1358 break; 1359 case 3: 1360 KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, 1361 ("pmap_extract: Invalid L3 pte found: %lx", 1362 tpte & ATTR_DESCR_MASK)); 1363 pa |= (va & L3_OFFSET); 1364 break; 1365 } 1366 } 1367 PMAP_UNLOCK(pmap); 1368 return (pa); 1369 } 1370 1371 /* 1372 * Routine: pmap_extract_and_hold 1373 * Function: 1374 * Atomically extract and hold the physical page 1375 * with the given pmap and virtual address pair 1376 * if that mapping permits the given protection. 1377 */ 1378 vm_page_t 1379 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1380 { 1381 pt_entry_t *pte, tpte; 1382 vm_offset_t off; 1383 vm_page_t m; 1384 int lvl; 1385 bool use; 1386 1387 m = NULL; 1388 PMAP_LOCK(pmap); 1389 pte = pmap_pte(pmap, va, &lvl); 1390 if (pte != NULL) { 1391 tpte = pmap_load(pte); 1392 1393 KASSERT(lvl > 0 && lvl <= 3, 1394 ("pmap_extract_and_hold: Invalid level %d", lvl)); 1395 CTASSERT(L1_BLOCK == L2_BLOCK); 1396 KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) || 1397 (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK), 1398 ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl, 1399 tpte & ATTR_DESCR_MASK)); 1400 1401 use = false; 1402 if ((prot & VM_PROT_WRITE) == 0) 1403 use = true; 1404 else if (pmap->pm_stage == PM_STAGE1 && 1405 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)) 1406 use = true; 1407 else if (pmap->pm_stage == PM_STAGE2 && 1408 ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) == 1409 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE))) 1410 use = true; 1411 1412 if (use) { 1413 switch (lvl) { 1414 case 1: 1415 off = va & L1_OFFSET; 1416 break; 1417 case 2: 1418 off = va & L2_OFFSET; 1419 break; 1420 case 3: 1421 default: 1422 off = 0; 1423 } 1424 m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off); 1425 if (m != NULL && !vm_page_wire_mapped(m)) 1426 m = NULL; 1427 } 1428 } 1429 PMAP_UNLOCK(pmap); 1430 return (m); 1431 } 1432 1433 /* 1434 * Walks the page tables to translate a kernel virtual address to a 1435 * physical address. Returns true if the kva is valid and stores the 1436 * physical address in pa if it is not NULL. 1437 */ 1438 bool 1439 pmap_klookup(vm_offset_t va, vm_paddr_t *pa) 1440 { 1441 pt_entry_t *pte, tpte; 1442 register_t intr; 1443 uint64_t par; 1444 1445 /* 1446 * Disable interrupts so we don't get interrupted between asking 1447 * for address translation, and getting the result back. 1448 */ 1449 intr = intr_disable(); 1450 par = arm64_address_translate_s1e1r(va); 1451 intr_restore(intr); 1452 1453 if (PAR_SUCCESS(par)) { 1454 if (pa != NULL) 1455 *pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK); 1456 return (true); 1457 } 1458 1459 /* 1460 * Fall back to walking the page table. The address translation 1461 * instruction may fail when the page is in a break-before-make 1462 * sequence. As we only clear the valid bit in said sequence we 1463 * can walk the page table to find the physical address. 1464 */ 1465 1466 pte = pmap_l1(kernel_pmap, va); 1467 if (pte == NULL) 1468 return (false); 1469 1470 /* 1471 * A concurrent pmap_update_entry() will clear the entry's valid bit 1472 * but leave the rest of the entry unchanged. Therefore, we treat a 1473 * non-zero entry as being valid, and we ignore the valid bit when 1474 * determining whether the entry maps a block, page, or table. 1475 */ 1476 tpte = pmap_load(pte); 1477 if (tpte == 0) 1478 return (false); 1479 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 1480 if (pa != NULL) 1481 *pa = (tpte & ~ATTR_MASK) | (va & L1_OFFSET); 1482 return (true); 1483 } 1484 pte = pmap_l1_to_l2(&tpte, va); 1485 tpte = pmap_load(pte); 1486 if (tpte == 0) 1487 return (false); 1488 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 1489 if (pa != NULL) 1490 *pa = (tpte & ~ATTR_MASK) | (va & L2_OFFSET); 1491 return (true); 1492 } 1493 pte = pmap_l2_to_l3(&tpte, va); 1494 tpte = pmap_load(pte); 1495 if (tpte == 0) 1496 return (false); 1497 if (pa != NULL) 1498 *pa = (tpte & ~ATTR_MASK) | (va & L3_OFFSET); 1499 return (true); 1500 } 1501 1502 vm_paddr_t 1503 pmap_kextract(vm_offset_t va) 1504 { 1505 vm_paddr_t pa; 1506 1507 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 1508 return (DMAP_TO_PHYS(va)); 1509 1510 if (pmap_klookup(va, &pa) == false) 1511 return (0); 1512 return (pa); 1513 } 1514 1515 /*************************************************** 1516 * Low level mapping routines..... 1517 ***************************************************/ 1518 1519 void 1520 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode) 1521 { 1522 pd_entry_t *pde; 1523 pt_entry_t *pte, attr; 1524 vm_offset_t va; 1525 int lvl; 1526 1527 KASSERT((pa & L3_OFFSET) == 0, 1528 ("pmap_kenter: Invalid physical address")); 1529 KASSERT((sva & L3_OFFSET) == 0, 1530 ("pmap_kenter: Invalid virtual address")); 1531 KASSERT((size & PAGE_MASK) == 0, 1532 ("pmap_kenter: Mapping is not page-sized")); 1533 1534 attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN | 1535 ATTR_S1_IDX(mode) | L3_PAGE; 1536 va = sva; 1537 while (size != 0) { 1538 pde = pmap_pde(kernel_pmap, va, &lvl); 1539 KASSERT(pde != NULL, 1540 ("pmap_kenter: Invalid page entry, va: 0x%lx", va)); 1541 KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl)); 1542 1543 pte = pmap_l2_to_l3(pde, va); 1544 pmap_load_store(pte, (pa & ~L3_OFFSET) | attr); 1545 1546 va += PAGE_SIZE; 1547 pa += PAGE_SIZE; 1548 size -= PAGE_SIZE; 1549 } 1550 pmap_invalidate_range(kernel_pmap, sva, va, true); 1551 } 1552 1553 void 1554 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) 1555 { 1556 1557 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE); 1558 } 1559 1560 /* 1561 * Remove a page from the kernel pagetables. 1562 */ 1563 PMAP_INLINE void 1564 pmap_kremove(vm_offset_t va) 1565 { 1566 pt_entry_t *pte; 1567 1568 pte = pmap_pte_exists(kernel_pmap, va, 3, __func__); 1569 pmap_clear(pte); 1570 pmap_invalidate_page(kernel_pmap, va, true); 1571 } 1572 1573 void 1574 pmap_kremove_device(vm_offset_t sva, vm_size_t size) 1575 { 1576 pt_entry_t *pte; 1577 vm_offset_t va; 1578 1579 KASSERT((sva & L3_OFFSET) == 0, 1580 ("pmap_kremove_device: Invalid virtual address")); 1581 KASSERT((size & PAGE_MASK) == 0, 1582 ("pmap_kremove_device: Mapping is not page-sized")); 1583 1584 va = sva; 1585 while (size != 0) { 1586 pte = pmap_pte_exists(kernel_pmap, va, 3, __func__); 1587 pmap_clear(pte); 1588 1589 va += PAGE_SIZE; 1590 size -= PAGE_SIZE; 1591 } 1592 pmap_invalidate_range(kernel_pmap, sva, va, true); 1593 } 1594 1595 /* 1596 * Used to map a range of physical addresses into kernel 1597 * virtual address space. 1598 * 1599 * The value passed in '*virt' is a suggested virtual address for 1600 * the mapping. Architectures which can support a direct-mapped 1601 * physical to virtual region can return the appropriate address 1602 * within that region, leaving '*virt' unchanged. Other 1603 * architectures should map the pages starting at '*virt' and 1604 * update '*virt' with the first usable address after the mapped 1605 * region. 1606 */ 1607 vm_offset_t 1608 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1609 { 1610 return PHYS_TO_DMAP(start); 1611 } 1612 1613 /* 1614 * Add a list of wired pages to the kva 1615 * this routine is only used for temporary 1616 * kernel mappings that do not need to have 1617 * page modification or references recorded. 1618 * Note that old mappings are simply written 1619 * over. The page *must* be wired. 1620 * Note: SMP coherent. Uses a ranged shootdown IPI. 1621 */ 1622 void 1623 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1624 { 1625 pd_entry_t *pde; 1626 pt_entry_t *pte, pa; 1627 vm_offset_t va; 1628 vm_page_t m; 1629 int i, lvl; 1630 1631 va = sva; 1632 for (i = 0; i < count; i++) { 1633 pde = pmap_pde(kernel_pmap, va, &lvl); 1634 KASSERT(pde != NULL, 1635 ("pmap_qenter: Invalid page entry, va: 0x%lx", va)); 1636 KASSERT(lvl == 2, 1637 ("pmap_qenter: Invalid level %d", lvl)); 1638 1639 m = ma[i]; 1640 pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | 1641 ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN | 1642 ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE; 1643 pte = pmap_l2_to_l3(pde, va); 1644 pmap_load_store(pte, pa); 1645 1646 va += L3_SIZE; 1647 } 1648 pmap_invalidate_range(kernel_pmap, sva, va, true); 1649 } 1650 1651 /* 1652 * This routine tears out page mappings from the 1653 * kernel -- it is meant only for temporary mappings. 1654 */ 1655 void 1656 pmap_qremove(vm_offset_t sva, int count) 1657 { 1658 pt_entry_t *pte; 1659 vm_offset_t va; 1660 1661 KASSERT(ADDR_IS_CANONICAL(sva), 1662 ("%s: Address not in canonical form: %lx", __func__, sva)); 1663 KASSERT(ADDR_IS_KERNEL(sva), ("usermode va %lx", sva)); 1664 1665 va = sva; 1666 while (count-- > 0) { 1667 pte = pmap_pte_exists(kernel_pmap, va, 3, NULL); 1668 if (pte != NULL) { 1669 pmap_clear(pte); 1670 } 1671 1672 va += PAGE_SIZE; 1673 } 1674 pmap_invalidate_range(kernel_pmap, sva, va, true); 1675 } 1676 1677 /*************************************************** 1678 * Page table page management routines..... 1679 ***************************************************/ 1680 /* 1681 * Schedule the specified unused page table page to be freed. Specifically, 1682 * add the page to the specified list of pages that will be released to the 1683 * physical memory manager after the TLB has been updated. 1684 */ 1685 static __inline void 1686 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 1687 boolean_t set_PG_ZERO) 1688 { 1689 1690 if (set_PG_ZERO) 1691 m->flags |= PG_ZERO; 1692 else 1693 m->flags &= ~PG_ZERO; 1694 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1695 } 1696 1697 /* 1698 * Decrements a page table page's reference count, which is used to record the 1699 * number of valid page table entries within the page. If the reference count 1700 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1701 * page table page was unmapped and FALSE otherwise. 1702 */ 1703 static inline boolean_t 1704 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1705 { 1706 1707 --m->ref_count; 1708 if (m->ref_count == 0) { 1709 _pmap_unwire_l3(pmap, va, m, free); 1710 return (TRUE); 1711 } else 1712 return (FALSE); 1713 } 1714 1715 static void 1716 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1717 { 1718 1719 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1720 /* 1721 * unmap the page table page 1722 */ 1723 if (m->pindex >= (NUL2E + NUL1E)) { 1724 /* l1 page */ 1725 pd_entry_t *l0; 1726 1727 l0 = pmap_l0(pmap, va); 1728 pmap_clear(l0); 1729 } else if (m->pindex >= NUL2E) { 1730 /* l2 page */ 1731 pd_entry_t *l1; 1732 1733 l1 = pmap_l1(pmap, va); 1734 pmap_clear(l1); 1735 } else { 1736 /* l3 page */ 1737 pd_entry_t *l2; 1738 1739 l2 = pmap_l2(pmap, va); 1740 pmap_clear(l2); 1741 } 1742 pmap_resident_count_dec(pmap, 1); 1743 if (m->pindex < NUL2E) { 1744 /* We just released an l3, unhold the matching l2 */ 1745 pd_entry_t *l1, tl1; 1746 vm_page_t l2pg; 1747 1748 l1 = pmap_l1(pmap, va); 1749 tl1 = pmap_load(l1); 1750 l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); 1751 pmap_unwire_l3(pmap, va, l2pg, free); 1752 } else if (m->pindex < (NUL2E + NUL1E)) { 1753 /* We just released an l2, unhold the matching l1 */ 1754 pd_entry_t *l0, tl0; 1755 vm_page_t l1pg; 1756 1757 l0 = pmap_l0(pmap, va); 1758 tl0 = pmap_load(l0); 1759 l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); 1760 pmap_unwire_l3(pmap, va, l1pg, free); 1761 } 1762 pmap_invalidate_page(pmap, va, false); 1763 1764 /* 1765 * Put page on a list so that it is released after 1766 * *ALL* TLB shootdown is done 1767 */ 1768 pmap_add_delayed_free_list(m, free, TRUE); 1769 } 1770 1771 /* 1772 * After removing a page table entry, this routine is used to 1773 * conditionally free the page, and manage the reference count. 1774 */ 1775 static int 1776 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 1777 struct spglist *free) 1778 { 1779 vm_page_t mpte; 1780 1781 KASSERT(ADDR_IS_CANONICAL(va), 1782 ("%s: Address not in canonical form: %lx", __func__, va)); 1783 if (ADDR_IS_KERNEL(va)) 1784 return (0); 1785 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 1786 mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK); 1787 return (pmap_unwire_l3(pmap, va, mpte, free)); 1788 } 1789 1790 /* 1791 * Release a page table page reference after a failed attempt to create a 1792 * mapping. 1793 */ 1794 static void 1795 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 1796 { 1797 struct spglist free; 1798 1799 SLIST_INIT(&free); 1800 if (pmap_unwire_l3(pmap, va, mpte, &free)) 1801 vm_page_free_pages_toq(&free, true); 1802 } 1803 1804 void 1805 pmap_pinit0(pmap_t pmap) 1806 { 1807 1808 PMAP_LOCK_INIT(pmap); 1809 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1810 pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1); 1811 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); 1812 vm_radix_init(&pmap->pm_root); 1813 pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN); 1814 pmap->pm_stage = PM_STAGE1; 1815 pmap->pm_levels = 4; 1816 pmap->pm_ttbr = pmap->pm_l0_paddr; 1817 pmap->pm_asid_set = &asids; 1818 1819 PCPU_SET(curpmap, pmap); 1820 } 1821 1822 int 1823 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels) 1824 { 1825 vm_page_t m; 1826 1827 /* 1828 * allocate the l0 page 1829 */ 1830 m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED | 1831 VM_ALLOC_ZERO); 1832 pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m); 1833 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); 1834 1835 vm_radix_init(&pmap->pm_root); 1836 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1837 pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX); 1838 1839 MPASS(levels == 3 || levels == 4); 1840 pmap->pm_levels = levels; 1841 pmap->pm_stage = stage; 1842 switch (stage) { 1843 case PM_STAGE1: 1844 pmap->pm_asid_set = &asids; 1845 break; 1846 case PM_STAGE2: 1847 pmap->pm_asid_set = &vmids; 1848 break; 1849 default: 1850 panic("%s: Invalid pmap type %d", __func__, stage); 1851 break; 1852 } 1853 1854 /* XXX Temporarily disable deferred ASID allocation. */ 1855 pmap_alloc_asid(pmap); 1856 1857 /* 1858 * Allocate the level 1 entry to use as the root. This will increase 1859 * the refcount on the level 1 page so it won't be removed until 1860 * pmap_release() is called. 1861 */ 1862 if (pmap->pm_levels == 3) { 1863 PMAP_LOCK(pmap); 1864 m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL); 1865 PMAP_UNLOCK(pmap); 1866 } 1867 pmap->pm_ttbr = VM_PAGE_TO_PHYS(m); 1868 1869 return (1); 1870 } 1871 1872 int 1873 pmap_pinit(pmap_t pmap) 1874 { 1875 1876 return (pmap_pinit_stage(pmap, PM_STAGE1, 4)); 1877 } 1878 1879 /* 1880 * This routine is called if the desired page table page does not exist. 1881 * 1882 * If page table page allocation fails, this routine may sleep before 1883 * returning NULL. It sleeps only if a lock pointer was given. 1884 * 1885 * Note: If a page allocation fails at page table level two or three, 1886 * one or two pages may be held during the wait, only to be released 1887 * afterwards. This conservative approach is easily argued to avoid 1888 * race conditions. 1889 */ 1890 static vm_page_t 1891 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 1892 { 1893 vm_page_t m, l1pg, l2pg; 1894 1895 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1896 1897 /* 1898 * Allocate a page table page. 1899 */ 1900 if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1901 if (lockp != NULL) { 1902 RELEASE_PV_LIST_LOCK(lockp); 1903 PMAP_UNLOCK(pmap); 1904 vm_wait(NULL); 1905 PMAP_LOCK(pmap); 1906 } 1907 1908 /* 1909 * Indicate the need to retry. While waiting, the page table 1910 * page may have been allocated. 1911 */ 1912 return (NULL); 1913 } 1914 m->pindex = ptepindex; 1915 1916 /* 1917 * Because of AArch64's weak memory consistency model, we must have a 1918 * barrier here to ensure that the stores for zeroing "m", whether by 1919 * pmap_zero_page() or an earlier function, are visible before adding 1920 * "m" to the page table. Otherwise, a page table walk by another 1921 * processor's MMU could see the mapping to "m" and a stale, non-zero 1922 * PTE within "m". 1923 */ 1924 dmb(ishst); 1925 1926 /* 1927 * Map the pagetable page into the process address space, if 1928 * it isn't already there. 1929 */ 1930 1931 if (ptepindex >= (NUL2E + NUL1E)) { 1932 pd_entry_t *l0p, l0e; 1933 vm_pindex_t l0index; 1934 1935 l0index = ptepindex - (NUL2E + NUL1E); 1936 l0p = &pmap->pm_l0[l0index]; 1937 KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0, 1938 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p))); 1939 l0e = VM_PAGE_TO_PHYS(m) | L0_TABLE; 1940 1941 /* 1942 * Mark all kernel memory as not accessible from userspace 1943 * and userspace memory as not executable from the kernel. 1944 * This has been done for the bootstrap L0 entries in 1945 * locore.S. 1946 */ 1947 if (pmap == kernel_pmap) 1948 l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0; 1949 else 1950 l0e |= TATTR_PXN_TABLE; 1951 pmap_store(l0p, l0e); 1952 } else if (ptepindex >= NUL2E) { 1953 vm_pindex_t l0index, l1index; 1954 pd_entry_t *l0, *l1; 1955 pd_entry_t tl0; 1956 1957 l1index = ptepindex - NUL2E; 1958 l0index = l1index >> L0_ENTRIES_SHIFT; 1959 1960 l0 = &pmap->pm_l0[l0index]; 1961 tl0 = pmap_load(l0); 1962 if (tl0 == 0) { 1963 /* recurse for allocating page dir */ 1964 if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, 1965 lockp) == NULL) { 1966 vm_page_unwire_noq(m); 1967 vm_page_free_zero(m); 1968 return (NULL); 1969 } 1970 } else { 1971 l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); 1972 l1pg->ref_count++; 1973 } 1974 1975 l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); 1976 l1 = &l1[ptepindex & Ln_ADDR_MASK]; 1977 KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0, 1978 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1))); 1979 pmap_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE); 1980 } else { 1981 vm_pindex_t l0index, l1index; 1982 pd_entry_t *l0, *l1, *l2; 1983 pd_entry_t tl0, tl1; 1984 1985 l1index = ptepindex >> Ln_ENTRIES_SHIFT; 1986 l0index = l1index >> L0_ENTRIES_SHIFT; 1987 1988 l0 = &pmap->pm_l0[l0index]; 1989 tl0 = pmap_load(l0); 1990 if (tl0 == 0) { 1991 /* recurse for allocating page dir */ 1992 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 1993 lockp) == NULL) { 1994 vm_page_unwire_noq(m); 1995 vm_page_free_zero(m); 1996 return (NULL); 1997 } 1998 tl0 = pmap_load(l0); 1999 l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); 2000 l1 = &l1[l1index & Ln_ADDR_MASK]; 2001 } else { 2002 l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); 2003 l1 = &l1[l1index & Ln_ADDR_MASK]; 2004 tl1 = pmap_load(l1); 2005 if (tl1 == 0) { 2006 /* recurse for allocating page dir */ 2007 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 2008 lockp) == NULL) { 2009 vm_page_unwire_noq(m); 2010 vm_page_free_zero(m); 2011 return (NULL); 2012 } 2013 } else { 2014 l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); 2015 l2pg->ref_count++; 2016 } 2017 } 2018 2019 l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); 2020 l2 = &l2[ptepindex & Ln_ADDR_MASK]; 2021 KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0, 2022 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2))); 2023 pmap_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE); 2024 } 2025 2026 pmap_resident_count_inc(pmap, 1); 2027 2028 return (m); 2029 } 2030 2031 static pd_entry_t * 2032 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp, 2033 struct rwlock **lockp) 2034 { 2035 pd_entry_t *l1, *l2; 2036 vm_page_t l2pg; 2037 vm_pindex_t l2pindex; 2038 2039 KASSERT(ADDR_IS_CANONICAL(va), 2040 ("%s: Address not in canonical form: %lx", __func__, va)); 2041 2042 retry: 2043 l1 = pmap_l1(pmap, va); 2044 if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) { 2045 l2 = pmap_l1_to_l2(l1, va); 2046 if (!ADDR_IS_KERNEL(va)) { 2047 /* Add a reference to the L2 page. */ 2048 l2pg = PHYS_TO_VM_PAGE(pmap_load(l1) & ~ATTR_MASK); 2049 l2pg->ref_count++; 2050 } else 2051 l2pg = NULL; 2052 } else if (!ADDR_IS_KERNEL(va)) { 2053 /* Allocate a L2 page. */ 2054 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; 2055 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); 2056 if (l2pg == NULL) { 2057 if (lockp != NULL) 2058 goto retry; 2059 else 2060 return (NULL); 2061 } 2062 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); 2063 l2 = &l2[pmap_l2_index(va)]; 2064 } else 2065 panic("pmap_alloc_l2: missing page table page for va %#lx", 2066 va); 2067 *l2pgp = l2pg; 2068 return (l2); 2069 } 2070 2071 static vm_page_t 2072 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 2073 { 2074 vm_pindex_t ptepindex; 2075 pd_entry_t *pde, tpde; 2076 #ifdef INVARIANTS 2077 pt_entry_t *pte; 2078 #endif 2079 vm_page_t m; 2080 int lvl; 2081 2082 /* 2083 * Calculate pagetable page index 2084 */ 2085 ptepindex = pmap_l2_pindex(va); 2086 retry: 2087 /* 2088 * Get the page directory entry 2089 */ 2090 pde = pmap_pde(pmap, va, &lvl); 2091 2092 /* 2093 * If the page table page is mapped, we just increment the hold count, 2094 * and activate it. If we get a level 2 pde it will point to a level 3 2095 * table. 2096 */ 2097 switch (lvl) { 2098 case -1: 2099 break; 2100 case 0: 2101 #ifdef INVARIANTS 2102 pte = pmap_l0_to_l1(pde, va); 2103 KASSERT(pmap_load(pte) == 0, 2104 ("pmap_alloc_l3: TODO: l0 superpages")); 2105 #endif 2106 break; 2107 case 1: 2108 #ifdef INVARIANTS 2109 pte = pmap_l1_to_l2(pde, va); 2110 KASSERT(pmap_load(pte) == 0, 2111 ("pmap_alloc_l3: TODO: l1 superpages")); 2112 #endif 2113 break; 2114 case 2: 2115 tpde = pmap_load(pde); 2116 if (tpde != 0) { 2117 m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK); 2118 m->ref_count++; 2119 return (m); 2120 } 2121 break; 2122 default: 2123 panic("pmap_alloc_l3: Invalid level %d", lvl); 2124 } 2125 2126 /* 2127 * Here if the pte page isn't mapped, or if it has been deallocated. 2128 */ 2129 m = _pmap_alloc_l3(pmap, ptepindex, lockp); 2130 if (m == NULL && lockp != NULL) 2131 goto retry; 2132 2133 return (m); 2134 } 2135 2136 /*************************************************** 2137 * Pmap allocation/deallocation routines. 2138 ***************************************************/ 2139 2140 /* 2141 * Release any resources held by the given physical map. 2142 * Called when a pmap initialized by pmap_pinit is being released. 2143 * Should only be called if the map contains no valid mappings. 2144 */ 2145 void 2146 pmap_release(pmap_t pmap) 2147 { 2148 boolean_t rv __diagused; 2149 struct spglist free; 2150 struct asid_set *set; 2151 vm_page_t m; 2152 int asid; 2153 2154 if (pmap->pm_levels != 4) { 2155 PMAP_ASSERT_STAGE2(pmap); 2156 KASSERT(pmap->pm_stats.resident_count == 1, 2157 ("pmap_release: pmap resident count %ld != 0", 2158 pmap->pm_stats.resident_count)); 2159 KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID, 2160 ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0])); 2161 2162 SLIST_INIT(&free); 2163 m = PHYS_TO_VM_PAGE(pmap->pm_ttbr); 2164 PMAP_LOCK(pmap); 2165 rv = pmap_unwire_l3(pmap, 0, m, &free); 2166 PMAP_UNLOCK(pmap); 2167 MPASS(rv == TRUE); 2168 vm_page_free_pages_toq(&free, true); 2169 } 2170 2171 KASSERT(pmap->pm_stats.resident_count == 0, 2172 ("pmap_release: pmap resident count %ld != 0", 2173 pmap->pm_stats.resident_count)); 2174 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2175 ("pmap_release: pmap has reserved page table page(s)")); 2176 2177 set = pmap->pm_asid_set; 2178 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 2179 2180 /* 2181 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate 2182 * the entries when removing them so rely on a later tlb invalidation. 2183 * this will happen when updating the VMID generation. Because of this 2184 * we don't reuse VMIDs within a generation. 2185 */ 2186 if (pmap->pm_stage == PM_STAGE1) { 2187 mtx_lock_spin(&set->asid_set_mutex); 2188 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) { 2189 asid = COOKIE_TO_ASID(pmap->pm_cookie); 2190 KASSERT(asid >= ASID_FIRST_AVAILABLE && 2191 asid < set->asid_set_size, 2192 ("pmap_release: pmap cookie has out-of-range asid")); 2193 bit_clear(set->asid_set, asid); 2194 } 2195 mtx_unlock_spin(&set->asid_set_mutex); 2196 } 2197 2198 m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr); 2199 vm_page_unwire_noq(m); 2200 vm_page_free_zero(m); 2201 } 2202 2203 static int 2204 kvm_size(SYSCTL_HANDLER_ARGS) 2205 { 2206 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 2207 2208 return sysctl_handle_long(oidp, &ksize, 0, req); 2209 } 2210 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 2211 0, 0, kvm_size, "LU", 2212 "Size of KVM"); 2213 2214 static int 2215 kvm_free(SYSCTL_HANDLER_ARGS) 2216 { 2217 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 2218 2219 return sysctl_handle_long(oidp, &kfree, 0, req); 2220 } 2221 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 2222 0, 0, kvm_free, "LU", 2223 "Amount of KVM free"); 2224 2225 /* 2226 * grow the number of kernel page table entries, if needed 2227 */ 2228 void 2229 pmap_growkernel(vm_offset_t addr) 2230 { 2231 vm_paddr_t paddr; 2232 vm_page_t nkpg; 2233 pd_entry_t *l0, *l1, *l2; 2234 2235 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2236 2237 addr = roundup2(addr, L2_SIZE); 2238 if (addr - 1 >= vm_map_max(kernel_map)) 2239 addr = vm_map_max(kernel_map); 2240 while (kernel_vm_end < addr) { 2241 l0 = pmap_l0(kernel_pmap, kernel_vm_end); 2242 KASSERT(pmap_load(l0) != 0, 2243 ("pmap_growkernel: No level 0 kernel entry")); 2244 2245 l1 = pmap_l0_to_l1(l0, kernel_vm_end); 2246 if (pmap_load(l1) == 0) { 2247 /* We need a new PDP entry */ 2248 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | 2249 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2250 if (nkpg == NULL) 2251 panic("pmap_growkernel: no memory to grow kernel"); 2252 nkpg->pindex = kernel_vm_end >> L1_SHIFT; 2253 /* See the dmb() in _pmap_alloc_l3(). */ 2254 dmb(ishst); 2255 paddr = VM_PAGE_TO_PHYS(nkpg); 2256 pmap_store(l1, paddr | L1_TABLE); 2257 continue; /* try again */ 2258 } 2259 l2 = pmap_l1_to_l2(l1, kernel_vm_end); 2260 if (pmap_load(l2) != 0) { 2261 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 2262 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2263 kernel_vm_end = vm_map_max(kernel_map); 2264 break; 2265 } 2266 continue; 2267 } 2268 2269 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 2270 VM_ALLOC_ZERO); 2271 if (nkpg == NULL) 2272 panic("pmap_growkernel: no memory to grow kernel"); 2273 nkpg->pindex = kernel_vm_end >> L2_SHIFT; 2274 /* See the dmb() in _pmap_alloc_l3(). */ 2275 dmb(ishst); 2276 paddr = VM_PAGE_TO_PHYS(nkpg); 2277 pmap_store(l2, paddr | L2_TABLE); 2278 2279 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 2280 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2281 kernel_vm_end = vm_map_max(kernel_map); 2282 break; 2283 } 2284 } 2285 } 2286 2287 /*************************************************** 2288 * page management routines. 2289 ***************************************************/ 2290 2291 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2292 CTASSERT(_NPCM == 3); 2293 CTASSERT(_NPCPV == 168); 2294 2295 static __inline struct pv_chunk * 2296 pv_to_chunk(pv_entry_t pv) 2297 { 2298 2299 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2300 } 2301 2302 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2303 2304 #define PC_FREE0 0xfffffffffffffffful 2305 #define PC_FREE1 0xfffffffffffffffful 2306 #define PC_FREE2 0x000000fffffffffful 2307 2308 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 2309 2310 #ifdef PV_STATS 2311 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2312 2313 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2314 "Current number of pv entry chunks"); 2315 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2316 "Current number of pv entry chunks allocated"); 2317 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2318 "Current number of pv entry chunks frees"); 2319 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2320 "Number of times tried to get a chunk page but failed."); 2321 2322 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 2323 static int pv_entry_spare; 2324 2325 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2326 "Current number of pv entry frees"); 2327 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2328 "Current number of pv entry allocs"); 2329 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2330 "Current number of pv entries"); 2331 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2332 "Current number of spare pv entries"); 2333 #endif 2334 2335 /* 2336 * We are in a serious low memory condition. Resort to 2337 * drastic measures to free some pages so we can allocate 2338 * another pv entry chunk. 2339 * 2340 * Returns NULL if PV entries were reclaimed from the specified pmap. 2341 * 2342 * We do not, however, unmap 2mpages because subsequent accesses will 2343 * allocate per-page pv entries until repromotion occurs, thereby 2344 * exacerbating the shortage of free pv entries. 2345 */ 2346 static vm_page_t 2347 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 2348 { 2349 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 2350 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 2351 struct md_page *pvh; 2352 pd_entry_t *pde; 2353 pmap_t next_pmap, pmap; 2354 pt_entry_t *pte, tpte; 2355 pv_entry_t pv; 2356 vm_offset_t va; 2357 vm_page_t m, m_pc; 2358 struct spglist free; 2359 uint64_t inuse; 2360 int bit, field, freed, lvl; 2361 static int active_reclaims = 0; 2362 2363 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2364 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 2365 2366 pmap = NULL; 2367 m_pc = NULL; 2368 SLIST_INIT(&free); 2369 bzero(&pc_marker_b, sizeof(pc_marker_b)); 2370 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 2371 pc_marker = (struct pv_chunk *)&pc_marker_b; 2372 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 2373 2374 mtx_lock(&pv_chunks_mutex); 2375 active_reclaims++; 2376 TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); 2377 TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); 2378 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 2379 SLIST_EMPTY(&free)) { 2380 next_pmap = pc->pc_pmap; 2381 if (next_pmap == NULL) { 2382 /* 2383 * The next chunk is a marker. However, it is 2384 * not our marker, so active_reclaims must be 2385 * > 1. Consequently, the next_chunk code 2386 * will not rotate the pv_chunks list. 2387 */ 2388 goto next_chunk; 2389 } 2390 mtx_unlock(&pv_chunks_mutex); 2391 2392 /* 2393 * A pv_chunk can only be removed from the pc_lru list 2394 * when both pv_chunks_mutex is owned and the 2395 * corresponding pmap is locked. 2396 */ 2397 if (pmap != next_pmap) { 2398 if (pmap != NULL && pmap != locked_pmap) 2399 PMAP_UNLOCK(pmap); 2400 pmap = next_pmap; 2401 /* Avoid deadlock and lock recursion. */ 2402 if (pmap > locked_pmap) { 2403 RELEASE_PV_LIST_LOCK(lockp); 2404 PMAP_LOCK(pmap); 2405 mtx_lock(&pv_chunks_mutex); 2406 continue; 2407 } else if (pmap != locked_pmap) { 2408 if (PMAP_TRYLOCK(pmap)) { 2409 mtx_lock(&pv_chunks_mutex); 2410 continue; 2411 } else { 2412 pmap = NULL; /* pmap is not locked */ 2413 mtx_lock(&pv_chunks_mutex); 2414 pc = TAILQ_NEXT(pc_marker, pc_lru); 2415 if (pc == NULL || 2416 pc->pc_pmap != next_pmap) 2417 continue; 2418 goto next_chunk; 2419 } 2420 } 2421 } 2422 2423 /* 2424 * Destroy every non-wired, 4 KB page mapping in the chunk. 2425 */ 2426 freed = 0; 2427 for (field = 0; field < _NPCM; field++) { 2428 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2429 inuse != 0; inuse &= ~(1UL << bit)) { 2430 bit = ffsl(inuse) - 1; 2431 pv = &pc->pc_pventry[field * 64 + bit]; 2432 va = pv->pv_va; 2433 pde = pmap_pde(pmap, va, &lvl); 2434 if (lvl != 2) 2435 continue; 2436 pte = pmap_l2_to_l3(pde, va); 2437 tpte = pmap_load(pte); 2438 if ((tpte & ATTR_SW_WIRED) != 0) 2439 continue; 2440 tpte = pmap_load_clear(pte); 2441 m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK); 2442 if (pmap_pte_dirty(pmap, tpte)) 2443 vm_page_dirty(m); 2444 if ((tpte & ATTR_AF) != 0) { 2445 pmap_invalidate_page(pmap, va, true); 2446 vm_page_aflag_set(m, PGA_REFERENCED); 2447 } 2448 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2449 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2450 m->md.pv_gen++; 2451 if (TAILQ_EMPTY(&m->md.pv_list) && 2452 (m->flags & PG_FICTITIOUS) == 0) { 2453 pvh = page_to_pvh(m); 2454 if (TAILQ_EMPTY(&pvh->pv_list)) { 2455 vm_page_aflag_clear(m, 2456 PGA_WRITEABLE); 2457 } 2458 } 2459 pc->pc_map[field] |= 1UL << bit; 2460 pmap_unuse_pt(pmap, va, pmap_load(pde), &free); 2461 freed++; 2462 } 2463 } 2464 if (freed == 0) { 2465 mtx_lock(&pv_chunks_mutex); 2466 goto next_chunk; 2467 } 2468 /* Every freed mapping is for a 4 KB page. */ 2469 pmap_resident_count_dec(pmap, freed); 2470 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 2471 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 2472 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 2473 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2474 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && 2475 pc->pc_map[2] == PC_FREE2) { 2476 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 2477 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 2478 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 2479 /* Entire chunk is free; return it. */ 2480 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2481 dump_drop_page(m_pc->phys_addr); 2482 mtx_lock(&pv_chunks_mutex); 2483 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2484 break; 2485 } 2486 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2487 mtx_lock(&pv_chunks_mutex); 2488 /* One freed pv entry in locked_pmap is sufficient. */ 2489 if (pmap == locked_pmap) 2490 break; 2491 2492 next_chunk: 2493 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 2494 TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); 2495 if (active_reclaims == 1 && pmap != NULL) { 2496 /* 2497 * Rotate the pv chunks list so that we do not 2498 * scan the same pv chunks that could not be 2499 * freed (because they contained a wired 2500 * and/or superpage mapping) on every 2501 * invocation of reclaim_pv_chunk(). 2502 */ 2503 while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { 2504 MPASS(pc->pc_pmap != NULL); 2505 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2506 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2507 } 2508 } 2509 } 2510 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 2511 TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); 2512 active_reclaims--; 2513 mtx_unlock(&pv_chunks_mutex); 2514 if (pmap != NULL && pmap != locked_pmap) 2515 PMAP_UNLOCK(pmap); 2516 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 2517 m_pc = SLIST_FIRST(&free); 2518 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2519 /* Recycle a freed page table page. */ 2520 m_pc->ref_count = 1; 2521 } 2522 vm_page_free_pages_toq(&free, true); 2523 return (m_pc); 2524 } 2525 2526 /* 2527 * free the pv_entry back to the free list 2528 */ 2529 static void 2530 free_pv_entry(pmap_t pmap, pv_entry_t pv) 2531 { 2532 struct pv_chunk *pc; 2533 int idx, field, bit; 2534 2535 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2536 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 2537 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 2538 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 2539 pc = pv_to_chunk(pv); 2540 idx = pv - &pc->pc_pventry[0]; 2541 field = idx / 64; 2542 bit = idx % 64; 2543 pc->pc_map[field] |= 1ul << bit; 2544 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 2545 pc->pc_map[2] != PC_FREE2) { 2546 /* 98% of the time, pc is already at the head of the list. */ 2547 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 2548 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2549 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2550 } 2551 return; 2552 } 2553 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2554 free_pv_chunk(pc); 2555 } 2556 2557 static void 2558 free_pv_chunk(struct pv_chunk *pc) 2559 { 2560 vm_page_t m; 2561 2562 mtx_lock(&pv_chunks_mutex); 2563 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2564 mtx_unlock(&pv_chunks_mutex); 2565 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 2566 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 2567 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 2568 /* entire chunk is free, return it */ 2569 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2570 dump_drop_page(m->phys_addr); 2571 vm_page_unwire_noq(m); 2572 vm_page_free(m); 2573 } 2574 2575 /* 2576 * Returns a new PV entry, allocating a new PV chunk from the system when 2577 * needed. If this PV chunk allocation fails and a PV list lock pointer was 2578 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 2579 * returned. 2580 * 2581 * The given PV list lock may be released. 2582 */ 2583 static pv_entry_t 2584 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 2585 { 2586 int bit, field; 2587 pv_entry_t pv; 2588 struct pv_chunk *pc; 2589 vm_page_t m; 2590 2591 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2592 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 2593 retry: 2594 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2595 if (pc != NULL) { 2596 for (field = 0; field < _NPCM; field++) { 2597 if (pc->pc_map[field]) { 2598 bit = ffsl(pc->pc_map[field]) - 1; 2599 break; 2600 } 2601 } 2602 if (field < _NPCM) { 2603 pv = &pc->pc_pventry[field * 64 + bit]; 2604 pc->pc_map[field] &= ~(1ul << bit); 2605 /* If this was the last item, move it to tail */ 2606 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 2607 pc->pc_map[2] == 0) { 2608 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2609 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 2610 pc_list); 2611 } 2612 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 2613 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 2614 return (pv); 2615 } 2616 } 2617 /* No free items, allocate another chunk */ 2618 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 2619 if (m == NULL) { 2620 if (lockp == NULL) { 2621 PV_STAT(pc_chunk_tryfail++); 2622 return (NULL); 2623 } 2624 m = reclaim_pv_chunk(pmap, lockp); 2625 if (m == NULL) 2626 goto retry; 2627 } 2628 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 2629 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 2630 dump_add_page(m->phys_addr); 2631 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 2632 pc->pc_pmap = pmap; 2633 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 2634 pc->pc_map[1] = PC_FREE1; 2635 pc->pc_map[2] = PC_FREE2; 2636 mtx_lock(&pv_chunks_mutex); 2637 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2638 mtx_unlock(&pv_chunks_mutex); 2639 pv = &pc->pc_pventry[0]; 2640 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2641 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 2642 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 2643 return (pv); 2644 } 2645 2646 /* 2647 * Ensure that the number of spare PV entries in the specified pmap meets or 2648 * exceeds the given count, "needed". 2649 * 2650 * The given PV list lock may be released. 2651 */ 2652 static void 2653 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 2654 { 2655 struct pch new_tail; 2656 struct pv_chunk *pc; 2657 vm_page_t m; 2658 int avail, free; 2659 bool reclaimed; 2660 2661 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2662 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 2663 2664 /* 2665 * Newly allocated PV chunks must be stored in a private list until 2666 * the required number of PV chunks have been allocated. Otherwise, 2667 * reclaim_pv_chunk() could recycle one of these chunks. In 2668 * contrast, these chunks must be added to the pmap upon allocation. 2669 */ 2670 TAILQ_INIT(&new_tail); 2671 retry: 2672 avail = 0; 2673 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 2674 bit_count((bitstr_t *)pc->pc_map, 0, 2675 sizeof(pc->pc_map) * NBBY, &free); 2676 if (free == 0) 2677 break; 2678 avail += free; 2679 if (avail >= needed) 2680 break; 2681 } 2682 for (reclaimed = false; avail < needed; avail += _NPCPV) { 2683 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 2684 if (m == NULL) { 2685 m = reclaim_pv_chunk(pmap, lockp); 2686 if (m == NULL) 2687 goto retry; 2688 reclaimed = true; 2689 } 2690 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 2691 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 2692 dump_add_page(m->phys_addr); 2693 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 2694 pc->pc_pmap = pmap; 2695 pc->pc_map[0] = PC_FREE0; 2696 pc->pc_map[1] = PC_FREE1; 2697 pc->pc_map[2] = PC_FREE2; 2698 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2699 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2700 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 2701 2702 /* 2703 * The reclaim might have freed a chunk from the current pmap. 2704 * If that chunk contained available entries, we need to 2705 * re-count the number of available entries. 2706 */ 2707 if (reclaimed) 2708 goto retry; 2709 } 2710 if (!TAILQ_EMPTY(&new_tail)) { 2711 mtx_lock(&pv_chunks_mutex); 2712 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 2713 mtx_unlock(&pv_chunks_mutex); 2714 } 2715 } 2716 2717 /* 2718 * First find and then remove the pv entry for the specified pmap and virtual 2719 * address from the specified pv list. Returns the pv entry if found and NULL 2720 * otherwise. This operation can be performed on pv lists for either 4KB or 2721 * 2MB page mappings. 2722 */ 2723 static __inline pv_entry_t 2724 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2725 { 2726 pv_entry_t pv; 2727 2728 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 2729 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 2730 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 2731 pvh->pv_gen++; 2732 break; 2733 } 2734 } 2735 return (pv); 2736 } 2737 2738 /* 2739 * After demotion from a 2MB page mapping to 512 4KB page mappings, 2740 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 2741 * entries for each of the 4KB page mappings. 2742 */ 2743 static void 2744 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 2745 struct rwlock **lockp) 2746 { 2747 struct md_page *pvh; 2748 struct pv_chunk *pc; 2749 pv_entry_t pv; 2750 vm_offset_t va_last; 2751 vm_page_t m; 2752 int bit, field; 2753 2754 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2755 KASSERT((va & L2_OFFSET) == 0, 2756 ("pmap_pv_demote_l2: va is not 2mpage aligned")); 2757 KASSERT((pa & L2_OFFSET) == 0, 2758 ("pmap_pv_demote_l2: pa is not 2mpage aligned")); 2759 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2760 2761 /* 2762 * Transfer the 2mpage's pv entry for this mapping to the first 2763 * page's pv list. Once this transfer begins, the pv list lock 2764 * must not be released until the last pv entry is reinstantiated. 2765 */ 2766 pvh = pa_to_pvh(pa); 2767 pv = pmap_pvh_remove(pvh, pmap, va); 2768 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); 2769 m = PHYS_TO_VM_PAGE(pa); 2770 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2771 m->md.pv_gen++; 2772 /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */ 2773 PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1)); 2774 va_last = va + L2_SIZE - PAGE_SIZE; 2775 for (;;) { 2776 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2777 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 2778 pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); 2779 for (field = 0; field < _NPCM; field++) { 2780 while (pc->pc_map[field]) { 2781 bit = ffsl(pc->pc_map[field]) - 1; 2782 pc->pc_map[field] &= ~(1ul << bit); 2783 pv = &pc->pc_pventry[field * 64 + bit]; 2784 va += PAGE_SIZE; 2785 pv->pv_va = va; 2786 m++; 2787 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2788 ("pmap_pv_demote_l2: page %p is not managed", m)); 2789 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2790 m->md.pv_gen++; 2791 if (va == va_last) 2792 goto out; 2793 } 2794 } 2795 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2796 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2797 } 2798 out: 2799 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 2800 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2801 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2802 } 2803 PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1)); 2804 PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1)); 2805 } 2806 2807 /* 2808 * First find and then destroy the pv entry for the specified pmap and virtual 2809 * address. This operation can be performed on pv lists for either 4KB or 2MB 2810 * page mappings. 2811 */ 2812 static void 2813 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2814 { 2815 pv_entry_t pv; 2816 2817 pv = pmap_pvh_remove(pvh, pmap, va); 2818 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 2819 free_pv_entry(pmap, pv); 2820 } 2821 2822 /* 2823 * Conditionally create the PV entry for a 4KB page mapping if the required 2824 * memory can be allocated without resorting to reclamation. 2825 */ 2826 static boolean_t 2827 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 2828 struct rwlock **lockp) 2829 { 2830 pv_entry_t pv; 2831 2832 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2833 /* Pass NULL instead of the lock pointer to disable reclamation. */ 2834 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 2835 pv->pv_va = va; 2836 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2837 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2838 m->md.pv_gen++; 2839 return (TRUE); 2840 } else 2841 return (FALSE); 2842 } 2843 2844 /* 2845 * Create the PV entry for a 2MB page mapping. Always returns true unless the 2846 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 2847 * false if the PV entry cannot be allocated without resorting to reclamation. 2848 */ 2849 static bool 2850 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, 2851 struct rwlock **lockp) 2852 { 2853 struct md_page *pvh; 2854 pv_entry_t pv; 2855 vm_paddr_t pa; 2856 2857 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2858 /* Pass NULL instead of the lock pointer to disable reclamation. */ 2859 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 2860 NULL : lockp)) == NULL) 2861 return (false); 2862 pv->pv_va = va; 2863 pa = l2e & ~ATTR_MASK; 2864 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2865 pvh = pa_to_pvh(pa); 2866 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2867 pvh->pv_gen++; 2868 return (true); 2869 } 2870 2871 static void 2872 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 2873 { 2874 pt_entry_t newl2, oldl2 __diagused; 2875 vm_page_t ml3; 2876 vm_paddr_t ml3pa; 2877 2878 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); 2879 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 2880 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2881 2882 ml3 = pmap_remove_pt_page(pmap, va); 2883 if (ml3 == NULL) 2884 panic("pmap_remove_kernel_l2: Missing pt page"); 2885 2886 ml3pa = VM_PAGE_TO_PHYS(ml3); 2887 newl2 = ml3pa | L2_TABLE; 2888 2889 /* 2890 * If this page table page was unmapped by a promotion, then it 2891 * contains valid mappings. Zero it to invalidate those mappings. 2892 */ 2893 if (ml3->valid != 0) 2894 pagezero((void *)PHYS_TO_DMAP(ml3pa)); 2895 2896 /* 2897 * Demote the mapping. The caller must have already invalidated the 2898 * mapping (i.e., the "break" in break-before-make). 2899 */ 2900 oldl2 = pmap_load_store(l2, newl2); 2901 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", 2902 __func__, l2, oldl2)); 2903 } 2904 2905 /* 2906 * pmap_remove_l2: Do the things to unmap a level 2 superpage. 2907 */ 2908 static int 2909 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 2910 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) 2911 { 2912 struct md_page *pvh; 2913 pt_entry_t old_l2; 2914 vm_page_t m, ml3, mt; 2915 2916 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2917 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); 2918 old_l2 = pmap_load_clear(l2); 2919 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, 2920 ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2)); 2921 2922 /* 2923 * Since a promotion must break the 4KB page mappings before making 2924 * the 2MB page mapping, a pmap_invalidate_page() suffices. 2925 */ 2926 pmap_invalidate_page(pmap, sva, true); 2927 2928 if (old_l2 & ATTR_SW_WIRED) 2929 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; 2930 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); 2931 if (old_l2 & ATTR_SW_MANAGED) { 2932 m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); 2933 pvh = page_to_pvh(m); 2934 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK); 2935 pmap_pvh_free(pvh, pmap, sva); 2936 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) { 2937 if (pmap_pte_dirty(pmap, old_l2)) 2938 vm_page_dirty(mt); 2939 if (old_l2 & ATTR_AF) 2940 vm_page_aflag_set(mt, PGA_REFERENCED); 2941 if (TAILQ_EMPTY(&mt->md.pv_list) && 2942 TAILQ_EMPTY(&pvh->pv_list)) 2943 vm_page_aflag_clear(mt, PGA_WRITEABLE); 2944 } 2945 } 2946 if (pmap == kernel_pmap) { 2947 pmap_remove_kernel_l2(pmap, l2, sva); 2948 } else { 2949 ml3 = pmap_remove_pt_page(pmap, sva); 2950 if (ml3 != NULL) { 2951 KASSERT(ml3->valid == VM_PAGE_BITS_ALL, 2952 ("pmap_remove_l2: l3 page not promoted")); 2953 pmap_resident_count_dec(pmap, 1); 2954 KASSERT(ml3->ref_count == NL3PG, 2955 ("pmap_remove_l2: l3 page ref count error")); 2956 ml3->ref_count = 0; 2957 pmap_add_delayed_free_list(ml3, free, FALSE); 2958 } 2959 } 2960 return (pmap_unuse_pt(pmap, sva, l1e, free)); 2961 } 2962 2963 /* 2964 * pmap_remove_l3: do the things to unmap a page in a process 2965 */ 2966 static int 2967 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 2968 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) 2969 { 2970 struct md_page *pvh; 2971 pt_entry_t old_l3; 2972 vm_page_t m; 2973 2974 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2975 old_l3 = pmap_load_clear(l3); 2976 pmap_invalidate_page(pmap, va, true); 2977 if (old_l3 & ATTR_SW_WIRED) 2978 pmap->pm_stats.wired_count -= 1; 2979 pmap_resident_count_dec(pmap, 1); 2980 if (old_l3 & ATTR_SW_MANAGED) { 2981 m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); 2982 if (pmap_pte_dirty(pmap, old_l3)) 2983 vm_page_dirty(m); 2984 if (old_l3 & ATTR_AF) 2985 vm_page_aflag_set(m, PGA_REFERENCED); 2986 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2987 pmap_pvh_free(&m->md, pmap, va); 2988 if (TAILQ_EMPTY(&m->md.pv_list) && 2989 (m->flags & PG_FICTITIOUS) == 0) { 2990 pvh = page_to_pvh(m); 2991 if (TAILQ_EMPTY(&pvh->pv_list)) 2992 vm_page_aflag_clear(m, PGA_WRITEABLE); 2993 } 2994 } 2995 return (pmap_unuse_pt(pmap, va, l2e, free)); 2996 } 2997 2998 /* 2999 * Remove the specified range of addresses from the L3 page table that is 3000 * identified by the given L2 entry. 3001 */ 3002 static void 3003 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva, 3004 vm_offset_t eva, struct spglist *free, struct rwlock **lockp) 3005 { 3006 struct md_page *pvh; 3007 struct rwlock *new_lock; 3008 pt_entry_t *l3, old_l3; 3009 vm_offset_t va; 3010 vm_page_t l3pg, m; 3011 3012 KASSERT(ADDR_IS_CANONICAL(sva), 3013 ("%s: Start address not in canonical form: %lx", __func__, sva)); 3014 KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS, 3015 ("%s: End address not in canonical form: %lx", __func__, eva)); 3016 3017 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3018 KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE), 3019 ("pmap_remove_l3_range: range crosses an L3 page table boundary")); 3020 l3pg = !ADDR_IS_KERNEL(sva) ? PHYS_TO_VM_PAGE(l2e & ~ATTR_MASK) : NULL; 3021 va = eva; 3022 for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) { 3023 if (!pmap_l3_valid(pmap_load(l3))) { 3024 if (va != eva) { 3025 pmap_invalidate_range(pmap, va, sva, true); 3026 va = eva; 3027 } 3028 continue; 3029 } 3030 old_l3 = pmap_load_clear(l3); 3031 if ((old_l3 & ATTR_SW_WIRED) != 0) 3032 pmap->pm_stats.wired_count--; 3033 pmap_resident_count_dec(pmap, 1); 3034 if ((old_l3 & ATTR_SW_MANAGED) != 0) { 3035 m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); 3036 if (pmap_pte_dirty(pmap, old_l3)) 3037 vm_page_dirty(m); 3038 if ((old_l3 & ATTR_AF) != 0) 3039 vm_page_aflag_set(m, PGA_REFERENCED); 3040 new_lock = PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)); 3041 if (new_lock != *lockp) { 3042 if (*lockp != NULL) { 3043 /* 3044 * Pending TLB invalidations must be 3045 * performed before the PV list lock is 3046 * released. Otherwise, a concurrent 3047 * pmap_remove_all() on a physical page 3048 * could return while a stale TLB entry 3049 * still provides access to that page. 3050 */ 3051 if (va != eva) { 3052 pmap_invalidate_range(pmap, va, 3053 sva, true); 3054 va = eva; 3055 } 3056 rw_wunlock(*lockp); 3057 } 3058 *lockp = new_lock; 3059 rw_wlock(*lockp); 3060 } 3061 pmap_pvh_free(&m->md, pmap, sva); 3062 if (TAILQ_EMPTY(&m->md.pv_list) && 3063 (m->flags & PG_FICTITIOUS) == 0) { 3064 pvh = page_to_pvh(m); 3065 if (TAILQ_EMPTY(&pvh->pv_list)) 3066 vm_page_aflag_clear(m, PGA_WRITEABLE); 3067 } 3068 } 3069 if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) { 3070 /* 3071 * _pmap_unwire_l3() has already invalidated the TLB 3072 * entries at all levels for "sva". So, we need not 3073 * perform "sva += L3_SIZE;" here. Moreover, we need 3074 * not perform "va = sva;" if "sva" is at the start 3075 * of a new valid range consisting of a single page. 3076 */ 3077 break; 3078 } 3079 if (va == eva) 3080 va = sva; 3081 } 3082 if (va != eva) 3083 pmap_invalidate_range(pmap, va, sva, true); 3084 } 3085 3086 /* 3087 * Remove the given range of addresses from the specified map. 3088 * 3089 * It is assumed that the start and end are properly 3090 * rounded to the page size. 3091 */ 3092 void 3093 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3094 { 3095 struct rwlock *lock; 3096 vm_offset_t va_next; 3097 pd_entry_t *l0, *l1, *l2; 3098 pt_entry_t l3_paddr; 3099 struct spglist free; 3100 3101 /* 3102 * Perform an unsynchronized read. This is, however, safe. 3103 */ 3104 if (pmap->pm_stats.resident_count == 0) 3105 return; 3106 3107 SLIST_INIT(&free); 3108 3109 PMAP_LOCK(pmap); 3110 3111 lock = NULL; 3112 for (; sva < eva; sva = va_next) { 3113 if (pmap->pm_stats.resident_count == 0) 3114 break; 3115 3116 l0 = pmap_l0(pmap, sva); 3117 if (pmap_load(l0) == 0) { 3118 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 3119 if (va_next < sva) 3120 va_next = eva; 3121 continue; 3122 } 3123 3124 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3125 if (va_next < sva) 3126 va_next = eva; 3127 l1 = pmap_l0_to_l1(l0, sva); 3128 if (pmap_load(l1) == 0) 3129 continue; 3130 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 3131 KASSERT(va_next <= eva, 3132 ("partial update of non-transparent 1G page " 3133 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 3134 pmap_load(l1), sva, eva, va_next)); 3135 MPASS(pmap != kernel_pmap); 3136 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0); 3137 pmap_clear(l1); 3138 pmap_invalidate_page(pmap, sva, true); 3139 pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE); 3140 pmap_unuse_pt(pmap, sva, pmap_load(l0), &free); 3141 continue; 3142 } 3143 3144 /* 3145 * Calculate index for next page table. 3146 */ 3147 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3148 if (va_next < sva) 3149 va_next = eva; 3150 3151 l2 = pmap_l1_to_l2(l1, sva); 3152 if (l2 == NULL) 3153 continue; 3154 3155 l3_paddr = pmap_load(l2); 3156 3157 if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) { 3158 if (sva + L2_SIZE == va_next && eva >= va_next) { 3159 pmap_remove_l2(pmap, l2, sva, pmap_load(l1), 3160 &free, &lock); 3161 continue; 3162 } else if (pmap_demote_l2_locked(pmap, l2, sva, 3163 &lock) == NULL) 3164 continue; 3165 l3_paddr = pmap_load(l2); 3166 } 3167 3168 /* 3169 * Weed out invalid mappings. 3170 */ 3171 if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE) 3172 continue; 3173 3174 /* 3175 * Limit our scan to either the end of the va represented 3176 * by the current page table page, or to the end of the 3177 * range being removed. 3178 */ 3179 if (va_next > eva) 3180 va_next = eva; 3181 3182 pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free, 3183 &lock); 3184 } 3185 if (lock != NULL) 3186 rw_wunlock(lock); 3187 PMAP_UNLOCK(pmap); 3188 vm_page_free_pages_toq(&free, true); 3189 } 3190 3191 /* 3192 * Routine: pmap_remove_all 3193 * Function: 3194 * Removes this physical page from 3195 * all physical maps in which it resides. 3196 * Reflects back modify bits to the pager. 3197 * 3198 * Notes: 3199 * Original versions of this routine were very 3200 * inefficient because they iteratively called 3201 * pmap_remove (slow...) 3202 */ 3203 3204 void 3205 pmap_remove_all(vm_page_t m) 3206 { 3207 struct md_page *pvh; 3208 pv_entry_t pv; 3209 pmap_t pmap; 3210 struct rwlock *lock; 3211 pd_entry_t *pde, tpde; 3212 pt_entry_t *pte, tpte; 3213 vm_offset_t va; 3214 struct spglist free; 3215 int lvl, pvh_gen, md_gen; 3216 3217 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3218 ("pmap_remove_all: page %p is not managed", m)); 3219 SLIST_INIT(&free); 3220 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3221 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 3222 rw_wlock(lock); 3223 retry: 3224 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 3225 pmap = PV_PMAP(pv); 3226 if (!PMAP_TRYLOCK(pmap)) { 3227 pvh_gen = pvh->pv_gen; 3228 rw_wunlock(lock); 3229 PMAP_LOCK(pmap); 3230 rw_wlock(lock); 3231 if (pvh_gen != pvh->pv_gen) { 3232 PMAP_UNLOCK(pmap); 3233 goto retry; 3234 } 3235 } 3236 va = pv->pv_va; 3237 pte = pmap_pte_exists(pmap, va, 2, __func__); 3238 pmap_demote_l2_locked(pmap, pte, va, &lock); 3239 PMAP_UNLOCK(pmap); 3240 } 3241 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3242 pmap = PV_PMAP(pv); 3243 PMAP_ASSERT_STAGE1(pmap); 3244 if (!PMAP_TRYLOCK(pmap)) { 3245 pvh_gen = pvh->pv_gen; 3246 md_gen = m->md.pv_gen; 3247 rw_wunlock(lock); 3248 PMAP_LOCK(pmap); 3249 rw_wlock(lock); 3250 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 3251 PMAP_UNLOCK(pmap); 3252 goto retry; 3253 } 3254 } 3255 pmap_resident_count_dec(pmap, 1); 3256 3257 pde = pmap_pde(pmap, pv->pv_va, &lvl); 3258 KASSERT(pde != NULL, 3259 ("pmap_remove_all: no page directory entry found")); 3260 KASSERT(lvl == 2, 3261 ("pmap_remove_all: invalid pde level %d", lvl)); 3262 tpde = pmap_load(pde); 3263 3264 pte = pmap_l2_to_l3(pde, pv->pv_va); 3265 tpte = pmap_load_clear(pte); 3266 if (tpte & ATTR_SW_WIRED) 3267 pmap->pm_stats.wired_count--; 3268 if ((tpte & ATTR_AF) != 0) { 3269 pmap_invalidate_page(pmap, pv->pv_va, true); 3270 vm_page_aflag_set(m, PGA_REFERENCED); 3271 } 3272 3273 /* 3274 * Update the vm_page_t clean and reference bits. 3275 */ 3276 if (pmap_pte_dirty(pmap, tpte)) 3277 vm_page_dirty(m); 3278 pmap_unuse_pt(pmap, pv->pv_va, tpde, &free); 3279 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3280 m->md.pv_gen++; 3281 free_pv_entry(pmap, pv); 3282 PMAP_UNLOCK(pmap); 3283 } 3284 vm_page_aflag_clear(m, PGA_WRITEABLE); 3285 rw_wunlock(lock); 3286 vm_page_free_pages_toq(&free, true); 3287 } 3288 3289 /* 3290 * pmap_protect_l2: do the things to protect a 2MB page in a pmap 3291 */ 3292 static void 3293 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask, 3294 pt_entry_t nbits) 3295 { 3296 pd_entry_t old_l2; 3297 vm_page_t m, mt; 3298 3299 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3300 PMAP_ASSERT_STAGE1(pmap); 3301 KASSERT((sva & L2_OFFSET) == 0, 3302 ("pmap_protect_l2: sva is not 2mpage aligned")); 3303 old_l2 = pmap_load(l2); 3304 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, 3305 ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2)); 3306 3307 /* 3308 * Return if the L2 entry already has the desired access restrictions 3309 * in place. 3310 */ 3311 if ((old_l2 & mask) == nbits) 3312 return; 3313 3314 while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits)) 3315 cpu_spinwait(); 3316 3317 /* 3318 * When a dirty read/write superpage mapping is write protected, 3319 * update the dirty field of each of the superpage's constituent 4KB 3320 * pages. 3321 */ 3322 if ((old_l2 & ATTR_SW_MANAGED) != 0 && 3323 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 3324 pmap_pte_dirty(pmap, old_l2)) { 3325 m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); 3326 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 3327 vm_page_dirty(mt); 3328 } 3329 3330 /* 3331 * Since a promotion must break the 4KB page mappings before making 3332 * the 2MB page mapping, a pmap_invalidate_page() suffices. 3333 */ 3334 pmap_invalidate_page(pmap, sva, true); 3335 } 3336 3337 /* 3338 * Set the physical protection on the 3339 * specified range of this map as requested. 3340 */ 3341 void 3342 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 3343 { 3344 vm_offset_t va, va_next; 3345 pd_entry_t *l0, *l1, *l2; 3346 pt_entry_t *l3p, l3, mask, nbits; 3347 3348 PMAP_ASSERT_STAGE1(pmap); 3349 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 3350 if (prot == VM_PROT_NONE) { 3351 pmap_remove(pmap, sva, eva); 3352 return; 3353 } 3354 3355 mask = nbits = 0; 3356 if ((prot & VM_PROT_WRITE) == 0) { 3357 mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM; 3358 nbits |= ATTR_S1_AP(ATTR_S1_AP_RO); 3359 } 3360 if ((prot & VM_PROT_EXECUTE) == 0) { 3361 mask |= ATTR_S1_XN; 3362 nbits |= ATTR_S1_XN; 3363 } 3364 if (mask == 0) 3365 return; 3366 3367 PMAP_LOCK(pmap); 3368 for (; sva < eva; sva = va_next) { 3369 l0 = pmap_l0(pmap, sva); 3370 if (pmap_load(l0) == 0) { 3371 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 3372 if (va_next < sva) 3373 va_next = eva; 3374 continue; 3375 } 3376 3377 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3378 if (va_next < sva) 3379 va_next = eva; 3380 l1 = pmap_l0_to_l1(l0, sva); 3381 if (pmap_load(l1) == 0) 3382 continue; 3383 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 3384 KASSERT(va_next <= eva, 3385 ("partial update of non-transparent 1G page " 3386 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 3387 pmap_load(l1), sva, eva, va_next)); 3388 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0); 3389 if ((pmap_load(l1) & mask) != nbits) { 3390 pmap_store(l1, (pmap_load(l1) & ~mask) | nbits); 3391 pmap_invalidate_page(pmap, sva, true); 3392 } 3393 continue; 3394 } 3395 3396 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3397 if (va_next < sva) 3398 va_next = eva; 3399 3400 l2 = pmap_l1_to_l2(l1, sva); 3401 if (pmap_load(l2) == 0) 3402 continue; 3403 3404 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { 3405 if (sva + L2_SIZE == va_next && eva >= va_next) { 3406 pmap_protect_l2(pmap, l2, sva, mask, nbits); 3407 continue; 3408 } else if (pmap_demote_l2(pmap, l2, sva) == NULL) 3409 continue; 3410 } 3411 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 3412 ("pmap_protect: Invalid L2 entry after demotion")); 3413 3414 if (va_next > eva) 3415 va_next = eva; 3416 3417 va = va_next; 3418 for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++, 3419 sva += L3_SIZE) { 3420 l3 = pmap_load(l3p); 3421 3422 /* 3423 * Go to the next L3 entry if the current one is 3424 * invalid or already has the desired access 3425 * restrictions in place. (The latter case occurs 3426 * frequently. For example, in a "buildworld" 3427 * workload, almost 1 out of 4 L3 entries already 3428 * have the desired restrictions.) 3429 */ 3430 if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) { 3431 if (va != va_next) { 3432 pmap_invalidate_range(pmap, va, sva, 3433 true); 3434 va = va_next; 3435 } 3436 continue; 3437 } 3438 3439 while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) | 3440 nbits)) 3441 cpu_spinwait(); 3442 3443 /* 3444 * When a dirty read/write mapping is write protected, 3445 * update the page's dirty field. 3446 */ 3447 if ((l3 & ATTR_SW_MANAGED) != 0 && 3448 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 3449 pmap_pte_dirty(pmap, l3)) 3450 vm_page_dirty(PHYS_TO_VM_PAGE(l3 & ~ATTR_MASK)); 3451 3452 if (va == va_next) 3453 va = sva; 3454 } 3455 if (va != va_next) 3456 pmap_invalidate_range(pmap, va, sva, true); 3457 } 3458 PMAP_UNLOCK(pmap); 3459 } 3460 3461 /* 3462 * Inserts the specified page table page into the specified pmap's collection 3463 * of idle page table pages. Each of a pmap's page table pages is responsible 3464 * for mapping a distinct range of virtual addresses. The pmap's collection is 3465 * ordered by this virtual address range. 3466 * 3467 * If "promoted" is false, then the page table page "mpte" must be zero filled. 3468 */ 3469 static __inline int 3470 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) 3471 { 3472 3473 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3474 mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; 3475 return (vm_radix_insert(&pmap->pm_root, mpte)); 3476 } 3477 3478 /* 3479 * Removes the page table page mapping the specified virtual address from the 3480 * specified pmap's collection of idle page table pages, and returns it. 3481 * Otherwise, returns NULL if there is no page table page corresponding to the 3482 * specified virtual address. 3483 */ 3484 static __inline vm_page_t 3485 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 3486 { 3487 3488 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3489 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); 3490 } 3491 3492 /* 3493 * Performs a break-before-make update of a pmap entry. This is needed when 3494 * either promoting or demoting pages to ensure the TLB doesn't get into an 3495 * inconsistent state. 3496 */ 3497 static void 3498 pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte, 3499 vm_offset_t va, vm_size_t size) 3500 { 3501 register_t intr; 3502 3503 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3504 3505 /* 3506 * Ensure we don't get switched out with the page table in an 3507 * inconsistent state. We also need to ensure no interrupts fire 3508 * as they may make use of an address we are about to invalidate. 3509 */ 3510 intr = intr_disable(); 3511 3512 /* 3513 * Clear the old mapping's valid bit, but leave the rest of the entry 3514 * unchanged, so that a lockless, concurrent pmap_kextract() can still 3515 * lookup the physical address. 3516 */ 3517 pmap_clear_bits(pte, ATTR_DESCR_VALID); 3518 3519 /* 3520 * When promoting, the L{1,2}_TABLE entry that is being replaced might 3521 * be cached, so we invalidate intermediate entries as well as final 3522 * entries. 3523 */ 3524 pmap_invalidate_range(pmap, va, va + size, false); 3525 3526 /* Create the new mapping */ 3527 pmap_store(pte, newpte); 3528 dsb(ishst); 3529 3530 intr_restore(intr); 3531 } 3532 3533 #if VM_NRESERVLEVEL > 0 3534 /* 3535 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 3536 * replace the many pv entries for the 4KB page mappings by a single pv entry 3537 * for the 2MB page mapping. 3538 */ 3539 static void 3540 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3541 struct rwlock **lockp) 3542 { 3543 struct md_page *pvh; 3544 pv_entry_t pv; 3545 vm_offset_t va_last; 3546 vm_page_t m; 3547 3548 KASSERT((pa & L2_OFFSET) == 0, 3549 ("pmap_pv_promote_l2: pa is not 2mpage aligned")); 3550 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3551 3552 /* 3553 * Transfer the first page's pv entry for this mapping to the 2mpage's 3554 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 3555 * a transfer avoids the possibility that get_pv_entry() calls 3556 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 3557 * mappings that is being promoted. 3558 */ 3559 m = PHYS_TO_VM_PAGE(pa); 3560 va = va & ~L2_OFFSET; 3561 pv = pmap_pvh_remove(&m->md, pmap, va); 3562 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found")); 3563 pvh = page_to_pvh(m); 3564 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3565 pvh->pv_gen++; 3566 /* Free the remaining NPTEPG - 1 pv entries. */ 3567 va_last = va + L2_SIZE - PAGE_SIZE; 3568 do { 3569 m++; 3570 va += PAGE_SIZE; 3571 pmap_pvh_free(&m->md, pmap, va); 3572 } while (va < va_last); 3573 } 3574 3575 /* 3576 * Tries to promote the 512, contiguous 4KB page mappings that are within a 3577 * single level 2 table entry to a single 2MB page mapping. For promotion 3578 * to occur, two conditions must be met: (1) the 4KB page mappings must map 3579 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 3580 * identical characteristics. 3581 */ 3582 static void 3583 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 3584 struct rwlock **lockp) 3585 { 3586 pt_entry_t *firstl3, *l3, newl2, oldl3, pa; 3587 vm_page_t mpte; 3588 vm_offset_t sva; 3589 3590 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3591 PMAP_ASSERT_STAGE1(pmap); 3592 3593 sva = va & ~L2_OFFSET; 3594 firstl3 = pmap_l2_to_l3(l2, sva); 3595 newl2 = pmap_load(firstl3); 3596 3597 if (((newl2 & (~ATTR_MASK | ATTR_AF)) & L2_OFFSET) != ATTR_AF) { 3598 atomic_add_long(&pmap_l2_p_failures, 1); 3599 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 3600 " in pmap %p", va, pmap); 3601 return; 3602 } 3603 3604 setl2: 3605 if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 3606 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 3607 /* 3608 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set, 3609 * ATTR_SW_DBM can be cleared without a TLB invalidation. 3610 */ 3611 if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM)) 3612 goto setl2; 3613 newl2 &= ~ATTR_SW_DBM; 3614 } 3615 3616 pa = newl2 + L2_SIZE - PAGE_SIZE; 3617 for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) { 3618 oldl3 = pmap_load(l3); 3619 setl3: 3620 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 3621 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 3622 /* 3623 * When the mapping is clean, i.e., ATTR_S1_AP_RO is 3624 * set, ATTR_SW_DBM can be cleared without a TLB 3625 * invalidation. 3626 */ 3627 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 & 3628 ~ATTR_SW_DBM)) 3629 goto setl3; 3630 oldl3 &= ~ATTR_SW_DBM; 3631 } 3632 if (oldl3 != pa) { 3633 atomic_add_long(&pmap_l2_p_failures, 1); 3634 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 3635 " in pmap %p", va, pmap); 3636 return; 3637 } 3638 pa -= PAGE_SIZE; 3639 } 3640 3641 /* 3642 * Save the page table page in its current state until the L2 3643 * mapping the superpage is demoted by pmap_demote_l2() or 3644 * destroyed by pmap_remove_l3(). 3645 */ 3646 mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); 3647 KASSERT(mpte >= vm_page_array && 3648 mpte < &vm_page_array[vm_page_array_size], 3649 ("pmap_promote_l2: page table page is out of range")); 3650 KASSERT(mpte->pindex == pmap_l2_pindex(va), 3651 ("pmap_promote_l2: page table page's pindex is wrong")); 3652 if (pmap_insert_pt_page(pmap, mpte, true)) { 3653 atomic_add_long(&pmap_l2_p_failures, 1); 3654 CTR2(KTR_PMAP, 3655 "pmap_promote_l2: failure for va %#lx in pmap %p", va, 3656 pmap); 3657 return; 3658 } 3659 3660 if ((newl2 & ATTR_SW_MANAGED) != 0) 3661 pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp); 3662 3663 newl2 &= ~ATTR_DESCR_MASK; 3664 newl2 |= L2_BLOCK; 3665 3666 pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE); 3667 3668 atomic_add_long(&pmap_l2_promotions, 1); 3669 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, 3670 pmap); 3671 } 3672 #endif /* VM_NRESERVLEVEL > 0 */ 3673 3674 static int 3675 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags, 3676 int psind) 3677 { 3678 pd_entry_t *l0p, *l1p, *l2p, origpte; 3679 vm_page_t mp; 3680 3681 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3682 KASSERT(psind > 0 && psind < MAXPAGESIZES, 3683 ("psind %d unexpected", psind)); 3684 KASSERT(((newpte & ~ATTR_MASK) & (pagesizes[psind] - 1)) == 0, 3685 ("unaligned phys address %#lx newpte %#lx psind %d", 3686 (newpte & ~ATTR_MASK), newpte, psind)); 3687 3688 restart: 3689 if (psind == 2) { 3690 l0p = pmap_l0(pmap, va); 3691 if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) { 3692 mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL); 3693 if (mp == NULL) { 3694 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 3695 return (KERN_RESOURCE_SHORTAGE); 3696 PMAP_UNLOCK(pmap); 3697 vm_wait(NULL); 3698 PMAP_LOCK(pmap); 3699 goto restart; 3700 } 3701 l1p = pmap_l0_to_l1(l0p, va); 3702 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va)); 3703 origpte = pmap_load(l1p); 3704 } else { 3705 l1p = pmap_l0_to_l1(l0p, va); 3706 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va)); 3707 origpte = pmap_load(l1p); 3708 if ((origpte & ATTR_DESCR_VALID) == 0) { 3709 mp = PHYS_TO_VM_PAGE(pmap_load(l0p) & 3710 ~ATTR_MASK); 3711 mp->ref_count++; 3712 } 3713 } 3714 KASSERT((origpte & ATTR_DESCR_VALID) == 0 || 3715 ((origpte & ATTR_DESCR_MASK) == L1_BLOCK && 3716 (origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK)), 3717 ("va %#lx changing 1G phys page l1 %#lx newpte %#lx", 3718 va, origpte, newpte)); 3719 pmap_store(l1p, newpte); 3720 } else /* (psind == 1) */ { 3721 l2p = pmap_l2(pmap, va); 3722 if (l2p == NULL) { 3723 mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL); 3724 if (mp == NULL) { 3725 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 3726 return (KERN_RESOURCE_SHORTAGE); 3727 PMAP_UNLOCK(pmap); 3728 vm_wait(NULL); 3729 PMAP_LOCK(pmap); 3730 goto restart; 3731 } 3732 l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 3733 l2p = &l2p[pmap_l2_index(va)]; 3734 origpte = pmap_load(l2p); 3735 } else { 3736 l1p = pmap_l1(pmap, va); 3737 origpte = pmap_load(l2p); 3738 if ((origpte & ATTR_DESCR_VALID) == 0) { 3739 mp = PHYS_TO_VM_PAGE(pmap_load(l1p) & 3740 ~ATTR_MASK); 3741 mp->ref_count++; 3742 } 3743 } 3744 KASSERT((origpte & ATTR_DESCR_VALID) == 0 || 3745 ((origpte & ATTR_DESCR_MASK) == L2_BLOCK && 3746 (origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK)), 3747 ("va %#lx changing 2M phys page l2 %#lx newpte %#lx", 3748 va, origpte, newpte)); 3749 pmap_store(l2p, newpte); 3750 } 3751 dsb(ishst); 3752 3753 if ((origpte & ATTR_DESCR_VALID) == 0) 3754 pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE); 3755 if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0) 3756 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE; 3757 else if ((newpte & ATTR_SW_WIRED) == 0 && 3758 (origpte & ATTR_SW_WIRED) != 0) 3759 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE; 3760 3761 return (KERN_SUCCESS); 3762 } 3763 3764 /* 3765 * Insert the given physical page (p) at 3766 * the specified virtual address (v) in the 3767 * target physical map with the protection requested. 3768 * 3769 * If specified, the page will be wired down, meaning 3770 * that the related pte can not be reclaimed. 3771 * 3772 * NB: This is the only routine which MAY NOT lazy-evaluate 3773 * or lose information. That is, this routine must actually 3774 * insert this page into the given map NOW. 3775 */ 3776 int 3777 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3778 u_int flags, int8_t psind) 3779 { 3780 struct rwlock *lock; 3781 pd_entry_t *pde; 3782 pt_entry_t new_l3, orig_l3; 3783 pt_entry_t *l2, *l3; 3784 pv_entry_t pv; 3785 vm_paddr_t opa, pa; 3786 vm_page_t mpte, om; 3787 boolean_t nosleep; 3788 int lvl, rv; 3789 3790 KASSERT(ADDR_IS_CANONICAL(va), 3791 ("%s: Address not in canonical form: %lx", __func__, va)); 3792 3793 va = trunc_page(va); 3794 if ((m->oflags & VPO_UNMANAGED) == 0) 3795 VM_PAGE_OBJECT_BUSY_ASSERT(m); 3796 pa = VM_PAGE_TO_PHYS(m); 3797 new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | L3_PAGE); 3798 new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr); 3799 new_l3 |= pmap_pte_prot(pmap, prot); 3800 3801 if ((flags & PMAP_ENTER_WIRED) != 0) 3802 new_l3 |= ATTR_SW_WIRED; 3803 if (pmap->pm_stage == PM_STAGE1) { 3804 if (!ADDR_IS_KERNEL(va)) 3805 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 3806 else 3807 new_l3 |= ATTR_S1_UXN; 3808 if (pmap != kernel_pmap) 3809 new_l3 |= ATTR_S1_nG; 3810 } else { 3811 /* 3812 * Clear the access flag on executable mappings, this will be 3813 * set later when the page is accessed. The fault handler is 3814 * required to invalidate the I-cache. 3815 * 3816 * TODO: Switch to the valid flag to allow hardware management 3817 * of the access flag. Much of the pmap code assumes the 3818 * valid flag is set and fails to destroy the old page tables 3819 * correctly if it is clear. 3820 */ 3821 if (prot & VM_PROT_EXECUTE) 3822 new_l3 &= ~ATTR_AF; 3823 } 3824 if ((m->oflags & VPO_UNMANAGED) == 0) { 3825 new_l3 |= ATTR_SW_MANAGED; 3826 if ((prot & VM_PROT_WRITE) != 0) { 3827 new_l3 |= ATTR_SW_DBM; 3828 if ((flags & VM_PROT_WRITE) == 0) { 3829 if (pmap->pm_stage == PM_STAGE1) 3830 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO); 3831 else 3832 new_l3 &= 3833 ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 3834 } 3835 } 3836 } 3837 3838 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); 3839 3840 lock = NULL; 3841 PMAP_LOCK(pmap); 3842 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { 3843 KASSERT((m->oflags & VPO_UNMANAGED) != 0, 3844 ("managed largepage va %#lx flags %#x", va, flags)); 3845 new_l3 &= ~L3_PAGE; 3846 if (psind == 2) 3847 new_l3 |= L1_BLOCK; 3848 else /* (psind == 1) */ 3849 new_l3 |= L2_BLOCK; 3850 rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind); 3851 goto out; 3852 } 3853 if (psind == 1) { 3854 /* Assert the required virtual and physical alignment. */ 3855 KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned")); 3856 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 3857 rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK, 3858 flags, m, &lock); 3859 goto out; 3860 } 3861 mpte = NULL; 3862 3863 /* 3864 * In the case that a page table page is not 3865 * resident, we are creating it here. 3866 */ 3867 retry: 3868 pde = pmap_pde(pmap, va, &lvl); 3869 if (pde != NULL && lvl == 2) { 3870 l3 = pmap_l2_to_l3(pde, va); 3871 if (!ADDR_IS_KERNEL(va) && mpte == NULL) { 3872 mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); 3873 mpte->ref_count++; 3874 } 3875 goto havel3; 3876 } else if (pde != NULL && lvl == 1) { 3877 l2 = pmap_l1_to_l2(pde, va); 3878 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK && 3879 (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) { 3880 l3 = &l3[pmap_l3_index(va)]; 3881 if (!ADDR_IS_KERNEL(va)) { 3882 mpte = PHYS_TO_VM_PAGE( 3883 pmap_load(l2) & ~ATTR_MASK); 3884 mpte->ref_count++; 3885 } 3886 goto havel3; 3887 } 3888 /* We need to allocate an L3 table. */ 3889 } 3890 if (!ADDR_IS_KERNEL(va)) { 3891 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 3892 3893 /* 3894 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order 3895 * to handle the possibility that a superpage mapping for "va" 3896 * was created while we slept. 3897 */ 3898 mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), 3899 nosleep ? NULL : &lock); 3900 if (mpte == NULL && nosleep) { 3901 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); 3902 rv = KERN_RESOURCE_SHORTAGE; 3903 goto out; 3904 } 3905 goto retry; 3906 } else 3907 panic("pmap_enter: missing L3 table for kernel va %#lx", va); 3908 3909 havel3: 3910 orig_l3 = pmap_load(l3); 3911 opa = orig_l3 & ~ATTR_MASK; 3912 pv = NULL; 3913 3914 /* 3915 * Is the specified virtual address already mapped? 3916 */ 3917 if (pmap_l3_valid(orig_l3)) { 3918 /* 3919 * Only allow adding new entries on stage 2 tables for now. 3920 * This simplifies cache invalidation as we may need to call 3921 * into EL2 to perform such actions. 3922 */ 3923 PMAP_ASSERT_STAGE1(pmap); 3924 /* 3925 * Wiring change, just update stats. We don't worry about 3926 * wiring PT pages as they remain resident as long as there 3927 * are valid mappings in them. Hence, if a user page is wired, 3928 * the PT page will be also. 3929 */ 3930 if ((flags & PMAP_ENTER_WIRED) != 0 && 3931 (orig_l3 & ATTR_SW_WIRED) == 0) 3932 pmap->pm_stats.wired_count++; 3933 else if ((flags & PMAP_ENTER_WIRED) == 0 && 3934 (orig_l3 & ATTR_SW_WIRED) != 0) 3935 pmap->pm_stats.wired_count--; 3936 3937 /* 3938 * Remove the extra PT page reference. 3939 */ 3940 if (mpte != NULL) { 3941 mpte->ref_count--; 3942 KASSERT(mpte->ref_count > 0, 3943 ("pmap_enter: missing reference to page table page," 3944 " va: 0x%lx", va)); 3945 } 3946 3947 /* 3948 * Has the physical page changed? 3949 */ 3950 if (opa == pa) { 3951 /* 3952 * No, might be a protection or wiring change. 3953 */ 3954 if ((orig_l3 & ATTR_SW_MANAGED) != 0 && 3955 (new_l3 & ATTR_SW_DBM) != 0) 3956 vm_page_aflag_set(m, PGA_WRITEABLE); 3957 goto validate; 3958 } 3959 3960 /* 3961 * The physical page has changed. Temporarily invalidate 3962 * the mapping. 3963 */ 3964 orig_l3 = pmap_load_clear(l3); 3965 KASSERT((orig_l3 & ~ATTR_MASK) == opa, 3966 ("pmap_enter: unexpected pa update for %#lx", va)); 3967 if ((orig_l3 & ATTR_SW_MANAGED) != 0) { 3968 om = PHYS_TO_VM_PAGE(opa); 3969 3970 /* 3971 * The pmap lock is sufficient to synchronize with 3972 * concurrent calls to pmap_page_test_mappings() and 3973 * pmap_ts_referenced(). 3974 */ 3975 if (pmap_pte_dirty(pmap, orig_l3)) 3976 vm_page_dirty(om); 3977 if ((orig_l3 & ATTR_AF) != 0) { 3978 pmap_invalidate_page(pmap, va, true); 3979 vm_page_aflag_set(om, PGA_REFERENCED); 3980 } 3981 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 3982 pv = pmap_pvh_remove(&om->md, pmap, va); 3983 if ((m->oflags & VPO_UNMANAGED) != 0) 3984 free_pv_entry(pmap, pv); 3985 if ((om->a.flags & PGA_WRITEABLE) != 0 && 3986 TAILQ_EMPTY(&om->md.pv_list) && 3987 ((om->flags & PG_FICTITIOUS) != 0 || 3988 TAILQ_EMPTY(&page_to_pvh(om)->pv_list))) 3989 vm_page_aflag_clear(om, PGA_WRITEABLE); 3990 } else { 3991 KASSERT((orig_l3 & ATTR_AF) != 0, 3992 ("pmap_enter: unmanaged mapping lacks ATTR_AF")); 3993 pmap_invalidate_page(pmap, va, true); 3994 } 3995 orig_l3 = 0; 3996 } else { 3997 /* 3998 * Increment the counters. 3999 */ 4000 if ((new_l3 & ATTR_SW_WIRED) != 0) 4001 pmap->pm_stats.wired_count++; 4002 pmap_resident_count_inc(pmap, 1); 4003 } 4004 /* 4005 * Enter on the PV list if part of our managed memory. 4006 */ 4007 if ((m->oflags & VPO_UNMANAGED) == 0) { 4008 if (pv == NULL) { 4009 pv = get_pv_entry(pmap, &lock); 4010 pv->pv_va = va; 4011 } 4012 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 4013 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4014 m->md.pv_gen++; 4015 if ((new_l3 & ATTR_SW_DBM) != 0) 4016 vm_page_aflag_set(m, PGA_WRITEABLE); 4017 } 4018 4019 validate: 4020 if (pmap->pm_stage == PM_STAGE1) { 4021 /* 4022 * Sync icache if exec permission and attribute 4023 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping 4024 * is stored and made valid for hardware table walk. If done 4025 * later, then other can access this page before caches are 4026 * properly synced. Don't do it for kernel memory which is 4027 * mapped with exec permission even if the memory isn't going 4028 * to hold executable code. The only time when icache sync is 4029 * needed is after kernel module is loaded and the relocation 4030 * info is processed. And it's done in elf_cpu_load_file(). 4031 */ 4032 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 4033 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK && 4034 (opa != pa || (orig_l3 & ATTR_S1_XN))) { 4035 PMAP_ASSERT_STAGE1(pmap); 4036 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); 4037 } 4038 } else { 4039 cpu_dcache_wb_range(PHYS_TO_DMAP(pa), PAGE_SIZE); 4040 } 4041 4042 /* 4043 * Update the L3 entry 4044 */ 4045 if (pmap_l3_valid(orig_l3)) { 4046 PMAP_ASSERT_STAGE1(pmap); 4047 KASSERT(opa == pa, ("pmap_enter: invalid update")); 4048 if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) { 4049 /* same PA, different attributes */ 4050 orig_l3 = pmap_load_store(l3, new_l3); 4051 pmap_invalidate_page(pmap, va, true); 4052 if ((orig_l3 & ATTR_SW_MANAGED) != 0 && 4053 pmap_pte_dirty(pmap, orig_l3)) 4054 vm_page_dirty(m); 4055 } else { 4056 /* 4057 * orig_l3 == new_l3 4058 * This can happens if multiple threads simultaneously 4059 * access not yet mapped page. This bad for performance 4060 * since this can cause full demotion-NOP-promotion 4061 * cycle. 4062 * Another possible reasons are: 4063 * - VM and pmap memory layout are diverged 4064 * - tlb flush is missing somewhere and CPU doesn't see 4065 * actual mapping. 4066 */ 4067 CTR4(KTR_PMAP, "%s: already mapped page - " 4068 "pmap %p va 0x%#lx pte 0x%lx", 4069 __func__, pmap, va, new_l3); 4070 } 4071 } else { 4072 /* New mapping */ 4073 pmap_store(l3, new_l3); 4074 dsb(ishst); 4075 } 4076 4077 #if VM_NRESERVLEVEL > 0 4078 /* 4079 * Try to promote from level 3 pages to a level 2 superpage. This 4080 * currently only works on stage 1 pmaps as pmap_promote_l2 looks at 4081 * stage 1 specific fields and performs a break-before-make sequence 4082 * that is incorrect a stage 2 pmap. 4083 */ 4084 if ((mpte == NULL || mpte->ref_count == NL3PG) && 4085 pmap_ps_enabled(pmap) && pmap->pm_stage == PM_STAGE1 && 4086 (m->flags & PG_FICTITIOUS) == 0 && 4087 vm_reserv_level_iffullpop(m) == 0) { 4088 pmap_promote_l2(pmap, pde, va, &lock); 4089 } 4090 #endif 4091 4092 rv = KERN_SUCCESS; 4093 out: 4094 if (lock != NULL) 4095 rw_wunlock(lock); 4096 PMAP_UNLOCK(pmap); 4097 return (rv); 4098 } 4099 4100 /* 4101 * Tries to create a read- and/or execute-only 2MB page mapping. Returns true 4102 * if successful. Returns false if (1) a page table page cannot be allocated 4103 * without sleeping, (2) a mapping already exists at the specified virtual 4104 * address, or (3) a PV entry cannot be allocated without reclaiming another 4105 * PV entry. 4106 */ 4107 static bool 4108 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4109 struct rwlock **lockp) 4110 { 4111 pd_entry_t new_l2; 4112 4113 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4114 PMAP_ASSERT_STAGE1(pmap); 4115 KASSERT(ADDR_IS_CANONICAL(va), 4116 ("%s: Address not in canonical form: %lx", __func__, va)); 4117 4118 new_l2 = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | 4119 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | 4120 L2_BLOCK); 4121 if ((m->oflags & VPO_UNMANAGED) == 0) { 4122 new_l2 |= ATTR_SW_MANAGED; 4123 new_l2 &= ~ATTR_AF; 4124 } 4125 if ((prot & VM_PROT_EXECUTE) == 0 || 4126 m->md.pv_memattr == VM_MEMATTR_DEVICE) 4127 new_l2 |= ATTR_S1_XN; 4128 if (!ADDR_IS_KERNEL(va)) 4129 new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 4130 else 4131 new_l2 |= ATTR_S1_UXN; 4132 if (pmap != kernel_pmap) 4133 new_l2 |= ATTR_S1_nG; 4134 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | 4135 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp) == 4136 KERN_SUCCESS); 4137 } 4138 4139 /* 4140 * Returns true if every page table entry in the specified page table is 4141 * zero. 4142 */ 4143 static bool 4144 pmap_every_pte_zero(vm_paddr_t pa) 4145 { 4146 pt_entry_t *pt_end, *pte; 4147 4148 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); 4149 pte = (pt_entry_t *)PHYS_TO_DMAP(pa); 4150 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) { 4151 if (*pte != 0) 4152 return (false); 4153 } 4154 return (true); 4155 } 4156 4157 /* 4158 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 4159 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 4160 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 4161 * a mapping already exists at the specified virtual address. Returns 4162 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table 4163 * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if 4164 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. 4165 */ 4166 static int 4167 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, 4168 vm_page_t m, struct rwlock **lockp) 4169 { 4170 struct spglist free; 4171 pd_entry_t *l2, old_l2; 4172 vm_page_t l2pg, mt; 4173 4174 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4175 KASSERT(ADDR_IS_CANONICAL(va), 4176 ("%s: Address not in canonical form: %lx", __func__, va)); 4177 4178 if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags & 4179 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { 4180 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", 4181 va, pmap); 4182 return (KERN_RESOURCE_SHORTAGE); 4183 } 4184 4185 /* 4186 * If there are existing mappings, either abort or remove them. 4187 */ 4188 if ((old_l2 = pmap_load(l2)) != 0) { 4189 KASSERT(l2pg == NULL || l2pg->ref_count > 1, 4190 ("pmap_enter_l2: l2pg's ref count is too low")); 4191 if ((flags & PMAP_ENTER_NOREPLACE) != 0 && 4192 (!ADDR_IS_KERNEL(va) || 4193 (old_l2 & ATTR_DESCR_MASK) == L2_BLOCK || 4194 !pmap_every_pte_zero(old_l2 & ~ATTR_MASK))) { 4195 if (l2pg != NULL) 4196 l2pg->ref_count--; 4197 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx" 4198 " in pmap %p", va, pmap); 4199 return (KERN_FAILURE); 4200 } 4201 SLIST_INIT(&free); 4202 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) 4203 (void)pmap_remove_l2(pmap, l2, va, 4204 pmap_load(pmap_l1(pmap, va)), &free, lockp); 4205 else 4206 pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE, 4207 &free, lockp); 4208 if (!ADDR_IS_KERNEL(va)) { 4209 vm_page_free_pages_toq(&free, true); 4210 KASSERT(pmap_load(l2) == 0, 4211 ("pmap_enter_l2: non-zero L2 entry %p", l2)); 4212 } else { 4213 KASSERT(SLIST_EMPTY(&free), 4214 ("pmap_enter_l2: freed kernel page table page")); 4215 4216 /* 4217 * Both pmap_remove_l2() and pmap_remove_l3_range() 4218 * will leave the kernel page table page zero filled. 4219 * Nonetheless, the TLB could have an intermediate 4220 * entry for the kernel page table page, so request 4221 * an invalidation at all levels after clearing 4222 * the L2_TABLE entry. 4223 */ 4224 mt = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); 4225 if (pmap_insert_pt_page(pmap, mt, false)) 4226 panic("pmap_enter_l2: trie insert failed"); 4227 pmap_clear(l2); 4228 pmap_invalidate_page(pmap, va, false); 4229 } 4230 } 4231 4232 if ((new_l2 & ATTR_SW_MANAGED) != 0) { 4233 /* 4234 * Abort this mapping if its PV entry could not be created. 4235 */ 4236 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { 4237 if (l2pg != NULL) 4238 pmap_abort_ptp(pmap, va, l2pg); 4239 CTR2(KTR_PMAP, 4240 "pmap_enter_l2: failure for va %#lx in pmap %p", 4241 va, pmap); 4242 return (KERN_RESOURCE_SHORTAGE); 4243 } 4244 if ((new_l2 & ATTR_SW_DBM) != 0) 4245 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 4246 vm_page_aflag_set(mt, PGA_WRITEABLE); 4247 } 4248 4249 /* 4250 * Increment counters. 4251 */ 4252 if ((new_l2 & ATTR_SW_WIRED) != 0) 4253 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; 4254 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; 4255 4256 /* 4257 * Conditionally sync the icache. See pmap_enter() for details. 4258 */ 4259 if ((new_l2 & ATTR_S1_XN) == 0 && ((new_l2 & ~ATTR_MASK) != 4260 (old_l2 & ~ATTR_MASK) || (old_l2 & ATTR_S1_XN) != 0) && 4261 pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) { 4262 cpu_icache_sync_range(PHYS_TO_DMAP(new_l2 & ~ATTR_MASK), 4263 L2_SIZE); 4264 } 4265 4266 /* 4267 * Map the superpage. 4268 */ 4269 pmap_store(l2, new_l2); 4270 dsb(ishst); 4271 4272 atomic_add_long(&pmap_l2_mappings, 1); 4273 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", 4274 va, pmap); 4275 4276 return (KERN_SUCCESS); 4277 } 4278 4279 /* 4280 * Maps a sequence of resident pages belonging to the same object. 4281 * The sequence begins with the given page m_start. This page is 4282 * mapped at the given virtual address start. Each subsequent page is 4283 * mapped at a virtual address that is offset from start by the same 4284 * amount as the page is offset from m_start within the object. The 4285 * last page in the sequence is the page with the largest offset from 4286 * m_start that can be mapped at a virtual address less than the given 4287 * virtual address end. Not every virtual page between start and end 4288 * is mapped; only those for which a resident page exists with the 4289 * corresponding offset from m_start are mapped. 4290 */ 4291 void 4292 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 4293 vm_page_t m_start, vm_prot_t prot) 4294 { 4295 struct rwlock *lock; 4296 vm_offset_t va; 4297 vm_page_t m, mpte; 4298 vm_pindex_t diff, psize; 4299 4300 VM_OBJECT_ASSERT_LOCKED(m_start->object); 4301 4302 psize = atop(end - start); 4303 mpte = NULL; 4304 m = m_start; 4305 lock = NULL; 4306 PMAP_LOCK(pmap); 4307 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 4308 va = start + ptoa(diff); 4309 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && 4310 m->psind == 1 && pmap_ps_enabled(pmap) && 4311 pmap_enter_2mpage(pmap, va, m, prot, &lock)) 4312 m = &m[L2_SIZE / PAGE_SIZE - 1]; 4313 else 4314 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, 4315 &lock); 4316 m = TAILQ_NEXT(m, listq); 4317 } 4318 if (lock != NULL) 4319 rw_wunlock(lock); 4320 PMAP_UNLOCK(pmap); 4321 } 4322 4323 /* 4324 * this code makes some *MAJOR* assumptions: 4325 * 1. Current pmap & pmap exists. 4326 * 2. Not wired. 4327 * 3. Read access. 4328 * 4. No page table pages. 4329 * but is *MUCH* faster than pmap_enter... 4330 */ 4331 4332 void 4333 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4334 { 4335 struct rwlock *lock; 4336 4337 lock = NULL; 4338 PMAP_LOCK(pmap); 4339 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 4340 if (lock != NULL) 4341 rw_wunlock(lock); 4342 PMAP_UNLOCK(pmap); 4343 } 4344 4345 static vm_page_t 4346 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 4347 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 4348 { 4349 pd_entry_t *pde; 4350 pt_entry_t *l2, *l3, l3_val; 4351 vm_paddr_t pa; 4352 int lvl; 4353 4354 KASSERT(!VA_IS_CLEANMAP(va) || 4355 (m->oflags & VPO_UNMANAGED) != 0, 4356 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 4357 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4358 PMAP_ASSERT_STAGE1(pmap); 4359 KASSERT(ADDR_IS_CANONICAL(va), 4360 ("%s: Address not in canonical form: %lx", __func__, va)); 4361 4362 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); 4363 /* 4364 * In the case that a page table page is not 4365 * resident, we are creating it here. 4366 */ 4367 if (!ADDR_IS_KERNEL(va)) { 4368 vm_pindex_t l2pindex; 4369 4370 /* 4371 * Calculate pagetable page index 4372 */ 4373 l2pindex = pmap_l2_pindex(va); 4374 if (mpte && (mpte->pindex == l2pindex)) { 4375 mpte->ref_count++; 4376 } else { 4377 /* 4378 * Get the l2 entry 4379 */ 4380 pde = pmap_pde(pmap, va, &lvl); 4381 4382 /* 4383 * If the page table page is mapped, we just increment 4384 * the hold count, and activate it. Otherwise, we 4385 * attempt to allocate a page table page. If this 4386 * attempt fails, we don't retry. Instead, we give up. 4387 */ 4388 if (lvl == 1) { 4389 l2 = pmap_l1_to_l2(pde, va); 4390 if ((pmap_load(l2) & ATTR_DESCR_MASK) == 4391 L2_BLOCK) 4392 return (NULL); 4393 } 4394 if (lvl == 2 && pmap_load(pde) != 0) { 4395 mpte = 4396 PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); 4397 mpte->ref_count++; 4398 } else { 4399 /* 4400 * Pass NULL instead of the PV list lock 4401 * pointer, because we don't intend to sleep. 4402 */ 4403 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); 4404 if (mpte == NULL) 4405 return (mpte); 4406 } 4407 } 4408 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 4409 l3 = &l3[pmap_l3_index(va)]; 4410 } else { 4411 mpte = NULL; 4412 pde = pmap_pde(kernel_pmap, va, &lvl); 4413 KASSERT(pde != NULL, 4414 ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx", 4415 va)); 4416 KASSERT(lvl == 2, 4417 ("pmap_enter_quick_locked: Invalid level %d", lvl)); 4418 l3 = pmap_l2_to_l3(pde, va); 4419 } 4420 4421 /* 4422 * Abort if a mapping already exists. 4423 */ 4424 if (pmap_load(l3) != 0) { 4425 if (mpte != NULL) 4426 mpte->ref_count--; 4427 return (NULL); 4428 } 4429 4430 /* 4431 * Enter on the PV list if part of our managed memory. 4432 */ 4433 if ((m->oflags & VPO_UNMANAGED) == 0 && 4434 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 4435 if (mpte != NULL) 4436 pmap_abort_ptp(pmap, va, mpte); 4437 return (NULL); 4438 } 4439 4440 /* 4441 * Increment counters 4442 */ 4443 pmap_resident_count_inc(pmap, 1); 4444 4445 pa = VM_PAGE_TO_PHYS(m); 4446 l3_val = pa | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) | 4447 ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE; 4448 if ((prot & VM_PROT_EXECUTE) == 0 || 4449 m->md.pv_memattr == VM_MEMATTR_DEVICE) 4450 l3_val |= ATTR_S1_XN; 4451 if (!ADDR_IS_KERNEL(va)) 4452 l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 4453 else 4454 l3_val |= ATTR_S1_UXN; 4455 if (pmap != kernel_pmap) 4456 l3_val |= ATTR_S1_nG; 4457 4458 /* 4459 * Now validate mapping with RO protection 4460 */ 4461 if ((m->oflags & VPO_UNMANAGED) == 0) { 4462 l3_val |= ATTR_SW_MANAGED; 4463 l3_val &= ~ATTR_AF; 4464 } 4465 4466 /* Sync icache before the mapping is stored to PTE */ 4467 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 4468 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) 4469 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); 4470 4471 pmap_store(l3, l3_val); 4472 dsb(ishst); 4473 4474 return (mpte); 4475 } 4476 4477 /* 4478 * This code maps large physical mmap regions into the 4479 * processor address space. Note that some shortcuts 4480 * are taken, but the code works. 4481 */ 4482 void 4483 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 4484 vm_pindex_t pindex, vm_size_t size) 4485 { 4486 4487 VM_OBJECT_ASSERT_WLOCKED(object); 4488 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 4489 ("pmap_object_init_pt: non-device object")); 4490 } 4491 4492 /* 4493 * Clear the wired attribute from the mappings for the specified range of 4494 * addresses in the given pmap. Every valid mapping within that range 4495 * must have the wired attribute set. In contrast, invalid mappings 4496 * cannot have the wired attribute set, so they are ignored. 4497 * 4498 * The wired attribute of the page table entry is not a hardware feature, 4499 * so there is no need to invalidate any TLB entries. 4500 */ 4501 void 4502 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4503 { 4504 vm_offset_t va_next; 4505 pd_entry_t *l0, *l1, *l2; 4506 pt_entry_t *l3; 4507 4508 PMAP_LOCK(pmap); 4509 for (; sva < eva; sva = va_next) { 4510 l0 = pmap_l0(pmap, sva); 4511 if (pmap_load(l0) == 0) { 4512 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 4513 if (va_next < sva) 4514 va_next = eva; 4515 continue; 4516 } 4517 4518 l1 = pmap_l0_to_l1(l0, sva); 4519 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 4520 if (va_next < sva) 4521 va_next = eva; 4522 if (pmap_load(l1) == 0) 4523 continue; 4524 4525 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 4526 KASSERT(va_next <= eva, 4527 ("partial update of non-transparent 1G page " 4528 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 4529 pmap_load(l1), sva, eva, va_next)); 4530 MPASS(pmap != kernel_pmap); 4531 MPASS((pmap_load(l1) & (ATTR_SW_MANAGED | 4532 ATTR_SW_WIRED)) == ATTR_SW_WIRED); 4533 pmap_clear_bits(l1, ATTR_SW_WIRED); 4534 pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE; 4535 continue; 4536 } 4537 4538 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 4539 if (va_next < sva) 4540 va_next = eva; 4541 4542 l2 = pmap_l1_to_l2(l1, sva); 4543 if (pmap_load(l2) == 0) 4544 continue; 4545 4546 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { 4547 if ((pmap_load(l2) & ATTR_SW_WIRED) == 0) 4548 panic("pmap_unwire: l2 %#jx is missing " 4549 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2)); 4550 4551 /* 4552 * Are we unwiring the entire large page? If not, 4553 * demote the mapping and fall through. 4554 */ 4555 if (sva + L2_SIZE == va_next && eva >= va_next) { 4556 pmap_clear_bits(l2, ATTR_SW_WIRED); 4557 pmap->pm_stats.wired_count -= L2_SIZE / 4558 PAGE_SIZE; 4559 continue; 4560 } else if (pmap_demote_l2(pmap, l2, sva) == NULL) 4561 panic("pmap_unwire: demotion failed"); 4562 } 4563 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 4564 ("pmap_unwire: Invalid l2 entry after demotion")); 4565 4566 if (va_next > eva) 4567 va_next = eva; 4568 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 4569 sva += L3_SIZE) { 4570 if (pmap_load(l3) == 0) 4571 continue; 4572 if ((pmap_load(l3) & ATTR_SW_WIRED) == 0) 4573 panic("pmap_unwire: l3 %#jx is missing " 4574 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3)); 4575 4576 /* 4577 * ATTR_SW_WIRED must be cleared atomically. Although 4578 * the pmap lock synchronizes access to ATTR_SW_WIRED, 4579 * the System MMU may write to the entry concurrently. 4580 */ 4581 pmap_clear_bits(l3, ATTR_SW_WIRED); 4582 pmap->pm_stats.wired_count--; 4583 } 4584 } 4585 PMAP_UNLOCK(pmap); 4586 } 4587 4588 /* 4589 * Copy the range specified by src_addr/len 4590 * from the source map to the range dst_addr/len 4591 * in the destination map. 4592 * 4593 * This routine is only advisory and need not do anything. 4594 * 4595 * Because the executable mappings created by this routine are copied, 4596 * it should not have to flush the instruction cache. 4597 */ 4598 void 4599 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 4600 vm_offset_t src_addr) 4601 { 4602 struct rwlock *lock; 4603 pd_entry_t *l0, *l1, *l2, srcptepaddr; 4604 pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte; 4605 vm_offset_t addr, end_addr, va_next; 4606 vm_page_t dst_m, dstmpte, srcmpte; 4607 4608 PMAP_ASSERT_STAGE1(dst_pmap); 4609 PMAP_ASSERT_STAGE1(src_pmap); 4610 4611 if (dst_addr != src_addr) 4612 return; 4613 end_addr = src_addr + len; 4614 lock = NULL; 4615 if (dst_pmap < src_pmap) { 4616 PMAP_LOCK(dst_pmap); 4617 PMAP_LOCK(src_pmap); 4618 } else { 4619 PMAP_LOCK(src_pmap); 4620 PMAP_LOCK(dst_pmap); 4621 } 4622 for (addr = src_addr; addr < end_addr; addr = va_next) { 4623 l0 = pmap_l0(src_pmap, addr); 4624 if (pmap_load(l0) == 0) { 4625 va_next = (addr + L0_SIZE) & ~L0_OFFSET; 4626 if (va_next < addr) 4627 va_next = end_addr; 4628 continue; 4629 } 4630 4631 va_next = (addr + L1_SIZE) & ~L1_OFFSET; 4632 if (va_next < addr) 4633 va_next = end_addr; 4634 l1 = pmap_l0_to_l1(l0, addr); 4635 if (pmap_load(l1) == 0) 4636 continue; 4637 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 4638 KASSERT(va_next <= end_addr, 4639 ("partial update of non-transparent 1G page " 4640 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx", 4641 pmap_load(l1), addr, end_addr, va_next)); 4642 srcptepaddr = pmap_load(l1); 4643 l1 = pmap_l1(dst_pmap, addr); 4644 if (l1 == NULL) { 4645 if (_pmap_alloc_l3(dst_pmap, 4646 pmap_l0_pindex(addr), NULL) == NULL) 4647 break; 4648 l1 = pmap_l1(dst_pmap, addr); 4649 } else { 4650 l0 = pmap_l0(dst_pmap, addr); 4651 dst_m = PHYS_TO_VM_PAGE(pmap_load(l0) & 4652 ~ATTR_MASK); 4653 dst_m->ref_count++; 4654 } 4655 KASSERT(pmap_load(l1) == 0, 4656 ("1G mapping present in dst pmap " 4657 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx", 4658 pmap_load(l1), addr, end_addr, va_next)); 4659 pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED); 4660 pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE); 4661 continue; 4662 } 4663 4664 va_next = (addr + L2_SIZE) & ~L2_OFFSET; 4665 if (va_next < addr) 4666 va_next = end_addr; 4667 l2 = pmap_l1_to_l2(l1, addr); 4668 srcptepaddr = pmap_load(l2); 4669 if (srcptepaddr == 0) 4670 continue; 4671 if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) { 4672 /* 4673 * We can only virtual copy whole superpages. 4674 */ 4675 if ((addr & L2_OFFSET) != 0 || 4676 addr + L2_SIZE > end_addr) 4677 continue; 4678 l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL); 4679 if (l2 == NULL) 4680 break; 4681 if (pmap_load(l2) == 0 && 4682 ((srcptepaddr & ATTR_SW_MANAGED) == 0 || 4683 pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr, 4684 PMAP_ENTER_NORECLAIM, &lock))) { 4685 /* 4686 * We leave the dirty bit unchanged because 4687 * managed read/write superpage mappings are 4688 * required to be dirty. However, managed 4689 * superpage mappings are not required to 4690 * have their accessed bit set, so we clear 4691 * it because we don't know if this mapping 4692 * will be used. 4693 */ 4694 srcptepaddr &= ~ATTR_SW_WIRED; 4695 if ((srcptepaddr & ATTR_SW_MANAGED) != 0) 4696 srcptepaddr &= ~ATTR_AF; 4697 pmap_store(l2, srcptepaddr); 4698 pmap_resident_count_inc(dst_pmap, L2_SIZE / 4699 PAGE_SIZE); 4700 atomic_add_long(&pmap_l2_mappings, 1); 4701 } else 4702 pmap_abort_ptp(dst_pmap, addr, dst_m); 4703 continue; 4704 } 4705 KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE, 4706 ("pmap_copy: invalid L2 entry")); 4707 srcptepaddr &= ~ATTR_MASK; 4708 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 4709 KASSERT(srcmpte->ref_count > 0, 4710 ("pmap_copy: source page table page is unused")); 4711 if (va_next > end_addr) 4712 va_next = end_addr; 4713 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 4714 src_pte = &src_pte[pmap_l3_index(addr)]; 4715 dstmpte = NULL; 4716 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { 4717 ptetemp = pmap_load(src_pte); 4718 4719 /* 4720 * We only virtual copy managed pages. 4721 */ 4722 if ((ptetemp & ATTR_SW_MANAGED) == 0) 4723 continue; 4724 4725 if (dstmpte != NULL) { 4726 KASSERT(dstmpte->pindex == pmap_l2_pindex(addr), 4727 ("dstmpte pindex/addr mismatch")); 4728 dstmpte->ref_count++; 4729 } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr, 4730 NULL)) == NULL) 4731 goto out; 4732 dst_pte = (pt_entry_t *) 4733 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 4734 dst_pte = &dst_pte[pmap_l3_index(addr)]; 4735 if (pmap_load(dst_pte) == 0 && 4736 pmap_try_insert_pv_entry(dst_pmap, addr, 4737 PHYS_TO_VM_PAGE(ptetemp & ~ATTR_MASK), &lock)) { 4738 /* 4739 * Clear the wired, modified, and accessed 4740 * (referenced) bits during the copy. 4741 */ 4742 mask = ATTR_AF | ATTR_SW_WIRED; 4743 nbits = 0; 4744 if ((ptetemp & ATTR_SW_DBM) != 0) 4745 nbits |= ATTR_S1_AP_RW_BIT; 4746 pmap_store(dst_pte, (ptetemp & ~mask) | nbits); 4747 pmap_resident_count_inc(dst_pmap, 1); 4748 } else { 4749 pmap_abort_ptp(dst_pmap, addr, dstmpte); 4750 goto out; 4751 } 4752 /* Have we copied all of the valid mappings? */ 4753 if (dstmpte->ref_count >= srcmpte->ref_count) 4754 break; 4755 } 4756 } 4757 out: 4758 /* 4759 * XXX This barrier may not be needed because the destination pmap is 4760 * not active. 4761 */ 4762 dsb(ishst); 4763 4764 if (lock != NULL) 4765 rw_wunlock(lock); 4766 PMAP_UNLOCK(src_pmap); 4767 PMAP_UNLOCK(dst_pmap); 4768 } 4769 4770 /* 4771 * pmap_zero_page zeros the specified hardware page by mapping 4772 * the page into KVM and using bzero to clear its contents. 4773 */ 4774 void 4775 pmap_zero_page(vm_page_t m) 4776 { 4777 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 4778 4779 pagezero((void *)va); 4780 } 4781 4782 /* 4783 * pmap_zero_page_area zeros the specified hardware page by mapping 4784 * the page into KVM and using bzero to clear its contents. 4785 * 4786 * off and size may not cover an area beyond a single hardware page. 4787 */ 4788 void 4789 pmap_zero_page_area(vm_page_t m, int off, int size) 4790 { 4791 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 4792 4793 if (off == 0 && size == PAGE_SIZE) 4794 pagezero((void *)va); 4795 else 4796 bzero((char *)va + off, size); 4797 } 4798 4799 /* 4800 * pmap_copy_page copies the specified (machine independent) 4801 * page by mapping the page into virtual memory and using 4802 * bcopy to copy the page, one machine dependent page at a 4803 * time. 4804 */ 4805 void 4806 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 4807 { 4808 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 4809 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 4810 4811 pagecopy((void *)src, (void *)dst); 4812 } 4813 4814 int unmapped_buf_allowed = 1; 4815 4816 void 4817 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 4818 vm_offset_t b_offset, int xfersize) 4819 { 4820 void *a_cp, *b_cp; 4821 vm_page_t m_a, m_b; 4822 vm_paddr_t p_a, p_b; 4823 vm_offset_t a_pg_offset, b_pg_offset; 4824 int cnt; 4825 4826 while (xfersize > 0) { 4827 a_pg_offset = a_offset & PAGE_MASK; 4828 m_a = ma[a_offset >> PAGE_SHIFT]; 4829 p_a = m_a->phys_addr; 4830 b_pg_offset = b_offset & PAGE_MASK; 4831 m_b = mb[b_offset >> PAGE_SHIFT]; 4832 p_b = m_b->phys_addr; 4833 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 4834 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 4835 if (__predict_false(!PHYS_IN_DMAP(p_a))) { 4836 panic("!DMAP a %lx", p_a); 4837 } else { 4838 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 4839 } 4840 if (__predict_false(!PHYS_IN_DMAP(p_b))) { 4841 panic("!DMAP b %lx", p_b); 4842 } else { 4843 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 4844 } 4845 bcopy(a_cp, b_cp, cnt); 4846 a_offset += cnt; 4847 b_offset += cnt; 4848 xfersize -= cnt; 4849 } 4850 } 4851 4852 vm_offset_t 4853 pmap_quick_enter_page(vm_page_t m) 4854 { 4855 4856 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); 4857 } 4858 4859 void 4860 pmap_quick_remove_page(vm_offset_t addr) 4861 { 4862 } 4863 4864 /* 4865 * Returns true if the pmap's pv is one of the first 4866 * 16 pvs linked to from this page. This count may 4867 * be changed upwards or downwards in the future; it 4868 * is only necessary that true be returned for a small 4869 * subset of pmaps for proper page aging. 4870 */ 4871 boolean_t 4872 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 4873 { 4874 struct md_page *pvh; 4875 struct rwlock *lock; 4876 pv_entry_t pv; 4877 int loops = 0; 4878 boolean_t rv; 4879 4880 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4881 ("pmap_page_exists_quick: page %p is not managed", m)); 4882 rv = FALSE; 4883 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4884 rw_rlock(lock); 4885 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4886 if (PV_PMAP(pv) == pmap) { 4887 rv = TRUE; 4888 break; 4889 } 4890 loops++; 4891 if (loops >= 16) 4892 break; 4893 } 4894 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 4895 pvh = page_to_pvh(m); 4896 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4897 if (PV_PMAP(pv) == pmap) { 4898 rv = TRUE; 4899 break; 4900 } 4901 loops++; 4902 if (loops >= 16) 4903 break; 4904 } 4905 } 4906 rw_runlock(lock); 4907 return (rv); 4908 } 4909 4910 /* 4911 * pmap_page_wired_mappings: 4912 * 4913 * Return the number of managed mappings to the given physical page 4914 * that are wired. 4915 */ 4916 int 4917 pmap_page_wired_mappings(vm_page_t m) 4918 { 4919 struct rwlock *lock; 4920 struct md_page *pvh; 4921 pmap_t pmap; 4922 pt_entry_t *pte; 4923 pv_entry_t pv; 4924 int count, md_gen, pvh_gen; 4925 4926 if ((m->oflags & VPO_UNMANAGED) != 0) 4927 return (0); 4928 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4929 rw_rlock(lock); 4930 restart: 4931 count = 0; 4932 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4933 pmap = PV_PMAP(pv); 4934 if (!PMAP_TRYLOCK(pmap)) { 4935 md_gen = m->md.pv_gen; 4936 rw_runlock(lock); 4937 PMAP_LOCK(pmap); 4938 rw_rlock(lock); 4939 if (md_gen != m->md.pv_gen) { 4940 PMAP_UNLOCK(pmap); 4941 goto restart; 4942 } 4943 } 4944 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 4945 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0) 4946 count++; 4947 PMAP_UNLOCK(pmap); 4948 } 4949 if ((m->flags & PG_FICTITIOUS) == 0) { 4950 pvh = page_to_pvh(m); 4951 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4952 pmap = PV_PMAP(pv); 4953 if (!PMAP_TRYLOCK(pmap)) { 4954 md_gen = m->md.pv_gen; 4955 pvh_gen = pvh->pv_gen; 4956 rw_runlock(lock); 4957 PMAP_LOCK(pmap); 4958 rw_rlock(lock); 4959 if (md_gen != m->md.pv_gen || 4960 pvh_gen != pvh->pv_gen) { 4961 PMAP_UNLOCK(pmap); 4962 goto restart; 4963 } 4964 } 4965 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__); 4966 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0) 4967 count++; 4968 PMAP_UNLOCK(pmap); 4969 } 4970 } 4971 rw_runlock(lock); 4972 return (count); 4973 } 4974 4975 /* 4976 * Returns true if the given page is mapped individually or as part of 4977 * a 2mpage. Otherwise, returns false. 4978 */ 4979 bool 4980 pmap_page_is_mapped(vm_page_t m) 4981 { 4982 struct rwlock *lock; 4983 bool rv; 4984 4985 if ((m->oflags & VPO_UNMANAGED) != 0) 4986 return (false); 4987 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4988 rw_rlock(lock); 4989 rv = !TAILQ_EMPTY(&m->md.pv_list) || 4990 ((m->flags & PG_FICTITIOUS) == 0 && 4991 !TAILQ_EMPTY(&page_to_pvh(m)->pv_list)); 4992 rw_runlock(lock); 4993 return (rv); 4994 } 4995 4996 /* 4997 * Destroy all managed, non-wired mappings in the given user-space 4998 * pmap. This pmap cannot be active on any processor besides the 4999 * caller. 5000 * 5001 * This function cannot be applied to the kernel pmap. Moreover, it 5002 * is not intended for general use. It is only to be used during 5003 * process termination. Consequently, it can be implemented in ways 5004 * that make it faster than pmap_remove(). First, it can more quickly 5005 * destroy mappings by iterating over the pmap's collection of PV 5006 * entries, rather than searching the page table. Second, it doesn't 5007 * have to test and clear the page table entries atomically, because 5008 * no processor is currently accessing the user address space. In 5009 * particular, a page table entry's dirty bit won't change state once 5010 * this function starts. 5011 */ 5012 void 5013 pmap_remove_pages(pmap_t pmap) 5014 { 5015 pd_entry_t *pde; 5016 pt_entry_t *pte, tpte; 5017 struct spglist free; 5018 vm_page_t m, ml3, mt; 5019 pv_entry_t pv; 5020 struct md_page *pvh; 5021 struct pv_chunk *pc, *npc; 5022 struct rwlock *lock; 5023 int64_t bit; 5024 uint64_t inuse, bitmask; 5025 int allfree, field, freed, idx, lvl; 5026 vm_paddr_t pa; 5027 5028 lock = NULL; 5029 5030 SLIST_INIT(&free); 5031 PMAP_LOCK(pmap); 5032 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 5033 allfree = 1; 5034 freed = 0; 5035 for (field = 0; field < _NPCM; field++) { 5036 inuse = ~pc->pc_map[field] & pc_freemask[field]; 5037 while (inuse != 0) { 5038 bit = ffsl(inuse) - 1; 5039 bitmask = 1UL << bit; 5040 idx = field * 64 + bit; 5041 pv = &pc->pc_pventry[idx]; 5042 inuse &= ~bitmask; 5043 5044 pde = pmap_pde(pmap, pv->pv_va, &lvl); 5045 KASSERT(pde != NULL, 5046 ("Attempting to remove an unmapped page")); 5047 5048 switch(lvl) { 5049 case 1: 5050 pte = pmap_l1_to_l2(pde, pv->pv_va); 5051 tpte = pmap_load(pte); 5052 KASSERT((tpte & ATTR_DESCR_MASK) == 5053 L2_BLOCK, 5054 ("Attempting to remove an invalid " 5055 "block: %lx", tpte)); 5056 break; 5057 case 2: 5058 pte = pmap_l2_to_l3(pde, pv->pv_va); 5059 tpte = pmap_load(pte); 5060 KASSERT((tpte & ATTR_DESCR_MASK) == 5061 L3_PAGE, 5062 ("Attempting to remove an invalid " 5063 "page: %lx", tpte)); 5064 break; 5065 default: 5066 panic( 5067 "Invalid page directory level: %d", 5068 lvl); 5069 } 5070 5071 /* 5072 * We cannot remove wired pages from a process' mapping at this time 5073 */ 5074 if (tpte & ATTR_SW_WIRED) { 5075 allfree = 0; 5076 continue; 5077 } 5078 5079 /* Mark free */ 5080 pc->pc_map[field] |= bitmask; 5081 5082 /* 5083 * Because this pmap is not active on other 5084 * processors, the dirty bit cannot have 5085 * changed state since we last loaded pte. 5086 */ 5087 pmap_clear(pte); 5088 5089 pa = tpte & ~ATTR_MASK; 5090 5091 m = PHYS_TO_VM_PAGE(pa); 5092 KASSERT(m->phys_addr == pa, 5093 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 5094 m, (uintmax_t)m->phys_addr, 5095 (uintmax_t)tpte)); 5096 5097 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 5098 m < &vm_page_array[vm_page_array_size], 5099 ("pmap_remove_pages: bad pte %#jx", 5100 (uintmax_t)tpte)); 5101 5102 /* 5103 * Update the vm_page_t clean/reference bits. 5104 */ 5105 if (pmap_pte_dirty(pmap, tpte)) { 5106 switch (lvl) { 5107 case 1: 5108 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 5109 vm_page_dirty(mt); 5110 break; 5111 case 2: 5112 vm_page_dirty(m); 5113 break; 5114 } 5115 } 5116 5117 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 5118 5119 switch (lvl) { 5120 case 1: 5121 pmap_resident_count_dec(pmap, 5122 L2_SIZE / PAGE_SIZE); 5123 pvh = page_to_pvh(m); 5124 TAILQ_REMOVE(&pvh->pv_list, pv,pv_next); 5125 pvh->pv_gen++; 5126 if (TAILQ_EMPTY(&pvh->pv_list)) { 5127 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 5128 if ((mt->a.flags & PGA_WRITEABLE) != 0 && 5129 TAILQ_EMPTY(&mt->md.pv_list)) 5130 vm_page_aflag_clear(mt, PGA_WRITEABLE); 5131 } 5132 ml3 = pmap_remove_pt_page(pmap, 5133 pv->pv_va); 5134 if (ml3 != NULL) { 5135 KASSERT(ml3->valid == VM_PAGE_BITS_ALL, 5136 ("pmap_remove_pages: l3 page not promoted")); 5137 pmap_resident_count_dec(pmap,1); 5138 KASSERT(ml3->ref_count == NL3PG, 5139 ("pmap_remove_pages: l3 page ref count error")); 5140 ml3->ref_count = 0; 5141 pmap_add_delayed_free_list(ml3, 5142 &free, FALSE); 5143 } 5144 break; 5145 case 2: 5146 pmap_resident_count_dec(pmap, 1); 5147 TAILQ_REMOVE(&m->md.pv_list, pv, 5148 pv_next); 5149 m->md.pv_gen++; 5150 if ((m->a.flags & PGA_WRITEABLE) != 0 && 5151 TAILQ_EMPTY(&m->md.pv_list) && 5152 (m->flags & PG_FICTITIOUS) == 0) { 5153 pvh = page_to_pvh(m); 5154 if (TAILQ_EMPTY(&pvh->pv_list)) 5155 vm_page_aflag_clear(m, 5156 PGA_WRITEABLE); 5157 } 5158 break; 5159 } 5160 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde), 5161 &free); 5162 freed++; 5163 } 5164 } 5165 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 5166 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 5167 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 5168 if (allfree) { 5169 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5170 free_pv_chunk(pc); 5171 } 5172 } 5173 if (lock != NULL) 5174 rw_wunlock(lock); 5175 pmap_invalidate_all(pmap); 5176 PMAP_UNLOCK(pmap); 5177 vm_page_free_pages_toq(&free, true); 5178 } 5179 5180 /* 5181 * This is used to check if a page has been accessed or modified. 5182 */ 5183 static boolean_t 5184 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 5185 { 5186 struct rwlock *lock; 5187 pv_entry_t pv; 5188 struct md_page *pvh; 5189 pt_entry_t *pte, mask, value; 5190 pmap_t pmap; 5191 int md_gen, pvh_gen; 5192 boolean_t rv; 5193 5194 rv = FALSE; 5195 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5196 rw_rlock(lock); 5197 restart: 5198 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5199 pmap = PV_PMAP(pv); 5200 PMAP_ASSERT_STAGE1(pmap); 5201 if (!PMAP_TRYLOCK(pmap)) { 5202 md_gen = m->md.pv_gen; 5203 rw_runlock(lock); 5204 PMAP_LOCK(pmap); 5205 rw_rlock(lock); 5206 if (md_gen != m->md.pv_gen) { 5207 PMAP_UNLOCK(pmap); 5208 goto restart; 5209 } 5210 } 5211 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 5212 mask = 0; 5213 value = 0; 5214 if (modified) { 5215 mask |= ATTR_S1_AP_RW_BIT; 5216 value |= ATTR_S1_AP(ATTR_S1_AP_RW); 5217 } 5218 if (accessed) { 5219 mask |= ATTR_AF | ATTR_DESCR_MASK; 5220 value |= ATTR_AF | L3_PAGE; 5221 } 5222 rv = (pmap_load(pte) & mask) == value; 5223 PMAP_UNLOCK(pmap); 5224 if (rv) 5225 goto out; 5226 } 5227 if ((m->flags & PG_FICTITIOUS) == 0) { 5228 pvh = page_to_pvh(m); 5229 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5230 pmap = PV_PMAP(pv); 5231 PMAP_ASSERT_STAGE1(pmap); 5232 if (!PMAP_TRYLOCK(pmap)) { 5233 md_gen = m->md.pv_gen; 5234 pvh_gen = pvh->pv_gen; 5235 rw_runlock(lock); 5236 PMAP_LOCK(pmap); 5237 rw_rlock(lock); 5238 if (md_gen != m->md.pv_gen || 5239 pvh_gen != pvh->pv_gen) { 5240 PMAP_UNLOCK(pmap); 5241 goto restart; 5242 } 5243 } 5244 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__); 5245 mask = 0; 5246 value = 0; 5247 if (modified) { 5248 mask |= ATTR_S1_AP_RW_BIT; 5249 value |= ATTR_S1_AP(ATTR_S1_AP_RW); 5250 } 5251 if (accessed) { 5252 mask |= ATTR_AF | ATTR_DESCR_MASK; 5253 value |= ATTR_AF | L2_BLOCK; 5254 } 5255 rv = (pmap_load(pte) & mask) == value; 5256 PMAP_UNLOCK(pmap); 5257 if (rv) 5258 goto out; 5259 } 5260 } 5261 out: 5262 rw_runlock(lock); 5263 return (rv); 5264 } 5265 5266 /* 5267 * pmap_is_modified: 5268 * 5269 * Return whether or not the specified physical page was modified 5270 * in any physical maps. 5271 */ 5272 boolean_t 5273 pmap_is_modified(vm_page_t m) 5274 { 5275 5276 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5277 ("pmap_is_modified: page %p is not managed", m)); 5278 5279 /* 5280 * If the page is not busied then this check is racy. 5281 */ 5282 if (!pmap_page_is_write_mapped(m)) 5283 return (FALSE); 5284 return (pmap_page_test_mappings(m, FALSE, TRUE)); 5285 } 5286 5287 /* 5288 * pmap_is_prefaultable: 5289 * 5290 * Return whether or not the specified virtual address is eligible 5291 * for prefault. 5292 */ 5293 boolean_t 5294 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 5295 { 5296 pd_entry_t *pde; 5297 pt_entry_t *pte; 5298 boolean_t rv; 5299 int lvl; 5300 5301 /* 5302 * Return TRUE if and only if the L3 entry for the specified virtual 5303 * address is allocated but invalid. 5304 */ 5305 rv = FALSE; 5306 PMAP_LOCK(pmap); 5307 pde = pmap_pde(pmap, addr, &lvl); 5308 if (pde != NULL && lvl == 2) { 5309 pte = pmap_l2_to_l3(pde, addr); 5310 rv = pmap_load(pte) == 0; 5311 } 5312 PMAP_UNLOCK(pmap); 5313 return (rv); 5314 } 5315 5316 /* 5317 * pmap_is_referenced: 5318 * 5319 * Return whether or not the specified physical page was referenced 5320 * in any physical maps. 5321 */ 5322 boolean_t 5323 pmap_is_referenced(vm_page_t m) 5324 { 5325 5326 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5327 ("pmap_is_referenced: page %p is not managed", m)); 5328 return (pmap_page_test_mappings(m, TRUE, FALSE)); 5329 } 5330 5331 /* 5332 * Clear the write and modified bits in each of the given page's mappings. 5333 */ 5334 void 5335 pmap_remove_write(vm_page_t m) 5336 { 5337 struct md_page *pvh; 5338 pmap_t pmap; 5339 struct rwlock *lock; 5340 pv_entry_t next_pv, pv; 5341 pt_entry_t oldpte, *pte; 5342 vm_offset_t va; 5343 int md_gen, pvh_gen; 5344 5345 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5346 ("pmap_remove_write: page %p is not managed", m)); 5347 vm_page_assert_busied(m); 5348 5349 if (!pmap_page_is_write_mapped(m)) 5350 return; 5351 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5352 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 5353 rw_wlock(lock); 5354 retry: 5355 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5356 pmap = PV_PMAP(pv); 5357 PMAP_ASSERT_STAGE1(pmap); 5358 if (!PMAP_TRYLOCK(pmap)) { 5359 pvh_gen = pvh->pv_gen; 5360 rw_wunlock(lock); 5361 PMAP_LOCK(pmap); 5362 rw_wlock(lock); 5363 if (pvh_gen != pvh->pv_gen) { 5364 PMAP_UNLOCK(pmap); 5365 goto retry; 5366 } 5367 } 5368 va = pv->pv_va; 5369 pte = pmap_pte_exists(pmap, va, 2, __func__); 5370 if ((pmap_load(pte) & ATTR_SW_DBM) != 0) 5371 (void)pmap_demote_l2_locked(pmap, pte, va, &lock); 5372 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 5373 ("inconsistent pv lock %p %p for page %p", 5374 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 5375 PMAP_UNLOCK(pmap); 5376 } 5377 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5378 pmap = PV_PMAP(pv); 5379 PMAP_ASSERT_STAGE1(pmap); 5380 if (!PMAP_TRYLOCK(pmap)) { 5381 pvh_gen = pvh->pv_gen; 5382 md_gen = m->md.pv_gen; 5383 rw_wunlock(lock); 5384 PMAP_LOCK(pmap); 5385 rw_wlock(lock); 5386 if (pvh_gen != pvh->pv_gen || 5387 md_gen != m->md.pv_gen) { 5388 PMAP_UNLOCK(pmap); 5389 goto retry; 5390 } 5391 } 5392 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 5393 oldpte = pmap_load(pte); 5394 if ((oldpte & ATTR_SW_DBM) != 0) { 5395 while (!atomic_fcmpset_64(pte, &oldpte, 5396 (oldpte | ATTR_S1_AP_RW_BIT) & ~ATTR_SW_DBM)) 5397 cpu_spinwait(); 5398 if ((oldpte & ATTR_S1_AP_RW_BIT) == 5399 ATTR_S1_AP(ATTR_S1_AP_RW)) 5400 vm_page_dirty(m); 5401 pmap_invalidate_page(pmap, pv->pv_va, true); 5402 } 5403 PMAP_UNLOCK(pmap); 5404 } 5405 rw_wunlock(lock); 5406 vm_page_aflag_clear(m, PGA_WRITEABLE); 5407 } 5408 5409 /* 5410 * pmap_ts_referenced: 5411 * 5412 * Return a count of reference bits for a page, clearing those bits. 5413 * It is not necessary for every reference bit to be cleared, but it 5414 * is necessary that 0 only be returned when there are truly no 5415 * reference bits set. 5416 * 5417 * As an optimization, update the page's dirty field if a modified bit is 5418 * found while counting reference bits. This opportunistic update can be 5419 * performed at low cost and can eliminate the need for some future calls 5420 * to pmap_is_modified(). However, since this function stops after 5421 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 5422 * dirty pages. Those dirty pages will only be detected by a future call 5423 * to pmap_is_modified(). 5424 */ 5425 int 5426 pmap_ts_referenced(vm_page_t m) 5427 { 5428 struct md_page *pvh; 5429 pv_entry_t pv, pvf; 5430 pmap_t pmap; 5431 struct rwlock *lock; 5432 pt_entry_t *pte, tpte; 5433 vm_offset_t va; 5434 vm_paddr_t pa; 5435 int cleared, md_gen, not_cleared, pvh_gen; 5436 struct spglist free; 5437 5438 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5439 ("pmap_ts_referenced: page %p is not managed", m)); 5440 SLIST_INIT(&free); 5441 cleared = 0; 5442 pa = VM_PAGE_TO_PHYS(m); 5443 lock = PHYS_TO_PV_LIST_LOCK(pa); 5444 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 5445 rw_wlock(lock); 5446 retry: 5447 not_cleared = 0; 5448 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 5449 goto small_mappings; 5450 pv = pvf; 5451 do { 5452 if (pvf == NULL) 5453 pvf = pv; 5454 pmap = PV_PMAP(pv); 5455 if (!PMAP_TRYLOCK(pmap)) { 5456 pvh_gen = pvh->pv_gen; 5457 rw_wunlock(lock); 5458 PMAP_LOCK(pmap); 5459 rw_wlock(lock); 5460 if (pvh_gen != pvh->pv_gen) { 5461 PMAP_UNLOCK(pmap); 5462 goto retry; 5463 } 5464 } 5465 va = pv->pv_va; 5466 pte = pmap_pte_exists(pmap, va, 2, __func__); 5467 tpte = pmap_load(pte); 5468 if (pmap_pte_dirty(pmap, tpte)) { 5469 /* 5470 * Although "tpte" is mapping a 2MB page, because 5471 * this function is called at a 4KB page granularity, 5472 * we only update the 4KB page under test. 5473 */ 5474 vm_page_dirty(m); 5475 } 5476 if ((tpte & ATTR_AF) != 0) { 5477 /* 5478 * Since this reference bit is shared by 512 4KB pages, 5479 * it should not be cleared every time it is tested. 5480 * Apply a simple "hash" function on the physical page 5481 * number, the virtual superpage number, and the pmap 5482 * address to select one 4KB page out of the 512 on 5483 * which testing the reference bit will result in 5484 * clearing that reference bit. This function is 5485 * designed to avoid the selection of the same 4KB page 5486 * for every 2MB page mapping. 5487 * 5488 * On demotion, a mapping that hasn't been referenced 5489 * is simply destroyed. To avoid the possibility of a 5490 * subsequent page fault on a demoted wired mapping, 5491 * always leave its reference bit set. Moreover, 5492 * since the superpage is wired, the current state of 5493 * its reference bit won't affect page replacement. 5494 */ 5495 if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^ 5496 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && 5497 (tpte & ATTR_SW_WIRED) == 0) { 5498 pmap_clear_bits(pte, ATTR_AF); 5499 pmap_invalidate_page(pmap, va, true); 5500 cleared++; 5501 } else 5502 not_cleared++; 5503 } 5504 PMAP_UNLOCK(pmap); 5505 /* Rotate the PV list if it has more than one entry. */ 5506 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5507 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5508 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5509 pvh->pv_gen++; 5510 } 5511 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 5512 goto out; 5513 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 5514 small_mappings: 5515 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 5516 goto out; 5517 pv = pvf; 5518 do { 5519 if (pvf == NULL) 5520 pvf = pv; 5521 pmap = PV_PMAP(pv); 5522 if (!PMAP_TRYLOCK(pmap)) { 5523 pvh_gen = pvh->pv_gen; 5524 md_gen = m->md.pv_gen; 5525 rw_wunlock(lock); 5526 PMAP_LOCK(pmap); 5527 rw_wlock(lock); 5528 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 5529 PMAP_UNLOCK(pmap); 5530 goto retry; 5531 } 5532 } 5533 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 5534 tpte = pmap_load(pte); 5535 if (pmap_pte_dirty(pmap, tpte)) 5536 vm_page_dirty(m); 5537 if ((tpte & ATTR_AF) != 0) { 5538 if ((tpte & ATTR_SW_WIRED) == 0) { 5539 pmap_clear_bits(pte, ATTR_AF); 5540 pmap_invalidate_page(pmap, pv->pv_va, true); 5541 cleared++; 5542 } else 5543 not_cleared++; 5544 } 5545 PMAP_UNLOCK(pmap); 5546 /* Rotate the PV list if it has more than one entry. */ 5547 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5548 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5549 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5550 m->md.pv_gen++; 5551 } 5552 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 5553 not_cleared < PMAP_TS_REFERENCED_MAX); 5554 out: 5555 rw_wunlock(lock); 5556 vm_page_free_pages_toq(&free, true); 5557 return (cleared + not_cleared); 5558 } 5559 5560 /* 5561 * Apply the given advice to the specified range of addresses within the 5562 * given pmap. Depending on the advice, clear the referenced and/or 5563 * modified flags in each mapping and set the mapped page's dirty field. 5564 */ 5565 void 5566 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 5567 { 5568 struct rwlock *lock; 5569 vm_offset_t va, va_next; 5570 vm_page_t m; 5571 pd_entry_t *l0, *l1, *l2, oldl2; 5572 pt_entry_t *l3, oldl3; 5573 5574 PMAP_ASSERT_STAGE1(pmap); 5575 5576 if (advice != MADV_DONTNEED && advice != MADV_FREE) 5577 return; 5578 5579 PMAP_LOCK(pmap); 5580 for (; sva < eva; sva = va_next) { 5581 l0 = pmap_l0(pmap, sva); 5582 if (pmap_load(l0) == 0) { 5583 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 5584 if (va_next < sva) 5585 va_next = eva; 5586 continue; 5587 } 5588 5589 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 5590 if (va_next < sva) 5591 va_next = eva; 5592 l1 = pmap_l0_to_l1(l0, sva); 5593 if (pmap_load(l1) == 0) 5594 continue; 5595 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 5596 KASSERT(va_next <= eva, 5597 ("partial update of non-transparent 1G page " 5598 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 5599 pmap_load(l1), sva, eva, va_next)); 5600 continue; 5601 } 5602 5603 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 5604 if (va_next < sva) 5605 va_next = eva; 5606 l2 = pmap_l1_to_l2(l1, sva); 5607 oldl2 = pmap_load(l2); 5608 if (oldl2 == 0) 5609 continue; 5610 if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) { 5611 if ((oldl2 & ATTR_SW_MANAGED) == 0) 5612 continue; 5613 lock = NULL; 5614 if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) { 5615 if (lock != NULL) 5616 rw_wunlock(lock); 5617 5618 /* 5619 * The 2MB page mapping was destroyed. 5620 */ 5621 continue; 5622 } 5623 5624 /* 5625 * Unless the page mappings are wired, remove the 5626 * mapping to a single page so that a subsequent 5627 * access may repromote. Choosing the last page 5628 * within the address range [sva, min(va_next, eva)) 5629 * generally results in more repromotions. Since the 5630 * underlying page table page is fully populated, this 5631 * removal never frees a page table page. 5632 */ 5633 if ((oldl2 & ATTR_SW_WIRED) == 0) { 5634 va = eva; 5635 if (va > va_next) 5636 va = va_next; 5637 va -= PAGE_SIZE; 5638 KASSERT(va >= sva, 5639 ("pmap_advise: no address gap")); 5640 l3 = pmap_l2_to_l3(l2, va); 5641 KASSERT(pmap_load(l3) != 0, 5642 ("pmap_advise: invalid PTE")); 5643 pmap_remove_l3(pmap, l3, va, pmap_load(l2), 5644 NULL, &lock); 5645 } 5646 if (lock != NULL) 5647 rw_wunlock(lock); 5648 } 5649 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 5650 ("pmap_advise: invalid L2 entry after demotion")); 5651 if (va_next > eva) 5652 va_next = eva; 5653 va = va_next; 5654 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 5655 sva += L3_SIZE) { 5656 oldl3 = pmap_load(l3); 5657 if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) != 5658 (ATTR_SW_MANAGED | L3_PAGE)) 5659 goto maybe_invlrng; 5660 else if (pmap_pte_dirty(pmap, oldl3)) { 5661 if (advice == MADV_DONTNEED) { 5662 /* 5663 * Future calls to pmap_is_modified() 5664 * can be avoided by making the page 5665 * dirty now. 5666 */ 5667 m = PHYS_TO_VM_PAGE(oldl3 & ~ATTR_MASK); 5668 vm_page_dirty(m); 5669 } 5670 while (!atomic_fcmpset_long(l3, &oldl3, 5671 (oldl3 & ~ATTR_AF) | 5672 ATTR_S1_AP(ATTR_S1_AP_RO))) 5673 cpu_spinwait(); 5674 } else if ((oldl3 & ATTR_AF) != 0) 5675 pmap_clear_bits(l3, ATTR_AF); 5676 else 5677 goto maybe_invlrng; 5678 if (va == va_next) 5679 va = sva; 5680 continue; 5681 maybe_invlrng: 5682 if (va != va_next) { 5683 pmap_invalidate_range(pmap, va, sva, true); 5684 va = va_next; 5685 } 5686 } 5687 if (va != va_next) 5688 pmap_invalidate_range(pmap, va, sva, true); 5689 } 5690 PMAP_UNLOCK(pmap); 5691 } 5692 5693 /* 5694 * Clear the modify bits on the specified physical page. 5695 */ 5696 void 5697 pmap_clear_modify(vm_page_t m) 5698 { 5699 struct md_page *pvh; 5700 struct rwlock *lock; 5701 pmap_t pmap; 5702 pv_entry_t next_pv, pv; 5703 pd_entry_t *l2, oldl2; 5704 pt_entry_t *l3, oldl3; 5705 vm_offset_t va; 5706 int md_gen, pvh_gen; 5707 5708 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5709 ("pmap_clear_modify: page %p is not managed", m)); 5710 vm_page_assert_busied(m); 5711 5712 if (!pmap_page_is_write_mapped(m)) 5713 return; 5714 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 5715 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5716 rw_wlock(lock); 5717 restart: 5718 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5719 pmap = PV_PMAP(pv); 5720 PMAP_ASSERT_STAGE1(pmap); 5721 if (!PMAP_TRYLOCK(pmap)) { 5722 pvh_gen = pvh->pv_gen; 5723 rw_wunlock(lock); 5724 PMAP_LOCK(pmap); 5725 rw_wlock(lock); 5726 if (pvh_gen != pvh->pv_gen) { 5727 PMAP_UNLOCK(pmap); 5728 goto restart; 5729 } 5730 } 5731 va = pv->pv_va; 5732 l2 = pmap_l2(pmap, va); 5733 oldl2 = pmap_load(l2); 5734 /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */ 5735 if ((oldl2 & ATTR_SW_DBM) != 0 && 5736 pmap_demote_l2_locked(pmap, l2, va, &lock) && 5737 (oldl2 & ATTR_SW_WIRED) == 0) { 5738 /* 5739 * Write protect the mapping to a single page so that 5740 * a subsequent write access may repromote. 5741 */ 5742 va += VM_PAGE_TO_PHYS(m) - (oldl2 & ~ATTR_MASK); 5743 l3 = pmap_l2_to_l3(l2, va); 5744 oldl3 = pmap_load(l3); 5745 while (!atomic_fcmpset_long(l3, &oldl3, 5746 (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO))) 5747 cpu_spinwait(); 5748 vm_page_dirty(m); 5749 pmap_invalidate_page(pmap, va, true); 5750 } 5751 PMAP_UNLOCK(pmap); 5752 } 5753 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5754 pmap = PV_PMAP(pv); 5755 PMAP_ASSERT_STAGE1(pmap); 5756 if (!PMAP_TRYLOCK(pmap)) { 5757 md_gen = m->md.pv_gen; 5758 pvh_gen = pvh->pv_gen; 5759 rw_wunlock(lock); 5760 PMAP_LOCK(pmap); 5761 rw_wlock(lock); 5762 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 5763 PMAP_UNLOCK(pmap); 5764 goto restart; 5765 } 5766 } 5767 l2 = pmap_l2(pmap, pv->pv_va); 5768 l3 = pmap_l2_to_l3(l2, pv->pv_va); 5769 oldl3 = pmap_load(l3); 5770 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM){ 5771 pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO)); 5772 pmap_invalidate_page(pmap, pv->pv_va, true); 5773 } 5774 PMAP_UNLOCK(pmap); 5775 } 5776 rw_wunlock(lock); 5777 } 5778 5779 void * 5780 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 5781 { 5782 struct pmap_preinit_mapping *ppim; 5783 vm_offset_t va, offset; 5784 pd_entry_t *pde; 5785 pt_entry_t *l2; 5786 int i, lvl, l2_blocks, free_l2_count, start_idx; 5787 5788 if (!vm_initialized) { 5789 /* 5790 * No L3 ptables so map entire L2 blocks where start VA is: 5791 * preinit_map_va + start_idx * L2_SIZE 5792 * There may be duplicate mappings (multiple VA -> same PA) but 5793 * ARM64 dcache is always PIPT so that's acceptable. 5794 */ 5795 if (size == 0) 5796 return (NULL); 5797 5798 /* Calculate how many L2 blocks are needed for the mapping */ 5799 l2_blocks = (roundup2(pa + size, L2_SIZE) - 5800 rounddown2(pa, L2_SIZE)) >> L2_SHIFT; 5801 5802 offset = pa & L2_OFFSET; 5803 5804 if (preinit_map_va == 0) 5805 return (NULL); 5806 5807 /* Map 2MiB L2 blocks from reserved VA space */ 5808 5809 free_l2_count = 0; 5810 start_idx = -1; 5811 /* Find enough free contiguous VA space */ 5812 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5813 ppim = pmap_preinit_mapping + i; 5814 if (free_l2_count > 0 && ppim->pa != 0) { 5815 /* Not enough space here */ 5816 free_l2_count = 0; 5817 start_idx = -1; 5818 continue; 5819 } 5820 5821 if (ppim->pa == 0) { 5822 /* Free L2 block */ 5823 if (start_idx == -1) 5824 start_idx = i; 5825 free_l2_count++; 5826 if (free_l2_count == l2_blocks) 5827 break; 5828 } 5829 } 5830 if (free_l2_count != l2_blocks) 5831 panic("%s: too many preinit mappings", __func__); 5832 5833 va = preinit_map_va + (start_idx * L2_SIZE); 5834 for (i = start_idx; i < start_idx + l2_blocks; i++) { 5835 /* Mark entries as allocated */ 5836 ppim = pmap_preinit_mapping + i; 5837 ppim->pa = pa; 5838 ppim->va = va + offset; 5839 ppim->size = size; 5840 } 5841 5842 /* Map L2 blocks */ 5843 pa = rounddown2(pa, L2_SIZE); 5844 for (i = 0; i < l2_blocks; i++) { 5845 pde = pmap_pde(kernel_pmap, va, &lvl); 5846 KASSERT(pde != NULL, 5847 ("pmap_mapbios: Invalid page entry, va: 0x%lx", 5848 va)); 5849 KASSERT(lvl == 1, 5850 ("pmap_mapbios: Invalid level %d", lvl)); 5851 5852 /* Insert L2_BLOCK */ 5853 l2 = pmap_l1_to_l2(pde, va); 5854 pmap_load_store(l2, 5855 pa | ATTR_DEFAULT | ATTR_S1_XN | 5856 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK); 5857 5858 va += L2_SIZE; 5859 pa += L2_SIZE; 5860 } 5861 pmap_invalidate_all(kernel_pmap); 5862 5863 va = preinit_map_va + (start_idx * L2_SIZE); 5864 5865 } else { 5866 /* kva_alloc may be used to map the pages */ 5867 offset = pa & PAGE_MASK; 5868 size = round_page(offset + size); 5869 5870 va = kva_alloc(size); 5871 if (va == 0) 5872 panic("%s: Couldn't allocate KVA", __func__); 5873 5874 pde = pmap_pde(kernel_pmap, va, &lvl); 5875 KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl)); 5876 5877 /* L3 table is linked */ 5878 va = trunc_page(va); 5879 pa = trunc_page(pa); 5880 pmap_kenter(va, size, pa, memory_mapping_mode(pa)); 5881 } 5882 5883 return ((void *)(va + offset)); 5884 } 5885 5886 void 5887 pmap_unmapbios(vm_offset_t va, vm_size_t size) 5888 { 5889 struct pmap_preinit_mapping *ppim; 5890 vm_offset_t offset, tmpsize, va_trunc; 5891 pd_entry_t *pde; 5892 pt_entry_t *l2; 5893 int i, lvl, l2_blocks, block; 5894 bool preinit_map; 5895 5896 l2_blocks = 5897 (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT; 5898 KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size)); 5899 5900 /* Remove preinit mapping */ 5901 preinit_map = false; 5902 block = 0; 5903 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5904 ppim = pmap_preinit_mapping + i; 5905 if (ppim->va == va) { 5906 KASSERT(ppim->size == size, 5907 ("pmap_unmapbios: size mismatch")); 5908 ppim->va = 0; 5909 ppim->pa = 0; 5910 ppim->size = 0; 5911 preinit_map = true; 5912 offset = block * L2_SIZE; 5913 va_trunc = rounddown2(va, L2_SIZE) + offset; 5914 5915 /* Remove L2_BLOCK */ 5916 pde = pmap_pde(kernel_pmap, va_trunc, &lvl); 5917 KASSERT(pde != NULL, 5918 ("pmap_unmapbios: Invalid page entry, va: 0x%lx", 5919 va_trunc)); 5920 l2 = pmap_l1_to_l2(pde, va_trunc); 5921 pmap_clear(l2); 5922 5923 if (block == (l2_blocks - 1)) 5924 break; 5925 block++; 5926 } 5927 } 5928 if (preinit_map) { 5929 pmap_invalidate_all(kernel_pmap); 5930 return; 5931 } 5932 5933 /* Unmap the pages reserved with kva_alloc. */ 5934 if (vm_initialized) { 5935 offset = va & PAGE_MASK; 5936 size = round_page(offset + size); 5937 va = trunc_page(va); 5938 5939 pde = pmap_pde(kernel_pmap, va, &lvl); 5940 KASSERT(pde != NULL, 5941 ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va)); 5942 KASSERT(lvl == 2, ("pmap_unmapbios: Invalid level %d", lvl)); 5943 5944 /* Unmap and invalidate the pages */ 5945 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 5946 pmap_kremove(va + tmpsize); 5947 5948 kva_free(va, size); 5949 } 5950 } 5951 5952 /* 5953 * Sets the memory attribute for the specified page. 5954 */ 5955 void 5956 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5957 { 5958 5959 m->md.pv_memattr = ma; 5960 5961 /* 5962 * If "m" is a normal page, update its direct mapping. This update 5963 * can be relied upon to perform any cache operations that are 5964 * required for data coherence. 5965 */ 5966 if ((m->flags & PG_FICTITIOUS) == 0 && 5967 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 5968 m->md.pv_memattr) != 0) 5969 panic("memory attribute change on the direct map failed"); 5970 } 5971 5972 /* 5973 * Changes the specified virtual address range's memory type to that given by 5974 * the parameter "mode". The specified virtual address range must be 5975 * completely contained within either the direct map or the kernel map. If 5976 * the virtual address range is contained within the kernel map, then the 5977 * memory type for each of the corresponding ranges of the direct map is also 5978 * changed. (The corresponding ranges of the direct map are those ranges that 5979 * map the same physical pages as the specified virtual address range.) These 5980 * changes to the direct map are necessary because Intel describes the 5981 * behavior of their processors as "undefined" if two or more mappings to the 5982 * same physical page have different memory types. 5983 * 5984 * Returns zero if the change completed successfully, and either EINVAL or 5985 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 5986 * of the virtual address range was not mapped, and ENOMEM is returned if 5987 * there was insufficient memory available to complete the change. In the 5988 * latter case, the memory type may have been changed on some part of the 5989 * virtual address range or the direct map. 5990 */ 5991 int 5992 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 5993 { 5994 int error; 5995 5996 PMAP_LOCK(kernel_pmap); 5997 error = pmap_change_props_locked(va, size, PROT_NONE, mode, false); 5998 PMAP_UNLOCK(kernel_pmap); 5999 return (error); 6000 } 6001 6002 /* 6003 * Changes the specified virtual address range's protections to those 6004 * specified by "prot". Like pmap_change_attr(), protections for aliases 6005 * in the direct map are updated as well. Protections on aliasing mappings may 6006 * be a subset of the requested protections; for example, mappings in the direct 6007 * map are never executable. 6008 */ 6009 int 6010 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot) 6011 { 6012 int error; 6013 6014 /* Only supported within the kernel map. */ 6015 if (va < VM_MIN_KERNEL_ADDRESS) 6016 return (EINVAL); 6017 6018 PMAP_LOCK(kernel_pmap); 6019 error = pmap_change_props_locked(va, size, prot, -1, false); 6020 PMAP_UNLOCK(kernel_pmap); 6021 return (error); 6022 } 6023 6024 static int 6025 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, 6026 int mode, bool skip_unmapped) 6027 { 6028 vm_offset_t base, offset, tmpva; 6029 vm_size_t pte_size; 6030 vm_paddr_t pa; 6031 pt_entry_t pte, *ptep, *newpte; 6032 pt_entry_t bits, mask; 6033 int lvl, rv; 6034 6035 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 6036 base = trunc_page(va); 6037 offset = va & PAGE_MASK; 6038 size = round_page(offset + size); 6039 6040 if (!VIRT_IN_DMAP(base) && 6041 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS)) 6042 return (EINVAL); 6043 6044 bits = 0; 6045 mask = 0; 6046 if (mode != -1) { 6047 bits = ATTR_S1_IDX(mode); 6048 mask = ATTR_S1_IDX_MASK; 6049 if (mode == VM_MEMATTR_DEVICE) { 6050 mask |= ATTR_S1_XN; 6051 bits |= ATTR_S1_XN; 6052 } 6053 } 6054 if (prot != VM_PROT_NONE) { 6055 /* Don't mark the DMAP as executable. It never is on arm64. */ 6056 if (VIRT_IN_DMAP(base)) { 6057 prot &= ~VM_PROT_EXECUTE; 6058 /* 6059 * XXX Mark the DMAP as writable for now. We rely 6060 * on this in ddb & dtrace to insert breakpoint 6061 * instructions. 6062 */ 6063 prot |= VM_PROT_WRITE; 6064 } 6065 6066 if ((prot & VM_PROT_WRITE) == 0) { 6067 bits |= ATTR_S1_AP(ATTR_S1_AP_RO); 6068 } 6069 if ((prot & VM_PROT_EXECUTE) == 0) { 6070 bits |= ATTR_S1_PXN; 6071 } 6072 bits |= ATTR_S1_UXN; 6073 mask |= ATTR_S1_AP_MASK | ATTR_S1_XN; 6074 } 6075 6076 for (tmpva = base; tmpva < base + size; ) { 6077 ptep = pmap_pte(kernel_pmap, tmpva, &lvl); 6078 if (ptep == NULL && !skip_unmapped) { 6079 return (EINVAL); 6080 } else if ((ptep == NULL && skip_unmapped) || 6081 (pmap_load(ptep) & mask) == bits) { 6082 /* 6083 * We already have the correct attribute or there 6084 * is no memory mapped at this address and we are 6085 * skipping unmapped memory. 6086 */ 6087 switch (lvl) { 6088 default: 6089 panic("Invalid DMAP table level: %d\n", lvl); 6090 case 1: 6091 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; 6092 break; 6093 case 2: 6094 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; 6095 break; 6096 case 3: 6097 tmpva += PAGE_SIZE; 6098 break; 6099 } 6100 } else { 6101 /* 6102 * Split the entry to an level 3 table, then 6103 * set the new attribute. 6104 */ 6105 switch (lvl) { 6106 default: 6107 panic("Invalid DMAP table level: %d\n", lvl); 6108 case 1: 6109 if ((tmpva & L1_OFFSET) == 0 && 6110 (base + size - tmpva) >= L1_SIZE) { 6111 pte_size = L1_SIZE; 6112 break; 6113 } 6114 newpte = pmap_demote_l1(kernel_pmap, ptep, 6115 tmpva & ~L1_OFFSET); 6116 if (newpte == NULL) 6117 return (EINVAL); 6118 ptep = pmap_l1_to_l2(ptep, tmpva); 6119 /* FALLTHROUGH */ 6120 case 2: 6121 if ((tmpva & L2_OFFSET) == 0 && 6122 (base + size - tmpva) >= L2_SIZE) { 6123 pte_size = L2_SIZE; 6124 break; 6125 } 6126 newpte = pmap_demote_l2(kernel_pmap, ptep, 6127 tmpva); 6128 if (newpte == NULL) 6129 return (EINVAL); 6130 ptep = pmap_l2_to_l3(ptep, tmpva); 6131 /* FALLTHROUGH */ 6132 case 3: 6133 pte_size = PAGE_SIZE; 6134 break; 6135 } 6136 6137 /* Update the entry */ 6138 pte = pmap_load(ptep); 6139 pte &= ~mask; 6140 pte |= bits; 6141 6142 pmap_update_entry(kernel_pmap, ptep, pte, tmpva, 6143 pte_size); 6144 6145 pa = pte & ~ATTR_MASK; 6146 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) { 6147 /* 6148 * Keep the DMAP memory in sync. 6149 */ 6150 rv = pmap_change_props_locked( 6151 PHYS_TO_DMAP(pa), pte_size, 6152 prot, mode, true); 6153 if (rv != 0) 6154 return (rv); 6155 } 6156 6157 /* 6158 * If moving to a non-cacheable entry flush 6159 * the cache. 6160 */ 6161 if (mode == VM_MEMATTR_UNCACHEABLE) 6162 cpu_dcache_wbinv_range(tmpva, pte_size); 6163 tmpva += pte_size; 6164 } 6165 } 6166 6167 return (0); 6168 } 6169 6170 /* 6171 * Create an L2 table to map all addresses within an L1 mapping. 6172 */ 6173 static pt_entry_t * 6174 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va) 6175 { 6176 pt_entry_t *l2, newl2, oldl1; 6177 vm_offset_t tmpl1; 6178 vm_paddr_t l2phys, phys; 6179 vm_page_t ml2; 6180 int i; 6181 6182 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6183 oldl1 = pmap_load(l1); 6184 KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK, 6185 ("pmap_demote_l1: Demoting a non-block entry")); 6186 KASSERT((va & L1_OFFSET) == 0, 6187 ("pmap_demote_l1: Invalid virtual address %#lx", va)); 6188 KASSERT((oldl1 & ATTR_SW_MANAGED) == 0, 6189 ("pmap_demote_l1: Level 1 table shouldn't be managed")); 6190 6191 tmpl1 = 0; 6192 if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) { 6193 tmpl1 = kva_alloc(PAGE_SIZE); 6194 if (tmpl1 == 0) 6195 return (NULL); 6196 } 6197 6198 if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) == 6199 NULL) { 6200 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx" 6201 " in pmap %p", va, pmap); 6202 l2 = NULL; 6203 goto fail; 6204 } 6205 6206 l2phys = VM_PAGE_TO_PHYS(ml2); 6207 l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys); 6208 6209 /* Address the range points at */ 6210 phys = oldl1 & ~ATTR_MASK; 6211 /* The attributed from the old l1 table to be copied */ 6212 newl2 = oldl1 & ATTR_MASK; 6213 6214 /* Create the new entries */ 6215 for (i = 0; i < Ln_ENTRIES; i++) { 6216 l2[i] = newl2 | phys; 6217 phys += L2_SIZE; 6218 } 6219 KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK), 6220 ("Invalid l2 page (%lx != %lx)", l2[0], 6221 (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK)); 6222 6223 if (tmpl1 != 0) { 6224 pmap_kenter(tmpl1, PAGE_SIZE, 6225 DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, 6226 VM_MEMATTR_WRITE_BACK); 6227 l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK)); 6228 } 6229 6230 pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE); 6231 6232 fail: 6233 if (tmpl1 != 0) { 6234 pmap_kremove(tmpl1); 6235 kva_free(tmpl1, PAGE_SIZE); 6236 } 6237 6238 return (l2); 6239 } 6240 6241 static void 6242 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3) 6243 { 6244 pt_entry_t *l3; 6245 6246 for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) { 6247 *l3 = newl3; 6248 newl3 += L3_SIZE; 6249 } 6250 } 6251 6252 static void 6253 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2, 6254 struct rwlock **lockp) 6255 { 6256 struct spglist free; 6257 6258 SLIST_INIT(&free); 6259 (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, 6260 lockp); 6261 vm_page_free_pages_toq(&free, true); 6262 } 6263 6264 /* 6265 * Create an L3 table to map all addresses within an L2 mapping. 6266 */ 6267 static pt_entry_t * 6268 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, 6269 struct rwlock **lockp) 6270 { 6271 pt_entry_t *l3, newl3, oldl2; 6272 vm_offset_t tmpl2; 6273 vm_paddr_t l3phys; 6274 vm_page_t ml3; 6275 6276 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6277 PMAP_ASSERT_STAGE1(pmap); 6278 KASSERT(ADDR_IS_CANONICAL(va), 6279 ("%s: Address not in canonical form: %lx", __func__, va)); 6280 6281 l3 = NULL; 6282 oldl2 = pmap_load(l2); 6283 KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK, 6284 ("pmap_demote_l2: Demoting a non-block entry")); 6285 va &= ~L2_OFFSET; 6286 6287 tmpl2 = 0; 6288 if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) { 6289 tmpl2 = kva_alloc(PAGE_SIZE); 6290 if (tmpl2 == 0) 6291 return (NULL); 6292 } 6293 6294 /* 6295 * Invalidate the 2MB page mapping and return "failure" if the 6296 * mapping was never accessed. 6297 */ 6298 if ((oldl2 & ATTR_AF) == 0) { 6299 KASSERT((oldl2 & ATTR_SW_WIRED) == 0, 6300 ("pmap_demote_l2: a wired mapping is missing ATTR_AF")); 6301 pmap_demote_l2_abort(pmap, va, l2, lockp); 6302 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p", 6303 va, pmap); 6304 goto fail; 6305 } 6306 6307 if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) { 6308 KASSERT((oldl2 & ATTR_SW_WIRED) == 0, 6309 ("pmap_demote_l2: page table page for a wired mapping" 6310 " is missing")); 6311 6312 /* 6313 * If the page table page is missing and the mapping 6314 * is for a kernel address, the mapping must belong to 6315 * either the direct map or the early kernel memory. 6316 * Page table pages are preallocated for every other 6317 * part of the kernel address space, so the direct map 6318 * region and early kernel memory are the only parts of the 6319 * kernel address space that must be handled here. 6320 */ 6321 KASSERT(!ADDR_IS_KERNEL(va) || VIRT_IN_DMAP(va) || 6322 (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end), 6323 ("pmap_demote_l2: No saved mpte for va %#lx", va)); 6324 6325 /* 6326 * If the 2MB page mapping belongs to the direct map 6327 * region of the kernel's address space, then the page 6328 * allocation request specifies the highest possible 6329 * priority (VM_ALLOC_INTERRUPT). Otherwise, the 6330 * priority is normal. 6331 */ 6332 ml3 = vm_page_alloc_noobj( 6333 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) | 6334 VM_ALLOC_WIRED); 6335 6336 /* 6337 * If the allocation of the new page table page fails, 6338 * invalidate the 2MB page mapping and return "failure". 6339 */ 6340 if (ml3 == NULL) { 6341 pmap_demote_l2_abort(pmap, va, l2, lockp); 6342 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx" 6343 " in pmap %p", va, pmap); 6344 goto fail; 6345 } 6346 ml3->pindex = pmap_l2_pindex(va); 6347 6348 if (!ADDR_IS_KERNEL(va)) { 6349 ml3->ref_count = NL3PG; 6350 pmap_resident_count_inc(pmap, 1); 6351 } 6352 } 6353 l3phys = VM_PAGE_TO_PHYS(ml3); 6354 l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys); 6355 newl3 = (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE; 6356 KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 6357 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM), 6358 ("pmap_demote_l2: L2 entry is writeable but not dirty")); 6359 6360 /* 6361 * If the page table page is not leftover from an earlier promotion, 6362 * or the mapping attributes have changed, (re)initialize the L3 table. 6363 * 6364 * When pmap_update_entry() clears the old L2 mapping, it (indirectly) 6365 * performs a dsb(). That dsb() ensures that the stores for filling 6366 * "l3" are visible before "l3" is added to the page table. 6367 */ 6368 if (ml3->valid == 0 || (l3[0] & ATTR_MASK) != (newl3 & ATTR_MASK)) 6369 pmap_fill_l3(l3, newl3); 6370 6371 /* 6372 * Map the temporary page so we don't lose access to the l2 table. 6373 */ 6374 if (tmpl2 != 0) { 6375 pmap_kenter(tmpl2, PAGE_SIZE, 6376 DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, 6377 VM_MEMATTR_WRITE_BACK); 6378 l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK)); 6379 } 6380 6381 /* 6382 * The spare PV entries must be reserved prior to demoting the 6383 * mapping, that is, prior to changing the PDE. Otherwise, the state 6384 * of the L2 and the PV lists will be inconsistent, which can result 6385 * in reclaim_pv_chunk() attempting to remove a PV entry from the 6386 * wrong PV list and pmap_pv_demote_l2() failing to find the expected 6387 * PV entry for the 2MB page mapping that is being demoted. 6388 */ 6389 if ((oldl2 & ATTR_SW_MANAGED) != 0) 6390 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); 6391 6392 /* 6393 * Pass PAGE_SIZE so that a single TLB invalidation is performed on 6394 * the 2MB page mapping. 6395 */ 6396 pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE); 6397 6398 /* 6399 * Demote the PV entry. 6400 */ 6401 if ((oldl2 & ATTR_SW_MANAGED) != 0) 6402 pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp); 6403 6404 atomic_add_long(&pmap_l2_demotions, 1); 6405 CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx" 6406 " in pmap %p %lx", va, pmap, l3[0]); 6407 6408 fail: 6409 if (tmpl2 != 0) { 6410 pmap_kremove(tmpl2); 6411 kva_free(tmpl2, PAGE_SIZE); 6412 } 6413 6414 return (l3); 6415 6416 } 6417 6418 static pt_entry_t * 6419 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 6420 { 6421 struct rwlock *lock; 6422 pt_entry_t *l3; 6423 6424 lock = NULL; 6425 l3 = pmap_demote_l2_locked(pmap, l2, va, &lock); 6426 if (lock != NULL) 6427 rw_wunlock(lock); 6428 return (l3); 6429 } 6430 6431 /* 6432 * Perform the pmap work for mincore(2). If the page is not both referenced and 6433 * modified by this pmap, returns its physical address so that the caller can 6434 * find other mappings. 6435 */ 6436 int 6437 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 6438 { 6439 pt_entry_t *pte, tpte; 6440 vm_paddr_t mask, pa; 6441 int lvl, val; 6442 bool managed; 6443 6444 PMAP_ASSERT_STAGE1(pmap); 6445 PMAP_LOCK(pmap); 6446 pte = pmap_pte(pmap, addr, &lvl); 6447 if (pte != NULL) { 6448 tpte = pmap_load(pte); 6449 6450 switch (lvl) { 6451 case 3: 6452 mask = L3_OFFSET; 6453 break; 6454 case 2: 6455 mask = L2_OFFSET; 6456 break; 6457 case 1: 6458 mask = L1_OFFSET; 6459 break; 6460 default: 6461 panic("pmap_mincore: invalid level %d", lvl); 6462 } 6463 6464 managed = (tpte & ATTR_SW_MANAGED) != 0; 6465 val = MINCORE_INCORE; 6466 if (lvl != 3) 6467 val |= MINCORE_PSIND(3 - lvl); 6468 if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed && 6469 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))) 6470 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6471 if ((tpte & ATTR_AF) == ATTR_AF) 6472 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6473 6474 pa = (tpte & ~ATTR_MASK) | (addr & mask); 6475 } else { 6476 managed = false; 6477 val = 0; 6478 } 6479 6480 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 6481 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 6482 *pap = pa; 6483 } 6484 PMAP_UNLOCK(pmap); 6485 return (val); 6486 } 6487 6488 /* 6489 * Garbage collect every ASID that is neither active on a processor nor 6490 * reserved. 6491 */ 6492 static void 6493 pmap_reset_asid_set(pmap_t pmap) 6494 { 6495 pmap_t curpmap; 6496 int asid, cpuid, epoch; 6497 struct asid_set *set; 6498 enum pmap_stage stage; 6499 6500 set = pmap->pm_asid_set; 6501 stage = pmap->pm_stage; 6502 6503 set = pmap->pm_asid_set; 6504 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 6505 mtx_assert(&set->asid_set_mutex, MA_OWNED); 6506 6507 /* 6508 * Ensure that the store to asid_epoch is globally visible before the 6509 * loads from pc_curpmap are performed. 6510 */ 6511 epoch = set->asid_epoch + 1; 6512 if (epoch == INT_MAX) 6513 epoch = 0; 6514 set->asid_epoch = epoch; 6515 dsb(ishst); 6516 if (stage == PM_STAGE1) { 6517 __asm __volatile("tlbi vmalle1is"); 6518 } else { 6519 KASSERT(pmap_clean_stage2_tlbi != NULL, 6520 ("%s: Unset stage 2 tlb invalidation callback\n", 6521 __func__)); 6522 pmap_clean_stage2_tlbi(); 6523 } 6524 dsb(ish); 6525 bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE, 6526 set->asid_set_size - 1); 6527 CPU_FOREACH(cpuid) { 6528 if (cpuid == curcpu) 6529 continue; 6530 if (stage == PM_STAGE1) { 6531 curpmap = pcpu_find(cpuid)->pc_curpmap; 6532 PMAP_ASSERT_STAGE1(pmap); 6533 } else { 6534 curpmap = pcpu_find(cpuid)->pc_curvmpmap; 6535 if (curpmap == NULL) 6536 continue; 6537 PMAP_ASSERT_STAGE2(pmap); 6538 } 6539 KASSERT(curpmap->pm_asid_set == set, ("Incorrect set")); 6540 asid = COOKIE_TO_ASID(curpmap->pm_cookie); 6541 if (asid == -1) 6542 continue; 6543 bit_set(set->asid_set, asid); 6544 curpmap->pm_cookie = COOKIE_FROM(asid, epoch); 6545 } 6546 } 6547 6548 /* 6549 * Allocate a new ASID for the specified pmap. 6550 */ 6551 static void 6552 pmap_alloc_asid(pmap_t pmap) 6553 { 6554 struct asid_set *set; 6555 int new_asid; 6556 6557 set = pmap->pm_asid_set; 6558 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 6559 6560 mtx_lock_spin(&set->asid_set_mutex); 6561 6562 /* 6563 * While this processor was waiting to acquire the asid set mutex, 6564 * pmap_reset_asid_set() running on another processor might have 6565 * updated this pmap's cookie to the current epoch. In which case, we 6566 * don't need to allocate a new ASID. 6567 */ 6568 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) 6569 goto out; 6570 6571 bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size, 6572 &new_asid); 6573 if (new_asid == -1) { 6574 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, 6575 set->asid_next, &new_asid); 6576 if (new_asid == -1) { 6577 pmap_reset_asid_set(pmap); 6578 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, 6579 set->asid_set_size, &new_asid); 6580 KASSERT(new_asid != -1, ("ASID allocation failure")); 6581 } 6582 } 6583 bit_set(set->asid_set, new_asid); 6584 set->asid_next = new_asid + 1; 6585 pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch); 6586 out: 6587 mtx_unlock_spin(&set->asid_set_mutex); 6588 } 6589 6590 /* 6591 * Compute the value that should be stored in ttbr0 to activate the specified 6592 * pmap. This value may change from time to time. 6593 */ 6594 uint64_t 6595 pmap_to_ttbr0(pmap_t pmap) 6596 { 6597 6598 return (ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)) | 6599 pmap->pm_ttbr); 6600 } 6601 6602 static bool 6603 pmap_activate_int(pmap_t pmap) 6604 { 6605 struct asid_set *set; 6606 int epoch; 6607 6608 KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap")); 6609 KASSERT(pmap != kernel_pmap, ("kernel pmap activation")); 6610 6611 if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) || 6612 (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) { 6613 /* 6614 * Handle the possibility that the old thread was preempted 6615 * after an "ic" or "tlbi" instruction but before it performed 6616 * a "dsb" instruction. If the old thread migrates to a new 6617 * processor, its completion of a "dsb" instruction on that 6618 * new processor does not guarantee that the "ic" or "tlbi" 6619 * instructions performed on the old processor have completed. 6620 */ 6621 dsb(ish); 6622 return (false); 6623 } 6624 6625 set = pmap->pm_asid_set; 6626 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 6627 6628 /* 6629 * Ensure that the store to curpmap is globally visible before the 6630 * load from asid_epoch is performed. 6631 */ 6632 if (pmap->pm_stage == PM_STAGE1) 6633 PCPU_SET(curpmap, pmap); 6634 else 6635 PCPU_SET(curvmpmap, pmap); 6636 dsb(ish); 6637 epoch = COOKIE_TO_EPOCH(pmap->pm_cookie); 6638 if (epoch >= 0 && epoch != set->asid_epoch) 6639 pmap_alloc_asid(pmap); 6640 6641 if (pmap->pm_stage == PM_STAGE1) { 6642 set_ttbr0(pmap_to_ttbr0(pmap)); 6643 if (PCPU_GET(bcast_tlbi_workaround) != 0) 6644 invalidate_local_icache(); 6645 } 6646 return (true); 6647 } 6648 6649 void 6650 pmap_activate_vm(pmap_t pmap) 6651 { 6652 6653 PMAP_ASSERT_STAGE2(pmap); 6654 6655 (void)pmap_activate_int(pmap); 6656 } 6657 6658 void 6659 pmap_activate(struct thread *td) 6660 { 6661 pmap_t pmap; 6662 6663 pmap = vmspace_pmap(td->td_proc->p_vmspace); 6664 PMAP_ASSERT_STAGE1(pmap); 6665 critical_enter(); 6666 (void)pmap_activate_int(pmap); 6667 critical_exit(); 6668 } 6669 6670 /* 6671 * Activate the thread we are switching to. 6672 * To simplify the assembly in cpu_throw return the new threads pcb. 6673 */ 6674 struct pcb * 6675 pmap_switch(struct thread *new) 6676 { 6677 pcpu_bp_harden bp_harden; 6678 struct pcb *pcb; 6679 6680 /* Store the new curthread */ 6681 PCPU_SET(curthread, new); 6682 #if defined(PERTHREAD_SSP) 6683 /* Set the new threads SSP canary */ 6684 __asm("msr sp_el0, %0" :: "r"(&new->td_md.md_canary)); 6685 #endif 6686 6687 /* And the new pcb */ 6688 pcb = new->td_pcb; 6689 PCPU_SET(curpcb, pcb); 6690 6691 /* 6692 * TODO: We may need to flush the cache here if switching 6693 * to a user process. 6694 */ 6695 6696 if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) { 6697 /* 6698 * Stop userspace from training the branch predictor against 6699 * other processes. This will call into a CPU specific 6700 * function that clears the branch predictor state. 6701 */ 6702 bp_harden = PCPU_GET(bp_harden); 6703 if (bp_harden != NULL) 6704 bp_harden(); 6705 } 6706 6707 return (pcb); 6708 } 6709 6710 void 6711 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) 6712 { 6713 6714 PMAP_ASSERT_STAGE1(pmap); 6715 KASSERT(ADDR_IS_CANONICAL(va), 6716 ("%s: Address not in canonical form: %lx", __func__, va)); 6717 6718 if (ADDR_IS_KERNEL(va)) { 6719 cpu_icache_sync_range(va, sz); 6720 } else { 6721 u_int len, offset; 6722 vm_paddr_t pa; 6723 6724 /* Find the length of data in this page to flush */ 6725 offset = va & PAGE_MASK; 6726 len = imin(PAGE_SIZE - offset, sz); 6727 6728 while (sz != 0) { 6729 /* Extract the physical address & find it in the DMAP */ 6730 pa = pmap_extract(pmap, va); 6731 if (pa != 0) 6732 cpu_icache_sync_range(PHYS_TO_DMAP(pa), len); 6733 6734 /* Move to the next page */ 6735 sz -= len; 6736 va += len; 6737 /* Set the length for the next iteration */ 6738 len = imin(PAGE_SIZE, sz); 6739 } 6740 } 6741 } 6742 6743 static int 6744 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far) 6745 { 6746 pd_entry_t *pdep; 6747 pt_entry_t *ptep, pte; 6748 int rv, lvl, dfsc; 6749 6750 PMAP_ASSERT_STAGE2(pmap); 6751 rv = KERN_FAILURE; 6752 6753 /* Data and insn aborts use same encoding for FSC field. */ 6754 dfsc = esr & ISS_DATA_DFSC_MASK; 6755 switch (dfsc) { 6756 case ISS_DATA_DFSC_TF_L0: 6757 case ISS_DATA_DFSC_TF_L1: 6758 case ISS_DATA_DFSC_TF_L2: 6759 case ISS_DATA_DFSC_TF_L3: 6760 PMAP_LOCK(pmap); 6761 pdep = pmap_pde(pmap, far, &lvl); 6762 if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) { 6763 PMAP_LOCK(pmap); 6764 break; 6765 } 6766 6767 switch (lvl) { 6768 case 0: 6769 ptep = pmap_l0_to_l1(pdep, far); 6770 break; 6771 case 1: 6772 ptep = pmap_l1_to_l2(pdep, far); 6773 break; 6774 case 2: 6775 ptep = pmap_l2_to_l3(pdep, far); 6776 break; 6777 default: 6778 panic("%s: Invalid pde level %d", __func__,lvl); 6779 } 6780 goto fault_exec; 6781 6782 case ISS_DATA_DFSC_AFF_L1: 6783 case ISS_DATA_DFSC_AFF_L2: 6784 case ISS_DATA_DFSC_AFF_L3: 6785 PMAP_LOCK(pmap); 6786 ptep = pmap_pte(pmap, far, &lvl); 6787 fault_exec: 6788 if (ptep != NULL && (pte = pmap_load(ptep)) != 0) { 6789 if (icache_vmid) { 6790 pmap_invalidate_vpipt_icache(); 6791 } else { 6792 /* 6793 * If accessing an executable page invalidate 6794 * the I-cache so it will be valid when we 6795 * continue execution in the guest. The D-cache 6796 * is assumed to already be clean to the Point 6797 * of Coherency. 6798 */ 6799 if ((pte & ATTR_S2_XN_MASK) != 6800 ATTR_S2_XN(ATTR_S2_XN_NONE)) { 6801 invalidate_icache(); 6802 } 6803 } 6804 pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID); 6805 rv = KERN_SUCCESS; 6806 } 6807 PMAP_UNLOCK(pmap); 6808 break; 6809 } 6810 6811 return (rv); 6812 } 6813 6814 int 6815 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far) 6816 { 6817 pt_entry_t pte, *ptep; 6818 register_t intr; 6819 uint64_t ec, par; 6820 int lvl, rv; 6821 6822 rv = KERN_FAILURE; 6823 6824 ec = ESR_ELx_EXCEPTION(esr); 6825 switch (ec) { 6826 case EXCP_INSN_ABORT_L: 6827 case EXCP_INSN_ABORT: 6828 case EXCP_DATA_ABORT_L: 6829 case EXCP_DATA_ABORT: 6830 break; 6831 default: 6832 return (rv); 6833 } 6834 6835 if (pmap->pm_stage == PM_STAGE2) 6836 return (pmap_stage2_fault(pmap, esr, far)); 6837 6838 /* Data and insn aborts use same encoding for FSC field. */ 6839 switch (esr & ISS_DATA_DFSC_MASK) { 6840 case ISS_DATA_DFSC_AFF_L1: 6841 case ISS_DATA_DFSC_AFF_L2: 6842 case ISS_DATA_DFSC_AFF_L3: 6843 PMAP_LOCK(pmap); 6844 ptep = pmap_pte(pmap, far, &lvl); 6845 if (ptep != NULL) { 6846 pmap_set_bits(ptep, ATTR_AF); 6847 rv = KERN_SUCCESS; 6848 /* 6849 * XXXMJ as an optimization we could mark the entry 6850 * dirty if this is a write fault. 6851 */ 6852 } 6853 PMAP_UNLOCK(pmap); 6854 break; 6855 case ISS_DATA_DFSC_PF_L1: 6856 case ISS_DATA_DFSC_PF_L2: 6857 case ISS_DATA_DFSC_PF_L3: 6858 if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) || 6859 (esr & ISS_DATA_WnR) == 0) 6860 return (rv); 6861 PMAP_LOCK(pmap); 6862 ptep = pmap_pte(pmap, far, &lvl); 6863 if (ptep != NULL && 6864 ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) { 6865 if ((pte & ATTR_S1_AP_RW_BIT) == 6866 ATTR_S1_AP(ATTR_S1_AP_RO)) { 6867 pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT); 6868 pmap_invalidate_page(pmap, far, true); 6869 } 6870 rv = KERN_SUCCESS; 6871 } 6872 PMAP_UNLOCK(pmap); 6873 break; 6874 case ISS_DATA_DFSC_TF_L0: 6875 case ISS_DATA_DFSC_TF_L1: 6876 case ISS_DATA_DFSC_TF_L2: 6877 case ISS_DATA_DFSC_TF_L3: 6878 /* 6879 * Retry the translation. A break-before-make sequence can 6880 * produce a transient fault. 6881 */ 6882 if (pmap == kernel_pmap) { 6883 /* 6884 * The translation fault may have occurred within a 6885 * critical section. Therefore, we must check the 6886 * address without acquiring the kernel pmap's lock. 6887 */ 6888 if (pmap_klookup(far, NULL)) 6889 rv = KERN_SUCCESS; 6890 } else { 6891 PMAP_LOCK(pmap); 6892 /* Ask the MMU to check the address. */ 6893 intr = intr_disable(); 6894 par = arm64_address_translate_s1e0r(far); 6895 intr_restore(intr); 6896 PMAP_UNLOCK(pmap); 6897 6898 /* 6899 * If the translation was successful, then we can 6900 * return success to the trap handler. 6901 */ 6902 if (PAR_SUCCESS(par)) 6903 rv = KERN_SUCCESS; 6904 } 6905 break; 6906 } 6907 6908 return (rv); 6909 } 6910 6911 /* 6912 * Increase the starting virtual address of the given mapping if a 6913 * different alignment might result in more superpage mappings. 6914 */ 6915 void 6916 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 6917 vm_offset_t *addr, vm_size_t size) 6918 { 6919 vm_offset_t superpage_offset; 6920 6921 if (size < L2_SIZE) 6922 return; 6923 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 6924 offset += ptoa(object->pg_color); 6925 superpage_offset = offset & L2_OFFSET; 6926 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || 6927 (*addr & L2_OFFSET) == superpage_offset) 6928 return; 6929 if ((*addr & L2_OFFSET) < superpage_offset) 6930 *addr = (*addr & ~L2_OFFSET) + superpage_offset; 6931 else 6932 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; 6933 } 6934 6935 /** 6936 * Get the kernel virtual address of a set of physical pages. If there are 6937 * physical addresses not covered by the DMAP perform a transient mapping 6938 * that will be removed when calling pmap_unmap_io_transient. 6939 * 6940 * \param page The pages the caller wishes to obtain the virtual 6941 * address on the kernel memory map. 6942 * \param vaddr On return contains the kernel virtual memory address 6943 * of the pages passed in the page parameter. 6944 * \param count Number of pages passed in. 6945 * \param can_fault TRUE if the thread using the mapped pages can take 6946 * page faults, FALSE otherwise. 6947 * 6948 * \returns TRUE if the caller must call pmap_unmap_io_transient when 6949 * finished or FALSE otherwise. 6950 * 6951 */ 6952 boolean_t 6953 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 6954 boolean_t can_fault) 6955 { 6956 vm_paddr_t paddr; 6957 boolean_t needs_mapping; 6958 int error __diagused, i; 6959 6960 /* 6961 * Allocate any KVA space that we need, this is done in a separate 6962 * loop to prevent calling vmem_alloc while pinned. 6963 */ 6964 needs_mapping = FALSE; 6965 for (i = 0; i < count; i++) { 6966 paddr = VM_PAGE_TO_PHYS(page[i]); 6967 if (__predict_false(!PHYS_IN_DMAP(paddr))) { 6968 error = vmem_alloc(kernel_arena, PAGE_SIZE, 6969 M_BESTFIT | M_WAITOK, &vaddr[i]); 6970 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 6971 needs_mapping = TRUE; 6972 } else { 6973 vaddr[i] = PHYS_TO_DMAP(paddr); 6974 } 6975 } 6976 6977 /* Exit early if everything is covered by the DMAP */ 6978 if (!needs_mapping) 6979 return (FALSE); 6980 6981 if (!can_fault) 6982 sched_pin(); 6983 for (i = 0; i < count; i++) { 6984 paddr = VM_PAGE_TO_PHYS(page[i]); 6985 if (!PHYS_IN_DMAP(paddr)) { 6986 panic( 6987 "pmap_map_io_transient: TODO: Map out of DMAP data"); 6988 } 6989 } 6990 6991 return (needs_mapping); 6992 } 6993 6994 void 6995 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 6996 boolean_t can_fault) 6997 { 6998 vm_paddr_t paddr; 6999 int i; 7000 7001 if (!can_fault) 7002 sched_unpin(); 7003 for (i = 0; i < count; i++) { 7004 paddr = VM_PAGE_TO_PHYS(page[i]); 7005 if (!PHYS_IN_DMAP(paddr)) { 7006 panic("ARM64TODO: pmap_unmap_io_transient: Unmap data"); 7007 } 7008 } 7009 } 7010 7011 boolean_t 7012 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 7013 { 7014 7015 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH); 7016 } 7017 7018 /* 7019 * Track a range of the kernel's virtual address space that is contiguous 7020 * in various mapping attributes. 7021 */ 7022 struct pmap_kernel_map_range { 7023 vm_offset_t sva; 7024 pt_entry_t attrs; 7025 int l3pages; 7026 int l3contig; 7027 int l2blocks; 7028 int l1blocks; 7029 }; 7030 7031 static void 7032 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 7033 vm_offset_t eva) 7034 { 7035 const char *mode; 7036 int index; 7037 7038 if (eva <= range->sva) 7039 return; 7040 7041 index = range->attrs & ATTR_S1_IDX_MASK; 7042 switch (index) { 7043 case ATTR_S1_IDX(VM_MEMATTR_DEVICE): 7044 mode = "DEV"; 7045 break; 7046 case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE): 7047 mode = "UC"; 7048 break; 7049 case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK): 7050 mode = "WB"; 7051 break; 7052 case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH): 7053 mode = "WT"; 7054 break; 7055 default: 7056 printf( 7057 "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n", 7058 __func__, index, range->sva, eva); 7059 mode = "??"; 7060 break; 7061 } 7062 7063 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %3s %d %d %d %d\n", 7064 range->sva, eva, 7065 (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-', 7066 (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x', 7067 (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X', 7068 (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's', 7069 mode, range->l1blocks, range->l2blocks, range->l3contig, 7070 range->l3pages); 7071 7072 /* Reset to sentinel value. */ 7073 range->sva = 0xfffffffffffffffful; 7074 } 7075 7076 /* 7077 * Determine whether the attributes specified by a page table entry match those 7078 * being tracked by the current range. 7079 */ 7080 static bool 7081 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 7082 { 7083 7084 return (range->attrs == attrs); 7085 } 7086 7087 static void 7088 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 7089 pt_entry_t attrs) 7090 { 7091 7092 memset(range, 0, sizeof(*range)); 7093 range->sva = va; 7094 range->attrs = attrs; 7095 } 7096 7097 /* Get the block/page attributes that correspond to the table attributes */ 7098 static pt_entry_t 7099 sysctl_kmaps_table_attrs(pd_entry_t table) 7100 { 7101 pt_entry_t attrs; 7102 7103 attrs = 0; 7104 if ((table & TATTR_UXN_TABLE) != 0) 7105 attrs |= ATTR_S1_UXN; 7106 if ((table & TATTR_PXN_TABLE) != 0) 7107 attrs |= ATTR_S1_PXN; 7108 if ((table & TATTR_AP_TABLE_RO) != 0) 7109 attrs |= ATTR_S1_AP(ATTR_S1_AP_RO); 7110 7111 return (attrs); 7112 } 7113 7114 /* Read the block/page attributes we care about */ 7115 static pt_entry_t 7116 sysctl_kmaps_block_attrs(pt_entry_t block) 7117 { 7118 return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK)); 7119 } 7120 7121 /* 7122 * Given a leaf PTE, derive the mapping's attributes. If they do not match 7123 * those of the current run, dump the address range and its attributes, and 7124 * begin a new run. 7125 */ 7126 static void 7127 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 7128 vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e, 7129 pt_entry_t l3e) 7130 { 7131 pt_entry_t attrs; 7132 7133 attrs = sysctl_kmaps_table_attrs(l0e); 7134 7135 if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 7136 attrs |= sysctl_kmaps_block_attrs(l1e); 7137 goto done; 7138 } 7139 attrs |= sysctl_kmaps_table_attrs(l1e); 7140 7141 if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 7142 attrs |= sysctl_kmaps_block_attrs(l2e); 7143 goto done; 7144 } 7145 attrs |= sysctl_kmaps_table_attrs(l2e); 7146 attrs |= sysctl_kmaps_block_attrs(l3e); 7147 7148 done: 7149 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 7150 sysctl_kmaps_dump(sb, range, va); 7151 sysctl_kmaps_reinit(range, va, attrs); 7152 } 7153 } 7154 7155 static int 7156 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 7157 { 7158 struct pmap_kernel_map_range range; 7159 struct sbuf sbuf, *sb; 7160 pd_entry_t l0e, *l1, l1e, *l2, l2e; 7161 pt_entry_t *l3, l3e; 7162 vm_offset_t sva; 7163 vm_paddr_t pa; 7164 int error, i, j, k, l; 7165 7166 error = sysctl_wire_old_buffer(req, 0); 7167 if (error != 0) 7168 return (error); 7169 sb = &sbuf; 7170 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 7171 7172 /* Sentinel value. */ 7173 range.sva = 0xfffffffffffffffful; 7174 7175 /* 7176 * Iterate over the kernel page tables without holding the kernel pmap 7177 * lock. Kernel page table pages are never freed, so at worst we will 7178 * observe inconsistencies in the output. 7179 */ 7180 for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES; 7181 i++) { 7182 if (i == pmap_l0_index(DMAP_MIN_ADDRESS)) 7183 sbuf_printf(sb, "\nDirect map:\n"); 7184 else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS)) 7185 sbuf_printf(sb, "\nKernel map:\n"); 7186 7187 l0e = kernel_pmap->pm_l0[i]; 7188 if ((l0e & ATTR_DESCR_VALID) == 0) { 7189 sysctl_kmaps_dump(sb, &range, sva); 7190 sva += L0_SIZE; 7191 continue; 7192 } 7193 pa = l0e & ~ATTR_MASK; 7194 l1 = (pd_entry_t *)PHYS_TO_DMAP(pa); 7195 7196 for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) { 7197 l1e = l1[j]; 7198 if ((l1e & ATTR_DESCR_VALID) == 0) { 7199 sysctl_kmaps_dump(sb, &range, sva); 7200 sva += L1_SIZE; 7201 continue; 7202 } 7203 if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) { 7204 sysctl_kmaps_check(sb, &range, sva, l0e, l1e, 7205 0, 0); 7206 range.l1blocks++; 7207 sva += L1_SIZE; 7208 continue; 7209 } 7210 pa = l1e & ~ATTR_MASK; 7211 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); 7212 7213 for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) { 7214 l2e = l2[k]; 7215 if ((l2e & ATTR_DESCR_VALID) == 0) { 7216 sysctl_kmaps_dump(sb, &range, sva); 7217 sva += L2_SIZE; 7218 continue; 7219 } 7220 if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) { 7221 sysctl_kmaps_check(sb, &range, sva, 7222 l0e, l1e, l2e, 0); 7223 range.l2blocks++; 7224 sva += L2_SIZE; 7225 continue; 7226 } 7227 pa = l2e & ~ATTR_MASK; 7228 l3 = (pt_entry_t *)PHYS_TO_DMAP(pa); 7229 7230 for (l = pmap_l3_index(sva); l < Ln_ENTRIES; 7231 l++, sva += L3_SIZE) { 7232 l3e = l3[l]; 7233 if ((l3e & ATTR_DESCR_VALID) == 0) { 7234 sysctl_kmaps_dump(sb, &range, 7235 sva); 7236 continue; 7237 } 7238 sysctl_kmaps_check(sb, &range, sva, 7239 l0e, l1e, l2e, l3e); 7240 if ((l3e & ATTR_CONTIGUOUS) != 0) 7241 range.l3contig += l % 16 == 0 ? 7242 1 : 0; 7243 else 7244 range.l3pages++; 7245 } 7246 } 7247 } 7248 } 7249 7250 error = sbuf_finish(sb); 7251 sbuf_delete(sb); 7252 return (error); 7253 } 7254 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 7255 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 7256 NULL, 0, sysctl_kmaps, "A", 7257 "Dump kernel address layout"); 7258