1 /*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2003 Peter Wemm 9 * All rights reserved. 10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 11 * All rights reserved. 12 * Copyright (c) 2014 Andrew Turner 13 * All rights reserved. 14 * Copyright (c) 2014-2016 The FreeBSD Foundation 15 * All rights reserved. 16 * 17 * This code is derived from software contributed to Berkeley by 18 * the Systems Programming Group of the University of Utah Computer 19 * Science Department and William Jolitz of UUNET Technologies Inc. 20 * 21 * This software was developed by Andrew Turner under sponsorship from 22 * the FreeBSD Foundation. 23 * 24 * Redistribution and use in source and binary forms, with or without 25 * modification, are permitted provided that the following conditions 26 * are met: 27 * 1. Redistributions of source code must retain the above copyright 28 * notice, this list of conditions and the following disclaimer. 29 * 2. Redistributions in binary form must reproduce the above copyright 30 * notice, this list of conditions and the following disclaimer in the 31 * documentation and/or other materials provided with the distribution. 32 * 3. All advertising materials mentioning features or use of this software 33 * must display the following acknowledgement: 34 * This product includes software developed by the University of 35 * California, Berkeley and its contributors. 36 * 4. Neither the name of the University nor the names of its contributors 37 * may be used to endorse or promote products derived from this software 38 * without specific prior written permission. 39 * 40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 43 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 50 * SUCH DAMAGE. 51 * 52 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 53 */ 54 /*- 55 * Copyright (c) 2003 Networks Associates Technology, Inc. 56 * All rights reserved. 57 * 58 * This software was developed for the FreeBSD Project by Jake Burkholder, 59 * Safeport Network Services, and Network Associates Laboratories, the 60 * Security Research Division of Network Associates, Inc. under 61 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 62 * CHATS research program. 63 * 64 * Redistribution and use in source and binary forms, with or without 65 * modification, are permitted provided that the following conditions 66 * are met: 67 * 1. Redistributions of source code must retain the above copyright 68 * notice, this list of conditions and the following disclaimer. 69 * 2. Redistributions in binary form must reproduce the above copyright 70 * notice, this list of conditions and the following disclaimer in the 71 * documentation and/or other materials provided with the distribution. 72 * 73 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 74 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 75 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 76 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 77 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 78 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 79 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 80 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 81 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 82 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 83 * SUCH DAMAGE. 84 */ 85 86 #include <sys/cdefs.h> 87 __FBSDID("$FreeBSD$"); 88 89 /* 90 * Manages physical address maps. 91 * 92 * Since the information managed by this module is 93 * also stored by the logical address mapping module, 94 * this module may throw away valid virtual-to-physical 95 * mappings at almost any time. However, invalidations 96 * of virtual-to-physical mappings must be done as 97 * requested. 98 * 99 * In order to cope with hardware architectures which 100 * make virtual-to-physical map invalidates expensive, 101 * this module may delay invalidate or reduced protection 102 * operations until such time as they are actually 103 * necessary. This module is given full information as 104 * to which processors are currently using which maps, 105 * and to when physical maps must be made correct. 106 */ 107 108 #include "opt_vm.h" 109 110 #include <sys/param.h> 111 #include <sys/bitstring.h> 112 #include <sys/bus.h> 113 #include <sys/systm.h> 114 #include <sys/kernel.h> 115 #include <sys/ktr.h> 116 #include <sys/limits.h> 117 #include <sys/lock.h> 118 #include <sys/malloc.h> 119 #include <sys/mman.h> 120 #include <sys/msgbuf.h> 121 #include <sys/mutex.h> 122 #include <sys/physmem.h> 123 #include <sys/proc.h> 124 #include <sys/rwlock.h> 125 #include <sys/sbuf.h> 126 #include <sys/sx.h> 127 #include <sys/vmem.h> 128 #include <sys/vmmeter.h> 129 #include <sys/sched.h> 130 #include <sys/sysctl.h> 131 #include <sys/_unrhdr.h> 132 #include <sys/smp.h> 133 134 #include <vm/vm.h> 135 #include <vm/vm_param.h> 136 #include <vm/vm_kern.h> 137 #include <vm/vm_page.h> 138 #include <vm/vm_map.h> 139 #include <vm/vm_object.h> 140 #include <vm/vm_extern.h> 141 #include <vm/vm_pageout.h> 142 #include <vm/vm_pager.h> 143 #include <vm/vm_phys.h> 144 #include <vm/vm_radix.h> 145 #include <vm/vm_reserv.h> 146 #include <vm/vm_dumpset.h> 147 #include <vm/uma.h> 148 149 #include <machine/machdep.h> 150 #include <machine/md_var.h> 151 #include <machine/pcb.h> 152 153 #define PMAP_ASSERT_STAGE1(pmap) MPASS((pmap)->pm_stage == PM_STAGE1) 154 #define PMAP_ASSERT_STAGE2(pmap) MPASS((pmap)->pm_stage == PM_STAGE2) 155 156 #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t))) 157 #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t))) 158 #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t))) 159 #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t))) 160 161 #define NUL0E L0_ENTRIES 162 #define NUL1E (NUL0E * NL1PG) 163 #define NUL2E (NUL1E * NL2PG) 164 165 #if !defined(DIAGNOSTIC) 166 #ifdef __GNUC_GNU_INLINE__ 167 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 168 #else 169 #define PMAP_INLINE extern inline 170 #endif 171 #else 172 #define PMAP_INLINE 173 #endif 174 175 #ifdef PV_STATS 176 #define PV_STAT(x) do { x ; } while (0) 177 #else 178 #define PV_STAT(x) do { } while (0) 179 #endif 180 181 #define pmap_l0_pindex(v) (NUL2E + NUL1E + ((v) >> L0_SHIFT)) 182 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT)) 183 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) 184 185 static struct md_page * 186 pa_to_pvh(vm_paddr_t pa) 187 { 188 struct vm_phys_seg *seg; 189 int segind; 190 191 for (segind = 0; segind < vm_phys_nsegs; segind++) { 192 seg = &vm_phys_segs[segind]; 193 if (pa >= seg->start && pa < seg->end) 194 return ((struct md_page *)seg->md_first + 195 pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start)); 196 } 197 panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa); 198 } 199 200 static struct md_page * 201 page_to_pvh(vm_page_t m) 202 { 203 struct vm_phys_seg *seg; 204 205 seg = &vm_phys_segs[m->segind]; 206 return ((struct md_page *)seg->md_first + 207 pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start)); 208 } 209 210 #define NPV_LIST_LOCKS MAXCPU 211 212 #define PHYS_TO_PV_LIST_LOCK(pa) \ 213 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) 214 215 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 216 struct rwlock **_lockp = (lockp); \ 217 struct rwlock *_new_lock; \ 218 \ 219 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 220 if (_new_lock != *_lockp) { \ 221 if (*_lockp != NULL) \ 222 rw_wunlock(*_lockp); \ 223 *_lockp = _new_lock; \ 224 rw_wlock(*_lockp); \ 225 } \ 226 } while (0) 227 228 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 229 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 230 231 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 232 struct rwlock **_lockp = (lockp); \ 233 \ 234 if (*_lockp != NULL) { \ 235 rw_wunlock(*_lockp); \ 236 *_lockp = NULL; \ 237 } \ 238 } while (0) 239 240 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 241 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 242 243 /* 244 * The presence of this flag indicates that the mapping is writeable. 245 * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise 246 * it is dirty. This flag may only be set on managed mappings. 247 * 248 * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it 249 * as a software managed bit. 250 */ 251 #define ATTR_SW_DBM ATTR_DBM 252 253 struct pmap kernel_pmap_store; 254 255 /* Used for mapping ACPI memory before VM is initialized */ 256 #define PMAP_PREINIT_MAPPING_COUNT 32 257 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE) 258 static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */ 259 static int vm_initialized = 0; /* No need to use pre-init maps when set */ 260 261 /* 262 * Reserve a few L2 blocks starting from 'preinit_map_va' pointer. 263 * Always map entire L2 block for simplicity. 264 * VA of L2 block = preinit_map_va + i * L2_SIZE 265 */ 266 static struct pmap_preinit_mapping { 267 vm_paddr_t pa; 268 vm_offset_t va; 269 vm_size_t size; 270 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 271 272 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 273 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 274 vm_offset_t kernel_vm_end = 0; 275 276 /* 277 * Data for the pv entry allocation mechanism. 278 */ 279 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 280 static struct mtx pv_chunks_mutex; 281 static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 282 static struct md_page *pv_table; 283 static struct md_page pv_dummy; 284 285 vm_paddr_t dmap_phys_base; /* The start of the dmap region */ 286 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ 287 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ 288 289 /* This code assumes all L1 DMAP entries will be used */ 290 CTASSERT((DMAP_MIN_ADDRESS & ~L0_OFFSET) == DMAP_MIN_ADDRESS); 291 CTASSERT((DMAP_MAX_ADDRESS & ~L0_OFFSET) == DMAP_MAX_ADDRESS); 292 293 #define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT) 294 extern pt_entry_t pagetable_dmap[]; 295 296 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 297 static vm_paddr_t physmap[PHYSMAP_SIZE]; 298 static u_int physmap_idx; 299 300 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 301 "VM/pmap parameters"); 302 303 /* 304 * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs 305 * that it has currently allocated to a pmap, a cursor ("asid_next") to 306 * optimize its search for a free ASID in the bit vector, and an epoch number 307 * ("asid_epoch") to indicate when it has reclaimed all previously allocated 308 * ASIDs that are not currently active on a processor. 309 * 310 * The current epoch number is always in the range [0, INT_MAX). Negative 311 * numbers and INT_MAX are reserved for special cases that are described 312 * below. 313 */ 314 struct asid_set { 315 int asid_bits; 316 bitstr_t *asid_set; 317 int asid_set_size; 318 int asid_next; 319 int asid_epoch; 320 struct mtx asid_set_mutex; 321 }; 322 323 static struct asid_set asids; 324 static struct asid_set vmids; 325 326 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 327 "ASID allocator"); 328 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0, 329 "The number of bits in an ASID"); 330 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0, 331 "The last allocated ASID plus one"); 332 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0, 333 "The current epoch number"); 334 335 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator"); 336 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0, 337 "The number of bits in an VMID"); 338 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0, 339 "The last allocated VMID plus one"); 340 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0, 341 "The current epoch number"); 342 343 void (*pmap_clean_stage2_tlbi)(void); 344 void (*pmap_invalidate_vpipt_icache)(void); 345 346 /* 347 * A pmap's cookie encodes an ASID and epoch number. Cookies for reserved 348 * ASIDs have a negative epoch number, specifically, INT_MIN. Cookies for 349 * dynamically allocated ASIDs have a non-negative epoch number. 350 * 351 * An invalid ASID is represented by -1. 352 * 353 * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN), 354 * which indicates that an ASID should never be allocated to the pmap, and 355 * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be 356 * allocated when the pmap is next activated. 357 */ 358 #define COOKIE_FROM(asid, epoch) ((long)((u_int)(asid) | \ 359 ((u_long)(epoch) << 32))) 360 #define COOKIE_TO_ASID(cookie) ((int)(cookie)) 361 #define COOKIE_TO_EPOCH(cookie) ((int)((u_long)(cookie) >> 32)) 362 363 static int superpages_enabled = 1; 364 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, 365 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0, 366 "Are large page mappings enabled?"); 367 368 /* 369 * Internal flags for pmap_enter()'s helper functions. 370 */ 371 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 372 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 373 374 static void free_pv_chunk(struct pv_chunk *pc); 375 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 376 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 377 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 378 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 379 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 380 vm_offset_t va); 381 382 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); 383 static bool pmap_activate_int(pmap_t pmap); 384 static void pmap_alloc_asid(pmap_t pmap); 385 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); 386 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va); 387 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, 388 vm_offset_t va, struct rwlock **lockp); 389 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); 390 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 391 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 392 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, 393 u_int flags, vm_page_t m, struct rwlock **lockp); 394 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 395 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp); 396 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, 397 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp); 398 static void pmap_reset_asid_set(pmap_t pmap); 399 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 400 vm_page_t m, struct rwlock **lockp); 401 402 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, 403 struct rwlock **lockp); 404 405 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, 406 struct spglist *free); 407 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 408 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 409 410 /* 411 * These load the old table data and store the new value. 412 * They need to be atomic as the System MMU may write to the table at 413 * the same time as the CPU. 414 */ 415 #define pmap_clear(table) atomic_store_64(table, 0) 416 #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits) 417 #define pmap_load(table) (*table) 418 #define pmap_load_clear(table) atomic_swap_64(table, 0) 419 #define pmap_load_store(table, entry) atomic_swap_64(table, entry) 420 #define pmap_set_bits(table, bits) atomic_set_64(table, bits) 421 #define pmap_store(table, entry) atomic_store_64(table, entry) 422 423 /********************/ 424 /* Inline functions */ 425 /********************/ 426 427 static __inline void 428 pagecopy(void *s, void *d) 429 { 430 431 memcpy(d, s, PAGE_SIZE); 432 } 433 434 static __inline pd_entry_t * 435 pmap_l0(pmap_t pmap, vm_offset_t va) 436 { 437 438 return (&pmap->pm_l0[pmap_l0_index(va)]); 439 } 440 441 static __inline pd_entry_t * 442 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) 443 { 444 pd_entry_t *l1; 445 446 l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); 447 return (&l1[pmap_l1_index(va)]); 448 } 449 450 static __inline pd_entry_t * 451 pmap_l1(pmap_t pmap, vm_offset_t va) 452 { 453 pd_entry_t *l0; 454 455 l0 = pmap_l0(pmap, va); 456 if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE) 457 return (NULL); 458 459 return (pmap_l0_to_l1(l0, va)); 460 } 461 462 static __inline pd_entry_t * 463 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va) 464 { 465 pd_entry_t l1, *l2p; 466 467 l1 = pmap_load(l1p); 468 469 /* 470 * The valid bit may be clear if pmap_update_entry() is concurrently 471 * modifying the entry, so for KVA only the entry type may be checked. 472 */ 473 KASSERT(va >= VM_MAX_USER_ADDRESS || (l1 & ATTR_DESCR_VALID) != 0, 474 ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va)); 475 KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, 476 ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va)); 477 l2p = (pd_entry_t *)PHYS_TO_DMAP(l1 & ~ATTR_MASK); 478 return (&l2p[pmap_l2_index(va)]); 479 } 480 481 static __inline pd_entry_t * 482 pmap_l2(pmap_t pmap, vm_offset_t va) 483 { 484 pd_entry_t *l1; 485 486 l1 = pmap_l1(pmap, va); 487 if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE) 488 return (NULL); 489 490 return (pmap_l1_to_l2(l1, va)); 491 } 492 493 static __inline pt_entry_t * 494 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va) 495 { 496 pd_entry_t l2; 497 pt_entry_t *l3p; 498 499 l2 = pmap_load(l2p); 500 501 /* 502 * The valid bit may be clear if pmap_update_entry() is concurrently 503 * modifying the entry, so for KVA only the entry type may be checked. 504 */ 505 KASSERT(va >= VM_MAX_USER_ADDRESS || (l2 & ATTR_DESCR_VALID) != 0, 506 ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va)); 507 KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, 508 ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va)); 509 l3p = (pt_entry_t *)PHYS_TO_DMAP(l2 & ~ATTR_MASK); 510 return (&l3p[pmap_l3_index(va)]); 511 } 512 513 /* 514 * Returns the lowest valid pde for a given virtual address. 515 * The next level may or may not point to a valid page or block. 516 */ 517 static __inline pd_entry_t * 518 pmap_pde(pmap_t pmap, vm_offset_t va, int *level) 519 { 520 pd_entry_t *l0, *l1, *l2, desc; 521 522 l0 = pmap_l0(pmap, va); 523 desc = pmap_load(l0) & ATTR_DESCR_MASK; 524 if (desc != L0_TABLE) { 525 *level = -1; 526 return (NULL); 527 } 528 529 l1 = pmap_l0_to_l1(l0, va); 530 desc = pmap_load(l1) & ATTR_DESCR_MASK; 531 if (desc != L1_TABLE) { 532 *level = 0; 533 return (l0); 534 } 535 536 l2 = pmap_l1_to_l2(l1, va); 537 desc = pmap_load(l2) & ATTR_DESCR_MASK; 538 if (desc != L2_TABLE) { 539 *level = 1; 540 return (l1); 541 } 542 543 *level = 2; 544 return (l2); 545 } 546 547 /* 548 * Returns the lowest valid pte block or table entry for a given virtual 549 * address. If there are no valid entries return NULL and set the level to 550 * the first invalid level. 551 */ 552 static __inline pt_entry_t * 553 pmap_pte(pmap_t pmap, vm_offset_t va, int *level) 554 { 555 pd_entry_t *l1, *l2, desc; 556 pt_entry_t *l3; 557 558 l1 = pmap_l1(pmap, va); 559 if (l1 == NULL) { 560 *level = 0; 561 return (NULL); 562 } 563 desc = pmap_load(l1) & ATTR_DESCR_MASK; 564 if (desc == L1_BLOCK) { 565 *level = 1; 566 return (l1); 567 } 568 569 if (desc != L1_TABLE) { 570 *level = 1; 571 return (NULL); 572 } 573 574 l2 = pmap_l1_to_l2(l1, va); 575 desc = pmap_load(l2) & ATTR_DESCR_MASK; 576 if (desc == L2_BLOCK) { 577 *level = 2; 578 return (l2); 579 } 580 581 if (desc != L2_TABLE) { 582 *level = 2; 583 return (NULL); 584 } 585 586 *level = 3; 587 l3 = pmap_l2_to_l3(l2, va); 588 if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE) 589 return (NULL); 590 591 return (l3); 592 } 593 594 bool 595 pmap_ps_enabled(pmap_t pmap __unused) 596 { 597 598 return (superpages_enabled != 0); 599 } 600 601 bool 602 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1, 603 pd_entry_t **l2, pt_entry_t **l3) 604 { 605 pd_entry_t *l0p, *l1p, *l2p; 606 607 if (pmap->pm_l0 == NULL) 608 return (false); 609 610 l0p = pmap_l0(pmap, va); 611 *l0 = l0p; 612 613 if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) 614 return (false); 615 616 l1p = pmap_l0_to_l1(l0p, va); 617 *l1 = l1p; 618 619 if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) { 620 *l2 = NULL; 621 *l3 = NULL; 622 return (true); 623 } 624 625 if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE) 626 return (false); 627 628 l2p = pmap_l1_to_l2(l1p, va); 629 *l2 = l2p; 630 631 if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) { 632 *l3 = NULL; 633 return (true); 634 } 635 636 if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE) 637 return (false); 638 639 *l3 = pmap_l2_to_l3(l2p, va); 640 641 return (true); 642 } 643 644 static __inline int 645 pmap_l3_valid(pt_entry_t l3) 646 { 647 648 return ((l3 & ATTR_DESCR_MASK) == L3_PAGE); 649 } 650 651 CTASSERT(L1_BLOCK == L2_BLOCK); 652 653 static pt_entry_t 654 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr) 655 { 656 pt_entry_t val; 657 658 if (pmap->pm_stage == PM_STAGE1) { 659 val = ATTR_S1_IDX(memattr); 660 if (memattr == VM_MEMATTR_DEVICE) 661 val |= ATTR_S1_XN; 662 return (val); 663 } 664 665 val = 0; 666 667 switch (memattr) { 668 case VM_MEMATTR_DEVICE: 669 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) | 670 ATTR_S2_XN(ATTR_S2_XN_ALL)); 671 case VM_MEMATTR_UNCACHEABLE: 672 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC)); 673 case VM_MEMATTR_WRITE_BACK: 674 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB)); 675 case VM_MEMATTR_WRITE_THROUGH: 676 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT)); 677 default: 678 panic("%s: invalid memory attribute %x", __func__, memattr); 679 } 680 } 681 682 static pt_entry_t 683 pmap_pte_prot(pmap_t pmap, vm_prot_t prot) 684 { 685 pt_entry_t val; 686 687 val = 0; 688 if (pmap->pm_stage == PM_STAGE1) { 689 if ((prot & VM_PROT_EXECUTE) == 0) 690 val |= ATTR_S1_XN; 691 if ((prot & VM_PROT_WRITE) == 0) 692 val |= ATTR_S1_AP(ATTR_S1_AP_RO); 693 } else { 694 if ((prot & VM_PROT_WRITE) != 0) 695 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 696 if ((prot & VM_PROT_READ) != 0) 697 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ); 698 if ((prot & VM_PROT_EXECUTE) == 0) 699 val |= ATTR_S2_XN(ATTR_S2_XN_ALL); 700 } 701 702 return (val); 703 } 704 705 /* 706 * Checks if the PTE is dirty. 707 */ 708 static inline int 709 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte) 710 { 711 712 KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte)); 713 714 if (pmap->pm_stage == PM_STAGE1) { 715 KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0, 716 ("pte %#lx is writeable and missing ATTR_SW_DBM", pte)); 717 718 return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 719 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM)); 720 } 721 722 return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) == 723 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)); 724 } 725 726 static __inline void 727 pmap_resident_count_inc(pmap_t pmap, int count) 728 { 729 730 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 731 pmap->pm_stats.resident_count += count; 732 } 733 734 static __inline void 735 pmap_resident_count_dec(pmap_t pmap, int count) 736 { 737 738 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 739 KASSERT(pmap->pm_stats.resident_count >= count, 740 ("pmap %p resident count underflow %ld %d", pmap, 741 pmap->pm_stats.resident_count, count)); 742 pmap->pm_stats.resident_count -= count; 743 } 744 745 static vm_paddr_t 746 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) 747 { 748 vm_paddr_t pa_page; 749 750 pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK; 751 return (pa_page | (va & PAR_LOW_MASK)); 752 } 753 754 static vm_offset_t 755 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, 756 vm_offset_t freemempos) 757 { 758 pt_entry_t *l2; 759 vm_offset_t va; 760 vm_paddr_t l2_pa, pa; 761 u_int l1_slot, l2_slot, prev_l1_slot; 762 int i; 763 764 dmap_phys_base = min_pa & ~L1_OFFSET; 765 dmap_phys_max = 0; 766 dmap_max_addr = 0; 767 l2 = NULL; 768 prev_l1_slot = -1; 769 770 #define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT) 771 memset(pagetable_dmap, 0, PAGE_SIZE * DMAP_TABLES); 772 773 for (i = 0; i < (physmap_idx * 2); i += 2) { 774 pa = physmap[i] & ~L2_OFFSET; 775 va = pa - dmap_phys_base + DMAP_MIN_ADDRESS; 776 777 /* Create L2 mappings at the start of the region */ 778 if ((pa & L1_OFFSET) != 0) { 779 l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); 780 if (l1_slot != prev_l1_slot) { 781 prev_l1_slot = l1_slot; 782 l2 = (pt_entry_t *)freemempos; 783 l2_pa = pmap_early_vtophys(kern_l1, 784 (vm_offset_t)l2); 785 freemempos += PAGE_SIZE; 786 787 pmap_store(&pagetable_dmap[l1_slot], 788 (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE); 789 790 memset(l2, 0, PAGE_SIZE); 791 } 792 KASSERT(l2 != NULL, 793 ("pmap_bootstrap_dmap: NULL l2 map")); 794 for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1]; 795 pa += L2_SIZE, va += L2_SIZE) { 796 /* 797 * We are on a boundary, stop to 798 * create a level 1 block 799 */ 800 if ((pa & L1_OFFSET) == 0) 801 break; 802 803 l2_slot = pmap_l2_index(va); 804 KASSERT(l2_slot != 0, ("...")); 805 pmap_store(&l2[l2_slot], 806 (pa & ~L2_OFFSET) | ATTR_DEFAULT | 807 ATTR_S1_XN | 808 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | 809 L2_BLOCK); 810 } 811 KASSERT(va == (pa - dmap_phys_base + DMAP_MIN_ADDRESS), 812 ("...")); 813 } 814 815 for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1] && 816 (physmap[i + 1] - pa) >= L1_SIZE; 817 pa += L1_SIZE, va += L1_SIZE) { 818 l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); 819 pmap_store(&pagetable_dmap[l1_slot], 820 (pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_S1_XN | 821 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L1_BLOCK); 822 } 823 824 /* Create L2 mappings at the end of the region */ 825 if (pa < physmap[i + 1]) { 826 l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); 827 if (l1_slot != prev_l1_slot) { 828 prev_l1_slot = l1_slot; 829 l2 = (pt_entry_t *)freemempos; 830 l2_pa = pmap_early_vtophys(kern_l1, 831 (vm_offset_t)l2); 832 freemempos += PAGE_SIZE; 833 834 pmap_store(&pagetable_dmap[l1_slot], 835 (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE); 836 837 memset(l2, 0, PAGE_SIZE); 838 } 839 KASSERT(l2 != NULL, 840 ("pmap_bootstrap_dmap: NULL l2 map")); 841 for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1]; 842 pa += L2_SIZE, va += L2_SIZE) { 843 l2_slot = pmap_l2_index(va); 844 pmap_store(&l2[l2_slot], 845 (pa & ~L2_OFFSET) | ATTR_DEFAULT | 846 ATTR_S1_XN | 847 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | 848 L2_BLOCK); 849 } 850 } 851 852 if (pa > dmap_phys_max) { 853 dmap_phys_max = pa; 854 dmap_max_addr = va; 855 } 856 } 857 858 cpu_tlb_flushID(); 859 860 return (freemempos); 861 } 862 863 static vm_offset_t 864 pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start) 865 { 866 vm_offset_t l2pt; 867 vm_paddr_t pa; 868 pd_entry_t *l1; 869 u_int l1_slot; 870 871 KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address")); 872 873 l1 = (pd_entry_t *)l1pt; 874 l1_slot = pmap_l1_index(va); 875 l2pt = l2_start; 876 877 for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) { 878 KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); 879 880 pa = pmap_early_vtophys(l1pt, l2pt); 881 pmap_store(&l1[l1_slot], 882 (pa & ~Ln_TABLE_MASK) | L1_TABLE); 883 l2pt += PAGE_SIZE; 884 } 885 886 /* Clean the L2 page table */ 887 memset((void *)l2_start, 0, l2pt - l2_start); 888 889 return l2pt; 890 } 891 892 static vm_offset_t 893 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) 894 { 895 vm_offset_t l3pt; 896 vm_paddr_t pa; 897 pd_entry_t *l2; 898 u_int l2_slot; 899 900 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); 901 902 l2 = pmap_l2(kernel_pmap, va); 903 l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE); 904 l2_slot = pmap_l2_index(va); 905 l3pt = l3_start; 906 907 for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { 908 KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); 909 910 pa = pmap_early_vtophys(l1pt, l3pt); 911 pmap_store(&l2[l2_slot], 912 (pa & ~Ln_TABLE_MASK) | ATTR_S1_UXN | L2_TABLE); 913 l3pt += PAGE_SIZE; 914 } 915 916 /* Clean the L2 page table */ 917 memset((void *)l3_start, 0, l3pt - l3_start); 918 919 return l3pt; 920 } 921 922 /* 923 * Bootstrap the system enough to run with virtual memory. 924 */ 925 void 926 pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart, 927 vm_size_t kernlen) 928 { 929 vm_offset_t freemempos; 930 vm_offset_t dpcpu, msgbufpv; 931 vm_paddr_t start_pa, pa, min_pa; 932 uint64_t kern_delta; 933 int i; 934 935 /* Verify that the ASID is set through TTBR0. */ 936 KASSERT((READ_SPECIALREG(tcr_el1) & TCR_A1) == 0, 937 ("pmap_bootstrap: TCR_EL1.A1 != 0")); 938 939 kern_delta = KERNBASE - kernstart; 940 941 printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); 942 printf("%lx\n", l1pt); 943 printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK); 944 945 /* Set this early so we can use the pagetable walking functions */ 946 kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt; 947 PMAP_LOCK_INIT(kernel_pmap); 948 kernel_pmap->pm_l0_paddr = l0pt - kern_delta; 949 kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN); 950 kernel_pmap->pm_stage = PM_STAGE1; 951 kernel_pmap->pm_levels = 4; 952 kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr; 953 kernel_pmap->pm_asid_set = &asids; 954 955 /* Assume the address we were loaded to is a valid physical address */ 956 min_pa = KERNBASE - kern_delta; 957 958 physmap_idx = physmem_avail(physmap, nitems(physmap)); 959 physmap_idx /= 2; 960 961 /* 962 * Find the minimum physical address. physmap is sorted, 963 * but may contain empty ranges. 964 */ 965 for (i = 0; i < physmap_idx * 2; i += 2) { 966 if (physmap[i] == physmap[i + 1]) 967 continue; 968 if (physmap[i] <= min_pa) 969 min_pa = physmap[i]; 970 } 971 972 freemempos = KERNBASE + kernlen; 973 freemempos = roundup2(freemempos, PAGE_SIZE); 974 975 /* Create a direct map region early so we can use it for pa -> va */ 976 freemempos = pmap_bootstrap_dmap(l1pt, min_pa, freemempos); 977 978 start_pa = pa = KERNBASE - kern_delta; 979 980 /* 981 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS. We assume that the 982 * loader allocated the first and only l2 page table page used to map 983 * the kernel, preloaded files and module metadata. 984 */ 985 freemempos = pmap_bootstrap_l2(l1pt, KERNBASE + L1_SIZE, freemempos); 986 /* And the l3 tables for the early devmap */ 987 freemempos = pmap_bootstrap_l3(l1pt, 988 VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE), freemempos); 989 990 cpu_tlb_flushID(); 991 992 #define alloc_pages(var, np) \ 993 (var) = freemempos; \ 994 freemempos += (np * PAGE_SIZE); \ 995 memset((char *)(var), 0, ((np) * PAGE_SIZE)); 996 997 /* Allocate dynamic per-cpu area. */ 998 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); 999 dpcpu_init((void *)dpcpu, 0); 1000 1001 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ 1002 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); 1003 msgbufp = (void *)msgbufpv; 1004 1005 /* Reserve some VA space for early BIOS/ACPI mapping */ 1006 preinit_map_va = roundup2(freemempos, L2_SIZE); 1007 1008 virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE; 1009 virtual_avail = roundup2(virtual_avail, L1_SIZE); 1010 virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE); 1011 kernel_vm_end = virtual_avail; 1012 1013 pa = pmap_early_vtophys(l1pt, freemempos); 1014 1015 physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC); 1016 1017 cpu_tlb_flushID(); 1018 } 1019 1020 /* 1021 * Initialize a vm_page's machine-dependent fields. 1022 */ 1023 void 1024 pmap_page_init(vm_page_t m) 1025 { 1026 1027 TAILQ_INIT(&m->md.pv_list); 1028 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; 1029 } 1030 1031 static void 1032 pmap_init_asids(struct asid_set *set, int bits) 1033 { 1034 int i; 1035 1036 set->asid_bits = bits; 1037 1038 /* 1039 * We may be too early in the overall initialization process to use 1040 * bit_alloc(). 1041 */ 1042 set->asid_set_size = 1 << set->asid_bits; 1043 set->asid_set = (bitstr_t *)kmem_malloc(bitstr_size(set->asid_set_size), 1044 M_WAITOK | M_ZERO); 1045 for (i = 0; i < ASID_FIRST_AVAILABLE; i++) 1046 bit_set(set->asid_set, i); 1047 set->asid_next = ASID_FIRST_AVAILABLE; 1048 mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN); 1049 } 1050 1051 /* 1052 * Initialize the pmap module. 1053 * Called by vm_init, to initialize any structures that the pmap 1054 * system needs to map virtual memory. 1055 */ 1056 void 1057 pmap_init(void) 1058 { 1059 struct vm_phys_seg *seg, *next_seg; 1060 struct md_page *pvh; 1061 vm_size_t s; 1062 uint64_t mmfr1; 1063 int i, pv_npg, vmid_bits; 1064 1065 /* 1066 * Are large page mappings enabled? 1067 */ 1068 TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); 1069 if (superpages_enabled) { 1070 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1071 ("pmap_init: can't assign to pagesizes[1]")); 1072 pagesizes[1] = L2_SIZE; 1073 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0, 1074 ("pmap_init: can't assign to pagesizes[2]")); 1075 pagesizes[2] = L1_SIZE; 1076 } 1077 1078 /* 1079 * Initialize the ASID allocator. 1080 */ 1081 pmap_init_asids(&asids, 1082 (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8); 1083 1084 if (has_hyp()) { 1085 mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1); 1086 vmid_bits = 8; 1087 1088 if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) == 1089 ID_AA64MMFR1_VMIDBits_16) 1090 vmid_bits = 16; 1091 pmap_init_asids(&vmids, vmid_bits); 1092 } 1093 1094 /* 1095 * Initialize the pv chunk list mutex. 1096 */ 1097 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 1098 1099 /* 1100 * Initialize the pool of pv list locks. 1101 */ 1102 for (i = 0; i < NPV_LIST_LOCKS; i++) 1103 rw_init(&pv_list_locks[i], "pmap pv list"); 1104 1105 /* 1106 * Calculate the size of the pv head table for superpages. 1107 */ 1108 pv_npg = 0; 1109 for (i = 0; i < vm_phys_nsegs; i++) { 1110 seg = &vm_phys_segs[i]; 1111 pv_npg += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1112 pmap_l2_pindex(seg->start); 1113 } 1114 1115 /* 1116 * Allocate memory for the pv head table for superpages. 1117 */ 1118 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 1119 s = round_page(s); 1120 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 1121 for (i = 0; i < pv_npg; i++) 1122 TAILQ_INIT(&pv_table[i].pv_list); 1123 TAILQ_INIT(&pv_dummy.pv_list); 1124 1125 /* 1126 * Set pointers from vm_phys_segs to pv_table. 1127 */ 1128 for (i = 0, pvh = pv_table; i < vm_phys_nsegs; i++) { 1129 seg = &vm_phys_segs[i]; 1130 seg->md_first = pvh; 1131 pvh += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1132 pmap_l2_pindex(seg->start); 1133 1134 /* 1135 * If there is a following segment, and the final 1136 * superpage of this segment and the initial superpage 1137 * of the next segment are the same then adjust the 1138 * pv_table entry for that next segment down by one so 1139 * that the pv_table entries will be shared. 1140 */ 1141 if (i + 1 < vm_phys_nsegs) { 1142 next_seg = &vm_phys_segs[i + 1]; 1143 if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 == 1144 pmap_l2_pindex(next_seg->start)) { 1145 pvh--; 1146 } 1147 } 1148 } 1149 1150 vm_initialized = 1; 1151 } 1152 1153 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1154 "2MB page mapping counters"); 1155 1156 static u_long pmap_l2_demotions; 1157 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, 1158 &pmap_l2_demotions, 0, "2MB page demotions"); 1159 1160 static u_long pmap_l2_mappings; 1161 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, 1162 &pmap_l2_mappings, 0, "2MB page mappings"); 1163 1164 static u_long pmap_l2_p_failures; 1165 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, 1166 &pmap_l2_p_failures, 0, "2MB page promotion failures"); 1167 1168 static u_long pmap_l2_promotions; 1169 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, 1170 &pmap_l2_promotions, 0, "2MB page promotions"); 1171 1172 /* 1173 * Invalidate a single TLB entry. 1174 */ 1175 static __inline void 1176 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1177 { 1178 uint64_t r; 1179 1180 PMAP_ASSERT_STAGE1(pmap); 1181 1182 dsb(ishst); 1183 if (pmap == kernel_pmap) { 1184 r = atop(va); 1185 __asm __volatile("tlbi vaae1is, %0" : : "r" (r)); 1186 } else { 1187 r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)) | atop(va); 1188 __asm __volatile("tlbi vae1is, %0" : : "r" (r)); 1189 } 1190 dsb(ish); 1191 isb(); 1192 } 1193 1194 static __inline void 1195 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1196 { 1197 uint64_t end, r, start; 1198 1199 PMAP_ASSERT_STAGE1(pmap); 1200 1201 dsb(ishst); 1202 if (pmap == kernel_pmap) { 1203 start = atop(sva); 1204 end = atop(eva); 1205 for (r = start; r < end; r++) 1206 __asm __volatile("tlbi vaae1is, %0" : : "r" (r)); 1207 } else { 1208 start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 1209 start |= atop(sva); 1210 end |= atop(eva); 1211 for (r = start; r < end; r++) 1212 __asm __volatile("tlbi vae1is, %0" : : "r" (r)); 1213 } 1214 dsb(ish); 1215 isb(); 1216 } 1217 1218 static __inline void 1219 pmap_invalidate_all(pmap_t pmap) 1220 { 1221 uint64_t r; 1222 1223 PMAP_ASSERT_STAGE1(pmap); 1224 1225 dsb(ishst); 1226 if (pmap == kernel_pmap) { 1227 __asm __volatile("tlbi vmalle1is"); 1228 } else { 1229 r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 1230 __asm __volatile("tlbi aside1is, %0" : : "r" (r)); 1231 } 1232 dsb(ish); 1233 isb(); 1234 } 1235 1236 /* 1237 * Routine: pmap_extract 1238 * Function: 1239 * Extract the physical page address associated 1240 * with the given map/virtual_address pair. 1241 */ 1242 vm_paddr_t 1243 pmap_extract(pmap_t pmap, vm_offset_t va) 1244 { 1245 pt_entry_t *pte, tpte; 1246 vm_paddr_t pa; 1247 int lvl; 1248 1249 pa = 0; 1250 PMAP_LOCK(pmap); 1251 /* 1252 * Find the block or page map for this virtual address. pmap_pte 1253 * will return either a valid block/page entry, or NULL. 1254 */ 1255 pte = pmap_pte(pmap, va, &lvl); 1256 if (pte != NULL) { 1257 tpte = pmap_load(pte); 1258 pa = tpte & ~ATTR_MASK; 1259 switch(lvl) { 1260 case 1: 1261 KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, 1262 ("pmap_extract: Invalid L1 pte found: %lx", 1263 tpte & ATTR_DESCR_MASK)); 1264 pa |= (va & L1_OFFSET); 1265 break; 1266 case 2: 1267 KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, 1268 ("pmap_extract: Invalid L2 pte found: %lx", 1269 tpte & ATTR_DESCR_MASK)); 1270 pa |= (va & L2_OFFSET); 1271 break; 1272 case 3: 1273 KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, 1274 ("pmap_extract: Invalid L3 pte found: %lx", 1275 tpte & ATTR_DESCR_MASK)); 1276 pa |= (va & L3_OFFSET); 1277 break; 1278 } 1279 } 1280 PMAP_UNLOCK(pmap); 1281 return (pa); 1282 } 1283 1284 /* 1285 * Routine: pmap_extract_and_hold 1286 * Function: 1287 * Atomically extract and hold the physical page 1288 * with the given pmap and virtual address pair 1289 * if that mapping permits the given protection. 1290 */ 1291 vm_page_t 1292 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1293 { 1294 pt_entry_t *pte, tpte; 1295 vm_offset_t off; 1296 vm_page_t m; 1297 int lvl; 1298 bool use; 1299 1300 m = NULL; 1301 PMAP_LOCK(pmap); 1302 pte = pmap_pte(pmap, va, &lvl); 1303 if (pte != NULL) { 1304 tpte = pmap_load(pte); 1305 1306 KASSERT(lvl > 0 && lvl <= 3, 1307 ("pmap_extract_and_hold: Invalid level %d", lvl)); 1308 CTASSERT(L1_BLOCK == L2_BLOCK); 1309 KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) || 1310 (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK), 1311 ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl, 1312 tpte & ATTR_DESCR_MASK)); 1313 1314 use = false; 1315 if ((prot & VM_PROT_WRITE) == 0) 1316 use = true; 1317 else if (pmap->pm_stage == PM_STAGE1 && 1318 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)) 1319 use = true; 1320 else if (pmap->pm_stage == PM_STAGE2 && 1321 ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) == 1322 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE))) 1323 use = true; 1324 1325 if (use) { 1326 switch (lvl) { 1327 case 1: 1328 off = va & L1_OFFSET; 1329 break; 1330 case 2: 1331 off = va & L2_OFFSET; 1332 break; 1333 case 3: 1334 default: 1335 off = 0; 1336 } 1337 m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off); 1338 if (m != NULL && !vm_page_wire_mapped(m)) 1339 m = NULL; 1340 } 1341 } 1342 PMAP_UNLOCK(pmap); 1343 return (m); 1344 } 1345 1346 /* 1347 * Walks the page tables to translate a kernel virtual address to a 1348 * physical address. Returns true if the kva is valid and stores the 1349 * physical address in pa if it is not NULL. 1350 */ 1351 bool 1352 pmap_klookup(vm_offset_t va, vm_paddr_t *pa) 1353 { 1354 pt_entry_t *pte, tpte; 1355 register_t intr; 1356 uint64_t par; 1357 1358 /* 1359 * Disable interrupts so we don't get interrupted between asking 1360 * for address translation, and getting the result back. 1361 */ 1362 intr = intr_disable(); 1363 par = arm64_address_translate_s1e1r(va); 1364 intr_restore(intr); 1365 1366 if (PAR_SUCCESS(par)) { 1367 if (pa != NULL) 1368 *pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK); 1369 return (true); 1370 } 1371 1372 /* 1373 * Fall back to walking the page table. The address translation 1374 * instruction may fail when the page is in a break-before-make 1375 * sequence. As we only clear the valid bit in said sequence we 1376 * can walk the page table to find the physical address. 1377 */ 1378 1379 pte = pmap_l1(kernel_pmap, va); 1380 if (pte == NULL) 1381 return (false); 1382 1383 /* 1384 * A concurrent pmap_update_entry() will clear the entry's valid bit 1385 * but leave the rest of the entry unchanged. Therefore, we treat a 1386 * non-zero entry as being valid, and we ignore the valid bit when 1387 * determining whether the entry maps a block, page, or table. 1388 */ 1389 tpte = pmap_load(pte); 1390 if (tpte == 0) 1391 return (false); 1392 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 1393 if (pa != NULL) 1394 *pa = (tpte & ~ATTR_MASK) | (va & L1_OFFSET); 1395 return (true); 1396 } 1397 pte = pmap_l1_to_l2(&tpte, va); 1398 tpte = pmap_load(pte); 1399 if (tpte == 0) 1400 return (false); 1401 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 1402 if (pa != NULL) 1403 *pa = (tpte & ~ATTR_MASK) | (va & L2_OFFSET); 1404 return (true); 1405 } 1406 pte = pmap_l2_to_l3(&tpte, va); 1407 tpte = pmap_load(pte); 1408 if (tpte == 0) 1409 return (false); 1410 if (pa != NULL) 1411 *pa = (tpte & ~ATTR_MASK) | (va & L3_OFFSET); 1412 return (true); 1413 } 1414 1415 vm_paddr_t 1416 pmap_kextract(vm_offset_t va) 1417 { 1418 vm_paddr_t pa; 1419 1420 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 1421 return (DMAP_TO_PHYS(va)); 1422 1423 if (pmap_klookup(va, &pa) == false) 1424 return (0); 1425 return (pa); 1426 } 1427 1428 /*************************************************** 1429 * Low level mapping routines..... 1430 ***************************************************/ 1431 1432 void 1433 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode) 1434 { 1435 pd_entry_t *pde; 1436 pt_entry_t *pte, attr; 1437 vm_offset_t va; 1438 int lvl; 1439 1440 KASSERT((pa & L3_OFFSET) == 0, 1441 ("pmap_kenter: Invalid physical address")); 1442 KASSERT((sva & L3_OFFSET) == 0, 1443 ("pmap_kenter: Invalid virtual address")); 1444 KASSERT((size & PAGE_MASK) == 0, 1445 ("pmap_kenter: Mapping is not page-sized")); 1446 1447 attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN | 1448 ATTR_S1_IDX(mode) | L3_PAGE; 1449 va = sva; 1450 while (size != 0) { 1451 pde = pmap_pde(kernel_pmap, va, &lvl); 1452 KASSERT(pde != NULL, 1453 ("pmap_kenter: Invalid page entry, va: 0x%lx", va)); 1454 KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl)); 1455 1456 pte = pmap_l2_to_l3(pde, va); 1457 pmap_load_store(pte, (pa & ~L3_OFFSET) | attr); 1458 1459 va += PAGE_SIZE; 1460 pa += PAGE_SIZE; 1461 size -= PAGE_SIZE; 1462 } 1463 pmap_invalidate_range(kernel_pmap, sva, va); 1464 } 1465 1466 void 1467 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) 1468 { 1469 1470 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE); 1471 } 1472 1473 /* 1474 * Remove a page from the kernel pagetables. 1475 */ 1476 PMAP_INLINE void 1477 pmap_kremove(vm_offset_t va) 1478 { 1479 pt_entry_t *pte; 1480 int lvl; 1481 1482 pte = pmap_pte(kernel_pmap, va, &lvl); 1483 KASSERT(pte != NULL, ("pmap_kremove: Invalid address")); 1484 KASSERT(lvl == 3, ("pmap_kremove: Invalid pte level %d", lvl)); 1485 1486 pmap_clear(pte); 1487 pmap_invalidate_page(kernel_pmap, va); 1488 } 1489 1490 void 1491 pmap_kremove_device(vm_offset_t sva, vm_size_t size) 1492 { 1493 pt_entry_t *pte; 1494 vm_offset_t va; 1495 int lvl; 1496 1497 KASSERT((sva & L3_OFFSET) == 0, 1498 ("pmap_kremove_device: Invalid virtual address")); 1499 KASSERT((size & PAGE_MASK) == 0, 1500 ("pmap_kremove_device: Mapping is not page-sized")); 1501 1502 va = sva; 1503 while (size != 0) { 1504 pte = pmap_pte(kernel_pmap, va, &lvl); 1505 KASSERT(pte != NULL, ("Invalid page table, va: 0x%lx", va)); 1506 KASSERT(lvl == 3, 1507 ("Invalid device pagetable level: %d != 3", lvl)); 1508 pmap_clear(pte); 1509 1510 va += PAGE_SIZE; 1511 size -= PAGE_SIZE; 1512 } 1513 pmap_invalidate_range(kernel_pmap, sva, va); 1514 } 1515 1516 /* 1517 * Used to map a range of physical addresses into kernel 1518 * virtual address space. 1519 * 1520 * The value passed in '*virt' is a suggested virtual address for 1521 * the mapping. Architectures which can support a direct-mapped 1522 * physical to virtual region can return the appropriate address 1523 * within that region, leaving '*virt' unchanged. Other 1524 * architectures should map the pages starting at '*virt' and 1525 * update '*virt' with the first usable address after the mapped 1526 * region. 1527 */ 1528 vm_offset_t 1529 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1530 { 1531 return PHYS_TO_DMAP(start); 1532 } 1533 1534 /* 1535 * Add a list of wired pages to the kva 1536 * this routine is only used for temporary 1537 * kernel mappings that do not need to have 1538 * page modification or references recorded. 1539 * Note that old mappings are simply written 1540 * over. The page *must* be wired. 1541 * Note: SMP coherent. Uses a ranged shootdown IPI. 1542 */ 1543 void 1544 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1545 { 1546 pd_entry_t *pde; 1547 pt_entry_t *pte, pa; 1548 vm_offset_t va; 1549 vm_page_t m; 1550 int i, lvl; 1551 1552 va = sva; 1553 for (i = 0; i < count; i++) { 1554 pde = pmap_pde(kernel_pmap, va, &lvl); 1555 KASSERT(pde != NULL, 1556 ("pmap_qenter: Invalid page entry, va: 0x%lx", va)); 1557 KASSERT(lvl == 2, 1558 ("pmap_qenter: Invalid level %d", lvl)); 1559 1560 m = ma[i]; 1561 pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | 1562 ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN | 1563 ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE; 1564 pte = pmap_l2_to_l3(pde, va); 1565 pmap_load_store(pte, pa); 1566 1567 va += L3_SIZE; 1568 } 1569 pmap_invalidate_range(kernel_pmap, sva, va); 1570 } 1571 1572 /* 1573 * This routine tears out page mappings from the 1574 * kernel -- it is meant only for temporary mappings. 1575 */ 1576 void 1577 pmap_qremove(vm_offset_t sva, int count) 1578 { 1579 pt_entry_t *pte; 1580 vm_offset_t va; 1581 int lvl; 1582 1583 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); 1584 1585 va = sva; 1586 while (count-- > 0) { 1587 pte = pmap_pte(kernel_pmap, va, &lvl); 1588 KASSERT(lvl == 3, 1589 ("Invalid device pagetable level: %d != 3", lvl)); 1590 if (pte != NULL) { 1591 pmap_clear(pte); 1592 } 1593 1594 va += PAGE_SIZE; 1595 } 1596 pmap_invalidate_range(kernel_pmap, sva, va); 1597 } 1598 1599 /*************************************************** 1600 * Page table page management routines..... 1601 ***************************************************/ 1602 /* 1603 * Schedule the specified unused page table page to be freed. Specifically, 1604 * add the page to the specified list of pages that will be released to the 1605 * physical memory manager after the TLB has been updated. 1606 */ 1607 static __inline void 1608 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 1609 boolean_t set_PG_ZERO) 1610 { 1611 1612 if (set_PG_ZERO) 1613 m->flags |= PG_ZERO; 1614 else 1615 m->flags &= ~PG_ZERO; 1616 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1617 } 1618 1619 /* 1620 * Decrements a page table page's reference count, which is used to record the 1621 * number of valid page table entries within the page. If the reference count 1622 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1623 * page table page was unmapped and FALSE otherwise. 1624 */ 1625 static inline boolean_t 1626 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1627 { 1628 1629 --m->ref_count; 1630 if (m->ref_count == 0) { 1631 _pmap_unwire_l3(pmap, va, m, free); 1632 return (TRUE); 1633 } else 1634 return (FALSE); 1635 } 1636 1637 static void 1638 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1639 { 1640 1641 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1642 /* 1643 * unmap the page table page 1644 */ 1645 if (m->pindex >= (NUL2E + NUL1E)) { 1646 /* l1 page */ 1647 pd_entry_t *l0; 1648 1649 l0 = pmap_l0(pmap, va); 1650 pmap_clear(l0); 1651 } else if (m->pindex >= NUL2E) { 1652 /* l2 page */ 1653 pd_entry_t *l1; 1654 1655 l1 = pmap_l1(pmap, va); 1656 pmap_clear(l1); 1657 } else { 1658 /* l3 page */ 1659 pd_entry_t *l2; 1660 1661 l2 = pmap_l2(pmap, va); 1662 pmap_clear(l2); 1663 } 1664 pmap_resident_count_dec(pmap, 1); 1665 if (m->pindex < NUL2E) { 1666 /* We just released an l3, unhold the matching l2 */ 1667 pd_entry_t *l1, tl1; 1668 vm_page_t l2pg; 1669 1670 l1 = pmap_l1(pmap, va); 1671 tl1 = pmap_load(l1); 1672 l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); 1673 pmap_unwire_l3(pmap, va, l2pg, free); 1674 } else if (m->pindex < (NUL2E + NUL1E)) { 1675 /* We just released an l2, unhold the matching l1 */ 1676 pd_entry_t *l0, tl0; 1677 vm_page_t l1pg; 1678 1679 l0 = pmap_l0(pmap, va); 1680 tl0 = pmap_load(l0); 1681 l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); 1682 pmap_unwire_l3(pmap, va, l1pg, free); 1683 } 1684 pmap_invalidate_page(pmap, va); 1685 1686 /* 1687 * Put page on a list so that it is released after 1688 * *ALL* TLB shootdown is done 1689 */ 1690 pmap_add_delayed_free_list(m, free, TRUE); 1691 } 1692 1693 /* 1694 * After removing a page table entry, this routine is used to 1695 * conditionally free the page, and manage the reference count. 1696 */ 1697 static int 1698 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 1699 struct spglist *free) 1700 { 1701 vm_page_t mpte; 1702 1703 if (va >= VM_MAXUSER_ADDRESS) 1704 return (0); 1705 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 1706 mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK); 1707 return (pmap_unwire_l3(pmap, va, mpte, free)); 1708 } 1709 1710 /* 1711 * Release a page table page reference after a failed attempt to create a 1712 * mapping. 1713 */ 1714 static void 1715 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 1716 { 1717 struct spglist free; 1718 1719 SLIST_INIT(&free); 1720 if (pmap_unwire_l3(pmap, va, mpte, &free)) { 1721 /* 1722 * Although "va" was never mapped, the TLB could nonetheless 1723 * have intermediate entries that refer to the freed page 1724 * table pages. Invalidate those entries. 1725 * 1726 * XXX redundant invalidation (See _pmap_unwire_l3().) 1727 */ 1728 pmap_invalidate_page(pmap, va); 1729 vm_page_free_pages_toq(&free, true); 1730 } 1731 } 1732 1733 void 1734 pmap_pinit0(pmap_t pmap) 1735 { 1736 1737 PMAP_LOCK_INIT(pmap); 1738 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1739 pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1); 1740 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); 1741 pmap->pm_root.rt_root = 0; 1742 pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN); 1743 pmap->pm_stage = PM_STAGE1; 1744 pmap->pm_levels = 4; 1745 pmap->pm_ttbr = pmap->pm_l0_paddr; 1746 pmap->pm_asid_set = &asids; 1747 1748 PCPU_SET(curpmap, pmap); 1749 } 1750 1751 int 1752 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels) 1753 { 1754 vm_page_t m; 1755 1756 /* 1757 * allocate the l0 page 1758 */ 1759 while ((m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 1760 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) 1761 vm_wait(NULL); 1762 1763 pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m); 1764 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); 1765 1766 if ((m->flags & PG_ZERO) == 0) 1767 pagezero(pmap->pm_l0); 1768 1769 pmap->pm_root.rt_root = 0; 1770 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1771 pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX); 1772 1773 MPASS(levels == 3 || levels == 4); 1774 pmap->pm_levels = levels; 1775 pmap->pm_stage = stage; 1776 switch (stage) { 1777 case PM_STAGE1: 1778 pmap->pm_asid_set = &asids; 1779 break; 1780 case PM_STAGE2: 1781 pmap->pm_asid_set = &vmids; 1782 break; 1783 default: 1784 panic("%s: Invalid pmap type %d", __func__, stage); 1785 break; 1786 } 1787 1788 /* XXX Temporarily disable deferred ASID allocation. */ 1789 pmap_alloc_asid(pmap); 1790 1791 /* 1792 * Allocate the level 1 entry to use as the root. This will increase 1793 * the refcount on the level 1 page so it won't be removed until 1794 * pmap_release() is called. 1795 */ 1796 if (pmap->pm_levels == 3) { 1797 PMAP_LOCK(pmap); 1798 m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL); 1799 PMAP_UNLOCK(pmap); 1800 } 1801 pmap->pm_ttbr = VM_PAGE_TO_PHYS(m); 1802 1803 return (1); 1804 } 1805 1806 int 1807 pmap_pinit(pmap_t pmap) 1808 { 1809 1810 return (pmap_pinit_stage(pmap, PM_STAGE1, 4)); 1811 } 1812 1813 /* 1814 * This routine is called if the desired page table page does not exist. 1815 * 1816 * If page table page allocation fails, this routine may sleep before 1817 * returning NULL. It sleeps only if a lock pointer was given. 1818 * 1819 * Note: If a page allocation fails at page table level two or three, 1820 * one or two pages may be held during the wait, only to be released 1821 * afterwards. This conservative approach is easily argued to avoid 1822 * race conditions. 1823 */ 1824 static vm_page_t 1825 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 1826 { 1827 vm_page_t m, l1pg, l2pg; 1828 1829 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1830 1831 /* 1832 * Allocate a page table page. 1833 */ 1834 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1835 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1836 if (lockp != NULL) { 1837 RELEASE_PV_LIST_LOCK(lockp); 1838 PMAP_UNLOCK(pmap); 1839 vm_wait(NULL); 1840 PMAP_LOCK(pmap); 1841 } 1842 1843 /* 1844 * Indicate the need to retry. While waiting, the page table 1845 * page may have been allocated. 1846 */ 1847 return (NULL); 1848 } 1849 if ((m->flags & PG_ZERO) == 0) 1850 pmap_zero_page(m); 1851 1852 /* 1853 * Because of AArch64's weak memory consistency model, we must have a 1854 * barrier here to ensure that the stores for zeroing "m", whether by 1855 * pmap_zero_page() or an earlier function, are visible before adding 1856 * "m" to the page table. Otherwise, a page table walk by another 1857 * processor's MMU could see the mapping to "m" and a stale, non-zero 1858 * PTE within "m". 1859 */ 1860 dmb(ishst); 1861 1862 /* 1863 * Map the pagetable page into the process address space, if 1864 * it isn't already there. 1865 */ 1866 1867 if (ptepindex >= (NUL2E + NUL1E)) { 1868 pd_entry_t *l0; 1869 vm_pindex_t l0index; 1870 1871 l0index = ptepindex - (NUL2E + NUL1E); 1872 l0 = &pmap->pm_l0[l0index]; 1873 pmap_store(l0, VM_PAGE_TO_PHYS(m) | L0_TABLE); 1874 } else if (ptepindex >= NUL2E) { 1875 vm_pindex_t l0index, l1index; 1876 pd_entry_t *l0, *l1; 1877 pd_entry_t tl0; 1878 1879 l1index = ptepindex - NUL2E; 1880 l0index = l1index >> L0_ENTRIES_SHIFT; 1881 1882 l0 = &pmap->pm_l0[l0index]; 1883 tl0 = pmap_load(l0); 1884 if (tl0 == 0) { 1885 /* recurse for allocating page dir */ 1886 if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, 1887 lockp) == NULL) { 1888 vm_page_unwire_noq(m); 1889 vm_page_free_zero(m); 1890 return (NULL); 1891 } 1892 } else { 1893 l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); 1894 l1pg->ref_count++; 1895 } 1896 1897 l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); 1898 l1 = &l1[ptepindex & Ln_ADDR_MASK]; 1899 pmap_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE); 1900 } else { 1901 vm_pindex_t l0index, l1index; 1902 pd_entry_t *l0, *l1, *l2; 1903 pd_entry_t tl0, tl1; 1904 1905 l1index = ptepindex >> Ln_ENTRIES_SHIFT; 1906 l0index = l1index >> L0_ENTRIES_SHIFT; 1907 1908 l0 = &pmap->pm_l0[l0index]; 1909 tl0 = pmap_load(l0); 1910 if (tl0 == 0) { 1911 /* recurse for allocating page dir */ 1912 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 1913 lockp) == NULL) { 1914 vm_page_unwire_noq(m); 1915 vm_page_free_zero(m); 1916 return (NULL); 1917 } 1918 tl0 = pmap_load(l0); 1919 l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); 1920 l1 = &l1[l1index & Ln_ADDR_MASK]; 1921 } else { 1922 l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); 1923 l1 = &l1[l1index & Ln_ADDR_MASK]; 1924 tl1 = pmap_load(l1); 1925 if (tl1 == 0) { 1926 /* recurse for allocating page dir */ 1927 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 1928 lockp) == NULL) { 1929 vm_page_unwire_noq(m); 1930 vm_page_free_zero(m); 1931 return (NULL); 1932 } 1933 } else { 1934 l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); 1935 l2pg->ref_count++; 1936 } 1937 } 1938 1939 l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); 1940 l2 = &l2[ptepindex & Ln_ADDR_MASK]; 1941 pmap_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE); 1942 } 1943 1944 pmap_resident_count_inc(pmap, 1); 1945 1946 return (m); 1947 } 1948 1949 static pd_entry_t * 1950 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp, 1951 struct rwlock **lockp) 1952 { 1953 pd_entry_t *l1, *l2; 1954 vm_page_t l2pg; 1955 vm_pindex_t l2pindex; 1956 1957 retry: 1958 l1 = pmap_l1(pmap, va); 1959 if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) { 1960 l2 = pmap_l1_to_l2(l1, va); 1961 if (va < VM_MAXUSER_ADDRESS) { 1962 /* Add a reference to the L2 page. */ 1963 l2pg = PHYS_TO_VM_PAGE(pmap_load(l1) & ~ATTR_MASK); 1964 l2pg->ref_count++; 1965 } else 1966 l2pg = NULL; 1967 } else if (va < VM_MAXUSER_ADDRESS) { 1968 /* Allocate a L2 page. */ 1969 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; 1970 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); 1971 if (l2pg == NULL) { 1972 if (lockp != NULL) 1973 goto retry; 1974 else 1975 return (NULL); 1976 } 1977 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); 1978 l2 = &l2[pmap_l2_index(va)]; 1979 } else 1980 panic("pmap_alloc_l2: missing page table page for va %#lx", 1981 va); 1982 *l2pgp = l2pg; 1983 return (l2); 1984 } 1985 1986 static vm_page_t 1987 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1988 { 1989 vm_pindex_t ptepindex; 1990 pd_entry_t *pde, tpde; 1991 #ifdef INVARIANTS 1992 pt_entry_t *pte; 1993 #endif 1994 vm_page_t m; 1995 int lvl; 1996 1997 /* 1998 * Calculate pagetable page index 1999 */ 2000 ptepindex = pmap_l2_pindex(va); 2001 retry: 2002 /* 2003 * Get the page directory entry 2004 */ 2005 pde = pmap_pde(pmap, va, &lvl); 2006 2007 /* 2008 * If the page table page is mapped, we just increment the hold count, 2009 * and activate it. If we get a level 2 pde it will point to a level 3 2010 * table. 2011 */ 2012 switch (lvl) { 2013 case -1: 2014 break; 2015 case 0: 2016 #ifdef INVARIANTS 2017 pte = pmap_l0_to_l1(pde, va); 2018 KASSERT(pmap_load(pte) == 0, 2019 ("pmap_alloc_l3: TODO: l0 superpages")); 2020 #endif 2021 break; 2022 case 1: 2023 #ifdef INVARIANTS 2024 pte = pmap_l1_to_l2(pde, va); 2025 KASSERT(pmap_load(pte) == 0, 2026 ("pmap_alloc_l3: TODO: l1 superpages")); 2027 #endif 2028 break; 2029 case 2: 2030 tpde = pmap_load(pde); 2031 if (tpde != 0) { 2032 m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK); 2033 m->ref_count++; 2034 return (m); 2035 } 2036 break; 2037 default: 2038 panic("pmap_alloc_l3: Invalid level %d", lvl); 2039 } 2040 2041 /* 2042 * Here if the pte page isn't mapped, or if it has been deallocated. 2043 */ 2044 m = _pmap_alloc_l3(pmap, ptepindex, lockp); 2045 if (m == NULL && lockp != NULL) 2046 goto retry; 2047 2048 return (m); 2049 } 2050 2051 /*************************************************** 2052 * Pmap allocation/deallocation routines. 2053 ***************************************************/ 2054 2055 /* 2056 * Release any resources held by the given physical map. 2057 * Called when a pmap initialized by pmap_pinit is being released. 2058 * Should only be called if the map contains no valid mappings. 2059 */ 2060 void 2061 pmap_release(pmap_t pmap) 2062 { 2063 boolean_t rv; 2064 struct spglist free; 2065 struct asid_set *set; 2066 vm_page_t m; 2067 int asid; 2068 2069 if (pmap->pm_levels != 4) { 2070 PMAP_ASSERT_STAGE2(pmap); 2071 KASSERT(pmap->pm_stats.resident_count == 1, 2072 ("pmap_release: pmap resident count %ld != 0", 2073 pmap->pm_stats.resident_count)); 2074 KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID, 2075 ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0])); 2076 2077 SLIST_INIT(&free); 2078 m = PHYS_TO_VM_PAGE(pmap->pm_ttbr); 2079 PMAP_LOCK(pmap); 2080 rv = pmap_unwire_l3(pmap, 0, m, &free); 2081 PMAP_UNLOCK(pmap); 2082 MPASS(rv == TRUE); 2083 vm_page_free_pages_toq(&free, true); 2084 } 2085 2086 KASSERT(pmap->pm_stats.resident_count == 0, 2087 ("pmap_release: pmap resident count %ld != 0", 2088 pmap->pm_stats.resident_count)); 2089 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2090 ("pmap_release: pmap has reserved page table page(s)")); 2091 2092 set = pmap->pm_asid_set; 2093 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 2094 2095 /* 2096 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate 2097 * the entries when removing them so rely on a later tlb invalidation. 2098 * this will happen when updating the VMID generation. Because of this 2099 * we don't reuse VMIDs within a generation. 2100 */ 2101 if (pmap->pm_stage == PM_STAGE1) { 2102 mtx_lock_spin(&set->asid_set_mutex); 2103 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) { 2104 asid = COOKIE_TO_ASID(pmap->pm_cookie); 2105 KASSERT(asid >= ASID_FIRST_AVAILABLE && 2106 asid < set->asid_set_size, 2107 ("pmap_release: pmap cookie has out-of-range asid")); 2108 bit_clear(set->asid_set, asid); 2109 } 2110 mtx_unlock_spin(&set->asid_set_mutex); 2111 } 2112 2113 m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr); 2114 vm_page_unwire_noq(m); 2115 vm_page_free_zero(m); 2116 } 2117 2118 static int 2119 kvm_size(SYSCTL_HANDLER_ARGS) 2120 { 2121 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 2122 2123 return sysctl_handle_long(oidp, &ksize, 0, req); 2124 } 2125 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 2126 0, 0, kvm_size, "LU", 2127 "Size of KVM"); 2128 2129 static int 2130 kvm_free(SYSCTL_HANDLER_ARGS) 2131 { 2132 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 2133 2134 return sysctl_handle_long(oidp, &kfree, 0, req); 2135 } 2136 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 2137 0, 0, kvm_free, "LU", 2138 "Amount of KVM free"); 2139 2140 /* 2141 * grow the number of kernel page table entries, if needed 2142 */ 2143 void 2144 pmap_growkernel(vm_offset_t addr) 2145 { 2146 vm_paddr_t paddr; 2147 vm_page_t nkpg; 2148 pd_entry_t *l0, *l1, *l2; 2149 2150 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2151 2152 addr = roundup2(addr, L2_SIZE); 2153 if (addr - 1 >= vm_map_max(kernel_map)) 2154 addr = vm_map_max(kernel_map); 2155 while (kernel_vm_end < addr) { 2156 l0 = pmap_l0(kernel_pmap, kernel_vm_end); 2157 KASSERT(pmap_load(l0) != 0, 2158 ("pmap_growkernel: No level 0 kernel entry")); 2159 2160 l1 = pmap_l0_to_l1(l0, kernel_vm_end); 2161 if (pmap_load(l1) == 0) { 2162 /* We need a new PDP entry */ 2163 nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT, 2164 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 2165 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2166 if (nkpg == NULL) 2167 panic("pmap_growkernel: no memory to grow kernel"); 2168 if ((nkpg->flags & PG_ZERO) == 0) 2169 pmap_zero_page(nkpg); 2170 /* See the dmb() in _pmap_alloc_l3(). */ 2171 dmb(ishst); 2172 paddr = VM_PAGE_TO_PHYS(nkpg); 2173 pmap_store(l1, paddr | L1_TABLE); 2174 continue; /* try again */ 2175 } 2176 l2 = pmap_l1_to_l2(l1, kernel_vm_end); 2177 if (pmap_load(l2) != 0) { 2178 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 2179 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2180 kernel_vm_end = vm_map_max(kernel_map); 2181 break; 2182 } 2183 continue; 2184 } 2185 2186 nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT, 2187 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2188 VM_ALLOC_ZERO); 2189 if (nkpg == NULL) 2190 panic("pmap_growkernel: no memory to grow kernel"); 2191 if ((nkpg->flags & PG_ZERO) == 0) 2192 pmap_zero_page(nkpg); 2193 /* See the dmb() in _pmap_alloc_l3(). */ 2194 dmb(ishst); 2195 paddr = VM_PAGE_TO_PHYS(nkpg); 2196 pmap_store(l2, paddr | L2_TABLE); 2197 2198 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 2199 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2200 kernel_vm_end = vm_map_max(kernel_map); 2201 break; 2202 } 2203 } 2204 } 2205 2206 /*************************************************** 2207 * page management routines. 2208 ***************************************************/ 2209 2210 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2211 CTASSERT(_NPCM == 3); 2212 CTASSERT(_NPCPV == 168); 2213 2214 static __inline struct pv_chunk * 2215 pv_to_chunk(pv_entry_t pv) 2216 { 2217 2218 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2219 } 2220 2221 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2222 2223 #define PC_FREE0 0xfffffffffffffffful 2224 #define PC_FREE1 0xfffffffffffffffful 2225 #define PC_FREE2 0x000000fffffffffful 2226 2227 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 2228 2229 #if 0 2230 #ifdef PV_STATS 2231 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2232 2233 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2234 "Current number of pv entry chunks"); 2235 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2236 "Current number of pv entry chunks allocated"); 2237 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2238 "Current number of pv entry chunks frees"); 2239 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2240 "Number of times tried to get a chunk page but failed."); 2241 2242 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 2243 static int pv_entry_spare; 2244 2245 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2246 "Current number of pv entry frees"); 2247 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2248 "Current number of pv entry allocs"); 2249 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2250 "Current number of pv entries"); 2251 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2252 "Current number of spare pv entries"); 2253 #endif 2254 #endif /* 0 */ 2255 2256 /* 2257 * We are in a serious low memory condition. Resort to 2258 * drastic measures to free some pages so we can allocate 2259 * another pv entry chunk. 2260 * 2261 * Returns NULL if PV entries were reclaimed from the specified pmap. 2262 * 2263 * We do not, however, unmap 2mpages because subsequent accesses will 2264 * allocate per-page pv entries until repromotion occurs, thereby 2265 * exacerbating the shortage of free pv entries. 2266 */ 2267 static vm_page_t 2268 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 2269 { 2270 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 2271 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 2272 struct md_page *pvh; 2273 pd_entry_t *pde; 2274 pmap_t next_pmap, pmap; 2275 pt_entry_t *pte, tpte; 2276 pv_entry_t pv; 2277 vm_offset_t va; 2278 vm_page_t m, m_pc; 2279 struct spglist free; 2280 uint64_t inuse; 2281 int bit, field, freed, lvl; 2282 static int active_reclaims = 0; 2283 2284 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2285 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 2286 2287 pmap = NULL; 2288 m_pc = NULL; 2289 SLIST_INIT(&free); 2290 bzero(&pc_marker_b, sizeof(pc_marker_b)); 2291 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 2292 pc_marker = (struct pv_chunk *)&pc_marker_b; 2293 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 2294 2295 mtx_lock(&pv_chunks_mutex); 2296 active_reclaims++; 2297 TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); 2298 TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); 2299 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 2300 SLIST_EMPTY(&free)) { 2301 next_pmap = pc->pc_pmap; 2302 if (next_pmap == NULL) { 2303 /* 2304 * The next chunk is a marker. However, it is 2305 * not our marker, so active_reclaims must be 2306 * > 1. Consequently, the next_chunk code 2307 * will not rotate the pv_chunks list. 2308 */ 2309 goto next_chunk; 2310 } 2311 mtx_unlock(&pv_chunks_mutex); 2312 2313 /* 2314 * A pv_chunk can only be removed from the pc_lru list 2315 * when both pv_chunks_mutex is owned and the 2316 * corresponding pmap is locked. 2317 */ 2318 if (pmap != next_pmap) { 2319 if (pmap != NULL && pmap != locked_pmap) 2320 PMAP_UNLOCK(pmap); 2321 pmap = next_pmap; 2322 /* Avoid deadlock and lock recursion. */ 2323 if (pmap > locked_pmap) { 2324 RELEASE_PV_LIST_LOCK(lockp); 2325 PMAP_LOCK(pmap); 2326 mtx_lock(&pv_chunks_mutex); 2327 continue; 2328 } else if (pmap != locked_pmap) { 2329 if (PMAP_TRYLOCK(pmap)) { 2330 mtx_lock(&pv_chunks_mutex); 2331 continue; 2332 } else { 2333 pmap = NULL; /* pmap is not locked */ 2334 mtx_lock(&pv_chunks_mutex); 2335 pc = TAILQ_NEXT(pc_marker, pc_lru); 2336 if (pc == NULL || 2337 pc->pc_pmap != next_pmap) 2338 continue; 2339 goto next_chunk; 2340 } 2341 } 2342 } 2343 2344 /* 2345 * Destroy every non-wired, 4 KB page mapping in the chunk. 2346 */ 2347 freed = 0; 2348 for (field = 0; field < _NPCM; field++) { 2349 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2350 inuse != 0; inuse &= ~(1UL << bit)) { 2351 bit = ffsl(inuse) - 1; 2352 pv = &pc->pc_pventry[field * 64 + bit]; 2353 va = pv->pv_va; 2354 pde = pmap_pde(pmap, va, &lvl); 2355 if (lvl != 2) 2356 continue; 2357 pte = pmap_l2_to_l3(pde, va); 2358 tpte = pmap_load(pte); 2359 if ((tpte & ATTR_SW_WIRED) != 0) 2360 continue; 2361 tpte = pmap_load_clear(pte); 2362 m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK); 2363 if (pmap_pte_dirty(pmap, tpte)) 2364 vm_page_dirty(m); 2365 if ((tpte & ATTR_AF) != 0) { 2366 pmap_invalidate_page(pmap, va); 2367 vm_page_aflag_set(m, PGA_REFERENCED); 2368 } 2369 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2370 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2371 m->md.pv_gen++; 2372 if (TAILQ_EMPTY(&m->md.pv_list) && 2373 (m->flags & PG_FICTITIOUS) == 0) { 2374 pvh = page_to_pvh(m); 2375 if (TAILQ_EMPTY(&pvh->pv_list)) { 2376 vm_page_aflag_clear(m, 2377 PGA_WRITEABLE); 2378 } 2379 } 2380 pc->pc_map[field] |= 1UL << bit; 2381 pmap_unuse_pt(pmap, va, pmap_load(pde), &free); 2382 freed++; 2383 } 2384 } 2385 if (freed == 0) { 2386 mtx_lock(&pv_chunks_mutex); 2387 goto next_chunk; 2388 } 2389 /* Every freed mapping is for a 4 KB page. */ 2390 pmap_resident_count_dec(pmap, freed); 2391 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 2392 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 2393 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 2394 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2395 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && 2396 pc->pc_map[2] == PC_FREE2) { 2397 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 2398 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 2399 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 2400 /* Entire chunk is free; return it. */ 2401 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2402 dump_drop_page(m_pc->phys_addr); 2403 mtx_lock(&pv_chunks_mutex); 2404 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2405 break; 2406 } 2407 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2408 mtx_lock(&pv_chunks_mutex); 2409 /* One freed pv entry in locked_pmap is sufficient. */ 2410 if (pmap == locked_pmap) 2411 break; 2412 2413 next_chunk: 2414 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 2415 TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); 2416 if (active_reclaims == 1 && pmap != NULL) { 2417 /* 2418 * Rotate the pv chunks list so that we do not 2419 * scan the same pv chunks that could not be 2420 * freed (because they contained a wired 2421 * and/or superpage mapping) on every 2422 * invocation of reclaim_pv_chunk(). 2423 */ 2424 while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { 2425 MPASS(pc->pc_pmap != NULL); 2426 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2427 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2428 } 2429 } 2430 } 2431 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 2432 TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); 2433 active_reclaims--; 2434 mtx_unlock(&pv_chunks_mutex); 2435 if (pmap != NULL && pmap != locked_pmap) 2436 PMAP_UNLOCK(pmap); 2437 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 2438 m_pc = SLIST_FIRST(&free); 2439 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2440 /* Recycle a freed page table page. */ 2441 m_pc->ref_count = 1; 2442 } 2443 vm_page_free_pages_toq(&free, true); 2444 return (m_pc); 2445 } 2446 2447 /* 2448 * free the pv_entry back to the free list 2449 */ 2450 static void 2451 free_pv_entry(pmap_t pmap, pv_entry_t pv) 2452 { 2453 struct pv_chunk *pc; 2454 int idx, field, bit; 2455 2456 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2457 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 2458 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 2459 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 2460 pc = pv_to_chunk(pv); 2461 idx = pv - &pc->pc_pventry[0]; 2462 field = idx / 64; 2463 bit = idx % 64; 2464 pc->pc_map[field] |= 1ul << bit; 2465 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 2466 pc->pc_map[2] != PC_FREE2) { 2467 /* 98% of the time, pc is already at the head of the list. */ 2468 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 2469 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2470 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2471 } 2472 return; 2473 } 2474 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2475 free_pv_chunk(pc); 2476 } 2477 2478 static void 2479 free_pv_chunk(struct pv_chunk *pc) 2480 { 2481 vm_page_t m; 2482 2483 mtx_lock(&pv_chunks_mutex); 2484 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2485 mtx_unlock(&pv_chunks_mutex); 2486 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 2487 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 2488 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 2489 /* entire chunk is free, return it */ 2490 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2491 dump_drop_page(m->phys_addr); 2492 vm_page_unwire_noq(m); 2493 vm_page_free(m); 2494 } 2495 2496 /* 2497 * Returns a new PV entry, allocating a new PV chunk from the system when 2498 * needed. If this PV chunk allocation fails and a PV list lock pointer was 2499 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 2500 * returned. 2501 * 2502 * The given PV list lock may be released. 2503 */ 2504 static pv_entry_t 2505 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 2506 { 2507 int bit, field; 2508 pv_entry_t pv; 2509 struct pv_chunk *pc; 2510 vm_page_t m; 2511 2512 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2513 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 2514 retry: 2515 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2516 if (pc != NULL) { 2517 for (field = 0; field < _NPCM; field++) { 2518 if (pc->pc_map[field]) { 2519 bit = ffsl(pc->pc_map[field]) - 1; 2520 break; 2521 } 2522 } 2523 if (field < _NPCM) { 2524 pv = &pc->pc_pventry[field * 64 + bit]; 2525 pc->pc_map[field] &= ~(1ul << bit); 2526 /* If this was the last item, move it to tail */ 2527 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 2528 pc->pc_map[2] == 0) { 2529 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2530 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 2531 pc_list); 2532 } 2533 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 2534 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 2535 return (pv); 2536 } 2537 } 2538 /* No free items, allocate another chunk */ 2539 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 2540 VM_ALLOC_WIRED); 2541 if (m == NULL) { 2542 if (lockp == NULL) { 2543 PV_STAT(pc_chunk_tryfail++); 2544 return (NULL); 2545 } 2546 m = reclaim_pv_chunk(pmap, lockp); 2547 if (m == NULL) 2548 goto retry; 2549 } 2550 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 2551 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 2552 dump_add_page(m->phys_addr); 2553 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 2554 pc->pc_pmap = pmap; 2555 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 2556 pc->pc_map[1] = PC_FREE1; 2557 pc->pc_map[2] = PC_FREE2; 2558 mtx_lock(&pv_chunks_mutex); 2559 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2560 mtx_unlock(&pv_chunks_mutex); 2561 pv = &pc->pc_pventry[0]; 2562 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2563 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 2564 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 2565 return (pv); 2566 } 2567 2568 /* 2569 * Ensure that the number of spare PV entries in the specified pmap meets or 2570 * exceeds the given count, "needed". 2571 * 2572 * The given PV list lock may be released. 2573 */ 2574 static void 2575 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 2576 { 2577 struct pch new_tail; 2578 struct pv_chunk *pc; 2579 vm_page_t m; 2580 int avail, free; 2581 bool reclaimed; 2582 2583 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2584 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 2585 2586 /* 2587 * Newly allocated PV chunks must be stored in a private list until 2588 * the required number of PV chunks have been allocated. Otherwise, 2589 * reclaim_pv_chunk() could recycle one of these chunks. In 2590 * contrast, these chunks must be added to the pmap upon allocation. 2591 */ 2592 TAILQ_INIT(&new_tail); 2593 retry: 2594 avail = 0; 2595 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 2596 bit_count((bitstr_t *)pc->pc_map, 0, 2597 sizeof(pc->pc_map) * NBBY, &free); 2598 if (free == 0) 2599 break; 2600 avail += free; 2601 if (avail >= needed) 2602 break; 2603 } 2604 for (reclaimed = false; avail < needed; avail += _NPCPV) { 2605 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 2606 VM_ALLOC_WIRED); 2607 if (m == NULL) { 2608 m = reclaim_pv_chunk(pmap, lockp); 2609 if (m == NULL) 2610 goto retry; 2611 reclaimed = true; 2612 } 2613 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 2614 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 2615 dump_add_page(m->phys_addr); 2616 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 2617 pc->pc_pmap = pmap; 2618 pc->pc_map[0] = PC_FREE0; 2619 pc->pc_map[1] = PC_FREE1; 2620 pc->pc_map[2] = PC_FREE2; 2621 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2622 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2623 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 2624 2625 /* 2626 * The reclaim might have freed a chunk from the current pmap. 2627 * If that chunk contained available entries, we need to 2628 * re-count the number of available entries. 2629 */ 2630 if (reclaimed) 2631 goto retry; 2632 } 2633 if (!TAILQ_EMPTY(&new_tail)) { 2634 mtx_lock(&pv_chunks_mutex); 2635 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 2636 mtx_unlock(&pv_chunks_mutex); 2637 } 2638 } 2639 2640 /* 2641 * First find and then remove the pv entry for the specified pmap and virtual 2642 * address from the specified pv list. Returns the pv entry if found and NULL 2643 * otherwise. This operation can be performed on pv lists for either 4KB or 2644 * 2MB page mappings. 2645 */ 2646 static __inline pv_entry_t 2647 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2648 { 2649 pv_entry_t pv; 2650 2651 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 2652 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 2653 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 2654 pvh->pv_gen++; 2655 break; 2656 } 2657 } 2658 return (pv); 2659 } 2660 2661 /* 2662 * After demotion from a 2MB page mapping to 512 4KB page mappings, 2663 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 2664 * entries for each of the 4KB page mappings. 2665 */ 2666 static void 2667 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 2668 struct rwlock **lockp) 2669 { 2670 struct md_page *pvh; 2671 struct pv_chunk *pc; 2672 pv_entry_t pv; 2673 vm_offset_t va_last; 2674 vm_page_t m; 2675 int bit, field; 2676 2677 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2678 KASSERT((va & L2_OFFSET) == 0, 2679 ("pmap_pv_demote_l2: va is not 2mpage aligned")); 2680 KASSERT((pa & L2_OFFSET) == 0, 2681 ("pmap_pv_demote_l2: pa is not 2mpage aligned")); 2682 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2683 2684 /* 2685 * Transfer the 2mpage's pv entry for this mapping to the first 2686 * page's pv list. Once this transfer begins, the pv list lock 2687 * must not be released until the last pv entry is reinstantiated. 2688 */ 2689 pvh = pa_to_pvh(pa); 2690 pv = pmap_pvh_remove(pvh, pmap, va); 2691 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); 2692 m = PHYS_TO_VM_PAGE(pa); 2693 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2694 m->md.pv_gen++; 2695 /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */ 2696 PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1)); 2697 va_last = va + L2_SIZE - PAGE_SIZE; 2698 for (;;) { 2699 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2700 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 2701 pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); 2702 for (field = 0; field < _NPCM; field++) { 2703 while (pc->pc_map[field]) { 2704 bit = ffsl(pc->pc_map[field]) - 1; 2705 pc->pc_map[field] &= ~(1ul << bit); 2706 pv = &pc->pc_pventry[field * 64 + bit]; 2707 va += PAGE_SIZE; 2708 pv->pv_va = va; 2709 m++; 2710 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2711 ("pmap_pv_demote_l2: page %p is not managed", m)); 2712 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2713 m->md.pv_gen++; 2714 if (va == va_last) 2715 goto out; 2716 } 2717 } 2718 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2719 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2720 } 2721 out: 2722 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 2723 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2724 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2725 } 2726 PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1)); 2727 PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1)); 2728 } 2729 2730 /* 2731 * First find and then destroy the pv entry for the specified pmap and virtual 2732 * address. This operation can be performed on pv lists for either 4KB or 2MB 2733 * page mappings. 2734 */ 2735 static void 2736 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2737 { 2738 pv_entry_t pv; 2739 2740 pv = pmap_pvh_remove(pvh, pmap, va); 2741 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 2742 free_pv_entry(pmap, pv); 2743 } 2744 2745 /* 2746 * Conditionally create the PV entry for a 4KB page mapping if the required 2747 * memory can be allocated without resorting to reclamation. 2748 */ 2749 static boolean_t 2750 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 2751 struct rwlock **lockp) 2752 { 2753 pv_entry_t pv; 2754 2755 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2756 /* Pass NULL instead of the lock pointer to disable reclamation. */ 2757 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 2758 pv->pv_va = va; 2759 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2760 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2761 m->md.pv_gen++; 2762 return (TRUE); 2763 } else 2764 return (FALSE); 2765 } 2766 2767 /* 2768 * Create the PV entry for a 2MB page mapping. Always returns true unless the 2769 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 2770 * false if the PV entry cannot be allocated without resorting to reclamation. 2771 */ 2772 static bool 2773 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, 2774 struct rwlock **lockp) 2775 { 2776 struct md_page *pvh; 2777 pv_entry_t pv; 2778 vm_paddr_t pa; 2779 2780 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2781 /* Pass NULL instead of the lock pointer to disable reclamation. */ 2782 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 2783 NULL : lockp)) == NULL) 2784 return (false); 2785 pv->pv_va = va; 2786 pa = l2e & ~ATTR_MASK; 2787 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2788 pvh = pa_to_pvh(pa); 2789 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2790 pvh->pv_gen++; 2791 return (true); 2792 } 2793 2794 static void 2795 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 2796 { 2797 pt_entry_t newl2, oldl2; 2798 vm_page_t ml3; 2799 vm_paddr_t ml3pa; 2800 2801 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); 2802 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 2803 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2804 2805 ml3 = pmap_remove_pt_page(pmap, va); 2806 if (ml3 == NULL) 2807 panic("pmap_remove_kernel_l2: Missing pt page"); 2808 2809 ml3pa = VM_PAGE_TO_PHYS(ml3); 2810 newl2 = ml3pa | L2_TABLE; 2811 2812 /* 2813 * If this page table page was unmapped by a promotion, then it 2814 * contains valid mappings. Zero it to invalidate those mappings. 2815 */ 2816 if (ml3->valid != 0) 2817 pagezero((void *)PHYS_TO_DMAP(ml3pa)); 2818 2819 /* 2820 * Demote the mapping. The caller must have already invalidated the 2821 * mapping (i.e., the "break" in break-before-make). 2822 */ 2823 oldl2 = pmap_load_store(l2, newl2); 2824 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", 2825 __func__, l2, oldl2)); 2826 } 2827 2828 /* 2829 * pmap_remove_l2: Do the things to unmap a level 2 superpage. 2830 */ 2831 static int 2832 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 2833 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) 2834 { 2835 struct md_page *pvh; 2836 pt_entry_t old_l2; 2837 vm_offset_t eva, va; 2838 vm_page_t m, ml3; 2839 2840 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2841 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); 2842 old_l2 = pmap_load_clear(l2); 2843 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, 2844 ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2)); 2845 2846 /* 2847 * Since a promotion must break the 4KB page mappings before making 2848 * the 2MB page mapping, a pmap_invalidate_page() suffices. 2849 */ 2850 pmap_invalidate_page(pmap, sva); 2851 2852 if (old_l2 & ATTR_SW_WIRED) 2853 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; 2854 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); 2855 if (old_l2 & ATTR_SW_MANAGED) { 2856 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK); 2857 pvh = pa_to_pvh(old_l2 & ~ATTR_MASK); 2858 pmap_pvh_free(pvh, pmap, sva); 2859 eva = sva + L2_SIZE; 2860 for (va = sva, m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); 2861 va < eva; va += PAGE_SIZE, m++) { 2862 if (pmap_pte_dirty(pmap, old_l2)) 2863 vm_page_dirty(m); 2864 if (old_l2 & ATTR_AF) 2865 vm_page_aflag_set(m, PGA_REFERENCED); 2866 if (TAILQ_EMPTY(&m->md.pv_list) && 2867 TAILQ_EMPTY(&pvh->pv_list)) 2868 vm_page_aflag_clear(m, PGA_WRITEABLE); 2869 } 2870 } 2871 if (pmap == kernel_pmap) { 2872 pmap_remove_kernel_l2(pmap, l2, sva); 2873 } else { 2874 ml3 = pmap_remove_pt_page(pmap, sva); 2875 if (ml3 != NULL) { 2876 KASSERT(ml3->valid == VM_PAGE_BITS_ALL, 2877 ("pmap_remove_l2: l3 page not promoted")); 2878 pmap_resident_count_dec(pmap, 1); 2879 KASSERT(ml3->ref_count == NL3PG, 2880 ("pmap_remove_l2: l3 page ref count error")); 2881 ml3->ref_count = 0; 2882 pmap_add_delayed_free_list(ml3, free, FALSE); 2883 } 2884 } 2885 return (pmap_unuse_pt(pmap, sva, l1e, free)); 2886 } 2887 2888 /* 2889 * pmap_remove_l3: do the things to unmap a page in a process 2890 */ 2891 static int 2892 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 2893 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) 2894 { 2895 struct md_page *pvh; 2896 pt_entry_t old_l3; 2897 vm_page_t m; 2898 2899 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2900 old_l3 = pmap_load_clear(l3); 2901 pmap_invalidate_page(pmap, va); 2902 if (old_l3 & ATTR_SW_WIRED) 2903 pmap->pm_stats.wired_count -= 1; 2904 pmap_resident_count_dec(pmap, 1); 2905 if (old_l3 & ATTR_SW_MANAGED) { 2906 m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); 2907 if (pmap_pte_dirty(pmap, old_l3)) 2908 vm_page_dirty(m); 2909 if (old_l3 & ATTR_AF) 2910 vm_page_aflag_set(m, PGA_REFERENCED); 2911 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2912 pmap_pvh_free(&m->md, pmap, va); 2913 if (TAILQ_EMPTY(&m->md.pv_list) && 2914 (m->flags & PG_FICTITIOUS) == 0) { 2915 pvh = page_to_pvh(m); 2916 if (TAILQ_EMPTY(&pvh->pv_list)) 2917 vm_page_aflag_clear(m, PGA_WRITEABLE); 2918 } 2919 } 2920 return (pmap_unuse_pt(pmap, va, l2e, free)); 2921 } 2922 2923 /* 2924 * Remove the specified range of addresses from the L3 page table that is 2925 * identified by the given L2 entry. 2926 */ 2927 static void 2928 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva, 2929 vm_offset_t eva, struct spglist *free, struct rwlock **lockp) 2930 { 2931 struct md_page *pvh; 2932 struct rwlock *new_lock; 2933 pt_entry_t *l3, old_l3; 2934 vm_offset_t va; 2935 vm_page_t l3pg, m; 2936 2937 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2938 KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE), 2939 ("pmap_remove_l3_range: range crosses an L3 page table boundary")); 2940 l3pg = sva < VM_MAXUSER_ADDRESS ? PHYS_TO_VM_PAGE(l2e & ~ATTR_MASK) : 2941 NULL; 2942 va = eva; 2943 for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) { 2944 if (!pmap_l3_valid(pmap_load(l3))) { 2945 if (va != eva) { 2946 pmap_invalidate_range(pmap, va, sva); 2947 va = eva; 2948 } 2949 continue; 2950 } 2951 old_l3 = pmap_load_clear(l3); 2952 if ((old_l3 & ATTR_SW_WIRED) != 0) 2953 pmap->pm_stats.wired_count--; 2954 pmap_resident_count_dec(pmap, 1); 2955 if ((old_l3 & ATTR_SW_MANAGED) != 0) { 2956 m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); 2957 if (pmap_pte_dirty(pmap, old_l3)) 2958 vm_page_dirty(m); 2959 if ((old_l3 & ATTR_AF) != 0) 2960 vm_page_aflag_set(m, PGA_REFERENCED); 2961 new_lock = PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)); 2962 if (new_lock != *lockp) { 2963 if (*lockp != NULL) { 2964 /* 2965 * Pending TLB invalidations must be 2966 * performed before the PV list lock is 2967 * released. Otherwise, a concurrent 2968 * pmap_remove_all() on a physical page 2969 * could return while a stale TLB entry 2970 * still provides access to that page. 2971 */ 2972 if (va != eva) { 2973 pmap_invalidate_range(pmap, va, 2974 sva); 2975 va = eva; 2976 } 2977 rw_wunlock(*lockp); 2978 } 2979 *lockp = new_lock; 2980 rw_wlock(*lockp); 2981 } 2982 pmap_pvh_free(&m->md, pmap, sva); 2983 if (TAILQ_EMPTY(&m->md.pv_list) && 2984 (m->flags & PG_FICTITIOUS) == 0) { 2985 pvh = page_to_pvh(m); 2986 if (TAILQ_EMPTY(&pvh->pv_list)) 2987 vm_page_aflag_clear(m, PGA_WRITEABLE); 2988 } 2989 } 2990 if (va == eva) 2991 va = sva; 2992 if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) { 2993 sva += L3_SIZE; 2994 break; 2995 } 2996 } 2997 if (va != eva) 2998 pmap_invalidate_range(pmap, va, sva); 2999 } 3000 3001 /* 3002 * Remove the given range of addresses from the specified map. 3003 * 3004 * It is assumed that the start and end are properly 3005 * rounded to the page size. 3006 */ 3007 void 3008 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3009 { 3010 struct rwlock *lock; 3011 vm_offset_t va_next; 3012 pd_entry_t *l0, *l1, *l2; 3013 pt_entry_t l3_paddr; 3014 struct spglist free; 3015 3016 /* 3017 * Perform an unsynchronized read. This is, however, safe. 3018 */ 3019 if (pmap->pm_stats.resident_count == 0) 3020 return; 3021 3022 SLIST_INIT(&free); 3023 3024 PMAP_LOCK(pmap); 3025 3026 lock = NULL; 3027 for (; sva < eva; sva = va_next) { 3028 if (pmap->pm_stats.resident_count == 0) 3029 break; 3030 3031 l0 = pmap_l0(pmap, sva); 3032 if (pmap_load(l0) == 0) { 3033 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 3034 if (va_next < sva) 3035 va_next = eva; 3036 continue; 3037 } 3038 3039 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3040 if (va_next < sva) 3041 va_next = eva; 3042 l1 = pmap_l0_to_l1(l0, sva); 3043 if (pmap_load(l1) == 0) 3044 continue; 3045 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 3046 KASSERT(va_next <= eva, 3047 ("partial update of non-transparent 1G page " 3048 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 3049 pmap_load(l1), sva, eva, va_next)); 3050 MPASS(pmap != kernel_pmap); 3051 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0); 3052 pmap_clear(l1); 3053 pmap_invalidate_page(pmap, sva); 3054 pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE); 3055 pmap_unuse_pt(pmap, sva, pmap_load(l0), &free); 3056 continue; 3057 } 3058 3059 /* 3060 * Calculate index for next page table. 3061 */ 3062 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3063 if (va_next < sva) 3064 va_next = eva; 3065 3066 l2 = pmap_l1_to_l2(l1, sva); 3067 if (l2 == NULL) 3068 continue; 3069 3070 l3_paddr = pmap_load(l2); 3071 3072 if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) { 3073 if (sva + L2_SIZE == va_next && eva >= va_next) { 3074 pmap_remove_l2(pmap, l2, sva, pmap_load(l1), 3075 &free, &lock); 3076 continue; 3077 } else if (pmap_demote_l2_locked(pmap, l2, sva, 3078 &lock) == NULL) 3079 continue; 3080 l3_paddr = pmap_load(l2); 3081 } 3082 3083 /* 3084 * Weed out invalid mappings. 3085 */ 3086 if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE) 3087 continue; 3088 3089 /* 3090 * Limit our scan to either the end of the va represented 3091 * by the current page table page, or to the end of the 3092 * range being removed. 3093 */ 3094 if (va_next > eva) 3095 va_next = eva; 3096 3097 pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free, 3098 &lock); 3099 } 3100 if (lock != NULL) 3101 rw_wunlock(lock); 3102 PMAP_UNLOCK(pmap); 3103 vm_page_free_pages_toq(&free, true); 3104 } 3105 3106 /* 3107 * Routine: pmap_remove_all 3108 * Function: 3109 * Removes this physical page from 3110 * all physical maps in which it resides. 3111 * Reflects back modify bits to the pager. 3112 * 3113 * Notes: 3114 * Original versions of this routine were very 3115 * inefficient because they iteratively called 3116 * pmap_remove (slow...) 3117 */ 3118 3119 void 3120 pmap_remove_all(vm_page_t m) 3121 { 3122 struct md_page *pvh; 3123 pv_entry_t pv; 3124 pmap_t pmap; 3125 struct rwlock *lock; 3126 pd_entry_t *pde, tpde; 3127 pt_entry_t *pte, tpte; 3128 vm_offset_t va; 3129 struct spglist free; 3130 int lvl, pvh_gen, md_gen; 3131 3132 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3133 ("pmap_remove_all: page %p is not managed", m)); 3134 SLIST_INIT(&free); 3135 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3136 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 3137 retry: 3138 rw_wlock(lock); 3139 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 3140 pmap = PV_PMAP(pv); 3141 if (!PMAP_TRYLOCK(pmap)) { 3142 pvh_gen = pvh->pv_gen; 3143 rw_wunlock(lock); 3144 PMAP_LOCK(pmap); 3145 rw_wlock(lock); 3146 if (pvh_gen != pvh->pv_gen) { 3147 rw_wunlock(lock); 3148 PMAP_UNLOCK(pmap); 3149 goto retry; 3150 } 3151 } 3152 va = pv->pv_va; 3153 pte = pmap_pte(pmap, va, &lvl); 3154 KASSERT(pte != NULL, 3155 ("pmap_remove_all: no page table entry found")); 3156 KASSERT(lvl == 2, 3157 ("pmap_remove_all: invalid pte level %d", lvl)); 3158 3159 pmap_demote_l2_locked(pmap, pte, va, &lock); 3160 PMAP_UNLOCK(pmap); 3161 } 3162 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3163 pmap = PV_PMAP(pv); 3164 PMAP_ASSERT_STAGE1(pmap); 3165 if (!PMAP_TRYLOCK(pmap)) { 3166 pvh_gen = pvh->pv_gen; 3167 md_gen = m->md.pv_gen; 3168 rw_wunlock(lock); 3169 PMAP_LOCK(pmap); 3170 rw_wlock(lock); 3171 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 3172 rw_wunlock(lock); 3173 PMAP_UNLOCK(pmap); 3174 goto retry; 3175 } 3176 } 3177 pmap_resident_count_dec(pmap, 1); 3178 3179 pde = pmap_pde(pmap, pv->pv_va, &lvl); 3180 KASSERT(pde != NULL, 3181 ("pmap_remove_all: no page directory entry found")); 3182 KASSERT(lvl == 2, 3183 ("pmap_remove_all: invalid pde level %d", lvl)); 3184 tpde = pmap_load(pde); 3185 3186 pte = pmap_l2_to_l3(pde, pv->pv_va); 3187 tpte = pmap_load_clear(pte); 3188 if (tpte & ATTR_SW_WIRED) 3189 pmap->pm_stats.wired_count--; 3190 if ((tpte & ATTR_AF) != 0) { 3191 pmap_invalidate_page(pmap, pv->pv_va); 3192 vm_page_aflag_set(m, PGA_REFERENCED); 3193 } 3194 3195 /* 3196 * Update the vm_page_t clean and reference bits. 3197 */ 3198 if (pmap_pte_dirty(pmap, tpte)) 3199 vm_page_dirty(m); 3200 pmap_unuse_pt(pmap, pv->pv_va, tpde, &free); 3201 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3202 m->md.pv_gen++; 3203 free_pv_entry(pmap, pv); 3204 PMAP_UNLOCK(pmap); 3205 } 3206 vm_page_aflag_clear(m, PGA_WRITEABLE); 3207 rw_wunlock(lock); 3208 vm_page_free_pages_toq(&free, true); 3209 } 3210 3211 /* 3212 * pmap_protect_l2: do the things to protect a 2MB page in a pmap 3213 */ 3214 static void 3215 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask, 3216 pt_entry_t nbits) 3217 { 3218 pd_entry_t old_l2; 3219 vm_page_t m, mt; 3220 3221 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3222 PMAP_ASSERT_STAGE1(pmap); 3223 KASSERT((sva & L2_OFFSET) == 0, 3224 ("pmap_protect_l2: sva is not 2mpage aligned")); 3225 old_l2 = pmap_load(l2); 3226 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, 3227 ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2)); 3228 3229 /* 3230 * Return if the L2 entry already has the desired access restrictions 3231 * in place. 3232 */ 3233 retry: 3234 if ((old_l2 & mask) == nbits) 3235 return; 3236 3237 /* 3238 * When a dirty read/write superpage mapping is write protected, 3239 * update the dirty field of each of the superpage's constituent 4KB 3240 * pages. 3241 */ 3242 if ((old_l2 & ATTR_SW_MANAGED) != 0 && 3243 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 3244 pmap_pte_dirty(pmap, old_l2)) { 3245 m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); 3246 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 3247 vm_page_dirty(mt); 3248 } 3249 3250 if (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits)) 3251 goto retry; 3252 3253 /* 3254 * Since a promotion must break the 4KB page mappings before making 3255 * the 2MB page mapping, a pmap_invalidate_page() suffices. 3256 */ 3257 pmap_invalidate_page(pmap, sva); 3258 } 3259 3260 /* 3261 * Set the physical protection on the 3262 * specified range of this map as requested. 3263 */ 3264 void 3265 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 3266 { 3267 vm_offset_t va, va_next; 3268 pd_entry_t *l0, *l1, *l2; 3269 pt_entry_t *l3p, l3, mask, nbits; 3270 3271 PMAP_ASSERT_STAGE1(pmap); 3272 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 3273 if (prot == VM_PROT_NONE) { 3274 pmap_remove(pmap, sva, eva); 3275 return; 3276 } 3277 3278 mask = nbits = 0; 3279 if ((prot & VM_PROT_WRITE) == 0) { 3280 mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM; 3281 nbits |= ATTR_S1_AP(ATTR_S1_AP_RO); 3282 } 3283 if ((prot & VM_PROT_EXECUTE) == 0) { 3284 mask |= ATTR_S1_XN; 3285 nbits |= ATTR_S1_XN; 3286 } 3287 if (mask == 0) 3288 return; 3289 3290 PMAP_LOCK(pmap); 3291 for (; sva < eva; sva = va_next) { 3292 l0 = pmap_l0(pmap, sva); 3293 if (pmap_load(l0) == 0) { 3294 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 3295 if (va_next < sva) 3296 va_next = eva; 3297 continue; 3298 } 3299 3300 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3301 if (va_next < sva) 3302 va_next = eva; 3303 l1 = pmap_l0_to_l1(l0, sva); 3304 if (pmap_load(l1) == 0) 3305 continue; 3306 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 3307 KASSERT(va_next <= eva, 3308 ("partial update of non-transparent 1G page " 3309 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 3310 pmap_load(l1), sva, eva, va_next)); 3311 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0); 3312 if ((pmap_load(l1) & mask) != nbits) { 3313 pmap_store(l1, (pmap_load(l1) & ~mask) | nbits); 3314 pmap_invalidate_page(pmap, sva); 3315 } 3316 continue; 3317 } 3318 3319 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3320 if (va_next < sva) 3321 va_next = eva; 3322 3323 l2 = pmap_l1_to_l2(l1, sva); 3324 if (pmap_load(l2) == 0) 3325 continue; 3326 3327 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { 3328 if (sva + L2_SIZE == va_next && eva >= va_next) { 3329 pmap_protect_l2(pmap, l2, sva, mask, nbits); 3330 continue; 3331 } else if (pmap_demote_l2(pmap, l2, sva) == NULL) 3332 continue; 3333 } 3334 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 3335 ("pmap_protect: Invalid L2 entry after demotion")); 3336 3337 if (va_next > eva) 3338 va_next = eva; 3339 3340 va = va_next; 3341 for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++, 3342 sva += L3_SIZE) { 3343 l3 = pmap_load(l3p); 3344 retry: 3345 /* 3346 * Go to the next L3 entry if the current one is 3347 * invalid or already has the desired access 3348 * restrictions in place. (The latter case occurs 3349 * frequently. For example, in a "buildworld" 3350 * workload, almost 1 out of 4 L3 entries already 3351 * have the desired restrictions.) 3352 */ 3353 if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) { 3354 if (va != va_next) { 3355 pmap_invalidate_range(pmap, va, sva); 3356 va = va_next; 3357 } 3358 continue; 3359 } 3360 3361 /* 3362 * When a dirty read/write mapping is write protected, 3363 * update the page's dirty field. 3364 */ 3365 if ((l3 & ATTR_SW_MANAGED) != 0 && 3366 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 3367 pmap_pte_dirty(pmap, l3)) 3368 vm_page_dirty(PHYS_TO_VM_PAGE(l3 & ~ATTR_MASK)); 3369 3370 if (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) | nbits)) 3371 goto retry; 3372 if (va == va_next) 3373 va = sva; 3374 } 3375 if (va != va_next) 3376 pmap_invalidate_range(pmap, va, sva); 3377 } 3378 PMAP_UNLOCK(pmap); 3379 } 3380 3381 /* 3382 * Inserts the specified page table page into the specified pmap's collection 3383 * of idle page table pages. Each of a pmap's page table pages is responsible 3384 * for mapping a distinct range of virtual addresses. The pmap's collection is 3385 * ordered by this virtual address range. 3386 * 3387 * If "promoted" is false, then the page table page "mpte" must be zero filled. 3388 */ 3389 static __inline int 3390 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) 3391 { 3392 3393 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3394 mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; 3395 return (vm_radix_insert(&pmap->pm_root, mpte)); 3396 } 3397 3398 /* 3399 * Removes the page table page mapping the specified virtual address from the 3400 * specified pmap's collection of idle page table pages, and returns it. 3401 * Otherwise, returns NULL if there is no page table page corresponding to the 3402 * specified virtual address. 3403 */ 3404 static __inline vm_page_t 3405 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 3406 { 3407 3408 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3409 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); 3410 } 3411 3412 /* 3413 * Performs a break-before-make update of a pmap entry. This is needed when 3414 * either promoting or demoting pages to ensure the TLB doesn't get into an 3415 * inconsistent state. 3416 */ 3417 static void 3418 pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte, 3419 vm_offset_t va, vm_size_t size) 3420 { 3421 register_t intr; 3422 3423 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3424 3425 /* 3426 * Ensure we don't get switched out with the page table in an 3427 * inconsistent state. We also need to ensure no interrupts fire 3428 * as they may make use of an address we are about to invalidate. 3429 */ 3430 intr = intr_disable(); 3431 3432 /* 3433 * Clear the old mapping's valid bit, but leave the rest of the entry 3434 * unchanged, so that a lockless, concurrent pmap_kextract() can still 3435 * lookup the physical address. 3436 */ 3437 pmap_clear_bits(pte, ATTR_DESCR_VALID); 3438 pmap_invalidate_range(pmap, va, va + size); 3439 3440 /* Create the new mapping */ 3441 pmap_store(pte, newpte); 3442 dsb(ishst); 3443 3444 intr_restore(intr); 3445 } 3446 3447 #if VM_NRESERVLEVEL > 0 3448 /* 3449 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 3450 * replace the many pv entries for the 4KB page mappings by a single pv entry 3451 * for the 2MB page mapping. 3452 */ 3453 static void 3454 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3455 struct rwlock **lockp) 3456 { 3457 struct md_page *pvh; 3458 pv_entry_t pv; 3459 vm_offset_t va_last; 3460 vm_page_t m; 3461 3462 KASSERT((pa & L2_OFFSET) == 0, 3463 ("pmap_pv_promote_l2: pa is not 2mpage aligned")); 3464 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3465 3466 /* 3467 * Transfer the first page's pv entry for this mapping to the 2mpage's 3468 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 3469 * a transfer avoids the possibility that get_pv_entry() calls 3470 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 3471 * mappings that is being promoted. 3472 */ 3473 m = PHYS_TO_VM_PAGE(pa); 3474 va = va & ~L2_OFFSET; 3475 pv = pmap_pvh_remove(&m->md, pmap, va); 3476 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found")); 3477 pvh = pa_to_pvh(pa); 3478 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3479 pvh->pv_gen++; 3480 /* Free the remaining NPTEPG - 1 pv entries. */ 3481 va_last = va + L2_SIZE - PAGE_SIZE; 3482 do { 3483 m++; 3484 va += PAGE_SIZE; 3485 pmap_pvh_free(&m->md, pmap, va); 3486 } while (va < va_last); 3487 } 3488 3489 /* 3490 * Tries to promote the 512, contiguous 4KB page mappings that are within a 3491 * single level 2 table entry to a single 2MB page mapping. For promotion 3492 * to occur, two conditions must be met: (1) the 4KB page mappings must map 3493 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 3494 * identical characteristics. 3495 */ 3496 static void 3497 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 3498 struct rwlock **lockp) 3499 { 3500 pt_entry_t *firstl3, *l3, newl2, oldl3, pa; 3501 vm_page_t mpte; 3502 vm_offset_t sva; 3503 3504 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3505 PMAP_ASSERT_STAGE1(pmap); 3506 3507 sva = va & ~L2_OFFSET; 3508 firstl3 = pmap_l2_to_l3(l2, sva); 3509 newl2 = pmap_load(firstl3); 3510 3511 setl2: 3512 if (((newl2 & (~ATTR_MASK | ATTR_AF)) & L2_OFFSET) != ATTR_AF) { 3513 atomic_add_long(&pmap_l2_p_failures, 1); 3514 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 3515 " in pmap %p", va, pmap); 3516 return; 3517 } 3518 3519 if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 3520 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 3521 /* 3522 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set, 3523 * ATTR_SW_DBM can be cleared without a TLB invalidation. 3524 */ 3525 if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM)) 3526 goto setl2; 3527 newl2 &= ~ATTR_SW_DBM; 3528 } 3529 3530 pa = newl2 + L2_SIZE - PAGE_SIZE; 3531 for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) { 3532 oldl3 = pmap_load(l3); 3533 setl3: 3534 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 3535 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 3536 /* 3537 * When the mapping is clean, i.e., ATTR_S1_AP_RO is 3538 * set, ATTR_SW_DBM can be cleared without a TLB 3539 * invalidation. 3540 */ 3541 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 & 3542 ~ATTR_SW_DBM)) 3543 goto setl3; 3544 oldl3 &= ~ATTR_SW_DBM; 3545 } 3546 if (oldl3 != pa) { 3547 atomic_add_long(&pmap_l2_p_failures, 1); 3548 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 3549 " in pmap %p", va, pmap); 3550 return; 3551 } 3552 pa -= PAGE_SIZE; 3553 } 3554 3555 /* 3556 * Save the page table page in its current state until the L2 3557 * mapping the superpage is demoted by pmap_demote_l2() or 3558 * destroyed by pmap_remove_l3(). 3559 */ 3560 mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); 3561 KASSERT(mpte >= vm_page_array && 3562 mpte < &vm_page_array[vm_page_array_size], 3563 ("pmap_promote_l2: page table page is out of range")); 3564 KASSERT(mpte->pindex == pmap_l2_pindex(va), 3565 ("pmap_promote_l2: page table page's pindex is wrong")); 3566 if (pmap_insert_pt_page(pmap, mpte, true)) { 3567 atomic_add_long(&pmap_l2_p_failures, 1); 3568 CTR2(KTR_PMAP, 3569 "pmap_promote_l2: failure for va %#lx in pmap %p", va, 3570 pmap); 3571 return; 3572 } 3573 3574 if ((newl2 & ATTR_SW_MANAGED) != 0) 3575 pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp); 3576 3577 newl2 &= ~ATTR_DESCR_MASK; 3578 newl2 |= L2_BLOCK; 3579 3580 pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE); 3581 3582 atomic_add_long(&pmap_l2_promotions, 1); 3583 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, 3584 pmap); 3585 } 3586 #endif /* VM_NRESERVLEVEL > 0 */ 3587 3588 static int 3589 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags, 3590 int psind) 3591 { 3592 pd_entry_t *l0p, *l1p, *l2p, origpte; 3593 vm_page_t mp; 3594 3595 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3596 KASSERT(psind > 0 && psind < MAXPAGESIZES, 3597 ("psind %d unexpected", psind)); 3598 KASSERT(((newpte & ~ATTR_MASK) & (pagesizes[psind] - 1)) == 0, 3599 ("unaligned phys address %#lx newpte %#lx psind %d", 3600 (newpte & ~ATTR_MASK), newpte, psind)); 3601 3602 restart: 3603 if (psind == 2) { 3604 l0p = pmap_l0(pmap, va); 3605 if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) { 3606 mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL); 3607 if (mp == NULL) { 3608 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 3609 return (KERN_RESOURCE_SHORTAGE); 3610 PMAP_UNLOCK(pmap); 3611 vm_wait(NULL); 3612 PMAP_LOCK(pmap); 3613 goto restart; 3614 } 3615 l1p = pmap_l0_to_l1(l0p, va); 3616 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va)); 3617 origpte = pmap_load(l1p); 3618 } else { 3619 l1p = pmap_l0_to_l1(l0p, va); 3620 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va)); 3621 origpte = pmap_load(l1p); 3622 if ((origpte & ATTR_DESCR_VALID) == 0) { 3623 mp = PHYS_TO_VM_PAGE(pmap_load(l0p) & 3624 ~ATTR_MASK); 3625 mp->ref_count++; 3626 } 3627 } 3628 KASSERT((origpte & ATTR_DESCR_VALID) == 0 || 3629 ((origpte & ATTR_DESCR_MASK) == L1_BLOCK && 3630 (origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK)), 3631 ("va %#lx changing 1G phys page l1 %#lx newpte %#lx", 3632 va, origpte, newpte)); 3633 pmap_store(l1p, newpte); 3634 } else /* (psind == 1) */ { 3635 l2p = pmap_l2(pmap, va); 3636 if (l2p == NULL) { 3637 mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL); 3638 if (mp == NULL) { 3639 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 3640 return (KERN_RESOURCE_SHORTAGE); 3641 PMAP_UNLOCK(pmap); 3642 vm_wait(NULL); 3643 PMAP_LOCK(pmap); 3644 goto restart; 3645 } 3646 l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 3647 l2p = &l2p[pmap_l2_index(va)]; 3648 origpte = pmap_load(l2p); 3649 } else { 3650 l1p = pmap_l1(pmap, va); 3651 origpte = pmap_load(l2p); 3652 if ((origpte & ATTR_DESCR_VALID) == 0) { 3653 mp = PHYS_TO_VM_PAGE(pmap_load(l1p) & 3654 ~ATTR_MASK); 3655 mp->ref_count++; 3656 } 3657 } 3658 KASSERT((origpte & ATTR_DESCR_VALID) == 0 || 3659 ((origpte & ATTR_DESCR_MASK) == L2_BLOCK && 3660 (origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK)), 3661 ("va %#lx changing 2M phys page l2 %#lx newpte %#lx", 3662 va, origpte, newpte)); 3663 pmap_store(l2p, newpte); 3664 } 3665 dsb(ishst); 3666 3667 if ((origpte & ATTR_DESCR_VALID) == 0) 3668 pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE); 3669 if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0) 3670 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE; 3671 else if ((newpte & ATTR_SW_WIRED) == 0 && 3672 (origpte & ATTR_SW_WIRED) != 0) 3673 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE; 3674 3675 return (KERN_SUCCESS); 3676 } 3677 3678 /* 3679 * Insert the given physical page (p) at 3680 * the specified virtual address (v) in the 3681 * target physical map with the protection requested. 3682 * 3683 * If specified, the page will be wired down, meaning 3684 * that the related pte can not be reclaimed. 3685 * 3686 * NB: This is the only routine which MAY NOT lazy-evaluate 3687 * or lose information. That is, this routine must actually 3688 * insert this page into the given map NOW. 3689 */ 3690 int 3691 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3692 u_int flags, int8_t psind) 3693 { 3694 struct rwlock *lock; 3695 pd_entry_t *pde; 3696 pt_entry_t new_l3, orig_l3; 3697 pt_entry_t *l2, *l3; 3698 pv_entry_t pv; 3699 vm_paddr_t opa, pa; 3700 vm_page_t mpte, om; 3701 boolean_t nosleep; 3702 int lvl, rv; 3703 3704 va = trunc_page(va); 3705 if ((m->oflags & VPO_UNMANAGED) == 0) 3706 VM_PAGE_OBJECT_BUSY_ASSERT(m); 3707 pa = VM_PAGE_TO_PHYS(m); 3708 new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | L3_PAGE); 3709 new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr); 3710 new_l3 |= pmap_pte_prot(pmap, prot); 3711 3712 if ((flags & PMAP_ENTER_WIRED) != 0) 3713 new_l3 |= ATTR_SW_WIRED; 3714 if (pmap->pm_stage == PM_STAGE1) { 3715 if (va < VM_MAXUSER_ADDRESS) 3716 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 3717 else 3718 new_l3 |= ATTR_S1_UXN; 3719 if (pmap != kernel_pmap) 3720 new_l3 |= ATTR_S1_nG; 3721 } else { 3722 /* 3723 * Clear the access flag on executable mappings, this will be 3724 * set later when the page is accessed. The fault handler is 3725 * required to invalidate the I-cache. 3726 * 3727 * TODO: Switch to the valid flag to allow hardware management 3728 * of the access flag. Much of the pmap code assumes the 3729 * valid flag is set and fails to destroy the old page tables 3730 * correctly if it is clear. 3731 */ 3732 if (prot & VM_PROT_EXECUTE) 3733 new_l3 &= ~ATTR_AF; 3734 } 3735 if ((m->oflags & VPO_UNMANAGED) == 0) { 3736 new_l3 |= ATTR_SW_MANAGED; 3737 if ((prot & VM_PROT_WRITE) != 0) { 3738 new_l3 |= ATTR_SW_DBM; 3739 if ((flags & VM_PROT_WRITE) == 0) { 3740 if (pmap->pm_stage == PM_STAGE1) 3741 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO); 3742 else 3743 new_l3 &= 3744 ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 3745 } 3746 } 3747 } 3748 3749 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); 3750 3751 lock = NULL; 3752 PMAP_LOCK(pmap); 3753 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { 3754 KASSERT((m->oflags & VPO_UNMANAGED) != 0, 3755 ("managed largepage va %#lx flags %#x", va, flags)); 3756 new_l3 &= ~L3_PAGE; 3757 if (psind == 2) 3758 new_l3 |= L1_BLOCK; 3759 else /* (psind == 1) */ 3760 new_l3 |= L2_BLOCK; 3761 rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind); 3762 goto out; 3763 } 3764 if (psind == 1) { 3765 /* Assert the required virtual and physical alignment. */ 3766 KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned")); 3767 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 3768 rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK, 3769 flags, m, &lock); 3770 goto out; 3771 } 3772 mpte = NULL; 3773 3774 /* 3775 * In the case that a page table page is not 3776 * resident, we are creating it here. 3777 */ 3778 retry: 3779 pde = pmap_pde(pmap, va, &lvl); 3780 if (pde != NULL && lvl == 2) { 3781 l3 = pmap_l2_to_l3(pde, va); 3782 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 3783 mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); 3784 mpte->ref_count++; 3785 } 3786 goto havel3; 3787 } else if (pde != NULL && lvl == 1) { 3788 l2 = pmap_l1_to_l2(pde, va); 3789 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK && 3790 (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) { 3791 l3 = &l3[pmap_l3_index(va)]; 3792 if (va < VM_MAXUSER_ADDRESS) { 3793 mpte = PHYS_TO_VM_PAGE( 3794 pmap_load(l2) & ~ATTR_MASK); 3795 mpte->ref_count++; 3796 } 3797 goto havel3; 3798 } 3799 /* We need to allocate an L3 table. */ 3800 } 3801 if (va < VM_MAXUSER_ADDRESS) { 3802 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 3803 3804 /* 3805 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order 3806 * to handle the possibility that a superpage mapping for "va" 3807 * was created while we slept. 3808 */ 3809 mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), 3810 nosleep ? NULL : &lock); 3811 if (mpte == NULL && nosleep) { 3812 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); 3813 rv = KERN_RESOURCE_SHORTAGE; 3814 goto out; 3815 } 3816 goto retry; 3817 } else 3818 panic("pmap_enter: missing L3 table for kernel va %#lx", va); 3819 3820 havel3: 3821 orig_l3 = pmap_load(l3); 3822 opa = orig_l3 & ~ATTR_MASK; 3823 pv = NULL; 3824 3825 /* 3826 * Is the specified virtual address already mapped? 3827 */ 3828 if (pmap_l3_valid(orig_l3)) { 3829 /* 3830 * Only allow adding new entries on stage 2 tables for now. 3831 * This simplifies cache invalidation as we may need to call 3832 * into EL2 to perform such actions. 3833 */ 3834 PMAP_ASSERT_STAGE1(pmap); 3835 /* 3836 * Wiring change, just update stats. We don't worry about 3837 * wiring PT pages as they remain resident as long as there 3838 * are valid mappings in them. Hence, if a user page is wired, 3839 * the PT page will be also. 3840 */ 3841 if ((flags & PMAP_ENTER_WIRED) != 0 && 3842 (orig_l3 & ATTR_SW_WIRED) == 0) 3843 pmap->pm_stats.wired_count++; 3844 else if ((flags & PMAP_ENTER_WIRED) == 0 && 3845 (orig_l3 & ATTR_SW_WIRED) != 0) 3846 pmap->pm_stats.wired_count--; 3847 3848 /* 3849 * Remove the extra PT page reference. 3850 */ 3851 if (mpte != NULL) { 3852 mpte->ref_count--; 3853 KASSERT(mpte->ref_count > 0, 3854 ("pmap_enter: missing reference to page table page," 3855 " va: 0x%lx", va)); 3856 } 3857 3858 /* 3859 * Has the physical page changed? 3860 */ 3861 if (opa == pa) { 3862 /* 3863 * No, might be a protection or wiring change. 3864 */ 3865 if ((orig_l3 & ATTR_SW_MANAGED) != 0 && 3866 (new_l3 & ATTR_SW_DBM) != 0) 3867 vm_page_aflag_set(m, PGA_WRITEABLE); 3868 goto validate; 3869 } 3870 3871 /* 3872 * The physical page has changed. Temporarily invalidate 3873 * the mapping. 3874 */ 3875 orig_l3 = pmap_load_clear(l3); 3876 KASSERT((orig_l3 & ~ATTR_MASK) == opa, 3877 ("pmap_enter: unexpected pa update for %#lx", va)); 3878 if ((orig_l3 & ATTR_SW_MANAGED) != 0) { 3879 om = PHYS_TO_VM_PAGE(opa); 3880 3881 /* 3882 * The pmap lock is sufficient to synchronize with 3883 * concurrent calls to pmap_page_test_mappings() and 3884 * pmap_ts_referenced(). 3885 */ 3886 if (pmap_pte_dirty(pmap, orig_l3)) 3887 vm_page_dirty(om); 3888 if ((orig_l3 & ATTR_AF) != 0) { 3889 pmap_invalidate_page(pmap, va); 3890 vm_page_aflag_set(om, PGA_REFERENCED); 3891 } 3892 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 3893 pv = pmap_pvh_remove(&om->md, pmap, va); 3894 if ((m->oflags & VPO_UNMANAGED) != 0) 3895 free_pv_entry(pmap, pv); 3896 if ((om->a.flags & PGA_WRITEABLE) != 0 && 3897 TAILQ_EMPTY(&om->md.pv_list) && 3898 ((om->flags & PG_FICTITIOUS) != 0 || 3899 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 3900 vm_page_aflag_clear(om, PGA_WRITEABLE); 3901 } else { 3902 KASSERT((orig_l3 & ATTR_AF) != 0, 3903 ("pmap_enter: unmanaged mapping lacks ATTR_AF")); 3904 pmap_invalidate_page(pmap, va); 3905 } 3906 orig_l3 = 0; 3907 } else { 3908 /* 3909 * Increment the counters. 3910 */ 3911 if ((new_l3 & ATTR_SW_WIRED) != 0) 3912 pmap->pm_stats.wired_count++; 3913 pmap_resident_count_inc(pmap, 1); 3914 } 3915 /* 3916 * Enter on the PV list if part of our managed memory. 3917 */ 3918 if ((m->oflags & VPO_UNMANAGED) == 0) { 3919 if (pv == NULL) { 3920 pv = get_pv_entry(pmap, &lock); 3921 pv->pv_va = va; 3922 } 3923 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 3924 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3925 m->md.pv_gen++; 3926 if ((new_l3 & ATTR_SW_DBM) != 0) 3927 vm_page_aflag_set(m, PGA_WRITEABLE); 3928 } 3929 3930 validate: 3931 if (pmap->pm_stage == PM_STAGE1) { 3932 /* 3933 * Sync icache if exec permission and attribute 3934 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping 3935 * is stored and made valid for hardware table walk. If done 3936 * later, then other can access this page before caches are 3937 * properly synced. Don't do it for kernel memory which is 3938 * mapped with exec permission even if the memory isn't going 3939 * to hold executable code. The only time when icache sync is 3940 * needed is after kernel module is loaded and the relocation 3941 * info is processed. And it's done in elf_cpu_load_file(). 3942 */ 3943 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 3944 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK && 3945 (opa != pa || (orig_l3 & ATTR_S1_XN))) { 3946 PMAP_ASSERT_STAGE1(pmap); 3947 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); 3948 } 3949 } else { 3950 cpu_dcache_wb_range(PHYS_TO_DMAP(pa), PAGE_SIZE); 3951 } 3952 3953 /* 3954 * Update the L3 entry 3955 */ 3956 if (pmap_l3_valid(orig_l3)) { 3957 PMAP_ASSERT_STAGE1(pmap); 3958 KASSERT(opa == pa, ("pmap_enter: invalid update")); 3959 if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) { 3960 /* same PA, different attributes */ 3961 orig_l3 = pmap_load_store(l3, new_l3); 3962 pmap_invalidate_page(pmap, va); 3963 if ((orig_l3 & ATTR_SW_MANAGED) != 0 && 3964 pmap_pte_dirty(pmap, orig_l3)) 3965 vm_page_dirty(m); 3966 } else { 3967 /* 3968 * orig_l3 == new_l3 3969 * This can happens if multiple threads simultaneously 3970 * access not yet mapped page. This bad for performance 3971 * since this can cause full demotion-NOP-promotion 3972 * cycle. 3973 * Another possible reasons are: 3974 * - VM and pmap memory layout are diverged 3975 * - tlb flush is missing somewhere and CPU doesn't see 3976 * actual mapping. 3977 */ 3978 CTR4(KTR_PMAP, "%s: already mapped page - " 3979 "pmap %p va 0x%#lx pte 0x%lx", 3980 __func__, pmap, va, new_l3); 3981 } 3982 } else { 3983 /* New mapping */ 3984 pmap_store(l3, new_l3); 3985 dsb(ishst); 3986 } 3987 3988 #if VM_NRESERVLEVEL > 0 3989 /* 3990 * Try to promote from level 3 pages to a level 2 superpage. This 3991 * currently only works on stage 1 pmaps as pmap_promote_l2 looks at 3992 * stage 1 specific fields and performs a break-before-make sequence 3993 * that is incorrect a stage 2 pmap. 3994 */ 3995 if ((mpte == NULL || mpte->ref_count == NL3PG) && 3996 pmap_ps_enabled(pmap) && pmap->pm_stage == PM_STAGE1 && 3997 (m->flags & PG_FICTITIOUS) == 0 && 3998 vm_reserv_level_iffullpop(m) == 0) { 3999 pmap_promote_l2(pmap, pde, va, &lock); 4000 } 4001 #endif 4002 4003 rv = KERN_SUCCESS; 4004 out: 4005 if (lock != NULL) 4006 rw_wunlock(lock); 4007 PMAP_UNLOCK(pmap); 4008 return (rv); 4009 } 4010 4011 /* 4012 * Tries to create a read- and/or execute-only 2MB page mapping. Returns true 4013 * if successful. Returns false if (1) a page table page cannot be allocated 4014 * without sleeping, (2) a mapping already exists at the specified virtual 4015 * address, or (3) a PV entry cannot be allocated without reclaiming another 4016 * PV entry. 4017 */ 4018 static bool 4019 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4020 struct rwlock **lockp) 4021 { 4022 pd_entry_t new_l2; 4023 4024 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4025 PMAP_ASSERT_STAGE1(pmap); 4026 4027 new_l2 = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | 4028 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | 4029 L2_BLOCK); 4030 if ((m->oflags & VPO_UNMANAGED) == 0) { 4031 new_l2 |= ATTR_SW_MANAGED; 4032 new_l2 &= ~ATTR_AF; 4033 } 4034 if ((prot & VM_PROT_EXECUTE) == 0 || 4035 m->md.pv_memattr == VM_MEMATTR_DEVICE) 4036 new_l2 |= ATTR_S1_XN; 4037 if (va < VM_MAXUSER_ADDRESS) 4038 new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 4039 else 4040 new_l2 |= ATTR_S1_UXN; 4041 if (pmap != kernel_pmap) 4042 new_l2 |= ATTR_S1_nG; 4043 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | 4044 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == 4045 KERN_SUCCESS); 4046 } 4047 4048 /* 4049 * Returns true if every page table entry in the specified page table is 4050 * zero. 4051 */ 4052 static bool 4053 pmap_every_pte_zero(vm_paddr_t pa) 4054 { 4055 pt_entry_t *pt_end, *pte; 4056 4057 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); 4058 pte = (pt_entry_t *)PHYS_TO_DMAP(pa); 4059 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) { 4060 if (*pte != 0) 4061 return (false); 4062 } 4063 return (true); 4064 } 4065 4066 /* 4067 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 4068 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 4069 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 4070 * a mapping already exists at the specified virtual address. Returns 4071 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table 4072 * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if 4073 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. 4074 * 4075 * The parameter "m" is only used when creating a managed, writeable mapping. 4076 */ 4077 static int 4078 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, 4079 vm_page_t m, struct rwlock **lockp) 4080 { 4081 struct spglist free; 4082 pd_entry_t *l2, old_l2; 4083 vm_page_t l2pg, mt; 4084 4085 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4086 4087 if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags & 4088 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { 4089 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", 4090 va, pmap); 4091 return (KERN_RESOURCE_SHORTAGE); 4092 } 4093 4094 /* 4095 * If there are existing mappings, either abort or remove them. 4096 */ 4097 if ((old_l2 = pmap_load(l2)) != 0) { 4098 KASSERT(l2pg == NULL || l2pg->ref_count > 1, 4099 ("pmap_enter_l2: l2pg's ref count is too low")); 4100 if ((flags & PMAP_ENTER_NOREPLACE) != 0 && (va < 4101 VM_MAXUSER_ADDRESS || (old_l2 & ATTR_DESCR_MASK) == 4102 L2_BLOCK || !pmap_every_pte_zero(old_l2 & ~ATTR_MASK))) { 4103 if (l2pg != NULL) 4104 l2pg->ref_count--; 4105 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx" 4106 " in pmap %p", va, pmap); 4107 return (KERN_FAILURE); 4108 } 4109 SLIST_INIT(&free); 4110 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) 4111 (void)pmap_remove_l2(pmap, l2, va, 4112 pmap_load(pmap_l1(pmap, va)), &free, lockp); 4113 else 4114 pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE, 4115 &free, lockp); 4116 if (va < VM_MAXUSER_ADDRESS) { 4117 vm_page_free_pages_toq(&free, true); 4118 KASSERT(pmap_load(l2) == 0, 4119 ("pmap_enter_l2: non-zero L2 entry %p", l2)); 4120 } else { 4121 KASSERT(SLIST_EMPTY(&free), 4122 ("pmap_enter_l2: freed kernel page table page")); 4123 4124 /* 4125 * Both pmap_remove_l2() and pmap_remove_l3_range() 4126 * will leave the kernel page table page zero filled. 4127 * Nonetheless, the TLB could have an intermediate 4128 * entry for the kernel page table page. 4129 */ 4130 mt = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); 4131 if (pmap_insert_pt_page(pmap, mt, false)) 4132 panic("pmap_enter_l2: trie insert failed"); 4133 pmap_clear(l2); 4134 pmap_invalidate_page(pmap, va); 4135 } 4136 } 4137 4138 if ((new_l2 & ATTR_SW_MANAGED) != 0) { 4139 /* 4140 * Abort this mapping if its PV entry could not be created. 4141 */ 4142 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { 4143 if (l2pg != NULL) 4144 pmap_abort_ptp(pmap, va, l2pg); 4145 CTR2(KTR_PMAP, 4146 "pmap_enter_l2: failure for va %#lx in pmap %p", 4147 va, pmap); 4148 return (KERN_RESOURCE_SHORTAGE); 4149 } 4150 if ((new_l2 & ATTR_SW_DBM) != 0) 4151 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 4152 vm_page_aflag_set(mt, PGA_WRITEABLE); 4153 } 4154 4155 /* 4156 * Increment counters. 4157 */ 4158 if ((new_l2 & ATTR_SW_WIRED) != 0) 4159 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; 4160 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; 4161 4162 /* 4163 * Map the superpage. 4164 */ 4165 pmap_store(l2, new_l2); 4166 dsb(ishst); 4167 4168 atomic_add_long(&pmap_l2_mappings, 1); 4169 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", 4170 va, pmap); 4171 4172 return (KERN_SUCCESS); 4173 } 4174 4175 /* 4176 * Maps a sequence of resident pages belonging to the same object. 4177 * The sequence begins with the given page m_start. This page is 4178 * mapped at the given virtual address start. Each subsequent page is 4179 * mapped at a virtual address that is offset from start by the same 4180 * amount as the page is offset from m_start within the object. The 4181 * last page in the sequence is the page with the largest offset from 4182 * m_start that can be mapped at a virtual address less than the given 4183 * virtual address end. Not every virtual page between start and end 4184 * is mapped; only those for which a resident page exists with the 4185 * corresponding offset from m_start are mapped. 4186 */ 4187 void 4188 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 4189 vm_page_t m_start, vm_prot_t prot) 4190 { 4191 struct rwlock *lock; 4192 vm_offset_t va; 4193 vm_page_t m, mpte; 4194 vm_pindex_t diff, psize; 4195 4196 VM_OBJECT_ASSERT_LOCKED(m_start->object); 4197 4198 psize = atop(end - start); 4199 mpte = NULL; 4200 m = m_start; 4201 lock = NULL; 4202 PMAP_LOCK(pmap); 4203 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 4204 va = start + ptoa(diff); 4205 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && 4206 m->psind == 1 && pmap_ps_enabled(pmap) && 4207 pmap_enter_2mpage(pmap, va, m, prot, &lock)) 4208 m = &m[L2_SIZE / PAGE_SIZE - 1]; 4209 else 4210 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, 4211 &lock); 4212 m = TAILQ_NEXT(m, listq); 4213 } 4214 if (lock != NULL) 4215 rw_wunlock(lock); 4216 PMAP_UNLOCK(pmap); 4217 } 4218 4219 /* 4220 * this code makes some *MAJOR* assumptions: 4221 * 1. Current pmap & pmap exists. 4222 * 2. Not wired. 4223 * 3. Read access. 4224 * 4. No page table pages. 4225 * but is *MUCH* faster than pmap_enter... 4226 */ 4227 4228 void 4229 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4230 { 4231 struct rwlock *lock; 4232 4233 lock = NULL; 4234 PMAP_LOCK(pmap); 4235 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 4236 if (lock != NULL) 4237 rw_wunlock(lock); 4238 PMAP_UNLOCK(pmap); 4239 } 4240 4241 static vm_page_t 4242 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 4243 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 4244 { 4245 pd_entry_t *pde; 4246 pt_entry_t *l2, *l3, l3_val; 4247 vm_paddr_t pa; 4248 int lvl; 4249 4250 KASSERT(!VA_IS_CLEANMAP(va) || 4251 (m->oflags & VPO_UNMANAGED) != 0, 4252 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 4253 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4254 PMAP_ASSERT_STAGE1(pmap); 4255 4256 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); 4257 /* 4258 * In the case that a page table page is not 4259 * resident, we are creating it here. 4260 */ 4261 if (va < VM_MAXUSER_ADDRESS) { 4262 vm_pindex_t l2pindex; 4263 4264 /* 4265 * Calculate pagetable page index 4266 */ 4267 l2pindex = pmap_l2_pindex(va); 4268 if (mpte && (mpte->pindex == l2pindex)) { 4269 mpte->ref_count++; 4270 } else { 4271 /* 4272 * Get the l2 entry 4273 */ 4274 pde = pmap_pde(pmap, va, &lvl); 4275 4276 /* 4277 * If the page table page is mapped, we just increment 4278 * the hold count, and activate it. Otherwise, we 4279 * attempt to allocate a page table page. If this 4280 * attempt fails, we don't retry. Instead, we give up. 4281 */ 4282 if (lvl == 1) { 4283 l2 = pmap_l1_to_l2(pde, va); 4284 if ((pmap_load(l2) & ATTR_DESCR_MASK) == 4285 L2_BLOCK) 4286 return (NULL); 4287 } 4288 if (lvl == 2 && pmap_load(pde) != 0) { 4289 mpte = 4290 PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); 4291 mpte->ref_count++; 4292 } else { 4293 /* 4294 * Pass NULL instead of the PV list lock 4295 * pointer, because we don't intend to sleep. 4296 */ 4297 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); 4298 if (mpte == NULL) 4299 return (mpte); 4300 } 4301 } 4302 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 4303 l3 = &l3[pmap_l3_index(va)]; 4304 } else { 4305 mpte = NULL; 4306 pde = pmap_pde(kernel_pmap, va, &lvl); 4307 KASSERT(pde != NULL, 4308 ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx", 4309 va)); 4310 KASSERT(lvl == 2, 4311 ("pmap_enter_quick_locked: Invalid level %d", lvl)); 4312 l3 = pmap_l2_to_l3(pde, va); 4313 } 4314 4315 /* 4316 * Abort if a mapping already exists. 4317 */ 4318 if (pmap_load(l3) != 0) { 4319 if (mpte != NULL) 4320 mpte->ref_count--; 4321 return (NULL); 4322 } 4323 4324 /* 4325 * Enter on the PV list if part of our managed memory. 4326 */ 4327 if ((m->oflags & VPO_UNMANAGED) == 0 && 4328 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 4329 if (mpte != NULL) 4330 pmap_abort_ptp(pmap, va, mpte); 4331 return (NULL); 4332 } 4333 4334 /* 4335 * Increment counters 4336 */ 4337 pmap_resident_count_inc(pmap, 1); 4338 4339 pa = VM_PAGE_TO_PHYS(m); 4340 l3_val = pa | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) | 4341 ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE; 4342 if ((prot & VM_PROT_EXECUTE) == 0 || 4343 m->md.pv_memattr == VM_MEMATTR_DEVICE) 4344 l3_val |= ATTR_S1_XN; 4345 if (va < VM_MAXUSER_ADDRESS) 4346 l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 4347 else 4348 l3_val |= ATTR_S1_UXN; 4349 if (pmap != kernel_pmap) 4350 l3_val |= ATTR_S1_nG; 4351 4352 /* 4353 * Now validate mapping with RO protection 4354 */ 4355 if ((m->oflags & VPO_UNMANAGED) == 0) { 4356 l3_val |= ATTR_SW_MANAGED; 4357 l3_val &= ~ATTR_AF; 4358 } 4359 4360 /* Sync icache before the mapping is stored to PTE */ 4361 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 4362 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) 4363 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); 4364 4365 pmap_store(l3, l3_val); 4366 dsb(ishst); 4367 4368 return (mpte); 4369 } 4370 4371 /* 4372 * This code maps large physical mmap regions into the 4373 * processor address space. Note that some shortcuts 4374 * are taken, but the code works. 4375 */ 4376 void 4377 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 4378 vm_pindex_t pindex, vm_size_t size) 4379 { 4380 4381 VM_OBJECT_ASSERT_WLOCKED(object); 4382 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 4383 ("pmap_object_init_pt: non-device object")); 4384 } 4385 4386 /* 4387 * Clear the wired attribute from the mappings for the specified range of 4388 * addresses in the given pmap. Every valid mapping within that range 4389 * must have the wired attribute set. In contrast, invalid mappings 4390 * cannot have the wired attribute set, so they are ignored. 4391 * 4392 * The wired attribute of the page table entry is not a hardware feature, 4393 * so there is no need to invalidate any TLB entries. 4394 */ 4395 void 4396 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4397 { 4398 vm_offset_t va_next; 4399 pd_entry_t *l0, *l1, *l2; 4400 pt_entry_t *l3; 4401 4402 PMAP_LOCK(pmap); 4403 for (; sva < eva; sva = va_next) { 4404 l0 = pmap_l0(pmap, sva); 4405 if (pmap_load(l0) == 0) { 4406 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 4407 if (va_next < sva) 4408 va_next = eva; 4409 continue; 4410 } 4411 4412 l1 = pmap_l0_to_l1(l0, sva); 4413 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 4414 if (va_next < sva) 4415 va_next = eva; 4416 if (pmap_load(l1) == 0) 4417 continue; 4418 4419 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 4420 KASSERT(va_next <= eva, 4421 ("partial update of non-transparent 1G page " 4422 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 4423 pmap_load(l1), sva, eva, va_next)); 4424 MPASS(pmap != kernel_pmap); 4425 MPASS((pmap_load(l1) & (ATTR_SW_MANAGED | 4426 ATTR_SW_WIRED)) == ATTR_SW_WIRED); 4427 pmap_clear_bits(l1, ATTR_SW_WIRED); 4428 pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE; 4429 continue; 4430 } 4431 4432 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 4433 if (va_next < sva) 4434 va_next = eva; 4435 4436 l2 = pmap_l1_to_l2(l1, sva); 4437 if (pmap_load(l2) == 0) 4438 continue; 4439 4440 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { 4441 if ((pmap_load(l2) & ATTR_SW_WIRED) == 0) 4442 panic("pmap_unwire: l2 %#jx is missing " 4443 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2)); 4444 4445 /* 4446 * Are we unwiring the entire large page? If not, 4447 * demote the mapping and fall through. 4448 */ 4449 if (sva + L2_SIZE == va_next && eva >= va_next) { 4450 pmap_clear_bits(l2, ATTR_SW_WIRED); 4451 pmap->pm_stats.wired_count -= L2_SIZE / 4452 PAGE_SIZE; 4453 continue; 4454 } else if (pmap_demote_l2(pmap, l2, sva) == NULL) 4455 panic("pmap_unwire: demotion failed"); 4456 } 4457 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 4458 ("pmap_unwire: Invalid l2 entry after demotion")); 4459 4460 if (va_next > eva) 4461 va_next = eva; 4462 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 4463 sva += L3_SIZE) { 4464 if (pmap_load(l3) == 0) 4465 continue; 4466 if ((pmap_load(l3) & ATTR_SW_WIRED) == 0) 4467 panic("pmap_unwire: l3 %#jx is missing " 4468 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3)); 4469 4470 /* 4471 * ATTR_SW_WIRED must be cleared atomically. Although 4472 * the pmap lock synchronizes access to ATTR_SW_WIRED, 4473 * the System MMU may write to the entry concurrently. 4474 */ 4475 pmap_clear_bits(l3, ATTR_SW_WIRED); 4476 pmap->pm_stats.wired_count--; 4477 } 4478 } 4479 PMAP_UNLOCK(pmap); 4480 } 4481 4482 /* 4483 * Copy the range specified by src_addr/len 4484 * from the source map to the range dst_addr/len 4485 * in the destination map. 4486 * 4487 * This routine is only advisory and need not do anything. 4488 * 4489 * Because the executable mappings created by this routine are copied, 4490 * it should not have to flush the instruction cache. 4491 */ 4492 void 4493 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 4494 vm_offset_t src_addr) 4495 { 4496 struct rwlock *lock; 4497 pd_entry_t *l0, *l1, *l2, srcptepaddr; 4498 pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte; 4499 vm_offset_t addr, end_addr, va_next; 4500 vm_page_t dst_m, dstmpte, srcmpte; 4501 4502 PMAP_ASSERT_STAGE1(dst_pmap); 4503 PMAP_ASSERT_STAGE1(src_pmap); 4504 4505 if (dst_addr != src_addr) 4506 return; 4507 end_addr = src_addr + len; 4508 lock = NULL; 4509 if (dst_pmap < src_pmap) { 4510 PMAP_LOCK(dst_pmap); 4511 PMAP_LOCK(src_pmap); 4512 } else { 4513 PMAP_LOCK(src_pmap); 4514 PMAP_LOCK(dst_pmap); 4515 } 4516 for (addr = src_addr; addr < end_addr; addr = va_next) { 4517 l0 = pmap_l0(src_pmap, addr); 4518 if (pmap_load(l0) == 0) { 4519 va_next = (addr + L0_SIZE) & ~L0_OFFSET; 4520 if (va_next < addr) 4521 va_next = end_addr; 4522 continue; 4523 } 4524 4525 va_next = (addr + L1_SIZE) & ~L1_OFFSET; 4526 if (va_next < addr) 4527 va_next = end_addr; 4528 l1 = pmap_l0_to_l1(l0, addr); 4529 if (pmap_load(l1) == 0) 4530 continue; 4531 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 4532 KASSERT(va_next <= end_addr, 4533 ("partial update of non-transparent 1G page " 4534 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx", 4535 pmap_load(l1), addr, end_addr, va_next)); 4536 srcptepaddr = pmap_load(l1); 4537 l1 = pmap_l1(dst_pmap, addr); 4538 if (l1 == NULL) { 4539 if (_pmap_alloc_l3(dst_pmap, 4540 pmap_l0_pindex(addr), NULL) == NULL) 4541 break; 4542 l1 = pmap_l1(dst_pmap, addr); 4543 } else { 4544 l0 = pmap_l0(dst_pmap, addr); 4545 dst_m = PHYS_TO_VM_PAGE(pmap_load(l0) & 4546 ~ATTR_MASK); 4547 dst_m->ref_count++; 4548 } 4549 KASSERT(pmap_load(l1) == 0, 4550 ("1G mapping present in dst pmap " 4551 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx", 4552 pmap_load(l1), addr, end_addr, va_next)); 4553 pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED); 4554 pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE); 4555 continue; 4556 } 4557 4558 va_next = (addr + L2_SIZE) & ~L2_OFFSET; 4559 if (va_next < addr) 4560 va_next = end_addr; 4561 l2 = pmap_l1_to_l2(l1, addr); 4562 srcptepaddr = pmap_load(l2); 4563 if (srcptepaddr == 0) 4564 continue; 4565 if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) { 4566 if ((addr & L2_OFFSET) != 0 || 4567 addr + L2_SIZE > end_addr) 4568 continue; 4569 l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL); 4570 if (l2 == NULL) 4571 break; 4572 if (pmap_load(l2) == 0 && 4573 ((srcptepaddr & ATTR_SW_MANAGED) == 0 || 4574 pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr, 4575 PMAP_ENTER_NORECLAIM, &lock))) { 4576 mask = ATTR_SW_WIRED; 4577 pmap_store(l2, srcptepaddr & ~mask); 4578 pmap_resident_count_inc(dst_pmap, L2_SIZE / 4579 PAGE_SIZE); 4580 atomic_add_long(&pmap_l2_mappings, 1); 4581 } else 4582 pmap_abort_ptp(dst_pmap, addr, dst_m); 4583 continue; 4584 } 4585 KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE, 4586 ("pmap_copy: invalid L2 entry")); 4587 srcptepaddr &= ~ATTR_MASK; 4588 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 4589 KASSERT(srcmpte->ref_count > 0, 4590 ("pmap_copy: source page table page is unused")); 4591 if (va_next > end_addr) 4592 va_next = end_addr; 4593 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 4594 src_pte = &src_pte[pmap_l3_index(addr)]; 4595 dstmpte = NULL; 4596 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { 4597 ptetemp = pmap_load(src_pte); 4598 4599 /* 4600 * We only virtual copy managed pages. 4601 */ 4602 if ((ptetemp & ATTR_SW_MANAGED) == 0) 4603 continue; 4604 4605 if (dstmpte != NULL) { 4606 KASSERT(dstmpte->pindex == pmap_l2_pindex(addr), 4607 ("dstmpte pindex/addr mismatch")); 4608 dstmpte->ref_count++; 4609 } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr, 4610 NULL)) == NULL) 4611 goto out; 4612 dst_pte = (pt_entry_t *) 4613 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 4614 dst_pte = &dst_pte[pmap_l3_index(addr)]; 4615 if (pmap_load(dst_pte) == 0 && 4616 pmap_try_insert_pv_entry(dst_pmap, addr, 4617 PHYS_TO_VM_PAGE(ptetemp & ~ATTR_MASK), &lock)) { 4618 /* 4619 * Clear the wired, modified, and accessed 4620 * (referenced) bits during the copy. 4621 */ 4622 mask = ATTR_AF | ATTR_SW_WIRED; 4623 nbits = 0; 4624 if ((ptetemp & ATTR_SW_DBM) != 0) 4625 nbits |= ATTR_S1_AP_RW_BIT; 4626 pmap_store(dst_pte, (ptetemp & ~mask) | nbits); 4627 pmap_resident_count_inc(dst_pmap, 1); 4628 } else { 4629 pmap_abort_ptp(dst_pmap, addr, dstmpte); 4630 goto out; 4631 } 4632 /* Have we copied all of the valid mappings? */ 4633 if (dstmpte->ref_count >= srcmpte->ref_count) 4634 break; 4635 } 4636 } 4637 out: 4638 /* 4639 * XXX This barrier may not be needed because the destination pmap is 4640 * not active. 4641 */ 4642 dsb(ishst); 4643 4644 if (lock != NULL) 4645 rw_wunlock(lock); 4646 PMAP_UNLOCK(src_pmap); 4647 PMAP_UNLOCK(dst_pmap); 4648 } 4649 4650 /* 4651 * pmap_zero_page zeros the specified hardware page by mapping 4652 * the page into KVM and using bzero to clear its contents. 4653 */ 4654 void 4655 pmap_zero_page(vm_page_t m) 4656 { 4657 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 4658 4659 pagezero((void *)va); 4660 } 4661 4662 /* 4663 * pmap_zero_page_area zeros the specified hardware page by mapping 4664 * the page into KVM and using bzero to clear its contents. 4665 * 4666 * off and size may not cover an area beyond a single hardware page. 4667 */ 4668 void 4669 pmap_zero_page_area(vm_page_t m, int off, int size) 4670 { 4671 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 4672 4673 if (off == 0 && size == PAGE_SIZE) 4674 pagezero((void *)va); 4675 else 4676 bzero((char *)va + off, size); 4677 } 4678 4679 /* 4680 * pmap_copy_page copies the specified (machine independent) 4681 * page by mapping the page into virtual memory and using 4682 * bcopy to copy the page, one machine dependent page at a 4683 * time. 4684 */ 4685 void 4686 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 4687 { 4688 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 4689 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 4690 4691 pagecopy((void *)src, (void *)dst); 4692 } 4693 4694 int unmapped_buf_allowed = 1; 4695 4696 void 4697 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 4698 vm_offset_t b_offset, int xfersize) 4699 { 4700 void *a_cp, *b_cp; 4701 vm_page_t m_a, m_b; 4702 vm_paddr_t p_a, p_b; 4703 vm_offset_t a_pg_offset, b_pg_offset; 4704 int cnt; 4705 4706 while (xfersize > 0) { 4707 a_pg_offset = a_offset & PAGE_MASK; 4708 m_a = ma[a_offset >> PAGE_SHIFT]; 4709 p_a = m_a->phys_addr; 4710 b_pg_offset = b_offset & PAGE_MASK; 4711 m_b = mb[b_offset >> PAGE_SHIFT]; 4712 p_b = m_b->phys_addr; 4713 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 4714 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 4715 if (__predict_false(!PHYS_IN_DMAP(p_a))) { 4716 panic("!DMAP a %lx", p_a); 4717 } else { 4718 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 4719 } 4720 if (__predict_false(!PHYS_IN_DMAP(p_b))) { 4721 panic("!DMAP b %lx", p_b); 4722 } else { 4723 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 4724 } 4725 bcopy(a_cp, b_cp, cnt); 4726 a_offset += cnt; 4727 b_offset += cnt; 4728 xfersize -= cnt; 4729 } 4730 } 4731 4732 vm_offset_t 4733 pmap_quick_enter_page(vm_page_t m) 4734 { 4735 4736 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); 4737 } 4738 4739 void 4740 pmap_quick_remove_page(vm_offset_t addr) 4741 { 4742 } 4743 4744 /* 4745 * Returns true if the pmap's pv is one of the first 4746 * 16 pvs linked to from this page. This count may 4747 * be changed upwards or downwards in the future; it 4748 * is only necessary that true be returned for a small 4749 * subset of pmaps for proper page aging. 4750 */ 4751 boolean_t 4752 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 4753 { 4754 struct md_page *pvh; 4755 struct rwlock *lock; 4756 pv_entry_t pv; 4757 int loops = 0; 4758 boolean_t rv; 4759 4760 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4761 ("pmap_page_exists_quick: page %p is not managed", m)); 4762 rv = FALSE; 4763 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4764 rw_rlock(lock); 4765 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4766 if (PV_PMAP(pv) == pmap) { 4767 rv = TRUE; 4768 break; 4769 } 4770 loops++; 4771 if (loops >= 16) 4772 break; 4773 } 4774 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 4775 pvh = page_to_pvh(m); 4776 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4777 if (PV_PMAP(pv) == pmap) { 4778 rv = TRUE; 4779 break; 4780 } 4781 loops++; 4782 if (loops >= 16) 4783 break; 4784 } 4785 } 4786 rw_runlock(lock); 4787 return (rv); 4788 } 4789 4790 /* 4791 * pmap_page_wired_mappings: 4792 * 4793 * Return the number of managed mappings to the given physical page 4794 * that are wired. 4795 */ 4796 int 4797 pmap_page_wired_mappings(vm_page_t m) 4798 { 4799 struct rwlock *lock; 4800 struct md_page *pvh; 4801 pmap_t pmap; 4802 pt_entry_t *pte; 4803 pv_entry_t pv; 4804 int count, lvl, md_gen, pvh_gen; 4805 4806 if ((m->oflags & VPO_UNMANAGED) != 0) 4807 return (0); 4808 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4809 rw_rlock(lock); 4810 restart: 4811 count = 0; 4812 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4813 pmap = PV_PMAP(pv); 4814 if (!PMAP_TRYLOCK(pmap)) { 4815 md_gen = m->md.pv_gen; 4816 rw_runlock(lock); 4817 PMAP_LOCK(pmap); 4818 rw_rlock(lock); 4819 if (md_gen != m->md.pv_gen) { 4820 PMAP_UNLOCK(pmap); 4821 goto restart; 4822 } 4823 } 4824 pte = pmap_pte(pmap, pv->pv_va, &lvl); 4825 if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0) 4826 count++; 4827 PMAP_UNLOCK(pmap); 4828 } 4829 if ((m->flags & PG_FICTITIOUS) == 0) { 4830 pvh = page_to_pvh(m); 4831 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4832 pmap = PV_PMAP(pv); 4833 if (!PMAP_TRYLOCK(pmap)) { 4834 md_gen = m->md.pv_gen; 4835 pvh_gen = pvh->pv_gen; 4836 rw_runlock(lock); 4837 PMAP_LOCK(pmap); 4838 rw_rlock(lock); 4839 if (md_gen != m->md.pv_gen || 4840 pvh_gen != pvh->pv_gen) { 4841 PMAP_UNLOCK(pmap); 4842 goto restart; 4843 } 4844 } 4845 pte = pmap_pte(pmap, pv->pv_va, &lvl); 4846 if (pte != NULL && 4847 (pmap_load(pte) & ATTR_SW_WIRED) != 0) 4848 count++; 4849 PMAP_UNLOCK(pmap); 4850 } 4851 } 4852 rw_runlock(lock); 4853 return (count); 4854 } 4855 4856 /* 4857 * Returns true if the given page is mapped individually or as part of 4858 * a 2mpage. Otherwise, returns false. 4859 */ 4860 bool 4861 pmap_page_is_mapped(vm_page_t m) 4862 { 4863 struct rwlock *lock; 4864 bool rv; 4865 4866 if ((m->oflags & VPO_UNMANAGED) != 0) 4867 return (false); 4868 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4869 rw_rlock(lock); 4870 rv = !TAILQ_EMPTY(&m->md.pv_list) || 4871 ((m->flags & PG_FICTITIOUS) == 0 && 4872 !TAILQ_EMPTY(&page_to_pvh(m)->pv_list)); 4873 rw_runlock(lock); 4874 return (rv); 4875 } 4876 4877 /* 4878 * Destroy all managed, non-wired mappings in the given user-space 4879 * pmap. This pmap cannot be active on any processor besides the 4880 * caller. 4881 * 4882 * This function cannot be applied to the kernel pmap. Moreover, it 4883 * is not intended for general use. It is only to be used during 4884 * process termination. Consequently, it can be implemented in ways 4885 * that make it faster than pmap_remove(). First, it can more quickly 4886 * destroy mappings by iterating over the pmap's collection of PV 4887 * entries, rather than searching the page table. Second, it doesn't 4888 * have to test and clear the page table entries atomically, because 4889 * no processor is currently accessing the user address space. In 4890 * particular, a page table entry's dirty bit won't change state once 4891 * this function starts. 4892 */ 4893 void 4894 pmap_remove_pages(pmap_t pmap) 4895 { 4896 pd_entry_t *pde; 4897 pt_entry_t *pte, tpte; 4898 struct spglist free; 4899 vm_page_t m, ml3, mt; 4900 pv_entry_t pv; 4901 struct md_page *pvh; 4902 struct pv_chunk *pc, *npc; 4903 struct rwlock *lock; 4904 int64_t bit; 4905 uint64_t inuse, bitmask; 4906 int allfree, field, freed, idx, lvl; 4907 vm_paddr_t pa; 4908 4909 lock = NULL; 4910 4911 SLIST_INIT(&free); 4912 PMAP_LOCK(pmap); 4913 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4914 allfree = 1; 4915 freed = 0; 4916 for (field = 0; field < _NPCM; field++) { 4917 inuse = ~pc->pc_map[field] & pc_freemask[field]; 4918 while (inuse != 0) { 4919 bit = ffsl(inuse) - 1; 4920 bitmask = 1UL << bit; 4921 idx = field * 64 + bit; 4922 pv = &pc->pc_pventry[idx]; 4923 inuse &= ~bitmask; 4924 4925 pde = pmap_pde(pmap, pv->pv_va, &lvl); 4926 KASSERT(pde != NULL, 4927 ("Attempting to remove an unmapped page")); 4928 4929 switch(lvl) { 4930 case 1: 4931 pte = pmap_l1_to_l2(pde, pv->pv_va); 4932 tpte = pmap_load(pte); 4933 KASSERT((tpte & ATTR_DESCR_MASK) == 4934 L2_BLOCK, 4935 ("Attempting to remove an invalid " 4936 "block: %lx", tpte)); 4937 break; 4938 case 2: 4939 pte = pmap_l2_to_l3(pde, pv->pv_va); 4940 tpte = pmap_load(pte); 4941 KASSERT((tpte & ATTR_DESCR_MASK) == 4942 L3_PAGE, 4943 ("Attempting to remove an invalid " 4944 "page: %lx", tpte)); 4945 break; 4946 default: 4947 panic( 4948 "Invalid page directory level: %d", 4949 lvl); 4950 } 4951 4952 /* 4953 * We cannot remove wired pages from a process' mapping at this time 4954 */ 4955 if (tpte & ATTR_SW_WIRED) { 4956 allfree = 0; 4957 continue; 4958 } 4959 4960 pa = tpte & ~ATTR_MASK; 4961 4962 m = PHYS_TO_VM_PAGE(pa); 4963 KASSERT(m->phys_addr == pa, 4964 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 4965 m, (uintmax_t)m->phys_addr, 4966 (uintmax_t)tpte)); 4967 4968 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4969 m < &vm_page_array[vm_page_array_size], 4970 ("pmap_remove_pages: bad pte %#jx", 4971 (uintmax_t)tpte)); 4972 4973 /* 4974 * Because this pmap is not active on other 4975 * processors, the dirty bit cannot have 4976 * changed state since we last loaded pte. 4977 */ 4978 pmap_clear(pte); 4979 4980 /* 4981 * Update the vm_page_t clean/reference bits. 4982 */ 4983 if (pmap_pte_dirty(pmap, tpte)) { 4984 switch (lvl) { 4985 case 1: 4986 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 4987 vm_page_dirty(mt); 4988 break; 4989 case 2: 4990 vm_page_dirty(m); 4991 break; 4992 } 4993 } 4994 4995 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 4996 4997 /* Mark free */ 4998 pc->pc_map[field] |= bitmask; 4999 switch (lvl) { 5000 case 1: 5001 pmap_resident_count_dec(pmap, 5002 L2_SIZE / PAGE_SIZE); 5003 pvh = pa_to_pvh(tpte & ~ATTR_MASK); 5004 TAILQ_REMOVE(&pvh->pv_list, pv,pv_next); 5005 pvh->pv_gen++; 5006 if (TAILQ_EMPTY(&pvh->pv_list)) { 5007 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 5008 if ((mt->a.flags & PGA_WRITEABLE) != 0 && 5009 TAILQ_EMPTY(&mt->md.pv_list)) 5010 vm_page_aflag_clear(mt, PGA_WRITEABLE); 5011 } 5012 ml3 = pmap_remove_pt_page(pmap, 5013 pv->pv_va); 5014 if (ml3 != NULL) { 5015 KASSERT(ml3->valid == VM_PAGE_BITS_ALL, 5016 ("pmap_remove_pages: l3 page not promoted")); 5017 pmap_resident_count_dec(pmap,1); 5018 KASSERT(ml3->ref_count == NL3PG, 5019 ("pmap_remove_pages: l3 page ref count error")); 5020 ml3->ref_count = 0; 5021 pmap_add_delayed_free_list(ml3, 5022 &free, FALSE); 5023 } 5024 break; 5025 case 2: 5026 pmap_resident_count_dec(pmap, 1); 5027 TAILQ_REMOVE(&m->md.pv_list, pv, 5028 pv_next); 5029 m->md.pv_gen++; 5030 if ((m->a.flags & PGA_WRITEABLE) != 0 && 5031 TAILQ_EMPTY(&m->md.pv_list) && 5032 (m->flags & PG_FICTITIOUS) == 0) { 5033 pvh = page_to_pvh(m); 5034 if (TAILQ_EMPTY(&pvh->pv_list)) 5035 vm_page_aflag_clear(m, 5036 PGA_WRITEABLE); 5037 } 5038 break; 5039 } 5040 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde), 5041 &free); 5042 freed++; 5043 } 5044 } 5045 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 5046 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 5047 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 5048 if (allfree) { 5049 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5050 free_pv_chunk(pc); 5051 } 5052 } 5053 if (lock != NULL) 5054 rw_wunlock(lock); 5055 pmap_invalidate_all(pmap); 5056 PMAP_UNLOCK(pmap); 5057 vm_page_free_pages_toq(&free, true); 5058 } 5059 5060 /* 5061 * This is used to check if a page has been accessed or modified. 5062 */ 5063 static boolean_t 5064 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 5065 { 5066 struct rwlock *lock; 5067 pv_entry_t pv; 5068 struct md_page *pvh; 5069 pt_entry_t *pte, mask, value; 5070 pmap_t pmap; 5071 int lvl, md_gen, pvh_gen; 5072 boolean_t rv; 5073 5074 rv = FALSE; 5075 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5076 rw_rlock(lock); 5077 restart: 5078 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5079 pmap = PV_PMAP(pv); 5080 PMAP_ASSERT_STAGE1(pmap); 5081 if (!PMAP_TRYLOCK(pmap)) { 5082 md_gen = m->md.pv_gen; 5083 rw_runlock(lock); 5084 PMAP_LOCK(pmap); 5085 rw_rlock(lock); 5086 if (md_gen != m->md.pv_gen) { 5087 PMAP_UNLOCK(pmap); 5088 goto restart; 5089 } 5090 } 5091 pte = pmap_pte(pmap, pv->pv_va, &lvl); 5092 KASSERT(lvl == 3, 5093 ("pmap_page_test_mappings: Invalid level %d", lvl)); 5094 mask = 0; 5095 value = 0; 5096 if (modified) { 5097 mask |= ATTR_S1_AP_RW_BIT; 5098 value |= ATTR_S1_AP(ATTR_S1_AP_RW); 5099 } 5100 if (accessed) { 5101 mask |= ATTR_AF | ATTR_DESCR_MASK; 5102 value |= ATTR_AF | L3_PAGE; 5103 } 5104 rv = (pmap_load(pte) & mask) == value; 5105 PMAP_UNLOCK(pmap); 5106 if (rv) 5107 goto out; 5108 } 5109 if ((m->flags & PG_FICTITIOUS) == 0) { 5110 pvh = page_to_pvh(m); 5111 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5112 pmap = PV_PMAP(pv); 5113 PMAP_ASSERT_STAGE1(pmap); 5114 if (!PMAP_TRYLOCK(pmap)) { 5115 md_gen = m->md.pv_gen; 5116 pvh_gen = pvh->pv_gen; 5117 rw_runlock(lock); 5118 PMAP_LOCK(pmap); 5119 rw_rlock(lock); 5120 if (md_gen != m->md.pv_gen || 5121 pvh_gen != pvh->pv_gen) { 5122 PMAP_UNLOCK(pmap); 5123 goto restart; 5124 } 5125 } 5126 pte = pmap_pte(pmap, pv->pv_va, &lvl); 5127 KASSERT(lvl == 2, 5128 ("pmap_page_test_mappings: Invalid level %d", lvl)); 5129 mask = 0; 5130 value = 0; 5131 if (modified) { 5132 mask |= ATTR_S1_AP_RW_BIT; 5133 value |= ATTR_S1_AP(ATTR_S1_AP_RW); 5134 } 5135 if (accessed) { 5136 mask |= ATTR_AF | ATTR_DESCR_MASK; 5137 value |= ATTR_AF | L2_BLOCK; 5138 } 5139 rv = (pmap_load(pte) & mask) == value; 5140 PMAP_UNLOCK(pmap); 5141 if (rv) 5142 goto out; 5143 } 5144 } 5145 out: 5146 rw_runlock(lock); 5147 return (rv); 5148 } 5149 5150 /* 5151 * pmap_is_modified: 5152 * 5153 * Return whether or not the specified physical page was modified 5154 * in any physical maps. 5155 */ 5156 boolean_t 5157 pmap_is_modified(vm_page_t m) 5158 { 5159 5160 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5161 ("pmap_is_modified: page %p is not managed", m)); 5162 5163 /* 5164 * If the page is not busied then this check is racy. 5165 */ 5166 if (!pmap_page_is_write_mapped(m)) 5167 return (FALSE); 5168 return (pmap_page_test_mappings(m, FALSE, TRUE)); 5169 } 5170 5171 /* 5172 * pmap_is_prefaultable: 5173 * 5174 * Return whether or not the specified virtual address is eligible 5175 * for prefault. 5176 */ 5177 boolean_t 5178 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 5179 { 5180 pt_entry_t *pte; 5181 boolean_t rv; 5182 int lvl; 5183 5184 rv = FALSE; 5185 PMAP_LOCK(pmap); 5186 pte = pmap_pte(pmap, addr, &lvl); 5187 if (pte != NULL && pmap_load(pte) != 0) { 5188 rv = TRUE; 5189 } 5190 PMAP_UNLOCK(pmap); 5191 return (rv); 5192 } 5193 5194 /* 5195 * pmap_is_referenced: 5196 * 5197 * Return whether or not the specified physical page was referenced 5198 * in any physical maps. 5199 */ 5200 boolean_t 5201 pmap_is_referenced(vm_page_t m) 5202 { 5203 5204 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5205 ("pmap_is_referenced: page %p is not managed", m)); 5206 return (pmap_page_test_mappings(m, TRUE, FALSE)); 5207 } 5208 5209 /* 5210 * Clear the write and modified bits in each of the given page's mappings. 5211 */ 5212 void 5213 pmap_remove_write(vm_page_t m) 5214 { 5215 struct md_page *pvh; 5216 pmap_t pmap; 5217 struct rwlock *lock; 5218 pv_entry_t next_pv, pv; 5219 pt_entry_t oldpte, *pte; 5220 vm_offset_t va; 5221 int lvl, md_gen, pvh_gen; 5222 5223 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5224 ("pmap_remove_write: page %p is not managed", m)); 5225 vm_page_assert_busied(m); 5226 5227 if (!pmap_page_is_write_mapped(m)) 5228 return; 5229 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5230 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 5231 retry_pv_loop: 5232 rw_wlock(lock); 5233 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5234 pmap = PV_PMAP(pv); 5235 PMAP_ASSERT_STAGE1(pmap); 5236 if (!PMAP_TRYLOCK(pmap)) { 5237 pvh_gen = pvh->pv_gen; 5238 rw_wunlock(lock); 5239 PMAP_LOCK(pmap); 5240 rw_wlock(lock); 5241 if (pvh_gen != pvh->pv_gen) { 5242 PMAP_UNLOCK(pmap); 5243 rw_wunlock(lock); 5244 goto retry_pv_loop; 5245 } 5246 } 5247 va = pv->pv_va; 5248 pte = pmap_pte(pmap, pv->pv_va, &lvl); 5249 if ((pmap_load(pte) & ATTR_SW_DBM) != 0) 5250 (void)pmap_demote_l2_locked(pmap, pte, va, &lock); 5251 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 5252 ("inconsistent pv lock %p %p for page %p", 5253 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 5254 PMAP_UNLOCK(pmap); 5255 } 5256 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5257 pmap = PV_PMAP(pv); 5258 PMAP_ASSERT_STAGE1(pmap); 5259 if (!PMAP_TRYLOCK(pmap)) { 5260 pvh_gen = pvh->pv_gen; 5261 md_gen = m->md.pv_gen; 5262 rw_wunlock(lock); 5263 PMAP_LOCK(pmap); 5264 rw_wlock(lock); 5265 if (pvh_gen != pvh->pv_gen || 5266 md_gen != m->md.pv_gen) { 5267 PMAP_UNLOCK(pmap); 5268 rw_wunlock(lock); 5269 goto retry_pv_loop; 5270 } 5271 } 5272 pte = pmap_pte(pmap, pv->pv_va, &lvl); 5273 oldpte = pmap_load(pte); 5274 retry: 5275 if ((oldpte & ATTR_SW_DBM) != 0) { 5276 if (!atomic_fcmpset_long(pte, &oldpte, 5277 (oldpte | ATTR_S1_AP_RW_BIT) & ~ATTR_SW_DBM)) 5278 goto retry; 5279 if ((oldpte & ATTR_S1_AP_RW_BIT) == 5280 ATTR_S1_AP(ATTR_S1_AP_RW)) 5281 vm_page_dirty(m); 5282 pmap_invalidate_page(pmap, pv->pv_va); 5283 } 5284 PMAP_UNLOCK(pmap); 5285 } 5286 rw_wunlock(lock); 5287 vm_page_aflag_clear(m, PGA_WRITEABLE); 5288 } 5289 5290 /* 5291 * pmap_ts_referenced: 5292 * 5293 * Return a count of reference bits for a page, clearing those bits. 5294 * It is not necessary for every reference bit to be cleared, but it 5295 * is necessary that 0 only be returned when there are truly no 5296 * reference bits set. 5297 * 5298 * As an optimization, update the page's dirty field if a modified bit is 5299 * found while counting reference bits. This opportunistic update can be 5300 * performed at low cost and can eliminate the need for some future calls 5301 * to pmap_is_modified(). However, since this function stops after 5302 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 5303 * dirty pages. Those dirty pages will only be detected by a future call 5304 * to pmap_is_modified(). 5305 */ 5306 int 5307 pmap_ts_referenced(vm_page_t m) 5308 { 5309 struct md_page *pvh; 5310 pv_entry_t pv, pvf; 5311 pmap_t pmap; 5312 struct rwlock *lock; 5313 pd_entry_t *pde, tpde; 5314 pt_entry_t *pte, tpte; 5315 vm_offset_t va; 5316 vm_paddr_t pa; 5317 int cleared, lvl, md_gen, not_cleared, pvh_gen; 5318 struct spglist free; 5319 5320 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5321 ("pmap_ts_referenced: page %p is not managed", m)); 5322 SLIST_INIT(&free); 5323 cleared = 0; 5324 pa = VM_PAGE_TO_PHYS(m); 5325 lock = PHYS_TO_PV_LIST_LOCK(pa); 5326 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 5327 rw_wlock(lock); 5328 retry: 5329 not_cleared = 0; 5330 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 5331 goto small_mappings; 5332 pv = pvf; 5333 do { 5334 if (pvf == NULL) 5335 pvf = pv; 5336 pmap = PV_PMAP(pv); 5337 if (!PMAP_TRYLOCK(pmap)) { 5338 pvh_gen = pvh->pv_gen; 5339 rw_wunlock(lock); 5340 PMAP_LOCK(pmap); 5341 rw_wlock(lock); 5342 if (pvh_gen != pvh->pv_gen) { 5343 PMAP_UNLOCK(pmap); 5344 goto retry; 5345 } 5346 } 5347 va = pv->pv_va; 5348 pde = pmap_pde(pmap, pv->pv_va, &lvl); 5349 KASSERT(pde != NULL, ("pmap_ts_referenced: no l1 table found")); 5350 KASSERT(lvl == 1, 5351 ("pmap_ts_referenced: invalid pde level %d", lvl)); 5352 tpde = pmap_load(pde); 5353 KASSERT((tpde & ATTR_DESCR_MASK) == L1_TABLE, 5354 ("pmap_ts_referenced: found an invalid l1 table")); 5355 pte = pmap_l1_to_l2(pde, pv->pv_va); 5356 tpte = pmap_load(pte); 5357 if (pmap_pte_dirty(pmap, tpte)) { 5358 /* 5359 * Although "tpte" is mapping a 2MB page, because 5360 * this function is called at a 4KB page granularity, 5361 * we only update the 4KB page under test. 5362 */ 5363 vm_page_dirty(m); 5364 } 5365 5366 if ((tpte & ATTR_AF) != 0) { 5367 /* 5368 * Since this reference bit is shared by 512 4KB pages, 5369 * it should not be cleared every time it is tested. 5370 * Apply a simple "hash" function on the physical page 5371 * number, the virtual superpage number, and the pmap 5372 * address to select one 4KB page out of the 512 on 5373 * which testing the reference bit will result in 5374 * clearing that reference bit. This function is 5375 * designed to avoid the selection of the same 4KB page 5376 * for every 2MB page mapping. 5377 * 5378 * On demotion, a mapping that hasn't been referenced 5379 * is simply destroyed. To avoid the possibility of a 5380 * subsequent page fault on a demoted wired mapping, 5381 * always leave its reference bit set. Moreover, 5382 * since the superpage is wired, the current state of 5383 * its reference bit won't affect page replacement. 5384 */ 5385 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ 5386 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && 5387 (tpte & ATTR_SW_WIRED) == 0) { 5388 pmap_clear_bits(pte, ATTR_AF); 5389 pmap_invalidate_page(pmap, pv->pv_va); 5390 cleared++; 5391 } else 5392 not_cleared++; 5393 } 5394 PMAP_UNLOCK(pmap); 5395 /* Rotate the PV list if it has more than one entry. */ 5396 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 5397 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5398 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5399 pvh->pv_gen++; 5400 } 5401 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 5402 goto out; 5403 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 5404 small_mappings: 5405 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 5406 goto out; 5407 pv = pvf; 5408 do { 5409 if (pvf == NULL) 5410 pvf = pv; 5411 pmap = PV_PMAP(pv); 5412 if (!PMAP_TRYLOCK(pmap)) { 5413 pvh_gen = pvh->pv_gen; 5414 md_gen = m->md.pv_gen; 5415 rw_wunlock(lock); 5416 PMAP_LOCK(pmap); 5417 rw_wlock(lock); 5418 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 5419 PMAP_UNLOCK(pmap); 5420 goto retry; 5421 } 5422 } 5423 pde = pmap_pde(pmap, pv->pv_va, &lvl); 5424 KASSERT(pde != NULL, ("pmap_ts_referenced: no l2 table found")); 5425 KASSERT(lvl == 2, 5426 ("pmap_ts_referenced: invalid pde level %d", lvl)); 5427 tpde = pmap_load(pde); 5428 KASSERT((tpde & ATTR_DESCR_MASK) == L2_TABLE, 5429 ("pmap_ts_referenced: found an invalid l2 table")); 5430 pte = pmap_l2_to_l3(pde, pv->pv_va); 5431 tpte = pmap_load(pte); 5432 if (pmap_pte_dirty(pmap, tpte)) 5433 vm_page_dirty(m); 5434 if ((tpte & ATTR_AF) != 0) { 5435 if ((tpte & ATTR_SW_WIRED) == 0) { 5436 pmap_clear_bits(pte, ATTR_AF); 5437 pmap_invalidate_page(pmap, pv->pv_va); 5438 cleared++; 5439 } else 5440 not_cleared++; 5441 } 5442 PMAP_UNLOCK(pmap); 5443 /* Rotate the PV list if it has more than one entry. */ 5444 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 5445 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5446 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5447 m->md.pv_gen++; 5448 } 5449 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 5450 not_cleared < PMAP_TS_REFERENCED_MAX); 5451 out: 5452 rw_wunlock(lock); 5453 vm_page_free_pages_toq(&free, true); 5454 return (cleared + not_cleared); 5455 } 5456 5457 /* 5458 * Apply the given advice to the specified range of addresses within the 5459 * given pmap. Depending on the advice, clear the referenced and/or 5460 * modified flags in each mapping and set the mapped page's dirty field. 5461 */ 5462 void 5463 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 5464 { 5465 struct rwlock *lock; 5466 vm_offset_t va, va_next; 5467 vm_page_t m; 5468 pd_entry_t *l0, *l1, *l2, oldl2; 5469 pt_entry_t *l3, oldl3; 5470 5471 PMAP_ASSERT_STAGE1(pmap); 5472 5473 if (advice != MADV_DONTNEED && advice != MADV_FREE) 5474 return; 5475 5476 PMAP_LOCK(pmap); 5477 for (; sva < eva; sva = va_next) { 5478 l0 = pmap_l0(pmap, sva); 5479 if (pmap_load(l0) == 0) { 5480 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 5481 if (va_next < sva) 5482 va_next = eva; 5483 continue; 5484 } 5485 5486 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 5487 if (va_next < sva) 5488 va_next = eva; 5489 l1 = pmap_l0_to_l1(l0, sva); 5490 if (pmap_load(l1) == 0) 5491 continue; 5492 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 5493 KASSERT(va_next <= eva, 5494 ("partial update of non-transparent 1G page " 5495 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 5496 pmap_load(l1), sva, eva, va_next)); 5497 continue; 5498 } 5499 5500 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 5501 if (va_next < sva) 5502 va_next = eva; 5503 l2 = pmap_l1_to_l2(l1, sva); 5504 oldl2 = pmap_load(l2); 5505 if (oldl2 == 0) 5506 continue; 5507 if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) { 5508 if ((oldl2 & ATTR_SW_MANAGED) == 0) 5509 continue; 5510 lock = NULL; 5511 if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) { 5512 if (lock != NULL) 5513 rw_wunlock(lock); 5514 5515 /* 5516 * The 2MB page mapping was destroyed. 5517 */ 5518 continue; 5519 } 5520 5521 /* 5522 * Unless the page mappings are wired, remove the 5523 * mapping to a single page so that a subsequent 5524 * access may repromote. Choosing the last page 5525 * within the address range [sva, min(va_next, eva)) 5526 * generally results in more repromotions. Since the 5527 * underlying page table page is fully populated, this 5528 * removal never frees a page table page. 5529 */ 5530 if ((oldl2 & ATTR_SW_WIRED) == 0) { 5531 va = eva; 5532 if (va > va_next) 5533 va = va_next; 5534 va -= PAGE_SIZE; 5535 KASSERT(va >= sva, 5536 ("pmap_advise: no address gap")); 5537 l3 = pmap_l2_to_l3(l2, va); 5538 KASSERT(pmap_load(l3) != 0, 5539 ("pmap_advise: invalid PTE")); 5540 pmap_remove_l3(pmap, l3, va, pmap_load(l2), 5541 NULL, &lock); 5542 } 5543 if (lock != NULL) 5544 rw_wunlock(lock); 5545 } 5546 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 5547 ("pmap_advise: invalid L2 entry after demotion")); 5548 if (va_next > eva) 5549 va_next = eva; 5550 va = va_next; 5551 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 5552 sva += L3_SIZE) { 5553 oldl3 = pmap_load(l3); 5554 if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) != 5555 (ATTR_SW_MANAGED | L3_PAGE)) 5556 goto maybe_invlrng; 5557 else if (pmap_pte_dirty(pmap, oldl3)) { 5558 if (advice == MADV_DONTNEED) { 5559 /* 5560 * Future calls to pmap_is_modified() 5561 * can be avoided by making the page 5562 * dirty now. 5563 */ 5564 m = PHYS_TO_VM_PAGE(oldl3 & ~ATTR_MASK); 5565 vm_page_dirty(m); 5566 } 5567 while (!atomic_fcmpset_long(l3, &oldl3, 5568 (oldl3 & ~ATTR_AF) | 5569 ATTR_S1_AP(ATTR_S1_AP_RO))) 5570 cpu_spinwait(); 5571 } else if ((oldl3 & ATTR_AF) != 0) 5572 pmap_clear_bits(l3, ATTR_AF); 5573 else 5574 goto maybe_invlrng; 5575 if (va == va_next) 5576 va = sva; 5577 continue; 5578 maybe_invlrng: 5579 if (va != va_next) { 5580 pmap_invalidate_range(pmap, va, sva); 5581 va = va_next; 5582 } 5583 } 5584 if (va != va_next) 5585 pmap_invalidate_range(pmap, va, sva); 5586 } 5587 PMAP_UNLOCK(pmap); 5588 } 5589 5590 /* 5591 * Clear the modify bits on the specified physical page. 5592 */ 5593 void 5594 pmap_clear_modify(vm_page_t m) 5595 { 5596 struct md_page *pvh; 5597 struct rwlock *lock; 5598 pmap_t pmap; 5599 pv_entry_t next_pv, pv; 5600 pd_entry_t *l2, oldl2; 5601 pt_entry_t *l3, oldl3; 5602 vm_offset_t va; 5603 int md_gen, pvh_gen; 5604 5605 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5606 ("pmap_clear_modify: page %p is not managed", m)); 5607 vm_page_assert_busied(m); 5608 5609 if (!pmap_page_is_write_mapped(m)) 5610 return; 5611 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 5612 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5613 rw_wlock(lock); 5614 restart: 5615 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5616 pmap = PV_PMAP(pv); 5617 PMAP_ASSERT_STAGE1(pmap); 5618 if (!PMAP_TRYLOCK(pmap)) { 5619 pvh_gen = pvh->pv_gen; 5620 rw_wunlock(lock); 5621 PMAP_LOCK(pmap); 5622 rw_wlock(lock); 5623 if (pvh_gen != pvh->pv_gen) { 5624 PMAP_UNLOCK(pmap); 5625 goto restart; 5626 } 5627 } 5628 va = pv->pv_va; 5629 l2 = pmap_l2(pmap, va); 5630 oldl2 = pmap_load(l2); 5631 /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */ 5632 if ((oldl2 & ATTR_SW_DBM) != 0 && 5633 pmap_demote_l2_locked(pmap, l2, va, &lock) && 5634 (oldl2 & ATTR_SW_WIRED) == 0) { 5635 /* 5636 * Write protect the mapping to a single page so that 5637 * a subsequent write access may repromote. 5638 */ 5639 va += VM_PAGE_TO_PHYS(m) - (oldl2 & ~ATTR_MASK); 5640 l3 = pmap_l2_to_l3(l2, va); 5641 oldl3 = pmap_load(l3); 5642 while (!atomic_fcmpset_long(l3, &oldl3, 5643 (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO))) 5644 cpu_spinwait(); 5645 vm_page_dirty(m); 5646 pmap_invalidate_page(pmap, va); 5647 } 5648 PMAP_UNLOCK(pmap); 5649 } 5650 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5651 pmap = PV_PMAP(pv); 5652 PMAP_ASSERT_STAGE1(pmap); 5653 if (!PMAP_TRYLOCK(pmap)) { 5654 md_gen = m->md.pv_gen; 5655 pvh_gen = pvh->pv_gen; 5656 rw_wunlock(lock); 5657 PMAP_LOCK(pmap); 5658 rw_wlock(lock); 5659 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 5660 PMAP_UNLOCK(pmap); 5661 goto restart; 5662 } 5663 } 5664 l2 = pmap_l2(pmap, pv->pv_va); 5665 l3 = pmap_l2_to_l3(l2, pv->pv_va); 5666 oldl3 = pmap_load(l3); 5667 if (pmap_l3_valid(oldl3) && 5668 (oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM){ 5669 pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO)); 5670 pmap_invalidate_page(pmap, pv->pv_va); 5671 } 5672 PMAP_UNLOCK(pmap); 5673 } 5674 rw_wunlock(lock); 5675 } 5676 5677 void * 5678 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 5679 { 5680 struct pmap_preinit_mapping *ppim; 5681 vm_offset_t va, offset; 5682 pd_entry_t *pde; 5683 pt_entry_t *l2; 5684 int i, lvl, l2_blocks, free_l2_count, start_idx; 5685 5686 if (!vm_initialized) { 5687 /* 5688 * No L3 ptables so map entire L2 blocks where start VA is: 5689 * preinit_map_va + start_idx * L2_SIZE 5690 * There may be duplicate mappings (multiple VA -> same PA) but 5691 * ARM64 dcache is always PIPT so that's acceptable. 5692 */ 5693 if (size == 0) 5694 return (NULL); 5695 5696 /* Calculate how many L2 blocks are needed for the mapping */ 5697 l2_blocks = (roundup2(pa + size, L2_SIZE) - 5698 rounddown2(pa, L2_SIZE)) >> L2_SHIFT; 5699 5700 offset = pa & L2_OFFSET; 5701 5702 if (preinit_map_va == 0) 5703 return (NULL); 5704 5705 /* Map 2MiB L2 blocks from reserved VA space */ 5706 5707 free_l2_count = 0; 5708 start_idx = -1; 5709 /* Find enough free contiguous VA space */ 5710 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5711 ppim = pmap_preinit_mapping + i; 5712 if (free_l2_count > 0 && ppim->pa != 0) { 5713 /* Not enough space here */ 5714 free_l2_count = 0; 5715 start_idx = -1; 5716 continue; 5717 } 5718 5719 if (ppim->pa == 0) { 5720 /* Free L2 block */ 5721 if (start_idx == -1) 5722 start_idx = i; 5723 free_l2_count++; 5724 if (free_l2_count == l2_blocks) 5725 break; 5726 } 5727 } 5728 if (free_l2_count != l2_blocks) 5729 panic("%s: too many preinit mappings", __func__); 5730 5731 va = preinit_map_va + (start_idx * L2_SIZE); 5732 for (i = start_idx; i < start_idx + l2_blocks; i++) { 5733 /* Mark entries as allocated */ 5734 ppim = pmap_preinit_mapping + i; 5735 ppim->pa = pa; 5736 ppim->va = va + offset; 5737 ppim->size = size; 5738 } 5739 5740 /* Map L2 blocks */ 5741 pa = rounddown2(pa, L2_SIZE); 5742 for (i = 0; i < l2_blocks; i++) { 5743 pde = pmap_pde(kernel_pmap, va, &lvl); 5744 KASSERT(pde != NULL, 5745 ("pmap_mapbios: Invalid page entry, va: 0x%lx", 5746 va)); 5747 KASSERT(lvl == 1, 5748 ("pmap_mapbios: Invalid level %d", lvl)); 5749 5750 /* Insert L2_BLOCK */ 5751 l2 = pmap_l1_to_l2(pde, va); 5752 pmap_load_store(l2, 5753 pa | ATTR_DEFAULT | ATTR_S1_XN | 5754 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK); 5755 5756 va += L2_SIZE; 5757 pa += L2_SIZE; 5758 } 5759 pmap_invalidate_all(kernel_pmap); 5760 5761 va = preinit_map_va + (start_idx * L2_SIZE); 5762 5763 } else { 5764 /* kva_alloc may be used to map the pages */ 5765 offset = pa & PAGE_MASK; 5766 size = round_page(offset + size); 5767 5768 va = kva_alloc(size); 5769 if (va == 0) 5770 panic("%s: Couldn't allocate KVA", __func__); 5771 5772 pde = pmap_pde(kernel_pmap, va, &lvl); 5773 KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl)); 5774 5775 /* L3 table is linked */ 5776 va = trunc_page(va); 5777 pa = trunc_page(pa); 5778 pmap_kenter(va, size, pa, memory_mapping_mode(pa)); 5779 } 5780 5781 return ((void *)(va + offset)); 5782 } 5783 5784 void 5785 pmap_unmapbios(vm_offset_t va, vm_size_t size) 5786 { 5787 struct pmap_preinit_mapping *ppim; 5788 vm_offset_t offset, tmpsize, va_trunc; 5789 pd_entry_t *pde; 5790 pt_entry_t *l2; 5791 int i, lvl, l2_blocks, block; 5792 bool preinit_map; 5793 5794 l2_blocks = 5795 (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT; 5796 KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size)); 5797 5798 /* Remove preinit mapping */ 5799 preinit_map = false; 5800 block = 0; 5801 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5802 ppim = pmap_preinit_mapping + i; 5803 if (ppim->va == va) { 5804 KASSERT(ppim->size == size, 5805 ("pmap_unmapbios: size mismatch")); 5806 ppim->va = 0; 5807 ppim->pa = 0; 5808 ppim->size = 0; 5809 preinit_map = true; 5810 offset = block * L2_SIZE; 5811 va_trunc = rounddown2(va, L2_SIZE) + offset; 5812 5813 /* Remove L2_BLOCK */ 5814 pde = pmap_pde(kernel_pmap, va_trunc, &lvl); 5815 KASSERT(pde != NULL, 5816 ("pmap_unmapbios: Invalid page entry, va: 0x%lx", 5817 va_trunc)); 5818 l2 = pmap_l1_to_l2(pde, va_trunc); 5819 pmap_clear(l2); 5820 5821 if (block == (l2_blocks - 1)) 5822 break; 5823 block++; 5824 } 5825 } 5826 if (preinit_map) { 5827 pmap_invalidate_all(kernel_pmap); 5828 return; 5829 } 5830 5831 /* Unmap the pages reserved with kva_alloc. */ 5832 if (vm_initialized) { 5833 offset = va & PAGE_MASK; 5834 size = round_page(offset + size); 5835 va = trunc_page(va); 5836 5837 pde = pmap_pde(kernel_pmap, va, &lvl); 5838 KASSERT(pde != NULL, 5839 ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va)); 5840 KASSERT(lvl == 2, ("pmap_unmapbios: Invalid level %d", lvl)); 5841 5842 /* Unmap and invalidate the pages */ 5843 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 5844 pmap_kremove(va + tmpsize); 5845 5846 kva_free(va, size); 5847 } 5848 } 5849 5850 /* 5851 * Sets the memory attribute for the specified page. 5852 */ 5853 void 5854 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5855 { 5856 5857 m->md.pv_memattr = ma; 5858 5859 /* 5860 * If "m" is a normal page, update its direct mapping. This update 5861 * can be relied upon to perform any cache operations that are 5862 * required for data coherence. 5863 */ 5864 if ((m->flags & PG_FICTITIOUS) == 0 && 5865 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 5866 m->md.pv_memattr) != 0) 5867 panic("memory attribute change on the direct map failed"); 5868 } 5869 5870 /* 5871 * Changes the specified virtual address range's memory type to that given by 5872 * the parameter "mode". The specified virtual address range must be 5873 * completely contained within either the direct map or the kernel map. If 5874 * the virtual address range is contained within the kernel map, then the 5875 * memory type for each of the corresponding ranges of the direct map is also 5876 * changed. (The corresponding ranges of the direct map are those ranges that 5877 * map the same physical pages as the specified virtual address range.) These 5878 * changes to the direct map are necessary because Intel describes the 5879 * behavior of their processors as "undefined" if two or more mappings to the 5880 * same physical page have different memory types. 5881 * 5882 * Returns zero if the change completed successfully, and either EINVAL or 5883 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 5884 * of the virtual address range was not mapped, and ENOMEM is returned if 5885 * there was insufficient memory available to complete the change. In the 5886 * latter case, the memory type may have been changed on some part of the 5887 * virtual address range or the direct map. 5888 */ 5889 int 5890 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 5891 { 5892 int error; 5893 5894 PMAP_LOCK(kernel_pmap); 5895 error = pmap_change_attr_locked(va, size, mode); 5896 PMAP_UNLOCK(kernel_pmap); 5897 return (error); 5898 } 5899 5900 static int 5901 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) 5902 { 5903 vm_offset_t base, offset, tmpva; 5904 pt_entry_t l3, *pte, *newpte; 5905 int lvl; 5906 5907 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 5908 base = trunc_page(va); 5909 offset = va & PAGE_MASK; 5910 size = round_page(offset + size); 5911 5912 if (!VIRT_IN_DMAP(base) && 5913 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS)) 5914 return (EINVAL); 5915 5916 for (tmpva = base; tmpva < base + size; ) { 5917 pte = pmap_pte(kernel_pmap, tmpva, &lvl); 5918 if (pte == NULL) 5919 return (EINVAL); 5920 5921 if ((pmap_load(pte) & ATTR_S1_IDX_MASK) == ATTR_S1_IDX(mode)) { 5922 /* 5923 * We already have the correct attribute, 5924 * ignore this entry. 5925 */ 5926 switch (lvl) { 5927 default: 5928 panic("Invalid DMAP table level: %d\n", lvl); 5929 case 1: 5930 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; 5931 break; 5932 case 2: 5933 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; 5934 break; 5935 case 3: 5936 tmpva += PAGE_SIZE; 5937 break; 5938 } 5939 } else { 5940 /* 5941 * Split the entry to an level 3 table, then 5942 * set the new attribute. 5943 */ 5944 switch (lvl) { 5945 default: 5946 panic("Invalid DMAP table level: %d\n", lvl); 5947 case 1: 5948 newpte = pmap_demote_l1(kernel_pmap, pte, 5949 tmpva & ~L1_OFFSET); 5950 if (newpte == NULL) 5951 return (EINVAL); 5952 pte = pmap_l1_to_l2(pte, tmpva); 5953 case 2: 5954 newpte = pmap_demote_l2(kernel_pmap, pte, 5955 tmpva); 5956 if (newpte == NULL) 5957 return (EINVAL); 5958 pte = pmap_l2_to_l3(pte, tmpva); 5959 case 3: 5960 /* Update the entry */ 5961 l3 = pmap_load(pte); 5962 l3 &= ~ATTR_S1_IDX_MASK; 5963 l3 |= ATTR_S1_IDX(mode); 5964 if (mode == VM_MEMATTR_DEVICE) 5965 l3 |= ATTR_S1_XN; 5966 5967 pmap_update_entry(kernel_pmap, pte, l3, tmpva, 5968 PAGE_SIZE); 5969 5970 /* 5971 * If moving to a non-cacheable entry flush 5972 * the cache. 5973 */ 5974 if (mode == VM_MEMATTR_UNCACHEABLE) 5975 cpu_dcache_wbinv_range(tmpva, L3_SIZE); 5976 5977 break; 5978 } 5979 tmpva += PAGE_SIZE; 5980 } 5981 } 5982 5983 return (0); 5984 } 5985 5986 /* 5987 * Create an L2 table to map all addresses within an L1 mapping. 5988 */ 5989 static pt_entry_t * 5990 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va) 5991 { 5992 pt_entry_t *l2, newl2, oldl1; 5993 vm_offset_t tmpl1; 5994 vm_paddr_t l2phys, phys; 5995 vm_page_t ml2; 5996 int i; 5997 5998 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5999 oldl1 = pmap_load(l1); 6000 KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK, 6001 ("pmap_demote_l1: Demoting a non-block entry")); 6002 KASSERT((va & L1_OFFSET) == 0, 6003 ("pmap_demote_l1: Invalid virtual address %#lx", va)); 6004 KASSERT((oldl1 & ATTR_SW_MANAGED) == 0, 6005 ("pmap_demote_l1: Level 1 table shouldn't be managed")); 6006 6007 tmpl1 = 0; 6008 if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) { 6009 tmpl1 = kva_alloc(PAGE_SIZE); 6010 if (tmpl1 == 0) 6011 return (NULL); 6012 } 6013 6014 if ((ml2 = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT | 6015 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 6016 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx" 6017 " in pmap %p", va, pmap); 6018 return (NULL); 6019 } 6020 6021 l2phys = VM_PAGE_TO_PHYS(ml2); 6022 l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys); 6023 6024 /* Address the range points at */ 6025 phys = oldl1 & ~ATTR_MASK; 6026 /* The attributed from the old l1 table to be copied */ 6027 newl2 = oldl1 & ATTR_MASK; 6028 6029 /* Create the new entries */ 6030 for (i = 0; i < Ln_ENTRIES; i++) { 6031 l2[i] = newl2 | phys; 6032 phys += L2_SIZE; 6033 } 6034 KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK), 6035 ("Invalid l2 page (%lx != %lx)", l2[0], 6036 (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK)); 6037 6038 if (tmpl1 != 0) { 6039 pmap_kenter(tmpl1, PAGE_SIZE, 6040 DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, 6041 VM_MEMATTR_WRITE_BACK); 6042 l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK)); 6043 } 6044 6045 pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE); 6046 6047 if (tmpl1 != 0) { 6048 pmap_kremove(tmpl1); 6049 kva_free(tmpl1, PAGE_SIZE); 6050 } 6051 6052 return (l2); 6053 } 6054 6055 static void 6056 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3) 6057 { 6058 pt_entry_t *l3; 6059 6060 for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) { 6061 *l3 = newl3; 6062 newl3 += L3_SIZE; 6063 } 6064 } 6065 6066 static void 6067 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2, 6068 struct rwlock **lockp) 6069 { 6070 struct spglist free; 6071 6072 SLIST_INIT(&free); 6073 (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, 6074 lockp); 6075 vm_page_free_pages_toq(&free, true); 6076 } 6077 6078 /* 6079 * Create an L3 table to map all addresses within an L2 mapping. 6080 */ 6081 static pt_entry_t * 6082 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, 6083 struct rwlock **lockp) 6084 { 6085 pt_entry_t *l3, newl3, oldl2; 6086 vm_offset_t tmpl2; 6087 vm_paddr_t l3phys; 6088 vm_page_t ml3; 6089 6090 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6091 PMAP_ASSERT_STAGE1(pmap); 6092 l3 = NULL; 6093 oldl2 = pmap_load(l2); 6094 KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK, 6095 ("pmap_demote_l2: Demoting a non-block entry")); 6096 va &= ~L2_OFFSET; 6097 6098 tmpl2 = 0; 6099 if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) { 6100 tmpl2 = kva_alloc(PAGE_SIZE); 6101 if (tmpl2 == 0) 6102 return (NULL); 6103 } 6104 6105 /* 6106 * Invalidate the 2MB page mapping and return "failure" if the 6107 * mapping was never accessed. 6108 */ 6109 if ((oldl2 & ATTR_AF) == 0) { 6110 KASSERT((oldl2 & ATTR_SW_WIRED) == 0, 6111 ("pmap_demote_l2: a wired mapping is missing ATTR_AF")); 6112 pmap_demote_l2_abort(pmap, va, l2, lockp); 6113 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p", 6114 va, pmap); 6115 goto fail; 6116 } 6117 6118 if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) { 6119 KASSERT((oldl2 & ATTR_SW_WIRED) == 0, 6120 ("pmap_demote_l2: page table page for a wired mapping" 6121 " is missing")); 6122 6123 /* 6124 * If the page table page is missing and the mapping 6125 * is for a kernel address, the mapping must belong to 6126 * the direct map. Page table pages are preallocated 6127 * for every other part of the kernel address space, 6128 * so the direct map region is the only part of the 6129 * kernel address space that must be handled here. 6130 */ 6131 KASSERT(va < VM_MAXUSER_ADDRESS || VIRT_IN_DMAP(va), 6132 ("pmap_demote_l2: No saved mpte for va %#lx", va)); 6133 6134 /* 6135 * If the 2MB page mapping belongs to the direct map 6136 * region of the kernel's address space, then the page 6137 * allocation request specifies the highest possible 6138 * priority (VM_ALLOC_INTERRUPT). Otherwise, the 6139 * priority is normal. 6140 */ 6141 ml3 = vm_page_alloc(NULL, pmap_l2_pindex(va), 6142 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | 6143 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); 6144 6145 /* 6146 * If the allocation of the new page table page fails, 6147 * invalidate the 2MB page mapping and return "failure". 6148 */ 6149 if (ml3 == NULL) { 6150 pmap_demote_l2_abort(pmap, va, l2, lockp); 6151 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx" 6152 " in pmap %p", va, pmap); 6153 goto fail; 6154 } 6155 6156 if (va < VM_MAXUSER_ADDRESS) { 6157 ml3->ref_count = NL3PG; 6158 pmap_resident_count_inc(pmap, 1); 6159 } 6160 } 6161 l3phys = VM_PAGE_TO_PHYS(ml3); 6162 l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys); 6163 newl3 = (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE; 6164 KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 6165 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM), 6166 ("pmap_demote_l2: L2 entry is writeable but not dirty")); 6167 6168 /* 6169 * If the page table page is not leftover from an earlier promotion, 6170 * or the mapping attributes have changed, (re)initialize the L3 table. 6171 * 6172 * When pmap_update_entry() clears the old L2 mapping, it (indirectly) 6173 * performs a dsb(). That dsb() ensures that the stores for filling 6174 * "l3" are visible before "l3" is added to the page table. 6175 */ 6176 if (ml3->valid == 0 || (l3[0] & ATTR_MASK) != (newl3 & ATTR_MASK)) 6177 pmap_fill_l3(l3, newl3); 6178 6179 /* 6180 * Map the temporary page so we don't lose access to the l2 table. 6181 */ 6182 if (tmpl2 != 0) { 6183 pmap_kenter(tmpl2, PAGE_SIZE, 6184 DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, 6185 VM_MEMATTR_WRITE_BACK); 6186 l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK)); 6187 } 6188 6189 /* 6190 * The spare PV entries must be reserved prior to demoting the 6191 * mapping, that is, prior to changing the PDE. Otherwise, the state 6192 * of the L2 and the PV lists will be inconsistent, which can result 6193 * in reclaim_pv_chunk() attempting to remove a PV entry from the 6194 * wrong PV list and pmap_pv_demote_l2() failing to find the expected 6195 * PV entry for the 2MB page mapping that is being demoted. 6196 */ 6197 if ((oldl2 & ATTR_SW_MANAGED) != 0) 6198 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); 6199 6200 /* 6201 * Pass PAGE_SIZE so that a single TLB invalidation is performed on 6202 * the 2MB page mapping. 6203 */ 6204 pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE); 6205 6206 /* 6207 * Demote the PV entry. 6208 */ 6209 if ((oldl2 & ATTR_SW_MANAGED) != 0) 6210 pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp); 6211 6212 atomic_add_long(&pmap_l2_demotions, 1); 6213 CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx" 6214 " in pmap %p %lx", va, pmap, l3[0]); 6215 6216 fail: 6217 if (tmpl2 != 0) { 6218 pmap_kremove(tmpl2); 6219 kva_free(tmpl2, PAGE_SIZE); 6220 } 6221 6222 return (l3); 6223 6224 } 6225 6226 static pt_entry_t * 6227 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 6228 { 6229 struct rwlock *lock; 6230 pt_entry_t *l3; 6231 6232 lock = NULL; 6233 l3 = pmap_demote_l2_locked(pmap, l2, va, &lock); 6234 if (lock != NULL) 6235 rw_wunlock(lock); 6236 return (l3); 6237 } 6238 6239 /* 6240 * Perform the pmap work for mincore(2). If the page is not both referenced and 6241 * modified by this pmap, returns its physical address so that the caller can 6242 * find other mappings. 6243 */ 6244 int 6245 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 6246 { 6247 pt_entry_t *pte, tpte; 6248 vm_paddr_t mask, pa; 6249 int lvl, val; 6250 bool managed; 6251 6252 PMAP_ASSERT_STAGE1(pmap); 6253 PMAP_LOCK(pmap); 6254 pte = pmap_pte(pmap, addr, &lvl); 6255 if (pte != NULL) { 6256 tpte = pmap_load(pte); 6257 6258 switch (lvl) { 6259 case 3: 6260 mask = L3_OFFSET; 6261 break; 6262 case 2: 6263 mask = L2_OFFSET; 6264 break; 6265 case 1: 6266 mask = L1_OFFSET; 6267 break; 6268 default: 6269 panic("pmap_mincore: invalid level %d", lvl); 6270 } 6271 6272 managed = (tpte & ATTR_SW_MANAGED) != 0; 6273 val = MINCORE_INCORE; 6274 if (lvl != 3) 6275 val |= MINCORE_PSIND(3 - lvl); 6276 if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed && 6277 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))) 6278 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6279 if ((tpte & ATTR_AF) == ATTR_AF) 6280 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6281 6282 pa = (tpte & ~ATTR_MASK) | (addr & mask); 6283 } else { 6284 managed = false; 6285 val = 0; 6286 } 6287 6288 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 6289 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 6290 *pap = pa; 6291 } 6292 PMAP_UNLOCK(pmap); 6293 return (val); 6294 } 6295 6296 /* 6297 * Garbage collect every ASID that is neither active on a processor nor 6298 * reserved. 6299 */ 6300 static void 6301 pmap_reset_asid_set(pmap_t pmap) 6302 { 6303 pmap_t curpmap; 6304 int asid, cpuid, epoch; 6305 struct asid_set *set; 6306 enum pmap_stage stage; 6307 6308 set = pmap->pm_asid_set; 6309 stage = pmap->pm_stage; 6310 6311 set = pmap->pm_asid_set; 6312 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 6313 mtx_assert(&set->asid_set_mutex, MA_OWNED); 6314 6315 /* 6316 * Ensure that the store to asid_epoch is globally visible before the 6317 * loads from pc_curpmap are performed. 6318 */ 6319 epoch = set->asid_epoch + 1; 6320 if (epoch == INT_MAX) 6321 epoch = 0; 6322 set->asid_epoch = epoch; 6323 dsb(ishst); 6324 if (stage == PM_STAGE1) { 6325 __asm __volatile("tlbi vmalle1is"); 6326 } else { 6327 KASSERT(pmap_clean_stage2_tlbi != NULL, 6328 ("%s: Unset stage 2 tlb invalidation callback\n", 6329 __func__)); 6330 pmap_clean_stage2_tlbi(); 6331 } 6332 dsb(ish); 6333 bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE, 6334 set->asid_set_size - 1); 6335 CPU_FOREACH(cpuid) { 6336 if (cpuid == curcpu) 6337 continue; 6338 if (stage == PM_STAGE1) { 6339 curpmap = pcpu_find(cpuid)->pc_curpmap; 6340 PMAP_ASSERT_STAGE1(pmap); 6341 } else { 6342 curpmap = pcpu_find(cpuid)->pc_curvmpmap; 6343 if (curpmap == NULL) 6344 continue; 6345 PMAP_ASSERT_STAGE2(pmap); 6346 } 6347 KASSERT(curpmap->pm_asid_set == set, ("Incorrect set")); 6348 asid = COOKIE_TO_ASID(curpmap->pm_cookie); 6349 if (asid == -1) 6350 continue; 6351 bit_set(set->asid_set, asid); 6352 curpmap->pm_cookie = COOKIE_FROM(asid, epoch); 6353 } 6354 } 6355 6356 /* 6357 * Allocate a new ASID for the specified pmap. 6358 */ 6359 static void 6360 pmap_alloc_asid(pmap_t pmap) 6361 { 6362 struct asid_set *set; 6363 int new_asid; 6364 6365 set = pmap->pm_asid_set; 6366 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 6367 6368 mtx_lock_spin(&set->asid_set_mutex); 6369 6370 /* 6371 * While this processor was waiting to acquire the asid set mutex, 6372 * pmap_reset_asid_set() running on another processor might have 6373 * updated this pmap's cookie to the current epoch. In which case, we 6374 * don't need to allocate a new ASID. 6375 */ 6376 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) 6377 goto out; 6378 6379 bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size, 6380 &new_asid); 6381 if (new_asid == -1) { 6382 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, 6383 set->asid_next, &new_asid); 6384 if (new_asid == -1) { 6385 pmap_reset_asid_set(pmap); 6386 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, 6387 set->asid_set_size, &new_asid); 6388 KASSERT(new_asid != -1, ("ASID allocation failure")); 6389 } 6390 } 6391 bit_set(set->asid_set, new_asid); 6392 set->asid_next = new_asid + 1; 6393 pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch); 6394 out: 6395 mtx_unlock_spin(&set->asid_set_mutex); 6396 } 6397 6398 /* 6399 * Compute the value that should be stored in ttbr0 to activate the specified 6400 * pmap. This value may change from time to time. 6401 */ 6402 uint64_t 6403 pmap_to_ttbr0(pmap_t pmap) 6404 { 6405 6406 return (ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)) | 6407 pmap->pm_ttbr); 6408 } 6409 6410 static bool 6411 pmap_activate_int(pmap_t pmap) 6412 { 6413 struct asid_set *set; 6414 int epoch; 6415 6416 KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap")); 6417 KASSERT(pmap != kernel_pmap, ("kernel pmap activation")); 6418 6419 if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) || 6420 (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) { 6421 /* 6422 * Handle the possibility that the old thread was preempted 6423 * after an "ic" or "tlbi" instruction but before it performed 6424 * a "dsb" instruction. If the old thread migrates to a new 6425 * processor, its completion of a "dsb" instruction on that 6426 * new processor does not guarantee that the "ic" or "tlbi" 6427 * instructions performed on the old processor have completed. 6428 */ 6429 dsb(ish); 6430 return (false); 6431 } 6432 6433 set = pmap->pm_asid_set; 6434 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 6435 6436 /* 6437 * Ensure that the store to curpmap is globally visible before the 6438 * load from asid_epoch is performed. 6439 */ 6440 if (pmap->pm_stage == PM_STAGE1) 6441 PCPU_SET(curpmap, pmap); 6442 else 6443 PCPU_SET(curvmpmap, pmap); 6444 dsb(ish); 6445 epoch = COOKIE_TO_EPOCH(pmap->pm_cookie); 6446 if (epoch >= 0 && epoch != set->asid_epoch) 6447 pmap_alloc_asid(pmap); 6448 6449 if (pmap->pm_stage == PM_STAGE1) { 6450 set_ttbr0(pmap_to_ttbr0(pmap)); 6451 if (PCPU_GET(bcast_tlbi_workaround) != 0) 6452 invalidate_local_icache(); 6453 } 6454 return (true); 6455 } 6456 6457 void 6458 pmap_activate_vm(pmap_t pmap) 6459 { 6460 6461 PMAP_ASSERT_STAGE2(pmap); 6462 6463 (void)pmap_activate_int(pmap); 6464 } 6465 6466 void 6467 pmap_activate(struct thread *td) 6468 { 6469 pmap_t pmap; 6470 6471 pmap = vmspace_pmap(td->td_proc->p_vmspace); 6472 PMAP_ASSERT_STAGE1(pmap); 6473 critical_enter(); 6474 (void)pmap_activate_int(pmap); 6475 critical_exit(); 6476 } 6477 6478 /* 6479 * To eliminate the unused parameter "old", we would have to add an instruction 6480 * to cpu_switch(). 6481 */ 6482 struct pcb * 6483 pmap_switch(struct thread *old __unused, struct thread *new) 6484 { 6485 pcpu_bp_harden bp_harden; 6486 struct pcb *pcb; 6487 6488 /* Store the new curthread */ 6489 PCPU_SET(curthread, new); 6490 6491 /* And the new pcb */ 6492 pcb = new->td_pcb; 6493 PCPU_SET(curpcb, pcb); 6494 6495 /* 6496 * TODO: We may need to flush the cache here if switching 6497 * to a user process. 6498 */ 6499 6500 if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) { 6501 /* 6502 * Stop userspace from training the branch predictor against 6503 * other processes. This will call into a CPU specific 6504 * function that clears the branch predictor state. 6505 */ 6506 bp_harden = PCPU_GET(bp_harden); 6507 if (bp_harden != NULL) 6508 bp_harden(); 6509 } 6510 6511 return (pcb); 6512 } 6513 6514 void 6515 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) 6516 { 6517 6518 PMAP_ASSERT_STAGE1(pmap); 6519 if (va >= VM_MIN_KERNEL_ADDRESS) { 6520 cpu_icache_sync_range(va, sz); 6521 } else { 6522 u_int len, offset; 6523 vm_paddr_t pa; 6524 6525 /* Find the length of data in this page to flush */ 6526 offset = va & PAGE_MASK; 6527 len = imin(PAGE_SIZE - offset, sz); 6528 6529 while (sz != 0) { 6530 /* Extract the physical address & find it in the DMAP */ 6531 pa = pmap_extract(pmap, va); 6532 if (pa != 0) 6533 cpu_icache_sync_range(PHYS_TO_DMAP(pa), len); 6534 6535 /* Move to the next page */ 6536 sz -= len; 6537 va += len; 6538 /* Set the length for the next iteration */ 6539 len = imin(PAGE_SIZE, sz); 6540 } 6541 } 6542 } 6543 6544 static int 6545 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far) 6546 { 6547 pd_entry_t *pdep; 6548 pt_entry_t *ptep, pte; 6549 int rv, lvl, dfsc; 6550 6551 PMAP_ASSERT_STAGE2(pmap); 6552 rv = KERN_FAILURE; 6553 6554 /* Data and insn aborts use same encoding for FSC field. */ 6555 dfsc = esr & ISS_DATA_DFSC_MASK; 6556 switch (dfsc) { 6557 case ISS_DATA_DFSC_TF_L0: 6558 case ISS_DATA_DFSC_TF_L1: 6559 case ISS_DATA_DFSC_TF_L2: 6560 case ISS_DATA_DFSC_TF_L3: 6561 PMAP_LOCK(pmap); 6562 pdep = pmap_pde(pmap, far, &lvl); 6563 if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) { 6564 PMAP_LOCK(pmap); 6565 break; 6566 } 6567 6568 switch (lvl) { 6569 case 0: 6570 ptep = pmap_l0_to_l1(pdep, far); 6571 break; 6572 case 1: 6573 ptep = pmap_l1_to_l2(pdep, far); 6574 break; 6575 case 2: 6576 ptep = pmap_l2_to_l3(pdep, far); 6577 break; 6578 default: 6579 panic("%s: Invalid pde level %d", __func__,lvl); 6580 } 6581 goto fault_exec; 6582 6583 case ISS_DATA_DFSC_AFF_L1: 6584 case ISS_DATA_DFSC_AFF_L2: 6585 case ISS_DATA_DFSC_AFF_L3: 6586 PMAP_LOCK(pmap); 6587 ptep = pmap_pte(pmap, far, &lvl); 6588 fault_exec: 6589 if (ptep != NULL && (pte = pmap_load(ptep)) != 0) { 6590 if (icache_vmid) { 6591 pmap_invalidate_vpipt_icache(); 6592 } else { 6593 /* 6594 * If accessing an executable page invalidate 6595 * the I-cache so it will be valid when we 6596 * continue execution in the guest. The D-cache 6597 * is assumed to already be clean to the Point 6598 * of Coherency. 6599 */ 6600 if ((pte & ATTR_S2_XN_MASK) != 6601 ATTR_S2_XN(ATTR_S2_XN_NONE)) { 6602 invalidate_icache(); 6603 } 6604 } 6605 pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID); 6606 rv = KERN_SUCCESS; 6607 } 6608 PMAP_UNLOCK(pmap); 6609 break; 6610 } 6611 6612 return (rv); 6613 } 6614 6615 int 6616 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far) 6617 { 6618 pt_entry_t pte, *ptep; 6619 register_t intr; 6620 uint64_t ec, par; 6621 int lvl, rv; 6622 6623 rv = KERN_FAILURE; 6624 6625 ec = ESR_ELx_EXCEPTION(esr); 6626 switch (ec) { 6627 case EXCP_INSN_ABORT_L: 6628 case EXCP_INSN_ABORT: 6629 case EXCP_DATA_ABORT_L: 6630 case EXCP_DATA_ABORT: 6631 break; 6632 default: 6633 return (rv); 6634 } 6635 6636 if (pmap->pm_stage == PM_STAGE2) 6637 return (pmap_stage2_fault(pmap, esr, far)); 6638 6639 /* Data and insn aborts use same encoding for FSC field. */ 6640 switch (esr & ISS_DATA_DFSC_MASK) { 6641 case ISS_DATA_DFSC_AFF_L1: 6642 case ISS_DATA_DFSC_AFF_L2: 6643 case ISS_DATA_DFSC_AFF_L3: 6644 PMAP_LOCK(pmap); 6645 ptep = pmap_pte(pmap, far, &lvl); 6646 if (ptep != NULL) { 6647 pmap_set_bits(ptep, ATTR_AF); 6648 rv = KERN_SUCCESS; 6649 /* 6650 * XXXMJ as an optimization we could mark the entry 6651 * dirty if this is a write fault. 6652 */ 6653 } 6654 PMAP_UNLOCK(pmap); 6655 break; 6656 case ISS_DATA_DFSC_PF_L1: 6657 case ISS_DATA_DFSC_PF_L2: 6658 case ISS_DATA_DFSC_PF_L3: 6659 if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) || 6660 (esr & ISS_DATA_WnR) == 0) 6661 return (rv); 6662 PMAP_LOCK(pmap); 6663 ptep = pmap_pte(pmap, far, &lvl); 6664 if (ptep != NULL && 6665 ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) { 6666 if ((pte & ATTR_S1_AP_RW_BIT) == 6667 ATTR_S1_AP(ATTR_S1_AP_RO)) { 6668 pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT); 6669 pmap_invalidate_page(pmap, far); 6670 } 6671 rv = KERN_SUCCESS; 6672 } 6673 PMAP_UNLOCK(pmap); 6674 break; 6675 case ISS_DATA_DFSC_TF_L0: 6676 case ISS_DATA_DFSC_TF_L1: 6677 case ISS_DATA_DFSC_TF_L2: 6678 case ISS_DATA_DFSC_TF_L3: 6679 /* 6680 * Retry the translation. A break-before-make sequence can 6681 * produce a transient fault. 6682 */ 6683 if (pmap == kernel_pmap) { 6684 /* 6685 * The translation fault may have occurred within a 6686 * critical section. Therefore, we must check the 6687 * address without acquiring the kernel pmap's lock. 6688 */ 6689 if (pmap_klookup(far, NULL)) 6690 rv = KERN_SUCCESS; 6691 } else { 6692 PMAP_LOCK(pmap); 6693 /* Ask the MMU to check the address. */ 6694 intr = intr_disable(); 6695 par = arm64_address_translate_s1e0r(far); 6696 intr_restore(intr); 6697 PMAP_UNLOCK(pmap); 6698 6699 /* 6700 * If the translation was successful, then we can 6701 * return success to the trap handler. 6702 */ 6703 if (PAR_SUCCESS(par)) 6704 rv = KERN_SUCCESS; 6705 } 6706 break; 6707 } 6708 6709 return (rv); 6710 } 6711 6712 /* 6713 * Increase the starting virtual address of the given mapping if a 6714 * different alignment might result in more superpage mappings. 6715 */ 6716 void 6717 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 6718 vm_offset_t *addr, vm_size_t size) 6719 { 6720 vm_offset_t superpage_offset; 6721 6722 if (size < L2_SIZE) 6723 return; 6724 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 6725 offset += ptoa(object->pg_color); 6726 superpage_offset = offset & L2_OFFSET; 6727 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || 6728 (*addr & L2_OFFSET) == superpage_offset) 6729 return; 6730 if ((*addr & L2_OFFSET) < superpage_offset) 6731 *addr = (*addr & ~L2_OFFSET) + superpage_offset; 6732 else 6733 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; 6734 } 6735 6736 /** 6737 * Get the kernel virtual address of a set of physical pages. If there are 6738 * physical addresses not covered by the DMAP perform a transient mapping 6739 * that will be removed when calling pmap_unmap_io_transient. 6740 * 6741 * \param page The pages the caller wishes to obtain the virtual 6742 * address on the kernel memory map. 6743 * \param vaddr On return contains the kernel virtual memory address 6744 * of the pages passed in the page parameter. 6745 * \param count Number of pages passed in. 6746 * \param can_fault TRUE if the thread using the mapped pages can take 6747 * page faults, FALSE otherwise. 6748 * 6749 * \returns TRUE if the caller must call pmap_unmap_io_transient when 6750 * finished or FALSE otherwise. 6751 * 6752 */ 6753 boolean_t 6754 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 6755 boolean_t can_fault) 6756 { 6757 vm_paddr_t paddr; 6758 boolean_t needs_mapping; 6759 int error, i; 6760 6761 /* 6762 * Allocate any KVA space that we need, this is done in a separate 6763 * loop to prevent calling vmem_alloc while pinned. 6764 */ 6765 needs_mapping = FALSE; 6766 for (i = 0; i < count; i++) { 6767 paddr = VM_PAGE_TO_PHYS(page[i]); 6768 if (__predict_false(!PHYS_IN_DMAP(paddr))) { 6769 error = vmem_alloc(kernel_arena, PAGE_SIZE, 6770 M_BESTFIT | M_WAITOK, &vaddr[i]); 6771 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 6772 needs_mapping = TRUE; 6773 } else { 6774 vaddr[i] = PHYS_TO_DMAP(paddr); 6775 } 6776 } 6777 6778 /* Exit early if everything is covered by the DMAP */ 6779 if (!needs_mapping) 6780 return (FALSE); 6781 6782 if (!can_fault) 6783 sched_pin(); 6784 for (i = 0; i < count; i++) { 6785 paddr = VM_PAGE_TO_PHYS(page[i]); 6786 if (!PHYS_IN_DMAP(paddr)) { 6787 panic( 6788 "pmap_map_io_transient: TODO: Map out of DMAP data"); 6789 } 6790 } 6791 6792 return (needs_mapping); 6793 } 6794 6795 void 6796 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 6797 boolean_t can_fault) 6798 { 6799 vm_paddr_t paddr; 6800 int i; 6801 6802 if (!can_fault) 6803 sched_unpin(); 6804 for (i = 0; i < count; i++) { 6805 paddr = VM_PAGE_TO_PHYS(page[i]); 6806 if (!PHYS_IN_DMAP(paddr)) { 6807 panic("ARM64TODO: pmap_unmap_io_transient: Unmap data"); 6808 } 6809 } 6810 } 6811 6812 boolean_t 6813 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 6814 { 6815 6816 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH); 6817 } 6818 6819 /* 6820 * Track a range of the kernel's virtual address space that is contiguous 6821 * in various mapping attributes. 6822 */ 6823 struct pmap_kernel_map_range { 6824 vm_offset_t sva; 6825 pt_entry_t attrs; 6826 int l3pages; 6827 int l3contig; 6828 int l2blocks; 6829 int l1blocks; 6830 }; 6831 6832 static void 6833 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 6834 vm_offset_t eva) 6835 { 6836 const char *mode; 6837 int index; 6838 6839 if (eva <= range->sva) 6840 return; 6841 6842 index = range->attrs & ATTR_S1_IDX_MASK; 6843 switch (index) { 6844 case ATTR_S1_IDX(VM_MEMATTR_DEVICE): 6845 mode = "DEV"; 6846 break; 6847 case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE): 6848 mode = "UC"; 6849 break; 6850 case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK): 6851 mode = "WB"; 6852 break; 6853 case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH): 6854 mode = "WT"; 6855 break; 6856 default: 6857 printf( 6858 "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n", 6859 __func__, index, range->sva, eva); 6860 mode = "??"; 6861 break; 6862 } 6863 6864 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c %3s %d %d %d %d\n", 6865 range->sva, eva, 6866 (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-', 6867 (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x', 6868 (range->attrs & ATTR_S1_AP_USER) != 0 ? 'u' : 's', 6869 mode, range->l1blocks, range->l2blocks, range->l3contig, 6870 range->l3pages); 6871 6872 /* Reset to sentinel value. */ 6873 range->sva = 0xfffffffffffffffful; 6874 } 6875 6876 /* 6877 * Determine whether the attributes specified by a page table entry match those 6878 * being tracked by the current range. 6879 */ 6880 static bool 6881 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 6882 { 6883 6884 return (range->attrs == attrs); 6885 } 6886 6887 static void 6888 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 6889 pt_entry_t attrs) 6890 { 6891 6892 memset(range, 0, sizeof(*range)); 6893 range->sva = va; 6894 range->attrs = attrs; 6895 } 6896 6897 /* 6898 * Given a leaf PTE, derive the mapping's attributes. If they do not match 6899 * those of the current run, dump the address range and its attributes, and 6900 * begin a new run. 6901 */ 6902 static void 6903 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 6904 vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e, 6905 pt_entry_t l3e) 6906 { 6907 pt_entry_t attrs; 6908 6909 attrs = l0e & (ATTR_S1_AP_MASK | ATTR_S1_XN); 6910 attrs |= l1e & (ATTR_S1_AP_MASK | ATTR_S1_XN); 6911 if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) 6912 attrs |= l1e & ATTR_S1_IDX_MASK; 6913 attrs |= l2e & (ATTR_S1_AP_MASK | ATTR_S1_XN); 6914 if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) 6915 attrs |= l2e & ATTR_S1_IDX_MASK; 6916 attrs |= l3e & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK); 6917 6918 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 6919 sysctl_kmaps_dump(sb, range, va); 6920 sysctl_kmaps_reinit(range, va, attrs); 6921 } 6922 } 6923 6924 static int 6925 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 6926 { 6927 struct pmap_kernel_map_range range; 6928 struct sbuf sbuf, *sb; 6929 pd_entry_t l0e, *l1, l1e, *l2, l2e; 6930 pt_entry_t *l3, l3e; 6931 vm_offset_t sva; 6932 vm_paddr_t pa; 6933 int error, i, j, k, l; 6934 6935 error = sysctl_wire_old_buffer(req, 0); 6936 if (error != 0) 6937 return (error); 6938 sb = &sbuf; 6939 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 6940 6941 /* Sentinel value. */ 6942 range.sva = 0xfffffffffffffffful; 6943 6944 /* 6945 * Iterate over the kernel page tables without holding the kernel pmap 6946 * lock. Kernel page table pages are never freed, so at worst we will 6947 * observe inconsistencies in the output. 6948 */ 6949 for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES; 6950 i++) { 6951 if (i == pmap_l0_index(DMAP_MIN_ADDRESS)) 6952 sbuf_printf(sb, "\nDirect map:\n"); 6953 else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS)) 6954 sbuf_printf(sb, "\nKernel map:\n"); 6955 6956 l0e = kernel_pmap->pm_l0[i]; 6957 if ((l0e & ATTR_DESCR_VALID) == 0) { 6958 sysctl_kmaps_dump(sb, &range, sva); 6959 sva += L0_SIZE; 6960 continue; 6961 } 6962 pa = l0e & ~ATTR_MASK; 6963 l1 = (pd_entry_t *)PHYS_TO_DMAP(pa); 6964 6965 for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) { 6966 l1e = l1[j]; 6967 if ((l1e & ATTR_DESCR_VALID) == 0) { 6968 sysctl_kmaps_dump(sb, &range, sva); 6969 sva += L1_SIZE; 6970 continue; 6971 } 6972 if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) { 6973 sysctl_kmaps_check(sb, &range, sva, l0e, l1e, 6974 0, 0); 6975 range.l1blocks++; 6976 sva += L1_SIZE; 6977 continue; 6978 } 6979 pa = l1e & ~ATTR_MASK; 6980 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); 6981 6982 for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) { 6983 l2e = l2[k]; 6984 if ((l2e & ATTR_DESCR_VALID) == 0) { 6985 sysctl_kmaps_dump(sb, &range, sva); 6986 sva += L2_SIZE; 6987 continue; 6988 } 6989 if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) { 6990 sysctl_kmaps_check(sb, &range, sva, 6991 l0e, l1e, l2e, 0); 6992 range.l2blocks++; 6993 sva += L2_SIZE; 6994 continue; 6995 } 6996 pa = l2e & ~ATTR_MASK; 6997 l3 = (pt_entry_t *)PHYS_TO_DMAP(pa); 6998 6999 for (l = pmap_l3_index(sva); l < Ln_ENTRIES; 7000 l++, sva += L3_SIZE) { 7001 l3e = l3[l]; 7002 if ((l3e & ATTR_DESCR_VALID) == 0) { 7003 sysctl_kmaps_dump(sb, &range, 7004 sva); 7005 continue; 7006 } 7007 sysctl_kmaps_check(sb, &range, sva, 7008 l0e, l1e, l2e, l3e); 7009 if ((l3e & ATTR_CONTIGUOUS) != 0) 7010 range.l3contig += l % 16 == 0 ? 7011 1 : 0; 7012 else 7013 range.l3pages++; 7014 } 7015 } 7016 } 7017 } 7018 7019 error = sbuf_finish(sb); 7020 sbuf_delete(sb); 7021 return (error); 7022 } 7023 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 7024 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 7025 NULL, 0, sysctl_kmaps, "A", 7026 "Dump kernel address layout"); 7027