1 /*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2003 Peter Wemm 9 * All rights reserved. 10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 11 * All rights reserved. 12 * Copyright (c) 2014 Andrew Turner 13 * All rights reserved. 14 * Copyright (c) 2014-2016 The FreeBSD Foundation 15 * All rights reserved. 16 * 17 * This code is derived from software contributed to Berkeley by 18 * the Systems Programming Group of the University of Utah Computer 19 * Science Department and William Jolitz of UUNET Technologies Inc. 20 * 21 * This software was developed by Andrew Turner under sponsorship from 22 * the FreeBSD Foundation. 23 * 24 * Redistribution and use in source and binary forms, with or without 25 * modification, are permitted provided that the following conditions 26 * are met: 27 * 1. Redistributions of source code must retain the above copyright 28 * notice, this list of conditions and the following disclaimer. 29 * 2. Redistributions in binary form must reproduce the above copyright 30 * notice, this list of conditions and the following disclaimer in the 31 * documentation and/or other materials provided with the distribution. 32 * 3. All advertising materials mentioning features or use of this software 33 * must display the following acknowledgement: 34 * This product includes software developed by the University of 35 * California, Berkeley and its contributors. 36 * 4. Neither the name of the University nor the names of its contributors 37 * may be used to endorse or promote products derived from this software 38 * without specific prior written permission. 39 * 40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 43 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 50 * SUCH DAMAGE. 51 * 52 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 53 */ 54 /*- 55 * Copyright (c) 2003 Networks Associates Technology, Inc. 56 * All rights reserved. 57 * 58 * This software was developed for the FreeBSD Project by Jake Burkholder, 59 * Safeport Network Services, and Network Associates Laboratories, the 60 * Security Research Division of Network Associates, Inc. under 61 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 62 * CHATS research program. 63 * 64 * Redistribution and use in source and binary forms, with or without 65 * modification, are permitted provided that the following conditions 66 * are met: 67 * 1. Redistributions of source code must retain the above copyright 68 * notice, this list of conditions and the following disclaimer. 69 * 2. Redistributions in binary form must reproduce the above copyright 70 * notice, this list of conditions and the following disclaimer in the 71 * documentation and/or other materials provided with the distribution. 72 * 73 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 74 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 75 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 76 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 77 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 78 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 79 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 80 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 81 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 82 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 83 * SUCH DAMAGE. 84 */ 85 86 #include <sys/cdefs.h> 87 __FBSDID("$FreeBSD$"); 88 89 /* 90 * Manages physical address maps. 91 * 92 * Since the information managed by this module is 93 * also stored by the logical address mapping module, 94 * this module may throw away valid virtual-to-physical 95 * mappings at almost any time. However, invalidations 96 * of virtual-to-physical mappings must be done as 97 * requested. 98 * 99 * In order to cope with hardware architectures which 100 * make virtual-to-physical map invalidates expensive, 101 * this module may delay invalidate or reduced protection 102 * operations until such time as they are actually 103 * necessary. This module is given full information as 104 * to which processors are currently using which maps, 105 * and to when physical maps must be made correct. 106 */ 107 108 #include "opt_vm.h" 109 110 #include <sys/param.h> 111 #include <sys/bitstring.h> 112 #include <sys/bus.h> 113 #include <sys/systm.h> 114 #include <sys/kernel.h> 115 #include <sys/ktr.h> 116 #include <sys/limits.h> 117 #include <sys/lock.h> 118 #include <sys/malloc.h> 119 #include <sys/mman.h> 120 #include <sys/msgbuf.h> 121 #include <sys/mutex.h> 122 #include <sys/proc.h> 123 #include <sys/rwlock.h> 124 #include <sys/sbuf.h> 125 #include <sys/sx.h> 126 #include <sys/vmem.h> 127 #include <sys/vmmeter.h> 128 #include <sys/sched.h> 129 #include <sys/sysctl.h> 130 #include <sys/_unrhdr.h> 131 #include <sys/smp.h> 132 133 #include <vm/vm.h> 134 #include <vm/vm_param.h> 135 #include <vm/vm_kern.h> 136 #include <vm/vm_page.h> 137 #include <vm/vm_map.h> 138 #include <vm/vm_object.h> 139 #include <vm/vm_extern.h> 140 #include <vm/vm_pageout.h> 141 #include <vm/vm_pager.h> 142 #include <vm/vm_phys.h> 143 #include <vm/vm_radix.h> 144 #include <vm/vm_reserv.h> 145 #include <vm/uma.h> 146 147 #include <machine/machdep.h> 148 #include <machine/md_var.h> 149 #include <machine/pcb.h> 150 151 #include <arm/include/physmem.h> 152 153 #define PMAP_ASSERT_STAGE1(pmap) MPASS((pmap)->pm_stage == PM_STAGE1) 154 155 #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t))) 156 #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t))) 157 #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t))) 158 #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t))) 159 160 #define NUL0E L0_ENTRIES 161 #define NUL1E (NUL0E * NL1PG) 162 #define NUL2E (NUL1E * NL2PG) 163 164 #if !defined(DIAGNOSTIC) 165 #ifdef __GNUC_GNU_INLINE__ 166 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 167 #else 168 #define PMAP_INLINE extern inline 169 #endif 170 #else 171 #define PMAP_INLINE 172 #endif 173 174 #ifdef PV_STATS 175 #define PV_STAT(x) do { x ; } while (0) 176 #else 177 #define PV_STAT(x) do { } while (0) 178 #endif 179 180 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) 181 #define pa_to_pvh(pa) (&pv_table[pmap_l2_pindex(pa)]) 182 183 #define NPV_LIST_LOCKS MAXCPU 184 185 #define PHYS_TO_PV_LIST_LOCK(pa) \ 186 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) 187 188 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 189 struct rwlock **_lockp = (lockp); \ 190 struct rwlock *_new_lock; \ 191 \ 192 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 193 if (_new_lock != *_lockp) { \ 194 if (*_lockp != NULL) \ 195 rw_wunlock(*_lockp); \ 196 *_lockp = _new_lock; \ 197 rw_wlock(*_lockp); \ 198 } \ 199 } while (0) 200 201 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 202 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 203 204 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 205 struct rwlock **_lockp = (lockp); \ 206 \ 207 if (*_lockp != NULL) { \ 208 rw_wunlock(*_lockp); \ 209 *_lockp = NULL; \ 210 } \ 211 } while (0) 212 213 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 214 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 215 216 /* 217 * The presence of this flag indicates that the mapping is writeable. 218 * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise 219 * it is dirty. This flag may only be set on managed mappings. 220 * 221 * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it 222 * as a software managed bit. 223 */ 224 #define ATTR_SW_DBM ATTR_DBM 225 226 struct pmap kernel_pmap_store; 227 228 /* Used for mapping ACPI memory before VM is initialized */ 229 #define PMAP_PREINIT_MAPPING_COUNT 32 230 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE) 231 static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */ 232 static int vm_initialized = 0; /* No need to use pre-init maps when set */ 233 234 /* 235 * Reserve a few L2 blocks starting from 'preinit_map_va' pointer. 236 * Always map entire L2 block for simplicity. 237 * VA of L2 block = preinit_map_va + i * L2_SIZE 238 */ 239 static struct pmap_preinit_mapping { 240 vm_paddr_t pa; 241 vm_offset_t va; 242 vm_size_t size; 243 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 244 245 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 246 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 247 vm_offset_t kernel_vm_end = 0; 248 249 /* 250 * Data for the pv entry allocation mechanism. 251 */ 252 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 253 static struct mtx pv_chunks_mutex; 254 static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 255 static struct md_page *pv_table; 256 static struct md_page pv_dummy; 257 258 vm_paddr_t dmap_phys_base; /* The start of the dmap region */ 259 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ 260 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ 261 262 /* This code assumes all L1 DMAP entries will be used */ 263 CTASSERT((DMAP_MIN_ADDRESS & ~L0_OFFSET) == DMAP_MIN_ADDRESS); 264 CTASSERT((DMAP_MAX_ADDRESS & ~L0_OFFSET) == DMAP_MAX_ADDRESS); 265 266 #define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT) 267 extern pt_entry_t pagetable_dmap[]; 268 269 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 270 static vm_paddr_t physmap[PHYSMAP_SIZE]; 271 static u_int physmap_idx; 272 273 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 274 "VM/pmap parameters"); 275 276 /* 277 * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs 278 * that it has currently allocated to a pmap, a cursor ("asid_next") to 279 * optimize its search for a free ASID in the bit vector, and an epoch number 280 * ("asid_epoch") to indicate when it has reclaimed all previously allocated 281 * ASIDs that are not currently active on a processor. 282 * 283 * The current epoch number is always in the range [0, INT_MAX). Negative 284 * numbers and INT_MAX are reserved for special cases that are described 285 * below. 286 */ 287 struct asid_set { 288 int asid_bits; 289 bitstr_t *asid_set; 290 int asid_set_size; 291 int asid_next; 292 int asid_epoch; 293 struct mtx asid_set_mutex; 294 }; 295 296 static struct asid_set asids; 297 298 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 299 "ASID allocator"); 300 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0, 301 "The number of bits in an ASID"); 302 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0, 303 "The last allocated ASID plus one"); 304 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0, 305 "The current epoch number"); 306 307 /* 308 * A pmap's cookie encodes an ASID and epoch number. Cookies for reserved 309 * ASIDs have a negative epoch number, specifically, INT_MIN. Cookies for 310 * dynamically allocated ASIDs have a non-negative epoch number. 311 * 312 * An invalid ASID is represented by -1. 313 * 314 * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN), 315 * which indicates that an ASID should never be allocated to the pmap, and 316 * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be 317 * allocated when the pmap is next activated. 318 */ 319 #define COOKIE_FROM(asid, epoch) ((long)((u_int)(asid) | \ 320 ((u_long)(epoch) << 32))) 321 #define COOKIE_TO_ASID(cookie) ((int)(cookie)) 322 #define COOKIE_TO_EPOCH(cookie) ((int)((u_long)(cookie) >> 32)) 323 324 static int superpages_enabled = 1; 325 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, 326 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0, 327 "Are large page mappings enabled?"); 328 329 /* 330 * Internal flags for pmap_enter()'s helper functions. 331 */ 332 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 333 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 334 335 static void free_pv_chunk(struct pv_chunk *pc); 336 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 337 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 338 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 339 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 340 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 341 vm_offset_t va); 342 343 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); 344 static bool pmap_activate_int(pmap_t pmap); 345 static void pmap_alloc_asid(pmap_t pmap); 346 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); 347 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va); 348 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, 349 vm_offset_t va, struct rwlock **lockp); 350 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); 351 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 352 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 353 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, 354 u_int flags, vm_page_t m, struct rwlock **lockp); 355 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 356 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp); 357 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, 358 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp); 359 static void pmap_reset_asid_set(pmap_t pmap); 360 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 361 vm_page_t m, struct rwlock **lockp); 362 363 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, 364 struct rwlock **lockp); 365 366 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, 367 struct spglist *free); 368 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 369 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 370 371 /* 372 * These load the old table data and store the new value. 373 * They need to be atomic as the System MMU may write to the table at 374 * the same time as the CPU. 375 */ 376 #define pmap_clear(table) atomic_store_64(table, 0) 377 #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits) 378 #define pmap_load(table) (*table) 379 #define pmap_load_clear(table) atomic_swap_64(table, 0) 380 #define pmap_load_store(table, entry) atomic_swap_64(table, entry) 381 #define pmap_set_bits(table, bits) atomic_set_64(table, bits) 382 #define pmap_store(table, entry) atomic_store_64(table, entry) 383 384 /********************/ 385 /* Inline functions */ 386 /********************/ 387 388 static __inline void 389 pagecopy(void *s, void *d) 390 { 391 392 memcpy(d, s, PAGE_SIZE); 393 } 394 395 static __inline pd_entry_t * 396 pmap_l0(pmap_t pmap, vm_offset_t va) 397 { 398 399 return (&pmap->pm_l0[pmap_l0_index(va)]); 400 } 401 402 static __inline pd_entry_t * 403 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) 404 { 405 pd_entry_t *l1; 406 407 l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); 408 return (&l1[pmap_l1_index(va)]); 409 } 410 411 static __inline pd_entry_t * 412 pmap_l1(pmap_t pmap, vm_offset_t va) 413 { 414 pd_entry_t *l0; 415 416 l0 = pmap_l0(pmap, va); 417 if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE) 418 return (NULL); 419 420 return (pmap_l0_to_l1(l0, va)); 421 } 422 423 static __inline pd_entry_t * 424 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) 425 { 426 pd_entry_t *l2; 427 428 l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); 429 return (&l2[pmap_l2_index(va)]); 430 } 431 432 static __inline pd_entry_t * 433 pmap_l2(pmap_t pmap, vm_offset_t va) 434 { 435 pd_entry_t *l1; 436 437 l1 = pmap_l1(pmap, va); 438 if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE) 439 return (NULL); 440 441 return (pmap_l1_to_l2(l1, va)); 442 } 443 444 static __inline pt_entry_t * 445 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) 446 { 447 pt_entry_t *l3; 448 449 l3 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l2) & ~ATTR_MASK); 450 return (&l3[pmap_l3_index(va)]); 451 } 452 453 /* 454 * Returns the lowest valid pde for a given virtual address. 455 * The next level may or may not point to a valid page or block. 456 */ 457 static __inline pd_entry_t * 458 pmap_pde(pmap_t pmap, vm_offset_t va, int *level) 459 { 460 pd_entry_t *l0, *l1, *l2, desc; 461 462 l0 = pmap_l0(pmap, va); 463 desc = pmap_load(l0) & ATTR_DESCR_MASK; 464 if (desc != L0_TABLE) { 465 *level = -1; 466 return (NULL); 467 } 468 469 l1 = pmap_l0_to_l1(l0, va); 470 desc = pmap_load(l1) & ATTR_DESCR_MASK; 471 if (desc != L1_TABLE) { 472 *level = 0; 473 return (l0); 474 } 475 476 l2 = pmap_l1_to_l2(l1, va); 477 desc = pmap_load(l2) & ATTR_DESCR_MASK; 478 if (desc != L2_TABLE) { 479 *level = 1; 480 return (l1); 481 } 482 483 *level = 2; 484 return (l2); 485 } 486 487 /* 488 * Returns the lowest valid pte block or table entry for a given virtual 489 * address. If there are no valid entries return NULL and set the level to 490 * the first invalid level. 491 */ 492 static __inline pt_entry_t * 493 pmap_pte(pmap_t pmap, vm_offset_t va, int *level) 494 { 495 pd_entry_t *l1, *l2, desc; 496 pt_entry_t *l3; 497 498 l1 = pmap_l1(pmap, va); 499 if (l1 == NULL) { 500 *level = 0; 501 return (NULL); 502 } 503 desc = pmap_load(l1) & ATTR_DESCR_MASK; 504 if (desc == L1_BLOCK) { 505 *level = 1; 506 return (l1); 507 } 508 509 if (desc != L1_TABLE) { 510 *level = 1; 511 return (NULL); 512 } 513 514 l2 = pmap_l1_to_l2(l1, va); 515 desc = pmap_load(l2) & ATTR_DESCR_MASK; 516 if (desc == L2_BLOCK) { 517 *level = 2; 518 return (l2); 519 } 520 521 if (desc != L2_TABLE) { 522 *level = 2; 523 return (NULL); 524 } 525 526 *level = 3; 527 l3 = pmap_l2_to_l3(l2, va); 528 if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE) 529 return (NULL); 530 531 return (l3); 532 } 533 534 bool 535 pmap_ps_enabled(pmap_t pmap __unused) 536 { 537 538 return (superpages_enabled != 0); 539 } 540 541 bool 542 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1, 543 pd_entry_t **l2, pt_entry_t **l3) 544 { 545 pd_entry_t *l0p, *l1p, *l2p; 546 547 if (pmap->pm_l0 == NULL) 548 return (false); 549 550 l0p = pmap_l0(pmap, va); 551 *l0 = l0p; 552 553 if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) 554 return (false); 555 556 l1p = pmap_l0_to_l1(l0p, va); 557 *l1 = l1p; 558 559 if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) { 560 *l2 = NULL; 561 *l3 = NULL; 562 return (true); 563 } 564 565 if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE) 566 return (false); 567 568 l2p = pmap_l1_to_l2(l1p, va); 569 *l2 = l2p; 570 571 if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) { 572 *l3 = NULL; 573 return (true); 574 } 575 576 if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE) 577 return (false); 578 579 *l3 = pmap_l2_to_l3(l2p, va); 580 581 return (true); 582 } 583 584 static __inline int 585 pmap_l3_valid(pt_entry_t l3) 586 { 587 588 return ((l3 & ATTR_DESCR_MASK) == L3_PAGE); 589 } 590 591 592 CTASSERT(L1_BLOCK == L2_BLOCK); 593 594 /* 595 * Checks if the PTE is dirty. 596 */ 597 static inline int 598 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte) 599 { 600 601 PMAP_ASSERT_STAGE1(pmap); 602 KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte)); 603 KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0, 604 ("pte %#lx is writeable and missing ATTR_SW_DBM", pte)); 605 606 return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 607 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM)); 608 } 609 610 static __inline void 611 pmap_resident_count_inc(pmap_t pmap, int count) 612 { 613 614 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 615 pmap->pm_stats.resident_count += count; 616 } 617 618 static __inline void 619 pmap_resident_count_dec(pmap_t pmap, int count) 620 { 621 622 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 623 KASSERT(pmap->pm_stats.resident_count >= count, 624 ("pmap %p resident count underflow %ld %d", pmap, 625 pmap->pm_stats.resident_count, count)); 626 pmap->pm_stats.resident_count -= count; 627 } 628 629 static pt_entry_t * 630 pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, 631 u_int *l2_slot) 632 { 633 pt_entry_t *l2; 634 pd_entry_t *l1; 635 636 l1 = (pd_entry_t *)l1pt; 637 *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; 638 639 /* Check locore has used a table L1 map */ 640 KASSERT((l1[*l1_slot] & ATTR_DESCR_MASK) == L1_TABLE, 641 ("Invalid bootstrap L1 table")); 642 /* Find the address of the L2 table */ 643 l2 = (pt_entry_t *)init_pt_va; 644 *l2_slot = pmap_l2_index(va); 645 646 return (l2); 647 } 648 649 static vm_paddr_t 650 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) 651 { 652 u_int l1_slot, l2_slot; 653 pt_entry_t *l2; 654 655 l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); 656 657 return ((l2[l2_slot] & ~ATTR_MASK) + (va & L2_OFFSET)); 658 } 659 660 static vm_offset_t 661 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, 662 vm_offset_t freemempos) 663 { 664 pt_entry_t *l2; 665 vm_offset_t va; 666 vm_paddr_t l2_pa, pa; 667 u_int l1_slot, l2_slot, prev_l1_slot; 668 int i; 669 670 dmap_phys_base = min_pa & ~L1_OFFSET; 671 dmap_phys_max = 0; 672 dmap_max_addr = 0; 673 l2 = NULL; 674 prev_l1_slot = -1; 675 676 #define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT) 677 memset(pagetable_dmap, 0, PAGE_SIZE * DMAP_TABLES); 678 679 for (i = 0; i < (physmap_idx * 2); i += 2) { 680 pa = physmap[i] & ~L2_OFFSET; 681 va = pa - dmap_phys_base + DMAP_MIN_ADDRESS; 682 683 /* Create L2 mappings at the start of the region */ 684 if ((pa & L1_OFFSET) != 0) { 685 l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); 686 if (l1_slot != prev_l1_slot) { 687 prev_l1_slot = l1_slot; 688 l2 = (pt_entry_t *)freemempos; 689 l2_pa = pmap_early_vtophys(kern_l1, 690 (vm_offset_t)l2); 691 freemempos += PAGE_SIZE; 692 693 pmap_store(&pagetable_dmap[l1_slot], 694 (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE); 695 696 memset(l2, 0, PAGE_SIZE); 697 } 698 KASSERT(l2 != NULL, 699 ("pmap_bootstrap_dmap: NULL l2 map")); 700 for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1]; 701 pa += L2_SIZE, va += L2_SIZE) { 702 /* 703 * We are on a boundary, stop to 704 * create a level 1 block 705 */ 706 if ((pa & L1_OFFSET) == 0) 707 break; 708 709 l2_slot = pmap_l2_index(va); 710 KASSERT(l2_slot != 0, ("...")); 711 pmap_store(&l2[l2_slot], 712 (pa & ~L2_OFFSET) | ATTR_DEFAULT | 713 ATTR_S1_XN | 714 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | 715 L2_BLOCK); 716 } 717 KASSERT(va == (pa - dmap_phys_base + DMAP_MIN_ADDRESS), 718 ("...")); 719 } 720 721 for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1] && 722 (physmap[i + 1] - pa) >= L1_SIZE; 723 pa += L1_SIZE, va += L1_SIZE) { 724 l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); 725 pmap_store(&pagetable_dmap[l1_slot], 726 (pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_S1_XN | 727 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L1_BLOCK); 728 } 729 730 /* Create L2 mappings at the end of the region */ 731 if (pa < physmap[i + 1]) { 732 l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); 733 if (l1_slot != prev_l1_slot) { 734 prev_l1_slot = l1_slot; 735 l2 = (pt_entry_t *)freemempos; 736 l2_pa = pmap_early_vtophys(kern_l1, 737 (vm_offset_t)l2); 738 freemempos += PAGE_SIZE; 739 740 pmap_store(&pagetable_dmap[l1_slot], 741 (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE); 742 743 memset(l2, 0, PAGE_SIZE); 744 } 745 KASSERT(l2 != NULL, 746 ("pmap_bootstrap_dmap: NULL l2 map")); 747 for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1]; 748 pa += L2_SIZE, va += L2_SIZE) { 749 l2_slot = pmap_l2_index(va); 750 pmap_store(&l2[l2_slot], 751 (pa & ~L2_OFFSET) | ATTR_DEFAULT | 752 ATTR_S1_XN | 753 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | 754 L2_BLOCK); 755 } 756 } 757 758 if (pa > dmap_phys_max) { 759 dmap_phys_max = pa; 760 dmap_max_addr = va; 761 } 762 } 763 764 cpu_tlb_flushID(); 765 766 return (freemempos); 767 } 768 769 static vm_offset_t 770 pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start) 771 { 772 vm_offset_t l2pt; 773 vm_paddr_t pa; 774 pd_entry_t *l1; 775 u_int l1_slot; 776 777 KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address")); 778 779 l1 = (pd_entry_t *)l1pt; 780 l1_slot = pmap_l1_index(va); 781 l2pt = l2_start; 782 783 for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) { 784 KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); 785 786 pa = pmap_early_vtophys(l1pt, l2pt); 787 pmap_store(&l1[l1_slot], 788 (pa & ~Ln_TABLE_MASK) | L1_TABLE); 789 l2pt += PAGE_SIZE; 790 } 791 792 /* Clean the L2 page table */ 793 memset((void *)l2_start, 0, l2pt - l2_start); 794 795 return l2pt; 796 } 797 798 static vm_offset_t 799 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) 800 { 801 vm_offset_t l3pt; 802 vm_paddr_t pa; 803 pd_entry_t *l2; 804 u_int l2_slot; 805 806 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); 807 808 l2 = pmap_l2(kernel_pmap, va); 809 l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE); 810 l2_slot = pmap_l2_index(va); 811 l3pt = l3_start; 812 813 for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { 814 KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); 815 816 pa = pmap_early_vtophys(l1pt, l3pt); 817 pmap_store(&l2[l2_slot], 818 (pa & ~Ln_TABLE_MASK) | ATTR_S1_UXN | L2_TABLE); 819 l3pt += PAGE_SIZE; 820 } 821 822 /* Clean the L2 page table */ 823 memset((void *)l3_start, 0, l3pt - l3_start); 824 825 return l3pt; 826 } 827 828 /* 829 * Bootstrap the system enough to run with virtual memory. 830 */ 831 void 832 pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart, 833 vm_size_t kernlen) 834 { 835 u_int l1_slot, l2_slot; 836 pt_entry_t *l2; 837 vm_offset_t va, freemempos; 838 vm_offset_t dpcpu, msgbufpv; 839 vm_paddr_t start_pa, pa, min_pa; 840 uint64_t kern_delta; 841 int i; 842 843 /* Verify that the ASID is set through TTBR0. */ 844 KASSERT((READ_SPECIALREG(tcr_el1) & TCR_A1) == 0, 845 ("pmap_bootstrap: TCR_EL1.A1 != 0")); 846 847 kern_delta = KERNBASE - kernstart; 848 849 printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); 850 printf("%lx\n", l1pt); 851 printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK); 852 853 /* Set this early so we can use the pagetable walking functions */ 854 kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt; 855 PMAP_LOCK_INIT(kernel_pmap); 856 kernel_pmap->pm_l0_paddr = l0pt - kern_delta; 857 kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN); 858 kernel_pmap->pm_stage = PM_STAGE1; 859 kernel_pmap->pm_asid_set = &asids; 860 861 /* Assume the address we were loaded to is a valid physical address */ 862 min_pa = KERNBASE - kern_delta; 863 864 physmap_idx = arm_physmem_avail(physmap, nitems(physmap)); 865 physmap_idx /= 2; 866 867 /* 868 * Find the minimum physical address. physmap is sorted, 869 * but may contain empty ranges. 870 */ 871 for (i = 0; i < (physmap_idx * 2); i += 2) { 872 if (physmap[i] == physmap[i + 1]) 873 continue; 874 if (physmap[i] <= min_pa) 875 min_pa = physmap[i]; 876 } 877 878 freemempos = KERNBASE + kernlen; 879 freemempos = roundup2(freemempos, PAGE_SIZE); 880 881 /* Create a direct map region early so we can use it for pa -> va */ 882 freemempos = pmap_bootstrap_dmap(l1pt, min_pa, freemempos); 883 884 va = KERNBASE; 885 start_pa = pa = KERNBASE - kern_delta; 886 887 /* 888 * Read the page table to find out what is already mapped. 889 * This assumes we have mapped a block of memory from KERNBASE 890 * using a single L1 entry. 891 */ 892 l2 = pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); 893 894 /* Sanity check the index, KERNBASE should be the first VA */ 895 KASSERT(l2_slot == 0, ("The L2 index is non-zero")); 896 897 /* Find how many pages we have mapped */ 898 for (; l2_slot < Ln_ENTRIES; l2_slot++) { 899 if ((l2[l2_slot] & ATTR_DESCR_MASK) == 0) 900 break; 901 902 /* Check locore used L2 blocks */ 903 KASSERT((l2[l2_slot] & ATTR_DESCR_MASK) == L2_BLOCK, 904 ("Invalid bootstrap L2 table")); 905 KASSERT((l2[l2_slot] & ~ATTR_MASK) == pa, 906 ("Incorrect PA in L2 table")); 907 908 va += L2_SIZE; 909 pa += L2_SIZE; 910 } 911 912 va = roundup2(va, L1_SIZE); 913 914 /* Create the l2 tables up to VM_MAX_KERNEL_ADDRESS */ 915 freemempos = pmap_bootstrap_l2(l1pt, va, freemempos); 916 /* And the l3 tables for the early devmap */ 917 freemempos = pmap_bootstrap_l3(l1pt, 918 VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE), freemempos); 919 920 cpu_tlb_flushID(); 921 922 #define alloc_pages(var, np) \ 923 (var) = freemempos; \ 924 freemempos += (np * PAGE_SIZE); \ 925 memset((char *)(var), 0, ((np) * PAGE_SIZE)); 926 927 /* Allocate dynamic per-cpu area. */ 928 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); 929 dpcpu_init((void *)dpcpu, 0); 930 931 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ 932 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); 933 msgbufp = (void *)msgbufpv; 934 935 /* Reserve some VA space for early BIOS/ACPI mapping */ 936 preinit_map_va = roundup2(freemempos, L2_SIZE); 937 938 virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE; 939 virtual_avail = roundup2(virtual_avail, L1_SIZE); 940 virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE); 941 kernel_vm_end = virtual_avail; 942 943 pa = pmap_early_vtophys(l1pt, freemempos); 944 945 arm_physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC); 946 947 cpu_tlb_flushID(); 948 } 949 950 /* 951 * Initialize a vm_page's machine-dependent fields. 952 */ 953 void 954 pmap_page_init(vm_page_t m) 955 { 956 957 TAILQ_INIT(&m->md.pv_list); 958 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; 959 } 960 961 static void 962 pmap_init_asids(struct asid_set *set, int bits) 963 { 964 int i; 965 966 set->asid_bits = bits; 967 968 /* 969 * We may be too early in the overall initialization process to use 970 * bit_alloc(). 971 */ 972 set->asid_set_size = 1 << set->asid_bits; 973 set->asid_set = (bitstr_t *)kmem_malloc(bitstr_size(set->asid_set_size), 974 M_WAITOK | M_ZERO); 975 for (i = 0; i < ASID_FIRST_AVAILABLE; i++) 976 bit_set(set->asid_set, i); 977 set->asid_next = ASID_FIRST_AVAILABLE; 978 mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN); 979 } 980 981 /* 982 * Initialize the pmap module. 983 * Called by vm_init, to initialize any structures that the pmap 984 * system needs to map virtual memory. 985 */ 986 void 987 pmap_init(void) 988 { 989 vm_size_t s; 990 int i, pv_npg; 991 992 /* 993 * Are large page mappings enabled? 994 */ 995 TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); 996 if (superpages_enabled) { 997 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 998 ("pmap_init: can't assign to pagesizes[1]")); 999 pagesizes[1] = L2_SIZE; 1000 } 1001 1002 /* 1003 * Initialize the ASID allocator. 1004 */ 1005 pmap_init_asids(&asids, 1006 (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8); 1007 1008 /* 1009 * Initialize the pv chunk list mutex. 1010 */ 1011 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 1012 1013 /* 1014 * Initialize the pool of pv list locks. 1015 */ 1016 for (i = 0; i < NPV_LIST_LOCKS; i++) 1017 rw_init(&pv_list_locks[i], "pmap pv list"); 1018 1019 /* 1020 * Calculate the size of the pv head table for superpages. 1021 */ 1022 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); 1023 1024 /* 1025 * Allocate memory for the pv head table for superpages. 1026 */ 1027 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 1028 s = round_page(s); 1029 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 1030 for (i = 0; i < pv_npg; i++) 1031 TAILQ_INIT(&pv_table[i].pv_list); 1032 TAILQ_INIT(&pv_dummy.pv_list); 1033 1034 vm_initialized = 1; 1035 } 1036 1037 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1038 "2MB page mapping counters"); 1039 1040 static u_long pmap_l2_demotions; 1041 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, 1042 &pmap_l2_demotions, 0, "2MB page demotions"); 1043 1044 static u_long pmap_l2_mappings; 1045 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, 1046 &pmap_l2_mappings, 0, "2MB page mappings"); 1047 1048 static u_long pmap_l2_p_failures; 1049 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, 1050 &pmap_l2_p_failures, 0, "2MB page promotion failures"); 1051 1052 static u_long pmap_l2_promotions; 1053 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, 1054 &pmap_l2_promotions, 0, "2MB page promotions"); 1055 1056 /* 1057 * Invalidate a single TLB entry. 1058 */ 1059 static __inline void 1060 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1061 { 1062 uint64_t r; 1063 1064 PMAP_ASSERT_STAGE1(pmap); 1065 1066 dsb(ishst); 1067 if (pmap == kernel_pmap) { 1068 r = atop(va); 1069 __asm __volatile("tlbi vaae1is, %0" : : "r" (r)); 1070 } else { 1071 r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)) | atop(va); 1072 __asm __volatile("tlbi vae1is, %0" : : "r" (r)); 1073 } 1074 dsb(ish); 1075 isb(); 1076 } 1077 1078 static __inline void 1079 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1080 { 1081 uint64_t end, r, start; 1082 1083 PMAP_ASSERT_STAGE1(pmap); 1084 1085 dsb(ishst); 1086 if (pmap == kernel_pmap) { 1087 start = atop(sva); 1088 end = atop(eva); 1089 for (r = start; r < end; r++) 1090 __asm __volatile("tlbi vaae1is, %0" : : "r" (r)); 1091 } else { 1092 start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 1093 start |= atop(sva); 1094 end |= atop(eva); 1095 for (r = start; r < end; r++) 1096 __asm __volatile("tlbi vae1is, %0" : : "r" (r)); 1097 } 1098 dsb(ish); 1099 isb(); 1100 } 1101 1102 static __inline void 1103 pmap_invalidate_all(pmap_t pmap) 1104 { 1105 uint64_t r; 1106 1107 PMAP_ASSERT_STAGE1(pmap); 1108 1109 dsb(ishst); 1110 if (pmap == kernel_pmap) { 1111 __asm __volatile("tlbi vmalle1is"); 1112 } else { 1113 r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 1114 __asm __volatile("tlbi aside1is, %0" : : "r" (r)); 1115 } 1116 dsb(ish); 1117 isb(); 1118 } 1119 1120 /* 1121 * Routine: pmap_extract 1122 * Function: 1123 * Extract the physical page address associated 1124 * with the given map/virtual_address pair. 1125 */ 1126 vm_paddr_t 1127 pmap_extract(pmap_t pmap, vm_offset_t va) 1128 { 1129 pt_entry_t *pte, tpte; 1130 vm_paddr_t pa; 1131 int lvl; 1132 1133 pa = 0; 1134 PMAP_LOCK(pmap); 1135 /* 1136 * Find the block or page map for this virtual address. pmap_pte 1137 * will return either a valid block/page entry, or NULL. 1138 */ 1139 pte = pmap_pte(pmap, va, &lvl); 1140 if (pte != NULL) { 1141 tpte = pmap_load(pte); 1142 pa = tpte & ~ATTR_MASK; 1143 switch(lvl) { 1144 case 1: 1145 KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, 1146 ("pmap_extract: Invalid L1 pte found: %lx", 1147 tpte & ATTR_DESCR_MASK)); 1148 pa |= (va & L1_OFFSET); 1149 break; 1150 case 2: 1151 KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, 1152 ("pmap_extract: Invalid L2 pte found: %lx", 1153 tpte & ATTR_DESCR_MASK)); 1154 pa |= (va & L2_OFFSET); 1155 break; 1156 case 3: 1157 KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, 1158 ("pmap_extract: Invalid L3 pte found: %lx", 1159 tpte & ATTR_DESCR_MASK)); 1160 pa |= (va & L3_OFFSET); 1161 break; 1162 } 1163 } 1164 PMAP_UNLOCK(pmap); 1165 return (pa); 1166 } 1167 1168 /* 1169 * Routine: pmap_extract_and_hold 1170 * Function: 1171 * Atomically extract and hold the physical page 1172 * with the given pmap and virtual address pair 1173 * if that mapping permits the given protection. 1174 */ 1175 vm_page_t 1176 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1177 { 1178 pt_entry_t *pte, tpte; 1179 vm_offset_t off; 1180 vm_page_t m; 1181 int lvl; 1182 1183 PMAP_ASSERT_STAGE1(pmap); 1184 1185 m = NULL; 1186 PMAP_LOCK(pmap); 1187 pte = pmap_pte(pmap, va, &lvl); 1188 if (pte != NULL) { 1189 tpte = pmap_load(pte); 1190 1191 KASSERT(lvl > 0 && lvl <= 3, 1192 ("pmap_extract_and_hold: Invalid level %d", lvl)); 1193 CTASSERT(L1_BLOCK == L2_BLOCK); 1194 KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) || 1195 (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK), 1196 ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl, 1197 tpte & ATTR_DESCR_MASK)); 1198 if (((tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)) || 1199 ((prot & VM_PROT_WRITE) == 0)) { 1200 switch(lvl) { 1201 case 1: 1202 off = va & L1_OFFSET; 1203 break; 1204 case 2: 1205 off = va & L2_OFFSET; 1206 break; 1207 case 3: 1208 default: 1209 off = 0; 1210 } 1211 m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off); 1212 if (!vm_page_wire_mapped(m)) 1213 m = NULL; 1214 } 1215 } 1216 PMAP_UNLOCK(pmap); 1217 return (m); 1218 } 1219 1220 vm_paddr_t 1221 pmap_kextract(vm_offset_t va) 1222 { 1223 pt_entry_t *pte, tpte; 1224 1225 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 1226 return (DMAP_TO_PHYS(va)); 1227 pte = pmap_l1(kernel_pmap, va); 1228 if (pte == NULL) 1229 return (0); 1230 1231 /* 1232 * A concurrent pmap_update_entry() will clear the entry's valid bit 1233 * but leave the rest of the entry unchanged. Therefore, we treat a 1234 * non-zero entry as being valid, and we ignore the valid bit when 1235 * determining whether the entry maps a block, page, or table. 1236 */ 1237 tpte = pmap_load(pte); 1238 if (tpte == 0) 1239 return (0); 1240 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) 1241 return ((tpte & ~ATTR_MASK) | (va & L1_OFFSET)); 1242 pte = pmap_l1_to_l2(&tpte, va); 1243 tpte = pmap_load(pte); 1244 if (tpte == 0) 1245 return (0); 1246 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) 1247 return ((tpte & ~ATTR_MASK) | (va & L2_OFFSET)); 1248 pte = pmap_l2_to_l3(&tpte, va); 1249 tpte = pmap_load(pte); 1250 if (tpte == 0) 1251 return (0); 1252 return ((tpte & ~ATTR_MASK) | (va & L3_OFFSET)); 1253 } 1254 1255 /*************************************************** 1256 * Low level mapping routines..... 1257 ***************************************************/ 1258 1259 void 1260 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode) 1261 { 1262 pd_entry_t *pde; 1263 pt_entry_t *pte, attr; 1264 vm_offset_t va; 1265 int lvl; 1266 1267 KASSERT((pa & L3_OFFSET) == 0, 1268 ("pmap_kenter: Invalid physical address")); 1269 KASSERT((sva & L3_OFFSET) == 0, 1270 ("pmap_kenter: Invalid virtual address")); 1271 KASSERT((size & PAGE_MASK) == 0, 1272 ("pmap_kenter: Mapping is not page-sized")); 1273 1274 attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN | 1275 ATTR_S1_IDX(mode) | L3_PAGE; 1276 va = sva; 1277 while (size != 0) { 1278 pde = pmap_pde(kernel_pmap, va, &lvl); 1279 KASSERT(pde != NULL, 1280 ("pmap_kenter: Invalid page entry, va: 0x%lx", va)); 1281 KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl)); 1282 1283 pte = pmap_l2_to_l3(pde, va); 1284 pmap_load_store(pte, (pa & ~L3_OFFSET) | attr); 1285 1286 va += PAGE_SIZE; 1287 pa += PAGE_SIZE; 1288 size -= PAGE_SIZE; 1289 } 1290 pmap_invalidate_range(kernel_pmap, sva, va); 1291 } 1292 1293 void 1294 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) 1295 { 1296 1297 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE); 1298 } 1299 1300 /* 1301 * Remove a page from the kernel pagetables. 1302 */ 1303 PMAP_INLINE void 1304 pmap_kremove(vm_offset_t va) 1305 { 1306 pt_entry_t *pte; 1307 int lvl; 1308 1309 pte = pmap_pte(kernel_pmap, va, &lvl); 1310 KASSERT(pte != NULL, ("pmap_kremove: Invalid address")); 1311 KASSERT(lvl == 3, ("pmap_kremove: Invalid pte level %d", lvl)); 1312 1313 pmap_clear(pte); 1314 pmap_invalidate_page(kernel_pmap, va); 1315 } 1316 1317 void 1318 pmap_kremove_device(vm_offset_t sva, vm_size_t size) 1319 { 1320 pt_entry_t *pte; 1321 vm_offset_t va; 1322 int lvl; 1323 1324 KASSERT((sva & L3_OFFSET) == 0, 1325 ("pmap_kremove_device: Invalid virtual address")); 1326 KASSERT((size & PAGE_MASK) == 0, 1327 ("pmap_kremove_device: Mapping is not page-sized")); 1328 1329 va = sva; 1330 while (size != 0) { 1331 pte = pmap_pte(kernel_pmap, va, &lvl); 1332 KASSERT(pte != NULL, ("Invalid page table, va: 0x%lx", va)); 1333 KASSERT(lvl == 3, 1334 ("Invalid device pagetable level: %d != 3", lvl)); 1335 pmap_clear(pte); 1336 1337 va += PAGE_SIZE; 1338 size -= PAGE_SIZE; 1339 } 1340 pmap_invalidate_range(kernel_pmap, sva, va); 1341 } 1342 1343 /* 1344 * Used to map a range of physical addresses into kernel 1345 * virtual address space. 1346 * 1347 * The value passed in '*virt' is a suggested virtual address for 1348 * the mapping. Architectures which can support a direct-mapped 1349 * physical to virtual region can return the appropriate address 1350 * within that region, leaving '*virt' unchanged. Other 1351 * architectures should map the pages starting at '*virt' and 1352 * update '*virt' with the first usable address after the mapped 1353 * region. 1354 */ 1355 vm_offset_t 1356 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1357 { 1358 return PHYS_TO_DMAP(start); 1359 } 1360 1361 1362 /* 1363 * Add a list of wired pages to the kva 1364 * this routine is only used for temporary 1365 * kernel mappings that do not need to have 1366 * page modification or references recorded. 1367 * Note that old mappings are simply written 1368 * over. The page *must* be wired. 1369 * Note: SMP coherent. Uses a ranged shootdown IPI. 1370 */ 1371 void 1372 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1373 { 1374 pd_entry_t *pde; 1375 pt_entry_t *pte, pa; 1376 vm_offset_t va; 1377 vm_page_t m; 1378 int i, lvl; 1379 1380 va = sva; 1381 for (i = 0; i < count; i++) { 1382 pde = pmap_pde(kernel_pmap, va, &lvl); 1383 KASSERT(pde != NULL, 1384 ("pmap_qenter: Invalid page entry, va: 0x%lx", va)); 1385 KASSERT(lvl == 2, 1386 ("pmap_qenter: Invalid level %d", lvl)); 1387 1388 m = ma[i]; 1389 pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | 1390 ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN | 1391 ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE; 1392 pte = pmap_l2_to_l3(pde, va); 1393 pmap_load_store(pte, pa); 1394 1395 va += L3_SIZE; 1396 } 1397 pmap_invalidate_range(kernel_pmap, sva, va); 1398 } 1399 1400 /* 1401 * This routine tears out page mappings from the 1402 * kernel -- it is meant only for temporary mappings. 1403 */ 1404 void 1405 pmap_qremove(vm_offset_t sva, int count) 1406 { 1407 pt_entry_t *pte; 1408 vm_offset_t va; 1409 int lvl; 1410 1411 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); 1412 1413 va = sva; 1414 while (count-- > 0) { 1415 pte = pmap_pte(kernel_pmap, va, &lvl); 1416 KASSERT(lvl == 3, 1417 ("Invalid device pagetable level: %d != 3", lvl)); 1418 if (pte != NULL) { 1419 pmap_clear(pte); 1420 } 1421 1422 va += PAGE_SIZE; 1423 } 1424 pmap_invalidate_range(kernel_pmap, sva, va); 1425 } 1426 1427 /*************************************************** 1428 * Page table page management routines..... 1429 ***************************************************/ 1430 /* 1431 * Schedule the specified unused page table page to be freed. Specifically, 1432 * add the page to the specified list of pages that will be released to the 1433 * physical memory manager after the TLB has been updated. 1434 */ 1435 static __inline void 1436 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 1437 boolean_t set_PG_ZERO) 1438 { 1439 1440 if (set_PG_ZERO) 1441 m->flags |= PG_ZERO; 1442 else 1443 m->flags &= ~PG_ZERO; 1444 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1445 } 1446 1447 /* 1448 * Decrements a page table page's reference count, which is used to record the 1449 * number of valid page table entries within the page. If the reference count 1450 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1451 * page table page was unmapped and FALSE otherwise. 1452 */ 1453 static inline boolean_t 1454 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1455 { 1456 1457 --m->ref_count; 1458 if (m->ref_count == 0) { 1459 _pmap_unwire_l3(pmap, va, m, free); 1460 return (TRUE); 1461 } else 1462 return (FALSE); 1463 } 1464 1465 static void 1466 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1467 { 1468 1469 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1470 /* 1471 * unmap the page table page 1472 */ 1473 if (m->pindex >= (NUL2E + NUL1E)) { 1474 /* l1 page */ 1475 pd_entry_t *l0; 1476 1477 l0 = pmap_l0(pmap, va); 1478 pmap_clear(l0); 1479 } else if (m->pindex >= NUL2E) { 1480 /* l2 page */ 1481 pd_entry_t *l1; 1482 1483 l1 = pmap_l1(pmap, va); 1484 pmap_clear(l1); 1485 } else { 1486 /* l3 page */ 1487 pd_entry_t *l2; 1488 1489 l2 = pmap_l2(pmap, va); 1490 pmap_clear(l2); 1491 } 1492 pmap_resident_count_dec(pmap, 1); 1493 if (m->pindex < NUL2E) { 1494 /* We just released an l3, unhold the matching l2 */ 1495 pd_entry_t *l1, tl1; 1496 vm_page_t l2pg; 1497 1498 l1 = pmap_l1(pmap, va); 1499 tl1 = pmap_load(l1); 1500 l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); 1501 pmap_unwire_l3(pmap, va, l2pg, free); 1502 } else if (m->pindex < (NUL2E + NUL1E)) { 1503 /* We just released an l2, unhold the matching l1 */ 1504 pd_entry_t *l0, tl0; 1505 vm_page_t l1pg; 1506 1507 l0 = pmap_l0(pmap, va); 1508 tl0 = pmap_load(l0); 1509 l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); 1510 pmap_unwire_l3(pmap, va, l1pg, free); 1511 } 1512 pmap_invalidate_page(pmap, va); 1513 1514 /* 1515 * Put page on a list so that it is released after 1516 * *ALL* TLB shootdown is done 1517 */ 1518 pmap_add_delayed_free_list(m, free, TRUE); 1519 } 1520 1521 /* 1522 * After removing a page table entry, this routine is used to 1523 * conditionally free the page, and manage the reference count. 1524 */ 1525 static int 1526 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 1527 struct spglist *free) 1528 { 1529 vm_page_t mpte; 1530 1531 if (va >= VM_MAXUSER_ADDRESS) 1532 return (0); 1533 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 1534 mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK); 1535 return (pmap_unwire_l3(pmap, va, mpte, free)); 1536 } 1537 1538 /* 1539 * Release a page table page reference after a failed attempt to create a 1540 * mapping. 1541 */ 1542 static void 1543 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 1544 { 1545 struct spglist free; 1546 1547 SLIST_INIT(&free); 1548 if (pmap_unwire_l3(pmap, va, mpte, &free)) { 1549 /* 1550 * Although "va" was never mapped, the TLB could nonetheless 1551 * have intermediate entries that refer to the freed page 1552 * table pages. Invalidate those entries. 1553 * 1554 * XXX redundant invalidation (See _pmap_unwire_l3().) 1555 */ 1556 pmap_invalidate_page(pmap, va); 1557 vm_page_free_pages_toq(&free, true); 1558 } 1559 } 1560 1561 void 1562 pmap_pinit0(pmap_t pmap) 1563 { 1564 1565 PMAP_LOCK_INIT(pmap); 1566 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1567 pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1); 1568 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); 1569 pmap->pm_root.rt_root = 0; 1570 pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN); 1571 pmap->pm_stage = PM_STAGE1; 1572 pmap->pm_asid_set = &asids; 1573 1574 PCPU_SET(curpmap, pmap); 1575 } 1576 1577 int 1578 pmap_pinit(pmap_t pmap) 1579 { 1580 vm_page_t l0pt; 1581 1582 /* 1583 * allocate the l0 page 1584 */ 1585 while ((l0pt = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 1586 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) 1587 vm_wait(NULL); 1588 1589 pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(l0pt); 1590 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); 1591 1592 if ((l0pt->flags & PG_ZERO) == 0) 1593 pagezero(pmap->pm_l0); 1594 1595 pmap->pm_root.rt_root = 0; 1596 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1597 pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX); 1598 pmap->pm_stage = PM_STAGE1; 1599 pmap->pm_asid_set = &asids; 1600 /* XXX Temporarily disable deferred ASID allocation. */ 1601 pmap_alloc_asid(pmap); 1602 1603 return (1); 1604 } 1605 1606 /* 1607 * This routine is called if the desired page table page does not exist. 1608 * 1609 * If page table page allocation fails, this routine may sleep before 1610 * returning NULL. It sleeps only if a lock pointer was given. 1611 * 1612 * Note: If a page allocation fails at page table level two or three, 1613 * one or two pages may be held during the wait, only to be released 1614 * afterwards. This conservative approach is easily argued to avoid 1615 * race conditions. 1616 */ 1617 static vm_page_t 1618 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 1619 { 1620 vm_page_t m, l1pg, l2pg; 1621 1622 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1623 1624 /* 1625 * Allocate a page table page. 1626 */ 1627 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1628 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1629 if (lockp != NULL) { 1630 RELEASE_PV_LIST_LOCK(lockp); 1631 PMAP_UNLOCK(pmap); 1632 vm_wait(NULL); 1633 PMAP_LOCK(pmap); 1634 } 1635 1636 /* 1637 * Indicate the need to retry. While waiting, the page table 1638 * page may have been allocated. 1639 */ 1640 return (NULL); 1641 } 1642 if ((m->flags & PG_ZERO) == 0) 1643 pmap_zero_page(m); 1644 1645 /* 1646 * Because of AArch64's weak memory consistency model, we must have a 1647 * barrier here to ensure that the stores for zeroing "m", whether by 1648 * pmap_zero_page() or an earlier function, are visible before adding 1649 * "m" to the page table. Otherwise, a page table walk by another 1650 * processor's MMU could see the mapping to "m" and a stale, non-zero 1651 * PTE within "m". 1652 */ 1653 dmb(ishst); 1654 1655 /* 1656 * Map the pagetable page into the process address space, if 1657 * it isn't already there. 1658 */ 1659 1660 if (ptepindex >= (NUL2E + NUL1E)) { 1661 pd_entry_t *l0; 1662 vm_pindex_t l0index; 1663 1664 l0index = ptepindex - (NUL2E + NUL1E); 1665 l0 = &pmap->pm_l0[l0index]; 1666 pmap_store(l0, VM_PAGE_TO_PHYS(m) | L0_TABLE); 1667 } else if (ptepindex >= NUL2E) { 1668 vm_pindex_t l0index, l1index; 1669 pd_entry_t *l0, *l1; 1670 pd_entry_t tl0; 1671 1672 l1index = ptepindex - NUL2E; 1673 l0index = l1index >> L0_ENTRIES_SHIFT; 1674 1675 l0 = &pmap->pm_l0[l0index]; 1676 tl0 = pmap_load(l0); 1677 if (tl0 == 0) { 1678 /* recurse for allocating page dir */ 1679 if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, 1680 lockp) == NULL) { 1681 vm_page_unwire_noq(m); 1682 vm_page_free_zero(m); 1683 return (NULL); 1684 } 1685 } else { 1686 l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); 1687 l1pg->ref_count++; 1688 } 1689 1690 l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); 1691 l1 = &l1[ptepindex & Ln_ADDR_MASK]; 1692 pmap_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE); 1693 } else { 1694 vm_pindex_t l0index, l1index; 1695 pd_entry_t *l0, *l1, *l2; 1696 pd_entry_t tl0, tl1; 1697 1698 l1index = ptepindex >> Ln_ENTRIES_SHIFT; 1699 l0index = l1index >> L0_ENTRIES_SHIFT; 1700 1701 l0 = &pmap->pm_l0[l0index]; 1702 tl0 = pmap_load(l0); 1703 if (tl0 == 0) { 1704 /* recurse for allocating page dir */ 1705 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 1706 lockp) == NULL) { 1707 vm_page_unwire_noq(m); 1708 vm_page_free_zero(m); 1709 return (NULL); 1710 } 1711 tl0 = pmap_load(l0); 1712 l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); 1713 l1 = &l1[l1index & Ln_ADDR_MASK]; 1714 } else { 1715 l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); 1716 l1 = &l1[l1index & Ln_ADDR_MASK]; 1717 tl1 = pmap_load(l1); 1718 if (tl1 == 0) { 1719 /* recurse for allocating page dir */ 1720 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 1721 lockp) == NULL) { 1722 vm_page_unwire_noq(m); 1723 vm_page_free_zero(m); 1724 return (NULL); 1725 } 1726 } else { 1727 l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); 1728 l2pg->ref_count++; 1729 } 1730 } 1731 1732 l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); 1733 l2 = &l2[ptepindex & Ln_ADDR_MASK]; 1734 pmap_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE); 1735 } 1736 1737 pmap_resident_count_inc(pmap, 1); 1738 1739 return (m); 1740 } 1741 1742 static pd_entry_t * 1743 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp, 1744 struct rwlock **lockp) 1745 { 1746 pd_entry_t *l1, *l2; 1747 vm_page_t l2pg; 1748 vm_pindex_t l2pindex; 1749 1750 retry: 1751 l1 = pmap_l1(pmap, va); 1752 if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) { 1753 l2 = pmap_l1_to_l2(l1, va); 1754 if (va < VM_MAXUSER_ADDRESS) { 1755 /* Add a reference to the L2 page. */ 1756 l2pg = PHYS_TO_VM_PAGE(pmap_load(l1) & ~ATTR_MASK); 1757 l2pg->ref_count++; 1758 } else 1759 l2pg = NULL; 1760 } else if (va < VM_MAXUSER_ADDRESS) { 1761 /* Allocate a L2 page. */ 1762 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; 1763 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); 1764 if (l2pg == NULL) { 1765 if (lockp != NULL) 1766 goto retry; 1767 else 1768 return (NULL); 1769 } 1770 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); 1771 l2 = &l2[pmap_l2_index(va)]; 1772 } else 1773 panic("pmap_alloc_l2: missing page table page for va %#lx", 1774 va); 1775 *l2pgp = l2pg; 1776 return (l2); 1777 } 1778 1779 static vm_page_t 1780 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1781 { 1782 vm_pindex_t ptepindex; 1783 pd_entry_t *pde, tpde; 1784 #ifdef INVARIANTS 1785 pt_entry_t *pte; 1786 #endif 1787 vm_page_t m; 1788 int lvl; 1789 1790 /* 1791 * Calculate pagetable page index 1792 */ 1793 ptepindex = pmap_l2_pindex(va); 1794 retry: 1795 /* 1796 * Get the page directory entry 1797 */ 1798 pde = pmap_pde(pmap, va, &lvl); 1799 1800 /* 1801 * If the page table page is mapped, we just increment the hold count, 1802 * and activate it. If we get a level 2 pde it will point to a level 3 1803 * table. 1804 */ 1805 switch (lvl) { 1806 case -1: 1807 break; 1808 case 0: 1809 #ifdef INVARIANTS 1810 pte = pmap_l0_to_l1(pde, va); 1811 KASSERT(pmap_load(pte) == 0, 1812 ("pmap_alloc_l3: TODO: l0 superpages")); 1813 #endif 1814 break; 1815 case 1: 1816 #ifdef INVARIANTS 1817 pte = pmap_l1_to_l2(pde, va); 1818 KASSERT(pmap_load(pte) == 0, 1819 ("pmap_alloc_l3: TODO: l1 superpages")); 1820 #endif 1821 break; 1822 case 2: 1823 tpde = pmap_load(pde); 1824 if (tpde != 0) { 1825 m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK); 1826 m->ref_count++; 1827 return (m); 1828 } 1829 break; 1830 default: 1831 panic("pmap_alloc_l3: Invalid level %d", lvl); 1832 } 1833 1834 /* 1835 * Here if the pte page isn't mapped, or if it has been deallocated. 1836 */ 1837 m = _pmap_alloc_l3(pmap, ptepindex, lockp); 1838 if (m == NULL && lockp != NULL) 1839 goto retry; 1840 1841 return (m); 1842 } 1843 1844 /*************************************************** 1845 * Pmap allocation/deallocation routines. 1846 ***************************************************/ 1847 1848 /* 1849 * Release any resources held by the given physical map. 1850 * Called when a pmap initialized by pmap_pinit is being released. 1851 * Should only be called if the map contains no valid mappings. 1852 */ 1853 void 1854 pmap_release(pmap_t pmap) 1855 { 1856 struct asid_set *set; 1857 vm_page_t m; 1858 int asid; 1859 1860 KASSERT(pmap->pm_stats.resident_count == 0, 1861 ("pmap_release: pmap resident count %ld != 0", 1862 pmap->pm_stats.resident_count)); 1863 KASSERT(vm_radix_is_empty(&pmap->pm_root), 1864 ("pmap_release: pmap has reserved page table page(s)")); 1865 PMAP_ASSERT_STAGE1(pmap); 1866 1867 set = pmap->pm_asid_set; 1868 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 1869 1870 mtx_lock_spin(&set->asid_set_mutex); 1871 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) { 1872 asid = COOKIE_TO_ASID(pmap->pm_cookie); 1873 KASSERT(asid >= ASID_FIRST_AVAILABLE && 1874 asid < set->asid_set_size, 1875 ("pmap_release: pmap cookie has out-of-range asid")); 1876 bit_clear(set->asid_set, asid); 1877 } 1878 mtx_unlock_spin(&set->asid_set_mutex); 1879 1880 m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr); 1881 vm_page_unwire_noq(m); 1882 vm_page_free_zero(m); 1883 } 1884 1885 static int 1886 kvm_size(SYSCTL_HANDLER_ARGS) 1887 { 1888 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 1889 1890 return sysctl_handle_long(oidp, &ksize, 0, req); 1891 } 1892 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1893 0, 0, kvm_size, "LU", 1894 "Size of KVM"); 1895 1896 static int 1897 kvm_free(SYSCTL_HANDLER_ARGS) 1898 { 1899 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 1900 1901 return sysctl_handle_long(oidp, &kfree, 0, req); 1902 } 1903 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1904 0, 0, kvm_free, "LU", 1905 "Amount of KVM free"); 1906 1907 /* 1908 * grow the number of kernel page table entries, if needed 1909 */ 1910 void 1911 pmap_growkernel(vm_offset_t addr) 1912 { 1913 vm_paddr_t paddr; 1914 vm_page_t nkpg; 1915 pd_entry_t *l0, *l1, *l2; 1916 1917 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 1918 1919 addr = roundup2(addr, L2_SIZE); 1920 if (addr - 1 >= vm_map_max(kernel_map)) 1921 addr = vm_map_max(kernel_map); 1922 while (kernel_vm_end < addr) { 1923 l0 = pmap_l0(kernel_pmap, kernel_vm_end); 1924 KASSERT(pmap_load(l0) != 0, 1925 ("pmap_growkernel: No level 0 kernel entry")); 1926 1927 l1 = pmap_l0_to_l1(l0, kernel_vm_end); 1928 if (pmap_load(l1) == 0) { 1929 /* We need a new PDP entry */ 1930 nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT, 1931 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 1932 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1933 if (nkpg == NULL) 1934 panic("pmap_growkernel: no memory to grow kernel"); 1935 if ((nkpg->flags & PG_ZERO) == 0) 1936 pmap_zero_page(nkpg); 1937 /* See the dmb() in _pmap_alloc_l3(). */ 1938 dmb(ishst); 1939 paddr = VM_PAGE_TO_PHYS(nkpg); 1940 pmap_store(l1, paddr | L1_TABLE); 1941 continue; /* try again */ 1942 } 1943 l2 = pmap_l1_to_l2(l1, kernel_vm_end); 1944 if (pmap_load(l2) != 0) { 1945 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1946 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1947 kernel_vm_end = vm_map_max(kernel_map); 1948 break; 1949 } 1950 continue; 1951 } 1952 1953 nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT, 1954 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 1955 VM_ALLOC_ZERO); 1956 if (nkpg == NULL) 1957 panic("pmap_growkernel: no memory to grow kernel"); 1958 if ((nkpg->flags & PG_ZERO) == 0) 1959 pmap_zero_page(nkpg); 1960 /* See the dmb() in _pmap_alloc_l3(). */ 1961 dmb(ishst); 1962 paddr = VM_PAGE_TO_PHYS(nkpg); 1963 pmap_store(l2, paddr | L2_TABLE); 1964 1965 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1966 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1967 kernel_vm_end = vm_map_max(kernel_map); 1968 break; 1969 } 1970 } 1971 } 1972 1973 1974 /*************************************************** 1975 * page management routines. 1976 ***************************************************/ 1977 1978 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 1979 CTASSERT(_NPCM == 3); 1980 CTASSERT(_NPCPV == 168); 1981 1982 static __inline struct pv_chunk * 1983 pv_to_chunk(pv_entry_t pv) 1984 { 1985 1986 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 1987 } 1988 1989 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 1990 1991 #define PC_FREE0 0xfffffffffffffffful 1992 #define PC_FREE1 0xfffffffffffffffful 1993 #define PC_FREE2 0x000000fffffffffful 1994 1995 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 1996 1997 #if 0 1998 #ifdef PV_STATS 1999 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2000 2001 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2002 "Current number of pv entry chunks"); 2003 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2004 "Current number of pv entry chunks allocated"); 2005 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2006 "Current number of pv entry chunks frees"); 2007 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2008 "Number of times tried to get a chunk page but failed."); 2009 2010 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 2011 static int pv_entry_spare; 2012 2013 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2014 "Current number of pv entry frees"); 2015 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2016 "Current number of pv entry allocs"); 2017 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2018 "Current number of pv entries"); 2019 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2020 "Current number of spare pv entries"); 2021 #endif 2022 #endif /* 0 */ 2023 2024 /* 2025 * We are in a serious low memory condition. Resort to 2026 * drastic measures to free some pages so we can allocate 2027 * another pv entry chunk. 2028 * 2029 * Returns NULL if PV entries were reclaimed from the specified pmap. 2030 * 2031 * We do not, however, unmap 2mpages because subsequent accesses will 2032 * allocate per-page pv entries until repromotion occurs, thereby 2033 * exacerbating the shortage of free pv entries. 2034 */ 2035 static vm_page_t 2036 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 2037 { 2038 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 2039 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 2040 struct md_page *pvh; 2041 pd_entry_t *pde; 2042 pmap_t next_pmap, pmap; 2043 pt_entry_t *pte, tpte; 2044 pv_entry_t pv; 2045 vm_offset_t va; 2046 vm_page_t m, m_pc; 2047 struct spglist free; 2048 uint64_t inuse; 2049 int bit, field, freed, lvl; 2050 static int active_reclaims = 0; 2051 2052 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2053 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 2054 2055 pmap = NULL; 2056 m_pc = NULL; 2057 SLIST_INIT(&free); 2058 bzero(&pc_marker_b, sizeof(pc_marker_b)); 2059 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 2060 pc_marker = (struct pv_chunk *)&pc_marker_b; 2061 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 2062 2063 mtx_lock(&pv_chunks_mutex); 2064 active_reclaims++; 2065 TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); 2066 TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); 2067 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 2068 SLIST_EMPTY(&free)) { 2069 next_pmap = pc->pc_pmap; 2070 if (next_pmap == NULL) { 2071 /* 2072 * The next chunk is a marker. However, it is 2073 * not our marker, so active_reclaims must be 2074 * > 1. Consequently, the next_chunk code 2075 * will not rotate the pv_chunks list. 2076 */ 2077 goto next_chunk; 2078 } 2079 mtx_unlock(&pv_chunks_mutex); 2080 2081 /* 2082 * A pv_chunk can only be removed from the pc_lru list 2083 * when both pv_chunks_mutex is owned and the 2084 * corresponding pmap is locked. 2085 */ 2086 if (pmap != next_pmap) { 2087 if (pmap != NULL && pmap != locked_pmap) 2088 PMAP_UNLOCK(pmap); 2089 pmap = next_pmap; 2090 /* Avoid deadlock and lock recursion. */ 2091 if (pmap > locked_pmap) { 2092 RELEASE_PV_LIST_LOCK(lockp); 2093 PMAP_LOCK(pmap); 2094 mtx_lock(&pv_chunks_mutex); 2095 continue; 2096 } else if (pmap != locked_pmap) { 2097 if (PMAP_TRYLOCK(pmap)) { 2098 mtx_lock(&pv_chunks_mutex); 2099 continue; 2100 } else { 2101 pmap = NULL; /* pmap is not locked */ 2102 mtx_lock(&pv_chunks_mutex); 2103 pc = TAILQ_NEXT(pc_marker, pc_lru); 2104 if (pc == NULL || 2105 pc->pc_pmap != next_pmap) 2106 continue; 2107 goto next_chunk; 2108 } 2109 } 2110 } 2111 2112 /* 2113 * Destroy every non-wired, 4 KB page mapping in the chunk. 2114 */ 2115 freed = 0; 2116 for (field = 0; field < _NPCM; field++) { 2117 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2118 inuse != 0; inuse &= ~(1UL << bit)) { 2119 bit = ffsl(inuse) - 1; 2120 pv = &pc->pc_pventry[field * 64 + bit]; 2121 va = pv->pv_va; 2122 pde = pmap_pde(pmap, va, &lvl); 2123 if (lvl != 2) 2124 continue; 2125 pte = pmap_l2_to_l3(pde, va); 2126 tpte = pmap_load(pte); 2127 if ((tpte & ATTR_SW_WIRED) != 0) 2128 continue; 2129 tpte = pmap_load_clear(pte); 2130 m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK); 2131 if (pmap_pte_dirty(pmap, tpte)) 2132 vm_page_dirty(m); 2133 if ((tpte & ATTR_AF) != 0) { 2134 pmap_invalidate_page(pmap, va); 2135 vm_page_aflag_set(m, PGA_REFERENCED); 2136 } 2137 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2138 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2139 m->md.pv_gen++; 2140 if (TAILQ_EMPTY(&m->md.pv_list) && 2141 (m->flags & PG_FICTITIOUS) == 0) { 2142 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2143 if (TAILQ_EMPTY(&pvh->pv_list)) { 2144 vm_page_aflag_clear(m, 2145 PGA_WRITEABLE); 2146 } 2147 } 2148 pc->pc_map[field] |= 1UL << bit; 2149 pmap_unuse_pt(pmap, va, pmap_load(pde), &free); 2150 freed++; 2151 } 2152 } 2153 if (freed == 0) { 2154 mtx_lock(&pv_chunks_mutex); 2155 goto next_chunk; 2156 } 2157 /* Every freed mapping is for a 4 KB page. */ 2158 pmap_resident_count_dec(pmap, freed); 2159 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 2160 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 2161 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 2162 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2163 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && 2164 pc->pc_map[2] == PC_FREE2) { 2165 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 2166 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 2167 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 2168 /* Entire chunk is free; return it. */ 2169 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2170 dump_drop_page(m_pc->phys_addr); 2171 mtx_lock(&pv_chunks_mutex); 2172 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2173 break; 2174 } 2175 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2176 mtx_lock(&pv_chunks_mutex); 2177 /* One freed pv entry in locked_pmap is sufficient. */ 2178 if (pmap == locked_pmap) 2179 break; 2180 2181 next_chunk: 2182 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 2183 TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); 2184 if (active_reclaims == 1 && pmap != NULL) { 2185 /* 2186 * Rotate the pv chunks list so that we do not 2187 * scan the same pv chunks that could not be 2188 * freed (because they contained a wired 2189 * and/or superpage mapping) on every 2190 * invocation of reclaim_pv_chunk(). 2191 */ 2192 while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { 2193 MPASS(pc->pc_pmap != NULL); 2194 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2195 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2196 } 2197 } 2198 } 2199 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 2200 TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); 2201 active_reclaims--; 2202 mtx_unlock(&pv_chunks_mutex); 2203 if (pmap != NULL && pmap != locked_pmap) 2204 PMAP_UNLOCK(pmap); 2205 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 2206 m_pc = SLIST_FIRST(&free); 2207 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2208 /* Recycle a freed page table page. */ 2209 m_pc->ref_count = 1; 2210 } 2211 vm_page_free_pages_toq(&free, true); 2212 return (m_pc); 2213 } 2214 2215 /* 2216 * free the pv_entry back to the free list 2217 */ 2218 static void 2219 free_pv_entry(pmap_t pmap, pv_entry_t pv) 2220 { 2221 struct pv_chunk *pc; 2222 int idx, field, bit; 2223 2224 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2225 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 2226 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 2227 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 2228 pc = pv_to_chunk(pv); 2229 idx = pv - &pc->pc_pventry[0]; 2230 field = idx / 64; 2231 bit = idx % 64; 2232 pc->pc_map[field] |= 1ul << bit; 2233 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 2234 pc->pc_map[2] != PC_FREE2) { 2235 /* 98% of the time, pc is already at the head of the list. */ 2236 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 2237 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2238 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2239 } 2240 return; 2241 } 2242 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2243 free_pv_chunk(pc); 2244 } 2245 2246 static void 2247 free_pv_chunk(struct pv_chunk *pc) 2248 { 2249 vm_page_t m; 2250 2251 mtx_lock(&pv_chunks_mutex); 2252 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2253 mtx_unlock(&pv_chunks_mutex); 2254 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 2255 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 2256 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 2257 /* entire chunk is free, return it */ 2258 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2259 dump_drop_page(m->phys_addr); 2260 vm_page_unwire_noq(m); 2261 vm_page_free(m); 2262 } 2263 2264 /* 2265 * Returns a new PV entry, allocating a new PV chunk from the system when 2266 * needed. If this PV chunk allocation fails and a PV list lock pointer was 2267 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 2268 * returned. 2269 * 2270 * The given PV list lock may be released. 2271 */ 2272 static pv_entry_t 2273 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 2274 { 2275 int bit, field; 2276 pv_entry_t pv; 2277 struct pv_chunk *pc; 2278 vm_page_t m; 2279 2280 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2281 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 2282 retry: 2283 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2284 if (pc != NULL) { 2285 for (field = 0; field < _NPCM; field++) { 2286 if (pc->pc_map[field]) { 2287 bit = ffsl(pc->pc_map[field]) - 1; 2288 break; 2289 } 2290 } 2291 if (field < _NPCM) { 2292 pv = &pc->pc_pventry[field * 64 + bit]; 2293 pc->pc_map[field] &= ~(1ul << bit); 2294 /* If this was the last item, move it to tail */ 2295 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 2296 pc->pc_map[2] == 0) { 2297 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2298 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 2299 pc_list); 2300 } 2301 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 2302 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 2303 return (pv); 2304 } 2305 } 2306 /* No free items, allocate another chunk */ 2307 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 2308 VM_ALLOC_WIRED); 2309 if (m == NULL) { 2310 if (lockp == NULL) { 2311 PV_STAT(pc_chunk_tryfail++); 2312 return (NULL); 2313 } 2314 m = reclaim_pv_chunk(pmap, lockp); 2315 if (m == NULL) 2316 goto retry; 2317 } 2318 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 2319 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 2320 dump_add_page(m->phys_addr); 2321 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 2322 pc->pc_pmap = pmap; 2323 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 2324 pc->pc_map[1] = PC_FREE1; 2325 pc->pc_map[2] = PC_FREE2; 2326 mtx_lock(&pv_chunks_mutex); 2327 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2328 mtx_unlock(&pv_chunks_mutex); 2329 pv = &pc->pc_pventry[0]; 2330 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2331 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 2332 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 2333 return (pv); 2334 } 2335 2336 /* 2337 * Ensure that the number of spare PV entries in the specified pmap meets or 2338 * exceeds the given count, "needed". 2339 * 2340 * The given PV list lock may be released. 2341 */ 2342 static void 2343 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 2344 { 2345 struct pch new_tail; 2346 struct pv_chunk *pc; 2347 vm_page_t m; 2348 int avail, free; 2349 bool reclaimed; 2350 2351 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2352 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 2353 2354 /* 2355 * Newly allocated PV chunks must be stored in a private list until 2356 * the required number of PV chunks have been allocated. Otherwise, 2357 * reclaim_pv_chunk() could recycle one of these chunks. In 2358 * contrast, these chunks must be added to the pmap upon allocation. 2359 */ 2360 TAILQ_INIT(&new_tail); 2361 retry: 2362 avail = 0; 2363 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 2364 bit_count((bitstr_t *)pc->pc_map, 0, 2365 sizeof(pc->pc_map) * NBBY, &free); 2366 if (free == 0) 2367 break; 2368 avail += free; 2369 if (avail >= needed) 2370 break; 2371 } 2372 for (reclaimed = false; avail < needed; avail += _NPCPV) { 2373 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 2374 VM_ALLOC_WIRED); 2375 if (m == NULL) { 2376 m = reclaim_pv_chunk(pmap, lockp); 2377 if (m == NULL) 2378 goto retry; 2379 reclaimed = true; 2380 } 2381 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 2382 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 2383 dump_add_page(m->phys_addr); 2384 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 2385 pc->pc_pmap = pmap; 2386 pc->pc_map[0] = PC_FREE0; 2387 pc->pc_map[1] = PC_FREE1; 2388 pc->pc_map[2] = PC_FREE2; 2389 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2390 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2391 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 2392 2393 /* 2394 * The reclaim might have freed a chunk from the current pmap. 2395 * If that chunk contained available entries, we need to 2396 * re-count the number of available entries. 2397 */ 2398 if (reclaimed) 2399 goto retry; 2400 } 2401 if (!TAILQ_EMPTY(&new_tail)) { 2402 mtx_lock(&pv_chunks_mutex); 2403 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 2404 mtx_unlock(&pv_chunks_mutex); 2405 } 2406 } 2407 2408 /* 2409 * First find and then remove the pv entry for the specified pmap and virtual 2410 * address from the specified pv list. Returns the pv entry if found and NULL 2411 * otherwise. This operation can be performed on pv lists for either 4KB or 2412 * 2MB page mappings. 2413 */ 2414 static __inline pv_entry_t 2415 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2416 { 2417 pv_entry_t pv; 2418 2419 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 2420 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 2421 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 2422 pvh->pv_gen++; 2423 break; 2424 } 2425 } 2426 return (pv); 2427 } 2428 2429 /* 2430 * After demotion from a 2MB page mapping to 512 4KB page mappings, 2431 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 2432 * entries for each of the 4KB page mappings. 2433 */ 2434 static void 2435 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 2436 struct rwlock **lockp) 2437 { 2438 struct md_page *pvh; 2439 struct pv_chunk *pc; 2440 pv_entry_t pv; 2441 vm_offset_t va_last; 2442 vm_page_t m; 2443 int bit, field; 2444 2445 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2446 KASSERT((va & L2_OFFSET) == 0, 2447 ("pmap_pv_demote_l2: va is not 2mpage aligned")); 2448 KASSERT((pa & L2_OFFSET) == 0, 2449 ("pmap_pv_demote_l2: pa is not 2mpage aligned")); 2450 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2451 2452 /* 2453 * Transfer the 2mpage's pv entry for this mapping to the first 2454 * page's pv list. Once this transfer begins, the pv list lock 2455 * must not be released until the last pv entry is reinstantiated. 2456 */ 2457 pvh = pa_to_pvh(pa); 2458 pv = pmap_pvh_remove(pvh, pmap, va); 2459 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); 2460 m = PHYS_TO_VM_PAGE(pa); 2461 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2462 m->md.pv_gen++; 2463 /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */ 2464 PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1)); 2465 va_last = va + L2_SIZE - PAGE_SIZE; 2466 for (;;) { 2467 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2468 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 2469 pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); 2470 for (field = 0; field < _NPCM; field++) { 2471 while (pc->pc_map[field]) { 2472 bit = ffsl(pc->pc_map[field]) - 1; 2473 pc->pc_map[field] &= ~(1ul << bit); 2474 pv = &pc->pc_pventry[field * 64 + bit]; 2475 va += PAGE_SIZE; 2476 pv->pv_va = va; 2477 m++; 2478 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2479 ("pmap_pv_demote_l2: page %p is not managed", m)); 2480 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2481 m->md.pv_gen++; 2482 if (va == va_last) 2483 goto out; 2484 } 2485 } 2486 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2487 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2488 } 2489 out: 2490 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 2491 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2492 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2493 } 2494 PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1)); 2495 PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1)); 2496 } 2497 2498 /* 2499 * First find and then destroy the pv entry for the specified pmap and virtual 2500 * address. This operation can be performed on pv lists for either 4KB or 2MB 2501 * page mappings. 2502 */ 2503 static void 2504 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2505 { 2506 pv_entry_t pv; 2507 2508 pv = pmap_pvh_remove(pvh, pmap, va); 2509 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 2510 free_pv_entry(pmap, pv); 2511 } 2512 2513 /* 2514 * Conditionally create the PV entry for a 4KB page mapping if the required 2515 * memory can be allocated without resorting to reclamation. 2516 */ 2517 static boolean_t 2518 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 2519 struct rwlock **lockp) 2520 { 2521 pv_entry_t pv; 2522 2523 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2524 /* Pass NULL instead of the lock pointer to disable reclamation. */ 2525 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 2526 pv->pv_va = va; 2527 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2528 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2529 m->md.pv_gen++; 2530 return (TRUE); 2531 } else 2532 return (FALSE); 2533 } 2534 2535 /* 2536 * Create the PV entry for a 2MB page mapping. Always returns true unless the 2537 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 2538 * false if the PV entry cannot be allocated without resorting to reclamation. 2539 */ 2540 static bool 2541 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, 2542 struct rwlock **lockp) 2543 { 2544 struct md_page *pvh; 2545 pv_entry_t pv; 2546 vm_paddr_t pa; 2547 2548 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2549 /* Pass NULL instead of the lock pointer to disable reclamation. */ 2550 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 2551 NULL : lockp)) == NULL) 2552 return (false); 2553 pv->pv_va = va; 2554 pa = l2e & ~ATTR_MASK; 2555 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2556 pvh = pa_to_pvh(pa); 2557 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2558 pvh->pv_gen++; 2559 return (true); 2560 } 2561 2562 static void 2563 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 2564 { 2565 pt_entry_t newl2, oldl2; 2566 vm_page_t ml3; 2567 vm_paddr_t ml3pa; 2568 2569 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); 2570 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 2571 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2572 2573 ml3 = pmap_remove_pt_page(pmap, va); 2574 if (ml3 == NULL) 2575 panic("pmap_remove_kernel_l2: Missing pt page"); 2576 2577 ml3pa = VM_PAGE_TO_PHYS(ml3); 2578 newl2 = ml3pa | L2_TABLE; 2579 2580 /* 2581 * If this page table page was unmapped by a promotion, then it 2582 * contains valid mappings. Zero it to invalidate those mappings. 2583 */ 2584 if (ml3->valid != 0) 2585 pagezero((void *)PHYS_TO_DMAP(ml3pa)); 2586 2587 /* 2588 * Demote the mapping. The caller must have already invalidated the 2589 * mapping (i.e., the "break" in break-before-make). 2590 */ 2591 oldl2 = pmap_load_store(l2, newl2); 2592 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", 2593 __func__, l2, oldl2)); 2594 } 2595 2596 /* 2597 * pmap_remove_l2: Do the things to unmap a level 2 superpage. 2598 */ 2599 static int 2600 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 2601 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) 2602 { 2603 struct md_page *pvh; 2604 pt_entry_t old_l2; 2605 vm_offset_t eva, va; 2606 vm_page_t m, ml3; 2607 2608 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2609 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); 2610 old_l2 = pmap_load_clear(l2); 2611 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, 2612 ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2)); 2613 2614 /* 2615 * Since a promotion must break the 4KB page mappings before making 2616 * the 2MB page mapping, a pmap_invalidate_page() suffices. 2617 */ 2618 pmap_invalidate_page(pmap, sva); 2619 2620 if (old_l2 & ATTR_SW_WIRED) 2621 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; 2622 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); 2623 if (old_l2 & ATTR_SW_MANAGED) { 2624 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK); 2625 pvh = pa_to_pvh(old_l2 & ~ATTR_MASK); 2626 pmap_pvh_free(pvh, pmap, sva); 2627 eva = sva + L2_SIZE; 2628 for (va = sva, m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); 2629 va < eva; va += PAGE_SIZE, m++) { 2630 if (pmap_pte_dirty(pmap, old_l2)) 2631 vm_page_dirty(m); 2632 if (old_l2 & ATTR_AF) 2633 vm_page_aflag_set(m, PGA_REFERENCED); 2634 if (TAILQ_EMPTY(&m->md.pv_list) && 2635 TAILQ_EMPTY(&pvh->pv_list)) 2636 vm_page_aflag_clear(m, PGA_WRITEABLE); 2637 } 2638 } 2639 if (pmap == kernel_pmap) { 2640 pmap_remove_kernel_l2(pmap, l2, sva); 2641 } else { 2642 ml3 = pmap_remove_pt_page(pmap, sva); 2643 if (ml3 != NULL) { 2644 KASSERT(ml3->valid == VM_PAGE_BITS_ALL, 2645 ("pmap_remove_l2: l3 page not promoted")); 2646 pmap_resident_count_dec(pmap, 1); 2647 KASSERT(ml3->ref_count == NL3PG, 2648 ("pmap_remove_l2: l3 page ref count error")); 2649 ml3->ref_count = 0; 2650 pmap_add_delayed_free_list(ml3, free, FALSE); 2651 } 2652 } 2653 return (pmap_unuse_pt(pmap, sva, l1e, free)); 2654 } 2655 2656 /* 2657 * pmap_remove_l3: do the things to unmap a page in a process 2658 */ 2659 static int 2660 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 2661 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) 2662 { 2663 struct md_page *pvh; 2664 pt_entry_t old_l3; 2665 vm_page_t m; 2666 2667 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2668 old_l3 = pmap_load_clear(l3); 2669 pmap_invalidate_page(pmap, va); 2670 if (old_l3 & ATTR_SW_WIRED) 2671 pmap->pm_stats.wired_count -= 1; 2672 pmap_resident_count_dec(pmap, 1); 2673 if (old_l3 & ATTR_SW_MANAGED) { 2674 m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); 2675 if (pmap_pte_dirty(pmap, old_l3)) 2676 vm_page_dirty(m); 2677 if (old_l3 & ATTR_AF) 2678 vm_page_aflag_set(m, PGA_REFERENCED); 2679 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2680 pmap_pvh_free(&m->md, pmap, va); 2681 if (TAILQ_EMPTY(&m->md.pv_list) && 2682 (m->flags & PG_FICTITIOUS) == 0) { 2683 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2684 if (TAILQ_EMPTY(&pvh->pv_list)) 2685 vm_page_aflag_clear(m, PGA_WRITEABLE); 2686 } 2687 } 2688 return (pmap_unuse_pt(pmap, va, l2e, free)); 2689 } 2690 2691 /* 2692 * Remove the specified range of addresses from the L3 page table that is 2693 * identified by the given L2 entry. 2694 */ 2695 static void 2696 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva, 2697 vm_offset_t eva, struct spglist *free, struct rwlock **lockp) 2698 { 2699 struct md_page *pvh; 2700 struct rwlock *new_lock; 2701 pt_entry_t *l3, old_l3; 2702 vm_offset_t va; 2703 vm_page_t l3pg, m; 2704 2705 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2706 KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE), 2707 ("pmap_remove_l3_range: range crosses an L3 page table boundary")); 2708 l3pg = sva < VM_MAXUSER_ADDRESS ? PHYS_TO_VM_PAGE(l2e & ~ATTR_MASK) : 2709 NULL; 2710 va = eva; 2711 for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) { 2712 if (!pmap_l3_valid(pmap_load(l3))) { 2713 if (va != eva) { 2714 pmap_invalidate_range(pmap, va, sva); 2715 va = eva; 2716 } 2717 continue; 2718 } 2719 old_l3 = pmap_load_clear(l3); 2720 if ((old_l3 & ATTR_SW_WIRED) != 0) 2721 pmap->pm_stats.wired_count--; 2722 pmap_resident_count_dec(pmap, 1); 2723 if ((old_l3 & ATTR_SW_MANAGED) != 0) { 2724 m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); 2725 if (pmap_pte_dirty(pmap, old_l3)) 2726 vm_page_dirty(m); 2727 if ((old_l3 & ATTR_AF) != 0) 2728 vm_page_aflag_set(m, PGA_REFERENCED); 2729 new_lock = PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)); 2730 if (new_lock != *lockp) { 2731 if (*lockp != NULL) { 2732 /* 2733 * Pending TLB invalidations must be 2734 * performed before the PV list lock is 2735 * released. Otherwise, a concurrent 2736 * pmap_remove_all() on a physical page 2737 * could return while a stale TLB entry 2738 * still provides access to that page. 2739 */ 2740 if (va != eva) { 2741 pmap_invalidate_range(pmap, va, 2742 sva); 2743 va = eva; 2744 } 2745 rw_wunlock(*lockp); 2746 } 2747 *lockp = new_lock; 2748 rw_wlock(*lockp); 2749 } 2750 pmap_pvh_free(&m->md, pmap, sva); 2751 if (TAILQ_EMPTY(&m->md.pv_list) && 2752 (m->flags & PG_FICTITIOUS) == 0) { 2753 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2754 if (TAILQ_EMPTY(&pvh->pv_list)) 2755 vm_page_aflag_clear(m, PGA_WRITEABLE); 2756 } 2757 } 2758 if (va == eva) 2759 va = sva; 2760 if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) { 2761 sva += L3_SIZE; 2762 break; 2763 } 2764 } 2765 if (va != eva) 2766 pmap_invalidate_range(pmap, va, sva); 2767 } 2768 2769 /* 2770 * Remove the given range of addresses from the specified map. 2771 * 2772 * It is assumed that the start and end are properly 2773 * rounded to the page size. 2774 */ 2775 void 2776 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2777 { 2778 struct rwlock *lock; 2779 vm_offset_t va_next; 2780 pd_entry_t *l0, *l1, *l2; 2781 pt_entry_t l3_paddr; 2782 struct spglist free; 2783 2784 /* 2785 * Perform an unsynchronized read. This is, however, safe. 2786 */ 2787 if (pmap->pm_stats.resident_count == 0) 2788 return; 2789 2790 SLIST_INIT(&free); 2791 2792 PMAP_LOCK(pmap); 2793 2794 lock = NULL; 2795 for (; sva < eva; sva = va_next) { 2796 2797 if (pmap->pm_stats.resident_count == 0) 2798 break; 2799 2800 l0 = pmap_l0(pmap, sva); 2801 if (pmap_load(l0) == 0) { 2802 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 2803 if (va_next < sva) 2804 va_next = eva; 2805 continue; 2806 } 2807 2808 l1 = pmap_l0_to_l1(l0, sva); 2809 if (pmap_load(l1) == 0) { 2810 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2811 if (va_next < sva) 2812 va_next = eva; 2813 continue; 2814 } 2815 2816 /* 2817 * Calculate index for next page table. 2818 */ 2819 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2820 if (va_next < sva) 2821 va_next = eva; 2822 2823 l2 = pmap_l1_to_l2(l1, sva); 2824 if (l2 == NULL) 2825 continue; 2826 2827 l3_paddr = pmap_load(l2); 2828 2829 if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) { 2830 if (sva + L2_SIZE == va_next && eva >= va_next) { 2831 pmap_remove_l2(pmap, l2, sva, pmap_load(l1), 2832 &free, &lock); 2833 continue; 2834 } else if (pmap_demote_l2_locked(pmap, l2, sva, 2835 &lock) == NULL) 2836 continue; 2837 l3_paddr = pmap_load(l2); 2838 } 2839 2840 /* 2841 * Weed out invalid mappings. 2842 */ 2843 if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE) 2844 continue; 2845 2846 /* 2847 * Limit our scan to either the end of the va represented 2848 * by the current page table page, or to the end of the 2849 * range being removed. 2850 */ 2851 if (va_next > eva) 2852 va_next = eva; 2853 2854 pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free, 2855 &lock); 2856 } 2857 if (lock != NULL) 2858 rw_wunlock(lock); 2859 PMAP_UNLOCK(pmap); 2860 vm_page_free_pages_toq(&free, true); 2861 } 2862 2863 /* 2864 * Routine: pmap_remove_all 2865 * Function: 2866 * Removes this physical page from 2867 * all physical maps in which it resides. 2868 * Reflects back modify bits to the pager. 2869 * 2870 * Notes: 2871 * Original versions of this routine were very 2872 * inefficient because they iteratively called 2873 * pmap_remove (slow...) 2874 */ 2875 2876 void 2877 pmap_remove_all(vm_page_t m) 2878 { 2879 struct md_page *pvh; 2880 pv_entry_t pv; 2881 pmap_t pmap; 2882 struct rwlock *lock; 2883 pd_entry_t *pde, tpde; 2884 pt_entry_t *pte, tpte; 2885 vm_offset_t va; 2886 struct spglist free; 2887 int lvl, pvh_gen, md_gen; 2888 2889 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2890 ("pmap_remove_all: page %p is not managed", m)); 2891 SLIST_INIT(&free); 2892 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 2893 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 2894 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2895 retry: 2896 rw_wlock(lock); 2897 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 2898 pmap = PV_PMAP(pv); 2899 if (!PMAP_TRYLOCK(pmap)) { 2900 pvh_gen = pvh->pv_gen; 2901 rw_wunlock(lock); 2902 PMAP_LOCK(pmap); 2903 rw_wlock(lock); 2904 if (pvh_gen != pvh->pv_gen) { 2905 rw_wunlock(lock); 2906 PMAP_UNLOCK(pmap); 2907 goto retry; 2908 } 2909 } 2910 va = pv->pv_va; 2911 pte = pmap_pte(pmap, va, &lvl); 2912 KASSERT(pte != NULL, 2913 ("pmap_remove_all: no page table entry found")); 2914 KASSERT(lvl == 2, 2915 ("pmap_remove_all: invalid pte level %d", lvl)); 2916 2917 pmap_demote_l2_locked(pmap, pte, va, &lock); 2918 PMAP_UNLOCK(pmap); 2919 } 2920 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2921 pmap = PV_PMAP(pv); 2922 PMAP_ASSERT_STAGE1(pmap); 2923 if (!PMAP_TRYLOCK(pmap)) { 2924 pvh_gen = pvh->pv_gen; 2925 md_gen = m->md.pv_gen; 2926 rw_wunlock(lock); 2927 PMAP_LOCK(pmap); 2928 rw_wlock(lock); 2929 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 2930 rw_wunlock(lock); 2931 PMAP_UNLOCK(pmap); 2932 goto retry; 2933 } 2934 } 2935 pmap_resident_count_dec(pmap, 1); 2936 2937 pde = pmap_pde(pmap, pv->pv_va, &lvl); 2938 KASSERT(pde != NULL, 2939 ("pmap_remove_all: no page directory entry found")); 2940 KASSERT(lvl == 2, 2941 ("pmap_remove_all: invalid pde level %d", lvl)); 2942 tpde = pmap_load(pde); 2943 2944 pte = pmap_l2_to_l3(pde, pv->pv_va); 2945 tpte = pmap_load_clear(pte); 2946 if (tpte & ATTR_SW_WIRED) 2947 pmap->pm_stats.wired_count--; 2948 if ((tpte & ATTR_AF) != 0) { 2949 pmap_invalidate_page(pmap, pv->pv_va); 2950 vm_page_aflag_set(m, PGA_REFERENCED); 2951 } 2952 2953 /* 2954 * Update the vm_page_t clean and reference bits. 2955 */ 2956 if (pmap_pte_dirty(pmap, tpte)) 2957 vm_page_dirty(m); 2958 pmap_unuse_pt(pmap, pv->pv_va, tpde, &free); 2959 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2960 m->md.pv_gen++; 2961 free_pv_entry(pmap, pv); 2962 PMAP_UNLOCK(pmap); 2963 } 2964 vm_page_aflag_clear(m, PGA_WRITEABLE); 2965 rw_wunlock(lock); 2966 vm_page_free_pages_toq(&free, true); 2967 } 2968 2969 /* 2970 * pmap_protect_l2: do the things to protect a 2MB page in a pmap 2971 */ 2972 static void 2973 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask, 2974 pt_entry_t nbits) 2975 { 2976 pd_entry_t old_l2; 2977 vm_page_t m, mt; 2978 2979 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2980 PMAP_ASSERT_STAGE1(pmap); 2981 KASSERT((sva & L2_OFFSET) == 0, 2982 ("pmap_protect_l2: sva is not 2mpage aligned")); 2983 old_l2 = pmap_load(l2); 2984 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, 2985 ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2)); 2986 2987 /* 2988 * Return if the L2 entry already has the desired access restrictions 2989 * in place. 2990 */ 2991 retry: 2992 if ((old_l2 & mask) == nbits) 2993 return; 2994 2995 /* 2996 * When a dirty read/write superpage mapping is write protected, 2997 * update the dirty field of each of the superpage's constituent 4KB 2998 * pages. 2999 */ 3000 if ((old_l2 & ATTR_SW_MANAGED) != 0 && 3001 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 3002 pmap_pte_dirty(pmap, old_l2)) { 3003 m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); 3004 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 3005 vm_page_dirty(mt); 3006 } 3007 3008 if (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits)) 3009 goto retry; 3010 3011 /* 3012 * Since a promotion must break the 4KB page mappings before making 3013 * the 2MB page mapping, a pmap_invalidate_page() suffices. 3014 */ 3015 pmap_invalidate_page(pmap, sva); 3016 } 3017 3018 /* 3019 * Set the physical protection on the 3020 * specified range of this map as requested. 3021 */ 3022 void 3023 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 3024 { 3025 vm_offset_t va, va_next; 3026 pd_entry_t *l0, *l1, *l2; 3027 pt_entry_t *l3p, l3, mask, nbits; 3028 3029 PMAP_ASSERT_STAGE1(pmap); 3030 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 3031 if (prot == VM_PROT_NONE) { 3032 pmap_remove(pmap, sva, eva); 3033 return; 3034 } 3035 3036 mask = nbits = 0; 3037 if ((prot & VM_PROT_WRITE) == 0) { 3038 mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM; 3039 nbits |= ATTR_S1_AP(ATTR_S1_AP_RO); 3040 } 3041 if ((prot & VM_PROT_EXECUTE) == 0) { 3042 mask |= ATTR_S1_XN; 3043 nbits |= ATTR_S1_XN; 3044 } 3045 if (mask == 0) 3046 return; 3047 3048 PMAP_LOCK(pmap); 3049 for (; sva < eva; sva = va_next) { 3050 3051 l0 = pmap_l0(pmap, sva); 3052 if (pmap_load(l0) == 0) { 3053 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 3054 if (va_next < sva) 3055 va_next = eva; 3056 continue; 3057 } 3058 3059 l1 = pmap_l0_to_l1(l0, sva); 3060 if (pmap_load(l1) == 0) { 3061 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3062 if (va_next < sva) 3063 va_next = eva; 3064 continue; 3065 } 3066 3067 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3068 if (va_next < sva) 3069 va_next = eva; 3070 3071 l2 = pmap_l1_to_l2(l1, sva); 3072 if (pmap_load(l2) == 0) 3073 continue; 3074 3075 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { 3076 if (sva + L2_SIZE == va_next && eva >= va_next) { 3077 pmap_protect_l2(pmap, l2, sva, mask, nbits); 3078 continue; 3079 } else if (pmap_demote_l2(pmap, l2, sva) == NULL) 3080 continue; 3081 } 3082 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 3083 ("pmap_protect: Invalid L2 entry after demotion")); 3084 3085 if (va_next > eva) 3086 va_next = eva; 3087 3088 va = va_next; 3089 for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++, 3090 sva += L3_SIZE) { 3091 l3 = pmap_load(l3p); 3092 retry: 3093 /* 3094 * Go to the next L3 entry if the current one is 3095 * invalid or already has the desired access 3096 * restrictions in place. (The latter case occurs 3097 * frequently. For example, in a "buildworld" 3098 * workload, almost 1 out of 4 L3 entries already 3099 * have the desired restrictions.) 3100 */ 3101 if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) { 3102 if (va != va_next) { 3103 pmap_invalidate_range(pmap, va, sva); 3104 va = va_next; 3105 } 3106 continue; 3107 } 3108 3109 /* 3110 * When a dirty read/write mapping is write protected, 3111 * update the page's dirty field. 3112 */ 3113 if ((l3 & ATTR_SW_MANAGED) != 0 && 3114 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 3115 pmap_pte_dirty(pmap, l3)) 3116 vm_page_dirty(PHYS_TO_VM_PAGE(l3 & ~ATTR_MASK)); 3117 3118 if (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) | nbits)) 3119 goto retry; 3120 if (va == va_next) 3121 va = sva; 3122 } 3123 if (va != va_next) 3124 pmap_invalidate_range(pmap, va, sva); 3125 } 3126 PMAP_UNLOCK(pmap); 3127 } 3128 3129 /* 3130 * Inserts the specified page table page into the specified pmap's collection 3131 * of idle page table pages. Each of a pmap's page table pages is responsible 3132 * for mapping a distinct range of virtual addresses. The pmap's collection is 3133 * ordered by this virtual address range. 3134 * 3135 * If "promoted" is false, then the page table page "mpte" must be zero filled. 3136 */ 3137 static __inline int 3138 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) 3139 { 3140 3141 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3142 mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; 3143 return (vm_radix_insert(&pmap->pm_root, mpte)); 3144 } 3145 3146 /* 3147 * Removes the page table page mapping the specified virtual address from the 3148 * specified pmap's collection of idle page table pages, and returns it. 3149 * Otherwise, returns NULL if there is no page table page corresponding to the 3150 * specified virtual address. 3151 */ 3152 static __inline vm_page_t 3153 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 3154 { 3155 3156 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3157 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); 3158 } 3159 3160 /* 3161 * Performs a break-before-make update of a pmap entry. This is needed when 3162 * either promoting or demoting pages to ensure the TLB doesn't get into an 3163 * inconsistent state. 3164 */ 3165 static void 3166 pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte, 3167 vm_offset_t va, vm_size_t size) 3168 { 3169 register_t intr; 3170 3171 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3172 3173 /* 3174 * Ensure we don't get switched out with the page table in an 3175 * inconsistent state. We also need to ensure no interrupts fire 3176 * as they may make use of an address we are about to invalidate. 3177 */ 3178 intr = intr_disable(); 3179 3180 /* 3181 * Clear the old mapping's valid bit, but leave the rest of the entry 3182 * unchanged, so that a lockless, concurrent pmap_kextract() can still 3183 * lookup the physical address. 3184 */ 3185 pmap_clear_bits(pte, ATTR_DESCR_VALID); 3186 pmap_invalidate_range(pmap, va, va + size); 3187 3188 /* Create the new mapping */ 3189 pmap_store(pte, newpte); 3190 dsb(ishst); 3191 3192 intr_restore(intr); 3193 } 3194 3195 #if VM_NRESERVLEVEL > 0 3196 /* 3197 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 3198 * replace the many pv entries for the 4KB page mappings by a single pv entry 3199 * for the 2MB page mapping. 3200 */ 3201 static void 3202 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3203 struct rwlock **lockp) 3204 { 3205 struct md_page *pvh; 3206 pv_entry_t pv; 3207 vm_offset_t va_last; 3208 vm_page_t m; 3209 3210 KASSERT((pa & L2_OFFSET) == 0, 3211 ("pmap_pv_promote_l2: pa is not 2mpage aligned")); 3212 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3213 3214 /* 3215 * Transfer the first page's pv entry for this mapping to the 2mpage's 3216 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 3217 * a transfer avoids the possibility that get_pv_entry() calls 3218 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 3219 * mappings that is being promoted. 3220 */ 3221 m = PHYS_TO_VM_PAGE(pa); 3222 va = va & ~L2_OFFSET; 3223 pv = pmap_pvh_remove(&m->md, pmap, va); 3224 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found")); 3225 pvh = pa_to_pvh(pa); 3226 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3227 pvh->pv_gen++; 3228 /* Free the remaining NPTEPG - 1 pv entries. */ 3229 va_last = va + L2_SIZE - PAGE_SIZE; 3230 do { 3231 m++; 3232 va += PAGE_SIZE; 3233 pmap_pvh_free(&m->md, pmap, va); 3234 } while (va < va_last); 3235 } 3236 3237 /* 3238 * Tries to promote the 512, contiguous 4KB page mappings that are within a 3239 * single level 2 table entry to a single 2MB page mapping. For promotion 3240 * to occur, two conditions must be met: (1) the 4KB page mappings must map 3241 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 3242 * identical characteristics. 3243 */ 3244 static void 3245 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 3246 struct rwlock **lockp) 3247 { 3248 pt_entry_t *firstl3, *l3, newl2, oldl3, pa; 3249 vm_page_t mpte; 3250 vm_offset_t sva; 3251 3252 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3253 PMAP_ASSERT_STAGE1(pmap); 3254 3255 sva = va & ~L2_OFFSET; 3256 firstl3 = pmap_l2_to_l3(l2, sva); 3257 newl2 = pmap_load(firstl3); 3258 3259 setl2: 3260 if (((newl2 & (~ATTR_MASK | ATTR_AF)) & L2_OFFSET) != ATTR_AF) { 3261 atomic_add_long(&pmap_l2_p_failures, 1); 3262 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 3263 " in pmap %p", va, pmap); 3264 return; 3265 } 3266 3267 if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 3268 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 3269 if (!atomic_fcmpset_64(l2, &newl2, newl2 & ~ATTR_SW_DBM)) 3270 goto setl2; 3271 newl2 &= ~ATTR_SW_DBM; 3272 } 3273 3274 pa = newl2 + L2_SIZE - PAGE_SIZE; 3275 for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) { 3276 oldl3 = pmap_load(l3); 3277 setl3: 3278 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 3279 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 3280 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 & 3281 ~ATTR_SW_DBM)) 3282 goto setl3; 3283 oldl3 &= ~ATTR_SW_DBM; 3284 } 3285 if (oldl3 != pa) { 3286 atomic_add_long(&pmap_l2_p_failures, 1); 3287 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 3288 " in pmap %p", va, pmap); 3289 return; 3290 } 3291 pa -= PAGE_SIZE; 3292 } 3293 3294 /* 3295 * Save the page table page in its current state until the L2 3296 * mapping the superpage is demoted by pmap_demote_l2() or 3297 * destroyed by pmap_remove_l3(). 3298 */ 3299 mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); 3300 KASSERT(mpte >= vm_page_array && 3301 mpte < &vm_page_array[vm_page_array_size], 3302 ("pmap_promote_l2: page table page is out of range")); 3303 KASSERT(mpte->pindex == pmap_l2_pindex(va), 3304 ("pmap_promote_l2: page table page's pindex is wrong")); 3305 if (pmap_insert_pt_page(pmap, mpte, true)) { 3306 atomic_add_long(&pmap_l2_p_failures, 1); 3307 CTR2(KTR_PMAP, 3308 "pmap_promote_l2: failure for va %#lx in pmap %p", va, 3309 pmap); 3310 return; 3311 } 3312 3313 if ((newl2 & ATTR_SW_MANAGED) != 0) 3314 pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp); 3315 3316 newl2 &= ~ATTR_DESCR_MASK; 3317 newl2 |= L2_BLOCK; 3318 3319 pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE); 3320 3321 atomic_add_long(&pmap_l2_promotions, 1); 3322 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, 3323 pmap); 3324 } 3325 #endif /* VM_NRESERVLEVEL > 0 */ 3326 3327 /* 3328 * Insert the given physical page (p) at 3329 * the specified virtual address (v) in the 3330 * target physical map with the protection requested. 3331 * 3332 * If specified, the page will be wired down, meaning 3333 * that the related pte can not be reclaimed. 3334 * 3335 * NB: This is the only routine which MAY NOT lazy-evaluate 3336 * or lose information. That is, this routine must actually 3337 * insert this page into the given map NOW. 3338 */ 3339 int 3340 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3341 u_int flags, int8_t psind) 3342 { 3343 struct rwlock *lock; 3344 pd_entry_t *pde; 3345 pt_entry_t new_l3, orig_l3; 3346 pt_entry_t *l2, *l3; 3347 pv_entry_t pv; 3348 vm_paddr_t opa, pa; 3349 vm_page_t mpte, om; 3350 boolean_t nosleep; 3351 int lvl, rv; 3352 3353 PMAP_ASSERT_STAGE1(pmap); 3354 3355 va = trunc_page(va); 3356 if ((m->oflags & VPO_UNMANAGED) == 0) 3357 VM_PAGE_OBJECT_BUSY_ASSERT(m); 3358 pa = VM_PAGE_TO_PHYS(m); 3359 new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) | 3360 L3_PAGE); 3361 if ((prot & VM_PROT_WRITE) == 0) 3362 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO); 3363 if ((prot & VM_PROT_EXECUTE) == 0 || 3364 m->md.pv_memattr == VM_MEMATTR_DEVICE) 3365 new_l3 |= ATTR_S1_XN; 3366 if ((flags & PMAP_ENTER_WIRED) != 0) 3367 new_l3 |= ATTR_SW_WIRED; 3368 if (va < VM_MAXUSER_ADDRESS) 3369 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 3370 else 3371 new_l3 |= ATTR_S1_UXN; 3372 if (pmap != kernel_pmap) 3373 new_l3 |= ATTR_S1_nG; 3374 if ((m->oflags & VPO_UNMANAGED) == 0) { 3375 new_l3 |= ATTR_SW_MANAGED; 3376 if ((prot & VM_PROT_WRITE) != 0) { 3377 new_l3 |= ATTR_SW_DBM; 3378 if ((flags & VM_PROT_WRITE) == 0) 3379 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO); 3380 } 3381 } 3382 3383 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); 3384 3385 lock = NULL; 3386 PMAP_LOCK(pmap); 3387 if (psind == 1) { 3388 /* Assert the required virtual and physical alignment. */ 3389 KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned")); 3390 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 3391 rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK, 3392 flags, m, &lock); 3393 goto out; 3394 } 3395 mpte = NULL; 3396 3397 /* 3398 * In the case that a page table page is not 3399 * resident, we are creating it here. 3400 */ 3401 retry: 3402 pde = pmap_pde(pmap, va, &lvl); 3403 if (pde != NULL && lvl == 2) { 3404 l3 = pmap_l2_to_l3(pde, va); 3405 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 3406 mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); 3407 mpte->ref_count++; 3408 } 3409 goto havel3; 3410 } else if (pde != NULL && lvl == 1) { 3411 l2 = pmap_l1_to_l2(pde, va); 3412 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK && 3413 (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) { 3414 l3 = &l3[pmap_l3_index(va)]; 3415 if (va < VM_MAXUSER_ADDRESS) { 3416 mpte = PHYS_TO_VM_PAGE( 3417 pmap_load(l2) & ~ATTR_MASK); 3418 mpte->ref_count++; 3419 } 3420 goto havel3; 3421 } 3422 /* We need to allocate an L3 table. */ 3423 } 3424 if (va < VM_MAXUSER_ADDRESS) { 3425 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 3426 3427 /* 3428 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order 3429 * to handle the possibility that a superpage mapping for "va" 3430 * was created while we slept. 3431 */ 3432 mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), 3433 nosleep ? NULL : &lock); 3434 if (mpte == NULL && nosleep) { 3435 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); 3436 rv = KERN_RESOURCE_SHORTAGE; 3437 goto out; 3438 } 3439 goto retry; 3440 } else 3441 panic("pmap_enter: missing L3 table for kernel va %#lx", va); 3442 3443 havel3: 3444 orig_l3 = pmap_load(l3); 3445 opa = orig_l3 & ~ATTR_MASK; 3446 pv = NULL; 3447 3448 /* 3449 * Is the specified virtual address already mapped? 3450 */ 3451 if (pmap_l3_valid(orig_l3)) { 3452 /* 3453 * Wiring change, just update stats. We don't worry about 3454 * wiring PT pages as they remain resident as long as there 3455 * are valid mappings in them. Hence, if a user page is wired, 3456 * the PT page will be also. 3457 */ 3458 if ((flags & PMAP_ENTER_WIRED) != 0 && 3459 (orig_l3 & ATTR_SW_WIRED) == 0) 3460 pmap->pm_stats.wired_count++; 3461 else if ((flags & PMAP_ENTER_WIRED) == 0 && 3462 (orig_l3 & ATTR_SW_WIRED) != 0) 3463 pmap->pm_stats.wired_count--; 3464 3465 /* 3466 * Remove the extra PT page reference. 3467 */ 3468 if (mpte != NULL) { 3469 mpte->ref_count--; 3470 KASSERT(mpte->ref_count > 0, 3471 ("pmap_enter: missing reference to page table page," 3472 " va: 0x%lx", va)); 3473 } 3474 3475 /* 3476 * Has the physical page changed? 3477 */ 3478 if (opa == pa) { 3479 /* 3480 * No, might be a protection or wiring change. 3481 */ 3482 if ((orig_l3 & ATTR_SW_MANAGED) != 0 && 3483 (new_l3 & ATTR_SW_DBM) != 0) 3484 vm_page_aflag_set(m, PGA_WRITEABLE); 3485 goto validate; 3486 } 3487 3488 /* 3489 * The physical page has changed. Temporarily invalidate 3490 * the mapping. 3491 */ 3492 orig_l3 = pmap_load_clear(l3); 3493 KASSERT((orig_l3 & ~ATTR_MASK) == opa, 3494 ("pmap_enter: unexpected pa update for %#lx", va)); 3495 if ((orig_l3 & ATTR_SW_MANAGED) != 0) { 3496 om = PHYS_TO_VM_PAGE(opa); 3497 3498 /* 3499 * The pmap lock is sufficient to synchronize with 3500 * concurrent calls to pmap_page_test_mappings() and 3501 * pmap_ts_referenced(). 3502 */ 3503 if (pmap_pte_dirty(pmap, orig_l3)) 3504 vm_page_dirty(om); 3505 if ((orig_l3 & ATTR_AF) != 0) { 3506 pmap_invalidate_page(pmap, va); 3507 vm_page_aflag_set(om, PGA_REFERENCED); 3508 } 3509 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 3510 pv = pmap_pvh_remove(&om->md, pmap, va); 3511 if ((m->oflags & VPO_UNMANAGED) != 0) 3512 free_pv_entry(pmap, pv); 3513 if ((om->a.flags & PGA_WRITEABLE) != 0 && 3514 TAILQ_EMPTY(&om->md.pv_list) && 3515 ((om->flags & PG_FICTITIOUS) != 0 || 3516 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 3517 vm_page_aflag_clear(om, PGA_WRITEABLE); 3518 } else { 3519 KASSERT((orig_l3 & ATTR_AF) != 0, 3520 ("pmap_enter: unmanaged mapping lacks ATTR_AF")); 3521 pmap_invalidate_page(pmap, va); 3522 } 3523 orig_l3 = 0; 3524 } else { 3525 /* 3526 * Increment the counters. 3527 */ 3528 if ((new_l3 & ATTR_SW_WIRED) != 0) 3529 pmap->pm_stats.wired_count++; 3530 pmap_resident_count_inc(pmap, 1); 3531 } 3532 /* 3533 * Enter on the PV list if part of our managed memory. 3534 */ 3535 if ((m->oflags & VPO_UNMANAGED) == 0) { 3536 if (pv == NULL) { 3537 pv = get_pv_entry(pmap, &lock); 3538 pv->pv_va = va; 3539 } 3540 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 3541 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3542 m->md.pv_gen++; 3543 if ((new_l3 & ATTR_SW_DBM) != 0) 3544 vm_page_aflag_set(m, PGA_WRITEABLE); 3545 } 3546 3547 validate: 3548 /* 3549 * Sync icache if exec permission and attribute VM_MEMATTR_WRITE_BACK 3550 * is set. Do it now, before the mapping is stored and made 3551 * valid for hardware table walk. If done later, then other can 3552 * access this page before caches are properly synced. 3553 * Don't do it for kernel memory which is mapped with exec 3554 * permission even if the memory isn't going to hold executable 3555 * code. The only time when icache sync is needed is after 3556 * kernel module is loaded and the relocation info is processed. 3557 * And it's done in elf_cpu_load_file(). 3558 */ 3559 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 3560 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK && 3561 (opa != pa || (orig_l3 & ATTR_S1_XN))) 3562 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); 3563 3564 /* 3565 * Update the L3 entry 3566 */ 3567 if (pmap_l3_valid(orig_l3)) { 3568 KASSERT(opa == pa, ("pmap_enter: invalid update")); 3569 if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) { 3570 /* same PA, different attributes */ 3571 orig_l3 = pmap_load_store(l3, new_l3); 3572 pmap_invalidate_page(pmap, va); 3573 if ((orig_l3 & ATTR_SW_MANAGED) != 0 && 3574 pmap_pte_dirty(pmap, orig_l3)) 3575 vm_page_dirty(m); 3576 } else { 3577 /* 3578 * orig_l3 == new_l3 3579 * This can happens if multiple threads simultaneously 3580 * access not yet mapped page. This bad for performance 3581 * since this can cause full demotion-NOP-promotion 3582 * cycle. 3583 * Another possible reasons are: 3584 * - VM and pmap memory layout are diverged 3585 * - tlb flush is missing somewhere and CPU doesn't see 3586 * actual mapping. 3587 */ 3588 CTR4(KTR_PMAP, "%s: already mapped page - " 3589 "pmap %p va 0x%#lx pte 0x%lx", 3590 __func__, pmap, va, new_l3); 3591 } 3592 } else { 3593 /* New mapping */ 3594 pmap_store(l3, new_l3); 3595 dsb(ishst); 3596 } 3597 3598 #if VM_NRESERVLEVEL > 0 3599 if ((mpte == NULL || mpte->ref_count == NL3PG) && 3600 pmap_ps_enabled(pmap) && 3601 (m->flags & PG_FICTITIOUS) == 0 && 3602 vm_reserv_level_iffullpop(m) == 0) { 3603 pmap_promote_l2(pmap, pde, va, &lock); 3604 } 3605 #endif 3606 3607 rv = KERN_SUCCESS; 3608 out: 3609 if (lock != NULL) 3610 rw_wunlock(lock); 3611 PMAP_UNLOCK(pmap); 3612 return (rv); 3613 } 3614 3615 /* 3616 * Tries to create a read- and/or execute-only 2MB page mapping. Returns true 3617 * if successful. Returns false if (1) a page table page cannot be allocated 3618 * without sleeping, (2) a mapping already exists at the specified virtual 3619 * address, or (3) a PV entry cannot be allocated without reclaiming another 3620 * PV entry. 3621 */ 3622 static bool 3623 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3624 struct rwlock **lockp) 3625 { 3626 pd_entry_t new_l2; 3627 3628 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3629 PMAP_ASSERT_STAGE1(pmap); 3630 3631 new_l2 = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | 3632 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | 3633 L2_BLOCK); 3634 if ((m->oflags & VPO_UNMANAGED) == 0) { 3635 new_l2 |= ATTR_SW_MANAGED; 3636 new_l2 &= ~ATTR_AF; 3637 } 3638 if ((prot & VM_PROT_EXECUTE) == 0 || 3639 m->md.pv_memattr == VM_MEMATTR_DEVICE) 3640 new_l2 |= ATTR_S1_XN; 3641 if (va < VM_MAXUSER_ADDRESS) 3642 new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 3643 else 3644 new_l2 |= ATTR_S1_UXN; 3645 if (pmap != kernel_pmap) 3646 new_l2 |= ATTR_S1_nG; 3647 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | 3648 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == 3649 KERN_SUCCESS); 3650 } 3651 3652 /* 3653 * Returns true if every page table entry in the specified page table is 3654 * zero. 3655 */ 3656 static bool 3657 pmap_every_pte_zero(vm_paddr_t pa) 3658 { 3659 pt_entry_t *pt_end, *pte; 3660 3661 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); 3662 pte = (pt_entry_t *)PHYS_TO_DMAP(pa); 3663 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) { 3664 if (*pte != 0) 3665 return (false); 3666 } 3667 return (true); 3668 } 3669 3670 /* 3671 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 3672 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 3673 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 3674 * a mapping already exists at the specified virtual address. Returns 3675 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table 3676 * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if 3677 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. 3678 * 3679 * The parameter "m" is only used when creating a managed, writeable mapping. 3680 */ 3681 static int 3682 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, 3683 vm_page_t m, struct rwlock **lockp) 3684 { 3685 struct spglist free; 3686 pd_entry_t *l2, old_l2; 3687 vm_page_t l2pg, mt; 3688 3689 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3690 3691 if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags & 3692 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { 3693 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", 3694 va, pmap); 3695 return (KERN_RESOURCE_SHORTAGE); 3696 } 3697 3698 /* 3699 * If there are existing mappings, either abort or remove them. 3700 */ 3701 if ((old_l2 = pmap_load(l2)) != 0) { 3702 KASSERT(l2pg == NULL || l2pg->ref_count > 1, 3703 ("pmap_enter_l2: l2pg's ref count is too low")); 3704 if ((flags & PMAP_ENTER_NOREPLACE) != 0 && (va < 3705 VM_MAXUSER_ADDRESS || (old_l2 & ATTR_DESCR_MASK) == 3706 L2_BLOCK || !pmap_every_pte_zero(old_l2 & ~ATTR_MASK))) { 3707 if (l2pg != NULL) 3708 l2pg->ref_count--; 3709 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx" 3710 " in pmap %p", va, pmap); 3711 return (KERN_FAILURE); 3712 } 3713 SLIST_INIT(&free); 3714 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) 3715 (void)pmap_remove_l2(pmap, l2, va, 3716 pmap_load(pmap_l1(pmap, va)), &free, lockp); 3717 else 3718 pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE, 3719 &free, lockp); 3720 if (va < VM_MAXUSER_ADDRESS) { 3721 vm_page_free_pages_toq(&free, true); 3722 KASSERT(pmap_load(l2) == 0, 3723 ("pmap_enter_l2: non-zero L2 entry %p", l2)); 3724 } else { 3725 KASSERT(SLIST_EMPTY(&free), 3726 ("pmap_enter_l2: freed kernel page table page")); 3727 3728 /* 3729 * Both pmap_remove_l2() and pmap_remove_l3_range() 3730 * will leave the kernel page table page zero filled. 3731 * Nonetheless, the TLB could have an intermediate 3732 * entry for the kernel page table page. 3733 */ 3734 mt = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); 3735 if (pmap_insert_pt_page(pmap, mt, false)) 3736 panic("pmap_enter_l2: trie insert failed"); 3737 pmap_clear(l2); 3738 pmap_invalidate_page(pmap, va); 3739 } 3740 } 3741 3742 if ((new_l2 & ATTR_SW_MANAGED) != 0) { 3743 /* 3744 * Abort this mapping if its PV entry could not be created. 3745 */ 3746 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { 3747 if (l2pg != NULL) 3748 pmap_abort_ptp(pmap, va, l2pg); 3749 CTR2(KTR_PMAP, 3750 "pmap_enter_l2: failure for va %#lx in pmap %p", 3751 va, pmap); 3752 return (KERN_RESOURCE_SHORTAGE); 3753 } 3754 if ((new_l2 & ATTR_SW_DBM) != 0) 3755 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 3756 vm_page_aflag_set(mt, PGA_WRITEABLE); 3757 } 3758 3759 /* 3760 * Increment counters. 3761 */ 3762 if ((new_l2 & ATTR_SW_WIRED) != 0) 3763 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; 3764 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; 3765 3766 /* 3767 * Map the superpage. 3768 */ 3769 pmap_store(l2, new_l2); 3770 dsb(ishst); 3771 3772 atomic_add_long(&pmap_l2_mappings, 1); 3773 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", 3774 va, pmap); 3775 3776 return (KERN_SUCCESS); 3777 } 3778 3779 /* 3780 * Maps a sequence of resident pages belonging to the same object. 3781 * The sequence begins with the given page m_start. This page is 3782 * mapped at the given virtual address start. Each subsequent page is 3783 * mapped at a virtual address that is offset from start by the same 3784 * amount as the page is offset from m_start within the object. The 3785 * last page in the sequence is the page with the largest offset from 3786 * m_start that can be mapped at a virtual address less than the given 3787 * virtual address end. Not every virtual page between start and end 3788 * is mapped; only those for which a resident page exists with the 3789 * corresponding offset from m_start are mapped. 3790 */ 3791 void 3792 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3793 vm_page_t m_start, vm_prot_t prot) 3794 { 3795 struct rwlock *lock; 3796 vm_offset_t va; 3797 vm_page_t m, mpte; 3798 vm_pindex_t diff, psize; 3799 3800 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3801 3802 psize = atop(end - start); 3803 mpte = NULL; 3804 m = m_start; 3805 lock = NULL; 3806 PMAP_LOCK(pmap); 3807 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3808 va = start + ptoa(diff); 3809 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && 3810 m->psind == 1 && pmap_ps_enabled(pmap) && 3811 pmap_enter_2mpage(pmap, va, m, prot, &lock)) 3812 m = &m[L2_SIZE / PAGE_SIZE - 1]; 3813 else 3814 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, 3815 &lock); 3816 m = TAILQ_NEXT(m, listq); 3817 } 3818 if (lock != NULL) 3819 rw_wunlock(lock); 3820 PMAP_UNLOCK(pmap); 3821 } 3822 3823 /* 3824 * this code makes some *MAJOR* assumptions: 3825 * 1. Current pmap & pmap exists. 3826 * 2. Not wired. 3827 * 3. Read access. 3828 * 4. No page table pages. 3829 * but is *MUCH* faster than pmap_enter... 3830 */ 3831 3832 void 3833 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3834 { 3835 struct rwlock *lock; 3836 3837 lock = NULL; 3838 PMAP_LOCK(pmap); 3839 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 3840 if (lock != NULL) 3841 rw_wunlock(lock); 3842 PMAP_UNLOCK(pmap); 3843 } 3844 3845 static vm_page_t 3846 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3847 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 3848 { 3849 pd_entry_t *pde; 3850 pt_entry_t *l2, *l3, l3_val; 3851 vm_paddr_t pa; 3852 int lvl; 3853 3854 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 3855 (m->oflags & VPO_UNMANAGED) != 0, 3856 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3857 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3858 PMAP_ASSERT_STAGE1(pmap); 3859 3860 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); 3861 /* 3862 * In the case that a page table page is not 3863 * resident, we are creating it here. 3864 */ 3865 if (va < VM_MAXUSER_ADDRESS) { 3866 vm_pindex_t l2pindex; 3867 3868 /* 3869 * Calculate pagetable page index 3870 */ 3871 l2pindex = pmap_l2_pindex(va); 3872 if (mpte && (mpte->pindex == l2pindex)) { 3873 mpte->ref_count++; 3874 } else { 3875 /* 3876 * Get the l2 entry 3877 */ 3878 pde = pmap_pde(pmap, va, &lvl); 3879 3880 /* 3881 * If the page table page is mapped, we just increment 3882 * the hold count, and activate it. Otherwise, we 3883 * attempt to allocate a page table page. If this 3884 * attempt fails, we don't retry. Instead, we give up. 3885 */ 3886 if (lvl == 1) { 3887 l2 = pmap_l1_to_l2(pde, va); 3888 if ((pmap_load(l2) & ATTR_DESCR_MASK) == 3889 L2_BLOCK) 3890 return (NULL); 3891 } 3892 if (lvl == 2 && pmap_load(pde) != 0) { 3893 mpte = 3894 PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); 3895 mpte->ref_count++; 3896 } else { 3897 /* 3898 * Pass NULL instead of the PV list lock 3899 * pointer, because we don't intend to sleep. 3900 */ 3901 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); 3902 if (mpte == NULL) 3903 return (mpte); 3904 } 3905 } 3906 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 3907 l3 = &l3[pmap_l3_index(va)]; 3908 } else { 3909 mpte = NULL; 3910 pde = pmap_pde(kernel_pmap, va, &lvl); 3911 KASSERT(pde != NULL, 3912 ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx", 3913 va)); 3914 KASSERT(lvl == 2, 3915 ("pmap_enter_quick_locked: Invalid level %d", lvl)); 3916 l3 = pmap_l2_to_l3(pde, va); 3917 } 3918 3919 /* 3920 * Abort if a mapping already exists. 3921 */ 3922 if (pmap_load(l3) != 0) { 3923 if (mpte != NULL) 3924 mpte->ref_count--; 3925 return (NULL); 3926 } 3927 3928 /* 3929 * Enter on the PV list if part of our managed memory. 3930 */ 3931 if ((m->oflags & VPO_UNMANAGED) == 0 && 3932 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 3933 if (mpte != NULL) 3934 pmap_abort_ptp(pmap, va, mpte); 3935 return (NULL); 3936 } 3937 3938 /* 3939 * Increment counters 3940 */ 3941 pmap_resident_count_inc(pmap, 1); 3942 3943 pa = VM_PAGE_TO_PHYS(m); 3944 l3_val = pa | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) | 3945 ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE; 3946 if ((prot & VM_PROT_EXECUTE) == 0 || 3947 m->md.pv_memattr == VM_MEMATTR_DEVICE) 3948 l3_val |= ATTR_S1_XN; 3949 if (va < VM_MAXUSER_ADDRESS) 3950 l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 3951 else 3952 l3_val |= ATTR_S1_UXN; 3953 if (pmap != kernel_pmap) 3954 l3_val |= ATTR_S1_nG; 3955 3956 /* 3957 * Now validate mapping with RO protection 3958 */ 3959 if ((m->oflags & VPO_UNMANAGED) == 0) { 3960 l3_val |= ATTR_SW_MANAGED; 3961 l3_val &= ~ATTR_AF; 3962 } 3963 3964 /* Sync icache before the mapping is stored to PTE */ 3965 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 3966 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) 3967 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); 3968 3969 pmap_store(l3, l3_val); 3970 dsb(ishst); 3971 3972 return (mpte); 3973 } 3974 3975 /* 3976 * This code maps large physical mmap regions into the 3977 * processor address space. Note that some shortcuts 3978 * are taken, but the code works. 3979 */ 3980 void 3981 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3982 vm_pindex_t pindex, vm_size_t size) 3983 { 3984 3985 VM_OBJECT_ASSERT_WLOCKED(object); 3986 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3987 ("pmap_object_init_pt: non-device object")); 3988 } 3989 3990 /* 3991 * Clear the wired attribute from the mappings for the specified range of 3992 * addresses in the given pmap. Every valid mapping within that range 3993 * must have the wired attribute set. In contrast, invalid mappings 3994 * cannot have the wired attribute set, so they are ignored. 3995 * 3996 * The wired attribute of the page table entry is not a hardware feature, 3997 * so there is no need to invalidate any TLB entries. 3998 */ 3999 void 4000 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4001 { 4002 vm_offset_t va_next; 4003 pd_entry_t *l0, *l1, *l2; 4004 pt_entry_t *l3; 4005 4006 PMAP_LOCK(pmap); 4007 for (; sva < eva; sva = va_next) { 4008 l0 = pmap_l0(pmap, sva); 4009 if (pmap_load(l0) == 0) { 4010 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 4011 if (va_next < sva) 4012 va_next = eva; 4013 continue; 4014 } 4015 4016 l1 = pmap_l0_to_l1(l0, sva); 4017 if (pmap_load(l1) == 0) { 4018 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 4019 if (va_next < sva) 4020 va_next = eva; 4021 continue; 4022 } 4023 4024 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 4025 if (va_next < sva) 4026 va_next = eva; 4027 4028 l2 = pmap_l1_to_l2(l1, sva); 4029 if (pmap_load(l2) == 0) 4030 continue; 4031 4032 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { 4033 if ((pmap_load(l2) & ATTR_SW_WIRED) == 0) 4034 panic("pmap_unwire: l2 %#jx is missing " 4035 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2)); 4036 4037 /* 4038 * Are we unwiring the entire large page? If not, 4039 * demote the mapping and fall through. 4040 */ 4041 if (sva + L2_SIZE == va_next && eva >= va_next) { 4042 pmap_clear_bits(l2, ATTR_SW_WIRED); 4043 pmap->pm_stats.wired_count -= L2_SIZE / 4044 PAGE_SIZE; 4045 continue; 4046 } else if (pmap_demote_l2(pmap, l2, sva) == NULL) 4047 panic("pmap_unwire: demotion failed"); 4048 } 4049 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 4050 ("pmap_unwire: Invalid l2 entry after demotion")); 4051 4052 if (va_next > eva) 4053 va_next = eva; 4054 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 4055 sva += L3_SIZE) { 4056 if (pmap_load(l3) == 0) 4057 continue; 4058 if ((pmap_load(l3) & ATTR_SW_WIRED) == 0) 4059 panic("pmap_unwire: l3 %#jx is missing " 4060 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3)); 4061 4062 /* 4063 * ATTR_SW_WIRED must be cleared atomically. Although 4064 * the pmap lock synchronizes access to ATTR_SW_WIRED, 4065 * the System MMU may write to the entry concurrently. 4066 */ 4067 pmap_clear_bits(l3, ATTR_SW_WIRED); 4068 pmap->pm_stats.wired_count--; 4069 } 4070 } 4071 PMAP_UNLOCK(pmap); 4072 } 4073 4074 /* 4075 * Copy the range specified by src_addr/len 4076 * from the source map to the range dst_addr/len 4077 * in the destination map. 4078 * 4079 * This routine is only advisory and need not do anything. 4080 * 4081 * Because the executable mappings created by this routine are copied, 4082 * it should not have to flush the instruction cache. 4083 */ 4084 void 4085 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 4086 vm_offset_t src_addr) 4087 { 4088 struct rwlock *lock; 4089 pd_entry_t *l0, *l1, *l2, srcptepaddr; 4090 pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte; 4091 vm_offset_t addr, end_addr, va_next; 4092 vm_page_t dst_l2pg, dstmpte, srcmpte; 4093 4094 PMAP_ASSERT_STAGE1(dst_pmap); 4095 PMAP_ASSERT_STAGE1(src_pmap); 4096 4097 if (dst_addr != src_addr) 4098 return; 4099 end_addr = src_addr + len; 4100 lock = NULL; 4101 if (dst_pmap < src_pmap) { 4102 PMAP_LOCK(dst_pmap); 4103 PMAP_LOCK(src_pmap); 4104 } else { 4105 PMAP_LOCK(src_pmap); 4106 PMAP_LOCK(dst_pmap); 4107 } 4108 for (addr = src_addr; addr < end_addr; addr = va_next) { 4109 l0 = pmap_l0(src_pmap, addr); 4110 if (pmap_load(l0) == 0) { 4111 va_next = (addr + L0_SIZE) & ~L0_OFFSET; 4112 if (va_next < addr) 4113 va_next = end_addr; 4114 continue; 4115 } 4116 l1 = pmap_l0_to_l1(l0, addr); 4117 if (pmap_load(l1) == 0) { 4118 va_next = (addr + L1_SIZE) & ~L1_OFFSET; 4119 if (va_next < addr) 4120 va_next = end_addr; 4121 continue; 4122 } 4123 va_next = (addr + L2_SIZE) & ~L2_OFFSET; 4124 if (va_next < addr) 4125 va_next = end_addr; 4126 l2 = pmap_l1_to_l2(l1, addr); 4127 srcptepaddr = pmap_load(l2); 4128 if (srcptepaddr == 0) 4129 continue; 4130 if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) { 4131 if ((addr & L2_OFFSET) != 0 || 4132 addr + L2_SIZE > end_addr) 4133 continue; 4134 l2 = pmap_alloc_l2(dst_pmap, addr, &dst_l2pg, NULL); 4135 if (l2 == NULL) 4136 break; 4137 if (pmap_load(l2) == 0 && 4138 ((srcptepaddr & ATTR_SW_MANAGED) == 0 || 4139 pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr, 4140 PMAP_ENTER_NORECLAIM, &lock))) { 4141 mask = ATTR_AF | ATTR_SW_WIRED; 4142 nbits = 0; 4143 if ((srcptepaddr & ATTR_SW_DBM) != 0) 4144 nbits |= ATTR_S1_AP_RW_BIT; 4145 pmap_store(l2, (srcptepaddr & ~mask) | nbits); 4146 pmap_resident_count_inc(dst_pmap, L2_SIZE / 4147 PAGE_SIZE); 4148 atomic_add_long(&pmap_l2_mappings, 1); 4149 } else 4150 pmap_abort_ptp(dst_pmap, addr, dst_l2pg); 4151 continue; 4152 } 4153 KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE, 4154 ("pmap_copy: invalid L2 entry")); 4155 srcptepaddr &= ~ATTR_MASK; 4156 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 4157 KASSERT(srcmpte->ref_count > 0, 4158 ("pmap_copy: source page table page is unused")); 4159 if (va_next > end_addr) 4160 va_next = end_addr; 4161 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 4162 src_pte = &src_pte[pmap_l3_index(addr)]; 4163 dstmpte = NULL; 4164 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { 4165 ptetemp = pmap_load(src_pte); 4166 4167 /* 4168 * We only virtual copy managed pages. 4169 */ 4170 if ((ptetemp & ATTR_SW_MANAGED) == 0) 4171 continue; 4172 4173 if (dstmpte != NULL) { 4174 KASSERT(dstmpte->pindex == pmap_l2_pindex(addr), 4175 ("dstmpte pindex/addr mismatch")); 4176 dstmpte->ref_count++; 4177 } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr, 4178 NULL)) == NULL) 4179 goto out; 4180 dst_pte = (pt_entry_t *) 4181 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 4182 dst_pte = &dst_pte[pmap_l3_index(addr)]; 4183 if (pmap_load(dst_pte) == 0 && 4184 pmap_try_insert_pv_entry(dst_pmap, addr, 4185 PHYS_TO_VM_PAGE(ptetemp & ~ATTR_MASK), &lock)) { 4186 /* 4187 * Clear the wired, modified, and accessed 4188 * (referenced) bits during the copy. 4189 */ 4190 mask = ATTR_AF | ATTR_SW_WIRED; 4191 nbits = 0; 4192 if ((ptetemp & ATTR_SW_DBM) != 0) 4193 nbits |= ATTR_S1_AP_RW_BIT; 4194 pmap_store(dst_pte, (ptetemp & ~mask) | nbits); 4195 pmap_resident_count_inc(dst_pmap, 1); 4196 } else { 4197 pmap_abort_ptp(dst_pmap, addr, dstmpte); 4198 goto out; 4199 } 4200 /* Have we copied all of the valid mappings? */ 4201 if (dstmpte->ref_count >= srcmpte->ref_count) 4202 break; 4203 } 4204 } 4205 out: 4206 /* 4207 * XXX This barrier may not be needed because the destination pmap is 4208 * not active. 4209 */ 4210 dsb(ishst); 4211 4212 if (lock != NULL) 4213 rw_wunlock(lock); 4214 PMAP_UNLOCK(src_pmap); 4215 PMAP_UNLOCK(dst_pmap); 4216 } 4217 4218 /* 4219 * pmap_zero_page zeros the specified hardware page by mapping 4220 * the page into KVM and using bzero to clear its contents. 4221 */ 4222 void 4223 pmap_zero_page(vm_page_t m) 4224 { 4225 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 4226 4227 pagezero((void *)va); 4228 } 4229 4230 /* 4231 * pmap_zero_page_area zeros the specified hardware page by mapping 4232 * the page into KVM and using bzero to clear its contents. 4233 * 4234 * off and size may not cover an area beyond a single hardware page. 4235 */ 4236 void 4237 pmap_zero_page_area(vm_page_t m, int off, int size) 4238 { 4239 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 4240 4241 if (off == 0 && size == PAGE_SIZE) 4242 pagezero((void *)va); 4243 else 4244 bzero((char *)va + off, size); 4245 } 4246 4247 /* 4248 * pmap_copy_page copies the specified (machine independent) 4249 * page by mapping the page into virtual memory and using 4250 * bcopy to copy the page, one machine dependent page at a 4251 * time. 4252 */ 4253 void 4254 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 4255 { 4256 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 4257 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 4258 4259 pagecopy((void *)src, (void *)dst); 4260 } 4261 4262 int unmapped_buf_allowed = 1; 4263 4264 void 4265 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 4266 vm_offset_t b_offset, int xfersize) 4267 { 4268 void *a_cp, *b_cp; 4269 vm_page_t m_a, m_b; 4270 vm_paddr_t p_a, p_b; 4271 vm_offset_t a_pg_offset, b_pg_offset; 4272 int cnt; 4273 4274 while (xfersize > 0) { 4275 a_pg_offset = a_offset & PAGE_MASK; 4276 m_a = ma[a_offset >> PAGE_SHIFT]; 4277 p_a = m_a->phys_addr; 4278 b_pg_offset = b_offset & PAGE_MASK; 4279 m_b = mb[b_offset >> PAGE_SHIFT]; 4280 p_b = m_b->phys_addr; 4281 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 4282 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 4283 if (__predict_false(!PHYS_IN_DMAP(p_a))) { 4284 panic("!DMAP a %lx", p_a); 4285 } else { 4286 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 4287 } 4288 if (__predict_false(!PHYS_IN_DMAP(p_b))) { 4289 panic("!DMAP b %lx", p_b); 4290 } else { 4291 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 4292 } 4293 bcopy(a_cp, b_cp, cnt); 4294 a_offset += cnt; 4295 b_offset += cnt; 4296 xfersize -= cnt; 4297 } 4298 } 4299 4300 vm_offset_t 4301 pmap_quick_enter_page(vm_page_t m) 4302 { 4303 4304 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); 4305 } 4306 4307 void 4308 pmap_quick_remove_page(vm_offset_t addr) 4309 { 4310 } 4311 4312 /* 4313 * Returns true if the pmap's pv is one of the first 4314 * 16 pvs linked to from this page. This count may 4315 * be changed upwards or downwards in the future; it 4316 * is only necessary that true be returned for a small 4317 * subset of pmaps for proper page aging. 4318 */ 4319 boolean_t 4320 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 4321 { 4322 struct md_page *pvh; 4323 struct rwlock *lock; 4324 pv_entry_t pv; 4325 int loops = 0; 4326 boolean_t rv; 4327 4328 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4329 ("pmap_page_exists_quick: page %p is not managed", m)); 4330 rv = FALSE; 4331 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4332 rw_rlock(lock); 4333 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4334 if (PV_PMAP(pv) == pmap) { 4335 rv = TRUE; 4336 break; 4337 } 4338 loops++; 4339 if (loops >= 16) 4340 break; 4341 } 4342 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 4343 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4344 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4345 if (PV_PMAP(pv) == pmap) { 4346 rv = TRUE; 4347 break; 4348 } 4349 loops++; 4350 if (loops >= 16) 4351 break; 4352 } 4353 } 4354 rw_runlock(lock); 4355 return (rv); 4356 } 4357 4358 /* 4359 * pmap_page_wired_mappings: 4360 * 4361 * Return the number of managed mappings to the given physical page 4362 * that are wired. 4363 */ 4364 int 4365 pmap_page_wired_mappings(vm_page_t m) 4366 { 4367 struct rwlock *lock; 4368 struct md_page *pvh; 4369 pmap_t pmap; 4370 pt_entry_t *pte; 4371 pv_entry_t pv; 4372 int count, lvl, md_gen, pvh_gen; 4373 4374 if ((m->oflags & VPO_UNMANAGED) != 0) 4375 return (0); 4376 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4377 rw_rlock(lock); 4378 restart: 4379 count = 0; 4380 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4381 pmap = PV_PMAP(pv); 4382 if (!PMAP_TRYLOCK(pmap)) { 4383 md_gen = m->md.pv_gen; 4384 rw_runlock(lock); 4385 PMAP_LOCK(pmap); 4386 rw_rlock(lock); 4387 if (md_gen != m->md.pv_gen) { 4388 PMAP_UNLOCK(pmap); 4389 goto restart; 4390 } 4391 } 4392 pte = pmap_pte(pmap, pv->pv_va, &lvl); 4393 if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0) 4394 count++; 4395 PMAP_UNLOCK(pmap); 4396 } 4397 if ((m->flags & PG_FICTITIOUS) == 0) { 4398 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4399 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4400 pmap = PV_PMAP(pv); 4401 if (!PMAP_TRYLOCK(pmap)) { 4402 md_gen = m->md.pv_gen; 4403 pvh_gen = pvh->pv_gen; 4404 rw_runlock(lock); 4405 PMAP_LOCK(pmap); 4406 rw_rlock(lock); 4407 if (md_gen != m->md.pv_gen || 4408 pvh_gen != pvh->pv_gen) { 4409 PMAP_UNLOCK(pmap); 4410 goto restart; 4411 } 4412 } 4413 pte = pmap_pte(pmap, pv->pv_va, &lvl); 4414 if (pte != NULL && 4415 (pmap_load(pte) & ATTR_SW_WIRED) != 0) 4416 count++; 4417 PMAP_UNLOCK(pmap); 4418 } 4419 } 4420 rw_runlock(lock); 4421 return (count); 4422 } 4423 4424 /* 4425 * Returns true if the given page is mapped individually or as part of 4426 * a 2mpage. Otherwise, returns false. 4427 */ 4428 bool 4429 pmap_page_is_mapped(vm_page_t m) 4430 { 4431 struct rwlock *lock; 4432 bool rv; 4433 4434 if ((m->oflags & VPO_UNMANAGED) != 0) 4435 return (false); 4436 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4437 rw_rlock(lock); 4438 rv = !TAILQ_EMPTY(&m->md.pv_list) || 4439 ((m->flags & PG_FICTITIOUS) == 0 && 4440 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 4441 rw_runlock(lock); 4442 return (rv); 4443 } 4444 4445 /* 4446 * Destroy all managed, non-wired mappings in the given user-space 4447 * pmap. This pmap cannot be active on any processor besides the 4448 * caller. 4449 * 4450 * This function cannot be applied to the kernel pmap. Moreover, it 4451 * is not intended for general use. It is only to be used during 4452 * process termination. Consequently, it can be implemented in ways 4453 * that make it faster than pmap_remove(). First, it can more quickly 4454 * destroy mappings by iterating over the pmap's collection of PV 4455 * entries, rather than searching the page table. Second, it doesn't 4456 * have to test and clear the page table entries atomically, because 4457 * no processor is currently accessing the user address space. In 4458 * particular, a page table entry's dirty bit won't change state once 4459 * this function starts. 4460 */ 4461 void 4462 pmap_remove_pages(pmap_t pmap) 4463 { 4464 pd_entry_t *pde; 4465 pt_entry_t *pte, tpte; 4466 struct spglist free; 4467 vm_page_t m, ml3, mt; 4468 pv_entry_t pv; 4469 struct md_page *pvh; 4470 struct pv_chunk *pc, *npc; 4471 struct rwlock *lock; 4472 int64_t bit; 4473 uint64_t inuse, bitmask; 4474 int allfree, field, freed, idx, lvl; 4475 vm_paddr_t pa; 4476 4477 KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); 4478 4479 lock = NULL; 4480 4481 SLIST_INIT(&free); 4482 PMAP_LOCK(pmap); 4483 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4484 allfree = 1; 4485 freed = 0; 4486 for (field = 0; field < _NPCM; field++) { 4487 inuse = ~pc->pc_map[field] & pc_freemask[field]; 4488 while (inuse != 0) { 4489 bit = ffsl(inuse) - 1; 4490 bitmask = 1UL << bit; 4491 idx = field * 64 + bit; 4492 pv = &pc->pc_pventry[idx]; 4493 inuse &= ~bitmask; 4494 4495 pde = pmap_pde(pmap, pv->pv_va, &lvl); 4496 KASSERT(pde != NULL, 4497 ("Attempting to remove an unmapped page")); 4498 4499 switch(lvl) { 4500 case 1: 4501 pte = pmap_l1_to_l2(pde, pv->pv_va); 4502 tpte = pmap_load(pte); 4503 KASSERT((tpte & ATTR_DESCR_MASK) == 4504 L2_BLOCK, 4505 ("Attempting to remove an invalid " 4506 "block: %lx", tpte)); 4507 break; 4508 case 2: 4509 pte = pmap_l2_to_l3(pde, pv->pv_va); 4510 tpte = pmap_load(pte); 4511 KASSERT((tpte & ATTR_DESCR_MASK) == 4512 L3_PAGE, 4513 ("Attempting to remove an invalid " 4514 "page: %lx", tpte)); 4515 break; 4516 default: 4517 panic( 4518 "Invalid page directory level: %d", 4519 lvl); 4520 } 4521 4522 /* 4523 * We cannot remove wired pages from a process' mapping at this time 4524 */ 4525 if (tpte & ATTR_SW_WIRED) { 4526 allfree = 0; 4527 continue; 4528 } 4529 4530 pa = tpte & ~ATTR_MASK; 4531 4532 m = PHYS_TO_VM_PAGE(pa); 4533 KASSERT(m->phys_addr == pa, 4534 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 4535 m, (uintmax_t)m->phys_addr, 4536 (uintmax_t)tpte)); 4537 4538 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4539 m < &vm_page_array[vm_page_array_size], 4540 ("pmap_remove_pages: bad pte %#jx", 4541 (uintmax_t)tpte)); 4542 4543 /* 4544 * Because this pmap is not active on other 4545 * processors, the dirty bit cannot have 4546 * changed state since we last loaded pte. 4547 */ 4548 pmap_clear(pte); 4549 4550 /* 4551 * Update the vm_page_t clean/reference bits. 4552 */ 4553 if (pmap_pte_dirty(pmap, tpte)) { 4554 switch (lvl) { 4555 case 1: 4556 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 4557 vm_page_dirty(mt); 4558 break; 4559 case 2: 4560 vm_page_dirty(m); 4561 break; 4562 } 4563 } 4564 4565 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 4566 4567 /* Mark free */ 4568 pc->pc_map[field] |= bitmask; 4569 switch (lvl) { 4570 case 1: 4571 pmap_resident_count_dec(pmap, 4572 L2_SIZE / PAGE_SIZE); 4573 pvh = pa_to_pvh(tpte & ~ATTR_MASK); 4574 TAILQ_REMOVE(&pvh->pv_list, pv,pv_next); 4575 pvh->pv_gen++; 4576 if (TAILQ_EMPTY(&pvh->pv_list)) { 4577 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 4578 if ((mt->a.flags & PGA_WRITEABLE) != 0 && 4579 TAILQ_EMPTY(&mt->md.pv_list)) 4580 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4581 } 4582 ml3 = pmap_remove_pt_page(pmap, 4583 pv->pv_va); 4584 if (ml3 != NULL) { 4585 KASSERT(ml3->valid == VM_PAGE_BITS_ALL, 4586 ("pmap_remove_pages: l3 page not promoted")); 4587 pmap_resident_count_dec(pmap,1); 4588 KASSERT(ml3->ref_count == NL3PG, 4589 ("pmap_remove_pages: l3 page ref count error")); 4590 ml3->ref_count = 0; 4591 pmap_add_delayed_free_list(ml3, 4592 &free, FALSE); 4593 } 4594 break; 4595 case 2: 4596 pmap_resident_count_dec(pmap, 1); 4597 TAILQ_REMOVE(&m->md.pv_list, pv, 4598 pv_next); 4599 m->md.pv_gen++; 4600 if ((m->a.flags & PGA_WRITEABLE) != 0 && 4601 TAILQ_EMPTY(&m->md.pv_list) && 4602 (m->flags & PG_FICTITIOUS) == 0) { 4603 pvh = pa_to_pvh( 4604 VM_PAGE_TO_PHYS(m)); 4605 if (TAILQ_EMPTY(&pvh->pv_list)) 4606 vm_page_aflag_clear(m, 4607 PGA_WRITEABLE); 4608 } 4609 break; 4610 } 4611 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde), 4612 &free); 4613 freed++; 4614 } 4615 } 4616 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 4617 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 4618 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 4619 if (allfree) { 4620 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4621 free_pv_chunk(pc); 4622 } 4623 } 4624 if (lock != NULL) 4625 rw_wunlock(lock); 4626 pmap_invalidate_all(pmap); 4627 PMAP_UNLOCK(pmap); 4628 vm_page_free_pages_toq(&free, true); 4629 } 4630 4631 /* 4632 * This is used to check if a page has been accessed or modified. 4633 */ 4634 static boolean_t 4635 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 4636 { 4637 struct rwlock *lock; 4638 pv_entry_t pv; 4639 struct md_page *pvh; 4640 pt_entry_t *pte, mask, value; 4641 pmap_t pmap; 4642 int lvl, md_gen, pvh_gen; 4643 boolean_t rv; 4644 4645 rv = FALSE; 4646 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4647 rw_rlock(lock); 4648 restart: 4649 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4650 pmap = PV_PMAP(pv); 4651 PMAP_ASSERT_STAGE1(pmap); 4652 if (!PMAP_TRYLOCK(pmap)) { 4653 md_gen = m->md.pv_gen; 4654 rw_runlock(lock); 4655 PMAP_LOCK(pmap); 4656 rw_rlock(lock); 4657 if (md_gen != m->md.pv_gen) { 4658 PMAP_UNLOCK(pmap); 4659 goto restart; 4660 } 4661 } 4662 pte = pmap_pte(pmap, pv->pv_va, &lvl); 4663 KASSERT(lvl == 3, 4664 ("pmap_page_test_mappings: Invalid level %d", lvl)); 4665 mask = 0; 4666 value = 0; 4667 if (modified) { 4668 mask |= ATTR_S1_AP_RW_BIT; 4669 value |= ATTR_S1_AP(ATTR_S1_AP_RW); 4670 } 4671 if (accessed) { 4672 mask |= ATTR_AF | ATTR_DESCR_MASK; 4673 value |= ATTR_AF | L3_PAGE; 4674 } 4675 rv = (pmap_load(pte) & mask) == value; 4676 PMAP_UNLOCK(pmap); 4677 if (rv) 4678 goto out; 4679 } 4680 if ((m->flags & PG_FICTITIOUS) == 0) { 4681 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4682 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4683 pmap = PV_PMAP(pv); 4684 PMAP_ASSERT_STAGE1(pmap); 4685 if (!PMAP_TRYLOCK(pmap)) { 4686 md_gen = m->md.pv_gen; 4687 pvh_gen = pvh->pv_gen; 4688 rw_runlock(lock); 4689 PMAP_LOCK(pmap); 4690 rw_rlock(lock); 4691 if (md_gen != m->md.pv_gen || 4692 pvh_gen != pvh->pv_gen) { 4693 PMAP_UNLOCK(pmap); 4694 goto restart; 4695 } 4696 } 4697 pte = pmap_pte(pmap, pv->pv_va, &lvl); 4698 KASSERT(lvl == 2, 4699 ("pmap_page_test_mappings: Invalid level %d", lvl)); 4700 mask = 0; 4701 value = 0; 4702 if (modified) { 4703 mask |= ATTR_S1_AP_RW_BIT; 4704 value |= ATTR_S1_AP(ATTR_S1_AP_RW); 4705 } 4706 if (accessed) { 4707 mask |= ATTR_AF | ATTR_DESCR_MASK; 4708 value |= ATTR_AF | L2_BLOCK; 4709 } 4710 rv = (pmap_load(pte) & mask) == value; 4711 PMAP_UNLOCK(pmap); 4712 if (rv) 4713 goto out; 4714 } 4715 } 4716 out: 4717 rw_runlock(lock); 4718 return (rv); 4719 } 4720 4721 /* 4722 * pmap_is_modified: 4723 * 4724 * Return whether or not the specified physical page was modified 4725 * in any physical maps. 4726 */ 4727 boolean_t 4728 pmap_is_modified(vm_page_t m) 4729 { 4730 4731 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4732 ("pmap_is_modified: page %p is not managed", m)); 4733 4734 /* 4735 * If the page is not busied then this check is racy. 4736 */ 4737 if (!pmap_page_is_write_mapped(m)) 4738 return (FALSE); 4739 return (pmap_page_test_mappings(m, FALSE, TRUE)); 4740 } 4741 4742 /* 4743 * pmap_is_prefaultable: 4744 * 4745 * Return whether or not the specified virtual address is eligible 4746 * for prefault. 4747 */ 4748 boolean_t 4749 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 4750 { 4751 pt_entry_t *pte; 4752 boolean_t rv; 4753 int lvl; 4754 4755 rv = FALSE; 4756 PMAP_LOCK(pmap); 4757 pte = pmap_pte(pmap, addr, &lvl); 4758 if (pte != NULL && pmap_load(pte) != 0) { 4759 rv = TRUE; 4760 } 4761 PMAP_UNLOCK(pmap); 4762 return (rv); 4763 } 4764 4765 /* 4766 * pmap_is_referenced: 4767 * 4768 * Return whether or not the specified physical page was referenced 4769 * in any physical maps. 4770 */ 4771 boolean_t 4772 pmap_is_referenced(vm_page_t m) 4773 { 4774 4775 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4776 ("pmap_is_referenced: page %p is not managed", m)); 4777 return (pmap_page_test_mappings(m, TRUE, FALSE)); 4778 } 4779 4780 /* 4781 * Clear the write and modified bits in each of the given page's mappings. 4782 */ 4783 void 4784 pmap_remove_write(vm_page_t m) 4785 { 4786 struct md_page *pvh; 4787 pmap_t pmap; 4788 struct rwlock *lock; 4789 pv_entry_t next_pv, pv; 4790 pt_entry_t oldpte, *pte; 4791 vm_offset_t va; 4792 int lvl, md_gen, pvh_gen; 4793 4794 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4795 ("pmap_remove_write: page %p is not managed", m)); 4796 vm_page_assert_busied(m); 4797 4798 if (!pmap_page_is_write_mapped(m)) 4799 return; 4800 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4801 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 4802 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4803 retry_pv_loop: 4804 rw_wlock(lock); 4805 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4806 pmap = PV_PMAP(pv); 4807 PMAP_ASSERT_STAGE1(pmap); 4808 if (!PMAP_TRYLOCK(pmap)) { 4809 pvh_gen = pvh->pv_gen; 4810 rw_wunlock(lock); 4811 PMAP_LOCK(pmap); 4812 rw_wlock(lock); 4813 if (pvh_gen != pvh->pv_gen) { 4814 PMAP_UNLOCK(pmap); 4815 rw_wunlock(lock); 4816 goto retry_pv_loop; 4817 } 4818 } 4819 va = pv->pv_va; 4820 pte = pmap_pte(pmap, pv->pv_va, &lvl); 4821 if ((pmap_load(pte) & ATTR_SW_DBM) != 0) 4822 (void)pmap_demote_l2_locked(pmap, pte, va, &lock); 4823 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 4824 ("inconsistent pv lock %p %p for page %p", 4825 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 4826 PMAP_UNLOCK(pmap); 4827 } 4828 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4829 pmap = PV_PMAP(pv); 4830 PMAP_ASSERT_STAGE1(pmap); 4831 if (!PMAP_TRYLOCK(pmap)) { 4832 pvh_gen = pvh->pv_gen; 4833 md_gen = m->md.pv_gen; 4834 rw_wunlock(lock); 4835 PMAP_LOCK(pmap); 4836 rw_wlock(lock); 4837 if (pvh_gen != pvh->pv_gen || 4838 md_gen != m->md.pv_gen) { 4839 PMAP_UNLOCK(pmap); 4840 rw_wunlock(lock); 4841 goto retry_pv_loop; 4842 } 4843 } 4844 pte = pmap_pte(pmap, pv->pv_va, &lvl); 4845 oldpte = pmap_load(pte); 4846 retry: 4847 if ((oldpte & ATTR_SW_DBM) != 0) { 4848 if (!atomic_fcmpset_long(pte, &oldpte, 4849 (oldpte | ATTR_S1_AP_RW_BIT) & ~ATTR_SW_DBM)) 4850 goto retry; 4851 if ((oldpte & ATTR_S1_AP_RW_BIT) == 4852 ATTR_S1_AP(ATTR_S1_AP_RW)) 4853 vm_page_dirty(m); 4854 pmap_invalidate_page(pmap, pv->pv_va); 4855 } 4856 PMAP_UNLOCK(pmap); 4857 } 4858 rw_wunlock(lock); 4859 vm_page_aflag_clear(m, PGA_WRITEABLE); 4860 } 4861 4862 /* 4863 * pmap_ts_referenced: 4864 * 4865 * Return a count of reference bits for a page, clearing those bits. 4866 * It is not necessary for every reference bit to be cleared, but it 4867 * is necessary that 0 only be returned when there are truly no 4868 * reference bits set. 4869 * 4870 * As an optimization, update the page's dirty field if a modified bit is 4871 * found while counting reference bits. This opportunistic update can be 4872 * performed at low cost and can eliminate the need for some future calls 4873 * to pmap_is_modified(). However, since this function stops after 4874 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 4875 * dirty pages. Those dirty pages will only be detected by a future call 4876 * to pmap_is_modified(). 4877 */ 4878 int 4879 pmap_ts_referenced(vm_page_t m) 4880 { 4881 struct md_page *pvh; 4882 pv_entry_t pv, pvf; 4883 pmap_t pmap; 4884 struct rwlock *lock; 4885 pd_entry_t *pde, tpde; 4886 pt_entry_t *pte, tpte; 4887 vm_offset_t va; 4888 vm_paddr_t pa; 4889 int cleared, lvl, md_gen, not_cleared, pvh_gen; 4890 struct spglist free; 4891 4892 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4893 ("pmap_ts_referenced: page %p is not managed", m)); 4894 SLIST_INIT(&free); 4895 cleared = 0; 4896 pa = VM_PAGE_TO_PHYS(m); 4897 lock = PHYS_TO_PV_LIST_LOCK(pa); 4898 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 4899 rw_wlock(lock); 4900 retry: 4901 not_cleared = 0; 4902 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 4903 goto small_mappings; 4904 pv = pvf; 4905 do { 4906 if (pvf == NULL) 4907 pvf = pv; 4908 pmap = PV_PMAP(pv); 4909 if (!PMAP_TRYLOCK(pmap)) { 4910 pvh_gen = pvh->pv_gen; 4911 rw_wunlock(lock); 4912 PMAP_LOCK(pmap); 4913 rw_wlock(lock); 4914 if (pvh_gen != pvh->pv_gen) { 4915 PMAP_UNLOCK(pmap); 4916 goto retry; 4917 } 4918 } 4919 va = pv->pv_va; 4920 pde = pmap_pde(pmap, pv->pv_va, &lvl); 4921 KASSERT(pde != NULL, ("pmap_ts_referenced: no l1 table found")); 4922 KASSERT(lvl == 1, 4923 ("pmap_ts_referenced: invalid pde level %d", lvl)); 4924 tpde = pmap_load(pde); 4925 KASSERT((tpde & ATTR_DESCR_MASK) == L1_TABLE, 4926 ("pmap_ts_referenced: found an invalid l1 table")); 4927 pte = pmap_l1_to_l2(pde, pv->pv_va); 4928 tpte = pmap_load(pte); 4929 if (pmap_pte_dirty(pmap, tpte)) { 4930 /* 4931 * Although "tpte" is mapping a 2MB page, because 4932 * this function is called at a 4KB page granularity, 4933 * we only update the 4KB page under test. 4934 */ 4935 vm_page_dirty(m); 4936 } 4937 4938 if ((tpte & ATTR_AF) != 0) { 4939 /* 4940 * Since this reference bit is shared by 512 4KB pages, 4941 * it should not be cleared every time it is tested. 4942 * Apply a simple "hash" function on the physical page 4943 * number, the virtual superpage number, and the pmap 4944 * address to select one 4KB page out of the 512 on 4945 * which testing the reference bit will result in 4946 * clearing that reference bit. This function is 4947 * designed to avoid the selection of the same 4KB page 4948 * for every 2MB page mapping. 4949 * 4950 * On demotion, a mapping that hasn't been referenced 4951 * is simply destroyed. To avoid the possibility of a 4952 * subsequent page fault on a demoted wired mapping, 4953 * always leave its reference bit set. Moreover, 4954 * since the superpage is wired, the current state of 4955 * its reference bit won't affect page replacement. 4956 */ 4957 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ 4958 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && 4959 (tpte & ATTR_SW_WIRED) == 0) { 4960 pmap_clear_bits(pte, ATTR_AF); 4961 pmap_invalidate_page(pmap, pv->pv_va); 4962 cleared++; 4963 } else 4964 not_cleared++; 4965 } 4966 PMAP_UNLOCK(pmap); 4967 /* Rotate the PV list if it has more than one entry. */ 4968 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4969 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4970 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4971 pvh->pv_gen++; 4972 } 4973 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 4974 goto out; 4975 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 4976 small_mappings: 4977 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 4978 goto out; 4979 pv = pvf; 4980 do { 4981 if (pvf == NULL) 4982 pvf = pv; 4983 pmap = PV_PMAP(pv); 4984 if (!PMAP_TRYLOCK(pmap)) { 4985 pvh_gen = pvh->pv_gen; 4986 md_gen = m->md.pv_gen; 4987 rw_wunlock(lock); 4988 PMAP_LOCK(pmap); 4989 rw_wlock(lock); 4990 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4991 PMAP_UNLOCK(pmap); 4992 goto retry; 4993 } 4994 } 4995 pde = pmap_pde(pmap, pv->pv_va, &lvl); 4996 KASSERT(pde != NULL, ("pmap_ts_referenced: no l2 table found")); 4997 KASSERT(lvl == 2, 4998 ("pmap_ts_referenced: invalid pde level %d", lvl)); 4999 tpde = pmap_load(pde); 5000 KASSERT((tpde & ATTR_DESCR_MASK) == L2_TABLE, 5001 ("pmap_ts_referenced: found an invalid l2 table")); 5002 pte = pmap_l2_to_l3(pde, pv->pv_va); 5003 tpte = pmap_load(pte); 5004 if (pmap_pte_dirty(pmap, tpte)) 5005 vm_page_dirty(m); 5006 if ((tpte & ATTR_AF) != 0) { 5007 if ((tpte & ATTR_SW_WIRED) == 0) { 5008 pmap_clear_bits(pte, ATTR_AF); 5009 pmap_invalidate_page(pmap, pv->pv_va); 5010 cleared++; 5011 } else 5012 not_cleared++; 5013 } 5014 PMAP_UNLOCK(pmap); 5015 /* Rotate the PV list if it has more than one entry. */ 5016 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 5017 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5018 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5019 m->md.pv_gen++; 5020 } 5021 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 5022 not_cleared < PMAP_TS_REFERENCED_MAX); 5023 out: 5024 rw_wunlock(lock); 5025 vm_page_free_pages_toq(&free, true); 5026 return (cleared + not_cleared); 5027 } 5028 5029 /* 5030 * Apply the given advice to the specified range of addresses within the 5031 * given pmap. Depending on the advice, clear the referenced and/or 5032 * modified flags in each mapping and set the mapped page's dirty field. 5033 */ 5034 void 5035 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 5036 { 5037 struct rwlock *lock; 5038 vm_offset_t va, va_next; 5039 vm_page_t m; 5040 pd_entry_t *l0, *l1, *l2, oldl2; 5041 pt_entry_t *l3, oldl3; 5042 5043 PMAP_ASSERT_STAGE1(pmap); 5044 5045 if (advice != MADV_DONTNEED && advice != MADV_FREE) 5046 return; 5047 5048 PMAP_LOCK(pmap); 5049 for (; sva < eva; sva = va_next) { 5050 l0 = pmap_l0(pmap, sva); 5051 if (pmap_load(l0) == 0) { 5052 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 5053 if (va_next < sva) 5054 va_next = eva; 5055 continue; 5056 } 5057 l1 = pmap_l0_to_l1(l0, sva); 5058 if (pmap_load(l1) == 0) { 5059 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 5060 if (va_next < sva) 5061 va_next = eva; 5062 continue; 5063 } 5064 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 5065 if (va_next < sva) 5066 va_next = eva; 5067 l2 = pmap_l1_to_l2(l1, sva); 5068 oldl2 = pmap_load(l2); 5069 if (oldl2 == 0) 5070 continue; 5071 if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) { 5072 if ((oldl2 & ATTR_SW_MANAGED) == 0) 5073 continue; 5074 lock = NULL; 5075 if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) { 5076 if (lock != NULL) 5077 rw_wunlock(lock); 5078 5079 /* 5080 * The 2MB page mapping was destroyed. 5081 */ 5082 continue; 5083 } 5084 5085 /* 5086 * Unless the page mappings are wired, remove the 5087 * mapping to a single page so that a subsequent 5088 * access may repromote. Choosing the last page 5089 * within the address range [sva, min(va_next, eva)) 5090 * generally results in more repromotions. Since the 5091 * underlying page table page is fully populated, this 5092 * removal never frees a page table page. 5093 */ 5094 if ((oldl2 & ATTR_SW_WIRED) == 0) { 5095 va = eva; 5096 if (va > va_next) 5097 va = va_next; 5098 va -= PAGE_SIZE; 5099 KASSERT(va >= sva, 5100 ("pmap_advise: no address gap")); 5101 l3 = pmap_l2_to_l3(l2, va); 5102 KASSERT(pmap_load(l3) != 0, 5103 ("pmap_advise: invalid PTE")); 5104 pmap_remove_l3(pmap, l3, va, pmap_load(l2), 5105 NULL, &lock); 5106 } 5107 if (lock != NULL) 5108 rw_wunlock(lock); 5109 } 5110 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 5111 ("pmap_advise: invalid L2 entry after demotion")); 5112 if (va_next > eva) 5113 va_next = eva; 5114 va = va_next; 5115 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 5116 sva += L3_SIZE) { 5117 oldl3 = pmap_load(l3); 5118 if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) != 5119 (ATTR_SW_MANAGED | L3_PAGE)) 5120 goto maybe_invlrng; 5121 else if (pmap_pte_dirty(pmap, oldl3)) { 5122 if (advice == MADV_DONTNEED) { 5123 /* 5124 * Future calls to pmap_is_modified() 5125 * can be avoided by making the page 5126 * dirty now. 5127 */ 5128 m = PHYS_TO_VM_PAGE(oldl3 & ~ATTR_MASK); 5129 vm_page_dirty(m); 5130 } 5131 while (!atomic_fcmpset_long(l3, &oldl3, 5132 (oldl3 & ~ATTR_AF) | 5133 ATTR_S1_AP(ATTR_S1_AP_RO))) 5134 cpu_spinwait(); 5135 } else if ((oldl3 & ATTR_AF) != 0) 5136 pmap_clear_bits(l3, ATTR_AF); 5137 else 5138 goto maybe_invlrng; 5139 if (va == va_next) 5140 va = sva; 5141 continue; 5142 maybe_invlrng: 5143 if (va != va_next) { 5144 pmap_invalidate_range(pmap, va, sva); 5145 va = va_next; 5146 } 5147 } 5148 if (va != va_next) 5149 pmap_invalidate_range(pmap, va, sva); 5150 } 5151 PMAP_UNLOCK(pmap); 5152 } 5153 5154 /* 5155 * Clear the modify bits on the specified physical page. 5156 */ 5157 void 5158 pmap_clear_modify(vm_page_t m) 5159 { 5160 struct md_page *pvh; 5161 struct rwlock *lock; 5162 pmap_t pmap; 5163 pv_entry_t next_pv, pv; 5164 pd_entry_t *l2, oldl2; 5165 pt_entry_t *l3, oldl3; 5166 vm_offset_t va; 5167 int md_gen, pvh_gen; 5168 5169 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5170 ("pmap_clear_modify: page %p is not managed", m)); 5171 vm_page_assert_busied(m); 5172 5173 if (!pmap_page_is_write_mapped(m)) 5174 return; 5175 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 5176 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5177 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5178 rw_wlock(lock); 5179 restart: 5180 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5181 pmap = PV_PMAP(pv); 5182 PMAP_ASSERT_STAGE1(pmap); 5183 if (!PMAP_TRYLOCK(pmap)) { 5184 pvh_gen = pvh->pv_gen; 5185 rw_wunlock(lock); 5186 PMAP_LOCK(pmap); 5187 rw_wlock(lock); 5188 if (pvh_gen != pvh->pv_gen) { 5189 PMAP_UNLOCK(pmap); 5190 goto restart; 5191 } 5192 } 5193 va = pv->pv_va; 5194 l2 = pmap_l2(pmap, va); 5195 oldl2 = pmap_load(l2); 5196 /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */ 5197 if ((oldl2 & ATTR_SW_DBM) != 0 && 5198 pmap_demote_l2_locked(pmap, l2, va, &lock) && 5199 (oldl2 & ATTR_SW_WIRED) == 0) { 5200 /* 5201 * Write protect the mapping to a single page so that 5202 * a subsequent write access may repromote. 5203 */ 5204 va += VM_PAGE_TO_PHYS(m) - (oldl2 & ~ATTR_MASK); 5205 l3 = pmap_l2_to_l3(l2, va); 5206 oldl3 = pmap_load(l3); 5207 while (!atomic_fcmpset_long(l3, &oldl3, 5208 (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO))) 5209 cpu_spinwait(); 5210 vm_page_dirty(m); 5211 pmap_invalidate_page(pmap, va); 5212 } 5213 PMAP_UNLOCK(pmap); 5214 } 5215 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5216 pmap = PV_PMAP(pv); 5217 PMAP_ASSERT_STAGE1(pmap); 5218 if (!PMAP_TRYLOCK(pmap)) { 5219 md_gen = m->md.pv_gen; 5220 pvh_gen = pvh->pv_gen; 5221 rw_wunlock(lock); 5222 PMAP_LOCK(pmap); 5223 rw_wlock(lock); 5224 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 5225 PMAP_UNLOCK(pmap); 5226 goto restart; 5227 } 5228 } 5229 l2 = pmap_l2(pmap, pv->pv_va); 5230 l3 = pmap_l2_to_l3(l2, pv->pv_va); 5231 oldl3 = pmap_load(l3); 5232 if (pmap_l3_valid(oldl3) && 5233 (oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM){ 5234 pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO)); 5235 pmap_invalidate_page(pmap, pv->pv_va); 5236 } 5237 PMAP_UNLOCK(pmap); 5238 } 5239 rw_wunlock(lock); 5240 } 5241 5242 void * 5243 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 5244 { 5245 struct pmap_preinit_mapping *ppim; 5246 vm_offset_t va, offset; 5247 pd_entry_t *pde; 5248 pt_entry_t *l2; 5249 int i, lvl, l2_blocks, free_l2_count, start_idx; 5250 5251 if (!vm_initialized) { 5252 /* 5253 * No L3 ptables so map entire L2 blocks where start VA is: 5254 * preinit_map_va + start_idx * L2_SIZE 5255 * There may be duplicate mappings (multiple VA -> same PA) but 5256 * ARM64 dcache is always PIPT so that's acceptable. 5257 */ 5258 if (size == 0) 5259 return (NULL); 5260 5261 /* Calculate how many L2 blocks are needed for the mapping */ 5262 l2_blocks = (roundup2(pa + size, L2_SIZE) - 5263 rounddown2(pa, L2_SIZE)) >> L2_SHIFT; 5264 5265 offset = pa & L2_OFFSET; 5266 5267 if (preinit_map_va == 0) 5268 return (NULL); 5269 5270 /* Map 2MiB L2 blocks from reserved VA space */ 5271 5272 free_l2_count = 0; 5273 start_idx = -1; 5274 /* Find enough free contiguous VA space */ 5275 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5276 ppim = pmap_preinit_mapping + i; 5277 if (free_l2_count > 0 && ppim->pa != 0) { 5278 /* Not enough space here */ 5279 free_l2_count = 0; 5280 start_idx = -1; 5281 continue; 5282 } 5283 5284 if (ppim->pa == 0) { 5285 /* Free L2 block */ 5286 if (start_idx == -1) 5287 start_idx = i; 5288 free_l2_count++; 5289 if (free_l2_count == l2_blocks) 5290 break; 5291 } 5292 } 5293 if (free_l2_count != l2_blocks) 5294 panic("%s: too many preinit mappings", __func__); 5295 5296 va = preinit_map_va + (start_idx * L2_SIZE); 5297 for (i = start_idx; i < start_idx + l2_blocks; i++) { 5298 /* Mark entries as allocated */ 5299 ppim = pmap_preinit_mapping + i; 5300 ppim->pa = pa; 5301 ppim->va = va + offset; 5302 ppim->size = size; 5303 } 5304 5305 /* Map L2 blocks */ 5306 pa = rounddown2(pa, L2_SIZE); 5307 for (i = 0; i < l2_blocks; i++) { 5308 pde = pmap_pde(kernel_pmap, va, &lvl); 5309 KASSERT(pde != NULL, 5310 ("pmap_mapbios: Invalid page entry, va: 0x%lx", 5311 va)); 5312 KASSERT(lvl == 1, 5313 ("pmap_mapbios: Invalid level %d", lvl)); 5314 5315 /* Insert L2_BLOCK */ 5316 l2 = pmap_l1_to_l2(pde, va); 5317 pmap_load_store(l2, 5318 pa | ATTR_DEFAULT | ATTR_S1_XN | 5319 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK); 5320 5321 va += L2_SIZE; 5322 pa += L2_SIZE; 5323 } 5324 pmap_invalidate_all(kernel_pmap); 5325 5326 va = preinit_map_va + (start_idx * L2_SIZE); 5327 5328 } else { 5329 /* kva_alloc may be used to map the pages */ 5330 offset = pa & PAGE_MASK; 5331 size = round_page(offset + size); 5332 5333 va = kva_alloc(size); 5334 if (va == 0) 5335 panic("%s: Couldn't allocate KVA", __func__); 5336 5337 pde = pmap_pde(kernel_pmap, va, &lvl); 5338 KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl)); 5339 5340 /* L3 table is linked */ 5341 va = trunc_page(va); 5342 pa = trunc_page(pa); 5343 pmap_kenter(va, size, pa, VM_MEMATTR_WRITE_BACK); 5344 } 5345 5346 return ((void *)(va + offset)); 5347 } 5348 5349 void 5350 pmap_unmapbios(vm_offset_t va, vm_size_t size) 5351 { 5352 struct pmap_preinit_mapping *ppim; 5353 vm_offset_t offset, tmpsize, va_trunc; 5354 pd_entry_t *pde; 5355 pt_entry_t *l2; 5356 int i, lvl, l2_blocks, block; 5357 bool preinit_map; 5358 5359 l2_blocks = 5360 (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT; 5361 KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size)); 5362 5363 /* Remove preinit mapping */ 5364 preinit_map = false; 5365 block = 0; 5366 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5367 ppim = pmap_preinit_mapping + i; 5368 if (ppim->va == va) { 5369 KASSERT(ppim->size == size, 5370 ("pmap_unmapbios: size mismatch")); 5371 ppim->va = 0; 5372 ppim->pa = 0; 5373 ppim->size = 0; 5374 preinit_map = true; 5375 offset = block * L2_SIZE; 5376 va_trunc = rounddown2(va, L2_SIZE) + offset; 5377 5378 /* Remove L2_BLOCK */ 5379 pde = pmap_pde(kernel_pmap, va_trunc, &lvl); 5380 KASSERT(pde != NULL, 5381 ("pmap_unmapbios: Invalid page entry, va: 0x%lx", 5382 va_trunc)); 5383 l2 = pmap_l1_to_l2(pde, va_trunc); 5384 pmap_clear(l2); 5385 5386 if (block == (l2_blocks - 1)) 5387 break; 5388 block++; 5389 } 5390 } 5391 if (preinit_map) { 5392 pmap_invalidate_all(kernel_pmap); 5393 return; 5394 } 5395 5396 /* Unmap the pages reserved with kva_alloc. */ 5397 if (vm_initialized) { 5398 offset = va & PAGE_MASK; 5399 size = round_page(offset + size); 5400 va = trunc_page(va); 5401 5402 pde = pmap_pde(kernel_pmap, va, &lvl); 5403 KASSERT(pde != NULL, 5404 ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va)); 5405 KASSERT(lvl == 2, ("pmap_unmapbios: Invalid level %d", lvl)); 5406 5407 /* Unmap and invalidate the pages */ 5408 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 5409 pmap_kremove(va + tmpsize); 5410 5411 kva_free(va, size); 5412 } 5413 } 5414 5415 /* 5416 * Sets the memory attribute for the specified page. 5417 */ 5418 void 5419 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5420 { 5421 5422 m->md.pv_memattr = ma; 5423 5424 /* 5425 * If "m" is a normal page, update its direct mapping. This update 5426 * can be relied upon to perform any cache operations that are 5427 * required for data coherence. 5428 */ 5429 if ((m->flags & PG_FICTITIOUS) == 0 && 5430 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 5431 m->md.pv_memattr) != 0) 5432 panic("memory attribute change on the direct map failed"); 5433 } 5434 5435 /* 5436 * Changes the specified virtual address range's memory type to that given by 5437 * the parameter "mode". The specified virtual address range must be 5438 * completely contained within either the direct map or the kernel map. If 5439 * the virtual address range is contained within the kernel map, then the 5440 * memory type for each of the corresponding ranges of the direct map is also 5441 * changed. (The corresponding ranges of the direct map are those ranges that 5442 * map the same physical pages as the specified virtual address range.) These 5443 * changes to the direct map are necessary because Intel describes the 5444 * behavior of their processors as "undefined" if two or more mappings to the 5445 * same physical page have different memory types. 5446 * 5447 * Returns zero if the change completed successfully, and either EINVAL or 5448 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 5449 * of the virtual address range was not mapped, and ENOMEM is returned if 5450 * there was insufficient memory available to complete the change. In the 5451 * latter case, the memory type may have been changed on some part of the 5452 * virtual address range or the direct map. 5453 */ 5454 int 5455 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 5456 { 5457 int error; 5458 5459 PMAP_LOCK(kernel_pmap); 5460 error = pmap_change_attr_locked(va, size, mode); 5461 PMAP_UNLOCK(kernel_pmap); 5462 return (error); 5463 } 5464 5465 static int 5466 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) 5467 { 5468 vm_offset_t base, offset, tmpva; 5469 pt_entry_t l3, *pte, *newpte; 5470 int lvl; 5471 5472 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 5473 base = trunc_page(va); 5474 offset = va & PAGE_MASK; 5475 size = round_page(offset + size); 5476 5477 if (!VIRT_IN_DMAP(base) && 5478 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS)) 5479 return (EINVAL); 5480 5481 for (tmpva = base; tmpva < base + size; ) { 5482 pte = pmap_pte(kernel_pmap, tmpva, &lvl); 5483 if (pte == NULL) 5484 return (EINVAL); 5485 5486 if ((pmap_load(pte) & ATTR_S1_IDX_MASK) == ATTR_S1_IDX(mode)) { 5487 /* 5488 * We already have the correct attribute, 5489 * ignore this entry. 5490 */ 5491 switch (lvl) { 5492 default: 5493 panic("Invalid DMAP table level: %d\n", lvl); 5494 case 1: 5495 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; 5496 break; 5497 case 2: 5498 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; 5499 break; 5500 case 3: 5501 tmpva += PAGE_SIZE; 5502 break; 5503 } 5504 } else { 5505 /* 5506 * Split the entry to an level 3 table, then 5507 * set the new attribute. 5508 */ 5509 switch (lvl) { 5510 default: 5511 panic("Invalid DMAP table level: %d\n", lvl); 5512 case 1: 5513 newpte = pmap_demote_l1(kernel_pmap, pte, 5514 tmpva & ~L1_OFFSET); 5515 if (newpte == NULL) 5516 return (EINVAL); 5517 pte = pmap_l1_to_l2(pte, tmpva); 5518 case 2: 5519 newpte = pmap_demote_l2(kernel_pmap, pte, 5520 tmpva); 5521 if (newpte == NULL) 5522 return (EINVAL); 5523 pte = pmap_l2_to_l3(pte, tmpva); 5524 case 3: 5525 /* Update the entry */ 5526 l3 = pmap_load(pte); 5527 l3 &= ~ATTR_S1_IDX_MASK; 5528 l3 |= ATTR_S1_IDX(mode); 5529 if (mode == VM_MEMATTR_DEVICE) 5530 l3 |= ATTR_S1_XN; 5531 5532 pmap_update_entry(kernel_pmap, pte, l3, tmpva, 5533 PAGE_SIZE); 5534 5535 /* 5536 * If moving to a non-cacheable entry flush 5537 * the cache. 5538 */ 5539 if (mode == VM_MEMATTR_UNCACHEABLE) 5540 cpu_dcache_wbinv_range(tmpva, L3_SIZE); 5541 5542 break; 5543 } 5544 tmpva += PAGE_SIZE; 5545 } 5546 } 5547 5548 return (0); 5549 } 5550 5551 /* 5552 * Create an L2 table to map all addresses within an L1 mapping. 5553 */ 5554 static pt_entry_t * 5555 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va) 5556 { 5557 pt_entry_t *l2, newl2, oldl1; 5558 vm_offset_t tmpl1; 5559 vm_paddr_t l2phys, phys; 5560 vm_page_t ml2; 5561 int i; 5562 5563 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5564 oldl1 = pmap_load(l1); 5565 KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK, 5566 ("pmap_demote_l1: Demoting a non-block entry")); 5567 KASSERT((va & L1_OFFSET) == 0, 5568 ("pmap_demote_l1: Invalid virtual address %#lx", va)); 5569 KASSERT((oldl1 & ATTR_SW_MANAGED) == 0, 5570 ("pmap_demote_l1: Level 1 table shouldn't be managed")); 5571 5572 tmpl1 = 0; 5573 if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) { 5574 tmpl1 = kva_alloc(PAGE_SIZE); 5575 if (tmpl1 == 0) 5576 return (NULL); 5577 } 5578 5579 if ((ml2 = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT | 5580 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 5581 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx" 5582 " in pmap %p", va, pmap); 5583 return (NULL); 5584 } 5585 5586 l2phys = VM_PAGE_TO_PHYS(ml2); 5587 l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys); 5588 5589 /* Address the range points at */ 5590 phys = oldl1 & ~ATTR_MASK; 5591 /* The attributed from the old l1 table to be copied */ 5592 newl2 = oldl1 & ATTR_MASK; 5593 5594 /* Create the new entries */ 5595 for (i = 0; i < Ln_ENTRIES; i++) { 5596 l2[i] = newl2 | phys; 5597 phys += L2_SIZE; 5598 } 5599 KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK), 5600 ("Invalid l2 page (%lx != %lx)", l2[0], 5601 (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK)); 5602 5603 if (tmpl1 != 0) { 5604 pmap_kenter(tmpl1, PAGE_SIZE, 5605 DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, 5606 VM_MEMATTR_WRITE_BACK); 5607 l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK)); 5608 } 5609 5610 pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE); 5611 5612 if (tmpl1 != 0) { 5613 pmap_kremove(tmpl1); 5614 kva_free(tmpl1, PAGE_SIZE); 5615 } 5616 5617 return (l2); 5618 } 5619 5620 static void 5621 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3) 5622 { 5623 pt_entry_t *l3; 5624 5625 for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) { 5626 *l3 = newl3; 5627 newl3 += L3_SIZE; 5628 } 5629 } 5630 5631 static void 5632 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2, 5633 struct rwlock **lockp) 5634 { 5635 struct spglist free; 5636 5637 SLIST_INIT(&free); 5638 (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, 5639 lockp); 5640 vm_page_free_pages_toq(&free, true); 5641 } 5642 5643 /* 5644 * Create an L3 table to map all addresses within an L2 mapping. 5645 */ 5646 static pt_entry_t * 5647 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, 5648 struct rwlock **lockp) 5649 { 5650 pt_entry_t *l3, newl3, oldl2; 5651 vm_offset_t tmpl2; 5652 vm_paddr_t l3phys; 5653 vm_page_t ml3; 5654 5655 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5656 PMAP_ASSERT_STAGE1(pmap); 5657 l3 = NULL; 5658 oldl2 = pmap_load(l2); 5659 KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK, 5660 ("pmap_demote_l2: Demoting a non-block entry")); 5661 va &= ~L2_OFFSET; 5662 5663 tmpl2 = 0; 5664 if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) { 5665 tmpl2 = kva_alloc(PAGE_SIZE); 5666 if (tmpl2 == 0) 5667 return (NULL); 5668 } 5669 5670 /* 5671 * Invalidate the 2MB page mapping and return "failure" if the 5672 * mapping was never accessed. 5673 */ 5674 if ((oldl2 & ATTR_AF) == 0) { 5675 KASSERT((oldl2 & ATTR_SW_WIRED) == 0, 5676 ("pmap_demote_l2: a wired mapping is missing ATTR_AF")); 5677 pmap_demote_l2_abort(pmap, va, l2, lockp); 5678 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p", 5679 va, pmap); 5680 goto fail; 5681 } 5682 5683 if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) { 5684 KASSERT((oldl2 & ATTR_SW_WIRED) == 0, 5685 ("pmap_demote_l2: page table page for a wired mapping" 5686 " is missing")); 5687 5688 /* 5689 * If the page table page is missing and the mapping 5690 * is for a kernel address, the mapping must belong to 5691 * the direct map. Page table pages are preallocated 5692 * for every other part of the kernel address space, 5693 * so the direct map region is the only part of the 5694 * kernel address space that must be handled here. 5695 */ 5696 KASSERT(va < VM_MAXUSER_ADDRESS || VIRT_IN_DMAP(va), 5697 ("pmap_demote_l2: No saved mpte for va %#lx", va)); 5698 5699 /* 5700 * If the 2MB page mapping belongs to the direct map 5701 * region of the kernel's address space, then the page 5702 * allocation request specifies the highest possible 5703 * priority (VM_ALLOC_INTERRUPT). Otherwise, the 5704 * priority is normal. 5705 */ 5706 ml3 = vm_page_alloc(NULL, pmap_l2_pindex(va), 5707 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | 5708 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); 5709 5710 /* 5711 * If the allocation of the new page table page fails, 5712 * invalidate the 2MB page mapping and return "failure". 5713 */ 5714 if (ml3 == NULL) { 5715 pmap_demote_l2_abort(pmap, va, l2, lockp); 5716 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx" 5717 " in pmap %p", va, pmap); 5718 goto fail; 5719 } 5720 5721 if (va < VM_MAXUSER_ADDRESS) { 5722 ml3->ref_count = NL3PG; 5723 pmap_resident_count_inc(pmap, 1); 5724 } 5725 } 5726 l3phys = VM_PAGE_TO_PHYS(ml3); 5727 l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys); 5728 newl3 = (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE; 5729 KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 5730 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM), 5731 ("pmap_demote_l2: L2 entry is writeable but not dirty")); 5732 5733 /* 5734 * If the page table page is not leftover from an earlier promotion, 5735 * or the mapping attributes have changed, (re)initialize the L3 table. 5736 * 5737 * When pmap_update_entry() clears the old L2 mapping, it (indirectly) 5738 * performs a dsb(). That dsb() ensures that the stores for filling 5739 * "l3" are visible before "l3" is added to the page table. 5740 */ 5741 if (ml3->valid == 0 || (l3[0] & ATTR_MASK) != (newl3 & ATTR_MASK)) 5742 pmap_fill_l3(l3, newl3); 5743 5744 /* 5745 * Map the temporary page so we don't lose access to the l2 table. 5746 */ 5747 if (tmpl2 != 0) { 5748 pmap_kenter(tmpl2, PAGE_SIZE, 5749 DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, 5750 VM_MEMATTR_WRITE_BACK); 5751 l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK)); 5752 } 5753 5754 /* 5755 * The spare PV entries must be reserved prior to demoting the 5756 * mapping, that is, prior to changing the PDE. Otherwise, the state 5757 * of the L2 and the PV lists will be inconsistent, which can result 5758 * in reclaim_pv_chunk() attempting to remove a PV entry from the 5759 * wrong PV list and pmap_pv_demote_l2() failing to find the expected 5760 * PV entry for the 2MB page mapping that is being demoted. 5761 */ 5762 if ((oldl2 & ATTR_SW_MANAGED) != 0) 5763 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); 5764 5765 /* 5766 * Pass PAGE_SIZE so that a single TLB invalidation is performed on 5767 * the 2MB page mapping. 5768 */ 5769 pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE); 5770 5771 /* 5772 * Demote the PV entry. 5773 */ 5774 if ((oldl2 & ATTR_SW_MANAGED) != 0) 5775 pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp); 5776 5777 atomic_add_long(&pmap_l2_demotions, 1); 5778 CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx" 5779 " in pmap %p %lx", va, pmap, l3[0]); 5780 5781 fail: 5782 if (tmpl2 != 0) { 5783 pmap_kremove(tmpl2); 5784 kva_free(tmpl2, PAGE_SIZE); 5785 } 5786 5787 return (l3); 5788 5789 } 5790 5791 static pt_entry_t * 5792 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 5793 { 5794 struct rwlock *lock; 5795 pt_entry_t *l3; 5796 5797 lock = NULL; 5798 l3 = pmap_demote_l2_locked(pmap, l2, va, &lock); 5799 if (lock != NULL) 5800 rw_wunlock(lock); 5801 return (l3); 5802 } 5803 5804 /* 5805 * Perform the pmap work for mincore(2). If the page is not both referenced and 5806 * modified by this pmap, returns its physical address so that the caller can 5807 * find other mappings. 5808 */ 5809 int 5810 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 5811 { 5812 pt_entry_t *pte, tpte; 5813 vm_paddr_t mask, pa; 5814 int lvl, val; 5815 bool managed; 5816 5817 PMAP_ASSERT_STAGE1(pmap); 5818 PMAP_LOCK(pmap); 5819 pte = pmap_pte(pmap, addr, &lvl); 5820 if (pte != NULL) { 5821 tpte = pmap_load(pte); 5822 5823 switch (lvl) { 5824 case 3: 5825 mask = L3_OFFSET; 5826 break; 5827 case 2: 5828 mask = L2_OFFSET; 5829 break; 5830 case 1: 5831 mask = L1_OFFSET; 5832 break; 5833 default: 5834 panic("pmap_mincore: invalid level %d", lvl); 5835 } 5836 5837 managed = (tpte & ATTR_SW_MANAGED) != 0; 5838 val = MINCORE_INCORE; 5839 if (lvl != 3) 5840 val |= MINCORE_SUPER; 5841 if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed && 5842 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))) 5843 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 5844 if ((tpte & ATTR_AF) == ATTR_AF) 5845 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 5846 5847 pa = (tpte & ~ATTR_MASK) | (addr & mask); 5848 } else { 5849 managed = false; 5850 val = 0; 5851 } 5852 5853 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 5854 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 5855 *pap = pa; 5856 } 5857 PMAP_UNLOCK(pmap); 5858 return (val); 5859 } 5860 5861 /* 5862 * Garbage collect every ASID that is neither active on a processor nor 5863 * reserved. 5864 */ 5865 static void 5866 pmap_reset_asid_set(pmap_t pmap) 5867 { 5868 pmap_t curpmap; 5869 int asid, cpuid, epoch; 5870 struct asid_set *set; 5871 5872 PMAP_ASSERT_STAGE1(pmap); 5873 5874 set = pmap->pm_asid_set; 5875 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 5876 mtx_assert(&set->asid_set_mutex, MA_OWNED); 5877 5878 /* 5879 * Ensure that the store to asid_epoch is globally visible before the 5880 * loads from pc_curpmap are performed. 5881 */ 5882 epoch = set->asid_epoch + 1; 5883 if (epoch == INT_MAX) 5884 epoch = 0; 5885 set->asid_epoch = epoch; 5886 dsb(ishst); 5887 __asm __volatile("tlbi vmalle1is"); 5888 dsb(ish); 5889 bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE, 5890 set->asid_set_size - 1); 5891 CPU_FOREACH(cpuid) { 5892 if (cpuid == curcpu) 5893 continue; 5894 curpmap = pcpu_find(cpuid)->pc_curpmap; 5895 KASSERT(curpmap->pm_asid_set == set, ("Incorrect set")); 5896 asid = COOKIE_TO_ASID(curpmap->pm_cookie); 5897 if (asid == -1) 5898 continue; 5899 bit_set(set->asid_set, asid); 5900 curpmap->pm_cookie = COOKIE_FROM(asid, epoch); 5901 } 5902 } 5903 5904 /* 5905 * Allocate a new ASID for the specified pmap. 5906 */ 5907 static void 5908 pmap_alloc_asid(pmap_t pmap) 5909 { 5910 struct asid_set *set; 5911 int new_asid; 5912 5913 PMAP_ASSERT_STAGE1(pmap); 5914 set = pmap->pm_asid_set; 5915 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 5916 5917 mtx_lock_spin(&set->asid_set_mutex); 5918 5919 /* 5920 * While this processor was waiting to acquire the asid set mutex, 5921 * pmap_reset_asid_set() running on another processor might have 5922 * updated this pmap's cookie to the current epoch. In which case, we 5923 * don't need to allocate a new ASID. 5924 */ 5925 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) 5926 goto out; 5927 5928 bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size, 5929 &new_asid); 5930 if (new_asid == -1) { 5931 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, 5932 set->asid_next, &new_asid); 5933 if (new_asid == -1) { 5934 pmap_reset_asid_set(pmap); 5935 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, 5936 set->asid_set_size, &new_asid); 5937 KASSERT(new_asid != -1, ("ASID allocation failure")); 5938 } 5939 } 5940 bit_set(set->asid_set, new_asid); 5941 set->asid_next = new_asid + 1; 5942 pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch); 5943 out: 5944 mtx_unlock_spin(&set->asid_set_mutex); 5945 } 5946 5947 /* 5948 * Compute the value that should be stored in ttbr0 to activate the specified 5949 * pmap. This value may change from time to time. 5950 */ 5951 uint64_t 5952 pmap_to_ttbr0(pmap_t pmap) 5953 { 5954 5955 PMAP_ASSERT_STAGE1(pmap); 5956 return (ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)) | 5957 pmap->pm_l0_paddr); 5958 } 5959 5960 static bool 5961 pmap_activate_int(pmap_t pmap) 5962 { 5963 struct asid_set *set; 5964 int epoch; 5965 5966 PMAP_ASSERT_STAGE1(pmap); 5967 KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap")); 5968 KASSERT(pmap != kernel_pmap, ("kernel pmap activation")); 5969 if (pmap == PCPU_GET(curpmap)) { 5970 /* 5971 * Handle the possibility that the old thread was preempted 5972 * after an "ic" or "tlbi" instruction but before it performed 5973 * a "dsb" instruction. If the old thread migrates to a new 5974 * processor, its completion of a "dsb" instruction on that 5975 * new processor does not guarantee that the "ic" or "tlbi" 5976 * instructions performed on the old processor have completed. 5977 */ 5978 dsb(ish); 5979 return (false); 5980 } 5981 5982 set = pmap->pm_asid_set; 5983 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 5984 5985 /* 5986 * Ensure that the store to curpmap is globally visible before the 5987 * load from asid_epoch is performed. 5988 */ 5989 PCPU_SET(curpmap, pmap); 5990 dsb(ish); 5991 epoch = COOKIE_TO_EPOCH(pmap->pm_cookie); 5992 if (epoch >= 0 && epoch != set->asid_epoch) 5993 pmap_alloc_asid(pmap); 5994 5995 set_ttbr0(pmap_to_ttbr0(pmap)); 5996 if (PCPU_GET(bcast_tlbi_workaround) != 0) 5997 invalidate_local_icache(); 5998 return (true); 5999 } 6000 6001 void 6002 pmap_activate(struct thread *td) 6003 { 6004 pmap_t pmap; 6005 6006 pmap = vmspace_pmap(td->td_proc->p_vmspace); 6007 PMAP_ASSERT_STAGE1(pmap); 6008 critical_enter(); 6009 (void)pmap_activate_int(pmap); 6010 critical_exit(); 6011 } 6012 6013 /* 6014 * To eliminate the unused parameter "old", we would have to add an instruction 6015 * to cpu_switch(). 6016 */ 6017 struct pcb * 6018 pmap_switch(struct thread *old __unused, struct thread *new) 6019 { 6020 pcpu_bp_harden bp_harden; 6021 struct pcb *pcb; 6022 6023 /* Store the new curthread */ 6024 PCPU_SET(curthread, new); 6025 6026 /* And the new pcb */ 6027 pcb = new->td_pcb; 6028 PCPU_SET(curpcb, pcb); 6029 6030 /* 6031 * TODO: We may need to flush the cache here if switching 6032 * to a user process. 6033 */ 6034 6035 if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) { 6036 /* 6037 * Stop userspace from training the branch predictor against 6038 * other processes. This will call into a CPU specific 6039 * function that clears the branch predictor state. 6040 */ 6041 bp_harden = PCPU_GET(bp_harden); 6042 if (bp_harden != NULL) 6043 bp_harden(); 6044 } 6045 6046 return (pcb); 6047 } 6048 6049 void 6050 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) 6051 { 6052 6053 PMAP_ASSERT_STAGE1(pmap); 6054 if (va >= VM_MIN_KERNEL_ADDRESS) { 6055 cpu_icache_sync_range(va, sz); 6056 } else { 6057 u_int len, offset; 6058 vm_paddr_t pa; 6059 6060 /* Find the length of data in this page to flush */ 6061 offset = va & PAGE_MASK; 6062 len = imin(PAGE_SIZE - offset, sz); 6063 6064 while (sz != 0) { 6065 /* Extract the physical address & find it in the DMAP */ 6066 pa = pmap_extract(pmap, va); 6067 if (pa != 0) 6068 cpu_icache_sync_range(PHYS_TO_DMAP(pa), len); 6069 6070 /* Move to the next page */ 6071 sz -= len; 6072 va += len; 6073 /* Set the length for the next iteration */ 6074 len = imin(PAGE_SIZE, sz); 6075 } 6076 } 6077 } 6078 6079 int 6080 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far) 6081 { 6082 pt_entry_t pte, *ptep; 6083 register_t intr; 6084 uint64_t ec, par; 6085 int lvl, rv; 6086 6087 PMAP_ASSERT_STAGE1(pmap); 6088 rv = KERN_FAILURE; 6089 6090 ec = ESR_ELx_EXCEPTION(esr); 6091 switch (ec) { 6092 case EXCP_INSN_ABORT_L: 6093 case EXCP_INSN_ABORT: 6094 case EXCP_DATA_ABORT_L: 6095 case EXCP_DATA_ABORT: 6096 break; 6097 default: 6098 return (rv); 6099 } 6100 6101 /* Data and insn aborts use same encoding for FSC field. */ 6102 switch (esr & ISS_DATA_DFSC_MASK) { 6103 case ISS_DATA_DFSC_AFF_L1: 6104 case ISS_DATA_DFSC_AFF_L2: 6105 case ISS_DATA_DFSC_AFF_L3: 6106 PMAP_LOCK(pmap); 6107 ptep = pmap_pte(pmap, far, &lvl); 6108 if (ptep != NULL) { 6109 pmap_set_bits(ptep, ATTR_AF); 6110 rv = KERN_SUCCESS; 6111 /* 6112 * XXXMJ as an optimization we could mark the entry 6113 * dirty if this is a write fault. 6114 */ 6115 } 6116 PMAP_UNLOCK(pmap); 6117 break; 6118 case ISS_DATA_DFSC_PF_L1: 6119 case ISS_DATA_DFSC_PF_L2: 6120 case ISS_DATA_DFSC_PF_L3: 6121 if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) || 6122 (esr & ISS_DATA_WnR) == 0) 6123 return (rv); 6124 PMAP_LOCK(pmap); 6125 ptep = pmap_pte(pmap, far, &lvl); 6126 if (ptep != NULL && 6127 ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) { 6128 if ((pte & ATTR_S1_AP_RW_BIT) == 6129 ATTR_S1_AP(ATTR_S1_AP_RO)) { 6130 pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT); 6131 pmap_invalidate_page(pmap, far); 6132 } 6133 rv = KERN_SUCCESS; 6134 } 6135 PMAP_UNLOCK(pmap); 6136 break; 6137 case ISS_DATA_DFSC_TF_L0: 6138 case ISS_DATA_DFSC_TF_L1: 6139 case ISS_DATA_DFSC_TF_L2: 6140 case ISS_DATA_DFSC_TF_L3: 6141 /* 6142 * Retry the translation. A break-before-make sequence can 6143 * produce a transient fault. 6144 */ 6145 if (pmap == kernel_pmap) { 6146 /* 6147 * The translation fault may have occurred within a 6148 * critical section. Therefore, we must check the 6149 * address without acquiring the kernel pmap's lock. 6150 */ 6151 if (pmap_kextract(far) != 0) 6152 rv = KERN_SUCCESS; 6153 } else { 6154 PMAP_LOCK(pmap); 6155 /* Ask the MMU to check the address. */ 6156 intr = intr_disable(); 6157 par = arm64_address_translate_s1e0r(far); 6158 intr_restore(intr); 6159 PMAP_UNLOCK(pmap); 6160 6161 /* 6162 * If the translation was successful, then we can 6163 * return success to the trap handler. 6164 */ 6165 if (PAR_SUCCESS(par)) 6166 rv = KERN_SUCCESS; 6167 } 6168 break; 6169 } 6170 6171 return (rv); 6172 } 6173 6174 /* 6175 * Increase the starting virtual address of the given mapping if a 6176 * different alignment might result in more superpage mappings. 6177 */ 6178 void 6179 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 6180 vm_offset_t *addr, vm_size_t size) 6181 { 6182 vm_offset_t superpage_offset; 6183 6184 if (size < L2_SIZE) 6185 return; 6186 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 6187 offset += ptoa(object->pg_color); 6188 superpage_offset = offset & L2_OFFSET; 6189 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || 6190 (*addr & L2_OFFSET) == superpage_offset) 6191 return; 6192 if ((*addr & L2_OFFSET) < superpage_offset) 6193 *addr = (*addr & ~L2_OFFSET) + superpage_offset; 6194 else 6195 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; 6196 } 6197 6198 /** 6199 * Get the kernel virtual address of a set of physical pages. If there are 6200 * physical addresses not covered by the DMAP perform a transient mapping 6201 * that will be removed when calling pmap_unmap_io_transient. 6202 * 6203 * \param page The pages the caller wishes to obtain the virtual 6204 * address on the kernel memory map. 6205 * \param vaddr On return contains the kernel virtual memory address 6206 * of the pages passed in the page parameter. 6207 * \param count Number of pages passed in. 6208 * \param can_fault TRUE if the thread using the mapped pages can take 6209 * page faults, FALSE otherwise. 6210 * 6211 * \returns TRUE if the caller must call pmap_unmap_io_transient when 6212 * finished or FALSE otherwise. 6213 * 6214 */ 6215 boolean_t 6216 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 6217 boolean_t can_fault) 6218 { 6219 vm_paddr_t paddr; 6220 boolean_t needs_mapping; 6221 int error, i; 6222 6223 /* 6224 * Allocate any KVA space that we need, this is done in a separate 6225 * loop to prevent calling vmem_alloc while pinned. 6226 */ 6227 needs_mapping = FALSE; 6228 for (i = 0; i < count; i++) { 6229 paddr = VM_PAGE_TO_PHYS(page[i]); 6230 if (__predict_false(!PHYS_IN_DMAP(paddr))) { 6231 error = vmem_alloc(kernel_arena, PAGE_SIZE, 6232 M_BESTFIT | M_WAITOK, &vaddr[i]); 6233 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 6234 needs_mapping = TRUE; 6235 } else { 6236 vaddr[i] = PHYS_TO_DMAP(paddr); 6237 } 6238 } 6239 6240 /* Exit early if everything is covered by the DMAP */ 6241 if (!needs_mapping) 6242 return (FALSE); 6243 6244 if (!can_fault) 6245 sched_pin(); 6246 for (i = 0; i < count; i++) { 6247 paddr = VM_PAGE_TO_PHYS(page[i]); 6248 if (!PHYS_IN_DMAP(paddr)) { 6249 panic( 6250 "pmap_map_io_transient: TODO: Map out of DMAP data"); 6251 } 6252 } 6253 6254 return (needs_mapping); 6255 } 6256 6257 void 6258 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 6259 boolean_t can_fault) 6260 { 6261 vm_paddr_t paddr; 6262 int i; 6263 6264 if (!can_fault) 6265 sched_unpin(); 6266 for (i = 0; i < count; i++) { 6267 paddr = VM_PAGE_TO_PHYS(page[i]); 6268 if (!PHYS_IN_DMAP(paddr)) { 6269 panic("ARM64TODO: pmap_unmap_io_transient: Unmap data"); 6270 } 6271 } 6272 } 6273 6274 boolean_t 6275 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 6276 { 6277 6278 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH); 6279 } 6280 6281 /* 6282 * Track a range of the kernel's virtual address space that is contiguous 6283 * in various mapping attributes. 6284 */ 6285 struct pmap_kernel_map_range { 6286 vm_offset_t sva; 6287 pt_entry_t attrs; 6288 int l3pages; 6289 int l3contig; 6290 int l2blocks; 6291 int l1blocks; 6292 }; 6293 6294 static void 6295 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 6296 vm_offset_t eva) 6297 { 6298 const char *mode; 6299 int index; 6300 6301 if (eva <= range->sva) 6302 return; 6303 6304 index = range->attrs & ATTR_S1_IDX_MASK; 6305 switch (index) { 6306 case ATTR_S1_IDX(VM_MEMATTR_DEVICE): 6307 mode = "DEV"; 6308 break; 6309 case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE): 6310 mode = "UC"; 6311 break; 6312 case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK): 6313 mode = "WB"; 6314 break; 6315 case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH): 6316 mode = "WT"; 6317 break; 6318 default: 6319 printf( 6320 "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n", 6321 __func__, index, range->sva, eva); 6322 mode = "??"; 6323 break; 6324 } 6325 6326 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c %3s %d %d %d %d\n", 6327 range->sva, eva, 6328 (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-', 6329 (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x', 6330 (range->attrs & ATTR_S1_AP_USER) != 0 ? 'u' : 's', 6331 mode, range->l1blocks, range->l2blocks, range->l3contig, 6332 range->l3pages); 6333 6334 /* Reset to sentinel value. */ 6335 range->sva = 0xfffffffffffffffful; 6336 } 6337 6338 /* 6339 * Determine whether the attributes specified by a page table entry match those 6340 * being tracked by the current range. 6341 */ 6342 static bool 6343 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 6344 { 6345 6346 return (range->attrs == attrs); 6347 } 6348 6349 static void 6350 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 6351 pt_entry_t attrs) 6352 { 6353 6354 memset(range, 0, sizeof(*range)); 6355 range->sva = va; 6356 range->attrs = attrs; 6357 } 6358 6359 /* 6360 * Given a leaf PTE, derive the mapping's attributes. If they do not match 6361 * those of the current run, dump the address range and its attributes, and 6362 * begin a new run. 6363 */ 6364 static void 6365 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 6366 vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e, 6367 pt_entry_t l3e) 6368 { 6369 pt_entry_t attrs; 6370 6371 attrs = l0e & (ATTR_S1_AP_MASK | ATTR_S1_XN); 6372 attrs |= l1e & (ATTR_S1_AP_MASK | ATTR_S1_XN); 6373 if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) 6374 attrs |= l1e & ATTR_S1_IDX_MASK; 6375 attrs |= l2e & (ATTR_S1_AP_MASK | ATTR_S1_XN); 6376 if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) 6377 attrs |= l2e & ATTR_S1_IDX_MASK; 6378 attrs |= l3e & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK); 6379 6380 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 6381 sysctl_kmaps_dump(sb, range, va); 6382 sysctl_kmaps_reinit(range, va, attrs); 6383 } 6384 } 6385 6386 static int 6387 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 6388 { 6389 struct pmap_kernel_map_range range; 6390 struct sbuf sbuf, *sb; 6391 pd_entry_t l0e, *l1, l1e, *l2, l2e; 6392 pt_entry_t *l3, l3e; 6393 vm_offset_t sva; 6394 vm_paddr_t pa; 6395 int error, i, j, k, l; 6396 6397 error = sysctl_wire_old_buffer(req, 0); 6398 if (error != 0) 6399 return (error); 6400 sb = &sbuf; 6401 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 6402 6403 /* Sentinel value. */ 6404 range.sva = 0xfffffffffffffffful; 6405 6406 /* 6407 * Iterate over the kernel page tables without holding the kernel pmap 6408 * lock. Kernel page table pages are never freed, so at worst we will 6409 * observe inconsistencies in the output. 6410 */ 6411 for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES; 6412 i++) { 6413 if (i == pmap_l0_index(DMAP_MIN_ADDRESS)) 6414 sbuf_printf(sb, "\nDirect map:\n"); 6415 else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS)) 6416 sbuf_printf(sb, "\nKernel map:\n"); 6417 6418 l0e = kernel_pmap->pm_l0[i]; 6419 if ((l0e & ATTR_DESCR_VALID) == 0) { 6420 sysctl_kmaps_dump(sb, &range, sva); 6421 sva += L0_SIZE; 6422 continue; 6423 } 6424 pa = l0e & ~ATTR_MASK; 6425 l1 = (pd_entry_t *)PHYS_TO_DMAP(pa); 6426 6427 for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) { 6428 l1e = l1[j]; 6429 if ((l1e & ATTR_DESCR_VALID) == 0) { 6430 sysctl_kmaps_dump(sb, &range, sva); 6431 sva += L1_SIZE; 6432 continue; 6433 } 6434 if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) { 6435 sysctl_kmaps_check(sb, &range, sva, l0e, l1e, 6436 0, 0); 6437 range.l1blocks++; 6438 sva += L1_SIZE; 6439 continue; 6440 } 6441 pa = l1e & ~ATTR_MASK; 6442 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); 6443 6444 for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) { 6445 l2e = l2[k]; 6446 if ((l2e & ATTR_DESCR_VALID) == 0) { 6447 sysctl_kmaps_dump(sb, &range, sva); 6448 sva += L2_SIZE; 6449 continue; 6450 } 6451 if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) { 6452 sysctl_kmaps_check(sb, &range, sva, 6453 l0e, l1e, l2e, 0); 6454 range.l2blocks++; 6455 sva += L2_SIZE; 6456 continue; 6457 } 6458 pa = l2e & ~ATTR_MASK; 6459 l3 = (pt_entry_t *)PHYS_TO_DMAP(pa); 6460 6461 for (l = pmap_l3_index(sva); l < Ln_ENTRIES; 6462 l++, sva += L3_SIZE) { 6463 l3e = l3[l]; 6464 if ((l3e & ATTR_DESCR_VALID) == 0) { 6465 sysctl_kmaps_dump(sb, &range, 6466 sva); 6467 continue; 6468 } 6469 sysctl_kmaps_check(sb, &range, sva, 6470 l0e, l1e, l2e, l3e); 6471 if ((l3e & ATTR_CONTIGUOUS) != 0) 6472 range.l3contig += l % 16 == 0 ? 6473 1 : 0; 6474 else 6475 range.l3pages++; 6476 } 6477 } 6478 } 6479 } 6480 6481 error = sbuf_finish(sb); 6482 sbuf_delete(sb); 6483 return (error); 6484 } 6485 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 6486 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 6487 NULL, 0, sysctl_kmaps, "A", 6488 "Dump kernel address layout"); 6489