1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * Copyright (c) 2014 Andrew Turner 15 * All rights reserved. 16 * Copyright (c) 2014 The FreeBSD Foundation 17 * All rights reserved. 18 * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com> 19 * All rights reserved. 20 * 21 * This code is derived from software contributed to Berkeley by 22 * the Systems Programming Group of the University of Utah Computer 23 * Science Department and William Jolitz of UUNET Technologies Inc. 24 * 25 * Portions of this software were developed by Andrew Turner under 26 * sponsorship from The FreeBSD Foundation. 27 * 28 * Portions of this software were developed by SRI International and the 29 * University of Cambridge Computer Laboratory under DARPA/AFRL contract 30 * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. 31 * 32 * Portions of this software were developed by the University of Cambridge 33 * Computer Laboratory as part of the CTSRD Project, with support from the 34 * UK Higher Education Innovation Fund (HEIF). 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. All advertising materials mentioning features or use of this software 45 * must display the following acknowledgement: 46 * This product includes software developed by the University of 47 * California, Berkeley and its contributors. 48 * 4. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 */ 64 /*- 65 * Copyright (c) 2003 Networks Associates Technology, Inc. 66 * All rights reserved. 67 * 68 * This software was developed for the FreeBSD Project by Jake Burkholder, 69 * Safeport Network Services, and Network Associates Laboratories, the 70 * Security Research Division of Network Associates, Inc. under 71 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 72 * CHATS research program. 73 * 74 * Redistribution and use in source and binary forms, with or without 75 * modification, are permitted provided that the following conditions 76 * are met: 77 * 1. Redistributions of source code must retain the above copyright 78 * notice, this list of conditions and the following disclaimer. 79 * 2. Redistributions in binary form must reproduce the above copyright 80 * notice, this list of conditions and the following disclaimer in the 81 * documentation and/or other materials provided with the distribution. 82 * 83 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 84 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 85 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 86 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 87 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 88 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 89 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 90 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 91 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 92 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 93 * SUCH DAMAGE. 94 */ 95 96 #include <sys/cdefs.h> 97 /* 98 * Manages physical address maps. 99 * 100 * Since the information managed by this module is 101 * also stored by the logical address mapping module, 102 * this module may throw away valid virtual-to-physical 103 * mappings at almost any time. However, invalidations 104 * of virtual-to-physical mappings must be done as 105 * requested. 106 * 107 * In order to cope with hardware architectures which 108 * make virtual-to-physical map invalidates expensive, 109 * this module may delay invalidate or reduced protection 110 * operations until such time as they are actually 111 * necessary. This module is given full information as 112 * to which processors are currently using which maps, 113 * and to when physical maps must be made correct. 114 */ 115 116 #include <sys/param.h> 117 #include <sys/systm.h> 118 #include <sys/bitstring.h> 119 #include <sys/bus.h> 120 #include <sys/cpuset.h> 121 #include <sys/kernel.h> 122 #include <sys/ktr.h> 123 #include <sys/lock.h> 124 #include <sys/malloc.h> 125 #include <sys/mman.h> 126 #include <sys/msgbuf.h> 127 #include <sys/mutex.h> 128 #include <sys/physmem.h> 129 #include <sys/proc.h> 130 #include <sys/rwlock.h> 131 #include <sys/sbuf.h> 132 #include <sys/sx.h> 133 #include <sys/vmem.h> 134 #include <sys/vmmeter.h> 135 #include <sys/sched.h> 136 #include <sys/sysctl.h> 137 #include <sys/smp.h> 138 139 #include <vm/vm.h> 140 #include <vm/vm_param.h> 141 #include <vm/vm_kern.h> 142 #include <vm/vm_page.h> 143 #include <vm/vm_map.h> 144 #include <vm/vm_object.h> 145 #include <vm/vm_extern.h> 146 #include <vm/vm_pageout.h> 147 #include <vm/vm_pager.h> 148 #include <vm/vm_phys.h> 149 #include <vm/vm_radix.h> 150 #include <vm/vm_reserv.h> 151 #include <vm/vm_dumpset.h> 152 #include <vm/uma.h> 153 154 #include <machine/machdep.h> 155 #include <machine/md_var.h> 156 #include <machine/pcb.h> 157 #include <machine/sbi.h> 158 159 /* 160 * Boundary values for the page table page index space: 161 * 162 * L3 pages: [0, NUL2E) 163 * L2 pages: [NUL2E, NUL2E + NUL1E) 164 * L1 pages: [NUL2E + NUL1E, NUL2E + NUL1E + NUL0E) 165 * 166 * Note that these ranges are used in both SV39 and SV48 mode. In SV39 mode the 167 * ranges are not fully populated since there are at most Ln_ENTRIES^2 L3 pages 168 * in a set of page tables. 169 */ 170 #define NUL0E Ln_ENTRIES 171 #define NUL1E (Ln_ENTRIES * NUL0E) 172 #define NUL2E (Ln_ENTRIES * NUL1E) 173 174 #ifdef PV_STATS 175 #define PV_STAT(x) do { x ; } while (0) 176 #define __pv_stat_used 177 #else 178 #define PV_STAT(x) do { } while (0) 179 #define __pv_stat_used __unused 180 #endif 181 182 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT)) 183 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) 184 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 185 186 #define NPV_LIST_LOCKS MAXCPU 187 188 #define PHYS_TO_PV_LIST_LOCK(pa) \ 189 (&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS]) 190 191 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 192 struct rwlock **_lockp = (lockp); \ 193 struct rwlock *_new_lock; \ 194 \ 195 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 196 if (_new_lock != *_lockp) { \ 197 if (*_lockp != NULL) \ 198 rw_wunlock(*_lockp); \ 199 *_lockp = _new_lock; \ 200 rw_wlock(*_lockp); \ 201 } \ 202 } while (0) 203 204 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 205 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 206 207 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 208 struct rwlock **_lockp = (lockp); \ 209 \ 210 if (*_lockp != NULL) { \ 211 rw_wunlock(*_lockp); \ 212 *_lockp = NULL; \ 213 } \ 214 } while (0) 215 216 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 217 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 218 219 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 220 "VM/pmap parameters"); 221 222 /* The list of all the user pmaps */ 223 LIST_HEAD(pmaplist, pmap); 224 static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER(); 225 226 enum pmap_mode __read_frequently pmap_mode = PMAP_MODE_SV39; 227 SYSCTL_INT(_vm_pmap, OID_AUTO, mode, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 228 &pmap_mode, 0, 229 "translation mode, 0 = SV39, 1 = SV48"); 230 231 struct pmap kernel_pmap_store; 232 233 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 234 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 235 vm_offset_t kernel_vm_end = 0; 236 237 vm_paddr_t dmap_phys_base; /* The start of the dmap region */ 238 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ 239 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ 240 241 /* This code assumes all L1 DMAP entries will be used */ 242 CTASSERT((DMAP_MIN_ADDRESS & ~L1_OFFSET) == DMAP_MIN_ADDRESS); 243 CTASSERT((DMAP_MAX_ADDRESS & ~L1_OFFSET) == DMAP_MAX_ADDRESS); 244 245 /* 246 * This code assumes that the early DEVMAP is L2_SIZE aligned and is fully 247 * contained within a single L2 entry. The early DTB is mapped immediately 248 * before the devmap L2 entry. 249 */ 250 CTASSERT((PMAP_MAPDEV_EARLY_SIZE & L2_OFFSET) == 0); 251 CTASSERT((VM_EARLY_DTB_ADDRESS & L2_OFFSET) == 0); 252 CTASSERT(VM_EARLY_DTB_ADDRESS < (VM_MAX_KERNEL_ADDRESS - PMAP_MAPDEV_EARLY_SIZE)); 253 254 static struct rwlock_padalign pvh_global_lock; 255 static struct mtx_padalign allpmaps_lock; 256 257 static int __read_frequently superpages_enabled = 1; 258 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, 259 CTLFLAG_RDTUN, &superpages_enabled, 0, 260 "Enable support for transparent superpages"); 261 262 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 263 "2MB page mapping counters"); 264 265 static u_long pmap_l2_demotions; 266 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, 267 &pmap_l2_demotions, 0, 268 "2MB page demotions"); 269 270 static u_long pmap_l2_mappings; 271 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, 272 &pmap_l2_mappings, 0, 273 "2MB page mappings"); 274 275 static u_long pmap_l2_p_failures; 276 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, 277 &pmap_l2_p_failures, 0, 278 "2MB page promotion failures"); 279 280 static u_long pmap_l2_promotions; 281 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, 282 &pmap_l2_promotions, 0, 283 "2MB page promotions"); 284 285 /* 286 * Data for the pv entry allocation mechanism 287 */ 288 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 289 static struct mtx pv_chunks_mutex; 290 static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 291 static struct md_page *pv_table; 292 static struct md_page pv_dummy; 293 294 extern cpuset_t all_harts; 295 296 /* 297 * Internal flags for pmap_enter()'s helper functions. 298 */ 299 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 300 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 301 302 static void free_pv_chunk(struct pv_chunk *pc); 303 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 304 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 305 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 306 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 307 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 308 vm_offset_t va); 309 static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va); 310 static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, 311 vm_offset_t va, struct rwlock **lockp); 312 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, 313 u_int flags, vm_page_t m, struct rwlock **lockp); 314 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 315 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 316 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, 317 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 318 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 319 vm_page_t m, struct rwlock **lockp); 320 321 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, 322 struct rwlock **lockp); 323 324 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 325 struct spglist *free); 326 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 327 328 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); 329 330 #define pmap_clear(pte) pmap_store(pte, 0) 331 #define pmap_clear_bits(pte, bits) atomic_clear_64(pte, bits) 332 #define pmap_load_store(pte, entry) atomic_swap_64(pte, entry) 333 #define pmap_load_clear(pte) pmap_load_store(pte, 0) 334 #define pmap_load(pte) atomic_load_64(pte) 335 #define pmap_store(pte, entry) atomic_store_64(pte, entry) 336 #define pmap_store_bits(pte, bits) atomic_set_64(pte, bits) 337 338 /********************/ 339 /* Inline functions */ 340 /********************/ 341 342 static __inline void 343 pagecopy(void *s, void *d) 344 { 345 346 memcpy(d, s, PAGE_SIZE); 347 } 348 349 static __inline void 350 pagezero(void *p) 351 { 352 353 bzero(p, PAGE_SIZE); 354 } 355 356 #define pmap_l0_index(va) (((va) >> L0_SHIFT) & Ln_ADDR_MASK) 357 #define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) 358 #define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) 359 #define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) 360 361 #define PTE_TO_PHYS(pte) \ 362 ((((pte) & ~PTE_HI_MASK) >> PTE_PPN0_S) * PAGE_SIZE) 363 #define L2PTE_TO_PHYS(l2) \ 364 ((((l2) & ~PTE_HI_MASK) >> PTE_PPN1_S) << L2_SHIFT) 365 366 static __inline pd_entry_t * 367 pmap_l0(pmap_t pmap, vm_offset_t va) 368 { 369 KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__)); 370 KASSERT(VIRT_IS_VALID(va), 371 ("%s: malformed virtual address %#lx", __func__, va)); 372 return (&pmap->pm_top[pmap_l0_index(va)]); 373 } 374 375 static __inline pd_entry_t * 376 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) 377 { 378 vm_paddr_t phys; 379 pd_entry_t *l1; 380 381 KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__)); 382 phys = PTE_TO_PHYS(pmap_load(l0)); 383 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 384 385 return (&l1[pmap_l1_index(va)]); 386 } 387 388 static __inline pd_entry_t * 389 pmap_l1(pmap_t pmap, vm_offset_t va) 390 { 391 pd_entry_t *l0; 392 393 KASSERT(VIRT_IS_VALID(va), 394 ("%s: malformed virtual address %#lx", __func__, va)); 395 if (pmap_mode == PMAP_MODE_SV39) { 396 return (&pmap->pm_top[pmap_l1_index(va)]); 397 } else { 398 l0 = pmap_l0(pmap, va); 399 if ((pmap_load(l0) & PTE_V) == 0) 400 return (NULL); 401 if ((pmap_load(l0) & PTE_RX) != 0) 402 return (NULL); 403 return (pmap_l0_to_l1(l0, va)); 404 } 405 } 406 407 static __inline pd_entry_t * 408 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) 409 { 410 vm_paddr_t phys; 411 pd_entry_t *l2; 412 413 phys = PTE_TO_PHYS(pmap_load(l1)); 414 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 415 416 return (&l2[pmap_l2_index(va)]); 417 } 418 419 static __inline pd_entry_t * 420 pmap_l2(pmap_t pmap, vm_offset_t va) 421 { 422 pd_entry_t *l1; 423 424 l1 = pmap_l1(pmap, va); 425 if (l1 == NULL) 426 return (NULL); 427 if ((pmap_load(l1) & PTE_V) == 0) 428 return (NULL); 429 if ((pmap_load(l1) & PTE_RX) != 0) 430 return (NULL); 431 432 return (pmap_l1_to_l2(l1, va)); 433 } 434 435 static __inline pt_entry_t * 436 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) 437 { 438 vm_paddr_t phys; 439 pt_entry_t *l3; 440 441 phys = PTE_TO_PHYS(pmap_load(l2)); 442 l3 = (pd_entry_t *)PHYS_TO_DMAP(phys); 443 444 return (&l3[pmap_l3_index(va)]); 445 } 446 447 static __inline pt_entry_t * 448 pmap_l3(pmap_t pmap, vm_offset_t va) 449 { 450 pd_entry_t *l2; 451 452 l2 = pmap_l2(pmap, va); 453 if (l2 == NULL) 454 return (NULL); 455 if ((pmap_load(l2) & PTE_V) == 0) 456 return (NULL); 457 if ((pmap_load(l2) & PTE_RX) != 0) 458 return (NULL); 459 460 return (pmap_l2_to_l3(l2, va)); 461 } 462 463 static __inline void 464 pmap_resident_count_inc(pmap_t pmap, int count) 465 { 466 467 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 468 pmap->pm_stats.resident_count += count; 469 } 470 471 static __inline void 472 pmap_resident_count_dec(pmap_t pmap, int count) 473 { 474 475 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 476 KASSERT(pmap->pm_stats.resident_count >= count, 477 ("pmap %p resident count underflow %ld %d", pmap, 478 pmap->pm_stats.resident_count, count)); 479 pmap->pm_stats.resident_count -= count; 480 } 481 482 static void 483 pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index, 484 pt_entry_t entry) 485 { 486 struct pmap *user_pmap; 487 pd_entry_t *l1; 488 489 /* 490 * Distribute new kernel L1 entry to all the user pmaps. This is only 491 * necessary with three-level paging configured: with four-level paging 492 * the kernel's half of the top-level page table page is static and can 493 * simply be copied at pmap initialization time. 494 */ 495 if (pmap != kernel_pmap || pmap_mode != PMAP_MODE_SV39) 496 return; 497 498 mtx_lock(&allpmaps_lock); 499 LIST_FOREACH(user_pmap, &allpmaps, pm_list) { 500 l1 = &user_pmap->pm_top[l1index]; 501 pmap_store(l1, entry); 502 } 503 mtx_unlock(&allpmaps_lock); 504 } 505 506 static pt_entry_t * 507 pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, 508 u_int *l2_slot) 509 { 510 pt_entry_t *l2; 511 pd_entry_t *l1 __diagused; 512 513 l1 = (pd_entry_t *)l1pt; 514 *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; 515 516 /* Check locore has used a table L1 map */ 517 KASSERT((l1[*l1_slot] & PTE_RX) == 0, 518 ("Invalid bootstrap L1 table")); 519 520 /* Find the address of the L2 table */ 521 l2 = (pt_entry_t *)init_pt_va; 522 *l2_slot = pmap_l2_index(va); 523 524 return (l2); 525 } 526 527 static vm_paddr_t 528 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) 529 { 530 u_int l1_slot, l2_slot; 531 pt_entry_t *l2; 532 vm_paddr_t ret; 533 534 l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); 535 536 /* Check locore has used L2 superpages */ 537 KASSERT((l2[l2_slot] & PTE_RX) != 0, 538 ("Invalid bootstrap L2 table")); 539 540 /* L2 is superpages */ 541 ret = L2PTE_TO_PHYS(l2[l2_slot]); 542 ret += (va & L2_OFFSET); 543 544 return (ret); 545 } 546 547 static void 548 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa) 549 { 550 vm_offset_t va; 551 vm_paddr_t pa; 552 pd_entry_t *l1; 553 u_int l1_slot; 554 pt_entry_t entry; 555 pn_t pn; 556 557 pa = dmap_phys_base = min_pa & ~L1_OFFSET; 558 va = DMAP_MIN_ADDRESS; 559 l1 = (pd_entry_t *)kern_l1; 560 l1_slot = pmap_l1_index(DMAP_MIN_ADDRESS); 561 562 for (; va < DMAP_MAX_ADDRESS && pa < max_pa; 563 pa += L1_SIZE, va += L1_SIZE, l1_slot++) { 564 KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); 565 566 /* superpages */ 567 pn = (pa / PAGE_SIZE); 568 entry = PTE_KERN; 569 entry |= (pn << PTE_PPN0_S); 570 pmap_store(&l1[l1_slot], entry); 571 } 572 573 /* Set the upper limit of the DMAP region */ 574 dmap_phys_max = pa; 575 dmap_max_addr = va; 576 577 sfence_vma(); 578 } 579 580 static vm_offset_t 581 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) 582 { 583 vm_offset_t l3pt; 584 pt_entry_t entry; 585 pd_entry_t *l2; 586 vm_paddr_t pa; 587 u_int l2_slot; 588 pn_t pn; 589 590 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); 591 592 l2 = pmap_l2(kernel_pmap, va); 593 l2 = (pd_entry_t *)((uintptr_t)l2 & ~(PAGE_SIZE - 1)); 594 l2_slot = pmap_l2_index(va); 595 l3pt = l3_start; 596 597 for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { 598 KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); 599 600 pa = pmap_early_vtophys(l1pt, l3pt); 601 pn = (pa / PAGE_SIZE); 602 entry = (PTE_V); 603 entry |= (pn << PTE_PPN0_S); 604 pmap_store(&l2[l2_slot], entry); 605 l3pt += PAGE_SIZE; 606 } 607 608 /* Clean the L2 page table */ 609 memset((void *)l3_start, 0, l3pt - l3_start); 610 611 return (l3pt); 612 } 613 614 /* 615 * Bootstrap the system enough to run with virtual memory. 616 */ 617 void 618 pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) 619 { 620 vm_paddr_t physmap[PHYS_AVAIL_ENTRIES]; 621 uint64_t satp; 622 vm_offset_t dpcpu, freemempos, l0pv, msgbufpv; 623 vm_paddr_t l0pa, l1pa, max_pa, min_pa, pa; 624 pd_entry_t *l0p; 625 pt_entry_t *l2p; 626 u_int l1_slot, l2_slot; 627 u_int physmap_idx; 628 int i, mode; 629 630 printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); 631 632 /* Set this early so we can use the pagetable walking functions */ 633 kernel_pmap_store.pm_top = (pd_entry_t *)l1pt; 634 PMAP_LOCK_INIT(kernel_pmap); 635 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 636 vm_radix_init(&kernel_pmap->pm_root); 637 638 rw_init(&pvh_global_lock, "pmap pv global"); 639 640 /* 641 * Set the current CPU as active in the kernel pmap. Secondary cores 642 * will add themselves later in init_secondary(). The SBI firmware 643 * may rely on this mask being precise, so CPU_FILL() is not used. 644 */ 645 CPU_SET(PCPU_GET(hart), &kernel_pmap->pm_active); 646 647 /* Assume the address we were loaded to is a valid physical address. */ 648 min_pa = max_pa = kernstart; 649 650 physmap_idx = physmem_avail(physmap, nitems(physmap)); 651 physmap_idx /= 2; 652 653 /* 654 * Find the minimum physical address. physmap is sorted, 655 * but may contain empty ranges. 656 */ 657 for (i = 0; i < physmap_idx * 2; i += 2) { 658 if (physmap[i] == physmap[i + 1]) 659 continue; 660 if (physmap[i] <= min_pa) 661 min_pa = physmap[i]; 662 if (physmap[i + 1] > max_pa) 663 max_pa = physmap[i + 1]; 664 } 665 printf("physmap_idx %u\n", physmap_idx); 666 printf("min_pa %lx\n", min_pa); 667 printf("max_pa %lx\n", max_pa); 668 669 /* Create a direct map region early so we can use it for pa -> va */ 670 pmap_bootstrap_dmap(l1pt, min_pa, max_pa); 671 672 /* 673 * Read the page table to find out what is already mapped. 674 * This assumes we have mapped a block of memory from KERNBASE 675 * using a single L1 entry. 676 */ 677 (void)pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); 678 679 /* Sanity check the index, KERNBASE should be the first VA */ 680 KASSERT(l2_slot == 0, ("The L2 index is non-zero")); 681 682 freemempos = roundup2(KERNBASE + kernlen, PAGE_SIZE); 683 684 /* Create the l3 tables for the early devmap */ 685 freemempos = pmap_bootstrap_l3(l1pt, 686 VM_MAX_KERNEL_ADDRESS - PMAP_MAPDEV_EARLY_SIZE, freemempos); 687 688 /* 689 * Invalidate the mapping we created for the DTB. At this point a copy 690 * has been created, and we no longer need it. We want to avoid the 691 * possibility of an aliased mapping in the future. 692 */ 693 l2p = pmap_l2(kernel_pmap, VM_EARLY_DTB_ADDRESS); 694 if ((pmap_load(l2p) & PTE_V) != 0) 695 pmap_clear(l2p); 696 697 sfence_vma(); 698 699 #define alloc_pages(var, np) \ 700 (var) = freemempos; \ 701 freemempos += (np * PAGE_SIZE); \ 702 memset((char *)(var), 0, ((np) * PAGE_SIZE)); 703 704 mode = 0; 705 TUNABLE_INT_FETCH("vm.pmap.mode", &mode); 706 if (mode == PMAP_MODE_SV48 && (mmu_caps & MMU_SV48) != 0) { 707 /* 708 * Enable SV48 mode: allocate an L0 page and set SV48 mode in 709 * SATP. If the implementation does not provide SV48 mode, 710 * the mode read back from the (WARL) SATP register will be 711 * unchanged, and we continue in SV39 mode. 712 */ 713 alloc_pages(l0pv, 1); 714 l0p = (void *)l0pv; 715 l1pa = pmap_early_vtophys(l1pt, l1pt); 716 l0p[pmap_l0_index(KERNBASE)] = PTE_V | PTE_A | PTE_D | 717 ((l1pa >> PAGE_SHIFT) << PTE_PPN0_S); 718 719 l0pa = pmap_early_vtophys(l1pt, l0pv); 720 csr_write(satp, (l0pa >> PAGE_SHIFT) | SATP_MODE_SV48); 721 satp = csr_read(satp); 722 if ((satp & SATP_MODE_M) == SATP_MODE_SV48) { 723 pmap_mode = PMAP_MODE_SV48; 724 kernel_pmap_store.pm_top = l0p; 725 } else { 726 /* Mode didn't change, give the page back. */ 727 freemempos -= PAGE_SIZE; 728 } 729 } 730 731 /* Allocate dynamic per-cpu area. */ 732 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); 733 dpcpu_init((void *)dpcpu, 0); 734 735 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ 736 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); 737 msgbufp = (void *)msgbufpv; 738 739 virtual_avail = roundup2(freemempos, L2_SIZE); 740 virtual_end = VM_MAX_KERNEL_ADDRESS - PMAP_MAPDEV_EARLY_SIZE; 741 kernel_vm_end = virtual_avail; 742 743 pa = pmap_early_vtophys(l1pt, freemempos); 744 745 physmem_exclude_region(kernstart, pa - kernstart, EXFLAG_NOALLOC); 746 } 747 748 /* 749 * Initialize a vm_page's machine-dependent fields. 750 */ 751 void 752 pmap_page_init(vm_page_t m) 753 { 754 755 TAILQ_INIT(&m->md.pv_list); 756 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; 757 } 758 759 /* 760 * Initialize the pmap module. 761 * Called by vm_init, to initialize any structures that the pmap 762 * system needs to map virtual memory. 763 */ 764 void 765 pmap_init(void) 766 { 767 vm_size_t s; 768 int i, pv_npg; 769 770 /* 771 * Initialize the pv chunk and pmap list mutexes. 772 */ 773 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 774 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF); 775 776 /* 777 * Initialize the pool of pv list locks. 778 */ 779 for (i = 0; i < NPV_LIST_LOCKS; i++) 780 rw_init(&pv_list_locks[i], "pmap pv list"); 781 782 /* 783 * Calculate the size of the pv head table for superpages. 784 */ 785 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); 786 787 /* 788 * Allocate memory for the pv head table for superpages. 789 */ 790 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 791 s = round_page(s); 792 pv_table = kmem_malloc(s, M_WAITOK | M_ZERO); 793 for (i = 0; i < pv_npg; i++) 794 TAILQ_INIT(&pv_table[i].pv_list); 795 TAILQ_INIT(&pv_dummy.pv_list); 796 797 if (superpages_enabled) 798 pagesizes[1] = L2_SIZE; 799 } 800 801 #ifdef SMP 802 /* 803 * For SMP, these functions have to use IPIs for coherence. 804 * 805 * In general, the calling thread uses a plain fence to order the 806 * writes to the page tables before invoking an SBI callback to invoke 807 * sfence_vma() on remote CPUs. 808 */ 809 static void 810 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 811 { 812 cpuset_t mask; 813 814 sched_pin(); 815 mask = pmap->pm_active; 816 CPU_CLR(PCPU_GET(hart), &mask); 817 fence(); 818 if (!CPU_EMPTY(&mask) && smp_started) 819 sbi_remote_sfence_vma(mask.__bits, va, 1); 820 sfence_vma_page(va); 821 sched_unpin(); 822 } 823 824 static void 825 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 826 { 827 cpuset_t mask; 828 829 sched_pin(); 830 mask = pmap->pm_active; 831 CPU_CLR(PCPU_GET(hart), &mask); 832 fence(); 833 if (!CPU_EMPTY(&mask) && smp_started) 834 sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1); 835 836 /* 837 * Might consider a loop of sfence_vma_page() for a small 838 * number of pages in the future. 839 */ 840 sfence_vma(); 841 sched_unpin(); 842 } 843 844 static void 845 pmap_invalidate_all(pmap_t pmap) 846 { 847 cpuset_t mask; 848 849 sched_pin(); 850 mask = pmap->pm_active; 851 CPU_CLR(PCPU_GET(hart), &mask); 852 853 /* 854 * XXX: The SBI doc doesn't detail how to specify x0 as the 855 * address to perform a global fence. BBL currently treats 856 * all sfence_vma requests as global however. 857 */ 858 fence(); 859 if (!CPU_EMPTY(&mask) && smp_started) 860 sbi_remote_sfence_vma(mask.__bits, 0, 0); 861 sfence_vma(); 862 sched_unpin(); 863 } 864 #else 865 /* 866 * Normal, non-SMP, invalidation functions. 867 * We inline these within pmap.c for speed. 868 */ 869 static __inline void 870 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 871 { 872 873 sfence_vma_page(va); 874 } 875 876 static __inline void 877 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 878 { 879 880 /* 881 * Might consider a loop of sfence_vma_page() for a small 882 * number of pages in the future. 883 */ 884 sfence_vma(); 885 } 886 887 static __inline void 888 pmap_invalidate_all(pmap_t pmap) 889 { 890 891 sfence_vma(); 892 } 893 #endif 894 895 /* 896 * Routine: pmap_extract 897 * Function: 898 * Extract the physical page address associated 899 * with the given map/virtual_address pair. 900 */ 901 vm_paddr_t 902 pmap_extract(pmap_t pmap, vm_offset_t va) 903 { 904 pd_entry_t *l2p, l2; 905 pt_entry_t *l3p; 906 vm_paddr_t pa; 907 908 pa = 0; 909 910 /* 911 * Start with an L2 lookup, L1 superpages are currently not implemented. 912 */ 913 PMAP_LOCK(pmap); 914 l2p = pmap_l2(pmap, va); 915 if (l2p != NULL && ((l2 = pmap_load(l2p)) & PTE_V) != 0) { 916 if ((l2 & PTE_RWX) == 0) { 917 l3p = pmap_l2_to_l3(l2p, va); 918 if (l3p != NULL) { 919 pa = PTE_TO_PHYS(pmap_load(l3p)); 920 pa |= (va & L3_OFFSET); 921 } 922 } else { 923 /* L2 is a superpage mapping. */ 924 pa = L2PTE_TO_PHYS(l2); 925 pa |= (va & L2_OFFSET); 926 } 927 } 928 PMAP_UNLOCK(pmap); 929 return (pa); 930 } 931 932 /* 933 * Routine: pmap_extract_and_hold 934 * Function: 935 * Atomically extract and hold the physical page 936 * with the given pmap and virtual address pair 937 * if that mapping permits the given protection. 938 */ 939 vm_page_t 940 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 941 { 942 pt_entry_t *l3p, l3; 943 vm_paddr_t phys; 944 vm_page_t m; 945 946 m = NULL; 947 PMAP_LOCK(pmap); 948 l3p = pmap_l3(pmap, va); 949 if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) { 950 if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) { 951 phys = PTE_TO_PHYS(l3); 952 m = PHYS_TO_VM_PAGE(phys); 953 if (!vm_page_wire_mapped(m)) 954 m = NULL; 955 } 956 } 957 PMAP_UNLOCK(pmap); 958 return (m); 959 } 960 961 /* 962 * Routine: pmap_kextract 963 * Function: 964 * Extract the physical page address associated with the given kernel 965 * virtual address. 966 */ 967 vm_paddr_t 968 pmap_kextract(vm_offset_t va) 969 { 970 pd_entry_t *l2, l2e; 971 pt_entry_t *l3; 972 vm_paddr_t pa; 973 974 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 975 pa = DMAP_TO_PHYS(va); 976 } else { 977 l2 = pmap_l2(kernel_pmap, va); 978 if (l2 == NULL) 979 panic("pmap_kextract: No l2"); 980 l2e = pmap_load(l2); 981 /* 982 * Beware of concurrent promotion and demotion! We must 983 * use l2e rather than loading from l2 multiple times to 984 * ensure we see a consistent state, including the 985 * implicit load in pmap_l2_to_l3. It is, however, safe 986 * to use an old l2e because the L3 page is preserved by 987 * promotion. 988 */ 989 if ((l2e & PTE_RX) != 0) { 990 /* superpages */ 991 pa = L2PTE_TO_PHYS(l2e); 992 pa |= (va & L2_OFFSET); 993 return (pa); 994 } 995 996 l3 = pmap_l2_to_l3(&l2e, va); 997 if (l3 == NULL) 998 panic("pmap_kextract: No l3..."); 999 pa = PTE_TO_PHYS(pmap_load(l3)); 1000 pa |= (va & PAGE_MASK); 1001 } 1002 return (pa); 1003 } 1004 1005 /*************************************************** 1006 * Low level mapping routines..... 1007 ***************************************************/ 1008 1009 void 1010 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode __unused) 1011 { 1012 pt_entry_t entry; 1013 pt_entry_t *l3; 1014 vm_offset_t va; 1015 pn_t pn; 1016 1017 KASSERT((pa & L3_OFFSET) == 0, 1018 ("pmap_kenter_device: Invalid physical address")); 1019 KASSERT((sva & L3_OFFSET) == 0, 1020 ("pmap_kenter_device: Invalid virtual address")); 1021 KASSERT((size & PAGE_MASK) == 0, 1022 ("pmap_kenter_device: Mapping is not page-sized")); 1023 1024 va = sva; 1025 while (size != 0) { 1026 l3 = pmap_l3(kernel_pmap, va); 1027 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 1028 1029 pn = (pa / PAGE_SIZE); 1030 entry = PTE_KERN; 1031 entry |= (pn << PTE_PPN0_S); 1032 pmap_store(l3, entry); 1033 1034 va += PAGE_SIZE; 1035 pa += PAGE_SIZE; 1036 size -= PAGE_SIZE; 1037 } 1038 pmap_invalidate_range(kernel_pmap, sva, va); 1039 } 1040 1041 void 1042 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) 1043 { 1044 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE); 1045 } 1046 1047 /* 1048 * Remove a page from the kernel pagetables. 1049 * Note: not SMP coherent. 1050 */ 1051 void 1052 pmap_kremove(vm_offset_t va) 1053 { 1054 pt_entry_t *l3; 1055 1056 l3 = pmap_l3(kernel_pmap, va); 1057 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 1058 1059 pmap_clear(l3); 1060 sfence_vma(); 1061 } 1062 1063 void 1064 pmap_kremove_device(vm_offset_t sva, vm_size_t size) 1065 { 1066 pt_entry_t *l3; 1067 vm_offset_t va; 1068 1069 KASSERT((sva & L3_OFFSET) == 0, 1070 ("pmap_kremove_device: Invalid virtual address")); 1071 KASSERT((size & PAGE_MASK) == 0, 1072 ("pmap_kremove_device: Mapping is not page-sized")); 1073 1074 va = sva; 1075 while (size != 0) { 1076 l3 = pmap_l3(kernel_pmap, va); 1077 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 1078 pmap_clear(l3); 1079 1080 va += PAGE_SIZE; 1081 size -= PAGE_SIZE; 1082 } 1083 1084 pmap_invalidate_range(kernel_pmap, sva, va); 1085 } 1086 1087 /* 1088 * Used to map a range of physical addresses into kernel 1089 * virtual address space. 1090 * 1091 * The value passed in '*virt' is a suggested virtual address for 1092 * the mapping. Architectures which can support a direct-mapped 1093 * physical to virtual region can return the appropriate address 1094 * within that region, leaving '*virt' unchanged. Other 1095 * architectures should map the pages starting at '*virt' and 1096 * update '*virt' with the first usable address after the mapped 1097 * region. 1098 */ 1099 vm_offset_t 1100 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1101 { 1102 1103 return PHYS_TO_DMAP(start); 1104 } 1105 1106 /* 1107 * Add a list of wired pages to the kva 1108 * this routine is only used for temporary 1109 * kernel mappings that do not need to have 1110 * page modification or references recorded. 1111 * Note that old mappings are simply written 1112 * over. The page *must* be wired. 1113 * Note: SMP coherent. Uses a ranged shootdown IPI. 1114 */ 1115 void 1116 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1117 { 1118 pt_entry_t *l3, pa; 1119 vm_offset_t va; 1120 vm_page_t m; 1121 pt_entry_t entry; 1122 pn_t pn; 1123 int i; 1124 1125 va = sva; 1126 for (i = 0; i < count; i++) { 1127 m = ma[i]; 1128 pa = VM_PAGE_TO_PHYS(m); 1129 pn = (pa / PAGE_SIZE); 1130 l3 = pmap_l3(kernel_pmap, va); 1131 1132 entry = PTE_KERN; 1133 entry |= (pn << PTE_PPN0_S); 1134 pmap_store(l3, entry); 1135 1136 va += L3_SIZE; 1137 } 1138 pmap_invalidate_range(kernel_pmap, sva, va); 1139 } 1140 1141 /* 1142 * This routine tears out page mappings from the 1143 * kernel -- it is meant only for temporary mappings. 1144 * Note: SMP coherent. Uses a ranged shootdown IPI. 1145 */ 1146 void 1147 pmap_qremove(vm_offset_t sva, int count) 1148 { 1149 pt_entry_t *l3; 1150 vm_offset_t va; 1151 1152 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); 1153 1154 for (va = sva; count-- > 0; va += PAGE_SIZE) { 1155 l3 = pmap_l3(kernel_pmap, va); 1156 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 1157 pmap_clear(l3); 1158 } 1159 pmap_invalidate_range(kernel_pmap, sva, va); 1160 } 1161 1162 bool 1163 pmap_ps_enabled(pmap_t pmap __unused) 1164 { 1165 1166 return (superpages_enabled); 1167 } 1168 1169 /*************************************************** 1170 * Page table page management routines..... 1171 ***************************************************/ 1172 /* 1173 * Schedule the specified unused page table page to be freed. Specifically, 1174 * add the page to the specified list of pages that will be released to the 1175 * physical memory manager after the TLB has been updated. 1176 */ 1177 static __inline void 1178 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO) 1179 { 1180 1181 if (set_PG_ZERO) 1182 m->flags |= PG_ZERO; 1183 else 1184 m->flags &= ~PG_ZERO; 1185 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1186 } 1187 1188 /* 1189 * Inserts the specified page table page into the specified pmap's collection 1190 * of idle page table pages. Each of a pmap's page table pages is responsible 1191 * for mapping a distinct range of virtual addresses. The pmap's collection is 1192 * ordered by this virtual address range. 1193 * 1194 * If "promoted" is false, then the page table page "mpte" must be zero filled; 1195 * "mpte"'s valid field will be set to 0. 1196 * 1197 * If "promoted" is true and "all_l3e_PTE_A_set" is false, then "mpte" must 1198 * contain valid mappings with identical attributes except for PTE_A; 1199 * "mpte"'s valid field will be set to 1. 1200 * 1201 * If "promoted" and "all_l3e_PTE_A_set" are both true, then "mpte" must contain 1202 * valid mappings with identical attributes including PTE_A; "mpte"'s valid 1203 * field will be set to VM_PAGE_BITS_ALL. 1204 */ 1205 static __inline int 1206 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, 1207 bool all_l3e_PTE_A_set) 1208 { 1209 1210 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1211 KASSERT(promoted || !all_l3e_PTE_A_set, 1212 ("a zero-filled PTP can't have PTE_A set in every PTE")); 1213 mpte->valid = promoted ? (all_l3e_PTE_A_set ? VM_PAGE_BITS_ALL : 1) : 0; 1214 return (vm_radix_insert(&pmap->pm_root, mpte)); 1215 } 1216 1217 /* 1218 * Removes the page table page mapping the specified virtual address from the 1219 * specified pmap's collection of idle page table pages, and returns it. 1220 * Otherwise, returns NULL if there is no page table page corresponding to the 1221 * specified virtual address. 1222 */ 1223 static __inline vm_page_t 1224 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 1225 { 1226 1227 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1228 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); 1229 } 1230 1231 /* 1232 * Decrements a page table page's reference count, which is used to record the 1233 * number of valid page table entries within the page. If the reference count 1234 * drops to zero, then the page table page is unmapped. Returns true if the 1235 * page table page was unmapped and false otherwise. 1236 */ 1237 static inline bool 1238 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1239 { 1240 KASSERT(m->ref_count > 0, 1241 ("%s: page %p ref count underflow", __func__, m)); 1242 1243 --m->ref_count; 1244 if (m->ref_count == 0) { 1245 _pmap_unwire_ptp(pmap, va, m, free); 1246 return (true); 1247 } else { 1248 return (false); 1249 } 1250 } 1251 1252 static void 1253 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1254 { 1255 vm_paddr_t phys; 1256 1257 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1258 if (m->pindex >= NUL2E + NUL1E) { 1259 pd_entry_t *l0; 1260 l0 = pmap_l0(pmap, va); 1261 pmap_clear(l0); 1262 } else if (m->pindex >= NUL2E) { 1263 pd_entry_t *l1; 1264 l1 = pmap_l1(pmap, va); 1265 pmap_clear(l1); 1266 pmap_distribute_l1(pmap, pmap_l1_index(va), 0); 1267 } else { 1268 pd_entry_t *l2; 1269 l2 = pmap_l2(pmap, va); 1270 pmap_clear(l2); 1271 } 1272 pmap_resident_count_dec(pmap, 1); 1273 if (m->pindex < NUL2E) { 1274 pd_entry_t *l1; 1275 vm_page_t pdpg; 1276 1277 l1 = pmap_l1(pmap, va); 1278 phys = PTE_TO_PHYS(pmap_load(l1)); 1279 pdpg = PHYS_TO_VM_PAGE(phys); 1280 pmap_unwire_ptp(pmap, va, pdpg, free); 1281 } else if (m->pindex < NUL2E + NUL1E && pmap_mode != PMAP_MODE_SV39) { 1282 pd_entry_t *l0; 1283 vm_page_t pdpg; 1284 1285 MPASS(pmap_mode != PMAP_MODE_SV39); 1286 l0 = pmap_l0(pmap, va); 1287 phys = PTE_TO_PHYS(pmap_load(l0)); 1288 pdpg = PHYS_TO_VM_PAGE(phys); 1289 pmap_unwire_ptp(pmap, va, pdpg, free); 1290 } 1291 pmap_invalidate_page(pmap, va); 1292 1293 vm_wire_sub(1); 1294 1295 /* 1296 * Put page on a list so that it is released after 1297 * *ALL* TLB shootdown is done 1298 */ 1299 pmap_add_delayed_free_list(m, free, true); 1300 } 1301 1302 /* 1303 * After removing a page table entry, this routine is used to 1304 * conditionally free the page, and manage the reference count. 1305 */ 1306 static int 1307 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 1308 struct spglist *free) 1309 { 1310 vm_page_t mpte; 1311 1312 if (va >= VM_MAXUSER_ADDRESS) 1313 return (0); 1314 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 1315 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde)); 1316 return (pmap_unwire_ptp(pmap, va, mpte, free)); 1317 } 1318 1319 static uint64_t 1320 pmap_satp_mode(void) 1321 { 1322 return (pmap_mode == PMAP_MODE_SV39 ? SATP_MODE_SV39 : SATP_MODE_SV48); 1323 } 1324 1325 void 1326 pmap_pinit0(pmap_t pmap) 1327 { 1328 PMAP_LOCK_INIT(pmap); 1329 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1330 pmap->pm_top = kernel_pmap->pm_top; 1331 pmap->pm_satp = pmap_satp_mode() | 1332 (vtophys(pmap->pm_top) >> PAGE_SHIFT); 1333 CPU_ZERO(&pmap->pm_active); 1334 TAILQ_INIT(&pmap->pm_pvchunk); 1335 vm_radix_init(&pmap->pm_root); 1336 pmap_activate_boot(pmap); 1337 } 1338 1339 int 1340 pmap_pinit(pmap_t pmap) 1341 { 1342 vm_paddr_t topphys; 1343 vm_page_t mtop; 1344 size_t i; 1345 1346 mtop = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO | 1347 VM_ALLOC_WAITOK); 1348 1349 topphys = VM_PAGE_TO_PHYS(mtop); 1350 pmap->pm_top = (pd_entry_t *)PHYS_TO_DMAP(topphys); 1351 pmap->pm_satp = pmap_satp_mode() | (topphys >> PAGE_SHIFT); 1352 1353 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1354 1355 CPU_ZERO(&pmap->pm_active); 1356 1357 if (pmap_mode == PMAP_MODE_SV39) { 1358 /* 1359 * Copy L1 entries from the kernel pmap. This must be done with 1360 * the allpmaps lock held to avoid races with 1361 * pmap_distribute_l1(). 1362 */ 1363 mtx_lock(&allpmaps_lock); 1364 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1365 for (i = pmap_l1_index(VM_MIN_KERNEL_ADDRESS); 1366 i < pmap_l1_index(VM_MAX_KERNEL_ADDRESS); i++) 1367 pmap->pm_top[i] = kernel_pmap->pm_top[i]; 1368 for (i = pmap_l1_index(DMAP_MIN_ADDRESS); 1369 i < pmap_l1_index(DMAP_MAX_ADDRESS); i++) 1370 pmap->pm_top[i] = kernel_pmap->pm_top[i]; 1371 mtx_unlock(&allpmaps_lock); 1372 } else { 1373 i = pmap_l0_index(VM_MIN_KERNEL_ADDRESS); 1374 pmap->pm_top[i] = kernel_pmap->pm_top[i]; 1375 } 1376 1377 TAILQ_INIT(&pmap->pm_pvchunk); 1378 vm_radix_init(&pmap->pm_root); 1379 1380 return (1); 1381 } 1382 1383 /* 1384 * This routine is called if the desired page table page does not exist. 1385 * 1386 * If page table page allocation fails, this routine may sleep before 1387 * returning NULL. It sleeps only if a lock pointer was given. 1388 * 1389 * Note: If a page allocation fails at page table level two or three, 1390 * one or two pages may be held during the wait, only to be released 1391 * afterwards. This conservative approach is easily argued to avoid 1392 * race conditions. 1393 */ 1394 static vm_page_t 1395 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 1396 { 1397 vm_page_t m, pdpg; 1398 pt_entry_t entry; 1399 vm_paddr_t phys; 1400 pn_t pn; 1401 1402 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1403 1404 /* 1405 * Allocate a page table page. 1406 */ 1407 m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1408 if (m == NULL) { 1409 if (lockp != NULL) { 1410 RELEASE_PV_LIST_LOCK(lockp); 1411 PMAP_UNLOCK(pmap); 1412 rw_runlock(&pvh_global_lock); 1413 vm_wait(NULL); 1414 rw_rlock(&pvh_global_lock); 1415 PMAP_LOCK(pmap); 1416 } 1417 1418 /* 1419 * Indicate the need to retry. While waiting, the page table 1420 * page may have been allocated. 1421 */ 1422 return (NULL); 1423 } 1424 m->pindex = ptepindex; 1425 1426 /* 1427 * Map the pagetable page into the process address space, if 1428 * it isn't already there. 1429 */ 1430 pn = VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT; 1431 if (ptepindex >= NUL2E + NUL1E) { 1432 pd_entry_t *l0; 1433 vm_pindex_t l0index; 1434 1435 KASSERT(pmap_mode != PMAP_MODE_SV39, 1436 ("%s: pindex %#lx in SV39 mode", __func__, ptepindex)); 1437 KASSERT(ptepindex < NUL2E + NUL1E + NUL0E, 1438 ("%s: pindex %#lx out of range", __func__, ptepindex)); 1439 1440 l0index = ptepindex - (NUL2E + NUL1E); 1441 l0 = &pmap->pm_top[l0index]; 1442 KASSERT((pmap_load(l0) & PTE_V) == 0, 1443 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0))); 1444 1445 entry = PTE_V | (pn << PTE_PPN0_S); 1446 pmap_store(l0, entry); 1447 } else if (ptepindex >= NUL2E) { 1448 pd_entry_t *l0, *l1; 1449 vm_pindex_t l0index, l1index; 1450 1451 l1index = ptepindex - NUL2E; 1452 if (pmap_mode == PMAP_MODE_SV39) { 1453 l1 = &pmap->pm_top[l1index]; 1454 } else { 1455 l0index = l1index >> Ln_ENTRIES_SHIFT; 1456 l0 = &pmap->pm_top[l0index]; 1457 if (pmap_load(l0) == 0) { 1458 /* Recurse to allocate the L1 page. */ 1459 if (_pmap_alloc_l3(pmap, 1460 NUL2E + NUL1E + l0index, lockp) == NULL) 1461 goto fail; 1462 phys = PTE_TO_PHYS(pmap_load(l0)); 1463 } else { 1464 phys = PTE_TO_PHYS(pmap_load(l0)); 1465 pdpg = PHYS_TO_VM_PAGE(phys); 1466 pdpg->ref_count++; 1467 } 1468 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1469 l1 = &l1[ptepindex & Ln_ADDR_MASK]; 1470 } 1471 KASSERT((pmap_load(l1) & PTE_V) == 0, 1472 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1))); 1473 1474 entry = PTE_V | (pn << PTE_PPN0_S); 1475 pmap_store(l1, entry); 1476 pmap_distribute_l1(pmap, l1index, entry); 1477 } else { 1478 vm_pindex_t l0index, l1index; 1479 pd_entry_t *l0, *l1, *l2; 1480 1481 l1index = ptepindex >> (L1_SHIFT - L2_SHIFT); 1482 if (pmap_mode == PMAP_MODE_SV39) { 1483 l1 = &pmap->pm_top[l1index]; 1484 if (pmap_load(l1) == 0) { 1485 /* recurse for allocating page dir */ 1486 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 1487 lockp) == NULL) 1488 goto fail; 1489 } else { 1490 phys = PTE_TO_PHYS(pmap_load(l1)); 1491 pdpg = PHYS_TO_VM_PAGE(phys); 1492 pdpg->ref_count++; 1493 } 1494 } else { 1495 l0index = l1index >> Ln_ENTRIES_SHIFT; 1496 l0 = &pmap->pm_top[l0index]; 1497 if (pmap_load(l0) == 0) { 1498 /* Recurse to allocate the L1 entry. */ 1499 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 1500 lockp) == NULL) 1501 goto fail; 1502 phys = PTE_TO_PHYS(pmap_load(l0)); 1503 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1504 l1 = &l1[l1index & Ln_ADDR_MASK]; 1505 } else { 1506 phys = PTE_TO_PHYS(pmap_load(l0)); 1507 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1508 l1 = &l1[l1index & Ln_ADDR_MASK]; 1509 if (pmap_load(l1) == 0) { 1510 /* Recurse to allocate the L2 page. */ 1511 if (_pmap_alloc_l3(pmap, 1512 NUL2E + l1index, lockp) == NULL) 1513 goto fail; 1514 } else { 1515 phys = PTE_TO_PHYS(pmap_load(l1)); 1516 pdpg = PHYS_TO_VM_PAGE(phys); 1517 pdpg->ref_count++; 1518 } 1519 } 1520 } 1521 1522 phys = PTE_TO_PHYS(pmap_load(l1)); 1523 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1524 l2 = &l2[ptepindex & Ln_ADDR_MASK]; 1525 KASSERT((pmap_load(l2) & PTE_V) == 0, 1526 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2))); 1527 1528 entry = PTE_V | (pn << PTE_PPN0_S); 1529 pmap_store(l2, entry); 1530 } 1531 1532 pmap_resident_count_inc(pmap, 1); 1533 1534 return (m); 1535 1536 fail: 1537 vm_page_unwire_noq(m); 1538 vm_page_free_zero(m); 1539 return (NULL); 1540 } 1541 1542 static vm_page_t 1543 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1544 { 1545 pd_entry_t *l1; 1546 vm_page_t l2pg; 1547 vm_pindex_t l2pindex; 1548 1549 retry: 1550 l1 = pmap_l1(pmap, va); 1551 if (l1 != NULL && (pmap_load(l1) & PTE_V) != 0) { 1552 KASSERT((pmap_load(l1) & PTE_RWX) == 0, 1553 ("%s: L1 entry %#lx for VA %#lx is a leaf", __func__, 1554 pmap_load(l1), va)); 1555 /* Add a reference to the L2 page. */ 1556 l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1))); 1557 l2pg->ref_count++; 1558 } else { 1559 /* Allocate a L2 page. */ 1560 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; 1561 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); 1562 if (l2pg == NULL && lockp != NULL) 1563 goto retry; 1564 } 1565 return (l2pg); 1566 } 1567 1568 static vm_page_t 1569 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1570 { 1571 vm_pindex_t ptepindex; 1572 pd_entry_t *l2; 1573 vm_paddr_t phys; 1574 vm_page_t m; 1575 1576 /* 1577 * Calculate pagetable page index 1578 */ 1579 ptepindex = pmap_l2_pindex(va); 1580 retry: 1581 /* 1582 * Get the page directory entry 1583 */ 1584 l2 = pmap_l2(pmap, va); 1585 1586 /* 1587 * If the page table page is mapped, we just increment the 1588 * hold count, and activate it. 1589 */ 1590 if (l2 != NULL && pmap_load(l2) != 0) { 1591 phys = PTE_TO_PHYS(pmap_load(l2)); 1592 m = PHYS_TO_VM_PAGE(phys); 1593 m->ref_count++; 1594 } else { 1595 /* 1596 * Here if the pte page isn't mapped, or if it has been 1597 * deallocated. 1598 */ 1599 m = _pmap_alloc_l3(pmap, ptepindex, lockp); 1600 if (m == NULL && lockp != NULL) 1601 goto retry; 1602 } 1603 return (m); 1604 } 1605 1606 /*************************************************** 1607 * Pmap allocation/deallocation routines. 1608 ***************************************************/ 1609 1610 /* 1611 * Release any resources held by the given physical map. 1612 * Called when a pmap initialized by pmap_pinit is being released. 1613 * Should only be called if the map contains no valid mappings. 1614 */ 1615 void 1616 pmap_release(pmap_t pmap) 1617 { 1618 vm_page_t m; 1619 1620 KASSERT(pmap->pm_stats.resident_count == 0, 1621 ("pmap_release: pmap resident count %ld != 0", 1622 pmap->pm_stats.resident_count)); 1623 KASSERT(CPU_EMPTY(&pmap->pm_active), 1624 ("releasing active pmap %p", pmap)); 1625 1626 if (pmap_mode == PMAP_MODE_SV39) { 1627 mtx_lock(&allpmaps_lock); 1628 LIST_REMOVE(pmap, pm_list); 1629 mtx_unlock(&allpmaps_lock); 1630 } 1631 1632 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_top)); 1633 vm_page_unwire_noq(m); 1634 vm_page_free(m); 1635 } 1636 1637 static int 1638 kvm_size(SYSCTL_HANDLER_ARGS) 1639 { 1640 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 1641 1642 return sysctl_handle_long(oidp, &ksize, 0, req); 1643 } 1644 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1645 0, 0, kvm_size, "LU", 1646 "Size of KVM"); 1647 1648 static int 1649 kvm_free(SYSCTL_HANDLER_ARGS) 1650 { 1651 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 1652 1653 return sysctl_handle_long(oidp, &kfree, 0, req); 1654 } 1655 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1656 0, 0, kvm_free, "LU", 1657 "Amount of KVM free"); 1658 1659 /* 1660 * grow the number of kernel page table entries, if needed 1661 */ 1662 void 1663 pmap_growkernel(vm_offset_t addr) 1664 { 1665 vm_paddr_t paddr; 1666 vm_page_t nkpg; 1667 pd_entry_t *l1, *l2; 1668 pt_entry_t entry; 1669 pn_t pn; 1670 1671 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 1672 1673 addr = roundup2(addr, L2_SIZE); 1674 if (addr - 1 >= vm_map_max(kernel_map)) 1675 addr = vm_map_max(kernel_map); 1676 while (kernel_vm_end < addr) { 1677 l1 = pmap_l1(kernel_pmap, kernel_vm_end); 1678 if (pmap_load(l1) == 0) { 1679 /* We need a new PDP entry */ 1680 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | 1681 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1682 if (nkpg == NULL) 1683 panic("pmap_growkernel: no memory to grow kernel"); 1684 nkpg->pindex = kernel_vm_end >> L1_SHIFT; 1685 paddr = VM_PAGE_TO_PHYS(nkpg); 1686 1687 pn = (paddr / PAGE_SIZE); 1688 entry = (PTE_V); 1689 entry |= (pn << PTE_PPN0_S); 1690 pmap_store(l1, entry); 1691 pmap_distribute_l1(kernel_pmap, 1692 pmap_l1_index(kernel_vm_end), entry); 1693 continue; /* try again */ 1694 } 1695 l2 = pmap_l1_to_l2(l1, kernel_vm_end); 1696 if ((pmap_load(l2) & PTE_V) != 0 && 1697 (pmap_load(l2) & PTE_RWX) == 0) { 1698 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1699 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1700 kernel_vm_end = vm_map_max(kernel_map); 1701 break; 1702 } 1703 continue; 1704 } 1705 1706 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 1707 VM_ALLOC_ZERO); 1708 if (nkpg == NULL) 1709 panic("pmap_growkernel: no memory to grow kernel"); 1710 nkpg->pindex = kernel_vm_end >> L2_SHIFT; 1711 paddr = VM_PAGE_TO_PHYS(nkpg); 1712 1713 pn = (paddr / PAGE_SIZE); 1714 entry = (PTE_V); 1715 entry |= (pn << PTE_PPN0_S); 1716 pmap_store(l2, entry); 1717 1718 pmap_invalidate_page(kernel_pmap, kernel_vm_end); 1719 1720 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1721 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1722 kernel_vm_end = vm_map_max(kernel_map); 1723 break; 1724 } 1725 } 1726 } 1727 1728 /*************************************************** 1729 * page management routines. 1730 ***************************************************/ 1731 1732 static const uint64_t pc_freemask[_NPCM] = { 1733 [0 ... _NPCM - 2] = PC_FREEN, 1734 [_NPCM - 1] = PC_FREEL 1735 }; 1736 1737 #if 0 1738 #ifdef PV_STATS 1739 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 1740 1741 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 1742 "Current number of pv entry chunks"); 1743 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 1744 "Current number of pv entry chunks allocated"); 1745 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 1746 "Current number of pv entry chunks frees"); 1747 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 1748 "Number of times tried to get a chunk page but failed."); 1749 1750 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 1751 static int pv_entry_spare; 1752 1753 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 1754 "Current number of pv entry frees"); 1755 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 1756 "Current number of pv entry allocs"); 1757 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 1758 "Current number of pv entries"); 1759 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 1760 "Current number of spare pv entries"); 1761 #endif 1762 #endif /* 0 */ 1763 1764 /* 1765 * We are in a serious low memory condition. Resort to 1766 * drastic measures to free some pages so we can allocate 1767 * another pv entry chunk. 1768 * 1769 * Returns NULL if PV entries were reclaimed from the specified pmap. 1770 * 1771 * We do not, however, unmap 2mpages because subsequent accesses will 1772 * allocate per-page pv entries until repromotion occurs, thereby 1773 * exacerbating the shortage of free pv entries. 1774 */ 1775 static vm_page_t 1776 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 1777 { 1778 1779 panic("RISCVTODO: reclaim_pv_chunk"); 1780 } 1781 1782 /* 1783 * free the pv_entry back to the free list 1784 */ 1785 static void 1786 free_pv_entry(pmap_t pmap, pv_entry_t pv) 1787 { 1788 struct pv_chunk *pc; 1789 int idx, field, bit; 1790 1791 rw_assert(&pvh_global_lock, RA_LOCKED); 1792 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1793 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 1794 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 1795 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 1796 pc = pv_to_chunk(pv); 1797 idx = pv - &pc->pc_pventry[0]; 1798 field = idx / 64; 1799 bit = idx % 64; 1800 pc->pc_map[field] |= 1ul << bit; 1801 if (!pc_is_free(pc)) { 1802 /* 98% of the time, pc is already at the head of the list. */ 1803 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 1804 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1805 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1806 } 1807 return; 1808 } 1809 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1810 free_pv_chunk(pc); 1811 } 1812 1813 static void 1814 free_pv_chunk(struct pv_chunk *pc) 1815 { 1816 vm_page_t m; 1817 1818 mtx_lock(&pv_chunks_mutex); 1819 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 1820 mtx_unlock(&pv_chunks_mutex); 1821 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 1822 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 1823 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 1824 /* entire chunk is free, return it */ 1825 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 1826 dump_drop_page(m->phys_addr); 1827 vm_page_unwire_noq(m); 1828 vm_page_free(m); 1829 } 1830 1831 /* 1832 * Returns a new PV entry, allocating a new PV chunk from the system when 1833 * needed. If this PV chunk allocation fails and a PV list lock pointer was 1834 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 1835 * returned. 1836 * 1837 * The given PV list lock may be released. 1838 */ 1839 static pv_entry_t 1840 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 1841 { 1842 int bit, field; 1843 pv_entry_t pv; 1844 struct pv_chunk *pc; 1845 vm_page_t m; 1846 1847 rw_assert(&pvh_global_lock, RA_LOCKED); 1848 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1849 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 1850 retry: 1851 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1852 if (pc != NULL) { 1853 for (field = 0; field < _NPCM; field++) { 1854 if (pc->pc_map[field]) { 1855 bit = ffsl(pc->pc_map[field]) - 1; 1856 break; 1857 } 1858 } 1859 if (field < _NPCM) { 1860 pv = &pc->pc_pventry[field * 64 + bit]; 1861 pc->pc_map[field] &= ~(1ul << bit); 1862 /* If this was the last item, move it to tail */ 1863 if (pc_is_full(pc)) { 1864 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1865 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 1866 pc_list); 1867 } 1868 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1869 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 1870 return (pv); 1871 } 1872 } 1873 /* No free items, allocate another chunk */ 1874 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 1875 if (m == NULL) { 1876 if (lockp == NULL) { 1877 PV_STAT(pc_chunk_tryfail++); 1878 return (NULL); 1879 } 1880 m = reclaim_pv_chunk(pmap, lockp); 1881 if (m == NULL) 1882 goto retry; 1883 } 1884 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 1885 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 1886 dump_add_page(m->phys_addr); 1887 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1888 pc->pc_pmap = pmap; 1889 pc->pc_map[0] = PC_FREEN & ~1ul; /* preallocated bit 0 */ 1890 pc->pc_map[1] = PC_FREEN; 1891 pc->pc_map[2] = PC_FREEL; 1892 mtx_lock(&pv_chunks_mutex); 1893 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 1894 mtx_unlock(&pv_chunks_mutex); 1895 pv = &pc->pc_pventry[0]; 1896 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1897 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1898 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 1899 return (pv); 1900 } 1901 1902 /* 1903 * Ensure that the number of spare PV entries in the specified pmap meets or 1904 * exceeds the given count, "needed". 1905 * 1906 * The given PV list lock may be released. 1907 */ 1908 static void 1909 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 1910 { 1911 struct pch new_tail; 1912 struct pv_chunk *pc; 1913 vm_page_t m; 1914 int avail, free; 1915 bool reclaimed; 1916 1917 rw_assert(&pvh_global_lock, RA_LOCKED); 1918 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1919 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 1920 1921 /* 1922 * Newly allocated PV chunks must be stored in a private list until 1923 * the required number of PV chunks have been allocated. Otherwise, 1924 * reclaim_pv_chunk() could recycle one of these chunks. In 1925 * contrast, these chunks must be added to the pmap upon allocation. 1926 */ 1927 TAILQ_INIT(&new_tail); 1928 retry: 1929 avail = 0; 1930 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 1931 bit_count((bitstr_t *)pc->pc_map, 0, 1932 sizeof(pc->pc_map) * NBBY, &free); 1933 if (free == 0) 1934 break; 1935 avail += free; 1936 if (avail >= needed) 1937 break; 1938 } 1939 for (reclaimed = false; avail < needed; avail += _NPCPV) { 1940 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 1941 if (m == NULL) { 1942 m = reclaim_pv_chunk(pmap, lockp); 1943 if (m == NULL) 1944 goto retry; 1945 reclaimed = true; 1946 } 1947 /* XXX PV STATS */ 1948 #if 0 1949 dump_add_page(m->phys_addr); 1950 #endif 1951 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1952 pc->pc_pmap = pmap; 1953 pc->pc_map[0] = PC_FREEN; 1954 pc->pc_map[1] = PC_FREEN; 1955 pc->pc_map[2] = PC_FREEL; 1956 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1957 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 1958 1959 /* 1960 * The reclaim might have freed a chunk from the current pmap. 1961 * If that chunk contained available entries, we need to 1962 * re-count the number of available entries. 1963 */ 1964 if (reclaimed) 1965 goto retry; 1966 } 1967 if (!TAILQ_EMPTY(&new_tail)) { 1968 mtx_lock(&pv_chunks_mutex); 1969 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 1970 mtx_unlock(&pv_chunks_mutex); 1971 } 1972 } 1973 1974 /* 1975 * First find and then remove the pv entry for the specified pmap and virtual 1976 * address from the specified pv list. Returns the pv entry if found and NULL 1977 * otherwise. This operation can be performed on pv lists for either 4KB or 1978 * 2MB page mappings. 1979 */ 1980 static __inline pv_entry_t 1981 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1982 { 1983 pv_entry_t pv; 1984 1985 rw_assert(&pvh_global_lock, RA_LOCKED); 1986 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 1987 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 1988 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 1989 pvh->pv_gen++; 1990 break; 1991 } 1992 } 1993 return (pv); 1994 } 1995 1996 /* 1997 * First find and then destroy the pv entry for the specified pmap and virtual 1998 * address. This operation can be performed on pv lists for either 4KB or 2MB 1999 * page mappings. 2000 */ 2001 static void 2002 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2003 { 2004 pv_entry_t pv; 2005 2006 pv = pmap_pvh_remove(pvh, pmap, va); 2007 2008 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va)); 2009 free_pv_entry(pmap, pv); 2010 } 2011 2012 /* 2013 * Conditionally create the PV entry for a 4KB page mapping if the required 2014 * memory can be allocated without resorting to reclamation. 2015 */ 2016 static bool 2017 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 2018 struct rwlock **lockp) 2019 { 2020 pv_entry_t pv; 2021 2022 rw_assert(&pvh_global_lock, RA_LOCKED); 2023 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2024 /* Pass NULL instead of the lock pointer to disable reclamation. */ 2025 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 2026 pv->pv_va = va; 2027 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2028 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2029 m->md.pv_gen++; 2030 return (true); 2031 } else 2032 return (false); 2033 } 2034 2035 /* 2036 * After demotion from a 2MB page mapping to 512 4KB page mappings, 2037 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 2038 * entries for each of the 4KB page mappings. 2039 */ 2040 static void __unused 2041 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 2042 struct rwlock **lockp) 2043 { 2044 struct md_page *pvh; 2045 struct pv_chunk *pc; 2046 pv_entry_t pv; 2047 vm_page_t m; 2048 vm_offset_t va_last; 2049 int bit, field; 2050 2051 rw_assert(&pvh_global_lock, RA_LOCKED); 2052 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2053 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2054 2055 /* 2056 * Transfer the 2mpage's pv entry for this mapping to the first 2057 * page's pv list. Once this transfer begins, the pv list lock 2058 * must not be released until the last pv entry is reinstantiated. 2059 */ 2060 pvh = pa_to_pvh(pa); 2061 va &= ~L2_OFFSET; 2062 pv = pmap_pvh_remove(pvh, pmap, va); 2063 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); 2064 m = PHYS_TO_VM_PAGE(pa); 2065 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2066 m->md.pv_gen++; 2067 /* Instantiate the remaining 511 pv entries. */ 2068 va_last = va + L2_SIZE - PAGE_SIZE; 2069 for (;;) { 2070 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2071 KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare")); 2072 for (field = 0; field < _NPCM; field++) { 2073 while (pc->pc_map[field] != 0) { 2074 bit = ffsl(pc->pc_map[field]) - 1; 2075 pc->pc_map[field] &= ~(1ul << bit); 2076 pv = &pc->pc_pventry[field * 64 + bit]; 2077 va += PAGE_SIZE; 2078 pv->pv_va = va; 2079 m++; 2080 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2081 ("pmap_pv_demote_l2: page %p is not managed", m)); 2082 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2083 m->md.pv_gen++; 2084 if (va == va_last) 2085 goto out; 2086 } 2087 } 2088 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2089 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2090 } 2091 out: 2092 if (pc_is_free(pc)) { 2093 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2094 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2095 } 2096 /* XXX PV stats */ 2097 } 2098 2099 #if VM_NRESERVLEVEL > 0 2100 static void 2101 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 2102 struct rwlock **lockp) 2103 { 2104 struct md_page *pvh; 2105 pv_entry_t pv; 2106 vm_page_t m; 2107 vm_offset_t va_last; 2108 2109 rw_assert(&pvh_global_lock, RA_LOCKED); 2110 KASSERT((pa & L2_OFFSET) == 0, 2111 ("pmap_pv_promote_l2: misaligned pa %#lx", pa)); 2112 2113 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2114 2115 m = PHYS_TO_VM_PAGE(pa); 2116 va = va & ~L2_OFFSET; 2117 pv = pmap_pvh_remove(&m->md, pmap, va); 2118 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va)); 2119 pvh = pa_to_pvh(pa); 2120 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2121 pvh->pv_gen++; 2122 2123 va_last = va + L2_SIZE - PAGE_SIZE; 2124 do { 2125 m++; 2126 va += PAGE_SIZE; 2127 pmap_pvh_free(&m->md, pmap, va); 2128 } while (va < va_last); 2129 } 2130 #endif /* VM_NRESERVLEVEL > 0 */ 2131 2132 /* 2133 * Create the PV entry for a 2MB page mapping. Always returns true unless the 2134 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 2135 * false if the PV entry cannot be allocated without resorting to reclamation. 2136 */ 2137 static bool 2138 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, 2139 struct rwlock **lockp) 2140 { 2141 struct md_page *pvh; 2142 pv_entry_t pv; 2143 vm_paddr_t pa; 2144 2145 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2146 /* Pass NULL instead of the lock pointer to disable reclamation. */ 2147 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 2148 NULL : lockp)) == NULL) 2149 return (false); 2150 pv->pv_va = va; 2151 pa = PTE_TO_PHYS(l2e); 2152 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2153 pvh = pa_to_pvh(pa); 2154 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2155 pvh->pv_gen++; 2156 return (true); 2157 } 2158 2159 static void 2160 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 2161 { 2162 pt_entry_t newl2, oldl2 __diagused; 2163 vm_page_t ml3; 2164 vm_paddr_t ml3pa; 2165 2166 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); 2167 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 2168 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2169 2170 ml3 = pmap_remove_pt_page(pmap, va); 2171 if (ml3 == NULL) 2172 panic("pmap_remove_kernel_l2: Missing pt page"); 2173 2174 ml3pa = VM_PAGE_TO_PHYS(ml3); 2175 newl2 = ml3pa | PTE_V; 2176 2177 /* 2178 * If this page table page was unmapped by a promotion, then it 2179 * contains valid mappings. Zero it to invalidate those mappings. 2180 */ 2181 if (vm_page_any_valid(ml3)) 2182 pagezero((void *)PHYS_TO_DMAP(ml3pa)); 2183 2184 /* 2185 * Demote the mapping. 2186 */ 2187 oldl2 = pmap_load_store(l2, newl2); 2188 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", 2189 __func__, l2, oldl2)); 2190 } 2191 2192 /* 2193 * pmap_remove_l2: Do the things to unmap a level 2 superpage. 2194 */ 2195 static int 2196 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 2197 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) 2198 { 2199 struct md_page *pvh; 2200 pt_entry_t oldl2; 2201 vm_offset_t eva, va; 2202 vm_page_t m, ml3; 2203 2204 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2205 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); 2206 oldl2 = pmap_load_clear(l2); 2207 KASSERT((oldl2 & PTE_RWX) != 0, 2208 ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2)); 2209 2210 /* 2211 * The sfence.vma documentation states that it is sufficient to specify 2212 * a single address within a superpage mapping. However, since we do 2213 * not perform any invalidation upon promotion, TLBs may still be 2214 * caching 4KB mappings within the superpage, so we must invalidate the 2215 * entire range. 2216 */ 2217 pmap_invalidate_range(pmap, sva, sva + L2_SIZE); 2218 if ((oldl2 & PTE_SW_WIRED) != 0) 2219 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; 2220 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); 2221 if ((oldl2 & PTE_SW_MANAGED) != 0) { 2222 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2)); 2223 pvh = pa_to_pvh(PTE_TO_PHYS(oldl2)); 2224 pmap_pvh_free(pvh, pmap, sva); 2225 eva = sva + L2_SIZE; 2226 for (va = sva, m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl2)); 2227 va < eva; va += PAGE_SIZE, m++) { 2228 if ((oldl2 & PTE_D) != 0) 2229 vm_page_dirty(m); 2230 if ((oldl2 & PTE_A) != 0) 2231 vm_page_aflag_set(m, PGA_REFERENCED); 2232 if (TAILQ_EMPTY(&m->md.pv_list) && 2233 TAILQ_EMPTY(&pvh->pv_list)) 2234 vm_page_aflag_clear(m, PGA_WRITEABLE); 2235 } 2236 } 2237 if (pmap == kernel_pmap) { 2238 pmap_remove_kernel_l2(pmap, l2, sva); 2239 } else { 2240 ml3 = pmap_remove_pt_page(pmap, sva); 2241 if (ml3 != NULL) { 2242 KASSERT(vm_page_any_valid(ml3), 2243 ("pmap_remove_l2: l3 page not promoted")); 2244 pmap_resident_count_dec(pmap, 1); 2245 KASSERT(ml3->ref_count == Ln_ENTRIES, 2246 ("pmap_remove_l2: l3 page ref count error")); 2247 ml3->ref_count = 1; 2248 vm_page_unwire_noq(ml3); 2249 pmap_add_delayed_free_list(ml3, free, false); 2250 } 2251 } 2252 return (pmap_unuse_pt(pmap, sva, l1e, free)); 2253 } 2254 2255 /* 2256 * pmap_remove_l3: do the things to unmap a page in a process 2257 */ 2258 static int 2259 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 2260 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) 2261 { 2262 struct md_page *pvh; 2263 pt_entry_t old_l3; 2264 vm_paddr_t phys; 2265 vm_page_t m; 2266 2267 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2268 old_l3 = pmap_load_clear(l3); 2269 pmap_invalidate_page(pmap, va); 2270 if (old_l3 & PTE_SW_WIRED) 2271 pmap->pm_stats.wired_count -= 1; 2272 pmap_resident_count_dec(pmap, 1); 2273 if (old_l3 & PTE_SW_MANAGED) { 2274 phys = PTE_TO_PHYS(old_l3); 2275 m = PHYS_TO_VM_PAGE(phys); 2276 if ((old_l3 & PTE_D) != 0) 2277 vm_page_dirty(m); 2278 if (old_l3 & PTE_A) 2279 vm_page_aflag_set(m, PGA_REFERENCED); 2280 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2281 pmap_pvh_free(&m->md, pmap, va); 2282 if (TAILQ_EMPTY(&m->md.pv_list) && 2283 (m->flags & PG_FICTITIOUS) == 0) { 2284 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2285 if (TAILQ_EMPTY(&pvh->pv_list)) 2286 vm_page_aflag_clear(m, PGA_WRITEABLE); 2287 } 2288 } 2289 2290 return (pmap_unuse_pt(pmap, va, l2e, free)); 2291 } 2292 2293 /* 2294 * Remove the given range of addresses from the specified map. 2295 * 2296 * It is assumed that the start and end are properly 2297 * rounded to the page size. 2298 */ 2299 void 2300 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2301 { 2302 struct spglist free; 2303 struct rwlock *lock; 2304 vm_offset_t va, va_next; 2305 pd_entry_t *l0, *l1, *l2, l2e; 2306 pt_entry_t *l3; 2307 2308 /* 2309 * Perform an unsynchronized read. This is, however, safe. 2310 */ 2311 if (pmap->pm_stats.resident_count == 0) 2312 return; 2313 2314 SLIST_INIT(&free); 2315 2316 rw_rlock(&pvh_global_lock); 2317 PMAP_LOCK(pmap); 2318 2319 lock = NULL; 2320 for (; sva < eva; sva = va_next) { 2321 if (pmap->pm_stats.resident_count == 0) 2322 break; 2323 2324 if (pmap_mode == PMAP_MODE_SV48) { 2325 l0 = pmap_l0(pmap, sva); 2326 if (pmap_load(l0) == 0) { 2327 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 2328 if (va_next < sva) 2329 va_next = eva; 2330 continue; 2331 } 2332 l1 = pmap_l0_to_l1(l0, sva); 2333 } else { 2334 l1 = pmap_l1(pmap, sva); 2335 } 2336 2337 if (pmap_load(l1) == 0) { 2338 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2339 if (va_next < sva) 2340 va_next = eva; 2341 continue; 2342 } 2343 2344 /* 2345 * Calculate index for next page table. 2346 */ 2347 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2348 if (va_next < sva) 2349 va_next = eva; 2350 2351 l2 = pmap_l1_to_l2(l1, sva); 2352 if (l2 == NULL) 2353 continue; 2354 if ((l2e = pmap_load(l2)) == 0) 2355 continue; 2356 if ((l2e & PTE_RWX) != 0) { 2357 if (sva + L2_SIZE == va_next && eva >= va_next) { 2358 (void)pmap_remove_l2(pmap, l2, sva, 2359 pmap_load(l1), &free, &lock); 2360 continue; 2361 } else if (!pmap_demote_l2_locked(pmap, l2, sva, 2362 &lock)) { 2363 /* 2364 * The large page mapping was destroyed. 2365 */ 2366 continue; 2367 } 2368 l2e = pmap_load(l2); 2369 } 2370 2371 /* 2372 * Limit our scan to either the end of the va represented 2373 * by the current page table page, or to the end of the 2374 * range being removed. 2375 */ 2376 if (va_next > eva) 2377 va_next = eva; 2378 2379 va = va_next; 2380 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2381 sva += L3_SIZE) { 2382 if (pmap_load(l3) == 0) { 2383 if (va != va_next) { 2384 pmap_invalidate_range(pmap, va, sva); 2385 va = va_next; 2386 } 2387 continue; 2388 } 2389 if (va == va_next) 2390 va = sva; 2391 if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) { 2392 sva += L3_SIZE; 2393 break; 2394 } 2395 } 2396 if (va != va_next) 2397 pmap_invalidate_range(pmap, va, sva); 2398 } 2399 if (lock != NULL) 2400 rw_wunlock(lock); 2401 rw_runlock(&pvh_global_lock); 2402 PMAP_UNLOCK(pmap); 2403 vm_page_free_pages_toq(&free, false); 2404 } 2405 2406 /* 2407 * Routine: pmap_remove_all 2408 * Function: 2409 * Removes this physical page from 2410 * all physical maps in which it resides. 2411 * Reflects back modify bits to the pager. 2412 * 2413 * Notes: 2414 * Original versions of this routine were very 2415 * inefficient because they iteratively called 2416 * pmap_remove (slow...) 2417 */ 2418 2419 void 2420 pmap_remove_all(vm_page_t m) 2421 { 2422 struct spglist free; 2423 struct md_page *pvh; 2424 pmap_t pmap; 2425 pt_entry_t *l3, l3e; 2426 pd_entry_t *l2, l2e __diagused; 2427 pv_entry_t pv; 2428 vm_offset_t va; 2429 2430 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2431 ("pmap_remove_all: page %p is not managed", m)); 2432 SLIST_INIT(&free); 2433 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 2434 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2435 2436 rw_wlock(&pvh_global_lock); 2437 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 2438 pmap = PV_PMAP(pv); 2439 PMAP_LOCK(pmap); 2440 va = pv->pv_va; 2441 l2 = pmap_l2(pmap, va); 2442 (void)pmap_demote_l2(pmap, l2, va); 2443 PMAP_UNLOCK(pmap); 2444 } 2445 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2446 pmap = PV_PMAP(pv); 2447 PMAP_LOCK(pmap); 2448 pmap_resident_count_dec(pmap, 1); 2449 l2 = pmap_l2(pmap, pv->pv_va); 2450 KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found")); 2451 l2e = pmap_load(l2); 2452 2453 KASSERT((l2e & PTE_RX) == 0, 2454 ("pmap_remove_all: found a superpage in %p's pv list", m)); 2455 2456 l3 = pmap_l2_to_l3(l2, pv->pv_va); 2457 l3e = pmap_load_clear(l3); 2458 pmap_invalidate_page(pmap, pv->pv_va); 2459 if (l3e & PTE_SW_WIRED) 2460 pmap->pm_stats.wired_count--; 2461 if ((l3e & PTE_A) != 0) 2462 vm_page_aflag_set(m, PGA_REFERENCED); 2463 2464 /* 2465 * Update the vm_page_t clean and reference bits. 2466 */ 2467 if ((l3e & PTE_D) != 0) 2468 vm_page_dirty(m); 2469 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free); 2470 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2471 m->md.pv_gen++; 2472 free_pv_entry(pmap, pv); 2473 PMAP_UNLOCK(pmap); 2474 } 2475 vm_page_aflag_clear(m, PGA_WRITEABLE); 2476 rw_wunlock(&pvh_global_lock); 2477 vm_page_free_pages_toq(&free, false); 2478 } 2479 2480 /* 2481 * Set the physical protection on the 2482 * specified range of this map as requested. 2483 */ 2484 void 2485 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2486 { 2487 pd_entry_t *l0, *l1, *l2, l2e; 2488 pt_entry_t *l3, l3e, mask; 2489 vm_page_t m, mt; 2490 vm_paddr_t pa; 2491 vm_offset_t va_next; 2492 bool anychanged, pv_lists_locked; 2493 2494 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2495 pmap_remove(pmap, sva, eva); 2496 return; 2497 } 2498 2499 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == 2500 (VM_PROT_WRITE | VM_PROT_EXECUTE)) 2501 return; 2502 2503 anychanged = false; 2504 pv_lists_locked = false; 2505 mask = 0; 2506 if ((prot & VM_PROT_WRITE) == 0) 2507 mask |= PTE_W | PTE_D; 2508 if ((prot & VM_PROT_EXECUTE) == 0) 2509 mask |= PTE_X; 2510 resume: 2511 PMAP_LOCK(pmap); 2512 for (; sva < eva; sva = va_next) { 2513 if (pmap_mode == PMAP_MODE_SV48) { 2514 l0 = pmap_l0(pmap, sva); 2515 if (pmap_load(l0) == 0) { 2516 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 2517 if (va_next < sva) 2518 va_next = eva; 2519 continue; 2520 } 2521 l1 = pmap_l0_to_l1(l0, sva); 2522 } else { 2523 l1 = pmap_l1(pmap, sva); 2524 } 2525 2526 if (pmap_load(l1) == 0) { 2527 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2528 if (va_next < sva) 2529 va_next = eva; 2530 continue; 2531 } 2532 2533 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2534 if (va_next < sva) 2535 va_next = eva; 2536 2537 l2 = pmap_l1_to_l2(l1, sva); 2538 if (l2 == NULL || (l2e = pmap_load(l2)) == 0) 2539 continue; 2540 if ((l2e & PTE_RWX) != 0) { 2541 if (sva + L2_SIZE == va_next && eva >= va_next) { 2542 retryl2: 2543 if ((prot & VM_PROT_WRITE) == 0 && 2544 (l2e & (PTE_SW_MANAGED | PTE_D)) == 2545 (PTE_SW_MANAGED | PTE_D)) { 2546 pa = PTE_TO_PHYS(l2e); 2547 m = PHYS_TO_VM_PAGE(pa); 2548 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 2549 vm_page_dirty(mt); 2550 } 2551 if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask)) 2552 goto retryl2; 2553 anychanged = true; 2554 continue; 2555 } else { 2556 if (!pv_lists_locked) { 2557 pv_lists_locked = true; 2558 if (!rw_try_rlock(&pvh_global_lock)) { 2559 if (anychanged) 2560 pmap_invalidate_all( 2561 pmap); 2562 PMAP_UNLOCK(pmap); 2563 rw_rlock(&pvh_global_lock); 2564 goto resume; 2565 } 2566 } 2567 if (!pmap_demote_l2(pmap, l2, sva)) { 2568 /* 2569 * The large page mapping was destroyed. 2570 */ 2571 continue; 2572 } 2573 } 2574 } 2575 2576 if (va_next > eva) 2577 va_next = eva; 2578 2579 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2580 sva += L3_SIZE) { 2581 l3e = pmap_load(l3); 2582 retryl3: 2583 if ((l3e & PTE_V) == 0) 2584 continue; 2585 if ((prot & VM_PROT_WRITE) == 0 && 2586 (l3e & (PTE_SW_MANAGED | PTE_D)) == 2587 (PTE_SW_MANAGED | PTE_D)) { 2588 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(l3e)); 2589 vm_page_dirty(m); 2590 } 2591 if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask)) 2592 goto retryl3; 2593 anychanged = true; 2594 } 2595 } 2596 if (anychanged) 2597 pmap_invalidate_all(pmap); 2598 if (pv_lists_locked) 2599 rw_runlock(&pvh_global_lock); 2600 PMAP_UNLOCK(pmap); 2601 } 2602 2603 int 2604 pmap_fault(pmap_t pmap, vm_offset_t va, vm_prot_t ftype) 2605 { 2606 pd_entry_t *l2, l2e; 2607 pt_entry_t bits, *pte, oldpte; 2608 int rv; 2609 2610 KASSERT(VIRT_IS_VALID(va), ("pmap_fault: invalid va %#lx", va)); 2611 2612 rv = 0; 2613 PMAP_LOCK(pmap); 2614 l2 = pmap_l2(pmap, va); 2615 if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) 2616 goto done; 2617 if ((l2e & PTE_RWX) == 0) { 2618 pte = pmap_l2_to_l3(l2, va); 2619 if (pte == NULL || ((oldpte = pmap_load(pte)) & PTE_V) == 0) 2620 goto done; 2621 } else { 2622 pte = l2; 2623 oldpte = l2e; 2624 } 2625 2626 if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) || 2627 (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) || 2628 (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) || 2629 (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0)) 2630 goto done; 2631 2632 bits = PTE_A; 2633 if (ftype == VM_PROT_WRITE) 2634 bits |= PTE_D; 2635 2636 /* 2637 * Spurious faults can occur if the implementation caches invalid 2638 * entries in the TLB, or if simultaneous accesses on multiple CPUs 2639 * race with each other. 2640 */ 2641 if ((oldpte & bits) != bits) 2642 pmap_store_bits(pte, bits); 2643 sfence_vma(); 2644 rv = 1; 2645 done: 2646 PMAP_UNLOCK(pmap); 2647 return (rv); 2648 } 2649 2650 static bool 2651 pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va) 2652 { 2653 struct rwlock *lock; 2654 bool rv; 2655 2656 lock = NULL; 2657 rv = pmap_demote_l2_locked(pmap, l2, va, &lock); 2658 if (lock != NULL) 2659 rw_wunlock(lock); 2660 return (rv); 2661 } 2662 2663 /* 2664 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 2665 * mapping is invalidated. 2666 */ 2667 static bool 2668 pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 2669 struct rwlock **lockp) 2670 { 2671 struct spglist free; 2672 vm_page_t mpte; 2673 pd_entry_t newl2, oldl2; 2674 pt_entry_t *firstl3, newl3; 2675 vm_paddr_t mptepa; 2676 int i; 2677 2678 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2679 2680 oldl2 = pmap_load(l2); 2681 KASSERT((oldl2 & PTE_RWX) != 0, 2682 ("pmap_demote_l2_locked: oldl2 is not a leaf entry")); 2683 if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 2684 NULL) { 2685 KASSERT((oldl2 & PTE_SW_WIRED) == 0, 2686 ("pmap_demote_l2_locked: page table page for a wired mapping is missing")); 2687 if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc_noobj( 2688 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) | 2689 VM_ALLOC_WIRED)) == NULL) { 2690 SLIST_INIT(&free); 2691 (void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET, 2692 pmap_load(pmap_l1(pmap, va)), &free, lockp); 2693 vm_page_free_pages_toq(&free, true); 2694 CTR2(KTR_PMAP, "pmap_demote_l2_locked: " 2695 "failure for va %#lx in pmap %p", va, pmap); 2696 return (false); 2697 } 2698 mpte->pindex = pmap_l2_pindex(va); 2699 if (va < VM_MAXUSER_ADDRESS) { 2700 mpte->ref_count = Ln_ENTRIES; 2701 pmap_resident_count_inc(pmap, 1); 2702 } 2703 } 2704 mptepa = VM_PAGE_TO_PHYS(mpte); 2705 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 2706 newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V; 2707 KASSERT((oldl2 & PTE_A) != 0, 2708 ("pmap_demote_l2_locked: oldl2 is missing PTE_A")); 2709 KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W, 2710 ("pmap_demote_l2_locked: oldl2 is missing PTE_D")); 2711 newl3 = oldl2; 2712 2713 /* 2714 * If the page table page is not leftover from an earlier promotion, 2715 * initialize it. 2716 */ 2717 if (!vm_page_all_valid(mpte)) { 2718 for (i = 0; i < Ln_ENTRIES; i++) 2719 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2720 } 2721 KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3), 2722 ("pmap_demote_l2_locked: firstl3 and newl3 map different physical " 2723 "addresses")); 2724 2725 /* 2726 * If the mapping has changed attributes, update the PTEs. 2727 */ 2728 if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE)) 2729 for (i = 0; i < Ln_ENTRIES; i++) 2730 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2731 2732 /* 2733 * The spare PV entries must be reserved prior to demoting the 2734 * mapping, that is, prior to changing the L2 entry. Otherwise, the 2735 * state of the L2 entry and the PV lists will be inconsistent, which 2736 * can result in reclaim_pv_chunk() attempting to remove a PV entry from 2737 * the wrong PV list and pmap_pv_demote_l2() failing to find the 2738 * expected PV entry for the 2MB page mapping that is being demoted. 2739 */ 2740 if ((oldl2 & PTE_SW_MANAGED) != 0) 2741 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); 2742 2743 /* 2744 * Demote the mapping. 2745 */ 2746 pmap_store(l2, newl2); 2747 2748 /* 2749 * Demote the PV entry. 2750 */ 2751 if ((oldl2 & PTE_SW_MANAGED) != 0) 2752 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp); 2753 2754 atomic_add_long(&pmap_l2_demotions, 1); 2755 CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p", 2756 va, pmap); 2757 return (true); 2758 } 2759 2760 #if VM_NRESERVLEVEL > 0 2761 static bool 2762 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t ml3, 2763 struct rwlock **lockp) 2764 { 2765 pt_entry_t all_l3e_PTE_A, *firstl3, firstl3e, *l3, l3e; 2766 vm_paddr_t pa; 2767 2768 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2769 if (!pmap_ps_enabled(pmap)) 2770 return (false); 2771 2772 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 2773 ("pmap_promote_l2: invalid l2 entry %p", l2)); 2774 2775 /* 2776 * Examine the first L3E in the specified PTP. Abort if this L3E is 2777 * ineligible for promotion or does not map the first 4KB physical page 2778 * within a 2MB page. 2779 */ 2780 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2))); 2781 firstl3e = pmap_load(firstl3); 2782 pa = PTE_TO_PHYS(firstl3e); 2783 if ((pa & L2_OFFSET) != 0) { 2784 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 2785 va, pmap); 2786 atomic_add_long(&pmap_l2_p_failures, 1); 2787 return (false); 2788 } 2789 2790 /* 2791 * Downgrade a clean, writable mapping to read-only to ensure that the 2792 * hardware does not set PTE_D while we are comparing PTEs. 2793 * 2794 * Upon a write access to a clean mapping, the implementation will 2795 * either atomically check protections and set PTE_D, or raise a page 2796 * fault. In the latter case, the pmap lock provides atomicity. Thus, 2797 * we do not issue an sfence.vma here and instead rely on pmap_fault() 2798 * to do so lazily. 2799 */ 2800 while ((firstl3e & (PTE_W | PTE_D)) == PTE_W) { 2801 if (atomic_fcmpset_64(firstl3, &firstl3e, firstl3e & ~PTE_W)) { 2802 firstl3e &= ~PTE_W; 2803 break; 2804 } 2805 } 2806 2807 /* 2808 * Examine each of the other PTEs in the specified PTP. Abort if this 2809 * PTE maps an unexpected 4KB physical page or does not have identical 2810 * characteristics to the first PTE. 2811 */ 2812 all_l3e_PTE_A = firstl3e & PTE_A; 2813 pa += L2_SIZE - PAGE_SIZE; 2814 for (l3 = firstl3 + Ln_ENTRIES - 1; l3 > firstl3; l3--) { 2815 l3e = pmap_load(l3); 2816 if (PTE_TO_PHYS(l3e) != pa) { 2817 CTR2(KTR_PMAP, 2818 "pmap_promote_l2: failure for va %#lx pmap %p", 2819 va, pmap); 2820 atomic_add_long(&pmap_l2_p_failures, 1); 2821 return (false); 2822 } 2823 while ((l3e & (PTE_W | PTE_D)) == PTE_W) { 2824 if (atomic_fcmpset_64(l3, &l3e, l3e & ~PTE_W)) { 2825 l3e &= ~PTE_W; 2826 break; 2827 } 2828 } 2829 if ((l3e & PTE_PROMOTE) != (firstl3e & PTE_PROMOTE)) { 2830 CTR2(KTR_PMAP, 2831 "pmap_promote_l2: failure for va %#lx pmap %p", 2832 va, pmap); 2833 atomic_add_long(&pmap_l2_p_failures, 1); 2834 return (false); 2835 } 2836 all_l3e_PTE_A &= l3e; 2837 pa -= PAGE_SIZE; 2838 } 2839 2840 /* 2841 * Unless all PTEs have PTE_A set, clear it from the superpage 2842 * mapping, so that promotions triggered by speculative mappings, 2843 * such as pmap_enter_quick(), don't automatically mark the 2844 * underlying pages as referenced. 2845 */ 2846 firstl3e &= ~PTE_A | all_l3e_PTE_A; 2847 2848 /* 2849 * Save the page table page in its current state until the L2 2850 * mapping the superpage is demoted by pmap_demote_l2() or 2851 * destroyed by pmap_remove_l3(). 2852 */ 2853 if (ml3 == NULL) 2854 ml3 = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2855 KASSERT(ml3->pindex == pmap_l2_pindex(va), 2856 ("pmap_promote_l2: page table page's pindex is wrong")); 2857 if (pmap_insert_pt_page(pmap, ml3, true, all_l3e_PTE_A != 0)) { 2858 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 2859 va, pmap); 2860 atomic_add_long(&pmap_l2_p_failures, 1); 2861 return (false); 2862 } 2863 2864 if ((firstl3e & PTE_SW_MANAGED) != 0) 2865 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(firstl3e), lockp); 2866 2867 pmap_store(l2, firstl3e); 2868 2869 atomic_add_long(&pmap_l2_promotions, 1); 2870 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, 2871 pmap); 2872 return (true); 2873 } 2874 #endif 2875 2876 /* 2877 * Insert the given physical page (p) at 2878 * the specified virtual address (v) in the 2879 * target physical map with the protection requested. 2880 * 2881 * If specified, the page will be wired down, meaning 2882 * that the related pte can not be reclaimed. 2883 * 2884 * NB: This is the only routine which MAY NOT lazy-evaluate 2885 * or lose information. That is, this routine must actually 2886 * insert this page into the given map NOW. 2887 */ 2888 int 2889 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2890 u_int flags, int8_t psind) 2891 { 2892 struct rwlock *lock; 2893 pd_entry_t *l1, *l2, l2e; 2894 pt_entry_t new_l3, orig_l3; 2895 pt_entry_t *l3; 2896 pv_entry_t pv; 2897 vm_paddr_t opa, pa, l2_pa, l3_pa; 2898 vm_page_t mpte, om, l2_m, l3_m; 2899 pt_entry_t entry; 2900 pn_t l2_pn, l3_pn, pn; 2901 int rv; 2902 bool nosleep; 2903 2904 va = trunc_page(va); 2905 if ((m->oflags & VPO_UNMANAGED) == 0) 2906 VM_PAGE_OBJECT_BUSY_ASSERT(m); 2907 pa = VM_PAGE_TO_PHYS(m); 2908 pn = (pa / PAGE_SIZE); 2909 2910 new_l3 = PTE_V | PTE_R | PTE_A; 2911 if (prot & VM_PROT_EXECUTE) 2912 new_l3 |= PTE_X; 2913 if (flags & VM_PROT_WRITE) 2914 new_l3 |= PTE_D; 2915 if (prot & VM_PROT_WRITE) 2916 new_l3 |= PTE_W; 2917 if (va < VM_MAX_USER_ADDRESS) 2918 new_l3 |= PTE_U; 2919 2920 new_l3 |= (pn << PTE_PPN0_S); 2921 if ((flags & PMAP_ENTER_WIRED) != 0) 2922 new_l3 |= PTE_SW_WIRED; 2923 2924 /* 2925 * Set modified bit gratuitously for writeable mappings if 2926 * the page is unmanaged. We do not want to take a fault 2927 * to do the dirty bit accounting for these mappings. 2928 */ 2929 if ((m->oflags & VPO_UNMANAGED) != 0) { 2930 if (prot & VM_PROT_WRITE) 2931 new_l3 |= PTE_D; 2932 } else 2933 new_l3 |= PTE_SW_MANAGED; 2934 2935 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); 2936 2937 lock = NULL; 2938 mpte = NULL; 2939 rw_rlock(&pvh_global_lock); 2940 PMAP_LOCK(pmap); 2941 if (psind == 1) { 2942 /* Assert the required virtual and physical alignment. */ 2943 KASSERT((va & L2_OFFSET) == 0, 2944 ("pmap_enter: va %#lx unaligned", va)); 2945 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 2946 rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock); 2947 goto out; 2948 } 2949 2950 l2 = pmap_l2(pmap, va); 2951 if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 && 2952 ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2, 2953 va, &lock))) { 2954 l3 = pmap_l2_to_l3(l2, va); 2955 if (va < VM_MAXUSER_ADDRESS) { 2956 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2957 mpte->ref_count++; 2958 } 2959 } else if (va < VM_MAXUSER_ADDRESS) { 2960 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 2961 mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock); 2962 if (mpte == NULL && nosleep) { 2963 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); 2964 if (lock != NULL) 2965 rw_wunlock(lock); 2966 rw_runlock(&pvh_global_lock); 2967 PMAP_UNLOCK(pmap); 2968 return (KERN_RESOURCE_SHORTAGE); 2969 } 2970 l3 = pmap_l3(pmap, va); 2971 } else { 2972 l3 = pmap_l3(pmap, va); 2973 /* TODO: This is not optimal, but should mostly work */ 2974 if (l3 == NULL) { 2975 if (l2 == NULL) { 2976 l2_m = vm_page_alloc_noobj(VM_ALLOC_WIRED | 2977 VM_ALLOC_ZERO); 2978 if (l2_m == NULL) 2979 panic("pmap_enter: l2 pte_m == NULL"); 2980 2981 l2_pa = VM_PAGE_TO_PHYS(l2_m); 2982 l2_pn = (l2_pa / PAGE_SIZE); 2983 2984 l1 = pmap_l1(pmap, va); 2985 entry = (PTE_V); 2986 entry |= (l2_pn << PTE_PPN0_S); 2987 pmap_store(l1, entry); 2988 pmap_distribute_l1(pmap, pmap_l1_index(va), entry); 2989 l2 = pmap_l1_to_l2(l1, va); 2990 } 2991 2992 l3_m = vm_page_alloc_noobj(VM_ALLOC_WIRED | 2993 VM_ALLOC_ZERO); 2994 if (l3_m == NULL) 2995 panic("pmap_enter: l3 pte_m == NULL"); 2996 2997 l3_pa = VM_PAGE_TO_PHYS(l3_m); 2998 l3_pn = (l3_pa / PAGE_SIZE); 2999 entry = (PTE_V); 3000 entry |= (l3_pn << PTE_PPN0_S); 3001 pmap_store(l2, entry); 3002 l3 = pmap_l2_to_l3(l2, va); 3003 } 3004 pmap_invalidate_page(pmap, va); 3005 } 3006 3007 orig_l3 = pmap_load(l3); 3008 opa = PTE_TO_PHYS(orig_l3); 3009 pv = NULL; 3010 3011 /* 3012 * Is the specified virtual address already mapped? 3013 */ 3014 if ((orig_l3 & PTE_V) != 0) { 3015 /* 3016 * Wiring change, just update stats. We don't worry about 3017 * wiring PT pages as they remain resident as long as there 3018 * are valid mappings in them. Hence, if a user page is wired, 3019 * the PT page will be also. 3020 */ 3021 if ((flags & PMAP_ENTER_WIRED) != 0 && 3022 (orig_l3 & PTE_SW_WIRED) == 0) 3023 pmap->pm_stats.wired_count++; 3024 else if ((flags & PMAP_ENTER_WIRED) == 0 && 3025 (orig_l3 & PTE_SW_WIRED) != 0) 3026 pmap->pm_stats.wired_count--; 3027 3028 /* 3029 * Remove the extra PT page reference. 3030 */ 3031 if (mpte != NULL) { 3032 mpte->ref_count--; 3033 KASSERT(mpte->ref_count > 0, 3034 ("pmap_enter: missing reference to page table page," 3035 " va: 0x%lx", va)); 3036 } 3037 3038 /* 3039 * Has the physical page changed? 3040 */ 3041 if (opa == pa) { 3042 /* 3043 * No, might be a protection or wiring change. 3044 */ 3045 if ((orig_l3 & PTE_SW_MANAGED) != 0 && 3046 (new_l3 & PTE_W) != 0) 3047 vm_page_aflag_set(m, PGA_WRITEABLE); 3048 goto validate; 3049 } 3050 3051 /* 3052 * The physical page has changed. Temporarily invalidate 3053 * the mapping. This ensures that all threads sharing the 3054 * pmap keep a consistent view of the mapping, which is 3055 * necessary for the correct handling of COW faults. It 3056 * also permits reuse of the old mapping's PV entry, 3057 * avoiding an allocation. 3058 * 3059 * For consistency, handle unmanaged mappings the same way. 3060 */ 3061 orig_l3 = pmap_load_clear(l3); 3062 KASSERT(PTE_TO_PHYS(orig_l3) == opa, 3063 ("pmap_enter: unexpected pa update for %#lx", va)); 3064 if ((orig_l3 & PTE_SW_MANAGED) != 0) { 3065 om = PHYS_TO_VM_PAGE(opa); 3066 3067 /* 3068 * The pmap lock is sufficient to synchronize with 3069 * concurrent calls to pmap_page_test_mappings() and 3070 * pmap_ts_referenced(). 3071 */ 3072 if ((orig_l3 & PTE_D) != 0) 3073 vm_page_dirty(om); 3074 if ((orig_l3 & PTE_A) != 0) 3075 vm_page_aflag_set(om, PGA_REFERENCED); 3076 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 3077 pv = pmap_pvh_remove(&om->md, pmap, va); 3078 KASSERT(pv != NULL, 3079 ("pmap_enter: no PV entry for %#lx", va)); 3080 if ((new_l3 & PTE_SW_MANAGED) == 0) 3081 free_pv_entry(pmap, pv); 3082 if ((om->a.flags & PGA_WRITEABLE) != 0 && 3083 TAILQ_EMPTY(&om->md.pv_list) && 3084 ((om->flags & PG_FICTITIOUS) != 0 || 3085 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 3086 vm_page_aflag_clear(om, PGA_WRITEABLE); 3087 } 3088 pmap_invalidate_page(pmap, va); 3089 orig_l3 = 0; 3090 } else { 3091 /* 3092 * Increment the counters. 3093 */ 3094 if ((new_l3 & PTE_SW_WIRED) != 0) 3095 pmap->pm_stats.wired_count++; 3096 pmap_resident_count_inc(pmap, 1); 3097 } 3098 /* 3099 * Enter on the PV list if part of our managed memory. 3100 */ 3101 if ((new_l3 & PTE_SW_MANAGED) != 0) { 3102 if (pv == NULL) { 3103 pv = get_pv_entry(pmap, &lock); 3104 pv->pv_va = va; 3105 } 3106 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 3107 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3108 m->md.pv_gen++; 3109 if ((new_l3 & PTE_W) != 0) 3110 vm_page_aflag_set(m, PGA_WRITEABLE); 3111 } 3112 3113 validate: 3114 /* 3115 * Sync the i-cache on all harts before updating the PTE 3116 * if the new PTE is executable. 3117 */ 3118 if (prot & VM_PROT_EXECUTE) 3119 pmap_sync_icache(pmap, va, PAGE_SIZE); 3120 3121 /* 3122 * Update the L3 entry. 3123 */ 3124 if (orig_l3 != 0) { 3125 orig_l3 = pmap_load_store(l3, new_l3); 3126 pmap_invalidate_page(pmap, va); 3127 KASSERT(PTE_TO_PHYS(orig_l3) == pa, 3128 ("pmap_enter: invalid update")); 3129 if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) == 3130 (PTE_D | PTE_SW_MANAGED)) 3131 vm_page_dirty(m); 3132 } else { 3133 pmap_store(l3, new_l3); 3134 } 3135 3136 #if VM_NRESERVLEVEL > 0 3137 if (mpte != NULL && mpte->ref_count == Ln_ENTRIES && 3138 (m->flags & PG_FICTITIOUS) == 0 && 3139 vm_reserv_level_iffullpop(m) == 0) 3140 (void)pmap_promote_l2(pmap, l2, va, mpte, &lock); 3141 #endif 3142 3143 rv = KERN_SUCCESS; 3144 out: 3145 if (lock != NULL) 3146 rw_wunlock(lock); 3147 rw_runlock(&pvh_global_lock); 3148 PMAP_UNLOCK(pmap); 3149 return (rv); 3150 } 3151 3152 /* 3153 * Tries to create a read- and/or execute-only 2MB page mapping. Returns 3154 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error 3155 * value. See pmap_enter_l2() for the possible error values when "no sleep", 3156 * "no replace", and "no reclaim" are specified. 3157 */ 3158 static int 3159 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3160 struct rwlock **lockp) 3161 { 3162 pd_entry_t new_l2; 3163 pn_t pn; 3164 3165 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3166 3167 pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE; 3168 new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V); 3169 if ((m->oflags & VPO_UNMANAGED) == 0) 3170 new_l2 |= PTE_SW_MANAGED; 3171 if ((prot & VM_PROT_EXECUTE) != 0) 3172 new_l2 |= PTE_X; 3173 if (va < VM_MAXUSER_ADDRESS) 3174 new_l2 |= PTE_U; 3175 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | 3176 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp)); 3177 } 3178 3179 /* 3180 * Returns true if every page table entry in the specified page table is 3181 * zero. 3182 */ 3183 static bool 3184 pmap_every_pte_zero(vm_paddr_t pa) 3185 { 3186 pt_entry_t *pt_end, *pte; 3187 3188 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); 3189 pte = (pt_entry_t *)PHYS_TO_DMAP(pa); 3190 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) { 3191 if (*pte != 0) 3192 return (false); 3193 } 3194 return (true); 3195 } 3196 3197 /* 3198 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 3199 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or 3200 * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if 3201 * PMAP_ENTER_NOREPLACE was specified and a 4KB page mapping already exists 3202 * within the 2MB virtual address range starting at the specified virtual 3203 * address. Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a 3204 * 2MB page mapping already exists at the specified virtual address. Returns 3205 * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a 3206 * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified 3207 * and a PV entry allocation failed. 3208 * 3209 * The parameter "m" is only used when creating a managed, writeable mapping. 3210 */ 3211 static int 3212 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, 3213 vm_page_t m, struct rwlock **lockp) 3214 { 3215 struct spglist free; 3216 pd_entry_t *l2, *l3, oldl2; 3217 vm_offset_t sva; 3218 vm_page_t l2pg, mt; 3219 vm_page_t uwptpg; 3220 3221 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3222 3223 if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? 3224 NULL : lockp)) == NULL) { 3225 CTR2(KTR_PMAP, "pmap_enter_l2: failed to allocate PT page" 3226 " for va %#lx in pmap %p", va, pmap); 3227 return (KERN_RESOURCE_SHORTAGE); 3228 } 3229 3230 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); 3231 l2 = &l2[pmap_l2_index(va)]; 3232 if ((oldl2 = pmap_load(l2)) != 0) { 3233 KASSERT(l2pg->ref_count > 1, 3234 ("pmap_enter_l2: l2pg's ref count is too low")); 3235 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 3236 if ((oldl2 & PTE_RWX) != 0) { 3237 l2pg->ref_count--; 3238 CTR2(KTR_PMAP, 3239 "pmap_enter_l2: no space for va %#lx" 3240 " in pmap %p", va, pmap); 3241 return (KERN_NO_SPACE); 3242 } else if (va < VM_MAXUSER_ADDRESS || 3243 !pmap_every_pte_zero(L2PTE_TO_PHYS(oldl2))) { 3244 l2pg->ref_count--; 3245 CTR2(KTR_PMAP, "pmap_enter_l2:" 3246 " failed to replace existing mapping" 3247 " for va %#lx in pmap %p", va, pmap); 3248 return (KERN_FAILURE); 3249 } 3250 } 3251 SLIST_INIT(&free); 3252 if ((oldl2 & PTE_RWX) != 0) 3253 (void)pmap_remove_l2(pmap, l2, va, 3254 pmap_load(pmap_l1(pmap, va)), &free, lockp); 3255 else 3256 for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) { 3257 l3 = pmap_l2_to_l3(l2, sva); 3258 if ((pmap_load(l3) & PTE_V) != 0 && 3259 pmap_remove_l3(pmap, l3, sva, oldl2, &free, 3260 lockp) != 0) 3261 break; 3262 } 3263 vm_page_free_pages_toq(&free, true); 3264 if (va >= VM_MAXUSER_ADDRESS) { 3265 /* 3266 * Both pmap_remove_l2() and pmap_remove_l3() will 3267 * leave the kernel page table page zero filled. 3268 */ 3269 mt = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 3270 if (pmap_insert_pt_page(pmap, mt, false, false)) 3271 panic("pmap_enter_l2: trie insert failed"); 3272 } else 3273 KASSERT(pmap_load(l2) == 0, 3274 ("pmap_enter_l2: non-zero L2 entry %p", l2)); 3275 } 3276 3277 /* 3278 * Allocate leaf ptpage for wired userspace pages. 3279 */ 3280 uwptpg = NULL; 3281 if ((new_l2 & PTE_SW_WIRED) != 0 && pmap != kernel_pmap) { 3282 uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED); 3283 if (uwptpg == NULL) { 3284 return (KERN_RESOURCE_SHORTAGE); 3285 } 3286 uwptpg->pindex = pmap_l2_pindex(va); 3287 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) { 3288 vm_page_unwire_noq(uwptpg); 3289 vm_page_free(uwptpg); 3290 return (KERN_RESOURCE_SHORTAGE); 3291 } 3292 pmap_resident_count_inc(pmap, 1); 3293 uwptpg->ref_count = Ln_ENTRIES; 3294 } 3295 if ((new_l2 & PTE_SW_MANAGED) != 0) { 3296 /* 3297 * Abort this mapping if its PV entry could not be created. 3298 */ 3299 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { 3300 SLIST_INIT(&free); 3301 if (pmap_unwire_ptp(pmap, va, l2pg, &free)) { 3302 /* 3303 * Although "va" is not mapped, paging-structure 3304 * caches could nonetheless have entries that 3305 * refer to the freed page table pages. 3306 * Invalidate those entries. 3307 */ 3308 pmap_invalidate_page(pmap, va); 3309 vm_page_free_pages_toq(&free, true); 3310 } 3311 if (uwptpg != NULL) { 3312 mt = pmap_remove_pt_page(pmap, va); 3313 KASSERT(mt == uwptpg, 3314 ("removed pt page %p, expected %p", mt, 3315 uwptpg)); 3316 pmap_resident_count_dec(pmap, 1); 3317 uwptpg->ref_count = 1; 3318 vm_page_unwire_noq(uwptpg); 3319 vm_page_free(uwptpg); 3320 } 3321 CTR2(KTR_PMAP, 3322 "pmap_enter_l2: failed to create PV entry" 3323 " for va %#lx in pmap %p", va, pmap); 3324 return (KERN_RESOURCE_SHORTAGE); 3325 } 3326 if ((new_l2 & PTE_W) != 0) 3327 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 3328 vm_page_aflag_set(mt, PGA_WRITEABLE); 3329 } 3330 3331 /* 3332 * Increment counters. 3333 */ 3334 if ((new_l2 & PTE_SW_WIRED) != 0) 3335 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; 3336 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; 3337 3338 /* 3339 * Map the superpage. 3340 */ 3341 pmap_store(l2, new_l2); 3342 3343 atomic_add_long(&pmap_l2_mappings, 1); 3344 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", 3345 va, pmap); 3346 3347 return (KERN_SUCCESS); 3348 } 3349 3350 /* 3351 * Maps a sequence of resident pages belonging to the same object. 3352 * The sequence begins with the given page m_start. This page is 3353 * mapped at the given virtual address start. Each subsequent page is 3354 * mapped at a virtual address that is offset from start by the same 3355 * amount as the page is offset from m_start within the object. The 3356 * last page in the sequence is the page with the largest offset from 3357 * m_start that can be mapped at a virtual address less than the given 3358 * virtual address end. Not every virtual page between start and end 3359 * is mapped; only those for which a resident page exists with the 3360 * corresponding offset from m_start are mapped. 3361 */ 3362 void 3363 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3364 vm_page_t m_start, vm_prot_t prot) 3365 { 3366 struct rwlock *lock; 3367 vm_offset_t va; 3368 vm_page_t m, mpte; 3369 vm_pindex_t diff, psize; 3370 int rv; 3371 3372 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3373 3374 psize = atop(end - start); 3375 mpte = NULL; 3376 m = m_start; 3377 lock = NULL; 3378 rw_rlock(&pvh_global_lock); 3379 PMAP_LOCK(pmap); 3380 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3381 va = start + ptoa(diff); 3382 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && 3383 m->psind == 1 && pmap_ps_enabled(pmap) && 3384 ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) == 3385 KERN_SUCCESS || rv == KERN_NO_SPACE)) 3386 m = &m[L2_SIZE / PAGE_SIZE - 1]; 3387 else 3388 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, 3389 &lock); 3390 m = TAILQ_NEXT(m, listq); 3391 } 3392 if (lock != NULL) 3393 rw_wunlock(lock); 3394 rw_runlock(&pvh_global_lock); 3395 PMAP_UNLOCK(pmap); 3396 } 3397 3398 /* 3399 * this code makes some *MAJOR* assumptions: 3400 * 1. Current pmap & pmap exists. 3401 * 2. Not wired. 3402 * 3. Read access. 3403 * 4. No page table pages. 3404 * but is *MUCH* faster than pmap_enter... 3405 */ 3406 3407 void 3408 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3409 { 3410 struct rwlock *lock; 3411 3412 lock = NULL; 3413 rw_rlock(&pvh_global_lock); 3414 PMAP_LOCK(pmap); 3415 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 3416 if (lock != NULL) 3417 rw_wunlock(lock); 3418 rw_runlock(&pvh_global_lock); 3419 PMAP_UNLOCK(pmap); 3420 } 3421 3422 static vm_page_t 3423 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3424 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 3425 { 3426 struct spglist free; 3427 vm_paddr_t phys; 3428 pd_entry_t *l2; 3429 pt_entry_t *l3, newl3; 3430 3431 KASSERT(!VA_IS_CLEANMAP(va) || 3432 (m->oflags & VPO_UNMANAGED) != 0, 3433 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3434 rw_assert(&pvh_global_lock, RA_LOCKED); 3435 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3436 l2 = NULL; 3437 3438 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); 3439 /* 3440 * In the case that a page table page is not 3441 * resident, we are creating it here. 3442 */ 3443 if (va < VM_MAXUSER_ADDRESS) { 3444 vm_pindex_t l2pindex; 3445 3446 /* 3447 * Calculate pagetable page index 3448 */ 3449 l2pindex = pmap_l2_pindex(va); 3450 if (mpte && (mpte->pindex == l2pindex)) { 3451 mpte->ref_count++; 3452 } else { 3453 /* 3454 * Get the l2 entry 3455 */ 3456 l2 = pmap_l2(pmap, va); 3457 3458 /* 3459 * If the page table page is mapped, we just increment 3460 * the hold count, and activate it. Otherwise, we 3461 * attempt to allocate a page table page. If this 3462 * attempt fails, we don't retry. Instead, we give up. 3463 */ 3464 if (l2 != NULL && pmap_load(l2) != 0) { 3465 if ((pmap_load(l2) & PTE_RWX) != 0) 3466 return (NULL); 3467 phys = PTE_TO_PHYS(pmap_load(l2)); 3468 mpte = PHYS_TO_VM_PAGE(phys); 3469 mpte->ref_count++; 3470 } else { 3471 /* 3472 * Pass NULL instead of the PV list lock 3473 * pointer, because we don't intend to sleep. 3474 */ 3475 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); 3476 if (mpte == NULL) 3477 return (mpte); 3478 } 3479 } 3480 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 3481 l3 = &l3[pmap_l3_index(va)]; 3482 } else { 3483 mpte = NULL; 3484 l3 = pmap_l3(kernel_pmap, va); 3485 } 3486 if (l3 == NULL) 3487 panic("pmap_enter_quick_locked: No l3"); 3488 if (pmap_load(l3) != 0) { 3489 if (mpte != NULL) 3490 mpte->ref_count--; 3491 return (NULL); 3492 } 3493 3494 /* 3495 * Enter on the PV list if part of our managed memory. 3496 */ 3497 if ((m->oflags & VPO_UNMANAGED) == 0 && 3498 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 3499 if (mpte != NULL) { 3500 SLIST_INIT(&free); 3501 if (pmap_unwire_ptp(pmap, va, mpte, &free)) 3502 vm_page_free_pages_toq(&free, false); 3503 } 3504 return (NULL); 3505 } 3506 3507 /* 3508 * Increment counters 3509 */ 3510 pmap_resident_count_inc(pmap, 1); 3511 3512 newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) | 3513 PTE_V | PTE_R; 3514 if ((prot & VM_PROT_EXECUTE) != 0) 3515 newl3 |= PTE_X; 3516 if ((m->oflags & VPO_UNMANAGED) == 0) 3517 newl3 |= PTE_SW_MANAGED; 3518 if (va < VM_MAX_USER_ADDRESS) 3519 newl3 |= PTE_U; 3520 3521 /* 3522 * Sync the i-cache on all harts before updating the PTE 3523 * if the new PTE is executable. 3524 */ 3525 if (prot & VM_PROT_EXECUTE) 3526 pmap_sync_icache(pmap, va, PAGE_SIZE); 3527 3528 pmap_store(l3, newl3); 3529 3530 #if VM_NRESERVLEVEL > 0 3531 /* 3532 * If both the PTP and the reservation are fully populated, then attempt 3533 * promotion. 3534 */ 3535 if ((mpte == NULL || mpte->ref_count == Ln_ENTRIES) && 3536 (m->flags & PG_FICTITIOUS) == 0 && 3537 vm_reserv_level_iffullpop(m) == 0) { 3538 if (l2 == NULL) 3539 l2 = pmap_l2(pmap, va); 3540 3541 /* 3542 * If promotion succeeds, then the next call to this function 3543 * should not be given the unmapped PTP as a hint. 3544 */ 3545 if (pmap_promote_l2(pmap, l2, va, mpte, lockp)) 3546 mpte = NULL; 3547 } 3548 #endif 3549 3550 return (mpte); 3551 } 3552 3553 /* 3554 * This code maps large physical mmap regions into the 3555 * processor address space. Note that some shortcuts 3556 * are taken, but the code works. 3557 */ 3558 void 3559 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3560 vm_pindex_t pindex, vm_size_t size) 3561 { 3562 3563 VM_OBJECT_ASSERT_WLOCKED(object); 3564 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3565 ("pmap_object_init_pt: non-device object")); 3566 } 3567 3568 /* 3569 * Clear the wired attribute from the mappings for the specified range of 3570 * addresses in the given pmap. Every valid mapping within that range 3571 * must have the wired attribute set. In contrast, invalid mappings 3572 * cannot have the wired attribute set, so they are ignored. 3573 * 3574 * The wired attribute of the page table entry is not a hardware feature, 3575 * so there is no need to invalidate any TLB entries. 3576 */ 3577 void 3578 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3579 { 3580 vm_offset_t va_next; 3581 pd_entry_t *l0, *l1, *l2, l2e; 3582 pt_entry_t *l3, l3e; 3583 bool pv_lists_locked; 3584 3585 pv_lists_locked = false; 3586 retry: 3587 PMAP_LOCK(pmap); 3588 for (; sva < eva; sva = va_next) { 3589 if (pmap_mode == PMAP_MODE_SV48) { 3590 l0 = pmap_l0(pmap, sva); 3591 if (pmap_load(l0) == 0) { 3592 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 3593 if (va_next < sva) 3594 va_next = eva; 3595 continue; 3596 } 3597 l1 = pmap_l0_to_l1(l0, sva); 3598 } else { 3599 l1 = pmap_l1(pmap, sva); 3600 } 3601 3602 if (pmap_load(l1) == 0) { 3603 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3604 if (va_next < sva) 3605 va_next = eva; 3606 continue; 3607 } 3608 3609 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3610 if (va_next < sva) 3611 va_next = eva; 3612 3613 l2 = pmap_l1_to_l2(l1, sva); 3614 if ((l2e = pmap_load(l2)) == 0) 3615 continue; 3616 if ((l2e & PTE_RWX) != 0) { 3617 if (sva + L2_SIZE == va_next && eva >= va_next) { 3618 if ((l2e & PTE_SW_WIRED) == 0) 3619 panic("pmap_unwire: l2 %#jx is missing " 3620 "PTE_SW_WIRED", (uintmax_t)l2e); 3621 pmap_clear_bits(l2, PTE_SW_WIRED); 3622 continue; 3623 } else { 3624 if (!pv_lists_locked) { 3625 pv_lists_locked = true; 3626 if (!rw_try_rlock(&pvh_global_lock)) { 3627 PMAP_UNLOCK(pmap); 3628 rw_rlock(&pvh_global_lock); 3629 /* Repeat sva. */ 3630 goto retry; 3631 } 3632 } 3633 if (!pmap_demote_l2(pmap, l2, sva)) 3634 panic("pmap_unwire: demotion failed"); 3635 } 3636 } 3637 3638 if (va_next > eva) 3639 va_next = eva; 3640 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 3641 sva += L3_SIZE) { 3642 if ((l3e = pmap_load(l3)) == 0) 3643 continue; 3644 if ((l3e & PTE_SW_WIRED) == 0) 3645 panic("pmap_unwire: l3 %#jx is missing " 3646 "PTE_SW_WIRED", (uintmax_t)l3e); 3647 3648 /* 3649 * PG_W must be cleared atomically. Although the pmap 3650 * lock synchronizes access to PG_W, another processor 3651 * could be setting PG_M and/or PG_A concurrently. 3652 */ 3653 pmap_clear_bits(l3, PTE_SW_WIRED); 3654 pmap->pm_stats.wired_count--; 3655 } 3656 } 3657 if (pv_lists_locked) 3658 rw_runlock(&pvh_global_lock); 3659 PMAP_UNLOCK(pmap); 3660 } 3661 3662 /* 3663 * Copy the range specified by src_addr/len 3664 * from the source map to the range dst_addr/len 3665 * in the destination map. 3666 * 3667 * This routine is only advisory and need not do anything. 3668 */ 3669 3670 void 3671 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 3672 vm_offset_t src_addr) 3673 { 3674 3675 } 3676 3677 /* 3678 * pmap_zero_page zeros the specified hardware page by mapping 3679 * the page into KVM and using bzero to clear its contents. 3680 */ 3681 void 3682 pmap_zero_page(vm_page_t m) 3683 { 3684 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3685 3686 pagezero((void *)va); 3687 } 3688 3689 /* 3690 * pmap_zero_page_area zeros the specified hardware page by mapping 3691 * the page into KVM and using bzero to clear its contents. 3692 * 3693 * off and size may not cover an area beyond a single hardware page. 3694 */ 3695 void 3696 pmap_zero_page_area(vm_page_t m, int off, int size) 3697 { 3698 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3699 3700 if (off == 0 && size == PAGE_SIZE) 3701 pagezero((void *)va); 3702 else 3703 bzero((char *)va + off, size); 3704 } 3705 3706 /* 3707 * pmap_copy_page copies the specified (machine independent) 3708 * page by mapping the page into virtual memory and using 3709 * bcopy to copy the page, one machine dependent page at a 3710 * time. 3711 */ 3712 void 3713 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 3714 { 3715 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 3716 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 3717 3718 pagecopy((void *)src, (void *)dst); 3719 } 3720 3721 int unmapped_buf_allowed = 1; 3722 3723 void 3724 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 3725 vm_offset_t b_offset, int xfersize) 3726 { 3727 void *a_cp, *b_cp; 3728 vm_page_t m_a, m_b; 3729 vm_paddr_t p_a, p_b; 3730 vm_offset_t a_pg_offset, b_pg_offset; 3731 int cnt; 3732 3733 while (xfersize > 0) { 3734 a_pg_offset = a_offset & PAGE_MASK; 3735 m_a = ma[a_offset >> PAGE_SHIFT]; 3736 p_a = m_a->phys_addr; 3737 b_pg_offset = b_offset & PAGE_MASK; 3738 m_b = mb[b_offset >> PAGE_SHIFT]; 3739 p_b = m_b->phys_addr; 3740 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 3741 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 3742 if (__predict_false(!PHYS_IN_DMAP(p_a))) { 3743 panic("!DMAP a %lx", p_a); 3744 } else { 3745 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 3746 } 3747 if (__predict_false(!PHYS_IN_DMAP(p_b))) { 3748 panic("!DMAP b %lx", p_b); 3749 } else { 3750 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 3751 } 3752 bcopy(a_cp, b_cp, cnt); 3753 a_offset += cnt; 3754 b_offset += cnt; 3755 xfersize -= cnt; 3756 } 3757 } 3758 3759 vm_offset_t 3760 pmap_quick_enter_page(vm_page_t m) 3761 { 3762 3763 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); 3764 } 3765 3766 void 3767 pmap_quick_remove_page(vm_offset_t addr) 3768 { 3769 } 3770 3771 /* 3772 * Returns true if the pmap's pv is one of the first 3773 * 16 pvs linked to from this page. This count may 3774 * be changed upwards or downwards in the future; it 3775 * is only necessary that true be returned for a small 3776 * subset of pmaps for proper page aging. 3777 */ 3778 bool 3779 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 3780 { 3781 struct md_page *pvh; 3782 struct rwlock *lock; 3783 pv_entry_t pv; 3784 int loops = 0; 3785 bool rv; 3786 3787 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3788 ("pmap_page_exists_quick: page %p is not managed", m)); 3789 rv = false; 3790 rw_rlock(&pvh_global_lock); 3791 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3792 rw_rlock(lock); 3793 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3794 if (PV_PMAP(pv) == pmap) { 3795 rv = true; 3796 break; 3797 } 3798 loops++; 3799 if (loops >= 16) 3800 break; 3801 } 3802 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 3803 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3804 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3805 if (PV_PMAP(pv) == pmap) { 3806 rv = true; 3807 break; 3808 } 3809 loops++; 3810 if (loops >= 16) 3811 break; 3812 } 3813 } 3814 rw_runlock(lock); 3815 rw_runlock(&pvh_global_lock); 3816 return (rv); 3817 } 3818 3819 /* 3820 * pmap_page_wired_mappings: 3821 * 3822 * Return the number of managed mappings to the given physical page 3823 * that are wired. 3824 */ 3825 int 3826 pmap_page_wired_mappings(vm_page_t m) 3827 { 3828 struct md_page *pvh; 3829 struct rwlock *lock; 3830 pmap_t pmap; 3831 pd_entry_t *l2; 3832 pt_entry_t *l3; 3833 pv_entry_t pv; 3834 int count, md_gen, pvh_gen; 3835 3836 if ((m->oflags & VPO_UNMANAGED) != 0) 3837 return (0); 3838 rw_rlock(&pvh_global_lock); 3839 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3840 rw_rlock(lock); 3841 restart: 3842 count = 0; 3843 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3844 pmap = PV_PMAP(pv); 3845 if (!PMAP_TRYLOCK(pmap)) { 3846 md_gen = m->md.pv_gen; 3847 rw_runlock(lock); 3848 PMAP_LOCK(pmap); 3849 rw_rlock(lock); 3850 if (md_gen != m->md.pv_gen) { 3851 PMAP_UNLOCK(pmap); 3852 goto restart; 3853 } 3854 } 3855 l2 = pmap_l2(pmap, pv->pv_va); 3856 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 3857 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 3858 l3 = pmap_l2_to_l3(l2, pv->pv_va); 3859 if ((pmap_load(l3) & PTE_SW_WIRED) != 0) 3860 count++; 3861 PMAP_UNLOCK(pmap); 3862 } 3863 if ((m->flags & PG_FICTITIOUS) == 0) { 3864 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3865 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3866 pmap = PV_PMAP(pv); 3867 if (!PMAP_TRYLOCK(pmap)) { 3868 md_gen = m->md.pv_gen; 3869 pvh_gen = pvh->pv_gen; 3870 rw_runlock(lock); 3871 PMAP_LOCK(pmap); 3872 rw_rlock(lock); 3873 if (md_gen != m->md.pv_gen || 3874 pvh_gen != pvh->pv_gen) { 3875 PMAP_UNLOCK(pmap); 3876 goto restart; 3877 } 3878 } 3879 l2 = pmap_l2(pmap, pv->pv_va); 3880 if ((pmap_load(l2) & PTE_SW_WIRED) != 0) 3881 count++; 3882 PMAP_UNLOCK(pmap); 3883 } 3884 } 3885 rw_runlock(lock); 3886 rw_runlock(&pvh_global_lock); 3887 return (count); 3888 } 3889 3890 /* 3891 * Returns true if the given page is mapped individually or as part of 3892 * a 2mpage. Otherwise, returns false. 3893 */ 3894 bool 3895 pmap_page_is_mapped(vm_page_t m) 3896 { 3897 struct rwlock *lock; 3898 bool rv; 3899 3900 if ((m->oflags & VPO_UNMANAGED) != 0) 3901 return (false); 3902 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3903 rw_rlock(lock); 3904 rv = !TAILQ_EMPTY(&m->md.pv_list) || 3905 ((m->flags & PG_FICTITIOUS) == 0 && 3906 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 3907 rw_runlock(lock); 3908 return (rv); 3909 } 3910 3911 static void 3912 pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, 3913 struct spglist *free, bool superpage) 3914 { 3915 struct md_page *pvh; 3916 vm_page_t mpte, mt; 3917 3918 if (superpage) { 3919 pmap_resident_count_dec(pmap, Ln_ENTRIES); 3920 pvh = pa_to_pvh(m->phys_addr); 3921 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3922 pvh->pv_gen++; 3923 if (TAILQ_EMPTY(&pvh->pv_list)) { 3924 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 3925 if (TAILQ_EMPTY(&mt->md.pv_list) && 3926 (mt->a.flags & PGA_WRITEABLE) != 0) 3927 vm_page_aflag_clear(mt, PGA_WRITEABLE); 3928 } 3929 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 3930 if (mpte != NULL) { 3931 KASSERT(vm_page_any_valid(mpte), 3932 ("pmap_remove_pages: pte page not promoted")); 3933 pmap_resident_count_dec(pmap, 1); 3934 KASSERT(mpte->ref_count == Ln_ENTRIES, 3935 ("pmap_remove_pages: pte page ref count error")); 3936 mpte->ref_count = 0; 3937 pmap_add_delayed_free_list(mpte, free, false); 3938 } 3939 } else { 3940 pmap_resident_count_dec(pmap, 1); 3941 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3942 m->md.pv_gen++; 3943 if (TAILQ_EMPTY(&m->md.pv_list) && 3944 (m->a.flags & PGA_WRITEABLE) != 0) { 3945 pvh = pa_to_pvh(m->phys_addr); 3946 if (TAILQ_EMPTY(&pvh->pv_list)) 3947 vm_page_aflag_clear(m, PGA_WRITEABLE); 3948 } 3949 } 3950 } 3951 3952 /* 3953 * Destroy all managed, non-wired mappings in the given user-space 3954 * pmap. This pmap cannot be active on any processor besides the 3955 * caller. 3956 * 3957 * This function cannot be applied to the kernel pmap. Moreover, it 3958 * is not intended for general use. It is only to be used during 3959 * process termination. Consequently, it can be implemented in ways 3960 * that make it faster than pmap_remove(). First, it can more quickly 3961 * destroy mappings by iterating over the pmap's collection of PV 3962 * entries, rather than searching the page table. Second, it doesn't 3963 * have to test and clear the page table entries atomically, because 3964 * no processor is currently accessing the user address space. In 3965 * particular, a page table entry's dirty bit won't change state once 3966 * this function starts. 3967 */ 3968 void 3969 pmap_remove_pages(pmap_t pmap) 3970 { 3971 struct spglist free; 3972 pd_entry_t ptepde; 3973 pt_entry_t *pte, tpte; 3974 vm_page_t m, mt; 3975 pv_entry_t pv; 3976 struct pv_chunk *pc, *npc; 3977 struct rwlock *lock; 3978 int64_t bit; 3979 uint64_t inuse, bitmask; 3980 int allfree, field, freed __pv_stat_used, idx; 3981 bool superpage; 3982 3983 lock = NULL; 3984 3985 SLIST_INIT(&free); 3986 rw_rlock(&pvh_global_lock); 3987 PMAP_LOCK(pmap); 3988 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 3989 allfree = 1; 3990 freed = 0; 3991 for (field = 0; field < _NPCM; field++) { 3992 inuse = ~pc->pc_map[field] & pc_freemask[field]; 3993 while (inuse != 0) { 3994 bit = ffsl(inuse) - 1; 3995 bitmask = 1UL << bit; 3996 idx = field * 64 + bit; 3997 pv = &pc->pc_pventry[idx]; 3998 inuse &= ~bitmask; 3999 4000 pte = pmap_l1(pmap, pv->pv_va); 4001 ptepde = pmap_load(pte); 4002 pte = pmap_l1_to_l2(pte, pv->pv_va); 4003 tpte = pmap_load(pte); 4004 4005 KASSERT((tpte & PTE_V) != 0, 4006 ("L2 PTE is invalid... bogus PV entry? " 4007 "va=%#lx, pte=%#lx", pv->pv_va, tpte)); 4008 if ((tpte & PTE_RWX) != 0) { 4009 superpage = true; 4010 } else { 4011 ptepde = tpte; 4012 pte = pmap_l2_to_l3(pte, pv->pv_va); 4013 tpte = pmap_load(pte); 4014 superpage = false; 4015 } 4016 4017 /* 4018 * We cannot remove wired pages from a 4019 * process' mapping at this time. 4020 */ 4021 if (tpte & PTE_SW_WIRED) { 4022 allfree = 0; 4023 continue; 4024 } 4025 4026 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte)); 4027 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4028 m < &vm_page_array[vm_page_array_size], 4029 ("pmap_remove_pages: bad pte %#jx", 4030 (uintmax_t)tpte)); 4031 4032 pmap_clear(pte); 4033 4034 /* 4035 * Update the vm_page_t clean/reference bits. 4036 */ 4037 if ((tpte & (PTE_D | PTE_W)) == 4038 (PTE_D | PTE_W)) { 4039 if (superpage) 4040 for (mt = m; 4041 mt < &m[Ln_ENTRIES]; mt++) 4042 vm_page_dirty(mt); 4043 else 4044 vm_page_dirty(m); 4045 } 4046 4047 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 4048 4049 /* Mark free */ 4050 pc->pc_map[field] |= bitmask; 4051 4052 pmap_remove_pages_pv(pmap, m, pv, &free, 4053 superpage); 4054 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 4055 freed++; 4056 } 4057 } 4058 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 4059 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 4060 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 4061 if (allfree) { 4062 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4063 free_pv_chunk(pc); 4064 } 4065 } 4066 if (lock != NULL) 4067 rw_wunlock(lock); 4068 pmap_invalidate_all(pmap); 4069 rw_runlock(&pvh_global_lock); 4070 PMAP_UNLOCK(pmap); 4071 vm_page_free_pages_toq(&free, false); 4072 } 4073 4074 static bool 4075 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified) 4076 { 4077 struct md_page *pvh; 4078 struct rwlock *lock; 4079 pd_entry_t *l2; 4080 pt_entry_t *l3, mask; 4081 pv_entry_t pv; 4082 pmap_t pmap; 4083 int md_gen, pvh_gen; 4084 bool rv; 4085 4086 mask = 0; 4087 if (modified) 4088 mask |= PTE_D; 4089 if (accessed) 4090 mask |= PTE_A; 4091 4092 rv = false; 4093 rw_rlock(&pvh_global_lock); 4094 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4095 rw_rlock(lock); 4096 restart: 4097 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4098 pmap = PV_PMAP(pv); 4099 if (!PMAP_TRYLOCK(pmap)) { 4100 md_gen = m->md.pv_gen; 4101 rw_runlock(lock); 4102 PMAP_LOCK(pmap); 4103 rw_rlock(lock); 4104 if (md_gen != m->md.pv_gen) { 4105 PMAP_UNLOCK(pmap); 4106 goto restart; 4107 } 4108 } 4109 l2 = pmap_l2(pmap, pv->pv_va); 4110 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4111 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 4112 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4113 rv = (pmap_load(l3) & mask) == mask; 4114 PMAP_UNLOCK(pmap); 4115 if (rv) 4116 goto out; 4117 } 4118 if ((m->flags & PG_FICTITIOUS) == 0) { 4119 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4120 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4121 pmap = PV_PMAP(pv); 4122 if (!PMAP_TRYLOCK(pmap)) { 4123 md_gen = m->md.pv_gen; 4124 pvh_gen = pvh->pv_gen; 4125 rw_runlock(lock); 4126 PMAP_LOCK(pmap); 4127 rw_rlock(lock); 4128 if (md_gen != m->md.pv_gen || 4129 pvh_gen != pvh->pv_gen) { 4130 PMAP_UNLOCK(pmap); 4131 goto restart; 4132 } 4133 } 4134 l2 = pmap_l2(pmap, pv->pv_va); 4135 rv = (pmap_load(l2) & mask) == mask; 4136 PMAP_UNLOCK(pmap); 4137 if (rv) 4138 goto out; 4139 } 4140 } 4141 out: 4142 rw_runlock(lock); 4143 rw_runlock(&pvh_global_lock); 4144 return (rv); 4145 } 4146 4147 /* 4148 * pmap_is_modified: 4149 * 4150 * Return whether or not the specified physical page was modified 4151 * in any physical maps. 4152 */ 4153 bool 4154 pmap_is_modified(vm_page_t m) 4155 { 4156 4157 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4158 ("pmap_is_modified: page %p is not managed", m)); 4159 4160 /* 4161 * If the page is not busied then this check is racy. 4162 */ 4163 if (!pmap_page_is_write_mapped(m)) 4164 return (false); 4165 return (pmap_page_test_mappings(m, false, true)); 4166 } 4167 4168 /* 4169 * pmap_is_prefaultable: 4170 * 4171 * Return whether or not the specified virtual address is eligible 4172 * for prefault. 4173 */ 4174 bool 4175 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 4176 { 4177 pt_entry_t *l3; 4178 bool rv; 4179 4180 /* 4181 * Return true if and only if the L3 entry for the specified virtual 4182 * address is allocated but invalid. 4183 */ 4184 rv = false; 4185 PMAP_LOCK(pmap); 4186 l3 = pmap_l3(pmap, addr); 4187 if (l3 != NULL && pmap_load(l3) == 0) { 4188 rv = true; 4189 } 4190 PMAP_UNLOCK(pmap); 4191 return (rv); 4192 } 4193 4194 /* 4195 * pmap_is_referenced: 4196 * 4197 * Return whether or not the specified physical page was referenced 4198 * in any physical maps. 4199 */ 4200 bool 4201 pmap_is_referenced(vm_page_t m) 4202 { 4203 4204 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4205 ("pmap_is_referenced: page %p is not managed", m)); 4206 return (pmap_page_test_mappings(m, true, false)); 4207 } 4208 4209 /* 4210 * Clear the write and modified bits in each of the given page's mappings. 4211 */ 4212 void 4213 pmap_remove_write(vm_page_t m) 4214 { 4215 struct md_page *pvh; 4216 struct rwlock *lock; 4217 pmap_t pmap; 4218 pd_entry_t *l2; 4219 pt_entry_t *l3, oldl3, newl3; 4220 pv_entry_t next_pv, pv; 4221 vm_offset_t va; 4222 int md_gen, pvh_gen; 4223 4224 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4225 ("pmap_remove_write: page %p is not managed", m)); 4226 vm_page_assert_busied(m); 4227 4228 if (!pmap_page_is_write_mapped(m)) 4229 return; 4230 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4231 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 4232 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4233 rw_rlock(&pvh_global_lock); 4234 retry_pv_loop: 4235 rw_wlock(lock); 4236 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4237 pmap = PV_PMAP(pv); 4238 if (!PMAP_TRYLOCK(pmap)) { 4239 pvh_gen = pvh->pv_gen; 4240 rw_wunlock(lock); 4241 PMAP_LOCK(pmap); 4242 rw_wlock(lock); 4243 if (pvh_gen != pvh->pv_gen) { 4244 PMAP_UNLOCK(pmap); 4245 rw_wunlock(lock); 4246 goto retry_pv_loop; 4247 } 4248 } 4249 va = pv->pv_va; 4250 l2 = pmap_l2(pmap, va); 4251 if ((pmap_load(l2) & PTE_W) != 0) 4252 (void)pmap_demote_l2_locked(pmap, l2, va, &lock); 4253 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 4254 ("inconsistent pv lock %p %p for page %p", 4255 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 4256 PMAP_UNLOCK(pmap); 4257 } 4258 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4259 pmap = PV_PMAP(pv); 4260 if (!PMAP_TRYLOCK(pmap)) { 4261 pvh_gen = pvh->pv_gen; 4262 md_gen = m->md.pv_gen; 4263 rw_wunlock(lock); 4264 PMAP_LOCK(pmap); 4265 rw_wlock(lock); 4266 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4267 PMAP_UNLOCK(pmap); 4268 rw_wunlock(lock); 4269 goto retry_pv_loop; 4270 } 4271 } 4272 l2 = pmap_l2(pmap, pv->pv_va); 4273 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4274 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 4275 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4276 oldl3 = pmap_load(l3); 4277 retry: 4278 if ((oldl3 & PTE_W) != 0) { 4279 newl3 = oldl3 & ~(PTE_D | PTE_W); 4280 if (!atomic_fcmpset_long(l3, &oldl3, newl3)) 4281 goto retry; 4282 if ((oldl3 & PTE_D) != 0) 4283 vm_page_dirty(m); 4284 pmap_invalidate_page(pmap, pv->pv_va); 4285 } 4286 PMAP_UNLOCK(pmap); 4287 } 4288 rw_wunlock(lock); 4289 vm_page_aflag_clear(m, PGA_WRITEABLE); 4290 rw_runlock(&pvh_global_lock); 4291 } 4292 4293 /* 4294 * pmap_ts_referenced: 4295 * 4296 * Return a count of reference bits for a page, clearing those bits. 4297 * It is not necessary for every reference bit to be cleared, but it 4298 * is necessary that 0 only be returned when there are truly no 4299 * reference bits set. 4300 * 4301 * As an optimization, update the page's dirty field if a modified bit is 4302 * found while counting reference bits. This opportunistic update can be 4303 * performed at low cost and can eliminate the need for some future calls 4304 * to pmap_is_modified(). However, since this function stops after 4305 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 4306 * dirty pages. Those dirty pages will only be detected by a future call 4307 * to pmap_is_modified(). 4308 */ 4309 int 4310 pmap_ts_referenced(vm_page_t m) 4311 { 4312 struct spglist free; 4313 struct md_page *pvh; 4314 struct rwlock *lock; 4315 pv_entry_t pv, pvf; 4316 pmap_t pmap; 4317 pd_entry_t *l2, l2e; 4318 pt_entry_t *l3, l3e; 4319 vm_paddr_t pa; 4320 vm_offset_t va; 4321 int cleared, md_gen, not_cleared, pvh_gen; 4322 4323 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4324 ("pmap_ts_referenced: page %p is not managed", m)); 4325 SLIST_INIT(&free); 4326 cleared = 0; 4327 pa = VM_PAGE_TO_PHYS(m); 4328 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 4329 4330 lock = PHYS_TO_PV_LIST_LOCK(pa); 4331 rw_rlock(&pvh_global_lock); 4332 rw_wlock(lock); 4333 retry: 4334 not_cleared = 0; 4335 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 4336 goto small_mappings; 4337 pv = pvf; 4338 do { 4339 pmap = PV_PMAP(pv); 4340 if (!PMAP_TRYLOCK(pmap)) { 4341 pvh_gen = pvh->pv_gen; 4342 rw_wunlock(lock); 4343 PMAP_LOCK(pmap); 4344 rw_wlock(lock); 4345 if (pvh_gen != pvh->pv_gen) { 4346 PMAP_UNLOCK(pmap); 4347 goto retry; 4348 } 4349 } 4350 va = pv->pv_va; 4351 l2 = pmap_l2(pmap, va); 4352 l2e = pmap_load(l2); 4353 if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) { 4354 /* 4355 * Although l2e is mapping a 2MB page, because 4356 * this function is called at a 4KB page granularity, 4357 * we only update the 4KB page under test. 4358 */ 4359 vm_page_dirty(m); 4360 } 4361 if ((l2e & PTE_A) != 0) { 4362 /* 4363 * Since this reference bit is shared by 512 4KB 4364 * pages, it should not be cleared every time it is 4365 * tested. Apply a simple "hash" function on the 4366 * physical page number, the virtual superpage number, 4367 * and the pmap address to select one 4KB page out of 4368 * the 512 on which testing the reference bit will 4369 * result in clearing that reference bit. This 4370 * function is designed to avoid the selection of the 4371 * same 4KB page for every 2MB page mapping. 4372 * 4373 * On demotion, a mapping that hasn't been referenced 4374 * is simply destroyed. To avoid the possibility of a 4375 * subsequent page fault on a demoted wired mapping, 4376 * always leave its reference bit set. Moreover, 4377 * since the superpage is wired, the current state of 4378 * its reference bit won't affect page replacement. 4379 */ 4380 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ 4381 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && 4382 (l2e & PTE_SW_WIRED) == 0) { 4383 pmap_clear_bits(l2, PTE_A); 4384 pmap_invalidate_page(pmap, va); 4385 cleared++; 4386 } else 4387 not_cleared++; 4388 } 4389 PMAP_UNLOCK(pmap); 4390 /* Rotate the PV list if it has more than one entry. */ 4391 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4392 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4393 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4394 pvh->pv_gen++; 4395 } 4396 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 4397 goto out; 4398 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 4399 small_mappings: 4400 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 4401 goto out; 4402 pv = pvf; 4403 do { 4404 pmap = PV_PMAP(pv); 4405 if (!PMAP_TRYLOCK(pmap)) { 4406 pvh_gen = pvh->pv_gen; 4407 md_gen = m->md.pv_gen; 4408 rw_wunlock(lock); 4409 PMAP_LOCK(pmap); 4410 rw_wlock(lock); 4411 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4412 PMAP_UNLOCK(pmap); 4413 goto retry; 4414 } 4415 } 4416 l2 = pmap_l2(pmap, pv->pv_va); 4417 4418 KASSERT((pmap_load(l2) & PTE_RX) == 0, 4419 ("pmap_ts_referenced: found an invalid l2 table")); 4420 4421 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4422 l3e = pmap_load(l3); 4423 if ((l3e & PTE_D) != 0) 4424 vm_page_dirty(m); 4425 if ((l3e & PTE_A) != 0) { 4426 if ((l3e & PTE_SW_WIRED) == 0) { 4427 /* 4428 * Wired pages cannot be paged out so 4429 * doing accessed bit emulation for 4430 * them is wasted effort. We do the 4431 * hard work for unwired pages only. 4432 */ 4433 pmap_clear_bits(l3, PTE_A); 4434 pmap_invalidate_page(pmap, pv->pv_va); 4435 cleared++; 4436 } else 4437 not_cleared++; 4438 } 4439 PMAP_UNLOCK(pmap); 4440 /* Rotate the PV list if it has more than one entry. */ 4441 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4442 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4443 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4444 m->md.pv_gen++; 4445 } 4446 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 4447 not_cleared < PMAP_TS_REFERENCED_MAX); 4448 out: 4449 rw_wunlock(lock); 4450 rw_runlock(&pvh_global_lock); 4451 vm_page_free_pages_toq(&free, false); 4452 return (cleared + not_cleared); 4453 } 4454 4455 /* 4456 * Apply the given advice to the specified range of addresses within the 4457 * given pmap. Depending on the advice, clear the referenced and/or 4458 * modified flags in each mapping and set the mapped page's dirty field. 4459 */ 4460 void 4461 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 4462 { 4463 } 4464 4465 /* 4466 * Clear the modify bits on the specified physical page. 4467 */ 4468 void 4469 pmap_clear_modify(vm_page_t m) 4470 { 4471 struct md_page *pvh; 4472 struct rwlock *lock; 4473 pmap_t pmap; 4474 pv_entry_t next_pv, pv; 4475 pd_entry_t *l2, oldl2; 4476 pt_entry_t *l3; 4477 vm_offset_t va; 4478 int md_gen, pvh_gen; 4479 4480 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4481 ("pmap_clear_modify: page %p is not managed", m)); 4482 vm_page_assert_busied(m); 4483 4484 if (!pmap_page_is_write_mapped(m)) 4485 return; 4486 4487 /* 4488 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 4489 * If the object containing the page is locked and the page is not 4490 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 4491 */ 4492 if ((m->a.flags & PGA_WRITEABLE) == 0) 4493 return; 4494 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 4495 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4496 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4497 rw_rlock(&pvh_global_lock); 4498 rw_wlock(lock); 4499 restart: 4500 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4501 pmap = PV_PMAP(pv); 4502 if (!PMAP_TRYLOCK(pmap)) { 4503 pvh_gen = pvh->pv_gen; 4504 rw_wunlock(lock); 4505 PMAP_LOCK(pmap); 4506 rw_wlock(lock); 4507 if (pvh_gen != pvh->pv_gen) { 4508 PMAP_UNLOCK(pmap); 4509 goto restart; 4510 } 4511 } 4512 va = pv->pv_va; 4513 l2 = pmap_l2(pmap, va); 4514 oldl2 = pmap_load(l2); 4515 /* If oldl2 has PTE_W set, then it also has PTE_D set. */ 4516 if ((oldl2 & PTE_W) != 0 && 4517 pmap_demote_l2_locked(pmap, l2, va, &lock) && 4518 (oldl2 & PTE_SW_WIRED) == 0) { 4519 /* 4520 * Write protect the mapping to a single page so that 4521 * a subsequent write access may repromote. 4522 */ 4523 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2); 4524 l3 = pmap_l2_to_l3(l2, va); 4525 pmap_clear_bits(l3, PTE_D | PTE_W); 4526 vm_page_dirty(m); 4527 pmap_invalidate_page(pmap, va); 4528 } 4529 PMAP_UNLOCK(pmap); 4530 } 4531 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4532 pmap = PV_PMAP(pv); 4533 if (!PMAP_TRYLOCK(pmap)) { 4534 md_gen = m->md.pv_gen; 4535 pvh_gen = pvh->pv_gen; 4536 rw_wunlock(lock); 4537 PMAP_LOCK(pmap); 4538 rw_wlock(lock); 4539 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4540 PMAP_UNLOCK(pmap); 4541 goto restart; 4542 } 4543 } 4544 l2 = pmap_l2(pmap, pv->pv_va); 4545 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4546 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 4547 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4548 if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) { 4549 pmap_clear_bits(l3, PTE_D | PTE_W); 4550 pmap_invalidate_page(pmap, pv->pv_va); 4551 } 4552 PMAP_UNLOCK(pmap); 4553 } 4554 rw_wunlock(lock); 4555 rw_runlock(&pvh_global_lock); 4556 } 4557 4558 void * 4559 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 4560 { 4561 4562 return ((void *)PHYS_TO_DMAP(pa)); 4563 } 4564 4565 void 4566 pmap_unmapbios(void *p, vm_size_t size) 4567 { 4568 } 4569 4570 /* 4571 * Sets the memory attribute for the specified page. 4572 */ 4573 void 4574 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 4575 { 4576 4577 m->md.pv_memattr = ma; 4578 4579 /* 4580 * If "m" is a normal page, update its direct mapping. This update 4581 * can be relied upon to perform any cache operations that are 4582 * required for data coherence. 4583 */ 4584 if ((m->flags & PG_FICTITIOUS) == 0 && 4585 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 4586 m->md.pv_memattr) != 0) 4587 panic("memory attribute change on the direct map failed"); 4588 } 4589 4590 /* 4591 * Changes the specified virtual address range's memory type to that given by 4592 * the parameter "mode". The specified virtual address range must be 4593 * completely contained within either the direct map or the kernel map. 4594 * 4595 * Returns zero if the change completed successfully, and either EINVAL or 4596 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 4597 * of the virtual address range was not mapped, and ENOMEM is returned if 4598 * there was insufficient memory available to complete the change. In the 4599 * latter case, the memory type may have been changed on some part of the 4600 * virtual address range. 4601 */ 4602 int 4603 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 4604 { 4605 int error; 4606 4607 PMAP_LOCK(kernel_pmap); 4608 error = pmap_change_attr_locked(va, size, mode); 4609 PMAP_UNLOCK(kernel_pmap); 4610 return (error); 4611 } 4612 4613 static int 4614 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) 4615 { 4616 vm_offset_t base, offset, tmpva; 4617 pd_entry_t *l1, l1e; 4618 pd_entry_t *l2, l2e; 4619 pt_entry_t *l3, l3e; 4620 4621 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 4622 base = trunc_page(va); 4623 offset = va & PAGE_MASK; 4624 size = round_page(offset + size); 4625 4626 if (!VIRT_IN_DMAP(base) && 4627 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS)) 4628 return (EINVAL); 4629 4630 for (tmpva = base; tmpva < base + size; ) { 4631 l1 = pmap_l1(kernel_pmap, tmpva); 4632 if (l1 == NULL || ((l1e = pmap_load(l1)) & PTE_V) == 0) 4633 return (EINVAL); 4634 if ((l1e & PTE_RWX) != 0) { 4635 /* 4636 * TODO: Demote if attributes don't match and there 4637 * isn't an L1 page left in the range, and update the 4638 * L1 entry if the attributes don't match but there is 4639 * an L1 page left in the range, once we support the 4640 * upcoming Svpbmt extension. 4641 */ 4642 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; 4643 continue; 4644 } 4645 l2 = pmap_l1_to_l2(l1, tmpva); 4646 if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) 4647 return (EINVAL); 4648 if ((l2e & PTE_RWX) != 0) { 4649 /* 4650 * TODO: Demote if attributes don't match and there 4651 * isn't an L2 page left in the range, and update the 4652 * L2 entry if the attributes don't match but there is 4653 * an L2 page left in the range, once we support the 4654 * upcoming Svpbmt extension. 4655 */ 4656 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; 4657 continue; 4658 } 4659 l3 = pmap_l2_to_l3(l2, tmpva); 4660 if (l3 == NULL || ((l3e = pmap_load(l3)) & PTE_V) == 0) 4661 return (EINVAL); 4662 /* 4663 * TODO: Update the L3 entry if the attributes don't match once 4664 * we support the upcoming Svpbmt extension. 4665 */ 4666 tmpva += PAGE_SIZE; 4667 } 4668 4669 return (0); 4670 } 4671 4672 /* 4673 * Perform the pmap work for mincore(2). If the page is not both referenced and 4674 * modified by this pmap, returns its physical address so that the caller can 4675 * find other mappings. 4676 */ 4677 int 4678 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 4679 { 4680 pt_entry_t *l2, *l3, tpte; 4681 vm_paddr_t pa; 4682 int val; 4683 bool managed; 4684 4685 PMAP_LOCK(pmap); 4686 l2 = pmap_l2(pmap, addr); 4687 if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) { 4688 if ((tpte & PTE_RWX) != 0) { 4689 pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET); 4690 val = MINCORE_INCORE | MINCORE_PSIND(1); 4691 } else { 4692 l3 = pmap_l2_to_l3(l2, addr); 4693 tpte = pmap_load(l3); 4694 if ((tpte & PTE_V) == 0) { 4695 PMAP_UNLOCK(pmap); 4696 return (0); 4697 } 4698 pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET); 4699 val = MINCORE_INCORE; 4700 } 4701 4702 if ((tpte & PTE_D) != 0) 4703 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 4704 if ((tpte & PTE_A) != 0) 4705 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 4706 managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED; 4707 } else { 4708 managed = false; 4709 val = 0; 4710 } 4711 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 4712 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 4713 *pap = pa; 4714 } 4715 PMAP_UNLOCK(pmap); 4716 return (val); 4717 } 4718 4719 void 4720 pmap_activate_sw(struct thread *td) 4721 { 4722 pmap_t oldpmap, pmap; 4723 u_int hart; 4724 4725 oldpmap = PCPU_GET(curpmap); 4726 pmap = vmspace_pmap(td->td_proc->p_vmspace); 4727 if (pmap == oldpmap) 4728 return; 4729 csr_write(satp, pmap->pm_satp); 4730 4731 hart = PCPU_GET(hart); 4732 #ifdef SMP 4733 CPU_SET_ATOMIC(hart, &pmap->pm_active); 4734 CPU_CLR_ATOMIC(hart, &oldpmap->pm_active); 4735 #else 4736 CPU_SET(hart, &pmap->pm_active); 4737 CPU_CLR(hart, &oldpmap->pm_active); 4738 #endif 4739 PCPU_SET(curpmap, pmap); 4740 4741 sfence_vma(); 4742 } 4743 4744 void 4745 pmap_activate(struct thread *td) 4746 { 4747 4748 critical_enter(); 4749 pmap_activate_sw(td); 4750 critical_exit(); 4751 } 4752 4753 void 4754 pmap_activate_boot(pmap_t pmap) 4755 { 4756 u_int hart; 4757 4758 hart = PCPU_GET(hart); 4759 #ifdef SMP 4760 CPU_SET_ATOMIC(hart, &pmap->pm_active); 4761 #else 4762 CPU_SET(hart, &pmap->pm_active); 4763 #endif 4764 PCPU_SET(curpmap, pmap); 4765 } 4766 4767 void 4768 pmap_active_cpus(pmap_t pmap, cpuset_t *res) 4769 { 4770 *res = pmap->pm_active; 4771 } 4772 4773 void 4774 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) 4775 { 4776 cpuset_t mask; 4777 4778 /* 4779 * From the RISC-V User-Level ISA V2.2: 4780 * 4781 * "To make a store to instruction memory visible to all 4782 * RISC-V harts, the writing hart has to execute a data FENCE 4783 * before requesting that all remote RISC-V harts execute a 4784 * FENCE.I." 4785 * 4786 * However, this is slightly misleading; we still need to 4787 * perform a FENCE.I for the local hart, as FENCE does nothing 4788 * for its icache. FENCE.I alone is also sufficient for the 4789 * local hart. 4790 */ 4791 sched_pin(); 4792 mask = all_harts; 4793 CPU_CLR(PCPU_GET(hart), &mask); 4794 fence_i(); 4795 if (!CPU_EMPTY(&mask) && smp_started) { 4796 fence(); 4797 sbi_remote_fence_i(mask.__bits); 4798 } 4799 sched_unpin(); 4800 } 4801 4802 /* 4803 * Increase the starting virtual address of the given mapping if a 4804 * different alignment might result in more superpage mappings. 4805 */ 4806 void 4807 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 4808 vm_offset_t *addr, vm_size_t size) 4809 { 4810 vm_offset_t superpage_offset; 4811 4812 if (size < L2_SIZE) 4813 return; 4814 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 4815 offset += ptoa(object->pg_color); 4816 superpage_offset = offset & L2_OFFSET; 4817 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || 4818 (*addr & L2_OFFSET) == superpage_offset) 4819 return; 4820 if ((*addr & L2_OFFSET) < superpage_offset) 4821 *addr = (*addr & ~L2_OFFSET) + superpage_offset; 4822 else 4823 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; 4824 } 4825 4826 /** 4827 * Get the kernel virtual address of a set of physical pages. If there are 4828 * physical addresses not covered by the DMAP perform a transient mapping 4829 * that will be removed when calling pmap_unmap_io_transient. 4830 * 4831 * \param page The pages the caller wishes to obtain the virtual 4832 * address on the kernel memory map. 4833 * \param vaddr On return contains the kernel virtual memory address 4834 * of the pages passed in the page parameter. 4835 * \param count Number of pages passed in. 4836 * \param can_fault true if the thread using the mapped pages can take 4837 * page faults, false otherwise. 4838 * 4839 * \returns true if the caller must call pmap_unmap_io_transient when 4840 * finished or false otherwise. 4841 * 4842 */ 4843 bool 4844 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 4845 bool can_fault) 4846 { 4847 vm_paddr_t paddr; 4848 bool needs_mapping; 4849 int error __diagused, i; 4850 4851 /* 4852 * Allocate any KVA space that we need, this is done in a separate 4853 * loop to prevent calling vmem_alloc while pinned. 4854 */ 4855 needs_mapping = false; 4856 for (i = 0; i < count; i++) { 4857 paddr = VM_PAGE_TO_PHYS(page[i]); 4858 if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) { 4859 error = vmem_alloc(kernel_arena, PAGE_SIZE, 4860 M_BESTFIT | M_WAITOK, &vaddr[i]); 4861 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 4862 needs_mapping = true; 4863 } else { 4864 vaddr[i] = PHYS_TO_DMAP(paddr); 4865 } 4866 } 4867 4868 /* Exit early if everything is covered by the DMAP */ 4869 if (!needs_mapping) 4870 return (false); 4871 4872 if (!can_fault) 4873 sched_pin(); 4874 for (i = 0; i < count; i++) { 4875 paddr = VM_PAGE_TO_PHYS(page[i]); 4876 if (paddr >= DMAP_MAX_PHYSADDR) { 4877 panic( 4878 "pmap_map_io_transient: TODO: Map out of DMAP data"); 4879 } 4880 } 4881 4882 return (needs_mapping); 4883 } 4884 4885 void 4886 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 4887 bool can_fault) 4888 { 4889 vm_paddr_t paddr; 4890 int i; 4891 4892 if (!can_fault) 4893 sched_unpin(); 4894 for (i = 0; i < count; i++) { 4895 paddr = VM_PAGE_TO_PHYS(page[i]); 4896 if (paddr >= DMAP_MAX_PHYSADDR) { 4897 panic("RISCVTODO: pmap_unmap_io_transient: Unmap data"); 4898 } 4899 } 4900 } 4901 4902 bool 4903 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 4904 { 4905 4906 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_BACK); 4907 } 4908 4909 bool 4910 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2, 4911 pt_entry_t **l3) 4912 { 4913 pd_entry_t *l1p, *l2p; 4914 4915 /* Get l1 directory entry. */ 4916 l1p = pmap_l1(pmap, va); 4917 *l1 = l1p; 4918 4919 if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0) 4920 return (false); 4921 4922 if ((pmap_load(l1p) & PTE_RX) != 0) { 4923 *l2 = NULL; 4924 *l3 = NULL; 4925 return (true); 4926 } 4927 4928 /* Get l2 directory entry. */ 4929 l2p = pmap_l1_to_l2(l1p, va); 4930 *l2 = l2p; 4931 4932 if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0) 4933 return (false); 4934 4935 if ((pmap_load(l2p) & PTE_RX) != 0) { 4936 *l3 = NULL; 4937 return (true); 4938 } 4939 4940 /* Get l3 page table entry. */ 4941 *l3 = pmap_l2_to_l3(l2p, va); 4942 4943 return (true); 4944 } 4945 4946 /* 4947 * Track a range of the kernel's virtual address space that is contiguous 4948 * in various mapping attributes. 4949 */ 4950 struct pmap_kernel_map_range { 4951 vm_offset_t sva; 4952 pt_entry_t attrs; 4953 int l3pages; 4954 int l2pages; 4955 int l1pages; 4956 }; 4957 4958 static void 4959 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 4960 vm_offset_t eva) 4961 { 4962 4963 if (eva <= range->sva) 4964 return; 4965 4966 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %d %d %d\n", 4967 range->sva, eva, 4968 (range->attrs & PTE_W) == PTE_W ? 'w' : '-', 4969 (range->attrs & PTE_X) == PTE_X ? 'x' : '-', 4970 (range->attrs & PTE_U) == PTE_U ? 'u' : 's', 4971 (range->attrs & PTE_G) == PTE_G ? 'g' : '-', 4972 range->l1pages, range->l2pages, range->l3pages); 4973 4974 /* Reset to sentinel value. */ 4975 range->sva = 0xfffffffffffffffful; 4976 } 4977 4978 /* 4979 * Determine whether the attributes specified by a page table entry match those 4980 * being tracked by the current range. 4981 */ 4982 static bool 4983 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 4984 { 4985 4986 return (range->attrs == attrs); 4987 } 4988 4989 static void 4990 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 4991 pt_entry_t attrs) 4992 { 4993 4994 memset(range, 0, sizeof(*range)); 4995 range->sva = va; 4996 range->attrs = attrs; 4997 } 4998 4999 /* 5000 * Given a leaf PTE, derive the mapping's attributes. If they do not match 5001 * those of the current run, dump the address range and its attributes, and 5002 * begin a new run. 5003 */ 5004 static void 5005 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 5006 vm_offset_t va, pd_entry_t l1e, pd_entry_t l2e, pt_entry_t l3e) 5007 { 5008 pt_entry_t attrs; 5009 5010 /* The PTE global bit is inherited by lower levels. */ 5011 attrs = l1e & PTE_G; 5012 if ((l1e & PTE_RWX) != 0) 5013 attrs |= l1e & (PTE_RWX | PTE_U); 5014 else if (l2e != 0) 5015 attrs |= l2e & PTE_G; 5016 if ((l2e & PTE_RWX) != 0) 5017 attrs |= l2e & (PTE_RWX | PTE_U); 5018 else if (l3e != 0) 5019 attrs |= l3e & (PTE_RWX | PTE_U | PTE_G); 5020 5021 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 5022 sysctl_kmaps_dump(sb, range, va); 5023 sysctl_kmaps_reinit(range, va, attrs); 5024 } 5025 } 5026 5027 static int 5028 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 5029 { 5030 struct pmap_kernel_map_range range; 5031 struct sbuf sbuf, *sb; 5032 pd_entry_t l1e, *l2, l2e; 5033 pt_entry_t *l3, l3e; 5034 vm_offset_t sva; 5035 vm_paddr_t pa; 5036 int error, i, j, k; 5037 5038 error = sysctl_wire_old_buffer(req, 0); 5039 if (error != 0) 5040 return (error); 5041 sb = &sbuf; 5042 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 5043 5044 /* Sentinel value. */ 5045 range.sva = 0xfffffffffffffffful; 5046 5047 /* 5048 * Iterate over the kernel page tables without holding the kernel pmap 5049 * lock. Kernel page table pages are never freed, so at worst we will 5050 * observe inconsistencies in the output. 5051 */ 5052 sva = VM_MIN_KERNEL_ADDRESS; 5053 for (i = pmap_l1_index(sva); i < Ln_ENTRIES; i++) { 5054 if (i == pmap_l1_index(DMAP_MIN_ADDRESS)) 5055 sbuf_printf(sb, "\nDirect map:\n"); 5056 else if (i == pmap_l1_index(VM_MIN_KERNEL_ADDRESS)) 5057 sbuf_printf(sb, "\nKernel map:\n"); 5058 5059 l1e = kernel_pmap->pm_top[i]; 5060 if ((l1e & PTE_V) == 0) { 5061 sysctl_kmaps_dump(sb, &range, sva); 5062 sva += L1_SIZE; 5063 continue; 5064 } 5065 if ((l1e & PTE_RWX) != 0) { 5066 sysctl_kmaps_check(sb, &range, sva, l1e, 0, 0); 5067 range.l1pages++; 5068 sva += L1_SIZE; 5069 continue; 5070 } 5071 pa = PTE_TO_PHYS(l1e); 5072 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); 5073 5074 for (j = pmap_l2_index(sva); j < Ln_ENTRIES; j++) { 5075 l2e = l2[j]; 5076 if ((l2e & PTE_V) == 0) { 5077 sysctl_kmaps_dump(sb, &range, sva); 5078 sva += L2_SIZE; 5079 continue; 5080 } 5081 if ((l2e & PTE_RWX) != 0) { 5082 sysctl_kmaps_check(sb, &range, sva, l1e, l2e, 0); 5083 range.l2pages++; 5084 sva += L2_SIZE; 5085 continue; 5086 } 5087 pa = PTE_TO_PHYS(l2e); 5088 l3 = (pd_entry_t *)PHYS_TO_DMAP(pa); 5089 5090 for (k = pmap_l3_index(sva); k < Ln_ENTRIES; k++, 5091 sva += L3_SIZE) { 5092 l3e = l3[k]; 5093 if ((l3e & PTE_V) == 0) { 5094 sysctl_kmaps_dump(sb, &range, sva); 5095 continue; 5096 } 5097 sysctl_kmaps_check(sb, &range, sva, 5098 l1e, l2e, l3e); 5099 range.l3pages++; 5100 } 5101 } 5102 } 5103 5104 error = sbuf_finish(sb); 5105 sbuf_delete(sb); 5106 return (error); 5107 } 5108 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 5109 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 5110 NULL, 0, sysctl_kmaps, "A", 5111 "Dump kernel address layout"); 5112