1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * Copyright (c) 2014 Andrew Turner 15 * All rights reserved. 16 * Copyright (c) 2014 The FreeBSD Foundation 17 * All rights reserved. 18 * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com> 19 * All rights reserved. 20 * 21 * This code is derived from software contributed to Berkeley by 22 * the Systems Programming Group of the University of Utah Computer 23 * Science Department and William Jolitz of UUNET Technologies Inc. 24 * 25 * Portions of this software were developed by Andrew Turner under 26 * sponsorship from The FreeBSD Foundation. 27 * 28 * Portions of this software were developed by SRI International and the 29 * University of Cambridge Computer Laboratory under DARPA/AFRL contract 30 * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. 31 * 32 * Portions of this software were developed by the University of Cambridge 33 * Computer Laboratory as part of the CTSRD Project, with support from the 34 * UK Higher Education Innovation Fund (HEIF). 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. All advertising materials mentioning features or use of this software 45 * must display the following acknowledgement: 46 * This product includes software developed by the University of 47 * California, Berkeley and its contributors. 48 * 4. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 * 64 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 65 */ 66 /*- 67 * Copyright (c) 2003 Networks Associates Technology, Inc. 68 * All rights reserved. 69 * 70 * This software was developed for the FreeBSD Project by Jake Burkholder, 71 * Safeport Network Services, and Network Associates Laboratories, the 72 * Security Research Division of Network Associates, Inc. under 73 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 74 * CHATS research program. 75 * 76 * Redistribution and use in source and binary forms, with or without 77 * modification, are permitted provided that the following conditions 78 * are met: 79 * 1. Redistributions of source code must retain the above copyright 80 * notice, this list of conditions and the following disclaimer. 81 * 2. Redistributions in binary form must reproduce the above copyright 82 * notice, this list of conditions and the following disclaimer in the 83 * documentation and/or other materials provided with the distribution. 84 * 85 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 86 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 87 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 88 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 89 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 90 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 91 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 92 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 93 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 94 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 95 * SUCH DAMAGE. 96 */ 97 98 #include <sys/cdefs.h> 99 __FBSDID("$FreeBSD$"); 100 101 /* 102 * Manages physical address maps. 103 * 104 * Since the information managed by this module is 105 * also stored by the logical address mapping module, 106 * this module may throw away valid virtual-to-physical 107 * mappings at almost any time. However, invalidations 108 * of virtual-to-physical mappings must be done as 109 * requested. 110 * 111 * In order to cope with hardware architectures which 112 * make virtual-to-physical map invalidates expensive, 113 * this module may delay invalidate or reduced protection 114 * operations until such time as they are actually 115 * necessary. This module is given full information as 116 * to which processors are currently using which maps, 117 * and to when physical maps must be made correct. 118 */ 119 120 #include <sys/param.h> 121 #include <sys/systm.h> 122 #include <sys/bitstring.h> 123 #include <sys/bus.h> 124 #include <sys/cpuset.h> 125 #include <sys/kernel.h> 126 #include <sys/ktr.h> 127 #include <sys/lock.h> 128 #include <sys/malloc.h> 129 #include <sys/mman.h> 130 #include <sys/msgbuf.h> 131 #include <sys/mutex.h> 132 #include <sys/physmem.h> 133 #include <sys/proc.h> 134 #include <sys/rwlock.h> 135 #include <sys/sbuf.h> 136 #include <sys/sx.h> 137 #include <sys/vmem.h> 138 #include <sys/vmmeter.h> 139 #include <sys/sched.h> 140 #include <sys/sysctl.h> 141 #include <sys/smp.h> 142 143 #include <vm/vm.h> 144 #include <vm/vm_param.h> 145 #include <vm/vm_kern.h> 146 #include <vm/vm_page.h> 147 #include <vm/vm_map.h> 148 #include <vm/vm_object.h> 149 #include <vm/vm_extern.h> 150 #include <vm/vm_pageout.h> 151 #include <vm/vm_pager.h> 152 #include <vm/vm_phys.h> 153 #include <vm/vm_radix.h> 154 #include <vm/vm_reserv.h> 155 #include <vm/vm_dumpset.h> 156 #include <vm/uma.h> 157 158 #include <machine/machdep.h> 159 #include <machine/md_var.h> 160 #include <machine/pcb.h> 161 #include <machine/sbi.h> 162 163 #define NUL1E (Ln_ENTRIES * Ln_ENTRIES) 164 #define NUL2E (Ln_ENTRIES * NUL1E) 165 166 #if !defined(DIAGNOSTIC) 167 #ifdef __GNUC_GNU_INLINE__ 168 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 169 #else 170 #define PMAP_INLINE extern inline 171 #endif 172 #else 173 #define PMAP_INLINE 174 #endif 175 176 #ifdef PV_STATS 177 #define PV_STAT(x) do { x ; } while (0) 178 #else 179 #define PV_STAT(x) do { } while (0) 180 #endif 181 182 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) 183 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 184 185 #define NPV_LIST_LOCKS MAXCPU 186 187 #define PHYS_TO_PV_LIST_LOCK(pa) \ 188 (&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS]) 189 190 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 191 struct rwlock **_lockp = (lockp); \ 192 struct rwlock *_new_lock; \ 193 \ 194 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 195 if (_new_lock != *_lockp) { \ 196 if (*_lockp != NULL) \ 197 rw_wunlock(*_lockp); \ 198 *_lockp = _new_lock; \ 199 rw_wlock(*_lockp); \ 200 } \ 201 } while (0) 202 203 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 204 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 205 206 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 207 struct rwlock **_lockp = (lockp); \ 208 \ 209 if (*_lockp != NULL) { \ 210 rw_wunlock(*_lockp); \ 211 *_lockp = NULL; \ 212 } \ 213 } while (0) 214 215 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 216 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 217 218 /* The list of all the user pmaps */ 219 LIST_HEAD(pmaplist, pmap); 220 static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER(); 221 222 struct pmap kernel_pmap_store; 223 224 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 225 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 226 vm_offset_t kernel_vm_end = 0; 227 228 vm_paddr_t dmap_phys_base; /* The start of the dmap region */ 229 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ 230 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ 231 232 /* This code assumes all L1 DMAP entries will be used */ 233 CTASSERT((DMAP_MIN_ADDRESS & ~L1_OFFSET) == DMAP_MIN_ADDRESS); 234 CTASSERT((DMAP_MAX_ADDRESS & ~L1_OFFSET) == DMAP_MAX_ADDRESS); 235 236 static struct rwlock_padalign pvh_global_lock; 237 static struct mtx_padalign allpmaps_lock; 238 239 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 240 "VM/pmap parameters"); 241 242 static int superpages_enabled = 1; 243 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, 244 CTLFLAG_RDTUN, &superpages_enabled, 0, 245 "Enable support for transparent superpages"); 246 247 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 248 "2MB page mapping counters"); 249 250 static u_long pmap_l2_demotions; 251 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, 252 &pmap_l2_demotions, 0, 253 "2MB page demotions"); 254 255 static u_long pmap_l2_mappings; 256 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, 257 &pmap_l2_mappings, 0, 258 "2MB page mappings"); 259 260 static u_long pmap_l2_p_failures; 261 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, 262 &pmap_l2_p_failures, 0, 263 "2MB page promotion failures"); 264 265 static u_long pmap_l2_promotions; 266 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, 267 &pmap_l2_promotions, 0, 268 "2MB page promotions"); 269 270 /* 271 * Data for the pv entry allocation mechanism 272 */ 273 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 274 static struct mtx pv_chunks_mutex; 275 static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 276 static struct md_page *pv_table; 277 static struct md_page pv_dummy; 278 279 extern cpuset_t all_harts; 280 281 /* 282 * Internal flags for pmap_enter()'s helper functions. 283 */ 284 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 285 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 286 287 static void free_pv_chunk(struct pv_chunk *pc); 288 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 289 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 290 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 291 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 292 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 293 vm_offset_t va); 294 static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va); 295 static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, 296 vm_offset_t va, struct rwlock **lockp); 297 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, 298 u_int flags, vm_page_t m, struct rwlock **lockp); 299 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 300 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 301 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, 302 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 303 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 304 vm_page_t m, struct rwlock **lockp); 305 306 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, 307 struct rwlock **lockp); 308 309 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 310 struct spglist *free); 311 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 312 313 #define pmap_clear(pte) pmap_store(pte, 0) 314 #define pmap_clear_bits(pte, bits) atomic_clear_64(pte, bits) 315 #define pmap_load_store(pte, entry) atomic_swap_64(pte, entry) 316 #define pmap_load_clear(pte) pmap_load_store(pte, 0) 317 #define pmap_load(pte) atomic_load_64(pte) 318 #define pmap_store(pte, entry) atomic_store_64(pte, entry) 319 #define pmap_store_bits(pte, bits) atomic_set_64(pte, bits) 320 321 /********************/ 322 /* Inline functions */ 323 /********************/ 324 325 static __inline void 326 pagecopy(void *s, void *d) 327 { 328 329 memcpy(d, s, PAGE_SIZE); 330 } 331 332 static __inline void 333 pagezero(void *p) 334 { 335 336 bzero(p, PAGE_SIZE); 337 } 338 339 #define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) 340 #define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) 341 #define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) 342 343 #define PTE_TO_PHYS(pte) \ 344 ((((pte) & ~PTE_HI_MASK) >> PTE_PPN0_S) * PAGE_SIZE) 345 #define L2PTE_TO_PHYS(l2) \ 346 ((((l2) & ~PTE_HI_MASK) >> PTE_PPN1_S) << L2_SHIFT) 347 348 static __inline pd_entry_t * 349 pmap_l1(pmap_t pmap, vm_offset_t va) 350 { 351 352 return (&pmap->pm_l1[pmap_l1_index(va)]); 353 } 354 355 static __inline pd_entry_t * 356 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) 357 { 358 vm_paddr_t phys; 359 pd_entry_t *l2; 360 361 phys = PTE_TO_PHYS(pmap_load(l1)); 362 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 363 364 return (&l2[pmap_l2_index(va)]); 365 } 366 367 static __inline pd_entry_t * 368 pmap_l2(pmap_t pmap, vm_offset_t va) 369 { 370 pd_entry_t *l1; 371 372 l1 = pmap_l1(pmap, va); 373 if ((pmap_load(l1) & PTE_V) == 0) 374 return (NULL); 375 if ((pmap_load(l1) & PTE_RX) != 0) 376 return (NULL); 377 378 return (pmap_l1_to_l2(l1, va)); 379 } 380 381 static __inline pt_entry_t * 382 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) 383 { 384 vm_paddr_t phys; 385 pt_entry_t *l3; 386 387 phys = PTE_TO_PHYS(pmap_load(l2)); 388 l3 = (pd_entry_t *)PHYS_TO_DMAP(phys); 389 390 return (&l3[pmap_l3_index(va)]); 391 } 392 393 static __inline pt_entry_t * 394 pmap_l3(pmap_t pmap, vm_offset_t va) 395 { 396 pd_entry_t *l2; 397 398 l2 = pmap_l2(pmap, va); 399 if (l2 == NULL) 400 return (NULL); 401 if ((pmap_load(l2) & PTE_V) == 0) 402 return (NULL); 403 if ((pmap_load(l2) & PTE_RX) != 0) 404 return (NULL); 405 406 return (pmap_l2_to_l3(l2, va)); 407 } 408 409 static __inline void 410 pmap_resident_count_inc(pmap_t pmap, int count) 411 { 412 413 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 414 pmap->pm_stats.resident_count += count; 415 } 416 417 static __inline void 418 pmap_resident_count_dec(pmap_t pmap, int count) 419 { 420 421 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 422 KASSERT(pmap->pm_stats.resident_count >= count, 423 ("pmap %p resident count underflow %ld %d", pmap, 424 pmap->pm_stats.resident_count, count)); 425 pmap->pm_stats.resident_count -= count; 426 } 427 428 static void 429 pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index, 430 pt_entry_t entry) 431 { 432 struct pmap *user_pmap; 433 pd_entry_t *l1; 434 435 /* Distribute new kernel L1 entry to all the user pmaps */ 436 if (pmap != kernel_pmap) 437 return; 438 439 mtx_lock(&allpmaps_lock); 440 LIST_FOREACH(user_pmap, &allpmaps, pm_list) { 441 l1 = &user_pmap->pm_l1[l1index]; 442 pmap_store(l1, entry); 443 } 444 mtx_unlock(&allpmaps_lock); 445 } 446 447 static pt_entry_t * 448 pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, 449 u_int *l2_slot) 450 { 451 pt_entry_t *l2; 452 pd_entry_t *l1; 453 454 l1 = (pd_entry_t *)l1pt; 455 *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; 456 457 /* Check locore has used a table L1 map */ 458 KASSERT((l1[*l1_slot] & PTE_RX) == 0, 459 ("Invalid bootstrap L1 table")); 460 461 /* Find the address of the L2 table */ 462 l2 = (pt_entry_t *)init_pt_va; 463 *l2_slot = pmap_l2_index(va); 464 465 return (l2); 466 } 467 468 static vm_paddr_t 469 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) 470 { 471 u_int l1_slot, l2_slot; 472 pt_entry_t *l2; 473 vm_paddr_t ret; 474 475 l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); 476 477 /* Check locore has used L2 superpages */ 478 KASSERT((l2[l2_slot] & PTE_RX) != 0, 479 ("Invalid bootstrap L2 table")); 480 481 /* L2 is superpages */ 482 ret = L2PTE_TO_PHYS(l2[l2_slot]); 483 ret += (va & L2_OFFSET); 484 485 return (ret); 486 } 487 488 static void 489 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa) 490 { 491 vm_offset_t va; 492 vm_paddr_t pa; 493 pd_entry_t *l1; 494 u_int l1_slot; 495 pt_entry_t entry; 496 pn_t pn; 497 498 pa = dmap_phys_base = min_pa & ~L1_OFFSET; 499 va = DMAP_MIN_ADDRESS; 500 l1 = (pd_entry_t *)kern_l1; 501 l1_slot = pmap_l1_index(DMAP_MIN_ADDRESS); 502 503 for (; va < DMAP_MAX_ADDRESS && pa < max_pa; 504 pa += L1_SIZE, va += L1_SIZE, l1_slot++) { 505 KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); 506 507 /* superpages */ 508 pn = (pa / PAGE_SIZE); 509 entry = PTE_KERN; 510 entry |= (pn << PTE_PPN0_S); 511 pmap_store(&l1[l1_slot], entry); 512 } 513 514 /* Set the upper limit of the DMAP region */ 515 dmap_phys_max = pa; 516 dmap_max_addr = va; 517 518 sfence_vma(); 519 } 520 521 static vm_offset_t 522 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) 523 { 524 vm_offset_t l3pt; 525 pt_entry_t entry; 526 pd_entry_t *l2; 527 vm_paddr_t pa; 528 u_int l2_slot; 529 pn_t pn; 530 531 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); 532 533 l2 = pmap_l2(kernel_pmap, va); 534 l2 = (pd_entry_t *)((uintptr_t)l2 & ~(PAGE_SIZE - 1)); 535 l2_slot = pmap_l2_index(va); 536 l3pt = l3_start; 537 538 for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { 539 KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); 540 541 pa = pmap_early_vtophys(l1pt, l3pt); 542 pn = (pa / PAGE_SIZE); 543 entry = (PTE_V); 544 entry |= (pn << PTE_PPN0_S); 545 pmap_store(&l2[l2_slot], entry); 546 l3pt += PAGE_SIZE; 547 } 548 549 /* Clean the L2 page table */ 550 memset((void *)l3_start, 0, l3pt - l3_start); 551 552 return (l3pt); 553 } 554 555 /* 556 * Bootstrap the system enough to run with virtual memory. 557 */ 558 void 559 pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) 560 { 561 u_int l1_slot, l2_slot; 562 vm_offset_t freemempos; 563 vm_offset_t dpcpu, msgbufpv; 564 vm_paddr_t max_pa, min_pa, pa; 565 pt_entry_t *l2p; 566 int i; 567 568 printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); 569 570 /* Set this early so we can use the pagetable walking functions */ 571 kernel_pmap_store.pm_l1 = (pd_entry_t *)l1pt; 572 PMAP_LOCK_INIT(kernel_pmap); 573 574 rw_init(&pvh_global_lock, "pmap pv global"); 575 576 /* 577 * Set the current CPU as active in the kernel pmap. Secondary cores 578 * will add themselves later in init_secondary(). The SBI firmware 579 * may rely on this mask being precise, so CPU_FILL() is not used. 580 */ 581 CPU_SET(PCPU_GET(hart), &kernel_pmap->pm_active); 582 583 /* Assume the address we were loaded to is a valid physical address. */ 584 min_pa = max_pa = kernstart; 585 586 physmap_idx = physmem_avail(physmap, nitems(physmap)); 587 physmap_idx /= 2; 588 589 /* 590 * Find the minimum physical address. physmap is sorted, 591 * but may contain empty ranges. 592 */ 593 for (i = 0; i < physmap_idx * 2; i += 2) { 594 if (physmap[i] == physmap[i + 1]) 595 continue; 596 if (physmap[i] <= min_pa) 597 min_pa = physmap[i]; 598 if (physmap[i + 1] > max_pa) 599 max_pa = physmap[i + 1]; 600 } 601 printf("physmap_idx %u\n", physmap_idx); 602 printf("min_pa %lx\n", min_pa); 603 printf("max_pa %lx\n", max_pa); 604 605 /* Create a direct map region early so we can use it for pa -> va */ 606 pmap_bootstrap_dmap(l1pt, min_pa, max_pa); 607 608 /* 609 * Read the page table to find out what is already mapped. 610 * This assumes we have mapped a block of memory from KERNBASE 611 * using a single L1 entry. 612 */ 613 (void)pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); 614 615 /* Sanity check the index, KERNBASE should be the first VA */ 616 KASSERT(l2_slot == 0, ("The L2 index is non-zero")); 617 618 freemempos = roundup2(KERNBASE + kernlen, PAGE_SIZE); 619 620 /* Create the l3 tables for the early devmap */ 621 freemempos = pmap_bootstrap_l3(l1pt, 622 VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos); 623 624 /* 625 * Invalidate the mapping we created for the DTB. At this point a copy 626 * has been created, and we no longer need it. We want to avoid the 627 * possibility of an aliased mapping in the future. 628 */ 629 l2p = pmap_l2(kernel_pmap, VM_EARLY_DTB_ADDRESS); 630 if ((pmap_load(l2p) & PTE_V) != 0) 631 pmap_clear(l2p); 632 633 sfence_vma(); 634 635 #define alloc_pages(var, np) \ 636 (var) = freemempos; \ 637 freemempos += (np * PAGE_SIZE); \ 638 memset((char *)(var), 0, ((np) * PAGE_SIZE)); 639 640 /* Allocate dynamic per-cpu area. */ 641 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); 642 dpcpu_init((void *)dpcpu, 0); 643 644 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ 645 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); 646 msgbufp = (void *)msgbufpv; 647 648 virtual_avail = roundup2(freemempos, L2_SIZE); 649 virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE; 650 kernel_vm_end = virtual_avail; 651 652 pa = pmap_early_vtophys(l1pt, freemempos); 653 654 physmem_exclude_region(kernstart, pa - kernstart, EXFLAG_NOALLOC); 655 } 656 657 /* 658 * Initialize a vm_page's machine-dependent fields. 659 */ 660 void 661 pmap_page_init(vm_page_t m) 662 { 663 664 TAILQ_INIT(&m->md.pv_list); 665 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; 666 } 667 668 /* 669 * Initialize the pmap module. 670 * Called by vm_init, to initialize any structures that the pmap 671 * system needs to map virtual memory. 672 */ 673 void 674 pmap_init(void) 675 { 676 vm_size_t s; 677 int i, pv_npg; 678 679 /* 680 * Initialize the pv chunk and pmap list mutexes. 681 */ 682 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 683 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF); 684 685 /* 686 * Initialize the pool of pv list locks. 687 */ 688 for (i = 0; i < NPV_LIST_LOCKS; i++) 689 rw_init(&pv_list_locks[i], "pmap pv list"); 690 691 /* 692 * Calculate the size of the pv head table for superpages. 693 */ 694 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); 695 696 /* 697 * Allocate memory for the pv head table for superpages. 698 */ 699 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 700 s = round_page(s); 701 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 702 for (i = 0; i < pv_npg; i++) 703 TAILQ_INIT(&pv_table[i].pv_list); 704 TAILQ_INIT(&pv_dummy.pv_list); 705 706 if (superpages_enabled) 707 pagesizes[1] = L2_SIZE; 708 } 709 710 #ifdef SMP 711 /* 712 * For SMP, these functions have to use IPIs for coherence. 713 * 714 * In general, the calling thread uses a plain fence to order the 715 * writes to the page tables before invoking an SBI callback to invoke 716 * sfence_vma() on remote CPUs. 717 */ 718 static void 719 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 720 { 721 cpuset_t mask; 722 723 sched_pin(); 724 mask = pmap->pm_active; 725 CPU_CLR(PCPU_GET(hart), &mask); 726 fence(); 727 if (!CPU_EMPTY(&mask) && smp_started) 728 sbi_remote_sfence_vma(mask.__bits, va, 1); 729 sfence_vma_page(va); 730 sched_unpin(); 731 } 732 733 static void 734 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 735 { 736 cpuset_t mask; 737 738 sched_pin(); 739 mask = pmap->pm_active; 740 CPU_CLR(PCPU_GET(hart), &mask); 741 fence(); 742 if (!CPU_EMPTY(&mask) && smp_started) 743 sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1); 744 745 /* 746 * Might consider a loop of sfence_vma_page() for a small 747 * number of pages in the future. 748 */ 749 sfence_vma(); 750 sched_unpin(); 751 } 752 753 static void 754 pmap_invalidate_all(pmap_t pmap) 755 { 756 cpuset_t mask; 757 758 sched_pin(); 759 mask = pmap->pm_active; 760 CPU_CLR(PCPU_GET(hart), &mask); 761 762 /* 763 * XXX: The SBI doc doesn't detail how to specify x0 as the 764 * address to perform a global fence. BBL currently treats 765 * all sfence_vma requests as global however. 766 */ 767 fence(); 768 if (!CPU_EMPTY(&mask) && smp_started) 769 sbi_remote_sfence_vma(mask.__bits, 0, 0); 770 sfence_vma(); 771 sched_unpin(); 772 } 773 #else 774 /* 775 * Normal, non-SMP, invalidation functions. 776 * We inline these within pmap.c for speed. 777 */ 778 static __inline void 779 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 780 { 781 782 sfence_vma_page(va); 783 } 784 785 static __inline void 786 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 787 { 788 789 /* 790 * Might consider a loop of sfence_vma_page() for a small 791 * number of pages in the future. 792 */ 793 sfence_vma(); 794 } 795 796 static __inline void 797 pmap_invalidate_all(pmap_t pmap) 798 { 799 800 sfence_vma(); 801 } 802 #endif 803 804 /* 805 * Routine: pmap_extract 806 * Function: 807 * Extract the physical page address associated 808 * with the given map/virtual_address pair. 809 */ 810 vm_paddr_t 811 pmap_extract(pmap_t pmap, vm_offset_t va) 812 { 813 pd_entry_t *l2p, l2; 814 pt_entry_t *l3p, l3; 815 vm_paddr_t pa; 816 817 pa = 0; 818 PMAP_LOCK(pmap); 819 /* 820 * Start with the l2 tabel. We are unable to allocate 821 * pages in the l1 table. 822 */ 823 l2p = pmap_l2(pmap, va); 824 if (l2p != NULL) { 825 l2 = pmap_load(l2p); 826 if ((l2 & PTE_RX) == 0) { 827 l3p = pmap_l2_to_l3(l2p, va); 828 if (l3p != NULL) { 829 l3 = pmap_load(l3p); 830 pa = PTE_TO_PHYS(l3); 831 pa |= (va & L3_OFFSET); 832 } 833 } else { 834 /* L2 is superpages */ 835 pa = L2PTE_TO_PHYS(l2); 836 pa |= (va & L2_OFFSET); 837 } 838 } 839 PMAP_UNLOCK(pmap); 840 return (pa); 841 } 842 843 /* 844 * Routine: pmap_extract_and_hold 845 * Function: 846 * Atomically extract and hold the physical page 847 * with the given pmap and virtual address pair 848 * if that mapping permits the given protection. 849 */ 850 vm_page_t 851 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 852 { 853 pt_entry_t *l3p, l3; 854 vm_paddr_t phys; 855 vm_page_t m; 856 857 m = NULL; 858 PMAP_LOCK(pmap); 859 l3p = pmap_l3(pmap, va); 860 if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) { 861 if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) { 862 phys = PTE_TO_PHYS(l3); 863 m = PHYS_TO_VM_PAGE(phys); 864 if (!vm_page_wire_mapped(m)) 865 m = NULL; 866 } 867 } 868 PMAP_UNLOCK(pmap); 869 return (m); 870 } 871 872 vm_paddr_t 873 pmap_kextract(vm_offset_t va) 874 { 875 pd_entry_t *l2, l2e; 876 pt_entry_t *l3; 877 vm_paddr_t pa; 878 879 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 880 pa = DMAP_TO_PHYS(va); 881 } else { 882 l2 = pmap_l2(kernel_pmap, va); 883 if (l2 == NULL) 884 panic("pmap_kextract: No l2"); 885 l2e = pmap_load(l2); 886 /* 887 * Beware of concurrent promotion and demotion! We must 888 * use l2e rather than loading from l2 multiple times to 889 * ensure we see a consistent state, including the 890 * implicit load in pmap_l2_to_l3. It is, however, safe 891 * to use an old l2e because the L3 page is preserved by 892 * promotion. 893 */ 894 if ((l2e & PTE_RX) != 0) { 895 /* superpages */ 896 pa = L2PTE_TO_PHYS(l2e); 897 pa |= (va & L2_OFFSET); 898 return (pa); 899 } 900 901 l3 = pmap_l2_to_l3(&l2e, va); 902 if (l3 == NULL) 903 panic("pmap_kextract: No l3..."); 904 pa = PTE_TO_PHYS(pmap_load(l3)); 905 pa |= (va & PAGE_MASK); 906 } 907 return (pa); 908 } 909 910 /*************************************************** 911 * Low level mapping routines..... 912 ***************************************************/ 913 914 void 915 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) 916 { 917 pt_entry_t entry; 918 pt_entry_t *l3; 919 vm_offset_t va; 920 pn_t pn; 921 922 KASSERT((pa & L3_OFFSET) == 0, 923 ("pmap_kenter_device: Invalid physical address")); 924 KASSERT((sva & L3_OFFSET) == 0, 925 ("pmap_kenter_device: Invalid virtual address")); 926 KASSERT((size & PAGE_MASK) == 0, 927 ("pmap_kenter_device: Mapping is not page-sized")); 928 929 va = sva; 930 while (size != 0) { 931 l3 = pmap_l3(kernel_pmap, va); 932 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 933 934 pn = (pa / PAGE_SIZE); 935 entry = PTE_KERN; 936 entry |= (pn << PTE_PPN0_S); 937 pmap_store(l3, entry); 938 939 va += PAGE_SIZE; 940 pa += PAGE_SIZE; 941 size -= PAGE_SIZE; 942 } 943 pmap_invalidate_range(kernel_pmap, sva, va); 944 } 945 946 /* 947 * Remove a page from the kernel pagetables. 948 * Note: not SMP coherent. 949 */ 950 PMAP_INLINE void 951 pmap_kremove(vm_offset_t va) 952 { 953 pt_entry_t *l3; 954 955 l3 = pmap_l3(kernel_pmap, va); 956 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 957 958 pmap_clear(l3); 959 sfence_vma(); 960 } 961 962 void 963 pmap_kremove_device(vm_offset_t sva, vm_size_t size) 964 { 965 pt_entry_t *l3; 966 vm_offset_t va; 967 968 KASSERT((sva & L3_OFFSET) == 0, 969 ("pmap_kremove_device: Invalid virtual address")); 970 KASSERT((size & PAGE_MASK) == 0, 971 ("pmap_kremove_device: Mapping is not page-sized")); 972 973 va = sva; 974 while (size != 0) { 975 l3 = pmap_l3(kernel_pmap, va); 976 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 977 pmap_clear(l3); 978 979 va += PAGE_SIZE; 980 size -= PAGE_SIZE; 981 } 982 983 pmap_invalidate_range(kernel_pmap, sva, va); 984 } 985 986 /* 987 * Used to map a range of physical addresses into kernel 988 * virtual address space. 989 * 990 * The value passed in '*virt' is a suggested virtual address for 991 * the mapping. Architectures which can support a direct-mapped 992 * physical to virtual region can return the appropriate address 993 * within that region, leaving '*virt' unchanged. Other 994 * architectures should map the pages starting at '*virt' and 995 * update '*virt' with the first usable address after the mapped 996 * region. 997 */ 998 vm_offset_t 999 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1000 { 1001 1002 return PHYS_TO_DMAP(start); 1003 } 1004 1005 /* 1006 * Add a list of wired pages to the kva 1007 * this routine is only used for temporary 1008 * kernel mappings that do not need to have 1009 * page modification or references recorded. 1010 * Note that old mappings are simply written 1011 * over. The page *must* be wired. 1012 * Note: SMP coherent. Uses a ranged shootdown IPI. 1013 */ 1014 void 1015 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1016 { 1017 pt_entry_t *l3, pa; 1018 vm_offset_t va; 1019 vm_page_t m; 1020 pt_entry_t entry; 1021 pn_t pn; 1022 int i; 1023 1024 va = sva; 1025 for (i = 0; i < count; i++) { 1026 m = ma[i]; 1027 pa = VM_PAGE_TO_PHYS(m); 1028 pn = (pa / PAGE_SIZE); 1029 l3 = pmap_l3(kernel_pmap, va); 1030 1031 entry = PTE_KERN; 1032 entry |= (pn << PTE_PPN0_S); 1033 pmap_store(l3, entry); 1034 1035 va += L3_SIZE; 1036 } 1037 pmap_invalidate_range(kernel_pmap, sva, va); 1038 } 1039 1040 /* 1041 * This routine tears out page mappings from the 1042 * kernel -- it is meant only for temporary mappings. 1043 * Note: SMP coherent. Uses a ranged shootdown IPI. 1044 */ 1045 void 1046 pmap_qremove(vm_offset_t sva, int count) 1047 { 1048 pt_entry_t *l3; 1049 vm_offset_t va; 1050 1051 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); 1052 1053 for (va = sva; count-- > 0; va += PAGE_SIZE) { 1054 l3 = pmap_l3(kernel_pmap, va); 1055 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 1056 pmap_clear(l3); 1057 } 1058 pmap_invalidate_range(kernel_pmap, sva, va); 1059 } 1060 1061 bool 1062 pmap_ps_enabled(pmap_t pmap __unused) 1063 { 1064 1065 return (superpages_enabled); 1066 } 1067 1068 /*************************************************** 1069 * Page table page management routines..... 1070 ***************************************************/ 1071 /* 1072 * Schedule the specified unused page table page to be freed. Specifically, 1073 * add the page to the specified list of pages that will be released to the 1074 * physical memory manager after the TLB has been updated. 1075 */ 1076 static __inline void 1077 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 1078 boolean_t set_PG_ZERO) 1079 { 1080 1081 if (set_PG_ZERO) 1082 m->flags |= PG_ZERO; 1083 else 1084 m->flags &= ~PG_ZERO; 1085 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1086 } 1087 1088 /* 1089 * Inserts the specified page table page into the specified pmap's collection 1090 * of idle page table pages. Each of a pmap's page table pages is responsible 1091 * for mapping a distinct range of virtual addresses. The pmap's collection is 1092 * ordered by this virtual address range. 1093 * 1094 * If "promoted" is false, then the page table page "ml3" must be zero filled. 1095 */ 1096 static __inline int 1097 pmap_insert_pt_page(pmap_t pmap, vm_page_t ml3, bool promoted) 1098 { 1099 1100 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1101 ml3->valid = promoted ? VM_PAGE_BITS_ALL : 0; 1102 return (vm_radix_insert(&pmap->pm_root, ml3)); 1103 } 1104 1105 /* 1106 * Removes the page table page mapping the specified virtual address from the 1107 * specified pmap's collection of idle page table pages, and returns it. 1108 * Otherwise, returns NULL if there is no page table page corresponding to the 1109 * specified virtual address. 1110 */ 1111 static __inline vm_page_t 1112 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 1113 { 1114 1115 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1116 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); 1117 } 1118 1119 /* 1120 * Decrements a page table page's reference count, which is used to record the 1121 * number of valid page table entries within the page. If the reference count 1122 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1123 * page table page was unmapped and FALSE otherwise. 1124 */ 1125 static inline boolean_t 1126 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1127 { 1128 1129 --m->ref_count; 1130 if (m->ref_count == 0) { 1131 _pmap_unwire_ptp(pmap, va, m, free); 1132 return (TRUE); 1133 } else { 1134 return (FALSE); 1135 } 1136 } 1137 1138 static void 1139 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1140 { 1141 vm_paddr_t phys; 1142 1143 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1144 if (m->pindex >= NUL2E) { 1145 pd_entry_t *l1; 1146 l1 = pmap_l1(pmap, va); 1147 pmap_clear(l1); 1148 pmap_distribute_l1(pmap, pmap_l1_index(va), 0); 1149 } else { 1150 pd_entry_t *l2; 1151 l2 = pmap_l2(pmap, va); 1152 pmap_clear(l2); 1153 } 1154 pmap_resident_count_dec(pmap, 1); 1155 if (m->pindex < NUL2E) { 1156 pd_entry_t *l1; 1157 vm_page_t pdpg; 1158 1159 l1 = pmap_l1(pmap, va); 1160 phys = PTE_TO_PHYS(pmap_load(l1)); 1161 pdpg = PHYS_TO_VM_PAGE(phys); 1162 pmap_unwire_ptp(pmap, va, pdpg, free); 1163 } 1164 pmap_invalidate_page(pmap, va); 1165 1166 vm_wire_sub(1); 1167 1168 /* 1169 * Put page on a list so that it is released after 1170 * *ALL* TLB shootdown is done 1171 */ 1172 pmap_add_delayed_free_list(m, free, TRUE); 1173 } 1174 1175 /* 1176 * After removing a page table entry, this routine is used to 1177 * conditionally free the page, and manage the reference count. 1178 */ 1179 static int 1180 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 1181 struct spglist *free) 1182 { 1183 vm_page_t mpte; 1184 1185 if (va >= VM_MAXUSER_ADDRESS) 1186 return (0); 1187 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 1188 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde)); 1189 return (pmap_unwire_ptp(pmap, va, mpte, free)); 1190 } 1191 1192 void 1193 pmap_pinit0(pmap_t pmap) 1194 { 1195 1196 PMAP_LOCK_INIT(pmap); 1197 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1198 pmap->pm_l1 = kernel_pmap->pm_l1; 1199 pmap->pm_satp = SATP_MODE_SV39 | (vtophys(pmap->pm_l1) >> PAGE_SHIFT); 1200 CPU_ZERO(&pmap->pm_active); 1201 pmap_activate_boot(pmap); 1202 } 1203 1204 int 1205 pmap_pinit(pmap_t pmap) 1206 { 1207 vm_paddr_t l1phys; 1208 vm_page_t l1pt; 1209 1210 /* 1211 * allocate the l1 page 1212 */ 1213 while ((l1pt = vm_page_alloc(NULL, 0xdeadbeef, VM_ALLOC_NORMAL | 1214 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) 1215 vm_wait(NULL); 1216 1217 l1phys = VM_PAGE_TO_PHYS(l1pt); 1218 pmap->pm_l1 = (pd_entry_t *)PHYS_TO_DMAP(l1phys); 1219 pmap->pm_satp = SATP_MODE_SV39 | (l1phys >> PAGE_SHIFT); 1220 1221 if ((l1pt->flags & PG_ZERO) == 0) 1222 pagezero(pmap->pm_l1); 1223 1224 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1225 1226 CPU_ZERO(&pmap->pm_active); 1227 1228 /* Install kernel pagetables */ 1229 memcpy(pmap->pm_l1, kernel_pmap->pm_l1, PAGE_SIZE); 1230 1231 /* Add to the list of all user pmaps */ 1232 mtx_lock(&allpmaps_lock); 1233 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1234 mtx_unlock(&allpmaps_lock); 1235 1236 vm_radix_init(&pmap->pm_root); 1237 1238 return (1); 1239 } 1240 1241 /* 1242 * This routine is called if the desired page table page does not exist. 1243 * 1244 * If page table page allocation fails, this routine may sleep before 1245 * returning NULL. It sleeps only if a lock pointer was given. 1246 * 1247 * Note: If a page allocation fails at page table level two or three, 1248 * one or two pages may be held during the wait, only to be released 1249 * afterwards. This conservative approach is easily argued to avoid 1250 * race conditions. 1251 */ 1252 static vm_page_t 1253 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 1254 { 1255 vm_page_t m, /*pdppg, */pdpg; 1256 pt_entry_t entry; 1257 vm_paddr_t phys; 1258 pn_t pn; 1259 1260 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1261 1262 /* 1263 * Allocate a page table page. 1264 */ 1265 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1266 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1267 if (lockp != NULL) { 1268 RELEASE_PV_LIST_LOCK(lockp); 1269 PMAP_UNLOCK(pmap); 1270 rw_runlock(&pvh_global_lock); 1271 vm_wait(NULL); 1272 rw_rlock(&pvh_global_lock); 1273 PMAP_LOCK(pmap); 1274 } 1275 1276 /* 1277 * Indicate the need to retry. While waiting, the page table 1278 * page may have been allocated. 1279 */ 1280 return (NULL); 1281 } 1282 1283 if ((m->flags & PG_ZERO) == 0) 1284 pmap_zero_page(m); 1285 1286 /* 1287 * Map the pagetable page into the process address space, if 1288 * it isn't already there. 1289 */ 1290 1291 if (ptepindex >= NUL2E) { 1292 pd_entry_t *l1; 1293 vm_pindex_t l1index; 1294 1295 l1index = ptepindex - NUL2E; 1296 l1 = &pmap->pm_l1[l1index]; 1297 KASSERT((pmap_load(l1) & PTE_V) == 0, 1298 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1))); 1299 1300 pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); 1301 entry = (PTE_V); 1302 entry |= (pn << PTE_PPN0_S); 1303 pmap_store(l1, entry); 1304 pmap_distribute_l1(pmap, l1index, entry); 1305 } else { 1306 vm_pindex_t l1index; 1307 pd_entry_t *l1, *l2; 1308 1309 l1index = ptepindex >> (L1_SHIFT - L2_SHIFT); 1310 l1 = &pmap->pm_l1[l1index]; 1311 if (pmap_load(l1) == 0) { 1312 /* recurse for allocating page dir */ 1313 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 1314 lockp) == NULL) { 1315 vm_page_unwire_noq(m); 1316 vm_page_free_zero(m); 1317 return (NULL); 1318 } 1319 } else { 1320 phys = PTE_TO_PHYS(pmap_load(l1)); 1321 pdpg = PHYS_TO_VM_PAGE(phys); 1322 pdpg->ref_count++; 1323 } 1324 1325 phys = PTE_TO_PHYS(pmap_load(l1)); 1326 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1327 l2 = &l2[ptepindex & Ln_ADDR_MASK]; 1328 KASSERT((pmap_load(l2) & PTE_V) == 0, 1329 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2))); 1330 1331 pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); 1332 entry = (PTE_V); 1333 entry |= (pn << PTE_PPN0_S); 1334 pmap_store(l2, entry); 1335 } 1336 1337 pmap_resident_count_inc(pmap, 1); 1338 1339 return (m); 1340 } 1341 1342 static vm_page_t 1343 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1344 { 1345 pd_entry_t *l1; 1346 vm_page_t l2pg; 1347 vm_pindex_t l2pindex; 1348 1349 retry: 1350 l1 = pmap_l1(pmap, va); 1351 if (l1 != NULL && (pmap_load(l1) & PTE_V) != 0) { 1352 KASSERT((pmap_load(l1) & PTE_RWX) == 0, 1353 ("%s: L1 entry %#lx for VA %#lx is a leaf", __func__, 1354 pmap_load(l1), va)); 1355 /* Add a reference to the L2 page. */ 1356 l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1))); 1357 l2pg->ref_count++; 1358 } else { 1359 /* Allocate a L2 page. */ 1360 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; 1361 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); 1362 if (l2pg == NULL && lockp != NULL) 1363 goto retry; 1364 } 1365 return (l2pg); 1366 } 1367 1368 static vm_page_t 1369 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1370 { 1371 vm_pindex_t ptepindex; 1372 pd_entry_t *l2; 1373 vm_paddr_t phys; 1374 vm_page_t m; 1375 1376 /* 1377 * Calculate pagetable page index 1378 */ 1379 ptepindex = pmap_l2_pindex(va); 1380 retry: 1381 /* 1382 * Get the page directory entry 1383 */ 1384 l2 = pmap_l2(pmap, va); 1385 1386 /* 1387 * If the page table page is mapped, we just increment the 1388 * hold count, and activate it. 1389 */ 1390 if (l2 != NULL && pmap_load(l2) != 0) { 1391 phys = PTE_TO_PHYS(pmap_load(l2)); 1392 m = PHYS_TO_VM_PAGE(phys); 1393 m->ref_count++; 1394 } else { 1395 /* 1396 * Here if the pte page isn't mapped, or if it has been 1397 * deallocated. 1398 */ 1399 m = _pmap_alloc_l3(pmap, ptepindex, lockp); 1400 if (m == NULL && lockp != NULL) 1401 goto retry; 1402 } 1403 return (m); 1404 } 1405 1406 /*************************************************** 1407 * Pmap allocation/deallocation routines. 1408 ***************************************************/ 1409 1410 /* 1411 * Release any resources held by the given physical map. 1412 * Called when a pmap initialized by pmap_pinit is being released. 1413 * Should only be called if the map contains no valid mappings. 1414 */ 1415 void 1416 pmap_release(pmap_t pmap) 1417 { 1418 vm_page_t m; 1419 1420 KASSERT(pmap->pm_stats.resident_count == 0, 1421 ("pmap_release: pmap resident count %ld != 0", 1422 pmap->pm_stats.resident_count)); 1423 KASSERT(CPU_EMPTY(&pmap->pm_active), 1424 ("releasing active pmap %p", pmap)); 1425 1426 mtx_lock(&allpmaps_lock); 1427 LIST_REMOVE(pmap, pm_list); 1428 mtx_unlock(&allpmaps_lock); 1429 1430 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l1)); 1431 vm_page_unwire_noq(m); 1432 vm_page_free(m); 1433 } 1434 1435 static int 1436 kvm_size(SYSCTL_HANDLER_ARGS) 1437 { 1438 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 1439 1440 return sysctl_handle_long(oidp, &ksize, 0, req); 1441 } 1442 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1443 0, 0, kvm_size, "LU", 1444 "Size of KVM"); 1445 1446 static int 1447 kvm_free(SYSCTL_HANDLER_ARGS) 1448 { 1449 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 1450 1451 return sysctl_handle_long(oidp, &kfree, 0, req); 1452 } 1453 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1454 0, 0, kvm_free, "LU", 1455 "Amount of KVM free"); 1456 1457 /* 1458 * grow the number of kernel page table entries, if needed 1459 */ 1460 void 1461 pmap_growkernel(vm_offset_t addr) 1462 { 1463 vm_paddr_t paddr; 1464 vm_page_t nkpg; 1465 pd_entry_t *l1, *l2; 1466 pt_entry_t entry; 1467 pn_t pn; 1468 1469 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 1470 1471 addr = roundup2(addr, L2_SIZE); 1472 if (addr - 1 >= vm_map_max(kernel_map)) 1473 addr = vm_map_max(kernel_map); 1474 while (kernel_vm_end < addr) { 1475 l1 = pmap_l1(kernel_pmap, kernel_vm_end); 1476 if (pmap_load(l1) == 0) { 1477 /* We need a new PDP entry */ 1478 nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT, 1479 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 1480 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1481 if (nkpg == NULL) 1482 panic("pmap_growkernel: no memory to grow kernel"); 1483 if ((nkpg->flags & PG_ZERO) == 0) 1484 pmap_zero_page(nkpg); 1485 paddr = VM_PAGE_TO_PHYS(nkpg); 1486 1487 pn = (paddr / PAGE_SIZE); 1488 entry = (PTE_V); 1489 entry |= (pn << PTE_PPN0_S); 1490 pmap_store(l1, entry); 1491 pmap_distribute_l1(kernel_pmap, 1492 pmap_l1_index(kernel_vm_end), entry); 1493 continue; /* try again */ 1494 } 1495 l2 = pmap_l1_to_l2(l1, kernel_vm_end); 1496 if ((pmap_load(l2) & PTE_V) != 0 && 1497 (pmap_load(l2) & PTE_RWX) == 0) { 1498 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1499 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1500 kernel_vm_end = vm_map_max(kernel_map); 1501 break; 1502 } 1503 continue; 1504 } 1505 1506 nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT, 1507 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 1508 VM_ALLOC_ZERO); 1509 if (nkpg == NULL) 1510 panic("pmap_growkernel: no memory to grow kernel"); 1511 if ((nkpg->flags & PG_ZERO) == 0) { 1512 pmap_zero_page(nkpg); 1513 } 1514 paddr = VM_PAGE_TO_PHYS(nkpg); 1515 1516 pn = (paddr / PAGE_SIZE); 1517 entry = (PTE_V); 1518 entry |= (pn << PTE_PPN0_S); 1519 pmap_store(l2, entry); 1520 1521 pmap_invalidate_page(kernel_pmap, kernel_vm_end); 1522 1523 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1524 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1525 kernel_vm_end = vm_map_max(kernel_map); 1526 break; 1527 } 1528 } 1529 } 1530 1531 /*************************************************** 1532 * page management routines. 1533 ***************************************************/ 1534 1535 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 1536 CTASSERT(_NPCM == 3); 1537 CTASSERT(_NPCPV == 168); 1538 1539 static __inline struct pv_chunk * 1540 pv_to_chunk(pv_entry_t pv) 1541 { 1542 1543 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 1544 } 1545 1546 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 1547 1548 #define PC_FREE0 0xfffffffffffffffful 1549 #define PC_FREE1 0xfffffffffffffffful 1550 #define PC_FREE2 0x000000fffffffffful 1551 1552 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 1553 1554 #if 0 1555 #ifdef PV_STATS 1556 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 1557 1558 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 1559 "Current number of pv entry chunks"); 1560 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 1561 "Current number of pv entry chunks allocated"); 1562 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 1563 "Current number of pv entry chunks frees"); 1564 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 1565 "Number of times tried to get a chunk page but failed."); 1566 1567 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 1568 static int pv_entry_spare; 1569 1570 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 1571 "Current number of pv entry frees"); 1572 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 1573 "Current number of pv entry allocs"); 1574 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 1575 "Current number of pv entries"); 1576 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 1577 "Current number of spare pv entries"); 1578 #endif 1579 #endif /* 0 */ 1580 1581 /* 1582 * We are in a serious low memory condition. Resort to 1583 * drastic measures to free some pages so we can allocate 1584 * another pv entry chunk. 1585 * 1586 * Returns NULL if PV entries were reclaimed from the specified pmap. 1587 * 1588 * We do not, however, unmap 2mpages because subsequent accesses will 1589 * allocate per-page pv entries until repromotion occurs, thereby 1590 * exacerbating the shortage of free pv entries. 1591 */ 1592 static vm_page_t 1593 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 1594 { 1595 1596 panic("RISCVTODO: reclaim_pv_chunk"); 1597 } 1598 1599 /* 1600 * free the pv_entry back to the free list 1601 */ 1602 static void 1603 free_pv_entry(pmap_t pmap, pv_entry_t pv) 1604 { 1605 struct pv_chunk *pc; 1606 int idx, field, bit; 1607 1608 rw_assert(&pvh_global_lock, RA_LOCKED); 1609 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1610 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 1611 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 1612 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 1613 pc = pv_to_chunk(pv); 1614 idx = pv - &pc->pc_pventry[0]; 1615 field = idx / 64; 1616 bit = idx % 64; 1617 pc->pc_map[field] |= 1ul << bit; 1618 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 1619 pc->pc_map[2] != PC_FREE2) { 1620 /* 98% of the time, pc is already at the head of the list. */ 1621 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 1622 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1623 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1624 } 1625 return; 1626 } 1627 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1628 free_pv_chunk(pc); 1629 } 1630 1631 static void 1632 free_pv_chunk(struct pv_chunk *pc) 1633 { 1634 vm_page_t m; 1635 1636 mtx_lock(&pv_chunks_mutex); 1637 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 1638 mtx_unlock(&pv_chunks_mutex); 1639 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 1640 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 1641 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 1642 /* entire chunk is free, return it */ 1643 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 1644 dump_drop_page(m->phys_addr); 1645 vm_page_unwire_noq(m); 1646 vm_page_free(m); 1647 } 1648 1649 /* 1650 * Returns a new PV entry, allocating a new PV chunk from the system when 1651 * needed. If this PV chunk allocation fails and a PV list lock pointer was 1652 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 1653 * returned. 1654 * 1655 * The given PV list lock may be released. 1656 */ 1657 static pv_entry_t 1658 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 1659 { 1660 int bit, field; 1661 pv_entry_t pv; 1662 struct pv_chunk *pc; 1663 vm_page_t m; 1664 1665 rw_assert(&pvh_global_lock, RA_LOCKED); 1666 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1667 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 1668 retry: 1669 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1670 if (pc != NULL) { 1671 for (field = 0; field < _NPCM; field++) { 1672 if (pc->pc_map[field]) { 1673 bit = ffsl(pc->pc_map[field]) - 1; 1674 break; 1675 } 1676 } 1677 if (field < _NPCM) { 1678 pv = &pc->pc_pventry[field * 64 + bit]; 1679 pc->pc_map[field] &= ~(1ul << bit); 1680 /* If this was the last item, move it to tail */ 1681 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 1682 pc->pc_map[2] == 0) { 1683 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1684 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 1685 pc_list); 1686 } 1687 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1688 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 1689 return (pv); 1690 } 1691 } 1692 /* No free items, allocate another chunk */ 1693 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1694 VM_ALLOC_WIRED); 1695 if (m == NULL) { 1696 if (lockp == NULL) { 1697 PV_STAT(pc_chunk_tryfail++); 1698 return (NULL); 1699 } 1700 m = reclaim_pv_chunk(pmap, lockp); 1701 if (m == NULL) 1702 goto retry; 1703 } 1704 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 1705 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 1706 dump_add_page(m->phys_addr); 1707 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1708 pc->pc_pmap = pmap; 1709 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 1710 pc->pc_map[1] = PC_FREE1; 1711 pc->pc_map[2] = PC_FREE2; 1712 mtx_lock(&pv_chunks_mutex); 1713 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 1714 mtx_unlock(&pv_chunks_mutex); 1715 pv = &pc->pc_pventry[0]; 1716 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1717 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1718 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 1719 return (pv); 1720 } 1721 1722 /* 1723 * Ensure that the number of spare PV entries in the specified pmap meets or 1724 * exceeds the given count, "needed". 1725 * 1726 * The given PV list lock may be released. 1727 */ 1728 static void 1729 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 1730 { 1731 struct pch new_tail; 1732 struct pv_chunk *pc; 1733 vm_page_t m; 1734 int avail, free; 1735 bool reclaimed; 1736 1737 rw_assert(&pvh_global_lock, RA_LOCKED); 1738 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1739 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 1740 1741 /* 1742 * Newly allocated PV chunks must be stored in a private list until 1743 * the required number of PV chunks have been allocated. Otherwise, 1744 * reclaim_pv_chunk() could recycle one of these chunks. In 1745 * contrast, these chunks must be added to the pmap upon allocation. 1746 */ 1747 TAILQ_INIT(&new_tail); 1748 retry: 1749 avail = 0; 1750 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 1751 bit_count((bitstr_t *)pc->pc_map, 0, 1752 sizeof(pc->pc_map) * NBBY, &free); 1753 if (free == 0) 1754 break; 1755 avail += free; 1756 if (avail >= needed) 1757 break; 1758 } 1759 for (reclaimed = false; avail < needed; avail += _NPCPV) { 1760 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1761 VM_ALLOC_WIRED); 1762 if (m == NULL) { 1763 m = reclaim_pv_chunk(pmap, lockp); 1764 if (m == NULL) 1765 goto retry; 1766 reclaimed = true; 1767 } 1768 /* XXX PV STATS */ 1769 #if 0 1770 dump_add_page(m->phys_addr); 1771 #endif 1772 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1773 pc->pc_pmap = pmap; 1774 pc->pc_map[0] = PC_FREE0; 1775 pc->pc_map[1] = PC_FREE1; 1776 pc->pc_map[2] = PC_FREE2; 1777 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1778 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 1779 1780 /* 1781 * The reclaim might have freed a chunk from the current pmap. 1782 * If that chunk contained available entries, we need to 1783 * re-count the number of available entries. 1784 */ 1785 if (reclaimed) 1786 goto retry; 1787 } 1788 if (!TAILQ_EMPTY(&new_tail)) { 1789 mtx_lock(&pv_chunks_mutex); 1790 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 1791 mtx_unlock(&pv_chunks_mutex); 1792 } 1793 } 1794 1795 /* 1796 * First find and then remove the pv entry for the specified pmap and virtual 1797 * address from the specified pv list. Returns the pv entry if found and NULL 1798 * otherwise. This operation can be performed on pv lists for either 4KB or 1799 * 2MB page mappings. 1800 */ 1801 static __inline pv_entry_t 1802 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1803 { 1804 pv_entry_t pv; 1805 1806 rw_assert(&pvh_global_lock, RA_LOCKED); 1807 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 1808 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 1809 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 1810 pvh->pv_gen++; 1811 break; 1812 } 1813 } 1814 return (pv); 1815 } 1816 1817 /* 1818 * First find and then destroy the pv entry for the specified pmap and virtual 1819 * address. This operation can be performed on pv lists for either 4KB or 2MB 1820 * page mappings. 1821 */ 1822 static void 1823 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1824 { 1825 pv_entry_t pv; 1826 1827 pv = pmap_pvh_remove(pvh, pmap, va); 1828 1829 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va)); 1830 free_pv_entry(pmap, pv); 1831 } 1832 1833 /* 1834 * Conditionally create the PV entry for a 4KB page mapping if the required 1835 * memory can be allocated without resorting to reclamation. 1836 */ 1837 static boolean_t 1838 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 1839 struct rwlock **lockp) 1840 { 1841 pv_entry_t pv; 1842 1843 rw_assert(&pvh_global_lock, RA_LOCKED); 1844 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1845 /* Pass NULL instead of the lock pointer to disable reclamation. */ 1846 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 1847 pv->pv_va = va; 1848 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 1849 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 1850 m->md.pv_gen++; 1851 return (TRUE); 1852 } else 1853 return (FALSE); 1854 } 1855 1856 /* 1857 * After demotion from a 2MB page mapping to 512 4KB page mappings, 1858 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 1859 * entries for each of the 4KB page mappings. 1860 */ 1861 static void __unused 1862 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1863 struct rwlock **lockp) 1864 { 1865 struct md_page *pvh; 1866 struct pv_chunk *pc; 1867 pv_entry_t pv; 1868 vm_page_t m; 1869 vm_offset_t va_last; 1870 int bit, field; 1871 1872 rw_assert(&pvh_global_lock, RA_LOCKED); 1873 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1874 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1875 1876 /* 1877 * Transfer the 2mpage's pv entry for this mapping to the first 1878 * page's pv list. Once this transfer begins, the pv list lock 1879 * must not be released until the last pv entry is reinstantiated. 1880 */ 1881 pvh = pa_to_pvh(pa); 1882 va &= ~L2_OFFSET; 1883 pv = pmap_pvh_remove(pvh, pmap, va); 1884 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); 1885 m = PHYS_TO_VM_PAGE(pa); 1886 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 1887 m->md.pv_gen++; 1888 /* Instantiate the remaining 511 pv entries. */ 1889 va_last = va + L2_SIZE - PAGE_SIZE; 1890 for (;;) { 1891 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1892 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 1893 pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); 1894 for (field = 0; field < _NPCM; field++) { 1895 while (pc->pc_map[field] != 0) { 1896 bit = ffsl(pc->pc_map[field]) - 1; 1897 pc->pc_map[field] &= ~(1ul << bit); 1898 pv = &pc->pc_pventry[field * 64 + bit]; 1899 va += PAGE_SIZE; 1900 pv->pv_va = va; 1901 m++; 1902 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 1903 ("pmap_pv_demote_l2: page %p is not managed", m)); 1904 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 1905 m->md.pv_gen++; 1906 if (va == va_last) 1907 goto out; 1908 } 1909 } 1910 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1911 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1912 } 1913 out: 1914 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 1915 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1916 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1917 } 1918 /* XXX PV stats */ 1919 } 1920 1921 #if VM_NRESERVLEVEL > 0 1922 static void 1923 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1924 struct rwlock **lockp) 1925 { 1926 struct md_page *pvh; 1927 pv_entry_t pv; 1928 vm_page_t m; 1929 vm_offset_t va_last; 1930 1931 rw_assert(&pvh_global_lock, RA_LOCKED); 1932 KASSERT((va & L2_OFFSET) == 0, 1933 ("pmap_pv_promote_l2: misaligned va %#lx", va)); 1934 1935 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1936 1937 m = PHYS_TO_VM_PAGE(pa); 1938 pv = pmap_pvh_remove(&m->md, pmap, va); 1939 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va)); 1940 pvh = pa_to_pvh(pa); 1941 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 1942 pvh->pv_gen++; 1943 1944 va_last = va + L2_SIZE - PAGE_SIZE; 1945 do { 1946 m++; 1947 va += PAGE_SIZE; 1948 pmap_pvh_free(&m->md, pmap, va); 1949 } while (va < va_last); 1950 } 1951 #endif /* VM_NRESERVLEVEL > 0 */ 1952 1953 /* 1954 * Create the PV entry for a 2MB page mapping. Always returns true unless the 1955 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 1956 * false if the PV entry cannot be allocated without resorting to reclamation. 1957 */ 1958 static bool 1959 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, 1960 struct rwlock **lockp) 1961 { 1962 struct md_page *pvh; 1963 pv_entry_t pv; 1964 vm_paddr_t pa; 1965 1966 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1967 /* Pass NULL instead of the lock pointer to disable reclamation. */ 1968 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 1969 NULL : lockp)) == NULL) 1970 return (false); 1971 pv->pv_va = va; 1972 pa = PTE_TO_PHYS(l2e); 1973 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1974 pvh = pa_to_pvh(pa); 1975 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 1976 pvh->pv_gen++; 1977 return (true); 1978 } 1979 1980 static void 1981 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 1982 { 1983 pt_entry_t newl2, oldl2; 1984 vm_page_t ml3; 1985 vm_paddr_t ml3pa; 1986 1987 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); 1988 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 1989 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1990 1991 ml3 = pmap_remove_pt_page(pmap, va); 1992 if (ml3 == NULL) 1993 panic("pmap_remove_kernel_l2: Missing pt page"); 1994 1995 ml3pa = VM_PAGE_TO_PHYS(ml3); 1996 newl2 = ml3pa | PTE_V; 1997 1998 /* 1999 * If this page table page was unmapped by a promotion, then it 2000 * contains valid mappings. Zero it to invalidate those mappings. 2001 */ 2002 if (ml3->valid != 0) 2003 pagezero((void *)PHYS_TO_DMAP(ml3pa)); 2004 2005 /* 2006 * Demote the mapping. 2007 */ 2008 oldl2 = pmap_load_store(l2, newl2); 2009 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", 2010 __func__, l2, oldl2)); 2011 } 2012 2013 /* 2014 * pmap_remove_l2: Do the things to unmap a level 2 superpage. 2015 */ 2016 static int 2017 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 2018 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) 2019 { 2020 struct md_page *pvh; 2021 pt_entry_t oldl2; 2022 vm_offset_t eva, va; 2023 vm_page_t m, ml3; 2024 2025 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2026 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); 2027 oldl2 = pmap_load_clear(l2); 2028 KASSERT((oldl2 & PTE_RWX) != 0, 2029 ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2)); 2030 2031 /* 2032 * The sfence.vma documentation states that it is sufficient to specify 2033 * a single address within a superpage mapping. However, since we do 2034 * not perform any invalidation upon promotion, TLBs may still be 2035 * caching 4KB mappings within the superpage, so we must invalidate the 2036 * entire range. 2037 */ 2038 pmap_invalidate_range(pmap, sva, sva + L2_SIZE); 2039 if ((oldl2 & PTE_SW_WIRED) != 0) 2040 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; 2041 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); 2042 if ((oldl2 & PTE_SW_MANAGED) != 0) { 2043 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2)); 2044 pvh = pa_to_pvh(PTE_TO_PHYS(oldl2)); 2045 pmap_pvh_free(pvh, pmap, sva); 2046 eva = sva + L2_SIZE; 2047 for (va = sva, m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl2)); 2048 va < eva; va += PAGE_SIZE, m++) { 2049 if ((oldl2 & PTE_D) != 0) 2050 vm_page_dirty(m); 2051 if ((oldl2 & PTE_A) != 0) 2052 vm_page_aflag_set(m, PGA_REFERENCED); 2053 if (TAILQ_EMPTY(&m->md.pv_list) && 2054 TAILQ_EMPTY(&pvh->pv_list)) 2055 vm_page_aflag_clear(m, PGA_WRITEABLE); 2056 } 2057 } 2058 if (pmap == kernel_pmap) { 2059 pmap_remove_kernel_l2(pmap, l2, sva); 2060 } else { 2061 ml3 = pmap_remove_pt_page(pmap, sva); 2062 if (ml3 != NULL) { 2063 KASSERT(ml3->valid == VM_PAGE_BITS_ALL, 2064 ("pmap_remove_l2: l3 page not promoted")); 2065 pmap_resident_count_dec(pmap, 1); 2066 KASSERT(ml3->ref_count == Ln_ENTRIES, 2067 ("pmap_remove_l2: l3 page ref count error")); 2068 ml3->ref_count = 1; 2069 vm_page_unwire_noq(ml3); 2070 pmap_add_delayed_free_list(ml3, free, FALSE); 2071 } 2072 } 2073 return (pmap_unuse_pt(pmap, sva, l1e, free)); 2074 } 2075 2076 /* 2077 * pmap_remove_l3: do the things to unmap a page in a process 2078 */ 2079 static int 2080 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 2081 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) 2082 { 2083 struct md_page *pvh; 2084 pt_entry_t old_l3; 2085 vm_paddr_t phys; 2086 vm_page_t m; 2087 2088 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2089 old_l3 = pmap_load_clear(l3); 2090 pmap_invalidate_page(pmap, va); 2091 if (old_l3 & PTE_SW_WIRED) 2092 pmap->pm_stats.wired_count -= 1; 2093 pmap_resident_count_dec(pmap, 1); 2094 if (old_l3 & PTE_SW_MANAGED) { 2095 phys = PTE_TO_PHYS(old_l3); 2096 m = PHYS_TO_VM_PAGE(phys); 2097 if ((old_l3 & PTE_D) != 0) 2098 vm_page_dirty(m); 2099 if (old_l3 & PTE_A) 2100 vm_page_aflag_set(m, PGA_REFERENCED); 2101 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2102 pmap_pvh_free(&m->md, pmap, va); 2103 if (TAILQ_EMPTY(&m->md.pv_list) && 2104 (m->flags & PG_FICTITIOUS) == 0) { 2105 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2106 if (TAILQ_EMPTY(&pvh->pv_list)) 2107 vm_page_aflag_clear(m, PGA_WRITEABLE); 2108 } 2109 } 2110 2111 return (pmap_unuse_pt(pmap, va, l2e, free)); 2112 } 2113 2114 /* 2115 * Remove the given range of addresses from the specified map. 2116 * 2117 * It is assumed that the start and end are properly 2118 * rounded to the page size. 2119 */ 2120 void 2121 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2122 { 2123 struct spglist free; 2124 struct rwlock *lock; 2125 vm_offset_t va, va_next; 2126 pd_entry_t *l1, *l2, l2e; 2127 pt_entry_t *l3; 2128 2129 /* 2130 * Perform an unsynchronized read. This is, however, safe. 2131 */ 2132 if (pmap->pm_stats.resident_count == 0) 2133 return; 2134 2135 SLIST_INIT(&free); 2136 2137 rw_rlock(&pvh_global_lock); 2138 PMAP_LOCK(pmap); 2139 2140 lock = NULL; 2141 for (; sva < eva; sva = va_next) { 2142 if (pmap->pm_stats.resident_count == 0) 2143 break; 2144 2145 l1 = pmap_l1(pmap, sva); 2146 if (pmap_load(l1) == 0) { 2147 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2148 if (va_next < sva) 2149 va_next = eva; 2150 continue; 2151 } 2152 2153 /* 2154 * Calculate index for next page table. 2155 */ 2156 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2157 if (va_next < sva) 2158 va_next = eva; 2159 2160 l2 = pmap_l1_to_l2(l1, sva); 2161 if (l2 == NULL) 2162 continue; 2163 if ((l2e = pmap_load(l2)) == 0) 2164 continue; 2165 if ((l2e & PTE_RWX) != 0) { 2166 if (sva + L2_SIZE == va_next && eva >= va_next) { 2167 (void)pmap_remove_l2(pmap, l2, sva, 2168 pmap_load(l1), &free, &lock); 2169 continue; 2170 } else if (!pmap_demote_l2_locked(pmap, l2, sva, 2171 &lock)) { 2172 /* 2173 * The large page mapping was destroyed. 2174 */ 2175 continue; 2176 } 2177 l2e = pmap_load(l2); 2178 } 2179 2180 /* 2181 * Limit our scan to either the end of the va represented 2182 * by the current page table page, or to the end of the 2183 * range being removed. 2184 */ 2185 if (va_next > eva) 2186 va_next = eva; 2187 2188 va = va_next; 2189 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2190 sva += L3_SIZE) { 2191 if (pmap_load(l3) == 0) { 2192 if (va != va_next) { 2193 pmap_invalidate_range(pmap, va, sva); 2194 va = va_next; 2195 } 2196 continue; 2197 } 2198 if (va == va_next) 2199 va = sva; 2200 if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) { 2201 sva += L3_SIZE; 2202 break; 2203 } 2204 } 2205 if (va != va_next) 2206 pmap_invalidate_range(pmap, va, sva); 2207 } 2208 if (lock != NULL) 2209 rw_wunlock(lock); 2210 rw_runlock(&pvh_global_lock); 2211 PMAP_UNLOCK(pmap); 2212 vm_page_free_pages_toq(&free, false); 2213 } 2214 2215 /* 2216 * Routine: pmap_remove_all 2217 * Function: 2218 * Removes this physical page from 2219 * all physical maps in which it resides. 2220 * Reflects back modify bits to the pager. 2221 * 2222 * Notes: 2223 * Original versions of this routine were very 2224 * inefficient because they iteratively called 2225 * pmap_remove (slow...) 2226 */ 2227 2228 void 2229 pmap_remove_all(vm_page_t m) 2230 { 2231 struct spglist free; 2232 struct md_page *pvh; 2233 pmap_t pmap; 2234 pt_entry_t *l3, l3e; 2235 pd_entry_t *l2, l2e; 2236 pv_entry_t pv; 2237 vm_offset_t va; 2238 2239 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2240 ("pmap_remove_all: page %p is not managed", m)); 2241 SLIST_INIT(&free); 2242 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 2243 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2244 2245 rw_wlock(&pvh_global_lock); 2246 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 2247 pmap = PV_PMAP(pv); 2248 PMAP_LOCK(pmap); 2249 va = pv->pv_va; 2250 l2 = pmap_l2(pmap, va); 2251 (void)pmap_demote_l2(pmap, l2, va); 2252 PMAP_UNLOCK(pmap); 2253 } 2254 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2255 pmap = PV_PMAP(pv); 2256 PMAP_LOCK(pmap); 2257 pmap_resident_count_dec(pmap, 1); 2258 l2 = pmap_l2(pmap, pv->pv_va); 2259 KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found")); 2260 l2e = pmap_load(l2); 2261 2262 KASSERT((l2e & PTE_RX) == 0, 2263 ("pmap_remove_all: found a superpage in %p's pv list", m)); 2264 2265 l3 = pmap_l2_to_l3(l2, pv->pv_va); 2266 l3e = pmap_load_clear(l3); 2267 pmap_invalidate_page(pmap, pv->pv_va); 2268 if (l3e & PTE_SW_WIRED) 2269 pmap->pm_stats.wired_count--; 2270 if ((l3e & PTE_A) != 0) 2271 vm_page_aflag_set(m, PGA_REFERENCED); 2272 2273 /* 2274 * Update the vm_page_t clean and reference bits. 2275 */ 2276 if ((l3e & PTE_D) != 0) 2277 vm_page_dirty(m); 2278 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free); 2279 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2280 m->md.pv_gen++; 2281 free_pv_entry(pmap, pv); 2282 PMAP_UNLOCK(pmap); 2283 } 2284 vm_page_aflag_clear(m, PGA_WRITEABLE); 2285 rw_wunlock(&pvh_global_lock); 2286 vm_page_free_pages_toq(&free, false); 2287 } 2288 2289 /* 2290 * Set the physical protection on the 2291 * specified range of this map as requested. 2292 */ 2293 void 2294 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2295 { 2296 pd_entry_t *l1, *l2, l2e; 2297 pt_entry_t *l3, l3e, mask; 2298 vm_page_t m, mt; 2299 vm_paddr_t pa; 2300 vm_offset_t va_next; 2301 bool anychanged, pv_lists_locked; 2302 2303 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2304 pmap_remove(pmap, sva, eva); 2305 return; 2306 } 2307 2308 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == 2309 (VM_PROT_WRITE | VM_PROT_EXECUTE)) 2310 return; 2311 2312 anychanged = false; 2313 pv_lists_locked = false; 2314 mask = 0; 2315 if ((prot & VM_PROT_WRITE) == 0) 2316 mask |= PTE_W | PTE_D; 2317 if ((prot & VM_PROT_EXECUTE) == 0) 2318 mask |= PTE_X; 2319 resume: 2320 PMAP_LOCK(pmap); 2321 for (; sva < eva; sva = va_next) { 2322 l1 = pmap_l1(pmap, sva); 2323 if (pmap_load(l1) == 0) { 2324 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2325 if (va_next < sva) 2326 va_next = eva; 2327 continue; 2328 } 2329 2330 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2331 if (va_next < sva) 2332 va_next = eva; 2333 2334 l2 = pmap_l1_to_l2(l1, sva); 2335 if (l2 == NULL || (l2e = pmap_load(l2)) == 0) 2336 continue; 2337 if ((l2e & PTE_RWX) != 0) { 2338 if (sva + L2_SIZE == va_next && eva >= va_next) { 2339 retryl2: 2340 if ((prot & VM_PROT_WRITE) == 0 && 2341 (l2e & (PTE_SW_MANAGED | PTE_D)) == 2342 (PTE_SW_MANAGED | PTE_D)) { 2343 pa = PTE_TO_PHYS(l2e); 2344 m = PHYS_TO_VM_PAGE(pa); 2345 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 2346 vm_page_dirty(mt); 2347 } 2348 if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask)) 2349 goto retryl2; 2350 anychanged = true; 2351 continue; 2352 } else { 2353 if (!pv_lists_locked) { 2354 pv_lists_locked = true; 2355 if (!rw_try_rlock(&pvh_global_lock)) { 2356 if (anychanged) 2357 pmap_invalidate_all( 2358 pmap); 2359 PMAP_UNLOCK(pmap); 2360 rw_rlock(&pvh_global_lock); 2361 goto resume; 2362 } 2363 } 2364 if (!pmap_demote_l2(pmap, l2, sva)) { 2365 /* 2366 * The large page mapping was destroyed. 2367 */ 2368 continue; 2369 } 2370 } 2371 } 2372 2373 if (va_next > eva) 2374 va_next = eva; 2375 2376 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2377 sva += L3_SIZE) { 2378 l3e = pmap_load(l3); 2379 retryl3: 2380 if ((l3e & PTE_V) == 0) 2381 continue; 2382 if ((prot & VM_PROT_WRITE) == 0 && 2383 (l3e & (PTE_SW_MANAGED | PTE_D)) == 2384 (PTE_SW_MANAGED | PTE_D)) { 2385 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(l3e)); 2386 vm_page_dirty(m); 2387 } 2388 if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask)) 2389 goto retryl3; 2390 anychanged = true; 2391 } 2392 } 2393 if (anychanged) 2394 pmap_invalidate_all(pmap); 2395 if (pv_lists_locked) 2396 rw_runlock(&pvh_global_lock); 2397 PMAP_UNLOCK(pmap); 2398 } 2399 2400 int 2401 pmap_fault(pmap_t pmap, vm_offset_t va, vm_prot_t ftype) 2402 { 2403 pd_entry_t *l2, l2e; 2404 pt_entry_t bits, *pte, oldpte; 2405 int rv; 2406 2407 rv = 0; 2408 PMAP_LOCK(pmap); 2409 l2 = pmap_l2(pmap, va); 2410 if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) 2411 goto done; 2412 if ((l2e & PTE_RWX) == 0) { 2413 pte = pmap_l2_to_l3(l2, va); 2414 if (pte == NULL || ((oldpte = pmap_load(pte)) & PTE_V) == 0) 2415 goto done; 2416 } else { 2417 pte = l2; 2418 oldpte = l2e; 2419 } 2420 2421 if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) || 2422 (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) || 2423 (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) || 2424 (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0)) 2425 goto done; 2426 2427 bits = PTE_A; 2428 if (ftype == VM_PROT_WRITE) 2429 bits |= PTE_D; 2430 2431 /* 2432 * Spurious faults can occur if the implementation caches invalid 2433 * entries in the TLB, or if simultaneous accesses on multiple CPUs 2434 * race with each other. 2435 */ 2436 if ((oldpte & bits) != bits) 2437 pmap_store_bits(pte, bits); 2438 sfence_vma(); 2439 rv = 1; 2440 done: 2441 PMAP_UNLOCK(pmap); 2442 return (rv); 2443 } 2444 2445 static bool 2446 pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va) 2447 { 2448 struct rwlock *lock; 2449 bool rv; 2450 2451 lock = NULL; 2452 rv = pmap_demote_l2_locked(pmap, l2, va, &lock); 2453 if (lock != NULL) 2454 rw_wunlock(lock); 2455 return (rv); 2456 } 2457 2458 /* 2459 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 2460 * mapping is invalidated. 2461 */ 2462 static bool 2463 pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 2464 struct rwlock **lockp) 2465 { 2466 struct spglist free; 2467 vm_page_t mpte; 2468 pd_entry_t newl2, oldl2; 2469 pt_entry_t *firstl3, newl3; 2470 vm_paddr_t mptepa; 2471 int i; 2472 2473 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2474 2475 oldl2 = pmap_load(l2); 2476 KASSERT((oldl2 & PTE_RWX) != 0, 2477 ("pmap_demote_l2_locked: oldl2 is not a leaf entry")); 2478 if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 2479 NULL) { 2480 if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc(NULL, 2481 pmap_l2_pindex(va), (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 2482 VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == 2483 NULL) { 2484 SLIST_INIT(&free); 2485 (void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET, 2486 pmap_load(pmap_l1(pmap, va)), &free, lockp); 2487 vm_page_free_pages_toq(&free, true); 2488 CTR2(KTR_PMAP, "pmap_demote_l2_locked: " 2489 "failure for va %#lx in pmap %p", va, pmap); 2490 return (false); 2491 } 2492 if (va < VM_MAXUSER_ADDRESS) { 2493 mpte->ref_count = Ln_ENTRIES; 2494 pmap_resident_count_inc(pmap, 1); 2495 } 2496 } 2497 mptepa = VM_PAGE_TO_PHYS(mpte); 2498 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 2499 newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V; 2500 KASSERT((oldl2 & PTE_A) != 0, 2501 ("pmap_demote_l2_locked: oldl2 is missing PTE_A")); 2502 KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W, 2503 ("pmap_demote_l2_locked: oldl2 is missing PTE_D")); 2504 newl3 = oldl2; 2505 2506 /* 2507 * If the page table page is not leftover from an earlier promotion, 2508 * initialize it. 2509 */ 2510 if (mpte->valid == 0) { 2511 for (i = 0; i < Ln_ENTRIES; i++) 2512 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2513 } 2514 KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3), 2515 ("pmap_demote_l2_locked: firstl3 and newl3 map different physical " 2516 "addresses")); 2517 2518 /* 2519 * If the mapping has changed attributes, update the page table 2520 * entries. 2521 */ 2522 if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE)) 2523 for (i = 0; i < Ln_ENTRIES; i++) 2524 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2525 2526 /* 2527 * The spare PV entries must be reserved prior to demoting the 2528 * mapping, that is, prior to changing the L2 entry. Otherwise, the 2529 * state of the L2 entry and the PV lists will be inconsistent, which 2530 * can result in reclaim_pv_chunk() attempting to remove a PV entry from 2531 * the wrong PV list and pmap_pv_demote_l2() failing to find the 2532 * expected PV entry for the 2MB page mapping that is being demoted. 2533 */ 2534 if ((oldl2 & PTE_SW_MANAGED) != 0) 2535 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); 2536 2537 /* 2538 * Demote the mapping. 2539 */ 2540 pmap_store(l2, newl2); 2541 2542 /* 2543 * Demote the PV entry. 2544 */ 2545 if ((oldl2 & PTE_SW_MANAGED) != 0) 2546 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp); 2547 2548 atomic_add_long(&pmap_l2_demotions, 1); 2549 CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p", 2550 va, pmap); 2551 return (true); 2552 } 2553 2554 #if VM_NRESERVLEVEL > 0 2555 static void 2556 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 2557 struct rwlock **lockp) 2558 { 2559 pt_entry_t *firstl3, firstl3e, *l3, l3e; 2560 vm_paddr_t pa; 2561 vm_page_t ml3; 2562 2563 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2564 2565 va &= ~L2_OFFSET; 2566 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 2567 ("pmap_promote_l2: invalid l2 entry %p", l2)); 2568 2569 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2))); 2570 firstl3e = pmap_load(firstl3); 2571 pa = PTE_TO_PHYS(firstl3e); 2572 if ((pa & L2_OFFSET) != 0) { 2573 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 2574 va, pmap); 2575 atomic_add_long(&pmap_l2_p_failures, 1); 2576 return; 2577 } 2578 2579 /* 2580 * Downgrade a clean, writable mapping to read-only to ensure that the 2581 * hardware does not set PTE_D while we are comparing PTEs. 2582 * 2583 * Upon a write access to a clean mapping, the implementation will 2584 * either atomically check protections and set PTE_D, or raise a page 2585 * fault. In the latter case, the pmap lock provides atomicity. Thus, 2586 * we do not issue an sfence.vma here and instead rely on pmap_fault() 2587 * to do so lazily. 2588 */ 2589 while ((firstl3e & (PTE_W | PTE_D)) == PTE_W) { 2590 if (atomic_fcmpset_64(firstl3, &firstl3e, firstl3e & ~PTE_W)) { 2591 firstl3e &= ~PTE_W; 2592 break; 2593 } 2594 } 2595 2596 pa += PAGE_SIZE; 2597 for (l3 = firstl3 + 1; l3 < firstl3 + Ln_ENTRIES; l3++) { 2598 l3e = pmap_load(l3); 2599 if (PTE_TO_PHYS(l3e) != pa) { 2600 CTR2(KTR_PMAP, 2601 "pmap_promote_l2: failure for va %#lx pmap %p", 2602 va, pmap); 2603 atomic_add_long(&pmap_l2_p_failures, 1); 2604 return; 2605 } 2606 while ((l3e & (PTE_W | PTE_D)) == PTE_W) { 2607 if (atomic_fcmpset_64(l3, &l3e, l3e & ~PTE_W)) { 2608 l3e &= ~PTE_W; 2609 break; 2610 } 2611 } 2612 if ((l3e & PTE_PROMOTE) != (firstl3e & PTE_PROMOTE)) { 2613 CTR2(KTR_PMAP, 2614 "pmap_promote_l2: failure for va %#lx pmap %p", 2615 va, pmap); 2616 atomic_add_long(&pmap_l2_p_failures, 1); 2617 return; 2618 } 2619 pa += PAGE_SIZE; 2620 } 2621 2622 ml3 = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2623 KASSERT(ml3->pindex == pmap_l2_pindex(va), 2624 ("pmap_promote_l2: page table page's pindex is wrong")); 2625 if (pmap_insert_pt_page(pmap, ml3, true)) { 2626 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 2627 va, pmap); 2628 atomic_add_long(&pmap_l2_p_failures, 1); 2629 return; 2630 } 2631 2632 if ((firstl3e & PTE_SW_MANAGED) != 0) 2633 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(firstl3e), lockp); 2634 2635 pmap_store(l2, firstl3e); 2636 2637 atomic_add_long(&pmap_l2_promotions, 1); 2638 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, 2639 pmap); 2640 } 2641 #endif 2642 2643 /* 2644 * Insert the given physical page (p) at 2645 * the specified virtual address (v) in the 2646 * target physical map with the protection requested. 2647 * 2648 * If specified, the page will be wired down, meaning 2649 * that the related pte can not be reclaimed. 2650 * 2651 * NB: This is the only routine which MAY NOT lazy-evaluate 2652 * or lose information. That is, this routine must actually 2653 * insert this page into the given map NOW. 2654 */ 2655 int 2656 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2657 u_int flags, int8_t psind) 2658 { 2659 struct rwlock *lock; 2660 pd_entry_t *l1, *l2, l2e; 2661 pt_entry_t new_l3, orig_l3; 2662 pt_entry_t *l3; 2663 pv_entry_t pv; 2664 vm_paddr_t opa, pa, l2_pa, l3_pa; 2665 vm_page_t mpte, om, l2_m, l3_m; 2666 pt_entry_t entry; 2667 pn_t l2_pn, l3_pn, pn; 2668 int rv; 2669 bool nosleep; 2670 2671 va = trunc_page(va); 2672 if ((m->oflags & VPO_UNMANAGED) == 0) 2673 VM_PAGE_OBJECT_BUSY_ASSERT(m); 2674 pa = VM_PAGE_TO_PHYS(m); 2675 pn = (pa / PAGE_SIZE); 2676 2677 new_l3 = PTE_V | PTE_R | PTE_A; 2678 if (prot & VM_PROT_EXECUTE) 2679 new_l3 |= PTE_X; 2680 if (flags & VM_PROT_WRITE) 2681 new_l3 |= PTE_D; 2682 if (prot & VM_PROT_WRITE) 2683 new_l3 |= PTE_W; 2684 if (va < VM_MAX_USER_ADDRESS) 2685 new_l3 |= PTE_U; 2686 2687 new_l3 |= (pn << PTE_PPN0_S); 2688 if ((flags & PMAP_ENTER_WIRED) != 0) 2689 new_l3 |= PTE_SW_WIRED; 2690 2691 /* 2692 * Set modified bit gratuitously for writeable mappings if 2693 * the page is unmanaged. We do not want to take a fault 2694 * to do the dirty bit accounting for these mappings. 2695 */ 2696 if ((m->oflags & VPO_UNMANAGED) != 0) { 2697 if (prot & VM_PROT_WRITE) 2698 new_l3 |= PTE_D; 2699 } else 2700 new_l3 |= PTE_SW_MANAGED; 2701 2702 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); 2703 2704 lock = NULL; 2705 mpte = NULL; 2706 rw_rlock(&pvh_global_lock); 2707 PMAP_LOCK(pmap); 2708 if (psind == 1) { 2709 /* Assert the required virtual and physical alignment. */ 2710 KASSERT((va & L2_OFFSET) == 0, 2711 ("pmap_enter: va %#lx unaligned", va)); 2712 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 2713 rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock); 2714 goto out; 2715 } 2716 2717 l2 = pmap_l2(pmap, va); 2718 if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 && 2719 ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2, 2720 va, &lock))) { 2721 l3 = pmap_l2_to_l3(l2, va); 2722 if (va < VM_MAXUSER_ADDRESS) { 2723 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2724 mpte->ref_count++; 2725 } 2726 } else if (va < VM_MAXUSER_ADDRESS) { 2727 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 2728 mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock); 2729 if (mpte == NULL && nosleep) { 2730 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); 2731 if (lock != NULL) 2732 rw_wunlock(lock); 2733 rw_runlock(&pvh_global_lock); 2734 PMAP_UNLOCK(pmap); 2735 return (KERN_RESOURCE_SHORTAGE); 2736 } 2737 l3 = pmap_l3(pmap, va); 2738 } else { 2739 l3 = pmap_l3(pmap, va); 2740 /* TODO: This is not optimal, but should mostly work */ 2741 if (l3 == NULL) { 2742 if (l2 == NULL) { 2743 l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2744 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2745 VM_ALLOC_ZERO); 2746 if (l2_m == NULL) 2747 panic("pmap_enter: l2 pte_m == NULL"); 2748 if ((l2_m->flags & PG_ZERO) == 0) 2749 pmap_zero_page(l2_m); 2750 2751 l2_pa = VM_PAGE_TO_PHYS(l2_m); 2752 l2_pn = (l2_pa / PAGE_SIZE); 2753 2754 l1 = pmap_l1(pmap, va); 2755 entry = (PTE_V); 2756 entry |= (l2_pn << PTE_PPN0_S); 2757 pmap_store(l1, entry); 2758 pmap_distribute_l1(pmap, pmap_l1_index(va), entry); 2759 l2 = pmap_l1_to_l2(l1, va); 2760 } 2761 2762 l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2763 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2764 if (l3_m == NULL) 2765 panic("pmap_enter: l3 pte_m == NULL"); 2766 if ((l3_m->flags & PG_ZERO) == 0) 2767 pmap_zero_page(l3_m); 2768 2769 l3_pa = VM_PAGE_TO_PHYS(l3_m); 2770 l3_pn = (l3_pa / PAGE_SIZE); 2771 entry = (PTE_V); 2772 entry |= (l3_pn << PTE_PPN0_S); 2773 pmap_store(l2, entry); 2774 l3 = pmap_l2_to_l3(l2, va); 2775 } 2776 pmap_invalidate_page(pmap, va); 2777 } 2778 2779 orig_l3 = pmap_load(l3); 2780 opa = PTE_TO_PHYS(orig_l3); 2781 pv = NULL; 2782 2783 /* 2784 * Is the specified virtual address already mapped? 2785 */ 2786 if ((orig_l3 & PTE_V) != 0) { 2787 /* 2788 * Wiring change, just update stats. We don't worry about 2789 * wiring PT pages as they remain resident as long as there 2790 * are valid mappings in them. Hence, if a user page is wired, 2791 * the PT page will be also. 2792 */ 2793 if ((flags & PMAP_ENTER_WIRED) != 0 && 2794 (orig_l3 & PTE_SW_WIRED) == 0) 2795 pmap->pm_stats.wired_count++; 2796 else if ((flags & PMAP_ENTER_WIRED) == 0 && 2797 (orig_l3 & PTE_SW_WIRED) != 0) 2798 pmap->pm_stats.wired_count--; 2799 2800 /* 2801 * Remove the extra PT page reference. 2802 */ 2803 if (mpte != NULL) { 2804 mpte->ref_count--; 2805 KASSERT(mpte->ref_count > 0, 2806 ("pmap_enter: missing reference to page table page," 2807 " va: 0x%lx", va)); 2808 } 2809 2810 /* 2811 * Has the physical page changed? 2812 */ 2813 if (opa == pa) { 2814 /* 2815 * No, might be a protection or wiring change. 2816 */ 2817 if ((orig_l3 & PTE_SW_MANAGED) != 0 && 2818 (new_l3 & PTE_W) != 0) 2819 vm_page_aflag_set(m, PGA_WRITEABLE); 2820 goto validate; 2821 } 2822 2823 /* 2824 * The physical page has changed. Temporarily invalidate 2825 * the mapping. This ensures that all threads sharing the 2826 * pmap keep a consistent view of the mapping, which is 2827 * necessary for the correct handling of COW faults. It 2828 * also permits reuse of the old mapping's PV entry, 2829 * avoiding an allocation. 2830 * 2831 * For consistency, handle unmanaged mappings the same way. 2832 */ 2833 orig_l3 = pmap_load_clear(l3); 2834 KASSERT(PTE_TO_PHYS(orig_l3) == opa, 2835 ("pmap_enter: unexpected pa update for %#lx", va)); 2836 if ((orig_l3 & PTE_SW_MANAGED) != 0) { 2837 om = PHYS_TO_VM_PAGE(opa); 2838 2839 /* 2840 * The pmap lock is sufficient to synchronize with 2841 * concurrent calls to pmap_page_test_mappings() and 2842 * pmap_ts_referenced(). 2843 */ 2844 if ((orig_l3 & PTE_D) != 0) 2845 vm_page_dirty(om); 2846 if ((orig_l3 & PTE_A) != 0) 2847 vm_page_aflag_set(om, PGA_REFERENCED); 2848 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 2849 pv = pmap_pvh_remove(&om->md, pmap, va); 2850 KASSERT(pv != NULL, 2851 ("pmap_enter: no PV entry for %#lx", va)); 2852 if ((new_l3 & PTE_SW_MANAGED) == 0) 2853 free_pv_entry(pmap, pv); 2854 if ((om->a.flags & PGA_WRITEABLE) != 0 && 2855 TAILQ_EMPTY(&om->md.pv_list) && 2856 ((om->flags & PG_FICTITIOUS) != 0 || 2857 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 2858 vm_page_aflag_clear(om, PGA_WRITEABLE); 2859 } 2860 pmap_invalidate_page(pmap, va); 2861 orig_l3 = 0; 2862 } else { 2863 /* 2864 * Increment the counters. 2865 */ 2866 if ((new_l3 & PTE_SW_WIRED) != 0) 2867 pmap->pm_stats.wired_count++; 2868 pmap_resident_count_inc(pmap, 1); 2869 } 2870 /* 2871 * Enter on the PV list if part of our managed memory. 2872 */ 2873 if ((new_l3 & PTE_SW_MANAGED) != 0) { 2874 if (pv == NULL) { 2875 pv = get_pv_entry(pmap, &lock); 2876 pv->pv_va = va; 2877 } 2878 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 2879 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2880 m->md.pv_gen++; 2881 if ((new_l3 & PTE_W) != 0) 2882 vm_page_aflag_set(m, PGA_WRITEABLE); 2883 } 2884 2885 validate: 2886 /* 2887 * Sync the i-cache on all harts before updating the PTE 2888 * if the new PTE is executable. 2889 */ 2890 if (prot & VM_PROT_EXECUTE) 2891 pmap_sync_icache(pmap, va, PAGE_SIZE); 2892 2893 /* 2894 * Update the L3 entry. 2895 */ 2896 if (orig_l3 != 0) { 2897 orig_l3 = pmap_load_store(l3, new_l3); 2898 pmap_invalidate_page(pmap, va); 2899 KASSERT(PTE_TO_PHYS(orig_l3) == pa, 2900 ("pmap_enter: invalid update")); 2901 if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) == 2902 (PTE_D | PTE_SW_MANAGED)) 2903 vm_page_dirty(m); 2904 } else { 2905 pmap_store(l3, new_l3); 2906 } 2907 2908 #if VM_NRESERVLEVEL > 0 2909 if (mpte != NULL && mpte->ref_count == Ln_ENTRIES && 2910 pmap_ps_enabled(pmap) && 2911 (m->flags & PG_FICTITIOUS) == 0 && 2912 vm_reserv_level_iffullpop(m) == 0) 2913 pmap_promote_l2(pmap, l2, va, &lock); 2914 #endif 2915 2916 rv = KERN_SUCCESS; 2917 out: 2918 if (lock != NULL) 2919 rw_wunlock(lock); 2920 rw_runlock(&pvh_global_lock); 2921 PMAP_UNLOCK(pmap); 2922 return (rv); 2923 } 2924 2925 /* 2926 * Tries to create a read- and/or execute-only 2MB page mapping. Returns true 2927 * if successful. Returns false if (1) a page table page cannot be allocated 2928 * without sleeping, (2) a mapping already exists at the specified virtual 2929 * address, or (3) a PV entry cannot be allocated without reclaiming another 2930 * PV entry. 2931 */ 2932 static bool 2933 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2934 struct rwlock **lockp) 2935 { 2936 pd_entry_t new_l2; 2937 pn_t pn; 2938 2939 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2940 2941 pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE; 2942 new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V); 2943 if ((m->oflags & VPO_UNMANAGED) == 0) 2944 new_l2 |= PTE_SW_MANAGED; 2945 if ((prot & VM_PROT_EXECUTE) != 0) 2946 new_l2 |= PTE_X; 2947 if (va < VM_MAXUSER_ADDRESS) 2948 new_l2 |= PTE_U; 2949 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | 2950 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == 2951 KERN_SUCCESS); 2952 } 2953 2954 /* 2955 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 2956 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 2957 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 2958 * a mapping already exists at the specified virtual address. Returns 2959 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table 2960 * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if 2961 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. 2962 * 2963 * The parameter "m" is only used when creating a managed, writeable mapping. 2964 */ 2965 static int 2966 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, 2967 vm_page_t m, struct rwlock **lockp) 2968 { 2969 struct spglist free; 2970 pd_entry_t *l2, *l3, oldl2; 2971 vm_offset_t sva; 2972 vm_page_t l2pg, mt; 2973 2974 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2975 2976 if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? 2977 NULL : lockp)) == NULL) { 2978 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", 2979 va, pmap); 2980 return (KERN_RESOURCE_SHORTAGE); 2981 } 2982 2983 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); 2984 l2 = &l2[pmap_l2_index(va)]; 2985 if ((oldl2 = pmap_load(l2)) != 0) { 2986 KASSERT(l2pg->ref_count > 1, 2987 ("pmap_enter_l2: l2pg's ref count is too low")); 2988 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 2989 l2pg->ref_count--; 2990 CTR2(KTR_PMAP, 2991 "pmap_enter_l2: failure for va %#lx in pmap %p", 2992 va, pmap); 2993 return (KERN_FAILURE); 2994 } 2995 SLIST_INIT(&free); 2996 if ((oldl2 & PTE_RWX) != 0) 2997 (void)pmap_remove_l2(pmap, l2, va, 2998 pmap_load(pmap_l1(pmap, va)), &free, lockp); 2999 else 3000 for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) { 3001 l3 = pmap_l2_to_l3(l2, sva); 3002 if ((pmap_load(l3) & PTE_V) != 0 && 3003 pmap_remove_l3(pmap, l3, sva, oldl2, &free, 3004 lockp) != 0) 3005 break; 3006 } 3007 vm_page_free_pages_toq(&free, true); 3008 if (va >= VM_MAXUSER_ADDRESS) { 3009 /* 3010 * Both pmap_remove_l2() and pmap_remove_l3() will 3011 * leave the kernel page table page zero filled. 3012 */ 3013 mt = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 3014 if (pmap_insert_pt_page(pmap, mt, false)) 3015 panic("pmap_enter_l2: trie insert failed"); 3016 } else 3017 KASSERT(pmap_load(l2) == 0, 3018 ("pmap_enter_l2: non-zero L2 entry %p", l2)); 3019 } 3020 3021 if ((new_l2 & PTE_SW_MANAGED) != 0) { 3022 /* 3023 * Abort this mapping if its PV entry could not be created. 3024 */ 3025 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { 3026 SLIST_INIT(&free); 3027 if (pmap_unwire_ptp(pmap, va, l2pg, &free)) { 3028 /* 3029 * Although "va" is not mapped, paging-structure 3030 * caches could nonetheless have entries that 3031 * refer to the freed page table pages. 3032 * Invalidate those entries. 3033 */ 3034 pmap_invalidate_page(pmap, va); 3035 vm_page_free_pages_toq(&free, true); 3036 } 3037 CTR2(KTR_PMAP, 3038 "pmap_enter_l2: failure for va %#lx in pmap %p", 3039 va, pmap); 3040 return (KERN_RESOURCE_SHORTAGE); 3041 } 3042 if ((new_l2 & PTE_W) != 0) 3043 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 3044 vm_page_aflag_set(mt, PGA_WRITEABLE); 3045 } 3046 3047 /* 3048 * Increment counters. 3049 */ 3050 if ((new_l2 & PTE_SW_WIRED) != 0) 3051 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; 3052 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; 3053 3054 /* 3055 * Map the superpage. 3056 */ 3057 pmap_store(l2, new_l2); 3058 3059 atomic_add_long(&pmap_l2_mappings, 1); 3060 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", 3061 va, pmap); 3062 3063 return (KERN_SUCCESS); 3064 } 3065 3066 /* 3067 * Maps a sequence of resident pages belonging to the same object. 3068 * The sequence begins with the given page m_start. This page is 3069 * mapped at the given virtual address start. Each subsequent page is 3070 * mapped at a virtual address that is offset from start by the same 3071 * amount as the page is offset from m_start within the object. The 3072 * last page in the sequence is the page with the largest offset from 3073 * m_start that can be mapped at a virtual address less than the given 3074 * virtual address end. Not every virtual page between start and end 3075 * is mapped; only those for which a resident page exists with the 3076 * corresponding offset from m_start are mapped. 3077 */ 3078 void 3079 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3080 vm_page_t m_start, vm_prot_t prot) 3081 { 3082 struct rwlock *lock; 3083 vm_offset_t va; 3084 vm_page_t m, mpte; 3085 vm_pindex_t diff, psize; 3086 3087 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3088 3089 psize = atop(end - start); 3090 mpte = NULL; 3091 m = m_start; 3092 lock = NULL; 3093 rw_rlock(&pvh_global_lock); 3094 PMAP_LOCK(pmap); 3095 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3096 va = start + ptoa(diff); 3097 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && 3098 m->psind == 1 && pmap_ps_enabled(pmap) && 3099 pmap_enter_2mpage(pmap, va, m, prot, &lock)) 3100 m = &m[L2_SIZE / PAGE_SIZE - 1]; 3101 else 3102 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, 3103 &lock); 3104 m = TAILQ_NEXT(m, listq); 3105 } 3106 if (lock != NULL) 3107 rw_wunlock(lock); 3108 rw_runlock(&pvh_global_lock); 3109 PMAP_UNLOCK(pmap); 3110 } 3111 3112 /* 3113 * this code makes some *MAJOR* assumptions: 3114 * 1. Current pmap & pmap exists. 3115 * 2. Not wired. 3116 * 3. Read access. 3117 * 4. No page table pages. 3118 * but is *MUCH* faster than pmap_enter... 3119 */ 3120 3121 void 3122 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3123 { 3124 struct rwlock *lock; 3125 3126 lock = NULL; 3127 rw_rlock(&pvh_global_lock); 3128 PMAP_LOCK(pmap); 3129 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 3130 if (lock != NULL) 3131 rw_wunlock(lock); 3132 rw_runlock(&pvh_global_lock); 3133 PMAP_UNLOCK(pmap); 3134 } 3135 3136 static vm_page_t 3137 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3138 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 3139 { 3140 struct spglist free; 3141 vm_paddr_t phys; 3142 pd_entry_t *l2; 3143 pt_entry_t *l3, newl3; 3144 3145 KASSERT(!VA_IS_CLEANMAP(va) || 3146 (m->oflags & VPO_UNMANAGED) != 0, 3147 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3148 rw_assert(&pvh_global_lock, RA_LOCKED); 3149 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3150 3151 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); 3152 /* 3153 * In the case that a page table page is not 3154 * resident, we are creating it here. 3155 */ 3156 if (va < VM_MAXUSER_ADDRESS) { 3157 vm_pindex_t l2pindex; 3158 3159 /* 3160 * Calculate pagetable page index 3161 */ 3162 l2pindex = pmap_l2_pindex(va); 3163 if (mpte && (mpte->pindex == l2pindex)) { 3164 mpte->ref_count++; 3165 } else { 3166 /* 3167 * Get the l2 entry 3168 */ 3169 l2 = pmap_l2(pmap, va); 3170 3171 /* 3172 * If the page table page is mapped, we just increment 3173 * the hold count, and activate it. Otherwise, we 3174 * attempt to allocate a page table page. If this 3175 * attempt fails, we don't retry. Instead, we give up. 3176 */ 3177 if (l2 != NULL && pmap_load(l2) != 0) { 3178 phys = PTE_TO_PHYS(pmap_load(l2)); 3179 mpte = PHYS_TO_VM_PAGE(phys); 3180 mpte->ref_count++; 3181 } else { 3182 /* 3183 * Pass NULL instead of the PV list lock 3184 * pointer, because we don't intend to sleep. 3185 */ 3186 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); 3187 if (mpte == NULL) 3188 return (mpte); 3189 } 3190 } 3191 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 3192 l3 = &l3[pmap_l3_index(va)]; 3193 } else { 3194 mpte = NULL; 3195 l3 = pmap_l3(kernel_pmap, va); 3196 } 3197 if (l3 == NULL) 3198 panic("pmap_enter_quick_locked: No l3"); 3199 if (pmap_load(l3) != 0) { 3200 if (mpte != NULL) { 3201 mpte->ref_count--; 3202 mpte = NULL; 3203 } 3204 return (mpte); 3205 } 3206 3207 /* 3208 * Enter on the PV list if part of our managed memory. 3209 */ 3210 if ((m->oflags & VPO_UNMANAGED) == 0 && 3211 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 3212 if (mpte != NULL) { 3213 SLIST_INIT(&free); 3214 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 3215 pmap_invalidate_page(pmap, va); 3216 vm_page_free_pages_toq(&free, false); 3217 } 3218 mpte = NULL; 3219 } 3220 return (mpte); 3221 } 3222 3223 /* 3224 * Increment counters 3225 */ 3226 pmap_resident_count_inc(pmap, 1); 3227 3228 newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) | 3229 PTE_V | PTE_R; 3230 if ((prot & VM_PROT_EXECUTE) != 0) 3231 newl3 |= PTE_X; 3232 if ((m->oflags & VPO_UNMANAGED) == 0) 3233 newl3 |= PTE_SW_MANAGED; 3234 if (va < VM_MAX_USER_ADDRESS) 3235 newl3 |= PTE_U; 3236 3237 /* 3238 * Sync the i-cache on all harts before updating the PTE 3239 * if the new PTE is executable. 3240 */ 3241 if (prot & VM_PROT_EXECUTE) 3242 pmap_sync_icache(pmap, va, PAGE_SIZE); 3243 3244 pmap_store(l3, newl3); 3245 3246 pmap_invalidate_page(pmap, va); 3247 return (mpte); 3248 } 3249 3250 /* 3251 * This code maps large physical mmap regions into the 3252 * processor address space. Note that some shortcuts 3253 * are taken, but the code works. 3254 */ 3255 void 3256 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3257 vm_pindex_t pindex, vm_size_t size) 3258 { 3259 3260 VM_OBJECT_ASSERT_WLOCKED(object); 3261 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3262 ("pmap_object_init_pt: non-device object")); 3263 } 3264 3265 /* 3266 * Clear the wired attribute from the mappings for the specified range of 3267 * addresses in the given pmap. Every valid mapping within that range 3268 * must have the wired attribute set. In contrast, invalid mappings 3269 * cannot have the wired attribute set, so they are ignored. 3270 * 3271 * The wired attribute of the page table entry is not a hardware feature, 3272 * so there is no need to invalidate any TLB entries. 3273 */ 3274 void 3275 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3276 { 3277 vm_offset_t va_next; 3278 pd_entry_t *l1, *l2, l2e; 3279 pt_entry_t *l3, l3e; 3280 bool pv_lists_locked; 3281 3282 pv_lists_locked = false; 3283 retry: 3284 PMAP_LOCK(pmap); 3285 for (; sva < eva; sva = va_next) { 3286 l1 = pmap_l1(pmap, sva); 3287 if (pmap_load(l1) == 0) { 3288 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3289 if (va_next < sva) 3290 va_next = eva; 3291 continue; 3292 } 3293 3294 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3295 if (va_next < sva) 3296 va_next = eva; 3297 3298 l2 = pmap_l1_to_l2(l1, sva); 3299 if ((l2e = pmap_load(l2)) == 0) 3300 continue; 3301 if ((l2e & PTE_RWX) != 0) { 3302 if (sva + L2_SIZE == va_next && eva >= va_next) { 3303 if ((l2e & PTE_SW_WIRED) == 0) 3304 panic("pmap_unwire: l2 %#jx is missing " 3305 "PTE_SW_WIRED", (uintmax_t)l2e); 3306 pmap_clear_bits(l2, PTE_SW_WIRED); 3307 continue; 3308 } else { 3309 if (!pv_lists_locked) { 3310 pv_lists_locked = true; 3311 if (!rw_try_rlock(&pvh_global_lock)) { 3312 PMAP_UNLOCK(pmap); 3313 rw_rlock(&pvh_global_lock); 3314 /* Repeat sva. */ 3315 goto retry; 3316 } 3317 } 3318 if (!pmap_demote_l2(pmap, l2, sva)) 3319 panic("pmap_unwire: demotion failed"); 3320 } 3321 } 3322 3323 if (va_next > eva) 3324 va_next = eva; 3325 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 3326 sva += L3_SIZE) { 3327 if ((l3e = pmap_load(l3)) == 0) 3328 continue; 3329 if ((l3e & PTE_SW_WIRED) == 0) 3330 panic("pmap_unwire: l3 %#jx is missing " 3331 "PTE_SW_WIRED", (uintmax_t)l3e); 3332 3333 /* 3334 * PG_W must be cleared atomically. Although the pmap 3335 * lock synchronizes access to PG_W, another processor 3336 * could be setting PG_M and/or PG_A concurrently. 3337 */ 3338 pmap_clear_bits(l3, PTE_SW_WIRED); 3339 pmap->pm_stats.wired_count--; 3340 } 3341 } 3342 if (pv_lists_locked) 3343 rw_runlock(&pvh_global_lock); 3344 PMAP_UNLOCK(pmap); 3345 } 3346 3347 /* 3348 * Copy the range specified by src_addr/len 3349 * from the source map to the range dst_addr/len 3350 * in the destination map. 3351 * 3352 * This routine is only advisory and need not do anything. 3353 */ 3354 3355 void 3356 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 3357 vm_offset_t src_addr) 3358 { 3359 3360 } 3361 3362 /* 3363 * pmap_zero_page zeros the specified hardware page by mapping 3364 * the page into KVM and using bzero to clear its contents. 3365 */ 3366 void 3367 pmap_zero_page(vm_page_t m) 3368 { 3369 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3370 3371 pagezero((void *)va); 3372 } 3373 3374 /* 3375 * pmap_zero_page_area zeros the specified hardware page by mapping 3376 * the page into KVM and using bzero to clear its contents. 3377 * 3378 * off and size may not cover an area beyond a single hardware page. 3379 */ 3380 void 3381 pmap_zero_page_area(vm_page_t m, int off, int size) 3382 { 3383 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3384 3385 if (off == 0 && size == PAGE_SIZE) 3386 pagezero((void *)va); 3387 else 3388 bzero((char *)va + off, size); 3389 } 3390 3391 /* 3392 * pmap_copy_page copies the specified (machine independent) 3393 * page by mapping the page into virtual memory and using 3394 * bcopy to copy the page, one machine dependent page at a 3395 * time. 3396 */ 3397 void 3398 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 3399 { 3400 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 3401 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 3402 3403 pagecopy((void *)src, (void *)dst); 3404 } 3405 3406 int unmapped_buf_allowed = 1; 3407 3408 void 3409 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 3410 vm_offset_t b_offset, int xfersize) 3411 { 3412 void *a_cp, *b_cp; 3413 vm_page_t m_a, m_b; 3414 vm_paddr_t p_a, p_b; 3415 vm_offset_t a_pg_offset, b_pg_offset; 3416 int cnt; 3417 3418 while (xfersize > 0) { 3419 a_pg_offset = a_offset & PAGE_MASK; 3420 m_a = ma[a_offset >> PAGE_SHIFT]; 3421 p_a = m_a->phys_addr; 3422 b_pg_offset = b_offset & PAGE_MASK; 3423 m_b = mb[b_offset >> PAGE_SHIFT]; 3424 p_b = m_b->phys_addr; 3425 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 3426 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 3427 if (__predict_false(!PHYS_IN_DMAP(p_a))) { 3428 panic("!DMAP a %lx", p_a); 3429 } else { 3430 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 3431 } 3432 if (__predict_false(!PHYS_IN_DMAP(p_b))) { 3433 panic("!DMAP b %lx", p_b); 3434 } else { 3435 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 3436 } 3437 bcopy(a_cp, b_cp, cnt); 3438 a_offset += cnt; 3439 b_offset += cnt; 3440 xfersize -= cnt; 3441 } 3442 } 3443 3444 vm_offset_t 3445 pmap_quick_enter_page(vm_page_t m) 3446 { 3447 3448 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); 3449 } 3450 3451 void 3452 pmap_quick_remove_page(vm_offset_t addr) 3453 { 3454 } 3455 3456 /* 3457 * Returns true if the pmap's pv is one of the first 3458 * 16 pvs linked to from this page. This count may 3459 * be changed upwards or downwards in the future; it 3460 * is only necessary that true be returned for a small 3461 * subset of pmaps for proper page aging. 3462 */ 3463 boolean_t 3464 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 3465 { 3466 struct md_page *pvh; 3467 struct rwlock *lock; 3468 pv_entry_t pv; 3469 int loops = 0; 3470 boolean_t rv; 3471 3472 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3473 ("pmap_page_exists_quick: page %p is not managed", m)); 3474 rv = FALSE; 3475 rw_rlock(&pvh_global_lock); 3476 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3477 rw_rlock(lock); 3478 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3479 if (PV_PMAP(pv) == pmap) { 3480 rv = TRUE; 3481 break; 3482 } 3483 loops++; 3484 if (loops >= 16) 3485 break; 3486 } 3487 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 3488 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3489 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3490 if (PV_PMAP(pv) == pmap) { 3491 rv = TRUE; 3492 break; 3493 } 3494 loops++; 3495 if (loops >= 16) 3496 break; 3497 } 3498 } 3499 rw_runlock(lock); 3500 rw_runlock(&pvh_global_lock); 3501 return (rv); 3502 } 3503 3504 /* 3505 * pmap_page_wired_mappings: 3506 * 3507 * Return the number of managed mappings to the given physical page 3508 * that are wired. 3509 */ 3510 int 3511 pmap_page_wired_mappings(vm_page_t m) 3512 { 3513 struct md_page *pvh; 3514 struct rwlock *lock; 3515 pmap_t pmap; 3516 pd_entry_t *l2; 3517 pt_entry_t *l3; 3518 pv_entry_t pv; 3519 int count, md_gen, pvh_gen; 3520 3521 if ((m->oflags & VPO_UNMANAGED) != 0) 3522 return (0); 3523 rw_rlock(&pvh_global_lock); 3524 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3525 rw_rlock(lock); 3526 restart: 3527 count = 0; 3528 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3529 pmap = PV_PMAP(pv); 3530 if (!PMAP_TRYLOCK(pmap)) { 3531 md_gen = m->md.pv_gen; 3532 rw_runlock(lock); 3533 PMAP_LOCK(pmap); 3534 rw_rlock(lock); 3535 if (md_gen != m->md.pv_gen) { 3536 PMAP_UNLOCK(pmap); 3537 goto restart; 3538 } 3539 } 3540 l2 = pmap_l2(pmap, pv->pv_va); 3541 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 3542 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 3543 l3 = pmap_l2_to_l3(l2, pv->pv_va); 3544 if ((pmap_load(l3) & PTE_SW_WIRED) != 0) 3545 count++; 3546 PMAP_UNLOCK(pmap); 3547 } 3548 if ((m->flags & PG_FICTITIOUS) == 0) { 3549 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3550 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3551 pmap = PV_PMAP(pv); 3552 if (!PMAP_TRYLOCK(pmap)) { 3553 md_gen = m->md.pv_gen; 3554 pvh_gen = pvh->pv_gen; 3555 rw_runlock(lock); 3556 PMAP_LOCK(pmap); 3557 rw_rlock(lock); 3558 if (md_gen != m->md.pv_gen || 3559 pvh_gen != pvh->pv_gen) { 3560 PMAP_UNLOCK(pmap); 3561 goto restart; 3562 } 3563 } 3564 l2 = pmap_l2(pmap, pv->pv_va); 3565 if ((pmap_load(l2) & PTE_SW_WIRED) != 0) 3566 count++; 3567 PMAP_UNLOCK(pmap); 3568 } 3569 } 3570 rw_runlock(lock); 3571 rw_runlock(&pvh_global_lock); 3572 return (count); 3573 } 3574 3575 /* 3576 * Returns true if the given page is mapped individually or as part of 3577 * a 2mpage. Otherwise, returns false. 3578 */ 3579 bool 3580 pmap_page_is_mapped(vm_page_t m) 3581 { 3582 struct rwlock *lock; 3583 bool rv; 3584 3585 if ((m->oflags & VPO_UNMANAGED) != 0) 3586 return (false); 3587 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3588 rw_rlock(lock); 3589 rv = !TAILQ_EMPTY(&m->md.pv_list) || 3590 ((m->flags & PG_FICTITIOUS) == 0 && 3591 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 3592 rw_runlock(lock); 3593 return (rv); 3594 } 3595 3596 static void 3597 pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, 3598 struct spglist *free, bool superpage) 3599 { 3600 struct md_page *pvh; 3601 vm_page_t mpte, mt; 3602 3603 if (superpage) { 3604 pmap_resident_count_dec(pmap, Ln_ENTRIES); 3605 pvh = pa_to_pvh(m->phys_addr); 3606 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3607 pvh->pv_gen++; 3608 if (TAILQ_EMPTY(&pvh->pv_list)) { 3609 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 3610 if (TAILQ_EMPTY(&mt->md.pv_list) && 3611 (mt->a.flags & PGA_WRITEABLE) != 0) 3612 vm_page_aflag_clear(mt, PGA_WRITEABLE); 3613 } 3614 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 3615 if (mpte != NULL) { 3616 KASSERT(mpte->valid == VM_PAGE_BITS_ALL, 3617 ("pmap_remove_pages: pte page not promoted")); 3618 pmap_resident_count_dec(pmap, 1); 3619 KASSERT(mpte->ref_count == Ln_ENTRIES, 3620 ("pmap_remove_pages: pte page ref count error")); 3621 mpte->ref_count = 0; 3622 pmap_add_delayed_free_list(mpte, free, FALSE); 3623 } 3624 } else { 3625 pmap_resident_count_dec(pmap, 1); 3626 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3627 m->md.pv_gen++; 3628 if (TAILQ_EMPTY(&m->md.pv_list) && 3629 (m->a.flags & PGA_WRITEABLE) != 0) { 3630 pvh = pa_to_pvh(m->phys_addr); 3631 if (TAILQ_EMPTY(&pvh->pv_list)) 3632 vm_page_aflag_clear(m, PGA_WRITEABLE); 3633 } 3634 } 3635 } 3636 3637 /* 3638 * Destroy all managed, non-wired mappings in the given user-space 3639 * pmap. This pmap cannot be active on any processor besides the 3640 * caller. 3641 * 3642 * This function cannot be applied to the kernel pmap. Moreover, it 3643 * is not intended for general use. It is only to be used during 3644 * process termination. Consequently, it can be implemented in ways 3645 * that make it faster than pmap_remove(). First, it can more quickly 3646 * destroy mappings by iterating over the pmap's collection of PV 3647 * entries, rather than searching the page table. Second, it doesn't 3648 * have to test and clear the page table entries atomically, because 3649 * no processor is currently accessing the user address space. In 3650 * particular, a page table entry's dirty bit won't change state once 3651 * this function starts. 3652 */ 3653 void 3654 pmap_remove_pages(pmap_t pmap) 3655 { 3656 struct spglist free; 3657 pd_entry_t ptepde; 3658 pt_entry_t *pte, tpte; 3659 vm_page_t m, mt; 3660 pv_entry_t pv; 3661 struct pv_chunk *pc, *npc; 3662 struct rwlock *lock; 3663 int64_t bit; 3664 uint64_t inuse, bitmask; 3665 int allfree, field, freed, idx; 3666 bool superpage; 3667 3668 lock = NULL; 3669 3670 SLIST_INIT(&free); 3671 rw_rlock(&pvh_global_lock); 3672 PMAP_LOCK(pmap); 3673 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 3674 allfree = 1; 3675 freed = 0; 3676 for (field = 0; field < _NPCM; field++) { 3677 inuse = ~pc->pc_map[field] & pc_freemask[field]; 3678 while (inuse != 0) { 3679 bit = ffsl(inuse) - 1; 3680 bitmask = 1UL << bit; 3681 idx = field * 64 + bit; 3682 pv = &pc->pc_pventry[idx]; 3683 inuse &= ~bitmask; 3684 3685 pte = pmap_l1(pmap, pv->pv_va); 3686 ptepde = pmap_load(pte); 3687 pte = pmap_l1_to_l2(pte, pv->pv_va); 3688 tpte = pmap_load(pte); 3689 if ((tpte & PTE_RWX) != 0) { 3690 superpage = true; 3691 } else { 3692 ptepde = tpte; 3693 pte = pmap_l2_to_l3(pte, pv->pv_va); 3694 tpte = pmap_load(pte); 3695 superpage = false; 3696 } 3697 3698 /* 3699 * We cannot remove wired pages from a 3700 * process' mapping at this time. 3701 */ 3702 if (tpte & PTE_SW_WIRED) { 3703 allfree = 0; 3704 continue; 3705 } 3706 3707 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte)); 3708 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 3709 m < &vm_page_array[vm_page_array_size], 3710 ("pmap_remove_pages: bad pte %#jx", 3711 (uintmax_t)tpte)); 3712 3713 pmap_clear(pte); 3714 3715 /* 3716 * Update the vm_page_t clean/reference bits. 3717 */ 3718 if ((tpte & (PTE_D | PTE_W)) == 3719 (PTE_D | PTE_W)) { 3720 if (superpage) 3721 for (mt = m; 3722 mt < &m[Ln_ENTRIES]; mt++) 3723 vm_page_dirty(mt); 3724 else 3725 vm_page_dirty(m); 3726 } 3727 3728 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 3729 3730 /* Mark free */ 3731 pc->pc_map[field] |= bitmask; 3732 3733 pmap_remove_pages_pv(pmap, m, pv, &free, 3734 superpage); 3735 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 3736 freed++; 3737 } 3738 } 3739 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 3740 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 3741 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 3742 if (allfree) { 3743 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3744 free_pv_chunk(pc); 3745 } 3746 } 3747 if (lock != NULL) 3748 rw_wunlock(lock); 3749 pmap_invalidate_all(pmap); 3750 rw_runlock(&pvh_global_lock); 3751 PMAP_UNLOCK(pmap); 3752 vm_page_free_pages_toq(&free, false); 3753 } 3754 3755 static bool 3756 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 3757 { 3758 struct md_page *pvh; 3759 struct rwlock *lock; 3760 pd_entry_t *l2; 3761 pt_entry_t *l3, mask; 3762 pv_entry_t pv; 3763 pmap_t pmap; 3764 int md_gen, pvh_gen; 3765 bool rv; 3766 3767 mask = 0; 3768 if (modified) 3769 mask |= PTE_D; 3770 if (accessed) 3771 mask |= PTE_A; 3772 3773 rv = FALSE; 3774 rw_rlock(&pvh_global_lock); 3775 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3776 rw_rlock(lock); 3777 restart: 3778 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3779 pmap = PV_PMAP(pv); 3780 if (!PMAP_TRYLOCK(pmap)) { 3781 md_gen = m->md.pv_gen; 3782 rw_runlock(lock); 3783 PMAP_LOCK(pmap); 3784 rw_rlock(lock); 3785 if (md_gen != m->md.pv_gen) { 3786 PMAP_UNLOCK(pmap); 3787 goto restart; 3788 } 3789 } 3790 l2 = pmap_l2(pmap, pv->pv_va); 3791 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 3792 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 3793 l3 = pmap_l2_to_l3(l2, pv->pv_va); 3794 rv = (pmap_load(l3) & mask) == mask; 3795 PMAP_UNLOCK(pmap); 3796 if (rv) 3797 goto out; 3798 } 3799 if ((m->flags & PG_FICTITIOUS) == 0) { 3800 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3801 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3802 pmap = PV_PMAP(pv); 3803 if (!PMAP_TRYLOCK(pmap)) { 3804 md_gen = m->md.pv_gen; 3805 pvh_gen = pvh->pv_gen; 3806 rw_runlock(lock); 3807 PMAP_LOCK(pmap); 3808 rw_rlock(lock); 3809 if (md_gen != m->md.pv_gen || 3810 pvh_gen != pvh->pv_gen) { 3811 PMAP_UNLOCK(pmap); 3812 goto restart; 3813 } 3814 } 3815 l2 = pmap_l2(pmap, pv->pv_va); 3816 rv = (pmap_load(l2) & mask) == mask; 3817 PMAP_UNLOCK(pmap); 3818 if (rv) 3819 goto out; 3820 } 3821 } 3822 out: 3823 rw_runlock(lock); 3824 rw_runlock(&pvh_global_lock); 3825 return (rv); 3826 } 3827 3828 /* 3829 * pmap_is_modified: 3830 * 3831 * Return whether or not the specified physical page was modified 3832 * in any physical maps. 3833 */ 3834 boolean_t 3835 pmap_is_modified(vm_page_t m) 3836 { 3837 3838 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3839 ("pmap_is_modified: page %p is not managed", m)); 3840 3841 /* 3842 * If the page is not busied then this check is racy. 3843 */ 3844 if (!pmap_page_is_write_mapped(m)) 3845 return (FALSE); 3846 return (pmap_page_test_mappings(m, FALSE, TRUE)); 3847 } 3848 3849 /* 3850 * pmap_is_prefaultable: 3851 * 3852 * Return whether or not the specified virtual address is eligible 3853 * for prefault. 3854 */ 3855 boolean_t 3856 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 3857 { 3858 pt_entry_t *l3; 3859 boolean_t rv; 3860 3861 rv = FALSE; 3862 PMAP_LOCK(pmap); 3863 l3 = pmap_l3(pmap, addr); 3864 if (l3 != NULL && pmap_load(l3) != 0) { 3865 rv = TRUE; 3866 } 3867 PMAP_UNLOCK(pmap); 3868 return (rv); 3869 } 3870 3871 /* 3872 * pmap_is_referenced: 3873 * 3874 * Return whether or not the specified physical page was referenced 3875 * in any physical maps. 3876 */ 3877 boolean_t 3878 pmap_is_referenced(vm_page_t m) 3879 { 3880 3881 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3882 ("pmap_is_referenced: page %p is not managed", m)); 3883 return (pmap_page_test_mappings(m, TRUE, FALSE)); 3884 } 3885 3886 /* 3887 * Clear the write and modified bits in each of the given page's mappings. 3888 */ 3889 void 3890 pmap_remove_write(vm_page_t m) 3891 { 3892 struct md_page *pvh; 3893 struct rwlock *lock; 3894 pmap_t pmap; 3895 pd_entry_t *l2; 3896 pt_entry_t *l3, oldl3, newl3; 3897 pv_entry_t next_pv, pv; 3898 vm_offset_t va; 3899 int md_gen, pvh_gen; 3900 3901 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3902 ("pmap_remove_write: page %p is not managed", m)); 3903 vm_page_assert_busied(m); 3904 3905 if (!pmap_page_is_write_mapped(m)) 3906 return; 3907 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3908 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 3909 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3910 rw_rlock(&pvh_global_lock); 3911 retry_pv_loop: 3912 rw_wlock(lock); 3913 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 3914 pmap = PV_PMAP(pv); 3915 if (!PMAP_TRYLOCK(pmap)) { 3916 pvh_gen = pvh->pv_gen; 3917 rw_wunlock(lock); 3918 PMAP_LOCK(pmap); 3919 rw_wlock(lock); 3920 if (pvh_gen != pvh->pv_gen) { 3921 PMAP_UNLOCK(pmap); 3922 rw_wunlock(lock); 3923 goto retry_pv_loop; 3924 } 3925 } 3926 va = pv->pv_va; 3927 l2 = pmap_l2(pmap, va); 3928 if ((pmap_load(l2) & PTE_W) != 0) 3929 (void)pmap_demote_l2_locked(pmap, l2, va, &lock); 3930 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 3931 ("inconsistent pv lock %p %p for page %p", 3932 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 3933 PMAP_UNLOCK(pmap); 3934 } 3935 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3936 pmap = PV_PMAP(pv); 3937 if (!PMAP_TRYLOCK(pmap)) { 3938 pvh_gen = pvh->pv_gen; 3939 md_gen = m->md.pv_gen; 3940 rw_wunlock(lock); 3941 PMAP_LOCK(pmap); 3942 rw_wlock(lock); 3943 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 3944 PMAP_UNLOCK(pmap); 3945 rw_wunlock(lock); 3946 goto retry_pv_loop; 3947 } 3948 } 3949 l2 = pmap_l2(pmap, pv->pv_va); 3950 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 3951 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 3952 l3 = pmap_l2_to_l3(l2, pv->pv_va); 3953 oldl3 = pmap_load(l3); 3954 retry: 3955 if ((oldl3 & PTE_W) != 0) { 3956 newl3 = oldl3 & ~(PTE_D | PTE_W); 3957 if (!atomic_fcmpset_long(l3, &oldl3, newl3)) 3958 goto retry; 3959 if ((oldl3 & PTE_D) != 0) 3960 vm_page_dirty(m); 3961 pmap_invalidate_page(pmap, pv->pv_va); 3962 } 3963 PMAP_UNLOCK(pmap); 3964 } 3965 rw_wunlock(lock); 3966 vm_page_aflag_clear(m, PGA_WRITEABLE); 3967 rw_runlock(&pvh_global_lock); 3968 } 3969 3970 /* 3971 * pmap_ts_referenced: 3972 * 3973 * Return a count of reference bits for a page, clearing those bits. 3974 * It is not necessary for every reference bit to be cleared, but it 3975 * is necessary that 0 only be returned when there are truly no 3976 * reference bits set. 3977 * 3978 * As an optimization, update the page's dirty field if a modified bit is 3979 * found while counting reference bits. This opportunistic update can be 3980 * performed at low cost and can eliminate the need for some future calls 3981 * to pmap_is_modified(). However, since this function stops after 3982 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 3983 * dirty pages. Those dirty pages will only be detected by a future call 3984 * to pmap_is_modified(). 3985 */ 3986 int 3987 pmap_ts_referenced(vm_page_t m) 3988 { 3989 struct spglist free; 3990 struct md_page *pvh; 3991 struct rwlock *lock; 3992 pv_entry_t pv, pvf; 3993 pmap_t pmap; 3994 pd_entry_t *l2, l2e; 3995 pt_entry_t *l3, l3e; 3996 vm_paddr_t pa; 3997 vm_offset_t va; 3998 int cleared, md_gen, not_cleared, pvh_gen; 3999 4000 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4001 ("pmap_ts_referenced: page %p is not managed", m)); 4002 SLIST_INIT(&free); 4003 cleared = 0; 4004 pa = VM_PAGE_TO_PHYS(m); 4005 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 4006 4007 lock = PHYS_TO_PV_LIST_LOCK(pa); 4008 rw_rlock(&pvh_global_lock); 4009 rw_wlock(lock); 4010 retry: 4011 not_cleared = 0; 4012 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 4013 goto small_mappings; 4014 pv = pvf; 4015 do { 4016 pmap = PV_PMAP(pv); 4017 if (!PMAP_TRYLOCK(pmap)) { 4018 pvh_gen = pvh->pv_gen; 4019 rw_wunlock(lock); 4020 PMAP_LOCK(pmap); 4021 rw_wlock(lock); 4022 if (pvh_gen != pvh->pv_gen) { 4023 PMAP_UNLOCK(pmap); 4024 goto retry; 4025 } 4026 } 4027 va = pv->pv_va; 4028 l2 = pmap_l2(pmap, va); 4029 l2e = pmap_load(l2); 4030 if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) { 4031 /* 4032 * Although l2e is mapping a 2MB page, because 4033 * this function is called at a 4KB page granularity, 4034 * we only update the 4KB page under test. 4035 */ 4036 vm_page_dirty(m); 4037 } 4038 if ((l2e & PTE_A) != 0) { 4039 /* 4040 * Since this reference bit is shared by 512 4KB 4041 * pages, it should not be cleared every time it is 4042 * tested. Apply a simple "hash" function on the 4043 * physical page number, the virtual superpage number, 4044 * and the pmap address to select one 4KB page out of 4045 * the 512 on which testing the reference bit will 4046 * result in clearing that reference bit. This 4047 * function is designed to avoid the selection of the 4048 * same 4KB page for every 2MB page mapping. 4049 * 4050 * On demotion, a mapping that hasn't been referenced 4051 * is simply destroyed. To avoid the possibility of a 4052 * subsequent page fault on a demoted wired mapping, 4053 * always leave its reference bit set. Moreover, 4054 * since the superpage is wired, the current state of 4055 * its reference bit won't affect page replacement. 4056 */ 4057 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ 4058 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && 4059 (l2e & PTE_SW_WIRED) == 0) { 4060 pmap_clear_bits(l2, PTE_A); 4061 pmap_invalidate_page(pmap, va); 4062 cleared++; 4063 } else 4064 not_cleared++; 4065 } 4066 PMAP_UNLOCK(pmap); 4067 /* Rotate the PV list if it has more than one entry. */ 4068 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4069 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4070 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4071 pvh->pv_gen++; 4072 } 4073 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 4074 goto out; 4075 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 4076 small_mappings: 4077 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 4078 goto out; 4079 pv = pvf; 4080 do { 4081 pmap = PV_PMAP(pv); 4082 if (!PMAP_TRYLOCK(pmap)) { 4083 pvh_gen = pvh->pv_gen; 4084 md_gen = m->md.pv_gen; 4085 rw_wunlock(lock); 4086 PMAP_LOCK(pmap); 4087 rw_wlock(lock); 4088 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4089 PMAP_UNLOCK(pmap); 4090 goto retry; 4091 } 4092 } 4093 l2 = pmap_l2(pmap, pv->pv_va); 4094 4095 KASSERT((pmap_load(l2) & PTE_RX) == 0, 4096 ("pmap_ts_referenced: found an invalid l2 table")); 4097 4098 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4099 l3e = pmap_load(l3); 4100 if ((l3e & PTE_D) != 0) 4101 vm_page_dirty(m); 4102 if ((l3e & PTE_A) != 0) { 4103 if ((l3e & PTE_SW_WIRED) == 0) { 4104 /* 4105 * Wired pages cannot be paged out so 4106 * doing accessed bit emulation for 4107 * them is wasted effort. We do the 4108 * hard work for unwired pages only. 4109 */ 4110 pmap_clear_bits(l3, PTE_A); 4111 pmap_invalidate_page(pmap, pv->pv_va); 4112 cleared++; 4113 } else 4114 not_cleared++; 4115 } 4116 PMAP_UNLOCK(pmap); 4117 /* Rotate the PV list if it has more than one entry. */ 4118 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4119 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4120 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4121 m->md.pv_gen++; 4122 } 4123 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 4124 not_cleared < PMAP_TS_REFERENCED_MAX); 4125 out: 4126 rw_wunlock(lock); 4127 rw_runlock(&pvh_global_lock); 4128 vm_page_free_pages_toq(&free, false); 4129 return (cleared + not_cleared); 4130 } 4131 4132 /* 4133 * Apply the given advice to the specified range of addresses within the 4134 * given pmap. Depending on the advice, clear the referenced and/or 4135 * modified flags in each mapping and set the mapped page's dirty field. 4136 */ 4137 void 4138 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 4139 { 4140 } 4141 4142 /* 4143 * Clear the modify bits on the specified physical page. 4144 */ 4145 void 4146 pmap_clear_modify(vm_page_t m) 4147 { 4148 struct md_page *pvh; 4149 struct rwlock *lock; 4150 pmap_t pmap; 4151 pv_entry_t next_pv, pv; 4152 pd_entry_t *l2, oldl2; 4153 pt_entry_t *l3; 4154 vm_offset_t va; 4155 int md_gen, pvh_gen; 4156 4157 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4158 ("pmap_clear_modify: page %p is not managed", m)); 4159 vm_page_assert_busied(m); 4160 4161 if (!pmap_page_is_write_mapped(m)) 4162 return; 4163 4164 /* 4165 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 4166 * If the object containing the page is locked and the page is not 4167 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 4168 */ 4169 if ((m->a.flags & PGA_WRITEABLE) == 0) 4170 return; 4171 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 4172 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4173 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4174 rw_rlock(&pvh_global_lock); 4175 rw_wlock(lock); 4176 restart: 4177 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4178 pmap = PV_PMAP(pv); 4179 if (!PMAP_TRYLOCK(pmap)) { 4180 pvh_gen = pvh->pv_gen; 4181 rw_wunlock(lock); 4182 PMAP_LOCK(pmap); 4183 rw_wlock(lock); 4184 if (pvh_gen != pvh->pv_gen) { 4185 PMAP_UNLOCK(pmap); 4186 goto restart; 4187 } 4188 } 4189 va = pv->pv_va; 4190 l2 = pmap_l2(pmap, va); 4191 oldl2 = pmap_load(l2); 4192 /* If oldl2 has PTE_W set, then it also has PTE_D set. */ 4193 if ((oldl2 & PTE_W) != 0 && 4194 pmap_demote_l2_locked(pmap, l2, va, &lock) && 4195 (oldl2 & PTE_SW_WIRED) == 0) { 4196 /* 4197 * Write protect the mapping to a single page so that 4198 * a subsequent write access may repromote. 4199 */ 4200 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2); 4201 l3 = pmap_l2_to_l3(l2, va); 4202 pmap_clear_bits(l3, PTE_D | PTE_W); 4203 vm_page_dirty(m); 4204 pmap_invalidate_page(pmap, va); 4205 } 4206 PMAP_UNLOCK(pmap); 4207 } 4208 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4209 pmap = PV_PMAP(pv); 4210 if (!PMAP_TRYLOCK(pmap)) { 4211 md_gen = m->md.pv_gen; 4212 pvh_gen = pvh->pv_gen; 4213 rw_wunlock(lock); 4214 PMAP_LOCK(pmap); 4215 rw_wlock(lock); 4216 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4217 PMAP_UNLOCK(pmap); 4218 goto restart; 4219 } 4220 } 4221 l2 = pmap_l2(pmap, pv->pv_va); 4222 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4223 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 4224 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4225 if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) { 4226 pmap_clear_bits(l3, PTE_D | PTE_W); 4227 pmap_invalidate_page(pmap, pv->pv_va); 4228 } 4229 PMAP_UNLOCK(pmap); 4230 } 4231 rw_wunlock(lock); 4232 rw_runlock(&pvh_global_lock); 4233 } 4234 4235 void * 4236 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 4237 { 4238 4239 return ((void *)PHYS_TO_DMAP(pa)); 4240 } 4241 4242 void 4243 pmap_unmapbios(vm_paddr_t pa, vm_size_t size) 4244 { 4245 } 4246 4247 /* 4248 * Sets the memory attribute for the specified page. 4249 */ 4250 void 4251 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 4252 { 4253 4254 m->md.pv_memattr = ma; 4255 } 4256 4257 /* 4258 * Perform the pmap work for mincore(2). If the page is not both referenced and 4259 * modified by this pmap, returns its physical address so that the caller can 4260 * find other mappings. 4261 */ 4262 int 4263 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 4264 { 4265 pt_entry_t *l2, *l3, tpte; 4266 vm_paddr_t pa; 4267 int val; 4268 bool managed; 4269 4270 PMAP_LOCK(pmap); 4271 l2 = pmap_l2(pmap, addr); 4272 if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) { 4273 if ((tpte & PTE_RWX) != 0) { 4274 pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET); 4275 val = MINCORE_INCORE | MINCORE_PSIND(1); 4276 } else { 4277 l3 = pmap_l2_to_l3(l2, addr); 4278 tpte = pmap_load(l3); 4279 if ((tpte & PTE_V) == 0) { 4280 PMAP_UNLOCK(pmap); 4281 return (0); 4282 } 4283 pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET); 4284 val = MINCORE_INCORE; 4285 } 4286 4287 if ((tpte & PTE_D) != 0) 4288 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 4289 if ((tpte & PTE_A) != 0) 4290 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 4291 managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED; 4292 } else { 4293 managed = false; 4294 val = 0; 4295 } 4296 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 4297 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 4298 *pap = pa; 4299 } 4300 PMAP_UNLOCK(pmap); 4301 return (val); 4302 } 4303 4304 void 4305 pmap_activate_sw(struct thread *td) 4306 { 4307 pmap_t oldpmap, pmap; 4308 u_int hart; 4309 4310 oldpmap = PCPU_GET(curpmap); 4311 pmap = vmspace_pmap(td->td_proc->p_vmspace); 4312 if (pmap == oldpmap) 4313 return; 4314 load_satp(pmap->pm_satp); 4315 4316 hart = PCPU_GET(hart); 4317 #ifdef SMP 4318 CPU_SET_ATOMIC(hart, &pmap->pm_active); 4319 CPU_CLR_ATOMIC(hart, &oldpmap->pm_active); 4320 #else 4321 CPU_SET(hart, &pmap->pm_active); 4322 CPU_CLR(hart, &oldpmap->pm_active); 4323 #endif 4324 PCPU_SET(curpmap, pmap); 4325 4326 sfence_vma(); 4327 } 4328 4329 void 4330 pmap_activate(struct thread *td) 4331 { 4332 4333 critical_enter(); 4334 pmap_activate_sw(td); 4335 critical_exit(); 4336 } 4337 4338 void 4339 pmap_activate_boot(pmap_t pmap) 4340 { 4341 u_int hart; 4342 4343 hart = PCPU_GET(hart); 4344 #ifdef SMP 4345 CPU_SET_ATOMIC(hart, &pmap->pm_active); 4346 #else 4347 CPU_SET(hart, &pmap->pm_active); 4348 #endif 4349 PCPU_SET(curpmap, pmap); 4350 } 4351 4352 void 4353 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) 4354 { 4355 cpuset_t mask; 4356 4357 /* 4358 * From the RISC-V User-Level ISA V2.2: 4359 * 4360 * "To make a store to instruction memory visible to all 4361 * RISC-V harts, the writing hart has to execute a data FENCE 4362 * before requesting that all remote RISC-V harts execute a 4363 * FENCE.I." 4364 * 4365 * However, this is slightly misleading; we still need to 4366 * perform a FENCE.I for the local hart, as FENCE does nothing 4367 * for its icache. FENCE.I alone is also sufficient for the 4368 * local hart. 4369 */ 4370 sched_pin(); 4371 mask = all_harts; 4372 CPU_CLR(PCPU_GET(hart), &mask); 4373 fence_i(); 4374 if (!CPU_EMPTY(&mask) && smp_started) { 4375 fence(); 4376 sbi_remote_fence_i(mask.__bits); 4377 } 4378 sched_unpin(); 4379 } 4380 4381 /* 4382 * Increase the starting virtual address of the given mapping if a 4383 * different alignment might result in more superpage mappings. 4384 */ 4385 void 4386 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 4387 vm_offset_t *addr, vm_size_t size) 4388 { 4389 vm_offset_t superpage_offset; 4390 4391 if (size < L2_SIZE) 4392 return; 4393 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 4394 offset += ptoa(object->pg_color); 4395 superpage_offset = offset & L2_OFFSET; 4396 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || 4397 (*addr & L2_OFFSET) == superpage_offset) 4398 return; 4399 if ((*addr & L2_OFFSET) < superpage_offset) 4400 *addr = (*addr & ~L2_OFFSET) + superpage_offset; 4401 else 4402 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; 4403 } 4404 4405 /** 4406 * Get the kernel virtual address of a set of physical pages. If there are 4407 * physical addresses not covered by the DMAP perform a transient mapping 4408 * that will be removed when calling pmap_unmap_io_transient. 4409 * 4410 * \param page The pages the caller wishes to obtain the virtual 4411 * address on the kernel memory map. 4412 * \param vaddr On return contains the kernel virtual memory address 4413 * of the pages passed in the page parameter. 4414 * \param count Number of pages passed in. 4415 * \param can_fault TRUE if the thread using the mapped pages can take 4416 * page faults, FALSE otherwise. 4417 * 4418 * \returns TRUE if the caller must call pmap_unmap_io_transient when 4419 * finished or FALSE otherwise. 4420 * 4421 */ 4422 boolean_t 4423 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 4424 boolean_t can_fault) 4425 { 4426 vm_paddr_t paddr; 4427 boolean_t needs_mapping; 4428 int error, i; 4429 4430 /* 4431 * Allocate any KVA space that we need, this is done in a separate 4432 * loop to prevent calling vmem_alloc while pinned. 4433 */ 4434 needs_mapping = FALSE; 4435 for (i = 0; i < count; i++) { 4436 paddr = VM_PAGE_TO_PHYS(page[i]); 4437 if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) { 4438 error = vmem_alloc(kernel_arena, PAGE_SIZE, 4439 M_BESTFIT | M_WAITOK, &vaddr[i]); 4440 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 4441 needs_mapping = TRUE; 4442 } else { 4443 vaddr[i] = PHYS_TO_DMAP(paddr); 4444 } 4445 } 4446 4447 /* Exit early if everything is covered by the DMAP */ 4448 if (!needs_mapping) 4449 return (FALSE); 4450 4451 if (!can_fault) 4452 sched_pin(); 4453 for (i = 0; i < count; i++) { 4454 paddr = VM_PAGE_TO_PHYS(page[i]); 4455 if (paddr >= DMAP_MAX_PHYSADDR) { 4456 panic( 4457 "pmap_map_io_transient: TODO: Map out of DMAP data"); 4458 } 4459 } 4460 4461 return (needs_mapping); 4462 } 4463 4464 void 4465 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 4466 boolean_t can_fault) 4467 { 4468 vm_paddr_t paddr; 4469 int i; 4470 4471 if (!can_fault) 4472 sched_unpin(); 4473 for (i = 0; i < count; i++) { 4474 paddr = VM_PAGE_TO_PHYS(page[i]); 4475 if (paddr >= DMAP_MAX_PHYSADDR) { 4476 panic("RISCVTODO: pmap_unmap_io_transient: Unmap data"); 4477 } 4478 } 4479 } 4480 4481 boolean_t 4482 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 4483 { 4484 4485 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_BACK); 4486 } 4487 4488 bool 4489 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2, 4490 pt_entry_t **l3) 4491 { 4492 pd_entry_t *l1p, *l2p; 4493 4494 /* Get l1 directory entry. */ 4495 l1p = pmap_l1(pmap, va); 4496 *l1 = l1p; 4497 4498 if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0) 4499 return (false); 4500 4501 if ((pmap_load(l1p) & PTE_RX) != 0) { 4502 *l2 = NULL; 4503 *l3 = NULL; 4504 return (true); 4505 } 4506 4507 /* Get l2 directory entry. */ 4508 l2p = pmap_l1_to_l2(l1p, va); 4509 *l2 = l2p; 4510 4511 if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0) 4512 return (false); 4513 4514 if ((pmap_load(l2p) & PTE_RX) != 0) { 4515 *l3 = NULL; 4516 return (true); 4517 } 4518 4519 /* Get l3 page table entry. */ 4520 *l3 = pmap_l2_to_l3(l2p, va); 4521 4522 return (true); 4523 } 4524 4525 /* 4526 * Track a range of the kernel's virtual address space that is contiguous 4527 * in various mapping attributes. 4528 */ 4529 struct pmap_kernel_map_range { 4530 vm_offset_t sva; 4531 pt_entry_t attrs; 4532 int l3pages; 4533 int l2pages; 4534 int l1pages; 4535 }; 4536 4537 static void 4538 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 4539 vm_offset_t eva) 4540 { 4541 4542 if (eva <= range->sva) 4543 return; 4544 4545 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %d %d %d\n", 4546 range->sva, eva, 4547 (range->attrs & PTE_W) == PTE_W ? 'w' : '-', 4548 (range->attrs & PTE_X) == PTE_X ? 'x' : '-', 4549 (range->attrs & PTE_U) == PTE_U ? 'u' : 's', 4550 (range->attrs & PTE_G) == PTE_G ? 'g' : '-', 4551 range->l1pages, range->l2pages, range->l3pages); 4552 4553 /* Reset to sentinel value. */ 4554 range->sva = 0xfffffffffffffffful; 4555 } 4556 4557 /* 4558 * Determine whether the attributes specified by a page table entry match those 4559 * being tracked by the current range. 4560 */ 4561 static bool 4562 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 4563 { 4564 4565 return (range->attrs == attrs); 4566 } 4567 4568 static void 4569 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 4570 pt_entry_t attrs) 4571 { 4572 4573 memset(range, 0, sizeof(*range)); 4574 range->sva = va; 4575 range->attrs = attrs; 4576 } 4577 4578 /* 4579 * Given a leaf PTE, derive the mapping's attributes. If they do not match 4580 * those of the current run, dump the address range and its attributes, and 4581 * begin a new run. 4582 */ 4583 static void 4584 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 4585 vm_offset_t va, pd_entry_t l1e, pd_entry_t l2e, pt_entry_t l3e) 4586 { 4587 pt_entry_t attrs; 4588 4589 /* The PTE global bit is inherited by lower levels. */ 4590 attrs = l1e & PTE_G; 4591 if ((l1e & PTE_RWX) != 0) 4592 attrs |= l1e & (PTE_RWX | PTE_U); 4593 else if (l2e != 0) 4594 attrs |= l2e & PTE_G; 4595 if ((l2e & PTE_RWX) != 0) 4596 attrs |= l2e & (PTE_RWX | PTE_U); 4597 else if (l3e != 0) 4598 attrs |= l3e & (PTE_RWX | PTE_U | PTE_G); 4599 4600 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 4601 sysctl_kmaps_dump(sb, range, va); 4602 sysctl_kmaps_reinit(range, va, attrs); 4603 } 4604 } 4605 4606 static int 4607 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 4608 { 4609 struct pmap_kernel_map_range range; 4610 struct sbuf sbuf, *sb; 4611 pd_entry_t l1e, *l2, l2e; 4612 pt_entry_t *l3, l3e; 4613 vm_offset_t sva; 4614 vm_paddr_t pa; 4615 int error, i, j, k; 4616 4617 error = sysctl_wire_old_buffer(req, 0); 4618 if (error != 0) 4619 return (error); 4620 sb = &sbuf; 4621 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 4622 4623 /* Sentinel value. */ 4624 range.sva = 0xfffffffffffffffful; 4625 4626 /* 4627 * Iterate over the kernel page tables without holding the kernel pmap 4628 * lock. Kernel page table pages are never freed, so at worst we will 4629 * observe inconsistencies in the output. 4630 */ 4631 sva = VM_MIN_KERNEL_ADDRESS; 4632 for (i = pmap_l1_index(sva); i < Ln_ENTRIES; i++) { 4633 if (i == pmap_l1_index(DMAP_MIN_ADDRESS)) 4634 sbuf_printf(sb, "\nDirect map:\n"); 4635 else if (i == pmap_l1_index(VM_MIN_KERNEL_ADDRESS)) 4636 sbuf_printf(sb, "\nKernel map:\n"); 4637 4638 l1e = kernel_pmap->pm_l1[i]; 4639 if ((l1e & PTE_V) == 0) { 4640 sysctl_kmaps_dump(sb, &range, sva); 4641 sva += L1_SIZE; 4642 continue; 4643 } 4644 if ((l1e & PTE_RWX) != 0) { 4645 sysctl_kmaps_check(sb, &range, sva, l1e, 0, 0); 4646 range.l1pages++; 4647 sva += L1_SIZE; 4648 continue; 4649 } 4650 pa = PTE_TO_PHYS(l1e); 4651 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); 4652 4653 for (j = pmap_l2_index(sva); j < Ln_ENTRIES; j++) { 4654 l2e = l2[j]; 4655 if ((l2e & PTE_V) == 0) { 4656 sysctl_kmaps_dump(sb, &range, sva); 4657 sva += L2_SIZE; 4658 continue; 4659 } 4660 if ((l2e & PTE_RWX) != 0) { 4661 sysctl_kmaps_check(sb, &range, sva, l1e, l2e, 0); 4662 range.l2pages++; 4663 sva += L2_SIZE; 4664 continue; 4665 } 4666 pa = PTE_TO_PHYS(l2e); 4667 l3 = (pd_entry_t *)PHYS_TO_DMAP(pa); 4668 4669 for (k = pmap_l3_index(sva); k < Ln_ENTRIES; k++, 4670 sva += L3_SIZE) { 4671 l3e = l3[k]; 4672 if ((l3e & PTE_V) == 0) { 4673 sysctl_kmaps_dump(sb, &range, sva); 4674 continue; 4675 } 4676 sysctl_kmaps_check(sb, &range, sva, 4677 l1e, l2e, l3e); 4678 range.l3pages++; 4679 } 4680 } 4681 } 4682 4683 error = sbuf_finish(sb); 4684 sbuf_delete(sb); 4685 return (error); 4686 } 4687 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 4688 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 4689 NULL, 0, sysctl_kmaps, "A", 4690 "Dump kernel address layout"); 4691