1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * Copyright (c) 2014 Andrew Turner 15 * All rights reserved. 16 * Copyright (c) 2014 The FreeBSD Foundation 17 * All rights reserved. 18 * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com> 19 * All rights reserved. 20 * 21 * This code is derived from software contributed to Berkeley by 22 * the Systems Programming Group of the University of Utah Computer 23 * Science Department and William Jolitz of UUNET Technologies Inc. 24 * 25 * Portions of this software were developed by Andrew Turner under 26 * sponsorship from The FreeBSD Foundation. 27 * 28 * Portions of this software were developed by SRI International and the 29 * University of Cambridge Computer Laboratory under DARPA/AFRL contract 30 * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. 31 * 32 * Portions of this software were developed by the University of Cambridge 33 * Computer Laboratory as part of the CTSRD Project, with support from the 34 * UK Higher Education Innovation Fund (HEIF). 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. All advertising materials mentioning features or use of this software 45 * must display the following acknowledgement: 46 * This product includes software developed by the University of 47 * California, Berkeley and its contributors. 48 * 4. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 * 64 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 65 */ 66 /*- 67 * Copyright (c) 2003 Networks Associates Technology, Inc. 68 * All rights reserved. 69 * 70 * This software was developed for the FreeBSD Project by Jake Burkholder, 71 * Safeport Network Services, and Network Associates Laboratories, the 72 * Security Research Division of Network Associates, Inc. under 73 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 74 * CHATS research program. 75 * 76 * Redistribution and use in source and binary forms, with or without 77 * modification, are permitted provided that the following conditions 78 * are met: 79 * 1. Redistributions of source code must retain the above copyright 80 * notice, this list of conditions and the following disclaimer. 81 * 2. Redistributions in binary form must reproduce the above copyright 82 * notice, this list of conditions and the following disclaimer in the 83 * documentation and/or other materials provided with the distribution. 84 * 85 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 86 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 87 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 88 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 89 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 90 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 91 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 92 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 93 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 94 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 95 * SUCH DAMAGE. 96 */ 97 98 #include <sys/cdefs.h> 99 __FBSDID("$FreeBSD$"); 100 101 /* 102 * Manages physical address maps. 103 * 104 * Since the information managed by this module is 105 * also stored by the logical address mapping module, 106 * this module may throw away valid virtual-to-physical 107 * mappings at almost any time. However, invalidations 108 * of virtual-to-physical mappings must be done as 109 * requested. 110 * 111 * In order to cope with hardware architectures which 112 * make virtual-to-physical map invalidates expensive, 113 * this module may delay invalidate or reduced protection 114 * operations until such time as they are actually 115 * necessary. This module is given full information as 116 * to which processors are currently using which maps, 117 * and to when physical maps must be made correct. 118 */ 119 120 #include <sys/param.h> 121 #include <sys/systm.h> 122 #include <sys/bitstring.h> 123 #include <sys/bus.h> 124 #include <sys/cpuset.h> 125 #include <sys/kernel.h> 126 #include <sys/ktr.h> 127 #include <sys/lock.h> 128 #include <sys/malloc.h> 129 #include <sys/mman.h> 130 #include <sys/msgbuf.h> 131 #include <sys/mutex.h> 132 #include <sys/physmem.h> 133 #include <sys/proc.h> 134 #include <sys/rwlock.h> 135 #include <sys/sbuf.h> 136 #include <sys/sx.h> 137 #include <sys/vmem.h> 138 #include <sys/vmmeter.h> 139 #include <sys/sched.h> 140 #include <sys/sysctl.h> 141 #include <sys/smp.h> 142 143 #include <vm/vm.h> 144 #include <vm/vm_param.h> 145 #include <vm/vm_kern.h> 146 #include <vm/vm_page.h> 147 #include <vm/vm_map.h> 148 #include <vm/vm_object.h> 149 #include <vm/vm_extern.h> 150 #include <vm/vm_pageout.h> 151 #include <vm/vm_pager.h> 152 #include <vm/vm_phys.h> 153 #include <vm/vm_radix.h> 154 #include <vm/vm_reserv.h> 155 #include <vm/uma.h> 156 157 #include <machine/machdep.h> 158 #include <machine/md_var.h> 159 #include <machine/pcb.h> 160 #include <machine/sbi.h> 161 162 #define NUL1E (Ln_ENTRIES * Ln_ENTRIES) 163 #define NUL2E (Ln_ENTRIES * NUL1E) 164 165 #if !defined(DIAGNOSTIC) 166 #ifdef __GNUC_GNU_INLINE__ 167 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 168 #else 169 #define PMAP_INLINE extern inline 170 #endif 171 #else 172 #define PMAP_INLINE 173 #endif 174 175 #ifdef PV_STATS 176 #define PV_STAT(x) do { x ; } while (0) 177 #else 178 #define PV_STAT(x) do { } while (0) 179 #endif 180 181 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) 182 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 183 184 #define NPV_LIST_LOCKS MAXCPU 185 186 #define PHYS_TO_PV_LIST_LOCK(pa) \ 187 (&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS]) 188 189 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 190 struct rwlock **_lockp = (lockp); \ 191 struct rwlock *_new_lock; \ 192 \ 193 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 194 if (_new_lock != *_lockp) { \ 195 if (*_lockp != NULL) \ 196 rw_wunlock(*_lockp); \ 197 *_lockp = _new_lock; \ 198 rw_wlock(*_lockp); \ 199 } \ 200 } while (0) 201 202 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 203 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 204 205 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 206 struct rwlock **_lockp = (lockp); \ 207 \ 208 if (*_lockp != NULL) { \ 209 rw_wunlock(*_lockp); \ 210 *_lockp = NULL; \ 211 } \ 212 } while (0) 213 214 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 215 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 216 217 /* The list of all the user pmaps */ 218 LIST_HEAD(pmaplist, pmap); 219 static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER(); 220 221 struct pmap kernel_pmap_store; 222 223 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 224 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 225 vm_offset_t kernel_vm_end = 0; 226 227 vm_paddr_t dmap_phys_base; /* The start of the dmap region */ 228 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ 229 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ 230 231 /* This code assumes all L1 DMAP entries will be used */ 232 CTASSERT((DMAP_MIN_ADDRESS & ~L1_OFFSET) == DMAP_MIN_ADDRESS); 233 CTASSERT((DMAP_MAX_ADDRESS & ~L1_OFFSET) == DMAP_MAX_ADDRESS); 234 235 static struct rwlock_padalign pvh_global_lock; 236 static struct mtx_padalign allpmaps_lock; 237 238 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 239 "VM/pmap parameters"); 240 241 static int superpages_enabled = 1; 242 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, 243 CTLFLAG_RDTUN, &superpages_enabled, 0, 244 "Enable support for transparent superpages"); 245 246 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 247 "2MB page mapping counters"); 248 249 static u_long pmap_l2_demotions; 250 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, 251 &pmap_l2_demotions, 0, 252 "2MB page demotions"); 253 254 static u_long pmap_l2_mappings; 255 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, 256 &pmap_l2_mappings, 0, 257 "2MB page mappings"); 258 259 static u_long pmap_l2_p_failures; 260 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, 261 &pmap_l2_p_failures, 0, 262 "2MB page promotion failures"); 263 264 static u_long pmap_l2_promotions; 265 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, 266 &pmap_l2_promotions, 0, 267 "2MB page promotions"); 268 269 /* 270 * Data for the pv entry allocation mechanism 271 */ 272 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 273 static struct mtx pv_chunks_mutex; 274 static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 275 static struct md_page *pv_table; 276 static struct md_page pv_dummy; 277 278 extern cpuset_t all_harts; 279 280 /* 281 * Internal flags for pmap_enter()'s helper functions. 282 */ 283 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 284 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 285 286 static void free_pv_chunk(struct pv_chunk *pc); 287 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 288 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 289 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 290 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 291 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 292 vm_offset_t va); 293 static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va); 294 static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, 295 vm_offset_t va, struct rwlock **lockp); 296 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, 297 u_int flags, vm_page_t m, struct rwlock **lockp); 298 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 299 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 300 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, 301 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 302 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 303 vm_page_t m, struct rwlock **lockp); 304 305 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, 306 struct rwlock **lockp); 307 308 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 309 struct spglist *free); 310 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 311 312 #define pmap_clear(pte) pmap_store(pte, 0) 313 #define pmap_clear_bits(pte, bits) atomic_clear_64(pte, bits) 314 #define pmap_load_store(pte, entry) atomic_swap_64(pte, entry) 315 #define pmap_load_clear(pte) pmap_load_store(pte, 0) 316 #define pmap_load(pte) atomic_load_64(pte) 317 #define pmap_store(pte, entry) atomic_store_64(pte, entry) 318 #define pmap_store_bits(pte, bits) atomic_set_64(pte, bits) 319 320 /********************/ 321 /* Inline functions */ 322 /********************/ 323 324 static __inline void 325 pagecopy(void *s, void *d) 326 { 327 328 memcpy(d, s, PAGE_SIZE); 329 } 330 331 static __inline void 332 pagezero(void *p) 333 { 334 335 bzero(p, PAGE_SIZE); 336 } 337 338 #define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) 339 #define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) 340 #define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) 341 342 #define PTE_TO_PHYS(pte) \ 343 ((((pte) & ~PTE_HI_MASK) >> PTE_PPN0_S) * PAGE_SIZE) 344 345 static __inline pd_entry_t * 346 pmap_l1(pmap_t pmap, vm_offset_t va) 347 { 348 349 return (&pmap->pm_l1[pmap_l1_index(va)]); 350 } 351 352 static __inline pd_entry_t * 353 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) 354 { 355 vm_paddr_t phys; 356 pd_entry_t *l2; 357 358 phys = PTE_TO_PHYS(pmap_load(l1)); 359 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 360 361 return (&l2[pmap_l2_index(va)]); 362 } 363 364 static __inline pd_entry_t * 365 pmap_l2(pmap_t pmap, vm_offset_t va) 366 { 367 pd_entry_t *l1; 368 369 l1 = pmap_l1(pmap, va); 370 if ((pmap_load(l1) & PTE_V) == 0) 371 return (NULL); 372 if ((pmap_load(l1) & PTE_RX) != 0) 373 return (NULL); 374 375 return (pmap_l1_to_l2(l1, va)); 376 } 377 378 static __inline pt_entry_t * 379 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) 380 { 381 vm_paddr_t phys; 382 pt_entry_t *l3; 383 384 phys = PTE_TO_PHYS(pmap_load(l2)); 385 l3 = (pd_entry_t *)PHYS_TO_DMAP(phys); 386 387 return (&l3[pmap_l3_index(va)]); 388 } 389 390 static __inline pt_entry_t * 391 pmap_l3(pmap_t pmap, vm_offset_t va) 392 { 393 pd_entry_t *l2; 394 395 l2 = pmap_l2(pmap, va); 396 if (l2 == NULL) 397 return (NULL); 398 if ((pmap_load(l2) & PTE_V) == 0) 399 return (NULL); 400 if ((pmap_load(l2) & PTE_RX) != 0) 401 return (NULL); 402 403 return (pmap_l2_to_l3(l2, va)); 404 } 405 406 static __inline void 407 pmap_resident_count_inc(pmap_t pmap, int count) 408 { 409 410 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 411 pmap->pm_stats.resident_count += count; 412 } 413 414 static __inline void 415 pmap_resident_count_dec(pmap_t pmap, int count) 416 { 417 418 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 419 KASSERT(pmap->pm_stats.resident_count >= count, 420 ("pmap %p resident count underflow %ld %d", pmap, 421 pmap->pm_stats.resident_count, count)); 422 pmap->pm_stats.resident_count -= count; 423 } 424 425 static void 426 pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index, 427 pt_entry_t entry) 428 { 429 struct pmap *user_pmap; 430 pd_entry_t *l1; 431 432 /* Distribute new kernel L1 entry to all the user pmaps */ 433 if (pmap != kernel_pmap) 434 return; 435 436 mtx_lock(&allpmaps_lock); 437 LIST_FOREACH(user_pmap, &allpmaps, pm_list) { 438 l1 = &user_pmap->pm_l1[l1index]; 439 pmap_store(l1, entry); 440 } 441 mtx_unlock(&allpmaps_lock); 442 } 443 444 static pt_entry_t * 445 pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, 446 u_int *l2_slot) 447 { 448 pt_entry_t *l2; 449 pd_entry_t *l1; 450 451 l1 = (pd_entry_t *)l1pt; 452 *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; 453 454 /* Check locore has used a table L1 map */ 455 KASSERT((l1[*l1_slot] & PTE_RX) == 0, 456 ("Invalid bootstrap L1 table")); 457 458 /* Find the address of the L2 table */ 459 l2 = (pt_entry_t *)init_pt_va; 460 *l2_slot = pmap_l2_index(va); 461 462 return (l2); 463 } 464 465 static vm_paddr_t 466 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) 467 { 468 u_int l1_slot, l2_slot; 469 pt_entry_t *l2; 470 vm_paddr_t ret; 471 472 l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); 473 474 /* Check locore has used L2 superpages */ 475 KASSERT((l2[l2_slot] & PTE_RX) != 0, 476 ("Invalid bootstrap L2 table")); 477 478 /* L2 is superpages */ 479 ret = (l2[l2_slot] >> PTE_PPN1_S) << L2_SHIFT; 480 ret += (va & L2_OFFSET); 481 482 return (ret); 483 } 484 485 static void 486 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa) 487 { 488 vm_offset_t va; 489 vm_paddr_t pa; 490 pd_entry_t *l1; 491 u_int l1_slot; 492 pt_entry_t entry; 493 pn_t pn; 494 495 pa = dmap_phys_base = min_pa & ~L1_OFFSET; 496 va = DMAP_MIN_ADDRESS; 497 l1 = (pd_entry_t *)kern_l1; 498 l1_slot = pmap_l1_index(DMAP_MIN_ADDRESS); 499 500 for (; va < DMAP_MAX_ADDRESS && pa < max_pa; 501 pa += L1_SIZE, va += L1_SIZE, l1_slot++) { 502 KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); 503 504 /* superpages */ 505 pn = (pa / PAGE_SIZE); 506 entry = PTE_KERN; 507 entry |= (pn << PTE_PPN0_S); 508 pmap_store(&l1[l1_slot], entry); 509 } 510 511 /* Set the upper limit of the DMAP region */ 512 dmap_phys_max = pa; 513 dmap_max_addr = va; 514 515 sfence_vma(); 516 } 517 518 static vm_offset_t 519 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) 520 { 521 vm_offset_t l3pt; 522 pt_entry_t entry; 523 pd_entry_t *l2; 524 vm_paddr_t pa; 525 u_int l2_slot; 526 pn_t pn; 527 528 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); 529 530 l2 = pmap_l2(kernel_pmap, va); 531 l2 = (pd_entry_t *)((uintptr_t)l2 & ~(PAGE_SIZE - 1)); 532 l2_slot = pmap_l2_index(va); 533 l3pt = l3_start; 534 535 for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { 536 KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); 537 538 pa = pmap_early_vtophys(l1pt, l3pt); 539 pn = (pa / PAGE_SIZE); 540 entry = (PTE_V); 541 entry |= (pn << PTE_PPN0_S); 542 pmap_store(&l2[l2_slot], entry); 543 l3pt += PAGE_SIZE; 544 } 545 546 547 /* Clean the L2 page table */ 548 memset((void *)l3_start, 0, l3pt - l3_start); 549 550 return (l3pt); 551 } 552 553 /* 554 * Bootstrap the system enough to run with virtual memory. 555 */ 556 void 557 pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) 558 { 559 u_int l1_slot, l2_slot; 560 vm_offset_t freemempos; 561 vm_offset_t dpcpu, msgbufpv; 562 vm_paddr_t max_pa, min_pa, pa; 563 pt_entry_t *l2p; 564 int i; 565 566 printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); 567 568 /* Set this early so we can use the pagetable walking functions */ 569 kernel_pmap_store.pm_l1 = (pd_entry_t *)l1pt; 570 PMAP_LOCK_INIT(kernel_pmap); 571 572 rw_init(&pvh_global_lock, "pmap pv global"); 573 574 CPU_FILL(&kernel_pmap->pm_active); 575 576 /* Assume the address we were loaded to is a valid physical address. */ 577 min_pa = max_pa = kernstart; 578 579 physmap_idx = physmem_avail(physmap, nitems(physmap)); 580 physmap_idx /= 2; 581 582 /* 583 * Find the minimum physical address. physmap is sorted, 584 * but may contain empty ranges. 585 */ 586 for (i = 0; i < physmap_idx * 2; i += 2) { 587 if (physmap[i] == physmap[i + 1]) 588 continue; 589 if (physmap[i] <= min_pa) 590 min_pa = physmap[i]; 591 if (physmap[i + 1] > max_pa) 592 max_pa = physmap[i + 1]; 593 } 594 printf("physmap_idx %lx\n", physmap_idx); 595 printf("min_pa %lx\n", min_pa); 596 printf("max_pa %lx\n", max_pa); 597 598 /* Create a direct map region early so we can use it for pa -> va */ 599 pmap_bootstrap_dmap(l1pt, min_pa, max_pa); 600 601 /* 602 * Read the page table to find out what is already mapped. 603 * This assumes we have mapped a block of memory from KERNBASE 604 * using a single L1 entry. 605 */ 606 (void)pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); 607 608 /* Sanity check the index, KERNBASE should be the first VA */ 609 KASSERT(l2_slot == 0, ("The L2 index is non-zero")); 610 611 freemempos = roundup2(KERNBASE + kernlen, PAGE_SIZE); 612 613 /* Create the l3 tables for the early devmap */ 614 freemempos = pmap_bootstrap_l3(l1pt, 615 VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos); 616 617 /* 618 * Invalidate the mapping we created for the DTB. At this point a copy 619 * has been created, and we no longer need it. We want to avoid the 620 * possibility of an aliased mapping in the future. 621 */ 622 l2p = pmap_l2(kernel_pmap, VM_EARLY_DTB_ADDRESS); 623 if ((pmap_load(l2p) & PTE_V) != 0) 624 pmap_clear(l2p); 625 626 sfence_vma(); 627 628 #define alloc_pages(var, np) \ 629 (var) = freemempos; \ 630 freemempos += (np * PAGE_SIZE); \ 631 memset((char *)(var), 0, ((np) * PAGE_SIZE)); 632 633 /* Allocate dynamic per-cpu area. */ 634 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); 635 dpcpu_init((void *)dpcpu, 0); 636 637 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ 638 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); 639 msgbufp = (void *)msgbufpv; 640 641 virtual_avail = roundup2(freemempos, L2_SIZE); 642 virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE; 643 kernel_vm_end = virtual_avail; 644 645 pa = pmap_early_vtophys(l1pt, freemempos); 646 647 physmem_exclude_region(kernstart, pa - kernstart, EXFLAG_NOALLOC); 648 } 649 650 /* 651 * Initialize a vm_page's machine-dependent fields. 652 */ 653 void 654 pmap_page_init(vm_page_t m) 655 { 656 657 TAILQ_INIT(&m->md.pv_list); 658 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; 659 } 660 661 /* 662 * Initialize the pmap module. 663 * Called by vm_init, to initialize any structures that the pmap 664 * system needs to map virtual memory. 665 */ 666 void 667 pmap_init(void) 668 { 669 vm_size_t s; 670 int i, pv_npg; 671 672 /* 673 * Initialize the pv chunk and pmap list mutexes. 674 */ 675 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 676 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF); 677 678 /* 679 * Initialize the pool of pv list locks. 680 */ 681 for (i = 0; i < NPV_LIST_LOCKS; i++) 682 rw_init(&pv_list_locks[i], "pmap pv list"); 683 684 /* 685 * Calculate the size of the pv head table for superpages. 686 */ 687 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); 688 689 /* 690 * Allocate memory for the pv head table for superpages. 691 */ 692 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 693 s = round_page(s); 694 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 695 for (i = 0; i < pv_npg; i++) 696 TAILQ_INIT(&pv_table[i].pv_list); 697 TAILQ_INIT(&pv_dummy.pv_list); 698 699 if (superpages_enabled) 700 pagesizes[1] = L2_SIZE; 701 } 702 703 #ifdef SMP 704 /* 705 * For SMP, these functions have to use IPIs for coherence. 706 * 707 * In general, the calling thread uses a plain fence to order the 708 * writes to the page tables before invoking an SBI callback to invoke 709 * sfence_vma() on remote CPUs. 710 */ 711 static void 712 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 713 { 714 cpuset_t mask; 715 716 sched_pin(); 717 mask = pmap->pm_active; 718 CPU_CLR(PCPU_GET(hart), &mask); 719 fence(); 720 if (!CPU_EMPTY(&mask) && smp_started) 721 sbi_remote_sfence_vma(mask.__bits, va, 1); 722 sfence_vma_page(va); 723 sched_unpin(); 724 } 725 726 static void 727 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 728 { 729 cpuset_t mask; 730 731 sched_pin(); 732 mask = pmap->pm_active; 733 CPU_CLR(PCPU_GET(hart), &mask); 734 fence(); 735 if (!CPU_EMPTY(&mask) && smp_started) 736 sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1); 737 738 /* 739 * Might consider a loop of sfence_vma_page() for a small 740 * number of pages in the future. 741 */ 742 sfence_vma(); 743 sched_unpin(); 744 } 745 746 static void 747 pmap_invalidate_all(pmap_t pmap) 748 { 749 cpuset_t mask; 750 751 sched_pin(); 752 mask = pmap->pm_active; 753 CPU_CLR(PCPU_GET(hart), &mask); 754 755 /* 756 * XXX: The SBI doc doesn't detail how to specify x0 as the 757 * address to perform a global fence. BBL currently treats 758 * all sfence_vma requests as global however. 759 */ 760 fence(); 761 if (!CPU_EMPTY(&mask) && smp_started) 762 sbi_remote_sfence_vma(mask.__bits, 0, 0); 763 sfence_vma(); 764 sched_unpin(); 765 } 766 #else 767 /* 768 * Normal, non-SMP, invalidation functions. 769 * We inline these within pmap.c for speed. 770 */ 771 static __inline void 772 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 773 { 774 775 sfence_vma_page(va); 776 } 777 778 static __inline void 779 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 780 { 781 782 /* 783 * Might consider a loop of sfence_vma_page() for a small 784 * number of pages in the future. 785 */ 786 sfence_vma(); 787 } 788 789 static __inline void 790 pmap_invalidate_all(pmap_t pmap) 791 { 792 793 sfence_vma(); 794 } 795 #endif 796 797 /* 798 * Routine: pmap_extract 799 * Function: 800 * Extract the physical page address associated 801 * with the given map/virtual_address pair. 802 */ 803 vm_paddr_t 804 pmap_extract(pmap_t pmap, vm_offset_t va) 805 { 806 pd_entry_t *l2p, l2; 807 pt_entry_t *l3p, l3; 808 vm_paddr_t pa; 809 810 pa = 0; 811 PMAP_LOCK(pmap); 812 /* 813 * Start with the l2 tabel. We are unable to allocate 814 * pages in the l1 table. 815 */ 816 l2p = pmap_l2(pmap, va); 817 if (l2p != NULL) { 818 l2 = pmap_load(l2p); 819 if ((l2 & PTE_RX) == 0) { 820 l3p = pmap_l2_to_l3(l2p, va); 821 if (l3p != NULL) { 822 l3 = pmap_load(l3p); 823 pa = PTE_TO_PHYS(l3); 824 pa |= (va & L3_OFFSET); 825 } 826 } else { 827 /* L2 is superpages */ 828 pa = (l2 >> PTE_PPN1_S) << L2_SHIFT; 829 pa |= (va & L2_OFFSET); 830 } 831 } 832 PMAP_UNLOCK(pmap); 833 return (pa); 834 } 835 836 /* 837 * Routine: pmap_extract_and_hold 838 * Function: 839 * Atomically extract and hold the physical page 840 * with the given pmap and virtual address pair 841 * if that mapping permits the given protection. 842 */ 843 vm_page_t 844 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 845 { 846 pt_entry_t *l3p, l3; 847 vm_paddr_t phys; 848 vm_page_t m; 849 850 m = NULL; 851 PMAP_LOCK(pmap); 852 l3p = pmap_l3(pmap, va); 853 if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) { 854 if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) { 855 phys = PTE_TO_PHYS(l3); 856 m = PHYS_TO_VM_PAGE(phys); 857 if (!vm_page_wire_mapped(m)) 858 m = NULL; 859 } 860 } 861 PMAP_UNLOCK(pmap); 862 return (m); 863 } 864 865 vm_paddr_t 866 pmap_kextract(vm_offset_t va) 867 { 868 pd_entry_t *l2; 869 pt_entry_t *l3; 870 vm_paddr_t pa; 871 872 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 873 pa = DMAP_TO_PHYS(va); 874 } else { 875 l2 = pmap_l2(kernel_pmap, va); 876 if (l2 == NULL) 877 panic("pmap_kextract: No l2"); 878 if ((pmap_load(l2) & PTE_RX) != 0) { 879 /* superpages */ 880 pa = (pmap_load(l2) >> PTE_PPN1_S) << L2_SHIFT; 881 pa |= (va & L2_OFFSET); 882 return (pa); 883 } 884 885 l3 = pmap_l2_to_l3(l2, va); 886 if (l3 == NULL) 887 panic("pmap_kextract: No l3..."); 888 pa = PTE_TO_PHYS(pmap_load(l3)); 889 pa |= (va & PAGE_MASK); 890 } 891 return (pa); 892 } 893 894 /*************************************************** 895 * Low level mapping routines..... 896 ***************************************************/ 897 898 void 899 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) 900 { 901 pt_entry_t entry; 902 pt_entry_t *l3; 903 vm_offset_t va; 904 pn_t pn; 905 906 KASSERT((pa & L3_OFFSET) == 0, 907 ("pmap_kenter_device: Invalid physical address")); 908 KASSERT((sva & L3_OFFSET) == 0, 909 ("pmap_kenter_device: Invalid virtual address")); 910 KASSERT((size & PAGE_MASK) == 0, 911 ("pmap_kenter_device: Mapping is not page-sized")); 912 913 va = sva; 914 while (size != 0) { 915 l3 = pmap_l3(kernel_pmap, va); 916 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 917 918 pn = (pa / PAGE_SIZE); 919 entry = PTE_KERN; 920 entry |= (pn << PTE_PPN0_S); 921 pmap_store(l3, entry); 922 923 va += PAGE_SIZE; 924 pa += PAGE_SIZE; 925 size -= PAGE_SIZE; 926 } 927 pmap_invalidate_range(kernel_pmap, sva, va); 928 } 929 930 /* 931 * Remove a page from the kernel pagetables. 932 * Note: not SMP coherent. 933 */ 934 PMAP_INLINE void 935 pmap_kremove(vm_offset_t va) 936 { 937 pt_entry_t *l3; 938 939 l3 = pmap_l3(kernel_pmap, va); 940 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 941 942 pmap_clear(l3); 943 sfence_vma(); 944 } 945 946 void 947 pmap_kremove_device(vm_offset_t sva, vm_size_t size) 948 { 949 pt_entry_t *l3; 950 vm_offset_t va; 951 952 KASSERT((sva & L3_OFFSET) == 0, 953 ("pmap_kremove_device: Invalid virtual address")); 954 KASSERT((size & PAGE_MASK) == 0, 955 ("pmap_kremove_device: Mapping is not page-sized")); 956 957 va = sva; 958 while (size != 0) { 959 l3 = pmap_l3(kernel_pmap, va); 960 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 961 pmap_clear(l3); 962 963 va += PAGE_SIZE; 964 size -= PAGE_SIZE; 965 } 966 967 pmap_invalidate_range(kernel_pmap, sva, va); 968 } 969 970 /* 971 * Used to map a range of physical addresses into kernel 972 * virtual address space. 973 * 974 * The value passed in '*virt' is a suggested virtual address for 975 * the mapping. Architectures which can support a direct-mapped 976 * physical to virtual region can return the appropriate address 977 * within that region, leaving '*virt' unchanged. Other 978 * architectures should map the pages starting at '*virt' and 979 * update '*virt' with the first usable address after the mapped 980 * region. 981 */ 982 vm_offset_t 983 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 984 { 985 986 return PHYS_TO_DMAP(start); 987 } 988 989 990 /* 991 * Add a list of wired pages to the kva 992 * this routine is only used for temporary 993 * kernel mappings that do not need to have 994 * page modification or references recorded. 995 * Note that old mappings are simply written 996 * over. The page *must* be wired. 997 * Note: SMP coherent. Uses a ranged shootdown IPI. 998 */ 999 void 1000 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1001 { 1002 pt_entry_t *l3, pa; 1003 vm_offset_t va; 1004 vm_page_t m; 1005 pt_entry_t entry; 1006 pn_t pn; 1007 int i; 1008 1009 va = sva; 1010 for (i = 0; i < count; i++) { 1011 m = ma[i]; 1012 pa = VM_PAGE_TO_PHYS(m); 1013 pn = (pa / PAGE_SIZE); 1014 l3 = pmap_l3(kernel_pmap, va); 1015 1016 entry = PTE_KERN; 1017 entry |= (pn << PTE_PPN0_S); 1018 pmap_store(l3, entry); 1019 1020 va += L3_SIZE; 1021 } 1022 pmap_invalidate_range(kernel_pmap, sva, va); 1023 } 1024 1025 /* 1026 * This routine tears out page mappings from the 1027 * kernel -- it is meant only for temporary mappings. 1028 * Note: SMP coherent. Uses a ranged shootdown IPI. 1029 */ 1030 void 1031 pmap_qremove(vm_offset_t sva, int count) 1032 { 1033 pt_entry_t *l3; 1034 vm_offset_t va; 1035 1036 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); 1037 1038 for (va = sva; count-- > 0; va += PAGE_SIZE) { 1039 l3 = pmap_l3(kernel_pmap, va); 1040 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 1041 pmap_clear(l3); 1042 } 1043 pmap_invalidate_range(kernel_pmap, sva, va); 1044 } 1045 1046 bool 1047 pmap_ps_enabled(pmap_t pmap __unused) 1048 { 1049 1050 return (superpages_enabled); 1051 } 1052 1053 /*************************************************** 1054 * Page table page management routines..... 1055 ***************************************************/ 1056 /* 1057 * Schedule the specified unused page table page to be freed. Specifically, 1058 * add the page to the specified list of pages that will be released to the 1059 * physical memory manager after the TLB has been updated. 1060 */ 1061 static __inline void 1062 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 1063 boolean_t set_PG_ZERO) 1064 { 1065 1066 if (set_PG_ZERO) 1067 m->flags |= PG_ZERO; 1068 else 1069 m->flags &= ~PG_ZERO; 1070 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1071 } 1072 1073 /* 1074 * Inserts the specified page table page into the specified pmap's collection 1075 * of idle page table pages. Each of a pmap's page table pages is responsible 1076 * for mapping a distinct range of virtual addresses. The pmap's collection is 1077 * ordered by this virtual address range. 1078 * 1079 * If "promoted" is false, then the page table page "ml3" must be zero filled. 1080 */ 1081 static __inline int 1082 pmap_insert_pt_page(pmap_t pmap, vm_page_t ml3, bool promoted) 1083 { 1084 1085 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1086 ml3->valid = promoted ? VM_PAGE_BITS_ALL : 0; 1087 return (vm_radix_insert(&pmap->pm_root, ml3)); 1088 } 1089 1090 /* 1091 * Removes the page table page mapping the specified virtual address from the 1092 * specified pmap's collection of idle page table pages, and returns it. 1093 * Otherwise, returns NULL if there is no page table page corresponding to the 1094 * specified virtual address. 1095 */ 1096 static __inline vm_page_t 1097 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 1098 { 1099 1100 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1101 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); 1102 } 1103 1104 /* 1105 * Decrements a page table page's reference count, which is used to record the 1106 * number of valid page table entries within the page. If the reference count 1107 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1108 * page table page was unmapped and FALSE otherwise. 1109 */ 1110 static inline boolean_t 1111 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1112 { 1113 1114 --m->ref_count; 1115 if (m->ref_count == 0) { 1116 _pmap_unwire_ptp(pmap, va, m, free); 1117 return (TRUE); 1118 } else { 1119 return (FALSE); 1120 } 1121 } 1122 1123 static void 1124 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1125 { 1126 vm_paddr_t phys; 1127 1128 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1129 if (m->pindex >= NUL1E) { 1130 pd_entry_t *l1; 1131 l1 = pmap_l1(pmap, va); 1132 pmap_clear(l1); 1133 pmap_distribute_l1(pmap, pmap_l1_index(va), 0); 1134 } else { 1135 pd_entry_t *l2; 1136 l2 = pmap_l2(pmap, va); 1137 pmap_clear(l2); 1138 } 1139 pmap_resident_count_dec(pmap, 1); 1140 if (m->pindex < NUL1E) { 1141 pd_entry_t *l1; 1142 vm_page_t pdpg; 1143 1144 l1 = pmap_l1(pmap, va); 1145 phys = PTE_TO_PHYS(pmap_load(l1)); 1146 pdpg = PHYS_TO_VM_PAGE(phys); 1147 pmap_unwire_ptp(pmap, va, pdpg, free); 1148 } 1149 pmap_invalidate_page(pmap, va); 1150 1151 vm_wire_sub(1); 1152 1153 /* 1154 * Put page on a list so that it is released after 1155 * *ALL* TLB shootdown is done 1156 */ 1157 pmap_add_delayed_free_list(m, free, TRUE); 1158 } 1159 1160 /* 1161 * After removing a page table entry, this routine is used to 1162 * conditionally free the page, and manage the reference count. 1163 */ 1164 static int 1165 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 1166 struct spglist *free) 1167 { 1168 vm_page_t mpte; 1169 1170 if (va >= VM_MAXUSER_ADDRESS) 1171 return (0); 1172 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 1173 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde)); 1174 return (pmap_unwire_ptp(pmap, va, mpte, free)); 1175 } 1176 1177 void 1178 pmap_pinit0(pmap_t pmap) 1179 { 1180 1181 PMAP_LOCK_INIT(pmap); 1182 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1183 pmap->pm_l1 = kernel_pmap->pm_l1; 1184 pmap->pm_satp = SATP_MODE_SV39 | (vtophys(pmap->pm_l1) >> PAGE_SHIFT); 1185 CPU_ZERO(&pmap->pm_active); 1186 pmap_activate_boot(pmap); 1187 } 1188 1189 int 1190 pmap_pinit(pmap_t pmap) 1191 { 1192 vm_paddr_t l1phys; 1193 vm_page_t l1pt; 1194 1195 /* 1196 * allocate the l1 page 1197 */ 1198 while ((l1pt = vm_page_alloc(NULL, 0xdeadbeef, VM_ALLOC_NORMAL | 1199 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) 1200 vm_wait(NULL); 1201 1202 l1phys = VM_PAGE_TO_PHYS(l1pt); 1203 pmap->pm_l1 = (pd_entry_t *)PHYS_TO_DMAP(l1phys); 1204 pmap->pm_satp = SATP_MODE_SV39 | (l1phys >> PAGE_SHIFT); 1205 1206 if ((l1pt->flags & PG_ZERO) == 0) 1207 pagezero(pmap->pm_l1); 1208 1209 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1210 1211 CPU_ZERO(&pmap->pm_active); 1212 1213 /* Install kernel pagetables */ 1214 memcpy(pmap->pm_l1, kernel_pmap->pm_l1, PAGE_SIZE); 1215 1216 /* Add to the list of all user pmaps */ 1217 mtx_lock(&allpmaps_lock); 1218 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1219 mtx_unlock(&allpmaps_lock); 1220 1221 vm_radix_init(&pmap->pm_root); 1222 1223 return (1); 1224 } 1225 1226 /* 1227 * This routine is called if the desired page table page does not exist. 1228 * 1229 * If page table page allocation fails, this routine may sleep before 1230 * returning NULL. It sleeps only if a lock pointer was given. 1231 * 1232 * Note: If a page allocation fails at page table level two or three, 1233 * one or two pages may be held during the wait, only to be released 1234 * afterwards. This conservative approach is easily argued to avoid 1235 * race conditions. 1236 */ 1237 static vm_page_t 1238 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 1239 { 1240 vm_page_t m, /*pdppg, */pdpg; 1241 pt_entry_t entry; 1242 vm_paddr_t phys; 1243 pn_t pn; 1244 1245 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1246 1247 /* 1248 * Allocate a page table page. 1249 */ 1250 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1251 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1252 if (lockp != NULL) { 1253 RELEASE_PV_LIST_LOCK(lockp); 1254 PMAP_UNLOCK(pmap); 1255 rw_runlock(&pvh_global_lock); 1256 vm_wait(NULL); 1257 rw_rlock(&pvh_global_lock); 1258 PMAP_LOCK(pmap); 1259 } 1260 1261 /* 1262 * Indicate the need to retry. While waiting, the page table 1263 * page may have been allocated. 1264 */ 1265 return (NULL); 1266 } 1267 1268 if ((m->flags & PG_ZERO) == 0) 1269 pmap_zero_page(m); 1270 1271 /* 1272 * Map the pagetable page into the process address space, if 1273 * it isn't already there. 1274 */ 1275 1276 if (ptepindex >= NUL1E) { 1277 pd_entry_t *l1; 1278 vm_pindex_t l1index; 1279 1280 l1index = ptepindex - NUL1E; 1281 l1 = &pmap->pm_l1[l1index]; 1282 1283 pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); 1284 entry = (PTE_V); 1285 entry |= (pn << PTE_PPN0_S); 1286 pmap_store(l1, entry); 1287 pmap_distribute_l1(pmap, l1index, entry); 1288 } else { 1289 vm_pindex_t l1index; 1290 pd_entry_t *l1, *l2; 1291 1292 l1index = ptepindex >> (L1_SHIFT - L2_SHIFT); 1293 l1 = &pmap->pm_l1[l1index]; 1294 if (pmap_load(l1) == 0) { 1295 /* recurse for allocating page dir */ 1296 if (_pmap_alloc_l3(pmap, NUL1E + l1index, 1297 lockp) == NULL) { 1298 vm_page_unwire_noq(m); 1299 vm_page_free_zero(m); 1300 return (NULL); 1301 } 1302 } else { 1303 phys = PTE_TO_PHYS(pmap_load(l1)); 1304 pdpg = PHYS_TO_VM_PAGE(phys); 1305 pdpg->ref_count++; 1306 } 1307 1308 phys = PTE_TO_PHYS(pmap_load(l1)); 1309 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1310 l2 = &l2[ptepindex & Ln_ADDR_MASK]; 1311 1312 pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); 1313 entry = (PTE_V); 1314 entry |= (pn << PTE_PPN0_S); 1315 pmap_store(l2, entry); 1316 } 1317 1318 pmap_resident_count_inc(pmap, 1); 1319 1320 return (m); 1321 } 1322 1323 static vm_page_t 1324 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1325 { 1326 pd_entry_t *l1; 1327 vm_page_t l2pg; 1328 vm_pindex_t l2pindex; 1329 1330 retry: 1331 l1 = pmap_l1(pmap, va); 1332 if (l1 != NULL && (pmap_load(l1) & PTE_RWX) == 0) { 1333 /* Add a reference to the L2 page. */ 1334 l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1))); 1335 l2pg->ref_count++; 1336 } else { 1337 /* Allocate a L2 page. */ 1338 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; 1339 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); 1340 if (l2pg == NULL && lockp != NULL) 1341 goto retry; 1342 } 1343 return (l2pg); 1344 } 1345 1346 static vm_page_t 1347 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1348 { 1349 vm_pindex_t ptepindex; 1350 pd_entry_t *l2; 1351 vm_paddr_t phys; 1352 vm_page_t m; 1353 1354 /* 1355 * Calculate pagetable page index 1356 */ 1357 ptepindex = pmap_l2_pindex(va); 1358 retry: 1359 /* 1360 * Get the page directory entry 1361 */ 1362 l2 = pmap_l2(pmap, va); 1363 1364 /* 1365 * If the page table page is mapped, we just increment the 1366 * hold count, and activate it. 1367 */ 1368 if (l2 != NULL && pmap_load(l2) != 0) { 1369 phys = PTE_TO_PHYS(pmap_load(l2)); 1370 m = PHYS_TO_VM_PAGE(phys); 1371 m->ref_count++; 1372 } else { 1373 /* 1374 * Here if the pte page isn't mapped, or if it has been 1375 * deallocated. 1376 */ 1377 m = _pmap_alloc_l3(pmap, ptepindex, lockp); 1378 if (m == NULL && lockp != NULL) 1379 goto retry; 1380 } 1381 return (m); 1382 } 1383 1384 1385 /*************************************************** 1386 * Pmap allocation/deallocation routines. 1387 ***************************************************/ 1388 1389 /* 1390 * Release any resources held by the given physical map. 1391 * Called when a pmap initialized by pmap_pinit is being released. 1392 * Should only be called if the map contains no valid mappings. 1393 */ 1394 void 1395 pmap_release(pmap_t pmap) 1396 { 1397 vm_page_t m; 1398 1399 KASSERT(pmap->pm_stats.resident_count == 0, 1400 ("pmap_release: pmap resident count %ld != 0", 1401 pmap->pm_stats.resident_count)); 1402 KASSERT(CPU_EMPTY(&pmap->pm_active), 1403 ("releasing active pmap %p", pmap)); 1404 1405 mtx_lock(&allpmaps_lock); 1406 LIST_REMOVE(pmap, pm_list); 1407 mtx_unlock(&allpmaps_lock); 1408 1409 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l1)); 1410 vm_page_unwire_noq(m); 1411 vm_page_free(m); 1412 } 1413 1414 static int 1415 kvm_size(SYSCTL_HANDLER_ARGS) 1416 { 1417 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 1418 1419 return sysctl_handle_long(oidp, &ksize, 0, req); 1420 } 1421 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1422 0, 0, kvm_size, "LU", 1423 "Size of KVM"); 1424 1425 static int 1426 kvm_free(SYSCTL_HANDLER_ARGS) 1427 { 1428 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 1429 1430 return sysctl_handle_long(oidp, &kfree, 0, req); 1431 } 1432 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1433 0, 0, kvm_free, "LU", 1434 "Amount of KVM free"); 1435 1436 /* 1437 * grow the number of kernel page table entries, if needed 1438 */ 1439 void 1440 pmap_growkernel(vm_offset_t addr) 1441 { 1442 vm_paddr_t paddr; 1443 vm_page_t nkpg; 1444 pd_entry_t *l1, *l2; 1445 pt_entry_t entry; 1446 pn_t pn; 1447 1448 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 1449 1450 addr = roundup2(addr, L2_SIZE); 1451 if (addr - 1 >= vm_map_max(kernel_map)) 1452 addr = vm_map_max(kernel_map); 1453 while (kernel_vm_end < addr) { 1454 l1 = pmap_l1(kernel_pmap, kernel_vm_end); 1455 if (pmap_load(l1) == 0) { 1456 /* We need a new PDP entry */ 1457 nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT, 1458 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 1459 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1460 if (nkpg == NULL) 1461 panic("pmap_growkernel: no memory to grow kernel"); 1462 if ((nkpg->flags & PG_ZERO) == 0) 1463 pmap_zero_page(nkpg); 1464 paddr = VM_PAGE_TO_PHYS(nkpg); 1465 1466 pn = (paddr / PAGE_SIZE); 1467 entry = (PTE_V); 1468 entry |= (pn << PTE_PPN0_S); 1469 pmap_store(l1, entry); 1470 pmap_distribute_l1(kernel_pmap, 1471 pmap_l1_index(kernel_vm_end), entry); 1472 continue; /* try again */ 1473 } 1474 l2 = pmap_l1_to_l2(l1, kernel_vm_end); 1475 if ((pmap_load(l2) & PTE_V) != 0 && 1476 (pmap_load(l2) & PTE_RWX) == 0) { 1477 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1478 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1479 kernel_vm_end = vm_map_max(kernel_map); 1480 break; 1481 } 1482 continue; 1483 } 1484 1485 nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT, 1486 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 1487 VM_ALLOC_ZERO); 1488 if (nkpg == NULL) 1489 panic("pmap_growkernel: no memory to grow kernel"); 1490 if ((nkpg->flags & PG_ZERO) == 0) { 1491 pmap_zero_page(nkpg); 1492 } 1493 paddr = VM_PAGE_TO_PHYS(nkpg); 1494 1495 pn = (paddr / PAGE_SIZE); 1496 entry = (PTE_V); 1497 entry |= (pn << PTE_PPN0_S); 1498 pmap_store(l2, entry); 1499 1500 pmap_invalidate_page(kernel_pmap, kernel_vm_end); 1501 1502 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1503 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1504 kernel_vm_end = vm_map_max(kernel_map); 1505 break; 1506 } 1507 } 1508 } 1509 1510 1511 /*************************************************** 1512 * page management routines. 1513 ***************************************************/ 1514 1515 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 1516 CTASSERT(_NPCM == 3); 1517 CTASSERT(_NPCPV == 168); 1518 1519 static __inline struct pv_chunk * 1520 pv_to_chunk(pv_entry_t pv) 1521 { 1522 1523 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 1524 } 1525 1526 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 1527 1528 #define PC_FREE0 0xfffffffffffffffful 1529 #define PC_FREE1 0xfffffffffffffffful 1530 #define PC_FREE2 0x000000fffffffffful 1531 1532 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 1533 1534 #if 0 1535 #ifdef PV_STATS 1536 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 1537 1538 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 1539 "Current number of pv entry chunks"); 1540 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 1541 "Current number of pv entry chunks allocated"); 1542 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 1543 "Current number of pv entry chunks frees"); 1544 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 1545 "Number of times tried to get a chunk page but failed."); 1546 1547 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 1548 static int pv_entry_spare; 1549 1550 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 1551 "Current number of pv entry frees"); 1552 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 1553 "Current number of pv entry allocs"); 1554 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 1555 "Current number of pv entries"); 1556 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 1557 "Current number of spare pv entries"); 1558 #endif 1559 #endif /* 0 */ 1560 1561 /* 1562 * We are in a serious low memory condition. Resort to 1563 * drastic measures to free some pages so we can allocate 1564 * another pv entry chunk. 1565 * 1566 * Returns NULL if PV entries were reclaimed from the specified pmap. 1567 * 1568 * We do not, however, unmap 2mpages because subsequent accesses will 1569 * allocate per-page pv entries until repromotion occurs, thereby 1570 * exacerbating the shortage of free pv entries. 1571 */ 1572 static vm_page_t 1573 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 1574 { 1575 1576 panic("RISCVTODO: reclaim_pv_chunk"); 1577 } 1578 1579 /* 1580 * free the pv_entry back to the free list 1581 */ 1582 static void 1583 free_pv_entry(pmap_t pmap, pv_entry_t pv) 1584 { 1585 struct pv_chunk *pc; 1586 int idx, field, bit; 1587 1588 rw_assert(&pvh_global_lock, RA_LOCKED); 1589 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1590 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 1591 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 1592 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 1593 pc = pv_to_chunk(pv); 1594 idx = pv - &pc->pc_pventry[0]; 1595 field = idx / 64; 1596 bit = idx % 64; 1597 pc->pc_map[field] |= 1ul << bit; 1598 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 1599 pc->pc_map[2] != PC_FREE2) { 1600 /* 98% of the time, pc is already at the head of the list. */ 1601 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 1602 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1603 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1604 } 1605 return; 1606 } 1607 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1608 free_pv_chunk(pc); 1609 } 1610 1611 static void 1612 free_pv_chunk(struct pv_chunk *pc) 1613 { 1614 vm_page_t m; 1615 1616 mtx_lock(&pv_chunks_mutex); 1617 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 1618 mtx_unlock(&pv_chunks_mutex); 1619 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 1620 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 1621 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 1622 /* entire chunk is free, return it */ 1623 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 1624 dump_drop_page(m->phys_addr); 1625 vm_page_unwire_noq(m); 1626 vm_page_free(m); 1627 } 1628 1629 /* 1630 * Returns a new PV entry, allocating a new PV chunk from the system when 1631 * needed. If this PV chunk allocation fails and a PV list lock pointer was 1632 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 1633 * returned. 1634 * 1635 * The given PV list lock may be released. 1636 */ 1637 static pv_entry_t 1638 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 1639 { 1640 int bit, field; 1641 pv_entry_t pv; 1642 struct pv_chunk *pc; 1643 vm_page_t m; 1644 1645 rw_assert(&pvh_global_lock, RA_LOCKED); 1646 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1647 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 1648 retry: 1649 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1650 if (pc != NULL) { 1651 for (field = 0; field < _NPCM; field++) { 1652 if (pc->pc_map[field]) { 1653 bit = ffsl(pc->pc_map[field]) - 1; 1654 break; 1655 } 1656 } 1657 if (field < _NPCM) { 1658 pv = &pc->pc_pventry[field * 64 + bit]; 1659 pc->pc_map[field] &= ~(1ul << bit); 1660 /* If this was the last item, move it to tail */ 1661 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 1662 pc->pc_map[2] == 0) { 1663 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1664 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 1665 pc_list); 1666 } 1667 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1668 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 1669 return (pv); 1670 } 1671 } 1672 /* No free items, allocate another chunk */ 1673 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1674 VM_ALLOC_WIRED); 1675 if (m == NULL) { 1676 if (lockp == NULL) { 1677 PV_STAT(pc_chunk_tryfail++); 1678 return (NULL); 1679 } 1680 m = reclaim_pv_chunk(pmap, lockp); 1681 if (m == NULL) 1682 goto retry; 1683 } 1684 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 1685 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 1686 dump_add_page(m->phys_addr); 1687 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1688 pc->pc_pmap = pmap; 1689 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 1690 pc->pc_map[1] = PC_FREE1; 1691 pc->pc_map[2] = PC_FREE2; 1692 mtx_lock(&pv_chunks_mutex); 1693 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 1694 mtx_unlock(&pv_chunks_mutex); 1695 pv = &pc->pc_pventry[0]; 1696 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1697 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1698 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 1699 return (pv); 1700 } 1701 1702 /* 1703 * Ensure that the number of spare PV entries in the specified pmap meets or 1704 * exceeds the given count, "needed". 1705 * 1706 * The given PV list lock may be released. 1707 */ 1708 static void 1709 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 1710 { 1711 struct pch new_tail; 1712 struct pv_chunk *pc; 1713 vm_page_t m; 1714 int avail, free; 1715 bool reclaimed; 1716 1717 rw_assert(&pvh_global_lock, RA_LOCKED); 1718 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1719 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 1720 1721 /* 1722 * Newly allocated PV chunks must be stored in a private list until 1723 * the required number of PV chunks have been allocated. Otherwise, 1724 * reclaim_pv_chunk() could recycle one of these chunks. In 1725 * contrast, these chunks must be added to the pmap upon allocation. 1726 */ 1727 TAILQ_INIT(&new_tail); 1728 retry: 1729 avail = 0; 1730 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 1731 bit_count((bitstr_t *)pc->pc_map, 0, 1732 sizeof(pc->pc_map) * NBBY, &free); 1733 if (free == 0) 1734 break; 1735 avail += free; 1736 if (avail >= needed) 1737 break; 1738 } 1739 for (reclaimed = false; avail < needed; avail += _NPCPV) { 1740 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1741 VM_ALLOC_WIRED); 1742 if (m == NULL) { 1743 m = reclaim_pv_chunk(pmap, lockp); 1744 if (m == NULL) 1745 goto retry; 1746 reclaimed = true; 1747 } 1748 /* XXX PV STATS */ 1749 #if 0 1750 dump_add_page(m->phys_addr); 1751 #endif 1752 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1753 pc->pc_pmap = pmap; 1754 pc->pc_map[0] = PC_FREE0; 1755 pc->pc_map[1] = PC_FREE1; 1756 pc->pc_map[2] = PC_FREE2; 1757 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1758 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 1759 1760 /* 1761 * The reclaim might have freed a chunk from the current pmap. 1762 * If that chunk contained available entries, we need to 1763 * re-count the number of available entries. 1764 */ 1765 if (reclaimed) 1766 goto retry; 1767 } 1768 if (!TAILQ_EMPTY(&new_tail)) { 1769 mtx_lock(&pv_chunks_mutex); 1770 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 1771 mtx_unlock(&pv_chunks_mutex); 1772 } 1773 } 1774 1775 /* 1776 * First find and then remove the pv entry for the specified pmap and virtual 1777 * address from the specified pv list. Returns the pv entry if found and NULL 1778 * otherwise. This operation can be performed on pv lists for either 4KB or 1779 * 2MB page mappings. 1780 */ 1781 static __inline pv_entry_t 1782 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1783 { 1784 pv_entry_t pv; 1785 1786 rw_assert(&pvh_global_lock, RA_LOCKED); 1787 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 1788 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 1789 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 1790 pvh->pv_gen++; 1791 break; 1792 } 1793 } 1794 return (pv); 1795 } 1796 1797 /* 1798 * First find and then destroy the pv entry for the specified pmap and virtual 1799 * address. This operation can be performed on pv lists for either 4KB or 2MB 1800 * page mappings. 1801 */ 1802 static void 1803 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1804 { 1805 pv_entry_t pv; 1806 1807 pv = pmap_pvh_remove(pvh, pmap, va); 1808 1809 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va)); 1810 free_pv_entry(pmap, pv); 1811 } 1812 1813 /* 1814 * Conditionally create the PV entry for a 4KB page mapping if the required 1815 * memory can be allocated without resorting to reclamation. 1816 */ 1817 static boolean_t 1818 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 1819 struct rwlock **lockp) 1820 { 1821 pv_entry_t pv; 1822 1823 rw_assert(&pvh_global_lock, RA_LOCKED); 1824 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1825 /* Pass NULL instead of the lock pointer to disable reclamation. */ 1826 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 1827 pv->pv_va = va; 1828 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 1829 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 1830 m->md.pv_gen++; 1831 return (TRUE); 1832 } else 1833 return (FALSE); 1834 } 1835 1836 /* 1837 * After demotion from a 2MB page mapping to 512 4KB page mappings, 1838 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 1839 * entries for each of the 4KB page mappings. 1840 */ 1841 static void __unused 1842 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1843 struct rwlock **lockp) 1844 { 1845 struct md_page *pvh; 1846 struct pv_chunk *pc; 1847 pv_entry_t pv; 1848 vm_page_t m; 1849 vm_offset_t va_last; 1850 int bit, field; 1851 1852 rw_assert(&pvh_global_lock, RA_LOCKED); 1853 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1854 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1855 1856 /* 1857 * Transfer the 2mpage's pv entry for this mapping to the first 1858 * page's pv list. Once this transfer begins, the pv list lock 1859 * must not be released until the last pv entry is reinstantiated. 1860 */ 1861 pvh = pa_to_pvh(pa); 1862 va &= ~L2_OFFSET; 1863 pv = pmap_pvh_remove(pvh, pmap, va); 1864 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); 1865 m = PHYS_TO_VM_PAGE(pa); 1866 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 1867 m->md.pv_gen++; 1868 /* Instantiate the remaining 511 pv entries. */ 1869 va_last = va + L2_SIZE - PAGE_SIZE; 1870 for (;;) { 1871 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1872 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 1873 pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); 1874 for (field = 0; field < _NPCM; field++) { 1875 while (pc->pc_map[field] != 0) { 1876 bit = ffsl(pc->pc_map[field]) - 1; 1877 pc->pc_map[field] &= ~(1ul << bit); 1878 pv = &pc->pc_pventry[field * 64 + bit]; 1879 va += PAGE_SIZE; 1880 pv->pv_va = va; 1881 m++; 1882 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 1883 ("pmap_pv_demote_l2: page %p is not managed", m)); 1884 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 1885 m->md.pv_gen++; 1886 if (va == va_last) 1887 goto out; 1888 } 1889 } 1890 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1891 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1892 } 1893 out: 1894 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 1895 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1896 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1897 } 1898 /* XXX PV stats */ 1899 } 1900 1901 #if VM_NRESERVLEVEL > 0 1902 static void 1903 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1904 struct rwlock **lockp) 1905 { 1906 struct md_page *pvh; 1907 pv_entry_t pv; 1908 vm_page_t m; 1909 vm_offset_t va_last; 1910 1911 rw_assert(&pvh_global_lock, RA_LOCKED); 1912 KASSERT((va & L2_OFFSET) == 0, 1913 ("pmap_pv_promote_l2: misaligned va %#lx", va)); 1914 1915 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1916 1917 m = PHYS_TO_VM_PAGE(pa); 1918 pv = pmap_pvh_remove(&m->md, pmap, va); 1919 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va)); 1920 pvh = pa_to_pvh(pa); 1921 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 1922 pvh->pv_gen++; 1923 1924 va_last = va + L2_SIZE - PAGE_SIZE; 1925 do { 1926 m++; 1927 va += PAGE_SIZE; 1928 pmap_pvh_free(&m->md, pmap, va); 1929 } while (va < va_last); 1930 } 1931 #endif /* VM_NRESERVLEVEL > 0 */ 1932 1933 /* 1934 * Create the PV entry for a 2MB page mapping. Always returns true unless the 1935 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 1936 * false if the PV entry cannot be allocated without resorting to reclamation. 1937 */ 1938 static bool 1939 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, 1940 struct rwlock **lockp) 1941 { 1942 struct md_page *pvh; 1943 pv_entry_t pv; 1944 vm_paddr_t pa; 1945 1946 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1947 /* Pass NULL instead of the lock pointer to disable reclamation. */ 1948 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 1949 NULL : lockp)) == NULL) 1950 return (false); 1951 pv->pv_va = va; 1952 pa = PTE_TO_PHYS(l2e); 1953 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1954 pvh = pa_to_pvh(pa); 1955 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 1956 pvh->pv_gen++; 1957 return (true); 1958 } 1959 1960 static void 1961 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 1962 { 1963 pt_entry_t newl2, oldl2; 1964 vm_page_t ml3; 1965 vm_paddr_t ml3pa; 1966 1967 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); 1968 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 1969 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1970 1971 ml3 = pmap_remove_pt_page(pmap, va); 1972 if (ml3 == NULL) 1973 panic("pmap_remove_kernel_l2: Missing pt page"); 1974 1975 ml3pa = VM_PAGE_TO_PHYS(ml3); 1976 newl2 = ml3pa | PTE_V; 1977 1978 /* 1979 * If this page table page was unmapped by a promotion, then it 1980 * contains valid mappings. Zero it to invalidate those mappings. 1981 */ 1982 if (ml3->valid != 0) 1983 pagezero((void *)PHYS_TO_DMAP(ml3pa)); 1984 1985 /* 1986 * Demote the mapping. 1987 */ 1988 oldl2 = pmap_load_store(l2, newl2); 1989 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", 1990 __func__, l2, oldl2)); 1991 } 1992 1993 /* 1994 * pmap_remove_l2: Do the things to unmap a level 2 superpage. 1995 */ 1996 static int 1997 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 1998 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) 1999 { 2000 struct md_page *pvh; 2001 pt_entry_t oldl2; 2002 vm_offset_t eva, va; 2003 vm_page_t m, ml3; 2004 2005 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2006 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); 2007 oldl2 = pmap_load_clear(l2); 2008 KASSERT((oldl2 & PTE_RWX) != 0, 2009 ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2)); 2010 2011 /* 2012 * The sfence.vma documentation states that it is sufficient to specify 2013 * a single address within a superpage mapping. However, since we do 2014 * not perform any invalidation upon promotion, TLBs may still be 2015 * caching 4KB mappings within the superpage, so we must invalidate the 2016 * entire range. 2017 */ 2018 pmap_invalidate_range(pmap, sva, sva + L2_SIZE); 2019 if ((oldl2 & PTE_SW_WIRED) != 0) 2020 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; 2021 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); 2022 if ((oldl2 & PTE_SW_MANAGED) != 0) { 2023 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2)); 2024 pvh = pa_to_pvh(PTE_TO_PHYS(oldl2)); 2025 pmap_pvh_free(pvh, pmap, sva); 2026 eva = sva + L2_SIZE; 2027 for (va = sva, m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl2)); 2028 va < eva; va += PAGE_SIZE, m++) { 2029 if ((oldl2 & PTE_D) != 0) 2030 vm_page_dirty(m); 2031 if ((oldl2 & PTE_A) != 0) 2032 vm_page_aflag_set(m, PGA_REFERENCED); 2033 if (TAILQ_EMPTY(&m->md.pv_list) && 2034 TAILQ_EMPTY(&pvh->pv_list)) 2035 vm_page_aflag_clear(m, PGA_WRITEABLE); 2036 } 2037 } 2038 if (pmap == kernel_pmap) { 2039 pmap_remove_kernel_l2(pmap, l2, sva); 2040 } else { 2041 ml3 = pmap_remove_pt_page(pmap, sva); 2042 if (ml3 != NULL) { 2043 KASSERT(ml3->valid == VM_PAGE_BITS_ALL, 2044 ("pmap_remove_l2: l3 page not promoted")); 2045 pmap_resident_count_dec(pmap, 1); 2046 KASSERT(ml3->ref_count == Ln_ENTRIES, 2047 ("pmap_remove_l2: l3 page ref count error")); 2048 ml3->ref_count = 1; 2049 vm_page_unwire_noq(ml3); 2050 pmap_add_delayed_free_list(ml3, free, FALSE); 2051 } 2052 } 2053 return (pmap_unuse_pt(pmap, sva, l1e, free)); 2054 } 2055 2056 /* 2057 * pmap_remove_l3: do the things to unmap a page in a process 2058 */ 2059 static int 2060 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 2061 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) 2062 { 2063 struct md_page *pvh; 2064 pt_entry_t old_l3; 2065 vm_paddr_t phys; 2066 vm_page_t m; 2067 2068 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2069 old_l3 = pmap_load_clear(l3); 2070 pmap_invalidate_page(pmap, va); 2071 if (old_l3 & PTE_SW_WIRED) 2072 pmap->pm_stats.wired_count -= 1; 2073 pmap_resident_count_dec(pmap, 1); 2074 if (old_l3 & PTE_SW_MANAGED) { 2075 phys = PTE_TO_PHYS(old_l3); 2076 m = PHYS_TO_VM_PAGE(phys); 2077 if ((old_l3 & PTE_D) != 0) 2078 vm_page_dirty(m); 2079 if (old_l3 & PTE_A) 2080 vm_page_aflag_set(m, PGA_REFERENCED); 2081 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2082 pmap_pvh_free(&m->md, pmap, va); 2083 if (TAILQ_EMPTY(&m->md.pv_list) && 2084 (m->flags & PG_FICTITIOUS) == 0) { 2085 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2086 if (TAILQ_EMPTY(&pvh->pv_list)) 2087 vm_page_aflag_clear(m, PGA_WRITEABLE); 2088 } 2089 } 2090 2091 return (pmap_unuse_pt(pmap, va, l2e, free)); 2092 } 2093 2094 /* 2095 * Remove the given range of addresses from the specified map. 2096 * 2097 * It is assumed that the start and end are properly 2098 * rounded to the page size. 2099 */ 2100 void 2101 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2102 { 2103 struct spglist free; 2104 struct rwlock *lock; 2105 vm_offset_t va, va_next; 2106 pd_entry_t *l1, *l2, l2e; 2107 pt_entry_t *l3; 2108 2109 /* 2110 * Perform an unsynchronized read. This is, however, safe. 2111 */ 2112 if (pmap->pm_stats.resident_count == 0) 2113 return; 2114 2115 SLIST_INIT(&free); 2116 2117 rw_rlock(&pvh_global_lock); 2118 PMAP_LOCK(pmap); 2119 2120 lock = NULL; 2121 for (; sva < eva; sva = va_next) { 2122 if (pmap->pm_stats.resident_count == 0) 2123 break; 2124 2125 l1 = pmap_l1(pmap, sva); 2126 if (pmap_load(l1) == 0) { 2127 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2128 if (va_next < sva) 2129 va_next = eva; 2130 continue; 2131 } 2132 2133 /* 2134 * Calculate index for next page table. 2135 */ 2136 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2137 if (va_next < sva) 2138 va_next = eva; 2139 2140 l2 = pmap_l1_to_l2(l1, sva); 2141 if (l2 == NULL) 2142 continue; 2143 if ((l2e = pmap_load(l2)) == 0) 2144 continue; 2145 if ((l2e & PTE_RWX) != 0) { 2146 if (sva + L2_SIZE == va_next && eva >= va_next) { 2147 (void)pmap_remove_l2(pmap, l2, sva, 2148 pmap_load(l1), &free, &lock); 2149 continue; 2150 } else if (!pmap_demote_l2_locked(pmap, l2, sva, 2151 &lock)) { 2152 /* 2153 * The large page mapping was destroyed. 2154 */ 2155 continue; 2156 } 2157 l2e = pmap_load(l2); 2158 } 2159 2160 /* 2161 * Limit our scan to either the end of the va represented 2162 * by the current page table page, or to the end of the 2163 * range being removed. 2164 */ 2165 if (va_next > eva) 2166 va_next = eva; 2167 2168 va = va_next; 2169 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2170 sva += L3_SIZE) { 2171 if (pmap_load(l3) == 0) { 2172 if (va != va_next) { 2173 pmap_invalidate_range(pmap, va, sva); 2174 va = va_next; 2175 } 2176 continue; 2177 } 2178 if (va == va_next) 2179 va = sva; 2180 if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) { 2181 sva += L3_SIZE; 2182 break; 2183 } 2184 } 2185 if (va != va_next) 2186 pmap_invalidate_range(pmap, va, sva); 2187 } 2188 if (lock != NULL) 2189 rw_wunlock(lock); 2190 rw_runlock(&pvh_global_lock); 2191 PMAP_UNLOCK(pmap); 2192 vm_page_free_pages_toq(&free, false); 2193 } 2194 2195 /* 2196 * Routine: pmap_remove_all 2197 * Function: 2198 * Removes this physical page from 2199 * all physical maps in which it resides. 2200 * Reflects back modify bits to the pager. 2201 * 2202 * Notes: 2203 * Original versions of this routine were very 2204 * inefficient because they iteratively called 2205 * pmap_remove (slow...) 2206 */ 2207 2208 void 2209 pmap_remove_all(vm_page_t m) 2210 { 2211 struct spglist free; 2212 struct md_page *pvh; 2213 pmap_t pmap; 2214 pt_entry_t *l3, l3e; 2215 pd_entry_t *l2, l2e; 2216 pv_entry_t pv; 2217 vm_offset_t va; 2218 2219 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2220 ("pmap_remove_all: page %p is not managed", m)); 2221 SLIST_INIT(&free); 2222 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 2223 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2224 2225 rw_wlock(&pvh_global_lock); 2226 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 2227 pmap = PV_PMAP(pv); 2228 PMAP_LOCK(pmap); 2229 va = pv->pv_va; 2230 l2 = pmap_l2(pmap, va); 2231 (void)pmap_demote_l2(pmap, l2, va); 2232 PMAP_UNLOCK(pmap); 2233 } 2234 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2235 pmap = PV_PMAP(pv); 2236 PMAP_LOCK(pmap); 2237 pmap_resident_count_dec(pmap, 1); 2238 l2 = pmap_l2(pmap, pv->pv_va); 2239 KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found")); 2240 l2e = pmap_load(l2); 2241 2242 KASSERT((l2e & PTE_RX) == 0, 2243 ("pmap_remove_all: found a superpage in %p's pv list", m)); 2244 2245 l3 = pmap_l2_to_l3(l2, pv->pv_va); 2246 l3e = pmap_load_clear(l3); 2247 pmap_invalidate_page(pmap, pv->pv_va); 2248 if (l3e & PTE_SW_WIRED) 2249 pmap->pm_stats.wired_count--; 2250 if ((l3e & PTE_A) != 0) 2251 vm_page_aflag_set(m, PGA_REFERENCED); 2252 2253 /* 2254 * Update the vm_page_t clean and reference bits. 2255 */ 2256 if ((l3e & PTE_D) != 0) 2257 vm_page_dirty(m); 2258 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free); 2259 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2260 m->md.pv_gen++; 2261 free_pv_entry(pmap, pv); 2262 PMAP_UNLOCK(pmap); 2263 } 2264 vm_page_aflag_clear(m, PGA_WRITEABLE); 2265 rw_wunlock(&pvh_global_lock); 2266 vm_page_free_pages_toq(&free, false); 2267 } 2268 2269 /* 2270 * Set the physical protection on the 2271 * specified range of this map as requested. 2272 */ 2273 void 2274 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2275 { 2276 pd_entry_t *l1, *l2, l2e; 2277 pt_entry_t *l3, l3e, mask; 2278 vm_page_t m, mt; 2279 vm_paddr_t pa; 2280 vm_offset_t va_next; 2281 bool anychanged, pv_lists_locked; 2282 2283 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2284 pmap_remove(pmap, sva, eva); 2285 return; 2286 } 2287 2288 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == 2289 (VM_PROT_WRITE | VM_PROT_EXECUTE)) 2290 return; 2291 2292 anychanged = false; 2293 pv_lists_locked = false; 2294 mask = 0; 2295 if ((prot & VM_PROT_WRITE) == 0) 2296 mask |= PTE_W | PTE_D; 2297 if ((prot & VM_PROT_EXECUTE) == 0) 2298 mask |= PTE_X; 2299 resume: 2300 PMAP_LOCK(pmap); 2301 for (; sva < eva; sva = va_next) { 2302 l1 = pmap_l1(pmap, sva); 2303 if (pmap_load(l1) == 0) { 2304 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2305 if (va_next < sva) 2306 va_next = eva; 2307 continue; 2308 } 2309 2310 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2311 if (va_next < sva) 2312 va_next = eva; 2313 2314 l2 = pmap_l1_to_l2(l1, sva); 2315 if (l2 == NULL || (l2e = pmap_load(l2)) == 0) 2316 continue; 2317 if ((l2e & PTE_RWX) != 0) { 2318 if (sva + L2_SIZE == va_next && eva >= va_next) { 2319 retryl2: 2320 if ((prot & VM_PROT_WRITE) == 0 && 2321 (l2e & (PTE_SW_MANAGED | PTE_D)) == 2322 (PTE_SW_MANAGED | PTE_D)) { 2323 pa = PTE_TO_PHYS(l2e); 2324 m = PHYS_TO_VM_PAGE(pa); 2325 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 2326 vm_page_dirty(mt); 2327 } 2328 if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask)) 2329 goto retryl2; 2330 anychanged = true; 2331 continue; 2332 } else { 2333 if (!pv_lists_locked) { 2334 pv_lists_locked = true; 2335 if (!rw_try_rlock(&pvh_global_lock)) { 2336 if (anychanged) 2337 pmap_invalidate_all( 2338 pmap); 2339 PMAP_UNLOCK(pmap); 2340 rw_rlock(&pvh_global_lock); 2341 goto resume; 2342 } 2343 } 2344 if (!pmap_demote_l2(pmap, l2, sva)) { 2345 /* 2346 * The large page mapping was destroyed. 2347 */ 2348 continue; 2349 } 2350 } 2351 } 2352 2353 if (va_next > eva) 2354 va_next = eva; 2355 2356 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2357 sva += L3_SIZE) { 2358 l3e = pmap_load(l3); 2359 retryl3: 2360 if ((l3e & PTE_V) == 0) 2361 continue; 2362 if ((prot & VM_PROT_WRITE) == 0 && 2363 (l3e & (PTE_SW_MANAGED | PTE_D)) == 2364 (PTE_SW_MANAGED | PTE_D)) { 2365 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(l3e)); 2366 vm_page_dirty(m); 2367 } 2368 if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask)) 2369 goto retryl3; 2370 anychanged = true; 2371 } 2372 } 2373 if (anychanged) 2374 pmap_invalidate_all(pmap); 2375 if (pv_lists_locked) 2376 rw_runlock(&pvh_global_lock); 2377 PMAP_UNLOCK(pmap); 2378 } 2379 2380 int 2381 pmap_fault_fixup(pmap_t pmap, vm_offset_t va, vm_prot_t ftype) 2382 { 2383 pd_entry_t *l2, l2e; 2384 pt_entry_t bits, *pte, oldpte; 2385 int rv; 2386 2387 rv = 0; 2388 PMAP_LOCK(pmap); 2389 l2 = pmap_l2(pmap, va); 2390 if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) 2391 goto done; 2392 if ((l2e & PTE_RWX) == 0) { 2393 pte = pmap_l2_to_l3(l2, va); 2394 if (pte == NULL || ((oldpte = pmap_load(pte)) & PTE_V) == 0) 2395 goto done; 2396 } else { 2397 pte = l2; 2398 oldpte = l2e; 2399 } 2400 2401 if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) || 2402 (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) || 2403 (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) || 2404 (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0)) 2405 goto done; 2406 2407 bits = PTE_A; 2408 if (ftype == VM_PROT_WRITE) 2409 bits |= PTE_D; 2410 2411 /* 2412 * Spurious faults can occur if the implementation caches invalid 2413 * entries in the TLB, or if simultaneous accesses on multiple CPUs 2414 * race with each other. 2415 */ 2416 if ((oldpte & bits) != bits) 2417 pmap_store_bits(pte, bits); 2418 sfence_vma(); 2419 rv = 1; 2420 done: 2421 PMAP_UNLOCK(pmap); 2422 return (rv); 2423 } 2424 2425 static bool 2426 pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va) 2427 { 2428 struct rwlock *lock; 2429 bool rv; 2430 2431 lock = NULL; 2432 rv = pmap_demote_l2_locked(pmap, l2, va, &lock); 2433 if (lock != NULL) 2434 rw_wunlock(lock); 2435 return (rv); 2436 } 2437 2438 /* 2439 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 2440 * mapping is invalidated. 2441 */ 2442 static bool 2443 pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 2444 struct rwlock **lockp) 2445 { 2446 struct spglist free; 2447 vm_page_t mpte; 2448 pd_entry_t newl2, oldl2; 2449 pt_entry_t *firstl3, newl3; 2450 vm_paddr_t mptepa; 2451 int i; 2452 2453 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2454 2455 oldl2 = pmap_load(l2); 2456 KASSERT((oldl2 & PTE_RWX) != 0, 2457 ("pmap_demote_l2_locked: oldl2 is not a leaf entry")); 2458 if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 2459 NULL) { 2460 if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc(NULL, 2461 pmap_l2_pindex(va), (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 2462 VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == 2463 NULL) { 2464 SLIST_INIT(&free); 2465 (void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET, 2466 pmap_load(pmap_l1(pmap, va)), &free, lockp); 2467 vm_page_free_pages_toq(&free, true); 2468 CTR2(KTR_PMAP, "pmap_demote_l2_locked: " 2469 "failure for va %#lx in pmap %p", va, pmap); 2470 return (false); 2471 } 2472 if (va < VM_MAXUSER_ADDRESS) { 2473 mpte->ref_count = Ln_ENTRIES; 2474 pmap_resident_count_inc(pmap, 1); 2475 } 2476 } 2477 mptepa = VM_PAGE_TO_PHYS(mpte); 2478 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 2479 newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V; 2480 KASSERT((oldl2 & PTE_A) != 0, 2481 ("pmap_demote_l2_locked: oldl2 is missing PTE_A")); 2482 KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W, 2483 ("pmap_demote_l2_locked: oldl2 is missing PTE_D")); 2484 newl3 = oldl2; 2485 2486 /* 2487 * If the page table page is not leftover from an earlier promotion, 2488 * initialize it. 2489 */ 2490 if (mpte->valid == 0) { 2491 for (i = 0; i < Ln_ENTRIES; i++) 2492 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2493 } 2494 KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3), 2495 ("pmap_demote_l2_locked: firstl3 and newl3 map different physical " 2496 "addresses")); 2497 2498 /* 2499 * If the mapping has changed attributes, update the page table 2500 * entries. 2501 */ 2502 if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE)) 2503 for (i = 0; i < Ln_ENTRIES; i++) 2504 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2505 2506 /* 2507 * The spare PV entries must be reserved prior to demoting the 2508 * mapping, that is, prior to changing the L2 entry. Otherwise, the 2509 * state of the L2 entry and the PV lists will be inconsistent, which 2510 * can result in reclaim_pv_chunk() attempting to remove a PV entry from 2511 * the wrong PV list and pmap_pv_demote_l2() failing to find the 2512 * expected PV entry for the 2MB page mapping that is being demoted. 2513 */ 2514 if ((oldl2 & PTE_SW_MANAGED) != 0) 2515 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); 2516 2517 /* 2518 * Demote the mapping. 2519 */ 2520 pmap_store(l2, newl2); 2521 2522 /* 2523 * Demote the PV entry. 2524 */ 2525 if ((oldl2 & PTE_SW_MANAGED) != 0) 2526 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp); 2527 2528 atomic_add_long(&pmap_l2_demotions, 1); 2529 CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p", 2530 va, pmap); 2531 return (true); 2532 } 2533 2534 #if VM_NRESERVLEVEL > 0 2535 static void 2536 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 2537 struct rwlock **lockp) 2538 { 2539 pt_entry_t *firstl3, *l3; 2540 vm_paddr_t pa; 2541 vm_page_t ml3; 2542 2543 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2544 2545 va &= ~L2_OFFSET; 2546 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 2547 ("pmap_promote_l2: invalid l2 entry %p", l2)); 2548 2549 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2))); 2550 pa = PTE_TO_PHYS(pmap_load(firstl3)); 2551 if ((pa & L2_OFFSET) != 0) { 2552 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 2553 va, pmap); 2554 atomic_add_long(&pmap_l2_p_failures, 1); 2555 return; 2556 } 2557 2558 pa += PAGE_SIZE; 2559 for (l3 = firstl3 + 1; l3 < firstl3 + Ln_ENTRIES; l3++) { 2560 if (PTE_TO_PHYS(pmap_load(l3)) != pa) { 2561 CTR2(KTR_PMAP, 2562 "pmap_promote_l2: failure for va %#lx pmap %p", 2563 va, pmap); 2564 atomic_add_long(&pmap_l2_p_failures, 1); 2565 return; 2566 } 2567 if ((pmap_load(l3) & PTE_PROMOTE) != 2568 (pmap_load(firstl3) & PTE_PROMOTE)) { 2569 CTR2(KTR_PMAP, 2570 "pmap_promote_l2: failure for va %#lx pmap %p", 2571 va, pmap); 2572 atomic_add_long(&pmap_l2_p_failures, 1); 2573 return; 2574 } 2575 pa += PAGE_SIZE; 2576 } 2577 2578 ml3 = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2579 KASSERT(ml3->pindex == pmap_l2_pindex(va), 2580 ("pmap_promote_l2: page table page's pindex is wrong")); 2581 if (pmap_insert_pt_page(pmap, ml3, true)) { 2582 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 2583 va, pmap); 2584 atomic_add_long(&pmap_l2_p_failures, 1); 2585 return; 2586 } 2587 2588 if ((pmap_load(firstl3) & PTE_SW_MANAGED) != 0) 2589 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(pmap_load(firstl3)), 2590 lockp); 2591 2592 pmap_store(l2, pmap_load(firstl3)); 2593 2594 atomic_add_long(&pmap_l2_promotions, 1); 2595 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, 2596 pmap); 2597 } 2598 #endif 2599 2600 /* 2601 * Insert the given physical page (p) at 2602 * the specified virtual address (v) in the 2603 * target physical map with the protection requested. 2604 * 2605 * If specified, the page will be wired down, meaning 2606 * that the related pte can not be reclaimed. 2607 * 2608 * NB: This is the only routine which MAY NOT lazy-evaluate 2609 * or lose information. That is, this routine must actually 2610 * insert this page into the given map NOW. 2611 */ 2612 int 2613 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2614 u_int flags, int8_t psind) 2615 { 2616 struct rwlock *lock; 2617 pd_entry_t *l1, *l2, l2e; 2618 pt_entry_t new_l3, orig_l3; 2619 pt_entry_t *l3; 2620 pv_entry_t pv; 2621 vm_paddr_t opa, pa, l2_pa, l3_pa; 2622 vm_page_t mpte, om, l2_m, l3_m; 2623 pt_entry_t entry; 2624 pn_t l2_pn, l3_pn, pn; 2625 int rv; 2626 bool nosleep; 2627 2628 va = trunc_page(va); 2629 if ((m->oflags & VPO_UNMANAGED) == 0) 2630 VM_PAGE_OBJECT_BUSY_ASSERT(m); 2631 pa = VM_PAGE_TO_PHYS(m); 2632 pn = (pa / PAGE_SIZE); 2633 2634 new_l3 = PTE_V | PTE_R | PTE_A; 2635 if (prot & VM_PROT_EXECUTE) 2636 new_l3 |= PTE_X; 2637 if (flags & VM_PROT_WRITE) 2638 new_l3 |= PTE_D; 2639 if (prot & VM_PROT_WRITE) 2640 new_l3 |= PTE_W; 2641 if (va < VM_MAX_USER_ADDRESS) 2642 new_l3 |= PTE_U; 2643 2644 new_l3 |= (pn << PTE_PPN0_S); 2645 if ((flags & PMAP_ENTER_WIRED) != 0) 2646 new_l3 |= PTE_SW_WIRED; 2647 2648 /* 2649 * Set modified bit gratuitously for writeable mappings if 2650 * the page is unmanaged. We do not want to take a fault 2651 * to do the dirty bit accounting for these mappings. 2652 */ 2653 if ((m->oflags & VPO_UNMANAGED) != 0) { 2654 if (prot & VM_PROT_WRITE) 2655 new_l3 |= PTE_D; 2656 } else 2657 new_l3 |= PTE_SW_MANAGED; 2658 2659 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); 2660 2661 lock = NULL; 2662 mpte = NULL; 2663 rw_rlock(&pvh_global_lock); 2664 PMAP_LOCK(pmap); 2665 if (psind == 1) { 2666 /* Assert the required virtual and physical alignment. */ 2667 KASSERT((va & L2_OFFSET) == 0, 2668 ("pmap_enter: va %#lx unaligned", va)); 2669 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 2670 rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock); 2671 goto out; 2672 } 2673 2674 l2 = pmap_l2(pmap, va); 2675 if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 && 2676 ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2, 2677 va, &lock))) { 2678 l3 = pmap_l2_to_l3(l2, va); 2679 if (va < VM_MAXUSER_ADDRESS) { 2680 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2681 mpte->ref_count++; 2682 } 2683 } else if (va < VM_MAXUSER_ADDRESS) { 2684 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 2685 mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock); 2686 if (mpte == NULL && nosleep) { 2687 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); 2688 if (lock != NULL) 2689 rw_wunlock(lock); 2690 rw_runlock(&pvh_global_lock); 2691 PMAP_UNLOCK(pmap); 2692 return (KERN_RESOURCE_SHORTAGE); 2693 } 2694 l3 = pmap_l3(pmap, va); 2695 } else { 2696 l3 = pmap_l3(pmap, va); 2697 /* TODO: This is not optimal, but should mostly work */ 2698 if (l3 == NULL) { 2699 if (l2 == NULL) { 2700 l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2701 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2702 VM_ALLOC_ZERO); 2703 if (l2_m == NULL) 2704 panic("pmap_enter: l2 pte_m == NULL"); 2705 if ((l2_m->flags & PG_ZERO) == 0) 2706 pmap_zero_page(l2_m); 2707 2708 l2_pa = VM_PAGE_TO_PHYS(l2_m); 2709 l2_pn = (l2_pa / PAGE_SIZE); 2710 2711 l1 = pmap_l1(pmap, va); 2712 entry = (PTE_V); 2713 entry |= (l2_pn << PTE_PPN0_S); 2714 pmap_store(l1, entry); 2715 pmap_distribute_l1(pmap, pmap_l1_index(va), entry); 2716 l2 = pmap_l1_to_l2(l1, va); 2717 } 2718 2719 l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2720 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2721 if (l3_m == NULL) 2722 panic("pmap_enter: l3 pte_m == NULL"); 2723 if ((l3_m->flags & PG_ZERO) == 0) 2724 pmap_zero_page(l3_m); 2725 2726 l3_pa = VM_PAGE_TO_PHYS(l3_m); 2727 l3_pn = (l3_pa / PAGE_SIZE); 2728 entry = (PTE_V); 2729 entry |= (l3_pn << PTE_PPN0_S); 2730 pmap_store(l2, entry); 2731 l3 = pmap_l2_to_l3(l2, va); 2732 } 2733 pmap_invalidate_page(pmap, va); 2734 } 2735 2736 orig_l3 = pmap_load(l3); 2737 opa = PTE_TO_PHYS(orig_l3); 2738 pv = NULL; 2739 2740 /* 2741 * Is the specified virtual address already mapped? 2742 */ 2743 if ((orig_l3 & PTE_V) != 0) { 2744 /* 2745 * Wiring change, just update stats. We don't worry about 2746 * wiring PT pages as they remain resident as long as there 2747 * are valid mappings in them. Hence, if a user page is wired, 2748 * the PT page will be also. 2749 */ 2750 if ((flags & PMAP_ENTER_WIRED) != 0 && 2751 (orig_l3 & PTE_SW_WIRED) == 0) 2752 pmap->pm_stats.wired_count++; 2753 else if ((flags & PMAP_ENTER_WIRED) == 0 && 2754 (orig_l3 & PTE_SW_WIRED) != 0) 2755 pmap->pm_stats.wired_count--; 2756 2757 /* 2758 * Remove the extra PT page reference. 2759 */ 2760 if (mpte != NULL) { 2761 mpte->ref_count--; 2762 KASSERT(mpte->ref_count > 0, 2763 ("pmap_enter: missing reference to page table page," 2764 " va: 0x%lx", va)); 2765 } 2766 2767 /* 2768 * Has the physical page changed? 2769 */ 2770 if (opa == pa) { 2771 /* 2772 * No, might be a protection or wiring change. 2773 */ 2774 if ((orig_l3 & PTE_SW_MANAGED) != 0 && 2775 (new_l3 & PTE_W) != 0) 2776 vm_page_aflag_set(m, PGA_WRITEABLE); 2777 goto validate; 2778 } 2779 2780 /* 2781 * The physical page has changed. Temporarily invalidate 2782 * the mapping. This ensures that all threads sharing the 2783 * pmap keep a consistent view of the mapping, which is 2784 * necessary for the correct handling of COW faults. It 2785 * also permits reuse of the old mapping's PV entry, 2786 * avoiding an allocation. 2787 * 2788 * For consistency, handle unmanaged mappings the same way. 2789 */ 2790 orig_l3 = pmap_load_clear(l3); 2791 KASSERT(PTE_TO_PHYS(orig_l3) == opa, 2792 ("pmap_enter: unexpected pa update for %#lx", va)); 2793 if ((orig_l3 & PTE_SW_MANAGED) != 0) { 2794 om = PHYS_TO_VM_PAGE(opa); 2795 2796 /* 2797 * The pmap lock is sufficient to synchronize with 2798 * concurrent calls to pmap_page_test_mappings() and 2799 * pmap_ts_referenced(). 2800 */ 2801 if ((orig_l3 & PTE_D) != 0) 2802 vm_page_dirty(om); 2803 if ((orig_l3 & PTE_A) != 0) 2804 vm_page_aflag_set(om, PGA_REFERENCED); 2805 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 2806 pv = pmap_pvh_remove(&om->md, pmap, va); 2807 KASSERT(pv != NULL, 2808 ("pmap_enter: no PV entry for %#lx", va)); 2809 if ((new_l3 & PTE_SW_MANAGED) == 0) 2810 free_pv_entry(pmap, pv); 2811 if ((om->a.flags & PGA_WRITEABLE) != 0 && 2812 TAILQ_EMPTY(&om->md.pv_list) && 2813 ((om->flags & PG_FICTITIOUS) != 0 || 2814 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 2815 vm_page_aflag_clear(om, PGA_WRITEABLE); 2816 } 2817 pmap_invalidate_page(pmap, va); 2818 orig_l3 = 0; 2819 } else { 2820 /* 2821 * Increment the counters. 2822 */ 2823 if ((new_l3 & PTE_SW_WIRED) != 0) 2824 pmap->pm_stats.wired_count++; 2825 pmap_resident_count_inc(pmap, 1); 2826 } 2827 /* 2828 * Enter on the PV list if part of our managed memory. 2829 */ 2830 if ((new_l3 & PTE_SW_MANAGED) != 0) { 2831 if (pv == NULL) { 2832 pv = get_pv_entry(pmap, &lock); 2833 pv->pv_va = va; 2834 } 2835 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 2836 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2837 m->md.pv_gen++; 2838 if ((new_l3 & PTE_W) != 0) 2839 vm_page_aflag_set(m, PGA_WRITEABLE); 2840 } 2841 2842 validate: 2843 /* 2844 * Sync the i-cache on all harts before updating the PTE 2845 * if the new PTE is executable. 2846 */ 2847 if (prot & VM_PROT_EXECUTE) 2848 pmap_sync_icache(pmap, va, PAGE_SIZE); 2849 2850 /* 2851 * Update the L3 entry. 2852 */ 2853 if (orig_l3 != 0) { 2854 orig_l3 = pmap_load_store(l3, new_l3); 2855 pmap_invalidate_page(pmap, va); 2856 KASSERT(PTE_TO_PHYS(orig_l3) == pa, 2857 ("pmap_enter: invalid update")); 2858 if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) == 2859 (PTE_D | PTE_SW_MANAGED)) 2860 vm_page_dirty(m); 2861 } else { 2862 pmap_store(l3, new_l3); 2863 } 2864 2865 #if VM_NRESERVLEVEL > 0 2866 if (mpte != NULL && mpte->ref_count == Ln_ENTRIES && 2867 pmap_ps_enabled(pmap) && 2868 (m->flags & PG_FICTITIOUS) == 0 && 2869 vm_reserv_level_iffullpop(m) == 0) 2870 pmap_promote_l2(pmap, l2, va, &lock); 2871 #endif 2872 2873 rv = KERN_SUCCESS; 2874 out: 2875 if (lock != NULL) 2876 rw_wunlock(lock); 2877 rw_runlock(&pvh_global_lock); 2878 PMAP_UNLOCK(pmap); 2879 return (rv); 2880 } 2881 2882 /* 2883 * Tries to create a read- and/or execute-only 2MB page mapping. Returns true 2884 * if successful. Returns false if (1) a page table page cannot be allocated 2885 * without sleeping, (2) a mapping already exists at the specified virtual 2886 * address, or (3) a PV entry cannot be allocated without reclaiming another 2887 * PV entry. 2888 */ 2889 static bool 2890 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2891 struct rwlock **lockp) 2892 { 2893 pd_entry_t new_l2; 2894 pn_t pn; 2895 2896 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2897 2898 pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE; 2899 new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V); 2900 if ((m->oflags & VPO_UNMANAGED) == 0) 2901 new_l2 |= PTE_SW_MANAGED; 2902 if ((prot & VM_PROT_EXECUTE) != 0) 2903 new_l2 |= PTE_X; 2904 if (va < VM_MAXUSER_ADDRESS) 2905 new_l2 |= PTE_U; 2906 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | 2907 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == 2908 KERN_SUCCESS); 2909 } 2910 2911 /* 2912 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 2913 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 2914 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 2915 * a mapping already exists at the specified virtual address. Returns 2916 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table 2917 * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if 2918 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. 2919 * 2920 * The parameter "m" is only used when creating a managed, writeable mapping. 2921 */ 2922 static int 2923 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, 2924 vm_page_t m, struct rwlock **lockp) 2925 { 2926 struct spglist free; 2927 pd_entry_t *l2, *l3, oldl2; 2928 vm_offset_t sva; 2929 vm_page_t l2pg, mt; 2930 2931 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2932 2933 if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? 2934 NULL : lockp)) == NULL) { 2935 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", 2936 va, pmap); 2937 return (KERN_RESOURCE_SHORTAGE); 2938 } 2939 2940 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); 2941 l2 = &l2[pmap_l2_index(va)]; 2942 if ((oldl2 = pmap_load(l2)) != 0) { 2943 KASSERT(l2pg->ref_count > 1, 2944 ("pmap_enter_l2: l2pg's ref count is too low")); 2945 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 2946 l2pg->ref_count--; 2947 CTR2(KTR_PMAP, 2948 "pmap_enter_l2: failure for va %#lx in pmap %p", 2949 va, pmap); 2950 return (KERN_FAILURE); 2951 } 2952 SLIST_INIT(&free); 2953 if ((oldl2 & PTE_RWX) != 0) 2954 (void)pmap_remove_l2(pmap, l2, va, 2955 pmap_load(pmap_l1(pmap, va)), &free, lockp); 2956 else 2957 for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) { 2958 l3 = pmap_l2_to_l3(l2, sva); 2959 if ((pmap_load(l3) & PTE_V) != 0 && 2960 pmap_remove_l3(pmap, l3, sva, oldl2, &free, 2961 lockp) != 0) 2962 break; 2963 } 2964 vm_page_free_pages_toq(&free, true); 2965 if (va >= VM_MAXUSER_ADDRESS) { 2966 /* 2967 * Both pmap_remove_l2() and pmap_remove_l3() will 2968 * leave the kernel page table page zero filled. 2969 */ 2970 mt = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2971 if (pmap_insert_pt_page(pmap, mt, false)) 2972 panic("pmap_enter_l2: trie insert failed"); 2973 } else 2974 KASSERT(pmap_load(l2) == 0, 2975 ("pmap_enter_l2: non-zero L2 entry %p", l2)); 2976 } 2977 2978 if ((new_l2 & PTE_SW_MANAGED) != 0) { 2979 /* 2980 * Abort this mapping if its PV entry could not be created. 2981 */ 2982 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { 2983 SLIST_INIT(&free); 2984 if (pmap_unwire_ptp(pmap, va, l2pg, &free)) { 2985 /* 2986 * Although "va" is not mapped, paging-structure 2987 * caches could nonetheless have entries that 2988 * refer to the freed page table pages. 2989 * Invalidate those entries. 2990 */ 2991 pmap_invalidate_page(pmap, va); 2992 vm_page_free_pages_toq(&free, true); 2993 } 2994 CTR2(KTR_PMAP, 2995 "pmap_enter_l2: failure for va %#lx in pmap %p", 2996 va, pmap); 2997 return (KERN_RESOURCE_SHORTAGE); 2998 } 2999 if ((new_l2 & PTE_W) != 0) 3000 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 3001 vm_page_aflag_set(mt, PGA_WRITEABLE); 3002 } 3003 3004 /* 3005 * Increment counters. 3006 */ 3007 if ((new_l2 & PTE_SW_WIRED) != 0) 3008 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; 3009 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; 3010 3011 /* 3012 * Map the superpage. 3013 */ 3014 pmap_store(l2, new_l2); 3015 3016 atomic_add_long(&pmap_l2_mappings, 1); 3017 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", 3018 va, pmap); 3019 3020 return (KERN_SUCCESS); 3021 } 3022 3023 /* 3024 * Maps a sequence of resident pages belonging to the same object. 3025 * The sequence begins with the given page m_start. This page is 3026 * mapped at the given virtual address start. Each subsequent page is 3027 * mapped at a virtual address that is offset from start by the same 3028 * amount as the page is offset from m_start within the object. The 3029 * last page in the sequence is the page with the largest offset from 3030 * m_start that can be mapped at a virtual address less than the given 3031 * virtual address end. Not every virtual page between start and end 3032 * is mapped; only those for which a resident page exists with the 3033 * corresponding offset from m_start are mapped. 3034 */ 3035 void 3036 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3037 vm_page_t m_start, vm_prot_t prot) 3038 { 3039 struct rwlock *lock; 3040 vm_offset_t va; 3041 vm_page_t m, mpte; 3042 vm_pindex_t diff, psize; 3043 3044 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3045 3046 psize = atop(end - start); 3047 mpte = NULL; 3048 m = m_start; 3049 lock = NULL; 3050 rw_rlock(&pvh_global_lock); 3051 PMAP_LOCK(pmap); 3052 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3053 va = start + ptoa(diff); 3054 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && 3055 m->psind == 1 && pmap_ps_enabled(pmap) && 3056 pmap_enter_2mpage(pmap, va, m, prot, &lock)) 3057 m = &m[L2_SIZE / PAGE_SIZE - 1]; 3058 else 3059 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, 3060 &lock); 3061 m = TAILQ_NEXT(m, listq); 3062 } 3063 if (lock != NULL) 3064 rw_wunlock(lock); 3065 rw_runlock(&pvh_global_lock); 3066 PMAP_UNLOCK(pmap); 3067 } 3068 3069 /* 3070 * this code makes some *MAJOR* assumptions: 3071 * 1. Current pmap & pmap exists. 3072 * 2. Not wired. 3073 * 3. Read access. 3074 * 4. No page table pages. 3075 * but is *MUCH* faster than pmap_enter... 3076 */ 3077 3078 void 3079 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3080 { 3081 struct rwlock *lock; 3082 3083 lock = NULL; 3084 rw_rlock(&pvh_global_lock); 3085 PMAP_LOCK(pmap); 3086 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 3087 if (lock != NULL) 3088 rw_wunlock(lock); 3089 rw_runlock(&pvh_global_lock); 3090 PMAP_UNLOCK(pmap); 3091 } 3092 3093 static vm_page_t 3094 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3095 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 3096 { 3097 struct spglist free; 3098 vm_paddr_t phys; 3099 pd_entry_t *l2; 3100 pt_entry_t *l3, newl3; 3101 3102 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 3103 (m->oflags & VPO_UNMANAGED) != 0, 3104 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3105 rw_assert(&pvh_global_lock, RA_LOCKED); 3106 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3107 3108 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); 3109 /* 3110 * In the case that a page table page is not 3111 * resident, we are creating it here. 3112 */ 3113 if (va < VM_MAXUSER_ADDRESS) { 3114 vm_pindex_t l2pindex; 3115 3116 /* 3117 * Calculate pagetable page index 3118 */ 3119 l2pindex = pmap_l2_pindex(va); 3120 if (mpte && (mpte->pindex == l2pindex)) { 3121 mpte->ref_count++; 3122 } else { 3123 /* 3124 * Get the l2 entry 3125 */ 3126 l2 = pmap_l2(pmap, va); 3127 3128 /* 3129 * If the page table page is mapped, we just increment 3130 * the hold count, and activate it. Otherwise, we 3131 * attempt to allocate a page table page. If this 3132 * attempt fails, we don't retry. Instead, we give up. 3133 */ 3134 if (l2 != NULL && pmap_load(l2) != 0) { 3135 phys = PTE_TO_PHYS(pmap_load(l2)); 3136 mpte = PHYS_TO_VM_PAGE(phys); 3137 mpte->ref_count++; 3138 } else { 3139 /* 3140 * Pass NULL instead of the PV list lock 3141 * pointer, because we don't intend to sleep. 3142 */ 3143 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); 3144 if (mpte == NULL) 3145 return (mpte); 3146 } 3147 } 3148 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 3149 l3 = &l3[pmap_l3_index(va)]; 3150 } else { 3151 mpte = NULL; 3152 l3 = pmap_l3(kernel_pmap, va); 3153 } 3154 if (l3 == NULL) 3155 panic("pmap_enter_quick_locked: No l3"); 3156 if (pmap_load(l3) != 0) { 3157 if (mpte != NULL) { 3158 mpte->ref_count--; 3159 mpte = NULL; 3160 } 3161 return (mpte); 3162 } 3163 3164 /* 3165 * Enter on the PV list if part of our managed memory. 3166 */ 3167 if ((m->oflags & VPO_UNMANAGED) == 0 && 3168 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 3169 if (mpte != NULL) { 3170 SLIST_INIT(&free); 3171 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 3172 pmap_invalidate_page(pmap, va); 3173 vm_page_free_pages_toq(&free, false); 3174 } 3175 mpte = NULL; 3176 } 3177 return (mpte); 3178 } 3179 3180 /* 3181 * Increment counters 3182 */ 3183 pmap_resident_count_inc(pmap, 1); 3184 3185 newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) | 3186 PTE_V | PTE_R; 3187 if ((prot & VM_PROT_EXECUTE) != 0) 3188 newl3 |= PTE_X; 3189 if ((m->oflags & VPO_UNMANAGED) == 0) 3190 newl3 |= PTE_SW_MANAGED; 3191 if (va < VM_MAX_USER_ADDRESS) 3192 newl3 |= PTE_U; 3193 3194 /* 3195 * Sync the i-cache on all harts before updating the PTE 3196 * if the new PTE is executable. 3197 */ 3198 if (prot & VM_PROT_EXECUTE) 3199 pmap_sync_icache(pmap, va, PAGE_SIZE); 3200 3201 pmap_store(l3, newl3); 3202 3203 pmap_invalidate_page(pmap, va); 3204 return (mpte); 3205 } 3206 3207 /* 3208 * This code maps large physical mmap regions into the 3209 * processor address space. Note that some shortcuts 3210 * are taken, but the code works. 3211 */ 3212 void 3213 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3214 vm_pindex_t pindex, vm_size_t size) 3215 { 3216 3217 VM_OBJECT_ASSERT_WLOCKED(object); 3218 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3219 ("pmap_object_init_pt: non-device object")); 3220 } 3221 3222 /* 3223 * Clear the wired attribute from the mappings for the specified range of 3224 * addresses in the given pmap. Every valid mapping within that range 3225 * must have the wired attribute set. In contrast, invalid mappings 3226 * cannot have the wired attribute set, so they are ignored. 3227 * 3228 * The wired attribute of the page table entry is not a hardware feature, 3229 * so there is no need to invalidate any TLB entries. 3230 */ 3231 void 3232 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3233 { 3234 vm_offset_t va_next; 3235 pd_entry_t *l1, *l2, l2e; 3236 pt_entry_t *l3, l3e; 3237 bool pv_lists_locked; 3238 3239 pv_lists_locked = false; 3240 retry: 3241 PMAP_LOCK(pmap); 3242 for (; sva < eva; sva = va_next) { 3243 l1 = pmap_l1(pmap, sva); 3244 if (pmap_load(l1) == 0) { 3245 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3246 if (va_next < sva) 3247 va_next = eva; 3248 continue; 3249 } 3250 3251 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3252 if (va_next < sva) 3253 va_next = eva; 3254 3255 l2 = pmap_l1_to_l2(l1, sva); 3256 if ((l2e = pmap_load(l2)) == 0) 3257 continue; 3258 if ((l2e & PTE_RWX) != 0) { 3259 if (sva + L2_SIZE == va_next && eva >= va_next) { 3260 if ((l2e & PTE_SW_WIRED) == 0) 3261 panic("pmap_unwire: l2 %#jx is missing " 3262 "PTE_SW_WIRED", (uintmax_t)l2e); 3263 pmap_clear_bits(l2, PTE_SW_WIRED); 3264 continue; 3265 } else { 3266 if (!pv_lists_locked) { 3267 pv_lists_locked = true; 3268 if (!rw_try_rlock(&pvh_global_lock)) { 3269 PMAP_UNLOCK(pmap); 3270 rw_rlock(&pvh_global_lock); 3271 /* Repeat sva. */ 3272 goto retry; 3273 } 3274 } 3275 if (!pmap_demote_l2(pmap, l2, sva)) 3276 panic("pmap_unwire: demotion failed"); 3277 } 3278 } 3279 3280 if (va_next > eva) 3281 va_next = eva; 3282 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 3283 sva += L3_SIZE) { 3284 if ((l3e = pmap_load(l3)) == 0) 3285 continue; 3286 if ((l3e & PTE_SW_WIRED) == 0) 3287 panic("pmap_unwire: l3 %#jx is missing " 3288 "PTE_SW_WIRED", (uintmax_t)l3e); 3289 3290 /* 3291 * PG_W must be cleared atomically. Although the pmap 3292 * lock synchronizes access to PG_W, another processor 3293 * could be setting PG_M and/or PG_A concurrently. 3294 */ 3295 pmap_clear_bits(l3, PTE_SW_WIRED); 3296 pmap->pm_stats.wired_count--; 3297 } 3298 } 3299 if (pv_lists_locked) 3300 rw_runlock(&pvh_global_lock); 3301 PMAP_UNLOCK(pmap); 3302 } 3303 3304 /* 3305 * Copy the range specified by src_addr/len 3306 * from the source map to the range dst_addr/len 3307 * in the destination map. 3308 * 3309 * This routine is only advisory and need not do anything. 3310 */ 3311 3312 void 3313 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 3314 vm_offset_t src_addr) 3315 { 3316 3317 } 3318 3319 /* 3320 * pmap_zero_page zeros the specified hardware page by mapping 3321 * the page into KVM and using bzero to clear its contents. 3322 */ 3323 void 3324 pmap_zero_page(vm_page_t m) 3325 { 3326 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3327 3328 pagezero((void *)va); 3329 } 3330 3331 /* 3332 * pmap_zero_page_area zeros the specified hardware page by mapping 3333 * the page into KVM and using bzero to clear its contents. 3334 * 3335 * off and size may not cover an area beyond a single hardware page. 3336 */ 3337 void 3338 pmap_zero_page_area(vm_page_t m, int off, int size) 3339 { 3340 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3341 3342 if (off == 0 && size == PAGE_SIZE) 3343 pagezero((void *)va); 3344 else 3345 bzero((char *)va + off, size); 3346 } 3347 3348 /* 3349 * pmap_copy_page copies the specified (machine independent) 3350 * page by mapping the page into virtual memory and using 3351 * bcopy to copy the page, one machine dependent page at a 3352 * time. 3353 */ 3354 void 3355 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 3356 { 3357 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 3358 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 3359 3360 pagecopy((void *)src, (void *)dst); 3361 } 3362 3363 int unmapped_buf_allowed = 1; 3364 3365 void 3366 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 3367 vm_offset_t b_offset, int xfersize) 3368 { 3369 void *a_cp, *b_cp; 3370 vm_page_t m_a, m_b; 3371 vm_paddr_t p_a, p_b; 3372 vm_offset_t a_pg_offset, b_pg_offset; 3373 int cnt; 3374 3375 while (xfersize > 0) { 3376 a_pg_offset = a_offset & PAGE_MASK; 3377 m_a = ma[a_offset >> PAGE_SHIFT]; 3378 p_a = m_a->phys_addr; 3379 b_pg_offset = b_offset & PAGE_MASK; 3380 m_b = mb[b_offset >> PAGE_SHIFT]; 3381 p_b = m_b->phys_addr; 3382 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 3383 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 3384 if (__predict_false(!PHYS_IN_DMAP(p_a))) { 3385 panic("!DMAP a %lx", p_a); 3386 } else { 3387 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 3388 } 3389 if (__predict_false(!PHYS_IN_DMAP(p_b))) { 3390 panic("!DMAP b %lx", p_b); 3391 } else { 3392 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 3393 } 3394 bcopy(a_cp, b_cp, cnt); 3395 a_offset += cnt; 3396 b_offset += cnt; 3397 xfersize -= cnt; 3398 } 3399 } 3400 3401 vm_offset_t 3402 pmap_quick_enter_page(vm_page_t m) 3403 { 3404 3405 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); 3406 } 3407 3408 void 3409 pmap_quick_remove_page(vm_offset_t addr) 3410 { 3411 } 3412 3413 /* 3414 * Returns true if the pmap's pv is one of the first 3415 * 16 pvs linked to from this page. This count may 3416 * be changed upwards or downwards in the future; it 3417 * is only necessary that true be returned for a small 3418 * subset of pmaps for proper page aging. 3419 */ 3420 boolean_t 3421 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 3422 { 3423 struct md_page *pvh; 3424 struct rwlock *lock; 3425 pv_entry_t pv; 3426 int loops = 0; 3427 boolean_t rv; 3428 3429 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3430 ("pmap_page_exists_quick: page %p is not managed", m)); 3431 rv = FALSE; 3432 rw_rlock(&pvh_global_lock); 3433 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3434 rw_rlock(lock); 3435 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3436 if (PV_PMAP(pv) == pmap) { 3437 rv = TRUE; 3438 break; 3439 } 3440 loops++; 3441 if (loops >= 16) 3442 break; 3443 } 3444 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 3445 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3446 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3447 if (PV_PMAP(pv) == pmap) { 3448 rv = TRUE; 3449 break; 3450 } 3451 loops++; 3452 if (loops >= 16) 3453 break; 3454 } 3455 } 3456 rw_runlock(lock); 3457 rw_runlock(&pvh_global_lock); 3458 return (rv); 3459 } 3460 3461 /* 3462 * pmap_page_wired_mappings: 3463 * 3464 * Return the number of managed mappings to the given physical page 3465 * that are wired. 3466 */ 3467 int 3468 pmap_page_wired_mappings(vm_page_t m) 3469 { 3470 struct md_page *pvh; 3471 struct rwlock *lock; 3472 pmap_t pmap; 3473 pd_entry_t *l2; 3474 pt_entry_t *l3; 3475 pv_entry_t pv; 3476 int count, md_gen, pvh_gen; 3477 3478 if ((m->oflags & VPO_UNMANAGED) != 0) 3479 return (0); 3480 rw_rlock(&pvh_global_lock); 3481 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3482 rw_rlock(lock); 3483 restart: 3484 count = 0; 3485 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3486 pmap = PV_PMAP(pv); 3487 if (!PMAP_TRYLOCK(pmap)) { 3488 md_gen = m->md.pv_gen; 3489 rw_runlock(lock); 3490 PMAP_LOCK(pmap); 3491 rw_rlock(lock); 3492 if (md_gen != m->md.pv_gen) { 3493 PMAP_UNLOCK(pmap); 3494 goto restart; 3495 } 3496 } 3497 l3 = pmap_l3(pmap, pv->pv_va); 3498 if ((pmap_load(l3) & PTE_SW_WIRED) != 0) 3499 count++; 3500 PMAP_UNLOCK(pmap); 3501 } 3502 if ((m->flags & PG_FICTITIOUS) == 0) { 3503 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3504 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3505 pmap = PV_PMAP(pv); 3506 if (!PMAP_TRYLOCK(pmap)) { 3507 md_gen = m->md.pv_gen; 3508 pvh_gen = pvh->pv_gen; 3509 rw_runlock(lock); 3510 PMAP_LOCK(pmap); 3511 rw_rlock(lock); 3512 if (md_gen != m->md.pv_gen || 3513 pvh_gen != pvh->pv_gen) { 3514 PMAP_UNLOCK(pmap); 3515 goto restart; 3516 } 3517 } 3518 l2 = pmap_l2(pmap, pv->pv_va); 3519 if ((pmap_load(l2) & PTE_SW_WIRED) != 0) 3520 count++; 3521 PMAP_UNLOCK(pmap); 3522 } 3523 } 3524 rw_runlock(lock); 3525 rw_runlock(&pvh_global_lock); 3526 return (count); 3527 } 3528 3529 /* 3530 * Returns true if the given page is mapped individually or as part of 3531 * a 2mpage. Otherwise, returns false. 3532 */ 3533 bool 3534 pmap_page_is_mapped(vm_page_t m) 3535 { 3536 struct rwlock *lock; 3537 bool rv; 3538 3539 if ((m->oflags & VPO_UNMANAGED) != 0) 3540 return (false); 3541 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3542 rw_rlock(lock); 3543 rv = !TAILQ_EMPTY(&m->md.pv_list) || 3544 ((m->flags & PG_FICTITIOUS) == 0 && 3545 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 3546 rw_runlock(lock); 3547 return (rv); 3548 } 3549 3550 static void 3551 pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, 3552 struct spglist *free, bool superpage) 3553 { 3554 struct md_page *pvh; 3555 vm_page_t mpte, mt; 3556 3557 if (superpage) { 3558 pmap_resident_count_dec(pmap, Ln_ENTRIES); 3559 pvh = pa_to_pvh(m->phys_addr); 3560 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3561 pvh->pv_gen++; 3562 if (TAILQ_EMPTY(&pvh->pv_list)) { 3563 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 3564 if (TAILQ_EMPTY(&mt->md.pv_list) && 3565 (mt->a.flags & PGA_WRITEABLE) != 0) 3566 vm_page_aflag_clear(mt, PGA_WRITEABLE); 3567 } 3568 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 3569 if (mpte != NULL) { 3570 KASSERT(mpte->valid == VM_PAGE_BITS_ALL, 3571 ("pmap_remove_pages: pte page not promoted")); 3572 pmap_resident_count_dec(pmap, 1); 3573 KASSERT(mpte->ref_count == Ln_ENTRIES, 3574 ("pmap_remove_pages: pte page ref count error")); 3575 mpte->ref_count = 0; 3576 pmap_add_delayed_free_list(mpte, free, FALSE); 3577 } 3578 } else { 3579 pmap_resident_count_dec(pmap, 1); 3580 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3581 m->md.pv_gen++; 3582 if (TAILQ_EMPTY(&m->md.pv_list) && 3583 (m->a.flags & PGA_WRITEABLE) != 0) { 3584 pvh = pa_to_pvh(m->phys_addr); 3585 if (TAILQ_EMPTY(&pvh->pv_list)) 3586 vm_page_aflag_clear(m, PGA_WRITEABLE); 3587 } 3588 } 3589 } 3590 3591 /* 3592 * Destroy all managed, non-wired mappings in the given user-space 3593 * pmap. This pmap cannot be active on any processor besides the 3594 * caller. 3595 * 3596 * This function cannot be applied to the kernel pmap. Moreover, it 3597 * is not intended for general use. It is only to be used during 3598 * process termination. Consequently, it can be implemented in ways 3599 * that make it faster than pmap_remove(). First, it can more quickly 3600 * destroy mappings by iterating over the pmap's collection of PV 3601 * entries, rather than searching the page table. Second, it doesn't 3602 * have to test and clear the page table entries atomically, because 3603 * no processor is currently accessing the user address space. In 3604 * particular, a page table entry's dirty bit won't change state once 3605 * this function starts. 3606 */ 3607 void 3608 pmap_remove_pages(pmap_t pmap) 3609 { 3610 struct spglist free; 3611 pd_entry_t ptepde; 3612 pt_entry_t *pte, tpte; 3613 vm_page_t m, mt; 3614 pv_entry_t pv; 3615 struct pv_chunk *pc, *npc; 3616 struct rwlock *lock; 3617 int64_t bit; 3618 uint64_t inuse, bitmask; 3619 int allfree, field, freed, idx; 3620 bool superpage; 3621 3622 lock = NULL; 3623 3624 SLIST_INIT(&free); 3625 rw_rlock(&pvh_global_lock); 3626 PMAP_LOCK(pmap); 3627 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 3628 allfree = 1; 3629 freed = 0; 3630 for (field = 0; field < _NPCM; field++) { 3631 inuse = ~pc->pc_map[field] & pc_freemask[field]; 3632 while (inuse != 0) { 3633 bit = ffsl(inuse) - 1; 3634 bitmask = 1UL << bit; 3635 idx = field * 64 + bit; 3636 pv = &pc->pc_pventry[idx]; 3637 inuse &= ~bitmask; 3638 3639 pte = pmap_l1(pmap, pv->pv_va); 3640 ptepde = pmap_load(pte); 3641 pte = pmap_l1_to_l2(pte, pv->pv_va); 3642 tpte = pmap_load(pte); 3643 if ((tpte & PTE_RWX) != 0) { 3644 superpage = true; 3645 } else { 3646 ptepde = tpte; 3647 pte = pmap_l2_to_l3(pte, pv->pv_va); 3648 tpte = pmap_load(pte); 3649 superpage = false; 3650 } 3651 3652 /* 3653 * We cannot remove wired pages from a 3654 * process' mapping at this time. 3655 */ 3656 if (tpte & PTE_SW_WIRED) { 3657 allfree = 0; 3658 continue; 3659 } 3660 3661 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte)); 3662 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 3663 m < &vm_page_array[vm_page_array_size], 3664 ("pmap_remove_pages: bad pte %#jx", 3665 (uintmax_t)tpte)); 3666 3667 pmap_clear(pte); 3668 3669 /* 3670 * Update the vm_page_t clean/reference bits. 3671 */ 3672 if ((tpte & (PTE_D | PTE_W)) == 3673 (PTE_D | PTE_W)) { 3674 if (superpage) 3675 for (mt = m; 3676 mt < &m[Ln_ENTRIES]; mt++) 3677 vm_page_dirty(mt); 3678 else 3679 vm_page_dirty(m); 3680 } 3681 3682 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 3683 3684 /* Mark free */ 3685 pc->pc_map[field] |= bitmask; 3686 3687 pmap_remove_pages_pv(pmap, m, pv, &free, 3688 superpage); 3689 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 3690 freed++; 3691 } 3692 } 3693 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 3694 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 3695 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 3696 if (allfree) { 3697 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3698 free_pv_chunk(pc); 3699 } 3700 } 3701 if (lock != NULL) 3702 rw_wunlock(lock); 3703 pmap_invalidate_all(pmap); 3704 rw_runlock(&pvh_global_lock); 3705 PMAP_UNLOCK(pmap); 3706 vm_page_free_pages_toq(&free, false); 3707 } 3708 3709 static bool 3710 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 3711 { 3712 struct md_page *pvh; 3713 struct rwlock *lock; 3714 pd_entry_t *l2; 3715 pt_entry_t *l3, mask; 3716 pv_entry_t pv; 3717 pmap_t pmap; 3718 int md_gen, pvh_gen; 3719 bool rv; 3720 3721 mask = 0; 3722 if (modified) 3723 mask |= PTE_D; 3724 if (accessed) 3725 mask |= PTE_A; 3726 3727 rv = FALSE; 3728 rw_rlock(&pvh_global_lock); 3729 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3730 rw_rlock(lock); 3731 restart: 3732 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3733 pmap = PV_PMAP(pv); 3734 if (!PMAP_TRYLOCK(pmap)) { 3735 md_gen = m->md.pv_gen; 3736 rw_runlock(lock); 3737 PMAP_LOCK(pmap); 3738 rw_rlock(lock); 3739 if (md_gen != m->md.pv_gen) { 3740 PMAP_UNLOCK(pmap); 3741 goto restart; 3742 } 3743 } 3744 l3 = pmap_l3(pmap, pv->pv_va); 3745 rv = (pmap_load(l3) & mask) == mask; 3746 PMAP_UNLOCK(pmap); 3747 if (rv) 3748 goto out; 3749 } 3750 if ((m->flags & PG_FICTITIOUS) == 0) { 3751 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3752 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3753 pmap = PV_PMAP(pv); 3754 if (!PMAP_TRYLOCK(pmap)) { 3755 md_gen = m->md.pv_gen; 3756 pvh_gen = pvh->pv_gen; 3757 rw_runlock(lock); 3758 PMAP_LOCK(pmap); 3759 rw_rlock(lock); 3760 if (md_gen != m->md.pv_gen || 3761 pvh_gen != pvh->pv_gen) { 3762 PMAP_UNLOCK(pmap); 3763 goto restart; 3764 } 3765 } 3766 l2 = pmap_l2(pmap, pv->pv_va); 3767 rv = (pmap_load(l2) & mask) == mask; 3768 PMAP_UNLOCK(pmap); 3769 if (rv) 3770 goto out; 3771 } 3772 } 3773 out: 3774 rw_runlock(lock); 3775 rw_runlock(&pvh_global_lock); 3776 return (rv); 3777 } 3778 3779 /* 3780 * pmap_is_modified: 3781 * 3782 * Return whether or not the specified physical page was modified 3783 * in any physical maps. 3784 */ 3785 boolean_t 3786 pmap_is_modified(vm_page_t m) 3787 { 3788 3789 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3790 ("pmap_is_modified: page %p is not managed", m)); 3791 3792 /* 3793 * If the page is not busied then this check is racy. 3794 */ 3795 if (!pmap_page_is_write_mapped(m)) 3796 return (FALSE); 3797 return (pmap_page_test_mappings(m, FALSE, TRUE)); 3798 } 3799 3800 /* 3801 * pmap_is_prefaultable: 3802 * 3803 * Return whether or not the specified virtual address is eligible 3804 * for prefault. 3805 */ 3806 boolean_t 3807 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 3808 { 3809 pt_entry_t *l3; 3810 boolean_t rv; 3811 3812 rv = FALSE; 3813 PMAP_LOCK(pmap); 3814 l3 = pmap_l3(pmap, addr); 3815 if (l3 != NULL && pmap_load(l3) != 0) { 3816 rv = TRUE; 3817 } 3818 PMAP_UNLOCK(pmap); 3819 return (rv); 3820 } 3821 3822 /* 3823 * pmap_is_referenced: 3824 * 3825 * Return whether or not the specified physical page was referenced 3826 * in any physical maps. 3827 */ 3828 boolean_t 3829 pmap_is_referenced(vm_page_t m) 3830 { 3831 3832 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3833 ("pmap_is_referenced: page %p is not managed", m)); 3834 return (pmap_page_test_mappings(m, TRUE, FALSE)); 3835 } 3836 3837 /* 3838 * Clear the write and modified bits in each of the given page's mappings. 3839 */ 3840 void 3841 pmap_remove_write(vm_page_t m) 3842 { 3843 struct md_page *pvh; 3844 struct rwlock *lock; 3845 pmap_t pmap; 3846 pd_entry_t *l2; 3847 pt_entry_t *l3, oldl3, newl3; 3848 pv_entry_t next_pv, pv; 3849 vm_offset_t va; 3850 int md_gen, pvh_gen; 3851 3852 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3853 ("pmap_remove_write: page %p is not managed", m)); 3854 vm_page_assert_busied(m); 3855 3856 if (!pmap_page_is_write_mapped(m)) 3857 return; 3858 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3859 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 3860 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3861 rw_rlock(&pvh_global_lock); 3862 retry_pv_loop: 3863 rw_wlock(lock); 3864 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 3865 pmap = PV_PMAP(pv); 3866 if (!PMAP_TRYLOCK(pmap)) { 3867 pvh_gen = pvh->pv_gen; 3868 rw_wunlock(lock); 3869 PMAP_LOCK(pmap); 3870 rw_wlock(lock); 3871 if (pvh_gen != pvh->pv_gen) { 3872 PMAP_UNLOCK(pmap); 3873 rw_wunlock(lock); 3874 goto retry_pv_loop; 3875 } 3876 } 3877 va = pv->pv_va; 3878 l2 = pmap_l2(pmap, va); 3879 if ((pmap_load(l2) & PTE_W) != 0) 3880 (void)pmap_demote_l2_locked(pmap, l2, va, &lock); 3881 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 3882 ("inconsistent pv lock %p %p for page %p", 3883 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 3884 PMAP_UNLOCK(pmap); 3885 } 3886 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3887 pmap = PV_PMAP(pv); 3888 if (!PMAP_TRYLOCK(pmap)) { 3889 pvh_gen = pvh->pv_gen; 3890 md_gen = m->md.pv_gen; 3891 rw_wunlock(lock); 3892 PMAP_LOCK(pmap); 3893 rw_wlock(lock); 3894 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 3895 PMAP_UNLOCK(pmap); 3896 rw_wunlock(lock); 3897 goto retry_pv_loop; 3898 } 3899 } 3900 l3 = pmap_l3(pmap, pv->pv_va); 3901 oldl3 = pmap_load(l3); 3902 retry: 3903 if ((oldl3 & PTE_W) != 0) { 3904 newl3 = oldl3 & ~(PTE_D | PTE_W); 3905 if (!atomic_fcmpset_long(l3, &oldl3, newl3)) 3906 goto retry; 3907 if ((oldl3 & PTE_D) != 0) 3908 vm_page_dirty(m); 3909 pmap_invalidate_page(pmap, pv->pv_va); 3910 } 3911 PMAP_UNLOCK(pmap); 3912 } 3913 rw_wunlock(lock); 3914 vm_page_aflag_clear(m, PGA_WRITEABLE); 3915 rw_runlock(&pvh_global_lock); 3916 } 3917 3918 /* 3919 * pmap_ts_referenced: 3920 * 3921 * Return a count of reference bits for a page, clearing those bits. 3922 * It is not necessary for every reference bit to be cleared, but it 3923 * is necessary that 0 only be returned when there are truly no 3924 * reference bits set. 3925 * 3926 * As an optimization, update the page's dirty field if a modified bit is 3927 * found while counting reference bits. This opportunistic update can be 3928 * performed at low cost and can eliminate the need for some future calls 3929 * to pmap_is_modified(). However, since this function stops after 3930 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 3931 * dirty pages. Those dirty pages will only be detected by a future call 3932 * to pmap_is_modified(). 3933 */ 3934 int 3935 pmap_ts_referenced(vm_page_t m) 3936 { 3937 struct spglist free; 3938 struct md_page *pvh; 3939 struct rwlock *lock; 3940 pv_entry_t pv, pvf; 3941 pmap_t pmap; 3942 pd_entry_t *l2, l2e; 3943 pt_entry_t *l3, l3e; 3944 vm_paddr_t pa; 3945 vm_offset_t va; 3946 int cleared, md_gen, not_cleared, pvh_gen; 3947 3948 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3949 ("pmap_ts_referenced: page %p is not managed", m)); 3950 SLIST_INIT(&free); 3951 cleared = 0; 3952 pa = VM_PAGE_TO_PHYS(m); 3953 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 3954 3955 lock = PHYS_TO_PV_LIST_LOCK(pa); 3956 rw_rlock(&pvh_global_lock); 3957 rw_wlock(lock); 3958 retry: 3959 not_cleared = 0; 3960 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 3961 goto small_mappings; 3962 pv = pvf; 3963 do { 3964 pmap = PV_PMAP(pv); 3965 if (!PMAP_TRYLOCK(pmap)) { 3966 pvh_gen = pvh->pv_gen; 3967 rw_wunlock(lock); 3968 PMAP_LOCK(pmap); 3969 rw_wlock(lock); 3970 if (pvh_gen != pvh->pv_gen) { 3971 PMAP_UNLOCK(pmap); 3972 goto retry; 3973 } 3974 } 3975 va = pv->pv_va; 3976 l2 = pmap_l2(pmap, va); 3977 l2e = pmap_load(l2); 3978 if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) { 3979 /* 3980 * Although l2e is mapping a 2MB page, because 3981 * this function is called at a 4KB page granularity, 3982 * we only update the 4KB page under test. 3983 */ 3984 vm_page_dirty(m); 3985 } 3986 if ((l2e & PTE_A) != 0) { 3987 /* 3988 * Since this reference bit is shared by 512 4KB 3989 * pages, it should not be cleared every time it is 3990 * tested. Apply a simple "hash" function on the 3991 * physical page number, the virtual superpage number, 3992 * and the pmap address to select one 4KB page out of 3993 * the 512 on which testing the reference bit will 3994 * result in clearing that reference bit. This 3995 * function is designed to avoid the selection of the 3996 * same 4KB page for every 2MB page mapping. 3997 * 3998 * On demotion, a mapping that hasn't been referenced 3999 * is simply destroyed. To avoid the possibility of a 4000 * subsequent page fault on a demoted wired mapping, 4001 * always leave its reference bit set. Moreover, 4002 * since the superpage is wired, the current state of 4003 * its reference bit won't affect page replacement. 4004 */ 4005 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ 4006 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && 4007 (l2e & PTE_SW_WIRED) == 0) { 4008 pmap_clear_bits(l2, PTE_A); 4009 pmap_invalidate_page(pmap, va); 4010 cleared++; 4011 } else 4012 not_cleared++; 4013 } 4014 PMAP_UNLOCK(pmap); 4015 /* Rotate the PV list if it has more than one entry. */ 4016 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4017 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4018 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4019 pvh->pv_gen++; 4020 } 4021 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 4022 goto out; 4023 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 4024 small_mappings: 4025 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 4026 goto out; 4027 pv = pvf; 4028 do { 4029 pmap = PV_PMAP(pv); 4030 if (!PMAP_TRYLOCK(pmap)) { 4031 pvh_gen = pvh->pv_gen; 4032 md_gen = m->md.pv_gen; 4033 rw_wunlock(lock); 4034 PMAP_LOCK(pmap); 4035 rw_wlock(lock); 4036 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4037 PMAP_UNLOCK(pmap); 4038 goto retry; 4039 } 4040 } 4041 l2 = pmap_l2(pmap, pv->pv_va); 4042 4043 KASSERT((pmap_load(l2) & PTE_RX) == 0, 4044 ("pmap_ts_referenced: found an invalid l2 table")); 4045 4046 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4047 l3e = pmap_load(l3); 4048 if ((l3e & PTE_D) != 0) 4049 vm_page_dirty(m); 4050 if ((l3e & PTE_A) != 0) { 4051 if ((l3e & PTE_SW_WIRED) == 0) { 4052 /* 4053 * Wired pages cannot be paged out so 4054 * doing accessed bit emulation for 4055 * them is wasted effort. We do the 4056 * hard work for unwired pages only. 4057 */ 4058 pmap_clear_bits(l3, PTE_A); 4059 pmap_invalidate_page(pmap, pv->pv_va); 4060 cleared++; 4061 } else 4062 not_cleared++; 4063 } 4064 PMAP_UNLOCK(pmap); 4065 /* Rotate the PV list if it has more than one entry. */ 4066 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4067 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4068 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4069 m->md.pv_gen++; 4070 } 4071 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 4072 not_cleared < PMAP_TS_REFERENCED_MAX); 4073 out: 4074 rw_wunlock(lock); 4075 rw_runlock(&pvh_global_lock); 4076 vm_page_free_pages_toq(&free, false); 4077 return (cleared + not_cleared); 4078 } 4079 4080 /* 4081 * Apply the given advice to the specified range of addresses within the 4082 * given pmap. Depending on the advice, clear the referenced and/or 4083 * modified flags in each mapping and set the mapped page's dirty field. 4084 */ 4085 void 4086 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 4087 { 4088 } 4089 4090 /* 4091 * Clear the modify bits on the specified physical page. 4092 */ 4093 void 4094 pmap_clear_modify(vm_page_t m) 4095 { 4096 struct md_page *pvh; 4097 struct rwlock *lock; 4098 pmap_t pmap; 4099 pv_entry_t next_pv, pv; 4100 pd_entry_t *l2, oldl2; 4101 pt_entry_t *l3; 4102 vm_offset_t va; 4103 int md_gen, pvh_gen; 4104 4105 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4106 ("pmap_clear_modify: page %p is not managed", m)); 4107 vm_page_assert_busied(m); 4108 4109 if (!pmap_page_is_write_mapped(m)) 4110 return; 4111 4112 /* 4113 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 4114 * If the object containing the page is locked and the page is not 4115 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 4116 */ 4117 if ((m->a.flags & PGA_WRITEABLE) == 0) 4118 return; 4119 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 4120 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4121 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4122 rw_rlock(&pvh_global_lock); 4123 rw_wlock(lock); 4124 restart: 4125 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4126 pmap = PV_PMAP(pv); 4127 if (!PMAP_TRYLOCK(pmap)) { 4128 pvh_gen = pvh->pv_gen; 4129 rw_wunlock(lock); 4130 PMAP_LOCK(pmap); 4131 rw_wlock(lock); 4132 if (pvh_gen != pvh->pv_gen) { 4133 PMAP_UNLOCK(pmap); 4134 goto restart; 4135 } 4136 } 4137 va = pv->pv_va; 4138 l2 = pmap_l2(pmap, va); 4139 oldl2 = pmap_load(l2); 4140 /* If oldl2 has PTE_W set, then it also has PTE_D set. */ 4141 if ((oldl2 & PTE_W) != 0 && 4142 pmap_demote_l2_locked(pmap, l2, va, &lock) && 4143 (oldl2 & PTE_SW_WIRED) == 0) { 4144 /* 4145 * Write protect the mapping to a single page so that 4146 * a subsequent write access may repromote. 4147 */ 4148 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2); 4149 l3 = pmap_l2_to_l3(l2, va); 4150 pmap_clear_bits(l3, PTE_D | PTE_W); 4151 vm_page_dirty(m); 4152 pmap_invalidate_page(pmap, va); 4153 } 4154 PMAP_UNLOCK(pmap); 4155 } 4156 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4157 pmap = PV_PMAP(pv); 4158 if (!PMAP_TRYLOCK(pmap)) { 4159 md_gen = m->md.pv_gen; 4160 pvh_gen = pvh->pv_gen; 4161 rw_wunlock(lock); 4162 PMAP_LOCK(pmap); 4163 rw_wlock(lock); 4164 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4165 PMAP_UNLOCK(pmap); 4166 goto restart; 4167 } 4168 } 4169 l2 = pmap_l2(pmap, pv->pv_va); 4170 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4171 ("pmap_clear_modify: found a 2mpage in page %p's pv list", 4172 m)); 4173 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4174 if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) { 4175 pmap_clear_bits(l3, PTE_D | PTE_W); 4176 pmap_invalidate_page(pmap, pv->pv_va); 4177 } 4178 PMAP_UNLOCK(pmap); 4179 } 4180 rw_wunlock(lock); 4181 rw_runlock(&pvh_global_lock); 4182 } 4183 4184 void * 4185 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 4186 { 4187 4188 return ((void *)PHYS_TO_DMAP(pa)); 4189 } 4190 4191 void 4192 pmap_unmapbios(vm_paddr_t pa, vm_size_t size) 4193 { 4194 } 4195 4196 /* 4197 * Sets the memory attribute for the specified page. 4198 */ 4199 void 4200 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 4201 { 4202 4203 m->md.pv_memattr = ma; 4204 } 4205 4206 /* 4207 * Perform the pmap work for mincore(2). If the page is not both referenced and 4208 * modified by this pmap, returns its physical address so that the caller can 4209 * find other mappings. 4210 */ 4211 int 4212 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 4213 { 4214 pt_entry_t *l2, *l3, tpte; 4215 vm_paddr_t pa; 4216 int val; 4217 bool managed; 4218 4219 PMAP_LOCK(pmap); 4220 l2 = pmap_l2(pmap, addr); 4221 if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) { 4222 if ((tpte & PTE_RWX) != 0) { 4223 pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET); 4224 val = MINCORE_INCORE | MINCORE_SUPER; 4225 } else { 4226 l3 = pmap_l2_to_l3(l2, addr); 4227 tpte = pmap_load(l3); 4228 if ((tpte & PTE_V) == 0) { 4229 PMAP_UNLOCK(pmap); 4230 return (0); 4231 } 4232 pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET); 4233 val = MINCORE_INCORE; 4234 } 4235 4236 if ((tpte & PTE_D) != 0) 4237 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 4238 if ((tpte & PTE_A) != 0) 4239 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 4240 managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED; 4241 } else { 4242 managed = false; 4243 val = 0; 4244 } 4245 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 4246 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 4247 *pap = pa; 4248 } 4249 PMAP_UNLOCK(pmap); 4250 return (val); 4251 } 4252 4253 void 4254 pmap_activate_sw(struct thread *td) 4255 { 4256 pmap_t oldpmap, pmap; 4257 u_int hart; 4258 4259 oldpmap = PCPU_GET(curpmap); 4260 pmap = vmspace_pmap(td->td_proc->p_vmspace); 4261 if (pmap == oldpmap) 4262 return; 4263 load_satp(pmap->pm_satp); 4264 4265 hart = PCPU_GET(hart); 4266 #ifdef SMP 4267 CPU_SET_ATOMIC(hart, &pmap->pm_active); 4268 CPU_CLR_ATOMIC(hart, &oldpmap->pm_active); 4269 #else 4270 CPU_SET(hart, &pmap->pm_active); 4271 CPU_CLR(hart, &oldpmap->pm_active); 4272 #endif 4273 PCPU_SET(curpmap, pmap); 4274 4275 sfence_vma(); 4276 } 4277 4278 void 4279 pmap_activate(struct thread *td) 4280 { 4281 4282 critical_enter(); 4283 pmap_activate_sw(td); 4284 critical_exit(); 4285 } 4286 4287 void 4288 pmap_activate_boot(pmap_t pmap) 4289 { 4290 u_int hart; 4291 4292 hart = PCPU_GET(hart); 4293 #ifdef SMP 4294 CPU_SET_ATOMIC(hart, &pmap->pm_active); 4295 #else 4296 CPU_SET(hart, &pmap->pm_active); 4297 #endif 4298 PCPU_SET(curpmap, pmap); 4299 } 4300 4301 void 4302 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) 4303 { 4304 cpuset_t mask; 4305 4306 /* 4307 * From the RISC-V User-Level ISA V2.2: 4308 * 4309 * "To make a store to instruction memory visible to all 4310 * RISC-V harts, the writing hart has to execute a data FENCE 4311 * before requesting that all remote RISC-V harts execute a 4312 * FENCE.I." 4313 * 4314 * However, this is slightly misleading; we still need to 4315 * perform a FENCE.I for the local hart, as FENCE does nothing 4316 * for its icache. FENCE.I alone is also sufficient for the 4317 * local hart. 4318 */ 4319 sched_pin(); 4320 mask = all_harts; 4321 CPU_CLR(PCPU_GET(hart), &mask); 4322 fence_i(); 4323 if (!CPU_EMPTY(&mask) && smp_started) { 4324 fence(); 4325 sbi_remote_fence_i(mask.__bits); 4326 } 4327 sched_unpin(); 4328 } 4329 4330 /* 4331 * Increase the starting virtual address of the given mapping if a 4332 * different alignment might result in more superpage mappings. 4333 */ 4334 void 4335 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 4336 vm_offset_t *addr, vm_size_t size) 4337 { 4338 vm_offset_t superpage_offset; 4339 4340 if (size < L2_SIZE) 4341 return; 4342 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 4343 offset += ptoa(object->pg_color); 4344 superpage_offset = offset & L2_OFFSET; 4345 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || 4346 (*addr & L2_OFFSET) == superpage_offset) 4347 return; 4348 if ((*addr & L2_OFFSET) < superpage_offset) 4349 *addr = (*addr & ~L2_OFFSET) + superpage_offset; 4350 else 4351 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; 4352 } 4353 4354 /** 4355 * Get the kernel virtual address of a set of physical pages. If there are 4356 * physical addresses not covered by the DMAP perform a transient mapping 4357 * that will be removed when calling pmap_unmap_io_transient. 4358 * 4359 * \param page The pages the caller wishes to obtain the virtual 4360 * address on the kernel memory map. 4361 * \param vaddr On return contains the kernel virtual memory address 4362 * of the pages passed in the page parameter. 4363 * \param count Number of pages passed in. 4364 * \param can_fault TRUE if the thread using the mapped pages can take 4365 * page faults, FALSE otherwise. 4366 * 4367 * \returns TRUE if the caller must call pmap_unmap_io_transient when 4368 * finished or FALSE otherwise. 4369 * 4370 */ 4371 boolean_t 4372 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 4373 boolean_t can_fault) 4374 { 4375 vm_paddr_t paddr; 4376 boolean_t needs_mapping; 4377 int error, i; 4378 4379 /* 4380 * Allocate any KVA space that we need, this is done in a separate 4381 * loop to prevent calling vmem_alloc while pinned. 4382 */ 4383 needs_mapping = FALSE; 4384 for (i = 0; i < count; i++) { 4385 paddr = VM_PAGE_TO_PHYS(page[i]); 4386 if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) { 4387 error = vmem_alloc(kernel_arena, PAGE_SIZE, 4388 M_BESTFIT | M_WAITOK, &vaddr[i]); 4389 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 4390 needs_mapping = TRUE; 4391 } else { 4392 vaddr[i] = PHYS_TO_DMAP(paddr); 4393 } 4394 } 4395 4396 /* Exit early if everything is covered by the DMAP */ 4397 if (!needs_mapping) 4398 return (FALSE); 4399 4400 if (!can_fault) 4401 sched_pin(); 4402 for (i = 0; i < count; i++) { 4403 paddr = VM_PAGE_TO_PHYS(page[i]); 4404 if (paddr >= DMAP_MAX_PHYSADDR) { 4405 panic( 4406 "pmap_map_io_transient: TODO: Map out of DMAP data"); 4407 } 4408 } 4409 4410 return (needs_mapping); 4411 } 4412 4413 void 4414 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 4415 boolean_t can_fault) 4416 { 4417 vm_paddr_t paddr; 4418 int i; 4419 4420 if (!can_fault) 4421 sched_unpin(); 4422 for (i = 0; i < count; i++) { 4423 paddr = VM_PAGE_TO_PHYS(page[i]); 4424 if (paddr >= DMAP_MAX_PHYSADDR) { 4425 panic("RISCVTODO: pmap_unmap_io_transient: Unmap data"); 4426 } 4427 } 4428 } 4429 4430 boolean_t 4431 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 4432 { 4433 4434 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_BACK); 4435 } 4436 4437 bool 4438 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2, 4439 pt_entry_t **l3) 4440 { 4441 pd_entry_t *l1p, *l2p; 4442 4443 /* Get l1 directory entry. */ 4444 l1p = pmap_l1(pmap, va); 4445 *l1 = l1p; 4446 4447 if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0) 4448 return (false); 4449 4450 if ((pmap_load(l1p) & PTE_RX) != 0) { 4451 *l2 = NULL; 4452 *l3 = NULL; 4453 return (true); 4454 } 4455 4456 /* Get l2 directory entry. */ 4457 l2p = pmap_l1_to_l2(l1p, va); 4458 *l2 = l2p; 4459 4460 if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0) 4461 return (false); 4462 4463 if ((pmap_load(l2p) & PTE_RX) != 0) { 4464 *l3 = NULL; 4465 return (true); 4466 } 4467 4468 /* Get l3 page table entry. */ 4469 *l3 = pmap_l2_to_l3(l2p, va); 4470 4471 return (true); 4472 } 4473 4474 /* 4475 * Track a range of the kernel's virtual address space that is contiguous 4476 * in various mapping attributes. 4477 */ 4478 struct pmap_kernel_map_range { 4479 vm_offset_t sva; 4480 pt_entry_t attrs; 4481 int l3pages; 4482 int l2pages; 4483 int l1pages; 4484 }; 4485 4486 static void 4487 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 4488 vm_offset_t eva) 4489 { 4490 4491 if (eva <= range->sva) 4492 return; 4493 4494 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %d %d %d\n", 4495 range->sva, eva, 4496 (range->attrs & PTE_W) == PTE_W ? 'w' : '-', 4497 (range->attrs & PTE_X) == PTE_X ? 'x' : '-', 4498 (range->attrs & PTE_U) == PTE_U ? 'u' : 's', 4499 (range->attrs & PTE_G) == PTE_G ? 'g' : '-', 4500 range->l1pages, range->l2pages, range->l3pages); 4501 4502 /* Reset to sentinel value. */ 4503 range->sva = 0xfffffffffffffffful; 4504 } 4505 4506 /* 4507 * Determine whether the attributes specified by a page table entry match those 4508 * being tracked by the current range. 4509 */ 4510 static bool 4511 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 4512 { 4513 4514 return (range->attrs == attrs); 4515 } 4516 4517 static void 4518 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 4519 pt_entry_t attrs) 4520 { 4521 4522 memset(range, 0, sizeof(*range)); 4523 range->sva = va; 4524 range->attrs = attrs; 4525 } 4526 4527 /* 4528 * Given a leaf PTE, derive the mapping's attributes. If they do not match 4529 * those of the current run, dump the address range and its attributes, and 4530 * begin a new run. 4531 */ 4532 static void 4533 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 4534 vm_offset_t va, pd_entry_t l1e, pd_entry_t l2e, pt_entry_t l3e) 4535 { 4536 pt_entry_t attrs; 4537 4538 /* The PTE global bit is inherited by lower levels. */ 4539 attrs = l1e & PTE_G; 4540 if ((l1e & PTE_RWX) != 0) 4541 attrs |= l1e & (PTE_RWX | PTE_U); 4542 else if (l2e != 0) 4543 attrs |= l2e & PTE_G; 4544 if ((l2e & PTE_RWX) != 0) 4545 attrs |= l2e & (PTE_RWX | PTE_U); 4546 else if (l3e != 0) 4547 attrs |= l3e & (PTE_RWX | PTE_U | PTE_G); 4548 4549 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 4550 sysctl_kmaps_dump(sb, range, va); 4551 sysctl_kmaps_reinit(range, va, attrs); 4552 } 4553 } 4554 4555 static int 4556 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 4557 { 4558 struct pmap_kernel_map_range range; 4559 struct sbuf sbuf, *sb; 4560 pd_entry_t l1e, *l2, l2e; 4561 pt_entry_t *l3, l3e; 4562 vm_offset_t sva; 4563 vm_paddr_t pa; 4564 int error, i, j, k; 4565 4566 error = sysctl_wire_old_buffer(req, 0); 4567 if (error != 0) 4568 return (error); 4569 sb = &sbuf; 4570 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 4571 4572 /* Sentinel value. */ 4573 range.sva = 0xfffffffffffffffful; 4574 4575 /* 4576 * Iterate over the kernel page tables without holding the kernel pmap 4577 * lock. Kernel page table pages are never freed, so at worst we will 4578 * observe inconsistencies in the output. 4579 */ 4580 sva = VM_MIN_KERNEL_ADDRESS; 4581 for (i = pmap_l1_index(sva); i < Ln_ENTRIES; i++) { 4582 if (i == pmap_l1_index(DMAP_MIN_ADDRESS)) 4583 sbuf_printf(sb, "\nDirect map:\n"); 4584 else if (i == pmap_l1_index(VM_MIN_KERNEL_ADDRESS)) 4585 sbuf_printf(sb, "\nKernel map:\n"); 4586 4587 l1e = kernel_pmap->pm_l1[i]; 4588 if ((l1e & PTE_V) == 0) { 4589 sysctl_kmaps_dump(sb, &range, sva); 4590 sva += L1_SIZE; 4591 continue; 4592 } 4593 if ((l1e & PTE_RWX) != 0) { 4594 sysctl_kmaps_check(sb, &range, sva, l1e, 0, 0); 4595 range.l1pages++; 4596 sva += L1_SIZE; 4597 continue; 4598 } 4599 pa = PTE_TO_PHYS(l1e); 4600 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); 4601 4602 for (j = pmap_l2_index(sva); j < Ln_ENTRIES; j++) { 4603 l2e = l2[j]; 4604 if ((l2e & PTE_V) == 0) { 4605 sysctl_kmaps_dump(sb, &range, sva); 4606 sva += L2_SIZE; 4607 continue; 4608 } 4609 if ((l2e & PTE_RWX) != 0) { 4610 sysctl_kmaps_check(sb, &range, sva, l1e, l2e, 0); 4611 range.l2pages++; 4612 sva += L2_SIZE; 4613 continue; 4614 } 4615 pa = PTE_TO_PHYS(l2e); 4616 l3 = (pd_entry_t *)PHYS_TO_DMAP(pa); 4617 4618 for (k = pmap_l3_index(sva); k < Ln_ENTRIES; k++, 4619 sva += L3_SIZE) { 4620 l3e = l3[k]; 4621 if ((l3e & PTE_V) == 0) { 4622 sysctl_kmaps_dump(sb, &range, sva); 4623 continue; 4624 } 4625 sysctl_kmaps_check(sb, &range, sva, 4626 l1e, l2e, l3e); 4627 range.l3pages++; 4628 } 4629 } 4630 } 4631 4632 error = sbuf_finish(sb); 4633 sbuf_delete(sb); 4634 return (error); 4635 } 4636 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 4637 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 4638 NULL, 0, sysctl_kmaps, "A", 4639 "Dump kernel address layout"); 4640