1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * Copyright (c) 2014 Andrew Turner 15 * All rights reserved. 16 * Copyright (c) 2014 The FreeBSD Foundation 17 * All rights reserved. 18 * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com> 19 * All rights reserved. 20 * 21 * This code is derived from software contributed to Berkeley by 22 * the Systems Programming Group of the University of Utah Computer 23 * Science Department and William Jolitz of UUNET Technologies Inc. 24 * 25 * Portions of this software were developed by Andrew Turner under 26 * sponsorship from The FreeBSD Foundation. 27 * 28 * Portions of this software were developed by SRI International and the 29 * University of Cambridge Computer Laboratory under DARPA/AFRL contract 30 * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. 31 * 32 * Portions of this software were developed by the University of Cambridge 33 * Computer Laboratory as part of the CTSRD Project, with support from the 34 * UK Higher Education Innovation Fund (HEIF). 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. All advertising materials mentioning features or use of this software 45 * must display the following acknowledgement: 46 * This product includes software developed by the University of 47 * California, Berkeley and its contributors. 48 * 4. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 * 64 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 65 */ 66 /*- 67 * Copyright (c) 2003 Networks Associates Technology, Inc. 68 * All rights reserved. 69 * 70 * This software was developed for the FreeBSD Project by Jake Burkholder, 71 * Safeport Network Services, and Network Associates Laboratories, the 72 * Security Research Division of Network Associates, Inc. under 73 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 74 * CHATS research program. 75 * 76 * Redistribution and use in source and binary forms, with or without 77 * modification, are permitted provided that the following conditions 78 * are met: 79 * 1. Redistributions of source code must retain the above copyright 80 * notice, this list of conditions and the following disclaimer. 81 * 2. Redistributions in binary form must reproduce the above copyright 82 * notice, this list of conditions and the following disclaimer in the 83 * documentation and/or other materials provided with the distribution. 84 * 85 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 86 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 87 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 88 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 89 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 90 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 91 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 92 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 93 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 94 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 95 * SUCH DAMAGE. 96 */ 97 98 #include <sys/cdefs.h> 99 __FBSDID("$FreeBSD$"); 100 101 /* 102 * Manages physical address maps. 103 * 104 * Since the information managed by this module is 105 * also stored by the logical address mapping module, 106 * this module may throw away valid virtual-to-physical 107 * mappings at almost any time. However, invalidations 108 * of virtual-to-physical mappings must be done as 109 * requested. 110 * 111 * In order to cope with hardware architectures which 112 * make virtual-to-physical map invalidates expensive, 113 * this module may delay invalidate or reduced protection 114 * operations until such time as they are actually 115 * necessary. This module is given full information as 116 * to which processors are currently using which maps, 117 * and to when physical maps must be made correct. 118 */ 119 120 #include <sys/param.h> 121 #include <sys/systm.h> 122 #include <sys/bitstring.h> 123 #include <sys/bus.h> 124 #include <sys/cpuset.h> 125 #include <sys/kernel.h> 126 #include <sys/ktr.h> 127 #include <sys/lock.h> 128 #include <sys/malloc.h> 129 #include <sys/mman.h> 130 #include <sys/msgbuf.h> 131 #include <sys/mutex.h> 132 #include <sys/physmem.h> 133 #include <sys/proc.h> 134 #include <sys/rwlock.h> 135 #include <sys/sbuf.h> 136 #include <sys/sx.h> 137 #include <sys/vmem.h> 138 #include <sys/vmmeter.h> 139 #include <sys/sched.h> 140 #include <sys/sysctl.h> 141 #include <sys/smp.h> 142 143 #include <vm/vm.h> 144 #include <vm/vm_param.h> 145 #include <vm/vm_kern.h> 146 #include <vm/vm_page.h> 147 #include <vm/vm_map.h> 148 #include <vm/vm_object.h> 149 #include <vm/vm_extern.h> 150 #include <vm/vm_pageout.h> 151 #include <vm/vm_pager.h> 152 #include <vm/vm_phys.h> 153 #include <vm/vm_radix.h> 154 #include <vm/vm_reserv.h> 155 #include <vm/uma.h> 156 157 #include <machine/machdep.h> 158 #include <machine/md_var.h> 159 #include <machine/pcb.h> 160 #include <machine/sbi.h> 161 162 #define NUL1E (Ln_ENTRIES * Ln_ENTRIES) 163 #define NUL2E (Ln_ENTRIES * NUL1E) 164 165 #if !defined(DIAGNOSTIC) 166 #ifdef __GNUC_GNU_INLINE__ 167 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 168 #else 169 #define PMAP_INLINE extern inline 170 #endif 171 #else 172 #define PMAP_INLINE 173 #endif 174 175 #ifdef PV_STATS 176 #define PV_STAT(x) do { x ; } while (0) 177 #else 178 #define PV_STAT(x) do { } while (0) 179 #endif 180 181 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) 182 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 183 184 #define NPV_LIST_LOCKS MAXCPU 185 186 #define PHYS_TO_PV_LIST_LOCK(pa) \ 187 (&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS]) 188 189 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 190 struct rwlock **_lockp = (lockp); \ 191 struct rwlock *_new_lock; \ 192 \ 193 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 194 if (_new_lock != *_lockp) { \ 195 if (*_lockp != NULL) \ 196 rw_wunlock(*_lockp); \ 197 *_lockp = _new_lock; \ 198 rw_wlock(*_lockp); \ 199 } \ 200 } while (0) 201 202 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 203 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 204 205 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 206 struct rwlock **_lockp = (lockp); \ 207 \ 208 if (*_lockp != NULL) { \ 209 rw_wunlock(*_lockp); \ 210 *_lockp = NULL; \ 211 } \ 212 } while (0) 213 214 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 215 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 216 217 /* The list of all the user pmaps */ 218 LIST_HEAD(pmaplist, pmap); 219 static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER(); 220 221 struct pmap kernel_pmap_store; 222 223 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 224 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 225 vm_offset_t kernel_vm_end = 0; 226 227 vm_paddr_t dmap_phys_base; /* The start of the dmap region */ 228 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ 229 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ 230 231 /* This code assumes all L1 DMAP entries will be used */ 232 CTASSERT((DMAP_MIN_ADDRESS & ~L1_OFFSET) == DMAP_MIN_ADDRESS); 233 CTASSERT((DMAP_MAX_ADDRESS & ~L1_OFFSET) == DMAP_MAX_ADDRESS); 234 235 static struct rwlock_padalign pvh_global_lock; 236 static struct mtx_padalign allpmaps_lock; 237 238 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 239 "VM/pmap parameters"); 240 241 static int superpages_enabled = 1; 242 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, 243 CTLFLAG_RDTUN, &superpages_enabled, 0, 244 "Enable support for transparent superpages"); 245 246 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 247 "2MB page mapping counters"); 248 249 static u_long pmap_l2_demotions; 250 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, 251 &pmap_l2_demotions, 0, 252 "2MB page demotions"); 253 254 static u_long pmap_l2_mappings; 255 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, 256 &pmap_l2_mappings, 0, 257 "2MB page mappings"); 258 259 static u_long pmap_l2_p_failures; 260 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, 261 &pmap_l2_p_failures, 0, 262 "2MB page promotion failures"); 263 264 static u_long pmap_l2_promotions; 265 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, 266 &pmap_l2_promotions, 0, 267 "2MB page promotions"); 268 269 /* 270 * Data for the pv entry allocation mechanism 271 */ 272 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 273 static struct mtx pv_chunks_mutex; 274 static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 275 static struct md_page *pv_table; 276 static struct md_page pv_dummy; 277 278 extern cpuset_t all_harts; 279 280 /* 281 * Internal flags for pmap_enter()'s helper functions. 282 */ 283 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 284 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 285 286 static void free_pv_chunk(struct pv_chunk *pc); 287 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 288 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 289 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 290 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 291 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 292 vm_offset_t va); 293 static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va); 294 static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, 295 vm_offset_t va, struct rwlock **lockp); 296 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, 297 u_int flags, vm_page_t m, struct rwlock **lockp); 298 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 299 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 300 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, 301 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 302 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 303 vm_page_t m, struct rwlock **lockp); 304 305 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, 306 struct rwlock **lockp); 307 308 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 309 struct spglist *free); 310 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 311 312 #define pmap_clear(pte) pmap_store(pte, 0) 313 #define pmap_clear_bits(pte, bits) atomic_clear_64(pte, bits) 314 #define pmap_load_store(pte, entry) atomic_swap_64(pte, entry) 315 #define pmap_load_clear(pte) pmap_load_store(pte, 0) 316 #define pmap_load(pte) atomic_load_64(pte) 317 #define pmap_store(pte, entry) atomic_store_64(pte, entry) 318 #define pmap_store_bits(pte, bits) atomic_set_64(pte, bits) 319 320 /********************/ 321 /* Inline functions */ 322 /********************/ 323 324 static __inline void 325 pagecopy(void *s, void *d) 326 { 327 328 memcpy(d, s, PAGE_SIZE); 329 } 330 331 static __inline void 332 pagezero(void *p) 333 { 334 335 bzero(p, PAGE_SIZE); 336 } 337 338 #define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) 339 #define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) 340 #define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) 341 342 #define PTE_TO_PHYS(pte) ((pte >> PTE_PPN0_S) * PAGE_SIZE) 343 344 static __inline pd_entry_t * 345 pmap_l1(pmap_t pmap, vm_offset_t va) 346 { 347 348 return (&pmap->pm_l1[pmap_l1_index(va)]); 349 } 350 351 static __inline pd_entry_t * 352 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) 353 { 354 vm_paddr_t phys; 355 pd_entry_t *l2; 356 357 phys = PTE_TO_PHYS(pmap_load(l1)); 358 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 359 360 return (&l2[pmap_l2_index(va)]); 361 } 362 363 static __inline pd_entry_t * 364 pmap_l2(pmap_t pmap, vm_offset_t va) 365 { 366 pd_entry_t *l1; 367 368 l1 = pmap_l1(pmap, va); 369 if ((pmap_load(l1) & PTE_V) == 0) 370 return (NULL); 371 if ((pmap_load(l1) & PTE_RX) != 0) 372 return (NULL); 373 374 return (pmap_l1_to_l2(l1, va)); 375 } 376 377 static __inline pt_entry_t * 378 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) 379 { 380 vm_paddr_t phys; 381 pt_entry_t *l3; 382 383 phys = PTE_TO_PHYS(pmap_load(l2)); 384 l3 = (pd_entry_t *)PHYS_TO_DMAP(phys); 385 386 return (&l3[pmap_l3_index(va)]); 387 } 388 389 static __inline pt_entry_t * 390 pmap_l3(pmap_t pmap, vm_offset_t va) 391 { 392 pd_entry_t *l2; 393 394 l2 = pmap_l2(pmap, va); 395 if (l2 == NULL) 396 return (NULL); 397 if ((pmap_load(l2) & PTE_V) == 0) 398 return (NULL); 399 if ((pmap_load(l2) & PTE_RX) != 0) 400 return (NULL); 401 402 return (pmap_l2_to_l3(l2, va)); 403 } 404 405 static __inline void 406 pmap_resident_count_inc(pmap_t pmap, int count) 407 { 408 409 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 410 pmap->pm_stats.resident_count += count; 411 } 412 413 static __inline void 414 pmap_resident_count_dec(pmap_t pmap, int count) 415 { 416 417 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 418 KASSERT(pmap->pm_stats.resident_count >= count, 419 ("pmap %p resident count underflow %ld %d", pmap, 420 pmap->pm_stats.resident_count, count)); 421 pmap->pm_stats.resident_count -= count; 422 } 423 424 static void 425 pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index, 426 pt_entry_t entry) 427 { 428 struct pmap *user_pmap; 429 pd_entry_t *l1; 430 431 /* Distribute new kernel L1 entry to all the user pmaps */ 432 if (pmap != kernel_pmap) 433 return; 434 435 mtx_lock(&allpmaps_lock); 436 LIST_FOREACH(user_pmap, &allpmaps, pm_list) { 437 l1 = &user_pmap->pm_l1[l1index]; 438 pmap_store(l1, entry); 439 } 440 mtx_unlock(&allpmaps_lock); 441 } 442 443 static pt_entry_t * 444 pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, 445 u_int *l2_slot) 446 { 447 pt_entry_t *l2; 448 pd_entry_t *l1; 449 450 l1 = (pd_entry_t *)l1pt; 451 *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; 452 453 /* Check locore has used a table L1 map */ 454 KASSERT((l1[*l1_slot] & PTE_RX) == 0, 455 ("Invalid bootstrap L1 table")); 456 457 /* Find the address of the L2 table */ 458 l2 = (pt_entry_t *)init_pt_va; 459 *l2_slot = pmap_l2_index(va); 460 461 return (l2); 462 } 463 464 static vm_paddr_t 465 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) 466 { 467 u_int l1_slot, l2_slot; 468 pt_entry_t *l2; 469 vm_paddr_t ret; 470 471 l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); 472 473 /* Check locore has used L2 superpages */ 474 KASSERT((l2[l2_slot] & PTE_RX) != 0, 475 ("Invalid bootstrap L2 table")); 476 477 /* L2 is superpages */ 478 ret = (l2[l2_slot] >> PTE_PPN1_S) << L2_SHIFT; 479 ret += (va & L2_OFFSET); 480 481 return (ret); 482 } 483 484 static void 485 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa) 486 { 487 vm_offset_t va; 488 vm_paddr_t pa; 489 pd_entry_t *l1; 490 u_int l1_slot; 491 pt_entry_t entry; 492 pn_t pn; 493 494 pa = dmap_phys_base = min_pa & ~L1_OFFSET; 495 va = DMAP_MIN_ADDRESS; 496 l1 = (pd_entry_t *)kern_l1; 497 l1_slot = pmap_l1_index(DMAP_MIN_ADDRESS); 498 499 for (; va < DMAP_MAX_ADDRESS && pa < max_pa; 500 pa += L1_SIZE, va += L1_SIZE, l1_slot++) { 501 KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); 502 503 /* superpages */ 504 pn = (pa / PAGE_SIZE); 505 entry = PTE_KERN; 506 entry |= (pn << PTE_PPN0_S); 507 pmap_store(&l1[l1_slot], entry); 508 } 509 510 /* Set the upper limit of the DMAP region */ 511 dmap_phys_max = pa; 512 dmap_max_addr = va; 513 514 sfence_vma(); 515 } 516 517 static vm_offset_t 518 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) 519 { 520 vm_offset_t l3pt; 521 pt_entry_t entry; 522 pd_entry_t *l2; 523 vm_paddr_t pa; 524 u_int l2_slot; 525 pn_t pn; 526 527 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); 528 529 l2 = pmap_l2(kernel_pmap, va); 530 l2 = (pd_entry_t *)((uintptr_t)l2 & ~(PAGE_SIZE - 1)); 531 l2_slot = pmap_l2_index(va); 532 l3pt = l3_start; 533 534 for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { 535 KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); 536 537 pa = pmap_early_vtophys(l1pt, l3pt); 538 pn = (pa / PAGE_SIZE); 539 entry = (PTE_V); 540 entry |= (pn << PTE_PPN0_S); 541 pmap_store(&l2[l2_slot], entry); 542 l3pt += PAGE_SIZE; 543 } 544 545 546 /* Clean the L2 page table */ 547 memset((void *)l3_start, 0, l3pt - l3_start); 548 549 return (l3pt); 550 } 551 552 /* 553 * Bootstrap the system enough to run with virtual memory. 554 */ 555 void 556 pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) 557 { 558 u_int l1_slot, l2_slot; 559 vm_offset_t freemempos; 560 vm_offset_t dpcpu, msgbufpv; 561 vm_paddr_t max_pa, min_pa, pa; 562 pt_entry_t *l2p; 563 int i; 564 565 printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); 566 567 /* Set this early so we can use the pagetable walking functions */ 568 kernel_pmap_store.pm_l1 = (pd_entry_t *)l1pt; 569 PMAP_LOCK_INIT(kernel_pmap); 570 571 rw_init(&pvh_global_lock, "pmap pv global"); 572 573 CPU_FILL(&kernel_pmap->pm_active); 574 575 /* Assume the address we were loaded to is a valid physical address. */ 576 min_pa = max_pa = kernstart; 577 578 physmap_idx = physmem_avail(physmap, nitems(physmap)); 579 physmap_idx /= 2; 580 581 /* 582 * Find the minimum physical address. physmap is sorted, 583 * but may contain empty ranges. 584 */ 585 for (i = 0; i < physmap_idx * 2; i += 2) { 586 if (physmap[i] == physmap[i + 1]) 587 continue; 588 if (physmap[i] <= min_pa) 589 min_pa = physmap[i]; 590 if (physmap[i + 1] > max_pa) 591 max_pa = physmap[i + 1]; 592 } 593 printf("physmap_idx %lx\n", physmap_idx); 594 printf("min_pa %lx\n", min_pa); 595 printf("max_pa %lx\n", max_pa); 596 597 /* Create a direct map region early so we can use it for pa -> va */ 598 pmap_bootstrap_dmap(l1pt, min_pa, max_pa); 599 600 /* 601 * Read the page table to find out what is already mapped. 602 * This assumes we have mapped a block of memory from KERNBASE 603 * using a single L1 entry. 604 */ 605 (void)pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); 606 607 /* Sanity check the index, KERNBASE should be the first VA */ 608 KASSERT(l2_slot == 0, ("The L2 index is non-zero")); 609 610 freemempos = roundup2(KERNBASE + kernlen, PAGE_SIZE); 611 612 /* Create the l3 tables for the early devmap */ 613 freemempos = pmap_bootstrap_l3(l1pt, 614 VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos); 615 616 /* 617 * Invalidate the mapping we created for the DTB. At this point a copy 618 * has been created, and we no longer need it. We want to avoid the 619 * possibility of an aliased mapping in the future. 620 */ 621 l2p = pmap_l2(kernel_pmap, VM_EARLY_DTB_ADDRESS); 622 KASSERT((pmap_load(l2p) & PTE_V) != 0, ("dtpb not mapped")); 623 pmap_clear(l2p); 624 625 sfence_vma(); 626 627 #define alloc_pages(var, np) \ 628 (var) = freemempos; \ 629 freemempos += (np * PAGE_SIZE); \ 630 memset((char *)(var), 0, ((np) * PAGE_SIZE)); 631 632 /* Allocate dynamic per-cpu area. */ 633 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); 634 dpcpu_init((void *)dpcpu, 0); 635 636 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ 637 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); 638 msgbufp = (void *)msgbufpv; 639 640 virtual_avail = roundup2(freemempos, L2_SIZE); 641 virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE; 642 kernel_vm_end = virtual_avail; 643 644 pa = pmap_early_vtophys(l1pt, freemempos); 645 646 physmem_exclude_region(kernstart, pa - kernstart, EXFLAG_NOALLOC); 647 } 648 649 /* 650 * Initialize a vm_page's machine-dependent fields. 651 */ 652 void 653 pmap_page_init(vm_page_t m) 654 { 655 656 TAILQ_INIT(&m->md.pv_list); 657 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; 658 } 659 660 /* 661 * Initialize the pmap module. 662 * Called by vm_init, to initialize any structures that the pmap 663 * system needs to map virtual memory. 664 */ 665 void 666 pmap_init(void) 667 { 668 vm_size_t s; 669 int i, pv_npg; 670 671 /* 672 * Initialize the pv chunk and pmap list mutexes. 673 */ 674 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 675 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF); 676 677 /* 678 * Initialize the pool of pv list locks. 679 */ 680 for (i = 0; i < NPV_LIST_LOCKS; i++) 681 rw_init(&pv_list_locks[i], "pmap pv list"); 682 683 /* 684 * Calculate the size of the pv head table for superpages. 685 */ 686 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); 687 688 /* 689 * Allocate memory for the pv head table for superpages. 690 */ 691 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 692 s = round_page(s); 693 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 694 for (i = 0; i < pv_npg; i++) 695 TAILQ_INIT(&pv_table[i].pv_list); 696 TAILQ_INIT(&pv_dummy.pv_list); 697 698 if (superpages_enabled) 699 pagesizes[1] = L2_SIZE; 700 } 701 702 #ifdef SMP 703 /* 704 * For SMP, these functions have to use IPIs for coherence. 705 * 706 * In general, the calling thread uses a plain fence to order the 707 * writes to the page tables before invoking an SBI callback to invoke 708 * sfence_vma() on remote CPUs. 709 */ 710 static void 711 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 712 { 713 cpuset_t mask; 714 715 sched_pin(); 716 mask = pmap->pm_active; 717 CPU_CLR(PCPU_GET(hart), &mask); 718 fence(); 719 if (!CPU_EMPTY(&mask) && smp_started) 720 sbi_remote_sfence_vma(mask.__bits, va, 1); 721 sfence_vma_page(va); 722 sched_unpin(); 723 } 724 725 static void 726 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 727 { 728 cpuset_t mask; 729 730 sched_pin(); 731 mask = pmap->pm_active; 732 CPU_CLR(PCPU_GET(hart), &mask); 733 fence(); 734 if (!CPU_EMPTY(&mask) && smp_started) 735 sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1); 736 737 /* 738 * Might consider a loop of sfence_vma_page() for a small 739 * number of pages in the future. 740 */ 741 sfence_vma(); 742 sched_unpin(); 743 } 744 745 static void 746 pmap_invalidate_all(pmap_t pmap) 747 { 748 cpuset_t mask; 749 750 sched_pin(); 751 mask = pmap->pm_active; 752 CPU_CLR(PCPU_GET(hart), &mask); 753 754 /* 755 * XXX: The SBI doc doesn't detail how to specify x0 as the 756 * address to perform a global fence. BBL currently treats 757 * all sfence_vma requests as global however. 758 */ 759 fence(); 760 if (!CPU_EMPTY(&mask) && smp_started) 761 sbi_remote_sfence_vma(mask.__bits, 0, 0); 762 sfence_vma(); 763 sched_unpin(); 764 } 765 #else 766 /* 767 * Normal, non-SMP, invalidation functions. 768 * We inline these within pmap.c for speed. 769 */ 770 static __inline void 771 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 772 { 773 774 sfence_vma_page(va); 775 } 776 777 static __inline void 778 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 779 { 780 781 /* 782 * Might consider a loop of sfence_vma_page() for a small 783 * number of pages in the future. 784 */ 785 sfence_vma(); 786 } 787 788 static __inline void 789 pmap_invalidate_all(pmap_t pmap) 790 { 791 792 sfence_vma(); 793 } 794 #endif 795 796 /* 797 * Routine: pmap_extract 798 * Function: 799 * Extract the physical page address associated 800 * with the given map/virtual_address pair. 801 */ 802 vm_paddr_t 803 pmap_extract(pmap_t pmap, vm_offset_t va) 804 { 805 pd_entry_t *l2p, l2; 806 pt_entry_t *l3p, l3; 807 vm_paddr_t pa; 808 809 pa = 0; 810 PMAP_LOCK(pmap); 811 /* 812 * Start with the l2 tabel. We are unable to allocate 813 * pages in the l1 table. 814 */ 815 l2p = pmap_l2(pmap, va); 816 if (l2p != NULL) { 817 l2 = pmap_load(l2p); 818 if ((l2 & PTE_RX) == 0) { 819 l3p = pmap_l2_to_l3(l2p, va); 820 if (l3p != NULL) { 821 l3 = pmap_load(l3p); 822 pa = PTE_TO_PHYS(l3); 823 pa |= (va & L3_OFFSET); 824 } 825 } else { 826 /* L2 is superpages */ 827 pa = (l2 >> PTE_PPN1_S) << L2_SHIFT; 828 pa |= (va & L2_OFFSET); 829 } 830 } 831 PMAP_UNLOCK(pmap); 832 return (pa); 833 } 834 835 /* 836 * Routine: pmap_extract_and_hold 837 * Function: 838 * Atomically extract and hold the physical page 839 * with the given pmap and virtual address pair 840 * if that mapping permits the given protection. 841 */ 842 vm_page_t 843 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 844 { 845 pt_entry_t *l3p, l3; 846 vm_paddr_t phys; 847 vm_page_t m; 848 849 m = NULL; 850 PMAP_LOCK(pmap); 851 l3p = pmap_l3(pmap, va); 852 if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) { 853 if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) { 854 phys = PTE_TO_PHYS(l3); 855 m = PHYS_TO_VM_PAGE(phys); 856 if (!vm_page_wire_mapped(m)) 857 m = NULL; 858 } 859 } 860 PMAP_UNLOCK(pmap); 861 return (m); 862 } 863 864 vm_paddr_t 865 pmap_kextract(vm_offset_t va) 866 { 867 pd_entry_t *l2; 868 pt_entry_t *l3; 869 vm_paddr_t pa; 870 871 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 872 pa = DMAP_TO_PHYS(va); 873 } else { 874 l2 = pmap_l2(kernel_pmap, va); 875 if (l2 == NULL) 876 panic("pmap_kextract: No l2"); 877 if ((pmap_load(l2) & PTE_RX) != 0) { 878 /* superpages */ 879 pa = (pmap_load(l2) >> PTE_PPN1_S) << L2_SHIFT; 880 pa |= (va & L2_OFFSET); 881 return (pa); 882 } 883 884 l3 = pmap_l2_to_l3(l2, va); 885 if (l3 == NULL) 886 panic("pmap_kextract: No l3..."); 887 pa = PTE_TO_PHYS(pmap_load(l3)); 888 pa |= (va & PAGE_MASK); 889 } 890 return (pa); 891 } 892 893 /*************************************************** 894 * Low level mapping routines..... 895 ***************************************************/ 896 897 void 898 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) 899 { 900 pt_entry_t entry; 901 pt_entry_t *l3; 902 vm_offset_t va; 903 pn_t pn; 904 905 KASSERT((pa & L3_OFFSET) == 0, 906 ("pmap_kenter_device: Invalid physical address")); 907 KASSERT((sva & L3_OFFSET) == 0, 908 ("pmap_kenter_device: Invalid virtual address")); 909 KASSERT((size & PAGE_MASK) == 0, 910 ("pmap_kenter_device: Mapping is not page-sized")); 911 912 va = sva; 913 while (size != 0) { 914 l3 = pmap_l3(kernel_pmap, va); 915 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 916 917 pn = (pa / PAGE_SIZE); 918 entry = PTE_KERN; 919 entry |= (pn << PTE_PPN0_S); 920 pmap_store(l3, entry); 921 922 va += PAGE_SIZE; 923 pa += PAGE_SIZE; 924 size -= PAGE_SIZE; 925 } 926 pmap_invalidate_range(kernel_pmap, sva, va); 927 } 928 929 /* 930 * Remove a page from the kernel pagetables. 931 * Note: not SMP coherent. 932 */ 933 PMAP_INLINE void 934 pmap_kremove(vm_offset_t va) 935 { 936 pt_entry_t *l3; 937 938 l3 = pmap_l3(kernel_pmap, va); 939 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 940 941 pmap_clear(l3); 942 sfence_vma(); 943 } 944 945 void 946 pmap_kremove_device(vm_offset_t sva, vm_size_t size) 947 { 948 pt_entry_t *l3; 949 vm_offset_t va; 950 951 KASSERT((sva & L3_OFFSET) == 0, 952 ("pmap_kremove_device: Invalid virtual address")); 953 KASSERT((size & PAGE_MASK) == 0, 954 ("pmap_kremove_device: Mapping is not page-sized")); 955 956 va = sva; 957 while (size != 0) { 958 l3 = pmap_l3(kernel_pmap, va); 959 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 960 pmap_clear(l3); 961 962 va += PAGE_SIZE; 963 size -= PAGE_SIZE; 964 } 965 966 pmap_invalidate_range(kernel_pmap, sva, va); 967 } 968 969 /* 970 * Used to map a range of physical addresses into kernel 971 * virtual address space. 972 * 973 * The value passed in '*virt' is a suggested virtual address for 974 * the mapping. Architectures which can support a direct-mapped 975 * physical to virtual region can return the appropriate address 976 * within that region, leaving '*virt' unchanged. Other 977 * architectures should map the pages starting at '*virt' and 978 * update '*virt' with the first usable address after the mapped 979 * region. 980 */ 981 vm_offset_t 982 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 983 { 984 985 return PHYS_TO_DMAP(start); 986 } 987 988 989 /* 990 * Add a list of wired pages to the kva 991 * this routine is only used for temporary 992 * kernel mappings that do not need to have 993 * page modification or references recorded. 994 * Note that old mappings are simply written 995 * over. The page *must* be wired. 996 * Note: SMP coherent. Uses a ranged shootdown IPI. 997 */ 998 void 999 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1000 { 1001 pt_entry_t *l3, pa; 1002 vm_offset_t va; 1003 vm_page_t m; 1004 pt_entry_t entry; 1005 pn_t pn; 1006 int i; 1007 1008 va = sva; 1009 for (i = 0; i < count; i++) { 1010 m = ma[i]; 1011 pa = VM_PAGE_TO_PHYS(m); 1012 pn = (pa / PAGE_SIZE); 1013 l3 = pmap_l3(kernel_pmap, va); 1014 1015 entry = PTE_KERN; 1016 entry |= (pn << PTE_PPN0_S); 1017 pmap_store(l3, entry); 1018 1019 va += L3_SIZE; 1020 } 1021 pmap_invalidate_range(kernel_pmap, sva, va); 1022 } 1023 1024 /* 1025 * This routine tears out page mappings from the 1026 * kernel -- it is meant only for temporary mappings. 1027 * Note: SMP coherent. Uses a ranged shootdown IPI. 1028 */ 1029 void 1030 pmap_qremove(vm_offset_t sva, int count) 1031 { 1032 pt_entry_t *l3; 1033 vm_offset_t va; 1034 1035 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); 1036 1037 for (va = sva; count-- > 0; va += PAGE_SIZE) { 1038 l3 = pmap_l3(kernel_pmap, va); 1039 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 1040 pmap_clear(l3); 1041 } 1042 pmap_invalidate_range(kernel_pmap, sva, va); 1043 } 1044 1045 bool 1046 pmap_ps_enabled(pmap_t pmap __unused) 1047 { 1048 1049 return (superpages_enabled); 1050 } 1051 1052 /*************************************************** 1053 * Page table page management routines..... 1054 ***************************************************/ 1055 /* 1056 * Schedule the specified unused page table page to be freed. Specifically, 1057 * add the page to the specified list of pages that will be released to the 1058 * physical memory manager after the TLB has been updated. 1059 */ 1060 static __inline void 1061 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 1062 boolean_t set_PG_ZERO) 1063 { 1064 1065 if (set_PG_ZERO) 1066 m->flags |= PG_ZERO; 1067 else 1068 m->flags &= ~PG_ZERO; 1069 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1070 } 1071 1072 /* 1073 * Inserts the specified page table page into the specified pmap's collection 1074 * of idle page table pages. Each of a pmap's page table pages is responsible 1075 * for mapping a distinct range of virtual addresses. The pmap's collection is 1076 * ordered by this virtual address range. 1077 * 1078 * If "promoted" is false, then the page table page "ml3" must be zero filled. 1079 */ 1080 static __inline int 1081 pmap_insert_pt_page(pmap_t pmap, vm_page_t ml3, bool promoted) 1082 { 1083 1084 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1085 ml3->valid = promoted ? VM_PAGE_BITS_ALL : 0; 1086 return (vm_radix_insert(&pmap->pm_root, ml3)); 1087 } 1088 1089 /* 1090 * Removes the page table page mapping the specified virtual address from the 1091 * specified pmap's collection of idle page table pages, and returns it. 1092 * Otherwise, returns NULL if there is no page table page corresponding to the 1093 * specified virtual address. 1094 */ 1095 static __inline vm_page_t 1096 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 1097 { 1098 1099 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1100 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); 1101 } 1102 1103 /* 1104 * Decrements a page table page's reference count, which is used to record the 1105 * number of valid page table entries within the page. If the reference count 1106 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1107 * page table page was unmapped and FALSE otherwise. 1108 */ 1109 static inline boolean_t 1110 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1111 { 1112 1113 --m->ref_count; 1114 if (m->ref_count == 0) { 1115 _pmap_unwire_ptp(pmap, va, m, free); 1116 return (TRUE); 1117 } else { 1118 return (FALSE); 1119 } 1120 } 1121 1122 static void 1123 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1124 { 1125 vm_paddr_t phys; 1126 1127 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1128 if (m->pindex >= NUL1E) { 1129 pd_entry_t *l1; 1130 l1 = pmap_l1(pmap, va); 1131 pmap_clear(l1); 1132 pmap_distribute_l1(pmap, pmap_l1_index(va), 0); 1133 } else { 1134 pd_entry_t *l2; 1135 l2 = pmap_l2(pmap, va); 1136 pmap_clear(l2); 1137 } 1138 pmap_resident_count_dec(pmap, 1); 1139 if (m->pindex < NUL1E) { 1140 pd_entry_t *l1; 1141 vm_page_t pdpg; 1142 1143 l1 = pmap_l1(pmap, va); 1144 phys = PTE_TO_PHYS(pmap_load(l1)); 1145 pdpg = PHYS_TO_VM_PAGE(phys); 1146 pmap_unwire_ptp(pmap, va, pdpg, free); 1147 } 1148 pmap_invalidate_page(pmap, va); 1149 1150 vm_wire_sub(1); 1151 1152 /* 1153 * Put page on a list so that it is released after 1154 * *ALL* TLB shootdown is done 1155 */ 1156 pmap_add_delayed_free_list(m, free, TRUE); 1157 } 1158 1159 /* 1160 * After removing a page table entry, this routine is used to 1161 * conditionally free the page, and manage the reference count. 1162 */ 1163 static int 1164 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 1165 struct spglist *free) 1166 { 1167 vm_page_t mpte; 1168 1169 if (va >= VM_MAXUSER_ADDRESS) 1170 return (0); 1171 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 1172 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde)); 1173 return (pmap_unwire_ptp(pmap, va, mpte, free)); 1174 } 1175 1176 void 1177 pmap_pinit0(pmap_t pmap) 1178 { 1179 1180 PMAP_LOCK_INIT(pmap); 1181 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1182 pmap->pm_l1 = kernel_pmap->pm_l1; 1183 pmap->pm_satp = SATP_MODE_SV39 | (vtophys(pmap->pm_l1) >> PAGE_SHIFT); 1184 CPU_ZERO(&pmap->pm_active); 1185 pmap_activate_boot(pmap); 1186 } 1187 1188 int 1189 pmap_pinit(pmap_t pmap) 1190 { 1191 vm_paddr_t l1phys; 1192 vm_page_t l1pt; 1193 1194 /* 1195 * allocate the l1 page 1196 */ 1197 while ((l1pt = vm_page_alloc(NULL, 0xdeadbeef, VM_ALLOC_NORMAL | 1198 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) 1199 vm_wait(NULL); 1200 1201 l1phys = VM_PAGE_TO_PHYS(l1pt); 1202 pmap->pm_l1 = (pd_entry_t *)PHYS_TO_DMAP(l1phys); 1203 pmap->pm_satp = SATP_MODE_SV39 | (l1phys >> PAGE_SHIFT); 1204 1205 if ((l1pt->flags & PG_ZERO) == 0) 1206 pagezero(pmap->pm_l1); 1207 1208 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1209 1210 CPU_ZERO(&pmap->pm_active); 1211 1212 /* Install kernel pagetables */ 1213 memcpy(pmap->pm_l1, kernel_pmap->pm_l1, PAGE_SIZE); 1214 1215 /* Add to the list of all user pmaps */ 1216 mtx_lock(&allpmaps_lock); 1217 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1218 mtx_unlock(&allpmaps_lock); 1219 1220 vm_radix_init(&pmap->pm_root); 1221 1222 return (1); 1223 } 1224 1225 /* 1226 * This routine is called if the desired page table page does not exist. 1227 * 1228 * If page table page allocation fails, this routine may sleep before 1229 * returning NULL. It sleeps only if a lock pointer was given. 1230 * 1231 * Note: If a page allocation fails at page table level two or three, 1232 * one or two pages may be held during the wait, only to be released 1233 * afterwards. This conservative approach is easily argued to avoid 1234 * race conditions. 1235 */ 1236 static vm_page_t 1237 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 1238 { 1239 vm_page_t m, /*pdppg, */pdpg; 1240 pt_entry_t entry; 1241 vm_paddr_t phys; 1242 pn_t pn; 1243 1244 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1245 1246 /* 1247 * Allocate a page table page. 1248 */ 1249 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1250 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1251 if (lockp != NULL) { 1252 RELEASE_PV_LIST_LOCK(lockp); 1253 PMAP_UNLOCK(pmap); 1254 rw_runlock(&pvh_global_lock); 1255 vm_wait(NULL); 1256 rw_rlock(&pvh_global_lock); 1257 PMAP_LOCK(pmap); 1258 } 1259 1260 /* 1261 * Indicate the need to retry. While waiting, the page table 1262 * page may have been allocated. 1263 */ 1264 return (NULL); 1265 } 1266 1267 if ((m->flags & PG_ZERO) == 0) 1268 pmap_zero_page(m); 1269 1270 /* 1271 * Map the pagetable page into the process address space, if 1272 * it isn't already there. 1273 */ 1274 1275 if (ptepindex >= NUL1E) { 1276 pd_entry_t *l1; 1277 vm_pindex_t l1index; 1278 1279 l1index = ptepindex - NUL1E; 1280 l1 = &pmap->pm_l1[l1index]; 1281 1282 pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); 1283 entry = (PTE_V); 1284 entry |= (pn << PTE_PPN0_S); 1285 pmap_store(l1, entry); 1286 pmap_distribute_l1(pmap, l1index, entry); 1287 } else { 1288 vm_pindex_t l1index; 1289 pd_entry_t *l1, *l2; 1290 1291 l1index = ptepindex >> (L1_SHIFT - L2_SHIFT); 1292 l1 = &pmap->pm_l1[l1index]; 1293 if (pmap_load(l1) == 0) { 1294 /* recurse for allocating page dir */ 1295 if (_pmap_alloc_l3(pmap, NUL1E + l1index, 1296 lockp) == NULL) { 1297 vm_page_unwire_noq(m); 1298 vm_page_free_zero(m); 1299 return (NULL); 1300 } 1301 } else { 1302 phys = PTE_TO_PHYS(pmap_load(l1)); 1303 pdpg = PHYS_TO_VM_PAGE(phys); 1304 pdpg->ref_count++; 1305 } 1306 1307 phys = PTE_TO_PHYS(pmap_load(l1)); 1308 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1309 l2 = &l2[ptepindex & Ln_ADDR_MASK]; 1310 1311 pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); 1312 entry = (PTE_V); 1313 entry |= (pn << PTE_PPN0_S); 1314 pmap_store(l2, entry); 1315 } 1316 1317 pmap_resident_count_inc(pmap, 1); 1318 1319 return (m); 1320 } 1321 1322 static vm_page_t 1323 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1324 { 1325 pd_entry_t *l1; 1326 vm_page_t l2pg; 1327 vm_pindex_t l2pindex; 1328 1329 retry: 1330 l1 = pmap_l1(pmap, va); 1331 if (l1 != NULL && (pmap_load(l1) & PTE_RWX) == 0) { 1332 /* Add a reference to the L2 page. */ 1333 l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1))); 1334 l2pg->ref_count++; 1335 } else { 1336 /* Allocate a L2 page. */ 1337 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; 1338 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); 1339 if (l2pg == NULL && lockp != NULL) 1340 goto retry; 1341 } 1342 return (l2pg); 1343 } 1344 1345 static vm_page_t 1346 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1347 { 1348 vm_pindex_t ptepindex; 1349 pd_entry_t *l2; 1350 vm_paddr_t phys; 1351 vm_page_t m; 1352 1353 /* 1354 * Calculate pagetable page index 1355 */ 1356 ptepindex = pmap_l2_pindex(va); 1357 retry: 1358 /* 1359 * Get the page directory entry 1360 */ 1361 l2 = pmap_l2(pmap, va); 1362 1363 /* 1364 * If the page table page is mapped, we just increment the 1365 * hold count, and activate it. 1366 */ 1367 if (l2 != NULL && pmap_load(l2) != 0) { 1368 phys = PTE_TO_PHYS(pmap_load(l2)); 1369 m = PHYS_TO_VM_PAGE(phys); 1370 m->ref_count++; 1371 } else { 1372 /* 1373 * Here if the pte page isn't mapped, or if it has been 1374 * deallocated. 1375 */ 1376 m = _pmap_alloc_l3(pmap, ptepindex, lockp); 1377 if (m == NULL && lockp != NULL) 1378 goto retry; 1379 } 1380 return (m); 1381 } 1382 1383 1384 /*************************************************** 1385 * Pmap allocation/deallocation routines. 1386 ***************************************************/ 1387 1388 /* 1389 * Release any resources held by the given physical map. 1390 * Called when a pmap initialized by pmap_pinit is being released. 1391 * Should only be called if the map contains no valid mappings. 1392 */ 1393 void 1394 pmap_release(pmap_t pmap) 1395 { 1396 vm_page_t m; 1397 1398 KASSERT(pmap->pm_stats.resident_count == 0, 1399 ("pmap_release: pmap resident count %ld != 0", 1400 pmap->pm_stats.resident_count)); 1401 KASSERT(CPU_EMPTY(&pmap->pm_active), 1402 ("releasing active pmap %p", pmap)); 1403 1404 mtx_lock(&allpmaps_lock); 1405 LIST_REMOVE(pmap, pm_list); 1406 mtx_unlock(&allpmaps_lock); 1407 1408 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l1)); 1409 vm_page_unwire_noq(m); 1410 vm_page_free(m); 1411 } 1412 1413 static int 1414 kvm_size(SYSCTL_HANDLER_ARGS) 1415 { 1416 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 1417 1418 return sysctl_handle_long(oidp, &ksize, 0, req); 1419 } 1420 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1421 0, 0, kvm_size, "LU", 1422 "Size of KVM"); 1423 1424 static int 1425 kvm_free(SYSCTL_HANDLER_ARGS) 1426 { 1427 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 1428 1429 return sysctl_handle_long(oidp, &kfree, 0, req); 1430 } 1431 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1432 0, 0, kvm_free, "LU", 1433 "Amount of KVM free"); 1434 1435 /* 1436 * grow the number of kernel page table entries, if needed 1437 */ 1438 void 1439 pmap_growkernel(vm_offset_t addr) 1440 { 1441 vm_paddr_t paddr; 1442 vm_page_t nkpg; 1443 pd_entry_t *l1, *l2; 1444 pt_entry_t entry; 1445 pn_t pn; 1446 1447 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 1448 1449 addr = roundup2(addr, L2_SIZE); 1450 if (addr - 1 >= vm_map_max(kernel_map)) 1451 addr = vm_map_max(kernel_map); 1452 while (kernel_vm_end < addr) { 1453 l1 = pmap_l1(kernel_pmap, kernel_vm_end); 1454 if (pmap_load(l1) == 0) { 1455 /* We need a new PDP entry */ 1456 nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT, 1457 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 1458 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1459 if (nkpg == NULL) 1460 panic("pmap_growkernel: no memory to grow kernel"); 1461 if ((nkpg->flags & PG_ZERO) == 0) 1462 pmap_zero_page(nkpg); 1463 paddr = VM_PAGE_TO_PHYS(nkpg); 1464 1465 pn = (paddr / PAGE_SIZE); 1466 entry = (PTE_V); 1467 entry |= (pn << PTE_PPN0_S); 1468 pmap_store(l1, entry); 1469 pmap_distribute_l1(kernel_pmap, 1470 pmap_l1_index(kernel_vm_end), entry); 1471 continue; /* try again */ 1472 } 1473 l2 = pmap_l1_to_l2(l1, kernel_vm_end); 1474 if ((pmap_load(l2) & PTE_V) != 0 && 1475 (pmap_load(l2) & PTE_RWX) == 0) { 1476 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1477 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1478 kernel_vm_end = vm_map_max(kernel_map); 1479 break; 1480 } 1481 continue; 1482 } 1483 1484 nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT, 1485 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 1486 VM_ALLOC_ZERO); 1487 if (nkpg == NULL) 1488 panic("pmap_growkernel: no memory to grow kernel"); 1489 if ((nkpg->flags & PG_ZERO) == 0) { 1490 pmap_zero_page(nkpg); 1491 } 1492 paddr = VM_PAGE_TO_PHYS(nkpg); 1493 1494 pn = (paddr / PAGE_SIZE); 1495 entry = (PTE_V); 1496 entry |= (pn << PTE_PPN0_S); 1497 pmap_store(l2, entry); 1498 1499 pmap_invalidate_page(kernel_pmap, kernel_vm_end); 1500 1501 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1502 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1503 kernel_vm_end = vm_map_max(kernel_map); 1504 break; 1505 } 1506 } 1507 } 1508 1509 1510 /*************************************************** 1511 * page management routines. 1512 ***************************************************/ 1513 1514 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 1515 CTASSERT(_NPCM == 3); 1516 CTASSERT(_NPCPV == 168); 1517 1518 static __inline struct pv_chunk * 1519 pv_to_chunk(pv_entry_t pv) 1520 { 1521 1522 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 1523 } 1524 1525 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 1526 1527 #define PC_FREE0 0xfffffffffffffffful 1528 #define PC_FREE1 0xfffffffffffffffful 1529 #define PC_FREE2 0x000000fffffffffful 1530 1531 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 1532 1533 #if 0 1534 #ifdef PV_STATS 1535 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 1536 1537 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 1538 "Current number of pv entry chunks"); 1539 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 1540 "Current number of pv entry chunks allocated"); 1541 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 1542 "Current number of pv entry chunks frees"); 1543 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 1544 "Number of times tried to get a chunk page but failed."); 1545 1546 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 1547 static int pv_entry_spare; 1548 1549 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 1550 "Current number of pv entry frees"); 1551 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 1552 "Current number of pv entry allocs"); 1553 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 1554 "Current number of pv entries"); 1555 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 1556 "Current number of spare pv entries"); 1557 #endif 1558 #endif /* 0 */ 1559 1560 /* 1561 * We are in a serious low memory condition. Resort to 1562 * drastic measures to free some pages so we can allocate 1563 * another pv entry chunk. 1564 * 1565 * Returns NULL if PV entries were reclaimed from the specified pmap. 1566 * 1567 * We do not, however, unmap 2mpages because subsequent accesses will 1568 * allocate per-page pv entries until repromotion occurs, thereby 1569 * exacerbating the shortage of free pv entries. 1570 */ 1571 static vm_page_t 1572 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 1573 { 1574 1575 panic("RISCVTODO: reclaim_pv_chunk"); 1576 } 1577 1578 /* 1579 * free the pv_entry back to the free list 1580 */ 1581 static void 1582 free_pv_entry(pmap_t pmap, pv_entry_t pv) 1583 { 1584 struct pv_chunk *pc; 1585 int idx, field, bit; 1586 1587 rw_assert(&pvh_global_lock, RA_LOCKED); 1588 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1589 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 1590 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 1591 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 1592 pc = pv_to_chunk(pv); 1593 idx = pv - &pc->pc_pventry[0]; 1594 field = idx / 64; 1595 bit = idx % 64; 1596 pc->pc_map[field] |= 1ul << bit; 1597 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 1598 pc->pc_map[2] != PC_FREE2) { 1599 /* 98% of the time, pc is already at the head of the list. */ 1600 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 1601 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1602 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1603 } 1604 return; 1605 } 1606 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1607 free_pv_chunk(pc); 1608 } 1609 1610 static void 1611 free_pv_chunk(struct pv_chunk *pc) 1612 { 1613 vm_page_t m; 1614 1615 mtx_lock(&pv_chunks_mutex); 1616 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 1617 mtx_unlock(&pv_chunks_mutex); 1618 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 1619 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 1620 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 1621 /* entire chunk is free, return it */ 1622 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 1623 dump_drop_page(m->phys_addr); 1624 vm_page_unwire_noq(m); 1625 vm_page_free(m); 1626 } 1627 1628 /* 1629 * Returns a new PV entry, allocating a new PV chunk from the system when 1630 * needed. If this PV chunk allocation fails and a PV list lock pointer was 1631 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 1632 * returned. 1633 * 1634 * The given PV list lock may be released. 1635 */ 1636 static pv_entry_t 1637 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 1638 { 1639 int bit, field; 1640 pv_entry_t pv; 1641 struct pv_chunk *pc; 1642 vm_page_t m; 1643 1644 rw_assert(&pvh_global_lock, RA_LOCKED); 1645 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1646 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 1647 retry: 1648 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1649 if (pc != NULL) { 1650 for (field = 0; field < _NPCM; field++) { 1651 if (pc->pc_map[field]) { 1652 bit = ffsl(pc->pc_map[field]) - 1; 1653 break; 1654 } 1655 } 1656 if (field < _NPCM) { 1657 pv = &pc->pc_pventry[field * 64 + bit]; 1658 pc->pc_map[field] &= ~(1ul << bit); 1659 /* If this was the last item, move it to tail */ 1660 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 1661 pc->pc_map[2] == 0) { 1662 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1663 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 1664 pc_list); 1665 } 1666 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1667 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 1668 return (pv); 1669 } 1670 } 1671 /* No free items, allocate another chunk */ 1672 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1673 VM_ALLOC_WIRED); 1674 if (m == NULL) { 1675 if (lockp == NULL) { 1676 PV_STAT(pc_chunk_tryfail++); 1677 return (NULL); 1678 } 1679 m = reclaim_pv_chunk(pmap, lockp); 1680 if (m == NULL) 1681 goto retry; 1682 } 1683 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 1684 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 1685 dump_add_page(m->phys_addr); 1686 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1687 pc->pc_pmap = pmap; 1688 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 1689 pc->pc_map[1] = PC_FREE1; 1690 pc->pc_map[2] = PC_FREE2; 1691 mtx_lock(&pv_chunks_mutex); 1692 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 1693 mtx_unlock(&pv_chunks_mutex); 1694 pv = &pc->pc_pventry[0]; 1695 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1696 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1697 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 1698 return (pv); 1699 } 1700 1701 /* 1702 * Ensure that the number of spare PV entries in the specified pmap meets or 1703 * exceeds the given count, "needed". 1704 * 1705 * The given PV list lock may be released. 1706 */ 1707 static void 1708 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 1709 { 1710 struct pch new_tail; 1711 struct pv_chunk *pc; 1712 vm_page_t m; 1713 int avail, free; 1714 bool reclaimed; 1715 1716 rw_assert(&pvh_global_lock, RA_LOCKED); 1717 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1718 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 1719 1720 /* 1721 * Newly allocated PV chunks must be stored in a private list until 1722 * the required number of PV chunks have been allocated. Otherwise, 1723 * reclaim_pv_chunk() could recycle one of these chunks. In 1724 * contrast, these chunks must be added to the pmap upon allocation. 1725 */ 1726 TAILQ_INIT(&new_tail); 1727 retry: 1728 avail = 0; 1729 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 1730 bit_count((bitstr_t *)pc->pc_map, 0, 1731 sizeof(pc->pc_map) * NBBY, &free); 1732 if (free == 0) 1733 break; 1734 avail += free; 1735 if (avail >= needed) 1736 break; 1737 } 1738 for (reclaimed = false; avail < needed; avail += _NPCPV) { 1739 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1740 VM_ALLOC_WIRED); 1741 if (m == NULL) { 1742 m = reclaim_pv_chunk(pmap, lockp); 1743 if (m == NULL) 1744 goto retry; 1745 reclaimed = true; 1746 } 1747 /* XXX PV STATS */ 1748 #if 0 1749 dump_add_page(m->phys_addr); 1750 #endif 1751 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1752 pc->pc_pmap = pmap; 1753 pc->pc_map[0] = PC_FREE0; 1754 pc->pc_map[1] = PC_FREE1; 1755 pc->pc_map[2] = PC_FREE2; 1756 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1757 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 1758 1759 /* 1760 * The reclaim might have freed a chunk from the current pmap. 1761 * If that chunk contained available entries, we need to 1762 * re-count the number of available entries. 1763 */ 1764 if (reclaimed) 1765 goto retry; 1766 } 1767 if (!TAILQ_EMPTY(&new_tail)) { 1768 mtx_lock(&pv_chunks_mutex); 1769 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 1770 mtx_unlock(&pv_chunks_mutex); 1771 } 1772 } 1773 1774 /* 1775 * First find and then remove the pv entry for the specified pmap and virtual 1776 * address from the specified pv list. Returns the pv entry if found and NULL 1777 * otherwise. This operation can be performed on pv lists for either 4KB or 1778 * 2MB page mappings. 1779 */ 1780 static __inline pv_entry_t 1781 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1782 { 1783 pv_entry_t pv; 1784 1785 rw_assert(&pvh_global_lock, RA_LOCKED); 1786 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 1787 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 1788 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 1789 pvh->pv_gen++; 1790 break; 1791 } 1792 } 1793 return (pv); 1794 } 1795 1796 /* 1797 * First find and then destroy the pv entry for the specified pmap and virtual 1798 * address. This operation can be performed on pv lists for either 4KB or 2MB 1799 * page mappings. 1800 */ 1801 static void 1802 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1803 { 1804 pv_entry_t pv; 1805 1806 pv = pmap_pvh_remove(pvh, pmap, va); 1807 1808 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va)); 1809 free_pv_entry(pmap, pv); 1810 } 1811 1812 /* 1813 * Conditionally create the PV entry for a 4KB page mapping if the required 1814 * memory can be allocated without resorting to reclamation. 1815 */ 1816 static boolean_t 1817 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 1818 struct rwlock **lockp) 1819 { 1820 pv_entry_t pv; 1821 1822 rw_assert(&pvh_global_lock, RA_LOCKED); 1823 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1824 /* Pass NULL instead of the lock pointer to disable reclamation. */ 1825 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 1826 pv->pv_va = va; 1827 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 1828 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 1829 m->md.pv_gen++; 1830 return (TRUE); 1831 } else 1832 return (FALSE); 1833 } 1834 1835 /* 1836 * After demotion from a 2MB page mapping to 512 4KB page mappings, 1837 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 1838 * entries for each of the 4KB page mappings. 1839 */ 1840 static void __unused 1841 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1842 struct rwlock **lockp) 1843 { 1844 struct md_page *pvh; 1845 struct pv_chunk *pc; 1846 pv_entry_t pv; 1847 vm_page_t m; 1848 vm_offset_t va_last; 1849 int bit, field; 1850 1851 rw_assert(&pvh_global_lock, RA_LOCKED); 1852 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1853 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1854 1855 /* 1856 * Transfer the 2mpage's pv entry for this mapping to the first 1857 * page's pv list. Once this transfer begins, the pv list lock 1858 * must not be released until the last pv entry is reinstantiated. 1859 */ 1860 pvh = pa_to_pvh(pa); 1861 va &= ~L2_OFFSET; 1862 pv = pmap_pvh_remove(pvh, pmap, va); 1863 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); 1864 m = PHYS_TO_VM_PAGE(pa); 1865 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 1866 m->md.pv_gen++; 1867 /* Instantiate the remaining 511 pv entries. */ 1868 va_last = va + L2_SIZE - PAGE_SIZE; 1869 for (;;) { 1870 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1871 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 1872 pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); 1873 for (field = 0; field < _NPCM; field++) { 1874 while (pc->pc_map[field] != 0) { 1875 bit = ffsl(pc->pc_map[field]) - 1; 1876 pc->pc_map[field] &= ~(1ul << bit); 1877 pv = &pc->pc_pventry[field * 64 + bit]; 1878 va += PAGE_SIZE; 1879 pv->pv_va = va; 1880 m++; 1881 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 1882 ("pmap_pv_demote_l2: page %p is not managed", m)); 1883 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 1884 m->md.pv_gen++; 1885 if (va == va_last) 1886 goto out; 1887 } 1888 } 1889 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1890 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1891 } 1892 out: 1893 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 1894 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1895 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1896 } 1897 /* XXX PV stats */ 1898 } 1899 1900 #if VM_NRESERVLEVEL > 0 1901 static void 1902 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1903 struct rwlock **lockp) 1904 { 1905 struct md_page *pvh; 1906 pv_entry_t pv; 1907 vm_page_t m; 1908 vm_offset_t va_last; 1909 1910 rw_assert(&pvh_global_lock, RA_LOCKED); 1911 KASSERT((va & L2_OFFSET) == 0, 1912 ("pmap_pv_promote_l2: misaligned va %#lx", va)); 1913 1914 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1915 1916 m = PHYS_TO_VM_PAGE(pa); 1917 pv = pmap_pvh_remove(&m->md, pmap, va); 1918 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va)); 1919 pvh = pa_to_pvh(pa); 1920 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 1921 pvh->pv_gen++; 1922 1923 va_last = va + L2_SIZE - PAGE_SIZE; 1924 do { 1925 m++; 1926 va += PAGE_SIZE; 1927 pmap_pvh_free(&m->md, pmap, va); 1928 } while (va < va_last); 1929 } 1930 #endif /* VM_NRESERVLEVEL > 0 */ 1931 1932 /* 1933 * Create the PV entry for a 2MB page mapping. Always returns true unless the 1934 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 1935 * false if the PV entry cannot be allocated without resorting to reclamation. 1936 */ 1937 static bool 1938 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, 1939 struct rwlock **lockp) 1940 { 1941 struct md_page *pvh; 1942 pv_entry_t pv; 1943 vm_paddr_t pa; 1944 1945 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1946 /* Pass NULL instead of the lock pointer to disable reclamation. */ 1947 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 1948 NULL : lockp)) == NULL) 1949 return (false); 1950 pv->pv_va = va; 1951 pa = PTE_TO_PHYS(l2e); 1952 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1953 pvh = pa_to_pvh(pa); 1954 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 1955 pvh->pv_gen++; 1956 return (true); 1957 } 1958 1959 static void 1960 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 1961 { 1962 pt_entry_t newl2, oldl2; 1963 vm_page_t ml3; 1964 vm_paddr_t ml3pa; 1965 1966 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); 1967 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 1968 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1969 1970 ml3 = pmap_remove_pt_page(pmap, va); 1971 if (ml3 == NULL) 1972 panic("pmap_remove_kernel_l2: Missing pt page"); 1973 1974 ml3pa = VM_PAGE_TO_PHYS(ml3); 1975 newl2 = ml3pa | PTE_V; 1976 1977 /* 1978 * If this page table page was unmapped by a promotion, then it 1979 * contains valid mappings. Zero it to invalidate those mappings. 1980 */ 1981 if (ml3->valid != 0) 1982 pagezero((void *)PHYS_TO_DMAP(ml3pa)); 1983 1984 /* 1985 * Demote the mapping. 1986 */ 1987 oldl2 = pmap_load_store(l2, newl2); 1988 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", 1989 __func__, l2, oldl2)); 1990 } 1991 1992 /* 1993 * pmap_remove_l2: Do the things to unmap a level 2 superpage. 1994 */ 1995 static int 1996 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 1997 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) 1998 { 1999 struct md_page *pvh; 2000 pt_entry_t oldl2; 2001 vm_offset_t eva, va; 2002 vm_page_t m, ml3; 2003 2004 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2005 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); 2006 oldl2 = pmap_load_clear(l2); 2007 KASSERT((oldl2 & PTE_RWX) != 0, 2008 ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2)); 2009 2010 /* 2011 * The sfence.vma documentation states that it is sufficient to specify 2012 * a single address within a superpage mapping. However, since we do 2013 * not perform any invalidation upon promotion, TLBs may still be 2014 * caching 4KB mappings within the superpage, so we must invalidate the 2015 * entire range. 2016 */ 2017 pmap_invalidate_range(pmap, sva, sva + L2_SIZE); 2018 if ((oldl2 & PTE_SW_WIRED) != 0) 2019 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; 2020 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); 2021 if ((oldl2 & PTE_SW_MANAGED) != 0) { 2022 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2)); 2023 pvh = pa_to_pvh(PTE_TO_PHYS(oldl2)); 2024 pmap_pvh_free(pvh, pmap, sva); 2025 eva = sva + L2_SIZE; 2026 for (va = sva, m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl2)); 2027 va < eva; va += PAGE_SIZE, m++) { 2028 if ((oldl2 & PTE_D) != 0) 2029 vm_page_dirty(m); 2030 if ((oldl2 & PTE_A) != 0) 2031 vm_page_aflag_set(m, PGA_REFERENCED); 2032 if (TAILQ_EMPTY(&m->md.pv_list) && 2033 TAILQ_EMPTY(&pvh->pv_list)) 2034 vm_page_aflag_clear(m, PGA_WRITEABLE); 2035 } 2036 } 2037 if (pmap == kernel_pmap) { 2038 pmap_remove_kernel_l2(pmap, l2, sva); 2039 } else { 2040 ml3 = pmap_remove_pt_page(pmap, sva); 2041 if (ml3 != NULL) { 2042 KASSERT(ml3->valid == VM_PAGE_BITS_ALL, 2043 ("pmap_remove_l2: l3 page not promoted")); 2044 pmap_resident_count_dec(pmap, 1); 2045 KASSERT(ml3->ref_count == Ln_ENTRIES, 2046 ("pmap_remove_l2: l3 page ref count error")); 2047 ml3->ref_count = 1; 2048 vm_page_unwire_noq(ml3); 2049 pmap_add_delayed_free_list(ml3, free, FALSE); 2050 } 2051 } 2052 return (pmap_unuse_pt(pmap, sva, l1e, free)); 2053 } 2054 2055 /* 2056 * pmap_remove_l3: do the things to unmap a page in a process 2057 */ 2058 static int 2059 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 2060 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) 2061 { 2062 struct md_page *pvh; 2063 pt_entry_t old_l3; 2064 vm_paddr_t phys; 2065 vm_page_t m; 2066 2067 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2068 old_l3 = pmap_load_clear(l3); 2069 pmap_invalidate_page(pmap, va); 2070 if (old_l3 & PTE_SW_WIRED) 2071 pmap->pm_stats.wired_count -= 1; 2072 pmap_resident_count_dec(pmap, 1); 2073 if (old_l3 & PTE_SW_MANAGED) { 2074 phys = PTE_TO_PHYS(old_l3); 2075 m = PHYS_TO_VM_PAGE(phys); 2076 if ((old_l3 & PTE_D) != 0) 2077 vm_page_dirty(m); 2078 if (old_l3 & PTE_A) 2079 vm_page_aflag_set(m, PGA_REFERENCED); 2080 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2081 pmap_pvh_free(&m->md, pmap, va); 2082 if (TAILQ_EMPTY(&m->md.pv_list) && 2083 (m->flags & PG_FICTITIOUS) == 0) { 2084 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2085 if (TAILQ_EMPTY(&pvh->pv_list)) 2086 vm_page_aflag_clear(m, PGA_WRITEABLE); 2087 } 2088 } 2089 2090 return (pmap_unuse_pt(pmap, va, l2e, free)); 2091 } 2092 2093 /* 2094 * Remove the given range of addresses from the specified map. 2095 * 2096 * It is assumed that the start and end are properly 2097 * rounded to the page size. 2098 */ 2099 void 2100 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2101 { 2102 struct spglist free; 2103 struct rwlock *lock; 2104 vm_offset_t va, va_next; 2105 pd_entry_t *l1, *l2, l2e; 2106 pt_entry_t *l3; 2107 2108 /* 2109 * Perform an unsynchronized read. This is, however, safe. 2110 */ 2111 if (pmap->pm_stats.resident_count == 0) 2112 return; 2113 2114 SLIST_INIT(&free); 2115 2116 rw_rlock(&pvh_global_lock); 2117 PMAP_LOCK(pmap); 2118 2119 lock = NULL; 2120 for (; sva < eva; sva = va_next) { 2121 if (pmap->pm_stats.resident_count == 0) 2122 break; 2123 2124 l1 = pmap_l1(pmap, sva); 2125 if (pmap_load(l1) == 0) { 2126 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2127 if (va_next < sva) 2128 va_next = eva; 2129 continue; 2130 } 2131 2132 /* 2133 * Calculate index for next page table. 2134 */ 2135 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2136 if (va_next < sva) 2137 va_next = eva; 2138 2139 l2 = pmap_l1_to_l2(l1, sva); 2140 if (l2 == NULL) 2141 continue; 2142 if ((l2e = pmap_load(l2)) == 0) 2143 continue; 2144 if ((l2e & PTE_RWX) != 0) { 2145 if (sva + L2_SIZE == va_next && eva >= va_next) { 2146 (void)pmap_remove_l2(pmap, l2, sva, 2147 pmap_load(l1), &free, &lock); 2148 continue; 2149 } else if (!pmap_demote_l2_locked(pmap, l2, sva, 2150 &lock)) { 2151 /* 2152 * The large page mapping was destroyed. 2153 */ 2154 continue; 2155 } 2156 l2e = pmap_load(l2); 2157 } 2158 2159 /* 2160 * Limit our scan to either the end of the va represented 2161 * by the current page table page, or to the end of the 2162 * range being removed. 2163 */ 2164 if (va_next > eva) 2165 va_next = eva; 2166 2167 va = va_next; 2168 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2169 sva += L3_SIZE) { 2170 if (pmap_load(l3) == 0) { 2171 if (va != va_next) { 2172 pmap_invalidate_range(pmap, va, sva); 2173 va = va_next; 2174 } 2175 continue; 2176 } 2177 if (va == va_next) 2178 va = sva; 2179 if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) { 2180 sva += L3_SIZE; 2181 break; 2182 } 2183 } 2184 if (va != va_next) 2185 pmap_invalidate_range(pmap, va, sva); 2186 } 2187 if (lock != NULL) 2188 rw_wunlock(lock); 2189 rw_runlock(&pvh_global_lock); 2190 PMAP_UNLOCK(pmap); 2191 vm_page_free_pages_toq(&free, false); 2192 } 2193 2194 /* 2195 * Routine: pmap_remove_all 2196 * Function: 2197 * Removes this physical page from 2198 * all physical maps in which it resides. 2199 * Reflects back modify bits to the pager. 2200 * 2201 * Notes: 2202 * Original versions of this routine were very 2203 * inefficient because they iteratively called 2204 * pmap_remove (slow...) 2205 */ 2206 2207 void 2208 pmap_remove_all(vm_page_t m) 2209 { 2210 struct spglist free; 2211 struct md_page *pvh; 2212 pmap_t pmap; 2213 pt_entry_t *l3, l3e; 2214 pd_entry_t *l2, l2e; 2215 pv_entry_t pv; 2216 vm_offset_t va; 2217 2218 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2219 ("pmap_remove_all: page %p is not managed", m)); 2220 SLIST_INIT(&free); 2221 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 2222 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2223 2224 rw_wlock(&pvh_global_lock); 2225 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 2226 pmap = PV_PMAP(pv); 2227 PMAP_LOCK(pmap); 2228 va = pv->pv_va; 2229 l2 = pmap_l2(pmap, va); 2230 (void)pmap_demote_l2(pmap, l2, va); 2231 PMAP_UNLOCK(pmap); 2232 } 2233 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2234 pmap = PV_PMAP(pv); 2235 PMAP_LOCK(pmap); 2236 pmap_resident_count_dec(pmap, 1); 2237 l2 = pmap_l2(pmap, pv->pv_va); 2238 KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found")); 2239 l2e = pmap_load(l2); 2240 2241 KASSERT((l2e & PTE_RX) == 0, 2242 ("pmap_remove_all: found a superpage in %p's pv list", m)); 2243 2244 l3 = pmap_l2_to_l3(l2, pv->pv_va); 2245 l3e = pmap_load_clear(l3); 2246 pmap_invalidate_page(pmap, pv->pv_va); 2247 if (l3e & PTE_SW_WIRED) 2248 pmap->pm_stats.wired_count--; 2249 if ((l3e & PTE_A) != 0) 2250 vm_page_aflag_set(m, PGA_REFERENCED); 2251 2252 /* 2253 * Update the vm_page_t clean and reference bits. 2254 */ 2255 if ((l3e & PTE_D) != 0) 2256 vm_page_dirty(m); 2257 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free); 2258 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2259 m->md.pv_gen++; 2260 free_pv_entry(pmap, pv); 2261 PMAP_UNLOCK(pmap); 2262 } 2263 vm_page_aflag_clear(m, PGA_WRITEABLE); 2264 rw_wunlock(&pvh_global_lock); 2265 vm_page_free_pages_toq(&free, false); 2266 } 2267 2268 /* 2269 * Set the physical protection on the 2270 * specified range of this map as requested. 2271 */ 2272 void 2273 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2274 { 2275 pd_entry_t *l1, *l2, l2e; 2276 pt_entry_t *l3, l3e, mask; 2277 vm_page_t m, mt; 2278 vm_paddr_t pa; 2279 vm_offset_t va_next; 2280 bool anychanged, pv_lists_locked; 2281 2282 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2283 pmap_remove(pmap, sva, eva); 2284 return; 2285 } 2286 2287 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == 2288 (VM_PROT_WRITE | VM_PROT_EXECUTE)) 2289 return; 2290 2291 anychanged = false; 2292 pv_lists_locked = false; 2293 mask = 0; 2294 if ((prot & VM_PROT_WRITE) == 0) 2295 mask |= PTE_W | PTE_D; 2296 if ((prot & VM_PROT_EXECUTE) == 0) 2297 mask |= PTE_X; 2298 resume: 2299 PMAP_LOCK(pmap); 2300 for (; sva < eva; sva = va_next) { 2301 l1 = pmap_l1(pmap, sva); 2302 if (pmap_load(l1) == 0) { 2303 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2304 if (va_next < sva) 2305 va_next = eva; 2306 continue; 2307 } 2308 2309 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2310 if (va_next < sva) 2311 va_next = eva; 2312 2313 l2 = pmap_l1_to_l2(l1, sva); 2314 if (l2 == NULL || (l2e = pmap_load(l2)) == 0) 2315 continue; 2316 if ((l2e & PTE_RWX) != 0) { 2317 if (sva + L2_SIZE == va_next && eva >= va_next) { 2318 retryl2: 2319 if ((prot & VM_PROT_WRITE) == 0 && 2320 (l2e & (PTE_SW_MANAGED | PTE_D)) == 2321 (PTE_SW_MANAGED | PTE_D)) { 2322 pa = PTE_TO_PHYS(l2e); 2323 m = PHYS_TO_VM_PAGE(pa); 2324 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 2325 vm_page_dirty(mt); 2326 } 2327 if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask)) 2328 goto retryl2; 2329 anychanged = true; 2330 continue; 2331 } else { 2332 if (!pv_lists_locked) { 2333 pv_lists_locked = true; 2334 if (!rw_try_rlock(&pvh_global_lock)) { 2335 if (anychanged) 2336 pmap_invalidate_all( 2337 pmap); 2338 PMAP_UNLOCK(pmap); 2339 rw_rlock(&pvh_global_lock); 2340 goto resume; 2341 } 2342 } 2343 if (!pmap_demote_l2(pmap, l2, sva)) { 2344 /* 2345 * The large page mapping was destroyed. 2346 */ 2347 continue; 2348 } 2349 } 2350 } 2351 2352 if (va_next > eva) 2353 va_next = eva; 2354 2355 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2356 sva += L3_SIZE) { 2357 l3e = pmap_load(l3); 2358 retryl3: 2359 if ((l3e & PTE_V) == 0) 2360 continue; 2361 if ((prot & VM_PROT_WRITE) == 0 && 2362 (l3e & (PTE_SW_MANAGED | PTE_D)) == 2363 (PTE_SW_MANAGED | PTE_D)) { 2364 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(l3e)); 2365 vm_page_dirty(m); 2366 } 2367 if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask)) 2368 goto retryl3; 2369 anychanged = true; 2370 } 2371 } 2372 if (anychanged) 2373 pmap_invalidate_all(pmap); 2374 if (pv_lists_locked) 2375 rw_runlock(&pvh_global_lock); 2376 PMAP_UNLOCK(pmap); 2377 } 2378 2379 int 2380 pmap_fault_fixup(pmap_t pmap, vm_offset_t va, vm_prot_t ftype) 2381 { 2382 pd_entry_t *l2, l2e; 2383 pt_entry_t bits, *pte, oldpte; 2384 int rv; 2385 2386 rv = 0; 2387 PMAP_LOCK(pmap); 2388 l2 = pmap_l2(pmap, va); 2389 if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) 2390 goto done; 2391 if ((l2e & PTE_RWX) == 0) { 2392 pte = pmap_l2_to_l3(l2, va); 2393 if (pte == NULL || ((oldpte = pmap_load(pte)) & PTE_V) == 0) 2394 goto done; 2395 } else { 2396 pte = l2; 2397 oldpte = l2e; 2398 } 2399 2400 if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) || 2401 (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) || 2402 (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) || 2403 (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0)) 2404 goto done; 2405 2406 bits = PTE_A; 2407 if (ftype == VM_PROT_WRITE) 2408 bits |= PTE_D; 2409 2410 /* 2411 * Spurious faults can occur if the implementation caches invalid 2412 * entries in the TLB, or if simultaneous accesses on multiple CPUs 2413 * race with each other. 2414 */ 2415 if ((oldpte & bits) != bits) 2416 pmap_store_bits(pte, bits); 2417 sfence_vma(); 2418 rv = 1; 2419 done: 2420 PMAP_UNLOCK(pmap); 2421 return (rv); 2422 } 2423 2424 static bool 2425 pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va) 2426 { 2427 struct rwlock *lock; 2428 bool rv; 2429 2430 lock = NULL; 2431 rv = pmap_demote_l2_locked(pmap, l2, va, &lock); 2432 if (lock != NULL) 2433 rw_wunlock(lock); 2434 return (rv); 2435 } 2436 2437 /* 2438 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 2439 * mapping is invalidated. 2440 */ 2441 static bool 2442 pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 2443 struct rwlock **lockp) 2444 { 2445 struct spglist free; 2446 vm_page_t mpte; 2447 pd_entry_t newl2, oldl2; 2448 pt_entry_t *firstl3, newl3; 2449 vm_paddr_t mptepa; 2450 int i; 2451 2452 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2453 2454 oldl2 = pmap_load(l2); 2455 KASSERT((oldl2 & PTE_RWX) != 0, 2456 ("pmap_demote_l2_locked: oldl2 is not a leaf entry")); 2457 if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 2458 NULL) { 2459 if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc(NULL, 2460 pmap_l2_pindex(va), (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 2461 VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == 2462 NULL) { 2463 SLIST_INIT(&free); 2464 (void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET, 2465 pmap_load(pmap_l1(pmap, va)), &free, lockp); 2466 vm_page_free_pages_toq(&free, true); 2467 CTR2(KTR_PMAP, "pmap_demote_l2_locked: " 2468 "failure for va %#lx in pmap %p", va, pmap); 2469 return (false); 2470 } 2471 if (va < VM_MAXUSER_ADDRESS) { 2472 mpte->ref_count = Ln_ENTRIES; 2473 pmap_resident_count_inc(pmap, 1); 2474 } 2475 } 2476 mptepa = VM_PAGE_TO_PHYS(mpte); 2477 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 2478 newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V; 2479 KASSERT((oldl2 & PTE_A) != 0, 2480 ("pmap_demote_l2_locked: oldl2 is missing PTE_A")); 2481 KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W, 2482 ("pmap_demote_l2_locked: oldl2 is missing PTE_D")); 2483 newl3 = oldl2; 2484 2485 /* 2486 * If the page table page is not leftover from an earlier promotion, 2487 * initialize it. 2488 */ 2489 if (mpte->valid == 0) { 2490 for (i = 0; i < Ln_ENTRIES; i++) 2491 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2492 } 2493 KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3), 2494 ("pmap_demote_l2_locked: firstl3 and newl3 map different physical " 2495 "addresses")); 2496 2497 /* 2498 * If the mapping has changed attributes, update the page table 2499 * entries. 2500 */ 2501 if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE)) 2502 for (i = 0; i < Ln_ENTRIES; i++) 2503 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2504 2505 /* 2506 * The spare PV entries must be reserved prior to demoting the 2507 * mapping, that is, prior to changing the L2 entry. Otherwise, the 2508 * state of the L2 entry and the PV lists will be inconsistent, which 2509 * can result in reclaim_pv_chunk() attempting to remove a PV entry from 2510 * the wrong PV list and pmap_pv_demote_l2() failing to find the 2511 * expected PV entry for the 2MB page mapping that is being demoted. 2512 */ 2513 if ((oldl2 & PTE_SW_MANAGED) != 0) 2514 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); 2515 2516 /* 2517 * Demote the mapping. 2518 */ 2519 pmap_store(l2, newl2); 2520 2521 /* 2522 * Demote the PV entry. 2523 */ 2524 if ((oldl2 & PTE_SW_MANAGED) != 0) 2525 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp); 2526 2527 atomic_add_long(&pmap_l2_demotions, 1); 2528 CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p", 2529 va, pmap); 2530 return (true); 2531 } 2532 2533 #if VM_NRESERVLEVEL > 0 2534 static void 2535 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 2536 struct rwlock **lockp) 2537 { 2538 pt_entry_t *firstl3, *l3; 2539 vm_paddr_t pa; 2540 vm_page_t ml3; 2541 2542 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2543 2544 va &= ~L2_OFFSET; 2545 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 2546 ("pmap_promote_l2: invalid l2 entry %p", l2)); 2547 2548 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2))); 2549 pa = PTE_TO_PHYS(pmap_load(firstl3)); 2550 if ((pa & L2_OFFSET) != 0) { 2551 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 2552 va, pmap); 2553 atomic_add_long(&pmap_l2_p_failures, 1); 2554 return; 2555 } 2556 2557 pa += PAGE_SIZE; 2558 for (l3 = firstl3 + 1; l3 < firstl3 + Ln_ENTRIES; l3++) { 2559 if (PTE_TO_PHYS(pmap_load(l3)) != pa) { 2560 CTR2(KTR_PMAP, 2561 "pmap_promote_l2: failure for va %#lx pmap %p", 2562 va, pmap); 2563 atomic_add_long(&pmap_l2_p_failures, 1); 2564 return; 2565 } 2566 if ((pmap_load(l3) & PTE_PROMOTE) != 2567 (pmap_load(firstl3) & PTE_PROMOTE)) { 2568 CTR2(KTR_PMAP, 2569 "pmap_promote_l2: failure for va %#lx pmap %p", 2570 va, pmap); 2571 atomic_add_long(&pmap_l2_p_failures, 1); 2572 return; 2573 } 2574 pa += PAGE_SIZE; 2575 } 2576 2577 ml3 = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2578 KASSERT(ml3->pindex == pmap_l2_pindex(va), 2579 ("pmap_promote_l2: page table page's pindex is wrong")); 2580 if (pmap_insert_pt_page(pmap, ml3, true)) { 2581 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 2582 va, pmap); 2583 atomic_add_long(&pmap_l2_p_failures, 1); 2584 return; 2585 } 2586 2587 if ((pmap_load(firstl3) & PTE_SW_MANAGED) != 0) 2588 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(pmap_load(firstl3)), 2589 lockp); 2590 2591 pmap_store(l2, pmap_load(firstl3)); 2592 2593 atomic_add_long(&pmap_l2_promotions, 1); 2594 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, 2595 pmap); 2596 } 2597 #endif 2598 2599 /* 2600 * Insert the given physical page (p) at 2601 * the specified virtual address (v) in the 2602 * target physical map with the protection requested. 2603 * 2604 * If specified, the page will be wired down, meaning 2605 * that the related pte can not be reclaimed. 2606 * 2607 * NB: This is the only routine which MAY NOT lazy-evaluate 2608 * or lose information. That is, this routine must actually 2609 * insert this page into the given map NOW. 2610 */ 2611 int 2612 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2613 u_int flags, int8_t psind) 2614 { 2615 struct rwlock *lock; 2616 pd_entry_t *l1, *l2, l2e; 2617 pt_entry_t new_l3, orig_l3; 2618 pt_entry_t *l3; 2619 pv_entry_t pv; 2620 vm_paddr_t opa, pa, l2_pa, l3_pa; 2621 vm_page_t mpte, om, l2_m, l3_m; 2622 pt_entry_t entry; 2623 pn_t l2_pn, l3_pn, pn; 2624 int rv; 2625 bool nosleep; 2626 2627 va = trunc_page(va); 2628 if ((m->oflags & VPO_UNMANAGED) == 0) 2629 VM_PAGE_OBJECT_BUSY_ASSERT(m); 2630 pa = VM_PAGE_TO_PHYS(m); 2631 pn = (pa / PAGE_SIZE); 2632 2633 new_l3 = PTE_V | PTE_R | PTE_A; 2634 if (prot & VM_PROT_EXECUTE) 2635 new_l3 |= PTE_X; 2636 if (flags & VM_PROT_WRITE) 2637 new_l3 |= PTE_D; 2638 if (prot & VM_PROT_WRITE) 2639 new_l3 |= PTE_W; 2640 if (va < VM_MAX_USER_ADDRESS) 2641 new_l3 |= PTE_U; 2642 2643 new_l3 |= (pn << PTE_PPN0_S); 2644 if ((flags & PMAP_ENTER_WIRED) != 0) 2645 new_l3 |= PTE_SW_WIRED; 2646 2647 /* 2648 * Set modified bit gratuitously for writeable mappings if 2649 * the page is unmanaged. We do not want to take a fault 2650 * to do the dirty bit accounting for these mappings. 2651 */ 2652 if ((m->oflags & VPO_UNMANAGED) != 0) { 2653 if (prot & VM_PROT_WRITE) 2654 new_l3 |= PTE_D; 2655 } else 2656 new_l3 |= PTE_SW_MANAGED; 2657 2658 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); 2659 2660 lock = NULL; 2661 mpte = NULL; 2662 rw_rlock(&pvh_global_lock); 2663 PMAP_LOCK(pmap); 2664 if (psind == 1) { 2665 /* Assert the required virtual and physical alignment. */ 2666 KASSERT((va & L2_OFFSET) == 0, 2667 ("pmap_enter: va %#lx unaligned", va)); 2668 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 2669 rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock); 2670 goto out; 2671 } 2672 2673 l2 = pmap_l2(pmap, va); 2674 if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 && 2675 ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2, 2676 va, &lock))) { 2677 l3 = pmap_l2_to_l3(l2, va); 2678 if (va < VM_MAXUSER_ADDRESS) { 2679 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2680 mpte->ref_count++; 2681 } 2682 } else if (va < VM_MAXUSER_ADDRESS) { 2683 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 2684 mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock); 2685 if (mpte == NULL && nosleep) { 2686 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); 2687 if (lock != NULL) 2688 rw_wunlock(lock); 2689 rw_runlock(&pvh_global_lock); 2690 PMAP_UNLOCK(pmap); 2691 return (KERN_RESOURCE_SHORTAGE); 2692 } 2693 l3 = pmap_l3(pmap, va); 2694 } else { 2695 l3 = pmap_l3(pmap, va); 2696 /* TODO: This is not optimal, but should mostly work */ 2697 if (l3 == NULL) { 2698 if (l2 == NULL) { 2699 l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2700 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2701 VM_ALLOC_ZERO); 2702 if (l2_m == NULL) 2703 panic("pmap_enter: l2 pte_m == NULL"); 2704 if ((l2_m->flags & PG_ZERO) == 0) 2705 pmap_zero_page(l2_m); 2706 2707 l2_pa = VM_PAGE_TO_PHYS(l2_m); 2708 l2_pn = (l2_pa / PAGE_SIZE); 2709 2710 l1 = pmap_l1(pmap, va); 2711 entry = (PTE_V); 2712 entry |= (l2_pn << PTE_PPN0_S); 2713 pmap_store(l1, entry); 2714 pmap_distribute_l1(pmap, pmap_l1_index(va), entry); 2715 l2 = pmap_l1_to_l2(l1, va); 2716 } 2717 2718 l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2719 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2720 if (l3_m == NULL) 2721 panic("pmap_enter: l3 pte_m == NULL"); 2722 if ((l3_m->flags & PG_ZERO) == 0) 2723 pmap_zero_page(l3_m); 2724 2725 l3_pa = VM_PAGE_TO_PHYS(l3_m); 2726 l3_pn = (l3_pa / PAGE_SIZE); 2727 entry = (PTE_V); 2728 entry |= (l3_pn << PTE_PPN0_S); 2729 pmap_store(l2, entry); 2730 l3 = pmap_l2_to_l3(l2, va); 2731 } 2732 pmap_invalidate_page(pmap, va); 2733 } 2734 2735 orig_l3 = pmap_load(l3); 2736 opa = PTE_TO_PHYS(orig_l3); 2737 pv = NULL; 2738 2739 /* 2740 * Is the specified virtual address already mapped? 2741 */ 2742 if ((orig_l3 & PTE_V) != 0) { 2743 /* 2744 * Wiring change, just update stats. We don't worry about 2745 * wiring PT pages as they remain resident as long as there 2746 * are valid mappings in them. Hence, if a user page is wired, 2747 * the PT page will be also. 2748 */ 2749 if ((flags & PMAP_ENTER_WIRED) != 0 && 2750 (orig_l3 & PTE_SW_WIRED) == 0) 2751 pmap->pm_stats.wired_count++; 2752 else if ((flags & PMAP_ENTER_WIRED) == 0 && 2753 (orig_l3 & PTE_SW_WIRED) != 0) 2754 pmap->pm_stats.wired_count--; 2755 2756 /* 2757 * Remove the extra PT page reference. 2758 */ 2759 if (mpte != NULL) { 2760 mpte->ref_count--; 2761 KASSERT(mpte->ref_count > 0, 2762 ("pmap_enter: missing reference to page table page," 2763 " va: 0x%lx", va)); 2764 } 2765 2766 /* 2767 * Has the physical page changed? 2768 */ 2769 if (opa == pa) { 2770 /* 2771 * No, might be a protection or wiring change. 2772 */ 2773 if ((orig_l3 & PTE_SW_MANAGED) != 0 && 2774 (new_l3 & PTE_W) != 0) 2775 vm_page_aflag_set(m, PGA_WRITEABLE); 2776 goto validate; 2777 } 2778 2779 /* 2780 * The physical page has changed. Temporarily invalidate 2781 * the mapping. This ensures that all threads sharing the 2782 * pmap keep a consistent view of the mapping, which is 2783 * necessary for the correct handling of COW faults. It 2784 * also permits reuse of the old mapping's PV entry, 2785 * avoiding an allocation. 2786 * 2787 * For consistency, handle unmanaged mappings the same way. 2788 */ 2789 orig_l3 = pmap_load_clear(l3); 2790 KASSERT(PTE_TO_PHYS(orig_l3) == opa, 2791 ("pmap_enter: unexpected pa update for %#lx", va)); 2792 if ((orig_l3 & PTE_SW_MANAGED) != 0) { 2793 om = PHYS_TO_VM_PAGE(opa); 2794 2795 /* 2796 * The pmap lock is sufficient to synchronize with 2797 * concurrent calls to pmap_page_test_mappings() and 2798 * pmap_ts_referenced(). 2799 */ 2800 if ((orig_l3 & PTE_D) != 0) 2801 vm_page_dirty(om); 2802 if ((orig_l3 & PTE_A) != 0) 2803 vm_page_aflag_set(om, PGA_REFERENCED); 2804 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 2805 pv = pmap_pvh_remove(&om->md, pmap, va); 2806 KASSERT(pv != NULL, 2807 ("pmap_enter: no PV entry for %#lx", va)); 2808 if ((new_l3 & PTE_SW_MANAGED) == 0) 2809 free_pv_entry(pmap, pv); 2810 if ((om->a.flags & PGA_WRITEABLE) != 0 && 2811 TAILQ_EMPTY(&om->md.pv_list) && 2812 ((om->flags & PG_FICTITIOUS) != 0 || 2813 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 2814 vm_page_aflag_clear(om, PGA_WRITEABLE); 2815 } 2816 pmap_invalidate_page(pmap, va); 2817 orig_l3 = 0; 2818 } else { 2819 /* 2820 * Increment the counters. 2821 */ 2822 if ((new_l3 & PTE_SW_WIRED) != 0) 2823 pmap->pm_stats.wired_count++; 2824 pmap_resident_count_inc(pmap, 1); 2825 } 2826 /* 2827 * Enter on the PV list if part of our managed memory. 2828 */ 2829 if ((new_l3 & PTE_SW_MANAGED) != 0) { 2830 if (pv == NULL) { 2831 pv = get_pv_entry(pmap, &lock); 2832 pv->pv_va = va; 2833 } 2834 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 2835 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2836 m->md.pv_gen++; 2837 if ((new_l3 & PTE_W) != 0) 2838 vm_page_aflag_set(m, PGA_WRITEABLE); 2839 } 2840 2841 validate: 2842 /* 2843 * Sync the i-cache on all harts before updating the PTE 2844 * if the new PTE is executable. 2845 */ 2846 if (prot & VM_PROT_EXECUTE) 2847 pmap_sync_icache(pmap, va, PAGE_SIZE); 2848 2849 /* 2850 * Update the L3 entry. 2851 */ 2852 if (orig_l3 != 0) { 2853 orig_l3 = pmap_load_store(l3, new_l3); 2854 pmap_invalidate_page(pmap, va); 2855 KASSERT(PTE_TO_PHYS(orig_l3) == pa, 2856 ("pmap_enter: invalid update")); 2857 if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) == 2858 (PTE_D | PTE_SW_MANAGED)) 2859 vm_page_dirty(m); 2860 } else { 2861 pmap_store(l3, new_l3); 2862 } 2863 2864 #if VM_NRESERVLEVEL > 0 2865 if (mpte != NULL && mpte->ref_count == Ln_ENTRIES && 2866 pmap_ps_enabled(pmap) && 2867 (m->flags & PG_FICTITIOUS) == 0 && 2868 vm_reserv_level_iffullpop(m) == 0) 2869 pmap_promote_l2(pmap, l2, va, &lock); 2870 #endif 2871 2872 rv = KERN_SUCCESS; 2873 out: 2874 if (lock != NULL) 2875 rw_wunlock(lock); 2876 rw_runlock(&pvh_global_lock); 2877 PMAP_UNLOCK(pmap); 2878 return (rv); 2879 } 2880 2881 /* 2882 * Tries to create a read- and/or execute-only 2MB page mapping. Returns true 2883 * if successful. Returns false if (1) a page table page cannot be allocated 2884 * without sleeping, (2) a mapping already exists at the specified virtual 2885 * address, or (3) a PV entry cannot be allocated without reclaiming another 2886 * PV entry. 2887 */ 2888 static bool 2889 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2890 struct rwlock **lockp) 2891 { 2892 pd_entry_t new_l2; 2893 pn_t pn; 2894 2895 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2896 2897 pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE; 2898 new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V); 2899 if ((m->oflags & VPO_UNMANAGED) == 0) 2900 new_l2 |= PTE_SW_MANAGED; 2901 if ((prot & VM_PROT_EXECUTE) != 0) 2902 new_l2 |= PTE_X; 2903 if (va < VM_MAXUSER_ADDRESS) 2904 new_l2 |= PTE_U; 2905 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | 2906 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == 2907 KERN_SUCCESS); 2908 } 2909 2910 /* 2911 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 2912 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 2913 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 2914 * a mapping already exists at the specified virtual address. Returns 2915 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table 2916 * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if 2917 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. 2918 * 2919 * The parameter "m" is only used when creating a managed, writeable mapping. 2920 */ 2921 static int 2922 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, 2923 vm_page_t m, struct rwlock **lockp) 2924 { 2925 struct spglist free; 2926 pd_entry_t *l2, *l3, oldl2; 2927 vm_offset_t sva; 2928 vm_page_t l2pg, mt; 2929 2930 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2931 2932 if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? 2933 NULL : lockp)) == NULL) { 2934 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", 2935 va, pmap); 2936 return (KERN_RESOURCE_SHORTAGE); 2937 } 2938 2939 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); 2940 l2 = &l2[pmap_l2_index(va)]; 2941 if ((oldl2 = pmap_load(l2)) != 0) { 2942 KASSERT(l2pg->ref_count > 1, 2943 ("pmap_enter_l2: l2pg's ref count is too low")); 2944 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 2945 l2pg->ref_count--; 2946 CTR2(KTR_PMAP, 2947 "pmap_enter_l2: failure for va %#lx in pmap %p", 2948 va, pmap); 2949 return (KERN_FAILURE); 2950 } 2951 SLIST_INIT(&free); 2952 if ((oldl2 & PTE_RWX) != 0) 2953 (void)pmap_remove_l2(pmap, l2, va, 2954 pmap_load(pmap_l1(pmap, va)), &free, lockp); 2955 else 2956 for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) { 2957 l3 = pmap_l2_to_l3(l2, sva); 2958 if ((pmap_load(l3) & PTE_V) != 0 && 2959 pmap_remove_l3(pmap, l3, sva, oldl2, &free, 2960 lockp) != 0) 2961 break; 2962 } 2963 vm_page_free_pages_toq(&free, true); 2964 if (va >= VM_MAXUSER_ADDRESS) { 2965 /* 2966 * Both pmap_remove_l2() and pmap_remove_l3() will 2967 * leave the kernel page table page zero filled. 2968 */ 2969 mt = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2970 if (pmap_insert_pt_page(pmap, mt, false)) 2971 panic("pmap_enter_l2: trie insert failed"); 2972 } else 2973 KASSERT(pmap_load(l2) == 0, 2974 ("pmap_enter_l2: non-zero L2 entry %p", l2)); 2975 } 2976 2977 if ((new_l2 & PTE_SW_MANAGED) != 0) { 2978 /* 2979 * Abort this mapping if its PV entry could not be created. 2980 */ 2981 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { 2982 SLIST_INIT(&free); 2983 if (pmap_unwire_ptp(pmap, va, l2pg, &free)) { 2984 /* 2985 * Although "va" is not mapped, paging-structure 2986 * caches could nonetheless have entries that 2987 * refer to the freed page table pages. 2988 * Invalidate those entries. 2989 */ 2990 pmap_invalidate_page(pmap, va); 2991 vm_page_free_pages_toq(&free, true); 2992 } 2993 CTR2(KTR_PMAP, 2994 "pmap_enter_l2: failure for va %#lx in pmap %p", 2995 va, pmap); 2996 return (KERN_RESOURCE_SHORTAGE); 2997 } 2998 if ((new_l2 & PTE_W) != 0) 2999 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 3000 vm_page_aflag_set(mt, PGA_WRITEABLE); 3001 } 3002 3003 /* 3004 * Increment counters. 3005 */ 3006 if ((new_l2 & PTE_SW_WIRED) != 0) 3007 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; 3008 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; 3009 3010 /* 3011 * Map the superpage. 3012 */ 3013 pmap_store(l2, new_l2); 3014 3015 atomic_add_long(&pmap_l2_mappings, 1); 3016 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", 3017 va, pmap); 3018 3019 return (KERN_SUCCESS); 3020 } 3021 3022 /* 3023 * Maps a sequence of resident pages belonging to the same object. 3024 * The sequence begins with the given page m_start. This page is 3025 * mapped at the given virtual address start. Each subsequent page is 3026 * mapped at a virtual address that is offset from start by the same 3027 * amount as the page is offset from m_start within the object. The 3028 * last page in the sequence is the page with the largest offset from 3029 * m_start that can be mapped at a virtual address less than the given 3030 * virtual address end. Not every virtual page between start and end 3031 * is mapped; only those for which a resident page exists with the 3032 * corresponding offset from m_start are mapped. 3033 */ 3034 void 3035 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3036 vm_page_t m_start, vm_prot_t prot) 3037 { 3038 struct rwlock *lock; 3039 vm_offset_t va; 3040 vm_page_t m, mpte; 3041 vm_pindex_t diff, psize; 3042 3043 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3044 3045 psize = atop(end - start); 3046 mpte = NULL; 3047 m = m_start; 3048 lock = NULL; 3049 rw_rlock(&pvh_global_lock); 3050 PMAP_LOCK(pmap); 3051 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3052 va = start + ptoa(diff); 3053 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && 3054 m->psind == 1 && pmap_ps_enabled(pmap) && 3055 pmap_enter_2mpage(pmap, va, m, prot, &lock)) 3056 m = &m[L2_SIZE / PAGE_SIZE - 1]; 3057 else 3058 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, 3059 &lock); 3060 m = TAILQ_NEXT(m, listq); 3061 } 3062 if (lock != NULL) 3063 rw_wunlock(lock); 3064 rw_runlock(&pvh_global_lock); 3065 PMAP_UNLOCK(pmap); 3066 } 3067 3068 /* 3069 * this code makes some *MAJOR* assumptions: 3070 * 1. Current pmap & pmap exists. 3071 * 2. Not wired. 3072 * 3. Read access. 3073 * 4. No page table pages. 3074 * but is *MUCH* faster than pmap_enter... 3075 */ 3076 3077 void 3078 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3079 { 3080 struct rwlock *lock; 3081 3082 lock = NULL; 3083 rw_rlock(&pvh_global_lock); 3084 PMAP_LOCK(pmap); 3085 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 3086 if (lock != NULL) 3087 rw_wunlock(lock); 3088 rw_runlock(&pvh_global_lock); 3089 PMAP_UNLOCK(pmap); 3090 } 3091 3092 static vm_page_t 3093 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3094 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 3095 { 3096 struct spglist free; 3097 vm_paddr_t phys; 3098 pd_entry_t *l2; 3099 pt_entry_t *l3, newl3; 3100 3101 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 3102 (m->oflags & VPO_UNMANAGED) != 0, 3103 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3104 rw_assert(&pvh_global_lock, RA_LOCKED); 3105 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3106 3107 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); 3108 /* 3109 * In the case that a page table page is not 3110 * resident, we are creating it here. 3111 */ 3112 if (va < VM_MAXUSER_ADDRESS) { 3113 vm_pindex_t l2pindex; 3114 3115 /* 3116 * Calculate pagetable page index 3117 */ 3118 l2pindex = pmap_l2_pindex(va); 3119 if (mpte && (mpte->pindex == l2pindex)) { 3120 mpte->ref_count++; 3121 } else { 3122 /* 3123 * Get the l2 entry 3124 */ 3125 l2 = pmap_l2(pmap, va); 3126 3127 /* 3128 * If the page table page is mapped, we just increment 3129 * the hold count, and activate it. Otherwise, we 3130 * attempt to allocate a page table page. If this 3131 * attempt fails, we don't retry. Instead, we give up. 3132 */ 3133 if (l2 != NULL && pmap_load(l2) != 0) { 3134 phys = PTE_TO_PHYS(pmap_load(l2)); 3135 mpte = PHYS_TO_VM_PAGE(phys); 3136 mpte->ref_count++; 3137 } else { 3138 /* 3139 * Pass NULL instead of the PV list lock 3140 * pointer, because we don't intend to sleep. 3141 */ 3142 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); 3143 if (mpte == NULL) 3144 return (mpte); 3145 } 3146 } 3147 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 3148 l3 = &l3[pmap_l3_index(va)]; 3149 } else { 3150 mpte = NULL; 3151 l3 = pmap_l3(kernel_pmap, va); 3152 } 3153 if (l3 == NULL) 3154 panic("pmap_enter_quick_locked: No l3"); 3155 if (pmap_load(l3) != 0) { 3156 if (mpte != NULL) { 3157 mpte->ref_count--; 3158 mpte = NULL; 3159 } 3160 return (mpte); 3161 } 3162 3163 /* 3164 * Enter on the PV list if part of our managed memory. 3165 */ 3166 if ((m->oflags & VPO_UNMANAGED) == 0 && 3167 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 3168 if (mpte != NULL) { 3169 SLIST_INIT(&free); 3170 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 3171 pmap_invalidate_page(pmap, va); 3172 vm_page_free_pages_toq(&free, false); 3173 } 3174 mpte = NULL; 3175 } 3176 return (mpte); 3177 } 3178 3179 /* 3180 * Increment counters 3181 */ 3182 pmap_resident_count_inc(pmap, 1); 3183 3184 newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) | 3185 PTE_V | PTE_R; 3186 if ((prot & VM_PROT_EXECUTE) != 0) 3187 newl3 |= PTE_X; 3188 if ((m->oflags & VPO_UNMANAGED) == 0) 3189 newl3 |= PTE_SW_MANAGED; 3190 if (va < VM_MAX_USER_ADDRESS) 3191 newl3 |= PTE_U; 3192 3193 /* 3194 * Sync the i-cache on all harts before updating the PTE 3195 * if the new PTE is executable. 3196 */ 3197 if (prot & VM_PROT_EXECUTE) 3198 pmap_sync_icache(pmap, va, PAGE_SIZE); 3199 3200 pmap_store(l3, newl3); 3201 3202 pmap_invalidate_page(pmap, va); 3203 return (mpte); 3204 } 3205 3206 /* 3207 * This code maps large physical mmap regions into the 3208 * processor address space. Note that some shortcuts 3209 * are taken, but the code works. 3210 */ 3211 void 3212 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3213 vm_pindex_t pindex, vm_size_t size) 3214 { 3215 3216 VM_OBJECT_ASSERT_WLOCKED(object); 3217 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3218 ("pmap_object_init_pt: non-device object")); 3219 } 3220 3221 /* 3222 * Clear the wired attribute from the mappings for the specified range of 3223 * addresses in the given pmap. Every valid mapping within that range 3224 * must have the wired attribute set. In contrast, invalid mappings 3225 * cannot have the wired attribute set, so they are ignored. 3226 * 3227 * The wired attribute of the page table entry is not a hardware feature, 3228 * so there is no need to invalidate any TLB entries. 3229 */ 3230 void 3231 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3232 { 3233 vm_offset_t va_next; 3234 pd_entry_t *l1, *l2, l2e; 3235 pt_entry_t *l3, l3e; 3236 bool pv_lists_locked; 3237 3238 pv_lists_locked = false; 3239 retry: 3240 PMAP_LOCK(pmap); 3241 for (; sva < eva; sva = va_next) { 3242 l1 = pmap_l1(pmap, sva); 3243 if (pmap_load(l1) == 0) { 3244 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3245 if (va_next < sva) 3246 va_next = eva; 3247 continue; 3248 } 3249 3250 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3251 if (va_next < sva) 3252 va_next = eva; 3253 3254 l2 = pmap_l1_to_l2(l1, sva); 3255 if ((l2e = pmap_load(l2)) == 0) 3256 continue; 3257 if ((l2e & PTE_RWX) != 0) { 3258 if (sva + L2_SIZE == va_next && eva >= va_next) { 3259 if ((l2e & PTE_SW_WIRED) == 0) 3260 panic("pmap_unwire: l2 %#jx is missing " 3261 "PTE_SW_WIRED", (uintmax_t)l2e); 3262 pmap_clear_bits(l2, PTE_SW_WIRED); 3263 continue; 3264 } else { 3265 if (!pv_lists_locked) { 3266 pv_lists_locked = true; 3267 if (!rw_try_rlock(&pvh_global_lock)) { 3268 PMAP_UNLOCK(pmap); 3269 rw_rlock(&pvh_global_lock); 3270 /* Repeat sva. */ 3271 goto retry; 3272 } 3273 } 3274 if (!pmap_demote_l2(pmap, l2, sva)) 3275 panic("pmap_unwire: demotion failed"); 3276 } 3277 } 3278 3279 if (va_next > eva) 3280 va_next = eva; 3281 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 3282 sva += L3_SIZE) { 3283 if ((l3e = pmap_load(l3)) == 0) 3284 continue; 3285 if ((l3e & PTE_SW_WIRED) == 0) 3286 panic("pmap_unwire: l3 %#jx is missing " 3287 "PTE_SW_WIRED", (uintmax_t)l3e); 3288 3289 /* 3290 * PG_W must be cleared atomically. Although the pmap 3291 * lock synchronizes access to PG_W, another processor 3292 * could be setting PG_M and/or PG_A concurrently. 3293 */ 3294 pmap_clear_bits(l3, PTE_SW_WIRED); 3295 pmap->pm_stats.wired_count--; 3296 } 3297 } 3298 if (pv_lists_locked) 3299 rw_runlock(&pvh_global_lock); 3300 PMAP_UNLOCK(pmap); 3301 } 3302 3303 /* 3304 * Copy the range specified by src_addr/len 3305 * from the source map to the range dst_addr/len 3306 * in the destination map. 3307 * 3308 * This routine is only advisory and need not do anything. 3309 */ 3310 3311 void 3312 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 3313 vm_offset_t src_addr) 3314 { 3315 3316 } 3317 3318 /* 3319 * pmap_zero_page zeros the specified hardware page by mapping 3320 * the page into KVM and using bzero to clear its contents. 3321 */ 3322 void 3323 pmap_zero_page(vm_page_t m) 3324 { 3325 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3326 3327 pagezero((void *)va); 3328 } 3329 3330 /* 3331 * pmap_zero_page_area zeros the specified hardware page by mapping 3332 * the page into KVM and using bzero to clear its contents. 3333 * 3334 * off and size may not cover an area beyond a single hardware page. 3335 */ 3336 void 3337 pmap_zero_page_area(vm_page_t m, int off, int size) 3338 { 3339 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3340 3341 if (off == 0 && size == PAGE_SIZE) 3342 pagezero((void *)va); 3343 else 3344 bzero((char *)va + off, size); 3345 } 3346 3347 /* 3348 * pmap_copy_page copies the specified (machine independent) 3349 * page by mapping the page into virtual memory and using 3350 * bcopy to copy the page, one machine dependent page at a 3351 * time. 3352 */ 3353 void 3354 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 3355 { 3356 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 3357 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 3358 3359 pagecopy((void *)src, (void *)dst); 3360 } 3361 3362 int unmapped_buf_allowed = 1; 3363 3364 void 3365 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 3366 vm_offset_t b_offset, int xfersize) 3367 { 3368 void *a_cp, *b_cp; 3369 vm_page_t m_a, m_b; 3370 vm_paddr_t p_a, p_b; 3371 vm_offset_t a_pg_offset, b_pg_offset; 3372 int cnt; 3373 3374 while (xfersize > 0) { 3375 a_pg_offset = a_offset & PAGE_MASK; 3376 m_a = ma[a_offset >> PAGE_SHIFT]; 3377 p_a = m_a->phys_addr; 3378 b_pg_offset = b_offset & PAGE_MASK; 3379 m_b = mb[b_offset >> PAGE_SHIFT]; 3380 p_b = m_b->phys_addr; 3381 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 3382 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 3383 if (__predict_false(!PHYS_IN_DMAP(p_a))) { 3384 panic("!DMAP a %lx", p_a); 3385 } else { 3386 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 3387 } 3388 if (__predict_false(!PHYS_IN_DMAP(p_b))) { 3389 panic("!DMAP b %lx", p_b); 3390 } else { 3391 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 3392 } 3393 bcopy(a_cp, b_cp, cnt); 3394 a_offset += cnt; 3395 b_offset += cnt; 3396 xfersize -= cnt; 3397 } 3398 } 3399 3400 vm_offset_t 3401 pmap_quick_enter_page(vm_page_t m) 3402 { 3403 3404 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); 3405 } 3406 3407 void 3408 pmap_quick_remove_page(vm_offset_t addr) 3409 { 3410 } 3411 3412 /* 3413 * Returns true if the pmap's pv is one of the first 3414 * 16 pvs linked to from this page. This count may 3415 * be changed upwards or downwards in the future; it 3416 * is only necessary that true be returned for a small 3417 * subset of pmaps for proper page aging. 3418 */ 3419 boolean_t 3420 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 3421 { 3422 struct md_page *pvh; 3423 struct rwlock *lock; 3424 pv_entry_t pv; 3425 int loops = 0; 3426 boolean_t rv; 3427 3428 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3429 ("pmap_page_exists_quick: page %p is not managed", m)); 3430 rv = FALSE; 3431 rw_rlock(&pvh_global_lock); 3432 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3433 rw_rlock(lock); 3434 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3435 if (PV_PMAP(pv) == pmap) { 3436 rv = TRUE; 3437 break; 3438 } 3439 loops++; 3440 if (loops >= 16) 3441 break; 3442 } 3443 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 3444 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3445 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3446 if (PV_PMAP(pv) == pmap) { 3447 rv = TRUE; 3448 break; 3449 } 3450 loops++; 3451 if (loops >= 16) 3452 break; 3453 } 3454 } 3455 rw_runlock(lock); 3456 rw_runlock(&pvh_global_lock); 3457 return (rv); 3458 } 3459 3460 /* 3461 * pmap_page_wired_mappings: 3462 * 3463 * Return the number of managed mappings to the given physical page 3464 * that are wired. 3465 */ 3466 int 3467 pmap_page_wired_mappings(vm_page_t m) 3468 { 3469 struct md_page *pvh; 3470 struct rwlock *lock; 3471 pmap_t pmap; 3472 pd_entry_t *l2; 3473 pt_entry_t *l3; 3474 pv_entry_t pv; 3475 int count, md_gen, pvh_gen; 3476 3477 if ((m->oflags & VPO_UNMANAGED) != 0) 3478 return (0); 3479 rw_rlock(&pvh_global_lock); 3480 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3481 rw_rlock(lock); 3482 restart: 3483 count = 0; 3484 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3485 pmap = PV_PMAP(pv); 3486 if (!PMAP_TRYLOCK(pmap)) { 3487 md_gen = m->md.pv_gen; 3488 rw_runlock(lock); 3489 PMAP_LOCK(pmap); 3490 rw_rlock(lock); 3491 if (md_gen != m->md.pv_gen) { 3492 PMAP_UNLOCK(pmap); 3493 goto restart; 3494 } 3495 } 3496 l3 = pmap_l3(pmap, pv->pv_va); 3497 if ((pmap_load(l3) & PTE_SW_WIRED) != 0) 3498 count++; 3499 PMAP_UNLOCK(pmap); 3500 } 3501 if ((m->flags & PG_FICTITIOUS) == 0) { 3502 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3503 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3504 pmap = PV_PMAP(pv); 3505 if (!PMAP_TRYLOCK(pmap)) { 3506 md_gen = m->md.pv_gen; 3507 pvh_gen = pvh->pv_gen; 3508 rw_runlock(lock); 3509 PMAP_LOCK(pmap); 3510 rw_rlock(lock); 3511 if (md_gen != m->md.pv_gen || 3512 pvh_gen != pvh->pv_gen) { 3513 PMAP_UNLOCK(pmap); 3514 goto restart; 3515 } 3516 } 3517 l2 = pmap_l2(pmap, pv->pv_va); 3518 if ((pmap_load(l2) & PTE_SW_WIRED) != 0) 3519 count++; 3520 PMAP_UNLOCK(pmap); 3521 } 3522 } 3523 rw_runlock(lock); 3524 rw_runlock(&pvh_global_lock); 3525 return (count); 3526 } 3527 3528 /* 3529 * Returns true if the given page is mapped individually or as part of 3530 * a 2mpage. Otherwise, returns false. 3531 */ 3532 bool 3533 pmap_page_is_mapped(vm_page_t m) 3534 { 3535 struct rwlock *lock; 3536 bool rv; 3537 3538 if ((m->oflags & VPO_UNMANAGED) != 0) 3539 return (false); 3540 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3541 rw_rlock(lock); 3542 rv = !TAILQ_EMPTY(&m->md.pv_list) || 3543 ((m->flags & PG_FICTITIOUS) == 0 && 3544 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 3545 rw_runlock(lock); 3546 return (rv); 3547 } 3548 3549 static void 3550 pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, 3551 struct spglist *free, bool superpage) 3552 { 3553 struct md_page *pvh; 3554 vm_page_t mpte, mt; 3555 3556 if (superpage) { 3557 pmap_resident_count_dec(pmap, Ln_ENTRIES); 3558 pvh = pa_to_pvh(m->phys_addr); 3559 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3560 pvh->pv_gen++; 3561 if (TAILQ_EMPTY(&pvh->pv_list)) { 3562 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 3563 if (TAILQ_EMPTY(&mt->md.pv_list) && 3564 (mt->a.flags & PGA_WRITEABLE) != 0) 3565 vm_page_aflag_clear(mt, PGA_WRITEABLE); 3566 } 3567 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 3568 if (mpte != NULL) { 3569 KASSERT(mpte->valid == VM_PAGE_BITS_ALL, 3570 ("pmap_remove_pages: pte page not promoted")); 3571 pmap_resident_count_dec(pmap, 1); 3572 KASSERT(mpte->ref_count == Ln_ENTRIES, 3573 ("pmap_remove_pages: pte page ref count error")); 3574 mpte->ref_count = 0; 3575 pmap_add_delayed_free_list(mpte, free, FALSE); 3576 } 3577 } else { 3578 pmap_resident_count_dec(pmap, 1); 3579 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3580 m->md.pv_gen++; 3581 if (TAILQ_EMPTY(&m->md.pv_list) && 3582 (m->a.flags & PGA_WRITEABLE) != 0) { 3583 pvh = pa_to_pvh(m->phys_addr); 3584 if (TAILQ_EMPTY(&pvh->pv_list)) 3585 vm_page_aflag_clear(m, PGA_WRITEABLE); 3586 } 3587 } 3588 } 3589 3590 /* 3591 * Destroy all managed, non-wired mappings in the given user-space 3592 * pmap. This pmap cannot be active on any processor besides the 3593 * caller. 3594 * 3595 * This function cannot be applied to the kernel pmap. Moreover, it 3596 * is not intended for general use. It is only to be used during 3597 * process termination. Consequently, it can be implemented in ways 3598 * that make it faster than pmap_remove(). First, it can more quickly 3599 * destroy mappings by iterating over the pmap's collection of PV 3600 * entries, rather than searching the page table. Second, it doesn't 3601 * have to test and clear the page table entries atomically, because 3602 * no processor is currently accessing the user address space. In 3603 * particular, a page table entry's dirty bit won't change state once 3604 * this function starts. 3605 */ 3606 void 3607 pmap_remove_pages(pmap_t pmap) 3608 { 3609 struct spglist free; 3610 pd_entry_t ptepde; 3611 pt_entry_t *pte, tpte; 3612 vm_page_t m, mt; 3613 pv_entry_t pv; 3614 struct pv_chunk *pc, *npc; 3615 struct rwlock *lock; 3616 int64_t bit; 3617 uint64_t inuse, bitmask; 3618 int allfree, field, freed, idx; 3619 bool superpage; 3620 3621 lock = NULL; 3622 3623 SLIST_INIT(&free); 3624 rw_rlock(&pvh_global_lock); 3625 PMAP_LOCK(pmap); 3626 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 3627 allfree = 1; 3628 freed = 0; 3629 for (field = 0; field < _NPCM; field++) { 3630 inuse = ~pc->pc_map[field] & pc_freemask[field]; 3631 while (inuse != 0) { 3632 bit = ffsl(inuse) - 1; 3633 bitmask = 1UL << bit; 3634 idx = field * 64 + bit; 3635 pv = &pc->pc_pventry[idx]; 3636 inuse &= ~bitmask; 3637 3638 pte = pmap_l1(pmap, pv->pv_va); 3639 ptepde = pmap_load(pte); 3640 pte = pmap_l1_to_l2(pte, pv->pv_va); 3641 tpte = pmap_load(pte); 3642 if ((tpte & PTE_RWX) != 0) { 3643 superpage = true; 3644 } else { 3645 ptepde = tpte; 3646 pte = pmap_l2_to_l3(pte, pv->pv_va); 3647 tpte = pmap_load(pte); 3648 superpage = false; 3649 } 3650 3651 /* 3652 * We cannot remove wired pages from a 3653 * process' mapping at this time. 3654 */ 3655 if (tpte & PTE_SW_WIRED) { 3656 allfree = 0; 3657 continue; 3658 } 3659 3660 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte)); 3661 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 3662 m < &vm_page_array[vm_page_array_size], 3663 ("pmap_remove_pages: bad pte %#jx", 3664 (uintmax_t)tpte)); 3665 3666 pmap_clear(pte); 3667 3668 /* 3669 * Update the vm_page_t clean/reference bits. 3670 */ 3671 if ((tpte & (PTE_D | PTE_W)) == 3672 (PTE_D | PTE_W)) { 3673 if (superpage) 3674 for (mt = m; 3675 mt < &m[Ln_ENTRIES]; mt++) 3676 vm_page_dirty(mt); 3677 else 3678 vm_page_dirty(m); 3679 } 3680 3681 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 3682 3683 /* Mark free */ 3684 pc->pc_map[field] |= bitmask; 3685 3686 pmap_remove_pages_pv(pmap, m, pv, &free, 3687 superpage); 3688 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 3689 freed++; 3690 } 3691 } 3692 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 3693 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 3694 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 3695 if (allfree) { 3696 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3697 free_pv_chunk(pc); 3698 } 3699 } 3700 if (lock != NULL) 3701 rw_wunlock(lock); 3702 pmap_invalidate_all(pmap); 3703 rw_runlock(&pvh_global_lock); 3704 PMAP_UNLOCK(pmap); 3705 vm_page_free_pages_toq(&free, false); 3706 } 3707 3708 static bool 3709 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 3710 { 3711 struct md_page *pvh; 3712 struct rwlock *lock; 3713 pd_entry_t *l2; 3714 pt_entry_t *l3, mask; 3715 pv_entry_t pv; 3716 pmap_t pmap; 3717 int md_gen, pvh_gen; 3718 bool rv; 3719 3720 mask = 0; 3721 if (modified) 3722 mask |= PTE_D; 3723 if (accessed) 3724 mask |= PTE_A; 3725 3726 rv = FALSE; 3727 rw_rlock(&pvh_global_lock); 3728 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3729 rw_rlock(lock); 3730 restart: 3731 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3732 pmap = PV_PMAP(pv); 3733 if (!PMAP_TRYLOCK(pmap)) { 3734 md_gen = m->md.pv_gen; 3735 rw_runlock(lock); 3736 PMAP_LOCK(pmap); 3737 rw_rlock(lock); 3738 if (md_gen != m->md.pv_gen) { 3739 PMAP_UNLOCK(pmap); 3740 goto restart; 3741 } 3742 } 3743 l3 = pmap_l3(pmap, pv->pv_va); 3744 rv = (pmap_load(l3) & mask) == mask; 3745 PMAP_UNLOCK(pmap); 3746 if (rv) 3747 goto out; 3748 } 3749 if ((m->flags & PG_FICTITIOUS) == 0) { 3750 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3751 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3752 pmap = PV_PMAP(pv); 3753 if (!PMAP_TRYLOCK(pmap)) { 3754 md_gen = m->md.pv_gen; 3755 pvh_gen = pvh->pv_gen; 3756 rw_runlock(lock); 3757 PMAP_LOCK(pmap); 3758 rw_rlock(lock); 3759 if (md_gen != m->md.pv_gen || 3760 pvh_gen != pvh->pv_gen) { 3761 PMAP_UNLOCK(pmap); 3762 goto restart; 3763 } 3764 } 3765 l2 = pmap_l2(pmap, pv->pv_va); 3766 rv = (pmap_load(l2) & mask) == mask; 3767 PMAP_UNLOCK(pmap); 3768 if (rv) 3769 goto out; 3770 } 3771 } 3772 out: 3773 rw_runlock(lock); 3774 rw_runlock(&pvh_global_lock); 3775 return (rv); 3776 } 3777 3778 /* 3779 * pmap_is_modified: 3780 * 3781 * Return whether or not the specified physical page was modified 3782 * in any physical maps. 3783 */ 3784 boolean_t 3785 pmap_is_modified(vm_page_t m) 3786 { 3787 3788 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3789 ("pmap_is_modified: page %p is not managed", m)); 3790 3791 /* 3792 * If the page is not busied then this check is racy. 3793 */ 3794 if (!pmap_page_is_write_mapped(m)) 3795 return (FALSE); 3796 return (pmap_page_test_mappings(m, FALSE, TRUE)); 3797 } 3798 3799 /* 3800 * pmap_is_prefaultable: 3801 * 3802 * Return whether or not the specified virtual address is eligible 3803 * for prefault. 3804 */ 3805 boolean_t 3806 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 3807 { 3808 pt_entry_t *l3; 3809 boolean_t rv; 3810 3811 rv = FALSE; 3812 PMAP_LOCK(pmap); 3813 l3 = pmap_l3(pmap, addr); 3814 if (l3 != NULL && pmap_load(l3) != 0) { 3815 rv = TRUE; 3816 } 3817 PMAP_UNLOCK(pmap); 3818 return (rv); 3819 } 3820 3821 /* 3822 * pmap_is_referenced: 3823 * 3824 * Return whether or not the specified physical page was referenced 3825 * in any physical maps. 3826 */ 3827 boolean_t 3828 pmap_is_referenced(vm_page_t m) 3829 { 3830 3831 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3832 ("pmap_is_referenced: page %p is not managed", m)); 3833 return (pmap_page_test_mappings(m, TRUE, FALSE)); 3834 } 3835 3836 /* 3837 * Clear the write and modified bits in each of the given page's mappings. 3838 */ 3839 void 3840 pmap_remove_write(vm_page_t m) 3841 { 3842 struct md_page *pvh; 3843 struct rwlock *lock; 3844 pmap_t pmap; 3845 pd_entry_t *l2; 3846 pt_entry_t *l3, oldl3, newl3; 3847 pv_entry_t next_pv, pv; 3848 vm_offset_t va; 3849 int md_gen, pvh_gen; 3850 3851 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3852 ("pmap_remove_write: page %p is not managed", m)); 3853 vm_page_assert_busied(m); 3854 3855 if (!pmap_page_is_write_mapped(m)) 3856 return; 3857 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3858 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 3859 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3860 rw_rlock(&pvh_global_lock); 3861 retry_pv_loop: 3862 rw_wlock(lock); 3863 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 3864 pmap = PV_PMAP(pv); 3865 if (!PMAP_TRYLOCK(pmap)) { 3866 pvh_gen = pvh->pv_gen; 3867 rw_wunlock(lock); 3868 PMAP_LOCK(pmap); 3869 rw_wlock(lock); 3870 if (pvh_gen != pvh->pv_gen) { 3871 PMAP_UNLOCK(pmap); 3872 rw_wunlock(lock); 3873 goto retry_pv_loop; 3874 } 3875 } 3876 va = pv->pv_va; 3877 l2 = pmap_l2(pmap, va); 3878 if ((pmap_load(l2) & PTE_W) != 0) 3879 (void)pmap_demote_l2_locked(pmap, l2, va, &lock); 3880 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 3881 ("inconsistent pv lock %p %p for page %p", 3882 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 3883 PMAP_UNLOCK(pmap); 3884 } 3885 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3886 pmap = PV_PMAP(pv); 3887 if (!PMAP_TRYLOCK(pmap)) { 3888 pvh_gen = pvh->pv_gen; 3889 md_gen = m->md.pv_gen; 3890 rw_wunlock(lock); 3891 PMAP_LOCK(pmap); 3892 rw_wlock(lock); 3893 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 3894 PMAP_UNLOCK(pmap); 3895 rw_wunlock(lock); 3896 goto retry_pv_loop; 3897 } 3898 } 3899 l3 = pmap_l3(pmap, pv->pv_va); 3900 oldl3 = pmap_load(l3); 3901 retry: 3902 if ((oldl3 & PTE_W) != 0) { 3903 newl3 = oldl3 & ~(PTE_D | PTE_W); 3904 if (!atomic_fcmpset_long(l3, &oldl3, newl3)) 3905 goto retry; 3906 if ((oldl3 & PTE_D) != 0) 3907 vm_page_dirty(m); 3908 pmap_invalidate_page(pmap, pv->pv_va); 3909 } 3910 PMAP_UNLOCK(pmap); 3911 } 3912 rw_wunlock(lock); 3913 vm_page_aflag_clear(m, PGA_WRITEABLE); 3914 rw_runlock(&pvh_global_lock); 3915 } 3916 3917 /* 3918 * pmap_ts_referenced: 3919 * 3920 * Return a count of reference bits for a page, clearing those bits. 3921 * It is not necessary for every reference bit to be cleared, but it 3922 * is necessary that 0 only be returned when there are truly no 3923 * reference bits set. 3924 * 3925 * As an optimization, update the page's dirty field if a modified bit is 3926 * found while counting reference bits. This opportunistic update can be 3927 * performed at low cost and can eliminate the need for some future calls 3928 * to pmap_is_modified(). However, since this function stops after 3929 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 3930 * dirty pages. Those dirty pages will only be detected by a future call 3931 * to pmap_is_modified(). 3932 */ 3933 int 3934 pmap_ts_referenced(vm_page_t m) 3935 { 3936 struct spglist free; 3937 struct md_page *pvh; 3938 struct rwlock *lock; 3939 pv_entry_t pv, pvf; 3940 pmap_t pmap; 3941 pd_entry_t *l2, l2e; 3942 pt_entry_t *l3, l3e; 3943 vm_paddr_t pa; 3944 vm_offset_t va; 3945 int cleared, md_gen, not_cleared, pvh_gen; 3946 3947 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3948 ("pmap_ts_referenced: page %p is not managed", m)); 3949 SLIST_INIT(&free); 3950 cleared = 0; 3951 pa = VM_PAGE_TO_PHYS(m); 3952 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 3953 3954 lock = PHYS_TO_PV_LIST_LOCK(pa); 3955 rw_rlock(&pvh_global_lock); 3956 rw_wlock(lock); 3957 retry: 3958 not_cleared = 0; 3959 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 3960 goto small_mappings; 3961 pv = pvf; 3962 do { 3963 pmap = PV_PMAP(pv); 3964 if (!PMAP_TRYLOCK(pmap)) { 3965 pvh_gen = pvh->pv_gen; 3966 rw_wunlock(lock); 3967 PMAP_LOCK(pmap); 3968 rw_wlock(lock); 3969 if (pvh_gen != pvh->pv_gen) { 3970 PMAP_UNLOCK(pmap); 3971 goto retry; 3972 } 3973 } 3974 va = pv->pv_va; 3975 l2 = pmap_l2(pmap, va); 3976 l2e = pmap_load(l2); 3977 if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) { 3978 /* 3979 * Although l2e is mapping a 2MB page, because 3980 * this function is called at a 4KB page granularity, 3981 * we only update the 4KB page under test. 3982 */ 3983 vm_page_dirty(m); 3984 } 3985 if ((l2e & PTE_A) != 0) { 3986 /* 3987 * Since this reference bit is shared by 512 4KB 3988 * pages, it should not be cleared every time it is 3989 * tested. Apply a simple "hash" function on the 3990 * physical page number, the virtual superpage number, 3991 * and the pmap address to select one 4KB page out of 3992 * the 512 on which testing the reference bit will 3993 * result in clearing that reference bit. This 3994 * function is designed to avoid the selection of the 3995 * same 4KB page for every 2MB page mapping. 3996 * 3997 * On demotion, a mapping that hasn't been referenced 3998 * is simply destroyed. To avoid the possibility of a 3999 * subsequent page fault on a demoted wired mapping, 4000 * always leave its reference bit set. Moreover, 4001 * since the superpage is wired, the current state of 4002 * its reference bit won't affect page replacement. 4003 */ 4004 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ 4005 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && 4006 (l2e & PTE_SW_WIRED) == 0) { 4007 pmap_clear_bits(l2, PTE_A); 4008 pmap_invalidate_page(pmap, va); 4009 cleared++; 4010 } else 4011 not_cleared++; 4012 } 4013 PMAP_UNLOCK(pmap); 4014 /* Rotate the PV list if it has more than one entry. */ 4015 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4016 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4017 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4018 pvh->pv_gen++; 4019 } 4020 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 4021 goto out; 4022 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 4023 small_mappings: 4024 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 4025 goto out; 4026 pv = pvf; 4027 do { 4028 pmap = PV_PMAP(pv); 4029 if (!PMAP_TRYLOCK(pmap)) { 4030 pvh_gen = pvh->pv_gen; 4031 md_gen = m->md.pv_gen; 4032 rw_wunlock(lock); 4033 PMAP_LOCK(pmap); 4034 rw_wlock(lock); 4035 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4036 PMAP_UNLOCK(pmap); 4037 goto retry; 4038 } 4039 } 4040 l2 = pmap_l2(pmap, pv->pv_va); 4041 4042 KASSERT((pmap_load(l2) & PTE_RX) == 0, 4043 ("pmap_ts_referenced: found an invalid l2 table")); 4044 4045 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4046 l3e = pmap_load(l3); 4047 if ((l3e & PTE_D) != 0) 4048 vm_page_dirty(m); 4049 if ((l3e & PTE_A) != 0) { 4050 if ((l3e & PTE_SW_WIRED) == 0) { 4051 /* 4052 * Wired pages cannot be paged out so 4053 * doing accessed bit emulation for 4054 * them is wasted effort. We do the 4055 * hard work for unwired pages only. 4056 */ 4057 pmap_clear_bits(l3, PTE_A); 4058 pmap_invalidate_page(pmap, pv->pv_va); 4059 cleared++; 4060 } else 4061 not_cleared++; 4062 } 4063 PMAP_UNLOCK(pmap); 4064 /* Rotate the PV list if it has more than one entry. */ 4065 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4066 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4067 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4068 m->md.pv_gen++; 4069 } 4070 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 4071 not_cleared < PMAP_TS_REFERENCED_MAX); 4072 out: 4073 rw_wunlock(lock); 4074 rw_runlock(&pvh_global_lock); 4075 vm_page_free_pages_toq(&free, false); 4076 return (cleared + not_cleared); 4077 } 4078 4079 /* 4080 * Apply the given advice to the specified range of addresses within the 4081 * given pmap. Depending on the advice, clear the referenced and/or 4082 * modified flags in each mapping and set the mapped page's dirty field. 4083 */ 4084 void 4085 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 4086 { 4087 } 4088 4089 /* 4090 * Clear the modify bits on the specified physical page. 4091 */ 4092 void 4093 pmap_clear_modify(vm_page_t m) 4094 { 4095 struct md_page *pvh; 4096 struct rwlock *lock; 4097 pmap_t pmap; 4098 pv_entry_t next_pv, pv; 4099 pd_entry_t *l2, oldl2; 4100 pt_entry_t *l3; 4101 vm_offset_t va; 4102 int md_gen, pvh_gen; 4103 4104 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4105 ("pmap_clear_modify: page %p is not managed", m)); 4106 vm_page_assert_busied(m); 4107 4108 if (!pmap_page_is_write_mapped(m)) 4109 return; 4110 4111 /* 4112 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 4113 * If the object containing the page is locked and the page is not 4114 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 4115 */ 4116 if ((m->a.flags & PGA_WRITEABLE) == 0) 4117 return; 4118 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 4119 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4120 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4121 rw_rlock(&pvh_global_lock); 4122 rw_wlock(lock); 4123 restart: 4124 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4125 pmap = PV_PMAP(pv); 4126 if (!PMAP_TRYLOCK(pmap)) { 4127 pvh_gen = pvh->pv_gen; 4128 rw_wunlock(lock); 4129 PMAP_LOCK(pmap); 4130 rw_wlock(lock); 4131 if (pvh_gen != pvh->pv_gen) { 4132 PMAP_UNLOCK(pmap); 4133 goto restart; 4134 } 4135 } 4136 va = pv->pv_va; 4137 l2 = pmap_l2(pmap, va); 4138 oldl2 = pmap_load(l2); 4139 /* If oldl2 has PTE_W set, then it also has PTE_D set. */ 4140 if ((oldl2 & PTE_W) != 0 && 4141 pmap_demote_l2_locked(pmap, l2, va, &lock) && 4142 (oldl2 & PTE_SW_WIRED) == 0) { 4143 /* 4144 * Write protect the mapping to a single page so that 4145 * a subsequent write access may repromote. 4146 */ 4147 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2); 4148 l3 = pmap_l2_to_l3(l2, va); 4149 pmap_clear_bits(l3, PTE_D | PTE_W); 4150 vm_page_dirty(m); 4151 pmap_invalidate_page(pmap, va); 4152 } 4153 PMAP_UNLOCK(pmap); 4154 } 4155 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4156 pmap = PV_PMAP(pv); 4157 if (!PMAP_TRYLOCK(pmap)) { 4158 md_gen = m->md.pv_gen; 4159 pvh_gen = pvh->pv_gen; 4160 rw_wunlock(lock); 4161 PMAP_LOCK(pmap); 4162 rw_wlock(lock); 4163 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4164 PMAP_UNLOCK(pmap); 4165 goto restart; 4166 } 4167 } 4168 l2 = pmap_l2(pmap, pv->pv_va); 4169 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4170 ("pmap_clear_modify: found a 2mpage in page %p's pv list", 4171 m)); 4172 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4173 if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) { 4174 pmap_clear_bits(l3, PTE_D | PTE_W); 4175 pmap_invalidate_page(pmap, pv->pv_va); 4176 } 4177 PMAP_UNLOCK(pmap); 4178 } 4179 rw_wunlock(lock); 4180 rw_runlock(&pvh_global_lock); 4181 } 4182 4183 void * 4184 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 4185 { 4186 4187 return ((void *)PHYS_TO_DMAP(pa)); 4188 } 4189 4190 void 4191 pmap_unmapbios(vm_paddr_t pa, vm_size_t size) 4192 { 4193 } 4194 4195 /* 4196 * Sets the memory attribute for the specified page. 4197 */ 4198 void 4199 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 4200 { 4201 4202 m->md.pv_memattr = ma; 4203 } 4204 4205 /* 4206 * Perform the pmap work for mincore(2). If the page is not both referenced and 4207 * modified by this pmap, returns its physical address so that the caller can 4208 * find other mappings. 4209 */ 4210 int 4211 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 4212 { 4213 pt_entry_t *l2, *l3, tpte; 4214 vm_paddr_t pa; 4215 int val; 4216 bool managed; 4217 4218 PMAP_LOCK(pmap); 4219 l2 = pmap_l2(pmap, addr); 4220 if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) { 4221 if ((tpte & PTE_RWX) != 0) { 4222 pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET); 4223 val = MINCORE_INCORE | MINCORE_SUPER; 4224 } else { 4225 l3 = pmap_l2_to_l3(l2, addr); 4226 tpte = pmap_load(l3); 4227 if ((tpte & PTE_V) == 0) { 4228 PMAP_UNLOCK(pmap); 4229 return (0); 4230 } 4231 pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET); 4232 val = MINCORE_INCORE; 4233 } 4234 4235 if ((tpte & PTE_D) != 0) 4236 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 4237 if ((tpte & PTE_A) != 0) 4238 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 4239 managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED; 4240 } else { 4241 managed = false; 4242 val = 0; 4243 } 4244 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 4245 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 4246 *pap = pa; 4247 } 4248 PMAP_UNLOCK(pmap); 4249 return (val); 4250 } 4251 4252 void 4253 pmap_activate_sw(struct thread *td) 4254 { 4255 pmap_t oldpmap, pmap; 4256 u_int hart; 4257 4258 oldpmap = PCPU_GET(curpmap); 4259 pmap = vmspace_pmap(td->td_proc->p_vmspace); 4260 if (pmap == oldpmap) 4261 return; 4262 load_satp(pmap->pm_satp); 4263 4264 hart = PCPU_GET(hart); 4265 #ifdef SMP 4266 CPU_SET_ATOMIC(hart, &pmap->pm_active); 4267 CPU_CLR_ATOMIC(hart, &oldpmap->pm_active); 4268 #else 4269 CPU_SET(hart, &pmap->pm_active); 4270 CPU_CLR(hart, &oldpmap->pm_active); 4271 #endif 4272 PCPU_SET(curpmap, pmap); 4273 4274 sfence_vma(); 4275 } 4276 4277 void 4278 pmap_activate(struct thread *td) 4279 { 4280 4281 critical_enter(); 4282 pmap_activate_sw(td); 4283 critical_exit(); 4284 } 4285 4286 void 4287 pmap_activate_boot(pmap_t pmap) 4288 { 4289 u_int hart; 4290 4291 hart = PCPU_GET(hart); 4292 #ifdef SMP 4293 CPU_SET_ATOMIC(hart, &pmap->pm_active); 4294 #else 4295 CPU_SET(hart, &pmap->pm_active); 4296 #endif 4297 PCPU_SET(curpmap, pmap); 4298 } 4299 4300 void 4301 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) 4302 { 4303 cpuset_t mask; 4304 4305 /* 4306 * From the RISC-V User-Level ISA V2.2: 4307 * 4308 * "To make a store to instruction memory visible to all 4309 * RISC-V harts, the writing hart has to execute a data FENCE 4310 * before requesting that all remote RISC-V harts execute a 4311 * FENCE.I." 4312 * 4313 * However, this is slightly misleading; we still need to 4314 * perform a FENCE.I for the local hart, as FENCE does nothing 4315 * for its icache. FENCE.I alone is also sufficient for the 4316 * local hart. 4317 */ 4318 sched_pin(); 4319 mask = all_harts; 4320 CPU_CLR(PCPU_GET(hart), &mask); 4321 fence_i(); 4322 if (!CPU_EMPTY(&mask) && smp_started) { 4323 fence(); 4324 sbi_remote_fence_i(mask.__bits); 4325 } 4326 sched_unpin(); 4327 } 4328 4329 /* 4330 * Increase the starting virtual address of the given mapping if a 4331 * different alignment might result in more superpage mappings. 4332 */ 4333 void 4334 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 4335 vm_offset_t *addr, vm_size_t size) 4336 { 4337 vm_offset_t superpage_offset; 4338 4339 if (size < L2_SIZE) 4340 return; 4341 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 4342 offset += ptoa(object->pg_color); 4343 superpage_offset = offset & L2_OFFSET; 4344 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || 4345 (*addr & L2_OFFSET) == superpage_offset) 4346 return; 4347 if ((*addr & L2_OFFSET) < superpage_offset) 4348 *addr = (*addr & ~L2_OFFSET) + superpage_offset; 4349 else 4350 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; 4351 } 4352 4353 /** 4354 * Get the kernel virtual address of a set of physical pages. If there are 4355 * physical addresses not covered by the DMAP perform a transient mapping 4356 * that will be removed when calling pmap_unmap_io_transient. 4357 * 4358 * \param page The pages the caller wishes to obtain the virtual 4359 * address on the kernel memory map. 4360 * \param vaddr On return contains the kernel virtual memory address 4361 * of the pages passed in the page parameter. 4362 * \param count Number of pages passed in. 4363 * \param can_fault TRUE if the thread using the mapped pages can take 4364 * page faults, FALSE otherwise. 4365 * 4366 * \returns TRUE if the caller must call pmap_unmap_io_transient when 4367 * finished or FALSE otherwise. 4368 * 4369 */ 4370 boolean_t 4371 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 4372 boolean_t can_fault) 4373 { 4374 vm_paddr_t paddr; 4375 boolean_t needs_mapping; 4376 int error, i; 4377 4378 /* 4379 * Allocate any KVA space that we need, this is done in a separate 4380 * loop to prevent calling vmem_alloc while pinned. 4381 */ 4382 needs_mapping = FALSE; 4383 for (i = 0; i < count; i++) { 4384 paddr = VM_PAGE_TO_PHYS(page[i]); 4385 if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) { 4386 error = vmem_alloc(kernel_arena, PAGE_SIZE, 4387 M_BESTFIT | M_WAITOK, &vaddr[i]); 4388 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 4389 needs_mapping = TRUE; 4390 } else { 4391 vaddr[i] = PHYS_TO_DMAP(paddr); 4392 } 4393 } 4394 4395 /* Exit early if everything is covered by the DMAP */ 4396 if (!needs_mapping) 4397 return (FALSE); 4398 4399 if (!can_fault) 4400 sched_pin(); 4401 for (i = 0; i < count; i++) { 4402 paddr = VM_PAGE_TO_PHYS(page[i]); 4403 if (paddr >= DMAP_MAX_PHYSADDR) { 4404 panic( 4405 "pmap_map_io_transient: TODO: Map out of DMAP data"); 4406 } 4407 } 4408 4409 return (needs_mapping); 4410 } 4411 4412 void 4413 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 4414 boolean_t can_fault) 4415 { 4416 vm_paddr_t paddr; 4417 int i; 4418 4419 if (!can_fault) 4420 sched_unpin(); 4421 for (i = 0; i < count; i++) { 4422 paddr = VM_PAGE_TO_PHYS(page[i]); 4423 if (paddr >= DMAP_MAX_PHYSADDR) { 4424 panic("RISCVTODO: pmap_unmap_io_transient: Unmap data"); 4425 } 4426 } 4427 } 4428 4429 boolean_t 4430 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 4431 { 4432 4433 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_BACK); 4434 } 4435 4436 bool 4437 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2, 4438 pt_entry_t **l3) 4439 { 4440 pd_entry_t *l1p, *l2p; 4441 4442 /* Get l1 directory entry. */ 4443 l1p = pmap_l1(pmap, va); 4444 *l1 = l1p; 4445 4446 if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0) 4447 return (false); 4448 4449 if ((pmap_load(l1p) & PTE_RX) != 0) { 4450 *l2 = NULL; 4451 *l3 = NULL; 4452 return (true); 4453 } 4454 4455 /* Get l2 directory entry. */ 4456 l2p = pmap_l1_to_l2(l1p, va); 4457 *l2 = l2p; 4458 4459 if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0) 4460 return (false); 4461 4462 if ((pmap_load(l2p) & PTE_RX) != 0) { 4463 *l3 = NULL; 4464 return (true); 4465 } 4466 4467 /* Get l3 page table entry. */ 4468 *l3 = pmap_l2_to_l3(l2p, va); 4469 4470 return (true); 4471 } 4472 4473 /* 4474 * Track a range of the kernel's virtual address space that is contiguous 4475 * in various mapping attributes. 4476 */ 4477 struct pmap_kernel_map_range { 4478 vm_offset_t sva; 4479 pt_entry_t attrs; 4480 int l3pages; 4481 int l2pages; 4482 int l1pages; 4483 }; 4484 4485 static void 4486 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 4487 vm_offset_t eva) 4488 { 4489 4490 if (eva <= range->sva) 4491 return; 4492 4493 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %d %d %d\n", 4494 range->sva, eva, 4495 (range->attrs & PTE_W) == PTE_W ? 'w' : '-', 4496 (range->attrs & PTE_X) == PTE_X ? 'x' : '-', 4497 (range->attrs & PTE_U) == PTE_U ? 'u' : 's', 4498 (range->attrs & PTE_G) == PTE_G ? 'g' : '-', 4499 range->l1pages, range->l2pages, range->l3pages); 4500 4501 /* Reset to sentinel value. */ 4502 range->sva = 0xfffffffffffffffful; 4503 } 4504 4505 /* 4506 * Determine whether the attributes specified by a page table entry match those 4507 * being tracked by the current range. 4508 */ 4509 static bool 4510 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 4511 { 4512 4513 return (range->attrs == attrs); 4514 } 4515 4516 static void 4517 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 4518 pt_entry_t attrs) 4519 { 4520 4521 memset(range, 0, sizeof(*range)); 4522 range->sva = va; 4523 range->attrs = attrs; 4524 } 4525 4526 /* 4527 * Given a leaf PTE, derive the mapping's attributes. If they do not match 4528 * those of the current run, dump the address range and its attributes, and 4529 * begin a new run. 4530 */ 4531 static void 4532 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 4533 vm_offset_t va, pd_entry_t l1e, pd_entry_t l2e, pt_entry_t l3e) 4534 { 4535 pt_entry_t attrs; 4536 4537 /* The PTE global bit is inherited by lower levels. */ 4538 attrs = l1e & PTE_G; 4539 if ((l1e & PTE_RWX) != 0) 4540 attrs |= l1e & (PTE_RWX | PTE_U); 4541 else if (l2e != 0) 4542 attrs |= l2e & PTE_G; 4543 if ((l2e & PTE_RWX) != 0) 4544 attrs |= l2e & (PTE_RWX | PTE_U); 4545 else if (l3e != 0) 4546 attrs |= l3e & (PTE_RWX | PTE_U | PTE_G); 4547 4548 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 4549 sysctl_kmaps_dump(sb, range, va); 4550 sysctl_kmaps_reinit(range, va, attrs); 4551 } 4552 } 4553 4554 static int 4555 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 4556 { 4557 struct pmap_kernel_map_range range; 4558 struct sbuf sbuf, *sb; 4559 pd_entry_t l1e, *l2, l2e; 4560 pt_entry_t *l3, l3e; 4561 vm_offset_t sva; 4562 vm_paddr_t pa; 4563 int error, i, j, k; 4564 4565 error = sysctl_wire_old_buffer(req, 0); 4566 if (error != 0) 4567 return (error); 4568 sb = &sbuf; 4569 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 4570 4571 /* Sentinel value. */ 4572 range.sva = 0xfffffffffffffffful; 4573 4574 /* 4575 * Iterate over the kernel page tables without holding the kernel pmap 4576 * lock. Kernel page table pages are never freed, so at worst we will 4577 * observe inconsistencies in the output. 4578 */ 4579 sva = VM_MIN_KERNEL_ADDRESS; 4580 for (i = pmap_l1_index(sva); i < Ln_ENTRIES; i++) { 4581 if (i == pmap_l1_index(DMAP_MIN_ADDRESS)) 4582 sbuf_printf(sb, "\nDirect map:\n"); 4583 else if (i == pmap_l1_index(VM_MIN_KERNEL_ADDRESS)) 4584 sbuf_printf(sb, "\nKernel map:\n"); 4585 4586 l1e = kernel_pmap->pm_l1[i]; 4587 if ((l1e & PTE_V) == 0) { 4588 sysctl_kmaps_dump(sb, &range, sva); 4589 sva += L1_SIZE; 4590 continue; 4591 } 4592 if ((l1e & PTE_RWX) != 0) { 4593 sysctl_kmaps_check(sb, &range, sva, l1e, 0, 0); 4594 range.l1pages++; 4595 sva += L1_SIZE; 4596 continue; 4597 } 4598 pa = PTE_TO_PHYS(l1e); 4599 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); 4600 4601 for (j = pmap_l2_index(sva); j < Ln_ENTRIES; j++) { 4602 l2e = l2[j]; 4603 if ((l2e & PTE_V) == 0) { 4604 sysctl_kmaps_dump(sb, &range, sva); 4605 sva += L2_SIZE; 4606 continue; 4607 } 4608 if ((l2e & PTE_RWX) != 0) { 4609 sysctl_kmaps_check(sb, &range, sva, l1e, l2e, 0); 4610 range.l2pages++; 4611 sva += L2_SIZE; 4612 continue; 4613 } 4614 pa = PTE_TO_PHYS(l2e); 4615 l3 = (pd_entry_t *)PHYS_TO_DMAP(pa); 4616 4617 for (k = pmap_l3_index(sva); k < Ln_ENTRIES; k++, 4618 sva += L3_SIZE) { 4619 l3e = l3[k]; 4620 if ((l3e & PTE_V) == 0) { 4621 sysctl_kmaps_dump(sb, &range, sva); 4622 continue; 4623 } 4624 sysctl_kmaps_check(sb, &range, sva, 4625 l1e, l2e, l3e); 4626 range.l3pages++; 4627 } 4628 } 4629 } 4630 4631 error = sbuf_finish(sb); 4632 sbuf_delete(sb); 4633 return (error); 4634 } 4635 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 4636 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 4637 NULL, 0, sysctl_kmaps, "A", 4638 "Dump kernel address layout"); 4639