1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * Copyright (c) 2014 Andrew Turner 15 * All rights reserved. 16 * Copyright (c) 2014 The FreeBSD Foundation 17 * All rights reserved. 18 * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com> 19 * All rights reserved. 20 * 21 * This code is derived from software contributed to Berkeley by 22 * the Systems Programming Group of the University of Utah Computer 23 * Science Department and William Jolitz of UUNET Technologies Inc. 24 * 25 * Portions of this software were developed by Andrew Turner under 26 * sponsorship from The FreeBSD Foundation. 27 * 28 * Portions of this software were developed by SRI International and the 29 * University of Cambridge Computer Laboratory under DARPA/AFRL contract 30 * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. 31 * 32 * Portions of this software were developed by the University of Cambridge 33 * Computer Laboratory as part of the CTSRD Project, with support from the 34 * UK Higher Education Innovation Fund (HEIF). 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. All advertising materials mentioning features or use of this software 45 * must display the following acknowledgement: 46 * This product includes software developed by the University of 47 * California, Berkeley and its contributors. 48 * 4. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 * 64 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 65 */ 66 /*- 67 * Copyright (c) 2003 Networks Associates Technology, Inc. 68 * All rights reserved. 69 * 70 * This software was developed for the FreeBSD Project by Jake Burkholder, 71 * Safeport Network Services, and Network Associates Laboratories, the 72 * Security Research Division of Network Associates, Inc. under 73 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 74 * CHATS research program. 75 * 76 * Redistribution and use in source and binary forms, with or without 77 * modification, are permitted provided that the following conditions 78 * are met: 79 * 1. Redistributions of source code must retain the above copyright 80 * notice, this list of conditions and the following disclaimer. 81 * 2. Redistributions in binary form must reproduce the above copyright 82 * notice, this list of conditions and the following disclaimer in the 83 * documentation and/or other materials provided with the distribution. 84 * 85 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 86 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 87 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 88 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 89 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 90 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 91 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 92 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 93 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 94 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 95 * SUCH DAMAGE. 96 */ 97 98 #include <sys/cdefs.h> 99 __FBSDID("$FreeBSD$"); 100 101 /* 102 * Manages physical address maps. 103 * 104 * Since the information managed by this module is 105 * also stored by the logical address mapping module, 106 * this module may throw away valid virtual-to-physical 107 * mappings at almost any time. However, invalidations 108 * of virtual-to-physical mappings must be done as 109 * requested. 110 * 111 * In order to cope with hardware architectures which 112 * make virtual-to-physical map invalidates expensive, 113 * this module may delay invalidate or reduced protection 114 * operations until such time as they are actually 115 * necessary. This module is given full information as 116 * to which processors are currently using which maps, 117 * and to when physical maps must be made correct. 118 */ 119 120 #include <sys/param.h> 121 #include <sys/systm.h> 122 #include <sys/bitstring.h> 123 #include <sys/bus.h> 124 #include <sys/cpuset.h> 125 #include <sys/kernel.h> 126 #include <sys/ktr.h> 127 #include <sys/lock.h> 128 #include <sys/malloc.h> 129 #include <sys/mman.h> 130 #include <sys/msgbuf.h> 131 #include <sys/mutex.h> 132 #include <sys/proc.h> 133 #include <sys/rwlock.h> 134 #include <sys/sbuf.h> 135 #include <sys/sx.h> 136 #include <sys/vmem.h> 137 #include <sys/vmmeter.h> 138 #include <sys/sched.h> 139 #include <sys/sysctl.h> 140 #include <sys/smp.h> 141 142 #include <vm/vm.h> 143 #include <vm/vm_param.h> 144 #include <vm/vm_kern.h> 145 #include <vm/vm_page.h> 146 #include <vm/vm_map.h> 147 #include <vm/vm_object.h> 148 #include <vm/vm_extern.h> 149 #include <vm/vm_pageout.h> 150 #include <vm/vm_pager.h> 151 #include <vm/vm_phys.h> 152 #include <vm/vm_radix.h> 153 #include <vm/vm_reserv.h> 154 #include <vm/uma.h> 155 156 #include <machine/machdep.h> 157 #include <machine/md_var.h> 158 #include <machine/pcb.h> 159 #include <machine/sbi.h> 160 161 #define NUL1E (Ln_ENTRIES * Ln_ENTRIES) 162 #define NUL2E (Ln_ENTRIES * NUL1E) 163 164 #if !defined(DIAGNOSTIC) 165 #ifdef __GNUC_GNU_INLINE__ 166 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 167 #else 168 #define PMAP_INLINE extern inline 169 #endif 170 #else 171 #define PMAP_INLINE 172 #endif 173 174 #ifdef PV_STATS 175 #define PV_STAT(x) do { x ; } while (0) 176 #else 177 #define PV_STAT(x) do { } while (0) 178 #endif 179 180 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) 181 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 182 183 #define NPV_LIST_LOCKS MAXCPU 184 185 #define PHYS_TO_PV_LIST_LOCK(pa) \ 186 (&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS]) 187 188 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 189 struct rwlock **_lockp = (lockp); \ 190 struct rwlock *_new_lock; \ 191 \ 192 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 193 if (_new_lock != *_lockp) { \ 194 if (*_lockp != NULL) \ 195 rw_wunlock(*_lockp); \ 196 *_lockp = _new_lock; \ 197 rw_wlock(*_lockp); \ 198 } \ 199 } while (0) 200 201 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 202 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 203 204 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 205 struct rwlock **_lockp = (lockp); \ 206 \ 207 if (*_lockp != NULL) { \ 208 rw_wunlock(*_lockp); \ 209 *_lockp = NULL; \ 210 } \ 211 } while (0) 212 213 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 214 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 215 216 /* The list of all the user pmaps */ 217 LIST_HEAD(pmaplist, pmap); 218 static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER(); 219 220 struct pmap kernel_pmap_store; 221 222 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 223 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 224 vm_offset_t kernel_vm_end = 0; 225 226 vm_paddr_t dmap_phys_base; /* The start of the dmap region */ 227 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ 228 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ 229 230 /* This code assumes all L1 DMAP entries will be used */ 231 CTASSERT((DMAP_MIN_ADDRESS & ~L1_OFFSET) == DMAP_MIN_ADDRESS); 232 CTASSERT((DMAP_MAX_ADDRESS & ~L1_OFFSET) == DMAP_MAX_ADDRESS); 233 234 static struct rwlock_padalign pvh_global_lock; 235 static struct mtx_padalign allpmaps_lock; 236 237 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 238 "VM/pmap parameters"); 239 240 static int superpages_enabled = 1; 241 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, 242 CTLFLAG_RDTUN, &superpages_enabled, 0, 243 "Enable support for transparent superpages"); 244 245 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 246 "2MB page mapping counters"); 247 248 static u_long pmap_l2_demotions; 249 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, 250 &pmap_l2_demotions, 0, 251 "2MB page demotions"); 252 253 static u_long pmap_l2_mappings; 254 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, 255 &pmap_l2_mappings, 0, 256 "2MB page mappings"); 257 258 static u_long pmap_l2_p_failures; 259 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, 260 &pmap_l2_p_failures, 0, 261 "2MB page promotion failures"); 262 263 static u_long pmap_l2_promotions; 264 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, 265 &pmap_l2_promotions, 0, 266 "2MB page promotions"); 267 268 /* 269 * Data for the pv entry allocation mechanism 270 */ 271 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 272 static struct mtx pv_chunks_mutex; 273 static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 274 static struct md_page *pv_table; 275 static struct md_page pv_dummy; 276 277 extern cpuset_t all_harts; 278 279 /* 280 * Internal flags for pmap_enter()'s helper functions. 281 */ 282 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 283 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 284 285 static void free_pv_chunk(struct pv_chunk *pc); 286 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 287 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 288 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 289 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 290 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 291 vm_offset_t va); 292 static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va); 293 static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, 294 vm_offset_t va, struct rwlock **lockp); 295 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, 296 u_int flags, vm_page_t m, struct rwlock **lockp); 297 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 298 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 299 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, 300 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 301 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 302 vm_page_t m, struct rwlock **lockp); 303 304 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, 305 struct rwlock **lockp); 306 307 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 308 struct spglist *free); 309 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 310 311 #define pmap_clear(pte) pmap_store(pte, 0) 312 #define pmap_clear_bits(pte, bits) atomic_clear_64(pte, bits) 313 #define pmap_load_store(pte, entry) atomic_swap_64(pte, entry) 314 #define pmap_load_clear(pte) pmap_load_store(pte, 0) 315 #define pmap_load(pte) atomic_load_64(pte) 316 #define pmap_store(pte, entry) atomic_store_64(pte, entry) 317 #define pmap_store_bits(pte, bits) atomic_set_64(pte, bits) 318 319 /********************/ 320 /* Inline functions */ 321 /********************/ 322 323 static __inline void 324 pagecopy(void *s, void *d) 325 { 326 327 memcpy(d, s, PAGE_SIZE); 328 } 329 330 static __inline void 331 pagezero(void *p) 332 { 333 334 bzero(p, PAGE_SIZE); 335 } 336 337 #define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) 338 #define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) 339 #define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) 340 341 #define PTE_TO_PHYS(pte) ((pte >> PTE_PPN0_S) * PAGE_SIZE) 342 343 static __inline pd_entry_t * 344 pmap_l1(pmap_t pmap, vm_offset_t va) 345 { 346 347 return (&pmap->pm_l1[pmap_l1_index(va)]); 348 } 349 350 static __inline pd_entry_t * 351 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) 352 { 353 vm_paddr_t phys; 354 pd_entry_t *l2; 355 356 phys = PTE_TO_PHYS(pmap_load(l1)); 357 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 358 359 return (&l2[pmap_l2_index(va)]); 360 } 361 362 static __inline pd_entry_t * 363 pmap_l2(pmap_t pmap, vm_offset_t va) 364 { 365 pd_entry_t *l1; 366 367 l1 = pmap_l1(pmap, va); 368 if ((pmap_load(l1) & PTE_V) == 0) 369 return (NULL); 370 if ((pmap_load(l1) & PTE_RX) != 0) 371 return (NULL); 372 373 return (pmap_l1_to_l2(l1, va)); 374 } 375 376 static __inline pt_entry_t * 377 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) 378 { 379 vm_paddr_t phys; 380 pt_entry_t *l3; 381 382 phys = PTE_TO_PHYS(pmap_load(l2)); 383 l3 = (pd_entry_t *)PHYS_TO_DMAP(phys); 384 385 return (&l3[pmap_l3_index(va)]); 386 } 387 388 static __inline pt_entry_t * 389 pmap_l3(pmap_t pmap, vm_offset_t va) 390 { 391 pd_entry_t *l2; 392 393 l2 = pmap_l2(pmap, va); 394 if (l2 == NULL) 395 return (NULL); 396 if ((pmap_load(l2) & PTE_V) == 0) 397 return (NULL); 398 if ((pmap_load(l2) & PTE_RX) != 0) 399 return (NULL); 400 401 return (pmap_l2_to_l3(l2, va)); 402 } 403 404 static __inline void 405 pmap_resident_count_inc(pmap_t pmap, int count) 406 { 407 408 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 409 pmap->pm_stats.resident_count += count; 410 } 411 412 static __inline void 413 pmap_resident_count_dec(pmap_t pmap, int count) 414 { 415 416 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 417 KASSERT(pmap->pm_stats.resident_count >= count, 418 ("pmap %p resident count underflow %ld %d", pmap, 419 pmap->pm_stats.resident_count, count)); 420 pmap->pm_stats.resident_count -= count; 421 } 422 423 static void 424 pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index, 425 pt_entry_t entry) 426 { 427 struct pmap *user_pmap; 428 pd_entry_t *l1; 429 430 /* Distribute new kernel L1 entry to all the user pmaps */ 431 if (pmap != kernel_pmap) 432 return; 433 434 mtx_lock(&allpmaps_lock); 435 LIST_FOREACH(user_pmap, &allpmaps, pm_list) { 436 l1 = &user_pmap->pm_l1[l1index]; 437 pmap_store(l1, entry); 438 } 439 mtx_unlock(&allpmaps_lock); 440 } 441 442 static pt_entry_t * 443 pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, 444 u_int *l2_slot) 445 { 446 pt_entry_t *l2; 447 pd_entry_t *l1; 448 449 l1 = (pd_entry_t *)l1pt; 450 *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; 451 452 /* Check locore has used a table L1 map */ 453 KASSERT((l1[*l1_slot] & PTE_RX) == 0, 454 ("Invalid bootstrap L1 table")); 455 456 /* Find the address of the L2 table */ 457 l2 = (pt_entry_t *)init_pt_va; 458 *l2_slot = pmap_l2_index(va); 459 460 return (l2); 461 } 462 463 static vm_paddr_t 464 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) 465 { 466 u_int l1_slot, l2_slot; 467 pt_entry_t *l2; 468 vm_paddr_t ret; 469 470 l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); 471 472 /* Check locore has used L2 superpages */ 473 KASSERT((l2[l2_slot] & PTE_RX) != 0, 474 ("Invalid bootstrap L2 table")); 475 476 /* L2 is superpages */ 477 ret = (l2[l2_slot] >> PTE_PPN1_S) << L2_SHIFT; 478 ret += (va & L2_OFFSET); 479 480 return (ret); 481 } 482 483 static void 484 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa) 485 { 486 vm_offset_t va; 487 vm_paddr_t pa; 488 pd_entry_t *l1; 489 u_int l1_slot; 490 pt_entry_t entry; 491 pn_t pn; 492 493 pa = dmap_phys_base = min_pa & ~L1_OFFSET; 494 va = DMAP_MIN_ADDRESS; 495 l1 = (pd_entry_t *)kern_l1; 496 l1_slot = pmap_l1_index(DMAP_MIN_ADDRESS); 497 498 for (; va < DMAP_MAX_ADDRESS && pa < max_pa; 499 pa += L1_SIZE, va += L1_SIZE, l1_slot++) { 500 KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); 501 502 /* superpages */ 503 pn = (pa / PAGE_SIZE); 504 entry = PTE_KERN; 505 entry |= (pn << PTE_PPN0_S); 506 pmap_store(&l1[l1_slot], entry); 507 } 508 509 /* Set the upper limit of the DMAP region */ 510 dmap_phys_max = pa; 511 dmap_max_addr = va; 512 513 sfence_vma(); 514 } 515 516 static vm_offset_t 517 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) 518 { 519 vm_offset_t l3pt; 520 pt_entry_t entry; 521 pd_entry_t *l2; 522 vm_paddr_t pa; 523 u_int l2_slot; 524 pn_t pn; 525 526 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); 527 528 l2 = pmap_l2(kernel_pmap, va); 529 l2 = (pd_entry_t *)((uintptr_t)l2 & ~(PAGE_SIZE - 1)); 530 l2_slot = pmap_l2_index(va); 531 l3pt = l3_start; 532 533 for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { 534 KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); 535 536 pa = pmap_early_vtophys(l1pt, l3pt); 537 pn = (pa / PAGE_SIZE); 538 entry = (PTE_V); 539 entry |= (pn << PTE_PPN0_S); 540 pmap_store(&l2[l2_slot], entry); 541 l3pt += PAGE_SIZE; 542 } 543 544 545 /* Clean the L2 page table */ 546 memset((void *)l3_start, 0, l3pt - l3_start); 547 548 return (l3pt); 549 } 550 551 /* 552 * Bootstrap the system enough to run with virtual memory. 553 */ 554 void 555 pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) 556 { 557 u_int l1_slot, l2_slot, avail_slot, map_slot; 558 vm_offset_t freemempos; 559 vm_offset_t dpcpu, msgbufpv; 560 vm_paddr_t end, max_pa, min_pa, pa, start; 561 int i; 562 563 printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); 564 printf("%lx\n", l1pt); 565 printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK); 566 567 /* Set this early so we can use the pagetable walking functions */ 568 kernel_pmap_store.pm_l1 = (pd_entry_t *)l1pt; 569 PMAP_LOCK_INIT(kernel_pmap); 570 571 rw_init(&pvh_global_lock, "pmap pv global"); 572 573 CPU_FILL(&kernel_pmap->pm_active); 574 575 /* Assume the address we were loaded to is a valid physical address. */ 576 min_pa = max_pa = kernstart; 577 578 /* 579 * Find the minimum physical address. physmap is sorted, 580 * but may contain empty ranges. 581 */ 582 for (i = 0; i < physmap_idx * 2; i += 2) { 583 if (physmap[i] == physmap[i + 1]) 584 continue; 585 if (physmap[i] <= min_pa) 586 min_pa = physmap[i]; 587 if (physmap[i + 1] > max_pa) 588 max_pa = physmap[i + 1]; 589 } 590 printf("physmap_idx %lx\n", physmap_idx); 591 printf("min_pa %lx\n", min_pa); 592 printf("max_pa %lx\n", max_pa); 593 594 /* Create a direct map region early so we can use it for pa -> va */ 595 pmap_bootstrap_dmap(l1pt, min_pa, max_pa); 596 597 /* 598 * Read the page table to find out what is already mapped. 599 * This assumes we have mapped a block of memory from KERNBASE 600 * using a single L1 entry. 601 */ 602 (void)pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); 603 604 /* Sanity check the index, KERNBASE should be the first VA */ 605 KASSERT(l2_slot == 0, ("The L2 index is non-zero")); 606 607 freemempos = roundup2(KERNBASE + kernlen, PAGE_SIZE); 608 609 /* Create the l3 tables for the early devmap */ 610 freemempos = pmap_bootstrap_l3(l1pt, 611 VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos); 612 613 sfence_vma(); 614 615 #define alloc_pages(var, np) \ 616 (var) = freemempos; \ 617 freemempos += (np * PAGE_SIZE); \ 618 memset((char *)(var), 0, ((np) * PAGE_SIZE)); 619 620 /* Allocate dynamic per-cpu area. */ 621 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); 622 dpcpu_init((void *)dpcpu, 0); 623 624 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ 625 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); 626 msgbufp = (void *)msgbufpv; 627 628 virtual_avail = roundup2(freemempos, L2_SIZE); 629 virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE; 630 kernel_vm_end = virtual_avail; 631 632 pa = pmap_early_vtophys(l1pt, freemempos); 633 634 /* Initialize phys_avail and dump_avail. */ 635 for (avail_slot = map_slot = physmem = 0; map_slot < physmap_idx * 2; 636 map_slot += 2) { 637 start = physmap[map_slot]; 638 end = physmap[map_slot + 1]; 639 640 if (start == end) 641 continue; 642 dump_avail[map_slot] = start; 643 dump_avail[map_slot + 1] = end; 644 realmem += atop((vm_offset_t)(end - start)); 645 646 if (start >= kernstart && end <= pa) 647 continue; 648 649 if (start < kernstart && end > kernstart) 650 end = kernstart; 651 else if (start < pa && end > pa) 652 start = pa; 653 phys_avail[avail_slot] = start; 654 phys_avail[avail_slot + 1] = end; 655 physmem += (end - start) >> PAGE_SHIFT; 656 avail_slot += 2; 657 658 if (end != physmap[map_slot + 1] && end > pa) { 659 phys_avail[avail_slot] = pa; 660 phys_avail[avail_slot + 1] = physmap[map_slot + 1]; 661 physmem += (physmap[map_slot + 1] - pa) >> PAGE_SHIFT; 662 avail_slot += 2; 663 } 664 } 665 phys_avail[avail_slot] = 0; 666 phys_avail[avail_slot + 1] = 0; 667 668 /* 669 * Maxmem isn't the "maximum memory", it's one larger than the 670 * highest page of the physical address space. It should be 671 * called something like "Maxphyspage". 672 */ 673 Maxmem = atop(phys_avail[avail_slot - 1]); 674 } 675 676 /* 677 * Initialize a vm_page's machine-dependent fields. 678 */ 679 void 680 pmap_page_init(vm_page_t m) 681 { 682 683 TAILQ_INIT(&m->md.pv_list); 684 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; 685 } 686 687 /* 688 * Initialize the pmap module. 689 * Called by vm_init, to initialize any structures that the pmap 690 * system needs to map virtual memory. 691 */ 692 void 693 pmap_init(void) 694 { 695 vm_size_t s; 696 int i, pv_npg; 697 698 /* 699 * Initialize the pv chunk and pmap list mutexes. 700 */ 701 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 702 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF); 703 704 /* 705 * Initialize the pool of pv list locks. 706 */ 707 for (i = 0; i < NPV_LIST_LOCKS; i++) 708 rw_init(&pv_list_locks[i], "pmap pv list"); 709 710 /* 711 * Calculate the size of the pv head table for superpages. 712 */ 713 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); 714 715 /* 716 * Allocate memory for the pv head table for superpages. 717 */ 718 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 719 s = round_page(s); 720 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 721 for (i = 0; i < pv_npg; i++) 722 TAILQ_INIT(&pv_table[i].pv_list); 723 TAILQ_INIT(&pv_dummy.pv_list); 724 725 if (superpages_enabled) 726 pagesizes[1] = L2_SIZE; 727 } 728 729 #ifdef SMP 730 /* 731 * For SMP, these functions have to use IPIs for coherence. 732 * 733 * In general, the calling thread uses a plain fence to order the 734 * writes to the page tables before invoking an SBI callback to invoke 735 * sfence_vma() on remote CPUs. 736 */ 737 static void 738 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 739 { 740 cpuset_t mask; 741 742 sched_pin(); 743 mask = pmap->pm_active; 744 CPU_CLR(PCPU_GET(hart), &mask); 745 fence(); 746 if (!CPU_EMPTY(&mask) && smp_started) 747 sbi_remote_sfence_vma(mask.__bits, va, 1); 748 sfence_vma_page(va); 749 sched_unpin(); 750 } 751 752 static void 753 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 754 { 755 cpuset_t mask; 756 757 sched_pin(); 758 mask = pmap->pm_active; 759 CPU_CLR(PCPU_GET(hart), &mask); 760 fence(); 761 if (!CPU_EMPTY(&mask) && smp_started) 762 sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1); 763 764 /* 765 * Might consider a loop of sfence_vma_page() for a small 766 * number of pages in the future. 767 */ 768 sfence_vma(); 769 sched_unpin(); 770 } 771 772 static void 773 pmap_invalidate_all(pmap_t pmap) 774 { 775 cpuset_t mask; 776 777 sched_pin(); 778 mask = pmap->pm_active; 779 CPU_CLR(PCPU_GET(hart), &mask); 780 781 /* 782 * XXX: The SBI doc doesn't detail how to specify x0 as the 783 * address to perform a global fence. BBL currently treats 784 * all sfence_vma requests as global however. 785 */ 786 fence(); 787 if (!CPU_EMPTY(&mask) && smp_started) 788 sbi_remote_sfence_vma(mask.__bits, 0, 0); 789 sfence_vma(); 790 sched_unpin(); 791 } 792 #else 793 /* 794 * Normal, non-SMP, invalidation functions. 795 * We inline these within pmap.c for speed. 796 */ 797 static __inline void 798 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 799 { 800 801 sfence_vma_page(va); 802 } 803 804 static __inline void 805 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 806 { 807 808 /* 809 * Might consider a loop of sfence_vma_page() for a small 810 * number of pages in the future. 811 */ 812 sfence_vma(); 813 } 814 815 static __inline void 816 pmap_invalidate_all(pmap_t pmap) 817 { 818 819 sfence_vma(); 820 } 821 #endif 822 823 /* 824 * Routine: pmap_extract 825 * Function: 826 * Extract the physical page address associated 827 * with the given map/virtual_address pair. 828 */ 829 vm_paddr_t 830 pmap_extract(pmap_t pmap, vm_offset_t va) 831 { 832 pd_entry_t *l2p, l2; 833 pt_entry_t *l3p, l3; 834 vm_paddr_t pa; 835 836 pa = 0; 837 PMAP_LOCK(pmap); 838 /* 839 * Start with the l2 tabel. We are unable to allocate 840 * pages in the l1 table. 841 */ 842 l2p = pmap_l2(pmap, va); 843 if (l2p != NULL) { 844 l2 = pmap_load(l2p); 845 if ((l2 & PTE_RX) == 0) { 846 l3p = pmap_l2_to_l3(l2p, va); 847 if (l3p != NULL) { 848 l3 = pmap_load(l3p); 849 pa = PTE_TO_PHYS(l3); 850 pa |= (va & L3_OFFSET); 851 } 852 } else { 853 /* L2 is superpages */ 854 pa = (l2 >> PTE_PPN1_S) << L2_SHIFT; 855 pa |= (va & L2_OFFSET); 856 } 857 } 858 PMAP_UNLOCK(pmap); 859 return (pa); 860 } 861 862 /* 863 * Routine: pmap_extract_and_hold 864 * Function: 865 * Atomically extract and hold the physical page 866 * with the given pmap and virtual address pair 867 * if that mapping permits the given protection. 868 */ 869 vm_page_t 870 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 871 { 872 pt_entry_t *l3p, l3; 873 vm_paddr_t phys; 874 vm_page_t m; 875 876 m = NULL; 877 PMAP_LOCK(pmap); 878 l3p = pmap_l3(pmap, va); 879 if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) { 880 if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) { 881 phys = PTE_TO_PHYS(l3); 882 m = PHYS_TO_VM_PAGE(phys); 883 if (!vm_page_wire_mapped(m)) 884 m = NULL; 885 } 886 } 887 PMAP_UNLOCK(pmap); 888 return (m); 889 } 890 891 vm_paddr_t 892 pmap_kextract(vm_offset_t va) 893 { 894 pd_entry_t *l2; 895 pt_entry_t *l3; 896 vm_paddr_t pa; 897 898 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 899 pa = DMAP_TO_PHYS(va); 900 } else { 901 l2 = pmap_l2(kernel_pmap, va); 902 if (l2 == NULL) 903 panic("pmap_kextract: No l2"); 904 if ((pmap_load(l2) & PTE_RX) != 0) { 905 /* superpages */ 906 pa = (pmap_load(l2) >> PTE_PPN1_S) << L2_SHIFT; 907 pa |= (va & L2_OFFSET); 908 return (pa); 909 } 910 911 l3 = pmap_l2_to_l3(l2, va); 912 if (l3 == NULL) 913 panic("pmap_kextract: No l3..."); 914 pa = PTE_TO_PHYS(pmap_load(l3)); 915 pa |= (va & PAGE_MASK); 916 } 917 return (pa); 918 } 919 920 /*************************************************** 921 * Low level mapping routines..... 922 ***************************************************/ 923 924 void 925 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) 926 { 927 pt_entry_t entry; 928 pt_entry_t *l3; 929 vm_offset_t va; 930 pn_t pn; 931 932 KASSERT((pa & L3_OFFSET) == 0, 933 ("pmap_kenter_device: Invalid physical address")); 934 KASSERT((sva & L3_OFFSET) == 0, 935 ("pmap_kenter_device: Invalid virtual address")); 936 KASSERT((size & PAGE_MASK) == 0, 937 ("pmap_kenter_device: Mapping is not page-sized")); 938 939 va = sva; 940 while (size != 0) { 941 l3 = pmap_l3(kernel_pmap, va); 942 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 943 944 pn = (pa / PAGE_SIZE); 945 entry = PTE_KERN; 946 entry |= (pn << PTE_PPN0_S); 947 pmap_store(l3, entry); 948 949 va += PAGE_SIZE; 950 pa += PAGE_SIZE; 951 size -= PAGE_SIZE; 952 } 953 pmap_invalidate_range(kernel_pmap, sva, va); 954 } 955 956 /* 957 * Remove a page from the kernel pagetables. 958 * Note: not SMP coherent. 959 */ 960 PMAP_INLINE void 961 pmap_kremove(vm_offset_t va) 962 { 963 pt_entry_t *l3; 964 965 l3 = pmap_l3(kernel_pmap, va); 966 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 967 968 pmap_clear(l3); 969 sfence_vma(); 970 } 971 972 void 973 pmap_kremove_device(vm_offset_t sva, vm_size_t size) 974 { 975 pt_entry_t *l3; 976 vm_offset_t va; 977 978 KASSERT((sva & L3_OFFSET) == 0, 979 ("pmap_kremove_device: Invalid virtual address")); 980 KASSERT((size & PAGE_MASK) == 0, 981 ("pmap_kremove_device: Mapping is not page-sized")); 982 983 va = sva; 984 while (size != 0) { 985 l3 = pmap_l3(kernel_pmap, va); 986 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 987 pmap_clear(l3); 988 989 va += PAGE_SIZE; 990 size -= PAGE_SIZE; 991 } 992 993 pmap_invalidate_range(kernel_pmap, sva, va); 994 } 995 996 /* 997 * Used to map a range of physical addresses into kernel 998 * virtual address space. 999 * 1000 * The value passed in '*virt' is a suggested virtual address for 1001 * the mapping. Architectures which can support a direct-mapped 1002 * physical to virtual region can return the appropriate address 1003 * within that region, leaving '*virt' unchanged. Other 1004 * architectures should map the pages starting at '*virt' and 1005 * update '*virt' with the first usable address after the mapped 1006 * region. 1007 */ 1008 vm_offset_t 1009 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1010 { 1011 1012 return PHYS_TO_DMAP(start); 1013 } 1014 1015 1016 /* 1017 * Add a list of wired pages to the kva 1018 * this routine is only used for temporary 1019 * kernel mappings that do not need to have 1020 * page modification or references recorded. 1021 * Note that old mappings are simply written 1022 * over. The page *must* be wired. 1023 * Note: SMP coherent. Uses a ranged shootdown IPI. 1024 */ 1025 void 1026 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1027 { 1028 pt_entry_t *l3, pa; 1029 vm_offset_t va; 1030 vm_page_t m; 1031 pt_entry_t entry; 1032 pn_t pn; 1033 int i; 1034 1035 va = sva; 1036 for (i = 0; i < count; i++) { 1037 m = ma[i]; 1038 pa = VM_PAGE_TO_PHYS(m); 1039 pn = (pa / PAGE_SIZE); 1040 l3 = pmap_l3(kernel_pmap, va); 1041 1042 entry = PTE_KERN; 1043 entry |= (pn << PTE_PPN0_S); 1044 pmap_store(l3, entry); 1045 1046 va += L3_SIZE; 1047 } 1048 pmap_invalidate_range(kernel_pmap, sva, va); 1049 } 1050 1051 /* 1052 * This routine tears out page mappings from the 1053 * kernel -- it is meant only for temporary mappings. 1054 * Note: SMP coherent. Uses a ranged shootdown IPI. 1055 */ 1056 void 1057 pmap_qremove(vm_offset_t sva, int count) 1058 { 1059 pt_entry_t *l3; 1060 vm_offset_t va; 1061 1062 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); 1063 1064 for (va = sva; count-- > 0; va += PAGE_SIZE) { 1065 l3 = pmap_l3(kernel_pmap, va); 1066 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 1067 pmap_clear(l3); 1068 } 1069 pmap_invalidate_range(kernel_pmap, sva, va); 1070 } 1071 1072 bool 1073 pmap_ps_enabled(pmap_t pmap __unused) 1074 { 1075 1076 return (superpages_enabled); 1077 } 1078 1079 /*************************************************** 1080 * Page table page management routines..... 1081 ***************************************************/ 1082 /* 1083 * Schedule the specified unused page table page to be freed. Specifically, 1084 * add the page to the specified list of pages that will be released to the 1085 * physical memory manager after the TLB has been updated. 1086 */ 1087 static __inline void 1088 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 1089 boolean_t set_PG_ZERO) 1090 { 1091 1092 if (set_PG_ZERO) 1093 m->flags |= PG_ZERO; 1094 else 1095 m->flags &= ~PG_ZERO; 1096 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1097 } 1098 1099 /* 1100 * Inserts the specified page table page into the specified pmap's collection 1101 * of idle page table pages. Each of a pmap's page table pages is responsible 1102 * for mapping a distinct range of virtual addresses. The pmap's collection is 1103 * ordered by this virtual address range. 1104 * 1105 * If "promoted" is false, then the page table page "ml3" must be zero filled. 1106 */ 1107 static __inline int 1108 pmap_insert_pt_page(pmap_t pmap, vm_page_t ml3, bool promoted) 1109 { 1110 1111 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1112 ml3->valid = promoted ? VM_PAGE_BITS_ALL : 0; 1113 return (vm_radix_insert(&pmap->pm_root, ml3)); 1114 } 1115 1116 /* 1117 * Removes the page table page mapping the specified virtual address from the 1118 * specified pmap's collection of idle page table pages, and returns it. 1119 * Otherwise, returns NULL if there is no page table page corresponding to the 1120 * specified virtual address. 1121 */ 1122 static __inline vm_page_t 1123 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 1124 { 1125 1126 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1127 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); 1128 } 1129 1130 /* 1131 * Decrements a page table page's reference count, which is used to record the 1132 * number of valid page table entries within the page. If the reference count 1133 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1134 * page table page was unmapped and FALSE otherwise. 1135 */ 1136 static inline boolean_t 1137 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1138 { 1139 1140 --m->ref_count; 1141 if (m->ref_count == 0) { 1142 _pmap_unwire_ptp(pmap, va, m, free); 1143 return (TRUE); 1144 } else { 1145 return (FALSE); 1146 } 1147 } 1148 1149 static void 1150 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1151 { 1152 vm_paddr_t phys; 1153 1154 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1155 if (m->pindex >= NUL1E) { 1156 pd_entry_t *l1; 1157 l1 = pmap_l1(pmap, va); 1158 pmap_clear(l1); 1159 pmap_distribute_l1(pmap, pmap_l1_index(va), 0); 1160 } else { 1161 pd_entry_t *l2; 1162 l2 = pmap_l2(pmap, va); 1163 pmap_clear(l2); 1164 } 1165 pmap_resident_count_dec(pmap, 1); 1166 if (m->pindex < NUL1E) { 1167 pd_entry_t *l1; 1168 vm_page_t pdpg; 1169 1170 l1 = pmap_l1(pmap, va); 1171 phys = PTE_TO_PHYS(pmap_load(l1)); 1172 pdpg = PHYS_TO_VM_PAGE(phys); 1173 pmap_unwire_ptp(pmap, va, pdpg, free); 1174 } 1175 pmap_invalidate_page(pmap, va); 1176 1177 vm_wire_sub(1); 1178 1179 /* 1180 * Put page on a list so that it is released after 1181 * *ALL* TLB shootdown is done 1182 */ 1183 pmap_add_delayed_free_list(m, free, TRUE); 1184 } 1185 1186 /* 1187 * After removing a page table entry, this routine is used to 1188 * conditionally free the page, and manage the reference count. 1189 */ 1190 static int 1191 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 1192 struct spglist *free) 1193 { 1194 vm_page_t mpte; 1195 1196 if (va >= VM_MAXUSER_ADDRESS) 1197 return (0); 1198 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 1199 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde)); 1200 return (pmap_unwire_ptp(pmap, va, mpte, free)); 1201 } 1202 1203 void 1204 pmap_pinit0(pmap_t pmap) 1205 { 1206 1207 PMAP_LOCK_INIT(pmap); 1208 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1209 pmap->pm_l1 = kernel_pmap->pm_l1; 1210 pmap->pm_satp = SATP_MODE_SV39 | (vtophys(pmap->pm_l1) >> PAGE_SHIFT); 1211 CPU_ZERO(&pmap->pm_active); 1212 pmap_activate_boot(pmap); 1213 } 1214 1215 int 1216 pmap_pinit(pmap_t pmap) 1217 { 1218 vm_paddr_t l1phys; 1219 vm_page_t l1pt; 1220 1221 /* 1222 * allocate the l1 page 1223 */ 1224 while ((l1pt = vm_page_alloc(NULL, 0xdeadbeef, VM_ALLOC_NORMAL | 1225 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) 1226 vm_wait(NULL); 1227 1228 l1phys = VM_PAGE_TO_PHYS(l1pt); 1229 pmap->pm_l1 = (pd_entry_t *)PHYS_TO_DMAP(l1phys); 1230 pmap->pm_satp = SATP_MODE_SV39 | (l1phys >> PAGE_SHIFT); 1231 1232 if ((l1pt->flags & PG_ZERO) == 0) 1233 pagezero(pmap->pm_l1); 1234 1235 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1236 1237 CPU_ZERO(&pmap->pm_active); 1238 1239 /* Install kernel pagetables */ 1240 memcpy(pmap->pm_l1, kernel_pmap->pm_l1, PAGE_SIZE); 1241 1242 /* Add to the list of all user pmaps */ 1243 mtx_lock(&allpmaps_lock); 1244 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1245 mtx_unlock(&allpmaps_lock); 1246 1247 vm_radix_init(&pmap->pm_root); 1248 1249 return (1); 1250 } 1251 1252 /* 1253 * This routine is called if the desired page table page does not exist. 1254 * 1255 * If page table page allocation fails, this routine may sleep before 1256 * returning NULL. It sleeps only if a lock pointer was given. 1257 * 1258 * Note: If a page allocation fails at page table level two or three, 1259 * one or two pages may be held during the wait, only to be released 1260 * afterwards. This conservative approach is easily argued to avoid 1261 * race conditions. 1262 */ 1263 static vm_page_t 1264 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 1265 { 1266 vm_page_t m, /*pdppg, */pdpg; 1267 pt_entry_t entry; 1268 vm_paddr_t phys; 1269 pn_t pn; 1270 1271 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1272 1273 /* 1274 * Allocate a page table page. 1275 */ 1276 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1277 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1278 if (lockp != NULL) { 1279 RELEASE_PV_LIST_LOCK(lockp); 1280 PMAP_UNLOCK(pmap); 1281 rw_runlock(&pvh_global_lock); 1282 vm_wait(NULL); 1283 rw_rlock(&pvh_global_lock); 1284 PMAP_LOCK(pmap); 1285 } 1286 1287 /* 1288 * Indicate the need to retry. While waiting, the page table 1289 * page may have been allocated. 1290 */ 1291 return (NULL); 1292 } 1293 1294 if ((m->flags & PG_ZERO) == 0) 1295 pmap_zero_page(m); 1296 1297 /* 1298 * Map the pagetable page into the process address space, if 1299 * it isn't already there. 1300 */ 1301 1302 if (ptepindex >= NUL1E) { 1303 pd_entry_t *l1; 1304 vm_pindex_t l1index; 1305 1306 l1index = ptepindex - NUL1E; 1307 l1 = &pmap->pm_l1[l1index]; 1308 1309 pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); 1310 entry = (PTE_V); 1311 entry |= (pn << PTE_PPN0_S); 1312 pmap_store(l1, entry); 1313 pmap_distribute_l1(pmap, l1index, entry); 1314 } else { 1315 vm_pindex_t l1index; 1316 pd_entry_t *l1, *l2; 1317 1318 l1index = ptepindex >> (L1_SHIFT - L2_SHIFT); 1319 l1 = &pmap->pm_l1[l1index]; 1320 if (pmap_load(l1) == 0) { 1321 /* recurse for allocating page dir */ 1322 if (_pmap_alloc_l3(pmap, NUL1E + l1index, 1323 lockp) == NULL) { 1324 vm_page_unwire_noq(m); 1325 vm_page_free_zero(m); 1326 return (NULL); 1327 } 1328 } else { 1329 phys = PTE_TO_PHYS(pmap_load(l1)); 1330 pdpg = PHYS_TO_VM_PAGE(phys); 1331 pdpg->ref_count++; 1332 } 1333 1334 phys = PTE_TO_PHYS(pmap_load(l1)); 1335 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1336 l2 = &l2[ptepindex & Ln_ADDR_MASK]; 1337 1338 pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); 1339 entry = (PTE_V); 1340 entry |= (pn << PTE_PPN0_S); 1341 pmap_store(l2, entry); 1342 } 1343 1344 pmap_resident_count_inc(pmap, 1); 1345 1346 return (m); 1347 } 1348 1349 static vm_page_t 1350 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1351 { 1352 pd_entry_t *l1; 1353 vm_page_t l2pg; 1354 vm_pindex_t l2pindex; 1355 1356 retry: 1357 l1 = pmap_l1(pmap, va); 1358 if (l1 != NULL && (pmap_load(l1) & PTE_RWX) == 0) { 1359 /* Add a reference to the L2 page. */ 1360 l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1))); 1361 l2pg->ref_count++; 1362 } else { 1363 /* Allocate a L2 page. */ 1364 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; 1365 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); 1366 if (l2pg == NULL && lockp != NULL) 1367 goto retry; 1368 } 1369 return (l2pg); 1370 } 1371 1372 static vm_page_t 1373 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1374 { 1375 vm_pindex_t ptepindex; 1376 pd_entry_t *l2; 1377 vm_paddr_t phys; 1378 vm_page_t m; 1379 1380 /* 1381 * Calculate pagetable page index 1382 */ 1383 ptepindex = pmap_l2_pindex(va); 1384 retry: 1385 /* 1386 * Get the page directory entry 1387 */ 1388 l2 = pmap_l2(pmap, va); 1389 1390 /* 1391 * If the page table page is mapped, we just increment the 1392 * hold count, and activate it. 1393 */ 1394 if (l2 != NULL && pmap_load(l2) != 0) { 1395 phys = PTE_TO_PHYS(pmap_load(l2)); 1396 m = PHYS_TO_VM_PAGE(phys); 1397 m->ref_count++; 1398 } else { 1399 /* 1400 * Here if the pte page isn't mapped, or if it has been 1401 * deallocated. 1402 */ 1403 m = _pmap_alloc_l3(pmap, ptepindex, lockp); 1404 if (m == NULL && lockp != NULL) 1405 goto retry; 1406 } 1407 return (m); 1408 } 1409 1410 1411 /*************************************************** 1412 * Pmap allocation/deallocation routines. 1413 ***************************************************/ 1414 1415 /* 1416 * Release any resources held by the given physical map. 1417 * Called when a pmap initialized by pmap_pinit is being released. 1418 * Should only be called if the map contains no valid mappings. 1419 */ 1420 void 1421 pmap_release(pmap_t pmap) 1422 { 1423 vm_page_t m; 1424 1425 KASSERT(pmap->pm_stats.resident_count == 0, 1426 ("pmap_release: pmap resident count %ld != 0", 1427 pmap->pm_stats.resident_count)); 1428 KASSERT(CPU_EMPTY(&pmap->pm_active), 1429 ("releasing active pmap %p", pmap)); 1430 1431 mtx_lock(&allpmaps_lock); 1432 LIST_REMOVE(pmap, pm_list); 1433 mtx_unlock(&allpmaps_lock); 1434 1435 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l1)); 1436 vm_page_unwire_noq(m); 1437 vm_page_free(m); 1438 } 1439 1440 static int 1441 kvm_size(SYSCTL_HANDLER_ARGS) 1442 { 1443 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 1444 1445 return sysctl_handle_long(oidp, &ksize, 0, req); 1446 } 1447 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1448 0, 0, kvm_size, "LU", 1449 "Size of KVM"); 1450 1451 static int 1452 kvm_free(SYSCTL_HANDLER_ARGS) 1453 { 1454 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 1455 1456 return sysctl_handle_long(oidp, &kfree, 0, req); 1457 } 1458 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1459 0, 0, kvm_free, "LU", 1460 "Amount of KVM free"); 1461 1462 /* 1463 * grow the number of kernel page table entries, if needed 1464 */ 1465 void 1466 pmap_growkernel(vm_offset_t addr) 1467 { 1468 vm_paddr_t paddr; 1469 vm_page_t nkpg; 1470 pd_entry_t *l1, *l2; 1471 pt_entry_t entry; 1472 pn_t pn; 1473 1474 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 1475 1476 addr = roundup2(addr, L2_SIZE); 1477 if (addr - 1 >= vm_map_max(kernel_map)) 1478 addr = vm_map_max(kernel_map); 1479 while (kernel_vm_end < addr) { 1480 l1 = pmap_l1(kernel_pmap, kernel_vm_end); 1481 if (pmap_load(l1) == 0) { 1482 /* We need a new PDP entry */ 1483 nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT, 1484 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 1485 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1486 if (nkpg == NULL) 1487 panic("pmap_growkernel: no memory to grow kernel"); 1488 if ((nkpg->flags & PG_ZERO) == 0) 1489 pmap_zero_page(nkpg); 1490 paddr = VM_PAGE_TO_PHYS(nkpg); 1491 1492 pn = (paddr / PAGE_SIZE); 1493 entry = (PTE_V); 1494 entry |= (pn << PTE_PPN0_S); 1495 pmap_store(l1, entry); 1496 pmap_distribute_l1(kernel_pmap, 1497 pmap_l1_index(kernel_vm_end), entry); 1498 continue; /* try again */ 1499 } 1500 l2 = pmap_l1_to_l2(l1, kernel_vm_end); 1501 if ((pmap_load(l2) & PTE_V) != 0 && 1502 (pmap_load(l2) & PTE_RWX) == 0) { 1503 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1504 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1505 kernel_vm_end = vm_map_max(kernel_map); 1506 break; 1507 } 1508 continue; 1509 } 1510 1511 nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT, 1512 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 1513 VM_ALLOC_ZERO); 1514 if (nkpg == NULL) 1515 panic("pmap_growkernel: no memory to grow kernel"); 1516 if ((nkpg->flags & PG_ZERO) == 0) { 1517 pmap_zero_page(nkpg); 1518 } 1519 paddr = VM_PAGE_TO_PHYS(nkpg); 1520 1521 pn = (paddr / PAGE_SIZE); 1522 entry = (PTE_V); 1523 entry |= (pn << PTE_PPN0_S); 1524 pmap_store(l2, entry); 1525 1526 pmap_invalidate_page(kernel_pmap, kernel_vm_end); 1527 1528 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1529 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1530 kernel_vm_end = vm_map_max(kernel_map); 1531 break; 1532 } 1533 } 1534 } 1535 1536 1537 /*************************************************** 1538 * page management routines. 1539 ***************************************************/ 1540 1541 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 1542 CTASSERT(_NPCM == 3); 1543 CTASSERT(_NPCPV == 168); 1544 1545 static __inline struct pv_chunk * 1546 pv_to_chunk(pv_entry_t pv) 1547 { 1548 1549 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 1550 } 1551 1552 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 1553 1554 #define PC_FREE0 0xfffffffffffffffful 1555 #define PC_FREE1 0xfffffffffffffffful 1556 #define PC_FREE2 0x000000fffffffffful 1557 1558 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 1559 1560 #if 0 1561 #ifdef PV_STATS 1562 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 1563 1564 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 1565 "Current number of pv entry chunks"); 1566 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 1567 "Current number of pv entry chunks allocated"); 1568 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 1569 "Current number of pv entry chunks frees"); 1570 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 1571 "Number of times tried to get a chunk page but failed."); 1572 1573 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 1574 static int pv_entry_spare; 1575 1576 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 1577 "Current number of pv entry frees"); 1578 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 1579 "Current number of pv entry allocs"); 1580 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 1581 "Current number of pv entries"); 1582 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 1583 "Current number of spare pv entries"); 1584 #endif 1585 #endif /* 0 */ 1586 1587 /* 1588 * We are in a serious low memory condition. Resort to 1589 * drastic measures to free some pages so we can allocate 1590 * another pv entry chunk. 1591 * 1592 * Returns NULL if PV entries were reclaimed from the specified pmap. 1593 * 1594 * We do not, however, unmap 2mpages because subsequent accesses will 1595 * allocate per-page pv entries until repromotion occurs, thereby 1596 * exacerbating the shortage of free pv entries. 1597 */ 1598 static vm_page_t 1599 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 1600 { 1601 1602 panic("RISCVTODO: reclaim_pv_chunk"); 1603 } 1604 1605 /* 1606 * free the pv_entry back to the free list 1607 */ 1608 static void 1609 free_pv_entry(pmap_t pmap, pv_entry_t pv) 1610 { 1611 struct pv_chunk *pc; 1612 int idx, field, bit; 1613 1614 rw_assert(&pvh_global_lock, RA_LOCKED); 1615 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1616 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 1617 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 1618 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 1619 pc = pv_to_chunk(pv); 1620 idx = pv - &pc->pc_pventry[0]; 1621 field = idx / 64; 1622 bit = idx % 64; 1623 pc->pc_map[field] |= 1ul << bit; 1624 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 1625 pc->pc_map[2] != PC_FREE2) { 1626 /* 98% of the time, pc is already at the head of the list. */ 1627 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 1628 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1629 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1630 } 1631 return; 1632 } 1633 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1634 free_pv_chunk(pc); 1635 } 1636 1637 static void 1638 free_pv_chunk(struct pv_chunk *pc) 1639 { 1640 vm_page_t m; 1641 1642 mtx_lock(&pv_chunks_mutex); 1643 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 1644 mtx_unlock(&pv_chunks_mutex); 1645 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 1646 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 1647 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 1648 /* entire chunk is free, return it */ 1649 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 1650 dump_drop_page(m->phys_addr); 1651 vm_page_unwire_noq(m); 1652 vm_page_free(m); 1653 } 1654 1655 /* 1656 * Returns a new PV entry, allocating a new PV chunk from the system when 1657 * needed. If this PV chunk allocation fails and a PV list lock pointer was 1658 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 1659 * returned. 1660 * 1661 * The given PV list lock may be released. 1662 */ 1663 static pv_entry_t 1664 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 1665 { 1666 int bit, field; 1667 pv_entry_t pv; 1668 struct pv_chunk *pc; 1669 vm_page_t m; 1670 1671 rw_assert(&pvh_global_lock, RA_LOCKED); 1672 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1673 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 1674 retry: 1675 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1676 if (pc != NULL) { 1677 for (field = 0; field < _NPCM; field++) { 1678 if (pc->pc_map[field]) { 1679 bit = ffsl(pc->pc_map[field]) - 1; 1680 break; 1681 } 1682 } 1683 if (field < _NPCM) { 1684 pv = &pc->pc_pventry[field * 64 + bit]; 1685 pc->pc_map[field] &= ~(1ul << bit); 1686 /* If this was the last item, move it to tail */ 1687 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 1688 pc->pc_map[2] == 0) { 1689 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1690 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 1691 pc_list); 1692 } 1693 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1694 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 1695 return (pv); 1696 } 1697 } 1698 /* No free items, allocate another chunk */ 1699 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1700 VM_ALLOC_WIRED); 1701 if (m == NULL) { 1702 if (lockp == NULL) { 1703 PV_STAT(pc_chunk_tryfail++); 1704 return (NULL); 1705 } 1706 m = reclaim_pv_chunk(pmap, lockp); 1707 if (m == NULL) 1708 goto retry; 1709 } 1710 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 1711 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 1712 dump_add_page(m->phys_addr); 1713 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1714 pc->pc_pmap = pmap; 1715 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 1716 pc->pc_map[1] = PC_FREE1; 1717 pc->pc_map[2] = PC_FREE2; 1718 mtx_lock(&pv_chunks_mutex); 1719 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 1720 mtx_unlock(&pv_chunks_mutex); 1721 pv = &pc->pc_pventry[0]; 1722 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1723 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1724 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 1725 return (pv); 1726 } 1727 1728 /* 1729 * Ensure that the number of spare PV entries in the specified pmap meets or 1730 * exceeds the given count, "needed". 1731 * 1732 * The given PV list lock may be released. 1733 */ 1734 static void 1735 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 1736 { 1737 struct pch new_tail; 1738 struct pv_chunk *pc; 1739 vm_page_t m; 1740 int avail, free; 1741 bool reclaimed; 1742 1743 rw_assert(&pvh_global_lock, RA_LOCKED); 1744 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1745 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 1746 1747 /* 1748 * Newly allocated PV chunks must be stored in a private list until 1749 * the required number of PV chunks have been allocated. Otherwise, 1750 * reclaim_pv_chunk() could recycle one of these chunks. In 1751 * contrast, these chunks must be added to the pmap upon allocation. 1752 */ 1753 TAILQ_INIT(&new_tail); 1754 retry: 1755 avail = 0; 1756 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 1757 bit_count((bitstr_t *)pc->pc_map, 0, 1758 sizeof(pc->pc_map) * NBBY, &free); 1759 if (free == 0) 1760 break; 1761 avail += free; 1762 if (avail >= needed) 1763 break; 1764 } 1765 for (reclaimed = false; avail < needed; avail += _NPCPV) { 1766 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1767 VM_ALLOC_WIRED); 1768 if (m == NULL) { 1769 m = reclaim_pv_chunk(pmap, lockp); 1770 if (m == NULL) 1771 goto retry; 1772 reclaimed = true; 1773 } 1774 /* XXX PV STATS */ 1775 #if 0 1776 dump_add_page(m->phys_addr); 1777 #endif 1778 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1779 pc->pc_pmap = pmap; 1780 pc->pc_map[0] = PC_FREE0; 1781 pc->pc_map[1] = PC_FREE1; 1782 pc->pc_map[2] = PC_FREE2; 1783 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1784 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 1785 1786 /* 1787 * The reclaim might have freed a chunk from the current pmap. 1788 * If that chunk contained available entries, we need to 1789 * re-count the number of available entries. 1790 */ 1791 if (reclaimed) 1792 goto retry; 1793 } 1794 if (!TAILQ_EMPTY(&new_tail)) { 1795 mtx_lock(&pv_chunks_mutex); 1796 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 1797 mtx_unlock(&pv_chunks_mutex); 1798 } 1799 } 1800 1801 /* 1802 * First find and then remove the pv entry for the specified pmap and virtual 1803 * address from the specified pv list. Returns the pv entry if found and NULL 1804 * otherwise. This operation can be performed on pv lists for either 4KB or 1805 * 2MB page mappings. 1806 */ 1807 static __inline pv_entry_t 1808 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1809 { 1810 pv_entry_t pv; 1811 1812 rw_assert(&pvh_global_lock, RA_LOCKED); 1813 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 1814 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 1815 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 1816 pvh->pv_gen++; 1817 break; 1818 } 1819 } 1820 return (pv); 1821 } 1822 1823 /* 1824 * First find and then destroy the pv entry for the specified pmap and virtual 1825 * address. This operation can be performed on pv lists for either 4KB or 2MB 1826 * page mappings. 1827 */ 1828 static void 1829 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1830 { 1831 pv_entry_t pv; 1832 1833 pv = pmap_pvh_remove(pvh, pmap, va); 1834 1835 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va)); 1836 free_pv_entry(pmap, pv); 1837 } 1838 1839 /* 1840 * Conditionally create the PV entry for a 4KB page mapping if the required 1841 * memory can be allocated without resorting to reclamation. 1842 */ 1843 static boolean_t 1844 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 1845 struct rwlock **lockp) 1846 { 1847 pv_entry_t pv; 1848 1849 rw_assert(&pvh_global_lock, RA_LOCKED); 1850 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1851 /* Pass NULL instead of the lock pointer to disable reclamation. */ 1852 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 1853 pv->pv_va = va; 1854 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 1855 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 1856 m->md.pv_gen++; 1857 return (TRUE); 1858 } else 1859 return (FALSE); 1860 } 1861 1862 /* 1863 * After demotion from a 2MB page mapping to 512 4KB page mappings, 1864 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 1865 * entries for each of the 4KB page mappings. 1866 */ 1867 static void __unused 1868 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1869 struct rwlock **lockp) 1870 { 1871 struct md_page *pvh; 1872 struct pv_chunk *pc; 1873 pv_entry_t pv; 1874 vm_page_t m; 1875 vm_offset_t va_last; 1876 int bit, field; 1877 1878 rw_assert(&pvh_global_lock, RA_LOCKED); 1879 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1880 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1881 1882 /* 1883 * Transfer the 2mpage's pv entry for this mapping to the first 1884 * page's pv list. Once this transfer begins, the pv list lock 1885 * must not be released until the last pv entry is reinstantiated. 1886 */ 1887 pvh = pa_to_pvh(pa); 1888 va &= ~L2_OFFSET; 1889 pv = pmap_pvh_remove(pvh, pmap, va); 1890 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); 1891 m = PHYS_TO_VM_PAGE(pa); 1892 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 1893 m->md.pv_gen++; 1894 /* Instantiate the remaining 511 pv entries. */ 1895 va_last = va + L2_SIZE - PAGE_SIZE; 1896 for (;;) { 1897 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1898 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 1899 pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); 1900 for (field = 0; field < _NPCM; field++) { 1901 while (pc->pc_map[field] != 0) { 1902 bit = ffsl(pc->pc_map[field]) - 1; 1903 pc->pc_map[field] &= ~(1ul << bit); 1904 pv = &pc->pc_pventry[field * 64 + bit]; 1905 va += PAGE_SIZE; 1906 pv->pv_va = va; 1907 m++; 1908 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 1909 ("pmap_pv_demote_l2: page %p is not managed", m)); 1910 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 1911 m->md.pv_gen++; 1912 if (va == va_last) 1913 goto out; 1914 } 1915 } 1916 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1917 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1918 } 1919 out: 1920 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 1921 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1922 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1923 } 1924 /* XXX PV stats */ 1925 } 1926 1927 #if VM_NRESERVLEVEL > 0 1928 static void 1929 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1930 struct rwlock **lockp) 1931 { 1932 struct md_page *pvh; 1933 pv_entry_t pv; 1934 vm_page_t m; 1935 vm_offset_t va_last; 1936 1937 rw_assert(&pvh_global_lock, RA_LOCKED); 1938 KASSERT((va & L2_OFFSET) == 0, 1939 ("pmap_pv_promote_l2: misaligned va %#lx", va)); 1940 1941 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1942 1943 m = PHYS_TO_VM_PAGE(pa); 1944 pv = pmap_pvh_remove(&m->md, pmap, va); 1945 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va)); 1946 pvh = pa_to_pvh(pa); 1947 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 1948 pvh->pv_gen++; 1949 1950 va_last = va + L2_SIZE - PAGE_SIZE; 1951 do { 1952 m++; 1953 va += PAGE_SIZE; 1954 pmap_pvh_free(&m->md, pmap, va); 1955 } while (va < va_last); 1956 } 1957 #endif /* VM_NRESERVLEVEL > 0 */ 1958 1959 /* 1960 * Create the PV entry for a 2MB page mapping. Always returns true unless the 1961 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 1962 * false if the PV entry cannot be allocated without resorting to reclamation. 1963 */ 1964 static bool 1965 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, 1966 struct rwlock **lockp) 1967 { 1968 struct md_page *pvh; 1969 pv_entry_t pv; 1970 vm_paddr_t pa; 1971 1972 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1973 /* Pass NULL instead of the lock pointer to disable reclamation. */ 1974 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 1975 NULL : lockp)) == NULL) 1976 return (false); 1977 pv->pv_va = va; 1978 pa = PTE_TO_PHYS(l2e); 1979 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1980 pvh = pa_to_pvh(pa); 1981 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 1982 pvh->pv_gen++; 1983 return (true); 1984 } 1985 1986 static void 1987 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 1988 { 1989 pt_entry_t newl2, oldl2; 1990 vm_page_t ml3; 1991 vm_paddr_t ml3pa; 1992 1993 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); 1994 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 1995 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1996 1997 ml3 = pmap_remove_pt_page(pmap, va); 1998 if (ml3 == NULL) 1999 panic("pmap_remove_kernel_l2: Missing pt page"); 2000 2001 ml3pa = VM_PAGE_TO_PHYS(ml3); 2002 newl2 = ml3pa | PTE_V; 2003 2004 /* 2005 * If this page table page was unmapped by a promotion, then it 2006 * contains valid mappings. Zero it to invalidate those mappings. 2007 */ 2008 if (ml3->valid != 0) 2009 pagezero((void *)PHYS_TO_DMAP(ml3pa)); 2010 2011 /* 2012 * Demote the mapping. 2013 */ 2014 oldl2 = pmap_load_store(l2, newl2); 2015 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", 2016 __func__, l2, oldl2)); 2017 } 2018 2019 /* 2020 * pmap_remove_l2: Do the things to unmap a level 2 superpage. 2021 */ 2022 static int 2023 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 2024 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) 2025 { 2026 struct md_page *pvh; 2027 pt_entry_t oldl2; 2028 vm_offset_t eva, va; 2029 vm_page_t m, ml3; 2030 2031 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2032 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); 2033 oldl2 = pmap_load_clear(l2); 2034 KASSERT((oldl2 & PTE_RWX) != 0, 2035 ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2)); 2036 2037 /* 2038 * The sfence.vma documentation states that it is sufficient to specify 2039 * a single address within a superpage mapping. However, since we do 2040 * not perform any invalidation upon promotion, TLBs may still be 2041 * caching 4KB mappings within the superpage, so we must invalidate the 2042 * entire range. 2043 */ 2044 pmap_invalidate_range(pmap, sva, sva + L2_SIZE); 2045 if ((oldl2 & PTE_SW_WIRED) != 0) 2046 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; 2047 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); 2048 if ((oldl2 & PTE_SW_MANAGED) != 0) { 2049 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2)); 2050 pvh = pa_to_pvh(PTE_TO_PHYS(oldl2)); 2051 pmap_pvh_free(pvh, pmap, sva); 2052 eva = sva + L2_SIZE; 2053 for (va = sva, m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl2)); 2054 va < eva; va += PAGE_SIZE, m++) { 2055 if ((oldl2 & PTE_D) != 0) 2056 vm_page_dirty(m); 2057 if ((oldl2 & PTE_A) != 0) 2058 vm_page_aflag_set(m, PGA_REFERENCED); 2059 if (TAILQ_EMPTY(&m->md.pv_list) && 2060 TAILQ_EMPTY(&pvh->pv_list)) 2061 vm_page_aflag_clear(m, PGA_WRITEABLE); 2062 } 2063 } 2064 if (pmap == kernel_pmap) { 2065 pmap_remove_kernel_l2(pmap, l2, sva); 2066 } else { 2067 ml3 = pmap_remove_pt_page(pmap, sva); 2068 if (ml3 != NULL) { 2069 KASSERT(ml3->valid == VM_PAGE_BITS_ALL, 2070 ("pmap_remove_l2: l3 page not promoted")); 2071 pmap_resident_count_dec(pmap, 1); 2072 KASSERT(ml3->ref_count == Ln_ENTRIES, 2073 ("pmap_remove_l2: l3 page ref count error")); 2074 ml3->ref_count = 1; 2075 vm_page_unwire_noq(ml3); 2076 pmap_add_delayed_free_list(ml3, free, FALSE); 2077 } 2078 } 2079 return (pmap_unuse_pt(pmap, sva, l1e, free)); 2080 } 2081 2082 /* 2083 * pmap_remove_l3: do the things to unmap a page in a process 2084 */ 2085 static int 2086 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 2087 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) 2088 { 2089 struct md_page *pvh; 2090 pt_entry_t old_l3; 2091 vm_paddr_t phys; 2092 vm_page_t m; 2093 2094 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2095 old_l3 = pmap_load_clear(l3); 2096 pmap_invalidate_page(pmap, va); 2097 if (old_l3 & PTE_SW_WIRED) 2098 pmap->pm_stats.wired_count -= 1; 2099 pmap_resident_count_dec(pmap, 1); 2100 if (old_l3 & PTE_SW_MANAGED) { 2101 phys = PTE_TO_PHYS(old_l3); 2102 m = PHYS_TO_VM_PAGE(phys); 2103 if ((old_l3 & PTE_D) != 0) 2104 vm_page_dirty(m); 2105 if (old_l3 & PTE_A) 2106 vm_page_aflag_set(m, PGA_REFERENCED); 2107 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2108 pmap_pvh_free(&m->md, pmap, va); 2109 if (TAILQ_EMPTY(&m->md.pv_list) && 2110 (m->flags & PG_FICTITIOUS) == 0) { 2111 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2112 if (TAILQ_EMPTY(&pvh->pv_list)) 2113 vm_page_aflag_clear(m, PGA_WRITEABLE); 2114 } 2115 } 2116 2117 return (pmap_unuse_pt(pmap, va, l2e, free)); 2118 } 2119 2120 /* 2121 * Remove the given range of addresses from the specified map. 2122 * 2123 * It is assumed that the start and end are properly 2124 * rounded to the page size. 2125 */ 2126 void 2127 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2128 { 2129 struct spglist free; 2130 struct rwlock *lock; 2131 vm_offset_t va, va_next; 2132 pd_entry_t *l1, *l2, l2e; 2133 pt_entry_t *l3; 2134 2135 /* 2136 * Perform an unsynchronized read. This is, however, safe. 2137 */ 2138 if (pmap->pm_stats.resident_count == 0) 2139 return; 2140 2141 SLIST_INIT(&free); 2142 2143 rw_rlock(&pvh_global_lock); 2144 PMAP_LOCK(pmap); 2145 2146 lock = NULL; 2147 for (; sva < eva; sva = va_next) { 2148 if (pmap->pm_stats.resident_count == 0) 2149 break; 2150 2151 l1 = pmap_l1(pmap, sva); 2152 if (pmap_load(l1) == 0) { 2153 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2154 if (va_next < sva) 2155 va_next = eva; 2156 continue; 2157 } 2158 2159 /* 2160 * Calculate index for next page table. 2161 */ 2162 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2163 if (va_next < sva) 2164 va_next = eva; 2165 2166 l2 = pmap_l1_to_l2(l1, sva); 2167 if (l2 == NULL) 2168 continue; 2169 if ((l2e = pmap_load(l2)) == 0) 2170 continue; 2171 if ((l2e & PTE_RWX) != 0) { 2172 if (sva + L2_SIZE == va_next && eva >= va_next) { 2173 (void)pmap_remove_l2(pmap, l2, sva, 2174 pmap_load(l1), &free, &lock); 2175 continue; 2176 } else if (!pmap_demote_l2_locked(pmap, l2, sva, 2177 &lock)) { 2178 /* 2179 * The large page mapping was destroyed. 2180 */ 2181 continue; 2182 } 2183 l2e = pmap_load(l2); 2184 } 2185 2186 /* 2187 * Limit our scan to either the end of the va represented 2188 * by the current page table page, or to the end of the 2189 * range being removed. 2190 */ 2191 if (va_next > eva) 2192 va_next = eva; 2193 2194 va = va_next; 2195 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2196 sva += L3_SIZE) { 2197 if (pmap_load(l3) == 0) { 2198 if (va != va_next) { 2199 pmap_invalidate_range(pmap, va, sva); 2200 va = va_next; 2201 } 2202 continue; 2203 } 2204 if (va == va_next) 2205 va = sva; 2206 if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) { 2207 sva += L3_SIZE; 2208 break; 2209 } 2210 } 2211 if (va != va_next) 2212 pmap_invalidate_range(pmap, va, sva); 2213 } 2214 if (lock != NULL) 2215 rw_wunlock(lock); 2216 rw_runlock(&pvh_global_lock); 2217 PMAP_UNLOCK(pmap); 2218 vm_page_free_pages_toq(&free, false); 2219 } 2220 2221 /* 2222 * Routine: pmap_remove_all 2223 * Function: 2224 * Removes this physical page from 2225 * all physical maps in which it resides. 2226 * Reflects back modify bits to the pager. 2227 * 2228 * Notes: 2229 * Original versions of this routine were very 2230 * inefficient because they iteratively called 2231 * pmap_remove (slow...) 2232 */ 2233 2234 void 2235 pmap_remove_all(vm_page_t m) 2236 { 2237 struct spglist free; 2238 struct md_page *pvh; 2239 pmap_t pmap; 2240 pt_entry_t *l3, l3e; 2241 pd_entry_t *l2, l2e; 2242 pv_entry_t pv; 2243 vm_offset_t va; 2244 2245 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2246 ("pmap_remove_all: page %p is not managed", m)); 2247 SLIST_INIT(&free); 2248 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 2249 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2250 2251 rw_wlock(&pvh_global_lock); 2252 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 2253 pmap = PV_PMAP(pv); 2254 PMAP_LOCK(pmap); 2255 va = pv->pv_va; 2256 l2 = pmap_l2(pmap, va); 2257 (void)pmap_demote_l2(pmap, l2, va); 2258 PMAP_UNLOCK(pmap); 2259 } 2260 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2261 pmap = PV_PMAP(pv); 2262 PMAP_LOCK(pmap); 2263 pmap_resident_count_dec(pmap, 1); 2264 l2 = pmap_l2(pmap, pv->pv_va); 2265 KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found")); 2266 l2e = pmap_load(l2); 2267 2268 KASSERT((l2e & PTE_RX) == 0, 2269 ("pmap_remove_all: found a superpage in %p's pv list", m)); 2270 2271 l3 = pmap_l2_to_l3(l2, pv->pv_va); 2272 l3e = pmap_load_clear(l3); 2273 pmap_invalidate_page(pmap, pv->pv_va); 2274 if (l3e & PTE_SW_WIRED) 2275 pmap->pm_stats.wired_count--; 2276 if ((l3e & PTE_A) != 0) 2277 vm_page_aflag_set(m, PGA_REFERENCED); 2278 2279 /* 2280 * Update the vm_page_t clean and reference bits. 2281 */ 2282 if ((l3e & PTE_D) != 0) 2283 vm_page_dirty(m); 2284 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free); 2285 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2286 m->md.pv_gen++; 2287 free_pv_entry(pmap, pv); 2288 PMAP_UNLOCK(pmap); 2289 } 2290 vm_page_aflag_clear(m, PGA_WRITEABLE); 2291 rw_wunlock(&pvh_global_lock); 2292 vm_page_free_pages_toq(&free, false); 2293 } 2294 2295 /* 2296 * Set the physical protection on the 2297 * specified range of this map as requested. 2298 */ 2299 void 2300 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2301 { 2302 pd_entry_t *l1, *l2, l2e; 2303 pt_entry_t *l3, l3e, mask; 2304 vm_page_t m, mt; 2305 vm_paddr_t pa; 2306 vm_offset_t va_next; 2307 bool anychanged, pv_lists_locked; 2308 2309 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2310 pmap_remove(pmap, sva, eva); 2311 return; 2312 } 2313 2314 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == 2315 (VM_PROT_WRITE | VM_PROT_EXECUTE)) 2316 return; 2317 2318 anychanged = false; 2319 pv_lists_locked = false; 2320 mask = 0; 2321 if ((prot & VM_PROT_WRITE) == 0) 2322 mask |= PTE_W | PTE_D; 2323 if ((prot & VM_PROT_EXECUTE) == 0) 2324 mask |= PTE_X; 2325 resume: 2326 PMAP_LOCK(pmap); 2327 for (; sva < eva; sva = va_next) { 2328 l1 = pmap_l1(pmap, sva); 2329 if (pmap_load(l1) == 0) { 2330 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2331 if (va_next < sva) 2332 va_next = eva; 2333 continue; 2334 } 2335 2336 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2337 if (va_next < sva) 2338 va_next = eva; 2339 2340 l2 = pmap_l1_to_l2(l1, sva); 2341 if (l2 == NULL || (l2e = pmap_load(l2)) == 0) 2342 continue; 2343 if ((l2e & PTE_RWX) != 0) { 2344 if (sva + L2_SIZE == va_next && eva >= va_next) { 2345 retryl2: 2346 if ((prot & VM_PROT_WRITE) == 0 && 2347 (l2e & (PTE_SW_MANAGED | PTE_D)) == 2348 (PTE_SW_MANAGED | PTE_D)) { 2349 pa = PTE_TO_PHYS(l2e); 2350 m = PHYS_TO_VM_PAGE(pa); 2351 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 2352 vm_page_dirty(mt); 2353 } 2354 if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask)) 2355 goto retryl2; 2356 anychanged = true; 2357 } else { 2358 if (!pv_lists_locked) { 2359 pv_lists_locked = true; 2360 if (!rw_try_rlock(&pvh_global_lock)) { 2361 if (anychanged) 2362 pmap_invalidate_all( 2363 pmap); 2364 PMAP_UNLOCK(pmap); 2365 rw_rlock(&pvh_global_lock); 2366 goto resume; 2367 } 2368 } 2369 if (!pmap_demote_l2(pmap, l2, sva)) { 2370 /* 2371 * The large page mapping was destroyed. 2372 */ 2373 continue; 2374 } 2375 } 2376 } 2377 2378 if (va_next > eva) 2379 va_next = eva; 2380 2381 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2382 sva += L3_SIZE) { 2383 l3e = pmap_load(l3); 2384 retryl3: 2385 if ((l3e & PTE_V) == 0) 2386 continue; 2387 if ((prot & VM_PROT_WRITE) == 0 && 2388 (l3e & (PTE_SW_MANAGED | PTE_D)) == 2389 (PTE_SW_MANAGED | PTE_D)) { 2390 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(l3e)); 2391 vm_page_dirty(m); 2392 } 2393 if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask)) 2394 goto retryl3; 2395 anychanged = true; 2396 } 2397 } 2398 if (anychanged) 2399 pmap_invalidate_all(pmap); 2400 if (pv_lists_locked) 2401 rw_runlock(&pvh_global_lock); 2402 PMAP_UNLOCK(pmap); 2403 } 2404 2405 int 2406 pmap_fault_fixup(pmap_t pmap, vm_offset_t va, vm_prot_t ftype) 2407 { 2408 pd_entry_t *l2, l2e; 2409 pt_entry_t bits, *pte, oldpte; 2410 int rv; 2411 2412 rv = 0; 2413 PMAP_LOCK(pmap); 2414 l2 = pmap_l2(pmap, va); 2415 if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) 2416 goto done; 2417 if ((l2e & PTE_RWX) == 0) { 2418 pte = pmap_l2_to_l3(l2, va); 2419 if (pte == NULL || ((oldpte = pmap_load(pte) & PTE_V)) == 0) 2420 goto done; 2421 } else { 2422 pte = l2; 2423 oldpte = l2e; 2424 } 2425 2426 if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) || 2427 (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) || 2428 (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) || 2429 (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0)) 2430 goto done; 2431 2432 bits = PTE_A; 2433 if (ftype == VM_PROT_WRITE) 2434 bits |= PTE_D; 2435 2436 /* 2437 * Spurious faults can occur if the implementation caches invalid 2438 * entries in the TLB, or if simultaneous accesses on multiple CPUs 2439 * race with each other. 2440 */ 2441 if ((oldpte & bits) != bits) 2442 pmap_store_bits(pte, bits); 2443 sfence_vma(); 2444 rv = 1; 2445 done: 2446 PMAP_UNLOCK(pmap); 2447 return (rv); 2448 } 2449 2450 static bool 2451 pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va) 2452 { 2453 struct rwlock *lock; 2454 bool rv; 2455 2456 lock = NULL; 2457 rv = pmap_demote_l2_locked(pmap, l2, va, &lock); 2458 if (lock != NULL) 2459 rw_wunlock(lock); 2460 return (rv); 2461 } 2462 2463 /* 2464 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 2465 * mapping is invalidated. 2466 */ 2467 static bool 2468 pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 2469 struct rwlock **lockp) 2470 { 2471 struct spglist free; 2472 vm_page_t mpte; 2473 pd_entry_t newl2, oldl2; 2474 pt_entry_t *firstl3, newl3; 2475 vm_paddr_t mptepa; 2476 int i; 2477 2478 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2479 2480 oldl2 = pmap_load(l2); 2481 KASSERT((oldl2 & PTE_RWX) != 0, 2482 ("pmap_demote_l2_locked: oldl2 is not a leaf entry")); 2483 if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 2484 NULL) { 2485 if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc(NULL, 2486 pmap_l2_pindex(va), (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 2487 VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == 2488 NULL) { 2489 SLIST_INIT(&free); 2490 (void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET, 2491 pmap_load(pmap_l1(pmap, va)), &free, lockp); 2492 vm_page_free_pages_toq(&free, true); 2493 CTR2(KTR_PMAP, "pmap_demote_l2_locked: " 2494 "failure for va %#lx in pmap %p", va, pmap); 2495 return (false); 2496 } 2497 if (va < VM_MAXUSER_ADDRESS) { 2498 mpte->ref_count = Ln_ENTRIES; 2499 pmap_resident_count_inc(pmap, 1); 2500 } 2501 } 2502 mptepa = VM_PAGE_TO_PHYS(mpte); 2503 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 2504 newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V; 2505 KASSERT((oldl2 & PTE_A) != 0, 2506 ("pmap_demote_l2_locked: oldl2 is missing PTE_A")); 2507 KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W, 2508 ("pmap_demote_l2_locked: oldl2 is missing PTE_D")); 2509 newl3 = oldl2; 2510 2511 /* 2512 * If the page table page is not leftover from an earlier promotion, 2513 * initialize it. 2514 */ 2515 if (mpte->valid == 0) { 2516 for (i = 0; i < Ln_ENTRIES; i++) 2517 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2518 } 2519 KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3), 2520 ("pmap_demote_l2_locked: firstl3 and newl3 map different physical " 2521 "addresses")); 2522 2523 /* 2524 * If the mapping has changed attributes, update the page table 2525 * entries. 2526 */ 2527 if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE)) 2528 for (i = 0; i < Ln_ENTRIES; i++) 2529 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2530 2531 /* 2532 * The spare PV entries must be reserved prior to demoting the 2533 * mapping, that is, prior to changing the L2 entry. Otherwise, the 2534 * state of the L2 entry and the PV lists will be inconsistent, which 2535 * can result in reclaim_pv_chunk() attempting to remove a PV entry from 2536 * the wrong PV list and pmap_pv_demote_l2() failing to find the 2537 * expected PV entry for the 2MB page mapping that is being demoted. 2538 */ 2539 if ((oldl2 & PTE_SW_MANAGED) != 0) 2540 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); 2541 2542 /* 2543 * Demote the mapping. 2544 */ 2545 pmap_store(l2, newl2); 2546 2547 /* 2548 * Demote the PV entry. 2549 */ 2550 if ((oldl2 & PTE_SW_MANAGED) != 0) 2551 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp); 2552 2553 atomic_add_long(&pmap_l2_demotions, 1); 2554 CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p", 2555 va, pmap); 2556 return (true); 2557 } 2558 2559 #if VM_NRESERVLEVEL > 0 2560 static void 2561 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 2562 struct rwlock **lockp) 2563 { 2564 pt_entry_t *firstl3, *l3; 2565 vm_paddr_t pa; 2566 vm_page_t ml3; 2567 2568 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2569 2570 va &= ~L2_OFFSET; 2571 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 2572 ("pmap_promote_l2: invalid l2 entry %p", l2)); 2573 2574 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2))); 2575 pa = PTE_TO_PHYS(pmap_load(firstl3)); 2576 if ((pa & L2_OFFSET) != 0) { 2577 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 2578 va, pmap); 2579 atomic_add_long(&pmap_l2_p_failures, 1); 2580 return; 2581 } 2582 2583 pa += PAGE_SIZE; 2584 for (l3 = firstl3 + 1; l3 < firstl3 + Ln_ENTRIES; l3++) { 2585 if (PTE_TO_PHYS(pmap_load(l3)) != pa) { 2586 CTR2(KTR_PMAP, 2587 "pmap_promote_l2: failure for va %#lx pmap %p", 2588 va, pmap); 2589 atomic_add_long(&pmap_l2_p_failures, 1); 2590 return; 2591 } 2592 if ((pmap_load(l3) & PTE_PROMOTE) != 2593 (pmap_load(firstl3) & PTE_PROMOTE)) { 2594 CTR2(KTR_PMAP, 2595 "pmap_promote_l2: failure for va %#lx pmap %p", 2596 va, pmap); 2597 atomic_add_long(&pmap_l2_p_failures, 1); 2598 return; 2599 } 2600 pa += PAGE_SIZE; 2601 } 2602 2603 ml3 = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2604 KASSERT(ml3->pindex == pmap_l2_pindex(va), 2605 ("pmap_promote_l2: page table page's pindex is wrong")); 2606 if (pmap_insert_pt_page(pmap, ml3, true)) { 2607 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 2608 va, pmap); 2609 atomic_add_long(&pmap_l2_p_failures, 1); 2610 return; 2611 } 2612 2613 if ((pmap_load(firstl3) & PTE_SW_MANAGED) != 0) 2614 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(pmap_load(firstl3)), 2615 lockp); 2616 2617 pmap_store(l2, pmap_load(firstl3)); 2618 2619 atomic_add_long(&pmap_l2_promotions, 1); 2620 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, 2621 pmap); 2622 } 2623 #endif 2624 2625 /* 2626 * Insert the given physical page (p) at 2627 * the specified virtual address (v) in the 2628 * target physical map with the protection requested. 2629 * 2630 * If specified, the page will be wired down, meaning 2631 * that the related pte can not be reclaimed. 2632 * 2633 * NB: This is the only routine which MAY NOT lazy-evaluate 2634 * or lose information. That is, this routine must actually 2635 * insert this page into the given map NOW. 2636 */ 2637 int 2638 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2639 u_int flags, int8_t psind) 2640 { 2641 struct rwlock *lock; 2642 pd_entry_t *l1, *l2, l2e; 2643 pt_entry_t new_l3, orig_l3; 2644 pt_entry_t *l3; 2645 pv_entry_t pv; 2646 vm_paddr_t opa, pa, l2_pa, l3_pa; 2647 vm_page_t mpte, om, l2_m, l3_m; 2648 pt_entry_t entry; 2649 pn_t l2_pn, l3_pn, pn; 2650 int rv; 2651 bool nosleep; 2652 2653 va = trunc_page(va); 2654 if ((m->oflags & VPO_UNMANAGED) == 0) 2655 VM_PAGE_OBJECT_BUSY_ASSERT(m); 2656 pa = VM_PAGE_TO_PHYS(m); 2657 pn = (pa / PAGE_SIZE); 2658 2659 new_l3 = PTE_V | PTE_R | PTE_A; 2660 if (prot & VM_PROT_EXECUTE) 2661 new_l3 |= PTE_X; 2662 if (flags & VM_PROT_WRITE) 2663 new_l3 |= PTE_D; 2664 if (prot & VM_PROT_WRITE) 2665 new_l3 |= PTE_W; 2666 if (va < VM_MAX_USER_ADDRESS) 2667 new_l3 |= PTE_U; 2668 2669 new_l3 |= (pn << PTE_PPN0_S); 2670 if ((flags & PMAP_ENTER_WIRED) != 0) 2671 new_l3 |= PTE_SW_WIRED; 2672 2673 /* 2674 * Set modified bit gratuitously for writeable mappings if 2675 * the page is unmanaged. We do not want to take a fault 2676 * to do the dirty bit accounting for these mappings. 2677 */ 2678 if ((m->oflags & VPO_UNMANAGED) != 0) { 2679 if (prot & VM_PROT_WRITE) 2680 new_l3 |= PTE_D; 2681 } else 2682 new_l3 |= PTE_SW_MANAGED; 2683 2684 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); 2685 2686 lock = NULL; 2687 mpte = NULL; 2688 rw_rlock(&pvh_global_lock); 2689 PMAP_LOCK(pmap); 2690 if (psind == 1) { 2691 /* Assert the required virtual and physical alignment. */ 2692 KASSERT((va & L2_OFFSET) == 0, 2693 ("pmap_enter: va %#lx unaligned", va)); 2694 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 2695 rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock); 2696 goto out; 2697 } 2698 2699 l2 = pmap_l2(pmap, va); 2700 if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 && 2701 ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2, 2702 va, &lock))) { 2703 l3 = pmap_l2_to_l3(l2, va); 2704 if (va < VM_MAXUSER_ADDRESS) { 2705 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2706 mpte->ref_count++; 2707 } 2708 } else if (va < VM_MAXUSER_ADDRESS) { 2709 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 2710 mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock); 2711 if (mpte == NULL && nosleep) { 2712 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); 2713 if (lock != NULL) 2714 rw_wunlock(lock); 2715 rw_runlock(&pvh_global_lock); 2716 PMAP_UNLOCK(pmap); 2717 return (KERN_RESOURCE_SHORTAGE); 2718 } 2719 l3 = pmap_l3(pmap, va); 2720 } else { 2721 l3 = pmap_l3(pmap, va); 2722 /* TODO: This is not optimal, but should mostly work */ 2723 if (l3 == NULL) { 2724 if (l2 == NULL) { 2725 l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2726 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2727 VM_ALLOC_ZERO); 2728 if (l2_m == NULL) 2729 panic("pmap_enter: l2 pte_m == NULL"); 2730 if ((l2_m->flags & PG_ZERO) == 0) 2731 pmap_zero_page(l2_m); 2732 2733 l2_pa = VM_PAGE_TO_PHYS(l2_m); 2734 l2_pn = (l2_pa / PAGE_SIZE); 2735 2736 l1 = pmap_l1(pmap, va); 2737 entry = (PTE_V); 2738 entry |= (l2_pn << PTE_PPN0_S); 2739 pmap_store(l1, entry); 2740 pmap_distribute_l1(pmap, pmap_l1_index(va), entry); 2741 l2 = pmap_l1_to_l2(l1, va); 2742 } 2743 2744 l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2745 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2746 if (l3_m == NULL) 2747 panic("pmap_enter: l3 pte_m == NULL"); 2748 if ((l3_m->flags & PG_ZERO) == 0) 2749 pmap_zero_page(l3_m); 2750 2751 l3_pa = VM_PAGE_TO_PHYS(l3_m); 2752 l3_pn = (l3_pa / PAGE_SIZE); 2753 entry = (PTE_V); 2754 entry |= (l3_pn << PTE_PPN0_S); 2755 pmap_store(l2, entry); 2756 l3 = pmap_l2_to_l3(l2, va); 2757 } 2758 pmap_invalidate_page(pmap, va); 2759 } 2760 2761 orig_l3 = pmap_load(l3); 2762 opa = PTE_TO_PHYS(orig_l3); 2763 pv = NULL; 2764 2765 /* 2766 * Is the specified virtual address already mapped? 2767 */ 2768 if ((orig_l3 & PTE_V) != 0) { 2769 /* 2770 * Wiring change, just update stats. We don't worry about 2771 * wiring PT pages as they remain resident as long as there 2772 * are valid mappings in them. Hence, if a user page is wired, 2773 * the PT page will be also. 2774 */ 2775 if ((flags & PMAP_ENTER_WIRED) != 0 && 2776 (orig_l3 & PTE_SW_WIRED) == 0) 2777 pmap->pm_stats.wired_count++; 2778 else if ((flags & PMAP_ENTER_WIRED) == 0 && 2779 (orig_l3 & PTE_SW_WIRED) != 0) 2780 pmap->pm_stats.wired_count--; 2781 2782 /* 2783 * Remove the extra PT page reference. 2784 */ 2785 if (mpte != NULL) { 2786 mpte->ref_count--; 2787 KASSERT(mpte->ref_count > 0, 2788 ("pmap_enter: missing reference to page table page," 2789 " va: 0x%lx", va)); 2790 } 2791 2792 /* 2793 * Has the physical page changed? 2794 */ 2795 if (opa == pa) { 2796 /* 2797 * No, might be a protection or wiring change. 2798 */ 2799 if ((orig_l3 & PTE_SW_MANAGED) != 0 && 2800 (new_l3 & PTE_W) != 0) 2801 vm_page_aflag_set(m, PGA_WRITEABLE); 2802 goto validate; 2803 } 2804 2805 /* 2806 * The physical page has changed. Temporarily invalidate 2807 * the mapping. This ensures that all threads sharing the 2808 * pmap keep a consistent view of the mapping, which is 2809 * necessary for the correct handling of COW faults. It 2810 * also permits reuse of the old mapping's PV entry, 2811 * avoiding an allocation. 2812 * 2813 * For consistency, handle unmanaged mappings the same way. 2814 */ 2815 orig_l3 = pmap_load_clear(l3); 2816 KASSERT(PTE_TO_PHYS(orig_l3) == opa, 2817 ("pmap_enter: unexpected pa update for %#lx", va)); 2818 if ((orig_l3 & PTE_SW_MANAGED) != 0) { 2819 om = PHYS_TO_VM_PAGE(opa); 2820 2821 /* 2822 * The pmap lock is sufficient to synchronize with 2823 * concurrent calls to pmap_page_test_mappings() and 2824 * pmap_ts_referenced(). 2825 */ 2826 if ((orig_l3 & PTE_D) != 0) 2827 vm_page_dirty(om); 2828 if ((orig_l3 & PTE_A) != 0) 2829 vm_page_aflag_set(om, PGA_REFERENCED); 2830 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 2831 pv = pmap_pvh_remove(&om->md, pmap, va); 2832 KASSERT(pv != NULL, 2833 ("pmap_enter: no PV entry for %#lx", va)); 2834 if ((new_l3 & PTE_SW_MANAGED) == 0) 2835 free_pv_entry(pmap, pv); 2836 if ((om->a.flags & PGA_WRITEABLE) != 0 && 2837 TAILQ_EMPTY(&om->md.pv_list) && 2838 ((om->flags & PG_FICTITIOUS) != 0 || 2839 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 2840 vm_page_aflag_clear(om, PGA_WRITEABLE); 2841 } 2842 pmap_invalidate_page(pmap, va); 2843 orig_l3 = 0; 2844 } else { 2845 /* 2846 * Increment the counters. 2847 */ 2848 if ((new_l3 & PTE_SW_WIRED) != 0) 2849 pmap->pm_stats.wired_count++; 2850 pmap_resident_count_inc(pmap, 1); 2851 } 2852 /* 2853 * Enter on the PV list if part of our managed memory. 2854 */ 2855 if ((new_l3 & PTE_SW_MANAGED) != 0) { 2856 if (pv == NULL) { 2857 pv = get_pv_entry(pmap, &lock); 2858 pv->pv_va = va; 2859 } 2860 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 2861 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2862 m->md.pv_gen++; 2863 if ((new_l3 & PTE_W) != 0) 2864 vm_page_aflag_set(m, PGA_WRITEABLE); 2865 } 2866 2867 validate: 2868 /* 2869 * Sync the i-cache on all harts before updating the PTE 2870 * if the new PTE is executable. 2871 */ 2872 if (prot & VM_PROT_EXECUTE) 2873 pmap_sync_icache(pmap, va, PAGE_SIZE); 2874 2875 /* 2876 * Update the L3 entry. 2877 */ 2878 if (orig_l3 != 0) { 2879 orig_l3 = pmap_load_store(l3, new_l3); 2880 pmap_invalidate_page(pmap, va); 2881 KASSERT(PTE_TO_PHYS(orig_l3) == pa, 2882 ("pmap_enter: invalid update")); 2883 if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) == 2884 (PTE_D | PTE_SW_MANAGED)) 2885 vm_page_dirty(m); 2886 } else { 2887 pmap_store(l3, new_l3); 2888 } 2889 2890 #if VM_NRESERVLEVEL > 0 2891 if (mpte != NULL && mpte->ref_count == Ln_ENTRIES && 2892 pmap_ps_enabled(pmap) && 2893 (m->flags & PG_FICTITIOUS) == 0 && 2894 vm_reserv_level_iffullpop(m) == 0) 2895 pmap_promote_l2(pmap, l2, va, &lock); 2896 #endif 2897 2898 rv = KERN_SUCCESS; 2899 out: 2900 if (lock != NULL) 2901 rw_wunlock(lock); 2902 rw_runlock(&pvh_global_lock); 2903 PMAP_UNLOCK(pmap); 2904 return (rv); 2905 } 2906 2907 /* 2908 * Tries to create a read- and/or execute-only 2MB page mapping. Returns true 2909 * if successful. Returns false if (1) a page table page cannot be allocated 2910 * without sleeping, (2) a mapping already exists at the specified virtual 2911 * address, or (3) a PV entry cannot be allocated without reclaiming another 2912 * PV entry. 2913 */ 2914 static bool 2915 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2916 struct rwlock **lockp) 2917 { 2918 pd_entry_t new_l2; 2919 pn_t pn; 2920 2921 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2922 2923 pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE; 2924 new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V); 2925 if ((m->oflags & VPO_UNMANAGED) == 0) 2926 new_l2 |= PTE_SW_MANAGED; 2927 if ((prot & VM_PROT_EXECUTE) != 0) 2928 new_l2 |= PTE_X; 2929 if (va < VM_MAXUSER_ADDRESS) 2930 new_l2 |= PTE_U; 2931 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | 2932 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == 2933 KERN_SUCCESS); 2934 } 2935 2936 /* 2937 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 2938 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 2939 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 2940 * a mapping already exists at the specified virtual address. Returns 2941 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table 2942 * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if 2943 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. 2944 * 2945 * The parameter "m" is only used when creating a managed, writeable mapping. 2946 */ 2947 static int 2948 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, 2949 vm_page_t m, struct rwlock **lockp) 2950 { 2951 struct spglist free; 2952 pd_entry_t *l2, *l3, oldl2; 2953 vm_offset_t sva; 2954 vm_page_t l2pg, mt; 2955 2956 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2957 2958 if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? 2959 NULL : lockp)) == NULL) { 2960 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", 2961 va, pmap); 2962 return (KERN_RESOURCE_SHORTAGE); 2963 } 2964 2965 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); 2966 l2 = &l2[pmap_l2_index(va)]; 2967 if ((oldl2 = pmap_load(l2)) != 0) { 2968 KASSERT(l2pg->ref_count > 1, 2969 ("pmap_enter_l2: l2pg's ref count is too low")); 2970 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 2971 l2pg->ref_count--; 2972 CTR2(KTR_PMAP, 2973 "pmap_enter_l2: failure for va %#lx in pmap %p", 2974 va, pmap); 2975 return (KERN_FAILURE); 2976 } 2977 SLIST_INIT(&free); 2978 if ((oldl2 & PTE_RWX) != 0) 2979 (void)pmap_remove_l2(pmap, l2, va, 2980 pmap_load(pmap_l1(pmap, va)), &free, lockp); 2981 else 2982 for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) { 2983 l3 = pmap_l2_to_l3(l2, sva); 2984 if ((pmap_load(l3) & PTE_V) != 0 && 2985 pmap_remove_l3(pmap, l3, sva, oldl2, &free, 2986 lockp) != 0) 2987 break; 2988 } 2989 vm_page_free_pages_toq(&free, true); 2990 if (va >= VM_MAXUSER_ADDRESS) { 2991 /* 2992 * Both pmap_remove_l2() and pmap_remove_l3() will 2993 * leave the kernel page table page zero filled. 2994 */ 2995 mt = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2996 if (pmap_insert_pt_page(pmap, mt, false)) 2997 panic("pmap_enter_l2: trie insert failed"); 2998 } else 2999 KASSERT(pmap_load(l2) == 0, 3000 ("pmap_enter_l2: non-zero L2 entry %p", l2)); 3001 } 3002 3003 if ((new_l2 & PTE_SW_MANAGED) != 0) { 3004 /* 3005 * Abort this mapping if its PV entry could not be created. 3006 */ 3007 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { 3008 SLIST_INIT(&free); 3009 if (pmap_unwire_ptp(pmap, va, l2pg, &free)) { 3010 /* 3011 * Although "va" is not mapped, paging-structure 3012 * caches could nonetheless have entries that 3013 * refer to the freed page table pages. 3014 * Invalidate those entries. 3015 */ 3016 pmap_invalidate_page(pmap, va); 3017 vm_page_free_pages_toq(&free, true); 3018 } 3019 CTR2(KTR_PMAP, 3020 "pmap_enter_l2: failure for va %#lx in pmap %p", 3021 va, pmap); 3022 return (KERN_RESOURCE_SHORTAGE); 3023 } 3024 if ((new_l2 & PTE_W) != 0) 3025 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 3026 vm_page_aflag_set(mt, PGA_WRITEABLE); 3027 } 3028 3029 /* 3030 * Increment counters. 3031 */ 3032 if ((new_l2 & PTE_SW_WIRED) != 0) 3033 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; 3034 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; 3035 3036 /* 3037 * Map the superpage. 3038 */ 3039 pmap_store(l2, new_l2); 3040 3041 atomic_add_long(&pmap_l2_mappings, 1); 3042 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", 3043 va, pmap); 3044 3045 return (KERN_SUCCESS); 3046 } 3047 3048 /* 3049 * Maps a sequence of resident pages belonging to the same object. 3050 * The sequence begins with the given page m_start. This page is 3051 * mapped at the given virtual address start. Each subsequent page is 3052 * mapped at a virtual address that is offset from start by the same 3053 * amount as the page is offset from m_start within the object. The 3054 * last page in the sequence is the page with the largest offset from 3055 * m_start that can be mapped at a virtual address less than the given 3056 * virtual address end. Not every virtual page between start and end 3057 * is mapped; only those for which a resident page exists with the 3058 * corresponding offset from m_start are mapped. 3059 */ 3060 void 3061 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3062 vm_page_t m_start, vm_prot_t prot) 3063 { 3064 struct rwlock *lock; 3065 vm_offset_t va; 3066 vm_page_t m, mpte; 3067 vm_pindex_t diff, psize; 3068 3069 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3070 3071 psize = atop(end - start); 3072 mpte = NULL; 3073 m = m_start; 3074 lock = NULL; 3075 rw_rlock(&pvh_global_lock); 3076 PMAP_LOCK(pmap); 3077 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3078 va = start + ptoa(diff); 3079 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && 3080 m->psind == 1 && pmap_ps_enabled(pmap) && 3081 pmap_enter_2mpage(pmap, va, m, prot, &lock)) 3082 m = &m[L2_SIZE / PAGE_SIZE - 1]; 3083 else 3084 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, 3085 &lock); 3086 m = TAILQ_NEXT(m, listq); 3087 } 3088 if (lock != NULL) 3089 rw_wunlock(lock); 3090 rw_runlock(&pvh_global_lock); 3091 PMAP_UNLOCK(pmap); 3092 } 3093 3094 /* 3095 * this code makes some *MAJOR* assumptions: 3096 * 1. Current pmap & pmap exists. 3097 * 2. Not wired. 3098 * 3. Read access. 3099 * 4. No page table pages. 3100 * but is *MUCH* faster than pmap_enter... 3101 */ 3102 3103 void 3104 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3105 { 3106 struct rwlock *lock; 3107 3108 lock = NULL; 3109 rw_rlock(&pvh_global_lock); 3110 PMAP_LOCK(pmap); 3111 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 3112 if (lock != NULL) 3113 rw_wunlock(lock); 3114 rw_runlock(&pvh_global_lock); 3115 PMAP_UNLOCK(pmap); 3116 } 3117 3118 static vm_page_t 3119 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3120 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 3121 { 3122 struct spglist free; 3123 vm_paddr_t phys; 3124 pd_entry_t *l2; 3125 pt_entry_t *l3, newl3; 3126 3127 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 3128 (m->oflags & VPO_UNMANAGED) != 0, 3129 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3130 rw_assert(&pvh_global_lock, RA_LOCKED); 3131 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3132 3133 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); 3134 /* 3135 * In the case that a page table page is not 3136 * resident, we are creating it here. 3137 */ 3138 if (va < VM_MAXUSER_ADDRESS) { 3139 vm_pindex_t l2pindex; 3140 3141 /* 3142 * Calculate pagetable page index 3143 */ 3144 l2pindex = pmap_l2_pindex(va); 3145 if (mpte && (mpte->pindex == l2pindex)) { 3146 mpte->ref_count++; 3147 } else { 3148 /* 3149 * Get the l2 entry 3150 */ 3151 l2 = pmap_l2(pmap, va); 3152 3153 /* 3154 * If the page table page is mapped, we just increment 3155 * the hold count, and activate it. Otherwise, we 3156 * attempt to allocate a page table page. If this 3157 * attempt fails, we don't retry. Instead, we give up. 3158 */ 3159 if (l2 != NULL && pmap_load(l2) != 0) { 3160 phys = PTE_TO_PHYS(pmap_load(l2)); 3161 mpte = PHYS_TO_VM_PAGE(phys); 3162 mpte->ref_count++; 3163 } else { 3164 /* 3165 * Pass NULL instead of the PV list lock 3166 * pointer, because we don't intend to sleep. 3167 */ 3168 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); 3169 if (mpte == NULL) 3170 return (mpte); 3171 } 3172 } 3173 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 3174 l3 = &l3[pmap_l3_index(va)]; 3175 } else { 3176 mpte = NULL; 3177 l3 = pmap_l3(kernel_pmap, va); 3178 } 3179 if (l3 == NULL) 3180 panic("pmap_enter_quick_locked: No l3"); 3181 if (pmap_load(l3) != 0) { 3182 if (mpte != NULL) { 3183 mpte->ref_count--; 3184 mpte = NULL; 3185 } 3186 return (mpte); 3187 } 3188 3189 /* 3190 * Enter on the PV list if part of our managed memory. 3191 */ 3192 if ((m->oflags & VPO_UNMANAGED) == 0 && 3193 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 3194 if (mpte != NULL) { 3195 SLIST_INIT(&free); 3196 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 3197 pmap_invalidate_page(pmap, va); 3198 vm_page_free_pages_toq(&free, false); 3199 } 3200 mpte = NULL; 3201 } 3202 return (mpte); 3203 } 3204 3205 /* 3206 * Increment counters 3207 */ 3208 pmap_resident_count_inc(pmap, 1); 3209 3210 newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) | 3211 PTE_V | PTE_R; 3212 if ((prot & VM_PROT_EXECUTE) != 0) 3213 newl3 |= PTE_X; 3214 if ((m->oflags & VPO_UNMANAGED) == 0) 3215 newl3 |= PTE_SW_MANAGED; 3216 if (va < VM_MAX_USER_ADDRESS) 3217 newl3 |= PTE_U; 3218 3219 /* 3220 * Sync the i-cache on all harts before updating the PTE 3221 * if the new PTE is executable. 3222 */ 3223 if (prot & VM_PROT_EXECUTE) 3224 pmap_sync_icache(pmap, va, PAGE_SIZE); 3225 3226 pmap_store(l3, newl3); 3227 3228 pmap_invalidate_page(pmap, va); 3229 return (mpte); 3230 } 3231 3232 /* 3233 * This code maps large physical mmap regions into the 3234 * processor address space. Note that some shortcuts 3235 * are taken, but the code works. 3236 */ 3237 void 3238 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3239 vm_pindex_t pindex, vm_size_t size) 3240 { 3241 3242 VM_OBJECT_ASSERT_WLOCKED(object); 3243 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3244 ("pmap_object_init_pt: non-device object")); 3245 } 3246 3247 /* 3248 * Clear the wired attribute from the mappings for the specified range of 3249 * addresses in the given pmap. Every valid mapping within that range 3250 * must have the wired attribute set. In contrast, invalid mappings 3251 * cannot have the wired attribute set, so they are ignored. 3252 * 3253 * The wired attribute of the page table entry is not a hardware feature, 3254 * so there is no need to invalidate any TLB entries. 3255 */ 3256 void 3257 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3258 { 3259 vm_offset_t va_next; 3260 pd_entry_t *l1, *l2, l2e; 3261 pt_entry_t *l3, l3e; 3262 bool pv_lists_locked; 3263 3264 pv_lists_locked = false; 3265 retry: 3266 PMAP_LOCK(pmap); 3267 for (; sva < eva; sva = va_next) { 3268 l1 = pmap_l1(pmap, sva); 3269 if (pmap_load(l1) == 0) { 3270 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3271 if (va_next < sva) 3272 va_next = eva; 3273 continue; 3274 } 3275 3276 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3277 if (va_next < sva) 3278 va_next = eva; 3279 3280 l2 = pmap_l1_to_l2(l1, sva); 3281 if ((l2e = pmap_load(l2)) == 0) 3282 continue; 3283 if ((l2e & PTE_RWX) != 0) { 3284 if (sva + L2_SIZE == va_next && eva >= va_next) { 3285 if ((l2e & PTE_SW_WIRED) == 0) 3286 panic("pmap_unwire: l2 %#jx is missing " 3287 "PTE_SW_WIRED", (uintmax_t)l2e); 3288 pmap_clear_bits(l2, PTE_SW_WIRED); 3289 continue; 3290 } else { 3291 if (!pv_lists_locked) { 3292 pv_lists_locked = true; 3293 if (!rw_try_rlock(&pvh_global_lock)) { 3294 PMAP_UNLOCK(pmap); 3295 rw_rlock(&pvh_global_lock); 3296 /* Repeat sva. */ 3297 goto retry; 3298 } 3299 } 3300 if (!pmap_demote_l2(pmap, l2, sva)) 3301 panic("pmap_unwire: demotion failed"); 3302 } 3303 } 3304 3305 if (va_next > eva) 3306 va_next = eva; 3307 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 3308 sva += L3_SIZE) { 3309 if ((l3e = pmap_load(l3)) == 0) 3310 continue; 3311 if ((l3e & PTE_SW_WIRED) == 0) 3312 panic("pmap_unwire: l3 %#jx is missing " 3313 "PTE_SW_WIRED", (uintmax_t)l3e); 3314 3315 /* 3316 * PG_W must be cleared atomically. Although the pmap 3317 * lock synchronizes access to PG_W, another processor 3318 * could be setting PG_M and/or PG_A concurrently. 3319 */ 3320 pmap_clear_bits(l3, PTE_SW_WIRED); 3321 pmap->pm_stats.wired_count--; 3322 } 3323 } 3324 if (pv_lists_locked) 3325 rw_runlock(&pvh_global_lock); 3326 PMAP_UNLOCK(pmap); 3327 } 3328 3329 /* 3330 * Copy the range specified by src_addr/len 3331 * from the source map to the range dst_addr/len 3332 * in the destination map. 3333 * 3334 * This routine is only advisory and need not do anything. 3335 */ 3336 3337 void 3338 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 3339 vm_offset_t src_addr) 3340 { 3341 3342 } 3343 3344 /* 3345 * pmap_zero_page zeros the specified hardware page by mapping 3346 * the page into KVM and using bzero to clear its contents. 3347 */ 3348 void 3349 pmap_zero_page(vm_page_t m) 3350 { 3351 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3352 3353 pagezero((void *)va); 3354 } 3355 3356 /* 3357 * pmap_zero_page_area zeros the specified hardware page by mapping 3358 * the page into KVM and using bzero to clear its contents. 3359 * 3360 * off and size may not cover an area beyond a single hardware page. 3361 */ 3362 void 3363 pmap_zero_page_area(vm_page_t m, int off, int size) 3364 { 3365 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3366 3367 if (off == 0 && size == PAGE_SIZE) 3368 pagezero((void *)va); 3369 else 3370 bzero((char *)va + off, size); 3371 } 3372 3373 /* 3374 * pmap_copy_page copies the specified (machine independent) 3375 * page by mapping the page into virtual memory and using 3376 * bcopy to copy the page, one machine dependent page at a 3377 * time. 3378 */ 3379 void 3380 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 3381 { 3382 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 3383 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 3384 3385 pagecopy((void *)src, (void *)dst); 3386 } 3387 3388 int unmapped_buf_allowed = 1; 3389 3390 void 3391 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 3392 vm_offset_t b_offset, int xfersize) 3393 { 3394 void *a_cp, *b_cp; 3395 vm_page_t m_a, m_b; 3396 vm_paddr_t p_a, p_b; 3397 vm_offset_t a_pg_offset, b_pg_offset; 3398 int cnt; 3399 3400 while (xfersize > 0) { 3401 a_pg_offset = a_offset & PAGE_MASK; 3402 m_a = ma[a_offset >> PAGE_SHIFT]; 3403 p_a = m_a->phys_addr; 3404 b_pg_offset = b_offset & PAGE_MASK; 3405 m_b = mb[b_offset >> PAGE_SHIFT]; 3406 p_b = m_b->phys_addr; 3407 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 3408 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 3409 if (__predict_false(!PHYS_IN_DMAP(p_a))) { 3410 panic("!DMAP a %lx", p_a); 3411 } else { 3412 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 3413 } 3414 if (__predict_false(!PHYS_IN_DMAP(p_b))) { 3415 panic("!DMAP b %lx", p_b); 3416 } else { 3417 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 3418 } 3419 bcopy(a_cp, b_cp, cnt); 3420 a_offset += cnt; 3421 b_offset += cnt; 3422 xfersize -= cnt; 3423 } 3424 } 3425 3426 vm_offset_t 3427 pmap_quick_enter_page(vm_page_t m) 3428 { 3429 3430 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); 3431 } 3432 3433 void 3434 pmap_quick_remove_page(vm_offset_t addr) 3435 { 3436 } 3437 3438 /* 3439 * Returns true if the pmap's pv is one of the first 3440 * 16 pvs linked to from this page. This count may 3441 * be changed upwards or downwards in the future; it 3442 * is only necessary that true be returned for a small 3443 * subset of pmaps for proper page aging. 3444 */ 3445 boolean_t 3446 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 3447 { 3448 struct md_page *pvh; 3449 struct rwlock *lock; 3450 pv_entry_t pv; 3451 int loops = 0; 3452 boolean_t rv; 3453 3454 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3455 ("pmap_page_exists_quick: page %p is not managed", m)); 3456 rv = FALSE; 3457 rw_rlock(&pvh_global_lock); 3458 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3459 rw_rlock(lock); 3460 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3461 if (PV_PMAP(pv) == pmap) { 3462 rv = TRUE; 3463 break; 3464 } 3465 loops++; 3466 if (loops >= 16) 3467 break; 3468 } 3469 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 3470 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3471 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3472 if (PV_PMAP(pv) == pmap) { 3473 rv = TRUE; 3474 break; 3475 } 3476 loops++; 3477 if (loops >= 16) 3478 break; 3479 } 3480 } 3481 rw_runlock(lock); 3482 rw_runlock(&pvh_global_lock); 3483 return (rv); 3484 } 3485 3486 /* 3487 * pmap_page_wired_mappings: 3488 * 3489 * Return the number of managed mappings to the given physical page 3490 * that are wired. 3491 */ 3492 int 3493 pmap_page_wired_mappings(vm_page_t m) 3494 { 3495 struct md_page *pvh; 3496 struct rwlock *lock; 3497 pmap_t pmap; 3498 pd_entry_t *l2; 3499 pt_entry_t *l3; 3500 pv_entry_t pv; 3501 int count, md_gen, pvh_gen; 3502 3503 if ((m->oflags & VPO_UNMANAGED) != 0) 3504 return (0); 3505 rw_rlock(&pvh_global_lock); 3506 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3507 rw_rlock(lock); 3508 restart: 3509 count = 0; 3510 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3511 pmap = PV_PMAP(pv); 3512 if (!PMAP_TRYLOCK(pmap)) { 3513 md_gen = m->md.pv_gen; 3514 rw_runlock(lock); 3515 PMAP_LOCK(pmap); 3516 rw_rlock(lock); 3517 if (md_gen != m->md.pv_gen) { 3518 PMAP_UNLOCK(pmap); 3519 goto restart; 3520 } 3521 } 3522 l3 = pmap_l3(pmap, pv->pv_va); 3523 if ((pmap_load(l3) & PTE_SW_WIRED) != 0) 3524 count++; 3525 PMAP_UNLOCK(pmap); 3526 } 3527 if ((m->flags & PG_FICTITIOUS) == 0) { 3528 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3529 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3530 pmap = PV_PMAP(pv); 3531 if (!PMAP_TRYLOCK(pmap)) { 3532 md_gen = m->md.pv_gen; 3533 pvh_gen = pvh->pv_gen; 3534 rw_runlock(lock); 3535 PMAP_LOCK(pmap); 3536 rw_rlock(lock); 3537 if (md_gen != m->md.pv_gen || 3538 pvh_gen != pvh->pv_gen) { 3539 PMAP_UNLOCK(pmap); 3540 goto restart; 3541 } 3542 } 3543 l2 = pmap_l2(pmap, pv->pv_va); 3544 if ((pmap_load(l2) & PTE_SW_WIRED) != 0) 3545 count++; 3546 PMAP_UNLOCK(pmap); 3547 } 3548 } 3549 rw_runlock(lock); 3550 rw_runlock(&pvh_global_lock); 3551 return (count); 3552 } 3553 3554 /* 3555 * Returns true if the given page is mapped individually or as part of 3556 * a 2mpage. Otherwise, returns false. 3557 */ 3558 bool 3559 pmap_page_is_mapped(vm_page_t m) 3560 { 3561 struct rwlock *lock; 3562 bool rv; 3563 3564 if ((m->oflags & VPO_UNMANAGED) != 0) 3565 return (false); 3566 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3567 rw_rlock(lock); 3568 rv = !TAILQ_EMPTY(&m->md.pv_list) || 3569 ((m->flags & PG_FICTITIOUS) == 0 && 3570 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 3571 rw_runlock(lock); 3572 return (rv); 3573 } 3574 3575 static void 3576 pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, 3577 struct spglist *free, bool superpage) 3578 { 3579 struct md_page *pvh; 3580 vm_page_t mpte, mt; 3581 3582 if (superpage) { 3583 pmap_resident_count_dec(pmap, Ln_ENTRIES); 3584 pvh = pa_to_pvh(m->phys_addr); 3585 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3586 pvh->pv_gen++; 3587 if (TAILQ_EMPTY(&pvh->pv_list)) { 3588 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 3589 if (TAILQ_EMPTY(&mt->md.pv_list) && 3590 (mt->a.flags & PGA_WRITEABLE) != 0) 3591 vm_page_aflag_clear(mt, PGA_WRITEABLE); 3592 } 3593 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 3594 if (mpte != NULL) { 3595 KASSERT(mpte->valid == VM_PAGE_BITS_ALL, 3596 ("pmap_remove_pages: pte page not promoted")); 3597 pmap_resident_count_dec(pmap, 1); 3598 KASSERT(mpte->ref_count == Ln_ENTRIES, 3599 ("pmap_remove_pages: pte page ref count error")); 3600 mpte->ref_count = 0; 3601 pmap_add_delayed_free_list(mpte, free, FALSE); 3602 } 3603 } else { 3604 pmap_resident_count_dec(pmap, 1); 3605 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3606 m->md.pv_gen++; 3607 if (TAILQ_EMPTY(&m->md.pv_list) && 3608 (m->a.flags & PGA_WRITEABLE) != 0) { 3609 pvh = pa_to_pvh(m->phys_addr); 3610 if (TAILQ_EMPTY(&pvh->pv_list)) 3611 vm_page_aflag_clear(m, PGA_WRITEABLE); 3612 } 3613 } 3614 } 3615 3616 /* 3617 * Destroy all managed, non-wired mappings in the given user-space 3618 * pmap. This pmap cannot be active on any processor besides the 3619 * caller. 3620 * 3621 * This function cannot be applied to the kernel pmap. Moreover, it 3622 * is not intended for general use. It is only to be used during 3623 * process termination. Consequently, it can be implemented in ways 3624 * that make it faster than pmap_remove(). First, it can more quickly 3625 * destroy mappings by iterating over the pmap's collection of PV 3626 * entries, rather than searching the page table. Second, it doesn't 3627 * have to test and clear the page table entries atomically, because 3628 * no processor is currently accessing the user address space. In 3629 * particular, a page table entry's dirty bit won't change state once 3630 * this function starts. 3631 */ 3632 void 3633 pmap_remove_pages(pmap_t pmap) 3634 { 3635 struct spglist free; 3636 pd_entry_t ptepde; 3637 pt_entry_t *pte, tpte; 3638 vm_page_t m, mt; 3639 pv_entry_t pv; 3640 struct pv_chunk *pc, *npc; 3641 struct rwlock *lock; 3642 int64_t bit; 3643 uint64_t inuse, bitmask; 3644 int allfree, field, freed, idx; 3645 bool superpage; 3646 3647 lock = NULL; 3648 3649 SLIST_INIT(&free); 3650 rw_rlock(&pvh_global_lock); 3651 PMAP_LOCK(pmap); 3652 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 3653 allfree = 1; 3654 freed = 0; 3655 for (field = 0; field < _NPCM; field++) { 3656 inuse = ~pc->pc_map[field] & pc_freemask[field]; 3657 while (inuse != 0) { 3658 bit = ffsl(inuse) - 1; 3659 bitmask = 1UL << bit; 3660 idx = field * 64 + bit; 3661 pv = &pc->pc_pventry[idx]; 3662 inuse &= ~bitmask; 3663 3664 pte = pmap_l1(pmap, pv->pv_va); 3665 ptepde = pmap_load(pte); 3666 pte = pmap_l1_to_l2(pte, pv->pv_va); 3667 tpte = pmap_load(pte); 3668 if ((tpte & PTE_RWX) != 0) { 3669 superpage = true; 3670 } else { 3671 ptepde = tpte; 3672 pte = pmap_l2_to_l3(pte, pv->pv_va); 3673 tpte = pmap_load(pte); 3674 superpage = false; 3675 } 3676 3677 /* 3678 * We cannot remove wired pages from a 3679 * process' mapping at this time. 3680 */ 3681 if (tpte & PTE_SW_WIRED) { 3682 allfree = 0; 3683 continue; 3684 } 3685 3686 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte)); 3687 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 3688 m < &vm_page_array[vm_page_array_size], 3689 ("pmap_remove_pages: bad pte %#jx", 3690 (uintmax_t)tpte)); 3691 3692 pmap_clear(pte); 3693 3694 /* 3695 * Update the vm_page_t clean/reference bits. 3696 */ 3697 if ((tpte & (PTE_D | PTE_W)) == 3698 (PTE_D | PTE_W)) { 3699 if (superpage) 3700 for (mt = m; 3701 mt < &m[Ln_ENTRIES]; mt++) 3702 vm_page_dirty(mt); 3703 else 3704 vm_page_dirty(m); 3705 } 3706 3707 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 3708 3709 /* Mark free */ 3710 pc->pc_map[field] |= bitmask; 3711 3712 pmap_remove_pages_pv(pmap, m, pv, &free, 3713 superpage); 3714 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 3715 freed++; 3716 } 3717 } 3718 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 3719 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 3720 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 3721 if (allfree) { 3722 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3723 free_pv_chunk(pc); 3724 } 3725 } 3726 if (lock != NULL) 3727 rw_wunlock(lock); 3728 pmap_invalidate_all(pmap); 3729 rw_runlock(&pvh_global_lock); 3730 PMAP_UNLOCK(pmap); 3731 vm_page_free_pages_toq(&free, false); 3732 } 3733 3734 static bool 3735 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 3736 { 3737 struct md_page *pvh; 3738 struct rwlock *lock; 3739 pd_entry_t *l2; 3740 pt_entry_t *l3, mask; 3741 pv_entry_t pv; 3742 pmap_t pmap; 3743 int md_gen, pvh_gen; 3744 bool rv; 3745 3746 mask = 0; 3747 if (modified) 3748 mask |= PTE_D; 3749 if (accessed) 3750 mask |= PTE_A; 3751 3752 rv = FALSE; 3753 rw_rlock(&pvh_global_lock); 3754 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3755 rw_rlock(lock); 3756 restart: 3757 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3758 pmap = PV_PMAP(pv); 3759 if (!PMAP_TRYLOCK(pmap)) { 3760 md_gen = m->md.pv_gen; 3761 rw_runlock(lock); 3762 PMAP_LOCK(pmap); 3763 rw_rlock(lock); 3764 if (md_gen != m->md.pv_gen) { 3765 PMAP_UNLOCK(pmap); 3766 goto restart; 3767 } 3768 } 3769 l3 = pmap_l3(pmap, pv->pv_va); 3770 rv = (pmap_load(l3) & mask) == mask; 3771 PMAP_UNLOCK(pmap); 3772 if (rv) 3773 goto out; 3774 } 3775 if ((m->flags & PG_FICTITIOUS) == 0) { 3776 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3777 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3778 pmap = PV_PMAP(pv); 3779 if (!PMAP_TRYLOCK(pmap)) { 3780 md_gen = m->md.pv_gen; 3781 pvh_gen = pvh->pv_gen; 3782 rw_runlock(lock); 3783 PMAP_LOCK(pmap); 3784 rw_rlock(lock); 3785 if (md_gen != m->md.pv_gen || 3786 pvh_gen != pvh->pv_gen) { 3787 PMAP_UNLOCK(pmap); 3788 goto restart; 3789 } 3790 } 3791 l2 = pmap_l2(pmap, pv->pv_va); 3792 rv = (pmap_load(l2) & mask) == mask; 3793 PMAP_UNLOCK(pmap); 3794 if (rv) 3795 goto out; 3796 } 3797 } 3798 out: 3799 rw_runlock(lock); 3800 rw_runlock(&pvh_global_lock); 3801 return (rv); 3802 } 3803 3804 /* 3805 * pmap_is_modified: 3806 * 3807 * Return whether or not the specified physical page was modified 3808 * in any physical maps. 3809 */ 3810 boolean_t 3811 pmap_is_modified(vm_page_t m) 3812 { 3813 3814 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3815 ("pmap_is_modified: page %p is not managed", m)); 3816 3817 /* 3818 * If the page is not busied then this check is racy. 3819 */ 3820 if (!pmap_page_is_write_mapped(m)) 3821 return (FALSE); 3822 return (pmap_page_test_mappings(m, FALSE, TRUE)); 3823 } 3824 3825 /* 3826 * pmap_is_prefaultable: 3827 * 3828 * Return whether or not the specified virtual address is eligible 3829 * for prefault. 3830 */ 3831 boolean_t 3832 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 3833 { 3834 pt_entry_t *l3; 3835 boolean_t rv; 3836 3837 rv = FALSE; 3838 PMAP_LOCK(pmap); 3839 l3 = pmap_l3(pmap, addr); 3840 if (l3 != NULL && pmap_load(l3) != 0) { 3841 rv = TRUE; 3842 } 3843 PMAP_UNLOCK(pmap); 3844 return (rv); 3845 } 3846 3847 /* 3848 * pmap_is_referenced: 3849 * 3850 * Return whether or not the specified physical page was referenced 3851 * in any physical maps. 3852 */ 3853 boolean_t 3854 pmap_is_referenced(vm_page_t m) 3855 { 3856 3857 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3858 ("pmap_is_referenced: page %p is not managed", m)); 3859 return (pmap_page_test_mappings(m, TRUE, FALSE)); 3860 } 3861 3862 /* 3863 * Clear the write and modified bits in each of the given page's mappings. 3864 */ 3865 void 3866 pmap_remove_write(vm_page_t m) 3867 { 3868 struct md_page *pvh; 3869 struct rwlock *lock; 3870 pmap_t pmap; 3871 pd_entry_t *l2; 3872 pt_entry_t *l3, oldl3, newl3; 3873 pv_entry_t next_pv, pv; 3874 vm_offset_t va; 3875 int md_gen, pvh_gen; 3876 3877 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3878 ("pmap_remove_write: page %p is not managed", m)); 3879 vm_page_assert_busied(m); 3880 3881 if (!pmap_page_is_write_mapped(m)) 3882 return; 3883 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3884 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 3885 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3886 rw_rlock(&pvh_global_lock); 3887 retry_pv_loop: 3888 rw_wlock(lock); 3889 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 3890 pmap = PV_PMAP(pv); 3891 if (!PMAP_TRYLOCK(pmap)) { 3892 pvh_gen = pvh->pv_gen; 3893 rw_wunlock(lock); 3894 PMAP_LOCK(pmap); 3895 rw_wlock(lock); 3896 if (pvh_gen != pvh->pv_gen) { 3897 PMAP_UNLOCK(pmap); 3898 rw_wunlock(lock); 3899 goto retry_pv_loop; 3900 } 3901 } 3902 va = pv->pv_va; 3903 l2 = pmap_l2(pmap, va); 3904 if ((pmap_load(l2) & PTE_W) != 0) 3905 (void)pmap_demote_l2_locked(pmap, l2, va, &lock); 3906 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 3907 ("inconsistent pv lock %p %p for page %p", 3908 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 3909 PMAP_UNLOCK(pmap); 3910 } 3911 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3912 pmap = PV_PMAP(pv); 3913 if (!PMAP_TRYLOCK(pmap)) { 3914 pvh_gen = pvh->pv_gen; 3915 md_gen = m->md.pv_gen; 3916 rw_wunlock(lock); 3917 PMAP_LOCK(pmap); 3918 rw_wlock(lock); 3919 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 3920 PMAP_UNLOCK(pmap); 3921 rw_wunlock(lock); 3922 goto retry_pv_loop; 3923 } 3924 } 3925 l3 = pmap_l3(pmap, pv->pv_va); 3926 oldl3 = pmap_load(l3); 3927 retry: 3928 if ((oldl3 & PTE_W) != 0) { 3929 newl3 = oldl3 & ~(PTE_D | PTE_W); 3930 if (!atomic_fcmpset_long(l3, &oldl3, newl3)) 3931 goto retry; 3932 if ((oldl3 & PTE_D) != 0) 3933 vm_page_dirty(m); 3934 pmap_invalidate_page(pmap, pv->pv_va); 3935 } 3936 PMAP_UNLOCK(pmap); 3937 } 3938 rw_wunlock(lock); 3939 vm_page_aflag_clear(m, PGA_WRITEABLE); 3940 rw_runlock(&pvh_global_lock); 3941 } 3942 3943 /* 3944 * pmap_ts_referenced: 3945 * 3946 * Return a count of reference bits for a page, clearing those bits. 3947 * It is not necessary for every reference bit to be cleared, but it 3948 * is necessary that 0 only be returned when there are truly no 3949 * reference bits set. 3950 * 3951 * As an optimization, update the page's dirty field if a modified bit is 3952 * found while counting reference bits. This opportunistic update can be 3953 * performed at low cost and can eliminate the need for some future calls 3954 * to pmap_is_modified(). However, since this function stops after 3955 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 3956 * dirty pages. Those dirty pages will only be detected by a future call 3957 * to pmap_is_modified(). 3958 */ 3959 int 3960 pmap_ts_referenced(vm_page_t m) 3961 { 3962 struct spglist free; 3963 struct md_page *pvh; 3964 struct rwlock *lock; 3965 pv_entry_t pv, pvf; 3966 pmap_t pmap; 3967 pd_entry_t *l2, l2e; 3968 pt_entry_t *l3, l3e; 3969 vm_paddr_t pa; 3970 vm_offset_t va; 3971 int cleared, md_gen, not_cleared, pvh_gen; 3972 3973 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3974 ("pmap_ts_referenced: page %p is not managed", m)); 3975 SLIST_INIT(&free); 3976 cleared = 0; 3977 pa = VM_PAGE_TO_PHYS(m); 3978 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 3979 3980 lock = PHYS_TO_PV_LIST_LOCK(pa); 3981 rw_rlock(&pvh_global_lock); 3982 rw_wlock(lock); 3983 retry: 3984 not_cleared = 0; 3985 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 3986 goto small_mappings; 3987 pv = pvf; 3988 do { 3989 pmap = PV_PMAP(pv); 3990 if (!PMAP_TRYLOCK(pmap)) { 3991 pvh_gen = pvh->pv_gen; 3992 rw_wunlock(lock); 3993 PMAP_LOCK(pmap); 3994 rw_wlock(lock); 3995 if (pvh_gen != pvh->pv_gen) { 3996 PMAP_UNLOCK(pmap); 3997 goto retry; 3998 } 3999 } 4000 va = pv->pv_va; 4001 l2 = pmap_l2(pmap, va); 4002 l2e = pmap_load(l2); 4003 if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) { 4004 /* 4005 * Although l2e is mapping a 2MB page, because 4006 * this function is called at a 4KB page granularity, 4007 * we only update the 4KB page under test. 4008 */ 4009 vm_page_dirty(m); 4010 } 4011 if ((l2e & PTE_A) != 0) { 4012 /* 4013 * Since this reference bit is shared by 512 4KB 4014 * pages, it should not be cleared every time it is 4015 * tested. Apply a simple "hash" function on the 4016 * physical page number, the virtual superpage number, 4017 * and the pmap address to select one 4KB page out of 4018 * the 512 on which testing the reference bit will 4019 * result in clearing that reference bit. This 4020 * function is designed to avoid the selection of the 4021 * same 4KB page for every 2MB page mapping. 4022 * 4023 * On demotion, a mapping that hasn't been referenced 4024 * is simply destroyed. To avoid the possibility of a 4025 * subsequent page fault on a demoted wired mapping, 4026 * always leave its reference bit set. Moreover, 4027 * since the superpage is wired, the current state of 4028 * its reference bit won't affect page replacement. 4029 */ 4030 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ 4031 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && 4032 (l2e & PTE_SW_WIRED) == 0) { 4033 pmap_clear_bits(l2, PTE_A); 4034 pmap_invalidate_page(pmap, va); 4035 cleared++; 4036 } else 4037 not_cleared++; 4038 } 4039 PMAP_UNLOCK(pmap); 4040 /* Rotate the PV list if it has more than one entry. */ 4041 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4042 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4043 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4044 pvh->pv_gen++; 4045 } 4046 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 4047 goto out; 4048 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 4049 small_mappings: 4050 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 4051 goto out; 4052 pv = pvf; 4053 do { 4054 pmap = PV_PMAP(pv); 4055 if (!PMAP_TRYLOCK(pmap)) { 4056 pvh_gen = pvh->pv_gen; 4057 md_gen = m->md.pv_gen; 4058 rw_wunlock(lock); 4059 PMAP_LOCK(pmap); 4060 rw_wlock(lock); 4061 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4062 PMAP_UNLOCK(pmap); 4063 goto retry; 4064 } 4065 } 4066 l2 = pmap_l2(pmap, pv->pv_va); 4067 4068 KASSERT((pmap_load(l2) & PTE_RX) == 0, 4069 ("pmap_ts_referenced: found an invalid l2 table")); 4070 4071 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4072 l3e = pmap_load(l3); 4073 if ((l3e & PTE_D) != 0) 4074 vm_page_dirty(m); 4075 if ((l3e & PTE_A) != 0) { 4076 if ((l3e & PTE_SW_WIRED) == 0) { 4077 /* 4078 * Wired pages cannot be paged out so 4079 * doing accessed bit emulation for 4080 * them is wasted effort. We do the 4081 * hard work for unwired pages only. 4082 */ 4083 pmap_clear_bits(l3, PTE_A); 4084 pmap_invalidate_page(pmap, pv->pv_va); 4085 cleared++; 4086 } else 4087 not_cleared++; 4088 } 4089 PMAP_UNLOCK(pmap); 4090 /* Rotate the PV list if it has more than one entry. */ 4091 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4092 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4093 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4094 m->md.pv_gen++; 4095 } 4096 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 4097 not_cleared < PMAP_TS_REFERENCED_MAX); 4098 out: 4099 rw_wunlock(lock); 4100 rw_runlock(&pvh_global_lock); 4101 vm_page_free_pages_toq(&free, false); 4102 return (cleared + not_cleared); 4103 } 4104 4105 /* 4106 * Apply the given advice to the specified range of addresses within the 4107 * given pmap. Depending on the advice, clear the referenced and/or 4108 * modified flags in each mapping and set the mapped page's dirty field. 4109 */ 4110 void 4111 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 4112 { 4113 } 4114 4115 /* 4116 * Clear the modify bits on the specified physical page. 4117 */ 4118 void 4119 pmap_clear_modify(vm_page_t m) 4120 { 4121 struct md_page *pvh; 4122 struct rwlock *lock; 4123 pmap_t pmap; 4124 pv_entry_t next_pv, pv; 4125 pd_entry_t *l2, oldl2; 4126 pt_entry_t *l3; 4127 vm_offset_t va; 4128 int md_gen, pvh_gen; 4129 4130 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4131 ("pmap_clear_modify: page %p is not managed", m)); 4132 vm_page_assert_busied(m); 4133 4134 if (!pmap_page_is_write_mapped(m)) 4135 return; 4136 4137 /* 4138 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 4139 * If the object containing the page is locked and the page is not 4140 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 4141 */ 4142 if ((m->a.flags & PGA_WRITEABLE) == 0) 4143 return; 4144 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 4145 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4146 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4147 rw_rlock(&pvh_global_lock); 4148 rw_wlock(lock); 4149 restart: 4150 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4151 pmap = PV_PMAP(pv); 4152 if (!PMAP_TRYLOCK(pmap)) { 4153 pvh_gen = pvh->pv_gen; 4154 rw_wunlock(lock); 4155 PMAP_LOCK(pmap); 4156 rw_wlock(lock); 4157 if (pvh_gen != pvh->pv_gen) { 4158 PMAP_UNLOCK(pmap); 4159 goto restart; 4160 } 4161 } 4162 va = pv->pv_va; 4163 l2 = pmap_l2(pmap, va); 4164 oldl2 = pmap_load(l2); 4165 /* If oldl2 has PTE_W set, then it also has PTE_D set. */ 4166 if ((oldl2 & PTE_W) != 0 && 4167 pmap_demote_l2_locked(pmap, l2, va, &lock) && 4168 (oldl2 & PTE_SW_WIRED) == 0) { 4169 /* 4170 * Write protect the mapping to a single page so that 4171 * a subsequent write access may repromote. 4172 */ 4173 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2); 4174 l3 = pmap_l2_to_l3(l2, va); 4175 pmap_clear_bits(l3, PTE_D | PTE_W); 4176 vm_page_dirty(m); 4177 pmap_invalidate_page(pmap, va); 4178 } 4179 PMAP_UNLOCK(pmap); 4180 } 4181 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4182 pmap = PV_PMAP(pv); 4183 if (!PMAP_TRYLOCK(pmap)) { 4184 md_gen = m->md.pv_gen; 4185 pvh_gen = pvh->pv_gen; 4186 rw_wunlock(lock); 4187 PMAP_LOCK(pmap); 4188 rw_wlock(lock); 4189 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4190 PMAP_UNLOCK(pmap); 4191 goto restart; 4192 } 4193 } 4194 l2 = pmap_l2(pmap, pv->pv_va); 4195 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4196 ("pmap_clear_modify: found a 2mpage in page %p's pv list", 4197 m)); 4198 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4199 if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) { 4200 pmap_clear_bits(l3, PTE_D | PTE_W); 4201 pmap_invalidate_page(pmap, pv->pv_va); 4202 } 4203 PMAP_UNLOCK(pmap); 4204 } 4205 rw_wunlock(lock); 4206 rw_runlock(&pvh_global_lock); 4207 } 4208 4209 void * 4210 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 4211 { 4212 4213 return ((void *)PHYS_TO_DMAP(pa)); 4214 } 4215 4216 void 4217 pmap_unmapbios(vm_paddr_t pa, vm_size_t size) 4218 { 4219 } 4220 4221 /* 4222 * Sets the memory attribute for the specified page. 4223 */ 4224 void 4225 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 4226 { 4227 4228 m->md.pv_memattr = ma; 4229 } 4230 4231 /* 4232 * Perform the pmap work for mincore(2). If the page is not both referenced and 4233 * modified by this pmap, returns its physical address so that the caller can 4234 * find other mappings. 4235 */ 4236 int 4237 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 4238 { 4239 pt_entry_t *l2, *l3, tpte; 4240 vm_paddr_t pa; 4241 int val; 4242 bool managed; 4243 4244 PMAP_LOCK(pmap); 4245 l2 = pmap_l2(pmap, addr); 4246 if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) { 4247 if ((tpte & PTE_RWX) != 0) { 4248 pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET); 4249 val = MINCORE_INCORE | MINCORE_SUPER; 4250 } else { 4251 l3 = pmap_l2_to_l3(l2, addr); 4252 tpte = pmap_load(l3); 4253 if ((tpte & PTE_V) == 0) { 4254 PMAP_UNLOCK(pmap); 4255 return (0); 4256 } 4257 pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET); 4258 val = MINCORE_INCORE; 4259 } 4260 4261 if ((tpte & PTE_D) != 0) 4262 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 4263 if ((tpte & PTE_A) != 0) 4264 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 4265 managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED; 4266 } else { 4267 managed = false; 4268 val = 0; 4269 } 4270 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 4271 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 4272 *pap = pa; 4273 } 4274 PMAP_UNLOCK(pmap); 4275 return (val); 4276 } 4277 4278 void 4279 pmap_activate_sw(struct thread *td) 4280 { 4281 pmap_t oldpmap, pmap; 4282 u_int hart; 4283 4284 oldpmap = PCPU_GET(curpmap); 4285 pmap = vmspace_pmap(td->td_proc->p_vmspace); 4286 if (pmap == oldpmap) 4287 return; 4288 load_satp(pmap->pm_satp); 4289 4290 hart = PCPU_GET(hart); 4291 #ifdef SMP 4292 CPU_SET_ATOMIC(hart, &pmap->pm_active); 4293 CPU_CLR_ATOMIC(hart, &oldpmap->pm_active); 4294 #else 4295 CPU_SET(hart, &pmap->pm_active); 4296 CPU_CLR(hart, &oldpmap->pm_active); 4297 #endif 4298 PCPU_SET(curpmap, pmap); 4299 4300 sfence_vma(); 4301 } 4302 4303 void 4304 pmap_activate(struct thread *td) 4305 { 4306 4307 critical_enter(); 4308 pmap_activate_sw(td); 4309 critical_exit(); 4310 } 4311 4312 void 4313 pmap_activate_boot(pmap_t pmap) 4314 { 4315 u_int hart; 4316 4317 hart = PCPU_GET(hart); 4318 #ifdef SMP 4319 CPU_SET_ATOMIC(hart, &pmap->pm_active); 4320 #else 4321 CPU_SET(hart, &pmap->pm_active); 4322 #endif 4323 PCPU_SET(curpmap, pmap); 4324 } 4325 4326 void 4327 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) 4328 { 4329 cpuset_t mask; 4330 4331 /* 4332 * From the RISC-V User-Level ISA V2.2: 4333 * 4334 * "To make a store to instruction memory visible to all 4335 * RISC-V harts, the writing hart has to execute a data FENCE 4336 * before requesting that all remote RISC-V harts execute a 4337 * FENCE.I." 4338 */ 4339 sched_pin(); 4340 mask = all_harts; 4341 CPU_CLR(PCPU_GET(hart), &mask); 4342 fence(); 4343 if (!CPU_EMPTY(&mask) && smp_started) 4344 sbi_remote_fence_i(mask.__bits); 4345 sched_unpin(); 4346 } 4347 4348 /* 4349 * Increase the starting virtual address of the given mapping if a 4350 * different alignment might result in more superpage mappings. 4351 */ 4352 void 4353 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 4354 vm_offset_t *addr, vm_size_t size) 4355 { 4356 vm_offset_t superpage_offset; 4357 4358 if (size < L2_SIZE) 4359 return; 4360 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 4361 offset += ptoa(object->pg_color); 4362 superpage_offset = offset & L2_OFFSET; 4363 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || 4364 (*addr & L2_OFFSET) == superpage_offset) 4365 return; 4366 if ((*addr & L2_OFFSET) < superpage_offset) 4367 *addr = (*addr & ~L2_OFFSET) + superpage_offset; 4368 else 4369 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; 4370 } 4371 4372 /** 4373 * Get the kernel virtual address of a set of physical pages. If there are 4374 * physical addresses not covered by the DMAP perform a transient mapping 4375 * that will be removed when calling pmap_unmap_io_transient. 4376 * 4377 * \param page The pages the caller wishes to obtain the virtual 4378 * address on the kernel memory map. 4379 * \param vaddr On return contains the kernel virtual memory address 4380 * of the pages passed in the page parameter. 4381 * \param count Number of pages passed in. 4382 * \param can_fault TRUE if the thread using the mapped pages can take 4383 * page faults, FALSE otherwise. 4384 * 4385 * \returns TRUE if the caller must call pmap_unmap_io_transient when 4386 * finished or FALSE otherwise. 4387 * 4388 */ 4389 boolean_t 4390 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 4391 boolean_t can_fault) 4392 { 4393 vm_paddr_t paddr; 4394 boolean_t needs_mapping; 4395 int error, i; 4396 4397 /* 4398 * Allocate any KVA space that we need, this is done in a separate 4399 * loop to prevent calling vmem_alloc while pinned. 4400 */ 4401 needs_mapping = FALSE; 4402 for (i = 0; i < count; i++) { 4403 paddr = VM_PAGE_TO_PHYS(page[i]); 4404 if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) { 4405 error = vmem_alloc(kernel_arena, PAGE_SIZE, 4406 M_BESTFIT | M_WAITOK, &vaddr[i]); 4407 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 4408 needs_mapping = TRUE; 4409 } else { 4410 vaddr[i] = PHYS_TO_DMAP(paddr); 4411 } 4412 } 4413 4414 /* Exit early if everything is covered by the DMAP */ 4415 if (!needs_mapping) 4416 return (FALSE); 4417 4418 if (!can_fault) 4419 sched_pin(); 4420 for (i = 0; i < count; i++) { 4421 paddr = VM_PAGE_TO_PHYS(page[i]); 4422 if (paddr >= DMAP_MAX_PHYSADDR) { 4423 panic( 4424 "pmap_map_io_transient: TODO: Map out of DMAP data"); 4425 } 4426 } 4427 4428 return (needs_mapping); 4429 } 4430 4431 void 4432 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 4433 boolean_t can_fault) 4434 { 4435 vm_paddr_t paddr; 4436 int i; 4437 4438 if (!can_fault) 4439 sched_unpin(); 4440 for (i = 0; i < count; i++) { 4441 paddr = VM_PAGE_TO_PHYS(page[i]); 4442 if (paddr >= DMAP_MAX_PHYSADDR) { 4443 panic("RISCVTODO: pmap_unmap_io_transient: Unmap data"); 4444 } 4445 } 4446 } 4447 4448 boolean_t 4449 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 4450 { 4451 4452 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_BACK); 4453 } 4454 4455 bool 4456 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2, 4457 pt_entry_t **l3) 4458 { 4459 pd_entry_t *l1p, *l2p; 4460 4461 /* Get l1 directory entry. */ 4462 l1p = pmap_l1(pmap, va); 4463 *l1 = l1p; 4464 4465 if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0) 4466 return (false); 4467 4468 if ((pmap_load(l1p) & PTE_RX) != 0) { 4469 *l2 = NULL; 4470 *l3 = NULL; 4471 return (true); 4472 } 4473 4474 /* Get l2 directory entry. */ 4475 l2p = pmap_l1_to_l2(l1p, va); 4476 *l2 = l2p; 4477 4478 if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0) 4479 return (false); 4480 4481 if ((pmap_load(l2p) & PTE_RX) != 0) { 4482 *l3 = NULL; 4483 return (true); 4484 } 4485 4486 /* Get l3 page table entry. */ 4487 *l3 = pmap_l2_to_l3(l2p, va); 4488 4489 return (true); 4490 } 4491 4492 /* 4493 * Track a range of the kernel's virtual address space that is contiguous 4494 * in various mapping attributes. 4495 */ 4496 struct pmap_kernel_map_range { 4497 vm_offset_t sva; 4498 pt_entry_t attrs; 4499 int l3pages; 4500 int l2pages; 4501 int l1pages; 4502 }; 4503 4504 static void 4505 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 4506 vm_offset_t eva) 4507 { 4508 4509 if (eva <= range->sva) 4510 return; 4511 4512 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %d %d %d\n", 4513 range->sva, eva, 4514 (range->attrs & PTE_W) == PTE_W ? 'w' : '-', 4515 (range->attrs & PTE_X) == PTE_X ? 'x' : '-', 4516 (range->attrs & PTE_U) == PTE_U ? 'u' : 's', 4517 (range->attrs & PTE_G) == PTE_G ? 'g' : '-', 4518 range->l1pages, range->l2pages, range->l3pages); 4519 4520 /* Reset to sentinel value. */ 4521 range->sva = 0xfffffffffffffffful; 4522 } 4523 4524 /* 4525 * Determine whether the attributes specified by a page table entry match those 4526 * being tracked by the current range. 4527 */ 4528 static bool 4529 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 4530 { 4531 4532 return (range->attrs == attrs); 4533 } 4534 4535 static void 4536 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 4537 pt_entry_t attrs) 4538 { 4539 4540 memset(range, 0, sizeof(*range)); 4541 range->sva = va; 4542 range->attrs = attrs; 4543 } 4544 4545 /* 4546 * Given a leaf PTE, derive the mapping's attributes. If they do not match 4547 * those of the current run, dump the address range and its attributes, and 4548 * begin a new run. 4549 */ 4550 static void 4551 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 4552 vm_offset_t va, pd_entry_t l1e, pd_entry_t l2e, pt_entry_t l3e) 4553 { 4554 pt_entry_t attrs; 4555 4556 /* The PTE global bit is inherited by lower levels. */ 4557 attrs = l1e & PTE_G; 4558 if ((l1e & PTE_RWX) != 0) 4559 attrs |= l1e & (PTE_RWX | PTE_U); 4560 else if (l2e != 0) 4561 attrs |= l2e & PTE_G; 4562 if ((l2e & PTE_RWX) != 0) 4563 attrs |= l2e & (PTE_RWX | PTE_U); 4564 else if (l3e != 0) 4565 attrs |= l3e & (PTE_RWX | PTE_U | PTE_G); 4566 4567 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 4568 sysctl_kmaps_dump(sb, range, va); 4569 sysctl_kmaps_reinit(range, va, attrs); 4570 } 4571 } 4572 4573 static int 4574 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 4575 { 4576 struct pmap_kernel_map_range range; 4577 struct sbuf sbuf, *sb; 4578 pd_entry_t l1e, *l2, l2e; 4579 pt_entry_t *l3, l3e; 4580 vm_offset_t sva; 4581 vm_paddr_t pa; 4582 int error, i, j, k; 4583 4584 error = sysctl_wire_old_buffer(req, 0); 4585 if (error != 0) 4586 return (error); 4587 sb = &sbuf; 4588 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 4589 4590 /* Sentinel value. */ 4591 range.sva = 0xfffffffffffffffful; 4592 4593 /* 4594 * Iterate over the kernel page tables without holding the kernel pmap 4595 * lock. Kernel page table pages are never freed, so at worst we will 4596 * observe inconsistencies in the output. 4597 */ 4598 sva = VM_MIN_KERNEL_ADDRESS; 4599 for (i = pmap_l1_index(sva); i < Ln_ENTRIES; i++) { 4600 if (i == pmap_l1_index(DMAP_MIN_ADDRESS)) 4601 sbuf_printf(sb, "\nDirect map:\n"); 4602 else if (i == pmap_l1_index(VM_MIN_KERNEL_ADDRESS)) 4603 sbuf_printf(sb, "\nKernel map:\n"); 4604 4605 l1e = kernel_pmap->pm_l1[i]; 4606 if ((l1e & PTE_V) == 0) { 4607 sysctl_kmaps_dump(sb, &range, sva); 4608 sva += L1_SIZE; 4609 continue; 4610 } 4611 if ((l1e & PTE_RWX) != 0) { 4612 sysctl_kmaps_check(sb, &range, sva, l1e, 0, 0); 4613 range.l1pages++; 4614 sva += L1_SIZE; 4615 continue; 4616 } 4617 pa = PTE_TO_PHYS(l1e); 4618 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); 4619 4620 for (j = pmap_l2_index(sva); j < Ln_ENTRIES; j++) { 4621 l2e = l2[j]; 4622 if ((l2e & PTE_V) == 0) { 4623 sysctl_kmaps_dump(sb, &range, sva); 4624 sva += L2_SIZE; 4625 continue; 4626 } 4627 if ((l2e & PTE_RWX) != 0) { 4628 sysctl_kmaps_check(sb, &range, sva, l1e, l2e, 0); 4629 range.l2pages++; 4630 sva += L2_SIZE; 4631 continue; 4632 } 4633 pa = PTE_TO_PHYS(l2e); 4634 l3 = (pd_entry_t *)PHYS_TO_DMAP(pa); 4635 4636 for (k = pmap_l3_index(sva); k < Ln_ENTRIES; k++, 4637 sva += L3_SIZE) { 4638 l3e = l3[k]; 4639 if ((l3e & PTE_V) == 0) { 4640 sysctl_kmaps_dump(sb, &range, sva); 4641 continue; 4642 } 4643 sysctl_kmaps_check(sb, &range, sva, 4644 l1e, l2e, l3e); 4645 range.l3pages++; 4646 } 4647 } 4648 } 4649 4650 error = sbuf_finish(sb); 4651 sbuf_delete(sb); 4652 return (error); 4653 } 4654 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 4655 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 4656 NULL, 0, sysctl_kmaps, "A", 4657 "Dump kernel address layout"); 4658