1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * Copyright (c) 2014 Andrew Turner 15 * All rights reserved. 16 * Copyright (c) 2014 The FreeBSD Foundation 17 * All rights reserved. 18 * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com> 19 * All rights reserved. 20 * 21 * This code is derived from software contributed to Berkeley by 22 * the Systems Programming Group of the University of Utah Computer 23 * Science Department and William Jolitz of UUNET Technologies Inc. 24 * 25 * Portions of this software were developed by Andrew Turner under 26 * sponsorship from The FreeBSD Foundation. 27 * 28 * Portions of this software were developed by SRI International and the 29 * University of Cambridge Computer Laboratory under DARPA/AFRL contract 30 * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. 31 * 32 * Portions of this software were developed by the University of Cambridge 33 * Computer Laboratory as part of the CTSRD Project, with support from the 34 * UK Higher Education Innovation Fund (HEIF). 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. All advertising materials mentioning features or use of this software 45 * must display the following acknowledgement: 46 * This product includes software developed by the University of 47 * California, Berkeley and its contributors. 48 * 4. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 * 64 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 65 */ 66 /*- 67 * Copyright (c) 2003 Networks Associates Technology, Inc. 68 * All rights reserved. 69 * 70 * This software was developed for the FreeBSD Project by Jake Burkholder, 71 * Safeport Network Services, and Network Associates Laboratories, the 72 * Security Research Division of Network Associates, Inc. under 73 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 74 * CHATS research program. 75 * 76 * Redistribution and use in source and binary forms, with or without 77 * modification, are permitted provided that the following conditions 78 * are met: 79 * 1. Redistributions of source code must retain the above copyright 80 * notice, this list of conditions and the following disclaimer. 81 * 2. Redistributions in binary form must reproduce the above copyright 82 * notice, this list of conditions and the following disclaimer in the 83 * documentation and/or other materials provided with the distribution. 84 * 85 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 86 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 87 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 88 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 89 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 90 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 91 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 92 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 93 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 94 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 95 * SUCH DAMAGE. 96 */ 97 98 #include <sys/cdefs.h> 99 __FBSDID("$FreeBSD$"); 100 101 /* 102 * Manages physical address maps. 103 * 104 * Since the information managed by this module is 105 * also stored by the logical address mapping module, 106 * this module may throw away valid virtual-to-physical 107 * mappings at almost any time. However, invalidations 108 * of virtual-to-physical mappings must be done as 109 * requested. 110 * 111 * In order to cope with hardware architectures which 112 * make virtual-to-physical map invalidates expensive, 113 * this module may delay invalidate or reduced protection 114 * operations until such time as they are actually 115 * necessary. This module is given full information as 116 * to which processors are currently using which maps, 117 * and to when physical maps must be made correct. 118 */ 119 120 #include <sys/param.h> 121 #include <sys/systm.h> 122 #include <sys/bitstring.h> 123 #include <sys/bus.h> 124 #include <sys/cpuset.h> 125 #include <sys/kernel.h> 126 #include <sys/ktr.h> 127 #include <sys/lock.h> 128 #include <sys/malloc.h> 129 #include <sys/mman.h> 130 #include <sys/msgbuf.h> 131 #include <sys/mutex.h> 132 #include <sys/proc.h> 133 #include <sys/rwlock.h> 134 #include <sys/sx.h> 135 #include <sys/vmem.h> 136 #include <sys/vmmeter.h> 137 #include <sys/sched.h> 138 #include <sys/sysctl.h> 139 #include <sys/smp.h> 140 141 #include <vm/vm.h> 142 #include <vm/vm_param.h> 143 #include <vm/vm_kern.h> 144 #include <vm/vm_page.h> 145 #include <vm/vm_map.h> 146 #include <vm/vm_object.h> 147 #include <vm/vm_extern.h> 148 #include <vm/vm_pageout.h> 149 #include <vm/vm_pager.h> 150 #include <vm/vm_phys.h> 151 #include <vm/vm_radix.h> 152 #include <vm/vm_reserv.h> 153 #include <vm/uma.h> 154 155 #include <machine/machdep.h> 156 #include <machine/md_var.h> 157 #include <machine/pcb.h> 158 #include <machine/sbi.h> 159 160 #define NUL1E (Ln_ENTRIES * Ln_ENTRIES) 161 #define NUL2E (Ln_ENTRIES * NUL1E) 162 163 #if !defined(DIAGNOSTIC) 164 #ifdef __GNUC_GNU_INLINE__ 165 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 166 #else 167 #define PMAP_INLINE extern inline 168 #endif 169 #else 170 #define PMAP_INLINE 171 #endif 172 173 #ifdef PV_STATS 174 #define PV_STAT(x) do { x ; } while (0) 175 #else 176 #define PV_STAT(x) do { } while (0) 177 #endif 178 179 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) 180 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 181 182 #define NPV_LIST_LOCKS MAXCPU 183 184 #define PHYS_TO_PV_LIST_LOCK(pa) \ 185 (&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS]) 186 187 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 188 struct rwlock **_lockp = (lockp); \ 189 struct rwlock *_new_lock; \ 190 \ 191 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 192 if (_new_lock != *_lockp) { \ 193 if (*_lockp != NULL) \ 194 rw_wunlock(*_lockp); \ 195 *_lockp = _new_lock; \ 196 rw_wlock(*_lockp); \ 197 } \ 198 } while (0) 199 200 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 201 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 202 203 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 204 struct rwlock **_lockp = (lockp); \ 205 \ 206 if (*_lockp != NULL) { \ 207 rw_wunlock(*_lockp); \ 208 *_lockp = NULL; \ 209 } \ 210 } while (0) 211 212 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 213 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 214 215 /* The list of all the user pmaps */ 216 LIST_HEAD(pmaplist, pmap); 217 static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER(); 218 219 struct pmap kernel_pmap_store; 220 221 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 222 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 223 vm_offset_t kernel_vm_end = 0; 224 225 vm_paddr_t dmap_phys_base; /* The start of the dmap region */ 226 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ 227 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ 228 229 /* This code assumes all L1 DMAP entries will be used */ 230 CTASSERT((DMAP_MIN_ADDRESS & ~L1_OFFSET) == DMAP_MIN_ADDRESS); 231 CTASSERT((DMAP_MAX_ADDRESS & ~L1_OFFSET) == DMAP_MAX_ADDRESS); 232 233 static struct rwlock_padalign pvh_global_lock; 234 static struct mtx_padalign allpmaps_lock; 235 236 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, 237 "VM/pmap parameters"); 238 239 static int superpages_enabled = 1; 240 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, 241 CTLFLAG_RDTUN, &superpages_enabled, 0, 242 "Enable support for transparent superpages"); 243 244 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0, 245 "2MB page mapping counters"); 246 247 static u_long pmap_l2_demotions; 248 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, 249 &pmap_l2_demotions, 0, 250 "2MB page demotions"); 251 252 static u_long pmap_l2_mappings; 253 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, 254 &pmap_l2_mappings, 0, 255 "2MB page mappings"); 256 257 static u_long pmap_l2_p_failures; 258 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, 259 &pmap_l2_p_failures, 0, 260 "2MB page promotion failures"); 261 262 static u_long pmap_l2_promotions; 263 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, 264 &pmap_l2_promotions, 0, 265 "2MB page promotions"); 266 267 /* 268 * Data for the pv entry allocation mechanism 269 */ 270 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 271 static struct mtx pv_chunks_mutex; 272 static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 273 static struct md_page *pv_table; 274 static struct md_page pv_dummy; 275 276 /* 277 * Internal flags for pmap_enter()'s helper functions. 278 */ 279 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 280 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 281 282 static void free_pv_chunk(struct pv_chunk *pc); 283 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 284 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 285 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 286 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 287 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 288 vm_offset_t va); 289 static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va); 290 static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, 291 vm_offset_t va, struct rwlock **lockp); 292 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, 293 u_int flags, vm_page_t m, struct rwlock **lockp); 294 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 295 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 296 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, 297 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 298 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 299 vm_page_t m, struct rwlock **lockp); 300 301 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, 302 struct rwlock **lockp); 303 304 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 305 struct spglist *free); 306 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 307 308 #define pmap_clear(pte) pmap_store(pte, 0) 309 #define pmap_clear_bits(pte, bits) atomic_clear_64(pte, bits) 310 #define pmap_load_store(pte, entry) atomic_swap_64(pte, entry) 311 #define pmap_load_clear(pte) pmap_load_store(pte, 0) 312 #define pmap_load(pte) atomic_load_64(pte) 313 #define pmap_store(pte, entry) atomic_store_64(pte, entry) 314 #define pmap_store_bits(pte, bits) atomic_set_64(pte, bits) 315 316 /********************/ 317 /* Inline functions */ 318 /********************/ 319 320 static __inline void 321 pagecopy(void *s, void *d) 322 { 323 324 memcpy(d, s, PAGE_SIZE); 325 } 326 327 static __inline void 328 pagezero(void *p) 329 { 330 331 bzero(p, PAGE_SIZE); 332 } 333 334 #define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) 335 #define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) 336 #define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) 337 338 #define PTE_TO_PHYS(pte) ((pte >> PTE_PPN0_S) * PAGE_SIZE) 339 340 static __inline pd_entry_t * 341 pmap_l1(pmap_t pmap, vm_offset_t va) 342 { 343 344 return (&pmap->pm_l1[pmap_l1_index(va)]); 345 } 346 347 static __inline pd_entry_t * 348 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) 349 { 350 vm_paddr_t phys; 351 pd_entry_t *l2; 352 353 phys = PTE_TO_PHYS(pmap_load(l1)); 354 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 355 356 return (&l2[pmap_l2_index(va)]); 357 } 358 359 static __inline pd_entry_t * 360 pmap_l2(pmap_t pmap, vm_offset_t va) 361 { 362 pd_entry_t *l1; 363 364 l1 = pmap_l1(pmap, va); 365 if ((pmap_load(l1) & PTE_V) == 0) 366 return (NULL); 367 if ((pmap_load(l1) & PTE_RX) != 0) 368 return (NULL); 369 370 return (pmap_l1_to_l2(l1, va)); 371 } 372 373 static __inline pt_entry_t * 374 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) 375 { 376 vm_paddr_t phys; 377 pt_entry_t *l3; 378 379 phys = PTE_TO_PHYS(pmap_load(l2)); 380 l3 = (pd_entry_t *)PHYS_TO_DMAP(phys); 381 382 return (&l3[pmap_l3_index(va)]); 383 } 384 385 static __inline pt_entry_t * 386 pmap_l3(pmap_t pmap, vm_offset_t va) 387 { 388 pd_entry_t *l2; 389 390 l2 = pmap_l2(pmap, va); 391 if (l2 == NULL) 392 return (NULL); 393 if ((pmap_load(l2) & PTE_V) == 0) 394 return (NULL); 395 if ((pmap_load(l2) & PTE_RX) != 0) 396 return (NULL); 397 398 return (pmap_l2_to_l3(l2, va)); 399 } 400 401 static __inline void 402 pmap_resident_count_inc(pmap_t pmap, int count) 403 { 404 405 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 406 pmap->pm_stats.resident_count += count; 407 } 408 409 static __inline void 410 pmap_resident_count_dec(pmap_t pmap, int count) 411 { 412 413 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 414 KASSERT(pmap->pm_stats.resident_count >= count, 415 ("pmap %p resident count underflow %ld %d", pmap, 416 pmap->pm_stats.resident_count, count)); 417 pmap->pm_stats.resident_count -= count; 418 } 419 420 static void 421 pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index, 422 pt_entry_t entry) 423 { 424 struct pmap *user_pmap; 425 pd_entry_t *l1; 426 427 /* Distribute new kernel L1 entry to all the user pmaps */ 428 if (pmap != kernel_pmap) 429 return; 430 431 mtx_lock(&allpmaps_lock); 432 LIST_FOREACH(user_pmap, &allpmaps, pm_list) { 433 l1 = &user_pmap->pm_l1[l1index]; 434 pmap_store(l1, entry); 435 } 436 mtx_unlock(&allpmaps_lock); 437 } 438 439 static pt_entry_t * 440 pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, 441 u_int *l2_slot) 442 { 443 pt_entry_t *l2; 444 pd_entry_t *l1; 445 446 l1 = (pd_entry_t *)l1pt; 447 *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; 448 449 /* Check locore has used a table L1 map */ 450 KASSERT((l1[*l1_slot] & PTE_RX) == 0, 451 ("Invalid bootstrap L1 table")); 452 453 /* Find the address of the L2 table */ 454 l2 = (pt_entry_t *)init_pt_va; 455 *l2_slot = pmap_l2_index(va); 456 457 return (l2); 458 } 459 460 static vm_paddr_t 461 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) 462 { 463 u_int l1_slot, l2_slot; 464 pt_entry_t *l2; 465 u_int ret; 466 467 l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); 468 469 /* Check locore has used L2 superpages */ 470 KASSERT((l2[l2_slot] & PTE_RX) != 0, 471 ("Invalid bootstrap L2 table")); 472 473 /* L2 is superpages */ 474 ret = (l2[l2_slot] >> PTE_PPN1_S) << L2_SHIFT; 475 ret += (va & L2_OFFSET); 476 477 return (ret); 478 } 479 480 static void 481 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa) 482 { 483 vm_offset_t va; 484 vm_paddr_t pa; 485 pd_entry_t *l1; 486 u_int l1_slot; 487 pt_entry_t entry; 488 pn_t pn; 489 490 pa = dmap_phys_base = min_pa & ~L1_OFFSET; 491 va = DMAP_MIN_ADDRESS; 492 l1 = (pd_entry_t *)kern_l1; 493 l1_slot = pmap_l1_index(DMAP_MIN_ADDRESS); 494 495 for (; va < DMAP_MAX_ADDRESS && pa < max_pa; 496 pa += L1_SIZE, va += L1_SIZE, l1_slot++) { 497 KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); 498 499 /* superpages */ 500 pn = (pa / PAGE_SIZE); 501 entry = PTE_KERN; 502 entry |= (pn << PTE_PPN0_S); 503 pmap_store(&l1[l1_slot], entry); 504 } 505 506 /* Set the upper limit of the DMAP region */ 507 dmap_phys_max = pa; 508 dmap_max_addr = va; 509 510 sfence_vma(); 511 } 512 513 static vm_offset_t 514 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) 515 { 516 vm_offset_t l3pt; 517 pt_entry_t entry; 518 pd_entry_t *l2; 519 vm_paddr_t pa; 520 u_int l2_slot; 521 pn_t pn; 522 523 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); 524 525 l2 = pmap_l2(kernel_pmap, va); 526 l2 = (pd_entry_t *)((uintptr_t)l2 & ~(PAGE_SIZE - 1)); 527 l2_slot = pmap_l2_index(va); 528 l3pt = l3_start; 529 530 for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { 531 KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); 532 533 pa = pmap_early_vtophys(l1pt, l3pt); 534 pn = (pa / PAGE_SIZE); 535 entry = (PTE_V); 536 entry |= (pn << PTE_PPN0_S); 537 pmap_store(&l2[l2_slot], entry); 538 l3pt += PAGE_SIZE; 539 } 540 541 542 /* Clean the L2 page table */ 543 memset((void *)l3_start, 0, l3pt - l3_start); 544 545 return (l3pt); 546 } 547 548 /* 549 * Bootstrap the system enough to run with virtual memory. 550 */ 551 void 552 pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) 553 { 554 u_int l1_slot, l2_slot, avail_slot, map_slot; 555 vm_offset_t freemempos; 556 vm_offset_t dpcpu, msgbufpv; 557 vm_paddr_t end, max_pa, min_pa, pa, start; 558 int i; 559 560 printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); 561 printf("%lx\n", l1pt); 562 printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK); 563 564 /* Set this early so we can use the pagetable walking functions */ 565 kernel_pmap_store.pm_l1 = (pd_entry_t *)l1pt; 566 PMAP_LOCK_INIT(kernel_pmap); 567 568 rw_init(&pvh_global_lock, "pmap pv global"); 569 570 CPU_FILL(&kernel_pmap->pm_active); 571 572 /* Assume the address we were loaded to is a valid physical address. */ 573 min_pa = max_pa = kernstart; 574 575 /* 576 * Find the minimum physical address. physmap is sorted, 577 * but may contain empty ranges. 578 */ 579 for (i = 0; i < physmap_idx * 2; i += 2) { 580 if (physmap[i] == physmap[i + 1]) 581 continue; 582 if (physmap[i] <= min_pa) 583 min_pa = physmap[i]; 584 if (physmap[i + 1] > max_pa) 585 max_pa = physmap[i + 1]; 586 } 587 printf("physmap_idx %lx\n", physmap_idx); 588 printf("min_pa %lx\n", min_pa); 589 printf("max_pa %lx\n", max_pa); 590 591 /* Create a direct map region early so we can use it for pa -> va */ 592 pmap_bootstrap_dmap(l1pt, min_pa, max_pa); 593 594 /* 595 * Read the page table to find out what is already mapped. 596 * This assumes we have mapped a block of memory from KERNBASE 597 * using a single L1 entry. 598 */ 599 (void)pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); 600 601 /* Sanity check the index, KERNBASE should be the first VA */ 602 KASSERT(l2_slot == 0, ("The L2 index is non-zero")); 603 604 freemempos = roundup2(KERNBASE + kernlen, PAGE_SIZE); 605 606 /* Create the l3 tables for the early devmap */ 607 freemempos = pmap_bootstrap_l3(l1pt, 608 VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos); 609 610 sfence_vma(); 611 612 #define alloc_pages(var, np) \ 613 (var) = freemempos; \ 614 freemempos += (np * PAGE_SIZE); \ 615 memset((char *)(var), 0, ((np) * PAGE_SIZE)); 616 617 /* Allocate dynamic per-cpu area. */ 618 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); 619 dpcpu_init((void *)dpcpu, 0); 620 621 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ 622 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); 623 msgbufp = (void *)msgbufpv; 624 625 virtual_avail = roundup2(freemempos, L2_SIZE); 626 virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE; 627 kernel_vm_end = virtual_avail; 628 629 pa = pmap_early_vtophys(l1pt, freemempos); 630 631 /* Initialize phys_avail and dump_avail. */ 632 for (avail_slot = map_slot = physmem = 0; map_slot < physmap_idx * 2; 633 map_slot += 2) { 634 start = physmap[map_slot]; 635 end = physmap[map_slot + 1]; 636 637 if (start == end) 638 continue; 639 dump_avail[map_slot] = start; 640 dump_avail[map_slot + 1] = end; 641 642 if (start >= kernstart && end <= pa) 643 continue; 644 645 if (start < kernstart && end > kernstart) 646 end = kernstart; 647 else if (start < pa && end > pa) 648 start = pa; 649 phys_avail[avail_slot] = start; 650 phys_avail[avail_slot + 1] = end; 651 physmem += (end - start) >> PAGE_SHIFT; 652 avail_slot += 2; 653 654 if (end != physmap[map_slot + 1] && end > pa) { 655 phys_avail[avail_slot] = pa; 656 phys_avail[avail_slot + 1] = physmap[map_slot + 1]; 657 physmem += (physmap[map_slot + 1] - pa) >> PAGE_SHIFT; 658 avail_slot += 2; 659 } 660 } 661 phys_avail[avail_slot] = 0; 662 phys_avail[avail_slot + 1] = 0; 663 664 /* 665 * Maxmem isn't the "maximum memory", it's one larger than the 666 * highest page of the physical address space. It should be 667 * called something like "Maxphyspage". 668 */ 669 Maxmem = atop(phys_avail[avail_slot - 1]); 670 } 671 672 /* 673 * Initialize a vm_page's machine-dependent fields. 674 */ 675 void 676 pmap_page_init(vm_page_t m) 677 { 678 679 TAILQ_INIT(&m->md.pv_list); 680 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; 681 } 682 683 /* 684 * Initialize the pmap module. 685 * Called by vm_init, to initialize any structures that the pmap 686 * system needs to map virtual memory. 687 */ 688 void 689 pmap_init(void) 690 { 691 vm_size_t s; 692 int i, pv_npg; 693 694 /* 695 * Initialize the pv chunk and pmap list mutexes. 696 */ 697 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 698 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF); 699 700 /* 701 * Initialize the pool of pv list locks. 702 */ 703 for (i = 0; i < NPV_LIST_LOCKS; i++) 704 rw_init(&pv_list_locks[i], "pmap pv list"); 705 706 /* 707 * Calculate the size of the pv head table for superpages. 708 */ 709 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); 710 711 /* 712 * Allocate memory for the pv head table for superpages. 713 */ 714 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 715 s = round_page(s); 716 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 717 for (i = 0; i < pv_npg; i++) 718 TAILQ_INIT(&pv_table[i].pv_list); 719 TAILQ_INIT(&pv_dummy.pv_list); 720 721 if (superpages_enabled) 722 pagesizes[1] = L2_SIZE; 723 } 724 725 #ifdef SMP 726 /* 727 * For SMP, these functions have to use IPIs for coherence. 728 * 729 * In general, the calling thread uses a plain fence to order the 730 * writes to the page tables before invoking an SBI callback to invoke 731 * sfence_vma() on remote CPUs. 732 */ 733 static void 734 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 735 { 736 cpuset_t mask; 737 738 sched_pin(); 739 mask = pmap->pm_active; 740 CPU_CLR(PCPU_GET(cpuid), &mask); 741 fence(); 742 if (!CPU_EMPTY(&mask) && smp_started) 743 sbi_remote_sfence_vma(mask.__bits, va, 1); 744 sfence_vma_page(va); 745 sched_unpin(); 746 } 747 748 static void 749 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 750 { 751 cpuset_t mask; 752 753 sched_pin(); 754 mask = pmap->pm_active; 755 CPU_CLR(PCPU_GET(cpuid), &mask); 756 fence(); 757 if (!CPU_EMPTY(&mask) && smp_started) 758 sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1); 759 760 /* 761 * Might consider a loop of sfence_vma_page() for a small 762 * number of pages in the future. 763 */ 764 sfence_vma(); 765 sched_unpin(); 766 } 767 768 static void 769 pmap_invalidate_all(pmap_t pmap) 770 { 771 cpuset_t mask; 772 773 sched_pin(); 774 mask = pmap->pm_active; 775 CPU_CLR(PCPU_GET(cpuid), &mask); 776 777 /* 778 * XXX: The SBI doc doesn't detail how to specify x0 as the 779 * address to perform a global fence. BBL currently treats 780 * all sfence_vma requests as global however. 781 */ 782 fence(); 783 if (!CPU_EMPTY(&mask) && smp_started) 784 sbi_remote_sfence_vma(mask.__bits, 0, 0); 785 sfence_vma(); 786 sched_unpin(); 787 } 788 #else 789 /* 790 * Normal, non-SMP, invalidation functions. 791 * We inline these within pmap.c for speed. 792 */ 793 static __inline void 794 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 795 { 796 797 sfence_vma_page(va); 798 } 799 800 static __inline void 801 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 802 { 803 804 /* 805 * Might consider a loop of sfence_vma_page() for a small 806 * number of pages in the future. 807 */ 808 sfence_vma(); 809 } 810 811 static __inline void 812 pmap_invalidate_all(pmap_t pmap) 813 { 814 815 sfence_vma(); 816 } 817 #endif 818 819 /* 820 * Routine: pmap_extract 821 * Function: 822 * Extract the physical page address associated 823 * with the given map/virtual_address pair. 824 */ 825 vm_paddr_t 826 pmap_extract(pmap_t pmap, vm_offset_t va) 827 { 828 pd_entry_t *l2p, l2; 829 pt_entry_t *l3p, l3; 830 vm_paddr_t pa; 831 832 pa = 0; 833 PMAP_LOCK(pmap); 834 /* 835 * Start with the l2 tabel. We are unable to allocate 836 * pages in the l1 table. 837 */ 838 l2p = pmap_l2(pmap, va); 839 if (l2p != NULL) { 840 l2 = pmap_load(l2p); 841 if ((l2 & PTE_RX) == 0) { 842 l3p = pmap_l2_to_l3(l2p, va); 843 if (l3p != NULL) { 844 l3 = pmap_load(l3p); 845 pa = PTE_TO_PHYS(l3); 846 pa |= (va & L3_OFFSET); 847 } 848 } else { 849 /* L2 is superpages */ 850 pa = (l2 >> PTE_PPN1_S) << L2_SHIFT; 851 pa |= (va & L2_OFFSET); 852 } 853 } 854 PMAP_UNLOCK(pmap); 855 return (pa); 856 } 857 858 /* 859 * Routine: pmap_extract_and_hold 860 * Function: 861 * Atomically extract and hold the physical page 862 * with the given pmap and virtual address pair 863 * if that mapping permits the given protection. 864 */ 865 vm_page_t 866 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 867 { 868 pt_entry_t *l3p, l3; 869 vm_paddr_t phys; 870 vm_paddr_t pa; 871 vm_page_t m; 872 873 pa = 0; 874 m = NULL; 875 PMAP_LOCK(pmap); 876 retry: 877 l3p = pmap_l3(pmap, va); 878 if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) { 879 if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) { 880 phys = PTE_TO_PHYS(l3); 881 if (vm_page_pa_tryrelock(pmap, phys, &pa)) 882 goto retry; 883 m = PHYS_TO_VM_PAGE(phys); 884 vm_page_hold(m); 885 } 886 } 887 PA_UNLOCK_COND(pa); 888 PMAP_UNLOCK(pmap); 889 return (m); 890 } 891 892 vm_paddr_t 893 pmap_kextract(vm_offset_t va) 894 { 895 pd_entry_t *l2; 896 pt_entry_t *l3; 897 vm_paddr_t pa; 898 899 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 900 pa = DMAP_TO_PHYS(va); 901 } else { 902 l2 = pmap_l2(kernel_pmap, va); 903 if (l2 == NULL) 904 panic("pmap_kextract: No l2"); 905 if ((pmap_load(l2) & PTE_RX) != 0) { 906 /* superpages */ 907 pa = (pmap_load(l2) >> PTE_PPN1_S) << L2_SHIFT; 908 pa |= (va & L2_OFFSET); 909 return (pa); 910 } 911 912 l3 = pmap_l2_to_l3(l2, va); 913 if (l3 == NULL) 914 panic("pmap_kextract: No l3..."); 915 pa = PTE_TO_PHYS(pmap_load(l3)); 916 pa |= (va & PAGE_MASK); 917 } 918 return (pa); 919 } 920 921 /*************************************************** 922 * Low level mapping routines..... 923 ***************************************************/ 924 925 void 926 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) 927 { 928 pt_entry_t entry; 929 pt_entry_t *l3; 930 vm_offset_t va; 931 pn_t pn; 932 933 KASSERT((pa & L3_OFFSET) == 0, 934 ("pmap_kenter_device: Invalid physical address")); 935 KASSERT((sva & L3_OFFSET) == 0, 936 ("pmap_kenter_device: Invalid virtual address")); 937 KASSERT((size & PAGE_MASK) == 0, 938 ("pmap_kenter_device: Mapping is not page-sized")); 939 940 va = sva; 941 while (size != 0) { 942 l3 = pmap_l3(kernel_pmap, va); 943 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 944 945 pn = (pa / PAGE_SIZE); 946 entry = PTE_KERN; 947 entry |= (pn << PTE_PPN0_S); 948 pmap_store(l3, entry); 949 950 va += PAGE_SIZE; 951 pa += PAGE_SIZE; 952 size -= PAGE_SIZE; 953 } 954 pmap_invalidate_range(kernel_pmap, sva, va); 955 } 956 957 /* 958 * Remove a page from the kernel pagetables. 959 * Note: not SMP coherent. 960 */ 961 PMAP_INLINE void 962 pmap_kremove(vm_offset_t va) 963 { 964 pt_entry_t *l3; 965 966 l3 = pmap_l3(kernel_pmap, va); 967 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 968 969 pmap_clear(l3); 970 sfence_vma(); 971 } 972 973 void 974 pmap_kremove_device(vm_offset_t sva, vm_size_t size) 975 { 976 pt_entry_t *l3; 977 vm_offset_t va; 978 979 KASSERT((sva & L3_OFFSET) == 0, 980 ("pmap_kremove_device: Invalid virtual address")); 981 KASSERT((size & PAGE_MASK) == 0, 982 ("pmap_kremove_device: Mapping is not page-sized")); 983 984 va = sva; 985 while (size != 0) { 986 l3 = pmap_l3(kernel_pmap, va); 987 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 988 pmap_clear(l3); 989 990 va += PAGE_SIZE; 991 size -= PAGE_SIZE; 992 } 993 994 pmap_invalidate_range(kernel_pmap, sva, va); 995 } 996 997 /* 998 * Used to map a range of physical addresses into kernel 999 * virtual address space. 1000 * 1001 * The value passed in '*virt' is a suggested virtual address for 1002 * the mapping. Architectures which can support a direct-mapped 1003 * physical to virtual region can return the appropriate address 1004 * within that region, leaving '*virt' unchanged. Other 1005 * architectures should map the pages starting at '*virt' and 1006 * update '*virt' with the first usable address after the mapped 1007 * region. 1008 */ 1009 vm_offset_t 1010 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1011 { 1012 1013 return PHYS_TO_DMAP(start); 1014 } 1015 1016 1017 /* 1018 * Add a list of wired pages to the kva 1019 * this routine is only used for temporary 1020 * kernel mappings that do not need to have 1021 * page modification or references recorded. 1022 * Note that old mappings are simply written 1023 * over. The page *must* be wired. 1024 * Note: SMP coherent. Uses a ranged shootdown IPI. 1025 */ 1026 void 1027 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1028 { 1029 pt_entry_t *l3, pa; 1030 vm_offset_t va; 1031 vm_page_t m; 1032 pt_entry_t entry; 1033 pn_t pn; 1034 int i; 1035 1036 va = sva; 1037 for (i = 0; i < count; i++) { 1038 m = ma[i]; 1039 pa = VM_PAGE_TO_PHYS(m); 1040 pn = (pa / PAGE_SIZE); 1041 l3 = pmap_l3(kernel_pmap, va); 1042 1043 entry = PTE_KERN; 1044 entry |= (pn << PTE_PPN0_S); 1045 pmap_store(l3, entry); 1046 1047 va += L3_SIZE; 1048 } 1049 pmap_invalidate_range(kernel_pmap, sva, va); 1050 } 1051 1052 /* 1053 * This routine tears out page mappings from the 1054 * kernel -- it is meant only for temporary mappings. 1055 * Note: SMP coherent. Uses a ranged shootdown IPI. 1056 */ 1057 void 1058 pmap_qremove(vm_offset_t sva, int count) 1059 { 1060 pt_entry_t *l3; 1061 vm_offset_t va; 1062 1063 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); 1064 1065 for (va = sva; count-- > 0; va += PAGE_SIZE) { 1066 l3 = pmap_l3(kernel_pmap, va); 1067 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 1068 pmap_clear(l3); 1069 } 1070 pmap_invalidate_range(kernel_pmap, sva, va); 1071 } 1072 1073 bool 1074 pmap_ps_enabled(pmap_t pmap __unused) 1075 { 1076 1077 return (superpages_enabled); 1078 } 1079 1080 /*************************************************** 1081 * Page table page management routines..... 1082 ***************************************************/ 1083 /* 1084 * Schedule the specified unused page table page to be freed. Specifically, 1085 * add the page to the specified list of pages that will be released to the 1086 * physical memory manager after the TLB has been updated. 1087 */ 1088 static __inline void 1089 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 1090 boolean_t set_PG_ZERO) 1091 { 1092 1093 if (set_PG_ZERO) 1094 m->flags |= PG_ZERO; 1095 else 1096 m->flags &= ~PG_ZERO; 1097 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1098 } 1099 1100 /* 1101 * Inserts the specified page table page into the specified pmap's collection 1102 * of idle page table pages. Each of a pmap's page table pages is responsible 1103 * for mapping a distinct range of virtual addresses. The pmap's collection is 1104 * ordered by this virtual address range. 1105 */ 1106 static __inline int 1107 pmap_insert_pt_page(pmap_t pmap, vm_page_t ml3) 1108 { 1109 1110 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1111 return (vm_radix_insert(&pmap->pm_root, ml3)); 1112 } 1113 1114 /* 1115 * Removes the page table page mapping the specified virtual address from the 1116 * specified pmap's collection of idle page table pages, and returns it. 1117 * Otherwise, returns NULL if there is no page table page corresponding to the 1118 * specified virtual address. 1119 */ 1120 static __inline vm_page_t 1121 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 1122 { 1123 1124 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1125 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); 1126 } 1127 1128 /* 1129 * Decrements a page table page's wire count, which is used to record the 1130 * number of valid page table entries within the page. If the wire count 1131 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1132 * page table page was unmapped and FALSE otherwise. 1133 */ 1134 static inline boolean_t 1135 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1136 { 1137 1138 --m->wire_count; 1139 if (m->wire_count == 0) { 1140 _pmap_unwire_ptp(pmap, va, m, free); 1141 return (TRUE); 1142 } else { 1143 return (FALSE); 1144 } 1145 } 1146 1147 static void 1148 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1149 { 1150 vm_paddr_t phys; 1151 1152 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1153 if (m->pindex >= NUL1E) { 1154 pd_entry_t *l1; 1155 l1 = pmap_l1(pmap, va); 1156 pmap_clear(l1); 1157 pmap_distribute_l1(pmap, pmap_l1_index(va), 0); 1158 } else { 1159 pd_entry_t *l2; 1160 l2 = pmap_l2(pmap, va); 1161 pmap_clear(l2); 1162 } 1163 pmap_resident_count_dec(pmap, 1); 1164 if (m->pindex < NUL1E) { 1165 pd_entry_t *l1; 1166 vm_page_t pdpg; 1167 1168 l1 = pmap_l1(pmap, va); 1169 phys = PTE_TO_PHYS(pmap_load(l1)); 1170 pdpg = PHYS_TO_VM_PAGE(phys); 1171 pmap_unwire_ptp(pmap, va, pdpg, free); 1172 } 1173 pmap_invalidate_page(pmap, va); 1174 1175 vm_wire_sub(1); 1176 1177 /* 1178 * Put page on a list so that it is released after 1179 * *ALL* TLB shootdown is done 1180 */ 1181 pmap_add_delayed_free_list(m, free, TRUE); 1182 } 1183 1184 /* 1185 * After removing a page table entry, this routine is used to 1186 * conditionally free the page, and manage the hold/wire counts. 1187 */ 1188 static int 1189 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 1190 struct spglist *free) 1191 { 1192 vm_page_t mpte; 1193 1194 if (va >= VM_MAXUSER_ADDRESS) 1195 return (0); 1196 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 1197 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde)); 1198 return (pmap_unwire_ptp(pmap, va, mpte, free)); 1199 } 1200 1201 void 1202 pmap_pinit0(pmap_t pmap) 1203 { 1204 1205 PMAP_LOCK_INIT(pmap); 1206 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1207 pmap->pm_l1 = kernel_pmap->pm_l1; 1208 pmap->pm_satp = SATP_MODE_SV39 | (vtophys(pmap->pm_l1) >> PAGE_SHIFT); 1209 CPU_ZERO(&pmap->pm_active); 1210 pmap_activate_boot(pmap); 1211 } 1212 1213 int 1214 pmap_pinit(pmap_t pmap) 1215 { 1216 vm_paddr_t l1phys; 1217 vm_page_t l1pt; 1218 1219 /* 1220 * allocate the l1 page 1221 */ 1222 while ((l1pt = vm_page_alloc(NULL, 0xdeadbeef, VM_ALLOC_NORMAL | 1223 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) 1224 vm_wait(NULL); 1225 1226 l1phys = VM_PAGE_TO_PHYS(l1pt); 1227 pmap->pm_l1 = (pd_entry_t *)PHYS_TO_DMAP(l1phys); 1228 pmap->pm_satp = SATP_MODE_SV39 | (l1phys >> PAGE_SHIFT); 1229 1230 if ((l1pt->flags & PG_ZERO) == 0) 1231 pagezero(pmap->pm_l1); 1232 1233 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1234 1235 CPU_ZERO(&pmap->pm_active); 1236 1237 /* Install kernel pagetables */ 1238 memcpy(pmap->pm_l1, kernel_pmap->pm_l1, PAGE_SIZE); 1239 1240 /* Add to the list of all user pmaps */ 1241 mtx_lock(&allpmaps_lock); 1242 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1243 mtx_unlock(&allpmaps_lock); 1244 1245 vm_radix_init(&pmap->pm_root); 1246 1247 return (1); 1248 } 1249 1250 /* 1251 * This routine is called if the desired page table page does not exist. 1252 * 1253 * If page table page allocation fails, this routine may sleep before 1254 * returning NULL. It sleeps only if a lock pointer was given. 1255 * 1256 * Note: If a page allocation fails at page table level two or three, 1257 * one or two pages may be held during the wait, only to be released 1258 * afterwards. This conservative approach is easily argued to avoid 1259 * race conditions. 1260 */ 1261 static vm_page_t 1262 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 1263 { 1264 vm_page_t m, /*pdppg, */pdpg; 1265 pt_entry_t entry; 1266 vm_paddr_t phys; 1267 pn_t pn; 1268 1269 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1270 1271 /* 1272 * Allocate a page table page. 1273 */ 1274 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1275 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1276 if (lockp != NULL) { 1277 RELEASE_PV_LIST_LOCK(lockp); 1278 PMAP_UNLOCK(pmap); 1279 rw_runlock(&pvh_global_lock); 1280 vm_wait(NULL); 1281 rw_rlock(&pvh_global_lock); 1282 PMAP_LOCK(pmap); 1283 } 1284 1285 /* 1286 * Indicate the need to retry. While waiting, the page table 1287 * page may have been allocated. 1288 */ 1289 return (NULL); 1290 } 1291 1292 if ((m->flags & PG_ZERO) == 0) 1293 pmap_zero_page(m); 1294 1295 /* 1296 * Map the pagetable page into the process address space, if 1297 * it isn't already there. 1298 */ 1299 1300 if (ptepindex >= NUL1E) { 1301 pd_entry_t *l1; 1302 vm_pindex_t l1index; 1303 1304 l1index = ptepindex - NUL1E; 1305 l1 = &pmap->pm_l1[l1index]; 1306 1307 pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); 1308 entry = (PTE_V); 1309 entry |= (pn << PTE_PPN0_S); 1310 pmap_store(l1, entry); 1311 pmap_distribute_l1(pmap, l1index, entry); 1312 } else { 1313 vm_pindex_t l1index; 1314 pd_entry_t *l1, *l2; 1315 1316 l1index = ptepindex >> (L1_SHIFT - L2_SHIFT); 1317 l1 = &pmap->pm_l1[l1index]; 1318 if (pmap_load(l1) == 0) { 1319 /* recurse for allocating page dir */ 1320 if (_pmap_alloc_l3(pmap, NUL1E + l1index, 1321 lockp) == NULL) { 1322 vm_page_unwire_noq(m); 1323 vm_page_free_zero(m); 1324 return (NULL); 1325 } 1326 } else { 1327 phys = PTE_TO_PHYS(pmap_load(l1)); 1328 pdpg = PHYS_TO_VM_PAGE(phys); 1329 pdpg->wire_count++; 1330 } 1331 1332 phys = PTE_TO_PHYS(pmap_load(l1)); 1333 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1334 l2 = &l2[ptepindex & Ln_ADDR_MASK]; 1335 1336 pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); 1337 entry = (PTE_V); 1338 entry |= (pn << PTE_PPN0_S); 1339 pmap_store(l2, entry); 1340 } 1341 1342 pmap_resident_count_inc(pmap, 1); 1343 1344 return (m); 1345 } 1346 1347 static vm_page_t 1348 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1349 { 1350 pd_entry_t *l1; 1351 vm_page_t l2pg; 1352 vm_pindex_t l2pindex; 1353 1354 retry: 1355 l1 = pmap_l1(pmap, va); 1356 if (l1 != NULL && (pmap_load(l1) & PTE_RWX) == 0) { 1357 /* Add a reference to the L2 page. */ 1358 l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1))); 1359 l2pg->wire_count++; 1360 } else { 1361 /* Allocate a L2 page. */ 1362 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; 1363 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); 1364 if (l2pg == NULL && lockp != NULL) 1365 goto retry; 1366 } 1367 return (l2pg); 1368 } 1369 1370 static vm_page_t 1371 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1372 { 1373 vm_pindex_t ptepindex; 1374 pd_entry_t *l2; 1375 vm_paddr_t phys; 1376 vm_page_t m; 1377 1378 /* 1379 * Calculate pagetable page index 1380 */ 1381 ptepindex = pmap_l2_pindex(va); 1382 retry: 1383 /* 1384 * Get the page directory entry 1385 */ 1386 l2 = pmap_l2(pmap, va); 1387 1388 /* 1389 * If the page table page is mapped, we just increment the 1390 * hold count, and activate it. 1391 */ 1392 if (l2 != NULL && pmap_load(l2) != 0) { 1393 phys = PTE_TO_PHYS(pmap_load(l2)); 1394 m = PHYS_TO_VM_PAGE(phys); 1395 m->wire_count++; 1396 } else { 1397 /* 1398 * Here if the pte page isn't mapped, or if it has been 1399 * deallocated. 1400 */ 1401 m = _pmap_alloc_l3(pmap, ptepindex, lockp); 1402 if (m == NULL && lockp != NULL) 1403 goto retry; 1404 } 1405 return (m); 1406 } 1407 1408 1409 /*************************************************** 1410 * Pmap allocation/deallocation routines. 1411 ***************************************************/ 1412 1413 /* 1414 * Release any resources held by the given physical map. 1415 * Called when a pmap initialized by pmap_pinit is being released. 1416 * Should only be called if the map contains no valid mappings. 1417 */ 1418 void 1419 pmap_release(pmap_t pmap) 1420 { 1421 vm_page_t m; 1422 1423 KASSERT(pmap->pm_stats.resident_count == 0, 1424 ("pmap_release: pmap resident count %ld != 0", 1425 pmap->pm_stats.resident_count)); 1426 KASSERT(CPU_EMPTY(&pmap->pm_active), 1427 ("releasing active pmap %p", pmap)); 1428 1429 mtx_lock(&allpmaps_lock); 1430 LIST_REMOVE(pmap, pm_list); 1431 mtx_unlock(&allpmaps_lock); 1432 1433 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l1)); 1434 vm_page_unwire_noq(m); 1435 vm_page_free(m); 1436 } 1437 1438 #if 0 1439 static int 1440 kvm_size(SYSCTL_HANDLER_ARGS) 1441 { 1442 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 1443 1444 return sysctl_handle_long(oidp, &ksize, 0, req); 1445 } 1446 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 1447 0, 0, kvm_size, "LU", "Size of KVM"); 1448 1449 static int 1450 kvm_free(SYSCTL_HANDLER_ARGS) 1451 { 1452 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 1453 1454 return sysctl_handle_long(oidp, &kfree, 0, req); 1455 } 1456 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 1457 0, 0, kvm_free, "LU", "Amount of KVM free"); 1458 #endif /* 0 */ 1459 1460 /* 1461 * grow the number of kernel page table entries, if needed 1462 */ 1463 void 1464 pmap_growkernel(vm_offset_t addr) 1465 { 1466 vm_paddr_t paddr; 1467 vm_page_t nkpg; 1468 pd_entry_t *l1, *l2; 1469 pt_entry_t entry; 1470 pn_t pn; 1471 1472 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 1473 1474 addr = roundup2(addr, L2_SIZE); 1475 if (addr - 1 >= vm_map_max(kernel_map)) 1476 addr = vm_map_max(kernel_map); 1477 while (kernel_vm_end < addr) { 1478 l1 = pmap_l1(kernel_pmap, kernel_vm_end); 1479 if (pmap_load(l1) == 0) { 1480 /* We need a new PDP entry */ 1481 nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT, 1482 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 1483 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1484 if (nkpg == NULL) 1485 panic("pmap_growkernel: no memory to grow kernel"); 1486 if ((nkpg->flags & PG_ZERO) == 0) 1487 pmap_zero_page(nkpg); 1488 paddr = VM_PAGE_TO_PHYS(nkpg); 1489 1490 pn = (paddr / PAGE_SIZE); 1491 entry = (PTE_V); 1492 entry |= (pn << PTE_PPN0_S); 1493 pmap_store(l1, entry); 1494 pmap_distribute_l1(kernel_pmap, 1495 pmap_l1_index(kernel_vm_end), entry); 1496 continue; /* try again */ 1497 } 1498 l2 = pmap_l1_to_l2(l1, kernel_vm_end); 1499 if ((pmap_load(l2) & PTE_V) != 0 && 1500 (pmap_load(l2) & PTE_RWX) == 0) { 1501 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1502 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1503 kernel_vm_end = vm_map_max(kernel_map); 1504 break; 1505 } 1506 continue; 1507 } 1508 1509 nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT, 1510 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 1511 VM_ALLOC_ZERO); 1512 if (nkpg == NULL) 1513 panic("pmap_growkernel: no memory to grow kernel"); 1514 if ((nkpg->flags & PG_ZERO) == 0) { 1515 pmap_zero_page(nkpg); 1516 } 1517 paddr = VM_PAGE_TO_PHYS(nkpg); 1518 1519 pn = (paddr / PAGE_SIZE); 1520 entry = (PTE_V); 1521 entry |= (pn << PTE_PPN0_S); 1522 pmap_store(l2, entry); 1523 1524 pmap_invalidate_page(kernel_pmap, kernel_vm_end); 1525 1526 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1527 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1528 kernel_vm_end = vm_map_max(kernel_map); 1529 break; 1530 } 1531 } 1532 } 1533 1534 1535 /*************************************************** 1536 * page management routines. 1537 ***************************************************/ 1538 1539 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 1540 CTASSERT(_NPCM == 3); 1541 CTASSERT(_NPCPV == 168); 1542 1543 static __inline struct pv_chunk * 1544 pv_to_chunk(pv_entry_t pv) 1545 { 1546 1547 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 1548 } 1549 1550 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 1551 1552 #define PC_FREE0 0xfffffffffffffffful 1553 #define PC_FREE1 0xfffffffffffffffful 1554 #define PC_FREE2 0x000000fffffffffful 1555 1556 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 1557 1558 #if 0 1559 #ifdef PV_STATS 1560 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 1561 1562 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 1563 "Current number of pv entry chunks"); 1564 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 1565 "Current number of pv entry chunks allocated"); 1566 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 1567 "Current number of pv entry chunks frees"); 1568 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 1569 "Number of times tried to get a chunk page but failed."); 1570 1571 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 1572 static int pv_entry_spare; 1573 1574 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 1575 "Current number of pv entry frees"); 1576 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 1577 "Current number of pv entry allocs"); 1578 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 1579 "Current number of pv entries"); 1580 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 1581 "Current number of spare pv entries"); 1582 #endif 1583 #endif /* 0 */ 1584 1585 /* 1586 * We are in a serious low memory condition. Resort to 1587 * drastic measures to free some pages so we can allocate 1588 * another pv entry chunk. 1589 * 1590 * Returns NULL if PV entries were reclaimed from the specified pmap. 1591 * 1592 * We do not, however, unmap 2mpages because subsequent accesses will 1593 * allocate per-page pv entries until repromotion occurs, thereby 1594 * exacerbating the shortage of free pv entries. 1595 */ 1596 static vm_page_t 1597 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 1598 { 1599 1600 panic("RISCVTODO: reclaim_pv_chunk"); 1601 } 1602 1603 /* 1604 * free the pv_entry back to the free list 1605 */ 1606 static void 1607 free_pv_entry(pmap_t pmap, pv_entry_t pv) 1608 { 1609 struct pv_chunk *pc; 1610 int idx, field, bit; 1611 1612 rw_assert(&pvh_global_lock, RA_LOCKED); 1613 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1614 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 1615 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 1616 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 1617 pc = pv_to_chunk(pv); 1618 idx = pv - &pc->pc_pventry[0]; 1619 field = idx / 64; 1620 bit = idx % 64; 1621 pc->pc_map[field] |= 1ul << bit; 1622 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 1623 pc->pc_map[2] != PC_FREE2) { 1624 /* 98% of the time, pc is already at the head of the list. */ 1625 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 1626 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1627 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1628 } 1629 return; 1630 } 1631 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1632 free_pv_chunk(pc); 1633 } 1634 1635 static void 1636 free_pv_chunk(struct pv_chunk *pc) 1637 { 1638 vm_page_t m; 1639 1640 mtx_lock(&pv_chunks_mutex); 1641 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 1642 mtx_unlock(&pv_chunks_mutex); 1643 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 1644 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 1645 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 1646 /* entire chunk is free, return it */ 1647 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 1648 dump_drop_page(m->phys_addr); 1649 vm_page_unwire(m, PQ_NONE); 1650 vm_page_free(m); 1651 } 1652 1653 /* 1654 * Returns a new PV entry, allocating a new PV chunk from the system when 1655 * needed. If this PV chunk allocation fails and a PV list lock pointer was 1656 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 1657 * returned. 1658 * 1659 * The given PV list lock may be released. 1660 */ 1661 static pv_entry_t 1662 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 1663 { 1664 int bit, field; 1665 pv_entry_t pv; 1666 struct pv_chunk *pc; 1667 vm_page_t m; 1668 1669 rw_assert(&pvh_global_lock, RA_LOCKED); 1670 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1671 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 1672 retry: 1673 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1674 if (pc != NULL) { 1675 for (field = 0; field < _NPCM; field++) { 1676 if (pc->pc_map[field]) { 1677 bit = ffsl(pc->pc_map[field]) - 1; 1678 break; 1679 } 1680 } 1681 if (field < _NPCM) { 1682 pv = &pc->pc_pventry[field * 64 + bit]; 1683 pc->pc_map[field] &= ~(1ul << bit); 1684 /* If this was the last item, move it to tail */ 1685 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 1686 pc->pc_map[2] == 0) { 1687 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1688 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 1689 pc_list); 1690 } 1691 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1692 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 1693 return (pv); 1694 } 1695 } 1696 /* No free items, allocate another chunk */ 1697 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1698 VM_ALLOC_WIRED); 1699 if (m == NULL) { 1700 if (lockp == NULL) { 1701 PV_STAT(pc_chunk_tryfail++); 1702 return (NULL); 1703 } 1704 m = reclaim_pv_chunk(pmap, lockp); 1705 if (m == NULL) 1706 goto retry; 1707 } 1708 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 1709 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 1710 dump_add_page(m->phys_addr); 1711 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1712 pc->pc_pmap = pmap; 1713 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 1714 pc->pc_map[1] = PC_FREE1; 1715 pc->pc_map[2] = PC_FREE2; 1716 mtx_lock(&pv_chunks_mutex); 1717 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 1718 mtx_unlock(&pv_chunks_mutex); 1719 pv = &pc->pc_pventry[0]; 1720 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1721 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1722 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 1723 return (pv); 1724 } 1725 1726 /* 1727 * Ensure that the number of spare PV entries in the specified pmap meets or 1728 * exceeds the given count, "needed". 1729 * 1730 * The given PV list lock may be released. 1731 */ 1732 static void 1733 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 1734 { 1735 struct pch new_tail; 1736 struct pv_chunk *pc; 1737 vm_page_t m; 1738 int avail, free; 1739 bool reclaimed; 1740 1741 rw_assert(&pvh_global_lock, RA_LOCKED); 1742 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1743 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 1744 1745 /* 1746 * Newly allocated PV chunks must be stored in a private list until 1747 * the required number of PV chunks have been allocated. Otherwise, 1748 * reclaim_pv_chunk() could recycle one of these chunks. In 1749 * contrast, these chunks must be added to the pmap upon allocation. 1750 */ 1751 TAILQ_INIT(&new_tail); 1752 retry: 1753 avail = 0; 1754 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 1755 bit_count((bitstr_t *)pc->pc_map, 0, 1756 sizeof(pc->pc_map) * NBBY, &free); 1757 if (free == 0) 1758 break; 1759 avail += free; 1760 if (avail >= needed) 1761 break; 1762 } 1763 for (reclaimed = false; avail < needed; avail += _NPCPV) { 1764 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1765 VM_ALLOC_WIRED); 1766 if (m == NULL) { 1767 m = reclaim_pv_chunk(pmap, lockp); 1768 if (m == NULL) 1769 goto retry; 1770 reclaimed = true; 1771 } 1772 /* XXX PV STATS */ 1773 #if 0 1774 dump_add_page(m->phys_addr); 1775 #endif 1776 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1777 pc->pc_pmap = pmap; 1778 pc->pc_map[0] = PC_FREE0; 1779 pc->pc_map[1] = PC_FREE1; 1780 pc->pc_map[2] = PC_FREE2; 1781 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1782 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 1783 1784 /* 1785 * The reclaim might have freed a chunk from the current pmap. 1786 * If that chunk contained available entries, we need to 1787 * re-count the number of available entries. 1788 */ 1789 if (reclaimed) 1790 goto retry; 1791 } 1792 if (!TAILQ_EMPTY(&new_tail)) { 1793 mtx_lock(&pv_chunks_mutex); 1794 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 1795 mtx_unlock(&pv_chunks_mutex); 1796 } 1797 } 1798 1799 /* 1800 * First find and then remove the pv entry for the specified pmap and virtual 1801 * address from the specified pv list. Returns the pv entry if found and NULL 1802 * otherwise. This operation can be performed on pv lists for either 4KB or 1803 * 2MB page mappings. 1804 */ 1805 static __inline pv_entry_t 1806 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1807 { 1808 pv_entry_t pv; 1809 1810 rw_assert(&pvh_global_lock, RA_LOCKED); 1811 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 1812 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 1813 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 1814 pvh->pv_gen++; 1815 break; 1816 } 1817 } 1818 return (pv); 1819 } 1820 1821 /* 1822 * First find and then destroy the pv entry for the specified pmap and virtual 1823 * address. This operation can be performed on pv lists for either 4KB or 2MB 1824 * page mappings. 1825 */ 1826 static void 1827 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1828 { 1829 pv_entry_t pv; 1830 1831 pv = pmap_pvh_remove(pvh, pmap, va); 1832 1833 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va)); 1834 free_pv_entry(pmap, pv); 1835 } 1836 1837 /* 1838 * Conditionally create the PV entry for a 4KB page mapping if the required 1839 * memory can be allocated without resorting to reclamation. 1840 */ 1841 static boolean_t 1842 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 1843 struct rwlock **lockp) 1844 { 1845 pv_entry_t pv; 1846 1847 rw_assert(&pvh_global_lock, RA_LOCKED); 1848 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1849 /* Pass NULL instead of the lock pointer to disable reclamation. */ 1850 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 1851 pv->pv_va = va; 1852 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 1853 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 1854 m->md.pv_gen++; 1855 return (TRUE); 1856 } else 1857 return (FALSE); 1858 } 1859 1860 /* 1861 * After demotion from a 2MB page mapping to 512 4KB page mappings, 1862 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 1863 * entries for each of the 4KB page mappings. 1864 */ 1865 static void __unused 1866 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1867 struct rwlock **lockp) 1868 { 1869 struct md_page *pvh; 1870 struct pv_chunk *pc; 1871 pv_entry_t pv; 1872 vm_page_t m; 1873 vm_offset_t va_last; 1874 int bit, field; 1875 1876 rw_assert(&pvh_global_lock, RA_LOCKED); 1877 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1878 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1879 1880 /* 1881 * Transfer the 2mpage's pv entry for this mapping to the first 1882 * page's pv list. Once this transfer begins, the pv list lock 1883 * must not be released until the last pv entry is reinstantiated. 1884 */ 1885 pvh = pa_to_pvh(pa); 1886 va &= ~L2_OFFSET; 1887 pv = pmap_pvh_remove(pvh, pmap, va); 1888 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); 1889 m = PHYS_TO_VM_PAGE(pa); 1890 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 1891 m->md.pv_gen++; 1892 /* Instantiate the remaining 511 pv entries. */ 1893 va_last = va + L2_SIZE - PAGE_SIZE; 1894 for (;;) { 1895 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1896 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 1897 pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); 1898 for (field = 0; field < _NPCM; field++) { 1899 while (pc->pc_map[field] != 0) { 1900 bit = ffsl(pc->pc_map[field]) - 1; 1901 pc->pc_map[field] &= ~(1ul << bit); 1902 pv = &pc->pc_pventry[field * 64 + bit]; 1903 va += PAGE_SIZE; 1904 pv->pv_va = va; 1905 m++; 1906 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 1907 ("pmap_pv_demote_l2: page %p is not managed", m)); 1908 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 1909 m->md.pv_gen++; 1910 if (va == va_last) 1911 goto out; 1912 } 1913 } 1914 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1915 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1916 } 1917 out: 1918 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 1919 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1920 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1921 } 1922 /* XXX PV stats */ 1923 } 1924 1925 #if VM_NRESERVLEVEL > 0 1926 static void 1927 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1928 struct rwlock **lockp) 1929 { 1930 struct md_page *pvh; 1931 pv_entry_t pv; 1932 vm_page_t m; 1933 vm_offset_t va_last; 1934 1935 rw_assert(&pvh_global_lock, RA_LOCKED); 1936 KASSERT((va & L2_OFFSET) == 0, 1937 ("pmap_pv_promote_l2: misaligned va %#lx", va)); 1938 1939 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1940 1941 m = PHYS_TO_VM_PAGE(pa); 1942 pv = pmap_pvh_remove(&m->md, pmap, va); 1943 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va)); 1944 pvh = pa_to_pvh(pa); 1945 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 1946 pvh->pv_gen++; 1947 1948 va_last = va + L2_SIZE - PAGE_SIZE; 1949 do { 1950 m++; 1951 va += PAGE_SIZE; 1952 pmap_pvh_free(&m->md, pmap, va); 1953 } while (va < va_last); 1954 } 1955 #endif /* VM_NRESERVLEVEL > 0 */ 1956 1957 /* 1958 * Create the PV entry for a 2MB page mapping. Always returns true unless the 1959 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 1960 * false if the PV entry cannot be allocated without resorting to reclamation. 1961 */ 1962 static bool 1963 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, 1964 struct rwlock **lockp) 1965 { 1966 struct md_page *pvh; 1967 pv_entry_t pv; 1968 vm_paddr_t pa; 1969 1970 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1971 /* Pass NULL instead of the lock pointer to disable reclamation. */ 1972 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 1973 NULL : lockp)) == NULL) 1974 return (false); 1975 pv->pv_va = va; 1976 pa = PTE_TO_PHYS(l2e); 1977 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1978 pvh = pa_to_pvh(pa); 1979 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 1980 pvh->pv_gen++; 1981 return (true); 1982 } 1983 1984 static void 1985 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 1986 { 1987 pt_entry_t newl2, oldl2; 1988 vm_page_t ml3; 1989 vm_paddr_t ml3pa; 1990 1991 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); 1992 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 1993 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1994 1995 ml3 = pmap_remove_pt_page(pmap, va); 1996 if (ml3 == NULL) 1997 panic("pmap_remove_kernel_l2: Missing pt page"); 1998 1999 ml3pa = VM_PAGE_TO_PHYS(ml3); 2000 newl2 = ml3pa | PTE_V; 2001 2002 /* 2003 * Initialize the page table page. 2004 */ 2005 pagezero((void *)PHYS_TO_DMAP(ml3pa)); 2006 2007 /* 2008 * Demote the mapping. 2009 */ 2010 oldl2 = pmap_load_store(l2, newl2); 2011 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", 2012 __func__, l2, oldl2)); 2013 } 2014 2015 /* 2016 * pmap_remove_l2: Do the things to unmap a level 2 superpage. 2017 */ 2018 static int 2019 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 2020 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) 2021 { 2022 struct md_page *pvh; 2023 pt_entry_t oldl2; 2024 vm_offset_t eva, va; 2025 vm_page_t m, ml3; 2026 2027 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2028 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); 2029 oldl2 = pmap_load_clear(l2); 2030 KASSERT((oldl2 & PTE_RWX) != 0, 2031 ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2)); 2032 2033 /* 2034 * The sfence.vma documentation states that it is sufficient to specify 2035 * a single address within a superpage mapping. However, since we do 2036 * not perform any invalidation upon promotion, TLBs may still be 2037 * caching 4KB mappings within the superpage, so we must invalidate the 2038 * entire range. 2039 */ 2040 pmap_invalidate_range(pmap, sva, sva + L2_SIZE); 2041 if ((oldl2 & PTE_SW_WIRED) != 0) 2042 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; 2043 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); 2044 if ((oldl2 & PTE_SW_MANAGED) != 0) { 2045 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2)); 2046 pvh = pa_to_pvh(PTE_TO_PHYS(oldl2)); 2047 pmap_pvh_free(pvh, pmap, sva); 2048 eva = sva + L2_SIZE; 2049 for (va = sva, m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl2)); 2050 va < eva; va += PAGE_SIZE, m++) { 2051 if ((oldl2 & PTE_D) != 0) 2052 vm_page_dirty(m); 2053 if ((oldl2 & PTE_A) != 0) 2054 vm_page_aflag_set(m, PGA_REFERENCED); 2055 if (TAILQ_EMPTY(&m->md.pv_list) && 2056 TAILQ_EMPTY(&pvh->pv_list)) 2057 vm_page_aflag_clear(m, PGA_WRITEABLE); 2058 } 2059 } 2060 if (pmap == kernel_pmap) { 2061 pmap_remove_kernel_l2(pmap, l2, sva); 2062 } else { 2063 ml3 = pmap_remove_pt_page(pmap, sva); 2064 if (ml3 != NULL) { 2065 pmap_resident_count_dec(pmap, 1); 2066 KASSERT(ml3->wire_count == Ln_ENTRIES, 2067 ("pmap_remove_l2: l3 page wire count error")); 2068 ml3->wire_count = 1; 2069 vm_page_unwire_noq(ml3); 2070 pmap_add_delayed_free_list(ml3, free, FALSE); 2071 } 2072 } 2073 return (pmap_unuse_pt(pmap, sva, l1e, free)); 2074 } 2075 2076 /* 2077 * pmap_remove_l3: do the things to unmap a page in a process 2078 */ 2079 static int 2080 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 2081 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) 2082 { 2083 pt_entry_t old_l3; 2084 vm_paddr_t phys; 2085 vm_page_t m; 2086 2087 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2088 old_l3 = pmap_load_clear(l3); 2089 pmap_invalidate_page(pmap, va); 2090 if (old_l3 & PTE_SW_WIRED) 2091 pmap->pm_stats.wired_count -= 1; 2092 pmap_resident_count_dec(pmap, 1); 2093 if (old_l3 & PTE_SW_MANAGED) { 2094 phys = PTE_TO_PHYS(old_l3); 2095 m = PHYS_TO_VM_PAGE(phys); 2096 if ((old_l3 & PTE_D) != 0) 2097 vm_page_dirty(m); 2098 if (old_l3 & PTE_A) 2099 vm_page_aflag_set(m, PGA_REFERENCED); 2100 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2101 pmap_pvh_free(&m->md, pmap, va); 2102 } 2103 2104 return (pmap_unuse_pt(pmap, va, l2e, free)); 2105 } 2106 2107 /* 2108 * Remove the given range of addresses from the specified map. 2109 * 2110 * It is assumed that the start and end are properly 2111 * rounded to the page size. 2112 */ 2113 void 2114 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2115 { 2116 struct spglist free; 2117 struct rwlock *lock; 2118 vm_offset_t va, va_next; 2119 pd_entry_t *l1, *l2, l2e; 2120 pt_entry_t *l3; 2121 2122 /* 2123 * Perform an unsynchronized read. This is, however, safe. 2124 */ 2125 if (pmap->pm_stats.resident_count == 0) 2126 return; 2127 2128 SLIST_INIT(&free); 2129 2130 rw_rlock(&pvh_global_lock); 2131 PMAP_LOCK(pmap); 2132 2133 lock = NULL; 2134 for (; sva < eva; sva = va_next) { 2135 if (pmap->pm_stats.resident_count == 0) 2136 break; 2137 2138 l1 = pmap_l1(pmap, sva); 2139 if (pmap_load(l1) == 0) { 2140 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2141 if (va_next < sva) 2142 va_next = eva; 2143 continue; 2144 } 2145 2146 /* 2147 * Calculate index for next page table. 2148 */ 2149 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2150 if (va_next < sva) 2151 va_next = eva; 2152 2153 l2 = pmap_l1_to_l2(l1, sva); 2154 if (l2 == NULL) 2155 continue; 2156 if ((l2e = pmap_load(l2)) == 0) 2157 continue; 2158 if ((l2e & PTE_RWX) != 0) { 2159 if (sva + L2_SIZE == va_next && eva >= va_next) { 2160 (void)pmap_remove_l2(pmap, l2, sva, 2161 pmap_load(l1), &free, &lock); 2162 continue; 2163 } else if (!pmap_demote_l2_locked(pmap, l2, sva, 2164 &lock)) { 2165 /* 2166 * The large page mapping was destroyed. 2167 */ 2168 continue; 2169 } 2170 l2e = pmap_load(l2); 2171 } 2172 2173 /* 2174 * Limit our scan to either the end of the va represented 2175 * by the current page table page, or to the end of the 2176 * range being removed. 2177 */ 2178 if (va_next > eva) 2179 va_next = eva; 2180 2181 va = va_next; 2182 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2183 sva += L3_SIZE) { 2184 if (pmap_load(l3) == 0) { 2185 if (va != va_next) { 2186 pmap_invalidate_range(pmap, va, sva); 2187 va = va_next; 2188 } 2189 continue; 2190 } 2191 if (va == va_next) 2192 va = sva; 2193 if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) { 2194 sva += L3_SIZE; 2195 break; 2196 } 2197 } 2198 if (va != va_next) 2199 pmap_invalidate_range(pmap, va, sva); 2200 } 2201 if (lock != NULL) 2202 rw_wunlock(lock); 2203 rw_runlock(&pvh_global_lock); 2204 PMAP_UNLOCK(pmap); 2205 vm_page_free_pages_toq(&free, false); 2206 } 2207 2208 /* 2209 * Routine: pmap_remove_all 2210 * Function: 2211 * Removes this physical page from 2212 * all physical maps in which it resides. 2213 * Reflects back modify bits to the pager. 2214 * 2215 * Notes: 2216 * Original versions of this routine were very 2217 * inefficient because they iteratively called 2218 * pmap_remove (slow...) 2219 */ 2220 2221 void 2222 pmap_remove_all(vm_page_t m) 2223 { 2224 struct spglist free; 2225 struct md_page *pvh; 2226 pmap_t pmap; 2227 pt_entry_t *l3, l3e; 2228 pd_entry_t *l2, l2e; 2229 pv_entry_t pv; 2230 vm_offset_t va; 2231 2232 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2233 ("pmap_remove_all: page %p is not managed", m)); 2234 SLIST_INIT(&free); 2235 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 2236 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2237 2238 rw_wlock(&pvh_global_lock); 2239 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 2240 pmap = PV_PMAP(pv); 2241 PMAP_LOCK(pmap); 2242 va = pv->pv_va; 2243 l2 = pmap_l2(pmap, va); 2244 (void)pmap_demote_l2(pmap, l2, va); 2245 PMAP_UNLOCK(pmap); 2246 } 2247 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2248 pmap = PV_PMAP(pv); 2249 PMAP_LOCK(pmap); 2250 pmap_resident_count_dec(pmap, 1); 2251 l2 = pmap_l2(pmap, pv->pv_va); 2252 KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found")); 2253 l2e = pmap_load(l2); 2254 2255 KASSERT((l2e & PTE_RX) == 0, 2256 ("pmap_remove_all: found a superpage in %p's pv list", m)); 2257 2258 l3 = pmap_l2_to_l3(l2, pv->pv_va); 2259 l3e = pmap_load_clear(l3); 2260 pmap_invalidate_page(pmap, pv->pv_va); 2261 if (l3e & PTE_SW_WIRED) 2262 pmap->pm_stats.wired_count--; 2263 if ((l3e & PTE_A) != 0) 2264 vm_page_aflag_set(m, PGA_REFERENCED); 2265 2266 /* 2267 * Update the vm_page_t clean and reference bits. 2268 */ 2269 if ((l3e & PTE_D) != 0) 2270 vm_page_dirty(m); 2271 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free); 2272 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2273 m->md.pv_gen++; 2274 free_pv_entry(pmap, pv); 2275 PMAP_UNLOCK(pmap); 2276 } 2277 vm_page_aflag_clear(m, PGA_WRITEABLE); 2278 rw_wunlock(&pvh_global_lock); 2279 vm_page_free_pages_toq(&free, false); 2280 } 2281 2282 /* 2283 * Set the physical protection on the 2284 * specified range of this map as requested. 2285 */ 2286 void 2287 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2288 { 2289 pd_entry_t *l1, *l2, l2e; 2290 pt_entry_t *l3, l3e, mask; 2291 vm_page_t m; 2292 vm_paddr_t pa; 2293 vm_offset_t va, va_next; 2294 bool anychanged, pv_lists_locked; 2295 2296 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2297 pmap_remove(pmap, sva, eva); 2298 return; 2299 } 2300 2301 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == 2302 (VM_PROT_WRITE | VM_PROT_EXECUTE)) 2303 return; 2304 2305 anychanged = false; 2306 pv_lists_locked = false; 2307 mask = 0; 2308 if ((prot & VM_PROT_WRITE) == 0) 2309 mask |= PTE_W | PTE_D; 2310 if ((prot & VM_PROT_EXECUTE) == 0) 2311 mask |= PTE_X; 2312 resume: 2313 PMAP_LOCK(pmap); 2314 for (; sva < eva; sva = va_next) { 2315 l1 = pmap_l1(pmap, sva); 2316 if (pmap_load(l1) == 0) { 2317 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2318 if (va_next < sva) 2319 va_next = eva; 2320 continue; 2321 } 2322 2323 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2324 if (va_next < sva) 2325 va_next = eva; 2326 2327 l2 = pmap_l1_to_l2(l1, sva); 2328 if (l2 == NULL || (l2e = pmap_load(l2)) == 0) 2329 continue; 2330 if ((l2e & PTE_RWX) != 0) { 2331 if (sva + L2_SIZE == va_next && eva >= va_next) { 2332 retryl2: 2333 if ((l2e & (PTE_SW_MANAGED | PTE_D)) == 2334 (PTE_SW_MANAGED | PTE_D)) { 2335 pa = PTE_TO_PHYS(l2e); 2336 for (va = sva, m = PHYS_TO_VM_PAGE(pa); 2337 va < va_next; m++, va += PAGE_SIZE) 2338 vm_page_dirty(m); 2339 } 2340 if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask)) 2341 goto retryl2; 2342 anychanged = true; 2343 } else { 2344 if (!pv_lists_locked) { 2345 pv_lists_locked = true; 2346 if (!rw_try_rlock(&pvh_global_lock)) { 2347 if (anychanged) 2348 pmap_invalidate_all( 2349 pmap); 2350 PMAP_UNLOCK(pmap); 2351 rw_rlock(&pvh_global_lock); 2352 goto resume; 2353 } 2354 } 2355 if (!pmap_demote_l2(pmap, l2, sva)) { 2356 /* 2357 * The large page mapping was destroyed. 2358 */ 2359 continue; 2360 } 2361 } 2362 } 2363 2364 if (va_next > eva) 2365 va_next = eva; 2366 2367 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2368 sva += L3_SIZE) { 2369 l3e = pmap_load(l3); 2370 retryl3: 2371 if ((l3e & PTE_V) == 0) 2372 continue; 2373 if ((prot & VM_PROT_WRITE) == 0 && 2374 (l3e & (PTE_SW_MANAGED | PTE_D)) == 2375 (PTE_SW_MANAGED | PTE_D)) { 2376 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(l3e)); 2377 vm_page_dirty(m); 2378 } 2379 if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask)) 2380 goto retryl3; 2381 anychanged = true; 2382 } 2383 } 2384 if (anychanged) 2385 pmap_invalidate_all(pmap); 2386 if (pv_lists_locked) 2387 rw_runlock(&pvh_global_lock); 2388 PMAP_UNLOCK(pmap); 2389 } 2390 2391 int 2392 pmap_fault_fixup(pmap_t pmap, vm_offset_t va, vm_prot_t ftype) 2393 { 2394 pd_entry_t *l2, l2e; 2395 pt_entry_t bits, *pte, oldpte; 2396 int rv; 2397 2398 rv = 0; 2399 PMAP_LOCK(pmap); 2400 l2 = pmap_l2(pmap, va); 2401 if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) 2402 goto done; 2403 if ((l2e & PTE_RWX) == 0) { 2404 pte = pmap_l2_to_l3(l2, va); 2405 if (pte == NULL || ((oldpte = pmap_load(pte) & PTE_V)) == 0) 2406 goto done; 2407 } else { 2408 pte = l2; 2409 oldpte = l2e; 2410 } 2411 2412 if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) || 2413 (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) || 2414 (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) || 2415 (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0)) 2416 goto done; 2417 2418 bits = PTE_A; 2419 if (ftype == VM_PROT_WRITE) 2420 bits |= PTE_D; 2421 2422 /* 2423 * Spurious faults can occur if the implementation caches invalid 2424 * entries in the TLB, or if simultaneous accesses on multiple CPUs 2425 * race with each other. 2426 */ 2427 if ((oldpte & bits) != bits) 2428 pmap_store_bits(pte, bits); 2429 sfence_vma(); 2430 rv = 1; 2431 done: 2432 PMAP_UNLOCK(pmap); 2433 return (rv); 2434 } 2435 2436 static bool 2437 pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va) 2438 { 2439 struct rwlock *lock; 2440 bool rv; 2441 2442 lock = NULL; 2443 rv = pmap_demote_l2_locked(pmap, l2, va, &lock); 2444 if (lock != NULL) 2445 rw_wunlock(lock); 2446 return (rv); 2447 } 2448 2449 /* 2450 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 2451 * mapping is invalidated. 2452 */ 2453 static bool 2454 pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 2455 struct rwlock **lockp) 2456 { 2457 struct spglist free; 2458 vm_page_t mpte; 2459 pd_entry_t newl2, oldl2; 2460 pt_entry_t *firstl3, newl3; 2461 vm_paddr_t mptepa; 2462 int i; 2463 2464 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2465 2466 oldl2 = pmap_load(l2); 2467 KASSERT((oldl2 & PTE_RWX) != 0, 2468 ("pmap_demote_l2_locked: oldl2 is not a leaf entry")); 2469 if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 2470 NULL) { 2471 if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc(NULL, 2472 pmap_l2_pindex(va), (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 2473 VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == 2474 NULL) { 2475 SLIST_INIT(&free); 2476 (void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET, 2477 pmap_load(pmap_l1(pmap, va)), &free, lockp); 2478 vm_page_free_pages_toq(&free, true); 2479 CTR2(KTR_PMAP, "pmap_demote_l2_locked: " 2480 "failure for va %#lx in pmap %p", va, pmap); 2481 return (false); 2482 } 2483 if (va < VM_MAXUSER_ADDRESS) 2484 pmap_resident_count_inc(pmap, 1); 2485 } 2486 mptepa = VM_PAGE_TO_PHYS(mpte); 2487 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 2488 newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V; 2489 KASSERT((oldl2 & PTE_A) != 0, 2490 ("pmap_demote_l2_locked: oldl2 is missing PTE_A")); 2491 KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W, 2492 ("pmap_demote_l2_locked: oldl2 is missing PTE_D")); 2493 newl3 = oldl2; 2494 2495 /* 2496 * If the page table page is new, initialize it. 2497 */ 2498 if (mpte->wire_count == 1) { 2499 mpte->wire_count = Ln_ENTRIES; 2500 for (i = 0; i < Ln_ENTRIES; i++) 2501 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2502 } 2503 KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3), 2504 ("pmap_demote_l2_locked: firstl3 and newl3 map different physical " 2505 "addresses")); 2506 2507 /* 2508 * If the mapping has changed attributes, update the page table 2509 * entries. 2510 */ 2511 if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE)) 2512 for (i = 0; i < Ln_ENTRIES; i++) 2513 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2514 2515 /* 2516 * The spare PV entries must be reserved prior to demoting the 2517 * mapping, that is, prior to changing the L2 entry. Otherwise, the 2518 * state of the L2 entry and the PV lists will be inconsistent, which 2519 * can result in reclaim_pv_chunk() attempting to remove a PV entry from 2520 * the wrong PV list and pmap_pv_demote_l2() failing to find the 2521 * expected PV entry for the 2MB page mapping that is being demoted. 2522 */ 2523 if ((oldl2 & PTE_SW_MANAGED) != 0) 2524 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); 2525 2526 /* 2527 * Demote the mapping. 2528 */ 2529 pmap_store(l2, newl2); 2530 2531 /* 2532 * Demote the PV entry. 2533 */ 2534 if ((oldl2 & PTE_SW_MANAGED) != 0) 2535 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp); 2536 2537 atomic_add_long(&pmap_l2_demotions, 1); 2538 CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p", 2539 va, pmap); 2540 return (true); 2541 } 2542 2543 #if VM_NRESERVLEVEL > 0 2544 static void 2545 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 2546 struct rwlock **lockp) 2547 { 2548 pt_entry_t *firstl3, *l3; 2549 vm_paddr_t pa; 2550 vm_page_t ml3; 2551 2552 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2553 2554 va &= ~L2_OFFSET; 2555 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 2556 ("pmap_promote_l2: invalid l2 entry %p", l2)); 2557 2558 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2))); 2559 pa = PTE_TO_PHYS(pmap_load(firstl3)); 2560 if ((pa & L2_OFFSET) != 0) { 2561 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 2562 va, pmap); 2563 atomic_add_long(&pmap_l2_p_failures, 1); 2564 return; 2565 } 2566 2567 pa += PAGE_SIZE; 2568 for (l3 = firstl3 + 1; l3 < firstl3 + Ln_ENTRIES; l3++) { 2569 if (PTE_TO_PHYS(pmap_load(l3)) != pa) { 2570 CTR2(KTR_PMAP, 2571 "pmap_promote_l2: failure for va %#lx pmap %p", 2572 va, pmap); 2573 atomic_add_long(&pmap_l2_p_failures, 1); 2574 return; 2575 } 2576 if ((pmap_load(l3) & PTE_PROMOTE) != 2577 (pmap_load(firstl3) & PTE_PROMOTE)) { 2578 CTR2(KTR_PMAP, 2579 "pmap_promote_l2: failure for va %#lx pmap %p", 2580 va, pmap); 2581 atomic_add_long(&pmap_l2_p_failures, 1); 2582 return; 2583 } 2584 pa += PAGE_SIZE; 2585 } 2586 2587 ml3 = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2588 KASSERT(ml3->pindex == pmap_l2_pindex(va), 2589 ("pmap_promote_l2: page table page's pindex is wrong")); 2590 if (pmap_insert_pt_page(pmap, ml3)) { 2591 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 2592 va, pmap); 2593 atomic_add_long(&pmap_l2_p_failures, 1); 2594 return; 2595 } 2596 2597 if ((pmap_load(firstl3) & PTE_SW_MANAGED) != 0) 2598 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(pmap_load(firstl3)), 2599 lockp); 2600 2601 pmap_store(l2, pmap_load(firstl3)); 2602 2603 atomic_add_long(&pmap_l2_promotions, 1); 2604 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, 2605 pmap); 2606 } 2607 #endif 2608 2609 /* 2610 * Insert the given physical page (p) at 2611 * the specified virtual address (v) in the 2612 * target physical map with the protection requested. 2613 * 2614 * If specified, the page will be wired down, meaning 2615 * that the related pte can not be reclaimed. 2616 * 2617 * NB: This is the only routine which MAY NOT lazy-evaluate 2618 * or lose information. That is, this routine must actually 2619 * insert this page into the given map NOW. 2620 */ 2621 int 2622 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2623 u_int flags, int8_t psind) 2624 { 2625 struct rwlock *lock; 2626 pd_entry_t *l1, *l2, l2e; 2627 pt_entry_t new_l3, orig_l3; 2628 pt_entry_t *l3; 2629 pv_entry_t pv; 2630 vm_paddr_t opa, pa, l2_pa, l3_pa; 2631 vm_page_t mpte, om, l2_m, l3_m; 2632 pt_entry_t entry; 2633 pn_t l2_pn, l3_pn, pn; 2634 int rv; 2635 bool nosleep; 2636 2637 va = trunc_page(va); 2638 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) 2639 VM_OBJECT_ASSERT_LOCKED(m->object); 2640 pa = VM_PAGE_TO_PHYS(m); 2641 pn = (pa / PAGE_SIZE); 2642 2643 new_l3 = PTE_V | PTE_R | PTE_A; 2644 if (prot & VM_PROT_EXECUTE) 2645 new_l3 |= PTE_X; 2646 if (flags & VM_PROT_WRITE) 2647 new_l3 |= PTE_D; 2648 if (prot & VM_PROT_WRITE) 2649 new_l3 |= PTE_W; 2650 if (va < VM_MAX_USER_ADDRESS) 2651 new_l3 |= PTE_U; 2652 2653 new_l3 |= (pn << PTE_PPN0_S); 2654 if ((flags & PMAP_ENTER_WIRED) != 0) 2655 new_l3 |= PTE_SW_WIRED; 2656 2657 /* 2658 * Set modified bit gratuitously for writeable mappings if 2659 * the page is unmanaged. We do not want to take a fault 2660 * to do the dirty bit accounting for these mappings. 2661 */ 2662 if ((m->oflags & VPO_UNMANAGED) != 0) { 2663 if (prot & VM_PROT_WRITE) 2664 new_l3 |= PTE_D; 2665 } else 2666 new_l3 |= PTE_SW_MANAGED; 2667 2668 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); 2669 2670 lock = NULL; 2671 mpte = NULL; 2672 rw_rlock(&pvh_global_lock); 2673 PMAP_LOCK(pmap); 2674 if (psind == 1) { 2675 /* Assert the required virtual and physical alignment. */ 2676 KASSERT((va & L2_OFFSET) == 0, 2677 ("pmap_enter: va %#lx unaligned", va)); 2678 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 2679 rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock); 2680 goto out; 2681 } 2682 2683 l2 = pmap_l2(pmap, va); 2684 if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 && 2685 ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2, 2686 va, &lock))) { 2687 l3 = pmap_l2_to_l3(l2, va); 2688 if (va < VM_MAXUSER_ADDRESS) { 2689 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2690 mpte->wire_count++; 2691 } 2692 } else if (va < VM_MAXUSER_ADDRESS) { 2693 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 2694 mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock); 2695 if (mpte == NULL && nosleep) { 2696 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); 2697 if (lock != NULL) 2698 rw_wunlock(lock); 2699 rw_runlock(&pvh_global_lock); 2700 PMAP_UNLOCK(pmap); 2701 return (KERN_RESOURCE_SHORTAGE); 2702 } 2703 l3 = pmap_l3(pmap, va); 2704 } else { 2705 l3 = pmap_l3(pmap, va); 2706 /* TODO: This is not optimal, but should mostly work */ 2707 if (l3 == NULL) { 2708 if (l2 == NULL) { 2709 l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2710 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2711 VM_ALLOC_ZERO); 2712 if (l2_m == NULL) 2713 panic("pmap_enter: l2 pte_m == NULL"); 2714 if ((l2_m->flags & PG_ZERO) == 0) 2715 pmap_zero_page(l2_m); 2716 2717 l2_pa = VM_PAGE_TO_PHYS(l2_m); 2718 l2_pn = (l2_pa / PAGE_SIZE); 2719 2720 l1 = pmap_l1(pmap, va); 2721 entry = (PTE_V); 2722 entry |= (l2_pn << PTE_PPN0_S); 2723 pmap_store(l1, entry); 2724 pmap_distribute_l1(pmap, pmap_l1_index(va), entry); 2725 l2 = pmap_l1_to_l2(l1, va); 2726 } 2727 2728 l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2729 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2730 if (l3_m == NULL) 2731 panic("pmap_enter: l3 pte_m == NULL"); 2732 if ((l3_m->flags & PG_ZERO) == 0) 2733 pmap_zero_page(l3_m); 2734 2735 l3_pa = VM_PAGE_TO_PHYS(l3_m); 2736 l3_pn = (l3_pa / PAGE_SIZE); 2737 entry = (PTE_V); 2738 entry |= (l3_pn << PTE_PPN0_S); 2739 pmap_store(l2, entry); 2740 l3 = pmap_l2_to_l3(l2, va); 2741 } 2742 pmap_invalidate_page(pmap, va); 2743 } 2744 2745 orig_l3 = pmap_load(l3); 2746 opa = PTE_TO_PHYS(orig_l3); 2747 pv = NULL; 2748 2749 /* 2750 * Is the specified virtual address already mapped? 2751 */ 2752 if ((orig_l3 & PTE_V) != 0) { 2753 /* 2754 * Wiring change, just update stats. We don't worry about 2755 * wiring PT pages as they remain resident as long as there 2756 * are valid mappings in them. Hence, if a user page is wired, 2757 * the PT page will be also. 2758 */ 2759 if ((flags & PMAP_ENTER_WIRED) != 0 && 2760 (orig_l3 & PTE_SW_WIRED) == 0) 2761 pmap->pm_stats.wired_count++; 2762 else if ((flags & PMAP_ENTER_WIRED) == 0 && 2763 (orig_l3 & PTE_SW_WIRED) != 0) 2764 pmap->pm_stats.wired_count--; 2765 2766 /* 2767 * Remove the extra PT page reference. 2768 */ 2769 if (mpte != NULL) { 2770 mpte->wire_count--; 2771 KASSERT(mpte->wire_count > 0, 2772 ("pmap_enter: missing reference to page table page," 2773 " va: 0x%lx", va)); 2774 } 2775 2776 /* 2777 * Has the physical page changed? 2778 */ 2779 if (opa == pa) { 2780 /* 2781 * No, might be a protection or wiring change. 2782 */ 2783 if ((orig_l3 & PTE_SW_MANAGED) != 0 && 2784 (new_l3 & PTE_W) != 0) 2785 vm_page_aflag_set(m, PGA_WRITEABLE); 2786 goto validate; 2787 } 2788 2789 /* 2790 * The physical page has changed. Temporarily invalidate 2791 * the mapping. This ensures that all threads sharing the 2792 * pmap keep a consistent view of the mapping, which is 2793 * necessary for the correct handling of COW faults. It 2794 * also permits reuse of the old mapping's PV entry, 2795 * avoiding an allocation. 2796 * 2797 * For consistency, handle unmanaged mappings the same way. 2798 */ 2799 orig_l3 = pmap_load_clear(l3); 2800 KASSERT(PTE_TO_PHYS(orig_l3) == opa, 2801 ("pmap_enter: unexpected pa update for %#lx", va)); 2802 if ((orig_l3 & PTE_SW_MANAGED) != 0) { 2803 om = PHYS_TO_VM_PAGE(opa); 2804 2805 /* 2806 * The pmap lock is sufficient to synchronize with 2807 * concurrent calls to pmap_page_test_mappings() and 2808 * pmap_ts_referenced(). 2809 */ 2810 if ((orig_l3 & PTE_D) != 0) 2811 vm_page_dirty(om); 2812 if ((orig_l3 & PTE_A) != 0) 2813 vm_page_aflag_set(om, PGA_REFERENCED); 2814 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 2815 pv = pmap_pvh_remove(&om->md, pmap, va); 2816 KASSERT(pv != NULL, 2817 ("pmap_enter: no PV entry for %#lx", va)); 2818 if ((new_l3 & PTE_SW_MANAGED) == 0) 2819 free_pv_entry(pmap, pv); 2820 if ((om->aflags & PGA_WRITEABLE) != 0 && 2821 TAILQ_EMPTY(&om->md.pv_list)) 2822 vm_page_aflag_clear(om, PGA_WRITEABLE); 2823 } 2824 pmap_invalidate_page(pmap, va); 2825 orig_l3 = 0; 2826 } else { 2827 /* 2828 * Increment the counters. 2829 */ 2830 if ((new_l3 & PTE_SW_WIRED) != 0) 2831 pmap->pm_stats.wired_count++; 2832 pmap_resident_count_inc(pmap, 1); 2833 } 2834 /* 2835 * Enter on the PV list if part of our managed memory. 2836 */ 2837 if ((new_l3 & PTE_SW_MANAGED) != 0) { 2838 if (pv == NULL) { 2839 pv = get_pv_entry(pmap, &lock); 2840 pv->pv_va = va; 2841 } 2842 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 2843 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2844 m->md.pv_gen++; 2845 if ((new_l3 & PTE_W) != 0) 2846 vm_page_aflag_set(m, PGA_WRITEABLE); 2847 } 2848 2849 validate: 2850 /* 2851 * Sync the i-cache on all harts before updating the PTE 2852 * if the new PTE is executable. 2853 */ 2854 if (prot & VM_PROT_EXECUTE) 2855 pmap_sync_icache(pmap, va, PAGE_SIZE); 2856 2857 /* 2858 * Update the L3 entry. 2859 */ 2860 if (orig_l3 != 0) { 2861 orig_l3 = pmap_load_store(l3, new_l3); 2862 pmap_invalidate_page(pmap, va); 2863 KASSERT(PTE_TO_PHYS(orig_l3) == pa, 2864 ("pmap_enter: invalid update")); 2865 if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) == 2866 (PTE_D | PTE_SW_MANAGED)) 2867 vm_page_dirty(m); 2868 } else { 2869 pmap_store(l3, new_l3); 2870 } 2871 2872 #if VM_NRESERVLEVEL > 0 2873 if (mpte != NULL && mpte->wire_count == Ln_ENTRIES && 2874 pmap_ps_enabled(pmap) && 2875 (m->flags & PG_FICTITIOUS) == 0 && 2876 vm_reserv_level_iffullpop(m) == 0) 2877 pmap_promote_l2(pmap, l2, va, &lock); 2878 #endif 2879 2880 rv = KERN_SUCCESS; 2881 out: 2882 if (lock != NULL) 2883 rw_wunlock(lock); 2884 rw_runlock(&pvh_global_lock); 2885 PMAP_UNLOCK(pmap); 2886 return (rv); 2887 } 2888 2889 /* 2890 * Tries to create a read- and/or execute-only 2MB page mapping. Returns true 2891 * if successful. Returns false if (1) a page table page cannot be allocated 2892 * without sleeping, (2) a mapping already exists at the specified virtual 2893 * address, or (3) a PV entry cannot be allocated without reclaiming another 2894 * PV entry. 2895 */ 2896 static bool 2897 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2898 struct rwlock **lockp) 2899 { 2900 pd_entry_t new_l2; 2901 pn_t pn; 2902 2903 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2904 2905 pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE; 2906 new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V); 2907 if ((m->oflags & VPO_UNMANAGED) == 0) 2908 new_l2 |= PTE_SW_MANAGED; 2909 if ((prot & VM_PROT_EXECUTE) != 0) 2910 new_l2 |= PTE_X; 2911 if (va < VM_MAXUSER_ADDRESS) 2912 new_l2 |= PTE_U; 2913 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | 2914 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == 2915 KERN_SUCCESS); 2916 } 2917 2918 /* 2919 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 2920 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 2921 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 2922 * a mapping already exists at the specified virtual address. Returns 2923 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table 2924 * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if 2925 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. 2926 * 2927 * The parameter "m" is only used when creating a managed, writeable mapping. 2928 */ 2929 static int 2930 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, 2931 vm_page_t m, struct rwlock **lockp) 2932 { 2933 struct spglist free; 2934 pd_entry_t *l2, *l3, oldl2; 2935 vm_offset_t sva; 2936 vm_page_t l2pg, mt; 2937 2938 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2939 2940 if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? 2941 NULL : lockp)) == NULL) { 2942 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", 2943 va, pmap); 2944 return (KERN_RESOURCE_SHORTAGE); 2945 } 2946 2947 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); 2948 l2 = &l2[pmap_l2_index(va)]; 2949 if ((oldl2 = pmap_load(l2)) != 0) { 2950 KASSERT(l2pg->wire_count > 1, 2951 ("pmap_enter_l2: l2pg's wire count is too low")); 2952 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 2953 l2pg->wire_count--; 2954 CTR2(KTR_PMAP, 2955 "pmap_enter_l2: failure for va %#lx in pmap %p", 2956 va, pmap); 2957 return (KERN_FAILURE); 2958 } 2959 SLIST_INIT(&free); 2960 if ((oldl2 & PTE_RWX) != 0) 2961 (void)pmap_remove_l2(pmap, l2, va, 2962 pmap_load(pmap_l1(pmap, va)), &free, lockp); 2963 else 2964 for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) { 2965 l3 = pmap_l2_to_l3(l2, sva); 2966 if ((pmap_load(l3) & PTE_V) != 0 && 2967 pmap_remove_l3(pmap, l3, sva, oldl2, &free, 2968 lockp) != 0) 2969 break; 2970 } 2971 vm_page_free_pages_toq(&free, true); 2972 if (va >= VM_MAXUSER_ADDRESS) { 2973 mt = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2974 if (pmap_insert_pt_page(pmap, mt)) { 2975 /* 2976 * XXX Currently, this can't happen bacuse 2977 * we do not perform pmap_enter(psind == 1) 2978 * on the kernel pmap. 2979 */ 2980 panic("pmap_enter_l2: trie insert failed"); 2981 } 2982 } else 2983 KASSERT(pmap_load(l2) == 0, 2984 ("pmap_enter_l2: non-zero L2 entry %p", l2)); 2985 } 2986 2987 if ((new_l2 & PTE_SW_MANAGED) != 0) { 2988 /* 2989 * Abort this mapping if its PV entry could not be created. 2990 */ 2991 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { 2992 SLIST_INIT(&free); 2993 if (pmap_unwire_ptp(pmap, va, l2pg, &free)) { 2994 /* 2995 * Although "va" is not mapped, paging-structure 2996 * caches could nonetheless have entries that 2997 * refer to the freed page table pages. 2998 * Invalidate those entries. 2999 */ 3000 pmap_invalidate_page(pmap, va); 3001 vm_page_free_pages_toq(&free, true); 3002 } 3003 CTR2(KTR_PMAP, 3004 "pmap_enter_l2: failure for va %#lx in pmap %p", 3005 va, pmap); 3006 return (KERN_RESOURCE_SHORTAGE); 3007 } 3008 if ((new_l2 & PTE_W) != 0) 3009 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 3010 vm_page_aflag_set(mt, PGA_WRITEABLE); 3011 } 3012 3013 /* 3014 * Increment counters. 3015 */ 3016 if ((new_l2 & PTE_SW_WIRED) != 0) 3017 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; 3018 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; 3019 3020 /* 3021 * Map the superpage. 3022 */ 3023 pmap_store(l2, new_l2); 3024 3025 atomic_add_long(&pmap_l2_mappings, 1); 3026 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", 3027 va, pmap); 3028 3029 return (KERN_SUCCESS); 3030 } 3031 3032 /* 3033 * Maps a sequence of resident pages belonging to the same object. 3034 * The sequence begins with the given page m_start. This page is 3035 * mapped at the given virtual address start. Each subsequent page is 3036 * mapped at a virtual address that is offset from start by the same 3037 * amount as the page is offset from m_start within the object. The 3038 * last page in the sequence is the page with the largest offset from 3039 * m_start that can be mapped at a virtual address less than the given 3040 * virtual address end. Not every virtual page between start and end 3041 * is mapped; only those for which a resident page exists with the 3042 * corresponding offset from m_start are mapped. 3043 */ 3044 void 3045 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3046 vm_page_t m_start, vm_prot_t prot) 3047 { 3048 struct rwlock *lock; 3049 vm_offset_t va; 3050 vm_page_t m, mpte; 3051 vm_pindex_t diff, psize; 3052 3053 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3054 3055 psize = atop(end - start); 3056 mpte = NULL; 3057 m = m_start; 3058 lock = NULL; 3059 rw_rlock(&pvh_global_lock); 3060 PMAP_LOCK(pmap); 3061 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3062 va = start + ptoa(diff); 3063 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && 3064 m->psind == 1 && pmap_ps_enabled(pmap) && 3065 pmap_enter_2mpage(pmap, va, m, prot, &lock)) 3066 m = &m[L2_SIZE / PAGE_SIZE - 1]; 3067 else 3068 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, 3069 &lock); 3070 m = TAILQ_NEXT(m, listq); 3071 } 3072 if (lock != NULL) 3073 rw_wunlock(lock); 3074 rw_runlock(&pvh_global_lock); 3075 PMAP_UNLOCK(pmap); 3076 } 3077 3078 /* 3079 * this code makes some *MAJOR* assumptions: 3080 * 1. Current pmap & pmap exists. 3081 * 2. Not wired. 3082 * 3. Read access. 3083 * 4. No page table pages. 3084 * but is *MUCH* faster than pmap_enter... 3085 */ 3086 3087 void 3088 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3089 { 3090 struct rwlock *lock; 3091 3092 lock = NULL; 3093 rw_rlock(&pvh_global_lock); 3094 PMAP_LOCK(pmap); 3095 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 3096 if (lock != NULL) 3097 rw_wunlock(lock); 3098 rw_runlock(&pvh_global_lock); 3099 PMAP_UNLOCK(pmap); 3100 } 3101 3102 static vm_page_t 3103 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3104 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 3105 { 3106 struct spglist free; 3107 vm_paddr_t phys; 3108 pd_entry_t *l2; 3109 pt_entry_t *l3, newl3; 3110 3111 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 3112 (m->oflags & VPO_UNMANAGED) != 0, 3113 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3114 rw_assert(&pvh_global_lock, RA_LOCKED); 3115 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3116 3117 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); 3118 /* 3119 * In the case that a page table page is not 3120 * resident, we are creating it here. 3121 */ 3122 if (va < VM_MAXUSER_ADDRESS) { 3123 vm_pindex_t l2pindex; 3124 3125 /* 3126 * Calculate pagetable page index 3127 */ 3128 l2pindex = pmap_l2_pindex(va); 3129 if (mpte && (mpte->pindex == l2pindex)) { 3130 mpte->wire_count++; 3131 } else { 3132 /* 3133 * Get the l2 entry 3134 */ 3135 l2 = pmap_l2(pmap, va); 3136 3137 /* 3138 * If the page table page is mapped, we just increment 3139 * the hold count, and activate it. Otherwise, we 3140 * attempt to allocate a page table page. If this 3141 * attempt fails, we don't retry. Instead, we give up. 3142 */ 3143 if (l2 != NULL && pmap_load(l2) != 0) { 3144 phys = PTE_TO_PHYS(pmap_load(l2)); 3145 mpte = PHYS_TO_VM_PAGE(phys); 3146 mpte->wire_count++; 3147 } else { 3148 /* 3149 * Pass NULL instead of the PV list lock 3150 * pointer, because we don't intend to sleep. 3151 */ 3152 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); 3153 if (mpte == NULL) 3154 return (mpte); 3155 } 3156 } 3157 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 3158 l3 = &l3[pmap_l3_index(va)]; 3159 } else { 3160 mpte = NULL; 3161 l3 = pmap_l3(kernel_pmap, va); 3162 } 3163 if (l3 == NULL) 3164 panic("pmap_enter_quick_locked: No l3"); 3165 if (pmap_load(l3) != 0) { 3166 if (mpte != NULL) { 3167 mpte->wire_count--; 3168 mpte = NULL; 3169 } 3170 return (mpte); 3171 } 3172 3173 /* 3174 * Enter on the PV list if part of our managed memory. 3175 */ 3176 if ((m->oflags & VPO_UNMANAGED) == 0 && 3177 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 3178 if (mpte != NULL) { 3179 SLIST_INIT(&free); 3180 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 3181 pmap_invalidate_page(pmap, va); 3182 vm_page_free_pages_toq(&free, false); 3183 } 3184 mpte = NULL; 3185 } 3186 return (mpte); 3187 } 3188 3189 /* 3190 * Increment counters 3191 */ 3192 pmap_resident_count_inc(pmap, 1); 3193 3194 newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) | 3195 PTE_V | PTE_R; 3196 if ((prot & VM_PROT_EXECUTE) != 0) 3197 newl3 |= PTE_X; 3198 if ((m->oflags & VPO_UNMANAGED) == 0) 3199 newl3 |= PTE_SW_MANAGED; 3200 if (va < VM_MAX_USER_ADDRESS) 3201 newl3 |= PTE_U; 3202 3203 /* 3204 * Sync the i-cache on all harts before updating the PTE 3205 * if the new PTE is executable. 3206 */ 3207 if (prot & VM_PROT_EXECUTE) 3208 pmap_sync_icache(pmap, va, PAGE_SIZE); 3209 3210 pmap_store(l3, newl3); 3211 3212 pmap_invalidate_page(pmap, va); 3213 return (mpte); 3214 } 3215 3216 /* 3217 * This code maps large physical mmap regions into the 3218 * processor address space. Note that some shortcuts 3219 * are taken, but the code works. 3220 */ 3221 void 3222 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3223 vm_pindex_t pindex, vm_size_t size) 3224 { 3225 3226 VM_OBJECT_ASSERT_WLOCKED(object); 3227 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3228 ("pmap_object_init_pt: non-device object")); 3229 } 3230 3231 /* 3232 * Clear the wired attribute from the mappings for the specified range of 3233 * addresses in the given pmap. Every valid mapping within that range 3234 * must have the wired attribute set. In contrast, invalid mappings 3235 * cannot have the wired attribute set, so they are ignored. 3236 * 3237 * The wired attribute of the page table entry is not a hardware feature, 3238 * so there is no need to invalidate any TLB entries. 3239 */ 3240 void 3241 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3242 { 3243 vm_offset_t va_next; 3244 pd_entry_t *l1, *l2, l2e; 3245 pt_entry_t *l3, l3e; 3246 bool pv_lists_locked; 3247 3248 pv_lists_locked = false; 3249 retry: 3250 PMAP_LOCK(pmap); 3251 for (; sva < eva; sva = va_next) { 3252 l1 = pmap_l1(pmap, sva); 3253 if (pmap_load(l1) == 0) { 3254 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3255 if (va_next < sva) 3256 va_next = eva; 3257 continue; 3258 } 3259 3260 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3261 if (va_next < sva) 3262 va_next = eva; 3263 3264 l2 = pmap_l1_to_l2(l1, sva); 3265 if ((l2e = pmap_load(l2)) == 0) 3266 continue; 3267 if ((l2e & PTE_RWX) != 0) { 3268 if (sva + L2_SIZE == va_next && eva >= va_next) { 3269 if ((l2e & PTE_SW_WIRED) == 0) 3270 panic("pmap_unwire: l2 %#jx is missing " 3271 "PTE_SW_WIRED", (uintmax_t)l2e); 3272 pmap_clear_bits(l2, PTE_SW_WIRED); 3273 continue; 3274 } else { 3275 if (!pv_lists_locked) { 3276 pv_lists_locked = true; 3277 if (!rw_try_rlock(&pvh_global_lock)) { 3278 PMAP_UNLOCK(pmap); 3279 rw_rlock(&pvh_global_lock); 3280 /* Repeat sva. */ 3281 goto retry; 3282 } 3283 } 3284 if (!pmap_demote_l2(pmap, l2, sva)) 3285 panic("pmap_unwire: demotion failed"); 3286 } 3287 } 3288 3289 if (va_next > eva) 3290 va_next = eva; 3291 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 3292 sva += L3_SIZE) { 3293 if ((l3e = pmap_load(l3)) == 0) 3294 continue; 3295 if ((l3e & PTE_SW_WIRED) == 0) 3296 panic("pmap_unwire: l3 %#jx is missing " 3297 "PTE_SW_WIRED", (uintmax_t)l3e); 3298 3299 /* 3300 * PG_W must be cleared atomically. Although the pmap 3301 * lock synchronizes access to PG_W, another processor 3302 * could be setting PG_M and/or PG_A concurrently. 3303 */ 3304 pmap_clear_bits(l3, PTE_SW_WIRED); 3305 pmap->pm_stats.wired_count--; 3306 } 3307 } 3308 if (pv_lists_locked) 3309 rw_runlock(&pvh_global_lock); 3310 PMAP_UNLOCK(pmap); 3311 } 3312 3313 /* 3314 * Copy the range specified by src_addr/len 3315 * from the source map to the range dst_addr/len 3316 * in the destination map. 3317 * 3318 * This routine is only advisory and need not do anything. 3319 */ 3320 3321 void 3322 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 3323 vm_offset_t src_addr) 3324 { 3325 3326 } 3327 3328 /* 3329 * pmap_zero_page zeros the specified hardware page by mapping 3330 * the page into KVM and using bzero to clear its contents. 3331 */ 3332 void 3333 pmap_zero_page(vm_page_t m) 3334 { 3335 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3336 3337 pagezero((void *)va); 3338 } 3339 3340 /* 3341 * pmap_zero_page_area zeros the specified hardware page by mapping 3342 * the page into KVM and using bzero to clear its contents. 3343 * 3344 * off and size may not cover an area beyond a single hardware page. 3345 */ 3346 void 3347 pmap_zero_page_area(vm_page_t m, int off, int size) 3348 { 3349 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3350 3351 if (off == 0 && size == PAGE_SIZE) 3352 pagezero((void *)va); 3353 else 3354 bzero((char *)va + off, size); 3355 } 3356 3357 /* 3358 * pmap_copy_page copies the specified (machine independent) 3359 * page by mapping the page into virtual memory and using 3360 * bcopy to copy the page, one machine dependent page at a 3361 * time. 3362 */ 3363 void 3364 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 3365 { 3366 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 3367 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 3368 3369 pagecopy((void *)src, (void *)dst); 3370 } 3371 3372 int unmapped_buf_allowed = 1; 3373 3374 void 3375 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 3376 vm_offset_t b_offset, int xfersize) 3377 { 3378 void *a_cp, *b_cp; 3379 vm_page_t m_a, m_b; 3380 vm_paddr_t p_a, p_b; 3381 vm_offset_t a_pg_offset, b_pg_offset; 3382 int cnt; 3383 3384 while (xfersize > 0) { 3385 a_pg_offset = a_offset & PAGE_MASK; 3386 m_a = ma[a_offset >> PAGE_SHIFT]; 3387 p_a = m_a->phys_addr; 3388 b_pg_offset = b_offset & PAGE_MASK; 3389 m_b = mb[b_offset >> PAGE_SHIFT]; 3390 p_b = m_b->phys_addr; 3391 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 3392 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 3393 if (__predict_false(!PHYS_IN_DMAP(p_a))) { 3394 panic("!DMAP a %lx", p_a); 3395 } else { 3396 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 3397 } 3398 if (__predict_false(!PHYS_IN_DMAP(p_b))) { 3399 panic("!DMAP b %lx", p_b); 3400 } else { 3401 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 3402 } 3403 bcopy(a_cp, b_cp, cnt); 3404 a_offset += cnt; 3405 b_offset += cnt; 3406 xfersize -= cnt; 3407 } 3408 } 3409 3410 vm_offset_t 3411 pmap_quick_enter_page(vm_page_t m) 3412 { 3413 3414 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); 3415 } 3416 3417 void 3418 pmap_quick_remove_page(vm_offset_t addr) 3419 { 3420 } 3421 3422 /* 3423 * Returns true if the pmap's pv is one of the first 3424 * 16 pvs linked to from this page. This count may 3425 * be changed upwards or downwards in the future; it 3426 * is only necessary that true be returned for a small 3427 * subset of pmaps for proper page aging. 3428 */ 3429 boolean_t 3430 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 3431 { 3432 struct md_page *pvh; 3433 struct rwlock *lock; 3434 pv_entry_t pv; 3435 int loops = 0; 3436 boolean_t rv; 3437 3438 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3439 ("pmap_page_exists_quick: page %p is not managed", m)); 3440 rv = FALSE; 3441 rw_rlock(&pvh_global_lock); 3442 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3443 rw_rlock(lock); 3444 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3445 if (PV_PMAP(pv) == pmap) { 3446 rv = TRUE; 3447 break; 3448 } 3449 loops++; 3450 if (loops >= 16) 3451 break; 3452 } 3453 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 3454 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3455 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3456 if (PV_PMAP(pv) == pmap) { 3457 rv = TRUE; 3458 break; 3459 } 3460 loops++; 3461 if (loops >= 16) 3462 break; 3463 } 3464 } 3465 rw_runlock(lock); 3466 rw_runlock(&pvh_global_lock); 3467 return (rv); 3468 } 3469 3470 /* 3471 * pmap_page_wired_mappings: 3472 * 3473 * Return the number of managed mappings to the given physical page 3474 * that are wired. 3475 */ 3476 int 3477 pmap_page_wired_mappings(vm_page_t m) 3478 { 3479 struct md_page *pvh; 3480 struct rwlock *lock; 3481 pmap_t pmap; 3482 pd_entry_t *l2; 3483 pt_entry_t *l3; 3484 pv_entry_t pv; 3485 int count, md_gen, pvh_gen; 3486 3487 if ((m->oflags & VPO_UNMANAGED) != 0) 3488 return (0); 3489 rw_rlock(&pvh_global_lock); 3490 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3491 rw_rlock(lock); 3492 restart: 3493 count = 0; 3494 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3495 pmap = PV_PMAP(pv); 3496 if (!PMAP_TRYLOCK(pmap)) { 3497 md_gen = m->md.pv_gen; 3498 rw_runlock(lock); 3499 PMAP_LOCK(pmap); 3500 rw_rlock(lock); 3501 if (md_gen != m->md.pv_gen) { 3502 PMAP_UNLOCK(pmap); 3503 goto restart; 3504 } 3505 } 3506 l3 = pmap_l3(pmap, pv->pv_va); 3507 if ((pmap_load(l3) & PTE_SW_WIRED) != 0) 3508 count++; 3509 PMAP_UNLOCK(pmap); 3510 } 3511 if ((m->flags & PG_FICTITIOUS) == 0) { 3512 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3513 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3514 pmap = PV_PMAP(pv); 3515 if (!PMAP_TRYLOCK(pmap)) { 3516 md_gen = m->md.pv_gen; 3517 pvh_gen = pvh->pv_gen; 3518 rw_runlock(lock); 3519 PMAP_LOCK(pmap); 3520 rw_rlock(lock); 3521 if (md_gen != m->md.pv_gen || 3522 pvh_gen != pvh->pv_gen) { 3523 PMAP_UNLOCK(pmap); 3524 goto restart; 3525 } 3526 } 3527 l2 = pmap_l2(pmap, pv->pv_va); 3528 if ((pmap_load(l2) & PTE_SW_WIRED) != 0) 3529 count++; 3530 PMAP_UNLOCK(pmap); 3531 } 3532 } 3533 rw_runlock(lock); 3534 rw_runlock(&pvh_global_lock); 3535 return (count); 3536 } 3537 3538 static void 3539 pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, 3540 struct spglist *free, bool superpage) 3541 { 3542 struct md_page *pvh; 3543 vm_page_t mpte, mt; 3544 3545 if (superpage) { 3546 pmap_resident_count_dec(pmap, Ln_ENTRIES); 3547 pvh = pa_to_pvh(m->phys_addr); 3548 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3549 pvh->pv_gen++; 3550 if (TAILQ_EMPTY(&pvh->pv_list)) { 3551 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 3552 if (TAILQ_EMPTY(&mt->md.pv_list) && 3553 (mt->aflags & PGA_WRITEABLE) != 0) 3554 vm_page_aflag_clear(mt, PGA_WRITEABLE); 3555 } 3556 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 3557 if (mpte != NULL) { 3558 pmap_resident_count_dec(pmap, 1); 3559 KASSERT(mpte->wire_count == Ln_ENTRIES, 3560 ("pmap_remove_pages: pte page wire count error")); 3561 mpte->wire_count = 0; 3562 pmap_add_delayed_free_list(mpte, free, FALSE); 3563 } 3564 } else { 3565 pmap_resident_count_dec(pmap, 1); 3566 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3567 m->md.pv_gen++; 3568 if (TAILQ_EMPTY(&m->md.pv_list) && 3569 (m->aflags & PGA_WRITEABLE) != 0) { 3570 pvh = pa_to_pvh(m->phys_addr); 3571 if (TAILQ_EMPTY(&pvh->pv_list)) 3572 vm_page_aflag_clear(m, PGA_WRITEABLE); 3573 } 3574 } 3575 } 3576 3577 /* 3578 * Destroy all managed, non-wired mappings in the given user-space 3579 * pmap. This pmap cannot be active on any processor besides the 3580 * caller. 3581 * 3582 * This function cannot be applied to the kernel pmap. Moreover, it 3583 * is not intended for general use. It is only to be used during 3584 * process termination. Consequently, it can be implemented in ways 3585 * that make it faster than pmap_remove(). First, it can more quickly 3586 * destroy mappings by iterating over the pmap's collection of PV 3587 * entries, rather than searching the page table. Second, it doesn't 3588 * have to test and clear the page table entries atomically, because 3589 * no processor is currently accessing the user address space. In 3590 * particular, a page table entry's dirty bit won't change state once 3591 * this function starts. 3592 */ 3593 void 3594 pmap_remove_pages(pmap_t pmap) 3595 { 3596 struct spglist free; 3597 pd_entry_t ptepde; 3598 pt_entry_t *pte, tpte; 3599 vm_page_t m, mt; 3600 pv_entry_t pv; 3601 struct pv_chunk *pc, *npc; 3602 struct rwlock *lock; 3603 int64_t bit; 3604 uint64_t inuse, bitmask; 3605 int allfree, field, freed, idx; 3606 bool superpage; 3607 3608 lock = NULL; 3609 3610 SLIST_INIT(&free); 3611 rw_rlock(&pvh_global_lock); 3612 PMAP_LOCK(pmap); 3613 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 3614 allfree = 1; 3615 freed = 0; 3616 for (field = 0; field < _NPCM; field++) { 3617 inuse = ~pc->pc_map[field] & pc_freemask[field]; 3618 while (inuse != 0) { 3619 bit = ffsl(inuse) - 1; 3620 bitmask = 1UL << bit; 3621 idx = field * 64 + bit; 3622 pv = &pc->pc_pventry[idx]; 3623 inuse &= ~bitmask; 3624 3625 pte = pmap_l1(pmap, pv->pv_va); 3626 ptepde = pmap_load(pte); 3627 pte = pmap_l1_to_l2(pte, pv->pv_va); 3628 tpte = pmap_load(pte); 3629 if ((tpte & PTE_RWX) != 0) { 3630 superpage = true; 3631 } else { 3632 ptepde = tpte; 3633 pte = pmap_l2_to_l3(pte, pv->pv_va); 3634 tpte = pmap_load(pte); 3635 superpage = false; 3636 } 3637 3638 /* 3639 * We cannot remove wired pages from a 3640 * process' mapping at this time. 3641 */ 3642 if (tpte & PTE_SW_WIRED) { 3643 allfree = 0; 3644 continue; 3645 } 3646 3647 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte)); 3648 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 3649 m < &vm_page_array[vm_page_array_size], 3650 ("pmap_remove_pages: bad pte %#jx", 3651 (uintmax_t)tpte)); 3652 3653 pmap_clear(pte); 3654 3655 /* 3656 * Update the vm_page_t clean/reference bits. 3657 */ 3658 if ((tpte & (PTE_D | PTE_W)) == 3659 (PTE_D | PTE_W)) { 3660 if (superpage) 3661 for (mt = m; 3662 mt < &m[Ln_ENTRIES]; mt++) 3663 vm_page_dirty(mt); 3664 else 3665 vm_page_dirty(m); 3666 } 3667 3668 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 3669 3670 /* Mark free */ 3671 pc->pc_map[field] |= bitmask; 3672 3673 pmap_remove_pages_pv(pmap, m, pv, &free, 3674 superpage); 3675 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 3676 freed++; 3677 } 3678 } 3679 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 3680 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 3681 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 3682 if (allfree) { 3683 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3684 free_pv_chunk(pc); 3685 } 3686 } 3687 if (lock != NULL) 3688 rw_wunlock(lock); 3689 pmap_invalidate_all(pmap); 3690 rw_runlock(&pvh_global_lock); 3691 PMAP_UNLOCK(pmap); 3692 vm_page_free_pages_toq(&free, false); 3693 } 3694 3695 static bool 3696 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 3697 { 3698 struct md_page *pvh; 3699 struct rwlock *lock; 3700 pd_entry_t *l2; 3701 pt_entry_t *l3, mask; 3702 pv_entry_t pv; 3703 pmap_t pmap; 3704 int md_gen, pvh_gen; 3705 bool rv; 3706 3707 mask = 0; 3708 if (modified) 3709 mask |= PTE_D; 3710 if (accessed) 3711 mask |= PTE_A; 3712 3713 rv = FALSE; 3714 rw_rlock(&pvh_global_lock); 3715 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3716 rw_rlock(lock); 3717 restart: 3718 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3719 pmap = PV_PMAP(pv); 3720 if (!PMAP_TRYLOCK(pmap)) { 3721 md_gen = m->md.pv_gen; 3722 rw_runlock(lock); 3723 PMAP_LOCK(pmap); 3724 rw_rlock(lock); 3725 if (md_gen != m->md.pv_gen) { 3726 PMAP_UNLOCK(pmap); 3727 goto restart; 3728 } 3729 } 3730 l3 = pmap_l3(pmap, pv->pv_va); 3731 rv = (pmap_load(l3) & mask) == mask; 3732 PMAP_UNLOCK(pmap); 3733 if (rv) 3734 goto out; 3735 } 3736 if ((m->flags & PG_FICTITIOUS) == 0) { 3737 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3738 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3739 pmap = PV_PMAP(pv); 3740 if (!PMAP_TRYLOCK(pmap)) { 3741 md_gen = m->md.pv_gen; 3742 pvh_gen = pvh->pv_gen; 3743 rw_runlock(lock); 3744 PMAP_LOCK(pmap); 3745 rw_rlock(lock); 3746 if (md_gen != m->md.pv_gen || 3747 pvh_gen != pvh->pv_gen) { 3748 PMAP_UNLOCK(pmap); 3749 goto restart; 3750 } 3751 } 3752 l2 = pmap_l2(pmap, pv->pv_va); 3753 rv = (pmap_load(l2) & mask) == mask; 3754 PMAP_UNLOCK(pmap); 3755 if (rv) 3756 goto out; 3757 } 3758 } 3759 out: 3760 rw_runlock(lock); 3761 rw_runlock(&pvh_global_lock); 3762 return (rv); 3763 } 3764 3765 /* 3766 * pmap_is_modified: 3767 * 3768 * Return whether or not the specified physical page was modified 3769 * in any physical maps. 3770 */ 3771 boolean_t 3772 pmap_is_modified(vm_page_t m) 3773 { 3774 3775 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3776 ("pmap_is_modified: page %p is not managed", m)); 3777 3778 /* 3779 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 3780 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 3781 * is clear, no PTEs can have PG_M set. 3782 */ 3783 VM_OBJECT_ASSERT_WLOCKED(m->object); 3784 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 3785 return (FALSE); 3786 return (pmap_page_test_mappings(m, FALSE, TRUE)); 3787 } 3788 3789 /* 3790 * pmap_is_prefaultable: 3791 * 3792 * Return whether or not the specified virtual address is eligible 3793 * for prefault. 3794 */ 3795 boolean_t 3796 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 3797 { 3798 pt_entry_t *l3; 3799 boolean_t rv; 3800 3801 rv = FALSE; 3802 PMAP_LOCK(pmap); 3803 l3 = pmap_l3(pmap, addr); 3804 if (l3 != NULL && pmap_load(l3) != 0) { 3805 rv = TRUE; 3806 } 3807 PMAP_UNLOCK(pmap); 3808 return (rv); 3809 } 3810 3811 /* 3812 * pmap_is_referenced: 3813 * 3814 * Return whether or not the specified physical page was referenced 3815 * in any physical maps. 3816 */ 3817 boolean_t 3818 pmap_is_referenced(vm_page_t m) 3819 { 3820 3821 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3822 ("pmap_is_referenced: page %p is not managed", m)); 3823 return (pmap_page_test_mappings(m, TRUE, FALSE)); 3824 } 3825 3826 /* 3827 * Clear the write and modified bits in each of the given page's mappings. 3828 */ 3829 void 3830 pmap_remove_write(vm_page_t m) 3831 { 3832 struct md_page *pvh; 3833 struct rwlock *lock; 3834 pmap_t pmap; 3835 pd_entry_t *l2; 3836 pt_entry_t *l3, oldl3, newl3; 3837 pv_entry_t next_pv, pv; 3838 vm_offset_t va; 3839 int md_gen, pvh_gen; 3840 3841 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3842 ("pmap_remove_write: page %p is not managed", m)); 3843 3844 /* 3845 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 3846 * set by another thread while the object is locked. Thus, 3847 * if PGA_WRITEABLE is clear, no page table entries need updating. 3848 */ 3849 VM_OBJECT_ASSERT_WLOCKED(m->object); 3850 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 3851 return; 3852 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3853 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 3854 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3855 rw_rlock(&pvh_global_lock); 3856 retry_pv_loop: 3857 rw_wlock(lock); 3858 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 3859 pmap = PV_PMAP(pv); 3860 if (!PMAP_TRYLOCK(pmap)) { 3861 pvh_gen = pvh->pv_gen; 3862 rw_wunlock(lock); 3863 PMAP_LOCK(pmap); 3864 rw_wlock(lock); 3865 if (pvh_gen != pvh->pv_gen) { 3866 PMAP_UNLOCK(pmap); 3867 rw_wunlock(lock); 3868 goto retry_pv_loop; 3869 } 3870 } 3871 va = pv->pv_va; 3872 l2 = pmap_l2(pmap, va); 3873 if ((pmap_load(l2) & PTE_W) != 0) 3874 (void)pmap_demote_l2_locked(pmap, l2, va, &lock); 3875 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 3876 ("inconsistent pv lock %p %p for page %p", 3877 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 3878 PMAP_UNLOCK(pmap); 3879 } 3880 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3881 pmap = PV_PMAP(pv); 3882 if (!PMAP_TRYLOCK(pmap)) { 3883 pvh_gen = pvh->pv_gen; 3884 md_gen = m->md.pv_gen; 3885 rw_wunlock(lock); 3886 PMAP_LOCK(pmap); 3887 rw_wlock(lock); 3888 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 3889 PMAP_UNLOCK(pmap); 3890 rw_wunlock(lock); 3891 goto retry_pv_loop; 3892 } 3893 } 3894 l3 = pmap_l3(pmap, pv->pv_va); 3895 oldl3 = pmap_load(l3); 3896 retry: 3897 if ((oldl3 & PTE_W) != 0) { 3898 newl3 = oldl3 & ~(PTE_D | PTE_W); 3899 if (!atomic_fcmpset_long(l3, &oldl3, newl3)) 3900 goto retry; 3901 if ((oldl3 & PTE_D) != 0) 3902 vm_page_dirty(m); 3903 pmap_invalidate_page(pmap, pv->pv_va); 3904 } 3905 PMAP_UNLOCK(pmap); 3906 } 3907 rw_wunlock(lock); 3908 vm_page_aflag_clear(m, PGA_WRITEABLE); 3909 rw_runlock(&pvh_global_lock); 3910 } 3911 3912 /* 3913 * pmap_ts_referenced: 3914 * 3915 * Return a count of reference bits for a page, clearing those bits. 3916 * It is not necessary for every reference bit to be cleared, but it 3917 * is necessary that 0 only be returned when there are truly no 3918 * reference bits set. 3919 * 3920 * As an optimization, update the page's dirty field if a modified bit is 3921 * found while counting reference bits. This opportunistic update can be 3922 * performed at low cost and can eliminate the need for some future calls 3923 * to pmap_is_modified(). However, since this function stops after 3924 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 3925 * dirty pages. Those dirty pages will only be detected by a future call 3926 * to pmap_is_modified(). 3927 */ 3928 int 3929 pmap_ts_referenced(vm_page_t m) 3930 { 3931 struct spglist free; 3932 struct md_page *pvh; 3933 struct rwlock *lock; 3934 pv_entry_t pv, pvf; 3935 pmap_t pmap; 3936 pd_entry_t *l2, l2e; 3937 pt_entry_t *l3, l3e; 3938 vm_paddr_t pa; 3939 vm_offset_t va; 3940 int md_gen, pvh_gen, ret; 3941 3942 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3943 ("pmap_ts_referenced: page %p is not managed", m)); 3944 SLIST_INIT(&free); 3945 ret = 0; 3946 pa = VM_PAGE_TO_PHYS(m); 3947 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 3948 3949 lock = PHYS_TO_PV_LIST_LOCK(pa); 3950 rw_rlock(&pvh_global_lock); 3951 rw_wlock(lock); 3952 retry: 3953 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 3954 goto small_mappings; 3955 pv = pvf; 3956 do { 3957 pmap = PV_PMAP(pv); 3958 if (!PMAP_TRYLOCK(pmap)) { 3959 pvh_gen = pvh->pv_gen; 3960 rw_wunlock(lock); 3961 PMAP_LOCK(pmap); 3962 rw_wlock(lock); 3963 if (pvh_gen != pvh->pv_gen) { 3964 PMAP_UNLOCK(pmap); 3965 goto retry; 3966 } 3967 } 3968 va = pv->pv_va; 3969 l2 = pmap_l2(pmap, va); 3970 l2e = pmap_load(l2); 3971 if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) { 3972 /* 3973 * Although l2e is mapping a 2MB page, because 3974 * this function is called at a 4KB page granularity, 3975 * we only update the 4KB page under test. 3976 */ 3977 vm_page_dirty(m); 3978 } 3979 if ((l2e & PTE_A) != 0) { 3980 /* 3981 * Since this reference bit is shared by 512 4KB 3982 * pages, it should not be cleared every time it is 3983 * tested. Apply a simple "hash" function on the 3984 * physical page number, the virtual superpage number, 3985 * and the pmap address to select one 4KB page out of 3986 * the 512 on which testing the reference bit will 3987 * result in clearing that reference bit. This 3988 * function is designed to avoid the selection of the 3989 * same 4KB page for every 2MB page mapping. 3990 * 3991 * On demotion, a mapping that hasn't been referenced 3992 * is simply destroyed. To avoid the possibility of a 3993 * subsequent page fault on a demoted wired mapping, 3994 * always leave its reference bit set. Moreover, 3995 * since the superpage is wired, the current state of 3996 * its reference bit won't affect page replacement. 3997 */ 3998 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ 3999 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && 4000 (l2e & PTE_SW_WIRED) == 0) { 4001 pmap_clear_bits(l2, PTE_A); 4002 pmap_invalidate_page(pmap, va); 4003 } 4004 ret++; 4005 } 4006 PMAP_UNLOCK(pmap); 4007 /* Rotate the PV list if it has more than one entry. */ 4008 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4009 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4010 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4011 pvh->pv_gen++; 4012 } 4013 if (ret >= PMAP_TS_REFERENCED_MAX) 4014 goto out; 4015 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 4016 small_mappings: 4017 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 4018 goto out; 4019 pv = pvf; 4020 do { 4021 pmap = PV_PMAP(pv); 4022 if (!PMAP_TRYLOCK(pmap)) { 4023 pvh_gen = pvh->pv_gen; 4024 md_gen = m->md.pv_gen; 4025 rw_wunlock(lock); 4026 PMAP_LOCK(pmap); 4027 rw_wlock(lock); 4028 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4029 PMAP_UNLOCK(pmap); 4030 goto retry; 4031 } 4032 } 4033 l2 = pmap_l2(pmap, pv->pv_va); 4034 4035 KASSERT((pmap_load(l2) & PTE_RX) == 0, 4036 ("pmap_ts_referenced: found an invalid l2 table")); 4037 4038 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4039 l3e = pmap_load(l3); 4040 if ((l3e & PTE_D) != 0) 4041 vm_page_dirty(m); 4042 if ((l3e & PTE_A) != 0) { 4043 if ((l3e & PTE_SW_WIRED) == 0) { 4044 /* 4045 * Wired pages cannot be paged out so 4046 * doing accessed bit emulation for 4047 * them is wasted effort. We do the 4048 * hard work for unwired pages only. 4049 */ 4050 pmap_clear_bits(l3, PTE_A); 4051 pmap_invalidate_page(pmap, pv->pv_va); 4052 } 4053 ret++; 4054 } 4055 PMAP_UNLOCK(pmap); 4056 /* Rotate the PV list if it has more than one entry. */ 4057 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4058 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4059 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4060 m->md.pv_gen++; 4061 } 4062 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && ret < 4063 PMAP_TS_REFERENCED_MAX); 4064 out: 4065 rw_wunlock(lock); 4066 rw_runlock(&pvh_global_lock); 4067 vm_page_free_pages_toq(&free, false); 4068 return (ret); 4069 } 4070 4071 /* 4072 * Apply the given advice to the specified range of addresses within the 4073 * given pmap. Depending on the advice, clear the referenced and/or 4074 * modified flags in each mapping and set the mapped page's dirty field. 4075 */ 4076 void 4077 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 4078 { 4079 } 4080 4081 /* 4082 * Clear the modify bits on the specified physical page. 4083 */ 4084 void 4085 pmap_clear_modify(vm_page_t m) 4086 { 4087 struct md_page *pvh; 4088 struct rwlock *lock; 4089 pmap_t pmap; 4090 pv_entry_t next_pv, pv; 4091 pd_entry_t *l2, oldl2; 4092 pt_entry_t *l3, oldl3; 4093 vm_offset_t va; 4094 int md_gen, pvh_gen; 4095 4096 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4097 ("pmap_clear_modify: page %p is not managed", m)); 4098 VM_OBJECT_ASSERT_WLOCKED(m->object); 4099 KASSERT(!vm_page_xbusied(m), 4100 ("pmap_clear_modify: page %p is exclusive busied", m)); 4101 4102 /* 4103 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 4104 * If the object containing the page is locked and the page is not 4105 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 4106 */ 4107 if ((m->aflags & PGA_WRITEABLE) == 0) 4108 return; 4109 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 4110 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4111 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4112 rw_rlock(&pvh_global_lock); 4113 rw_wlock(lock); 4114 restart: 4115 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4116 pmap = PV_PMAP(pv); 4117 if (!PMAP_TRYLOCK(pmap)) { 4118 pvh_gen = pvh->pv_gen; 4119 rw_wunlock(lock); 4120 PMAP_LOCK(pmap); 4121 rw_wlock(lock); 4122 if (pvh_gen != pvh->pv_gen) { 4123 PMAP_UNLOCK(pmap); 4124 goto restart; 4125 } 4126 } 4127 va = pv->pv_va; 4128 l2 = pmap_l2(pmap, va); 4129 oldl2 = pmap_load(l2); 4130 if ((oldl2 & PTE_W) != 0) { 4131 if (pmap_demote_l2_locked(pmap, l2, va, &lock)) { 4132 if ((oldl2 & PTE_SW_WIRED) == 0) { 4133 /* 4134 * Write protect the mapping to a 4135 * single page so that a subsequent 4136 * write access may repromote. 4137 */ 4138 va += VM_PAGE_TO_PHYS(m) - 4139 PTE_TO_PHYS(oldl2); 4140 l3 = pmap_l2_to_l3(l2, va); 4141 oldl3 = pmap_load(l3); 4142 if ((oldl3 & PTE_V) != 0) { 4143 while (!atomic_fcmpset_long(l3, 4144 &oldl3, oldl3 & ~(PTE_D | 4145 PTE_W))) 4146 cpu_spinwait(); 4147 vm_page_dirty(m); 4148 pmap_invalidate_page(pmap, va); 4149 } 4150 } 4151 } 4152 } 4153 PMAP_UNLOCK(pmap); 4154 } 4155 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4156 pmap = PV_PMAP(pv); 4157 if (!PMAP_TRYLOCK(pmap)) { 4158 md_gen = m->md.pv_gen; 4159 pvh_gen = pvh->pv_gen; 4160 rw_wunlock(lock); 4161 PMAP_LOCK(pmap); 4162 rw_wlock(lock); 4163 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4164 PMAP_UNLOCK(pmap); 4165 goto restart; 4166 } 4167 } 4168 l2 = pmap_l2(pmap, pv->pv_va); 4169 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4170 ("pmap_clear_modify: found a 2mpage in page %p's pv list", 4171 m)); 4172 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4173 if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) { 4174 pmap_clear_bits(l3, PTE_D); 4175 pmap_invalidate_page(pmap, pv->pv_va); 4176 } 4177 PMAP_UNLOCK(pmap); 4178 } 4179 rw_wunlock(lock); 4180 rw_runlock(&pvh_global_lock); 4181 } 4182 4183 void * 4184 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 4185 { 4186 4187 return ((void *)PHYS_TO_DMAP(pa)); 4188 } 4189 4190 void 4191 pmap_unmapbios(vm_paddr_t pa, vm_size_t size) 4192 { 4193 } 4194 4195 /* 4196 * Sets the memory attribute for the specified page. 4197 */ 4198 void 4199 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 4200 { 4201 4202 m->md.pv_memattr = ma; 4203 4204 /* 4205 * RISCVTODO: Implement the below (from the amd64 pmap) 4206 * If "m" is a normal page, update its direct mapping. This update 4207 * can be relied upon to perform any cache operations that are 4208 * required for data coherence. 4209 */ 4210 if ((m->flags & PG_FICTITIOUS) == 0 && 4211 PHYS_IN_DMAP(VM_PAGE_TO_PHYS(m))) 4212 panic("RISCVTODO: pmap_page_set_memattr"); 4213 } 4214 4215 /* 4216 * perform the pmap work for mincore 4217 */ 4218 int 4219 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 4220 { 4221 pt_entry_t *l2, *l3, tpte; 4222 vm_paddr_t pa; 4223 int val; 4224 bool managed; 4225 4226 PMAP_LOCK(pmap); 4227 retry: 4228 managed = false; 4229 val = 0; 4230 4231 l2 = pmap_l2(pmap, addr); 4232 if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) { 4233 if ((tpte & PTE_RWX) != 0) { 4234 pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET); 4235 val = MINCORE_INCORE | MINCORE_SUPER; 4236 } else { 4237 l3 = pmap_l2_to_l3(l2, addr); 4238 tpte = pmap_load(l3); 4239 if ((tpte & PTE_V) == 0) 4240 goto done; 4241 pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET); 4242 val = MINCORE_INCORE; 4243 } 4244 4245 if ((tpte & PTE_D) != 0) 4246 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 4247 if ((tpte & PTE_A) != 0) 4248 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 4249 managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED; 4250 } 4251 4252 done: 4253 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 4254 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 4255 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 4256 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 4257 goto retry; 4258 } else 4259 PA_UNLOCK_COND(*locked_pa); 4260 PMAP_UNLOCK(pmap); 4261 return (val); 4262 } 4263 4264 void 4265 pmap_activate_sw(struct thread *td) 4266 { 4267 pmap_t oldpmap, pmap; 4268 u_int cpu; 4269 4270 oldpmap = PCPU_GET(curpmap); 4271 pmap = vmspace_pmap(td->td_proc->p_vmspace); 4272 if (pmap == oldpmap) 4273 return; 4274 load_satp(pmap->pm_satp); 4275 4276 cpu = PCPU_GET(cpuid); 4277 #ifdef SMP 4278 CPU_SET_ATOMIC(cpu, &pmap->pm_active); 4279 CPU_CLR_ATOMIC(cpu, &oldpmap->pm_active); 4280 #else 4281 CPU_SET(cpu, &pmap->pm_active); 4282 CPU_CLR(cpu, &oldpmap->pm_active); 4283 #endif 4284 PCPU_SET(curpmap, pmap); 4285 4286 sfence_vma(); 4287 } 4288 4289 void 4290 pmap_activate(struct thread *td) 4291 { 4292 4293 critical_enter(); 4294 pmap_activate_sw(td); 4295 critical_exit(); 4296 } 4297 4298 void 4299 pmap_activate_boot(pmap_t pmap) 4300 { 4301 u_int cpu; 4302 4303 cpu = PCPU_GET(cpuid); 4304 #ifdef SMP 4305 CPU_SET_ATOMIC(cpu, &pmap->pm_active); 4306 #else 4307 CPU_SET(cpu, &pmap->pm_active); 4308 #endif 4309 PCPU_SET(curpmap, pmap); 4310 } 4311 4312 void 4313 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) 4314 { 4315 cpuset_t mask; 4316 4317 /* 4318 * From the RISC-V User-Level ISA V2.2: 4319 * 4320 * "To make a store to instruction memory visible to all 4321 * RISC-V harts, the writing hart has to execute a data FENCE 4322 * before requesting that all remote RISC-V harts execute a 4323 * FENCE.I." 4324 */ 4325 sched_pin(); 4326 mask = all_cpus; 4327 CPU_CLR(PCPU_GET(cpuid), &mask); 4328 fence(); 4329 if (!CPU_EMPTY(&mask) && smp_started) 4330 sbi_remote_fence_i(mask.__bits); 4331 sched_unpin(); 4332 } 4333 4334 /* 4335 * Increase the starting virtual address of the given mapping if a 4336 * different alignment might result in more superpage mappings. 4337 */ 4338 void 4339 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 4340 vm_offset_t *addr, vm_size_t size) 4341 { 4342 vm_offset_t superpage_offset; 4343 4344 if (size < L2_SIZE) 4345 return; 4346 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 4347 offset += ptoa(object->pg_color); 4348 superpage_offset = offset & L2_OFFSET; 4349 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || 4350 (*addr & L2_OFFSET) == superpage_offset) 4351 return; 4352 if ((*addr & L2_OFFSET) < superpage_offset) 4353 *addr = (*addr & ~L2_OFFSET) + superpage_offset; 4354 else 4355 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; 4356 } 4357 4358 /** 4359 * Get the kernel virtual address of a set of physical pages. If there are 4360 * physical addresses not covered by the DMAP perform a transient mapping 4361 * that will be removed when calling pmap_unmap_io_transient. 4362 * 4363 * \param page The pages the caller wishes to obtain the virtual 4364 * address on the kernel memory map. 4365 * \param vaddr On return contains the kernel virtual memory address 4366 * of the pages passed in the page parameter. 4367 * \param count Number of pages passed in. 4368 * \param can_fault TRUE if the thread using the mapped pages can take 4369 * page faults, FALSE otherwise. 4370 * 4371 * \returns TRUE if the caller must call pmap_unmap_io_transient when 4372 * finished or FALSE otherwise. 4373 * 4374 */ 4375 boolean_t 4376 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 4377 boolean_t can_fault) 4378 { 4379 vm_paddr_t paddr; 4380 boolean_t needs_mapping; 4381 int error, i; 4382 4383 /* 4384 * Allocate any KVA space that we need, this is done in a separate 4385 * loop to prevent calling vmem_alloc while pinned. 4386 */ 4387 needs_mapping = FALSE; 4388 for (i = 0; i < count; i++) { 4389 paddr = VM_PAGE_TO_PHYS(page[i]); 4390 if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) { 4391 error = vmem_alloc(kernel_arena, PAGE_SIZE, 4392 M_BESTFIT | M_WAITOK, &vaddr[i]); 4393 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 4394 needs_mapping = TRUE; 4395 } else { 4396 vaddr[i] = PHYS_TO_DMAP(paddr); 4397 } 4398 } 4399 4400 /* Exit early if everything is covered by the DMAP */ 4401 if (!needs_mapping) 4402 return (FALSE); 4403 4404 if (!can_fault) 4405 sched_pin(); 4406 for (i = 0; i < count; i++) { 4407 paddr = VM_PAGE_TO_PHYS(page[i]); 4408 if (paddr >= DMAP_MAX_PHYSADDR) { 4409 panic( 4410 "pmap_map_io_transient: TODO: Map out of DMAP data"); 4411 } 4412 } 4413 4414 return (needs_mapping); 4415 } 4416 4417 void 4418 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 4419 boolean_t can_fault) 4420 { 4421 vm_paddr_t paddr; 4422 int i; 4423 4424 if (!can_fault) 4425 sched_unpin(); 4426 for (i = 0; i < count; i++) { 4427 paddr = VM_PAGE_TO_PHYS(page[i]); 4428 if (paddr >= DMAP_MAX_PHYSADDR) { 4429 panic("RISCVTODO: pmap_unmap_io_transient: Unmap data"); 4430 } 4431 } 4432 } 4433 4434 boolean_t 4435 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 4436 { 4437 4438 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_BACK); 4439 } 4440 4441 bool 4442 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2, 4443 pt_entry_t **l3) 4444 { 4445 pd_entry_t *l1p, *l2p; 4446 4447 /* Get l1 directory entry. */ 4448 l1p = pmap_l1(pmap, va); 4449 *l1 = l1p; 4450 4451 if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0) 4452 return (false); 4453 4454 if ((pmap_load(l1p) & PTE_RX) != 0) { 4455 *l2 = NULL; 4456 *l3 = NULL; 4457 return (true); 4458 } 4459 4460 /* Get l2 directory entry. */ 4461 l2p = pmap_l1_to_l2(l1p, va); 4462 *l2 = l2p; 4463 4464 if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0) 4465 return (false); 4466 4467 if ((pmap_load(l2p) & PTE_RX) != 0) { 4468 *l3 = NULL; 4469 return (true); 4470 } 4471 4472 /* Get l3 page table entry. */ 4473 *l3 = pmap_l2_to_l3(l2p, va); 4474 4475 return (true); 4476 } 4477