1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * Copyright (c) 2014 Andrew Turner 15 * All rights reserved. 16 * Copyright (c) 2014 The FreeBSD Foundation 17 * All rights reserved. 18 * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com> 19 * All rights reserved. 20 * 21 * This code is derived from software contributed to Berkeley by 22 * the Systems Programming Group of the University of Utah Computer 23 * Science Department and William Jolitz of UUNET Technologies Inc. 24 * 25 * Portions of this software were developed by Andrew Turner under 26 * sponsorship from The FreeBSD Foundation. 27 * 28 * Portions of this software were developed by SRI International and the 29 * University of Cambridge Computer Laboratory under DARPA/AFRL contract 30 * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. 31 * 32 * Portions of this software were developed by the University of Cambridge 33 * Computer Laboratory as part of the CTSRD Project, with support from the 34 * UK Higher Education Innovation Fund (HEIF). 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. All advertising materials mentioning features or use of this software 45 * must display the following acknowledgement: 46 * This product includes software developed by the University of 47 * California, Berkeley and its contributors. 48 * 4. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 * 64 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 65 */ 66 /*- 67 * Copyright (c) 2003 Networks Associates Technology, Inc. 68 * All rights reserved. 69 * 70 * This software was developed for the FreeBSD Project by Jake Burkholder, 71 * Safeport Network Services, and Network Associates Laboratories, the 72 * Security Research Division of Network Associates, Inc. under 73 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 74 * CHATS research program. 75 * 76 * Redistribution and use in source and binary forms, with or without 77 * modification, are permitted provided that the following conditions 78 * are met: 79 * 1. Redistributions of source code must retain the above copyright 80 * notice, this list of conditions and the following disclaimer. 81 * 2. Redistributions in binary form must reproduce the above copyright 82 * notice, this list of conditions and the following disclaimer in the 83 * documentation and/or other materials provided with the distribution. 84 * 85 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 86 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 87 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 88 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 89 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 90 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 91 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 92 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 93 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 94 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 95 * SUCH DAMAGE. 96 */ 97 98 #include <sys/cdefs.h> 99 __FBSDID("$FreeBSD$"); 100 101 /* 102 * Manages physical address maps. 103 * 104 * Since the information managed by this module is 105 * also stored by the logical address mapping module, 106 * this module may throw away valid virtual-to-physical 107 * mappings at almost any time. However, invalidations 108 * of virtual-to-physical mappings must be done as 109 * requested. 110 * 111 * In order to cope with hardware architectures which 112 * make virtual-to-physical map invalidates expensive, 113 * this module may delay invalidate or reduced protection 114 * operations until such time as they are actually 115 * necessary. This module is given full information as 116 * to which processors are currently using which maps, 117 * and to when physical maps must be made correct. 118 */ 119 120 #include <sys/param.h> 121 #include <sys/systm.h> 122 #include <sys/bitstring.h> 123 #include <sys/bus.h> 124 #include <sys/cpuset.h> 125 #include <sys/kernel.h> 126 #include <sys/ktr.h> 127 #include <sys/lock.h> 128 #include <sys/malloc.h> 129 #include <sys/mman.h> 130 #include <sys/msgbuf.h> 131 #include <sys/mutex.h> 132 #include <sys/proc.h> 133 #include <sys/rwlock.h> 134 #include <sys/sx.h> 135 #include <sys/vmem.h> 136 #include <sys/vmmeter.h> 137 #include <sys/sched.h> 138 #include <sys/sysctl.h> 139 #include <sys/smp.h> 140 141 #include <vm/vm.h> 142 #include <vm/vm_param.h> 143 #include <vm/vm_kern.h> 144 #include <vm/vm_page.h> 145 #include <vm/vm_map.h> 146 #include <vm/vm_object.h> 147 #include <vm/vm_extern.h> 148 #include <vm/vm_pageout.h> 149 #include <vm/vm_pager.h> 150 #include <vm/vm_phys.h> 151 #include <vm/vm_radix.h> 152 #include <vm/vm_reserv.h> 153 #include <vm/uma.h> 154 155 #include <machine/machdep.h> 156 #include <machine/md_var.h> 157 #include <machine/pcb.h> 158 #include <machine/sbi.h> 159 160 #define NUL1E (Ln_ENTRIES * Ln_ENTRIES) 161 #define NUL2E (Ln_ENTRIES * NUL1E) 162 163 #if !defined(DIAGNOSTIC) 164 #ifdef __GNUC_GNU_INLINE__ 165 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 166 #else 167 #define PMAP_INLINE extern inline 168 #endif 169 #else 170 #define PMAP_INLINE 171 #endif 172 173 #ifdef PV_STATS 174 #define PV_STAT(x) do { x ; } while (0) 175 #else 176 #define PV_STAT(x) do { } while (0) 177 #endif 178 179 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) 180 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 181 182 #define NPV_LIST_LOCKS MAXCPU 183 184 #define PHYS_TO_PV_LIST_LOCK(pa) \ 185 (&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS]) 186 187 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 188 struct rwlock **_lockp = (lockp); \ 189 struct rwlock *_new_lock; \ 190 \ 191 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 192 if (_new_lock != *_lockp) { \ 193 if (*_lockp != NULL) \ 194 rw_wunlock(*_lockp); \ 195 *_lockp = _new_lock; \ 196 rw_wlock(*_lockp); \ 197 } \ 198 } while (0) 199 200 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 201 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 202 203 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 204 struct rwlock **_lockp = (lockp); \ 205 \ 206 if (*_lockp != NULL) { \ 207 rw_wunlock(*_lockp); \ 208 *_lockp = NULL; \ 209 } \ 210 } while (0) 211 212 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 213 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 214 215 /* The list of all the user pmaps */ 216 LIST_HEAD(pmaplist, pmap); 217 static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER(); 218 219 struct pmap kernel_pmap_store; 220 221 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 222 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 223 vm_offset_t kernel_vm_end = 0; 224 225 vm_paddr_t dmap_phys_base; /* The start of the dmap region */ 226 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ 227 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ 228 229 /* This code assumes all L1 DMAP entries will be used */ 230 CTASSERT((DMAP_MIN_ADDRESS & ~L1_OFFSET) == DMAP_MIN_ADDRESS); 231 CTASSERT((DMAP_MAX_ADDRESS & ~L1_OFFSET) == DMAP_MAX_ADDRESS); 232 233 static struct rwlock_padalign pvh_global_lock; 234 static struct mtx_padalign allpmaps_lock; 235 236 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, 237 "VM/pmap parameters"); 238 239 static int superpages_enabled = 1; 240 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, 241 CTLFLAG_RDTUN, &superpages_enabled, 0, 242 "Enable support for transparent superpages"); 243 244 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0, 245 "2MB page mapping counters"); 246 247 static u_long pmap_l2_demotions; 248 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, 249 &pmap_l2_demotions, 0, 250 "2MB page demotions"); 251 252 static u_long pmap_l2_mappings; 253 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, 254 &pmap_l2_mappings, 0, 255 "2MB page mappings"); 256 257 static u_long pmap_l2_p_failures; 258 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, 259 &pmap_l2_p_failures, 0, 260 "2MB page promotion failures"); 261 262 static u_long pmap_l2_promotions; 263 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, 264 &pmap_l2_promotions, 0, 265 "2MB page promotions"); 266 267 /* 268 * Data for the pv entry allocation mechanism 269 */ 270 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 271 static struct mtx pv_chunks_mutex; 272 static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 273 static struct md_page *pv_table; 274 static struct md_page pv_dummy; 275 276 /* 277 * Internal flags for pmap_enter()'s helper functions. 278 */ 279 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 280 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 281 282 static void free_pv_chunk(struct pv_chunk *pc); 283 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 284 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 285 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 286 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 287 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 288 vm_offset_t va); 289 static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va); 290 static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, 291 vm_offset_t va, struct rwlock **lockp); 292 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, 293 u_int flags, vm_page_t m, struct rwlock **lockp); 294 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 295 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 296 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, 297 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 298 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 299 vm_page_t m, struct rwlock **lockp); 300 301 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, 302 struct rwlock **lockp); 303 304 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 305 struct spglist *free); 306 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 307 308 #define pmap_clear(pte) pmap_store(pte, 0) 309 #define pmap_clear_bits(pte, bits) atomic_clear_64(pte, bits) 310 #define pmap_load_store(pte, entry) atomic_swap_64(pte, entry) 311 #define pmap_load_clear(pte) pmap_load_store(pte, 0) 312 #define pmap_load(pte) atomic_load_64(pte) 313 #define pmap_store(pte, entry) atomic_store_64(pte, entry) 314 #define pmap_store_bits(pte, bits) atomic_set_64(pte, bits) 315 316 /********************/ 317 /* Inline functions */ 318 /********************/ 319 320 static __inline void 321 pagecopy(void *s, void *d) 322 { 323 324 memcpy(d, s, PAGE_SIZE); 325 } 326 327 static __inline void 328 pagezero(void *p) 329 { 330 331 bzero(p, PAGE_SIZE); 332 } 333 334 #define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) 335 #define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) 336 #define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) 337 338 #define PTE_TO_PHYS(pte) ((pte >> PTE_PPN0_S) * PAGE_SIZE) 339 340 static __inline pd_entry_t * 341 pmap_l1(pmap_t pmap, vm_offset_t va) 342 { 343 344 return (&pmap->pm_l1[pmap_l1_index(va)]); 345 } 346 347 static __inline pd_entry_t * 348 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) 349 { 350 vm_paddr_t phys; 351 pd_entry_t *l2; 352 353 phys = PTE_TO_PHYS(pmap_load(l1)); 354 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 355 356 return (&l2[pmap_l2_index(va)]); 357 } 358 359 static __inline pd_entry_t * 360 pmap_l2(pmap_t pmap, vm_offset_t va) 361 { 362 pd_entry_t *l1; 363 364 l1 = pmap_l1(pmap, va); 365 if ((pmap_load(l1) & PTE_V) == 0) 366 return (NULL); 367 if ((pmap_load(l1) & PTE_RX) != 0) 368 return (NULL); 369 370 return (pmap_l1_to_l2(l1, va)); 371 } 372 373 static __inline pt_entry_t * 374 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) 375 { 376 vm_paddr_t phys; 377 pt_entry_t *l3; 378 379 phys = PTE_TO_PHYS(pmap_load(l2)); 380 l3 = (pd_entry_t *)PHYS_TO_DMAP(phys); 381 382 return (&l3[pmap_l3_index(va)]); 383 } 384 385 static __inline pt_entry_t * 386 pmap_l3(pmap_t pmap, vm_offset_t va) 387 { 388 pd_entry_t *l2; 389 390 l2 = pmap_l2(pmap, va); 391 if (l2 == NULL) 392 return (NULL); 393 if ((pmap_load(l2) & PTE_V) == 0) 394 return (NULL); 395 if ((pmap_load(l2) & PTE_RX) != 0) 396 return (NULL); 397 398 return (pmap_l2_to_l3(l2, va)); 399 } 400 401 static __inline void 402 pmap_resident_count_inc(pmap_t pmap, int count) 403 { 404 405 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 406 pmap->pm_stats.resident_count += count; 407 } 408 409 static __inline void 410 pmap_resident_count_dec(pmap_t pmap, int count) 411 { 412 413 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 414 KASSERT(pmap->pm_stats.resident_count >= count, 415 ("pmap %p resident count underflow %ld %d", pmap, 416 pmap->pm_stats.resident_count, count)); 417 pmap->pm_stats.resident_count -= count; 418 } 419 420 static void 421 pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index, 422 pt_entry_t entry) 423 { 424 struct pmap *user_pmap; 425 pd_entry_t *l1; 426 427 /* Distribute new kernel L1 entry to all the user pmaps */ 428 if (pmap != kernel_pmap) 429 return; 430 431 mtx_lock(&allpmaps_lock); 432 LIST_FOREACH(user_pmap, &allpmaps, pm_list) { 433 l1 = &user_pmap->pm_l1[l1index]; 434 pmap_store(l1, entry); 435 } 436 mtx_unlock(&allpmaps_lock); 437 } 438 439 static pt_entry_t * 440 pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, 441 u_int *l2_slot) 442 { 443 pt_entry_t *l2; 444 pd_entry_t *l1; 445 446 l1 = (pd_entry_t *)l1pt; 447 *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; 448 449 /* Check locore has used a table L1 map */ 450 KASSERT((l1[*l1_slot] & PTE_RX) == 0, 451 ("Invalid bootstrap L1 table")); 452 453 /* Find the address of the L2 table */ 454 l2 = (pt_entry_t *)init_pt_va; 455 *l2_slot = pmap_l2_index(va); 456 457 return (l2); 458 } 459 460 static vm_paddr_t 461 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) 462 { 463 u_int l1_slot, l2_slot; 464 pt_entry_t *l2; 465 u_int ret; 466 467 l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); 468 469 /* Check locore has used L2 superpages */ 470 KASSERT((l2[l2_slot] & PTE_RX) != 0, 471 ("Invalid bootstrap L2 table")); 472 473 /* L2 is superpages */ 474 ret = (l2[l2_slot] >> PTE_PPN1_S) << L2_SHIFT; 475 ret += (va & L2_OFFSET); 476 477 return (ret); 478 } 479 480 static void 481 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa) 482 { 483 vm_offset_t va; 484 vm_paddr_t pa; 485 pd_entry_t *l1; 486 u_int l1_slot; 487 pt_entry_t entry; 488 pn_t pn; 489 490 pa = dmap_phys_base = min_pa & ~L1_OFFSET; 491 va = DMAP_MIN_ADDRESS; 492 l1 = (pd_entry_t *)kern_l1; 493 l1_slot = pmap_l1_index(DMAP_MIN_ADDRESS); 494 495 for (; va < DMAP_MAX_ADDRESS && pa < max_pa; 496 pa += L1_SIZE, va += L1_SIZE, l1_slot++) { 497 KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); 498 499 /* superpages */ 500 pn = (pa / PAGE_SIZE); 501 entry = PTE_KERN; 502 entry |= (pn << PTE_PPN0_S); 503 pmap_store(&l1[l1_slot], entry); 504 } 505 506 /* Set the upper limit of the DMAP region */ 507 dmap_phys_max = pa; 508 dmap_max_addr = va; 509 510 sfence_vma(); 511 } 512 513 static vm_offset_t 514 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) 515 { 516 vm_offset_t l3pt; 517 pt_entry_t entry; 518 pd_entry_t *l2; 519 vm_paddr_t pa; 520 u_int l2_slot; 521 pn_t pn; 522 523 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); 524 525 l2 = pmap_l2(kernel_pmap, va); 526 l2 = (pd_entry_t *)((uintptr_t)l2 & ~(PAGE_SIZE - 1)); 527 l2_slot = pmap_l2_index(va); 528 l3pt = l3_start; 529 530 for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { 531 KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); 532 533 pa = pmap_early_vtophys(l1pt, l3pt); 534 pn = (pa / PAGE_SIZE); 535 entry = (PTE_V); 536 entry |= (pn << PTE_PPN0_S); 537 pmap_store(&l2[l2_slot], entry); 538 l3pt += PAGE_SIZE; 539 } 540 541 542 /* Clean the L2 page table */ 543 memset((void *)l3_start, 0, l3pt - l3_start); 544 545 return (l3pt); 546 } 547 548 /* 549 * Bootstrap the system enough to run with virtual memory. 550 */ 551 void 552 pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) 553 { 554 u_int l1_slot, l2_slot, avail_slot, map_slot; 555 vm_offset_t freemempos; 556 vm_offset_t dpcpu, msgbufpv; 557 vm_paddr_t end, max_pa, min_pa, pa, start; 558 int i; 559 560 printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); 561 printf("%lx\n", l1pt); 562 printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK); 563 564 /* Set this early so we can use the pagetable walking functions */ 565 kernel_pmap_store.pm_l1 = (pd_entry_t *)l1pt; 566 PMAP_LOCK_INIT(kernel_pmap); 567 568 rw_init(&pvh_global_lock, "pmap pv global"); 569 570 CPU_FILL(&kernel_pmap->pm_active); 571 572 /* Assume the address we were loaded to is a valid physical address. */ 573 min_pa = max_pa = kernstart; 574 575 /* 576 * Find the minimum physical address. physmap is sorted, 577 * but may contain empty ranges. 578 */ 579 for (i = 0; i < physmap_idx * 2; i += 2) { 580 if (physmap[i] == physmap[i + 1]) 581 continue; 582 if (physmap[i] <= min_pa) 583 min_pa = physmap[i]; 584 if (physmap[i + 1] > max_pa) 585 max_pa = physmap[i + 1]; 586 } 587 printf("physmap_idx %lx\n", physmap_idx); 588 printf("min_pa %lx\n", min_pa); 589 printf("max_pa %lx\n", max_pa); 590 591 /* Create a direct map region early so we can use it for pa -> va */ 592 pmap_bootstrap_dmap(l1pt, min_pa, max_pa); 593 594 /* 595 * Read the page table to find out what is already mapped. 596 * This assumes we have mapped a block of memory from KERNBASE 597 * using a single L1 entry. 598 */ 599 (void)pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); 600 601 /* Sanity check the index, KERNBASE should be the first VA */ 602 KASSERT(l2_slot == 0, ("The L2 index is non-zero")); 603 604 freemempos = roundup2(KERNBASE + kernlen, PAGE_SIZE); 605 606 /* Create the l3 tables for the early devmap */ 607 freemempos = pmap_bootstrap_l3(l1pt, 608 VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos); 609 610 sfence_vma(); 611 612 #define alloc_pages(var, np) \ 613 (var) = freemempos; \ 614 freemempos += (np * PAGE_SIZE); \ 615 memset((char *)(var), 0, ((np) * PAGE_SIZE)); 616 617 /* Allocate dynamic per-cpu area. */ 618 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); 619 dpcpu_init((void *)dpcpu, 0); 620 621 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ 622 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); 623 msgbufp = (void *)msgbufpv; 624 625 virtual_avail = roundup2(freemempos, L2_SIZE); 626 virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE; 627 kernel_vm_end = virtual_avail; 628 629 pa = pmap_early_vtophys(l1pt, freemempos); 630 631 /* Initialize phys_avail. */ 632 for (avail_slot = map_slot = physmem = 0; map_slot < physmap_idx * 2; 633 map_slot += 2) { 634 start = physmap[map_slot]; 635 end = physmap[map_slot + 1]; 636 637 if (start == end) 638 continue; 639 if (start >= kernstart && end <= pa) 640 continue; 641 642 if (start < kernstart && end > kernstart) 643 end = kernstart; 644 else if (start < pa && end > pa) 645 start = pa; 646 phys_avail[avail_slot] = start; 647 phys_avail[avail_slot + 1] = end; 648 physmem += (end - start) >> PAGE_SHIFT; 649 avail_slot += 2; 650 651 if (end != physmap[map_slot + 1] && end > pa) { 652 phys_avail[avail_slot] = pa; 653 phys_avail[avail_slot + 1] = physmap[map_slot + 1]; 654 physmem += (physmap[map_slot + 1] - pa) >> PAGE_SHIFT; 655 avail_slot += 2; 656 } 657 } 658 phys_avail[avail_slot] = 0; 659 phys_avail[avail_slot + 1] = 0; 660 661 /* 662 * Maxmem isn't the "maximum memory", it's one larger than the 663 * highest page of the physical address space. It should be 664 * called something like "Maxphyspage". 665 */ 666 Maxmem = atop(phys_avail[avail_slot - 1]); 667 } 668 669 /* 670 * Initialize a vm_page's machine-dependent fields. 671 */ 672 void 673 pmap_page_init(vm_page_t m) 674 { 675 676 TAILQ_INIT(&m->md.pv_list); 677 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; 678 } 679 680 /* 681 * Initialize the pmap module. 682 * Called by vm_init, to initialize any structures that the pmap 683 * system needs to map virtual memory. 684 */ 685 void 686 pmap_init(void) 687 { 688 vm_size_t s; 689 int i, pv_npg; 690 691 /* 692 * Initialize the pv chunk and pmap list mutexes. 693 */ 694 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 695 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF); 696 697 /* 698 * Initialize the pool of pv list locks. 699 */ 700 for (i = 0; i < NPV_LIST_LOCKS; i++) 701 rw_init(&pv_list_locks[i], "pmap pv list"); 702 703 /* 704 * Calculate the size of the pv head table for superpages. 705 */ 706 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); 707 708 /* 709 * Allocate memory for the pv head table for superpages. 710 */ 711 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 712 s = round_page(s); 713 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 714 for (i = 0; i < pv_npg; i++) 715 TAILQ_INIT(&pv_table[i].pv_list); 716 TAILQ_INIT(&pv_dummy.pv_list); 717 718 if (superpages_enabled) 719 pagesizes[1] = L2_SIZE; 720 } 721 722 #ifdef SMP 723 /* 724 * For SMP, these functions have to use IPIs for coherence. 725 * 726 * In general, the calling thread uses a plain fence to order the 727 * writes to the page tables before invoking an SBI callback to invoke 728 * sfence_vma() on remote CPUs. 729 */ 730 static void 731 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 732 { 733 cpuset_t mask; 734 735 sched_pin(); 736 mask = pmap->pm_active; 737 CPU_CLR(PCPU_GET(cpuid), &mask); 738 fence(); 739 if (!CPU_EMPTY(&mask) && smp_started) 740 sbi_remote_sfence_vma(mask.__bits, va, 1); 741 sfence_vma_page(va); 742 sched_unpin(); 743 } 744 745 static void 746 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 747 { 748 cpuset_t mask; 749 750 sched_pin(); 751 mask = pmap->pm_active; 752 CPU_CLR(PCPU_GET(cpuid), &mask); 753 fence(); 754 if (!CPU_EMPTY(&mask) && smp_started) 755 sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1); 756 757 /* 758 * Might consider a loop of sfence_vma_page() for a small 759 * number of pages in the future. 760 */ 761 sfence_vma(); 762 sched_unpin(); 763 } 764 765 static void 766 pmap_invalidate_all(pmap_t pmap) 767 { 768 cpuset_t mask; 769 770 sched_pin(); 771 mask = pmap->pm_active; 772 CPU_CLR(PCPU_GET(cpuid), &mask); 773 774 /* 775 * XXX: The SBI doc doesn't detail how to specify x0 as the 776 * address to perform a global fence. BBL currently treats 777 * all sfence_vma requests as global however. 778 */ 779 fence(); 780 if (!CPU_EMPTY(&mask) && smp_started) 781 sbi_remote_sfence_vma(mask.__bits, 0, 0); 782 sfence_vma(); 783 sched_unpin(); 784 } 785 #else 786 /* 787 * Normal, non-SMP, invalidation functions. 788 * We inline these within pmap.c for speed. 789 */ 790 static __inline void 791 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 792 { 793 794 sfence_vma_page(va); 795 } 796 797 static __inline void 798 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 799 { 800 801 /* 802 * Might consider a loop of sfence_vma_page() for a small 803 * number of pages in the future. 804 */ 805 sfence_vma(); 806 } 807 808 static __inline void 809 pmap_invalidate_all(pmap_t pmap) 810 { 811 812 sfence_vma(); 813 } 814 #endif 815 816 /* 817 * Routine: pmap_extract 818 * Function: 819 * Extract the physical page address associated 820 * with the given map/virtual_address pair. 821 */ 822 vm_paddr_t 823 pmap_extract(pmap_t pmap, vm_offset_t va) 824 { 825 pd_entry_t *l2p, l2; 826 pt_entry_t *l3p, l3; 827 vm_paddr_t pa; 828 829 pa = 0; 830 PMAP_LOCK(pmap); 831 /* 832 * Start with the l2 tabel. We are unable to allocate 833 * pages in the l1 table. 834 */ 835 l2p = pmap_l2(pmap, va); 836 if (l2p != NULL) { 837 l2 = pmap_load(l2p); 838 if ((l2 & PTE_RX) == 0) { 839 l3p = pmap_l2_to_l3(l2p, va); 840 if (l3p != NULL) { 841 l3 = pmap_load(l3p); 842 pa = PTE_TO_PHYS(l3); 843 pa |= (va & L3_OFFSET); 844 } 845 } else { 846 /* L2 is superpages */ 847 pa = (l2 >> PTE_PPN1_S) << L2_SHIFT; 848 pa |= (va & L2_OFFSET); 849 } 850 } 851 PMAP_UNLOCK(pmap); 852 return (pa); 853 } 854 855 /* 856 * Routine: pmap_extract_and_hold 857 * Function: 858 * Atomically extract and hold the physical page 859 * with the given pmap and virtual address pair 860 * if that mapping permits the given protection. 861 */ 862 vm_page_t 863 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 864 { 865 pt_entry_t *l3p, l3; 866 vm_paddr_t phys; 867 vm_paddr_t pa; 868 vm_page_t m; 869 870 pa = 0; 871 m = NULL; 872 PMAP_LOCK(pmap); 873 retry: 874 l3p = pmap_l3(pmap, va); 875 if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) { 876 if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) { 877 phys = PTE_TO_PHYS(l3); 878 if (vm_page_pa_tryrelock(pmap, phys, &pa)) 879 goto retry; 880 m = PHYS_TO_VM_PAGE(phys); 881 vm_page_hold(m); 882 } 883 } 884 PA_UNLOCK_COND(pa); 885 PMAP_UNLOCK(pmap); 886 return (m); 887 } 888 889 vm_paddr_t 890 pmap_kextract(vm_offset_t va) 891 { 892 pd_entry_t *l2; 893 pt_entry_t *l3; 894 vm_paddr_t pa; 895 896 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 897 pa = DMAP_TO_PHYS(va); 898 } else { 899 l2 = pmap_l2(kernel_pmap, va); 900 if (l2 == NULL) 901 panic("pmap_kextract: No l2"); 902 if ((pmap_load(l2) & PTE_RX) != 0) { 903 /* superpages */ 904 pa = (pmap_load(l2) >> PTE_PPN1_S) << L2_SHIFT; 905 pa |= (va & L2_OFFSET); 906 return (pa); 907 } 908 909 l3 = pmap_l2_to_l3(l2, va); 910 if (l3 == NULL) 911 panic("pmap_kextract: No l3..."); 912 pa = PTE_TO_PHYS(pmap_load(l3)); 913 pa |= (va & PAGE_MASK); 914 } 915 return (pa); 916 } 917 918 /*************************************************** 919 * Low level mapping routines..... 920 ***************************************************/ 921 922 void 923 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) 924 { 925 pt_entry_t entry; 926 pt_entry_t *l3; 927 vm_offset_t va; 928 pn_t pn; 929 930 KASSERT((pa & L3_OFFSET) == 0, 931 ("pmap_kenter_device: Invalid physical address")); 932 KASSERT((sva & L3_OFFSET) == 0, 933 ("pmap_kenter_device: Invalid virtual address")); 934 KASSERT((size & PAGE_MASK) == 0, 935 ("pmap_kenter_device: Mapping is not page-sized")); 936 937 va = sva; 938 while (size != 0) { 939 l3 = pmap_l3(kernel_pmap, va); 940 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 941 942 pn = (pa / PAGE_SIZE); 943 entry = PTE_KERN; 944 entry |= (pn << PTE_PPN0_S); 945 pmap_store(l3, entry); 946 947 va += PAGE_SIZE; 948 pa += PAGE_SIZE; 949 size -= PAGE_SIZE; 950 } 951 pmap_invalidate_range(kernel_pmap, sva, va); 952 } 953 954 /* 955 * Remove a page from the kernel pagetables. 956 * Note: not SMP coherent. 957 */ 958 PMAP_INLINE void 959 pmap_kremove(vm_offset_t va) 960 { 961 pt_entry_t *l3; 962 963 l3 = pmap_l3(kernel_pmap, va); 964 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 965 966 pmap_clear(l3); 967 sfence_vma(); 968 } 969 970 void 971 pmap_kremove_device(vm_offset_t sva, vm_size_t size) 972 { 973 pt_entry_t *l3; 974 vm_offset_t va; 975 976 KASSERT((sva & L3_OFFSET) == 0, 977 ("pmap_kremove_device: Invalid virtual address")); 978 KASSERT((size & PAGE_MASK) == 0, 979 ("pmap_kremove_device: Mapping is not page-sized")); 980 981 va = sva; 982 while (size != 0) { 983 l3 = pmap_l3(kernel_pmap, va); 984 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 985 pmap_clear(l3); 986 987 va += PAGE_SIZE; 988 size -= PAGE_SIZE; 989 } 990 991 pmap_invalidate_range(kernel_pmap, sva, va); 992 } 993 994 /* 995 * Used to map a range of physical addresses into kernel 996 * virtual address space. 997 * 998 * The value passed in '*virt' is a suggested virtual address for 999 * the mapping. Architectures which can support a direct-mapped 1000 * physical to virtual region can return the appropriate address 1001 * within that region, leaving '*virt' unchanged. Other 1002 * architectures should map the pages starting at '*virt' and 1003 * update '*virt' with the first usable address after the mapped 1004 * region. 1005 */ 1006 vm_offset_t 1007 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1008 { 1009 1010 return PHYS_TO_DMAP(start); 1011 } 1012 1013 1014 /* 1015 * Add a list of wired pages to the kva 1016 * this routine is only used for temporary 1017 * kernel mappings that do not need to have 1018 * page modification or references recorded. 1019 * Note that old mappings are simply written 1020 * over. The page *must* be wired. 1021 * Note: SMP coherent. Uses a ranged shootdown IPI. 1022 */ 1023 void 1024 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1025 { 1026 pt_entry_t *l3, pa; 1027 vm_offset_t va; 1028 vm_page_t m; 1029 pt_entry_t entry; 1030 pn_t pn; 1031 int i; 1032 1033 va = sva; 1034 for (i = 0; i < count; i++) { 1035 m = ma[i]; 1036 pa = VM_PAGE_TO_PHYS(m); 1037 pn = (pa / PAGE_SIZE); 1038 l3 = pmap_l3(kernel_pmap, va); 1039 1040 entry = PTE_KERN; 1041 entry |= (pn << PTE_PPN0_S); 1042 pmap_store(l3, entry); 1043 1044 va += L3_SIZE; 1045 } 1046 pmap_invalidate_range(kernel_pmap, sva, va); 1047 } 1048 1049 /* 1050 * This routine tears out page mappings from the 1051 * kernel -- it is meant only for temporary mappings. 1052 * Note: SMP coherent. Uses a ranged shootdown IPI. 1053 */ 1054 void 1055 pmap_qremove(vm_offset_t sva, int count) 1056 { 1057 pt_entry_t *l3; 1058 vm_offset_t va; 1059 1060 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); 1061 1062 for (va = sva; count-- > 0; va += PAGE_SIZE) { 1063 l3 = pmap_l3(kernel_pmap, va); 1064 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 1065 pmap_clear(l3); 1066 } 1067 pmap_invalidate_range(kernel_pmap, sva, va); 1068 } 1069 1070 bool 1071 pmap_ps_enabled(pmap_t pmap __unused) 1072 { 1073 1074 return (superpages_enabled); 1075 } 1076 1077 /*************************************************** 1078 * Page table page management routines..... 1079 ***************************************************/ 1080 /* 1081 * Schedule the specified unused page table page to be freed. Specifically, 1082 * add the page to the specified list of pages that will be released to the 1083 * physical memory manager after the TLB has been updated. 1084 */ 1085 static __inline void 1086 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 1087 boolean_t set_PG_ZERO) 1088 { 1089 1090 if (set_PG_ZERO) 1091 m->flags |= PG_ZERO; 1092 else 1093 m->flags &= ~PG_ZERO; 1094 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1095 } 1096 1097 /* 1098 * Inserts the specified page table page into the specified pmap's collection 1099 * of idle page table pages. Each of a pmap's page table pages is responsible 1100 * for mapping a distinct range of virtual addresses. The pmap's collection is 1101 * ordered by this virtual address range. 1102 */ 1103 static __inline int 1104 pmap_insert_pt_page(pmap_t pmap, vm_page_t ml3) 1105 { 1106 1107 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1108 return (vm_radix_insert(&pmap->pm_root, ml3)); 1109 } 1110 1111 /* 1112 * Removes the page table page mapping the specified virtual address from the 1113 * specified pmap's collection of idle page table pages, and returns it. 1114 * Otherwise, returns NULL if there is no page table page corresponding to the 1115 * specified virtual address. 1116 */ 1117 static __inline vm_page_t 1118 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 1119 { 1120 1121 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1122 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); 1123 } 1124 1125 /* 1126 * Decrements a page table page's wire count, which is used to record the 1127 * number of valid page table entries within the page. If the wire count 1128 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1129 * page table page was unmapped and FALSE otherwise. 1130 */ 1131 static inline boolean_t 1132 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1133 { 1134 1135 --m->wire_count; 1136 if (m->wire_count == 0) { 1137 _pmap_unwire_ptp(pmap, va, m, free); 1138 return (TRUE); 1139 } else { 1140 return (FALSE); 1141 } 1142 } 1143 1144 static void 1145 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1146 { 1147 vm_paddr_t phys; 1148 1149 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1150 if (m->pindex >= NUL1E) { 1151 pd_entry_t *l1; 1152 l1 = pmap_l1(pmap, va); 1153 pmap_clear(l1); 1154 pmap_distribute_l1(pmap, pmap_l1_index(va), 0); 1155 } else { 1156 pd_entry_t *l2; 1157 l2 = pmap_l2(pmap, va); 1158 pmap_clear(l2); 1159 } 1160 pmap_resident_count_dec(pmap, 1); 1161 if (m->pindex < NUL1E) { 1162 pd_entry_t *l1; 1163 vm_page_t pdpg; 1164 1165 l1 = pmap_l1(pmap, va); 1166 phys = PTE_TO_PHYS(pmap_load(l1)); 1167 pdpg = PHYS_TO_VM_PAGE(phys); 1168 pmap_unwire_ptp(pmap, va, pdpg, free); 1169 } 1170 pmap_invalidate_page(pmap, va); 1171 1172 vm_wire_sub(1); 1173 1174 /* 1175 * Put page on a list so that it is released after 1176 * *ALL* TLB shootdown is done 1177 */ 1178 pmap_add_delayed_free_list(m, free, TRUE); 1179 } 1180 1181 /* 1182 * After removing a page table entry, this routine is used to 1183 * conditionally free the page, and manage the hold/wire counts. 1184 */ 1185 static int 1186 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 1187 struct spglist *free) 1188 { 1189 vm_page_t mpte; 1190 1191 if (va >= VM_MAXUSER_ADDRESS) 1192 return (0); 1193 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 1194 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde)); 1195 return (pmap_unwire_ptp(pmap, va, mpte, free)); 1196 } 1197 1198 void 1199 pmap_pinit0(pmap_t pmap) 1200 { 1201 1202 PMAP_LOCK_INIT(pmap); 1203 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1204 pmap->pm_l1 = kernel_pmap->pm_l1; 1205 pmap->pm_satp = SATP_MODE_SV39 | (vtophys(pmap->pm_l1) >> PAGE_SHIFT); 1206 CPU_ZERO(&pmap->pm_active); 1207 pmap_activate_boot(pmap); 1208 } 1209 1210 int 1211 pmap_pinit(pmap_t pmap) 1212 { 1213 vm_paddr_t l1phys; 1214 vm_page_t l1pt; 1215 1216 /* 1217 * allocate the l1 page 1218 */ 1219 while ((l1pt = vm_page_alloc(NULL, 0xdeadbeef, VM_ALLOC_NORMAL | 1220 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) 1221 vm_wait(NULL); 1222 1223 l1phys = VM_PAGE_TO_PHYS(l1pt); 1224 pmap->pm_l1 = (pd_entry_t *)PHYS_TO_DMAP(l1phys); 1225 pmap->pm_satp = SATP_MODE_SV39 | (l1phys >> PAGE_SHIFT); 1226 1227 if ((l1pt->flags & PG_ZERO) == 0) 1228 pagezero(pmap->pm_l1); 1229 1230 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1231 1232 CPU_ZERO(&pmap->pm_active); 1233 1234 /* Install kernel pagetables */ 1235 memcpy(pmap->pm_l1, kernel_pmap->pm_l1, PAGE_SIZE); 1236 1237 /* Add to the list of all user pmaps */ 1238 mtx_lock(&allpmaps_lock); 1239 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1240 mtx_unlock(&allpmaps_lock); 1241 1242 vm_radix_init(&pmap->pm_root); 1243 1244 return (1); 1245 } 1246 1247 /* 1248 * This routine is called if the desired page table page does not exist. 1249 * 1250 * If page table page allocation fails, this routine may sleep before 1251 * returning NULL. It sleeps only if a lock pointer was given. 1252 * 1253 * Note: If a page allocation fails at page table level two or three, 1254 * one or two pages may be held during the wait, only to be released 1255 * afterwards. This conservative approach is easily argued to avoid 1256 * race conditions. 1257 */ 1258 static vm_page_t 1259 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 1260 { 1261 vm_page_t m, /*pdppg, */pdpg; 1262 pt_entry_t entry; 1263 vm_paddr_t phys; 1264 pn_t pn; 1265 1266 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1267 1268 /* 1269 * Allocate a page table page. 1270 */ 1271 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1272 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1273 if (lockp != NULL) { 1274 RELEASE_PV_LIST_LOCK(lockp); 1275 PMAP_UNLOCK(pmap); 1276 rw_runlock(&pvh_global_lock); 1277 vm_wait(NULL); 1278 rw_rlock(&pvh_global_lock); 1279 PMAP_LOCK(pmap); 1280 } 1281 1282 /* 1283 * Indicate the need to retry. While waiting, the page table 1284 * page may have been allocated. 1285 */ 1286 return (NULL); 1287 } 1288 1289 if ((m->flags & PG_ZERO) == 0) 1290 pmap_zero_page(m); 1291 1292 /* 1293 * Map the pagetable page into the process address space, if 1294 * it isn't already there. 1295 */ 1296 1297 if (ptepindex >= NUL1E) { 1298 pd_entry_t *l1; 1299 vm_pindex_t l1index; 1300 1301 l1index = ptepindex - NUL1E; 1302 l1 = &pmap->pm_l1[l1index]; 1303 1304 pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); 1305 entry = (PTE_V); 1306 entry |= (pn << PTE_PPN0_S); 1307 pmap_store(l1, entry); 1308 pmap_distribute_l1(pmap, l1index, entry); 1309 } else { 1310 vm_pindex_t l1index; 1311 pd_entry_t *l1, *l2; 1312 1313 l1index = ptepindex >> (L1_SHIFT - L2_SHIFT); 1314 l1 = &pmap->pm_l1[l1index]; 1315 if (pmap_load(l1) == 0) { 1316 /* recurse for allocating page dir */ 1317 if (_pmap_alloc_l3(pmap, NUL1E + l1index, 1318 lockp) == NULL) { 1319 vm_page_unwire_noq(m); 1320 vm_page_free_zero(m); 1321 return (NULL); 1322 } 1323 } else { 1324 phys = PTE_TO_PHYS(pmap_load(l1)); 1325 pdpg = PHYS_TO_VM_PAGE(phys); 1326 pdpg->wire_count++; 1327 } 1328 1329 phys = PTE_TO_PHYS(pmap_load(l1)); 1330 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1331 l2 = &l2[ptepindex & Ln_ADDR_MASK]; 1332 1333 pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); 1334 entry = (PTE_V); 1335 entry |= (pn << PTE_PPN0_S); 1336 pmap_store(l2, entry); 1337 } 1338 1339 pmap_resident_count_inc(pmap, 1); 1340 1341 return (m); 1342 } 1343 1344 static vm_page_t 1345 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1346 { 1347 pd_entry_t *l1; 1348 vm_page_t l2pg; 1349 vm_pindex_t l2pindex; 1350 1351 retry: 1352 l1 = pmap_l1(pmap, va); 1353 if (l1 != NULL && (pmap_load(l1) & PTE_RWX) == 0) { 1354 /* Add a reference to the L2 page. */ 1355 l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1))); 1356 l2pg->wire_count++; 1357 } else { 1358 /* Allocate a L2 page. */ 1359 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; 1360 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); 1361 if (l2pg == NULL && lockp != NULL) 1362 goto retry; 1363 } 1364 return (l2pg); 1365 } 1366 1367 static vm_page_t 1368 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1369 { 1370 vm_pindex_t ptepindex; 1371 pd_entry_t *l2; 1372 vm_paddr_t phys; 1373 vm_page_t m; 1374 1375 /* 1376 * Calculate pagetable page index 1377 */ 1378 ptepindex = pmap_l2_pindex(va); 1379 retry: 1380 /* 1381 * Get the page directory entry 1382 */ 1383 l2 = pmap_l2(pmap, va); 1384 1385 /* 1386 * If the page table page is mapped, we just increment the 1387 * hold count, and activate it. 1388 */ 1389 if (l2 != NULL && pmap_load(l2) != 0) { 1390 phys = PTE_TO_PHYS(pmap_load(l2)); 1391 m = PHYS_TO_VM_PAGE(phys); 1392 m->wire_count++; 1393 } else { 1394 /* 1395 * Here if the pte page isn't mapped, or if it has been 1396 * deallocated. 1397 */ 1398 m = _pmap_alloc_l3(pmap, ptepindex, lockp); 1399 if (m == NULL && lockp != NULL) 1400 goto retry; 1401 } 1402 return (m); 1403 } 1404 1405 1406 /*************************************************** 1407 * Pmap allocation/deallocation routines. 1408 ***************************************************/ 1409 1410 /* 1411 * Release any resources held by the given physical map. 1412 * Called when a pmap initialized by pmap_pinit is being released. 1413 * Should only be called if the map contains no valid mappings. 1414 */ 1415 void 1416 pmap_release(pmap_t pmap) 1417 { 1418 vm_page_t m; 1419 1420 KASSERT(pmap->pm_stats.resident_count == 0, 1421 ("pmap_release: pmap resident count %ld != 0", 1422 pmap->pm_stats.resident_count)); 1423 KASSERT(CPU_EMPTY(&pmap->pm_active), 1424 ("releasing active pmap %p", pmap)); 1425 1426 mtx_lock(&allpmaps_lock); 1427 LIST_REMOVE(pmap, pm_list); 1428 mtx_unlock(&allpmaps_lock); 1429 1430 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l1)); 1431 vm_page_unwire_noq(m); 1432 vm_page_free(m); 1433 } 1434 1435 #if 0 1436 static int 1437 kvm_size(SYSCTL_HANDLER_ARGS) 1438 { 1439 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 1440 1441 return sysctl_handle_long(oidp, &ksize, 0, req); 1442 } 1443 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 1444 0, 0, kvm_size, "LU", "Size of KVM"); 1445 1446 static int 1447 kvm_free(SYSCTL_HANDLER_ARGS) 1448 { 1449 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 1450 1451 return sysctl_handle_long(oidp, &kfree, 0, req); 1452 } 1453 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 1454 0, 0, kvm_free, "LU", "Amount of KVM free"); 1455 #endif /* 0 */ 1456 1457 /* 1458 * grow the number of kernel page table entries, if needed 1459 */ 1460 void 1461 pmap_growkernel(vm_offset_t addr) 1462 { 1463 vm_paddr_t paddr; 1464 vm_page_t nkpg; 1465 pd_entry_t *l1, *l2; 1466 pt_entry_t entry; 1467 pn_t pn; 1468 1469 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 1470 1471 addr = roundup2(addr, L2_SIZE); 1472 if (addr - 1 >= vm_map_max(kernel_map)) 1473 addr = vm_map_max(kernel_map); 1474 while (kernel_vm_end < addr) { 1475 l1 = pmap_l1(kernel_pmap, kernel_vm_end); 1476 if (pmap_load(l1) == 0) { 1477 /* We need a new PDP entry */ 1478 nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT, 1479 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 1480 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1481 if (nkpg == NULL) 1482 panic("pmap_growkernel: no memory to grow kernel"); 1483 if ((nkpg->flags & PG_ZERO) == 0) 1484 pmap_zero_page(nkpg); 1485 paddr = VM_PAGE_TO_PHYS(nkpg); 1486 1487 pn = (paddr / PAGE_SIZE); 1488 entry = (PTE_V); 1489 entry |= (pn << PTE_PPN0_S); 1490 pmap_store(l1, entry); 1491 pmap_distribute_l1(kernel_pmap, 1492 pmap_l1_index(kernel_vm_end), entry); 1493 continue; /* try again */ 1494 } 1495 l2 = pmap_l1_to_l2(l1, kernel_vm_end); 1496 if ((pmap_load(l2) & PTE_V) != 0 && 1497 (pmap_load(l2) & PTE_RWX) == 0) { 1498 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1499 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1500 kernel_vm_end = vm_map_max(kernel_map); 1501 break; 1502 } 1503 continue; 1504 } 1505 1506 nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT, 1507 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 1508 VM_ALLOC_ZERO); 1509 if (nkpg == NULL) 1510 panic("pmap_growkernel: no memory to grow kernel"); 1511 if ((nkpg->flags & PG_ZERO) == 0) { 1512 pmap_zero_page(nkpg); 1513 } 1514 paddr = VM_PAGE_TO_PHYS(nkpg); 1515 1516 pn = (paddr / PAGE_SIZE); 1517 entry = (PTE_V); 1518 entry |= (pn << PTE_PPN0_S); 1519 pmap_store(l2, entry); 1520 1521 pmap_invalidate_page(kernel_pmap, kernel_vm_end); 1522 1523 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1524 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1525 kernel_vm_end = vm_map_max(kernel_map); 1526 break; 1527 } 1528 } 1529 } 1530 1531 1532 /*************************************************** 1533 * page management routines. 1534 ***************************************************/ 1535 1536 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 1537 CTASSERT(_NPCM == 3); 1538 CTASSERT(_NPCPV == 168); 1539 1540 static __inline struct pv_chunk * 1541 pv_to_chunk(pv_entry_t pv) 1542 { 1543 1544 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 1545 } 1546 1547 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 1548 1549 #define PC_FREE0 0xfffffffffffffffful 1550 #define PC_FREE1 0xfffffffffffffffful 1551 #define PC_FREE2 0x000000fffffffffful 1552 1553 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 1554 1555 #if 0 1556 #ifdef PV_STATS 1557 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 1558 1559 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 1560 "Current number of pv entry chunks"); 1561 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 1562 "Current number of pv entry chunks allocated"); 1563 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 1564 "Current number of pv entry chunks frees"); 1565 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 1566 "Number of times tried to get a chunk page but failed."); 1567 1568 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 1569 static int pv_entry_spare; 1570 1571 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 1572 "Current number of pv entry frees"); 1573 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 1574 "Current number of pv entry allocs"); 1575 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 1576 "Current number of pv entries"); 1577 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 1578 "Current number of spare pv entries"); 1579 #endif 1580 #endif /* 0 */ 1581 1582 /* 1583 * We are in a serious low memory condition. Resort to 1584 * drastic measures to free some pages so we can allocate 1585 * another pv entry chunk. 1586 * 1587 * Returns NULL if PV entries were reclaimed from the specified pmap. 1588 * 1589 * We do not, however, unmap 2mpages because subsequent accesses will 1590 * allocate per-page pv entries until repromotion occurs, thereby 1591 * exacerbating the shortage of free pv entries. 1592 */ 1593 static vm_page_t 1594 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 1595 { 1596 1597 panic("RISCVTODO: reclaim_pv_chunk"); 1598 } 1599 1600 /* 1601 * free the pv_entry back to the free list 1602 */ 1603 static void 1604 free_pv_entry(pmap_t pmap, pv_entry_t pv) 1605 { 1606 struct pv_chunk *pc; 1607 int idx, field, bit; 1608 1609 rw_assert(&pvh_global_lock, RA_LOCKED); 1610 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1611 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 1612 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 1613 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 1614 pc = pv_to_chunk(pv); 1615 idx = pv - &pc->pc_pventry[0]; 1616 field = idx / 64; 1617 bit = idx % 64; 1618 pc->pc_map[field] |= 1ul << bit; 1619 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 1620 pc->pc_map[2] != PC_FREE2) { 1621 /* 98% of the time, pc is already at the head of the list. */ 1622 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 1623 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1624 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1625 } 1626 return; 1627 } 1628 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1629 free_pv_chunk(pc); 1630 } 1631 1632 static void 1633 free_pv_chunk(struct pv_chunk *pc) 1634 { 1635 vm_page_t m; 1636 1637 mtx_lock(&pv_chunks_mutex); 1638 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 1639 mtx_unlock(&pv_chunks_mutex); 1640 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 1641 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 1642 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 1643 /* entire chunk is free, return it */ 1644 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 1645 #if 0 /* TODO: For minidump */ 1646 dump_drop_page(m->phys_addr); 1647 #endif 1648 vm_page_unwire(m, PQ_NONE); 1649 vm_page_free(m); 1650 } 1651 1652 /* 1653 * Returns a new PV entry, allocating a new PV chunk from the system when 1654 * needed. If this PV chunk allocation fails and a PV list lock pointer was 1655 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 1656 * returned. 1657 * 1658 * The given PV list lock may be released. 1659 */ 1660 static pv_entry_t 1661 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 1662 { 1663 int bit, field; 1664 pv_entry_t pv; 1665 struct pv_chunk *pc; 1666 vm_page_t m; 1667 1668 rw_assert(&pvh_global_lock, RA_LOCKED); 1669 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1670 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 1671 retry: 1672 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1673 if (pc != NULL) { 1674 for (field = 0; field < _NPCM; field++) { 1675 if (pc->pc_map[field]) { 1676 bit = ffsl(pc->pc_map[field]) - 1; 1677 break; 1678 } 1679 } 1680 if (field < _NPCM) { 1681 pv = &pc->pc_pventry[field * 64 + bit]; 1682 pc->pc_map[field] &= ~(1ul << bit); 1683 /* If this was the last item, move it to tail */ 1684 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 1685 pc->pc_map[2] == 0) { 1686 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1687 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 1688 pc_list); 1689 } 1690 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1691 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 1692 return (pv); 1693 } 1694 } 1695 /* No free items, allocate another chunk */ 1696 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1697 VM_ALLOC_WIRED); 1698 if (m == NULL) { 1699 if (lockp == NULL) { 1700 PV_STAT(pc_chunk_tryfail++); 1701 return (NULL); 1702 } 1703 m = reclaim_pv_chunk(pmap, lockp); 1704 if (m == NULL) 1705 goto retry; 1706 } 1707 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 1708 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 1709 #if 0 /* TODO: This is for minidump */ 1710 dump_add_page(m->phys_addr); 1711 #endif 1712 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1713 pc->pc_pmap = pmap; 1714 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 1715 pc->pc_map[1] = PC_FREE1; 1716 pc->pc_map[2] = PC_FREE2; 1717 mtx_lock(&pv_chunks_mutex); 1718 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 1719 mtx_unlock(&pv_chunks_mutex); 1720 pv = &pc->pc_pventry[0]; 1721 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1722 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1723 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 1724 return (pv); 1725 } 1726 1727 /* 1728 * Ensure that the number of spare PV entries in the specified pmap meets or 1729 * exceeds the given count, "needed". 1730 * 1731 * The given PV list lock may be released. 1732 */ 1733 static void 1734 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 1735 { 1736 struct pch new_tail; 1737 struct pv_chunk *pc; 1738 vm_page_t m; 1739 int avail, free; 1740 bool reclaimed; 1741 1742 rw_assert(&pvh_global_lock, RA_LOCKED); 1743 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1744 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 1745 1746 /* 1747 * Newly allocated PV chunks must be stored in a private list until 1748 * the required number of PV chunks have been allocated. Otherwise, 1749 * reclaim_pv_chunk() could recycle one of these chunks. In 1750 * contrast, these chunks must be added to the pmap upon allocation. 1751 */ 1752 TAILQ_INIT(&new_tail); 1753 retry: 1754 avail = 0; 1755 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 1756 bit_count((bitstr_t *)pc->pc_map, 0, 1757 sizeof(pc->pc_map) * NBBY, &free); 1758 if (free == 0) 1759 break; 1760 avail += free; 1761 if (avail >= needed) 1762 break; 1763 } 1764 for (reclaimed = false; avail < needed; avail += _NPCPV) { 1765 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1766 VM_ALLOC_WIRED); 1767 if (m == NULL) { 1768 m = reclaim_pv_chunk(pmap, lockp); 1769 if (m == NULL) 1770 goto retry; 1771 reclaimed = true; 1772 } 1773 /* XXX PV STATS */ 1774 #if 0 1775 dump_add_page(m->phys_addr); 1776 #endif 1777 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1778 pc->pc_pmap = pmap; 1779 pc->pc_map[0] = PC_FREE0; 1780 pc->pc_map[1] = PC_FREE1; 1781 pc->pc_map[2] = PC_FREE2; 1782 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1783 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 1784 1785 /* 1786 * The reclaim might have freed a chunk from the current pmap. 1787 * If that chunk contained available entries, we need to 1788 * re-count the number of available entries. 1789 */ 1790 if (reclaimed) 1791 goto retry; 1792 } 1793 if (!TAILQ_EMPTY(&new_tail)) { 1794 mtx_lock(&pv_chunks_mutex); 1795 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 1796 mtx_unlock(&pv_chunks_mutex); 1797 } 1798 } 1799 1800 /* 1801 * First find and then remove the pv entry for the specified pmap and virtual 1802 * address from the specified pv list. Returns the pv entry if found and NULL 1803 * otherwise. This operation can be performed on pv lists for either 4KB or 1804 * 2MB page mappings. 1805 */ 1806 static __inline pv_entry_t 1807 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1808 { 1809 pv_entry_t pv; 1810 1811 rw_assert(&pvh_global_lock, RA_LOCKED); 1812 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 1813 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 1814 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 1815 pvh->pv_gen++; 1816 break; 1817 } 1818 } 1819 return (pv); 1820 } 1821 1822 /* 1823 * First find and then destroy the pv entry for the specified pmap and virtual 1824 * address. This operation can be performed on pv lists for either 4KB or 2MB 1825 * page mappings. 1826 */ 1827 static void 1828 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1829 { 1830 pv_entry_t pv; 1831 1832 pv = pmap_pvh_remove(pvh, pmap, va); 1833 1834 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va)); 1835 free_pv_entry(pmap, pv); 1836 } 1837 1838 /* 1839 * Conditionally create the PV entry for a 4KB page mapping if the required 1840 * memory can be allocated without resorting to reclamation. 1841 */ 1842 static boolean_t 1843 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 1844 struct rwlock **lockp) 1845 { 1846 pv_entry_t pv; 1847 1848 rw_assert(&pvh_global_lock, RA_LOCKED); 1849 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1850 /* Pass NULL instead of the lock pointer to disable reclamation. */ 1851 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 1852 pv->pv_va = va; 1853 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 1854 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 1855 m->md.pv_gen++; 1856 return (TRUE); 1857 } else 1858 return (FALSE); 1859 } 1860 1861 /* 1862 * After demotion from a 2MB page mapping to 512 4KB page mappings, 1863 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 1864 * entries for each of the 4KB page mappings. 1865 */ 1866 static void __unused 1867 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1868 struct rwlock **lockp) 1869 { 1870 struct md_page *pvh; 1871 struct pv_chunk *pc; 1872 pv_entry_t pv; 1873 vm_page_t m; 1874 vm_offset_t va_last; 1875 int bit, field; 1876 1877 rw_assert(&pvh_global_lock, RA_LOCKED); 1878 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1879 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1880 1881 /* 1882 * Transfer the 2mpage's pv entry for this mapping to the first 1883 * page's pv list. Once this transfer begins, the pv list lock 1884 * must not be released until the last pv entry is reinstantiated. 1885 */ 1886 pvh = pa_to_pvh(pa); 1887 va &= ~L2_OFFSET; 1888 pv = pmap_pvh_remove(pvh, pmap, va); 1889 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); 1890 m = PHYS_TO_VM_PAGE(pa); 1891 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 1892 m->md.pv_gen++; 1893 /* Instantiate the remaining 511 pv entries. */ 1894 va_last = va + L2_SIZE - PAGE_SIZE; 1895 for (;;) { 1896 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1897 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 1898 pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); 1899 for (field = 0; field < _NPCM; field++) { 1900 while (pc->pc_map[field] != 0) { 1901 bit = ffsl(pc->pc_map[field]) - 1; 1902 pc->pc_map[field] &= ~(1ul << bit); 1903 pv = &pc->pc_pventry[field * 64 + bit]; 1904 va += PAGE_SIZE; 1905 pv->pv_va = va; 1906 m++; 1907 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 1908 ("pmap_pv_demote_l2: page %p is not managed", m)); 1909 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 1910 m->md.pv_gen++; 1911 if (va == va_last) 1912 goto out; 1913 } 1914 } 1915 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1916 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1917 } 1918 out: 1919 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 1920 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1921 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1922 } 1923 /* XXX PV stats */ 1924 } 1925 1926 #if VM_NRESERVLEVEL > 0 1927 static void 1928 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1929 struct rwlock **lockp) 1930 { 1931 struct md_page *pvh; 1932 pv_entry_t pv; 1933 vm_page_t m; 1934 vm_offset_t va_last; 1935 1936 rw_assert(&pvh_global_lock, RA_LOCKED); 1937 KASSERT((va & L2_OFFSET) == 0, 1938 ("pmap_pv_promote_l2: misaligned va %#lx", va)); 1939 1940 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1941 1942 m = PHYS_TO_VM_PAGE(pa); 1943 pv = pmap_pvh_remove(&m->md, pmap, va); 1944 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va)); 1945 pvh = pa_to_pvh(pa); 1946 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 1947 pvh->pv_gen++; 1948 1949 va_last = va + L2_SIZE - PAGE_SIZE; 1950 do { 1951 m++; 1952 va += PAGE_SIZE; 1953 pmap_pvh_free(&m->md, pmap, va); 1954 } while (va < va_last); 1955 } 1956 #endif /* VM_NRESERVLEVEL > 0 */ 1957 1958 /* 1959 * Create the PV entry for a 2MB page mapping. Always returns true unless the 1960 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 1961 * false if the PV entry cannot be allocated without resorting to reclamation. 1962 */ 1963 static bool 1964 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, 1965 struct rwlock **lockp) 1966 { 1967 struct md_page *pvh; 1968 pv_entry_t pv; 1969 vm_paddr_t pa; 1970 1971 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1972 /* Pass NULL instead of the lock pointer to disable reclamation. */ 1973 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 1974 NULL : lockp)) == NULL) 1975 return (false); 1976 pv->pv_va = va; 1977 pa = PTE_TO_PHYS(l2e); 1978 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1979 pvh = pa_to_pvh(pa); 1980 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 1981 pvh->pv_gen++; 1982 return (true); 1983 } 1984 1985 static void 1986 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 1987 { 1988 pt_entry_t newl2, oldl2; 1989 vm_page_t ml3; 1990 vm_paddr_t ml3pa; 1991 1992 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); 1993 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 1994 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1995 1996 ml3 = pmap_remove_pt_page(pmap, va); 1997 if (ml3 == NULL) 1998 panic("pmap_remove_kernel_l2: Missing pt page"); 1999 2000 ml3pa = VM_PAGE_TO_PHYS(ml3); 2001 newl2 = ml3pa | PTE_V; 2002 2003 /* 2004 * Initialize the page table page. 2005 */ 2006 pagezero((void *)PHYS_TO_DMAP(ml3pa)); 2007 2008 /* 2009 * Demote the mapping. 2010 */ 2011 oldl2 = pmap_load_store(l2, newl2); 2012 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", 2013 __func__, l2, oldl2)); 2014 } 2015 2016 /* 2017 * pmap_remove_l2: Do the things to unmap a level 2 superpage. 2018 */ 2019 static int 2020 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 2021 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) 2022 { 2023 struct md_page *pvh; 2024 pt_entry_t oldl2; 2025 vm_offset_t eva, va; 2026 vm_page_t m, ml3; 2027 2028 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2029 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); 2030 oldl2 = pmap_load_clear(l2); 2031 KASSERT((oldl2 & PTE_RWX) != 0, 2032 ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2)); 2033 2034 /* 2035 * The sfence.vma documentation states that it is sufficient to specify 2036 * a single address within a superpage mapping. However, since we do 2037 * not perform any invalidation upon promotion, TLBs may still be 2038 * caching 4KB mappings within the superpage, so we must invalidate the 2039 * entire range. 2040 */ 2041 pmap_invalidate_range(pmap, sva, sva + L2_SIZE); 2042 if ((oldl2 & PTE_SW_WIRED) != 0) 2043 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; 2044 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); 2045 if ((oldl2 & PTE_SW_MANAGED) != 0) { 2046 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2)); 2047 pvh = pa_to_pvh(PTE_TO_PHYS(oldl2)); 2048 pmap_pvh_free(pvh, pmap, sva); 2049 eva = sva + L2_SIZE; 2050 for (va = sva, m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl2)); 2051 va < eva; va += PAGE_SIZE, m++) { 2052 if ((oldl2 & PTE_D) != 0) 2053 vm_page_dirty(m); 2054 if ((oldl2 & PTE_A) != 0) 2055 vm_page_aflag_set(m, PGA_REFERENCED); 2056 if (TAILQ_EMPTY(&m->md.pv_list) && 2057 TAILQ_EMPTY(&pvh->pv_list)) 2058 vm_page_aflag_clear(m, PGA_WRITEABLE); 2059 } 2060 } 2061 if (pmap == kernel_pmap) { 2062 pmap_remove_kernel_l2(pmap, l2, sva); 2063 } else { 2064 ml3 = pmap_remove_pt_page(pmap, sva); 2065 if (ml3 != NULL) { 2066 pmap_resident_count_dec(pmap, 1); 2067 KASSERT(ml3->wire_count == Ln_ENTRIES, 2068 ("pmap_remove_l2: l3 page wire count error")); 2069 ml3->wire_count = 1; 2070 vm_page_unwire_noq(ml3); 2071 pmap_add_delayed_free_list(ml3, free, FALSE); 2072 } 2073 } 2074 return (pmap_unuse_pt(pmap, sva, l1e, free)); 2075 } 2076 2077 /* 2078 * pmap_remove_l3: do the things to unmap a page in a process 2079 */ 2080 static int 2081 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 2082 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) 2083 { 2084 pt_entry_t old_l3; 2085 vm_paddr_t phys; 2086 vm_page_t m; 2087 2088 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2089 old_l3 = pmap_load_clear(l3); 2090 pmap_invalidate_page(pmap, va); 2091 if (old_l3 & PTE_SW_WIRED) 2092 pmap->pm_stats.wired_count -= 1; 2093 pmap_resident_count_dec(pmap, 1); 2094 if (old_l3 & PTE_SW_MANAGED) { 2095 phys = PTE_TO_PHYS(old_l3); 2096 m = PHYS_TO_VM_PAGE(phys); 2097 if ((old_l3 & PTE_D) != 0) 2098 vm_page_dirty(m); 2099 if (old_l3 & PTE_A) 2100 vm_page_aflag_set(m, PGA_REFERENCED); 2101 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2102 pmap_pvh_free(&m->md, pmap, va); 2103 } 2104 2105 return (pmap_unuse_pt(pmap, va, l2e, free)); 2106 } 2107 2108 /* 2109 * Remove the given range of addresses from the specified map. 2110 * 2111 * It is assumed that the start and end are properly 2112 * rounded to the page size. 2113 */ 2114 void 2115 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2116 { 2117 struct spglist free; 2118 struct rwlock *lock; 2119 vm_offset_t va, va_next; 2120 pd_entry_t *l1, *l2, l2e; 2121 pt_entry_t *l3; 2122 2123 /* 2124 * Perform an unsynchronized read. This is, however, safe. 2125 */ 2126 if (pmap->pm_stats.resident_count == 0) 2127 return; 2128 2129 SLIST_INIT(&free); 2130 2131 rw_rlock(&pvh_global_lock); 2132 PMAP_LOCK(pmap); 2133 2134 lock = NULL; 2135 for (; sva < eva; sva = va_next) { 2136 if (pmap->pm_stats.resident_count == 0) 2137 break; 2138 2139 l1 = pmap_l1(pmap, sva); 2140 if (pmap_load(l1) == 0) { 2141 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2142 if (va_next < sva) 2143 va_next = eva; 2144 continue; 2145 } 2146 2147 /* 2148 * Calculate index for next page table. 2149 */ 2150 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2151 if (va_next < sva) 2152 va_next = eva; 2153 2154 l2 = pmap_l1_to_l2(l1, sva); 2155 if (l2 == NULL) 2156 continue; 2157 if ((l2e = pmap_load(l2)) == 0) 2158 continue; 2159 if ((l2e & PTE_RWX) != 0) { 2160 if (sva + L2_SIZE == va_next && eva >= va_next) { 2161 (void)pmap_remove_l2(pmap, l2, sva, 2162 pmap_load(l1), &free, &lock); 2163 continue; 2164 } else if (!pmap_demote_l2_locked(pmap, l2, sva, 2165 &lock)) { 2166 /* 2167 * The large page mapping was destroyed. 2168 */ 2169 continue; 2170 } 2171 l2e = pmap_load(l2); 2172 } 2173 2174 /* 2175 * Limit our scan to either the end of the va represented 2176 * by the current page table page, or to the end of the 2177 * range being removed. 2178 */ 2179 if (va_next > eva) 2180 va_next = eva; 2181 2182 va = va_next; 2183 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2184 sva += L3_SIZE) { 2185 if (pmap_load(l3) == 0) { 2186 if (va != va_next) { 2187 pmap_invalidate_range(pmap, va, sva); 2188 va = va_next; 2189 } 2190 continue; 2191 } 2192 if (va == va_next) 2193 va = sva; 2194 if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) { 2195 sva += L3_SIZE; 2196 break; 2197 } 2198 } 2199 if (va != va_next) 2200 pmap_invalidate_range(pmap, va, sva); 2201 } 2202 if (lock != NULL) 2203 rw_wunlock(lock); 2204 rw_runlock(&pvh_global_lock); 2205 PMAP_UNLOCK(pmap); 2206 vm_page_free_pages_toq(&free, false); 2207 } 2208 2209 /* 2210 * Routine: pmap_remove_all 2211 * Function: 2212 * Removes this physical page from 2213 * all physical maps in which it resides. 2214 * Reflects back modify bits to the pager. 2215 * 2216 * Notes: 2217 * Original versions of this routine were very 2218 * inefficient because they iteratively called 2219 * pmap_remove (slow...) 2220 */ 2221 2222 void 2223 pmap_remove_all(vm_page_t m) 2224 { 2225 struct spglist free; 2226 struct md_page *pvh; 2227 pmap_t pmap; 2228 pt_entry_t *l3, l3e; 2229 pd_entry_t *l2, l2e; 2230 pv_entry_t pv; 2231 vm_offset_t va; 2232 2233 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2234 ("pmap_remove_all: page %p is not managed", m)); 2235 SLIST_INIT(&free); 2236 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 2237 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2238 2239 rw_wlock(&pvh_global_lock); 2240 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 2241 pmap = PV_PMAP(pv); 2242 PMAP_LOCK(pmap); 2243 va = pv->pv_va; 2244 l2 = pmap_l2(pmap, va); 2245 (void)pmap_demote_l2(pmap, l2, va); 2246 PMAP_UNLOCK(pmap); 2247 } 2248 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2249 pmap = PV_PMAP(pv); 2250 PMAP_LOCK(pmap); 2251 pmap_resident_count_dec(pmap, 1); 2252 l2 = pmap_l2(pmap, pv->pv_va); 2253 KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found")); 2254 l2e = pmap_load(l2); 2255 2256 KASSERT((l2e & PTE_RX) == 0, 2257 ("pmap_remove_all: found a superpage in %p's pv list", m)); 2258 2259 l3 = pmap_l2_to_l3(l2, pv->pv_va); 2260 l3e = pmap_load_clear(l3); 2261 pmap_invalidate_page(pmap, pv->pv_va); 2262 if (l3e & PTE_SW_WIRED) 2263 pmap->pm_stats.wired_count--; 2264 if ((l3e & PTE_A) != 0) 2265 vm_page_aflag_set(m, PGA_REFERENCED); 2266 2267 /* 2268 * Update the vm_page_t clean and reference bits. 2269 */ 2270 if ((l3e & PTE_D) != 0) 2271 vm_page_dirty(m); 2272 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free); 2273 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2274 m->md.pv_gen++; 2275 free_pv_entry(pmap, pv); 2276 PMAP_UNLOCK(pmap); 2277 } 2278 vm_page_aflag_clear(m, PGA_WRITEABLE); 2279 rw_wunlock(&pvh_global_lock); 2280 vm_page_free_pages_toq(&free, false); 2281 } 2282 2283 /* 2284 * Set the physical protection on the 2285 * specified range of this map as requested. 2286 */ 2287 void 2288 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2289 { 2290 pd_entry_t *l1, *l2, l2e; 2291 pt_entry_t *l3, l3e, mask; 2292 vm_page_t m; 2293 vm_paddr_t pa; 2294 vm_offset_t va, va_next; 2295 bool anychanged, pv_lists_locked; 2296 2297 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2298 pmap_remove(pmap, sva, eva); 2299 return; 2300 } 2301 2302 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == 2303 (VM_PROT_WRITE | VM_PROT_EXECUTE)) 2304 return; 2305 2306 anychanged = false; 2307 pv_lists_locked = false; 2308 mask = 0; 2309 if ((prot & VM_PROT_WRITE) == 0) 2310 mask |= PTE_W | PTE_D; 2311 if ((prot & VM_PROT_EXECUTE) == 0) 2312 mask |= PTE_X; 2313 resume: 2314 PMAP_LOCK(pmap); 2315 for (; sva < eva; sva = va_next) { 2316 l1 = pmap_l1(pmap, sva); 2317 if (pmap_load(l1) == 0) { 2318 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2319 if (va_next < sva) 2320 va_next = eva; 2321 continue; 2322 } 2323 2324 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2325 if (va_next < sva) 2326 va_next = eva; 2327 2328 l2 = pmap_l1_to_l2(l1, sva); 2329 if (l2 == NULL || (l2e = pmap_load(l2)) == 0) 2330 continue; 2331 if ((l2e & PTE_RWX) != 0) { 2332 if (sva + L2_SIZE == va_next && eva >= va_next) { 2333 retryl2: 2334 if ((l2e & (PTE_SW_MANAGED | PTE_D)) == 2335 (PTE_SW_MANAGED | PTE_D)) { 2336 pa = PTE_TO_PHYS(l2e); 2337 for (va = sva, m = PHYS_TO_VM_PAGE(pa); 2338 va < va_next; m++, va += PAGE_SIZE) 2339 vm_page_dirty(m); 2340 } 2341 if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask)) 2342 goto retryl2; 2343 anychanged = true; 2344 } else { 2345 if (!pv_lists_locked) { 2346 pv_lists_locked = true; 2347 if (!rw_try_rlock(&pvh_global_lock)) { 2348 if (anychanged) 2349 pmap_invalidate_all( 2350 pmap); 2351 PMAP_UNLOCK(pmap); 2352 rw_rlock(&pvh_global_lock); 2353 goto resume; 2354 } 2355 } 2356 if (!pmap_demote_l2(pmap, l2, sva)) { 2357 /* 2358 * The large page mapping was destroyed. 2359 */ 2360 continue; 2361 } 2362 } 2363 } 2364 2365 if (va_next > eva) 2366 va_next = eva; 2367 2368 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2369 sva += L3_SIZE) { 2370 l3e = pmap_load(l3); 2371 retryl3: 2372 if ((l3e & PTE_V) == 0) 2373 continue; 2374 if ((prot & VM_PROT_WRITE) == 0 && 2375 (l3e & (PTE_SW_MANAGED | PTE_D)) == 2376 (PTE_SW_MANAGED | PTE_D)) { 2377 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(l3e)); 2378 vm_page_dirty(m); 2379 } 2380 if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask)) 2381 goto retryl3; 2382 anychanged = true; 2383 } 2384 } 2385 if (anychanged) 2386 pmap_invalidate_all(pmap); 2387 if (pv_lists_locked) 2388 rw_runlock(&pvh_global_lock); 2389 PMAP_UNLOCK(pmap); 2390 } 2391 2392 int 2393 pmap_fault_fixup(pmap_t pmap, vm_offset_t va, vm_prot_t ftype) 2394 { 2395 pd_entry_t *l2, l2e; 2396 pt_entry_t bits, *pte, oldpte; 2397 int rv; 2398 2399 rv = 0; 2400 PMAP_LOCK(pmap); 2401 l2 = pmap_l2(pmap, va); 2402 if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) 2403 goto done; 2404 if ((l2e & PTE_RWX) == 0) { 2405 pte = pmap_l2_to_l3(l2, va); 2406 if (pte == NULL || ((oldpte = pmap_load(pte) & PTE_V)) == 0) 2407 goto done; 2408 } else { 2409 pte = l2; 2410 oldpte = l2e; 2411 } 2412 2413 if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) || 2414 (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) || 2415 (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) || 2416 (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0)) 2417 goto done; 2418 2419 bits = PTE_A; 2420 if (ftype == VM_PROT_WRITE) 2421 bits |= PTE_D; 2422 2423 /* 2424 * Spurious faults can occur if the implementation caches invalid 2425 * entries in the TLB, or if simultaneous accesses on multiple CPUs 2426 * race with each other. 2427 */ 2428 if ((oldpte & bits) != bits) 2429 pmap_store_bits(pte, bits); 2430 sfence_vma(); 2431 rv = 1; 2432 done: 2433 PMAP_UNLOCK(pmap); 2434 return (rv); 2435 } 2436 2437 static bool 2438 pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va) 2439 { 2440 struct rwlock *lock; 2441 bool rv; 2442 2443 lock = NULL; 2444 rv = pmap_demote_l2_locked(pmap, l2, va, &lock); 2445 if (lock != NULL) 2446 rw_wunlock(lock); 2447 return (rv); 2448 } 2449 2450 /* 2451 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 2452 * mapping is invalidated. 2453 */ 2454 static bool 2455 pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 2456 struct rwlock **lockp) 2457 { 2458 struct spglist free; 2459 vm_page_t mpte; 2460 pd_entry_t newl2, oldl2; 2461 pt_entry_t *firstl3, newl3; 2462 vm_paddr_t mptepa; 2463 int i; 2464 2465 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2466 2467 oldl2 = pmap_load(l2); 2468 KASSERT((oldl2 & PTE_RWX) != 0, 2469 ("pmap_demote_l2_locked: oldl2 is not a leaf entry")); 2470 if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 2471 NULL) { 2472 if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc(NULL, 2473 pmap_l2_pindex(va), (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 2474 VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == 2475 NULL) { 2476 SLIST_INIT(&free); 2477 (void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET, 2478 pmap_load(pmap_l1(pmap, va)), &free, lockp); 2479 vm_page_free_pages_toq(&free, true); 2480 CTR2(KTR_PMAP, "pmap_demote_l2_locked: " 2481 "failure for va %#lx in pmap %p", va, pmap); 2482 return (false); 2483 } 2484 if (va < VM_MAXUSER_ADDRESS) 2485 pmap_resident_count_inc(pmap, 1); 2486 } 2487 mptepa = VM_PAGE_TO_PHYS(mpte); 2488 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 2489 newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V; 2490 KASSERT((oldl2 & PTE_A) != 0, 2491 ("pmap_demote_l2_locked: oldl2 is missing PTE_A")); 2492 KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W, 2493 ("pmap_demote_l2_locked: oldl2 is missing PTE_D")); 2494 newl3 = oldl2; 2495 2496 /* 2497 * If the page table page is new, initialize it. 2498 */ 2499 if (mpte->wire_count == 1) { 2500 mpte->wire_count = Ln_ENTRIES; 2501 for (i = 0; i < Ln_ENTRIES; i++) 2502 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2503 } 2504 KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3), 2505 ("pmap_demote_l2_locked: firstl3 and newl3 map different physical " 2506 "addresses")); 2507 2508 /* 2509 * If the mapping has changed attributes, update the page table 2510 * entries. 2511 */ 2512 if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE)) 2513 for (i = 0; i < Ln_ENTRIES; i++) 2514 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2515 2516 /* 2517 * The spare PV entries must be reserved prior to demoting the 2518 * mapping, that is, prior to changing the L2 entry. Otherwise, the 2519 * state of the L2 entry and the PV lists will be inconsistent, which 2520 * can result in reclaim_pv_chunk() attempting to remove a PV entry from 2521 * the wrong PV list and pmap_pv_demote_l2() failing to find the 2522 * expected PV entry for the 2MB page mapping that is being demoted. 2523 */ 2524 if ((oldl2 & PTE_SW_MANAGED) != 0) 2525 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); 2526 2527 /* 2528 * Demote the mapping. 2529 */ 2530 pmap_store(l2, newl2); 2531 2532 /* 2533 * Demote the PV entry. 2534 */ 2535 if ((oldl2 & PTE_SW_MANAGED) != 0) 2536 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp); 2537 2538 atomic_add_long(&pmap_l2_demotions, 1); 2539 CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p", 2540 va, pmap); 2541 return (true); 2542 } 2543 2544 #if VM_NRESERVLEVEL > 0 2545 static void 2546 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 2547 struct rwlock **lockp) 2548 { 2549 pt_entry_t *firstl3, *l3; 2550 vm_paddr_t pa; 2551 vm_page_t ml3; 2552 2553 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2554 2555 va &= ~L2_OFFSET; 2556 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 2557 ("pmap_promote_l2: invalid l2 entry %p", l2)); 2558 2559 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2))); 2560 pa = PTE_TO_PHYS(pmap_load(firstl3)); 2561 if ((pa & L2_OFFSET) != 0) { 2562 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 2563 va, pmap); 2564 atomic_add_long(&pmap_l2_p_failures, 1); 2565 return; 2566 } 2567 2568 pa += PAGE_SIZE; 2569 for (l3 = firstl3 + 1; l3 < firstl3 + Ln_ENTRIES; l3++) { 2570 if (PTE_TO_PHYS(pmap_load(l3)) != pa) { 2571 CTR2(KTR_PMAP, 2572 "pmap_promote_l2: failure for va %#lx pmap %p", 2573 va, pmap); 2574 atomic_add_long(&pmap_l2_p_failures, 1); 2575 return; 2576 } 2577 if ((pmap_load(l3) & PTE_PROMOTE) != 2578 (pmap_load(firstl3) & PTE_PROMOTE)) { 2579 CTR2(KTR_PMAP, 2580 "pmap_promote_l2: failure for va %#lx pmap %p", 2581 va, pmap); 2582 atomic_add_long(&pmap_l2_p_failures, 1); 2583 return; 2584 } 2585 pa += PAGE_SIZE; 2586 } 2587 2588 ml3 = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2589 KASSERT(ml3->pindex == pmap_l2_pindex(va), 2590 ("pmap_promote_l2: page table page's pindex is wrong")); 2591 if (pmap_insert_pt_page(pmap, ml3)) { 2592 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 2593 va, pmap); 2594 atomic_add_long(&pmap_l2_p_failures, 1); 2595 return; 2596 } 2597 2598 if ((pmap_load(firstl3) & PTE_SW_MANAGED) != 0) 2599 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(pmap_load(firstl3)), 2600 lockp); 2601 2602 pmap_store(l2, pmap_load(firstl3)); 2603 2604 atomic_add_long(&pmap_l2_promotions, 1); 2605 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, 2606 pmap); 2607 } 2608 #endif 2609 2610 /* 2611 * Insert the given physical page (p) at 2612 * the specified virtual address (v) in the 2613 * target physical map with the protection requested. 2614 * 2615 * If specified, the page will be wired down, meaning 2616 * that the related pte can not be reclaimed. 2617 * 2618 * NB: This is the only routine which MAY NOT lazy-evaluate 2619 * or lose information. That is, this routine must actually 2620 * insert this page into the given map NOW. 2621 */ 2622 int 2623 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2624 u_int flags, int8_t psind) 2625 { 2626 struct rwlock *lock; 2627 pd_entry_t *l1, *l2, l2e; 2628 pt_entry_t new_l3, orig_l3; 2629 pt_entry_t *l3; 2630 pv_entry_t pv; 2631 vm_paddr_t opa, pa, l2_pa, l3_pa; 2632 vm_page_t mpte, om, l2_m, l3_m; 2633 pt_entry_t entry; 2634 pn_t l2_pn, l3_pn, pn; 2635 int rv; 2636 bool nosleep; 2637 2638 va = trunc_page(va); 2639 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) 2640 VM_OBJECT_ASSERT_LOCKED(m->object); 2641 pa = VM_PAGE_TO_PHYS(m); 2642 pn = (pa / PAGE_SIZE); 2643 2644 new_l3 = PTE_V | PTE_R | PTE_A; 2645 if (prot & VM_PROT_EXECUTE) 2646 new_l3 |= PTE_X; 2647 if (flags & VM_PROT_WRITE) 2648 new_l3 |= PTE_D; 2649 if (prot & VM_PROT_WRITE) 2650 new_l3 |= PTE_W; 2651 if (va < VM_MAX_USER_ADDRESS) 2652 new_l3 |= PTE_U; 2653 2654 new_l3 |= (pn << PTE_PPN0_S); 2655 if ((flags & PMAP_ENTER_WIRED) != 0) 2656 new_l3 |= PTE_SW_WIRED; 2657 2658 /* 2659 * Set modified bit gratuitously for writeable mappings if 2660 * the page is unmanaged. We do not want to take a fault 2661 * to do the dirty bit accounting for these mappings. 2662 */ 2663 if ((m->oflags & VPO_UNMANAGED) != 0) { 2664 if (prot & VM_PROT_WRITE) 2665 new_l3 |= PTE_D; 2666 } else 2667 new_l3 |= PTE_SW_MANAGED; 2668 2669 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); 2670 2671 lock = NULL; 2672 mpte = NULL; 2673 rw_rlock(&pvh_global_lock); 2674 PMAP_LOCK(pmap); 2675 if (psind == 1) { 2676 /* Assert the required virtual and physical alignment. */ 2677 KASSERT((va & L2_OFFSET) == 0, 2678 ("pmap_enter: va %#lx unaligned", va)); 2679 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 2680 rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock); 2681 goto out; 2682 } 2683 2684 l2 = pmap_l2(pmap, va); 2685 if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 && 2686 ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2, 2687 va, &lock))) { 2688 l3 = pmap_l2_to_l3(l2, va); 2689 if (va < VM_MAXUSER_ADDRESS) { 2690 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2691 mpte->wire_count++; 2692 } 2693 } else if (va < VM_MAXUSER_ADDRESS) { 2694 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 2695 mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock); 2696 if (mpte == NULL && nosleep) { 2697 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); 2698 if (lock != NULL) 2699 rw_wunlock(lock); 2700 rw_runlock(&pvh_global_lock); 2701 PMAP_UNLOCK(pmap); 2702 return (KERN_RESOURCE_SHORTAGE); 2703 } 2704 l3 = pmap_l3(pmap, va); 2705 } else { 2706 l3 = pmap_l3(pmap, va); 2707 /* TODO: This is not optimal, but should mostly work */ 2708 if (l3 == NULL) { 2709 if (l2 == NULL) { 2710 l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2711 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2712 VM_ALLOC_ZERO); 2713 if (l2_m == NULL) 2714 panic("pmap_enter: l2 pte_m == NULL"); 2715 if ((l2_m->flags & PG_ZERO) == 0) 2716 pmap_zero_page(l2_m); 2717 2718 l2_pa = VM_PAGE_TO_PHYS(l2_m); 2719 l2_pn = (l2_pa / PAGE_SIZE); 2720 2721 l1 = pmap_l1(pmap, va); 2722 entry = (PTE_V); 2723 entry |= (l2_pn << PTE_PPN0_S); 2724 pmap_store(l1, entry); 2725 pmap_distribute_l1(pmap, pmap_l1_index(va), entry); 2726 l2 = pmap_l1_to_l2(l1, va); 2727 } 2728 2729 l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2730 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2731 if (l3_m == NULL) 2732 panic("pmap_enter: l3 pte_m == NULL"); 2733 if ((l3_m->flags & PG_ZERO) == 0) 2734 pmap_zero_page(l3_m); 2735 2736 l3_pa = VM_PAGE_TO_PHYS(l3_m); 2737 l3_pn = (l3_pa / PAGE_SIZE); 2738 entry = (PTE_V); 2739 entry |= (l3_pn << PTE_PPN0_S); 2740 pmap_store(l2, entry); 2741 l3 = pmap_l2_to_l3(l2, va); 2742 } 2743 pmap_invalidate_page(pmap, va); 2744 } 2745 2746 orig_l3 = pmap_load(l3); 2747 opa = PTE_TO_PHYS(orig_l3); 2748 pv = NULL; 2749 2750 /* 2751 * Is the specified virtual address already mapped? 2752 */ 2753 if ((orig_l3 & PTE_V) != 0) { 2754 /* 2755 * Wiring change, just update stats. We don't worry about 2756 * wiring PT pages as they remain resident as long as there 2757 * are valid mappings in them. Hence, if a user page is wired, 2758 * the PT page will be also. 2759 */ 2760 if ((flags & PMAP_ENTER_WIRED) != 0 && 2761 (orig_l3 & PTE_SW_WIRED) == 0) 2762 pmap->pm_stats.wired_count++; 2763 else if ((flags & PMAP_ENTER_WIRED) == 0 && 2764 (orig_l3 & PTE_SW_WIRED) != 0) 2765 pmap->pm_stats.wired_count--; 2766 2767 /* 2768 * Remove the extra PT page reference. 2769 */ 2770 if (mpte != NULL) { 2771 mpte->wire_count--; 2772 KASSERT(mpte->wire_count > 0, 2773 ("pmap_enter: missing reference to page table page," 2774 " va: 0x%lx", va)); 2775 } 2776 2777 /* 2778 * Has the physical page changed? 2779 */ 2780 if (opa == pa) { 2781 /* 2782 * No, might be a protection or wiring change. 2783 */ 2784 if ((orig_l3 & PTE_SW_MANAGED) != 0 && 2785 (new_l3 & PTE_W) != 0) 2786 vm_page_aflag_set(m, PGA_WRITEABLE); 2787 goto validate; 2788 } 2789 2790 /* 2791 * The physical page has changed. Temporarily invalidate 2792 * the mapping. This ensures that all threads sharing the 2793 * pmap keep a consistent view of the mapping, which is 2794 * necessary for the correct handling of COW faults. It 2795 * also permits reuse of the old mapping's PV entry, 2796 * avoiding an allocation. 2797 * 2798 * For consistency, handle unmanaged mappings the same way. 2799 */ 2800 orig_l3 = pmap_load_clear(l3); 2801 KASSERT(PTE_TO_PHYS(orig_l3) == opa, 2802 ("pmap_enter: unexpected pa update for %#lx", va)); 2803 if ((orig_l3 & PTE_SW_MANAGED) != 0) { 2804 om = PHYS_TO_VM_PAGE(opa); 2805 2806 /* 2807 * The pmap lock is sufficient to synchronize with 2808 * concurrent calls to pmap_page_test_mappings() and 2809 * pmap_ts_referenced(). 2810 */ 2811 if ((orig_l3 & PTE_D) != 0) 2812 vm_page_dirty(om); 2813 if ((orig_l3 & PTE_A) != 0) 2814 vm_page_aflag_set(om, PGA_REFERENCED); 2815 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 2816 pv = pmap_pvh_remove(&om->md, pmap, va); 2817 KASSERT(pv != NULL, 2818 ("pmap_enter: no PV entry for %#lx", va)); 2819 if ((new_l3 & PTE_SW_MANAGED) == 0) 2820 free_pv_entry(pmap, pv); 2821 if ((om->aflags & PGA_WRITEABLE) != 0 && 2822 TAILQ_EMPTY(&om->md.pv_list)) 2823 vm_page_aflag_clear(om, PGA_WRITEABLE); 2824 } 2825 pmap_invalidate_page(pmap, va); 2826 orig_l3 = 0; 2827 } else { 2828 /* 2829 * Increment the counters. 2830 */ 2831 if ((new_l3 & PTE_SW_WIRED) != 0) 2832 pmap->pm_stats.wired_count++; 2833 pmap_resident_count_inc(pmap, 1); 2834 } 2835 /* 2836 * Enter on the PV list if part of our managed memory. 2837 */ 2838 if ((new_l3 & PTE_SW_MANAGED) != 0) { 2839 if (pv == NULL) { 2840 pv = get_pv_entry(pmap, &lock); 2841 pv->pv_va = va; 2842 } 2843 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 2844 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2845 m->md.pv_gen++; 2846 if ((new_l3 & PTE_W) != 0) 2847 vm_page_aflag_set(m, PGA_WRITEABLE); 2848 } 2849 2850 validate: 2851 /* 2852 * Sync the i-cache on all harts before updating the PTE 2853 * if the new PTE is executable. 2854 */ 2855 if (prot & VM_PROT_EXECUTE) 2856 pmap_sync_icache(pmap, va, PAGE_SIZE); 2857 2858 /* 2859 * Update the L3 entry. 2860 */ 2861 if (orig_l3 != 0) { 2862 orig_l3 = pmap_load_store(l3, new_l3); 2863 pmap_invalidate_page(pmap, va); 2864 KASSERT(PTE_TO_PHYS(orig_l3) == pa, 2865 ("pmap_enter: invalid update")); 2866 if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) == 2867 (PTE_D | PTE_SW_MANAGED)) 2868 vm_page_dirty(m); 2869 } else { 2870 pmap_store(l3, new_l3); 2871 } 2872 2873 #if VM_NRESERVLEVEL > 0 2874 if (mpte != NULL && mpte->wire_count == Ln_ENTRIES && 2875 pmap_ps_enabled(pmap) && 2876 (m->flags & PG_FICTITIOUS) == 0 && 2877 vm_reserv_level_iffullpop(m) == 0) 2878 pmap_promote_l2(pmap, l2, va, &lock); 2879 #endif 2880 2881 rv = KERN_SUCCESS; 2882 out: 2883 if (lock != NULL) 2884 rw_wunlock(lock); 2885 rw_runlock(&pvh_global_lock); 2886 PMAP_UNLOCK(pmap); 2887 return (rv); 2888 } 2889 2890 /* 2891 * Tries to create a read- and/or execute-only 2MB page mapping. Returns true 2892 * if successful. Returns false if (1) a page table page cannot be allocated 2893 * without sleeping, (2) a mapping already exists at the specified virtual 2894 * address, or (3) a PV entry cannot be allocated without reclaiming another 2895 * PV entry. 2896 */ 2897 static bool 2898 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2899 struct rwlock **lockp) 2900 { 2901 pd_entry_t new_l2; 2902 pn_t pn; 2903 2904 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2905 2906 pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE; 2907 new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V); 2908 if ((m->oflags & VPO_UNMANAGED) == 0) 2909 new_l2 |= PTE_SW_MANAGED; 2910 if ((prot & VM_PROT_EXECUTE) != 0) 2911 new_l2 |= PTE_X; 2912 if (va < VM_MAXUSER_ADDRESS) 2913 new_l2 |= PTE_U; 2914 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | 2915 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == 2916 KERN_SUCCESS); 2917 } 2918 2919 /* 2920 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 2921 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 2922 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 2923 * a mapping already exists at the specified virtual address. Returns 2924 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table 2925 * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if 2926 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. 2927 * 2928 * The parameter "m" is only used when creating a managed, writeable mapping. 2929 */ 2930 static int 2931 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, 2932 vm_page_t m, struct rwlock **lockp) 2933 { 2934 struct spglist free; 2935 pd_entry_t *l2, *l3, oldl2; 2936 vm_offset_t sva; 2937 vm_page_t l2pg, mt; 2938 2939 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2940 2941 if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? 2942 NULL : lockp)) == NULL) { 2943 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", 2944 va, pmap); 2945 return (KERN_RESOURCE_SHORTAGE); 2946 } 2947 2948 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); 2949 l2 = &l2[pmap_l2_index(va)]; 2950 if ((oldl2 = pmap_load(l2)) != 0) { 2951 KASSERT(l2pg->wire_count > 1, 2952 ("pmap_enter_l2: l2pg's wire count is too low")); 2953 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 2954 l2pg->wire_count--; 2955 CTR2(KTR_PMAP, 2956 "pmap_enter_l2: failure for va %#lx in pmap %p", 2957 va, pmap); 2958 return (KERN_FAILURE); 2959 } 2960 SLIST_INIT(&free); 2961 if ((oldl2 & PTE_RWX) != 0) 2962 (void)pmap_remove_l2(pmap, l2, va, 2963 pmap_load(pmap_l1(pmap, va)), &free, lockp); 2964 else 2965 for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) { 2966 l3 = pmap_l2_to_l3(l2, sva); 2967 if ((pmap_load(l3) & PTE_V) != 0 && 2968 pmap_remove_l3(pmap, l3, sva, oldl2, &free, 2969 lockp) != 0) 2970 break; 2971 } 2972 vm_page_free_pages_toq(&free, true); 2973 if (va >= VM_MAXUSER_ADDRESS) { 2974 mt = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2975 if (pmap_insert_pt_page(pmap, mt)) { 2976 /* 2977 * XXX Currently, this can't happen bacuse 2978 * we do not perform pmap_enter(psind == 1) 2979 * on the kernel pmap. 2980 */ 2981 panic("pmap_enter_l2: trie insert failed"); 2982 } 2983 } else 2984 KASSERT(pmap_load(l2) == 0, 2985 ("pmap_enter_l2: non-zero L2 entry %p", l2)); 2986 } 2987 2988 if ((new_l2 & PTE_SW_MANAGED) != 0) { 2989 /* 2990 * Abort this mapping if its PV entry could not be created. 2991 */ 2992 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { 2993 SLIST_INIT(&free); 2994 if (pmap_unwire_ptp(pmap, va, l2pg, &free)) { 2995 /* 2996 * Although "va" is not mapped, paging-structure 2997 * caches could nonetheless have entries that 2998 * refer to the freed page table pages. 2999 * Invalidate those entries. 3000 */ 3001 pmap_invalidate_page(pmap, va); 3002 vm_page_free_pages_toq(&free, true); 3003 } 3004 CTR2(KTR_PMAP, 3005 "pmap_enter_l2: failure for va %#lx in pmap %p", 3006 va, pmap); 3007 return (KERN_RESOURCE_SHORTAGE); 3008 } 3009 if ((new_l2 & PTE_W) != 0) 3010 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 3011 vm_page_aflag_set(mt, PGA_WRITEABLE); 3012 } 3013 3014 /* 3015 * Increment counters. 3016 */ 3017 if ((new_l2 & PTE_SW_WIRED) != 0) 3018 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; 3019 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; 3020 3021 /* 3022 * Map the superpage. 3023 */ 3024 pmap_store(l2, new_l2); 3025 3026 atomic_add_long(&pmap_l2_mappings, 1); 3027 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", 3028 va, pmap); 3029 3030 return (KERN_SUCCESS); 3031 } 3032 3033 /* 3034 * Maps a sequence of resident pages belonging to the same object. 3035 * The sequence begins with the given page m_start. This page is 3036 * mapped at the given virtual address start. Each subsequent page is 3037 * mapped at a virtual address that is offset from start by the same 3038 * amount as the page is offset from m_start within the object. The 3039 * last page in the sequence is the page with the largest offset from 3040 * m_start that can be mapped at a virtual address less than the given 3041 * virtual address end. Not every virtual page between start and end 3042 * is mapped; only those for which a resident page exists with the 3043 * corresponding offset from m_start are mapped. 3044 */ 3045 void 3046 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3047 vm_page_t m_start, vm_prot_t prot) 3048 { 3049 struct rwlock *lock; 3050 vm_offset_t va; 3051 vm_page_t m, mpte; 3052 vm_pindex_t diff, psize; 3053 3054 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3055 3056 psize = atop(end - start); 3057 mpte = NULL; 3058 m = m_start; 3059 lock = NULL; 3060 rw_rlock(&pvh_global_lock); 3061 PMAP_LOCK(pmap); 3062 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3063 va = start + ptoa(diff); 3064 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && 3065 m->psind == 1 && pmap_ps_enabled(pmap) && 3066 pmap_enter_2mpage(pmap, va, m, prot, &lock)) 3067 m = &m[L2_SIZE / PAGE_SIZE - 1]; 3068 else 3069 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, 3070 &lock); 3071 m = TAILQ_NEXT(m, listq); 3072 } 3073 if (lock != NULL) 3074 rw_wunlock(lock); 3075 rw_runlock(&pvh_global_lock); 3076 PMAP_UNLOCK(pmap); 3077 } 3078 3079 /* 3080 * this code makes some *MAJOR* assumptions: 3081 * 1. Current pmap & pmap exists. 3082 * 2. Not wired. 3083 * 3. Read access. 3084 * 4. No page table pages. 3085 * but is *MUCH* faster than pmap_enter... 3086 */ 3087 3088 void 3089 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3090 { 3091 struct rwlock *lock; 3092 3093 lock = NULL; 3094 rw_rlock(&pvh_global_lock); 3095 PMAP_LOCK(pmap); 3096 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 3097 if (lock != NULL) 3098 rw_wunlock(lock); 3099 rw_runlock(&pvh_global_lock); 3100 PMAP_UNLOCK(pmap); 3101 } 3102 3103 static vm_page_t 3104 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3105 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 3106 { 3107 struct spglist free; 3108 vm_paddr_t phys; 3109 pd_entry_t *l2; 3110 pt_entry_t *l3, newl3; 3111 3112 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 3113 (m->oflags & VPO_UNMANAGED) != 0, 3114 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3115 rw_assert(&pvh_global_lock, RA_LOCKED); 3116 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3117 3118 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); 3119 /* 3120 * In the case that a page table page is not 3121 * resident, we are creating it here. 3122 */ 3123 if (va < VM_MAXUSER_ADDRESS) { 3124 vm_pindex_t l2pindex; 3125 3126 /* 3127 * Calculate pagetable page index 3128 */ 3129 l2pindex = pmap_l2_pindex(va); 3130 if (mpte && (mpte->pindex == l2pindex)) { 3131 mpte->wire_count++; 3132 } else { 3133 /* 3134 * Get the l2 entry 3135 */ 3136 l2 = pmap_l2(pmap, va); 3137 3138 /* 3139 * If the page table page is mapped, we just increment 3140 * the hold count, and activate it. Otherwise, we 3141 * attempt to allocate a page table page. If this 3142 * attempt fails, we don't retry. Instead, we give up. 3143 */ 3144 if (l2 != NULL && pmap_load(l2) != 0) { 3145 phys = PTE_TO_PHYS(pmap_load(l2)); 3146 mpte = PHYS_TO_VM_PAGE(phys); 3147 mpte->wire_count++; 3148 } else { 3149 /* 3150 * Pass NULL instead of the PV list lock 3151 * pointer, because we don't intend to sleep. 3152 */ 3153 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); 3154 if (mpte == NULL) 3155 return (mpte); 3156 } 3157 } 3158 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 3159 l3 = &l3[pmap_l3_index(va)]; 3160 } else { 3161 mpte = NULL; 3162 l3 = pmap_l3(kernel_pmap, va); 3163 } 3164 if (l3 == NULL) 3165 panic("pmap_enter_quick_locked: No l3"); 3166 if (pmap_load(l3) != 0) { 3167 if (mpte != NULL) { 3168 mpte->wire_count--; 3169 mpte = NULL; 3170 } 3171 return (mpte); 3172 } 3173 3174 /* 3175 * Enter on the PV list if part of our managed memory. 3176 */ 3177 if ((m->oflags & VPO_UNMANAGED) == 0 && 3178 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 3179 if (mpte != NULL) { 3180 SLIST_INIT(&free); 3181 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 3182 pmap_invalidate_page(pmap, va); 3183 vm_page_free_pages_toq(&free, false); 3184 } 3185 mpte = NULL; 3186 } 3187 return (mpte); 3188 } 3189 3190 /* 3191 * Increment counters 3192 */ 3193 pmap_resident_count_inc(pmap, 1); 3194 3195 newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) | 3196 PTE_V | PTE_R; 3197 if ((prot & VM_PROT_EXECUTE) != 0) 3198 newl3 |= PTE_X; 3199 if ((m->oflags & VPO_UNMANAGED) == 0) 3200 newl3 |= PTE_SW_MANAGED; 3201 if (va < VM_MAX_USER_ADDRESS) 3202 newl3 |= PTE_U; 3203 3204 /* 3205 * Sync the i-cache on all harts before updating the PTE 3206 * if the new PTE is executable. 3207 */ 3208 if (prot & VM_PROT_EXECUTE) 3209 pmap_sync_icache(pmap, va, PAGE_SIZE); 3210 3211 pmap_store(l3, newl3); 3212 3213 pmap_invalidate_page(pmap, va); 3214 return (mpte); 3215 } 3216 3217 /* 3218 * This code maps large physical mmap regions into the 3219 * processor address space. Note that some shortcuts 3220 * are taken, but the code works. 3221 */ 3222 void 3223 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3224 vm_pindex_t pindex, vm_size_t size) 3225 { 3226 3227 VM_OBJECT_ASSERT_WLOCKED(object); 3228 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3229 ("pmap_object_init_pt: non-device object")); 3230 } 3231 3232 /* 3233 * Clear the wired attribute from the mappings for the specified range of 3234 * addresses in the given pmap. Every valid mapping within that range 3235 * must have the wired attribute set. In contrast, invalid mappings 3236 * cannot have the wired attribute set, so they are ignored. 3237 * 3238 * The wired attribute of the page table entry is not a hardware feature, 3239 * so there is no need to invalidate any TLB entries. 3240 */ 3241 void 3242 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3243 { 3244 vm_offset_t va_next; 3245 pd_entry_t *l1, *l2, l2e; 3246 pt_entry_t *l3, l3e; 3247 bool pv_lists_locked; 3248 3249 pv_lists_locked = false; 3250 retry: 3251 PMAP_LOCK(pmap); 3252 for (; sva < eva; sva = va_next) { 3253 l1 = pmap_l1(pmap, sva); 3254 if (pmap_load(l1) == 0) { 3255 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3256 if (va_next < sva) 3257 va_next = eva; 3258 continue; 3259 } 3260 3261 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3262 if (va_next < sva) 3263 va_next = eva; 3264 3265 l2 = pmap_l1_to_l2(l1, sva); 3266 if ((l2e = pmap_load(l2)) == 0) 3267 continue; 3268 if ((l2e & PTE_RWX) != 0) { 3269 if (sva + L2_SIZE == va_next && eva >= va_next) { 3270 if ((l2e & PTE_SW_WIRED) == 0) 3271 panic("pmap_unwire: l2 %#jx is missing " 3272 "PTE_SW_WIRED", (uintmax_t)l2e); 3273 pmap_clear_bits(l2, PTE_SW_WIRED); 3274 continue; 3275 } else { 3276 if (!pv_lists_locked) { 3277 pv_lists_locked = true; 3278 if (!rw_try_rlock(&pvh_global_lock)) { 3279 PMAP_UNLOCK(pmap); 3280 rw_rlock(&pvh_global_lock); 3281 /* Repeat sva. */ 3282 goto retry; 3283 } 3284 } 3285 if (!pmap_demote_l2(pmap, l2, sva)) 3286 panic("pmap_unwire: demotion failed"); 3287 } 3288 } 3289 3290 if (va_next > eva) 3291 va_next = eva; 3292 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 3293 sva += L3_SIZE) { 3294 if ((l3e = pmap_load(l3)) == 0) 3295 continue; 3296 if ((l3e & PTE_SW_WIRED) == 0) 3297 panic("pmap_unwire: l3 %#jx is missing " 3298 "PTE_SW_WIRED", (uintmax_t)l3e); 3299 3300 /* 3301 * PG_W must be cleared atomically. Although the pmap 3302 * lock synchronizes access to PG_W, another processor 3303 * could be setting PG_M and/or PG_A concurrently. 3304 */ 3305 pmap_clear_bits(l3, PTE_SW_WIRED); 3306 pmap->pm_stats.wired_count--; 3307 } 3308 } 3309 if (pv_lists_locked) 3310 rw_runlock(&pvh_global_lock); 3311 PMAP_UNLOCK(pmap); 3312 } 3313 3314 /* 3315 * Copy the range specified by src_addr/len 3316 * from the source map to the range dst_addr/len 3317 * in the destination map. 3318 * 3319 * This routine is only advisory and need not do anything. 3320 */ 3321 3322 void 3323 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 3324 vm_offset_t src_addr) 3325 { 3326 3327 } 3328 3329 /* 3330 * pmap_zero_page zeros the specified hardware page by mapping 3331 * the page into KVM and using bzero to clear its contents. 3332 */ 3333 void 3334 pmap_zero_page(vm_page_t m) 3335 { 3336 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3337 3338 pagezero((void *)va); 3339 } 3340 3341 /* 3342 * pmap_zero_page_area zeros the specified hardware page by mapping 3343 * the page into KVM and using bzero to clear its contents. 3344 * 3345 * off and size may not cover an area beyond a single hardware page. 3346 */ 3347 void 3348 pmap_zero_page_area(vm_page_t m, int off, int size) 3349 { 3350 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3351 3352 if (off == 0 && size == PAGE_SIZE) 3353 pagezero((void *)va); 3354 else 3355 bzero((char *)va + off, size); 3356 } 3357 3358 /* 3359 * pmap_copy_page copies the specified (machine independent) 3360 * page by mapping the page into virtual memory and using 3361 * bcopy to copy the page, one machine dependent page at a 3362 * time. 3363 */ 3364 void 3365 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 3366 { 3367 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 3368 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 3369 3370 pagecopy((void *)src, (void *)dst); 3371 } 3372 3373 int unmapped_buf_allowed = 1; 3374 3375 void 3376 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 3377 vm_offset_t b_offset, int xfersize) 3378 { 3379 void *a_cp, *b_cp; 3380 vm_page_t m_a, m_b; 3381 vm_paddr_t p_a, p_b; 3382 vm_offset_t a_pg_offset, b_pg_offset; 3383 int cnt; 3384 3385 while (xfersize > 0) { 3386 a_pg_offset = a_offset & PAGE_MASK; 3387 m_a = ma[a_offset >> PAGE_SHIFT]; 3388 p_a = m_a->phys_addr; 3389 b_pg_offset = b_offset & PAGE_MASK; 3390 m_b = mb[b_offset >> PAGE_SHIFT]; 3391 p_b = m_b->phys_addr; 3392 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 3393 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 3394 if (__predict_false(!PHYS_IN_DMAP(p_a))) { 3395 panic("!DMAP a %lx", p_a); 3396 } else { 3397 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 3398 } 3399 if (__predict_false(!PHYS_IN_DMAP(p_b))) { 3400 panic("!DMAP b %lx", p_b); 3401 } else { 3402 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 3403 } 3404 bcopy(a_cp, b_cp, cnt); 3405 a_offset += cnt; 3406 b_offset += cnt; 3407 xfersize -= cnt; 3408 } 3409 } 3410 3411 vm_offset_t 3412 pmap_quick_enter_page(vm_page_t m) 3413 { 3414 3415 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); 3416 } 3417 3418 void 3419 pmap_quick_remove_page(vm_offset_t addr) 3420 { 3421 } 3422 3423 /* 3424 * Returns true if the pmap's pv is one of the first 3425 * 16 pvs linked to from this page. This count may 3426 * be changed upwards or downwards in the future; it 3427 * is only necessary that true be returned for a small 3428 * subset of pmaps for proper page aging. 3429 */ 3430 boolean_t 3431 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 3432 { 3433 struct md_page *pvh; 3434 struct rwlock *lock; 3435 pv_entry_t pv; 3436 int loops = 0; 3437 boolean_t rv; 3438 3439 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3440 ("pmap_page_exists_quick: page %p is not managed", m)); 3441 rv = FALSE; 3442 rw_rlock(&pvh_global_lock); 3443 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3444 rw_rlock(lock); 3445 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3446 if (PV_PMAP(pv) == pmap) { 3447 rv = TRUE; 3448 break; 3449 } 3450 loops++; 3451 if (loops >= 16) 3452 break; 3453 } 3454 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 3455 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3456 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3457 if (PV_PMAP(pv) == pmap) { 3458 rv = TRUE; 3459 break; 3460 } 3461 loops++; 3462 if (loops >= 16) 3463 break; 3464 } 3465 } 3466 rw_runlock(lock); 3467 rw_runlock(&pvh_global_lock); 3468 return (rv); 3469 } 3470 3471 /* 3472 * pmap_page_wired_mappings: 3473 * 3474 * Return the number of managed mappings to the given physical page 3475 * that are wired. 3476 */ 3477 int 3478 pmap_page_wired_mappings(vm_page_t m) 3479 { 3480 struct md_page *pvh; 3481 struct rwlock *lock; 3482 pmap_t pmap; 3483 pd_entry_t *l2; 3484 pt_entry_t *l3; 3485 pv_entry_t pv; 3486 int count, md_gen, pvh_gen; 3487 3488 if ((m->oflags & VPO_UNMANAGED) != 0) 3489 return (0); 3490 rw_rlock(&pvh_global_lock); 3491 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3492 rw_rlock(lock); 3493 restart: 3494 count = 0; 3495 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3496 pmap = PV_PMAP(pv); 3497 if (!PMAP_TRYLOCK(pmap)) { 3498 md_gen = m->md.pv_gen; 3499 rw_runlock(lock); 3500 PMAP_LOCK(pmap); 3501 rw_rlock(lock); 3502 if (md_gen != m->md.pv_gen) { 3503 PMAP_UNLOCK(pmap); 3504 goto restart; 3505 } 3506 } 3507 l3 = pmap_l3(pmap, pv->pv_va); 3508 if ((pmap_load(l3) & PTE_SW_WIRED) != 0) 3509 count++; 3510 PMAP_UNLOCK(pmap); 3511 } 3512 if ((m->flags & PG_FICTITIOUS) == 0) { 3513 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3514 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3515 pmap = PV_PMAP(pv); 3516 if (!PMAP_TRYLOCK(pmap)) { 3517 md_gen = m->md.pv_gen; 3518 pvh_gen = pvh->pv_gen; 3519 rw_runlock(lock); 3520 PMAP_LOCK(pmap); 3521 rw_rlock(lock); 3522 if (md_gen != m->md.pv_gen || 3523 pvh_gen != pvh->pv_gen) { 3524 PMAP_UNLOCK(pmap); 3525 goto restart; 3526 } 3527 } 3528 l2 = pmap_l2(pmap, pv->pv_va); 3529 if ((pmap_load(l2) & PTE_SW_WIRED) != 0) 3530 count++; 3531 PMAP_UNLOCK(pmap); 3532 } 3533 } 3534 rw_runlock(lock); 3535 rw_runlock(&pvh_global_lock); 3536 return (count); 3537 } 3538 3539 static void 3540 pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, 3541 struct spglist *free, bool superpage) 3542 { 3543 struct md_page *pvh; 3544 vm_page_t mpte, mt; 3545 3546 if (superpage) { 3547 pmap_resident_count_dec(pmap, Ln_ENTRIES); 3548 pvh = pa_to_pvh(m->phys_addr); 3549 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3550 pvh->pv_gen++; 3551 if (TAILQ_EMPTY(&pvh->pv_list)) { 3552 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 3553 if (TAILQ_EMPTY(&mt->md.pv_list) && 3554 (mt->aflags & PGA_WRITEABLE) != 0) 3555 vm_page_aflag_clear(mt, PGA_WRITEABLE); 3556 } 3557 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 3558 if (mpte != NULL) { 3559 pmap_resident_count_dec(pmap, 1); 3560 KASSERT(mpte->wire_count == Ln_ENTRIES, 3561 ("pmap_remove_pages: pte page wire count error")); 3562 mpte->wire_count = 0; 3563 pmap_add_delayed_free_list(mpte, free, FALSE); 3564 } 3565 } else { 3566 pmap_resident_count_dec(pmap, 1); 3567 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3568 m->md.pv_gen++; 3569 if (TAILQ_EMPTY(&m->md.pv_list) && 3570 (m->aflags & PGA_WRITEABLE) != 0) { 3571 pvh = pa_to_pvh(m->phys_addr); 3572 if (TAILQ_EMPTY(&pvh->pv_list)) 3573 vm_page_aflag_clear(m, PGA_WRITEABLE); 3574 } 3575 } 3576 } 3577 3578 /* 3579 * Destroy all managed, non-wired mappings in the given user-space 3580 * pmap. This pmap cannot be active on any processor besides the 3581 * caller. 3582 * 3583 * This function cannot be applied to the kernel pmap. Moreover, it 3584 * is not intended for general use. It is only to be used during 3585 * process termination. Consequently, it can be implemented in ways 3586 * that make it faster than pmap_remove(). First, it can more quickly 3587 * destroy mappings by iterating over the pmap's collection of PV 3588 * entries, rather than searching the page table. Second, it doesn't 3589 * have to test and clear the page table entries atomically, because 3590 * no processor is currently accessing the user address space. In 3591 * particular, a page table entry's dirty bit won't change state once 3592 * this function starts. 3593 */ 3594 void 3595 pmap_remove_pages(pmap_t pmap) 3596 { 3597 struct spglist free; 3598 pd_entry_t ptepde; 3599 pt_entry_t *pte, tpte; 3600 vm_page_t m, mt; 3601 pv_entry_t pv; 3602 struct pv_chunk *pc, *npc; 3603 struct rwlock *lock; 3604 int64_t bit; 3605 uint64_t inuse, bitmask; 3606 int allfree, field, freed, idx; 3607 bool superpage; 3608 3609 lock = NULL; 3610 3611 SLIST_INIT(&free); 3612 rw_rlock(&pvh_global_lock); 3613 PMAP_LOCK(pmap); 3614 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 3615 allfree = 1; 3616 freed = 0; 3617 for (field = 0; field < _NPCM; field++) { 3618 inuse = ~pc->pc_map[field] & pc_freemask[field]; 3619 while (inuse != 0) { 3620 bit = ffsl(inuse) - 1; 3621 bitmask = 1UL << bit; 3622 idx = field * 64 + bit; 3623 pv = &pc->pc_pventry[idx]; 3624 inuse &= ~bitmask; 3625 3626 pte = pmap_l1(pmap, pv->pv_va); 3627 ptepde = pmap_load(pte); 3628 pte = pmap_l1_to_l2(pte, pv->pv_va); 3629 tpte = pmap_load(pte); 3630 if ((tpte & PTE_RWX) != 0) { 3631 superpage = true; 3632 } else { 3633 ptepde = tpte; 3634 pte = pmap_l2_to_l3(pte, pv->pv_va); 3635 tpte = pmap_load(pte); 3636 superpage = false; 3637 } 3638 3639 /* 3640 * We cannot remove wired pages from a 3641 * process' mapping at this time. 3642 */ 3643 if (tpte & PTE_SW_WIRED) { 3644 allfree = 0; 3645 continue; 3646 } 3647 3648 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte)); 3649 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 3650 m < &vm_page_array[vm_page_array_size], 3651 ("pmap_remove_pages: bad pte %#jx", 3652 (uintmax_t)tpte)); 3653 3654 pmap_clear(pte); 3655 3656 /* 3657 * Update the vm_page_t clean/reference bits. 3658 */ 3659 if ((tpte & (PTE_D | PTE_W)) == 3660 (PTE_D | PTE_W)) { 3661 if (superpage) 3662 for (mt = m; 3663 mt < &m[Ln_ENTRIES]; mt++) 3664 vm_page_dirty(mt); 3665 else 3666 vm_page_dirty(m); 3667 } 3668 3669 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 3670 3671 /* Mark free */ 3672 pc->pc_map[field] |= bitmask; 3673 3674 pmap_remove_pages_pv(pmap, m, pv, &free, 3675 superpage); 3676 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 3677 freed++; 3678 } 3679 } 3680 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 3681 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 3682 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 3683 if (allfree) { 3684 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3685 free_pv_chunk(pc); 3686 } 3687 } 3688 if (lock != NULL) 3689 rw_wunlock(lock); 3690 pmap_invalidate_all(pmap); 3691 rw_runlock(&pvh_global_lock); 3692 PMAP_UNLOCK(pmap); 3693 vm_page_free_pages_toq(&free, false); 3694 } 3695 3696 static bool 3697 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 3698 { 3699 struct md_page *pvh; 3700 struct rwlock *lock; 3701 pd_entry_t *l2; 3702 pt_entry_t *l3, mask; 3703 pv_entry_t pv; 3704 pmap_t pmap; 3705 int md_gen, pvh_gen; 3706 bool rv; 3707 3708 mask = 0; 3709 if (modified) 3710 mask |= PTE_D; 3711 if (accessed) 3712 mask |= PTE_A; 3713 3714 rv = FALSE; 3715 rw_rlock(&pvh_global_lock); 3716 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3717 rw_rlock(lock); 3718 restart: 3719 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3720 pmap = PV_PMAP(pv); 3721 if (!PMAP_TRYLOCK(pmap)) { 3722 md_gen = m->md.pv_gen; 3723 rw_runlock(lock); 3724 PMAP_LOCK(pmap); 3725 rw_rlock(lock); 3726 if (md_gen != m->md.pv_gen) { 3727 PMAP_UNLOCK(pmap); 3728 goto restart; 3729 } 3730 } 3731 l3 = pmap_l3(pmap, pv->pv_va); 3732 rv = (pmap_load(l3) & mask) == mask; 3733 PMAP_UNLOCK(pmap); 3734 if (rv) 3735 goto out; 3736 } 3737 if ((m->flags & PG_FICTITIOUS) == 0) { 3738 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3739 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3740 pmap = PV_PMAP(pv); 3741 if (!PMAP_TRYLOCK(pmap)) { 3742 md_gen = m->md.pv_gen; 3743 pvh_gen = pvh->pv_gen; 3744 rw_runlock(lock); 3745 PMAP_LOCK(pmap); 3746 rw_rlock(lock); 3747 if (md_gen != m->md.pv_gen || 3748 pvh_gen != pvh->pv_gen) { 3749 PMAP_UNLOCK(pmap); 3750 goto restart; 3751 } 3752 } 3753 l2 = pmap_l2(pmap, pv->pv_va); 3754 rv = (pmap_load(l2) & mask) == mask; 3755 PMAP_UNLOCK(pmap); 3756 if (rv) 3757 goto out; 3758 } 3759 } 3760 out: 3761 rw_runlock(lock); 3762 rw_runlock(&pvh_global_lock); 3763 return (rv); 3764 } 3765 3766 /* 3767 * pmap_is_modified: 3768 * 3769 * Return whether or not the specified physical page was modified 3770 * in any physical maps. 3771 */ 3772 boolean_t 3773 pmap_is_modified(vm_page_t m) 3774 { 3775 3776 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3777 ("pmap_is_modified: page %p is not managed", m)); 3778 3779 /* 3780 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 3781 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 3782 * is clear, no PTEs can have PG_M set. 3783 */ 3784 VM_OBJECT_ASSERT_WLOCKED(m->object); 3785 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 3786 return (FALSE); 3787 return (pmap_page_test_mappings(m, FALSE, TRUE)); 3788 } 3789 3790 /* 3791 * pmap_is_prefaultable: 3792 * 3793 * Return whether or not the specified virtual address is eligible 3794 * for prefault. 3795 */ 3796 boolean_t 3797 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 3798 { 3799 pt_entry_t *l3; 3800 boolean_t rv; 3801 3802 rv = FALSE; 3803 PMAP_LOCK(pmap); 3804 l3 = pmap_l3(pmap, addr); 3805 if (l3 != NULL && pmap_load(l3) != 0) { 3806 rv = TRUE; 3807 } 3808 PMAP_UNLOCK(pmap); 3809 return (rv); 3810 } 3811 3812 /* 3813 * pmap_is_referenced: 3814 * 3815 * Return whether or not the specified physical page was referenced 3816 * in any physical maps. 3817 */ 3818 boolean_t 3819 pmap_is_referenced(vm_page_t m) 3820 { 3821 3822 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3823 ("pmap_is_referenced: page %p is not managed", m)); 3824 return (pmap_page_test_mappings(m, TRUE, FALSE)); 3825 } 3826 3827 /* 3828 * Clear the write and modified bits in each of the given page's mappings. 3829 */ 3830 void 3831 pmap_remove_write(vm_page_t m) 3832 { 3833 struct md_page *pvh; 3834 struct rwlock *lock; 3835 pmap_t pmap; 3836 pd_entry_t *l2; 3837 pt_entry_t *l3, oldl3, newl3; 3838 pv_entry_t next_pv, pv; 3839 vm_offset_t va; 3840 int md_gen, pvh_gen; 3841 3842 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3843 ("pmap_remove_write: page %p is not managed", m)); 3844 3845 /* 3846 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 3847 * set by another thread while the object is locked. Thus, 3848 * if PGA_WRITEABLE is clear, no page table entries need updating. 3849 */ 3850 VM_OBJECT_ASSERT_WLOCKED(m->object); 3851 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 3852 return; 3853 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3854 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 3855 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3856 rw_rlock(&pvh_global_lock); 3857 retry_pv_loop: 3858 rw_wlock(lock); 3859 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 3860 pmap = PV_PMAP(pv); 3861 if (!PMAP_TRYLOCK(pmap)) { 3862 pvh_gen = pvh->pv_gen; 3863 rw_wunlock(lock); 3864 PMAP_LOCK(pmap); 3865 rw_wlock(lock); 3866 if (pvh_gen != pvh->pv_gen) { 3867 PMAP_UNLOCK(pmap); 3868 rw_wunlock(lock); 3869 goto retry_pv_loop; 3870 } 3871 } 3872 va = pv->pv_va; 3873 l2 = pmap_l2(pmap, va); 3874 if ((pmap_load(l2) & PTE_W) != 0) 3875 (void)pmap_demote_l2_locked(pmap, l2, va, &lock); 3876 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 3877 ("inconsistent pv lock %p %p for page %p", 3878 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 3879 PMAP_UNLOCK(pmap); 3880 } 3881 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3882 pmap = PV_PMAP(pv); 3883 if (!PMAP_TRYLOCK(pmap)) { 3884 pvh_gen = pvh->pv_gen; 3885 md_gen = m->md.pv_gen; 3886 rw_wunlock(lock); 3887 PMAP_LOCK(pmap); 3888 rw_wlock(lock); 3889 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 3890 PMAP_UNLOCK(pmap); 3891 rw_wunlock(lock); 3892 goto retry_pv_loop; 3893 } 3894 } 3895 l3 = pmap_l3(pmap, pv->pv_va); 3896 oldl3 = pmap_load(l3); 3897 retry: 3898 if ((oldl3 & PTE_W) != 0) { 3899 newl3 = oldl3 & ~(PTE_D | PTE_W); 3900 if (!atomic_fcmpset_long(l3, &oldl3, newl3)) 3901 goto retry; 3902 if ((oldl3 & PTE_D) != 0) 3903 vm_page_dirty(m); 3904 pmap_invalidate_page(pmap, pv->pv_va); 3905 } 3906 PMAP_UNLOCK(pmap); 3907 } 3908 rw_wunlock(lock); 3909 vm_page_aflag_clear(m, PGA_WRITEABLE); 3910 rw_runlock(&pvh_global_lock); 3911 } 3912 3913 /* 3914 * pmap_ts_referenced: 3915 * 3916 * Return a count of reference bits for a page, clearing those bits. 3917 * It is not necessary for every reference bit to be cleared, but it 3918 * is necessary that 0 only be returned when there are truly no 3919 * reference bits set. 3920 * 3921 * As an optimization, update the page's dirty field if a modified bit is 3922 * found while counting reference bits. This opportunistic update can be 3923 * performed at low cost and can eliminate the need for some future calls 3924 * to pmap_is_modified(). However, since this function stops after 3925 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 3926 * dirty pages. Those dirty pages will only be detected by a future call 3927 * to pmap_is_modified(). 3928 */ 3929 int 3930 pmap_ts_referenced(vm_page_t m) 3931 { 3932 struct spglist free; 3933 struct md_page *pvh; 3934 struct rwlock *lock; 3935 pv_entry_t pv, pvf; 3936 pmap_t pmap; 3937 pd_entry_t *l2, l2e; 3938 pt_entry_t *l3, l3e; 3939 vm_paddr_t pa; 3940 vm_offset_t va; 3941 int md_gen, pvh_gen, ret; 3942 3943 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3944 ("pmap_ts_referenced: page %p is not managed", m)); 3945 SLIST_INIT(&free); 3946 ret = 0; 3947 pa = VM_PAGE_TO_PHYS(m); 3948 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 3949 3950 lock = PHYS_TO_PV_LIST_LOCK(pa); 3951 rw_rlock(&pvh_global_lock); 3952 rw_wlock(lock); 3953 retry: 3954 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 3955 goto small_mappings; 3956 pv = pvf; 3957 do { 3958 pmap = PV_PMAP(pv); 3959 if (!PMAP_TRYLOCK(pmap)) { 3960 pvh_gen = pvh->pv_gen; 3961 rw_wunlock(lock); 3962 PMAP_LOCK(pmap); 3963 rw_wlock(lock); 3964 if (pvh_gen != pvh->pv_gen) { 3965 PMAP_UNLOCK(pmap); 3966 goto retry; 3967 } 3968 } 3969 va = pv->pv_va; 3970 l2 = pmap_l2(pmap, va); 3971 l2e = pmap_load(l2); 3972 if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) { 3973 /* 3974 * Although l2e is mapping a 2MB page, because 3975 * this function is called at a 4KB page granularity, 3976 * we only update the 4KB page under test. 3977 */ 3978 vm_page_dirty(m); 3979 } 3980 if ((l2e & PTE_A) != 0) { 3981 /* 3982 * Since this reference bit is shared by 512 4KB 3983 * pages, it should not be cleared every time it is 3984 * tested. Apply a simple "hash" function on the 3985 * physical page number, the virtual superpage number, 3986 * and the pmap address to select one 4KB page out of 3987 * the 512 on which testing the reference bit will 3988 * result in clearing that reference bit. This 3989 * function is designed to avoid the selection of the 3990 * same 4KB page for every 2MB page mapping. 3991 * 3992 * On demotion, a mapping that hasn't been referenced 3993 * is simply destroyed. To avoid the possibility of a 3994 * subsequent page fault on a demoted wired mapping, 3995 * always leave its reference bit set. Moreover, 3996 * since the superpage is wired, the current state of 3997 * its reference bit won't affect page replacement. 3998 */ 3999 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ 4000 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && 4001 (l2e & PTE_SW_WIRED) == 0) { 4002 pmap_clear_bits(l2, PTE_A); 4003 pmap_invalidate_page(pmap, va); 4004 } 4005 ret++; 4006 } 4007 PMAP_UNLOCK(pmap); 4008 /* Rotate the PV list if it has more than one entry. */ 4009 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4010 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4011 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4012 pvh->pv_gen++; 4013 } 4014 if (ret >= PMAP_TS_REFERENCED_MAX) 4015 goto out; 4016 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 4017 small_mappings: 4018 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 4019 goto out; 4020 pv = pvf; 4021 do { 4022 pmap = PV_PMAP(pv); 4023 if (!PMAP_TRYLOCK(pmap)) { 4024 pvh_gen = pvh->pv_gen; 4025 md_gen = m->md.pv_gen; 4026 rw_wunlock(lock); 4027 PMAP_LOCK(pmap); 4028 rw_wlock(lock); 4029 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4030 PMAP_UNLOCK(pmap); 4031 goto retry; 4032 } 4033 } 4034 l2 = pmap_l2(pmap, pv->pv_va); 4035 4036 KASSERT((pmap_load(l2) & PTE_RX) == 0, 4037 ("pmap_ts_referenced: found an invalid l2 table")); 4038 4039 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4040 l3e = pmap_load(l3); 4041 if ((l3e & PTE_D) != 0) 4042 vm_page_dirty(m); 4043 if ((l3e & PTE_A) != 0) { 4044 if ((l3e & PTE_SW_WIRED) == 0) { 4045 /* 4046 * Wired pages cannot be paged out so 4047 * doing accessed bit emulation for 4048 * them is wasted effort. We do the 4049 * hard work for unwired pages only. 4050 */ 4051 pmap_clear_bits(l3, PTE_A); 4052 pmap_invalidate_page(pmap, pv->pv_va); 4053 } 4054 ret++; 4055 } 4056 PMAP_UNLOCK(pmap); 4057 /* Rotate the PV list if it has more than one entry. */ 4058 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4059 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4060 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4061 m->md.pv_gen++; 4062 } 4063 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && ret < 4064 PMAP_TS_REFERENCED_MAX); 4065 out: 4066 rw_wunlock(lock); 4067 rw_runlock(&pvh_global_lock); 4068 vm_page_free_pages_toq(&free, false); 4069 return (ret); 4070 } 4071 4072 /* 4073 * Apply the given advice to the specified range of addresses within the 4074 * given pmap. Depending on the advice, clear the referenced and/or 4075 * modified flags in each mapping and set the mapped page's dirty field. 4076 */ 4077 void 4078 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 4079 { 4080 } 4081 4082 /* 4083 * Clear the modify bits on the specified physical page. 4084 */ 4085 void 4086 pmap_clear_modify(vm_page_t m) 4087 { 4088 struct md_page *pvh; 4089 struct rwlock *lock; 4090 pmap_t pmap; 4091 pv_entry_t next_pv, pv; 4092 pd_entry_t *l2, oldl2; 4093 pt_entry_t *l3, oldl3; 4094 vm_offset_t va; 4095 int md_gen, pvh_gen; 4096 4097 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4098 ("pmap_clear_modify: page %p is not managed", m)); 4099 VM_OBJECT_ASSERT_WLOCKED(m->object); 4100 KASSERT(!vm_page_xbusied(m), 4101 ("pmap_clear_modify: page %p is exclusive busied", m)); 4102 4103 /* 4104 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 4105 * If the object containing the page is locked and the page is not 4106 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 4107 */ 4108 if ((m->aflags & PGA_WRITEABLE) == 0) 4109 return; 4110 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 4111 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4112 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4113 rw_rlock(&pvh_global_lock); 4114 rw_wlock(lock); 4115 restart: 4116 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4117 pmap = PV_PMAP(pv); 4118 if (!PMAP_TRYLOCK(pmap)) { 4119 pvh_gen = pvh->pv_gen; 4120 rw_wunlock(lock); 4121 PMAP_LOCK(pmap); 4122 rw_wlock(lock); 4123 if (pvh_gen != pvh->pv_gen) { 4124 PMAP_UNLOCK(pmap); 4125 goto restart; 4126 } 4127 } 4128 va = pv->pv_va; 4129 l2 = pmap_l2(pmap, va); 4130 oldl2 = pmap_load(l2); 4131 if ((oldl2 & PTE_W) != 0) { 4132 if (pmap_demote_l2_locked(pmap, l2, va, &lock)) { 4133 if ((oldl2 & PTE_SW_WIRED) == 0) { 4134 /* 4135 * Write protect the mapping to a 4136 * single page so that a subsequent 4137 * write access may repromote. 4138 */ 4139 va += VM_PAGE_TO_PHYS(m) - 4140 PTE_TO_PHYS(oldl2); 4141 l3 = pmap_l2_to_l3(l2, va); 4142 oldl3 = pmap_load(l3); 4143 if ((oldl3 & PTE_V) != 0) { 4144 while (!atomic_fcmpset_long(l3, 4145 &oldl3, oldl3 & ~(PTE_D | 4146 PTE_W))) 4147 cpu_spinwait(); 4148 vm_page_dirty(m); 4149 pmap_invalidate_page(pmap, va); 4150 } 4151 } 4152 } 4153 } 4154 PMAP_UNLOCK(pmap); 4155 } 4156 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4157 pmap = PV_PMAP(pv); 4158 if (!PMAP_TRYLOCK(pmap)) { 4159 md_gen = m->md.pv_gen; 4160 pvh_gen = pvh->pv_gen; 4161 rw_wunlock(lock); 4162 PMAP_LOCK(pmap); 4163 rw_wlock(lock); 4164 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4165 PMAP_UNLOCK(pmap); 4166 goto restart; 4167 } 4168 } 4169 l2 = pmap_l2(pmap, pv->pv_va); 4170 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4171 ("pmap_clear_modify: found a 2mpage in page %p's pv list", 4172 m)); 4173 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4174 if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) { 4175 pmap_clear_bits(l3, PTE_D); 4176 pmap_invalidate_page(pmap, pv->pv_va); 4177 } 4178 PMAP_UNLOCK(pmap); 4179 } 4180 rw_wunlock(lock); 4181 rw_runlock(&pvh_global_lock); 4182 } 4183 4184 void * 4185 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 4186 { 4187 4188 return ((void *)PHYS_TO_DMAP(pa)); 4189 } 4190 4191 void 4192 pmap_unmapbios(vm_paddr_t pa, vm_size_t size) 4193 { 4194 } 4195 4196 /* 4197 * Sets the memory attribute for the specified page. 4198 */ 4199 void 4200 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 4201 { 4202 4203 m->md.pv_memattr = ma; 4204 4205 /* 4206 * RISCVTODO: Implement the below (from the amd64 pmap) 4207 * If "m" is a normal page, update its direct mapping. This update 4208 * can be relied upon to perform any cache operations that are 4209 * required for data coherence. 4210 */ 4211 if ((m->flags & PG_FICTITIOUS) == 0 && 4212 PHYS_IN_DMAP(VM_PAGE_TO_PHYS(m))) 4213 panic("RISCVTODO: pmap_page_set_memattr"); 4214 } 4215 4216 /* 4217 * perform the pmap work for mincore 4218 */ 4219 int 4220 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 4221 { 4222 pt_entry_t *l2, *l3, tpte; 4223 vm_paddr_t pa; 4224 int val; 4225 bool managed; 4226 4227 PMAP_LOCK(pmap); 4228 retry: 4229 managed = false; 4230 val = 0; 4231 4232 l2 = pmap_l2(pmap, addr); 4233 if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) { 4234 if ((tpte & PTE_RWX) != 0) { 4235 pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET); 4236 val = MINCORE_INCORE | MINCORE_SUPER; 4237 } else { 4238 l3 = pmap_l2_to_l3(l2, addr); 4239 tpte = pmap_load(l3); 4240 if ((tpte & PTE_V) == 0) 4241 goto done; 4242 pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET); 4243 val = MINCORE_INCORE; 4244 } 4245 4246 if ((tpte & PTE_D) != 0) 4247 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 4248 if ((tpte & PTE_A) != 0) 4249 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 4250 managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED; 4251 } 4252 4253 done: 4254 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 4255 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 4256 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 4257 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 4258 goto retry; 4259 } else 4260 PA_UNLOCK_COND(*locked_pa); 4261 PMAP_UNLOCK(pmap); 4262 return (val); 4263 } 4264 4265 void 4266 pmap_activate_sw(struct thread *td) 4267 { 4268 pmap_t oldpmap, pmap; 4269 u_int cpu; 4270 4271 oldpmap = PCPU_GET(curpmap); 4272 pmap = vmspace_pmap(td->td_proc->p_vmspace); 4273 if (pmap == oldpmap) 4274 return; 4275 load_satp(pmap->pm_satp); 4276 4277 cpu = PCPU_GET(cpuid); 4278 #ifdef SMP 4279 CPU_SET_ATOMIC(cpu, &pmap->pm_active); 4280 CPU_CLR_ATOMIC(cpu, &oldpmap->pm_active); 4281 #else 4282 CPU_SET(cpu, &pmap->pm_active); 4283 CPU_CLR(cpu, &oldpmap->pm_active); 4284 #endif 4285 PCPU_SET(curpmap, pmap); 4286 4287 sfence_vma(); 4288 } 4289 4290 void 4291 pmap_activate(struct thread *td) 4292 { 4293 4294 critical_enter(); 4295 pmap_activate_sw(td); 4296 critical_exit(); 4297 } 4298 4299 void 4300 pmap_activate_boot(pmap_t pmap) 4301 { 4302 u_int cpu; 4303 4304 cpu = PCPU_GET(cpuid); 4305 #ifdef SMP 4306 CPU_SET_ATOMIC(cpu, &pmap->pm_active); 4307 #else 4308 CPU_SET(cpu, &pmap->pm_active); 4309 #endif 4310 PCPU_SET(curpmap, pmap); 4311 } 4312 4313 void 4314 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) 4315 { 4316 cpuset_t mask; 4317 4318 /* 4319 * From the RISC-V User-Level ISA V2.2: 4320 * 4321 * "To make a store to instruction memory visible to all 4322 * RISC-V harts, the writing hart has to execute a data FENCE 4323 * before requesting that all remote RISC-V harts execute a 4324 * FENCE.I." 4325 */ 4326 sched_pin(); 4327 mask = all_cpus; 4328 CPU_CLR(PCPU_GET(cpuid), &mask); 4329 fence(); 4330 if (!CPU_EMPTY(&mask) && smp_started) 4331 sbi_remote_fence_i(mask.__bits); 4332 sched_unpin(); 4333 } 4334 4335 /* 4336 * Increase the starting virtual address of the given mapping if a 4337 * different alignment might result in more superpage mappings. 4338 */ 4339 void 4340 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 4341 vm_offset_t *addr, vm_size_t size) 4342 { 4343 vm_offset_t superpage_offset; 4344 4345 if (size < L2_SIZE) 4346 return; 4347 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 4348 offset += ptoa(object->pg_color); 4349 superpage_offset = offset & L2_OFFSET; 4350 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || 4351 (*addr & L2_OFFSET) == superpage_offset) 4352 return; 4353 if ((*addr & L2_OFFSET) < superpage_offset) 4354 *addr = (*addr & ~L2_OFFSET) + superpage_offset; 4355 else 4356 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; 4357 } 4358 4359 /** 4360 * Get the kernel virtual address of a set of physical pages. If there are 4361 * physical addresses not covered by the DMAP perform a transient mapping 4362 * that will be removed when calling pmap_unmap_io_transient. 4363 * 4364 * \param page The pages the caller wishes to obtain the virtual 4365 * address on the kernel memory map. 4366 * \param vaddr On return contains the kernel virtual memory address 4367 * of the pages passed in the page parameter. 4368 * \param count Number of pages passed in. 4369 * \param can_fault TRUE if the thread using the mapped pages can take 4370 * page faults, FALSE otherwise. 4371 * 4372 * \returns TRUE if the caller must call pmap_unmap_io_transient when 4373 * finished or FALSE otherwise. 4374 * 4375 */ 4376 boolean_t 4377 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 4378 boolean_t can_fault) 4379 { 4380 vm_paddr_t paddr; 4381 boolean_t needs_mapping; 4382 int error, i; 4383 4384 /* 4385 * Allocate any KVA space that we need, this is done in a separate 4386 * loop to prevent calling vmem_alloc while pinned. 4387 */ 4388 needs_mapping = FALSE; 4389 for (i = 0; i < count; i++) { 4390 paddr = VM_PAGE_TO_PHYS(page[i]); 4391 if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) { 4392 error = vmem_alloc(kernel_arena, PAGE_SIZE, 4393 M_BESTFIT | M_WAITOK, &vaddr[i]); 4394 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 4395 needs_mapping = TRUE; 4396 } else { 4397 vaddr[i] = PHYS_TO_DMAP(paddr); 4398 } 4399 } 4400 4401 /* Exit early if everything is covered by the DMAP */ 4402 if (!needs_mapping) 4403 return (FALSE); 4404 4405 if (!can_fault) 4406 sched_pin(); 4407 for (i = 0; i < count; i++) { 4408 paddr = VM_PAGE_TO_PHYS(page[i]); 4409 if (paddr >= DMAP_MAX_PHYSADDR) { 4410 panic( 4411 "pmap_map_io_transient: TODO: Map out of DMAP data"); 4412 } 4413 } 4414 4415 return (needs_mapping); 4416 } 4417 4418 void 4419 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 4420 boolean_t can_fault) 4421 { 4422 vm_paddr_t paddr; 4423 int i; 4424 4425 if (!can_fault) 4426 sched_unpin(); 4427 for (i = 0; i < count; i++) { 4428 paddr = VM_PAGE_TO_PHYS(page[i]); 4429 if (paddr >= DMAP_MAX_PHYSADDR) { 4430 panic("RISCVTODO: pmap_unmap_io_transient: Unmap data"); 4431 } 4432 } 4433 } 4434 4435 boolean_t 4436 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 4437 { 4438 4439 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_BACK); 4440 } 4441