1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * Copyright (c) 2014 Andrew Turner 15 * All rights reserved. 16 * Copyright (c) 2014 The FreeBSD Foundation 17 * All rights reserved. 18 * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com> 19 * All rights reserved. 20 * 21 * This code is derived from software contributed to Berkeley by 22 * the Systems Programming Group of the University of Utah Computer 23 * Science Department and William Jolitz of UUNET Technologies Inc. 24 * 25 * Portions of this software were developed by Andrew Turner under 26 * sponsorship from The FreeBSD Foundation. 27 * 28 * Portions of this software were developed by SRI International and the 29 * University of Cambridge Computer Laboratory under DARPA/AFRL contract 30 * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. 31 * 32 * Portions of this software were developed by the University of Cambridge 33 * Computer Laboratory as part of the CTSRD Project, with support from the 34 * UK Higher Education Innovation Fund (HEIF). 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. All advertising materials mentioning features or use of this software 45 * must display the following acknowledgement: 46 * This product includes software developed by the University of 47 * California, Berkeley and its contributors. 48 * 4. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 * 64 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 65 */ 66 /*- 67 * Copyright (c) 2003 Networks Associates Technology, Inc. 68 * All rights reserved. 69 * 70 * This software was developed for the FreeBSD Project by Jake Burkholder, 71 * Safeport Network Services, and Network Associates Laboratories, the 72 * Security Research Division of Network Associates, Inc. under 73 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 74 * CHATS research program. 75 * 76 * Redistribution and use in source and binary forms, with or without 77 * modification, are permitted provided that the following conditions 78 * are met: 79 * 1. Redistributions of source code must retain the above copyright 80 * notice, this list of conditions and the following disclaimer. 81 * 2. Redistributions in binary form must reproduce the above copyright 82 * notice, this list of conditions and the following disclaimer in the 83 * documentation and/or other materials provided with the distribution. 84 * 85 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 86 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 87 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 88 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 89 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 90 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 91 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 92 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 93 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 94 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 95 * SUCH DAMAGE. 96 */ 97 98 #include <sys/cdefs.h> 99 __FBSDID("$FreeBSD$"); 100 101 /* 102 * Manages physical address maps. 103 * 104 * Since the information managed by this module is 105 * also stored by the logical address mapping module, 106 * this module may throw away valid virtual-to-physical 107 * mappings at almost any time. However, invalidations 108 * of virtual-to-physical mappings must be done as 109 * requested. 110 * 111 * In order to cope with hardware architectures which 112 * make virtual-to-physical map invalidates expensive, 113 * this module may delay invalidate or reduced protection 114 * operations until such time as they are actually 115 * necessary. This module is given full information as 116 * to which processors are currently using which maps, 117 * and to when physical maps must be made correct. 118 */ 119 120 #include <sys/param.h> 121 #include <sys/systm.h> 122 #include <sys/bitstring.h> 123 #include <sys/bus.h> 124 #include <sys/cpuset.h> 125 #include <sys/kernel.h> 126 #include <sys/ktr.h> 127 #include <sys/lock.h> 128 #include <sys/malloc.h> 129 #include <sys/mman.h> 130 #include <sys/msgbuf.h> 131 #include <sys/mutex.h> 132 #include <sys/physmem.h> 133 #include <sys/proc.h> 134 #include <sys/rwlock.h> 135 #include <sys/sbuf.h> 136 #include <sys/sx.h> 137 #include <sys/vmem.h> 138 #include <sys/vmmeter.h> 139 #include <sys/sched.h> 140 #include <sys/sysctl.h> 141 #include <sys/smp.h> 142 143 #include <vm/vm.h> 144 #include <vm/vm_param.h> 145 #include <vm/vm_kern.h> 146 #include <vm/vm_page.h> 147 #include <vm/vm_map.h> 148 #include <vm/vm_object.h> 149 #include <vm/vm_extern.h> 150 #include <vm/vm_pageout.h> 151 #include <vm/vm_pager.h> 152 #include <vm/vm_phys.h> 153 #include <vm/vm_radix.h> 154 #include <vm/vm_reserv.h> 155 #include <vm/vm_dumpset.h> 156 #include <vm/uma.h> 157 158 #include <machine/machdep.h> 159 #include <machine/md_var.h> 160 #include <machine/pcb.h> 161 #include <machine/sbi.h> 162 163 /* 164 * Boundary values for the page table page index space: 165 * 166 * L3 pages: [0, NUL2E) 167 * L2 pages: [NUL2E, NUL2E + NUL1E) 168 * L1 pages: [NUL2E + NUL1E, NUL2E + NUL1E + NUL0E) 169 * 170 * Note that these ranges are used in both SV39 and SV48 mode. In SV39 mode the 171 * ranges are not fully populated since there are at most Ln_ENTRIES^2 L3 pages 172 * in a set of page tables. 173 */ 174 #define NUL0E Ln_ENTRIES 175 #define NUL1E (Ln_ENTRIES * NUL0E) 176 #define NUL2E (Ln_ENTRIES * NUL1E) 177 178 #if !defined(DIAGNOSTIC) 179 #ifdef __GNUC_GNU_INLINE__ 180 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 181 #else 182 #define PMAP_INLINE extern inline 183 #endif 184 #else 185 #define PMAP_INLINE 186 #endif 187 188 #ifdef PV_STATS 189 #define PV_STAT(x) do { x ; } while (0) 190 #else 191 #define PV_STAT(x) do { } while (0) 192 #endif 193 194 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT)) 195 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) 196 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 197 198 #define NPV_LIST_LOCKS MAXCPU 199 200 #define PHYS_TO_PV_LIST_LOCK(pa) \ 201 (&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS]) 202 203 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 204 struct rwlock **_lockp = (lockp); \ 205 struct rwlock *_new_lock; \ 206 \ 207 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 208 if (_new_lock != *_lockp) { \ 209 if (*_lockp != NULL) \ 210 rw_wunlock(*_lockp); \ 211 *_lockp = _new_lock; \ 212 rw_wlock(*_lockp); \ 213 } \ 214 } while (0) 215 216 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 217 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 218 219 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 220 struct rwlock **_lockp = (lockp); \ 221 \ 222 if (*_lockp != NULL) { \ 223 rw_wunlock(*_lockp); \ 224 *_lockp = NULL; \ 225 } \ 226 } while (0) 227 228 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 229 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 230 231 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 232 "VM/pmap parameters"); 233 234 /* The list of all the user pmaps */ 235 LIST_HEAD(pmaplist, pmap); 236 static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER(); 237 238 enum pmap_mode __read_frequently pmap_mode = PMAP_MODE_SV39; 239 SYSCTL_INT(_vm_pmap, OID_AUTO, mode, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 240 &pmap_mode, 0, 241 "translation mode, 0 = SV39, 1 = SV48"); 242 243 struct pmap kernel_pmap_store; 244 245 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 246 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 247 vm_offset_t kernel_vm_end = 0; 248 249 vm_paddr_t dmap_phys_base; /* The start of the dmap region */ 250 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ 251 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ 252 253 /* This code assumes all L1 DMAP entries will be used */ 254 CTASSERT((DMAP_MIN_ADDRESS & ~L1_OFFSET) == DMAP_MIN_ADDRESS); 255 CTASSERT((DMAP_MAX_ADDRESS & ~L1_OFFSET) == DMAP_MAX_ADDRESS); 256 257 static struct rwlock_padalign pvh_global_lock; 258 static struct mtx_padalign allpmaps_lock; 259 260 static int superpages_enabled = 1; 261 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, 262 CTLFLAG_RDTUN, &superpages_enabled, 0, 263 "Enable support for transparent superpages"); 264 265 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 266 "2MB page mapping counters"); 267 268 static u_long pmap_l2_demotions; 269 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, 270 &pmap_l2_demotions, 0, 271 "2MB page demotions"); 272 273 static u_long pmap_l2_mappings; 274 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, 275 &pmap_l2_mappings, 0, 276 "2MB page mappings"); 277 278 static u_long pmap_l2_p_failures; 279 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, 280 &pmap_l2_p_failures, 0, 281 "2MB page promotion failures"); 282 283 static u_long pmap_l2_promotions; 284 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, 285 &pmap_l2_promotions, 0, 286 "2MB page promotions"); 287 288 /* 289 * Data for the pv entry allocation mechanism 290 */ 291 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 292 static struct mtx pv_chunks_mutex; 293 static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 294 static struct md_page *pv_table; 295 static struct md_page pv_dummy; 296 297 extern cpuset_t all_harts; 298 299 /* 300 * Internal flags for pmap_enter()'s helper functions. 301 */ 302 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 303 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 304 305 static void free_pv_chunk(struct pv_chunk *pc); 306 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 307 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 308 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 309 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 310 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 311 vm_offset_t va); 312 static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va); 313 static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, 314 vm_offset_t va, struct rwlock **lockp); 315 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, 316 u_int flags, vm_page_t m, struct rwlock **lockp); 317 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 318 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 319 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, 320 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 321 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 322 vm_page_t m, struct rwlock **lockp); 323 324 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, 325 struct rwlock **lockp); 326 327 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 328 struct spglist *free); 329 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 330 331 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); 332 333 #define pmap_clear(pte) pmap_store(pte, 0) 334 #define pmap_clear_bits(pte, bits) atomic_clear_64(pte, bits) 335 #define pmap_load_store(pte, entry) atomic_swap_64(pte, entry) 336 #define pmap_load_clear(pte) pmap_load_store(pte, 0) 337 #define pmap_load(pte) atomic_load_64(pte) 338 #define pmap_store(pte, entry) atomic_store_64(pte, entry) 339 #define pmap_store_bits(pte, bits) atomic_set_64(pte, bits) 340 341 /********************/ 342 /* Inline functions */ 343 /********************/ 344 345 static __inline void 346 pagecopy(void *s, void *d) 347 { 348 349 memcpy(d, s, PAGE_SIZE); 350 } 351 352 static __inline void 353 pagezero(void *p) 354 { 355 356 bzero(p, PAGE_SIZE); 357 } 358 359 #define pmap_l0_index(va) (((va) >> L0_SHIFT) & Ln_ADDR_MASK) 360 #define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) 361 #define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) 362 #define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) 363 364 #define PTE_TO_PHYS(pte) \ 365 ((((pte) & ~PTE_HI_MASK) >> PTE_PPN0_S) * PAGE_SIZE) 366 #define L2PTE_TO_PHYS(l2) \ 367 ((((l2) & ~PTE_HI_MASK) >> PTE_PPN1_S) << L2_SHIFT) 368 369 static __inline pd_entry_t * 370 pmap_l0(pmap_t pmap, vm_offset_t va) 371 { 372 KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__)); 373 KASSERT(VIRT_IS_VALID(va), 374 ("%s: malformed virtual address %#lx", __func__, va)); 375 return (&pmap->pm_top[pmap_l0_index(va)]); 376 } 377 378 static __inline pd_entry_t * 379 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) 380 { 381 vm_paddr_t phys; 382 pd_entry_t *l1; 383 384 KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__)); 385 phys = PTE_TO_PHYS(pmap_load(l0)); 386 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 387 388 return (&l1[pmap_l1_index(va)]); 389 } 390 391 static __inline pd_entry_t * 392 pmap_l1(pmap_t pmap, vm_offset_t va) 393 { 394 pd_entry_t *l0; 395 396 KASSERT(VIRT_IS_VALID(va), 397 ("%s: malformed virtual address %#lx", __func__, va)); 398 if (pmap_mode == PMAP_MODE_SV39) { 399 return (&pmap->pm_top[pmap_l1_index(va)]); 400 } else { 401 l0 = pmap_l0(pmap, va); 402 if ((pmap_load(l0) & PTE_V) == 0) 403 return (NULL); 404 if ((pmap_load(l0) & PTE_RX) != 0) 405 return (NULL); 406 return (pmap_l0_to_l1(l0, va)); 407 } 408 } 409 410 static __inline pd_entry_t * 411 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) 412 { 413 vm_paddr_t phys; 414 pd_entry_t *l2; 415 416 phys = PTE_TO_PHYS(pmap_load(l1)); 417 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 418 419 return (&l2[pmap_l2_index(va)]); 420 } 421 422 static __inline pd_entry_t * 423 pmap_l2(pmap_t pmap, vm_offset_t va) 424 { 425 pd_entry_t *l1; 426 427 l1 = pmap_l1(pmap, va); 428 if (l1 == NULL) 429 return (NULL); 430 if ((pmap_load(l1) & PTE_V) == 0) 431 return (NULL); 432 if ((pmap_load(l1) & PTE_RX) != 0) 433 return (NULL); 434 435 return (pmap_l1_to_l2(l1, va)); 436 } 437 438 static __inline pt_entry_t * 439 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) 440 { 441 vm_paddr_t phys; 442 pt_entry_t *l3; 443 444 phys = PTE_TO_PHYS(pmap_load(l2)); 445 l3 = (pd_entry_t *)PHYS_TO_DMAP(phys); 446 447 return (&l3[pmap_l3_index(va)]); 448 } 449 450 static __inline pt_entry_t * 451 pmap_l3(pmap_t pmap, vm_offset_t va) 452 { 453 pd_entry_t *l2; 454 455 l2 = pmap_l2(pmap, va); 456 if (l2 == NULL) 457 return (NULL); 458 if ((pmap_load(l2) & PTE_V) == 0) 459 return (NULL); 460 if ((pmap_load(l2) & PTE_RX) != 0) 461 return (NULL); 462 463 return (pmap_l2_to_l3(l2, va)); 464 } 465 466 static __inline void 467 pmap_resident_count_inc(pmap_t pmap, int count) 468 { 469 470 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 471 pmap->pm_stats.resident_count += count; 472 } 473 474 static __inline void 475 pmap_resident_count_dec(pmap_t pmap, int count) 476 { 477 478 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 479 KASSERT(pmap->pm_stats.resident_count >= count, 480 ("pmap %p resident count underflow %ld %d", pmap, 481 pmap->pm_stats.resident_count, count)); 482 pmap->pm_stats.resident_count -= count; 483 } 484 485 static void 486 pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index, 487 pt_entry_t entry) 488 { 489 struct pmap *user_pmap; 490 pd_entry_t *l1; 491 492 /* 493 * Distribute new kernel L1 entry to all the user pmaps. This is only 494 * necessary with three-level paging configured: with four-level paging 495 * the kernel's half of the top-level page table page is static and can 496 * simply be copied at pmap initialization time. 497 */ 498 if (pmap != kernel_pmap || pmap_mode != PMAP_MODE_SV39) 499 return; 500 501 mtx_lock(&allpmaps_lock); 502 LIST_FOREACH(user_pmap, &allpmaps, pm_list) { 503 l1 = &user_pmap->pm_top[l1index]; 504 pmap_store(l1, entry); 505 } 506 mtx_unlock(&allpmaps_lock); 507 } 508 509 static pt_entry_t * 510 pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, 511 u_int *l2_slot) 512 { 513 pt_entry_t *l2; 514 pd_entry_t *l1 __diagused; 515 516 l1 = (pd_entry_t *)l1pt; 517 *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; 518 519 /* Check locore has used a table L1 map */ 520 KASSERT((l1[*l1_slot] & PTE_RX) == 0, 521 ("Invalid bootstrap L1 table")); 522 523 /* Find the address of the L2 table */ 524 l2 = (pt_entry_t *)init_pt_va; 525 *l2_slot = pmap_l2_index(va); 526 527 return (l2); 528 } 529 530 static vm_paddr_t 531 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) 532 { 533 u_int l1_slot, l2_slot; 534 pt_entry_t *l2; 535 vm_paddr_t ret; 536 537 l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); 538 539 /* Check locore has used L2 superpages */ 540 KASSERT((l2[l2_slot] & PTE_RX) != 0, 541 ("Invalid bootstrap L2 table")); 542 543 /* L2 is superpages */ 544 ret = L2PTE_TO_PHYS(l2[l2_slot]); 545 ret += (va & L2_OFFSET); 546 547 return (ret); 548 } 549 550 static void 551 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa) 552 { 553 vm_offset_t va; 554 vm_paddr_t pa; 555 pd_entry_t *l1; 556 u_int l1_slot; 557 pt_entry_t entry; 558 pn_t pn; 559 560 pa = dmap_phys_base = min_pa & ~L1_OFFSET; 561 va = DMAP_MIN_ADDRESS; 562 l1 = (pd_entry_t *)kern_l1; 563 l1_slot = pmap_l1_index(DMAP_MIN_ADDRESS); 564 565 for (; va < DMAP_MAX_ADDRESS && pa < max_pa; 566 pa += L1_SIZE, va += L1_SIZE, l1_slot++) { 567 KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); 568 569 /* superpages */ 570 pn = (pa / PAGE_SIZE); 571 entry = PTE_KERN; 572 entry |= (pn << PTE_PPN0_S); 573 pmap_store(&l1[l1_slot], entry); 574 } 575 576 /* Set the upper limit of the DMAP region */ 577 dmap_phys_max = pa; 578 dmap_max_addr = va; 579 580 sfence_vma(); 581 } 582 583 static vm_offset_t 584 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) 585 { 586 vm_offset_t l3pt; 587 pt_entry_t entry; 588 pd_entry_t *l2; 589 vm_paddr_t pa; 590 u_int l2_slot; 591 pn_t pn; 592 593 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); 594 595 l2 = pmap_l2(kernel_pmap, va); 596 l2 = (pd_entry_t *)((uintptr_t)l2 & ~(PAGE_SIZE - 1)); 597 l2_slot = pmap_l2_index(va); 598 l3pt = l3_start; 599 600 for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { 601 KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); 602 603 pa = pmap_early_vtophys(l1pt, l3pt); 604 pn = (pa / PAGE_SIZE); 605 entry = (PTE_V); 606 entry |= (pn << PTE_PPN0_S); 607 pmap_store(&l2[l2_slot], entry); 608 l3pt += PAGE_SIZE; 609 } 610 611 /* Clean the L2 page table */ 612 memset((void *)l3_start, 0, l3pt - l3_start); 613 614 return (l3pt); 615 } 616 617 /* 618 * Bootstrap the system enough to run with virtual memory. 619 */ 620 void 621 pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) 622 { 623 vm_paddr_t physmap[PHYS_AVAIL_ENTRIES]; 624 uint64_t satp; 625 vm_offset_t dpcpu, freemempos, l0pv, msgbufpv; 626 vm_paddr_t l0pa, l1pa, max_pa, min_pa, pa; 627 pd_entry_t *l0p; 628 pt_entry_t *l2p; 629 u_int l1_slot, l2_slot; 630 u_int physmap_idx; 631 int i, mode; 632 633 printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); 634 635 /* Set this early so we can use the pagetable walking functions */ 636 kernel_pmap_store.pm_top = (pd_entry_t *)l1pt; 637 PMAP_LOCK_INIT(kernel_pmap); 638 639 rw_init(&pvh_global_lock, "pmap pv global"); 640 641 /* 642 * Set the current CPU as active in the kernel pmap. Secondary cores 643 * will add themselves later in init_secondary(). The SBI firmware 644 * may rely on this mask being precise, so CPU_FILL() is not used. 645 */ 646 CPU_SET(PCPU_GET(hart), &kernel_pmap->pm_active); 647 648 /* Assume the address we were loaded to is a valid physical address. */ 649 min_pa = max_pa = kernstart; 650 651 physmap_idx = physmem_avail(physmap, nitems(physmap)); 652 physmap_idx /= 2; 653 654 /* 655 * Find the minimum physical address. physmap is sorted, 656 * but may contain empty ranges. 657 */ 658 for (i = 0; i < physmap_idx * 2; i += 2) { 659 if (physmap[i] == physmap[i + 1]) 660 continue; 661 if (physmap[i] <= min_pa) 662 min_pa = physmap[i]; 663 if (physmap[i + 1] > max_pa) 664 max_pa = physmap[i + 1]; 665 } 666 printf("physmap_idx %u\n", physmap_idx); 667 printf("min_pa %lx\n", min_pa); 668 printf("max_pa %lx\n", max_pa); 669 670 /* Create a direct map region early so we can use it for pa -> va */ 671 pmap_bootstrap_dmap(l1pt, min_pa, max_pa); 672 673 /* 674 * Read the page table to find out what is already mapped. 675 * This assumes we have mapped a block of memory from KERNBASE 676 * using a single L1 entry. 677 */ 678 (void)pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); 679 680 /* Sanity check the index, KERNBASE should be the first VA */ 681 KASSERT(l2_slot == 0, ("The L2 index is non-zero")); 682 683 freemempos = roundup2(KERNBASE + kernlen, PAGE_SIZE); 684 685 /* Create the l3 tables for the early devmap */ 686 freemempos = pmap_bootstrap_l3(l1pt, 687 VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos); 688 689 /* 690 * Invalidate the mapping we created for the DTB. At this point a copy 691 * has been created, and we no longer need it. We want to avoid the 692 * possibility of an aliased mapping in the future. 693 */ 694 l2p = pmap_l2(kernel_pmap, VM_EARLY_DTB_ADDRESS); 695 if ((pmap_load(l2p) & PTE_V) != 0) 696 pmap_clear(l2p); 697 698 sfence_vma(); 699 700 #define alloc_pages(var, np) \ 701 (var) = freemempos; \ 702 freemempos += (np * PAGE_SIZE); \ 703 memset((char *)(var), 0, ((np) * PAGE_SIZE)); 704 705 mode = 0; 706 TUNABLE_INT_FETCH("vm.pmap.mode", &mode); 707 if (mode == PMAP_MODE_SV48) { 708 /* 709 * Enable SV48 mode: allocate an L0 page and set SV48 mode in 710 * SATP. If the implementation does not provide SV48 mode, 711 * the mode read back from the (WARL) SATP register will be 712 * unchanged, and we continue in SV39 mode. 713 */ 714 alloc_pages(l0pv, 1); 715 l0p = (void *)l0pv; 716 l1pa = pmap_early_vtophys(l1pt, l1pt); 717 l0p[pmap_l0_index(KERNBASE)] = PTE_V | PTE_A | PTE_D | 718 ((l1pa >> PAGE_SHIFT) << PTE_PPN0_S); 719 720 l0pa = pmap_early_vtophys(l1pt, l0pv); 721 csr_write(satp, (l0pa >> PAGE_SHIFT) | SATP_MODE_SV48); 722 satp = csr_read(satp); 723 if ((satp & SATP_MODE_M) == SATP_MODE_SV48) { 724 pmap_mode = PMAP_MODE_SV48; 725 kernel_pmap_store.pm_top = l0p; 726 } else { 727 /* Mode didn't change, give the page back. */ 728 freemempos -= PAGE_SIZE; 729 } 730 } 731 732 /* Allocate dynamic per-cpu area. */ 733 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); 734 dpcpu_init((void *)dpcpu, 0); 735 736 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ 737 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); 738 msgbufp = (void *)msgbufpv; 739 740 virtual_avail = roundup2(freemempos, L2_SIZE); 741 virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE; 742 kernel_vm_end = virtual_avail; 743 744 pa = pmap_early_vtophys(l1pt, freemempos); 745 746 physmem_exclude_region(kernstart, pa - kernstart, EXFLAG_NOALLOC); 747 } 748 749 /* 750 * Initialize a vm_page's machine-dependent fields. 751 */ 752 void 753 pmap_page_init(vm_page_t m) 754 { 755 756 TAILQ_INIT(&m->md.pv_list); 757 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; 758 } 759 760 /* 761 * Initialize the pmap module. 762 * Called by vm_init, to initialize any structures that the pmap 763 * system needs to map virtual memory. 764 */ 765 void 766 pmap_init(void) 767 { 768 vm_size_t s; 769 int i, pv_npg; 770 771 /* 772 * Initialize the pv chunk and pmap list mutexes. 773 */ 774 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 775 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF); 776 777 /* 778 * Initialize the pool of pv list locks. 779 */ 780 for (i = 0; i < NPV_LIST_LOCKS; i++) 781 rw_init(&pv_list_locks[i], "pmap pv list"); 782 783 /* 784 * Calculate the size of the pv head table for superpages. 785 */ 786 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); 787 788 /* 789 * Allocate memory for the pv head table for superpages. 790 */ 791 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 792 s = round_page(s); 793 pv_table = kmem_malloc(s, M_WAITOK | M_ZERO); 794 for (i = 0; i < pv_npg; i++) 795 TAILQ_INIT(&pv_table[i].pv_list); 796 TAILQ_INIT(&pv_dummy.pv_list); 797 798 if (superpages_enabled) 799 pagesizes[1] = L2_SIZE; 800 } 801 802 #ifdef SMP 803 /* 804 * For SMP, these functions have to use IPIs for coherence. 805 * 806 * In general, the calling thread uses a plain fence to order the 807 * writes to the page tables before invoking an SBI callback to invoke 808 * sfence_vma() on remote CPUs. 809 */ 810 static void 811 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 812 { 813 cpuset_t mask; 814 815 sched_pin(); 816 mask = pmap->pm_active; 817 CPU_CLR(PCPU_GET(hart), &mask); 818 fence(); 819 if (!CPU_EMPTY(&mask) && smp_started) 820 sbi_remote_sfence_vma(mask.__bits, va, 1); 821 sfence_vma_page(va); 822 sched_unpin(); 823 } 824 825 static void 826 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 827 { 828 cpuset_t mask; 829 830 sched_pin(); 831 mask = pmap->pm_active; 832 CPU_CLR(PCPU_GET(hart), &mask); 833 fence(); 834 if (!CPU_EMPTY(&mask) && smp_started) 835 sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1); 836 837 /* 838 * Might consider a loop of sfence_vma_page() for a small 839 * number of pages in the future. 840 */ 841 sfence_vma(); 842 sched_unpin(); 843 } 844 845 static void 846 pmap_invalidate_all(pmap_t pmap) 847 { 848 cpuset_t mask; 849 850 sched_pin(); 851 mask = pmap->pm_active; 852 CPU_CLR(PCPU_GET(hart), &mask); 853 854 /* 855 * XXX: The SBI doc doesn't detail how to specify x0 as the 856 * address to perform a global fence. BBL currently treats 857 * all sfence_vma requests as global however. 858 */ 859 fence(); 860 if (!CPU_EMPTY(&mask) && smp_started) 861 sbi_remote_sfence_vma(mask.__bits, 0, 0); 862 sfence_vma(); 863 sched_unpin(); 864 } 865 #else 866 /* 867 * Normal, non-SMP, invalidation functions. 868 * We inline these within pmap.c for speed. 869 */ 870 static __inline void 871 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 872 { 873 874 sfence_vma_page(va); 875 } 876 877 static __inline void 878 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 879 { 880 881 /* 882 * Might consider a loop of sfence_vma_page() for a small 883 * number of pages in the future. 884 */ 885 sfence_vma(); 886 } 887 888 static __inline void 889 pmap_invalidate_all(pmap_t pmap) 890 { 891 892 sfence_vma(); 893 } 894 #endif 895 896 /* 897 * Routine: pmap_extract 898 * Function: 899 * Extract the physical page address associated 900 * with the given map/virtual_address pair. 901 */ 902 vm_paddr_t 903 pmap_extract(pmap_t pmap, vm_offset_t va) 904 { 905 pd_entry_t *l2p, l2; 906 pt_entry_t *l3p; 907 vm_paddr_t pa; 908 909 pa = 0; 910 911 /* 912 * Start with an L2 lookup, L1 superpages are currently not implemented. 913 */ 914 PMAP_LOCK(pmap); 915 l2p = pmap_l2(pmap, va); 916 if (l2p != NULL && ((l2 = pmap_load(l2p)) & PTE_V) != 0) { 917 if ((l2 & PTE_RWX) == 0) { 918 l3p = pmap_l2_to_l3(l2p, va); 919 if (l3p != NULL) { 920 pa = PTE_TO_PHYS(pmap_load(l3p)); 921 pa |= (va & L3_OFFSET); 922 } 923 } else { 924 /* L2 is a superpage mapping. */ 925 pa = L2PTE_TO_PHYS(l2); 926 pa |= (va & L2_OFFSET); 927 } 928 } 929 PMAP_UNLOCK(pmap); 930 return (pa); 931 } 932 933 /* 934 * Routine: pmap_extract_and_hold 935 * Function: 936 * Atomically extract and hold the physical page 937 * with the given pmap and virtual address pair 938 * if that mapping permits the given protection. 939 */ 940 vm_page_t 941 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 942 { 943 pt_entry_t *l3p, l3; 944 vm_paddr_t phys; 945 vm_page_t m; 946 947 m = NULL; 948 PMAP_LOCK(pmap); 949 l3p = pmap_l3(pmap, va); 950 if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) { 951 if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) { 952 phys = PTE_TO_PHYS(l3); 953 m = PHYS_TO_VM_PAGE(phys); 954 if (!vm_page_wire_mapped(m)) 955 m = NULL; 956 } 957 } 958 PMAP_UNLOCK(pmap); 959 return (m); 960 } 961 962 vm_paddr_t 963 pmap_kextract(vm_offset_t va) 964 { 965 pd_entry_t *l2, l2e; 966 pt_entry_t *l3; 967 vm_paddr_t pa; 968 969 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 970 pa = DMAP_TO_PHYS(va); 971 } else { 972 l2 = pmap_l2(kernel_pmap, va); 973 if (l2 == NULL) 974 panic("pmap_kextract: No l2"); 975 l2e = pmap_load(l2); 976 /* 977 * Beware of concurrent promotion and demotion! We must 978 * use l2e rather than loading from l2 multiple times to 979 * ensure we see a consistent state, including the 980 * implicit load in pmap_l2_to_l3. It is, however, safe 981 * to use an old l2e because the L3 page is preserved by 982 * promotion. 983 */ 984 if ((l2e & PTE_RX) != 0) { 985 /* superpages */ 986 pa = L2PTE_TO_PHYS(l2e); 987 pa |= (va & L2_OFFSET); 988 return (pa); 989 } 990 991 l3 = pmap_l2_to_l3(&l2e, va); 992 if (l3 == NULL) 993 panic("pmap_kextract: No l3..."); 994 pa = PTE_TO_PHYS(pmap_load(l3)); 995 pa |= (va & PAGE_MASK); 996 } 997 return (pa); 998 } 999 1000 /*************************************************** 1001 * Low level mapping routines..... 1002 ***************************************************/ 1003 1004 void 1005 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode __unused) 1006 { 1007 pt_entry_t entry; 1008 pt_entry_t *l3; 1009 vm_offset_t va; 1010 pn_t pn; 1011 1012 KASSERT((pa & L3_OFFSET) == 0, 1013 ("pmap_kenter_device: Invalid physical address")); 1014 KASSERT((sva & L3_OFFSET) == 0, 1015 ("pmap_kenter_device: Invalid virtual address")); 1016 KASSERT((size & PAGE_MASK) == 0, 1017 ("pmap_kenter_device: Mapping is not page-sized")); 1018 1019 va = sva; 1020 while (size != 0) { 1021 l3 = pmap_l3(kernel_pmap, va); 1022 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 1023 1024 pn = (pa / PAGE_SIZE); 1025 entry = PTE_KERN; 1026 entry |= (pn << PTE_PPN0_S); 1027 pmap_store(l3, entry); 1028 1029 va += PAGE_SIZE; 1030 pa += PAGE_SIZE; 1031 size -= PAGE_SIZE; 1032 } 1033 pmap_invalidate_range(kernel_pmap, sva, va); 1034 } 1035 1036 void 1037 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) 1038 { 1039 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE); 1040 } 1041 1042 /* 1043 * Remove a page from the kernel pagetables. 1044 * Note: not SMP coherent. 1045 */ 1046 PMAP_INLINE void 1047 pmap_kremove(vm_offset_t va) 1048 { 1049 pt_entry_t *l3; 1050 1051 l3 = pmap_l3(kernel_pmap, va); 1052 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 1053 1054 pmap_clear(l3); 1055 sfence_vma(); 1056 } 1057 1058 void 1059 pmap_kremove_device(vm_offset_t sva, vm_size_t size) 1060 { 1061 pt_entry_t *l3; 1062 vm_offset_t va; 1063 1064 KASSERT((sva & L3_OFFSET) == 0, 1065 ("pmap_kremove_device: Invalid virtual address")); 1066 KASSERT((size & PAGE_MASK) == 0, 1067 ("pmap_kremove_device: Mapping is not page-sized")); 1068 1069 va = sva; 1070 while (size != 0) { 1071 l3 = pmap_l3(kernel_pmap, va); 1072 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 1073 pmap_clear(l3); 1074 1075 va += PAGE_SIZE; 1076 size -= PAGE_SIZE; 1077 } 1078 1079 pmap_invalidate_range(kernel_pmap, sva, va); 1080 } 1081 1082 /* 1083 * Used to map a range of physical addresses into kernel 1084 * virtual address space. 1085 * 1086 * The value passed in '*virt' is a suggested virtual address for 1087 * the mapping. Architectures which can support a direct-mapped 1088 * physical to virtual region can return the appropriate address 1089 * within that region, leaving '*virt' unchanged. Other 1090 * architectures should map the pages starting at '*virt' and 1091 * update '*virt' with the first usable address after the mapped 1092 * region. 1093 */ 1094 vm_offset_t 1095 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1096 { 1097 1098 return PHYS_TO_DMAP(start); 1099 } 1100 1101 /* 1102 * Add a list of wired pages to the kva 1103 * this routine is only used for temporary 1104 * kernel mappings that do not need to have 1105 * page modification or references recorded. 1106 * Note that old mappings are simply written 1107 * over. The page *must* be wired. 1108 * Note: SMP coherent. Uses a ranged shootdown IPI. 1109 */ 1110 void 1111 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1112 { 1113 pt_entry_t *l3, pa; 1114 vm_offset_t va; 1115 vm_page_t m; 1116 pt_entry_t entry; 1117 pn_t pn; 1118 int i; 1119 1120 va = sva; 1121 for (i = 0; i < count; i++) { 1122 m = ma[i]; 1123 pa = VM_PAGE_TO_PHYS(m); 1124 pn = (pa / PAGE_SIZE); 1125 l3 = pmap_l3(kernel_pmap, va); 1126 1127 entry = PTE_KERN; 1128 entry |= (pn << PTE_PPN0_S); 1129 pmap_store(l3, entry); 1130 1131 va += L3_SIZE; 1132 } 1133 pmap_invalidate_range(kernel_pmap, sva, va); 1134 } 1135 1136 /* 1137 * This routine tears out page mappings from the 1138 * kernel -- it is meant only for temporary mappings. 1139 * Note: SMP coherent. Uses a ranged shootdown IPI. 1140 */ 1141 void 1142 pmap_qremove(vm_offset_t sva, int count) 1143 { 1144 pt_entry_t *l3; 1145 vm_offset_t va; 1146 1147 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); 1148 1149 for (va = sva; count-- > 0; va += PAGE_SIZE) { 1150 l3 = pmap_l3(kernel_pmap, va); 1151 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 1152 pmap_clear(l3); 1153 } 1154 pmap_invalidate_range(kernel_pmap, sva, va); 1155 } 1156 1157 bool 1158 pmap_ps_enabled(pmap_t pmap __unused) 1159 { 1160 1161 return (superpages_enabled); 1162 } 1163 1164 /*************************************************** 1165 * Page table page management routines..... 1166 ***************************************************/ 1167 /* 1168 * Schedule the specified unused page table page to be freed. Specifically, 1169 * add the page to the specified list of pages that will be released to the 1170 * physical memory manager after the TLB has been updated. 1171 */ 1172 static __inline void 1173 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 1174 boolean_t set_PG_ZERO) 1175 { 1176 1177 if (set_PG_ZERO) 1178 m->flags |= PG_ZERO; 1179 else 1180 m->flags &= ~PG_ZERO; 1181 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1182 } 1183 1184 /* 1185 * Inserts the specified page table page into the specified pmap's collection 1186 * of idle page table pages. Each of a pmap's page table pages is responsible 1187 * for mapping a distinct range of virtual addresses. The pmap's collection is 1188 * ordered by this virtual address range. 1189 * 1190 * If "promoted" is false, then the page table page "ml3" must be zero filled. 1191 */ 1192 static __inline int 1193 pmap_insert_pt_page(pmap_t pmap, vm_page_t ml3, bool promoted) 1194 { 1195 1196 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1197 ml3->valid = promoted ? VM_PAGE_BITS_ALL : 0; 1198 return (vm_radix_insert(&pmap->pm_root, ml3)); 1199 } 1200 1201 /* 1202 * Removes the page table page mapping the specified virtual address from the 1203 * specified pmap's collection of idle page table pages, and returns it. 1204 * Otherwise, returns NULL if there is no page table page corresponding to the 1205 * specified virtual address. 1206 */ 1207 static __inline vm_page_t 1208 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 1209 { 1210 1211 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1212 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); 1213 } 1214 1215 /* 1216 * Decrements a page table page's reference count, which is used to record the 1217 * number of valid page table entries within the page. If the reference count 1218 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1219 * page table page was unmapped and FALSE otherwise. 1220 */ 1221 static inline boolean_t 1222 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1223 { 1224 KASSERT(m->ref_count > 0, 1225 ("%s: page %p ref count underflow", __func__, m)); 1226 1227 --m->ref_count; 1228 if (m->ref_count == 0) { 1229 _pmap_unwire_ptp(pmap, va, m, free); 1230 return (TRUE); 1231 } else { 1232 return (FALSE); 1233 } 1234 } 1235 1236 static void 1237 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1238 { 1239 vm_paddr_t phys; 1240 1241 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1242 if (m->pindex >= NUL2E + NUL1E) { 1243 pd_entry_t *l0; 1244 l0 = pmap_l0(pmap, va); 1245 pmap_clear(l0); 1246 } else if (m->pindex >= NUL2E) { 1247 pd_entry_t *l1; 1248 l1 = pmap_l1(pmap, va); 1249 pmap_clear(l1); 1250 pmap_distribute_l1(pmap, pmap_l1_index(va), 0); 1251 } else { 1252 pd_entry_t *l2; 1253 l2 = pmap_l2(pmap, va); 1254 pmap_clear(l2); 1255 } 1256 pmap_resident_count_dec(pmap, 1); 1257 if (m->pindex < NUL2E) { 1258 pd_entry_t *l1; 1259 vm_page_t pdpg; 1260 1261 l1 = pmap_l1(pmap, va); 1262 phys = PTE_TO_PHYS(pmap_load(l1)); 1263 pdpg = PHYS_TO_VM_PAGE(phys); 1264 pmap_unwire_ptp(pmap, va, pdpg, free); 1265 } else if (m->pindex < NUL2E + NUL1E && pmap_mode != PMAP_MODE_SV39) { 1266 pd_entry_t *l0; 1267 vm_page_t pdpg; 1268 1269 MPASS(pmap_mode != PMAP_MODE_SV39); 1270 l0 = pmap_l0(pmap, va); 1271 phys = PTE_TO_PHYS(pmap_load(l0)); 1272 pdpg = PHYS_TO_VM_PAGE(phys); 1273 pmap_unwire_ptp(pmap, va, pdpg, free); 1274 } 1275 pmap_invalidate_page(pmap, va); 1276 1277 vm_wire_sub(1); 1278 1279 /* 1280 * Put page on a list so that it is released after 1281 * *ALL* TLB shootdown is done 1282 */ 1283 pmap_add_delayed_free_list(m, free, TRUE); 1284 } 1285 1286 /* 1287 * After removing a page table entry, this routine is used to 1288 * conditionally free the page, and manage the reference count. 1289 */ 1290 static int 1291 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 1292 struct spglist *free) 1293 { 1294 vm_page_t mpte; 1295 1296 if (va >= VM_MAXUSER_ADDRESS) 1297 return (0); 1298 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 1299 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde)); 1300 return (pmap_unwire_ptp(pmap, va, mpte, free)); 1301 } 1302 1303 static uint64_t 1304 pmap_satp_mode(void) 1305 { 1306 return (pmap_mode == PMAP_MODE_SV39 ? SATP_MODE_SV39 : SATP_MODE_SV48); 1307 } 1308 1309 void 1310 pmap_pinit0(pmap_t pmap) 1311 { 1312 PMAP_LOCK_INIT(pmap); 1313 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1314 pmap->pm_top = kernel_pmap->pm_top; 1315 pmap->pm_satp = pmap_satp_mode() | 1316 (vtophys(pmap->pm_top) >> PAGE_SHIFT); 1317 CPU_ZERO(&pmap->pm_active); 1318 pmap_activate_boot(pmap); 1319 } 1320 1321 int 1322 pmap_pinit(pmap_t pmap) 1323 { 1324 vm_paddr_t topphys; 1325 vm_page_t mtop; 1326 size_t i; 1327 1328 mtop = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO | 1329 VM_ALLOC_WAITOK); 1330 1331 topphys = VM_PAGE_TO_PHYS(mtop); 1332 pmap->pm_top = (pd_entry_t *)PHYS_TO_DMAP(topphys); 1333 pmap->pm_satp = pmap_satp_mode() | (topphys >> PAGE_SHIFT); 1334 1335 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1336 1337 CPU_ZERO(&pmap->pm_active); 1338 1339 if (pmap_mode == PMAP_MODE_SV39) { 1340 /* 1341 * Copy L1 entries from the kernel pmap. This must be done with 1342 * the allpmaps lock held to avoid races with 1343 * pmap_distribute_l1(). 1344 */ 1345 mtx_lock(&allpmaps_lock); 1346 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1347 for (i = pmap_l1_index(VM_MIN_KERNEL_ADDRESS); 1348 i < pmap_l1_index(VM_MAX_KERNEL_ADDRESS); i++) 1349 pmap->pm_top[i] = kernel_pmap->pm_top[i]; 1350 for (i = pmap_l1_index(DMAP_MIN_ADDRESS); 1351 i < pmap_l1_index(DMAP_MAX_ADDRESS); i++) 1352 pmap->pm_top[i] = kernel_pmap->pm_top[i]; 1353 mtx_unlock(&allpmaps_lock); 1354 } else { 1355 i = pmap_l0_index(VM_MIN_KERNEL_ADDRESS); 1356 pmap->pm_top[i] = kernel_pmap->pm_top[i]; 1357 } 1358 1359 vm_radix_init(&pmap->pm_root); 1360 1361 return (1); 1362 } 1363 1364 /* 1365 * This routine is called if the desired page table page does not exist. 1366 * 1367 * If page table page allocation fails, this routine may sleep before 1368 * returning NULL. It sleeps only if a lock pointer was given. 1369 * 1370 * Note: If a page allocation fails at page table level two or three, 1371 * one or two pages may be held during the wait, only to be released 1372 * afterwards. This conservative approach is easily argued to avoid 1373 * race conditions. 1374 */ 1375 static vm_page_t 1376 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 1377 { 1378 vm_page_t m, pdpg; 1379 pt_entry_t entry; 1380 vm_paddr_t phys; 1381 pn_t pn; 1382 1383 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1384 1385 /* 1386 * Allocate a page table page. 1387 */ 1388 m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1389 if (m == NULL) { 1390 if (lockp != NULL) { 1391 RELEASE_PV_LIST_LOCK(lockp); 1392 PMAP_UNLOCK(pmap); 1393 rw_runlock(&pvh_global_lock); 1394 vm_wait(NULL); 1395 rw_rlock(&pvh_global_lock); 1396 PMAP_LOCK(pmap); 1397 } 1398 1399 /* 1400 * Indicate the need to retry. While waiting, the page table 1401 * page may have been allocated. 1402 */ 1403 return (NULL); 1404 } 1405 m->pindex = ptepindex; 1406 1407 /* 1408 * Map the pagetable page into the process address space, if 1409 * it isn't already there. 1410 */ 1411 pn = VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT; 1412 if (ptepindex >= NUL2E + NUL1E) { 1413 pd_entry_t *l0; 1414 vm_pindex_t l0index; 1415 1416 KASSERT(pmap_mode != PMAP_MODE_SV39, 1417 ("%s: pindex %#lx in SV39 mode", __func__, ptepindex)); 1418 KASSERT(ptepindex < NUL2E + NUL1E + NUL0E, 1419 ("%s: pindex %#lx out of range", __func__, ptepindex)); 1420 1421 l0index = ptepindex - (NUL2E + NUL1E); 1422 l0 = &pmap->pm_top[l0index]; 1423 KASSERT((pmap_load(l0) & PTE_V) == 0, 1424 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0))); 1425 1426 entry = PTE_V | (pn << PTE_PPN0_S); 1427 pmap_store(l0, entry); 1428 } else if (ptepindex >= NUL2E) { 1429 pd_entry_t *l0, *l1; 1430 vm_pindex_t l0index, l1index; 1431 1432 l1index = ptepindex - NUL2E; 1433 if (pmap_mode == PMAP_MODE_SV39) { 1434 l1 = &pmap->pm_top[l1index]; 1435 } else { 1436 l0index = l1index >> Ln_ENTRIES_SHIFT; 1437 l0 = &pmap->pm_top[l0index]; 1438 if (pmap_load(l0) == 0) { 1439 /* Recurse to allocate the L1 page. */ 1440 if (_pmap_alloc_l3(pmap, 1441 NUL2E + NUL1E + l0index, lockp) == NULL) 1442 goto fail; 1443 phys = PTE_TO_PHYS(pmap_load(l0)); 1444 } else { 1445 phys = PTE_TO_PHYS(pmap_load(l0)); 1446 pdpg = PHYS_TO_VM_PAGE(phys); 1447 pdpg->ref_count++; 1448 } 1449 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1450 l1 = &l1[ptepindex & Ln_ADDR_MASK]; 1451 } 1452 KASSERT((pmap_load(l1) & PTE_V) == 0, 1453 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1))); 1454 1455 entry = PTE_V | (pn << PTE_PPN0_S); 1456 pmap_store(l1, entry); 1457 pmap_distribute_l1(pmap, l1index, entry); 1458 } else { 1459 vm_pindex_t l0index, l1index; 1460 pd_entry_t *l0, *l1, *l2; 1461 1462 l1index = ptepindex >> (L1_SHIFT - L2_SHIFT); 1463 if (pmap_mode == PMAP_MODE_SV39) { 1464 l1 = &pmap->pm_top[l1index]; 1465 if (pmap_load(l1) == 0) { 1466 /* recurse for allocating page dir */ 1467 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 1468 lockp) == NULL) 1469 goto fail; 1470 } else { 1471 phys = PTE_TO_PHYS(pmap_load(l1)); 1472 pdpg = PHYS_TO_VM_PAGE(phys); 1473 pdpg->ref_count++; 1474 } 1475 } else { 1476 l0index = l1index >> Ln_ENTRIES_SHIFT; 1477 l0 = &pmap->pm_top[l0index]; 1478 if (pmap_load(l0) == 0) { 1479 /* Recurse to allocate the L1 entry. */ 1480 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 1481 lockp) == NULL) 1482 goto fail; 1483 phys = PTE_TO_PHYS(pmap_load(l0)); 1484 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1485 l1 = &l1[l1index & Ln_ADDR_MASK]; 1486 } else { 1487 phys = PTE_TO_PHYS(pmap_load(l0)); 1488 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1489 l1 = &l1[l1index & Ln_ADDR_MASK]; 1490 if (pmap_load(l1) == 0) { 1491 /* Recurse to allocate the L2 page. */ 1492 if (_pmap_alloc_l3(pmap, 1493 NUL2E + l1index, lockp) == NULL) 1494 goto fail; 1495 } else { 1496 phys = PTE_TO_PHYS(pmap_load(l1)); 1497 pdpg = PHYS_TO_VM_PAGE(phys); 1498 pdpg->ref_count++; 1499 } 1500 } 1501 } 1502 1503 phys = PTE_TO_PHYS(pmap_load(l1)); 1504 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1505 l2 = &l2[ptepindex & Ln_ADDR_MASK]; 1506 KASSERT((pmap_load(l2) & PTE_V) == 0, 1507 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2))); 1508 1509 entry = PTE_V | (pn << PTE_PPN0_S); 1510 pmap_store(l2, entry); 1511 } 1512 1513 pmap_resident_count_inc(pmap, 1); 1514 1515 return (m); 1516 1517 fail: 1518 vm_page_unwire_noq(m); 1519 vm_page_free_zero(m); 1520 return (NULL); 1521 } 1522 1523 static vm_page_t 1524 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1525 { 1526 pd_entry_t *l1; 1527 vm_page_t l2pg; 1528 vm_pindex_t l2pindex; 1529 1530 retry: 1531 l1 = pmap_l1(pmap, va); 1532 if (l1 != NULL && (pmap_load(l1) & PTE_V) != 0) { 1533 KASSERT((pmap_load(l1) & PTE_RWX) == 0, 1534 ("%s: L1 entry %#lx for VA %#lx is a leaf", __func__, 1535 pmap_load(l1), va)); 1536 /* Add a reference to the L2 page. */ 1537 l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1))); 1538 l2pg->ref_count++; 1539 } else { 1540 /* Allocate a L2 page. */ 1541 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; 1542 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); 1543 if (l2pg == NULL && lockp != NULL) 1544 goto retry; 1545 } 1546 return (l2pg); 1547 } 1548 1549 static vm_page_t 1550 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1551 { 1552 vm_pindex_t ptepindex; 1553 pd_entry_t *l2; 1554 vm_paddr_t phys; 1555 vm_page_t m; 1556 1557 /* 1558 * Calculate pagetable page index 1559 */ 1560 ptepindex = pmap_l2_pindex(va); 1561 retry: 1562 /* 1563 * Get the page directory entry 1564 */ 1565 l2 = pmap_l2(pmap, va); 1566 1567 /* 1568 * If the page table page is mapped, we just increment the 1569 * hold count, and activate it. 1570 */ 1571 if (l2 != NULL && pmap_load(l2) != 0) { 1572 phys = PTE_TO_PHYS(pmap_load(l2)); 1573 m = PHYS_TO_VM_PAGE(phys); 1574 m->ref_count++; 1575 } else { 1576 /* 1577 * Here if the pte page isn't mapped, or if it has been 1578 * deallocated. 1579 */ 1580 m = _pmap_alloc_l3(pmap, ptepindex, lockp); 1581 if (m == NULL && lockp != NULL) 1582 goto retry; 1583 } 1584 return (m); 1585 } 1586 1587 /*************************************************** 1588 * Pmap allocation/deallocation routines. 1589 ***************************************************/ 1590 1591 /* 1592 * Release any resources held by the given physical map. 1593 * Called when a pmap initialized by pmap_pinit is being released. 1594 * Should only be called if the map contains no valid mappings. 1595 */ 1596 void 1597 pmap_release(pmap_t pmap) 1598 { 1599 vm_page_t m; 1600 1601 KASSERT(pmap->pm_stats.resident_count == 0, 1602 ("pmap_release: pmap resident count %ld != 0", 1603 pmap->pm_stats.resident_count)); 1604 KASSERT(CPU_EMPTY(&pmap->pm_active), 1605 ("releasing active pmap %p", pmap)); 1606 1607 if (pmap_mode == PMAP_MODE_SV39) { 1608 mtx_lock(&allpmaps_lock); 1609 LIST_REMOVE(pmap, pm_list); 1610 mtx_unlock(&allpmaps_lock); 1611 } 1612 1613 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_top)); 1614 vm_page_unwire_noq(m); 1615 vm_page_free(m); 1616 } 1617 1618 static int 1619 kvm_size(SYSCTL_HANDLER_ARGS) 1620 { 1621 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 1622 1623 return sysctl_handle_long(oidp, &ksize, 0, req); 1624 } 1625 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1626 0, 0, kvm_size, "LU", 1627 "Size of KVM"); 1628 1629 static int 1630 kvm_free(SYSCTL_HANDLER_ARGS) 1631 { 1632 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 1633 1634 return sysctl_handle_long(oidp, &kfree, 0, req); 1635 } 1636 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1637 0, 0, kvm_free, "LU", 1638 "Amount of KVM free"); 1639 1640 /* 1641 * grow the number of kernel page table entries, if needed 1642 */ 1643 void 1644 pmap_growkernel(vm_offset_t addr) 1645 { 1646 vm_paddr_t paddr; 1647 vm_page_t nkpg; 1648 pd_entry_t *l1, *l2; 1649 pt_entry_t entry; 1650 pn_t pn; 1651 1652 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 1653 1654 addr = roundup2(addr, L2_SIZE); 1655 if (addr - 1 >= vm_map_max(kernel_map)) 1656 addr = vm_map_max(kernel_map); 1657 while (kernel_vm_end < addr) { 1658 l1 = pmap_l1(kernel_pmap, kernel_vm_end); 1659 if (pmap_load(l1) == 0) { 1660 /* We need a new PDP entry */ 1661 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | 1662 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1663 if (nkpg == NULL) 1664 panic("pmap_growkernel: no memory to grow kernel"); 1665 nkpg->pindex = kernel_vm_end >> L1_SHIFT; 1666 paddr = VM_PAGE_TO_PHYS(nkpg); 1667 1668 pn = (paddr / PAGE_SIZE); 1669 entry = (PTE_V); 1670 entry |= (pn << PTE_PPN0_S); 1671 pmap_store(l1, entry); 1672 pmap_distribute_l1(kernel_pmap, 1673 pmap_l1_index(kernel_vm_end), entry); 1674 continue; /* try again */ 1675 } 1676 l2 = pmap_l1_to_l2(l1, kernel_vm_end); 1677 if ((pmap_load(l2) & PTE_V) != 0 && 1678 (pmap_load(l2) & PTE_RWX) == 0) { 1679 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1680 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1681 kernel_vm_end = vm_map_max(kernel_map); 1682 break; 1683 } 1684 continue; 1685 } 1686 1687 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 1688 VM_ALLOC_ZERO); 1689 if (nkpg == NULL) 1690 panic("pmap_growkernel: no memory to grow kernel"); 1691 nkpg->pindex = kernel_vm_end >> L2_SHIFT; 1692 paddr = VM_PAGE_TO_PHYS(nkpg); 1693 1694 pn = (paddr / PAGE_SIZE); 1695 entry = (PTE_V); 1696 entry |= (pn << PTE_PPN0_S); 1697 pmap_store(l2, entry); 1698 1699 pmap_invalidate_page(kernel_pmap, kernel_vm_end); 1700 1701 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1702 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1703 kernel_vm_end = vm_map_max(kernel_map); 1704 break; 1705 } 1706 } 1707 } 1708 1709 /*************************************************** 1710 * page management routines. 1711 ***************************************************/ 1712 1713 static const uint64_t pc_freemask[_NPCM] = { 1714 [0 ... _NPCM - 2] = PC_FREEN, 1715 [_NPCM - 1] = PC_FREEL 1716 }; 1717 1718 #if 0 1719 #ifdef PV_STATS 1720 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 1721 1722 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 1723 "Current number of pv entry chunks"); 1724 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 1725 "Current number of pv entry chunks allocated"); 1726 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 1727 "Current number of pv entry chunks frees"); 1728 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 1729 "Number of times tried to get a chunk page but failed."); 1730 1731 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 1732 static int pv_entry_spare; 1733 1734 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 1735 "Current number of pv entry frees"); 1736 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 1737 "Current number of pv entry allocs"); 1738 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 1739 "Current number of pv entries"); 1740 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 1741 "Current number of spare pv entries"); 1742 #endif 1743 #endif /* 0 */ 1744 1745 /* 1746 * We are in a serious low memory condition. Resort to 1747 * drastic measures to free some pages so we can allocate 1748 * another pv entry chunk. 1749 * 1750 * Returns NULL if PV entries were reclaimed from the specified pmap. 1751 * 1752 * We do not, however, unmap 2mpages because subsequent accesses will 1753 * allocate per-page pv entries until repromotion occurs, thereby 1754 * exacerbating the shortage of free pv entries. 1755 */ 1756 static vm_page_t 1757 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 1758 { 1759 1760 panic("RISCVTODO: reclaim_pv_chunk"); 1761 } 1762 1763 /* 1764 * free the pv_entry back to the free list 1765 */ 1766 static void 1767 free_pv_entry(pmap_t pmap, pv_entry_t pv) 1768 { 1769 struct pv_chunk *pc; 1770 int idx, field, bit; 1771 1772 rw_assert(&pvh_global_lock, RA_LOCKED); 1773 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1774 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 1775 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 1776 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 1777 pc = pv_to_chunk(pv); 1778 idx = pv - &pc->pc_pventry[0]; 1779 field = idx / 64; 1780 bit = idx % 64; 1781 pc->pc_map[field] |= 1ul << bit; 1782 if (!pc_is_free(pc)) { 1783 /* 98% of the time, pc is already at the head of the list. */ 1784 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 1785 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1786 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1787 } 1788 return; 1789 } 1790 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1791 free_pv_chunk(pc); 1792 } 1793 1794 static void 1795 free_pv_chunk(struct pv_chunk *pc) 1796 { 1797 vm_page_t m; 1798 1799 mtx_lock(&pv_chunks_mutex); 1800 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 1801 mtx_unlock(&pv_chunks_mutex); 1802 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 1803 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 1804 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 1805 /* entire chunk is free, return it */ 1806 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 1807 dump_drop_page(m->phys_addr); 1808 vm_page_unwire_noq(m); 1809 vm_page_free(m); 1810 } 1811 1812 /* 1813 * Returns a new PV entry, allocating a new PV chunk from the system when 1814 * needed. If this PV chunk allocation fails and a PV list lock pointer was 1815 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 1816 * returned. 1817 * 1818 * The given PV list lock may be released. 1819 */ 1820 static pv_entry_t 1821 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 1822 { 1823 int bit, field; 1824 pv_entry_t pv; 1825 struct pv_chunk *pc; 1826 vm_page_t m; 1827 1828 rw_assert(&pvh_global_lock, RA_LOCKED); 1829 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1830 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 1831 retry: 1832 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1833 if (pc != NULL) { 1834 for (field = 0; field < _NPCM; field++) { 1835 if (pc->pc_map[field]) { 1836 bit = ffsl(pc->pc_map[field]) - 1; 1837 break; 1838 } 1839 } 1840 if (field < _NPCM) { 1841 pv = &pc->pc_pventry[field * 64 + bit]; 1842 pc->pc_map[field] &= ~(1ul << bit); 1843 /* If this was the last item, move it to tail */ 1844 if (pc_is_full(pc)) { 1845 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1846 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 1847 pc_list); 1848 } 1849 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1850 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 1851 return (pv); 1852 } 1853 } 1854 /* No free items, allocate another chunk */ 1855 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 1856 if (m == NULL) { 1857 if (lockp == NULL) { 1858 PV_STAT(pc_chunk_tryfail++); 1859 return (NULL); 1860 } 1861 m = reclaim_pv_chunk(pmap, lockp); 1862 if (m == NULL) 1863 goto retry; 1864 } 1865 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 1866 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 1867 dump_add_page(m->phys_addr); 1868 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1869 pc->pc_pmap = pmap; 1870 pc->pc_map[0] = PC_FREEN & ~1ul; /* preallocated bit 0 */ 1871 pc->pc_map[1] = PC_FREEN; 1872 pc->pc_map[2] = PC_FREEL; 1873 mtx_lock(&pv_chunks_mutex); 1874 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 1875 mtx_unlock(&pv_chunks_mutex); 1876 pv = &pc->pc_pventry[0]; 1877 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1878 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1879 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 1880 return (pv); 1881 } 1882 1883 /* 1884 * Ensure that the number of spare PV entries in the specified pmap meets or 1885 * exceeds the given count, "needed". 1886 * 1887 * The given PV list lock may be released. 1888 */ 1889 static void 1890 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 1891 { 1892 struct pch new_tail; 1893 struct pv_chunk *pc; 1894 vm_page_t m; 1895 int avail, free; 1896 bool reclaimed; 1897 1898 rw_assert(&pvh_global_lock, RA_LOCKED); 1899 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1900 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 1901 1902 /* 1903 * Newly allocated PV chunks must be stored in a private list until 1904 * the required number of PV chunks have been allocated. Otherwise, 1905 * reclaim_pv_chunk() could recycle one of these chunks. In 1906 * contrast, these chunks must be added to the pmap upon allocation. 1907 */ 1908 TAILQ_INIT(&new_tail); 1909 retry: 1910 avail = 0; 1911 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 1912 bit_count((bitstr_t *)pc->pc_map, 0, 1913 sizeof(pc->pc_map) * NBBY, &free); 1914 if (free == 0) 1915 break; 1916 avail += free; 1917 if (avail >= needed) 1918 break; 1919 } 1920 for (reclaimed = false; avail < needed; avail += _NPCPV) { 1921 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 1922 if (m == NULL) { 1923 m = reclaim_pv_chunk(pmap, lockp); 1924 if (m == NULL) 1925 goto retry; 1926 reclaimed = true; 1927 } 1928 /* XXX PV STATS */ 1929 #if 0 1930 dump_add_page(m->phys_addr); 1931 #endif 1932 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1933 pc->pc_pmap = pmap; 1934 pc->pc_map[0] = PC_FREEN; 1935 pc->pc_map[1] = PC_FREEN; 1936 pc->pc_map[2] = PC_FREEL; 1937 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1938 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 1939 1940 /* 1941 * The reclaim might have freed a chunk from the current pmap. 1942 * If that chunk contained available entries, we need to 1943 * re-count the number of available entries. 1944 */ 1945 if (reclaimed) 1946 goto retry; 1947 } 1948 if (!TAILQ_EMPTY(&new_tail)) { 1949 mtx_lock(&pv_chunks_mutex); 1950 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 1951 mtx_unlock(&pv_chunks_mutex); 1952 } 1953 } 1954 1955 /* 1956 * First find and then remove the pv entry for the specified pmap and virtual 1957 * address from the specified pv list. Returns the pv entry if found and NULL 1958 * otherwise. This operation can be performed on pv lists for either 4KB or 1959 * 2MB page mappings. 1960 */ 1961 static __inline pv_entry_t 1962 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1963 { 1964 pv_entry_t pv; 1965 1966 rw_assert(&pvh_global_lock, RA_LOCKED); 1967 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 1968 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 1969 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 1970 pvh->pv_gen++; 1971 break; 1972 } 1973 } 1974 return (pv); 1975 } 1976 1977 /* 1978 * First find and then destroy the pv entry for the specified pmap and virtual 1979 * address. This operation can be performed on pv lists for either 4KB or 2MB 1980 * page mappings. 1981 */ 1982 static void 1983 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1984 { 1985 pv_entry_t pv; 1986 1987 pv = pmap_pvh_remove(pvh, pmap, va); 1988 1989 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va)); 1990 free_pv_entry(pmap, pv); 1991 } 1992 1993 /* 1994 * Conditionally create the PV entry for a 4KB page mapping if the required 1995 * memory can be allocated without resorting to reclamation. 1996 */ 1997 static boolean_t 1998 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 1999 struct rwlock **lockp) 2000 { 2001 pv_entry_t pv; 2002 2003 rw_assert(&pvh_global_lock, RA_LOCKED); 2004 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2005 /* Pass NULL instead of the lock pointer to disable reclamation. */ 2006 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 2007 pv->pv_va = va; 2008 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2009 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2010 m->md.pv_gen++; 2011 return (TRUE); 2012 } else 2013 return (FALSE); 2014 } 2015 2016 /* 2017 * After demotion from a 2MB page mapping to 512 4KB page mappings, 2018 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 2019 * entries for each of the 4KB page mappings. 2020 */ 2021 static void __unused 2022 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 2023 struct rwlock **lockp) 2024 { 2025 struct md_page *pvh; 2026 struct pv_chunk *pc; 2027 pv_entry_t pv; 2028 vm_page_t m; 2029 vm_offset_t va_last; 2030 int bit, field; 2031 2032 rw_assert(&pvh_global_lock, RA_LOCKED); 2033 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2034 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2035 2036 /* 2037 * Transfer the 2mpage's pv entry for this mapping to the first 2038 * page's pv list. Once this transfer begins, the pv list lock 2039 * must not be released until the last pv entry is reinstantiated. 2040 */ 2041 pvh = pa_to_pvh(pa); 2042 va &= ~L2_OFFSET; 2043 pv = pmap_pvh_remove(pvh, pmap, va); 2044 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); 2045 m = PHYS_TO_VM_PAGE(pa); 2046 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2047 m->md.pv_gen++; 2048 /* Instantiate the remaining 511 pv entries. */ 2049 va_last = va + L2_SIZE - PAGE_SIZE; 2050 for (;;) { 2051 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2052 KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare")); 2053 for (field = 0; field < _NPCM; field++) { 2054 while (pc->pc_map[field] != 0) { 2055 bit = ffsl(pc->pc_map[field]) - 1; 2056 pc->pc_map[field] &= ~(1ul << bit); 2057 pv = &pc->pc_pventry[field * 64 + bit]; 2058 va += PAGE_SIZE; 2059 pv->pv_va = va; 2060 m++; 2061 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2062 ("pmap_pv_demote_l2: page %p is not managed", m)); 2063 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2064 m->md.pv_gen++; 2065 if (va == va_last) 2066 goto out; 2067 } 2068 } 2069 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2070 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2071 } 2072 out: 2073 if (pc_is_free(pc)) { 2074 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2075 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2076 } 2077 /* XXX PV stats */ 2078 } 2079 2080 #if VM_NRESERVLEVEL > 0 2081 static void 2082 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 2083 struct rwlock **lockp) 2084 { 2085 struct md_page *pvh; 2086 pv_entry_t pv; 2087 vm_page_t m; 2088 vm_offset_t va_last; 2089 2090 rw_assert(&pvh_global_lock, RA_LOCKED); 2091 KASSERT((pa & L2_OFFSET) == 0, 2092 ("pmap_pv_promote_l2: misaligned pa %#lx", pa)); 2093 2094 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2095 2096 m = PHYS_TO_VM_PAGE(pa); 2097 va = va & ~L2_OFFSET; 2098 pv = pmap_pvh_remove(&m->md, pmap, va); 2099 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va)); 2100 pvh = pa_to_pvh(pa); 2101 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2102 pvh->pv_gen++; 2103 2104 va_last = va + L2_SIZE - PAGE_SIZE; 2105 do { 2106 m++; 2107 va += PAGE_SIZE; 2108 pmap_pvh_free(&m->md, pmap, va); 2109 } while (va < va_last); 2110 } 2111 #endif /* VM_NRESERVLEVEL > 0 */ 2112 2113 /* 2114 * Create the PV entry for a 2MB page mapping. Always returns true unless the 2115 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 2116 * false if the PV entry cannot be allocated without resorting to reclamation. 2117 */ 2118 static bool 2119 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, 2120 struct rwlock **lockp) 2121 { 2122 struct md_page *pvh; 2123 pv_entry_t pv; 2124 vm_paddr_t pa; 2125 2126 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2127 /* Pass NULL instead of the lock pointer to disable reclamation. */ 2128 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 2129 NULL : lockp)) == NULL) 2130 return (false); 2131 pv->pv_va = va; 2132 pa = PTE_TO_PHYS(l2e); 2133 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2134 pvh = pa_to_pvh(pa); 2135 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2136 pvh->pv_gen++; 2137 return (true); 2138 } 2139 2140 static void 2141 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 2142 { 2143 pt_entry_t newl2, oldl2 __diagused; 2144 vm_page_t ml3; 2145 vm_paddr_t ml3pa; 2146 2147 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); 2148 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 2149 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2150 2151 ml3 = pmap_remove_pt_page(pmap, va); 2152 if (ml3 == NULL) 2153 panic("pmap_remove_kernel_l2: Missing pt page"); 2154 2155 ml3pa = VM_PAGE_TO_PHYS(ml3); 2156 newl2 = ml3pa | PTE_V; 2157 2158 /* 2159 * If this page table page was unmapped by a promotion, then it 2160 * contains valid mappings. Zero it to invalidate those mappings. 2161 */ 2162 if (ml3->valid != 0) 2163 pagezero((void *)PHYS_TO_DMAP(ml3pa)); 2164 2165 /* 2166 * Demote the mapping. 2167 */ 2168 oldl2 = pmap_load_store(l2, newl2); 2169 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", 2170 __func__, l2, oldl2)); 2171 } 2172 2173 /* 2174 * pmap_remove_l2: Do the things to unmap a level 2 superpage. 2175 */ 2176 static int 2177 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 2178 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) 2179 { 2180 struct md_page *pvh; 2181 pt_entry_t oldl2; 2182 vm_offset_t eva, va; 2183 vm_page_t m, ml3; 2184 2185 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2186 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); 2187 oldl2 = pmap_load_clear(l2); 2188 KASSERT((oldl2 & PTE_RWX) != 0, 2189 ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2)); 2190 2191 /* 2192 * The sfence.vma documentation states that it is sufficient to specify 2193 * a single address within a superpage mapping. However, since we do 2194 * not perform any invalidation upon promotion, TLBs may still be 2195 * caching 4KB mappings within the superpage, so we must invalidate the 2196 * entire range. 2197 */ 2198 pmap_invalidate_range(pmap, sva, sva + L2_SIZE); 2199 if ((oldl2 & PTE_SW_WIRED) != 0) 2200 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; 2201 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); 2202 if ((oldl2 & PTE_SW_MANAGED) != 0) { 2203 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2)); 2204 pvh = pa_to_pvh(PTE_TO_PHYS(oldl2)); 2205 pmap_pvh_free(pvh, pmap, sva); 2206 eva = sva + L2_SIZE; 2207 for (va = sva, m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl2)); 2208 va < eva; va += PAGE_SIZE, m++) { 2209 if ((oldl2 & PTE_D) != 0) 2210 vm_page_dirty(m); 2211 if ((oldl2 & PTE_A) != 0) 2212 vm_page_aflag_set(m, PGA_REFERENCED); 2213 if (TAILQ_EMPTY(&m->md.pv_list) && 2214 TAILQ_EMPTY(&pvh->pv_list)) 2215 vm_page_aflag_clear(m, PGA_WRITEABLE); 2216 } 2217 } 2218 if (pmap == kernel_pmap) { 2219 pmap_remove_kernel_l2(pmap, l2, sva); 2220 } else { 2221 ml3 = pmap_remove_pt_page(pmap, sva); 2222 if (ml3 != NULL) { 2223 KASSERT(ml3->valid == VM_PAGE_BITS_ALL, 2224 ("pmap_remove_l2: l3 page not promoted")); 2225 pmap_resident_count_dec(pmap, 1); 2226 KASSERT(ml3->ref_count == Ln_ENTRIES, 2227 ("pmap_remove_l2: l3 page ref count error")); 2228 ml3->ref_count = 1; 2229 vm_page_unwire_noq(ml3); 2230 pmap_add_delayed_free_list(ml3, free, FALSE); 2231 } 2232 } 2233 return (pmap_unuse_pt(pmap, sva, l1e, free)); 2234 } 2235 2236 /* 2237 * pmap_remove_l3: do the things to unmap a page in a process 2238 */ 2239 static int 2240 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 2241 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) 2242 { 2243 struct md_page *pvh; 2244 pt_entry_t old_l3; 2245 vm_paddr_t phys; 2246 vm_page_t m; 2247 2248 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2249 old_l3 = pmap_load_clear(l3); 2250 pmap_invalidate_page(pmap, va); 2251 if (old_l3 & PTE_SW_WIRED) 2252 pmap->pm_stats.wired_count -= 1; 2253 pmap_resident_count_dec(pmap, 1); 2254 if (old_l3 & PTE_SW_MANAGED) { 2255 phys = PTE_TO_PHYS(old_l3); 2256 m = PHYS_TO_VM_PAGE(phys); 2257 if ((old_l3 & PTE_D) != 0) 2258 vm_page_dirty(m); 2259 if (old_l3 & PTE_A) 2260 vm_page_aflag_set(m, PGA_REFERENCED); 2261 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2262 pmap_pvh_free(&m->md, pmap, va); 2263 if (TAILQ_EMPTY(&m->md.pv_list) && 2264 (m->flags & PG_FICTITIOUS) == 0) { 2265 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2266 if (TAILQ_EMPTY(&pvh->pv_list)) 2267 vm_page_aflag_clear(m, PGA_WRITEABLE); 2268 } 2269 } 2270 2271 return (pmap_unuse_pt(pmap, va, l2e, free)); 2272 } 2273 2274 /* 2275 * Remove the given range of addresses from the specified map. 2276 * 2277 * It is assumed that the start and end are properly 2278 * rounded to the page size. 2279 */ 2280 void 2281 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2282 { 2283 struct spglist free; 2284 struct rwlock *lock; 2285 vm_offset_t va, va_next; 2286 pd_entry_t *l0, *l1, *l2, l2e; 2287 pt_entry_t *l3; 2288 2289 /* 2290 * Perform an unsynchronized read. This is, however, safe. 2291 */ 2292 if (pmap->pm_stats.resident_count == 0) 2293 return; 2294 2295 SLIST_INIT(&free); 2296 2297 rw_rlock(&pvh_global_lock); 2298 PMAP_LOCK(pmap); 2299 2300 lock = NULL; 2301 for (; sva < eva; sva = va_next) { 2302 if (pmap->pm_stats.resident_count == 0) 2303 break; 2304 2305 if (pmap_mode == PMAP_MODE_SV48) { 2306 l0 = pmap_l0(pmap, sva); 2307 if (pmap_load(l0) == 0) { 2308 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 2309 if (va_next < sva) 2310 va_next = eva; 2311 continue; 2312 } 2313 l1 = pmap_l0_to_l1(l0, sva); 2314 } else { 2315 l1 = pmap_l1(pmap, sva); 2316 } 2317 2318 if (pmap_load(l1) == 0) { 2319 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2320 if (va_next < sva) 2321 va_next = eva; 2322 continue; 2323 } 2324 2325 /* 2326 * Calculate index for next page table. 2327 */ 2328 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2329 if (va_next < sva) 2330 va_next = eva; 2331 2332 l2 = pmap_l1_to_l2(l1, sva); 2333 if (l2 == NULL) 2334 continue; 2335 if ((l2e = pmap_load(l2)) == 0) 2336 continue; 2337 if ((l2e & PTE_RWX) != 0) { 2338 if (sva + L2_SIZE == va_next && eva >= va_next) { 2339 (void)pmap_remove_l2(pmap, l2, sva, 2340 pmap_load(l1), &free, &lock); 2341 continue; 2342 } else if (!pmap_demote_l2_locked(pmap, l2, sva, 2343 &lock)) { 2344 /* 2345 * The large page mapping was destroyed. 2346 */ 2347 continue; 2348 } 2349 l2e = pmap_load(l2); 2350 } 2351 2352 /* 2353 * Limit our scan to either the end of the va represented 2354 * by the current page table page, or to the end of the 2355 * range being removed. 2356 */ 2357 if (va_next > eva) 2358 va_next = eva; 2359 2360 va = va_next; 2361 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2362 sva += L3_SIZE) { 2363 if (pmap_load(l3) == 0) { 2364 if (va != va_next) { 2365 pmap_invalidate_range(pmap, va, sva); 2366 va = va_next; 2367 } 2368 continue; 2369 } 2370 if (va == va_next) 2371 va = sva; 2372 if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) { 2373 sva += L3_SIZE; 2374 break; 2375 } 2376 } 2377 if (va != va_next) 2378 pmap_invalidate_range(pmap, va, sva); 2379 } 2380 if (lock != NULL) 2381 rw_wunlock(lock); 2382 rw_runlock(&pvh_global_lock); 2383 PMAP_UNLOCK(pmap); 2384 vm_page_free_pages_toq(&free, false); 2385 } 2386 2387 /* 2388 * Routine: pmap_remove_all 2389 * Function: 2390 * Removes this physical page from 2391 * all physical maps in which it resides. 2392 * Reflects back modify bits to the pager. 2393 * 2394 * Notes: 2395 * Original versions of this routine were very 2396 * inefficient because they iteratively called 2397 * pmap_remove (slow...) 2398 */ 2399 2400 void 2401 pmap_remove_all(vm_page_t m) 2402 { 2403 struct spglist free; 2404 struct md_page *pvh; 2405 pmap_t pmap; 2406 pt_entry_t *l3, l3e; 2407 pd_entry_t *l2, l2e __diagused; 2408 pv_entry_t pv; 2409 vm_offset_t va; 2410 2411 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2412 ("pmap_remove_all: page %p is not managed", m)); 2413 SLIST_INIT(&free); 2414 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 2415 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2416 2417 rw_wlock(&pvh_global_lock); 2418 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 2419 pmap = PV_PMAP(pv); 2420 PMAP_LOCK(pmap); 2421 va = pv->pv_va; 2422 l2 = pmap_l2(pmap, va); 2423 (void)pmap_demote_l2(pmap, l2, va); 2424 PMAP_UNLOCK(pmap); 2425 } 2426 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2427 pmap = PV_PMAP(pv); 2428 PMAP_LOCK(pmap); 2429 pmap_resident_count_dec(pmap, 1); 2430 l2 = pmap_l2(pmap, pv->pv_va); 2431 KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found")); 2432 l2e = pmap_load(l2); 2433 2434 KASSERT((l2e & PTE_RX) == 0, 2435 ("pmap_remove_all: found a superpage in %p's pv list", m)); 2436 2437 l3 = pmap_l2_to_l3(l2, pv->pv_va); 2438 l3e = pmap_load_clear(l3); 2439 pmap_invalidate_page(pmap, pv->pv_va); 2440 if (l3e & PTE_SW_WIRED) 2441 pmap->pm_stats.wired_count--; 2442 if ((l3e & PTE_A) != 0) 2443 vm_page_aflag_set(m, PGA_REFERENCED); 2444 2445 /* 2446 * Update the vm_page_t clean and reference bits. 2447 */ 2448 if ((l3e & PTE_D) != 0) 2449 vm_page_dirty(m); 2450 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free); 2451 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2452 m->md.pv_gen++; 2453 free_pv_entry(pmap, pv); 2454 PMAP_UNLOCK(pmap); 2455 } 2456 vm_page_aflag_clear(m, PGA_WRITEABLE); 2457 rw_wunlock(&pvh_global_lock); 2458 vm_page_free_pages_toq(&free, false); 2459 } 2460 2461 /* 2462 * Set the physical protection on the 2463 * specified range of this map as requested. 2464 */ 2465 void 2466 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2467 { 2468 pd_entry_t *l0, *l1, *l2, l2e; 2469 pt_entry_t *l3, l3e, mask; 2470 vm_page_t m, mt; 2471 vm_paddr_t pa; 2472 vm_offset_t va_next; 2473 bool anychanged, pv_lists_locked; 2474 2475 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2476 pmap_remove(pmap, sva, eva); 2477 return; 2478 } 2479 2480 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == 2481 (VM_PROT_WRITE | VM_PROT_EXECUTE)) 2482 return; 2483 2484 anychanged = false; 2485 pv_lists_locked = false; 2486 mask = 0; 2487 if ((prot & VM_PROT_WRITE) == 0) 2488 mask |= PTE_W | PTE_D; 2489 if ((prot & VM_PROT_EXECUTE) == 0) 2490 mask |= PTE_X; 2491 resume: 2492 PMAP_LOCK(pmap); 2493 for (; sva < eva; sva = va_next) { 2494 if (pmap_mode == PMAP_MODE_SV48) { 2495 l0 = pmap_l0(pmap, sva); 2496 if (pmap_load(l0) == 0) { 2497 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 2498 if (va_next < sva) 2499 va_next = eva; 2500 continue; 2501 } 2502 l1 = pmap_l0_to_l1(l0, sva); 2503 } else { 2504 l1 = pmap_l1(pmap, sva); 2505 } 2506 2507 if (pmap_load(l1) == 0) { 2508 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2509 if (va_next < sva) 2510 va_next = eva; 2511 continue; 2512 } 2513 2514 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2515 if (va_next < sva) 2516 va_next = eva; 2517 2518 l2 = pmap_l1_to_l2(l1, sva); 2519 if (l2 == NULL || (l2e = pmap_load(l2)) == 0) 2520 continue; 2521 if ((l2e & PTE_RWX) != 0) { 2522 if (sva + L2_SIZE == va_next && eva >= va_next) { 2523 retryl2: 2524 if ((prot & VM_PROT_WRITE) == 0 && 2525 (l2e & (PTE_SW_MANAGED | PTE_D)) == 2526 (PTE_SW_MANAGED | PTE_D)) { 2527 pa = PTE_TO_PHYS(l2e); 2528 m = PHYS_TO_VM_PAGE(pa); 2529 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 2530 vm_page_dirty(mt); 2531 } 2532 if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask)) 2533 goto retryl2; 2534 anychanged = true; 2535 continue; 2536 } else { 2537 if (!pv_lists_locked) { 2538 pv_lists_locked = true; 2539 if (!rw_try_rlock(&pvh_global_lock)) { 2540 if (anychanged) 2541 pmap_invalidate_all( 2542 pmap); 2543 PMAP_UNLOCK(pmap); 2544 rw_rlock(&pvh_global_lock); 2545 goto resume; 2546 } 2547 } 2548 if (!pmap_demote_l2(pmap, l2, sva)) { 2549 /* 2550 * The large page mapping was destroyed. 2551 */ 2552 continue; 2553 } 2554 } 2555 } 2556 2557 if (va_next > eva) 2558 va_next = eva; 2559 2560 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2561 sva += L3_SIZE) { 2562 l3e = pmap_load(l3); 2563 retryl3: 2564 if ((l3e & PTE_V) == 0) 2565 continue; 2566 if ((prot & VM_PROT_WRITE) == 0 && 2567 (l3e & (PTE_SW_MANAGED | PTE_D)) == 2568 (PTE_SW_MANAGED | PTE_D)) { 2569 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(l3e)); 2570 vm_page_dirty(m); 2571 } 2572 if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask)) 2573 goto retryl3; 2574 anychanged = true; 2575 } 2576 } 2577 if (anychanged) 2578 pmap_invalidate_all(pmap); 2579 if (pv_lists_locked) 2580 rw_runlock(&pvh_global_lock); 2581 PMAP_UNLOCK(pmap); 2582 } 2583 2584 int 2585 pmap_fault(pmap_t pmap, vm_offset_t va, vm_prot_t ftype) 2586 { 2587 pd_entry_t *l2, l2e; 2588 pt_entry_t bits, *pte, oldpte; 2589 int rv; 2590 2591 KASSERT(VIRT_IS_VALID(va), ("pmap_fault: invalid va %#lx", va)); 2592 2593 rv = 0; 2594 PMAP_LOCK(pmap); 2595 l2 = pmap_l2(pmap, va); 2596 if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) 2597 goto done; 2598 if ((l2e & PTE_RWX) == 0) { 2599 pte = pmap_l2_to_l3(l2, va); 2600 if (pte == NULL || ((oldpte = pmap_load(pte)) & PTE_V) == 0) 2601 goto done; 2602 } else { 2603 pte = l2; 2604 oldpte = l2e; 2605 } 2606 2607 if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) || 2608 (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) || 2609 (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) || 2610 (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0)) 2611 goto done; 2612 2613 bits = PTE_A; 2614 if (ftype == VM_PROT_WRITE) 2615 bits |= PTE_D; 2616 2617 /* 2618 * Spurious faults can occur if the implementation caches invalid 2619 * entries in the TLB, or if simultaneous accesses on multiple CPUs 2620 * race with each other. 2621 */ 2622 if ((oldpte & bits) != bits) 2623 pmap_store_bits(pte, bits); 2624 sfence_vma(); 2625 rv = 1; 2626 done: 2627 PMAP_UNLOCK(pmap); 2628 return (rv); 2629 } 2630 2631 static bool 2632 pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va) 2633 { 2634 struct rwlock *lock; 2635 bool rv; 2636 2637 lock = NULL; 2638 rv = pmap_demote_l2_locked(pmap, l2, va, &lock); 2639 if (lock != NULL) 2640 rw_wunlock(lock); 2641 return (rv); 2642 } 2643 2644 /* 2645 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 2646 * mapping is invalidated. 2647 */ 2648 static bool 2649 pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 2650 struct rwlock **lockp) 2651 { 2652 struct spglist free; 2653 vm_page_t mpte; 2654 pd_entry_t newl2, oldl2; 2655 pt_entry_t *firstl3, newl3; 2656 vm_paddr_t mptepa; 2657 int i; 2658 2659 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2660 2661 oldl2 = pmap_load(l2); 2662 KASSERT((oldl2 & PTE_RWX) != 0, 2663 ("pmap_demote_l2_locked: oldl2 is not a leaf entry")); 2664 if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 2665 NULL) { 2666 if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc_noobj( 2667 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) | 2668 VM_ALLOC_WIRED)) == NULL) { 2669 SLIST_INIT(&free); 2670 (void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET, 2671 pmap_load(pmap_l1(pmap, va)), &free, lockp); 2672 vm_page_free_pages_toq(&free, true); 2673 CTR2(KTR_PMAP, "pmap_demote_l2_locked: " 2674 "failure for va %#lx in pmap %p", va, pmap); 2675 return (false); 2676 } 2677 mpte->pindex = pmap_l2_pindex(va); 2678 if (va < VM_MAXUSER_ADDRESS) { 2679 mpte->ref_count = Ln_ENTRIES; 2680 pmap_resident_count_inc(pmap, 1); 2681 } 2682 } 2683 mptepa = VM_PAGE_TO_PHYS(mpte); 2684 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 2685 newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V; 2686 KASSERT((oldl2 & PTE_A) != 0, 2687 ("pmap_demote_l2_locked: oldl2 is missing PTE_A")); 2688 KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W, 2689 ("pmap_demote_l2_locked: oldl2 is missing PTE_D")); 2690 newl3 = oldl2; 2691 2692 /* 2693 * If the page table page is not leftover from an earlier promotion, 2694 * initialize it. 2695 */ 2696 if (mpte->valid == 0) { 2697 for (i = 0; i < Ln_ENTRIES; i++) 2698 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2699 } 2700 KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3), 2701 ("pmap_demote_l2_locked: firstl3 and newl3 map different physical " 2702 "addresses")); 2703 2704 /* 2705 * If the mapping has changed attributes, update the page table 2706 * entries. 2707 */ 2708 if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE)) 2709 for (i = 0; i < Ln_ENTRIES; i++) 2710 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2711 2712 /* 2713 * The spare PV entries must be reserved prior to demoting the 2714 * mapping, that is, prior to changing the L2 entry. Otherwise, the 2715 * state of the L2 entry and the PV lists will be inconsistent, which 2716 * can result in reclaim_pv_chunk() attempting to remove a PV entry from 2717 * the wrong PV list and pmap_pv_demote_l2() failing to find the 2718 * expected PV entry for the 2MB page mapping that is being demoted. 2719 */ 2720 if ((oldl2 & PTE_SW_MANAGED) != 0) 2721 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); 2722 2723 /* 2724 * Demote the mapping. 2725 */ 2726 pmap_store(l2, newl2); 2727 2728 /* 2729 * Demote the PV entry. 2730 */ 2731 if ((oldl2 & PTE_SW_MANAGED) != 0) 2732 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp); 2733 2734 atomic_add_long(&pmap_l2_demotions, 1); 2735 CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p", 2736 va, pmap); 2737 return (true); 2738 } 2739 2740 #if VM_NRESERVLEVEL > 0 2741 static void 2742 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t ml3, 2743 struct rwlock **lockp) 2744 { 2745 pt_entry_t *firstl3, firstl3e, *l3, l3e; 2746 vm_paddr_t pa; 2747 2748 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2749 2750 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 2751 ("pmap_promote_l2: invalid l2 entry %p", l2)); 2752 2753 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2))); 2754 firstl3e = pmap_load(firstl3); 2755 pa = PTE_TO_PHYS(firstl3e); 2756 if ((pa & L2_OFFSET) != 0) { 2757 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 2758 va, pmap); 2759 atomic_add_long(&pmap_l2_p_failures, 1); 2760 return; 2761 } 2762 2763 /* 2764 * Downgrade a clean, writable mapping to read-only to ensure that the 2765 * hardware does not set PTE_D while we are comparing PTEs. 2766 * 2767 * Upon a write access to a clean mapping, the implementation will 2768 * either atomically check protections and set PTE_D, or raise a page 2769 * fault. In the latter case, the pmap lock provides atomicity. Thus, 2770 * we do not issue an sfence.vma here and instead rely on pmap_fault() 2771 * to do so lazily. 2772 */ 2773 while ((firstl3e & (PTE_W | PTE_D)) == PTE_W) { 2774 if (atomic_fcmpset_64(firstl3, &firstl3e, firstl3e & ~PTE_W)) { 2775 firstl3e &= ~PTE_W; 2776 break; 2777 } 2778 } 2779 2780 pa += PAGE_SIZE; 2781 for (l3 = firstl3 + 1; l3 < firstl3 + Ln_ENTRIES; l3++) { 2782 l3e = pmap_load(l3); 2783 if (PTE_TO_PHYS(l3e) != pa) { 2784 CTR2(KTR_PMAP, 2785 "pmap_promote_l2: failure for va %#lx pmap %p", 2786 va, pmap); 2787 atomic_add_long(&pmap_l2_p_failures, 1); 2788 return; 2789 } 2790 while ((l3e & (PTE_W | PTE_D)) == PTE_W) { 2791 if (atomic_fcmpset_64(l3, &l3e, l3e & ~PTE_W)) { 2792 l3e &= ~PTE_W; 2793 break; 2794 } 2795 } 2796 if ((l3e & PTE_PROMOTE) != (firstl3e & PTE_PROMOTE)) { 2797 CTR2(KTR_PMAP, 2798 "pmap_promote_l2: failure for va %#lx pmap %p", 2799 va, pmap); 2800 atomic_add_long(&pmap_l2_p_failures, 1); 2801 return; 2802 } 2803 pa += PAGE_SIZE; 2804 } 2805 2806 if (ml3 == NULL) 2807 ml3 = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2808 KASSERT(ml3->pindex == pmap_l2_pindex(va), 2809 ("pmap_promote_l2: page table page's pindex is wrong")); 2810 if (pmap_insert_pt_page(pmap, ml3, true)) { 2811 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 2812 va, pmap); 2813 atomic_add_long(&pmap_l2_p_failures, 1); 2814 return; 2815 } 2816 2817 if ((firstl3e & PTE_SW_MANAGED) != 0) 2818 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(firstl3e), lockp); 2819 2820 pmap_store(l2, firstl3e); 2821 2822 atomic_add_long(&pmap_l2_promotions, 1); 2823 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, 2824 pmap); 2825 } 2826 #endif 2827 2828 /* 2829 * Insert the given physical page (p) at 2830 * the specified virtual address (v) in the 2831 * target physical map with the protection requested. 2832 * 2833 * If specified, the page will be wired down, meaning 2834 * that the related pte can not be reclaimed. 2835 * 2836 * NB: This is the only routine which MAY NOT lazy-evaluate 2837 * or lose information. That is, this routine must actually 2838 * insert this page into the given map NOW. 2839 */ 2840 int 2841 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2842 u_int flags, int8_t psind) 2843 { 2844 struct rwlock *lock; 2845 pd_entry_t *l1, *l2, l2e; 2846 pt_entry_t new_l3, orig_l3; 2847 pt_entry_t *l3; 2848 pv_entry_t pv; 2849 vm_paddr_t opa, pa, l2_pa, l3_pa; 2850 vm_page_t mpte, om, l2_m, l3_m; 2851 pt_entry_t entry; 2852 pn_t l2_pn, l3_pn, pn; 2853 int rv; 2854 bool nosleep; 2855 2856 va = trunc_page(va); 2857 if ((m->oflags & VPO_UNMANAGED) == 0) 2858 VM_PAGE_OBJECT_BUSY_ASSERT(m); 2859 pa = VM_PAGE_TO_PHYS(m); 2860 pn = (pa / PAGE_SIZE); 2861 2862 new_l3 = PTE_V | PTE_R | PTE_A; 2863 if (prot & VM_PROT_EXECUTE) 2864 new_l3 |= PTE_X; 2865 if (flags & VM_PROT_WRITE) 2866 new_l3 |= PTE_D; 2867 if (prot & VM_PROT_WRITE) 2868 new_l3 |= PTE_W; 2869 if (va < VM_MAX_USER_ADDRESS) 2870 new_l3 |= PTE_U; 2871 2872 new_l3 |= (pn << PTE_PPN0_S); 2873 if ((flags & PMAP_ENTER_WIRED) != 0) 2874 new_l3 |= PTE_SW_WIRED; 2875 2876 /* 2877 * Set modified bit gratuitously for writeable mappings if 2878 * the page is unmanaged. We do not want to take a fault 2879 * to do the dirty bit accounting for these mappings. 2880 */ 2881 if ((m->oflags & VPO_UNMANAGED) != 0) { 2882 if (prot & VM_PROT_WRITE) 2883 new_l3 |= PTE_D; 2884 } else 2885 new_l3 |= PTE_SW_MANAGED; 2886 2887 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); 2888 2889 lock = NULL; 2890 mpte = NULL; 2891 rw_rlock(&pvh_global_lock); 2892 PMAP_LOCK(pmap); 2893 if (psind == 1) { 2894 /* Assert the required virtual and physical alignment. */ 2895 KASSERT((va & L2_OFFSET) == 0, 2896 ("pmap_enter: va %#lx unaligned", va)); 2897 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 2898 rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock); 2899 goto out; 2900 } 2901 2902 l2 = pmap_l2(pmap, va); 2903 if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 && 2904 ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2, 2905 va, &lock))) { 2906 l3 = pmap_l2_to_l3(l2, va); 2907 if (va < VM_MAXUSER_ADDRESS) { 2908 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2909 mpte->ref_count++; 2910 } 2911 } else if (va < VM_MAXUSER_ADDRESS) { 2912 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 2913 mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock); 2914 if (mpte == NULL && nosleep) { 2915 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); 2916 if (lock != NULL) 2917 rw_wunlock(lock); 2918 rw_runlock(&pvh_global_lock); 2919 PMAP_UNLOCK(pmap); 2920 return (KERN_RESOURCE_SHORTAGE); 2921 } 2922 l3 = pmap_l3(pmap, va); 2923 } else { 2924 l3 = pmap_l3(pmap, va); 2925 /* TODO: This is not optimal, but should mostly work */ 2926 if (l3 == NULL) { 2927 if (l2 == NULL) { 2928 l2_m = vm_page_alloc_noobj(VM_ALLOC_WIRED | 2929 VM_ALLOC_ZERO); 2930 if (l2_m == NULL) 2931 panic("pmap_enter: l2 pte_m == NULL"); 2932 2933 l2_pa = VM_PAGE_TO_PHYS(l2_m); 2934 l2_pn = (l2_pa / PAGE_SIZE); 2935 2936 l1 = pmap_l1(pmap, va); 2937 entry = (PTE_V); 2938 entry |= (l2_pn << PTE_PPN0_S); 2939 pmap_store(l1, entry); 2940 pmap_distribute_l1(pmap, pmap_l1_index(va), entry); 2941 l2 = pmap_l1_to_l2(l1, va); 2942 } 2943 2944 l3_m = vm_page_alloc_noobj(VM_ALLOC_WIRED | 2945 VM_ALLOC_ZERO); 2946 if (l3_m == NULL) 2947 panic("pmap_enter: l3 pte_m == NULL"); 2948 2949 l3_pa = VM_PAGE_TO_PHYS(l3_m); 2950 l3_pn = (l3_pa / PAGE_SIZE); 2951 entry = (PTE_V); 2952 entry |= (l3_pn << PTE_PPN0_S); 2953 pmap_store(l2, entry); 2954 l3 = pmap_l2_to_l3(l2, va); 2955 } 2956 pmap_invalidate_page(pmap, va); 2957 } 2958 2959 orig_l3 = pmap_load(l3); 2960 opa = PTE_TO_PHYS(orig_l3); 2961 pv = NULL; 2962 2963 /* 2964 * Is the specified virtual address already mapped? 2965 */ 2966 if ((orig_l3 & PTE_V) != 0) { 2967 /* 2968 * Wiring change, just update stats. We don't worry about 2969 * wiring PT pages as they remain resident as long as there 2970 * are valid mappings in them. Hence, if a user page is wired, 2971 * the PT page will be also. 2972 */ 2973 if ((flags & PMAP_ENTER_WIRED) != 0 && 2974 (orig_l3 & PTE_SW_WIRED) == 0) 2975 pmap->pm_stats.wired_count++; 2976 else if ((flags & PMAP_ENTER_WIRED) == 0 && 2977 (orig_l3 & PTE_SW_WIRED) != 0) 2978 pmap->pm_stats.wired_count--; 2979 2980 /* 2981 * Remove the extra PT page reference. 2982 */ 2983 if (mpte != NULL) { 2984 mpte->ref_count--; 2985 KASSERT(mpte->ref_count > 0, 2986 ("pmap_enter: missing reference to page table page," 2987 " va: 0x%lx", va)); 2988 } 2989 2990 /* 2991 * Has the physical page changed? 2992 */ 2993 if (opa == pa) { 2994 /* 2995 * No, might be a protection or wiring change. 2996 */ 2997 if ((orig_l3 & PTE_SW_MANAGED) != 0 && 2998 (new_l3 & PTE_W) != 0) 2999 vm_page_aflag_set(m, PGA_WRITEABLE); 3000 goto validate; 3001 } 3002 3003 /* 3004 * The physical page has changed. Temporarily invalidate 3005 * the mapping. This ensures that all threads sharing the 3006 * pmap keep a consistent view of the mapping, which is 3007 * necessary for the correct handling of COW faults. It 3008 * also permits reuse of the old mapping's PV entry, 3009 * avoiding an allocation. 3010 * 3011 * For consistency, handle unmanaged mappings the same way. 3012 */ 3013 orig_l3 = pmap_load_clear(l3); 3014 KASSERT(PTE_TO_PHYS(orig_l3) == opa, 3015 ("pmap_enter: unexpected pa update for %#lx", va)); 3016 if ((orig_l3 & PTE_SW_MANAGED) != 0) { 3017 om = PHYS_TO_VM_PAGE(opa); 3018 3019 /* 3020 * The pmap lock is sufficient to synchronize with 3021 * concurrent calls to pmap_page_test_mappings() and 3022 * pmap_ts_referenced(). 3023 */ 3024 if ((orig_l3 & PTE_D) != 0) 3025 vm_page_dirty(om); 3026 if ((orig_l3 & PTE_A) != 0) 3027 vm_page_aflag_set(om, PGA_REFERENCED); 3028 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 3029 pv = pmap_pvh_remove(&om->md, pmap, va); 3030 KASSERT(pv != NULL, 3031 ("pmap_enter: no PV entry for %#lx", va)); 3032 if ((new_l3 & PTE_SW_MANAGED) == 0) 3033 free_pv_entry(pmap, pv); 3034 if ((om->a.flags & PGA_WRITEABLE) != 0 && 3035 TAILQ_EMPTY(&om->md.pv_list) && 3036 ((om->flags & PG_FICTITIOUS) != 0 || 3037 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 3038 vm_page_aflag_clear(om, PGA_WRITEABLE); 3039 } 3040 pmap_invalidate_page(pmap, va); 3041 orig_l3 = 0; 3042 } else { 3043 /* 3044 * Increment the counters. 3045 */ 3046 if ((new_l3 & PTE_SW_WIRED) != 0) 3047 pmap->pm_stats.wired_count++; 3048 pmap_resident_count_inc(pmap, 1); 3049 } 3050 /* 3051 * Enter on the PV list if part of our managed memory. 3052 */ 3053 if ((new_l3 & PTE_SW_MANAGED) != 0) { 3054 if (pv == NULL) { 3055 pv = get_pv_entry(pmap, &lock); 3056 pv->pv_va = va; 3057 } 3058 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 3059 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3060 m->md.pv_gen++; 3061 if ((new_l3 & PTE_W) != 0) 3062 vm_page_aflag_set(m, PGA_WRITEABLE); 3063 } 3064 3065 validate: 3066 /* 3067 * Sync the i-cache on all harts before updating the PTE 3068 * if the new PTE is executable. 3069 */ 3070 if (prot & VM_PROT_EXECUTE) 3071 pmap_sync_icache(pmap, va, PAGE_SIZE); 3072 3073 /* 3074 * Update the L3 entry. 3075 */ 3076 if (orig_l3 != 0) { 3077 orig_l3 = pmap_load_store(l3, new_l3); 3078 pmap_invalidate_page(pmap, va); 3079 KASSERT(PTE_TO_PHYS(orig_l3) == pa, 3080 ("pmap_enter: invalid update")); 3081 if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) == 3082 (PTE_D | PTE_SW_MANAGED)) 3083 vm_page_dirty(m); 3084 } else { 3085 pmap_store(l3, new_l3); 3086 } 3087 3088 #if VM_NRESERVLEVEL > 0 3089 if (mpte != NULL && mpte->ref_count == Ln_ENTRIES && 3090 pmap_ps_enabled(pmap) && 3091 (m->flags & PG_FICTITIOUS) == 0 && 3092 vm_reserv_level_iffullpop(m) == 0) 3093 pmap_promote_l2(pmap, l2, va, mpte, &lock); 3094 #endif 3095 3096 rv = KERN_SUCCESS; 3097 out: 3098 if (lock != NULL) 3099 rw_wunlock(lock); 3100 rw_runlock(&pvh_global_lock); 3101 PMAP_UNLOCK(pmap); 3102 return (rv); 3103 } 3104 3105 /* 3106 * Tries to create a read- and/or execute-only 2MB page mapping. Returns 3107 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error 3108 * value. See pmap_enter_l2() for the possible error values when "no sleep", 3109 * "no replace", and "no reclaim" are specified. 3110 */ 3111 static int 3112 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3113 struct rwlock **lockp) 3114 { 3115 pd_entry_t new_l2; 3116 pn_t pn; 3117 3118 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3119 3120 pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE; 3121 new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V); 3122 if ((m->oflags & VPO_UNMANAGED) == 0) 3123 new_l2 |= PTE_SW_MANAGED; 3124 if ((prot & VM_PROT_EXECUTE) != 0) 3125 new_l2 |= PTE_X; 3126 if (va < VM_MAXUSER_ADDRESS) 3127 new_l2 |= PTE_U; 3128 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | 3129 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp)); 3130 } 3131 3132 /* 3133 * Returns true if every page table entry in the specified page table is 3134 * zero. 3135 */ 3136 static bool 3137 pmap_every_pte_zero(vm_paddr_t pa) 3138 { 3139 pt_entry_t *pt_end, *pte; 3140 3141 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); 3142 pte = (pt_entry_t *)PHYS_TO_DMAP(pa); 3143 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) { 3144 if (*pte != 0) 3145 return (false); 3146 } 3147 return (true); 3148 } 3149 3150 /* 3151 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 3152 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or 3153 * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if 3154 * PMAP_ENTER_NOREPLACE was specified and a 4KB page mapping already exists 3155 * within the 2MB virtual address range starting at the specified virtual 3156 * address. Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a 3157 * 2MB page mapping already exists at the specified virtual address. Returns 3158 * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a 3159 * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified 3160 * and a PV entry allocation failed. 3161 * 3162 * The parameter "m" is only used when creating a managed, writeable mapping. 3163 */ 3164 static int 3165 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, 3166 vm_page_t m, struct rwlock **lockp) 3167 { 3168 struct spglist free; 3169 pd_entry_t *l2, *l3, oldl2; 3170 vm_offset_t sva; 3171 vm_page_t l2pg, mt; 3172 3173 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3174 3175 if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? 3176 NULL : lockp)) == NULL) { 3177 CTR2(KTR_PMAP, "pmap_enter_l2: failed to allocate PT page" 3178 " for va %#lx in pmap %p", va, pmap); 3179 return (KERN_RESOURCE_SHORTAGE); 3180 } 3181 3182 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); 3183 l2 = &l2[pmap_l2_index(va)]; 3184 if ((oldl2 = pmap_load(l2)) != 0) { 3185 KASSERT(l2pg->ref_count > 1, 3186 ("pmap_enter_l2: l2pg's ref count is too low")); 3187 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 3188 if ((oldl2 & PTE_RWX) != 0) { 3189 l2pg->ref_count--; 3190 CTR2(KTR_PMAP, 3191 "pmap_enter_l2: no space for va %#lx" 3192 " in pmap %p", va, pmap); 3193 return (KERN_NO_SPACE); 3194 } else if (va < VM_MAXUSER_ADDRESS || 3195 !pmap_every_pte_zero(L2PTE_TO_PHYS(oldl2))) { 3196 l2pg->ref_count--; 3197 CTR2(KTR_PMAP, "pmap_enter_l2:" 3198 " failed to replace existing mapping" 3199 " for va %#lx in pmap %p", va, pmap); 3200 return (KERN_FAILURE); 3201 } 3202 } 3203 SLIST_INIT(&free); 3204 if ((oldl2 & PTE_RWX) != 0) 3205 (void)pmap_remove_l2(pmap, l2, va, 3206 pmap_load(pmap_l1(pmap, va)), &free, lockp); 3207 else 3208 for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) { 3209 l3 = pmap_l2_to_l3(l2, sva); 3210 if ((pmap_load(l3) & PTE_V) != 0 && 3211 pmap_remove_l3(pmap, l3, sva, oldl2, &free, 3212 lockp) != 0) 3213 break; 3214 } 3215 vm_page_free_pages_toq(&free, true); 3216 if (va >= VM_MAXUSER_ADDRESS) { 3217 /* 3218 * Both pmap_remove_l2() and pmap_remove_l3() will 3219 * leave the kernel page table page zero filled. 3220 */ 3221 mt = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 3222 if (pmap_insert_pt_page(pmap, mt, false)) 3223 panic("pmap_enter_l2: trie insert failed"); 3224 } else 3225 KASSERT(pmap_load(l2) == 0, 3226 ("pmap_enter_l2: non-zero L2 entry %p", l2)); 3227 } 3228 3229 if ((new_l2 & PTE_SW_MANAGED) != 0) { 3230 /* 3231 * Abort this mapping if its PV entry could not be created. 3232 */ 3233 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { 3234 SLIST_INIT(&free); 3235 if (pmap_unwire_ptp(pmap, va, l2pg, &free)) { 3236 /* 3237 * Although "va" is not mapped, paging-structure 3238 * caches could nonetheless have entries that 3239 * refer to the freed page table pages. 3240 * Invalidate those entries. 3241 */ 3242 pmap_invalidate_page(pmap, va); 3243 vm_page_free_pages_toq(&free, true); 3244 } 3245 CTR2(KTR_PMAP, 3246 "pmap_enter_l2: failed to create PV entry" 3247 " for va %#lx in pmap %p", va, pmap); 3248 return (KERN_RESOURCE_SHORTAGE); 3249 } 3250 if ((new_l2 & PTE_W) != 0) 3251 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 3252 vm_page_aflag_set(mt, PGA_WRITEABLE); 3253 } 3254 3255 /* 3256 * Increment counters. 3257 */ 3258 if ((new_l2 & PTE_SW_WIRED) != 0) 3259 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; 3260 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; 3261 3262 /* 3263 * Map the superpage. 3264 */ 3265 pmap_store(l2, new_l2); 3266 3267 atomic_add_long(&pmap_l2_mappings, 1); 3268 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", 3269 va, pmap); 3270 3271 return (KERN_SUCCESS); 3272 } 3273 3274 /* 3275 * Maps a sequence of resident pages belonging to the same object. 3276 * The sequence begins with the given page m_start. This page is 3277 * mapped at the given virtual address start. Each subsequent page is 3278 * mapped at a virtual address that is offset from start by the same 3279 * amount as the page is offset from m_start within the object. The 3280 * last page in the sequence is the page with the largest offset from 3281 * m_start that can be mapped at a virtual address less than the given 3282 * virtual address end. Not every virtual page between start and end 3283 * is mapped; only those for which a resident page exists with the 3284 * corresponding offset from m_start are mapped. 3285 */ 3286 void 3287 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3288 vm_page_t m_start, vm_prot_t prot) 3289 { 3290 struct rwlock *lock; 3291 vm_offset_t va; 3292 vm_page_t m, mpte; 3293 vm_pindex_t diff, psize; 3294 int rv; 3295 3296 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3297 3298 psize = atop(end - start); 3299 mpte = NULL; 3300 m = m_start; 3301 lock = NULL; 3302 rw_rlock(&pvh_global_lock); 3303 PMAP_LOCK(pmap); 3304 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3305 va = start + ptoa(diff); 3306 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && 3307 m->psind == 1 && pmap_ps_enabled(pmap) && 3308 ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) == 3309 KERN_SUCCESS || rv == KERN_NO_SPACE)) 3310 m = &m[L2_SIZE / PAGE_SIZE - 1]; 3311 else 3312 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, 3313 &lock); 3314 m = TAILQ_NEXT(m, listq); 3315 } 3316 if (lock != NULL) 3317 rw_wunlock(lock); 3318 rw_runlock(&pvh_global_lock); 3319 PMAP_UNLOCK(pmap); 3320 } 3321 3322 /* 3323 * this code makes some *MAJOR* assumptions: 3324 * 1. Current pmap & pmap exists. 3325 * 2. Not wired. 3326 * 3. Read access. 3327 * 4. No page table pages. 3328 * but is *MUCH* faster than pmap_enter... 3329 */ 3330 3331 void 3332 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3333 { 3334 struct rwlock *lock; 3335 3336 lock = NULL; 3337 rw_rlock(&pvh_global_lock); 3338 PMAP_LOCK(pmap); 3339 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 3340 if (lock != NULL) 3341 rw_wunlock(lock); 3342 rw_runlock(&pvh_global_lock); 3343 PMAP_UNLOCK(pmap); 3344 } 3345 3346 static vm_page_t 3347 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3348 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 3349 { 3350 struct spglist free; 3351 vm_paddr_t phys; 3352 pd_entry_t *l2; 3353 pt_entry_t *l3, newl3; 3354 3355 KASSERT(!VA_IS_CLEANMAP(va) || 3356 (m->oflags & VPO_UNMANAGED) != 0, 3357 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3358 rw_assert(&pvh_global_lock, RA_LOCKED); 3359 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3360 3361 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); 3362 /* 3363 * In the case that a page table page is not 3364 * resident, we are creating it here. 3365 */ 3366 if (va < VM_MAXUSER_ADDRESS) { 3367 vm_pindex_t l2pindex; 3368 3369 /* 3370 * Calculate pagetable page index 3371 */ 3372 l2pindex = pmap_l2_pindex(va); 3373 if (mpte && (mpte->pindex == l2pindex)) { 3374 mpte->ref_count++; 3375 } else { 3376 /* 3377 * Get the l2 entry 3378 */ 3379 l2 = pmap_l2(pmap, va); 3380 3381 /* 3382 * If the page table page is mapped, we just increment 3383 * the hold count, and activate it. Otherwise, we 3384 * attempt to allocate a page table page. If this 3385 * attempt fails, we don't retry. Instead, we give up. 3386 */ 3387 if (l2 != NULL && pmap_load(l2) != 0) { 3388 if ((pmap_load(l2) & PTE_RWX) != 0) 3389 return (NULL); 3390 phys = PTE_TO_PHYS(pmap_load(l2)); 3391 mpte = PHYS_TO_VM_PAGE(phys); 3392 mpte->ref_count++; 3393 } else { 3394 /* 3395 * Pass NULL instead of the PV list lock 3396 * pointer, because we don't intend to sleep. 3397 */ 3398 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); 3399 if (mpte == NULL) 3400 return (mpte); 3401 } 3402 } 3403 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 3404 l3 = &l3[pmap_l3_index(va)]; 3405 } else { 3406 mpte = NULL; 3407 l3 = pmap_l3(kernel_pmap, va); 3408 } 3409 if (l3 == NULL) 3410 panic("pmap_enter_quick_locked: No l3"); 3411 if (pmap_load(l3) != 0) { 3412 if (mpte != NULL) { 3413 mpte->ref_count--; 3414 mpte = NULL; 3415 } 3416 return (mpte); 3417 } 3418 3419 /* 3420 * Enter on the PV list if part of our managed memory. 3421 */ 3422 if ((m->oflags & VPO_UNMANAGED) == 0 && 3423 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 3424 if (mpte != NULL) { 3425 SLIST_INIT(&free); 3426 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 3427 pmap_invalidate_page(pmap, va); 3428 vm_page_free_pages_toq(&free, false); 3429 } 3430 mpte = NULL; 3431 } 3432 return (mpte); 3433 } 3434 3435 /* 3436 * Increment counters 3437 */ 3438 pmap_resident_count_inc(pmap, 1); 3439 3440 newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) | 3441 PTE_V | PTE_R; 3442 if ((prot & VM_PROT_EXECUTE) != 0) 3443 newl3 |= PTE_X; 3444 if ((m->oflags & VPO_UNMANAGED) == 0) 3445 newl3 |= PTE_SW_MANAGED; 3446 if (va < VM_MAX_USER_ADDRESS) 3447 newl3 |= PTE_U; 3448 3449 /* 3450 * Sync the i-cache on all harts before updating the PTE 3451 * if the new PTE is executable. 3452 */ 3453 if (prot & VM_PROT_EXECUTE) 3454 pmap_sync_icache(pmap, va, PAGE_SIZE); 3455 3456 pmap_store(l3, newl3); 3457 3458 pmap_invalidate_page(pmap, va); 3459 return (mpte); 3460 } 3461 3462 /* 3463 * This code maps large physical mmap regions into the 3464 * processor address space. Note that some shortcuts 3465 * are taken, but the code works. 3466 */ 3467 void 3468 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3469 vm_pindex_t pindex, vm_size_t size) 3470 { 3471 3472 VM_OBJECT_ASSERT_WLOCKED(object); 3473 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3474 ("pmap_object_init_pt: non-device object")); 3475 } 3476 3477 /* 3478 * Clear the wired attribute from the mappings for the specified range of 3479 * addresses in the given pmap. Every valid mapping within that range 3480 * must have the wired attribute set. In contrast, invalid mappings 3481 * cannot have the wired attribute set, so they are ignored. 3482 * 3483 * The wired attribute of the page table entry is not a hardware feature, 3484 * so there is no need to invalidate any TLB entries. 3485 */ 3486 void 3487 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3488 { 3489 vm_offset_t va_next; 3490 pd_entry_t *l0, *l1, *l2, l2e; 3491 pt_entry_t *l3, l3e; 3492 bool pv_lists_locked; 3493 3494 pv_lists_locked = false; 3495 retry: 3496 PMAP_LOCK(pmap); 3497 for (; sva < eva; sva = va_next) { 3498 if (pmap_mode == PMAP_MODE_SV48) { 3499 l0 = pmap_l0(pmap, sva); 3500 if (pmap_load(l0) == 0) { 3501 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 3502 if (va_next < sva) 3503 va_next = eva; 3504 continue; 3505 } 3506 l1 = pmap_l0_to_l1(l0, sva); 3507 } else { 3508 l1 = pmap_l1(pmap, sva); 3509 } 3510 3511 if (pmap_load(l1) == 0) { 3512 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3513 if (va_next < sva) 3514 va_next = eva; 3515 continue; 3516 } 3517 3518 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3519 if (va_next < sva) 3520 va_next = eva; 3521 3522 l2 = pmap_l1_to_l2(l1, sva); 3523 if ((l2e = pmap_load(l2)) == 0) 3524 continue; 3525 if ((l2e & PTE_RWX) != 0) { 3526 if (sva + L2_SIZE == va_next && eva >= va_next) { 3527 if ((l2e & PTE_SW_WIRED) == 0) 3528 panic("pmap_unwire: l2 %#jx is missing " 3529 "PTE_SW_WIRED", (uintmax_t)l2e); 3530 pmap_clear_bits(l2, PTE_SW_WIRED); 3531 continue; 3532 } else { 3533 if (!pv_lists_locked) { 3534 pv_lists_locked = true; 3535 if (!rw_try_rlock(&pvh_global_lock)) { 3536 PMAP_UNLOCK(pmap); 3537 rw_rlock(&pvh_global_lock); 3538 /* Repeat sva. */ 3539 goto retry; 3540 } 3541 } 3542 if (!pmap_demote_l2(pmap, l2, sva)) 3543 panic("pmap_unwire: demotion failed"); 3544 } 3545 } 3546 3547 if (va_next > eva) 3548 va_next = eva; 3549 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 3550 sva += L3_SIZE) { 3551 if ((l3e = pmap_load(l3)) == 0) 3552 continue; 3553 if ((l3e & PTE_SW_WIRED) == 0) 3554 panic("pmap_unwire: l3 %#jx is missing " 3555 "PTE_SW_WIRED", (uintmax_t)l3e); 3556 3557 /* 3558 * PG_W must be cleared atomically. Although the pmap 3559 * lock synchronizes access to PG_W, another processor 3560 * could be setting PG_M and/or PG_A concurrently. 3561 */ 3562 pmap_clear_bits(l3, PTE_SW_WIRED); 3563 pmap->pm_stats.wired_count--; 3564 } 3565 } 3566 if (pv_lists_locked) 3567 rw_runlock(&pvh_global_lock); 3568 PMAP_UNLOCK(pmap); 3569 } 3570 3571 /* 3572 * Copy the range specified by src_addr/len 3573 * from the source map to the range dst_addr/len 3574 * in the destination map. 3575 * 3576 * This routine is only advisory and need not do anything. 3577 */ 3578 3579 void 3580 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 3581 vm_offset_t src_addr) 3582 { 3583 3584 } 3585 3586 /* 3587 * pmap_zero_page zeros the specified hardware page by mapping 3588 * the page into KVM and using bzero to clear its contents. 3589 */ 3590 void 3591 pmap_zero_page(vm_page_t m) 3592 { 3593 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3594 3595 pagezero((void *)va); 3596 } 3597 3598 /* 3599 * pmap_zero_page_area zeros the specified hardware page by mapping 3600 * the page into KVM and using bzero to clear its contents. 3601 * 3602 * off and size may not cover an area beyond a single hardware page. 3603 */ 3604 void 3605 pmap_zero_page_area(vm_page_t m, int off, int size) 3606 { 3607 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3608 3609 if (off == 0 && size == PAGE_SIZE) 3610 pagezero((void *)va); 3611 else 3612 bzero((char *)va + off, size); 3613 } 3614 3615 /* 3616 * pmap_copy_page copies the specified (machine independent) 3617 * page by mapping the page into virtual memory and using 3618 * bcopy to copy the page, one machine dependent page at a 3619 * time. 3620 */ 3621 void 3622 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 3623 { 3624 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 3625 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 3626 3627 pagecopy((void *)src, (void *)dst); 3628 } 3629 3630 int unmapped_buf_allowed = 1; 3631 3632 void 3633 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 3634 vm_offset_t b_offset, int xfersize) 3635 { 3636 void *a_cp, *b_cp; 3637 vm_page_t m_a, m_b; 3638 vm_paddr_t p_a, p_b; 3639 vm_offset_t a_pg_offset, b_pg_offset; 3640 int cnt; 3641 3642 while (xfersize > 0) { 3643 a_pg_offset = a_offset & PAGE_MASK; 3644 m_a = ma[a_offset >> PAGE_SHIFT]; 3645 p_a = m_a->phys_addr; 3646 b_pg_offset = b_offset & PAGE_MASK; 3647 m_b = mb[b_offset >> PAGE_SHIFT]; 3648 p_b = m_b->phys_addr; 3649 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 3650 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 3651 if (__predict_false(!PHYS_IN_DMAP(p_a))) { 3652 panic("!DMAP a %lx", p_a); 3653 } else { 3654 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 3655 } 3656 if (__predict_false(!PHYS_IN_DMAP(p_b))) { 3657 panic("!DMAP b %lx", p_b); 3658 } else { 3659 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 3660 } 3661 bcopy(a_cp, b_cp, cnt); 3662 a_offset += cnt; 3663 b_offset += cnt; 3664 xfersize -= cnt; 3665 } 3666 } 3667 3668 vm_offset_t 3669 pmap_quick_enter_page(vm_page_t m) 3670 { 3671 3672 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); 3673 } 3674 3675 void 3676 pmap_quick_remove_page(vm_offset_t addr) 3677 { 3678 } 3679 3680 /* 3681 * Returns true if the pmap's pv is one of the first 3682 * 16 pvs linked to from this page. This count may 3683 * be changed upwards or downwards in the future; it 3684 * is only necessary that true be returned for a small 3685 * subset of pmaps for proper page aging. 3686 */ 3687 boolean_t 3688 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 3689 { 3690 struct md_page *pvh; 3691 struct rwlock *lock; 3692 pv_entry_t pv; 3693 int loops = 0; 3694 boolean_t rv; 3695 3696 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3697 ("pmap_page_exists_quick: page %p is not managed", m)); 3698 rv = FALSE; 3699 rw_rlock(&pvh_global_lock); 3700 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3701 rw_rlock(lock); 3702 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3703 if (PV_PMAP(pv) == pmap) { 3704 rv = TRUE; 3705 break; 3706 } 3707 loops++; 3708 if (loops >= 16) 3709 break; 3710 } 3711 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 3712 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3713 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3714 if (PV_PMAP(pv) == pmap) { 3715 rv = TRUE; 3716 break; 3717 } 3718 loops++; 3719 if (loops >= 16) 3720 break; 3721 } 3722 } 3723 rw_runlock(lock); 3724 rw_runlock(&pvh_global_lock); 3725 return (rv); 3726 } 3727 3728 /* 3729 * pmap_page_wired_mappings: 3730 * 3731 * Return the number of managed mappings to the given physical page 3732 * that are wired. 3733 */ 3734 int 3735 pmap_page_wired_mappings(vm_page_t m) 3736 { 3737 struct md_page *pvh; 3738 struct rwlock *lock; 3739 pmap_t pmap; 3740 pd_entry_t *l2; 3741 pt_entry_t *l3; 3742 pv_entry_t pv; 3743 int count, md_gen, pvh_gen; 3744 3745 if ((m->oflags & VPO_UNMANAGED) != 0) 3746 return (0); 3747 rw_rlock(&pvh_global_lock); 3748 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3749 rw_rlock(lock); 3750 restart: 3751 count = 0; 3752 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3753 pmap = PV_PMAP(pv); 3754 if (!PMAP_TRYLOCK(pmap)) { 3755 md_gen = m->md.pv_gen; 3756 rw_runlock(lock); 3757 PMAP_LOCK(pmap); 3758 rw_rlock(lock); 3759 if (md_gen != m->md.pv_gen) { 3760 PMAP_UNLOCK(pmap); 3761 goto restart; 3762 } 3763 } 3764 l2 = pmap_l2(pmap, pv->pv_va); 3765 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 3766 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 3767 l3 = pmap_l2_to_l3(l2, pv->pv_va); 3768 if ((pmap_load(l3) & PTE_SW_WIRED) != 0) 3769 count++; 3770 PMAP_UNLOCK(pmap); 3771 } 3772 if ((m->flags & PG_FICTITIOUS) == 0) { 3773 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3774 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3775 pmap = PV_PMAP(pv); 3776 if (!PMAP_TRYLOCK(pmap)) { 3777 md_gen = m->md.pv_gen; 3778 pvh_gen = pvh->pv_gen; 3779 rw_runlock(lock); 3780 PMAP_LOCK(pmap); 3781 rw_rlock(lock); 3782 if (md_gen != m->md.pv_gen || 3783 pvh_gen != pvh->pv_gen) { 3784 PMAP_UNLOCK(pmap); 3785 goto restart; 3786 } 3787 } 3788 l2 = pmap_l2(pmap, pv->pv_va); 3789 if ((pmap_load(l2) & PTE_SW_WIRED) != 0) 3790 count++; 3791 PMAP_UNLOCK(pmap); 3792 } 3793 } 3794 rw_runlock(lock); 3795 rw_runlock(&pvh_global_lock); 3796 return (count); 3797 } 3798 3799 /* 3800 * Returns true if the given page is mapped individually or as part of 3801 * a 2mpage. Otherwise, returns false. 3802 */ 3803 bool 3804 pmap_page_is_mapped(vm_page_t m) 3805 { 3806 struct rwlock *lock; 3807 bool rv; 3808 3809 if ((m->oflags & VPO_UNMANAGED) != 0) 3810 return (false); 3811 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3812 rw_rlock(lock); 3813 rv = !TAILQ_EMPTY(&m->md.pv_list) || 3814 ((m->flags & PG_FICTITIOUS) == 0 && 3815 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 3816 rw_runlock(lock); 3817 return (rv); 3818 } 3819 3820 static void 3821 pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, 3822 struct spglist *free, bool superpage) 3823 { 3824 struct md_page *pvh; 3825 vm_page_t mpte, mt; 3826 3827 if (superpage) { 3828 pmap_resident_count_dec(pmap, Ln_ENTRIES); 3829 pvh = pa_to_pvh(m->phys_addr); 3830 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3831 pvh->pv_gen++; 3832 if (TAILQ_EMPTY(&pvh->pv_list)) { 3833 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 3834 if (TAILQ_EMPTY(&mt->md.pv_list) && 3835 (mt->a.flags & PGA_WRITEABLE) != 0) 3836 vm_page_aflag_clear(mt, PGA_WRITEABLE); 3837 } 3838 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 3839 if (mpte != NULL) { 3840 KASSERT(mpte->valid == VM_PAGE_BITS_ALL, 3841 ("pmap_remove_pages: pte page not promoted")); 3842 pmap_resident_count_dec(pmap, 1); 3843 KASSERT(mpte->ref_count == Ln_ENTRIES, 3844 ("pmap_remove_pages: pte page ref count error")); 3845 mpte->ref_count = 0; 3846 pmap_add_delayed_free_list(mpte, free, FALSE); 3847 } 3848 } else { 3849 pmap_resident_count_dec(pmap, 1); 3850 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3851 m->md.pv_gen++; 3852 if (TAILQ_EMPTY(&m->md.pv_list) && 3853 (m->a.flags & PGA_WRITEABLE) != 0) { 3854 pvh = pa_to_pvh(m->phys_addr); 3855 if (TAILQ_EMPTY(&pvh->pv_list)) 3856 vm_page_aflag_clear(m, PGA_WRITEABLE); 3857 } 3858 } 3859 } 3860 3861 /* 3862 * Destroy all managed, non-wired mappings in the given user-space 3863 * pmap. This pmap cannot be active on any processor besides the 3864 * caller. 3865 * 3866 * This function cannot be applied to the kernel pmap. Moreover, it 3867 * is not intended for general use. It is only to be used during 3868 * process termination. Consequently, it can be implemented in ways 3869 * that make it faster than pmap_remove(). First, it can more quickly 3870 * destroy mappings by iterating over the pmap's collection of PV 3871 * entries, rather than searching the page table. Second, it doesn't 3872 * have to test and clear the page table entries atomically, because 3873 * no processor is currently accessing the user address space. In 3874 * particular, a page table entry's dirty bit won't change state once 3875 * this function starts. 3876 */ 3877 void 3878 pmap_remove_pages(pmap_t pmap) 3879 { 3880 struct spglist free; 3881 pd_entry_t ptepde; 3882 pt_entry_t *pte, tpte; 3883 vm_page_t m, mt; 3884 pv_entry_t pv; 3885 struct pv_chunk *pc, *npc; 3886 struct rwlock *lock; 3887 int64_t bit; 3888 uint64_t inuse, bitmask; 3889 int allfree, field, freed, idx; 3890 bool superpage; 3891 3892 lock = NULL; 3893 3894 SLIST_INIT(&free); 3895 rw_rlock(&pvh_global_lock); 3896 PMAP_LOCK(pmap); 3897 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 3898 allfree = 1; 3899 freed = 0; 3900 for (field = 0; field < _NPCM; field++) { 3901 inuse = ~pc->pc_map[field] & pc_freemask[field]; 3902 while (inuse != 0) { 3903 bit = ffsl(inuse) - 1; 3904 bitmask = 1UL << bit; 3905 idx = field * 64 + bit; 3906 pv = &pc->pc_pventry[idx]; 3907 inuse &= ~bitmask; 3908 3909 pte = pmap_l1(pmap, pv->pv_va); 3910 ptepde = pmap_load(pte); 3911 pte = pmap_l1_to_l2(pte, pv->pv_va); 3912 tpte = pmap_load(pte); 3913 3914 KASSERT((tpte & PTE_V) != 0, 3915 ("L2 PTE is invalid... bogus PV entry? " 3916 "va=%#lx, pte=%#lx", pv->pv_va, tpte)); 3917 if ((tpte & PTE_RWX) != 0) { 3918 superpage = true; 3919 } else { 3920 ptepde = tpte; 3921 pte = pmap_l2_to_l3(pte, pv->pv_va); 3922 tpte = pmap_load(pte); 3923 superpage = false; 3924 } 3925 3926 /* 3927 * We cannot remove wired pages from a 3928 * process' mapping at this time. 3929 */ 3930 if (tpte & PTE_SW_WIRED) { 3931 allfree = 0; 3932 continue; 3933 } 3934 3935 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte)); 3936 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 3937 m < &vm_page_array[vm_page_array_size], 3938 ("pmap_remove_pages: bad pte %#jx", 3939 (uintmax_t)tpte)); 3940 3941 pmap_clear(pte); 3942 3943 /* 3944 * Update the vm_page_t clean/reference bits. 3945 */ 3946 if ((tpte & (PTE_D | PTE_W)) == 3947 (PTE_D | PTE_W)) { 3948 if (superpage) 3949 for (mt = m; 3950 mt < &m[Ln_ENTRIES]; mt++) 3951 vm_page_dirty(mt); 3952 else 3953 vm_page_dirty(m); 3954 } 3955 3956 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 3957 3958 /* Mark free */ 3959 pc->pc_map[field] |= bitmask; 3960 3961 pmap_remove_pages_pv(pmap, m, pv, &free, 3962 superpage); 3963 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 3964 freed++; 3965 } 3966 } 3967 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 3968 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 3969 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 3970 if (allfree) { 3971 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3972 free_pv_chunk(pc); 3973 } 3974 } 3975 if (lock != NULL) 3976 rw_wunlock(lock); 3977 pmap_invalidate_all(pmap); 3978 rw_runlock(&pvh_global_lock); 3979 PMAP_UNLOCK(pmap); 3980 vm_page_free_pages_toq(&free, false); 3981 } 3982 3983 static bool 3984 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 3985 { 3986 struct md_page *pvh; 3987 struct rwlock *lock; 3988 pd_entry_t *l2; 3989 pt_entry_t *l3, mask; 3990 pv_entry_t pv; 3991 pmap_t pmap; 3992 int md_gen, pvh_gen; 3993 bool rv; 3994 3995 mask = 0; 3996 if (modified) 3997 mask |= PTE_D; 3998 if (accessed) 3999 mask |= PTE_A; 4000 4001 rv = FALSE; 4002 rw_rlock(&pvh_global_lock); 4003 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4004 rw_rlock(lock); 4005 restart: 4006 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4007 pmap = PV_PMAP(pv); 4008 if (!PMAP_TRYLOCK(pmap)) { 4009 md_gen = m->md.pv_gen; 4010 rw_runlock(lock); 4011 PMAP_LOCK(pmap); 4012 rw_rlock(lock); 4013 if (md_gen != m->md.pv_gen) { 4014 PMAP_UNLOCK(pmap); 4015 goto restart; 4016 } 4017 } 4018 l2 = pmap_l2(pmap, pv->pv_va); 4019 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4020 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 4021 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4022 rv = (pmap_load(l3) & mask) == mask; 4023 PMAP_UNLOCK(pmap); 4024 if (rv) 4025 goto out; 4026 } 4027 if ((m->flags & PG_FICTITIOUS) == 0) { 4028 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4029 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4030 pmap = PV_PMAP(pv); 4031 if (!PMAP_TRYLOCK(pmap)) { 4032 md_gen = m->md.pv_gen; 4033 pvh_gen = pvh->pv_gen; 4034 rw_runlock(lock); 4035 PMAP_LOCK(pmap); 4036 rw_rlock(lock); 4037 if (md_gen != m->md.pv_gen || 4038 pvh_gen != pvh->pv_gen) { 4039 PMAP_UNLOCK(pmap); 4040 goto restart; 4041 } 4042 } 4043 l2 = pmap_l2(pmap, pv->pv_va); 4044 rv = (pmap_load(l2) & mask) == mask; 4045 PMAP_UNLOCK(pmap); 4046 if (rv) 4047 goto out; 4048 } 4049 } 4050 out: 4051 rw_runlock(lock); 4052 rw_runlock(&pvh_global_lock); 4053 return (rv); 4054 } 4055 4056 /* 4057 * pmap_is_modified: 4058 * 4059 * Return whether or not the specified physical page was modified 4060 * in any physical maps. 4061 */ 4062 boolean_t 4063 pmap_is_modified(vm_page_t m) 4064 { 4065 4066 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4067 ("pmap_is_modified: page %p is not managed", m)); 4068 4069 /* 4070 * If the page is not busied then this check is racy. 4071 */ 4072 if (!pmap_page_is_write_mapped(m)) 4073 return (FALSE); 4074 return (pmap_page_test_mappings(m, FALSE, TRUE)); 4075 } 4076 4077 /* 4078 * pmap_is_prefaultable: 4079 * 4080 * Return whether or not the specified virtual address is eligible 4081 * for prefault. 4082 */ 4083 boolean_t 4084 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 4085 { 4086 pt_entry_t *l3; 4087 boolean_t rv; 4088 4089 /* 4090 * Return TRUE if and only if the L3 entry for the specified virtual 4091 * address is allocated but invalid. 4092 */ 4093 rv = FALSE; 4094 PMAP_LOCK(pmap); 4095 l3 = pmap_l3(pmap, addr); 4096 if (l3 != NULL && pmap_load(l3) == 0) { 4097 rv = TRUE; 4098 } 4099 PMAP_UNLOCK(pmap); 4100 return (rv); 4101 } 4102 4103 /* 4104 * pmap_is_referenced: 4105 * 4106 * Return whether or not the specified physical page was referenced 4107 * in any physical maps. 4108 */ 4109 boolean_t 4110 pmap_is_referenced(vm_page_t m) 4111 { 4112 4113 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4114 ("pmap_is_referenced: page %p is not managed", m)); 4115 return (pmap_page_test_mappings(m, TRUE, FALSE)); 4116 } 4117 4118 /* 4119 * Clear the write and modified bits in each of the given page's mappings. 4120 */ 4121 void 4122 pmap_remove_write(vm_page_t m) 4123 { 4124 struct md_page *pvh; 4125 struct rwlock *lock; 4126 pmap_t pmap; 4127 pd_entry_t *l2; 4128 pt_entry_t *l3, oldl3, newl3; 4129 pv_entry_t next_pv, pv; 4130 vm_offset_t va; 4131 int md_gen, pvh_gen; 4132 4133 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4134 ("pmap_remove_write: page %p is not managed", m)); 4135 vm_page_assert_busied(m); 4136 4137 if (!pmap_page_is_write_mapped(m)) 4138 return; 4139 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4140 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 4141 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4142 rw_rlock(&pvh_global_lock); 4143 retry_pv_loop: 4144 rw_wlock(lock); 4145 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4146 pmap = PV_PMAP(pv); 4147 if (!PMAP_TRYLOCK(pmap)) { 4148 pvh_gen = pvh->pv_gen; 4149 rw_wunlock(lock); 4150 PMAP_LOCK(pmap); 4151 rw_wlock(lock); 4152 if (pvh_gen != pvh->pv_gen) { 4153 PMAP_UNLOCK(pmap); 4154 rw_wunlock(lock); 4155 goto retry_pv_loop; 4156 } 4157 } 4158 va = pv->pv_va; 4159 l2 = pmap_l2(pmap, va); 4160 if ((pmap_load(l2) & PTE_W) != 0) 4161 (void)pmap_demote_l2_locked(pmap, l2, va, &lock); 4162 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 4163 ("inconsistent pv lock %p %p for page %p", 4164 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 4165 PMAP_UNLOCK(pmap); 4166 } 4167 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4168 pmap = PV_PMAP(pv); 4169 if (!PMAP_TRYLOCK(pmap)) { 4170 pvh_gen = pvh->pv_gen; 4171 md_gen = m->md.pv_gen; 4172 rw_wunlock(lock); 4173 PMAP_LOCK(pmap); 4174 rw_wlock(lock); 4175 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4176 PMAP_UNLOCK(pmap); 4177 rw_wunlock(lock); 4178 goto retry_pv_loop; 4179 } 4180 } 4181 l2 = pmap_l2(pmap, pv->pv_va); 4182 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4183 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 4184 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4185 oldl3 = pmap_load(l3); 4186 retry: 4187 if ((oldl3 & PTE_W) != 0) { 4188 newl3 = oldl3 & ~(PTE_D | PTE_W); 4189 if (!atomic_fcmpset_long(l3, &oldl3, newl3)) 4190 goto retry; 4191 if ((oldl3 & PTE_D) != 0) 4192 vm_page_dirty(m); 4193 pmap_invalidate_page(pmap, pv->pv_va); 4194 } 4195 PMAP_UNLOCK(pmap); 4196 } 4197 rw_wunlock(lock); 4198 vm_page_aflag_clear(m, PGA_WRITEABLE); 4199 rw_runlock(&pvh_global_lock); 4200 } 4201 4202 /* 4203 * pmap_ts_referenced: 4204 * 4205 * Return a count of reference bits for a page, clearing those bits. 4206 * It is not necessary for every reference bit to be cleared, but it 4207 * is necessary that 0 only be returned when there are truly no 4208 * reference bits set. 4209 * 4210 * As an optimization, update the page's dirty field if a modified bit is 4211 * found while counting reference bits. This opportunistic update can be 4212 * performed at low cost and can eliminate the need for some future calls 4213 * to pmap_is_modified(). However, since this function stops after 4214 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 4215 * dirty pages. Those dirty pages will only be detected by a future call 4216 * to pmap_is_modified(). 4217 */ 4218 int 4219 pmap_ts_referenced(vm_page_t m) 4220 { 4221 struct spglist free; 4222 struct md_page *pvh; 4223 struct rwlock *lock; 4224 pv_entry_t pv, pvf; 4225 pmap_t pmap; 4226 pd_entry_t *l2, l2e; 4227 pt_entry_t *l3, l3e; 4228 vm_paddr_t pa; 4229 vm_offset_t va; 4230 int cleared, md_gen, not_cleared, pvh_gen; 4231 4232 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4233 ("pmap_ts_referenced: page %p is not managed", m)); 4234 SLIST_INIT(&free); 4235 cleared = 0; 4236 pa = VM_PAGE_TO_PHYS(m); 4237 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 4238 4239 lock = PHYS_TO_PV_LIST_LOCK(pa); 4240 rw_rlock(&pvh_global_lock); 4241 rw_wlock(lock); 4242 retry: 4243 not_cleared = 0; 4244 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 4245 goto small_mappings; 4246 pv = pvf; 4247 do { 4248 pmap = PV_PMAP(pv); 4249 if (!PMAP_TRYLOCK(pmap)) { 4250 pvh_gen = pvh->pv_gen; 4251 rw_wunlock(lock); 4252 PMAP_LOCK(pmap); 4253 rw_wlock(lock); 4254 if (pvh_gen != pvh->pv_gen) { 4255 PMAP_UNLOCK(pmap); 4256 goto retry; 4257 } 4258 } 4259 va = pv->pv_va; 4260 l2 = pmap_l2(pmap, va); 4261 l2e = pmap_load(l2); 4262 if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) { 4263 /* 4264 * Although l2e is mapping a 2MB page, because 4265 * this function is called at a 4KB page granularity, 4266 * we only update the 4KB page under test. 4267 */ 4268 vm_page_dirty(m); 4269 } 4270 if ((l2e & PTE_A) != 0) { 4271 /* 4272 * Since this reference bit is shared by 512 4KB 4273 * pages, it should not be cleared every time it is 4274 * tested. Apply a simple "hash" function on the 4275 * physical page number, the virtual superpage number, 4276 * and the pmap address to select one 4KB page out of 4277 * the 512 on which testing the reference bit will 4278 * result in clearing that reference bit. This 4279 * function is designed to avoid the selection of the 4280 * same 4KB page for every 2MB page mapping. 4281 * 4282 * On demotion, a mapping that hasn't been referenced 4283 * is simply destroyed. To avoid the possibility of a 4284 * subsequent page fault on a demoted wired mapping, 4285 * always leave its reference bit set. Moreover, 4286 * since the superpage is wired, the current state of 4287 * its reference bit won't affect page replacement. 4288 */ 4289 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ 4290 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && 4291 (l2e & PTE_SW_WIRED) == 0) { 4292 pmap_clear_bits(l2, PTE_A); 4293 pmap_invalidate_page(pmap, va); 4294 cleared++; 4295 } else 4296 not_cleared++; 4297 } 4298 PMAP_UNLOCK(pmap); 4299 /* Rotate the PV list if it has more than one entry. */ 4300 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4301 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4302 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4303 pvh->pv_gen++; 4304 } 4305 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 4306 goto out; 4307 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 4308 small_mappings: 4309 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 4310 goto out; 4311 pv = pvf; 4312 do { 4313 pmap = PV_PMAP(pv); 4314 if (!PMAP_TRYLOCK(pmap)) { 4315 pvh_gen = pvh->pv_gen; 4316 md_gen = m->md.pv_gen; 4317 rw_wunlock(lock); 4318 PMAP_LOCK(pmap); 4319 rw_wlock(lock); 4320 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4321 PMAP_UNLOCK(pmap); 4322 goto retry; 4323 } 4324 } 4325 l2 = pmap_l2(pmap, pv->pv_va); 4326 4327 KASSERT((pmap_load(l2) & PTE_RX) == 0, 4328 ("pmap_ts_referenced: found an invalid l2 table")); 4329 4330 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4331 l3e = pmap_load(l3); 4332 if ((l3e & PTE_D) != 0) 4333 vm_page_dirty(m); 4334 if ((l3e & PTE_A) != 0) { 4335 if ((l3e & PTE_SW_WIRED) == 0) { 4336 /* 4337 * Wired pages cannot be paged out so 4338 * doing accessed bit emulation for 4339 * them is wasted effort. We do the 4340 * hard work for unwired pages only. 4341 */ 4342 pmap_clear_bits(l3, PTE_A); 4343 pmap_invalidate_page(pmap, pv->pv_va); 4344 cleared++; 4345 } else 4346 not_cleared++; 4347 } 4348 PMAP_UNLOCK(pmap); 4349 /* Rotate the PV list if it has more than one entry. */ 4350 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4351 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4352 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4353 m->md.pv_gen++; 4354 } 4355 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 4356 not_cleared < PMAP_TS_REFERENCED_MAX); 4357 out: 4358 rw_wunlock(lock); 4359 rw_runlock(&pvh_global_lock); 4360 vm_page_free_pages_toq(&free, false); 4361 return (cleared + not_cleared); 4362 } 4363 4364 /* 4365 * Apply the given advice to the specified range of addresses within the 4366 * given pmap. Depending on the advice, clear the referenced and/or 4367 * modified flags in each mapping and set the mapped page's dirty field. 4368 */ 4369 void 4370 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 4371 { 4372 } 4373 4374 /* 4375 * Clear the modify bits on the specified physical page. 4376 */ 4377 void 4378 pmap_clear_modify(vm_page_t m) 4379 { 4380 struct md_page *pvh; 4381 struct rwlock *lock; 4382 pmap_t pmap; 4383 pv_entry_t next_pv, pv; 4384 pd_entry_t *l2, oldl2; 4385 pt_entry_t *l3; 4386 vm_offset_t va; 4387 int md_gen, pvh_gen; 4388 4389 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4390 ("pmap_clear_modify: page %p is not managed", m)); 4391 vm_page_assert_busied(m); 4392 4393 if (!pmap_page_is_write_mapped(m)) 4394 return; 4395 4396 /* 4397 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 4398 * If the object containing the page is locked and the page is not 4399 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 4400 */ 4401 if ((m->a.flags & PGA_WRITEABLE) == 0) 4402 return; 4403 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 4404 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4405 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4406 rw_rlock(&pvh_global_lock); 4407 rw_wlock(lock); 4408 restart: 4409 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4410 pmap = PV_PMAP(pv); 4411 if (!PMAP_TRYLOCK(pmap)) { 4412 pvh_gen = pvh->pv_gen; 4413 rw_wunlock(lock); 4414 PMAP_LOCK(pmap); 4415 rw_wlock(lock); 4416 if (pvh_gen != pvh->pv_gen) { 4417 PMAP_UNLOCK(pmap); 4418 goto restart; 4419 } 4420 } 4421 va = pv->pv_va; 4422 l2 = pmap_l2(pmap, va); 4423 oldl2 = pmap_load(l2); 4424 /* If oldl2 has PTE_W set, then it also has PTE_D set. */ 4425 if ((oldl2 & PTE_W) != 0 && 4426 pmap_demote_l2_locked(pmap, l2, va, &lock) && 4427 (oldl2 & PTE_SW_WIRED) == 0) { 4428 /* 4429 * Write protect the mapping to a single page so that 4430 * a subsequent write access may repromote. 4431 */ 4432 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2); 4433 l3 = pmap_l2_to_l3(l2, va); 4434 pmap_clear_bits(l3, PTE_D | PTE_W); 4435 vm_page_dirty(m); 4436 pmap_invalidate_page(pmap, va); 4437 } 4438 PMAP_UNLOCK(pmap); 4439 } 4440 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4441 pmap = PV_PMAP(pv); 4442 if (!PMAP_TRYLOCK(pmap)) { 4443 md_gen = m->md.pv_gen; 4444 pvh_gen = pvh->pv_gen; 4445 rw_wunlock(lock); 4446 PMAP_LOCK(pmap); 4447 rw_wlock(lock); 4448 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4449 PMAP_UNLOCK(pmap); 4450 goto restart; 4451 } 4452 } 4453 l2 = pmap_l2(pmap, pv->pv_va); 4454 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4455 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 4456 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4457 if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) { 4458 pmap_clear_bits(l3, PTE_D | PTE_W); 4459 pmap_invalidate_page(pmap, pv->pv_va); 4460 } 4461 PMAP_UNLOCK(pmap); 4462 } 4463 rw_wunlock(lock); 4464 rw_runlock(&pvh_global_lock); 4465 } 4466 4467 void * 4468 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 4469 { 4470 4471 return ((void *)PHYS_TO_DMAP(pa)); 4472 } 4473 4474 void 4475 pmap_unmapbios(void *p, vm_size_t size) 4476 { 4477 } 4478 4479 /* 4480 * Sets the memory attribute for the specified page. 4481 */ 4482 void 4483 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 4484 { 4485 4486 m->md.pv_memattr = ma; 4487 4488 /* 4489 * If "m" is a normal page, update its direct mapping. This update 4490 * can be relied upon to perform any cache operations that are 4491 * required for data coherence. 4492 */ 4493 if ((m->flags & PG_FICTITIOUS) == 0 && 4494 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 4495 m->md.pv_memattr) != 0) 4496 panic("memory attribute change on the direct map failed"); 4497 } 4498 4499 /* 4500 * Changes the specified virtual address range's memory type to that given by 4501 * the parameter "mode". The specified virtual address range must be 4502 * completely contained within either the direct map or the kernel map. 4503 * 4504 * Returns zero if the change completed successfully, and either EINVAL or 4505 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 4506 * of the virtual address range was not mapped, and ENOMEM is returned if 4507 * there was insufficient memory available to complete the change. In the 4508 * latter case, the memory type may have been changed on some part of the 4509 * virtual address range. 4510 */ 4511 int 4512 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 4513 { 4514 int error; 4515 4516 PMAP_LOCK(kernel_pmap); 4517 error = pmap_change_attr_locked(va, size, mode); 4518 PMAP_UNLOCK(kernel_pmap); 4519 return (error); 4520 } 4521 4522 static int 4523 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) 4524 { 4525 vm_offset_t base, offset, tmpva; 4526 pd_entry_t *l1, l1e; 4527 pd_entry_t *l2, l2e; 4528 pt_entry_t *l3, l3e; 4529 4530 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 4531 base = trunc_page(va); 4532 offset = va & PAGE_MASK; 4533 size = round_page(offset + size); 4534 4535 if (!VIRT_IN_DMAP(base) && 4536 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS)) 4537 return (EINVAL); 4538 4539 for (tmpva = base; tmpva < base + size; ) { 4540 l1 = pmap_l1(kernel_pmap, tmpva); 4541 if (l1 == NULL || ((l1e = pmap_load(l1)) & PTE_V) == 0) 4542 return (EINVAL); 4543 if ((l1e & PTE_RWX) != 0) { 4544 /* 4545 * TODO: Demote if attributes don't match and there 4546 * isn't an L1 page left in the range, and update the 4547 * L1 entry if the attributes don't match but there is 4548 * an L1 page left in the range, once we support the 4549 * upcoming Svpbmt extension. 4550 */ 4551 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; 4552 continue; 4553 } 4554 l2 = pmap_l1_to_l2(l1, tmpva); 4555 if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) 4556 return (EINVAL); 4557 if ((l2e & PTE_RWX) != 0) { 4558 /* 4559 * TODO: Demote if attributes don't match and there 4560 * isn't an L2 page left in the range, and update the 4561 * L2 entry if the attributes don't match but there is 4562 * an L2 page left in the range, once we support the 4563 * upcoming Svpbmt extension. 4564 */ 4565 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; 4566 continue; 4567 } 4568 l3 = pmap_l2_to_l3(l2, tmpva); 4569 if (l3 == NULL || ((l3e = pmap_load(l3)) & PTE_V) == 0) 4570 return (EINVAL); 4571 /* 4572 * TODO: Update the L3 entry if the attributes don't match once 4573 * we support the upcoming Svpbmt extension. 4574 */ 4575 tmpva += PAGE_SIZE; 4576 } 4577 4578 return (0); 4579 } 4580 4581 /* 4582 * Perform the pmap work for mincore(2). If the page is not both referenced and 4583 * modified by this pmap, returns its physical address so that the caller can 4584 * find other mappings. 4585 */ 4586 int 4587 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 4588 { 4589 pt_entry_t *l2, *l3, tpte; 4590 vm_paddr_t pa; 4591 int val; 4592 bool managed; 4593 4594 PMAP_LOCK(pmap); 4595 l2 = pmap_l2(pmap, addr); 4596 if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) { 4597 if ((tpte & PTE_RWX) != 0) { 4598 pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET); 4599 val = MINCORE_INCORE | MINCORE_PSIND(1); 4600 } else { 4601 l3 = pmap_l2_to_l3(l2, addr); 4602 tpte = pmap_load(l3); 4603 if ((tpte & PTE_V) == 0) { 4604 PMAP_UNLOCK(pmap); 4605 return (0); 4606 } 4607 pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET); 4608 val = MINCORE_INCORE; 4609 } 4610 4611 if ((tpte & PTE_D) != 0) 4612 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 4613 if ((tpte & PTE_A) != 0) 4614 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 4615 managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED; 4616 } else { 4617 managed = false; 4618 val = 0; 4619 } 4620 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 4621 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 4622 *pap = pa; 4623 } 4624 PMAP_UNLOCK(pmap); 4625 return (val); 4626 } 4627 4628 void 4629 pmap_activate_sw(struct thread *td) 4630 { 4631 pmap_t oldpmap, pmap; 4632 u_int hart; 4633 4634 oldpmap = PCPU_GET(curpmap); 4635 pmap = vmspace_pmap(td->td_proc->p_vmspace); 4636 if (pmap == oldpmap) 4637 return; 4638 csr_write(satp, pmap->pm_satp); 4639 4640 hart = PCPU_GET(hart); 4641 #ifdef SMP 4642 CPU_SET_ATOMIC(hart, &pmap->pm_active); 4643 CPU_CLR_ATOMIC(hart, &oldpmap->pm_active); 4644 #else 4645 CPU_SET(hart, &pmap->pm_active); 4646 CPU_CLR(hart, &oldpmap->pm_active); 4647 #endif 4648 PCPU_SET(curpmap, pmap); 4649 4650 sfence_vma(); 4651 } 4652 4653 void 4654 pmap_activate(struct thread *td) 4655 { 4656 4657 critical_enter(); 4658 pmap_activate_sw(td); 4659 critical_exit(); 4660 } 4661 4662 void 4663 pmap_activate_boot(pmap_t pmap) 4664 { 4665 u_int hart; 4666 4667 hart = PCPU_GET(hart); 4668 #ifdef SMP 4669 CPU_SET_ATOMIC(hart, &pmap->pm_active); 4670 #else 4671 CPU_SET(hart, &pmap->pm_active); 4672 #endif 4673 PCPU_SET(curpmap, pmap); 4674 } 4675 4676 void 4677 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) 4678 { 4679 cpuset_t mask; 4680 4681 /* 4682 * From the RISC-V User-Level ISA V2.2: 4683 * 4684 * "To make a store to instruction memory visible to all 4685 * RISC-V harts, the writing hart has to execute a data FENCE 4686 * before requesting that all remote RISC-V harts execute a 4687 * FENCE.I." 4688 * 4689 * However, this is slightly misleading; we still need to 4690 * perform a FENCE.I for the local hart, as FENCE does nothing 4691 * for its icache. FENCE.I alone is also sufficient for the 4692 * local hart. 4693 */ 4694 sched_pin(); 4695 mask = all_harts; 4696 CPU_CLR(PCPU_GET(hart), &mask); 4697 fence_i(); 4698 if (!CPU_EMPTY(&mask) && smp_started) { 4699 fence(); 4700 sbi_remote_fence_i(mask.__bits); 4701 } 4702 sched_unpin(); 4703 } 4704 4705 /* 4706 * Increase the starting virtual address of the given mapping if a 4707 * different alignment might result in more superpage mappings. 4708 */ 4709 void 4710 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 4711 vm_offset_t *addr, vm_size_t size) 4712 { 4713 vm_offset_t superpage_offset; 4714 4715 if (size < L2_SIZE) 4716 return; 4717 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 4718 offset += ptoa(object->pg_color); 4719 superpage_offset = offset & L2_OFFSET; 4720 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || 4721 (*addr & L2_OFFSET) == superpage_offset) 4722 return; 4723 if ((*addr & L2_OFFSET) < superpage_offset) 4724 *addr = (*addr & ~L2_OFFSET) + superpage_offset; 4725 else 4726 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; 4727 } 4728 4729 /** 4730 * Get the kernel virtual address of a set of physical pages. If there are 4731 * physical addresses not covered by the DMAP perform a transient mapping 4732 * that will be removed when calling pmap_unmap_io_transient. 4733 * 4734 * \param page The pages the caller wishes to obtain the virtual 4735 * address on the kernel memory map. 4736 * \param vaddr On return contains the kernel virtual memory address 4737 * of the pages passed in the page parameter. 4738 * \param count Number of pages passed in. 4739 * \param can_fault TRUE if the thread using the mapped pages can take 4740 * page faults, FALSE otherwise. 4741 * 4742 * \returns TRUE if the caller must call pmap_unmap_io_transient when 4743 * finished or FALSE otherwise. 4744 * 4745 */ 4746 boolean_t 4747 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 4748 boolean_t can_fault) 4749 { 4750 vm_paddr_t paddr; 4751 boolean_t needs_mapping; 4752 int error __diagused, i; 4753 4754 /* 4755 * Allocate any KVA space that we need, this is done in a separate 4756 * loop to prevent calling vmem_alloc while pinned. 4757 */ 4758 needs_mapping = FALSE; 4759 for (i = 0; i < count; i++) { 4760 paddr = VM_PAGE_TO_PHYS(page[i]); 4761 if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) { 4762 error = vmem_alloc(kernel_arena, PAGE_SIZE, 4763 M_BESTFIT | M_WAITOK, &vaddr[i]); 4764 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 4765 needs_mapping = TRUE; 4766 } else { 4767 vaddr[i] = PHYS_TO_DMAP(paddr); 4768 } 4769 } 4770 4771 /* Exit early if everything is covered by the DMAP */ 4772 if (!needs_mapping) 4773 return (FALSE); 4774 4775 if (!can_fault) 4776 sched_pin(); 4777 for (i = 0; i < count; i++) { 4778 paddr = VM_PAGE_TO_PHYS(page[i]); 4779 if (paddr >= DMAP_MAX_PHYSADDR) { 4780 panic( 4781 "pmap_map_io_transient: TODO: Map out of DMAP data"); 4782 } 4783 } 4784 4785 return (needs_mapping); 4786 } 4787 4788 void 4789 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 4790 boolean_t can_fault) 4791 { 4792 vm_paddr_t paddr; 4793 int i; 4794 4795 if (!can_fault) 4796 sched_unpin(); 4797 for (i = 0; i < count; i++) { 4798 paddr = VM_PAGE_TO_PHYS(page[i]); 4799 if (paddr >= DMAP_MAX_PHYSADDR) { 4800 panic("RISCVTODO: pmap_unmap_io_transient: Unmap data"); 4801 } 4802 } 4803 } 4804 4805 boolean_t 4806 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 4807 { 4808 4809 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_BACK); 4810 } 4811 4812 bool 4813 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2, 4814 pt_entry_t **l3) 4815 { 4816 pd_entry_t *l1p, *l2p; 4817 4818 /* Get l1 directory entry. */ 4819 l1p = pmap_l1(pmap, va); 4820 *l1 = l1p; 4821 4822 if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0) 4823 return (false); 4824 4825 if ((pmap_load(l1p) & PTE_RX) != 0) { 4826 *l2 = NULL; 4827 *l3 = NULL; 4828 return (true); 4829 } 4830 4831 /* Get l2 directory entry. */ 4832 l2p = pmap_l1_to_l2(l1p, va); 4833 *l2 = l2p; 4834 4835 if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0) 4836 return (false); 4837 4838 if ((pmap_load(l2p) & PTE_RX) != 0) { 4839 *l3 = NULL; 4840 return (true); 4841 } 4842 4843 /* Get l3 page table entry. */ 4844 *l3 = pmap_l2_to_l3(l2p, va); 4845 4846 return (true); 4847 } 4848 4849 /* 4850 * Track a range of the kernel's virtual address space that is contiguous 4851 * in various mapping attributes. 4852 */ 4853 struct pmap_kernel_map_range { 4854 vm_offset_t sva; 4855 pt_entry_t attrs; 4856 int l3pages; 4857 int l2pages; 4858 int l1pages; 4859 }; 4860 4861 static void 4862 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 4863 vm_offset_t eva) 4864 { 4865 4866 if (eva <= range->sva) 4867 return; 4868 4869 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %d %d %d\n", 4870 range->sva, eva, 4871 (range->attrs & PTE_W) == PTE_W ? 'w' : '-', 4872 (range->attrs & PTE_X) == PTE_X ? 'x' : '-', 4873 (range->attrs & PTE_U) == PTE_U ? 'u' : 's', 4874 (range->attrs & PTE_G) == PTE_G ? 'g' : '-', 4875 range->l1pages, range->l2pages, range->l3pages); 4876 4877 /* Reset to sentinel value. */ 4878 range->sva = 0xfffffffffffffffful; 4879 } 4880 4881 /* 4882 * Determine whether the attributes specified by a page table entry match those 4883 * being tracked by the current range. 4884 */ 4885 static bool 4886 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 4887 { 4888 4889 return (range->attrs == attrs); 4890 } 4891 4892 static void 4893 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 4894 pt_entry_t attrs) 4895 { 4896 4897 memset(range, 0, sizeof(*range)); 4898 range->sva = va; 4899 range->attrs = attrs; 4900 } 4901 4902 /* 4903 * Given a leaf PTE, derive the mapping's attributes. If they do not match 4904 * those of the current run, dump the address range and its attributes, and 4905 * begin a new run. 4906 */ 4907 static void 4908 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 4909 vm_offset_t va, pd_entry_t l1e, pd_entry_t l2e, pt_entry_t l3e) 4910 { 4911 pt_entry_t attrs; 4912 4913 /* The PTE global bit is inherited by lower levels. */ 4914 attrs = l1e & PTE_G; 4915 if ((l1e & PTE_RWX) != 0) 4916 attrs |= l1e & (PTE_RWX | PTE_U); 4917 else if (l2e != 0) 4918 attrs |= l2e & PTE_G; 4919 if ((l2e & PTE_RWX) != 0) 4920 attrs |= l2e & (PTE_RWX | PTE_U); 4921 else if (l3e != 0) 4922 attrs |= l3e & (PTE_RWX | PTE_U | PTE_G); 4923 4924 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 4925 sysctl_kmaps_dump(sb, range, va); 4926 sysctl_kmaps_reinit(range, va, attrs); 4927 } 4928 } 4929 4930 static int 4931 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 4932 { 4933 struct pmap_kernel_map_range range; 4934 struct sbuf sbuf, *sb; 4935 pd_entry_t l1e, *l2, l2e; 4936 pt_entry_t *l3, l3e; 4937 vm_offset_t sva; 4938 vm_paddr_t pa; 4939 int error, i, j, k; 4940 4941 error = sysctl_wire_old_buffer(req, 0); 4942 if (error != 0) 4943 return (error); 4944 sb = &sbuf; 4945 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 4946 4947 /* Sentinel value. */ 4948 range.sva = 0xfffffffffffffffful; 4949 4950 /* 4951 * Iterate over the kernel page tables without holding the kernel pmap 4952 * lock. Kernel page table pages are never freed, so at worst we will 4953 * observe inconsistencies in the output. 4954 */ 4955 sva = VM_MIN_KERNEL_ADDRESS; 4956 for (i = pmap_l1_index(sva); i < Ln_ENTRIES; i++) { 4957 if (i == pmap_l1_index(DMAP_MIN_ADDRESS)) 4958 sbuf_printf(sb, "\nDirect map:\n"); 4959 else if (i == pmap_l1_index(VM_MIN_KERNEL_ADDRESS)) 4960 sbuf_printf(sb, "\nKernel map:\n"); 4961 4962 l1e = kernel_pmap->pm_top[i]; 4963 if ((l1e & PTE_V) == 0) { 4964 sysctl_kmaps_dump(sb, &range, sva); 4965 sva += L1_SIZE; 4966 continue; 4967 } 4968 if ((l1e & PTE_RWX) != 0) { 4969 sysctl_kmaps_check(sb, &range, sva, l1e, 0, 0); 4970 range.l1pages++; 4971 sva += L1_SIZE; 4972 continue; 4973 } 4974 pa = PTE_TO_PHYS(l1e); 4975 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); 4976 4977 for (j = pmap_l2_index(sva); j < Ln_ENTRIES; j++) { 4978 l2e = l2[j]; 4979 if ((l2e & PTE_V) == 0) { 4980 sysctl_kmaps_dump(sb, &range, sva); 4981 sva += L2_SIZE; 4982 continue; 4983 } 4984 if ((l2e & PTE_RWX) != 0) { 4985 sysctl_kmaps_check(sb, &range, sva, l1e, l2e, 0); 4986 range.l2pages++; 4987 sva += L2_SIZE; 4988 continue; 4989 } 4990 pa = PTE_TO_PHYS(l2e); 4991 l3 = (pd_entry_t *)PHYS_TO_DMAP(pa); 4992 4993 for (k = pmap_l3_index(sva); k < Ln_ENTRIES; k++, 4994 sva += L3_SIZE) { 4995 l3e = l3[k]; 4996 if ((l3e & PTE_V) == 0) { 4997 sysctl_kmaps_dump(sb, &range, sva); 4998 continue; 4999 } 5000 sysctl_kmaps_check(sb, &range, sva, 5001 l1e, l2e, l3e); 5002 range.l3pages++; 5003 } 5004 } 5005 } 5006 5007 error = sbuf_finish(sb); 5008 sbuf_delete(sb); 5009 return (error); 5010 } 5011 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 5012 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 5013 NULL, 0, sysctl_kmaps, "A", 5014 "Dump kernel address layout"); 5015