1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * Copyright (c) 2014 Andrew Turner 15 * All rights reserved. 16 * Copyright (c) 2014 The FreeBSD Foundation 17 * All rights reserved. 18 * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com> 19 * All rights reserved. 20 * 21 * This code is derived from software contributed to Berkeley by 22 * the Systems Programming Group of the University of Utah Computer 23 * Science Department and William Jolitz of UUNET Technologies Inc. 24 * 25 * Portions of this software were developed by Andrew Turner under 26 * sponsorship from The FreeBSD Foundation. 27 * 28 * Portions of this software were developed by SRI International and the 29 * University of Cambridge Computer Laboratory under DARPA/AFRL contract 30 * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. 31 * 32 * Portions of this software were developed by the University of Cambridge 33 * Computer Laboratory as part of the CTSRD Project, with support from the 34 * UK Higher Education Innovation Fund (HEIF). 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. All advertising materials mentioning features or use of this software 45 * must display the following acknowledgement: 46 * This product includes software developed by the University of 47 * California, Berkeley and its contributors. 48 * 4. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 * 64 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 65 */ 66 /*- 67 * Copyright (c) 2003 Networks Associates Technology, Inc. 68 * All rights reserved. 69 * 70 * This software was developed for the FreeBSD Project by Jake Burkholder, 71 * Safeport Network Services, and Network Associates Laboratories, the 72 * Security Research Division of Network Associates, Inc. under 73 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 74 * CHATS research program. 75 * 76 * Redistribution and use in source and binary forms, with or without 77 * modification, are permitted provided that the following conditions 78 * are met: 79 * 1. Redistributions of source code must retain the above copyright 80 * notice, this list of conditions and the following disclaimer. 81 * 2. Redistributions in binary form must reproduce the above copyright 82 * notice, this list of conditions and the following disclaimer in the 83 * documentation and/or other materials provided with the distribution. 84 * 85 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 86 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 87 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 88 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 89 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 90 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 91 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 92 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 93 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 94 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 95 * SUCH DAMAGE. 96 */ 97 98 #include <sys/cdefs.h> 99 __FBSDID("$FreeBSD$"); 100 101 /* 102 * Manages physical address maps. 103 * 104 * Since the information managed by this module is 105 * also stored by the logical address mapping module, 106 * this module may throw away valid virtual-to-physical 107 * mappings at almost any time. However, invalidations 108 * of virtual-to-physical mappings must be done as 109 * requested. 110 * 111 * In order to cope with hardware architectures which 112 * make virtual-to-physical map invalidates expensive, 113 * this module may delay invalidate or reduced protection 114 * operations until such time as they are actually 115 * necessary. This module is given full information as 116 * to which processors are currently using which maps, 117 * and to when physical maps must be made correct. 118 */ 119 120 #include <sys/param.h> 121 #include <sys/systm.h> 122 #include <sys/bitstring.h> 123 #include <sys/bus.h> 124 #include <sys/cpuset.h> 125 #include <sys/kernel.h> 126 #include <sys/ktr.h> 127 #include <sys/lock.h> 128 #include <sys/malloc.h> 129 #include <sys/mman.h> 130 #include <sys/msgbuf.h> 131 #include <sys/mutex.h> 132 #include <sys/physmem.h> 133 #include <sys/proc.h> 134 #include <sys/rwlock.h> 135 #include <sys/sbuf.h> 136 #include <sys/sx.h> 137 #include <sys/vmem.h> 138 #include <sys/vmmeter.h> 139 #include <sys/sched.h> 140 #include <sys/sysctl.h> 141 #include <sys/smp.h> 142 143 #include <vm/vm.h> 144 #include <vm/vm_param.h> 145 #include <vm/vm_kern.h> 146 #include <vm/vm_page.h> 147 #include <vm/vm_map.h> 148 #include <vm/vm_object.h> 149 #include <vm/vm_extern.h> 150 #include <vm/vm_pageout.h> 151 #include <vm/vm_pager.h> 152 #include <vm/vm_phys.h> 153 #include <vm/vm_radix.h> 154 #include <vm/vm_reserv.h> 155 #include <vm/vm_dumpset.h> 156 #include <vm/uma.h> 157 158 #include <machine/machdep.h> 159 #include <machine/md_var.h> 160 #include <machine/pcb.h> 161 #include <machine/sbi.h> 162 163 /* 164 * Boundary values for the page table page index space: 165 * 166 * L3 pages: [0, NUL2E) 167 * L2 pages: [NUL2E, NUL2E + NUL1E) 168 * L1 pages: [NUL2E + NUL1E, NUL2E + NUL1E + NUL0E) 169 * 170 * Note that these ranges are used in both SV39 and SV48 mode. In SV39 mode the 171 * ranges are not fully populated since there are at most Ln_ENTRIES^2 L3 pages 172 * in a set of page tables. 173 */ 174 #define NUL0E Ln_ENTRIES 175 #define NUL1E (Ln_ENTRIES * NUL0E) 176 #define NUL2E (Ln_ENTRIES * NUL1E) 177 178 #if !defined(DIAGNOSTIC) 179 #ifdef __GNUC_GNU_INLINE__ 180 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 181 #else 182 #define PMAP_INLINE extern inline 183 #endif 184 #else 185 #define PMAP_INLINE 186 #endif 187 188 #ifdef PV_STATS 189 #define PV_STAT(x) do { x ; } while (0) 190 #else 191 #define PV_STAT(x) do { } while (0) 192 #endif 193 194 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT)) 195 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) 196 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 197 198 #define NPV_LIST_LOCKS MAXCPU 199 200 #define PHYS_TO_PV_LIST_LOCK(pa) \ 201 (&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS]) 202 203 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 204 struct rwlock **_lockp = (lockp); \ 205 struct rwlock *_new_lock; \ 206 \ 207 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 208 if (_new_lock != *_lockp) { \ 209 if (*_lockp != NULL) \ 210 rw_wunlock(*_lockp); \ 211 *_lockp = _new_lock; \ 212 rw_wlock(*_lockp); \ 213 } \ 214 } while (0) 215 216 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 217 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 218 219 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 220 struct rwlock **_lockp = (lockp); \ 221 \ 222 if (*_lockp != NULL) { \ 223 rw_wunlock(*_lockp); \ 224 *_lockp = NULL; \ 225 } \ 226 } while (0) 227 228 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 229 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 230 231 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 232 "VM/pmap parameters"); 233 234 /* The list of all the user pmaps */ 235 LIST_HEAD(pmaplist, pmap); 236 static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER(); 237 238 enum pmap_mode __read_frequently pmap_mode = PMAP_MODE_SV39; 239 SYSCTL_INT(_vm_pmap, OID_AUTO, mode, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 240 &pmap_mode, 0, 241 "translation mode, 0 = SV39, 1 = SV48"); 242 243 struct pmap kernel_pmap_store; 244 245 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 246 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 247 vm_offset_t kernel_vm_end = 0; 248 249 vm_paddr_t dmap_phys_base; /* The start of the dmap region */ 250 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ 251 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ 252 253 /* This code assumes all L1 DMAP entries will be used */ 254 CTASSERT((DMAP_MIN_ADDRESS & ~L1_OFFSET) == DMAP_MIN_ADDRESS); 255 CTASSERT((DMAP_MAX_ADDRESS & ~L1_OFFSET) == DMAP_MAX_ADDRESS); 256 257 static struct rwlock_padalign pvh_global_lock; 258 static struct mtx_padalign allpmaps_lock; 259 260 static int superpages_enabled = 1; 261 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, 262 CTLFLAG_RDTUN, &superpages_enabled, 0, 263 "Enable support for transparent superpages"); 264 265 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 266 "2MB page mapping counters"); 267 268 static u_long pmap_l2_demotions; 269 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, 270 &pmap_l2_demotions, 0, 271 "2MB page demotions"); 272 273 static u_long pmap_l2_mappings; 274 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, 275 &pmap_l2_mappings, 0, 276 "2MB page mappings"); 277 278 static u_long pmap_l2_p_failures; 279 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, 280 &pmap_l2_p_failures, 0, 281 "2MB page promotion failures"); 282 283 static u_long pmap_l2_promotions; 284 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, 285 &pmap_l2_promotions, 0, 286 "2MB page promotions"); 287 288 /* 289 * Data for the pv entry allocation mechanism 290 */ 291 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 292 static struct mtx pv_chunks_mutex; 293 static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 294 static struct md_page *pv_table; 295 static struct md_page pv_dummy; 296 297 extern cpuset_t all_harts; 298 299 /* 300 * Internal flags for pmap_enter()'s helper functions. 301 */ 302 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 303 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 304 305 static void free_pv_chunk(struct pv_chunk *pc); 306 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 307 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 308 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 309 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 310 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 311 vm_offset_t va); 312 static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va); 313 static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, 314 vm_offset_t va, struct rwlock **lockp); 315 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, 316 u_int flags, vm_page_t m, struct rwlock **lockp); 317 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 318 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 319 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, 320 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 321 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 322 vm_page_t m, struct rwlock **lockp); 323 324 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, 325 struct rwlock **lockp); 326 327 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 328 struct spglist *free); 329 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 330 331 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); 332 333 #define pmap_clear(pte) pmap_store(pte, 0) 334 #define pmap_clear_bits(pte, bits) atomic_clear_64(pte, bits) 335 #define pmap_load_store(pte, entry) atomic_swap_64(pte, entry) 336 #define pmap_load_clear(pte) pmap_load_store(pte, 0) 337 #define pmap_load(pte) atomic_load_64(pte) 338 #define pmap_store(pte, entry) atomic_store_64(pte, entry) 339 #define pmap_store_bits(pte, bits) atomic_set_64(pte, bits) 340 341 /********************/ 342 /* Inline functions */ 343 /********************/ 344 345 static __inline void 346 pagecopy(void *s, void *d) 347 { 348 349 memcpy(d, s, PAGE_SIZE); 350 } 351 352 static __inline void 353 pagezero(void *p) 354 { 355 356 bzero(p, PAGE_SIZE); 357 } 358 359 #define pmap_l0_index(va) (((va) >> L0_SHIFT) & Ln_ADDR_MASK) 360 #define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) 361 #define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) 362 #define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) 363 364 #define PTE_TO_PHYS(pte) \ 365 ((((pte) & ~PTE_HI_MASK) >> PTE_PPN0_S) * PAGE_SIZE) 366 #define L2PTE_TO_PHYS(l2) \ 367 ((((l2) & ~PTE_HI_MASK) >> PTE_PPN1_S) << L2_SHIFT) 368 369 static __inline pd_entry_t * 370 pmap_l0(pmap_t pmap, vm_offset_t va) 371 { 372 KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__)); 373 KASSERT(VIRT_IS_VALID(va), 374 ("%s: malformed virtual address %#lx", __func__, va)); 375 return (&pmap->pm_top[pmap_l0_index(va)]); 376 } 377 378 static __inline pd_entry_t * 379 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) 380 { 381 vm_paddr_t phys; 382 pd_entry_t *l1; 383 384 KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__)); 385 phys = PTE_TO_PHYS(pmap_load(l0)); 386 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 387 388 return (&l1[pmap_l1_index(va)]); 389 } 390 391 static __inline pd_entry_t * 392 pmap_l1(pmap_t pmap, vm_offset_t va) 393 { 394 pd_entry_t *l0; 395 396 KASSERT(VIRT_IS_VALID(va), 397 ("%s: malformed virtual address %#lx", __func__, va)); 398 if (pmap_mode == PMAP_MODE_SV39) { 399 return (&pmap->pm_top[pmap_l1_index(va)]); 400 } else { 401 l0 = pmap_l0(pmap, va); 402 if ((pmap_load(l0) & PTE_V) == 0) 403 return (NULL); 404 if ((pmap_load(l0) & PTE_RX) != 0) 405 return (NULL); 406 return (pmap_l0_to_l1(l0, va)); 407 } 408 } 409 410 static __inline pd_entry_t * 411 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) 412 { 413 vm_paddr_t phys; 414 pd_entry_t *l2; 415 416 phys = PTE_TO_PHYS(pmap_load(l1)); 417 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 418 419 return (&l2[pmap_l2_index(va)]); 420 } 421 422 static __inline pd_entry_t * 423 pmap_l2(pmap_t pmap, vm_offset_t va) 424 { 425 pd_entry_t *l1; 426 427 l1 = pmap_l1(pmap, va); 428 if (l1 == NULL) 429 return (NULL); 430 if ((pmap_load(l1) & PTE_V) == 0) 431 return (NULL); 432 if ((pmap_load(l1) & PTE_RX) != 0) 433 return (NULL); 434 435 return (pmap_l1_to_l2(l1, va)); 436 } 437 438 static __inline pt_entry_t * 439 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) 440 { 441 vm_paddr_t phys; 442 pt_entry_t *l3; 443 444 phys = PTE_TO_PHYS(pmap_load(l2)); 445 l3 = (pd_entry_t *)PHYS_TO_DMAP(phys); 446 447 return (&l3[pmap_l3_index(va)]); 448 } 449 450 static __inline pt_entry_t * 451 pmap_l3(pmap_t pmap, vm_offset_t va) 452 { 453 pd_entry_t *l2; 454 455 l2 = pmap_l2(pmap, va); 456 if (l2 == NULL) 457 return (NULL); 458 if ((pmap_load(l2) & PTE_V) == 0) 459 return (NULL); 460 if ((pmap_load(l2) & PTE_RX) != 0) 461 return (NULL); 462 463 return (pmap_l2_to_l3(l2, va)); 464 } 465 466 static __inline void 467 pmap_resident_count_inc(pmap_t pmap, int count) 468 { 469 470 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 471 pmap->pm_stats.resident_count += count; 472 } 473 474 static __inline void 475 pmap_resident_count_dec(pmap_t pmap, int count) 476 { 477 478 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 479 KASSERT(pmap->pm_stats.resident_count >= count, 480 ("pmap %p resident count underflow %ld %d", pmap, 481 pmap->pm_stats.resident_count, count)); 482 pmap->pm_stats.resident_count -= count; 483 } 484 485 static void 486 pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index, 487 pt_entry_t entry) 488 { 489 struct pmap *user_pmap; 490 pd_entry_t *l1; 491 492 /* 493 * Distribute new kernel L1 entry to all the user pmaps. This is only 494 * necessary with three-level paging configured: with four-level paging 495 * the kernel's half of the top-level page table page is static and can 496 * simply be copied at pmap initialization time. 497 */ 498 if (pmap != kernel_pmap || pmap_mode != PMAP_MODE_SV39) 499 return; 500 501 mtx_lock(&allpmaps_lock); 502 LIST_FOREACH(user_pmap, &allpmaps, pm_list) { 503 l1 = &user_pmap->pm_top[l1index]; 504 pmap_store(l1, entry); 505 } 506 mtx_unlock(&allpmaps_lock); 507 } 508 509 static pt_entry_t * 510 pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, 511 u_int *l2_slot) 512 { 513 pt_entry_t *l2; 514 pd_entry_t *l1 __diagused; 515 516 l1 = (pd_entry_t *)l1pt; 517 *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; 518 519 /* Check locore has used a table L1 map */ 520 KASSERT((l1[*l1_slot] & PTE_RX) == 0, 521 ("Invalid bootstrap L1 table")); 522 523 /* Find the address of the L2 table */ 524 l2 = (pt_entry_t *)init_pt_va; 525 *l2_slot = pmap_l2_index(va); 526 527 return (l2); 528 } 529 530 static vm_paddr_t 531 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) 532 { 533 u_int l1_slot, l2_slot; 534 pt_entry_t *l2; 535 vm_paddr_t ret; 536 537 l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); 538 539 /* Check locore has used L2 superpages */ 540 KASSERT((l2[l2_slot] & PTE_RX) != 0, 541 ("Invalid bootstrap L2 table")); 542 543 /* L2 is superpages */ 544 ret = L2PTE_TO_PHYS(l2[l2_slot]); 545 ret += (va & L2_OFFSET); 546 547 return (ret); 548 } 549 550 static void 551 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa) 552 { 553 vm_offset_t va; 554 vm_paddr_t pa; 555 pd_entry_t *l1; 556 u_int l1_slot; 557 pt_entry_t entry; 558 pn_t pn; 559 560 pa = dmap_phys_base = min_pa & ~L1_OFFSET; 561 va = DMAP_MIN_ADDRESS; 562 l1 = (pd_entry_t *)kern_l1; 563 l1_slot = pmap_l1_index(DMAP_MIN_ADDRESS); 564 565 for (; va < DMAP_MAX_ADDRESS && pa < max_pa; 566 pa += L1_SIZE, va += L1_SIZE, l1_slot++) { 567 KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); 568 569 /* superpages */ 570 pn = (pa / PAGE_SIZE); 571 entry = PTE_KERN; 572 entry |= (pn << PTE_PPN0_S); 573 pmap_store(&l1[l1_slot], entry); 574 } 575 576 /* Set the upper limit of the DMAP region */ 577 dmap_phys_max = pa; 578 dmap_max_addr = va; 579 580 sfence_vma(); 581 } 582 583 static vm_offset_t 584 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) 585 { 586 vm_offset_t l3pt; 587 pt_entry_t entry; 588 pd_entry_t *l2; 589 vm_paddr_t pa; 590 u_int l2_slot; 591 pn_t pn; 592 593 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); 594 595 l2 = pmap_l2(kernel_pmap, va); 596 l2 = (pd_entry_t *)((uintptr_t)l2 & ~(PAGE_SIZE - 1)); 597 l2_slot = pmap_l2_index(va); 598 l3pt = l3_start; 599 600 for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { 601 KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); 602 603 pa = pmap_early_vtophys(l1pt, l3pt); 604 pn = (pa / PAGE_SIZE); 605 entry = (PTE_V); 606 entry |= (pn << PTE_PPN0_S); 607 pmap_store(&l2[l2_slot], entry); 608 l3pt += PAGE_SIZE; 609 } 610 611 /* Clean the L2 page table */ 612 memset((void *)l3_start, 0, l3pt - l3_start); 613 614 return (l3pt); 615 } 616 617 /* 618 * Bootstrap the system enough to run with virtual memory. 619 */ 620 void 621 pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) 622 { 623 vm_paddr_t physmap[PHYS_AVAIL_ENTRIES]; 624 uint64_t satp; 625 vm_offset_t dpcpu, freemempos, l0pv, msgbufpv; 626 vm_paddr_t l0pa, l1pa, max_pa, min_pa, pa; 627 pd_entry_t *l0p; 628 pt_entry_t *l2p; 629 u_int l1_slot, l2_slot; 630 u_int physmap_idx; 631 int i, mode; 632 633 printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); 634 635 /* Set this early so we can use the pagetable walking functions */ 636 kernel_pmap_store.pm_top = (pd_entry_t *)l1pt; 637 PMAP_LOCK_INIT(kernel_pmap); 638 639 rw_init(&pvh_global_lock, "pmap pv global"); 640 641 /* 642 * Set the current CPU as active in the kernel pmap. Secondary cores 643 * will add themselves later in init_secondary(). The SBI firmware 644 * may rely on this mask being precise, so CPU_FILL() is not used. 645 */ 646 CPU_SET(PCPU_GET(hart), &kernel_pmap->pm_active); 647 648 /* Assume the address we were loaded to is a valid physical address. */ 649 min_pa = max_pa = kernstart; 650 651 physmap_idx = physmem_avail(physmap, nitems(physmap)); 652 physmap_idx /= 2; 653 654 /* 655 * Find the minimum physical address. physmap is sorted, 656 * but may contain empty ranges. 657 */ 658 for (i = 0; i < physmap_idx * 2; i += 2) { 659 if (physmap[i] == physmap[i + 1]) 660 continue; 661 if (physmap[i] <= min_pa) 662 min_pa = physmap[i]; 663 if (physmap[i + 1] > max_pa) 664 max_pa = physmap[i + 1]; 665 } 666 printf("physmap_idx %u\n", physmap_idx); 667 printf("min_pa %lx\n", min_pa); 668 printf("max_pa %lx\n", max_pa); 669 670 /* Create a direct map region early so we can use it for pa -> va */ 671 pmap_bootstrap_dmap(l1pt, min_pa, max_pa); 672 673 /* 674 * Read the page table to find out what is already mapped. 675 * This assumes we have mapped a block of memory from KERNBASE 676 * using a single L1 entry. 677 */ 678 (void)pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); 679 680 /* Sanity check the index, KERNBASE should be the first VA */ 681 KASSERT(l2_slot == 0, ("The L2 index is non-zero")); 682 683 freemempos = roundup2(KERNBASE + kernlen, PAGE_SIZE); 684 685 /* Create the l3 tables for the early devmap */ 686 freemempos = pmap_bootstrap_l3(l1pt, 687 VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos); 688 689 /* 690 * Invalidate the mapping we created for the DTB. At this point a copy 691 * has been created, and we no longer need it. We want to avoid the 692 * possibility of an aliased mapping in the future. 693 */ 694 l2p = pmap_l2(kernel_pmap, VM_EARLY_DTB_ADDRESS); 695 if ((pmap_load(l2p) & PTE_V) != 0) 696 pmap_clear(l2p); 697 698 sfence_vma(); 699 700 #define alloc_pages(var, np) \ 701 (var) = freemempos; \ 702 freemempos += (np * PAGE_SIZE); \ 703 memset((char *)(var), 0, ((np) * PAGE_SIZE)); 704 705 mode = 0; 706 TUNABLE_INT_FETCH("vm.pmap.mode", &mode); 707 if (mode == PMAP_MODE_SV48) { 708 /* 709 * Enable SV48 mode: allocate an L0 page and set SV48 mode in 710 * SATP. If the implementation does not provide SV48 mode, 711 * the mode read back from the (WARL) SATP register will be 712 * unchanged, and we continue in SV39 mode. 713 */ 714 alloc_pages(l0pv, 1); 715 l0p = (void *)l0pv; 716 l1pa = pmap_early_vtophys(l1pt, l1pt); 717 l0p[pmap_l0_index(KERNBASE)] = PTE_V | PTE_A | PTE_D | 718 ((l1pa >> PAGE_SHIFT) << PTE_PPN0_S); 719 720 l0pa = pmap_early_vtophys(l1pt, l0pv); 721 csr_write(satp, (l0pa >> PAGE_SHIFT) | SATP_MODE_SV48); 722 satp = csr_read(satp); 723 if ((satp & SATP_MODE_M) == SATP_MODE_SV48) { 724 pmap_mode = PMAP_MODE_SV48; 725 kernel_pmap_store.pm_top = l0p; 726 } else { 727 /* Mode didn't change, give the page back. */ 728 freemempos -= PAGE_SIZE; 729 } 730 } 731 732 /* Allocate dynamic per-cpu area. */ 733 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); 734 dpcpu_init((void *)dpcpu, 0); 735 736 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ 737 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); 738 msgbufp = (void *)msgbufpv; 739 740 virtual_avail = roundup2(freemempos, L2_SIZE); 741 virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE; 742 kernel_vm_end = virtual_avail; 743 744 pa = pmap_early_vtophys(l1pt, freemempos); 745 746 physmem_exclude_region(kernstart, pa - kernstart, EXFLAG_NOALLOC); 747 } 748 749 /* 750 * Initialize a vm_page's machine-dependent fields. 751 */ 752 void 753 pmap_page_init(vm_page_t m) 754 { 755 756 TAILQ_INIT(&m->md.pv_list); 757 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; 758 } 759 760 /* 761 * Initialize the pmap module. 762 * Called by vm_init, to initialize any structures that the pmap 763 * system needs to map virtual memory. 764 */ 765 void 766 pmap_init(void) 767 { 768 vm_size_t s; 769 int i, pv_npg; 770 771 /* 772 * Initialize the pv chunk and pmap list mutexes. 773 */ 774 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 775 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF); 776 777 /* 778 * Initialize the pool of pv list locks. 779 */ 780 for (i = 0; i < NPV_LIST_LOCKS; i++) 781 rw_init(&pv_list_locks[i], "pmap pv list"); 782 783 /* 784 * Calculate the size of the pv head table for superpages. 785 */ 786 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); 787 788 /* 789 * Allocate memory for the pv head table for superpages. 790 */ 791 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 792 s = round_page(s); 793 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 794 for (i = 0; i < pv_npg; i++) 795 TAILQ_INIT(&pv_table[i].pv_list); 796 TAILQ_INIT(&pv_dummy.pv_list); 797 798 if (superpages_enabled) 799 pagesizes[1] = L2_SIZE; 800 } 801 802 #ifdef SMP 803 /* 804 * For SMP, these functions have to use IPIs for coherence. 805 * 806 * In general, the calling thread uses a plain fence to order the 807 * writes to the page tables before invoking an SBI callback to invoke 808 * sfence_vma() on remote CPUs. 809 */ 810 static void 811 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 812 { 813 cpuset_t mask; 814 815 sched_pin(); 816 mask = pmap->pm_active; 817 CPU_CLR(PCPU_GET(hart), &mask); 818 fence(); 819 if (!CPU_EMPTY(&mask) && smp_started) 820 sbi_remote_sfence_vma(mask.__bits, va, 1); 821 sfence_vma_page(va); 822 sched_unpin(); 823 } 824 825 static void 826 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 827 { 828 cpuset_t mask; 829 830 sched_pin(); 831 mask = pmap->pm_active; 832 CPU_CLR(PCPU_GET(hart), &mask); 833 fence(); 834 if (!CPU_EMPTY(&mask) && smp_started) 835 sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1); 836 837 /* 838 * Might consider a loop of sfence_vma_page() for a small 839 * number of pages in the future. 840 */ 841 sfence_vma(); 842 sched_unpin(); 843 } 844 845 static void 846 pmap_invalidate_all(pmap_t pmap) 847 { 848 cpuset_t mask; 849 850 sched_pin(); 851 mask = pmap->pm_active; 852 CPU_CLR(PCPU_GET(hart), &mask); 853 854 /* 855 * XXX: The SBI doc doesn't detail how to specify x0 as the 856 * address to perform a global fence. BBL currently treats 857 * all sfence_vma requests as global however. 858 */ 859 fence(); 860 if (!CPU_EMPTY(&mask) && smp_started) 861 sbi_remote_sfence_vma(mask.__bits, 0, 0); 862 sfence_vma(); 863 sched_unpin(); 864 } 865 #else 866 /* 867 * Normal, non-SMP, invalidation functions. 868 * We inline these within pmap.c for speed. 869 */ 870 static __inline void 871 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 872 { 873 874 sfence_vma_page(va); 875 } 876 877 static __inline void 878 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 879 { 880 881 /* 882 * Might consider a loop of sfence_vma_page() for a small 883 * number of pages in the future. 884 */ 885 sfence_vma(); 886 } 887 888 static __inline void 889 pmap_invalidate_all(pmap_t pmap) 890 { 891 892 sfence_vma(); 893 } 894 #endif 895 896 /* 897 * Routine: pmap_extract 898 * Function: 899 * Extract the physical page address associated 900 * with the given map/virtual_address pair. 901 */ 902 vm_paddr_t 903 pmap_extract(pmap_t pmap, vm_offset_t va) 904 { 905 pd_entry_t *l2p, l2; 906 pt_entry_t *l3p, l3; 907 vm_paddr_t pa; 908 909 pa = 0; 910 PMAP_LOCK(pmap); 911 /* 912 * Start with the l2 tabel. We are unable to allocate 913 * pages in the l1 table. 914 */ 915 l2p = pmap_l2(pmap, va); 916 if (l2p != NULL) { 917 l2 = pmap_load(l2p); 918 if ((l2 & PTE_RX) == 0) { 919 l3p = pmap_l2_to_l3(l2p, va); 920 if (l3p != NULL) { 921 l3 = pmap_load(l3p); 922 pa = PTE_TO_PHYS(l3); 923 pa |= (va & L3_OFFSET); 924 } 925 } else { 926 /* L2 is superpages */ 927 pa = L2PTE_TO_PHYS(l2); 928 pa |= (va & L2_OFFSET); 929 } 930 } 931 PMAP_UNLOCK(pmap); 932 return (pa); 933 } 934 935 /* 936 * Routine: pmap_extract_and_hold 937 * Function: 938 * Atomically extract and hold the physical page 939 * with the given pmap and virtual address pair 940 * if that mapping permits the given protection. 941 */ 942 vm_page_t 943 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 944 { 945 pt_entry_t *l3p, l3; 946 vm_paddr_t phys; 947 vm_page_t m; 948 949 m = NULL; 950 PMAP_LOCK(pmap); 951 l3p = pmap_l3(pmap, va); 952 if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) { 953 if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) { 954 phys = PTE_TO_PHYS(l3); 955 m = PHYS_TO_VM_PAGE(phys); 956 if (!vm_page_wire_mapped(m)) 957 m = NULL; 958 } 959 } 960 PMAP_UNLOCK(pmap); 961 return (m); 962 } 963 964 vm_paddr_t 965 pmap_kextract(vm_offset_t va) 966 { 967 pd_entry_t *l2, l2e; 968 pt_entry_t *l3; 969 vm_paddr_t pa; 970 971 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 972 pa = DMAP_TO_PHYS(va); 973 } else { 974 l2 = pmap_l2(kernel_pmap, va); 975 if (l2 == NULL) 976 panic("pmap_kextract: No l2"); 977 l2e = pmap_load(l2); 978 /* 979 * Beware of concurrent promotion and demotion! We must 980 * use l2e rather than loading from l2 multiple times to 981 * ensure we see a consistent state, including the 982 * implicit load in pmap_l2_to_l3. It is, however, safe 983 * to use an old l2e because the L3 page is preserved by 984 * promotion. 985 */ 986 if ((l2e & PTE_RX) != 0) { 987 /* superpages */ 988 pa = L2PTE_TO_PHYS(l2e); 989 pa |= (va & L2_OFFSET); 990 return (pa); 991 } 992 993 l3 = pmap_l2_to_l3(&l2e, va); 994 if (l3 == NULL) 995 panic("pmap_kextract: No l3..."); 996 pa = PTE_TO_PHYS(pmap_load(l3)); 997 pa |= (va & PAGE_MASK); 998 } 999 return (pa); 1000 } 1001 1002 /*************************************************** 1003 * Low level mapping routines..... 1004 ***************************************************/ 1005 1006 void 1007 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode __unused) 1008 { 1009 pt_entry_t entry; 1010 pt_entry_t *l3; 1011 vm_offset_t va; 1012 pn_t pn; 1013 1014 KASSERT((pa & L3_OFFSET) == 0, 1015 ("pmap_kenter_device: Invalid physical address")); 1016 KASSERT((sva & L3_OFFSET) == 0, 1017 ("pmap_kenter_device: Invalid virtual address")); 1018 KASSERT((size & PAGE_MASK) == 0, 1019 ("pmap_kenter_device: Mapping is not page-sized")); 1020 1021 va = sva; 1022 while (size != 0) { 1023 l3 = pmap_l3(kernel_pmap, va); 1024 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 1025 1026 pn = (pa / PAGE_SIZE); 1027 entry = PTE_KERN; 1028 entry |= (pn << PTE_PPN0_S); 1029 pmap_store(l3, entry); 1030 1031 va += PAGE_SIZE; 1032 pa += PAGE_SIZE; 1033 size -= PAGE_SIZE; 1034 } 1035 pmap_invalidate_range(kernel_pmap, sva, va); 1036 } 1037 1038 void 1039 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) 1040 { 1041 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE); 1042 } 1043 1044 /* 1045 * Remove a page from the kernel pagetables. 1046 * Note: not SMP coherent. 1047 */ 1048 PMAP_INLINE void 1049 pmap_kremove(vm_offset_t va) 1050 { 1051 pt_entry_t *l3; 1052 1053 l3 = pmap_l3(kernel_pmap, va); 1054 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 1055 1056 pmap_clear(l3); 1057 sfence_vma(); 1058 } 1059 1060 void 1061 pmap_kremove_device(vm_offset_t sva, vm_size_t size) 1062 { 1063 pt_entry_t *l3; 1064 vm_offset_t va; 1065 1066 KASSERT((sva & L3_OFFSET) == 0, 1067 ("pmap_kremove_device: Invalid virtual address")); 1068 KASSERT((size & PAGE_MASK) == 0, 1069 ("pmap_kremove_device: Mapping is not page-sized")); 1070 1071 va = sva; 1072 while (size != 0) { 1073 l3 = pmap_l3(kernel_pmap, va); 1074 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 1075 pmap_clear(l3); 1076 1077 va += PAGE_SIZE; 1078 size -= PAGE_SIZE; 1079 } 1080 1081 pmap_invalidate_range(kernel_pmap, sva, va); 1082 } 1083 1084 /* 1085 * Used to map a range of physical addresses into kernel 1086 * virtual address space. 1087 * 1088 * The value passed in '*virt' is a suggested virtual address for 1089 * the mapping. Architectures which can support a direct-mapped 1090 * physical to virtual region can return the appropriate address 1091 * within that region, leaving '*virt' unchanged. Other 1092 * architectures should map the pages starting at '*virt' and 1093 * update '*virt' with the first usable address after the mapped 1094 * region. 1095 */ 1096 vm_offset_t 1097 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1098 { 1099 1100 return PHYS_TO_DMAP(start); 1101 } 1102 1103 /* 1104 * Add a list of wired pages to the kva 1105 * this routine is only used for temporary 1106 * kernel mappings that do not need to have 1107 * page modification or references recorded. 1108 * Note that old mappings are simply written 1109 * over. The page *must* be wired. 1110 * Note: SMP coherent. Uses a ranged shootdown IPI. 1111 */ 1112 void 1113 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1114 { 1115 pt_entry_t *l3, pa; 1116 vm_offset_t va; 1117 vm_page_t m; 1118 pt_entry_t entry; 1119 pn_t pn; 1120 int i; 1121 1122 va = sva; 1123 for (i = 0; i < count; i++) { 1124 m = ma[i]; 1125 pa = VM_PAGE_TO_PHYS(m); 1126 pn = (pa / PAGE_SIZE); 1127 l3 = pmap_l3(kernel_pmap, va); 1128 1129 entry = PTE_KERN; 1130 entry |= (pn << PTE_PPN0_S); 1131 pmap_store(l3, entry); 1132 1133 va += L3_SIZE; 1134 } 1135 pmap_invalidate_range(kernel_pmap, sva, va); 1136 } 1137 1138 /* 1139 * This routine tears out page mappings from the 1140 * kernel -- it is meant only for temporary mappings. 1141 * Note: SMP coherent. Uses a ranged shootdown IPI. 1142 */ 1143 void 1144 pmap_qremove(vm_offset_t sva, int count) 1145 { 1146 pt_entry_t *l3; 1147 vm_offset_t va; 1148 1149 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); 1150 1151 for (va = sva; count-- > 0; va += PAGE_SIZE) { 1152 l3 = pmap_l3(kernel_pmap, va); 1153 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 1154 pmap_clear(l3); 1155 } 1156 pmap_invalidate_range(kernel_pmap, sva, va); 1157 } 1158 1159 bool 1160 pmap_ps_enabled(pmap_t pmap __unused) 1161 { 1162 1163 return (superpages_enabled); 1164 } 1165 1166 /*************************************************** 1167 * Page table page management routines..... 1168 ***************************************************/ 1169 /* 1170 * Schedule the specified unused page table page to be freed. Specifically, 1171 * add the page to the specified list of pages that will be released to the 1172 * physical memory manager after the TLB has been updated. 1173 */ 1174 static __inline void 1175 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 1176 boolean_t set_PG_ZERO) 1177 { 1178 1179 if (set_PG_ZERO) 1180 m->flags |= PG_ZERO; 1181 else 1182 m->flags &= ~PG_ZERO; 1183 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1184 } 1185 1186 /* 1187 * Inserts the specified page table page into the specified pmap's collection 1188 * of idle page table pages. Each of a pmap's page table pages is responsible 1189 * for mapping a distinct range of virtual addresses. The pmap's collection is 1190 * ordered by this virtual address range. 1191 * 1192 * If "promoted" is false, then the page table page "ml3" must be zero filled. 1193 */ 1194 static __inline int 1195 pmap_insert_pt_page(pmap_t pmap, vm_page_t ml3, bool promoted) 1196 { 1197 1198 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1199 ml3->valid = promoted ? VM_PAGE_BITS_ALL : 0; 1200 return (vm_radix_insert(&pmap->pm_root, ml3)); 1201 } 1202 1203 /* 1204 * Removes the page table page mapping the specified virtual address from the 1205 * specified pmap's collection of idle page table pages, and returns it. 1206 * Otherwise, returns NULL if there is no page table page corresponding to the 1207 * specified virtual address. 1208 */ 1209 static __inline vm_page_t 1210 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 1211 { 1212 1213 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1214 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); 1215 } 1216 1217 /* 1218 * Decrements a page table page's reference count, which is used to record the 1219 * number of valid page table entries within the page. If the reference count 1220 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1221 * page table page was unmapped and FALSE otherwise. 1222 */ 1223 static inline boolean_t 1224 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1225 { 1226 KASSERT(m->ref_count > 0, 1227 ("%s: page %p ref count underflow", __func__, m)); 1228 1229 --m->ref_count; 1230 if (m->ref_count == 0) { 1231 _pmap_unwire_ptp(pmap, va, m, free); 1232 return (TRUE); 1233 } else { 1234 return (FALSE); 1235 } 1236 } 1237 1238 static void 1239 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1240 { 1241 vm_paddr_t phys; 1242 1243 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1244 if (m->pindex >= NUL2E + NUL1E) { 1245 pd_entry_t *l0; 1246 l0 = pmap_l0(pmap, va); 1247 pmap_clear(l0); 1248 } else if (m->pindex >= NUL2E) { 1249 pd_entry_t *l1; 1250 l1 = pmap_l1(pmap, va); 1251 pmap_clear(l1); 1252 pmap_distribute_l1(pmap, pmap_l1_index(va), 0); 1253 } else { 1254 pd_entry_t *l2; 1255 l2 = pmap_l2(pmap, va); 1256 pmap_clear(l2); 1257 } 1258 pmap_resident_count_dec(pmap, 1); 1259 if (m->pindex < NUL2E) { 1260 pd_entry_t *l1; 1261 vm_page_t pdpg; 1262 1263 l1 = pmap_l1(pmap, va); 1264 phys = PTE_TO_PHYS(pmap_load(l1)); 1265 pdpg = PHYS_TO_VM_PAGE(phys); 1266 pmap_unwire_ptp(pmap, va, pdpg, free); 1267 } else if (m->pindex < NUL2E + NUL1E && pmap_mode != PMAP_MODE_SV39) { 1268 pd_entry_t *l0; 1269 vm_page_t pdpg; 1270 1271 MPASS(pmap_mode != PMAP_MODE_SV39); 1272 l0 = pmap_l0(pmap, va); 1273 phys = PTE_TO_PHYS(pmap_load(l0)); 1274 pdpg = PHYS_TO_VM_PAGE(phys); 1275 pmap_unwire_ptp(pmap, va, pdpg, free); 1276 } 1277 pmap_invalidate_page(pmap, va); 1278 1279 vm_wire_sub(1); 1280 1281 /* 1282 * Put page on a list so that it is released after 1283 * *ALL* TLB shootdown is done 1284 */ 1285 pmap_add_delayed_free_list(m, free, TRUE); 1286 } 1287 1288 /* 1289 * After removing a page table entry, this routine is used to 1290 * conditionally free the page, and manage the reference count. 1291 */ 1292 static int 1293 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 1294 struct spglist *free) 1295 { 1296 vm_page_t mpte; 1297 1298 if (va >= VM_MAXUSER_ADDRESS) 1299 return (0); 1300 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 1301 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde)); 1302 return (pmap_unwire_ptp(pmap, va, mpte, free)); 1303 } 1304 1305 static uint64_t 1306 pmap_satp_mode(void) 1307 { 1308 return (pmap_mode == PMAP_MODE_SV39 ? SATP_MODE_SV39 : SATP_MODE_SV48); 1309 } 1310 1311 void 1312 pmap_pinit0(pmap_t pmap) 1313 { 1314 PMAP_LOCK_INIT(pmap); 1315 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1316 pmap->pm_top = kernel_pmap->pm_top; 1317 pmap->pm_satp = pmap_satp_mode() | 1318 (vtophys(pmap->pm_top) >> PAGE_SHIFT); 1319 CPU_ZERO(&pmap->pm_active); 1320 pmap_activate_boot(pmap); 1321 } 1322 1323 int 1324 pmap_pinit(pmap_t pmap) 1325 { 1326 vm_paddr_t topphys; 1327 vm_page_t mtop; 1328 size_t i; 1329 1330 mtop = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO | 1331 VM_ALLOC_WAITOK); 1332 1333 topphys = VM_PAGE_TO_PHYS(mtop); 1334 pmap->pm_top = (pd_entry_t *)PHYS_TO_DMAP(topphys); 1335 pmap->pm_satp = pmap_satp_mode() | (topphys >> PAGE_SHIFT); 1336 1337 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1338 1339 CPU_ZERO(&pmap->pm_active); 1340 1341 if (pmap_mode == PMAP_MODE_SV39) { 1342 /* 1343 * Copy L1 entries from the kernel pmap. This must be done with 1344 * the allpmaps lock held to avoid races with 1345 * pmap_distribute_l1(). 1346 */ 1347 mtx_lock(&allpmaps_lock); 1348 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1349 for (i = pmap_l1_index(VM_MIN_KERNEL_ADDRESS); 1350 i < pmap_l1_index(VM_MAX_KERNEL_ADDRESS); i++) 1351 pmap->pm_top[i] = kernel_pmap->pm_top[i]; 1352 for (i = pmap_l1_index(DMAP_MIN_ADDRESS); 1353 i < pmap_l1_index(DMAP_MAX_ADDRESS); i++) 1354 pmap->pm_top[i] = kernel_pmap->pm_top[i]; 1355 mtx_unlock(&allpmaps_lock); 1356 } else { 1357 i = pmap_l0_index(VM_MIN_KERNEL_ADDRESS); 1358 pmap->pm_top[i] = kernel_pmap->pm_top[i]; 1359 } 1360 1361 vm_radix_init(&pmap->pm_root); 1362 1363 return (1); 1364 } 1365 1366 /* 1367 * This routine is called if the desired page table page does not exist. 1368 * 1369 * If page table page allocation fails, this routine may sleep before 1370 * returning NULL. It sleeps only if a lock pointer was given. 1371 * 1372 * Note: If a page allocation fails at page table level two or three, 1373 * one or two pages may be held during the wait, only to be released 1374 * afterwards. This conservative approach is easily argued to avoid 1375 * race conditions. 1376 */ 1377 static vm_page_t 1378 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 1379 { 1380 vm_page_t m, pdpg; 1381 pt_entry_t entry; 1382 vm_paddr_t phys; 1383 pn_t pn; 1384 1385 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1386 1387 /* 1388 * Allocate a page table page. 1389 */ 1390 m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1391 if (m == NULL) { 1392 if (lockp != NULL) { 1393 RELEASE_PV_LIST_LOCK(lockp); 1394 PMAP_UNLOCK(pmap); 1395 rw_runlock(&pvh_global_lock); 1396 vm_wait(NULL); 1397 rw_rlock(&pvh_global_lock); 1398 PMAP_LOCK(pmap); 1399 } 1400 1401 /* 1402 * Indicate the need to retry. While waiting, the page table 1403 * page may have been allocated. 1404 */ 1405 return (NULL); 1406 } 1407 m->pindex = ptepindex; 1408 1409 /* 1410 * Map the pagetable page into the process address space, if 1411 * it isn't already there. 1412 */ 1413 pn = VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT; 1414 if (ptepindex >= NUL2E + NUL1E) { 1415 pd_entry_t *l0; 1416 vm_pindex_t l0index; 1417 1418 KASSERT(pmap_mode != PMAP_MODE_SV39, 1419 ("%s: pindex %#lx in SV39 mode", __func__, ptepindex)); 1420 KASSERT(ptepindex < NUL2E + NUL1E + NUL0E, 1421 ("%s: pindex %#lx out of range", __func__, ptepindex)); 1422 1423 l0index = ptepindex - (NUL2E + NUL1E); 1424 l0 = &pmap->pm_top[l0index]; 1425 KASSERT((pmap_load(l0) & PTE_V) == 0, 1426 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0))); 1427 1428 entry = PTE_V | (pn << PTE_PPN0_S); 1429 pmap_store(l0, entry); 1430 } else if (ptepindex >= NUL2E) { 1431 pd_entry_t *l0, *l1; 1432 vm_pindex_t l0index, l1index; 1433 1434 l1index = ptepindex - NUL2E; 1435 if (pmap_mode == PMAP_MODE_SV39) { 1436 l1 = &pmap->pm_top[l1index]; 1437 } else { 1438 l0index = l1index >> Ln_ENTRIES_SHIFT; 1439 l0 = &pmap->pm_top[l0index]; 1440 if (pmap_load(l0) == 0) { 1441 /* Recurse to allocate the L1 page. */ 1442 if (_pmap_alloc_l3(pmap, 1443 NUL2E + NUL1E + l0index, lockp) == NULL) 1444 goto fail; 1445 phys = PTE_TO_PHYS(pmap_load(l0)); 1446 } else { 1447 phys = PTE_TO_PHYS(pmap_load(l0)); 1448 pdpg = PHYS_TO_VM_PAGE(phys); 1449 pdpg->ref_count++; 1450 } 1451 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1452 l1 = &l1[ptepindex & Ln_ADDR_MASK]; 1453 } 1454 KASSERT((pmap_load(l1) & PTE_V) == 0, 1455 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1))); 1456 1457 entry = PTE_V | (pn << PTE_PPN0_S); 1458 pmap_store(l1, entry); 1459 pmap_distribute_l1(pmap, l1index, entry); 1460 } else { 1461 vm_pindex_t l0index, l1index; 1462 pd_entry_t *l0, *l1, *l2; 1463 1464 l1index = ptepindex >> (L1_SHIFT - L2_SHIFT); 1465 if (pmap_mode == PMAP_MODE_SV39) { 1466 l1 = &pmap->pm_top[l1index]; 1467 if (pmap_load(l1) == 0) { 1468 /* recurse for allocating page dir */ 1469 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 1470 lockp) == NULL) 1471 goto fail; 1472 } else { 1473 phys = PTE_TO_PHYS(pmap_load(l1)); 1474 pdpg = PHYS_TO_VM_PAGE(phys); 1475 pdpg->ref_count++; 1476 } 1477 } else { 1478 l0index = l1index >> Ln_ENTRIES_SHIFT; 1479 l0 = &pmap->pm_top[l0index]; 1480 if (pmap_load(l0) == 0) { 1481 /* Recurse to allocate the L1 entry. */ 1482 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 1483 lockp) == NULL) 1484 goto fail; 1485 phys = PTE_TO_PHYS(pmap_load(l0)); 1486 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1487 l1 = &l1[l1index & Ln_ADDR_MASK]; 1488 } else { 1489 phys = PTE_TO_PHYS(pmap_load(l0)); 1490 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1491 l1 = &l1[l1index & Ln_ADDR_MASK]; 1492 if (pmap_load(l1) == 0) { 1493 /* Recurse to allocate the L2 page. */ 1494 if (_pmap_alloc_l3(pmap, 1495 NUL2E + l1index, lockp) == NULL) 1496 goto fail; 1497 } else { 1498 phys = PTE_TO_PHYS(pmap_load(l1)); 1499 pdpg = PHYS_TO_VM_PAGE(phys); 1500 pdpg->ref_count++; 1501 } 1502 } 1503 } 1504 1505 phys = PTE_TO_PHYS(pmap_load(l1)); 1506 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1507 l2 = &l2[ptepindex & Ln_ADDR_MASK]; 1508 KASSERT((pmap_load(l2) & PTE_V) == 0, 1509 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2))); 1510 1511 entry = PTE_V | (pn << PTE_PPN0_S); 1512 pmap_store(l2, entry); 1513 } 1514 1515 pmap_resident_count_inc(pmap, 1); 1516 1517 return (m); 1518 1519 fail: 1520 vm_page_unwire_noq(m); 1521 vm_page_free_zero(m); 1522 return (NULL); 1523 } 1524 1525 static vm_page_t 1526 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1527 { 1528 pd_entry_t *l1; 1529 vm_page_t l2pg; 1530 vm_pindex_t l2pindex; 1531 1532 retry: 1533 l1 = pmap_l1(pmap, va); 1534 if (l1 != NULL && (pmap_load(l1) & PTE_V) != 0) { 1535 KASSERT((pmap_load(l1) & PTE_RWX) == 0, 1536 ("%s: L1 entry %#lx for VA %#lx is a leaf", __func__, 1537 pmap_load(l1), va)); 1538 /* Add a reference to the L2 page. */ 1539 l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1))); 1540 l2pg->ref_count++; 1541 } else { 1542 /* Allocate a L2 page. */ 1543 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; 1544 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); 1545 if (l2pg == NULL && lockp != NULL) 1546 goto retry; 1547 } 1548 return (l2pg); 1549 } 1550 1551 static vm_page_t 1552 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1553 { 1554 vm_pindex_t ptepindex; 1555 pd_entry_t *l2; 1556 vm_paddr_t phys; 1557 vm_page_t m; 1558 1559 /* 1560 * Calculate pagetable page index 1561 */ 1562 ptepindex = pmap_l2_pindex(va); 1563 retry: 1564 /* 1565 * Get the page directory entry 1566 */ 1567 l2 = pmap_l2(pmap, va); 1568 1569 /* 1570 * If the page table page is mapped, we just increment the 1571 * hold count, and activate it. 1572 */ 1573 if (l2 != NULL && pmap_load(l2) != 0) { 1574 phys = PTE_TO_PHYS(pmap_load(l2)); 1575 m = PHYS_TO_VM_PAGE(phys); 1576 m->ref_count++; 1577 } else { 1578 /* 1579 * Here if the pte page isn't mapped, or if it has been 1580 * deallocated. 1581 */ 1582 m = _pmap_alloc_l3(pmap, ptepindex, lockp); 1583 if (m == NULL && lockp != NULL) 1584 goto retry; 1585 } 1586 return (m); 1587 } 1588 1589 /*************************************************** 1590 * Pmap allocation/deallocation routines. 1591 ***************************************************/ 1592 1593 /* 1594 * Release any resources held by the given physical map. 1595 * Called when a pmap initialized by pmap_pinit is being released. 1596 * Should only be called if the map contains no valid mappings. 1597 */ 1598 void 1599 pmap_release(pmap_t pmap) 1600 { 1601 vm_page_t m; 1602 1603 KASSERT(pmap->pm_stats.resident_count == 0, 1604 ("pmap_release: pmap resident count %ld != 0", 1605 pmap->pm_stats.resident_count)); 1606 KASSERT(CPU_EMPTY(&pmap->pm_active), 1607 ("releasing active pmap %p", pmap)); 1608 1609 if (pmap_mode == PMAP_MODE_SV39) { 1610 mtx_lock(&allpmaps_lock); 1611 LIST_REMOVE(pmap, pm_list); 1612 mtx_unlock(&allpmaps_lock); 1613 } 1614 1615 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_top)); 1616 vm_page_unwire_noq(m); 1617 vm_page_free(m); 1618 } 1619 1620 static int 1621 kvm_size(SYSCTL_HANDLER_ARGS) 1622 { 1623 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 1624 1625 return sysctl_handle_long(oidp, &ksize, 0, req); 1626 } 1627 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1628 0, 0, kvm_size, "LU", 1629 "Size of KVM"); 1630 1631 static int 1632 kvm_free(SYSCTL_HANDLER_ARGS) 1633 { 1634 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 1635 1636 return sysctl_handle_long(oidp, &kfree, 0, req); 1637 } 1638 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1639 0, 0, kvm_free, "LU", 1640 "Amount of KVM free"); 1641 1642 /* 1643 * grow the number of kernel page table entries, if needed 1644 */ 1645 void 1646 pmap_growkernel(vm_offset_t addr) 1647 { 1648 vm_paddr_t paddr; 1649 vm_page_t nkpg; 1650 pd_entry_t *l1, *l2; 1651 pt_entry_t entry; 1652 pn_t pn; 1653 1654 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 1655 1656 addr = roundup2(addr, L2_SIZE); 1657 if (addr - 1 >= vm_map_max(kernel_map)) 1658 addr = vm_map_max(kernel_map); 1659 while (kernel_vm_end < addr) { 1660 l1 = pmap_l1(kernel_pmap, kernel_vm_end); 1661 if (pmap_load(l1) == 0) { 1662 /* We need a new PDP entry */ 1663 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | 1664 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1665 if (nkpg == NULL) 1666 panic("pmap_growkernel: no memory to grow kernel"); 1667 nkpg->pindex = kernel_vm_end >> L1_SHIFT; 1668 paddr = VM_PAGE_TO_PHYS(nkpg); 1669 1670 pn = (paddr / PAGE_SIZE); 1671 entry = (PTE_V); 1672 entry |= (pn << PTE_PPN0_S); 1673 pmap_store(l1, entry); 1674 pmap_distribute_l1(kernel_pmap, 1675 pmap_l1_index(kernel_vm_end), entry); 1676 continue; /* try again */ 1677 } 1678 l2 = pmap_l1_to_l2(l1, kernel_vm_end); 1679 if ((pmap_load(l2) & PTE_V) != 0 && 1680 (pmap_load(l2) & PTE_RWX) == 0) { 1681 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1682 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1683 kernel_vm_end = vm_map_max(kernel_map); 1684 break; 1685 } 1686 continue; 1687 } 1688 1689 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 1690 VM_ALLOC_ZERO); 1691 if (nkpg == NULL) 1692 panic("pmap_growkernel: no memory to grow kernel"); 1693 nkpg->pindex = kernel_vm_end >> L2_SHIFT; 1694 paddr = VM_PAGE_TO_PHYS(nkpg); 1695 1696 pn = (paddr / PAGE_SIZE); 1697 entry = (PTE_V); 1698 entry |= (pn << PTE_PPN0_S); 1699 pmap_store(l2, entry); 1700 1701 pmap_invalidate_page(kernel_pmap, kernel_vm_end); 1702 1703 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1704 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1705 kernel_vm_end = vm_map_max(kernel_map); 1706 break; 1707 } 1708 } 1709 } 1710 1711 /*************************************************** 1712 * page management routines. 1713 ***************************************************/ 1714 1715 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 1716 CTASSERT(_NPCM == 3); 1717 CTASSERT(_NPCPV == 168); 1718 1719 static __inline struct pv_chunk * 1720 pv_to_chunk(pv_entry_t pv) 1721 { 1722 1723 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 1724 } 1725 1726 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 1727 1728 #define PC_FREE0 0xfffffffffffffffful 1729 #define PC_FREE1 0xfffffffffffffffful 1730 #define PC_FREE2 ((1ul << (_NPCPV % 64)) - 1) 1731 1732 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 1733 1734 #if 0 1735 #ifdef PV_STATS 1736 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 1737 1738 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 1739 "Current number of pv entry chunks"); 1740 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 1741 "Current number of pv entry chunks allocated"); 1742 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 1743 "Current number of pv entry chunks frees"); 1744 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 1745 "Number of times tried to get a chunk page but failed."); 1746 1747 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 1748 static int pv_entry_spare; 1749 1750 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 1751 "Current number of pv entry frees"); 1752 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 1753 "Current number of pv entry allocs"); 1754 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 1755 "Current number of pv entries"); 1756 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 1757 "Current number of spare pv entries"); 1758 #endif 1759 #endif /* 0 */ 1760 1761 /* 1762 * We are in a serious low memory condition. Resort to 1763 * drastic measures to free some pages so we can allocate 1764 * another pv entry chunk. 1765 * 1766 * Returns NULL if PV entries were reclaimed from the specified pmap. 1767 * 1768 * We do not, however, unmap 2mpages because subsequent accesses will 1769 * allocate per-page pv entries until repromotion occurs, thereby 1770 * exacerbating the shortage of free pv entries. 1771 */ 1772 static vm_page_t 1773 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 1774 { 1775 1776 panic("RISCVTODO: reclaim_pv_chunk"); 1777 } 1778 1779 /* 1780 * free the pv_entry back to the free list 1781 */ 1782 static void 1783 free_pv_entry(pmap_t pmap, pv_entry_t pv) 1784 { 1785 struct pv_chunk *pc; 1786 int idx, field, bit; 1787 1788 rw_assert(&pvh_global_lock, RA_LOCKED); 1789 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1790 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 1791 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 1792 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 1793 pc = pv_to_chunk(pv); 1794 idx = pv - &pc->pc_pventry[0]; 1795 field = idx / 64; 1796 bit = idx % 64; 1797 pc->pc_map[field] |= 1ul << bit; 1798 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 1799 pc->pc_map[2] != PC_FREE2) { 1800 /* 98% of the time, pc is already at the head of the list. */ 1801 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 1802 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1803 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1804 } 1805 return; 1806 } 1807 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1808 free_pv_chunk(pc); 1809 } 1810 1811 static void 1812 free_pv_chunk(struct pv_chunk *pc) 1813 { 1814 vm_page_t m; 1815 1816 mtx_lock(&pv_chunks_mutex); 1817 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 1818 mtx_unlock(&pv_chunks_mutex); 1819 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 1820 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 1821 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 1822 /* entire chunk is free, return it */ 1823 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 1824 dump_drop_page(m->phys_addr); 1825 vm_page_unwire_noq(m); 1826 vm_page_free(m); 1827 } 1828 1829 /* 1830 * Returns a new PV entry, allocating a new PV chunk from the system when 1831 * needed. If this PV chunk allocation fails and a PV list lock pointer was 1832 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 1833 * returned. 1834 * 1835 * The given PV list lock may be released. 1836 */ 1837 static pv_entry_t 1838 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 1839 { 1840 int bit, field; 1841 pv_entry_t pv; 1842 struct pv_chunk *pc; 1843 vm_page_t m; 1844 1845 rw_assert(&pvh_global_lock, RA_LOCKED); 1846 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1847 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 1848 retry: 1849 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1850 if (pc != NULL) { 1851 for (field = 0; field < _NPCM; field++) { 1852 if (pc->pc_map[field]) { 1853 bit = ffsl(pc->pc_map[field]) - 1; 1854 break; 1855 } 1856 } 1857 if (field < _NPCM) { 1858 pv = &pc->pc_pventry[field * 64 + bit]; 1859 pc->pc_map[field] &= ~(1ul << bit); 1860 /* If this was the last item, move it to tail */ 1861 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 1862 pc->pc_map[2] == 0) { 1863 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1864 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 1865 pc_list); 1866 } 1867 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1868 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 1869 return (pv); 1870 } 1871 } 1872 /* No free items, allocate another chunk */ 1873 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 1874 if (m == NULL) { 1875 if (lockp == NULL) { 1876 PV_STAT(pc_chunk_tryfail++); 1877 return (NULL); 1878 } 1879 m = reclaim_pv_chunk(pmap, lockp); 1880 if (m == NULL) 1881 goto retry; 1882 } 1883 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 1884 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 1885 dump_add_page(m->phys_addr); 1886 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1887 pc->pc_pmap = pmap; 1888 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 1889 pc->pc_map[1] = PC_FREE1; 1890 pc->pc_map[2] = PC_FREE2; 1891 mtx_lock(&pv_chunks_mutex); 1892 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 1893 mtx_unlock(&pv_chunks_mutex); 1894 pv = &pc->pc_pventry[0]; 1895 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1896 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1897 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 1898 return (pv); 1899 } 1900 1901 /* 1902 * Ensure that the number of spare PV entries in the specified pmap meets or 1903 * exceeds the given count, "needed". 1904 * 1905 * The given PV list lock may be released. 1906 */ 1907 static void 1908 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 1909 { 1910 struct pch new_tail; 1911 struct pv_chunk *pc; 1912 vm_page_t m; 1913 int avail, free; 1914 bool reclaimed; 1915 1916 rw_assert(&pvh_global_lock, RA_LOCKED); 1917 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1918 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 1919 1920 /* 1921 * Newly allocated PV chunks must be stored in a private list until 1922 * the required number of PV chunks have been allocated. Otherwise, 1923 * reclaim_pv_chunk() could recycle one of these chunks. In 1924 * contrast, these chunks must be added to the pmap upon allocation. 1925 */ 1926 TAILQ_INIT(&new_tail); 1927 retry: 1928 avail = 0; 1929 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 1930 bit_count((bitstr_t *)pc->pc_map, 0, 1931 sizeof(pc->pc_map) * NBBY, &free); 1932 if (free == 0) 1933 break; 1934 avail += free; 1935 if (avail >= needed) 1936 break; 1937 } 1938 for (reclaimed = false; avail < needed; avail += _NPCPV) { 1939 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 1940 if (m == NULL) { 1941 m = reclaim_pv_chunk(pmap, lockp); 1942 if (m == NULL) 1943 goto retry; 1944 reclaimed = true; 1945 } 1946 /* XXX PV STATS */ 1947 #if 0 1948 dump_add_page(m->phys_addr); 1949 #endif 1950 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1951 pc->pc_pmap = pmap; 1952 pc->pc_map[0] = PC_FREE0; 1953 pc->pc_map[1] = PC_FREE1; 1954 pc->pc_map[2] = PC_FREE2; 1955 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1956 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 1957 1958 /* 1959 * The reclaim might have freed a chunk from the current pmap. 1960 * If that chunk contained available entries, we need to 1961 * re-count the number of available entries. 1962 */ 1963 if (reclaimed) 1964 goto retry; 1965 } 1966 if (!TAILQ_EMPTY(&new_tail)) { 1967 mtx_lock(&pv_chunks_mutex); 1968 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 1969 mtx_unlock(&pv_chunks_mutex); 1970 } 1971 } 1972 1973 /* 1974 * First find and then remove the pv entry for the specified pmap and virtual 1975 * address from the specified pv list. Returns the pv entry if found and NULL 1976 * otherwise. This operation can be performed on pv lists for either 4KB or 1977 * 2MB page mappings. 1978 */ 1979 static __inline pv_entry_t 1980 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1981 { 1982 pv_entry_t pv; 1983 1984 rw_assert(&pvh_global_lock, RA_LOCKED); 1985 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 1986 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 1987 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 1988 pvh->pv_gen++; 1989 break; 1990 } 1991 } 1992 return (pv); 1993 } 1994 1995 /* 1996 * First find and then destroy the pv entry for the specified pmap and virtual 1997 * address. This operation can be performed on pv lists for either 4KB or 2MB 1998 * page mappings. 1999 */ 2000 static void 2001 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2002 { 2003 pv_entry_t pv; 2004 2005 pv = pmap_pvh_remove(pvh, pmap, va); 2006 2007 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va)); 2008 free_pv_entry(pmap, pv); 2009 } 2010 2011 /* 2012 * Conditionally create the PV entry for a 4KB page mapping if the required 2013 * memory can be allocated without resorting to reclamation. 2014 */ 2015 static boolean_t 2016 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 2017 struct rwlock **lockp) 2018 { 2019 pv_entry_t pv; 2020 2021 rw_assert(&pvh_global_lock, RA_LOCKED); 2022 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2023 /* Pass NULL instead of the lock pointer to disable reclamation. */ 2024 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 2025 pv->pv_va = va; 2026 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2027 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2028 m->md.pv_gen++; 2029 return (TRUE); 2030 } else 2031 return (FALSE); 2032 } 2033 2034 /* 2035 * After demotion from a 2MB page mapping to 512 4KB page mappings, 2036 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 2037 * entries for each of the 4KB page mappings. 2038 */ 2039 static void __unused 2040 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 2041 struct rwlock **lockp) 2042 { 2043 struct md_page *pvh; 2044 struct pv_chunk *pc; 2045 pv_entry_t pv; 2046 vm_page_t m; 2047 vm_offset_t va_last; 2048 int bit, field; 2049 2050 rw_assert(&pvh_global_lock, RA_LOCKED); 2051 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2052 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2053 2054 /* 2055 * Transfer the 2mpage's pv entry for this mapping to the first 2056 * page's pv list. Once this transfer begins, the pv list lock 2057 * must not be released until the last pv entry is reinstantiated. 2058 */ 2059 pvh = pa_to_pvh(pa); 2060 va &= ~L2_OFFSET; 2061 pv = pmap_pvh_remove(pvh, pmap, va); 2062 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); 2063 m = PHYS_TO_VM_PAGE(pa); 2064 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2065 m->md.pv_gen++; 2066 /* Instantiate the remaining 511 pv entries. */ 2067 va_last = va + L2_SIZE - PAGE_SIZE; 2068 for (;;) { 2069 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2070 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 2071 pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); 2072 for (field = 0; field < _NPCM; field++) { 2073 while (pc->pc_map[field] != 0) { 2074 bit = ffsl(pc->pc_map[field]) - 1; 2075 pc->pc_map[field] &= ~(1ul << bit); 2076 pv = &pc->pc_pventry[field * 64 + bit]; 2077 va += PAGE_SIZE; 2078 pv->pv_va = va; 2079 m++; 2080 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2081 ("pmap_pv_demote_l2: page %p is not managed", m)); 2082 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2083 m->md.pv_gen++; 2084 if (va == va_last) 2085 goto out; 2086 } 2087 } 2088 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2089 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2090 } 2091 out: 2092 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 2093 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2094 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2095 } 2096 /* XXX PV stats */ 2097 } 2098 2099 #if VM_NRESERVLEVEL > 0 2100 static void 2101 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 2102 struct rwlock **lockp) 2103 { 2104 struct md_page *pvh; 2105 pv_entry_t pv; 2106 vm_page_t m; 2107 vm_offset_t va_last; 2108 2109 rw_assert(&pvh_global_lock, RA_LOCKED); 2110 KASSERT((va & L2_OFFSET) == 0, 2111 ("pmap_pv_promote_l2: misaligned va %#lx", va)); 2112 2113 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2114 2115 m = PHYS_TO_VM_PAGE(pa); 2116 pv = pmap_pvh_remove(&m->md, pmap, va); 2117 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va)); 2118 pvh = pa_to_pvh(pa); 2119 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2120 pvh->pv_gen++; 2121 2122 va_last = va + L2_SIZE - PAGE_SIZE; 2123 do { 2124 m++; 2125 va += PAGE_SIZE; 2126 pmap_pvh_free(&m->md, pmap, va); 2127 } while (va < va_last); 2128 } 2129 #endif /* VM_NRESERVLEVEL > 0 */ 2130 2131 /* 2132 * Create the PV entry for a 2MB page mapping. Always returns true unless the 2133 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 2134 * false if the PV entry cannot be allocated without resorting to reclamation. 2135 */ 2136 static bool 2137 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, 2138 struct rwlock **lockp) 2139 { 2140 struct md_page *pvh; 2141 pv_entry_t pv; 2142 vm_paddr_t pa; 2143 2144 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2145 /* Pass NULL instead of the lock pointer to disable reclamation. */ 2146 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 2147 NULL : lockp)) == NULL) 2148 return (false); 2149 pv->pv_va = va; 2150 pa = PTE_TO_PHYS(l2e); 2151 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2152 pvh = pa_to_pvh(pa); 2153 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2154 pvh->pv_gen++; 2155 return (true); 2156 } 2157 2158 static void 2159 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 2160 { 2161 pt_entry_t newl2, oldl2 __diagused; 2162 vm_page_t ml3; 2163 vm_paddr_t ml3pa; 2164 2165 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); 2166 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 2167 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2168 2169 ml3 = pmap_remove_pt_page(pmap, va); 2170 if (ml3 == NULL) 2171 panic("pmap_remove_kernel_l2: Missing pt page"); 2172 2173 ml3pa = VM_PAGE_TO_PHYS(ml3); 2174 newl2 = ml3pa | PTE_V; 2175 2176 /* 2177 * If this page table page was unmapped by a promotion, then it 2178 * contains valid mappings. Zero it to invalidate those mappings. 2179 */ 2180 if (ml3->valid != 0) 2181 pagezero((void *)PHYS_TO_DMAP(ml3pa)); 2182 2183 /* 2184 * Demote the mapping. 2185 */ 2186 oldl2 = pmap_load_store(l2, newl2); 2187 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", 2188 __func__, l2, oldl2)); 2189 } 2190 2191 /* 2192 * pmap_remove_l2: Do the things to unmap a level 2 superpage. 2193 */ 2194 static int 2195 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 2196 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) 2197 { 2198 struct md_page *pvh; 2199 pt_entry_t oldl2; 2200 vm_offset_t eva, va; 2201 vm_page_t m, ml3; 2202 2203 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2204 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); 2205 oldl2 = pmap_load_clear(l2); 2206 KASSERT((oldl2 & PTE_RWX) != 0, 2207 ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2)); 2208 2209 /* 2210 * The sfence.vma documentation states that it is sufficient to specify 2211 * a single address within a superpage mapping. However, since we do 2212 * not perform any invalidation upon promotion, TLBs may still be 2213 * caching 4KB mappings within the superpage, so we must invalidate the 2214 * entire range. 2215 */ 2216 pmap_invalidate_range(pmap, sva, sva + L2_SIZE); 2217 if ((oldl2 & PTE_SW_WIRED) != 0) 2218 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; 2219 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); 2220 if ((oldl2 & PTE_SW_MANAGED) != 0) { 2221 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2)); 2222 pvh = pa_to_pvh(PTE_TO_PHYS(oldl2)); 2223 pmap_pvh_free(pvh, pmap, sva); 2224 eva = sva + L2_SIZE; 2225 for (va = sva, m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl2)); 2226 va < eva; va += PAGE_SIZE, m++) { 2227 if ((oldl2 & PTE_D) != 0) 2228 vm_page_dirty(m); 2229 if ((oldl2 & PTE_A) != 0) 2230 vm_page_aflag_set(m, PGA_REFERENCED); 2231 if (TAILQ_EMPTY(&m->md.pv_list) && 2232 TAILQ_EMPTY(&pvh->pv_list)) 2233 vm_page_aflag_clear(m, PGA_WRITEABLE); 2234 } 2235 } 2236 if (pmap == kernel_pmap) { 2237 pmap_remove_kernel_l2(pmap, l2, sva); 2238 } else { 2239 ml3 = pmap_remove_pt_page(pmap, sva); 2240 if (ml3 != NULL) { 2241 KASSERT(ml3->valid == VM_PAGE_BITS_ALL, 2242 ("pmap_remove_l2: l3 page not promoted")); 2243 pmap_resident_count_dec(pmap, 1); 2244 KASSERT(ml3->ref_count == Ln_ENTRIES, 2245 ("pmap_remove_l2: l3 page ref count error")); 2246 ml3->ref_count = 1; 2247 vm_page_unwire_noq(ml3); 2248 pmap_add_delayed_free_list(ml3, free, FALSE); 2249 } 2250 } 2251 return (pmap_unuse_pt(pmap, sva, l1e, free)); 2252 } 2253 2254 /* 2255 * pmap_remove_l3: do the things to unmap a page in a process 2256 */ 2257 static int 2258 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 2259 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) 2260 { 2261 struct md_page *pvh; 2262 pt_entry_t old_l3; 2263 vm_paddr_t phys; 2264 vm_page_t m; 2265 2266 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2267 old_l3 = pmap_load_clear(l3); 2268 pmap_invalidate_page(pmap, va); 2269 if (old_l3 & PTE_SW_WIRED) 2270 pmap->pm_stats.wired_count -= 1; 2271 pmap_resident_count_dec(pmap, 1); 2272 if (old_l3 & PTE_SW_MANAGED) { 2273 phys = PTE_TO_PHYS(old_l3); 2274 m = PHYS_TO_VM_PAGE(phys); 2275 if ((old_l3 & PTE_D) != 0) 2276 vm_page_dirty(m); 2277 if (old_l3 & PTE_A) 2278 vm_page_aflag_set(m, PGA_REFERENCED); 2279 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2280 pmap_pvh_free(&m->md, pmap, va); 2281 if (TAILQ_EMPTY(&m->md.pv_list) && 2282 (m->flags & PG_FICTITIOUS) == 0) { 2283 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2284 if (TAILQ_EMPTY(&pvh->pv_list)) 2285 vm_page_aflag_clear(m, PGA_WRITEABLE); 2286 } 2287 } 2288 2289 return (pmap_unuse_pt(pmap, va, l2e, free)); 2290 } 2291 2292 /* 2293 * Remove the given range of addresses from the specified map. 2294 * 2295 * It is assumed that the start and end are properly 2296 * rounded to the page size. 2297 */ 2298 void 2299 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2300 { 2301 struct spglist free; 2302 struct rwlock *lock; 2303 vm_offset_t va, va_next; 2304 pd_entry_t *l0, *l1, *l2, l2e; 2305 pt_entry_t *l3; 2306 2307 /* 2308 * Perform an unsynchronized read. This is, however, safe. 2309 */ 2310 if (pmap->pm_stats.resident_count == 0) 2311 return; 2312 2313 SLIST_INIT(&free); 2314 2315 rw_rlock(&pvh_global_lock); 2316 PMAP_LOCK(pmap); 2317 2318 lock = NULL; 2319 for (; sva < eva; sva = va_next) { 2320 if (pmap->pm_stats.resident_count == 0) 2321 break; 2322 2323 if (pmap_mode == PMAP_MODE_SV48) { 2324 l0 = pmap_l0(pmap, sva); 2325 if (pmap_load(l0) == 0) { 2326 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 2327 if (va_next < sva) 2328 va_next = eva; 2329 continue; 2330 } 2331 l1 = pmap_l0_to_l1(l0, sva); 2332 } else { 2333 l1 = pmap_l1(pmap, sva); 2334 } 2335 2336 if (pmap_load(l1) == 0) { 2337 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2338 if (va_next < sva) 2339 va_next = eva; 2340 continue; 2341 } 2342 2343 /* 2344 * Calculate index for next page table. 2345 */ 2346 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2347 if (va_next < sva) 2348 va_next = eva; 2349 2350 l2 = pmap_l1_to_l2(l1, sva); 2351 if (l2 == NULL) 2352 continue; 2353 if ((l2e = pmap_load(l2)) == 0) 2354 continue; 2355 if ((l2e & PTE_RWX) != 0) { 2356 if (sva + L2_SIZE == va_next && eva >= va_next) { 2357 (void)pmap_remove_l2(pmap, l2, sva, 2358 pmap_load(l1), &free, &lock); 2359 continue; 2360 } else if (!pmap_demote_l2_locked(pmap, l2, sva, 2361 &lock)) { 2362 /* 2363 * The large page mapping was destroyed. 2364 */ 2365 continue; 2366 } 2367 l2e = pmap_load(l2); 2368 } 2369 2370 /* 2371 * Limit our scan to either the end of the va represented 2372 * by the current page table page, or to the end of the 2373 * range being removed. 2374 */ 2375 if (va_next > eva) 2376 va_next = eva; 2377 2378 va = va_next; 2379 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2380 sva += L3_SIZE) { 2381 if (pmap_load(l3) == 0) { 2382 if (va != va_next) { 2383 pmap_invalidate_range(pmap, va, sva); 2384 va = va_next; 2385 } 2386 continue; 2387 } 2388 if (va == va_next) 2389 va = sva; 2390 if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) { 2391 sva += L3_SIZE; 2392 break; 2393 } 2394 } 2395 if (va != va_next) 2396 pmap_invalidate_range(pmap, va, sva); 2397 } 2398 if (lock != NULL) 2399 rw_wunlock(lock); 2400 rw_runlock(&pvh_global_lock); 2401 PMAP_UNLOCK(pmap); 2402 vm_page_free_pages_toq(&free, false); 2403 } 2404 2405 /* 2406 * Routine: pmap_remove_all 2407 * Function: 2408 * Removes this physical page from 2409 * all physical maps in which it resides. 2410 * Reflects back modify bits to the pager. 2411 * 2412 * Notes: 2413 * Original versions of this routine were very 2414 * inefficient because they iteratively called 2415 * pmap_remove (slow...) 2416 */ 2417 2418 void 2419 pmap_remove_all(vm_page_t m) 2420 { 2421 struct spglist free; 2422 struct md_page *pvh; 2423 pmap_t pmap; 2424 pt_entry_t *l3, l3e; 2425 pd_entry_t *l2, l2e __diagused; 2426 pv_entry_t pv; 2427 vm_offset_t va; 2428 2429 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2430 ("pmap_remove_all: page %p is not managed", m)); 2431 SLIST_INIT(&free); 2432 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 2433 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2434 2435 rw_wlock(&pvh_global_lock); 2436 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 2437 pmap = PV_PMAP(pv); 2438 PMAP_LOCK(pmap); 2439 va = pv->pv_va; 2440 l2 = pmap_l2(pmap, va); 2441 (void)pmap_demote_l2(pmap, l2, va); 2442 PMAP_UNLOCK(pmap); 2443 } 2444 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2445 pmap = PV_PMAP(pv); 2446 PMAP_LOCK(pmap); 2447 pmap_resident_count_dec(pmap, 1); 2448 l2 = pmap_l2(pmap, pv->pv_va); 2449 KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found")); 2450 l2e = pmap_load(l2); 2451 2452 KASSERT((l2e & PTE_RX) == 0, 2453 ("pmap_remove_all: found a superpage in %p's pv list", m)); 2454 2455 l3 = pmap_l2_to_l3(l2, pv->pv_va); 2456 l3e = pmap_load_clear(l3); 2457 pmap_invalidate_page(pmap, pv->pv_va); 2458 if (l3e & PTE_SW_WIRED) 2459 pmap->pm_stats.wired_count--; 2460 if ((l3e & PTE_A) != 0) 2461 vm_page_aflag_set(m, PGA_REFERENCED); 2462 2463 /* 2464 * Update the vm_page_t clean and reference bits. 2465 */ 2466 if ((l3e & PTE_D) != 0) 2467 vm_page_dirty(m); 2468 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free); 2469 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2470 m->md.pv_gen++; 2471 free_pv_entry(pmap, pv); 2472 PMAP_UNLOCK(pmap); 2473 } 2474 vm_page_aflag_clear(m, PGA_WRITEABLE); 2475 rw_wunlock(&pvh_global_lock); 2476 vm_page_free_pages_toq(&free, false); 2477 } 2478 2479 /* 2480 * Set the physical protection on the 2481 * specified range of this map as requested. 2482 */ 2483 void 2484 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2485 { 2486 pd_entry_t *l0, *l1, *l2, l2e; 2487 pt_entry_t *l3, l3e, mask; 2488 vm_page_t m, mt; 2489 vm_paddr_t pa; 2490 vm_offset_t va_next; 2491 bool anychanged, pv_lists_locked; 2492 2493 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2494 pmap_remove(pmap, sva, eva); 2495 return; 2496 } 2497 2498 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == 2499 (VM_PROT_WRITE | VM_PROT_EXECUTE)) 2500 return; 2501 2502 anychanged = false; 2503 pv_lists_locked = false; 2504 mask = 0; 2505 if ((prot & VM_PROT_WRITE) == 0) 2506 mask |= PTE_W | PTE_D; 2507 if ((prot & VM_PROT_EXECUTE) == 0) 2508 mask |= PTE_X; 2509 resume: 2510 PMAP_LOCK(pmap); 2511 for (; sva < eva; sva = va_next) { 2512 if (pmap_mode == PMAP_MODE_SV48) { 2513 l0 = pmap_l0(pmap, sva); 2514 if (pmap_load(l0) == 0) { 2515 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 2516 if (va_next < sva) 2517 va_next = eva; 2518 continue; 2519 } 2520 l1 = pmap_l0_to_l1(l0, sva); 2521 } else { 2522 l1 = pmap_l1(pmap, sva); 2523 } 2524 2525 if (pmap_load(l1) == 0) { 2526 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2527 if (va_next < sva) 2528 va_next = eva; 2529 continue; 2530 } 2531 2532 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2533 if (va_next < sva) 2534 va_next = eva; 2535 2536 l2 = pmap_l1_to_l2(l1, sva); 2537 if (l2 == NULL || (l2e = pmap_load(l2)) == 0) 2538 continue; 2539 if ((l2e & PTE_RWX) != 0) { 2540 if (sva + L2_SIZE == va_next && eva >= va_next) { 2541 retryl2: 2542 if ((prot & VM_PROT_WRITE) == 0 && 2543 (l2e & (PTE_SW_MANAGED | PTE_D)) == 2544 (PTE_SW_MANAGED | PTE_D)) { 2545 pa = PTE_TO_PHYS(l2e); 2546 m = PHYS_TO_VM_PAGE(pa); 2547 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 2548 vm_page_dirty(mt); 2549 } 2550 if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask)) 2551 goto retryl2; 2552 anychanged = true; 2553 continue; 2554 } else { 2555 if (!pv_lists_locked) { 2556 pv_lists_locked = true; 2557 if (!rw_try_rlock(&pvh_global_lock)) { 2558 if (anychanged) 2559 pmap_invalidate_all( 2560 pmap); 2561 PMAP_UNLOCK(pmap); 2562 rw_rlock(&pvh_global_lock); 2563 goto resume; 2564 } 2565 } 2566 if (!pmap_demote_l2(pmap, l2, sva)) { 2567 /* 2568 * The large page mapping was destroyed. 2569 */ 2570 continue; 2571 } 2572 } 2573 } 2574 2575 if (va_next > eva) 2576 va_next = eva; 2577 2578 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2579 sva += L3_SIZE) { 2580 l3e = pmap_load(l3); 2581 retryl3: 2582 if ((l3e & PTE_V) == 0) 2583 continue; 2584 if ((prot & VM_PROT_WRITE) == 0 && 2585 (l3e & (PTE_SW_MANAGED | PTE_D)) == 2586 (PTE_SW_MANAGED | PTE_D)) { 2587 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(l3e)); 2588 vm_page_dirty(m); 2589 } 2590 if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask)) 2591 goto retryl3; 2592 anychanged = true; 2593 } 2594 } 2595 if (anychanged) 2596 pmap_invalidate_all(pmap); 2597 if (pv_lists_locked) 2598 rw_runlock(&pvh_global_lock); 2599 PMAP_UNLOCK(pmap); 2600 } 2601 2602 int 2603 pmap_fault(pmap_t pmap, vm_offset_t va, vm_prot_t ftype) 2604 { 2605 pd_entry_t *l2, l2e; 2606 pt_entry_t bits, *pte, oldpte; 2607 int rv; 2608 2609 KASSERT(VIRT_IS_VALID(va), ("pmap_fault: invalid va %#lx", va)); 2610 2611 rv = 0; 2612 PMAP_LOCK(pmap); 2613 l2 = pmap_l2(pmap, va); 2614 if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) 2615 goto done; 2616 if ((l2e & PTE_RWX) == 0) { 2617 pte = pmap_l2_to_l3(l2, va); 2618 if (pte == NULL || ((oldpte = pmap_load(pte)) & PTE_V) == 0) 2619 goto done; 2620 } else { 2621 pte = l2; 2622 oldpte = l2e; 2623 } 2624 2625 if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) || 2626 (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) || 2627 (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) || 2628 (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0)) 2629 goto done; 2630 2631 bits = PTE_A; 2632 if (ftype == VM_PROT_WRITE) 2633 bits |= PTE_D; 2634 2635 /* 2636 * Spurious faults can occur if the implementation caches invalid 2637 * entries in the TLB, or if simultaneous accesses on multiple CPUs 2638 * race with each other. 2639 */ 2640 if ((oldpte & bits) != bits) 2641 pmap_store_bits(pte, bits); 2642 sfence_vma(); 2643 rv = 1; 2644 done: 2645 PMAP_UNLOCK(pmap); 2646 return (rv); 2647 } 2648 2649 static bool 2650 pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va) 2651 { 2652 struct rwlock *lock; 2653 bool rv; 2654 2655 lock = NULL; 2656 rv = pmap_demote_l2_locked(pmap, l2, va, &lock); 2657 if (lock != NULL) 2658 rw_wunlock(lock); 2659 return (rv); 2660 } 2661 2662 /* 2663 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 2664 * mapping is invalidated. 2665 */ 2666 static bool 2667 pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 2668 struct rwlock **lockp) 2669 { 2670 struct spglist free; 2671 vm_page_t mpte; 2672 pd_entry_t newl2, oldl2; 2673 pt_entry_t *firstl3, newl3; 2674 vm_paddr_t mptepa; 2675 int i; 2676 2677 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2678 2679 oldl2 = pmap_load(l2); 2680 KASSERT((oldl2 & PTE_RWX) != 0, 2681 ("pmap_demote_l2_locked: oldl2 is not a leaf entry")); 2682 if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 2683 NULL) { 2684 if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc_noobj( 2685 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) | 2686 VM_ALLOC_WIRED)) == NULL) { 2687 SLIST_INIT(&free); 2688 (void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET, 2689 pmap_load(pmap_l1(pmap, va)), &free, lockp); 2690 vm_page_free_pages_toq(&free, true); 2691 CTR2(KTR_PMAP, "pmap_demote_l2_locked: " 2692 "failure for va %#lx in pmap %p", va, pmap); 2693 return (false); 2694 } 2695 mpte->pindex = pmap_l2_pindex(va); 2696 if (va < VM_MAXUSER_ADDRESS) { 2697 mpte->ref_count = Ln_ENTRIES; 2698 pmap_resident_count_inc(pmap, 1); 2699 } 2700 } 2701 mptepa = VM_PAGE_TO_PHYS(mpte); 2702 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 2703 newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V; 2704 KASSERT((oldl2 & PTE_A) != 0, 2705 ("pmap_demote_l2_locked: oldl2 is missing PTE_A")); 2706 KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W, 2707 ("pmap_demote_l2_locked: oldl2 is missing PTE_D")); 2708 newl3 = oldl2; 2709 2710 /* 2711 * If the page table page is not leftover from an earlier promotion, 2712 * initialize it. 2713 */ 2714 if (mpte->valid == 0) { 2715 for (i = 0; i < Ln_ENTRIES; i++) 2716 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2717 } 2718 KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3), 2719 ("pmap_demote_l2_locked: firstl3 and newl3 map different physical " 2720 "addresses")); 2721 2722 /* 2723 * If the mapping has changed attributes, update the page table 2724 * entries. 2725 */ 2726 if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE)) 2727 for (i = 0; i < Ln_ENTRIES; i++) 2728 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2729 2730 /* 2731 * The spare PV entries must be reserved prior to demoting the 2732 * mapping, that is, prior to changing the L2 entry. Otherwise, the 2733 * state of the L2 entry and the PV lists will be inconsistent, which 2734 * can result in reclaim_pv_chunk() attempting to remove a PV entry from 2735 * the wrong PV list and pmap_pv_demote_l2() failing to find the 2736 * expected PV entry for the 2MB page mapping that is being demoted. 2737 */ 2738 if ((oldl2 & PTE_SW_MANAGED) != 0) 2739 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); 2740 2741 /* 2742 * Demote the mapping. 2743 */ 2744 pmap_store(l2, newl2); 2745 2746 /* 2747 * Demote the PV entry. 2748 */ 2749 if ((oldl2 & PTE_SW_MANAGED) != 0) 2750 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp); 2751 2752 atomic_add_long(&pmap_l2_demotions, 1); 2753 CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p", 2754 va, pmap); 2755 return (true); 2756 } 2757 2758 #if VM_NRESERVLEVEL > 0 2759 static void 2760 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 2761 struct rwlock **lockp) 2762 { 2763 pt_entry_t *firstl3, firstl3e, *l3, l3e; 2764 vm_paddr_t pa; 2765 vm_page_t ml3; 2766 2767 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2768 2769 va &= ~L2_OFFSET; 2770 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 2771 ("pmap_promote_l2: invalid l2 entry %p", l2)); 2772 2773 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2))); 2774 firstl3e = pmap_load(firstl3); 2775 pa = PTE_TO_PHYS(firstl3e); 2776 if ((pa & L2_OFFSET) != 0) { 2777 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 2778 va, pmap); 2779 atomic_add_long(&pmap_l2_p_failures, 1); 2780 return; 2781 } 2782 2783 /* 2784 * Downgrade a clean, writable mapping to read-only to ensure that the 2785 * hardware does not set PTE_D while we are comparing PTEs. 2786 * 2787 * Upon a write access to a clean mapping, the implementation will 2788 * either atomically check protections and set PTE_D, or raise a page 2789 * fault. In the latter case, the pmap lock provides atomicity. Thus, 2790 * we do not issue an sfence.vma here and instead rely on pmap_fault() 2791 * to do so lazily. 2792 */ 2793 while ((firstl3e & (PTE_W | PTE_D)) == PTE_W) { 2794 if (atomic_fcmpset_64(firstl3, &firstl3e, firstl3e & ~PTE_W)) { 2795 firstl3e &= ~PTE_W; 2796 break; 2797 } 2798 } 2799 2800 pa += PAGE_SIZE; 2801 for (l3 = firstl3 + 1; l3 < firstl3 + Ln_ENTRIES; l3++) { 2802 l3e = pmap_load(l3); 2803 if (PTE_TO_PHYS(l3e) != pa) { 2804 CTR2(KTR_PMAP, 2805 "pmap_promote_l2: failure for va %#lx pmap %p", 2806 va, pmap); 2807 atomic_add_long(&pmap_l2_p_failures, 1); 2808 return; 2809 } 2810 while ((l3e & (PTE_W | PTE_D)) == PTE_W) { 2811 if (atomic_fcmpset_64(l3, &l3e, l3e & ~PTE_W)) { 2812 l3e &= ~PTE_W; 2813 break; 2814 } 2815 } 2816 if ((l3e & PTE_PROMOTE) != (firstl3e & PTE_PROMOTE)) { 2817 CTR2(KTR_PMAP, 2818 "pmap_promote_l2: failure for va %#lx pmap %p", 2819 va, pmap); 2820 atomic_add_long(&pmap_l2_p_failures, 1); 2821 return; 2822 } 2823 pa += PAGE_SIZE; 2824 } 2825 2826 ml3 = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2827 KASSERT(ml3->pindex == pmap_l2_pindex(va), 2828 ("pmap_promote_l2: page table page's pindex is wrong")); 2829 if (pmap_insert_pt_page(pmap, ml3, true)) { 2830 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 2831 va, pmap); 2832 atomic_add_long(&pmap_l2_p_failures, 1); 2833 return; 2834 } 2835 2836 if ((firstl3e & PTE_SW_MANAGED) != 0) 2837 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(firstl3e), lockp); 2838 2839 pmap_store(l2, firstl3e); 2840 2841 atomic_add_long(&pmap_l2_promotions, 1); 2842 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, 2843 pmap); 2844 } 2845 #endif 2846 2847 /* 2848 * Insert the given physical page (p) at 2849 * the specified virtual address (v) in the 2850 * target physical map with the protection requested. 2851 * 2852 * If specified, the page will be wired down, meaning 2853 * that the related pte can not be reclaimed. 2854 * 2855 * NB: This is the only routine which MAY NOT lazy-evaluate 2856 * or lose information. That is, this routine must actually 2857 * insert this page into the given map NOW. 2858 */ 2859 int 2860 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2861 u_int flags, int8_t psind) 2862 { 2863 struct rwlock *lock; 2864 pd_entry_t *l1, *l2, l2e; 2865 pt_entry_t new_l3, orig_l3; 2866 pt_entry_t *l3; 2867 pv_entry_t pv; 2868 vm_paddr_t opa, pa, l2_pa, l3_pa; 2869 vm_page_t mpte, om, l2_m, l3_m; 2870 pt_entry_t entry; 2871 pn_t l2_pn, l3_pn, pn; 2872 int rv; 2873 bool nosleep; 2874 2875 va = trunc_page(va); 2876 if ((m->oflags & VPO_UNMANAGED) == 0) 2877 VM_PAGE_OBJECT_BUSY_ASSERT(m); 2878 pa = VM_PAGE_TO_PHYS(m); 2879 pn = (pa / PAGE_SIZE); 2880 2881 new_l3 = PTE_V | PTE_R | PTE_A; 2882 if (prot & VM_PROT_EXECUTE) 2883 new_l3 |= PTE_X; 2884 if (flags & VM_PROT_WRITE) 2885 new_l3 |= PTE_D; 2886 if (prot & VM_PROT_WRITE) 2887 new_l3 |= PTE_W; 2888 if (va < VM_MAX_USER_ADDRESS) 2889 new_l3 |= PTE_U; 2890 2891 new_l3 |= (pn << PTE_PPN0_S); 2892 if ((flags & PMAP_ENTER_WIRED) != 0) 2893 new_l3 |= PTE_SW_WIRED; 2894 2895 /* 2896 * Set modified bit gratuitously for writeable mappings if 2897 * the page is unmanaged. We do not want to take a fault 2898 * to do the dirty bit accounting for these mappings. 2899 */ 2900 if ((m->oflags & VPO_UNMANAGED) != 0) { 2901 if (prot & VM_PROT_WRITE) 2902 new_l3 |= PTE_D; 2903 } else 2904 new_l3 |= PTE_SW_MANAGED; 2905 2906 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); 2907 2908 lock = NULL; 2909 mpte = NULL; 2910 rw_rlock(&pvh_global_lock); 2911 PMAP_LOCK(pmap); 2912 if (psind == 1) { 2913 /* Assert the required virtual and physical alignment. */ 2914 KASSERT((va & L2_OFFSET) == 0, 2915 ("pmap_enter: va %#lx unaligned", va)); 2916 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 2917 rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock); 2918 goto out; 2919 } 2920 2921 l2 = pmap_l2(pmap, va); 2922 if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 && 2923 ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2, 2924 va, &lock))) { 2925 l3 = pmap_l2_to_l3(l2, va); 2926 if (va < VM_MAXUSER_ADDRESS) { 2927 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2928 mpte->ref_count++; 2929 } 2930 } else if (va < VM_MAXUSER_ADDRESS) { 2931 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 2932 mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock); 2933 if (mpte == NULL && nosleep) { 2934 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); 2935 if (lock != NULL) 2936 rw_wunlock(lock); 2937 rw_runlock(&pvh_global_lock); 2938 PMAP_UNLOCK(pmap); 2939 return (KERN_RESOURCE_SHORTAGE); 2940 } 2941 l3 = pmap_l3(pmap, va); 2942 } else { 2943 l3 = pmap_l3(pmap, va); 2944 /* TODO: This is not optimal, but should mostly work */ 2945 if (l3 == NULL) { 2946 if (l2 == NULL) { 2947 l2_m = vm_page_alloc_noobj(VM_ALLOC_WIRED | 2948 VM_ALLOC_ZERO); 2949 if (l2_m == NULL) 2950 panic("pmap_enter: l2 pte_m == NULL"); 2951 2952 l2_pa = VM_PAGE_TO_PHYS(l2_m); 2953 l2_pn = (l2_pa / PAGE_SIZE); 2954 2955 l1 = pmap_l1(pmap, va); 2956 entry = (PTE_V); 2957 entry |= (l2_pn << PTE_PPN0_S); 2958 pmap_store(l1, entry); 2959 pmap_distribute_l1(pmap, pmap_l1_index(va), entry); 2960 l2 = pmap_l1_to_l2(l1, va); 2961 } 2962 2963 l3_m = vm_page_alloc_noobj(VM_ALLOC_WIRED | 2964 VM_ALLOC_ZERO); 2965 if (l3_m == NULL) 2966 panic("pmap_enter: l3 pte_m == NULL"); 2967 2968 l3_pa = VM_PAGE_TO_PHYS(l3_m); 2969 l3_pn = (l3_pa / PAGE_SIZE); 2970 entry = (PTE_V); 2971 entry |= (l3_pn << PTE_PPN0_S); 2972 pmap_store(l2, entry); 2973 l3 = pmap_l2_to_l3(l2, va); 2974 } 2975 pmap_invalidate_page(pmap, va); 2976 } 2977 2978 orig_l3 = pmap_load(l3); 2979 opa = PTE_TO_PHYS(orig_l3); 2980 pv = NULL; 2981 2982 /* 2983 * Is the specified virtual address already mapped? 2984 */ 2985 if ((orig_l3 & PTE_V) != 0) { 2986 /* 2987 * Wiring change, just update stats. We don't worry about 2988 * wiring PT pages as they remain resident as long as there 2989 * are valid mappings in them. Hence, if a user page is wired, 2990 * the PT page will be also. 2991 */ 2992 if ((flags & PMAP_ENTER_WIRED) != 0 && 2993 (orig_l3 & PTE_SW_WIRED) == 0) 2994 pmap->pm_stats.wired_count++; 2995 else if ((flags & PMAP_ENTER_WIRED) == 0 && 2996 (orig_l3 & PTE_SW_WIRED) != 0) 2997 pmap->pm_stats.wired_count--; 2998 2999 /* 3000 * Remove the extra PT page reference. 3001 */ 3002 if (mpte != NULL) { 3003 mpte->ref_count--; 3004 KASSERT(mpte->ref_count > 0, 3005 ("pmap_enter: missing reference to page table page," 3006 " va: 0x%lx", va)); 3007 } 3008 3009 /* 3010 * Has the physical page changed? 3011 */ 3012 if (opa == pa) { 3013 /* 3014 * No, might be a protection or wiring change. 3015 */ 3016 if ((orig_l3 & PTE_SW_MANAGED) != 0 && 3017 (new_l3 & PTE_W) != 0) 3018 vm_page_aflag_set(m, PGA_WRITEABLE); 3019 goto validate; 3020 } 3021 3022 /* 3023 * The physical page has changed. Temporarily invalidate 3024 * the mapping. This ensures that all threads sharing the 3025 * pmap keep a consistent view of the mapping, which is 3026 * necessary for the correct handling of COW faults. It 3027 * also permits reuse of the old mapping's PV entry, 3028 * avoiding an allocation. 3029 * 3030 * For consistency, handle unmanaged mappings the same way. 3031 */ 3032 orig_l3 = pmap_load_clear(l3); 3033 KASSERT(PTE_TO_PHYS(orig_l3) == opa, 3034 ("pmap_enter: unexpected pa update for %#lx", va)); 3035 if ((orig_l3 & PTE_SW_MANAGED) != 0) { 3036 om = PHYS_TO_VM_PAGE(opa); 3037 3038 /* 3039 * The pmap lock is sufficient to synchronize with 3040 * concurrent calls to pmap_page_test_mappings() and 3041 * pmap_ts_referenced(). 3042 */ 3043 if ((orig_l3 & PTE_D) != 0) 3044 vm_page_dirty(om); 3045 if ((orig_l3 & PTE_A) != 0) 3046 vm_page_aflag_set(om, PGA_REFERENCED); 3047 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 3048 pv = pmap_pvh_remove(&om->md, pmap, va); 3049 KASSERT(pv != NULL, 3050 ("pmap_enter: no PV entry for %#lx", va)); 3051 if ((new_l3 & PTE_SW_MANAGED) == 0) 3052 free_pv_entry(pmap, pv); 3053 if ((om->a.flags & PGA_WRITEABLE) != 0 && 3054 TAILQ_EMPTY(&om->md.pv_list) && 3055 ((om->flags & PG_FICTITIOUS) != 0 || 3056 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 3057 vm_page_aflag_clear(om, PGA_WRITEABLE); 3058 } 3059 pmap_invalidate_page(pmap, va); 3060 orig_l3 = 0; 3061 } else { 3062 /* 3063 * Increment the counters. 3064 */ 3065 if ((new_l3 & PTE_SW_WIRED) != 0) 3066 pmap->pm_stats.wired_count++; 3067 pmap_resident_count_inc(pmap, 1); 3068 } 3069 /* 3070 * Enter on the PV list if part of our managed memory. 3071 */ 3072 if ((new_l3 & PTE_SW_MANAGED) != 0) { 3073 if (pv == NULL) { 3074 pv = get_pv_entry(pmap, &lock); 3075 pv->pv_va = va; 3076 } 3077 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 3078 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3079 m->md.pv_gen++; 3080 if ((new_l3 & PTE_W) != 0) 3081 vm_page_aflag_set(m, PGA_WRITEABLE); 3082 } 3083 3084 validate: 3085 /* 3086 * Sync the i-cache on all harts before updating the PTE 3087 * if the new PTE is executable. 3088 */ 3089 if (prot & VM_PROT_EXECUTE) 3090 pmap_sync_icache(pmap, va, PAGE_SIZE); 3091 3092 /* 3093 * Update the L3 entry. 3094 */ 3095 if (orig_l3 != 0) { 3096 orig_l3 = pmap_load_store(l3, new_l3); 3097 pmap_invalidate_page(pmap, va); 3098 KASSERT(PTE_TO_PHYS(orig_l3) == pa, 3099 ("pmap_enter: invalid update")); 3100 if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) == 3101 (PTE_D | PTE_SW_MANAGED)) 3102 vm_page_dirty(m); 3103 } else { 3104 pmap_store(l3, new_l3); 3105 } 3106 3107 #if VM_NRESERVLEVEL > 0 3108 if (mpte != NULL && mpte->ref_count == Ln_ENTRIES && 3109 pmap_ps_enabled(pmap) && 3110 (m->flags & PG_FICTITIOUS) == 0 && 3111 vm_reserv_level_iffullpop(m) == 0) 3112 pmap_promote_l2(pmap, l2, va, &lock); 3113 #endif 3114 3115 rv = KERN_SUCCESS; 3116 out: 3117 if (lock != NULL) 3118 rw_wunlock(lock); 3119 rw_runlock(&pvh_global_lock); 3120 PMAP_UNLOCK(pmap); 3121 return (rv); 3122 } 3123 3124 /* 3125 * Tries to create a read- and/or execute-only 2MB page mapping. Returns true 3126 * if successful. Returns false if (1) a page table page cannot be allocated 3127 * without sleeping, (2) a mapping already exists at the specified virtual 3128 * address, or (3) a PV entry cannot be allocated without reclaiming another 3129 * PV entry. 3130 */ 3131 static bool 3132 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3133 struct rwlock **lockp) 3134 { 3135 pd_entry_t new_l2; 3136 pn_t pn; 3137 3138 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3139 3140 pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE; 3141 new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V); 3142 if ((m->oflags & VPO_UNMANAGED) == 0) 3143 new_l2 |= PTE_SW_MANAGED; 3144 if ((prot & VM_PROT_EXECUTE) != 0) 3145 new_l2 |= PTE_X; 3146 if (va < VM_MAXUSER_ADDRESS) 3147 new_l2 |= PTE_U; 3148 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | 3149 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == 3150 KERN_SUCCESS); 3151 } 3152 3153 /* 3154 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 3155 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 3156 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 3157 * a mapping already exists at the specified virtual address. Returns 3158 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table 3159 * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if 3160 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. 3161 * 3162 * The parameter "m" is only used when creating a managed, writeable mapping. 3163 */ 3164 static int 3165 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, 3166 vm_page_t m, struct rwlock **lockp) 3167 { 3168 struct spglist free; 3169 pd_entry_t *l2, *l3, oldl2; 3170 vm_offset_t sva; 3171 vm_page_t l2pg, mt; 3172 3173 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3174 3175 if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? 3176 NULL : lockp)) == NULL) { 3177 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", 3178 va, pmap); 3179 return (KERN_RESOURCE_SHORTAGE); 3180 } 3181 3182 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); 3183 l2 = &l2[pmap_l2_index(va)]; 3184 if ((oldl2 = pmap_load(l2)) != 0) { 3185 KASSERT(l2pg->ref_count > 1, 3186 ("pmap_enter_l2: l2pg's ref count is too low")); 3187 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 3188 l2pg->ref_count--; 3189 CTR2(KTR_PMAP, 3190 "pmap_enter_l2: failure for va %#lx in pmap %p", 3191 va, pmap); 3192 return (KERN_FAILURE); 3193 } 3194 SLIST_INIT(&free); 3195 if ((oldl2 & PTE_RWX) != 0) 3196 (void)pmap_remove_l2(pmap, l2, va, 3197 pmap_load(pmap_l1(pmap, va)), &free, lockp); 3198 else 3199 for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) { 3200 l3 = pmap_l2_to_l3(l2, sva); 3201 if ((pmap_load(l3) & PTE_V) != 0 && 3202 pmap_remove_l3(pmap, l3, sva, oldl2, &free, 3203 lockp) != 0) 3204 break; 3205 } 3206 vm_page_free_pages_toq(&free, true); 3207 if (va >= VM_MAXUSER_ADDRESS) { 3208 /* 3209 * Both pmap_remove_l2() and pmap_remove_l3() will 3210 * leave the kernel page table page zero filled. 3211 */ 3212 mt = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 3213 if (pmap_insert_pt_page(pmap, mt, false)) 3214 panic("pmap_enter_l2: trie insert failed"); 3215 } else 3216 KASSERT(pmap_load(l2) == 0, 3217 ("pmap_enter_l2: non-zero L2 entry %p", l2)); 3218 } 3219 3220 if ((new_l2 & PTE_SW_MANAGED) != 0) { 3221 /* 3222 * Abort this mapping if its PV entry could not be created. 3223 */ 3224 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { 3225 SLIST_INIT(&free); 3226 if (pmap_unwire_ptp(pmap, va, l2pg, &free)) { 3227 /* 3228 * Although "va" is not mapped, paging-structure 3229 * caches could nonetheless have entries that 3230 * refer to the freed page table pages. 3231 * Invalidate those entries. 3232 */ 3233 pmap_invalidate_page(pmap, va); 3234 vm_page_free_pages_toq(&free, true); 3235 } 3236 CTR2(KTR_PMAP, 3237 "pmap_enter_l2: failure for va %#lx in pmap %p", 3238 va, pmap); 3239 return (KERN_RESOURCE_SHORTAGE); 3240 } 3241 if ((new_l2 & PTE_W) != 0) 3242 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 3243 vm_page_aflag_set(mt, PGA_WRITEABLE); 3244 } 3245 3246 /* 3247 * Increment counters. 3248 */ 3249 if ((new_l2 & PTE_SW_WIRED) != 0) 3250 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; 3251 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; 3252 3253 /* 3254 * Map the superpage. 3255 */ 3256 pmap_store(l2, new_l2); 3257 3258 atomic_add_long(&pmap_l2_mappings, 1); 3259 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", 3260 va, pmap); 3261 3262 return (KERN_SUCCESS); 3263 } 3264 3265 /* 3266 * Maps a sequence of resident pages belonging to the same object. 3267 * The sequence begins with the given page m_start. This page is 3268 * mapped at the given virtual address start. Each subsequent page is 3269 * mapped at a virtual address that is offset from start by the same 3270 * amount as the page is offset from m_start within the object. The 3271 * last page in the sequence is the page with the largest offset from 3272 * m_start that can be mapped at a virtual address less than the given 3273 * virtual address end. Not every virtual page between start and end 3274 * is mapped; only those for which a resident page exists with the 3275 * corresponding offset from m_start are mapped. 3276 */ 3277 void 3278 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3279 vm_page_t m_start, vm_prot_t prot) 3280 { 3281 struct rwlock *lock; 3282 vm_offset_t va; 3283 vm_page_t m, mpte; 3284 vm_pindex_t diff, psize; 3285 3286 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3287 3288 psize = atop(end - start); 3289 mpte = NULL; 3290 m = m_start; 3291 lock = NULL; 3292 rw_rlock(&pvh_global_lock); 3293 PMAP_LOCK(pmap); 3294 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3295 va = start + ptoa(diff); 3296 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && 3297 m->psind == 1 && pmap_ps_enabled(pmap) && 3298 pmap_enter_2mpage(pmap, va, m, prot, &lock)) 3299 m = &m[L2_SIZE / PAGE_SIZE - 1]; 3300 else 3301 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, 3302 &lock); 3303 m = TAILQ_NEXT(m, listq); 3304 } 3305 if (lock != NULL) 3306 rw_wunlock(lock); 3307 rw_runlock(&pvh_global_lock); 3308 PMAP_UNLOCK(pmap); 3309 } 3310 3311 /* 3312 * this code makes some *MAJOR* assumptions: 3313 * 1. Current pmap & pmap exists. 3314 * 2. Not wired. 3315 * 3. Read access. 3316 * 4. No page table pages. 3317 * but is *MUCH* faster than pmap_enter... 3318 */ 3319 3320 void 3321 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3322 { 3323 struct rwlock *lock; 3324 3325 lock = NULL; 3326 rw_rlock(&pvh_global_lock); 3327 PMAP_LOCK(pmap); 3328 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 3329 if (lock != NULL) 3330 rw_wunlock(lock); 3331 rw_runlock(&pvh_global_lock); 3332 PMAP_UNLOCK(pmap); 3333 } 3334 3335 static vm_page_t 3336 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3337 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 3338 { 3339 struct spglist free; 3340 vm_paddr_t phys; 3341 pd_entry_t *l2; 3342 pt_entry_t *l3, newl3; 3343 3344 KASSERT(!VA_IS_CLEANMAP(va) || 3345 (m->oflags & VPO_UNMANAGED) != 0, 3346 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3347 rw_assert(&pvh_global_lock, RA_LOCKED); 3348 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3349 3350 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); 3351 /* 3352 * In the case that a page table page is not 3353 * resident, we are creating it here. 3354 */ 3355 if (va < VM_MAXUSER_ADDRESS) { 3356 vm_pindex_t l2pindex; 3357 3358 /* 3359 * Calculate pagetable page index 3360 */ 3361 l2pindex = pmap_l2_pindex(va); 3362 if (mpte && (mpte->pindex == l2pindex)) { 3363 mpte->ref_count++; 3364 } else { 3365 /* 3366 * Get the l2 entry 3367 */ 3368 l2 = pmap_l2(pmap, va); 3369 3370 /* 3371 * If the page table page is mapped, we just increment 3372 * the hold count, and activate it. Otherwise, we 3373 * attempt to allocate a page table page. If this 3374 * attempt fails, we don't retry. Instead, we give up. 3375 */ 3376 if (l2 != NULL && pmap_load(l2) != 0) { 3377 phys = PTE_TO_PHYS(pmap_load(l2)); 3378 mpte = PHYS_TO_VM_PAGE(phys); 3379 mpte->ref_count++; 3380 } else { 3381 /* 3382 * Pass NULL instead of the PV list lock 3383 * pointer, because we don't intend to sleep. 3384 */ 3385 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); 3386 if (mpte == NULL) 3387 return (mpte); 3388 } 3389 } 3390 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 3391 l3 = &l3[pmap_l3_index(va)]; 3392 } else { 3393 mpte = NULL; 3394 l3 = pmap_l3(kernel_pmap, va); 3395 } 3396 if (l3 == NULL) 3397 panic("pmap_enter_quick_locked: No l3"); 3398 if (pmap_load(l3) != 0) { 3399 if (mpte != NULL) { 3400 mpte->ref_count--; 3401 mpte = NULL; 3402 } 3403 return (mpte); 3404 } 3405 3406 /* 3407 * Enter on the PV list if part of our managed memory. 3408 */ 3409 if ((m->oflags & VPO_UNMANAGED) == 0 && 3410 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 3411 if (mpte != NULL) { 3412 SLIST_INIT(&free); 3413 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 3414 pmap_invalidate_page(pmap, va); 3415 vm_page_free_pages_toq(&free, false); 3416 } 3417 mpte = NULL; 3418 } 3419 return (mpte); 3420 } 3421 3422 /* 3423 * Increment counters 3424 */ 3425 pmap_resident_count_inc(pmap, 1); 3426 3427 newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) | 3428 PTE_V | PTE_R; 3429 if ((prot & VM_PROT_EXECUTE) != 0) 3430 newl3 |= PTE_X; 3431 if ((m->oflags & VPO_UNMANAGED) == 0) 3432 newl3 |= PTE_SW_MANAGED; 3433 if (va < VM_MAX_USER_ADDRESS) 3434 newl3 |= PTE_U; 3435 3436 /* 3437 * Sync the i-cache on all harts before updating the PTE 3438 * if the new PTE is executable. 3439 */ 3440 if (prot & VM_PROT_EXECUTE) 3441 pmap_sync_icache(pmap, va, PAGE_SIZE); 3442 3443 pmap_store(l3, newl3); 3444 3445 pmap_invalidate_page(pmap, va); 3446 return (mpte); 3447 } 3448 3449 /* 3450 * This code maps large physical mmap regions into the 3451 * processor address space. Note that some shortcuts 3452 * are taken, but the code works. 3453 */ 3454 void 3455 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3456 vm_pindex_t pindex, vm_size_t size) 3457 { 3458 3459 VM_OBJECT_ASSERT_WLOCKED(object); 3460 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3461 ("pmap_object_init_pt: non-device object")); 3462 } 3463 3464 /* 3465 * Clear the wired attribute from the mappings for the specified range of 3466 * addresses in the given pmap. Every valid mapping within that range 3467 * must have the wired attribute set. In contrast, invalid mappings 3468 * cannot have the wired attribute set, so they are ignored. 3469 * 3470 * The wired attribute of the page table entry is not a hardware feature, 3471 * so there is no need to invalidate any TLB entries. 3472 */ 3473 void 3474 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3475 { 3476 vm_offset_t va_next; 3477 pd_entry_t *l0, *l1, *l2, l2e; 3478 pt_entry_t *l3, l3e; 3479 bool pv_lists_locked; 3480 3481 pv_lists_locked = false; 3482 retry: 3483 PMAP_LOCK(pmap); 3484 for (; sva < eva; sva = va_next) { 3485 if (pmap_mode == PMAP_MODE_SV48) { 3486 l0 = pmap_l0(pmap, sva); 3487 if (pmap_load(l0) == 0) { 3488 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 3489 if (va_next < sva) 3490 va_next = eva; 3491 continue; 3492 } 3493 l1 = pmap_l0_to_l1(l0, sva); 3494 } else { 3495 l1 = pmap_l1(pmap, sva); 3496 } 3497 3498 if (pmap_load(l1) == 0) { 3499 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3500 if (va_next < sva) 3501 va_next = eva; 3502 continue; 3503 } 3504 3505 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3506 if (va_next < sva) 3507 va_next = eva; 3508 3509 l2 = pmap_l1_to_l2(l1, sva); 3510 if ((l2e = pmap_load(l2)) == 0) 3511 continue; 3512 if ((l2e & PTE_RWX) != 0) { 3513 if (sva + L2_SIZE == va_next && eva >= va_next) { 3514 if ((l2e & PTE_SW_WIRED) == 0) 3515 panic("pmap_unwire: l2 %#jx is missing " 3516 "PTE_SW_WIRED", (uintmax_t)l2e); 3517 pmap_clear_bits(l2, PTE_SW_WIRED); 3518 continue; 3519 } else { 3520 if (!pv_lists_locked) { 3521 pv_lists_locked = true; 3522 if (!rw_try_rlock(&pvh_global_lock)) { 3523 PMAP_UNLOCK(pmap); 3524 rw_rlock(&pvh_global_lock); 3525 /* Repeat sva. */ 3526 goto retry; 3527 } 3528 } 3529 if (!pmap_demote_l2(pmap, l2, sva)) 3530 panic("pmap_unwire: demotion failed"); 3531 } 3532 } 3533 3534 if (va_next > eva) 3535 va_next = eva; 3536 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 3537 sva += L3_SIZE) { 3538 if ((l3e = pmap_load(l3)) == 0) 3539 continue; 3540 if ((l3e & PTE_SW_WIRED) == 0) 3541 panic("pmap_unwire: l3 %#jx is missing " 3542 "PTE_SW_WIRED", (uintmax_t)l3e); 3543 3544 /* 3545 * PG_W must be cleared atomically. Although the pmap 3546 * lock synchronizes access to PG_W, another processor 3547 * could be setting PG_M and/or PG_A concurrently. 3548 */ 3549 pmap_clear_bits(l3, PTE_SW_WIRED); 3550 pmap->pm_stats.wired_count--; 3551 } 3552 } 3553 if (pv_lists_locked) 3554 rw_runlock(&pvh_global_lock); 3555 PMAP_UNLOCK(pmap); 3556 } 3557 3558 /* 3559 * Copy the range specified by src_addr/len 3560 * from the source map to the range dst_addr/len 3561 * in the destination map. 3562 * 3563 * This routine is only advisory and need not do anything. 3564 */ 3565 3566 void 3567 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 3568 vm_offset_t src_addr) 3569 { 3570 3571 } 3572 3573 /* 3574 * pmap_zero_page zeros the specified hardware page by mapping 3575 * the page into KVM and using bzero to clear its contents. 3576 */ 3577 void 3578 pmap_zero_page(vm_page_t m) 3579 { 3580 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3581 3582 pagezero((void *)va); 3583 } 3584 3585 /* 3586 * pmap_zero_page_area zeros the specified hardware page by mapping 3587 * the page into KVM and using bzero to clear its contents. 3588 * 3589 * off and size may not cover an area beyond a single hardware page. 3590 */ 3591 void 3592 pmap_zero_page_area(vm_page_t m, int off, int size) 3593 { 3594 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3595 3596 if (off == 0 && size == PAGE_SIZE) 3597 pagezero((void *)va); 3598 else 3599 bzero((char *)va + off, size); 3600 } 3601 3602 /* 3603 * pmap_copy_page copies the specified (machine independent) 3604 * page by mapping the page into virtual memory and using 3605 * bcopy to copy the page, one machine dependent page at a 3606 * time. 3607 */ 3608 void 3609 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 3610 { 3611 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 3612 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 3613 3614 pagecopy((void *)src, (void *)dst); 3615 } 3616 3617 int unmapped_buf_allowed = 1; 3618 3619 void 3620 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 3621 vm_offset_t b_offset, int xfersize) 3622 { 3623 void *a_cp, *b_cp; 3624 vm_page_t m_a, m_b; 3625 vm_paddr_t p_a, p_b; 3626 vm_offset_t a_pg_offset, b_pg_offset; 3627 int cnt; 3628 3629 while (xfersize > 0) { 3630 a_pg_offset = a_offset & PAGE_MASK; 3631 m_a = ma[a_offset >> PAGE_SHIFT]; 3632 p_a = m_a->phys_addr; 3633 b_pg_offset = b_offset & PAGE_MASK; 3634 m_b = mb[b_offset >> PAGE_SHIFT]; 3635 p_b = m_b->phys_addr; 3636 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 3637 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 3638 if (__predict_false(!PHYS_IN_DMAP(p_a))) { 3639 panic("!DMAP a %lx", p_a); 3640 } else { 3641 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 3642 } 3643 if (__predict_false(!PHYS_IN_DMAP(p_b))) { 3644 panic("!DMAP b %lx", p_b); 3645 } else { 3646 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 3647 } 3648 bcopy(a_cp, b_cp, cnt); 3649 a_offset += cnt; 3650 b_offset += cnt; 3651 xfersize -= cnt; 3652 } 3653 } 3654 3655 vm_offset_t 3656 pmap_quick_enter_page(vm_page_t m) 3657 { 3658 3659 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); 3660 } 3661 3662 void 3663 pmap_quick_remove_page(vm_offset_t addr) 3664 { 3665 } 3666 3667 /* 3668 * Returns true if the pmap's pv is one of the first 3669 * 16 pvs linked to from this page. This count may 3670 * be changed upwards or downwards in the future; it 3671 * is only necessary that true be returned for a small 3672 * subset of pmaps for proper page aging. 3673 */ 3674 boolean_t 3675 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 3676 { 3677 struct md_page *pvh; 3678 struct rwlock *lock; 3679 pv_entry_t pv; 3680 int loops = 0; 3681 boolean_t rv; 3682 3683 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3684 ("pmap_page_exists_quick: page %p is not managed", m)); 3685 rv = FALSE; 3686 rw_rlock(&pvh_global_lock); 3687 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3688 rw_rlock(lock); 3689 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3690 if (PV_PMAP(pv) == pmap) { 3691 rv = TRUE; 3692 break; 3693 } 3694 loops++; 3695 if (loops >= 16) 3696 break; 3697 } 3698 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 3699 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3700 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3701 if (PV_PMAP(pv) == pmap) { 3702 rv = TRUE; 3703 break; 3704 } 3705 loops++; 3706 if (loops >= 16) 3707 break; 3708 } 3709 } 3710 rw_runlock(lock); 3711 rw_runlock(&pvh_global_lock); 3712 return (rv); 3713 } 3714 3715 /* 3716 * pmap_page_wired_mappings: 3717 * 3718 * Return the number of managed mappings to the given physical page 3719 * that are wired. 3720 */ 3721 int 3722 pmap_page_wired_mappings(vm_page_t m) 3723 { 3724 struct md_page *pvh; 3725 struct rwlock *lock; 3726 pmap_t pmap; 3727 pd_entry_t *l2; 3728 pt_entry_t *l3; 3729 pv_entry_t pv; 3730 int count, md_gen, pvh_gen; 3731 3732 if ((m->oflags & VPO_UNMANAGED) != 0) 3733 return (0); 3734 rw_rlock(&pvh_global_lock); 3735 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3736 rw_rlock(lock); 3737 restart: 3738 count = 0; 3739 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3740 pmap = PV_PMAP(pv); 3741 if (!PMAP_TRYLOCK(pmap)) { 3742 md_gen = m->md.pv_gen; 3743 rw_runlock(lock); 3744 PMAP_LOCK(pmap); 3745 rw_rlock(lock); 3746 if (md_gen != m->md.pv_gen) { 3747 PMAP_UNLOCK(pmap); 3748 goto restart; 3749 } 3750 } 3751 l2 = pmap_l2(pmap, pv->pv_va); 3752 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 3753 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 3754 l3 = pmap_l2_to_l3(l2, pv->pv_va); 3755 if ((pmap_load(l3) & PTE_SW_WIRED) != 0) 3756 count++; 3757 PMAP_UNLOCK(pmap); 3758 } 3759 if ((m->flags & PG_FICTITIOUS) == 0) { 3760 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3761 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3762 pmap = PV_PMAP(pv); 3763 if (!PMAP_TRYLOCK(pmap)) { 3764 md_gen = m->md.pv_gen; 3765 pvh_gen = pvh->pv_gen; 3766 rw_runlock(lock); 3767 PMAP_LOCK(pmap); 3768 rw_rlock(lock); 3769 if (md_gen != m->md.pv_gen || 3770 pvh_gen != pvh->pv_gen) { 3771 PMAP_UNLOCK(pmap); 3772 goto restart; 3773 } 3774 } 3775 l2 = pmap_l2(pmap, pv->pv_va); 3776 if ((pmap_load(l2) & PTE_SW_WIRED) != 0) 3777 count++; 3778 PMAP_UNLOCK(pmap); 3779 } 3780 } 3781 rw_runlock(lock); 3782 rw_runlock(&pvh_global_lock); 3783 return (count); 3784 } 3785 3786 /* 3787 * Returns true if the given page is mapped individually or as part of 3788 * a 2mpage. Otherwise, returns false. 3789 */ 3790 bool 3791 pmap_page_is_mapped(vm_page_t m) 3792 { 3793 struct rwlock *lock; 3794 bool rv; 3795 3796 if ((m->oflags & VPO_UNMANAGED) != 0) 3797 return (false); 3798 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3799 rw_rlock(lock); 3800 rv = !TAILQ_EMPTY(&m->md.pv_list) || 3801 ((m->flags & PG_FICTITIOUS) == 0 && 3802 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 3803 rw_runlock(lock); 3804 return (rv); 3805 } 3806 3807 static void 3808 pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, 3809 struct spglist *free, bool superpage) 3810 { 3811 struct md_page *pvh; 3812 vm_page_t mpte, mt; 3813 3814 if (superpage) { 3815 pmap_resident_count_dec(pmap, Ln_ENTRIES); 3816 pvh = pa_to_pvh(m->phys_addr); 3817 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3818 pvh->pv_gen++; 3819 if (TAILQ_EMPTY(&pvh->pv_list)) { 3820 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 3821 if (TAILQ_EMPTY(&mt->md.pv_list) && 3822 (mt->a.flags & PGA_WRITEABLE) != 0) 3823 vm_page_aflag_clear(mt, PGA_WRITEABLE); 3824 } 3825 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 3826 if (mpte != NULL) { 3827 KASSERT(mpte->valid == VM_PAGE_BITS_ALL, 3828 ("pmap_remove_pages: pte page not promoted")); 3829 pmap_resident_count_dec(pmap, 1); 3830 KASSERT(mpte->ref_count == Ln_ENTRIES, 3831 ("pmap_remove_pages: pte page ref count error")); 3832 mpte->ref_count = 0; 3833 pmap_add_delayed_free_list(mpte, free, FALSE); 3834 } 3835 } else { 3836 pmap_resident_count_dec(pmap, 1); 3837 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3838 m->md.pv_gen++; 3839 if (TAILQ_EMPTY(&m->md.pv_list) && 3840 (m->a.flags & PGA_WRITEABLE) != 0) { 3841 pvh = pa_to_pvh(m->phys_addr); 3842 if (TAILQ_EMPTY(&pvh->pv_list)) 3843 vm_page_aflag_clear(m, PGA_WRITEABLE); 3844 } 3845 } 3846 } 3847 3848 /* 3849 * Destroy all managed, non-wired mappings in the given user-space 3850 * pmap. This pmap cannot be active on any processor besides the 3851 * caller. 3852 * 3853 * This function cannot be applied to the kernel pmap. Moreover, it 3854 * is not intended for general use. It is only to be used during 3855 * process termination. Consequently, it can be implemented in ways 3856 * that make it faster than pmap_remove(). First, it can more quickly 3857 * destroy mappings by iterating over the pmap's collection of PV 3858 * entries, rather than searching the page table. Second, it doesn't 3859 * have to test and clear the page table entries atomically, because 3860 * no processor is currently accessing the user address space. In 3861 * particular, a page table entry's dirty bit won't change state once 3862 * this function starts. 3863 */ 3864 void 3865 pmap_remove_pages(pmap_t pmap) 3866 { 3867 struct spglist free; 3868 pd_entry_t ptepde; 3869 pt_entry_t *pte, tpte; 3870 vm_page_t m, mt; 3871 pv_entry_t pv; 3872 struct pv_chunk *pc, *npc; 3873 struct rwlock *lock; 3874 int64_t bit; 3875 uint64_t inuse, bitmask; 3876 int allfree, field, freed, idx; 3877 bool superpage; 3878 3879 lock = NULL; 3880 3881 SLIST_INIT(&free); 3882 rw_rlock(&pvh_global_lock); 3883 PMAP_LOCK(pmap); 3884 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 3885 allfree = 1; 3886 freed = 0; 3887 for (field = 0; field < _NPCM; field++) { 3888 inuse = ~pc->pc_map[field] & pc_freemask[field]; 3889 while (inuse != 0) { 3890 bit = ffsl(inuse) - 1; 3891 bitmask = 1UL << bit; 3892 idx = field * 64 + bit; 3893 pv = &pc->pc_pventry[idx]; 3894 inuse &= ~bitmask; 3895 3896 pte = pmap_l1(pmap, pv->pv_va); 3897 ptepde = pmap_load(pte); 3898 pte = pmap_l1_to_l2(pte, pv->pv_va); 3899 tpte = pmap_load(pte); 3900 if ((tpte & PTE_RWX) != 0) { 3901 superpage = true; 3902 } else { 3903 ptepde = tpte; 3904 pte = pmap_l2_to_l3(pte, pv->pv_va); 3905 tpte = pmap_load(pte); 3906 superpage = false; 3907 } 3908 3909 /* 3910 * We cannot remove wired pages from a 3911 * process' mapping at this time. 3912 */ 3913 if (tpte & PTE_SW_WIRED) { 3914 allfree = 0; 3915 continue; 3916 } 3917 3918 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte)); 3919 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 3920 m < &vm_page_array[vm_page_array_size], 3921 ("pmap_remove_pages: bad pte %#jx", 3922 (uintmax_t)tpte)); 3923 3924 pmap_clear(pte); 3925 3926 /* 3927 * Update the vm_page_t clean/reference bits. 3928 */ 3929 if ((tpte & (PTE_D | PTE_W)) == 3930 (PTE_D | PTE_W)) { 3931 if (superpage) 3932 for (mt = m; 3933 mt < &m[Ln_ENTRIES]; mt++) 3934 vm_page_dirty(mt); 3935 else 3936 vm_page_dirty(m); 3937 } 3938 3939 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 3940 3941 /* Mark free */ 3942 pc->pc_map[field] |= bitmask; 3943 3944 pmap_remove_pages_pv(pmap, m, pv, &free, 3945 superpage); 3946 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 3947 freed++; 3948 } 3949 } 3950 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 3951 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 3952 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 3953 if (allfree) { 3954 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3955 free_pv_chunk(pc); 3956 } 3957 } 3958 if (lock != NULL) 3959 rw_wunlock(lock); 3960 pmap_invalidate_all(pmap); 3961 rw_runlock(&pvh_global_lock); 3962 PMAP_UNLOCK(pmap); 3963 vm_page_free_pages_toq(&free, false); 3964 } 3965 3966 static bool 3967 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 3968 { 3969 struct md_page *pvh; 3970 struct rwlock *lock; 3971 pd_entry_t *l2; 3972 pt_entry_t *l3, mask; 3973 pv_entry_t pv; 3974 pmap_t pmap; 3975 int md_gen, pvh_gen; 3976 bool rv; 3977 3978 mask = 0; 3979 if (modified) 3980 mask |= PTE_D; 3981 if (accessed) 3982 mask |= PTE_A; 3983 3984 rv = FALSE; 3985 rw_rlock(&pvh_global_lock); 3986 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3987 rw_rlock(lock); 3988 restart: 3989 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3990 pmap = PV_PMAP(pv); 3991 if (!PMAP_TRYLOCK(pmap)) { 3992 md_gen = m->md.pv_gen; 3993 rw_runlock(lock); 3994 PMAP_LOCK(pmap); 3995 rw_rlock(lock); 3996 if (md_gen != m->md.pv_gen) { 3997 PMAP_UNLOCK(pmap); 3998 goto restart; 3999 } 4000 } 4001 l2 = pmap_l2(pmap, pv->pv_va); 4002 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4003 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 4004 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4005 rv = (pmap_load(l3) & mask) == mask; 4006 PMAP_UNLOCK(pmap); 4007 if (rv) 4008 goto out; 4009 } 4010 if ((m->flags & PG_FICTITIOUS) == 0) { 4011 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4012 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4013 pmap = PV_PMAP(pv); 4014 if (!PMAP_TRYLOCK(pmap)) { 4015 md_gen = m->md.pv_gen; 4016 pvh_gen = pvh->pv_gen; 4017 rw_runlock(lock); 4018 PMAP_LOCK(pmap); 4019 rw_rlock(lock); 4020 if (md_gen != m->md.pv_gen || 4021 pvh_gen != pvh->pv_gen) { 4022 PMAP_UNLOCK(pmap); 4023 goto restart; 4024 } 4025 } 4026 l2 = pmap_l2(pmap, pv->pv_va); 4027 rv = (pmap_load(l2) & mask) == mask; 4028 PMAP_UNLOCK(pmap); 4029 if (rv) 4030 goto out; 4031 } 4032 } 4033 out: 4034 rw_runlock(lock); 4035 rw_runlock(&pvh_global_lock); 4036 return (rv); 4037 } 4038 4039 /* 4040 * pmap_is_modified: 4041 * 4042 * Return whether or not the specified physical page was modified 4043 * in any physical maps. 4044 */ 4045 boolean_t 4046 pmap_is_modified(vm_page_t m) 4047 { 4048 4049 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4050 ("pmap_is_modified: page %p is not managed", m)); 4051 4052 /* 4053 * If the page is not busied then this check is racy. 4054 */ 4055 if (!pmap_page_is_write_mapped(m)) 4056 return (FALSE); 4057 return (pmap_page_test_mappings(m, FALSE, TRUE)); 4058 } 4059 4060 /* 4061 * pmap_is_prefaultable: 4062 * 4063 * Return whether or not the specified virtual address is eligible 4064 * for prefault. 4065 */ 4066 boolean_t 4067 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 4068 { 4069 pt_entry_t *l3; 4070 boolean_t rv; 4071 4072 /* 4073 * Return TRUE if and only if the L3 entry for the specified virtual 4074 * address is allocated but invalid. 4075 */ 4076 rv = FALSE; 4077 PMAP_LOCK(pmap); 4078 l3 = pmap_l3(pmap, addr); 4079 if (l3 != NULL && pmap_load(l3) == 0) { 4080 rv = TRUE; 4081 } 4082 PMAP_UNLOCK(pmap); 4083 return (rv); 4084 } 4085 4086 /* 4087 * pmap_is_referenced: 4088 * 4089 * Return whether or not the specified physical page was referenced 4090 * in any physical maps. 4091 */ 4092 boolean_t 4093 pmap_is_referenced(vm_page_t m) 4094 { 4095 4096 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4097 ("pmap_is_referenced: page %p is not managed", m)); 4098 return (pmap_page_test_mappings(m, TRUE, FALSE)); 4099 } 4100 4101 /* 4102 * Clear the write and modified bits in each of the given page's mappings. 4103 */ 4104 void 4105 pmap_remove_write(vm_page_t m) 4106 { 4107 struct md_page *pvh; 4108 struct rwlock *lock; 4109 pmap_t pmap; 4110 pd_entry_t *l2; 4111 pt_entry_t *l3, oldl3, newl3; 4112 pv_entry_t next_pv, pv; 4113 vm_offset_t va; 4114 int md_gen, pvh_gen; 4115 4116 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4117 ("pmap_remove_write: page %p is not managed", m)); 4118 vm_page_assert_busied(m); 4119 4120 if (!pmap_page_is_write_mapped(m)) 4121 return; 4122 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4123 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 4124 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4125 rw_rlock(&pvh_global_lock); 4126 retry_pv_loop: 4127 rw_wlock(lock); 4128 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4129 pmap = PV_PMAP(pv); 4130 if (!PMAP_TRYLOCK(pmap)) { 4131 pvh_gen = pvh->pv_gen; 4132 rw_wunlock(lock); 4133 PMAP_LOCK(pmap); 4134 rw_wlock(lock); 4135 if (pvh_gen != pvh->pv_gen) { 4136 PMAP_UNLOCK(pmap); 4137 rw_wunlock(lock); 4138 goto retry_pv_loop; 4139 } 4140 } 4141 va = pv->pv_va; 4142 l2 = pmap_l2(pmap, va); 4143 if ((pmap_load(l2) & PTE_W) != 0) 4144 (void)pmap_demote_l2_locked(pmap, l2, va, &lock); 4145 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 4146 ("inconsistent pv lock %p %p for page %p", 4147 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 4148 PMAP_UNLOCK(pmap); 4149 } 4150 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4151 pmap = PV_PMAP(pv); 4152 if (!PMAP_TRYLOCK(pmap)) { 4153 pvh_gen = pvh->pv_gen; 4154 md_gen = m->md.pv_gen; 4155 rw_wunlock(lock); 4156 PMAP_LOCK(pmap); 4157 rw_wlock(lock); 4158 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4159 PMAP_UNLOCK(pmap); 4160 rw_wunlock(lock); 4161 goto retry_pv_loop; 4162 } 4163 } 4164 l2 = pmap_l2(pmap, pv->pv_va); 4165 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4166 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 4167 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4168 oldl3 = pmap_load(l3); 4169 retry: 4170 if ((oldl3 & PTE_W) != 0) { 4171 newl3 = oldl3 & ~(PTE_D | PTE_W); 4172 if (!atomic_fcmpset_long(l3, &oldl3, newl3)) 4173 goto retry; 4174 if ((oldl3 & PTE_D) != 0) 4175 vm_page_dirty(m); 4176 pmap_invalidate_page(pmap, pv->pv_va); 4177 } 4178 PMAP_UNLOCK(pmap); 4179 } 4180 rw_wunlock(lock); 4181 vm_page_aflag_clear(m, PGA_WRITEABLE); 4182 rw_runlock(&pvh_global_lock); 4183 } 4184 4185 /* 4186 * pmap_ts_referenced: 4187 * 4188 * Return a count of reference bits for a page, clearing those bits. 4189 * It is not necessary for every reference bit to be cleared, but it 4190 * is necessary that 0 only be returned when there are truly no 4191 * reference bits set. 4192 * 4193 * As an optimization, update the page's dirty field if a modified bit is 4194 * found while counting reference bits. This opportunistic update can be 4195 * performed at low cost and can eliminate the need for some future calls 4196 * to pmap_is_modified(). However, since this function stops after 4197 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 4198 * dirty pages. Those dirty pages will only be detected by a future call 4199 * to pmap_is_modified(). 4200 */ 4201 int 4202 pmap_ts_referenced(vm_page_t m) 4203 { 4204 struct spglist free; 4205 struct md_page *pvh; 4206 struct rwlock *lock; 4207 pv_entry_t pv, pvf; 4208 pmap_t pmap; 4209 pd_entry_t *l2, l2e; 4210 pt_entry_t *l3, l3e; 4211 vm_paddr_t pa; 4212 vm_offset_t va; 4213 int cleared, md_gen, not_cleared, pvh_gen; 4214 4215 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4216 ("pmap_ts_referenced: page %p is not managed", m)); 4217 SLIST_INIT(&free); 4218 cleared = 0; 4219 pa = VM_PAGE_TO_PHYS(m); 4220 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 4221 4222 lock = PHYS_TO_PV_LIST_LOCK(pa); 4223 rw_rlock(&pvh_global_lock); 4224 rw_wlock(lock); 4225 retry: 4226 not_cleared = 0; 4227 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 4228 goto small_mappings; 4229 pv = pvf; 4230 do { 4231 pmap = PV_PMAP(pv); 4232 if (!PMAP_TRYLOCK(pmap)) { 4233 pvh_gen = pvh->pv_gen; 4234 rw_wunlock(lock); 4235 PMAP_LOCK(pmap); 4236 rw_wlock(lock); 4237 if (pvh_gen != pvh->pv_gen) { 4238 PMAP_UNLOCK(pmap); 4239 goto retry; 4240 } 4241 } 4242 va = pv->pv_va; 4243 l2 = pmap_l2(pmap, va); 4244 l2e = pmap_load(l2); 4245 if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) { 4246 /* 4247 * Although l2e is mapping a 2MB page, because 4248 * this function is called at a 4KB page granularity, 4249 * we only update the 4KB page under test. 4250 */ 4251 vm_page_dirty(m); 4252 } 4253 if ((l2e & PTE_A) != 0) { 4254 /* 4255 * Since this reference bit is shared by 512 4KB 4256 * pages, it should not be cleared every time it is 4257 * tested. Apply a simple "hash" function on the 4258 * physical page number, the virtual superpage number, 4259 * and the pmap address to select one 4KB page out of 4260 * the 512 on which testing the reference bit will 4261 * result in clearing that reference bit. This 4262 * function is designed to avoid the selection of the 4263 * same 4KB page for every 2MB page mapping. 4264 * 4265 * On demotion, a mapping that hasn't been referenced 4266 * is simply destroyed. To avoid the possibility of a 4267 * subsequent page fault on a demoted wired mapping, 4268 * always leave its reference bit set. Moreover, 4269 * since the superpage is wired, the current state of 4270 * its reference bit won't affect page replacement. 4271 */ 4272 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ 4273 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && 4274 (l2e & PTE_SW_WIRED) == 0) { 4275 pmap_clear_bits(l2, PTE_A); 4276 pmap_invalidate_page(pmap, va); 4277 cleared++; 4278 } else 4279 not_cleared++; 4280 } 4281 PMAP_UNLOCK(pmap); 4282 /* Rotate the PV list if it has more than one entry. */ 4283 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4284 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4285 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4286 pvh->pv_gen++; 4287 } 4288 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 4289 goto out; 4290 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 4291 small_mappings: 4292 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 4293 goto out; 4294 pv = pvf; 4295 do { 4296 pmap = PV_PMAP(pv); 4297 if (!PMAP_TRYLOCK(pmap)) { 4298 pvh_gen = pvh->pv_gen; 4299 md_gen = m->md.pv_gen; 4300 rw_wunlock(lock); 4301 PMAP_LOCK(pmap); 4302 rw_wlock(lock); 4303 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4304 PMAP_UNLOCK(pmap); 4305 goto retry; 4306 } 4307 } 4308 l2 = pmap_l2(pmap, pv->pv_va); 4309 4310 KASSERT((pmap_load(l2) & PTE_RX) == 0, 4311 ("pmap_ts_referenced: found an invalid l2 table")); 4312 4313 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4314 l3e = pmap_load(l3); 4315 if ((l3e & PTE_D) != 0) 4316 vm_page_dirty(m); 4317 if ((l3e & PTE_A) != 0) { 4318 if ((l3e & PTE_SW_WIRED) == 0) { 4319 /* 4320 * Wired pages cannot be paged out so 4321 * doing accessed bit emulation for 4322 * them is wasted effort. We do the 4323 * hard work for unwired pages only. 4324 */ 4325 pmap_clear_bits(l3, PTE_A); 4326 pmap_invalidate_page(pmap, pv->pv_va); 4327 cleared++; 4328 } else 4329 not_cleared++; 4330 } 4331 PMAP_UNLOCK(pmap); 4332 /* Rotate the PV list if it has more than one entry. */ 4333 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4334 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4335 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4336 m->md.pv_gen++; 4337 } 4338 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 4339 not_cleared < PMAP_TS_REFERENCED_MAX); 4340 out: 4341 rw_wunlock(lock); 4342 rw_runlock(&pvh_global_lock); 4343 vm_page_free_pages_toq(&free, false); 4344 return (cleared + not_cleared); 4345 } 4346 4347 /* 4348 * Apply the given advice to the specified range of addresses within the 4349 * given pmap. Depending on the advice, clear the referenced and/or 4350 * modified flags in each mapping and set the mapped page's dirty field. 4351 */ 4352 void 4353 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 4354 { 4355 } 4356 4357 /* 4358 * Clear the modify bits on the specified physical page. 4359 */ 4360 void 4361 pmap_clear_modify(vm_page_t m) 4362 { 4363 struct md_page *pvh; 4364 struct rwlock *lock; 4365 pmap_t pmap; 4366 pv_entry_t next_pv, pv; 4367 pd_entry_t *l2, oldl2; 4368 pt_entry_t *l3; 4369 vm_offset_t va; 4370 int md_gen, pvh_gen; 4371 4372 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4373 ("pmap_clear_modify: page %p is not managed", m)); 4374 vm_page_assert_busied(m); 4375 4376 if (!pmap_page_is_write_mapped(m)) 4377 return; 4378 4379 /* 4380 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 4381 * If the object containing the page is locked and the page is not 4382 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 4383 */ 4384 if ((m->a.flags & PGA_WRITEABLE) == 0) 4385 return; 4386 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 4387 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4388 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4389 rw_rlock(&pvh_global_lock); 4390 rw_wlock(lock); 4391 restart: 4392 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4393 pmap = PV_PMAP(pv); 4394 if (!PMAP_TRYLOCK(pmap)) { 4395 pvh_gen = pvh->pv_gen; 4396 rw_wunlock(lock); 4397 PMAP_LOCK(pmap); 4398 rw_wlock(lock); 4399 if (pvh_gen != pvh->pv_gen) { 4400 PMAP_UNLOCK(pmap); 4401 goto restart; 4402 } 4403 } 4404 va = pv->pv_va; 4405 l2 = pmap_l2(pmap, va); 4406 oldl2 = pmap_load(l2); 4407 /* If oldl2 has PTE_W set, then it also has PTE_D set. */ 4408 if ((oldl2 & PTE_W) != 0 && 4409 pmap_demote_l2_locked(pmap, l2, va, &lock) && 4410 (oldl2 & PTE_SW_WIRED) == 0) { 4411 /* 4412 * Write protect the mapping to a single page so that 4413 * a subsequent write access may repromote. 4414 */ 4415 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2); 4416 l3 = pmap_l2_to_l3(l2, va); 4417 pmap_clear_bits(l3, PTE_D | PTE_W); 4418 vm_page_dirty(m); 4419 pmap_invalidate_page(pmap, va); 4420 } 4421 PMAP_UNLOCK(pmap); 4422 } 4423 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4424 pmap = PV_PMAP(pv); 4425 if (!PMAP_TRYLOCK(pmap)) { 4426 md_gen = m->md.pv_gen; 4427 pvh_gen = pvh->pv_gen; 4428 rw_wunlock(lock); 4429 PMAP_LOCK(pmap); 4430 rw_wlock(lock); 4431 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4432 PMAP_UNLOCK(pmap); 4433 goto restart; 4434 } 4435 } 4436 l2 = pmap_l2(pmap, pv->pv_va); 4437 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4438 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 4439 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4440 if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) { 4441 pmap_clear_bits(l3, PTE_D | PTE_W); 4442 pmap_invalidate_page(pmap, pv->pv_va); 4443 } 4444 PMAP_UNLOCK(pmap); 4445 } 4446 rw_wunlock(lock); 4447 rw_runlock(&pvh_global_lock); 4448 } 4449 4450 void * 4451 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 4452 { 4453 4454 return ((void *)PHYS_TO_DMAP(pa)); 4455 } 4456 4457 void 4458 pmap_unmapbios(vm_paddr_t pa, vm_size_t size) 4459 { 4460 } 4461 4462 /* 4463 * Sets the memory attribute for the specified page. 4464 */ 4465 void 4466 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 4467 { 4468 4469 m->md.pv_memattr = ma; 4470 4471 /* 4472 * If "m" is a normal page, update its direct mapping. This update 4473 * can be relied upon to perform any cache operations that are 4474 * required for data coherence. 4475 */ 4476 if ((m->flags & PG_FICTITIOUS) == 0 && 4477 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 4478 m->md.pv_memattr) != 0) 4479 panic("memory attribute change on the direct map failed"); 4480 } 4481 4482 /* 4483 * Changes the specified virtual address range's memory type to that given by 4484 * the parameter "mode". The specified virtual address range must be 4485 * completely contained within either the direct map or the kernel map. 4486 * 4487 * Returns zero if the change completed successfully, and either EINVAL or 4488 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 4489 * of the virtual address range was not mapped, and ENOMEM is returned if 4490 * there was insufficient memory available to complete the change. In the 4491 * latter case, the memory type may have been changed on some part of the 4492 * virtual address range. 4493 */ 4494 int 4495 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 4496 { 4497 int error; 4498 4499 PMAP_LOCK(kernel_pmap); 4500 error = pmap_change_attr_locked(va, size, mode); 4501 PMAP_UNLOCK(kernel_pmap); 4502 return (error); 4503 } 4504 4505 static int 4506 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) 4507 { 4508 vm_offset_t base, offset, tmpva; 4509 pd_entry_t *l1, l1e; 4510 pd_entry_t *l2, l2e; 4511 pt_entry_t *l3, l3e; 4512 4513 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 4514 base = trunc_page(va); 4515 offset = va & PAGE_MASK; 4516 size = round_page(offset + size); 4517 4518 if (!VIRT_IN_DMAP(base) && 4519 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS)) 4520 return (EINVAL); 4521 4522 for (tmpva = base; tmpva < base + size; ) { 4523 l1 = pmap_l1(kernel_pmap, tmpva); 4524 if (l1 == NULL || ((l1e = pmap_load(l1)) & PTE_V) == 0) 4525 return (EINVAL); 4526 if ((l1e & PTE_RWX) != 0) { 4527 /* 4528 * TODO: Demote if attributes don't match and there 4529 * isn't an L1 page left in the range, and update the 4530 * L1 entry if the attributes don't match but there is 4531 * an L1 page left in the range, once we support the 4532 * upcoming Svpbmt extension. 4533 */ 4534 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; 4535 continue; 4536 } 4537 l2 = pmap_l1_to_l2(l1, tmpva); 4538 if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) 4539 return (EINVAL); 4540 if ((l2e & PTE_RWX) != 0) { 4541 /* 4542 * TODO: Demote if attributes don't match and there 4543 * isn't an L2 page left in the range, and update the 4544 * L2 entry if the attributes don't match but there is 4545 * an L2 page left in the range, once we support the 4546 * upcoming Svpbmt extension. 4547 */ 4548 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; 4549 continue; 4550 } 4551 l3 = pmap_l2_to_l3(l2, tmpva); 4552 if (l3 == NULL || ((l3e = pmap_load(l3)) & PTE_V) == 0) 4553 return (EINVAL); 4554 /* 4555 * TODO: Update the L3 entry if the attributes don't match once 4556 * we support the upcoming Svpbmt extension. 4557 */ 4558 tmpva += PAGE_SIZE; 4559 } 4560 4561 return (0); 4562 } 4563 4564 /* 4565 * Perform the pmap work for mincore(2). If the page is not both referenced and 4566 * modified by this pmap, returns its physical address so that the caller can 4567 * find other mappings. 4568 */ 4569 int 4570 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 4571 { 4572 pt_entry_t *l2, *l3, tpte; 4573 vm_paddr_t pa; 4574 int val; 4575 bool managed; 4576 4577 PMAP_LOCK(pmap); 4578 l2 = pmap_l2(pmap, addr); 4579 if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) { 4580 if ((tpte & PTE_RWX) != 0) { 4581 pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET); 4582 val = MINCORE_INCORE | MINCORE_PSIND(1); 4583 } else { 4584 l3 = pmap_l2_to_l3(l2, addr); 4585 tpte = pmap_load(l3); 4586 if ((tpte & PTE_V) == 0) { 4587 PMAP_UNLOCK(pmap); 4588 return (0); 4589 } 4590 pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET); 4591 val = MINCORE_INCORE; 4592 } 4593 4594 if ((tpte & PTE_D) != 0) 4595 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 4596 if ((tpte & PTE_A) != 0) 4597 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 4598 managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED; 4599 } else { 4600 managed = false; 4601 val = 0; 4602 } 4603 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 4604 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 4605 *pap = pa; 4606 } 4607 PMAP_UNLOCK(pmap); 4608 return (val); 4609 } 4610 4611 void 4612 pmap_activate_sw(struct thread *td) 4613 { 4614 pmap_t oldpmap, pmap; 4615 u_int hart; 4616 4617 oldpmap = PCPU_GET(curpmap); 4618 pmap = vmspace_pmap(td->td_proc->p_vmspace); 4619 if (pmap == oldpmap) 4620 return; 4621 csr_write(satp, pmap->pm_satp); 4622 4623 hart = PCPU_GET(hart); 4624 #ifdef SMP 4625 CPU_SET_ATOMIC(hart, &pmap->pm_active); 4626 CPU_CLR_ATOMIC(hart, &oldpmap->pm_active); 4627 #else 4628 CPU_SET(hart, &pmap->pm_active); 4629 CPU_CLR(hart, &oldpmap->pm_active); 4630 #endif 4631 PCPU_SET(curpmap, pmap); 4632 4633 sfence_vma(); 4634 } 4635 4636 void 4637 pmap_activate(struct thread *td) 4638 { 4639 4640 critical_enter(); 4641 pmap_activate_sw(td); 4642 critical_exit(); 4643 } 4644 4645 void 4646 pmap_activate_boot(pmap_t pmap) 4647 { 4648 u_int hart; 4649 4650 hart = PCPU_GET(hart); 4651 #ifdef SMP 4652 CPU_SET_ATOMIC(hart, &pmap->pm_active); 4653 #else 4654 CPU_SET(hart, &pmap->pm_active); 4655 #endif 4656 PCPU_SET(curpmap, pmap); 4657 } 4658 4659 void 4660 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) 4661 { 4662 cpuset_t mask; 4663 4664 /* 4665 * From the RISC-V User-Level ISA V2.2: 4666 * 4667 * "To make a store to instruction memory visible to all 4668 * RISC-V harts, the writing hart has to execute a data FENCE 4669 * before requesting that all remote RISC-V harts execute a 4670 * FENCE.I." 4671 * 4672 * However, this is slightly misleading; we still need to 4673 * perform a FENCE.I for the local hart, as FENCE does nothing 4674 * for its icache. FENCE.I alone is also sufficient for the 4675 * local hart. 4676 */ 4677 sched_pin(); 4678 mask = all_harts; 4679 CPU_CLR(PCPU_GET(hart), &mask); 4680 fence_i(); 4681 if (!CPU_EMPTY(&mask) && smp_started) { 4682 fence(); 4683 sbi_remote_fence_i(mask.__bits); 4684 } 4685 sched_unpin(); 4686 } 4687 4688 /* 4689 * Increase the starting virtual address of the given mapping if a 4690 * different alignment might result in more superpage mappings. 4691 */ 4692 void 4693 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 4694 vm_offset_t *addr, vm_size_t size) 4695 { 4696 vm_offset_t superpage_offset; 4697 4698 if (size < L2_SIZE) 4699 return; 4700 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 4701 offset += ptoa(object->pg_color); 4702 superpage_offset = offset & L2_OFFSET; 4703 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || 4704 (*addr & L2_OFFSET) == superpage_offset) 4705 return; 4706 if ((*addr & L2_OFFSET) < superpage_offset) 4707 *addr = (*addr & ~L2_OFFSET) + superpage_offset; 4708 else 4709 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; 4710 } 4711 4712 /** 4713 * Get the kernel virtual address of a set of physical pages. If there are 4714 * physical addresses not covered by the DMAP perform a transient mapping 4715 * that will be removed when calling pmap_unmap_io_transient. 4716 * 4717 * \param page The pages the caller wishes to obtain the virtual 4718 * address on the kernel memory map. 4719 * \param vaddr On return contains the kernel virtual memory address 4720 * of the pages passed in the page parameter. 4721 * \param count Number of pages passed in. 4722 * \param can_fault TRUE if the thread using the mapped pages can take 4723 * page faults, FALSE otherwise. 4724 * 4725 * \returns TRUE if the caller must call pmap_unmap_io_transient when 4726 * finished or FALSE otherwise. 4727 * 4728 */ 4729 boolean_t 4730 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 4731 boolean_t can_fault) 4732 { 4733 vm_paddr_t paddr; 4734 boolean_t needs_mapping; 4735 int error __diagused, i; 4736 4737 /* 4738 * Allocate any KVA space that we need, this is done in a separate 4739 * loop to prevent calling vmem_alloc while pinned. 4740 */ 4741 needs_mapping = FALSE; 4742 for (i = 0; i < count; i++) { 4743 paddr = VM_PAGE_TO_PHYS(page[i]); 4744 if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) { 4745 error = vmem_alloc(kernel_arena, PAGE_SIZE, 4746 M_BESTFIT | M_WAITOK, &vaddr[i]); 4747 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 4748 needs_mapping = TRUE; 4749 } else { 4750 vaddr[i] = PHYS_TO_DMAP(paddr); 4751 } 4752 } 4753 4754 /* Exit early if everything is covered by the DMAP */ 4755 if (!needs_mapping) 4756 return (FALSE); 4757 4758 if (!can_fault) 4759 sched_pin(); 4760 for (i = 0; i < count; i++) { 4761 paddr = VM_PAGE_TO_PHYS(page[i]); 4762 if (paddr >= DMAP_MAX_PHYSADDR) { 4763 panic( 4764 "pmap_map_io_transient: TODO: Map out of DMAP data"); 4765 } 4766 } 4767 4768 return (needs_mapping); 4769 } 4770 4771 void 4772 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 4773 boolean_t can_fault) 4774 { 4775 vm_paddr_t paddr; 4776 int i; 4777 4778 if (!can_fault) 4779 sched_unpin(); 4780 for (i = 0; i < count; i++) { 4781 paddr = VM_PAGE_TO_PHYS(page[i]); 4782 if (paddr >= DMAP_MAX_PHYSADDR) { 4783 panic("RISCVTODO: pmap_unmap_io_transient: Unmap data"); 4784 } 4785 } 4786 } 4787 4788 boolean_t 4789 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 4790 { 4791 4792 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_BACK); 4793 } 4794 4795 bool 4796 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2, 4797 pt_entry_t **l3) 4798 { 4799 pd_entry_t *l1p, *l2p; 4800 4801 /* Get l1 directory entry. */ 4802 l1p = pmap_l1(pmap, va); 4803 *l1 = l1p; 4804 4805 if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0) 4806 return (false); 4807 4808 if ((pmap_load(l1p) & PTE_RX) != 0) { 4809 *l2 = NULL; 4810 *l3 = NULL; 4811 return (true); 4812 } 4813 4814 /* Get l2 directory entry. */ 4815 l2p = pmap_l1_to_l2(l1p, va); 4816 *l2 = l2p; 4817 4818 if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0) 4819 return (false); 4820 4821 if ((pmap_load(l2p) & PTE_RX) != 0) { 4822 *l3 = NULL; 4823 return (true); 4824 } 4825 4826 /* Get l3 page table entry. */ 4827 *l3 = pmap_l2_to_l3(l2p, va); 4828 4829 return (true); 4830 } 4831 4832 /* 4833 * Track a range of the kernel's virtual address space that is contiguous 4834 * in various mapping attributes. 4835 */ 4836 struct pmap_kernel_map_range { 4837 vm_offset_t sva; 4838 pt_entry_t attrs; 4839 int l3pages; 4840 int l2pages; 4841 int l1pages; 4842 }; 4843 4844 static void 4845 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 4846 vm_offset_t eva) 4847 { 4848 4849 if (eva <= range->sva) 4850 return; 4851 4852 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %d %d %d\n", 4853 range->sva, eva, 4854 (range->attrs & PTE_W) == PTE_W ? 'w' : '-', 4855 (range->attrs & PTE_X) == PTE_X ? 'x' : '-', 4856 (range->attrs & PTE_U) == PTE_U ? 'u' : 's', 4857 (range->attrs & PTE_G) == PTE_G ? 'g' : '-', 4858 range->l1pages, range->l2pages, range->l3pages); 4859 4860 /* Reset to sentinel value. */ 4861 range->sva = 0xfffffffffffffffful; 4862 } 4863 4864 /* 4865 * Determine whether the attributes specified by a page table entry match those 4866 * being tracked by the current range. 4867 */ 4868 static bool 4869 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 4870 { 4871 4872 return (range->attrs == attrs); 4873 } 4874 4875 static void 4876 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 4877 pt_entry_t attrs) 4878 { 4879 4880 memset(range, 0, sizeof(*range)); 4881 range->sva = va; 4882 range->attrs = attrs; 4883 } 4884 4885 /* 4886 * Given a leaf PTE, derive the mapping's attributes. If they do not match 4887 * those of the current run, dump the address range and its attributes, and 4888 * begin a new run. 4889 */ 4890 static void 4891 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 4892 vm_offset_t va, pd_entry_t l1e, pd_entry_t l2e, pt_entry_t l3e) 4893 { 4894 pt_entry_t attrs; 4895 4896 /* The PTE global bit is inherited by lower levels. */ 4897 attrs = l1e & PTE_G; 4898 if ((l1e & PTE_RWX) != 0) 4899 attrs |= l1e & (PTE_RWX | PTE_U); 4900 else if (l2e != 0) 4901 attrs |= l2e & PTE_G; 4902 if ((l2e & PTE_RWX) != 0) 4903 attrs |= l2e & (PTE_RWX | PTE_U); 4904 else if (l3e != 0) 4905 attrs |= l3e & (PTE_RWX | PTE_U | PTE_G); 4906 4907 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 4908 sysctl_kmaps_dump(sb, range, va); 4909 sysctl_kmaps_reinit(range, va, attrs); 4910 } 4911 } 4912 4913 static int 4914 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 4915 { 4916 struct pmap_kernel_map_range range; 4917 struct sbuf sbuf, *sb; 4918 pd_entry_t l1e, *l2, l2e; 4919 pt_entry_t *l3, l3e; 4920 vm_offset_t sva; 4921 vm_paddr_t pa; 4922 int error, i, j, k; 4923 4924 error = sysctl_wire_old_buffer(req, 0); 4925 if (error != 0) 4926 return (error); 4927 sb = &sbuf; 4928 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 4929 4930 /* Sentinel value. */ 4931 range.sva = 0xfffffffffffffffful; 4932 4933 /* 4934 * Iterate over the kernel page tables without holding the kernel pmap 4935 * lock. Kernel page table pages are never freed, so at worst we will 4936 * observe inconsistencies in the output. 4937 */ 4938 sva = VM_MIN_KERNEL_ADDRESS; 4939 for (i = pmap_l1_index(sva); i < Ln_ENTRIES; i++) { 4940 if (i == pmap_l1_index(DMAP_MIN_ADDRESS)) 4941 sbuf_printf(sb, "\nDirect map:\n"); 4942 else if (i == pmap_l1_index(VM_MIN_KERNEL_ADDRESS)) 4943 sbuf_printf(sb, "\nKernel map:\n"); 4944 4945 l1e = kernel_pmap->pm_top[i]; 4946 if ((l1e & PTE_V) == 0) { 4947 sysctl_kmaps_dump(sb, &range, sva); 4948 sva += L1_SIZE; 4949 continue; 4950 } 4951 if ((l1e & PTE_RWX) != 0) { 4952 sysctl_kmaps_check(sb, &range, sva, l1e, 0, 0); 4953 range.l1pages++; 4954 sva += L1_SIZE; 4955 continue; 4956 } 4957 pa = PTE_TO_PHYS(l1e); 4958 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); 4959 4960 for (j = pmap_l2_index(sva); j < Ln_ENTRIES; j++) { 4961 l2e = l2[j]; 4962 if ((l2e & PTE_V) == 0) { 4963 sysctl_kmaps_dump(sb, &range, sva); 4964 sva += L2_SIZE; 4965 continue; 4966 } 4967 if ((l2e & PTE_RWX) != 0) { 4968 sysctl_kmaps_check(sb, &range, sva, l1e, l2e, 0); 4969 range.l2pages++; 4970 sva += L2_SIZE; 4971 continue; 4972 } 4973 pa = PTE_TO_PHYS(l2e); 4974 l3 = (pd_entry_t *)PHYS_TO_DMAP(pa); 4975 4976 for (k = pmap_l3_index(sva); k < Ln_ENTRIES; k++, 4977 sva += L3_SIZE) { 4978 l3e = l3[k]; 4979 if ((l3e & PTE_V) == 0) { 4980 sysctl_kmaps_dump(sb, &range, sva); 4981 continue; 4982 } 4983 sysctl_kmaps_check(sb, &range, sva, 4984 l1e, l2e, l3e); 4985 range.l3pages++; 4986 } 4987 } 4988 } 4989 4990 error = sbuf_finish(sb); 4991 sbuf_delete(sb); 4992 return (error); 4993 } 4994 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 4995 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 4996 NULL, 0, sysctl_kmaps, "A", 4997 "Dump kernel address layout"); 4998