1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * Copyright (c) 2014 Andrew Turner 15 * All rights reserved. 16 * Copyright (c) 2014 The FreeBSD Foundation 17 * All rights reserved. 18 * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com> 19 * All rights reserved. 20 * 21 * This code is derived from software contributed to Berkeley by 22 * the Systems Programming Group of the University of Utah Computer 23 * Science Department and William Jolitz of UUNET Technologies Inc. 24 * 25 * Portions of this software were developed by Andrew Turner under 26 * sponsorship from The FreeBSD Foundation. 27 * 28 * Portions of this software were developed by SRI International and the 29 * University of Cambridge Computer Laboratory under DARPA/AFRL contract 30 * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. 31 * 32 * Portions of this software were developed by the University of Cambridge 33 * Computer Laboratory as part of the CTSRD Project, with support from the 34 * UK Higher Education Innovation Fund (HEIF). 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. All advertising materials mentioning features or use of this software 45 * must display the following acknowledgement: 46 * This product includes software developed by the University of 47 * California, Berkeley and its contributors. 48 * 4. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 */ 64 /*- 65 * Copyright (c) 2003 Networks Associates Technology, Inc. 66 * All rights reserved. 67 * 68 * This software was developed for the FreeBSD Project by Jake Burkholder, 69 * Safeport Network Services, and Network Associates Laboratories, the 70 * Security Research Division of Network Associates, Inc. under 71 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 72 * CHATS research program. 73 * 74 * Redistribution and use in source and binary forms, with or without 75 * modification, are permitted provided that the following conditions 76 * are met: 77 * 1. Redistributions of source code must retain the above copyright 78 * notice, this list of conditions and the following disclaimer. 79 * 2. Redistributions in binary form must reproduce the above copyright 80 * notice, this list of conditions and the following disclaimer in the 81 * documentation and/or other materials provided with the distribution. 82 * 83 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 84 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 85 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 86 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 87 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 88 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 89 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 90 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 91 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 92 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 93 * SUCH DAMAGE. 94 */ 95 96 #include <sys/cdefs.h> 97 /* 98 * Manages physical address maps. 99 * 100 * Since the information managed by this module is 101 * also stored by the logical address mapping module, 102 * this module may throw away valid virtual-to-physical 103 * mappings at almost any time. However, invalidations 104 * of virtual-to-physical mappings must be done as 105 * requested. 106 * 107 * In order to cope with hardware architectures which 108 * make virtual-to-physical map invalidates expensive, 109 * this module may delay invalidate or reduced protection 110 * operations until such time as they are actually 111 * necessary. This module is given full information as 112 * to which processors are currently using which maps, 113 * and to when physical maps must be made correct. 114 */ 115 116 #include <sys/param.h> 117 #include <sys/systm.h> 118 #include <sys/bitstring.h> 119 #include <sys/bus.h> 120 #include <sys/cpuset.h> 121 #include <sys/kernel.h> 122 #include <sys/ktr.h> 123 #include <sys/lock.h> 124 #include <sys/malloc.h> 125 #include <sys/mman.h> 126 #include <sys/msgbuf.h> 127 #include <sys/mutex.h> 128 #include <sys/physmem.h> 129 #include <sys/proc.h> 130 #include <sys/rwlock.h> 131 #include <sys/sbuf.h> 132 #include <sys/sx.h> 133 #include <sys/vmem.h> 134 #include <sys/vmmeter.h> 135 #include <sys/sched.h> 136 #include <sys/sysctl.h> 137 #include <sys/smp.h> 138 139 #include <vm/vm.h> 140 #include <vm/vm_param.h> 141 #include <vm/vm_kern.h> 142 #include <vm/vm_page.h> 143 #include <vm/vm_map.h> 144 #include <vm/vm_object.h> 145 #include <vm/vm_extern.h> 146 #include <vm/vm_pageout.h> 147 #include <vm/vm_pager.h> 148 #include <vm/vm_phys.h> 149 #include <vm/vm_radix.h> 150 #include <vm/vm_reserv.h> 151 #include <vm/vm_dumpset.h> 152 #include <vm/uma.h> 153 154 #include <machine/machdep.h> 155 #include <machine/md_var.h> 156 #include <machine/pcb.h> 157 #include <machine/sbi.h> 158 159 /* 160 * Boundary values for the page table page index space: 161 * 162 * L3 pages: [0, NUL2E) 163 * L2 pages: [NUL2E, NUL2E + NUL1E) 164 * L1 pages: [NUL2E + NUL1E, NUL2E + NUL1E + NUL0E) 165 * 166 * Note that these ranges are used in both SV39 and SV48 mode. In SV39 mode the 167 * ranges are not fully populated since there are at most Ln_ENTRIES^2 L3 pages 168 * in a set of page tables. 169 */ 170 #define NUL0E Ln_ENTRIES 171 #define NUL1E (Ln_ENTRIES * NUL0E) 172 #define NUL2E (Ln_ENTRIES * NUL1E) 173 174 #ifdef PV_STATS 175 #define PV_STAT(x) do { x ; } while (0) 176 #define __pv_stat_used 177 #else 178 #define PV_STAT(x) do { } while (0) 179 #define __pv_stat_used __unused 180 #endif 181 182 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT)) 183 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) 184 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 185 186 #define NPV_LIST_LOCKS MAXCPU 187 188 #define PHYS_TO_PV_LIST_LOCK(pa) \ 189 (&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS]) 190 191 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 192 struct rwlock **_lockp = (lockp); \ 193 struct rwlock *_new_lock; \ 194 \ 195 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 196 if (_new_lock != *_lockp) { \ 197 if (*_lockp != NULL) \ 198 rw_wunlock(*_lockp); \ 199 *_lockp = _new_lock; \ 200 rw_wlock(*_lockp); \ 201 } \ 202 } while (0) 203 204 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 205 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 206 207 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 208 struct rwlock **_lockp = (lockp); \ 209 \ 210 if (*_lockp != NULL) { \ 211 rw_wunlock(*_lockp); \ 212 *_lockp = NULL; \ 213 } \ 214 } while (0) 215 216 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 217 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 218 219 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 220 "VM/pmap parameters"); 221 222 /* The list of all the user pmaps */ 223 LIST_HEAD(pmaplist, pmap); 224 static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER(); 225 226 enum pmap_mode __read_frequently pmap_mode = PMAP_MODE_SV39; 227 SYSCTL_INT(_vm_pmap, OID_AUTO, mode, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 228 &pmap_mode, 0, 229 "translation mode, 0 = SV39, 1 = SV48"); 230 231 struct pmap kernel_pmap_store; 232 233 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 234 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 235 vm_offset_t kernel_vm_end = 0; 236 237 vm_paddr_t dmap_phys_base; /* The start of the dmap region */ 238 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ 239 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ 240 241 /* This code assumes all L1 DMAP entries will be used */ 242 CTASSERT((DMAP_MIN_ADDRESS & ~L1_OFFSET) == DMAP_MIN_ADDRESS); 243 CTASSERT((DMAP_MAX_ADDRESS & ~L1_OFFSET) == DMAP_MAX_ADDRESS); 244 245 /* 246 * This code assumes that the early DEVMAP is L2_SIZE aligned and is fully 247 * contained within a single L2 entry. The early DTB is mapped immediately 248 * before the devmap L2 entry. 249 */ 250 CTASSERT((PMAP_MAPDEV_EARLY_SIZE & L2_OFFSET) == 0); 251 CTASSERT((VM_EARLY_DTB_ADDRESS & L2_OFFSET) == 0); 252 CTASSERT(VM_EARLY_DTB_ADDRESS < (VM_MAX_KERNEL_ADDRESS - PMAP_MAPDEV_EARLY_SIZE)); 253 254 static struct rwlock_padalign pvh_global_lock; 255 static struct mtx_padalign allpmaps_lock; 256 257 static int __read_frequently superpages_enabled = 1; 258 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, 259 CTLFLAG_RDTUN, &superpages_enabled, 0, 260 "Enable support for transparent superpages"); 261 262 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 263 "2MB page mapping counters"); 264 265 static u_long pmap_l2_demotions; 266 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, 267 &pmap_l2_demotions, 0, 268 "2MB page demotions"); 269 270 static u_long pmap_l2_mappings; 271 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, 272 &pmap_l2_mappings, 0, 273 "2MB page mappings"); 274 275 static u_long pmap_l2_p_failures; 276 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, 277 &pmap_l2_p_failures, 0, 278 "2MB page promotion failures"); 279 280 static u_long pmap_l2_promotions; 281 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, 282 &pmap_l2_promotions, 0, 283 "2MB page promotions"); 284 285 /* 286 * Data for the pv entry allocation mechanism 287 */ 288 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 289 static struct mtx pv_chunks_mutex; 290 static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 291 static struct md_page *pv_table; 292 static struct md_page pv_dummy; 293 294 extern cpuset_t all_harts; 295 296 /* 297 * Internal flags for pmap_enter()'s helper functions. 298 */ 299 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 300 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 301 302 static void free_pv_chunk(struct pv_chunk *pc); 303 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 304 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 305 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 306 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 307 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 308 vm_offset_t va); 309 static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va); 310 static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, 311 vm_offset_t va, struct rwlock **lockp); 312 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, 313 u_int flags, vm_page_t m, struct rwlock **lockp); 314 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 315 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 316 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, 317 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 318 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 319 vm_page_t m, struct rwlock **lockp); 320 321 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, 322 struct rwlock **lockp); 323 324 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 325 struct spglist *free); 326 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 327 328 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); 329 330 #define pmap_clear(pte) pmap_store(pte, 0) 331 #define pmap_clear_bits(pte, bits) atomic_clear_64(pte, bits) 332 #define pmap_load_store(pte, entry) atomic_swap_64(pte, entry) 333 #define pmap_load_clear(pte) pmap_load_store(pte, 0) 334 #define pmap_load(pte) atomic_load_64(pte) 335 #define pmap_store(pte, entry) atomic_store_64(pte, entry) 336 #define pmap_store_bits(pte, bits) atomic_set_64(pte, bits) 337 338 /********************/ 339 /* Inline functions */ 340 /********************/ 341 342 static __inline void 343 pagecopy(void *s, void *d) 344 { 345 346 memcpy(d, s, PAGE_SIZE); 347 } 348 349 static __inline void 350 pagezero(void *p) 351 { 352 353 bzero(p, PAGE_SIZE); 354 } 355 356 #define pmap_l0_index(va) (((va) >> L0_SHIFT) & Ln_ADDR_MASK) 357 #define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) 358 #define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) 359 #define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) 360 361 #define PTE_TO_PHYS(pte) \ 362 ((((pte) & ~PTE_HI_MASK) >> PTE_PPN0_S) * PAGE_SIZE) 363 #define L2PTE_TO_PHYS(l2) \ 364 ((((l2) & ~PTE_HI_MASK) >> PTE_PPN1_S) << L2_SHIFT) 365 366 static __inline pd_entry_t * 367 pmap_l0(pmap_t pmap, vm_offset_t va) 368 { 369 KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__)); 370 KASSERT(VIRT_IS_VALID(va), 371 ("%s: malformed virtual address %#lx", __func__, va)); 372 return (&pmap->pm_top[pmap_l0_index(va)]); 373 } 374 375 static __inline pd_entry_t * 376 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) 377 { 378 vm_paddr_t phys; 379 pd_entry_t *l1; 380 381 KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__)); 382 phys = PTE_TO_PHYS(pmap_load(l0)); 383 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 384 385 return (&l1[pmap_l1_index(va)]); 386 } 387 388 static __inline pd_entry_t * 389 pmap_l1(pmap_t pmap, vm_offset_t va) 390 { 391 pd_entry_t *l0; 392 393 KASSERT(VIRT_IS_VALID(va), 394 ("%s: malformed virtual address %#lx", __func__, va)); 395 if (pmap_mode == PMAP_MODE_SV39) { 396 return (&pmap->pm_top[pmap_l1_index(va)]); 397 } else { 398 l0 = pmap_l0(pmap, va); 399 if ((pmap_load(l0) & PTE_V) == 0) 400 return (NULL); 401 if ((pmap_load(l0) & PTE_RX) != 0) 402 return (NULL); 403 return (pmap_l0_to_l1(l0, va)); 404 } 405 } 406 407 static __inline pd_entry_t * 408 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) 409 { 410 vm_paddr_t phys; 411 pd_entry_t *l2; 412 413 phys = PTE_TO_PHYS(pmap_load(l1)); 414 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 415 416 return (&l2[pmap_l2_index(va)]); 417 } 418 419 static __inline pd_entry_t * 420 pmap_l2(pmap_t pmap, vm_offset_t va) 421 { 422 pd_entry_t *l1; 423 424 l1 = pmap_l1(pmap, va); 425 if (l1 == NULL) 426 return (NULL); 427 if ((pmap_load(l1) & PTE_V) == 0) 428 return (NULL); 429 if ((pmap_load(l1) & PTE_RX) != 0) 430 return (NULL); 431 432 return (pmap_l1_to_l2(l1, va)); 433 } 434 435 static __inline pt_entry_t * 436 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) 437 { 438 vm_paddr_t phys; 439 pt_entry_t *l3; 440 441 phys = PTE_TO_PHYS(pmap_load(l2)); 442 l3 = (pd_entry_t *)PHYS_TO_DMAP(phys); 443 444 return (&l3[pmap_l3_index(va)]); 445 } 446 447 static __inline pt_entry_t * 448 pmap_l3(pmap_t pmap, vm_offset_t va) 449 { 450 pd_entry_t *l2; 451 452 l2 = pmap_l2(pmap, va); 453 if (l2 == NULL) 454 return (NULL); 455 if ((pmap_load(l2) & PTE_V) == 0) 456 return (NULL); 457 if ((pmap_load(l2) & PTE_RX) != 0) 458 return (NULL); 459 460 return (pmap_l2_to_l3(l2, va)); 461 } 462 463 static __inline void 464 pmap_resident_count_inc(pmap_t pmap, int count) 465 { 466 467 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 468 pmap->pm_stats.resident_count += count; 469 } 470 471 static __inline void 472 pmap_resident_count_dec(pmap_t pmap, int count) 473 { 474 475 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 476 KASSERT(pmap->pm_stats.resident_count >= count, 477 ("pmap %p resident count underflow %ld %d", pmap, 478 pmap->pm_stats.resident_count, count)); 479 pmap->pm_stats.resident_count -= count; 480 } 481 482 static void 483 pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index, 484 pt_entry_t entry) 485 { 486 struct pmap *user_pmap; 487 pd_entry_t *l1; 488 489 /* 490 * Distribute new kernel L1 entry to all the user pmaps. This is only 491 * necessary with three-level paging configured: with four-level paging 492 * the kernel's half of the top-level page table page is static and can 493 * simply be copied at pmap initialization time. 494 */ 495 if (pmap != kernel_pmap || pmap_mode != PMAP_MODE_SV39) 496 return; 497 498 mtx_lock(&allpmaps_lock); 499 LIST_FOREACH(user_pmap, &allpmaps, pm_list) { 500 l1 = &user_pmap->pm_top[l1index]; 501 pmap_store(l1, entry); 502 } 503 mtx_unlock(&allpmaps_lock); 504 } 505 506 static pt_entry_t * 507 pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, 508 u_int *l2_slot) 509 { 510 pt_entry_t *l2; 511 pd_entry_t *l1 __diagused; 512 513 l1 = (pd_entry_t *)l1pt; 514 *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; 515 516 /* Check locore has used a table L1 map */ 517 KASSERT((l1[*l1_slot] & PTE_RX) == 0, 518 ("Invalid bootstrap L1 table")); 519 520 /* Find the address of the L2 table */ 521 l2 = (pt_entry_t *)init_pt_va; 522 *l2_slot = pmap_l2_index(va); 523 524 return (l2); 525 } 526 527 static vm_paddr_t 528 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) 529 { 530 u_int l1_slot, l2_slot; 531 pt_entry_t *l2; 532 vm_paddr_t ret; 533 534 l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); 535 536 /* Check locore has used L2 superpages */ 537 KASSERT((l2[l2_slot] & PTE_RX) != 0, 538 ("Invalid bootstrap L2 table")); 539 540 /* L2 is superpages */ 541 ret = L2PTE_TO_PHYS(l2[l2_slot]); 542 ret += (va & L2_OFFSET); 543 544 return (ret); 545 } 546 547 static void 548 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa) 549 { 550 vm_offset_t va; 551 vm_paddr_t pa; 552 pd_entry_t *l1; 553 u_int l1_slot; 554 pt_entry_t entry; 555 pn_t pn; 556 557 pa = dmap_phys_base = min_pa & ~L1_OFFSET; 558 va = DMAP_MIN_ADDRESS; 559 l1 = (pd_entry_t *)kern_l1; 560 l1_slot = pmap_l1_index(DMAP_MIN_ADDRESS); 561 562 for (; va < DMAP_MAX_ADDRESS && pa < max_pa; 563 pa += L1_SIZE, va += L1_SIZE, l1_slot++) { 564 KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); 565 566 /* superpages */ 567 pn = (pa / PAGE_SIZE); 568 entry = PTE_KERN; 569 entry |= (pn << PTE_PPN0_S); 570 pmap_store(&l1[l1_slot], entry); 571 } 572 573 /* Set the upper limit of the DMAP region */ 574 dmap_phys_max = pa; 575 dmap_max_addr = va; 576 577 sfence_vma(); 578 } 579 580 static vm_offset_t 581 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) 582 { 583 vm_offset_t l3pt; 584 pt_entry_t entry; 585 pd_entry_t *l2; 586 vm_paddr_t pa; 587 u_int l2_slot; 588 pn_t pn; 589 590 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); 591 592 l2 = pmap_l2(kernel_pmap, va); 593 l2 = (pd_entry_t *)((uintptr_t)l2 & ~(PAGE_SIZE - 1)); 594 l2_slot = pmap_l2_index(va); 595 l3pt = l3_start; 596 597 for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { 598 KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); 599 600 pa = pmap_early_vtophys(l1pt, l3pt); 601 pn = (pa / PAGE_SIZE); 602 entry = (PTE_V); 603 entry |= (pn << PTE_PPN0_S); 604 pmap_store(&l2[l2_slot], entry); 605 l3pt += PAGE_SIZE; 606 } 607 608 /* Clean the L2 page table */ 609 memset((void *)l3_start, 0, l3pt - l3_start); 610 611 return (l3pt); 612 } 613 614 /* 615 * Bootstrap the system enough to run with virtual memory. 616 */ 617 void 618 pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) 619 { 620 vm_paddr_t physmap[PHYS_AVAIL_ENTRIES]; 621 uint64_t satp; 622 vm_offset_t dpcpu, freemempos, l0pv, msgbufpv; 623 vm_paddr_t l0pa, l1pa, max_pa, min_pa, pa; 624 pd_entry_t *l0p; 625 pt_entry_t *l2p; 626 u_int l1_slot, l2_slot; 627 u_int physmap_idx; 628 int i, mode; 629 630 printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); 631 632 /* Set this early so we can use the pagetable walking functions */ 633 kernel_pmap_store.pm_top = (pd_entry_t *)l1pt; 634 PMAP_LOCK_INIT(kernel_pmap); 635 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 636 vm_radix_init(&kernel_pmap->pm_root); 637 638 rw_init(&pvh_global_lock, "pmap pv global"); 639 640 /* 641 * Set the current CPU as active in the kernel pmap. Secondary cores 642 * will add themselves later in init_secondary(). The SBI firmware 643 * may rely on this mask being precise, so CPU_FILL() is not used. 644 */ 645 CPU_SET(PCPU_GET(hart), &kernel_pmap->pm_active); 646 647 /* Assume the address we were loaded to is a valid physical address. */ 648 min_pa = max_pa = kernstart; 649 650 physmap_idx = physmem_avail(physmap, nitems(physmap)); 651 physmap_idx /= 2; 652 653 /* 654 * Find the minimum physical address. physmap is sorted, 655 * but may contain empty ranges. 656 */ 657 for (i = 0; i < physmap_idx * 2; i += 2) { 658 if (physmap[i] == physmap[i + 1]) 659 continue; 660 if (physmap[i] <= min_pa) 661 min_pa = physmap[i]; 662 if (physmap[i + 1] > max_pa) 663 max_pa = physmap[i + 1]; 664 } 665 printf("physmap_idx %u\n", physmap_idx); 666 printf("min_pa %lx\n", min_pa); 667 printf("max_pa %lx\n", max_pa); 668 669 /* Create a direct map region early so we can use it for pa -> va */ 670 pmap_bootstrap_dmap(l1pt, min_pa, max_pa); 671 672 /* 673 * Read the page table to find out what is already mapped. 674 * This assumes we have mapped a block of memory from KERNBASE 675 * using a single L1 entry. 676 */ 677 (void)pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); 678 679 /* Sanity check the index, KERNBASE should be the first VA */ 680 KASSERT(l2_slot == 0, ("The L2 index is non-zero")); 681 682 freemempos = roundup2(KERNBASE + kernlen, PAGE_SIZE); 683 684 /* Create the l3 tables for the early devmap */ 685 freemempos = pmap_bootstrap_l3(l1pt, 686 VM_MAX_KERNEL_ADDRESS - PMAP_MAPDEV_EARLY_SIZE, freemempos); 687 688 /* 689 * Invalidate the mapping we created for the DTB. At this point a copy 690 * has been created, and we no longer need it. We want to avoid the 691 * possibility of an aliased mapping in the future. 692 */ 693 l2p = pmap_l2(kernel_pmap, VM_EARLY_DTB_ADDRESS); 694 if ((pmap_load(l2p) & PTE_V) != 0) 695 pmap_clear(l2p); 696 697 sfence_vma(); 698 699 #define alloc_pages(var, np) \ 700 (var) = freemempos; \ 701 freemempos += (np * PAGE_SIZE); \ 702 memset((char *)(var), 0, ((np) * PAGE_SIZE)); 703 704 mode = 0; 705 TUNABLE_INT_FETCH("vm.pmap.mode", &mode); 706 if (mode == PMAP_MODE_SV48 && (mmu_caps & MMU_SV48) != 0) { 707 /* 708 * Enable SV48 mode: allocate an L0 page and set SV48 mode in 709 * SATP. If the implementation does not provide SV48 mode, 710 * the mode read back from the (WARL) SATP register will be 711 * unchanged, and we continue in SV39 mode. 712 */ 713 alloc_pages(l0pv, 1); 714 l0p = (void *)l0pv; 715 l1pa = pmap_early_vtophys(l1pt, l1pt); 716 l0p[pmap_l0_index(KERNBASE)] = PTE_V | PTE_A | PTE_D | 717 ((l1pa >> PAGE_SHIFT) << PTE_PPN0_S); 718 719 l0pa = pmap_early_vtophys(l1pt, l0pv); 720 csr_write(satp, (l0pa >> PAGE_SHIFT) | SATP_MODE_SV48); 721 satp = csr_read(satp); 722 if ((satp & SATP_MODE_M) == SATP_MODE_SV48) { 723 pmap_mode = PMAP_MODE_SV48; 724 kernel_pmap_store.pm_top = l0p; 725 } else { 726 /* Mode didn't change, give the page back. */ 727 freemempos -= PAGE_SIZE; 728 } 729 } 730 731 /* Allocate dynamic per-cpu area. */ 732 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); 733 dpcpu_init((void *)dpcpu, 0); 734 735 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ 736 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); 737 msgbufp = (void *)msgbufpv; 738 739 virtual_avail = roundup2(freemempos, L2_SIZE); 740 virtual_end = VM_MAX_KERNEL_ADDRESS - PMAP_MAPDEV_EARLY_SIZE; 741 kernel_vm_end = virtual_avail; 742 743 pa = pmap_early_vtophys(l1pt, freemempos); 744 745 physmem_exclude_region(kernstart, pa - kernstart, EXFLAG_NOALLOC); 746 } 747 748 /* 749 * Initialize a vm_page's machine-dependent fields. 750 */ 751 void 752 pmap_page_init(vm_page_t m) 753 { 754 755 TAILQ_INIT(&m->md.pv_list); 756 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; 757 } 758 759 /* 760 * Initialize the pmap module. 761 * Called by vm_init, to initialize any structures that the pmap 762 * system needs to map virtual memory. 763 */ 764 void 765 pmap_init(void) 766 { 767 vm_size_t s; 768 int i, pv_npg; 769 770 /* 771 * Initialize the pv chunk and pmap list mutexes. 772 */ 773 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 774 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF); 775 776 /* 777 * Initialize the pool of pv list locks. 778 */ 779 for (i = 0; i < NPV_LIST_LOCKS; i++) 780 rw_init(&pv_list_locks[i], "pmap pv list"); 781 782 /* 783 * Calculate the size of the pv head table for superpages. 784 */ 785 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); 786 787 /* 788 * Allocate memory for the pv head table for superpages. 789 */ 790 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 791 s = round_page(s); 792 pv_table = kmem_malloc(s, M_WAITOK | M_ZERO); 793 for (i = 0; i < pv_npg; i++) 794 TAILQ_INIT(&pv_table[i].pv_list); 795 TAILQ_INIT(&pv_dummy.pv_list); 796 797 if (superpages_enabled) 798 pagesizes[1] = L2_SIZE; 799 } 800 801 #ifdef SMP 802 /* 803 * For SMP, these functions have to use IPIs for coherence. 804 * 805 * In general, the calling thread uses a plain fence to order the 806 * writes to the page tables before invoking an SBI callback to invoke 807 * sfence_vma() on remote CPUs. 808 */ 809 static void 810 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 811 { 812 cpuset_t mask; 813 814 sched_pin(); 815 mask = pmap->pm_active; 816 CPU_CLR(PCPU_GET(hart), &mask); 817 fence(); 818 if (!CPU_EMPTY(&mask) && smp_started) 819 sbi_remote_sfence_vma(mask.__bits, va, 1); 820 sfence_vma_page(va); 821 sched_unpin(); 822 } 823 824 static void 825 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 826 { 827 cpuset_t mask; 828 829 sched_pin(); 830 mask = pmap->pm_active; 831 CPU_CLR(PCPU_GET(hart), &mask); 832 fence(); 833 if (!CPU_EMPTY(&mask) && smp_started) 834 sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1); 835 836 /* 837 * Might consider a loop of sfence_vma_page() for a small 838 * number of pages in the future. 839 */ 840 sfence_vma(); 841 sched_unpin(); 842 } 843 844 static void 845 pmap_invalidate_all(pmap_t pmap) 846 { 847 cpuset_t mask; 848 849 sched_pin(); 850 mask = pmap->pm_active; 851 CPU_CLR(PCPU_GET(hart), &mask); 852 853 /* 854 * XXX: The SBI doc doesn't detail how to specify x0 as the 855 * address to perform a global fence. BBL currently treats 856 * all sfence_vma requests as global however. 857 */ 858 fence(); 859 if (!CPU_EMPTY(&mask) && smp_started) 860 sbi_remote_sfence_vma(mask.__bits, 0, 0); 861 sfence_vma(); 862 sched_unpin(); 863 } 864 #else 865 /* 866 * Normal, non-SMP, invalidation functions. 867 * We inline these within pmap.c for speed. 868 */ 869 static __inline void 870 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 871 { 872 873 sfence_vma_page(va); 874 } 875 876 static __inline void 877 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 878 { 879 880 /* 881 * Might consider a loop of sfence_vma_page() for a small 882 * number of pages in the future. 883 */ 884 sfence_vma(); 885 } 886 887 static __inline void 888 pmap_invalidate_all(pmap_t pmap) 889 { 890 891 sfence_vma(); 892 } 893 #endif 894 895 /* 896 * Routine: pmap_extract 897 * Function: 898 * Extract the physical page address associated 899 * with the given map/virtual_address pair. 900 */ 901 vm_paddr_t 902 pmap_extract(pmap_t pmap, vm_offset_t va) 903 { 904 pd_entry_t *l2p, l2; 905 pt_entry_t *l3p; 906 vm_paddr_t pa; 907 908 pa = 0; 909 910 /* 911 * Start with an L2 lookup, L1 superpages are currently not implemented. 912 */ 913 PMAP_LOCK(pmap); 914 l2p = pmap_l2(pmap, va); 915 if (l2p != NULL && ((l2 = pmap_load(l2p)) & PTE_V) != 0) { 916 if ((l2 & PTE_RWX) == 0) { 917 l3p = pmap_l2_to_l3(l2p, va); 918 if (l3p != NULL) { 919 pa = PTE_TO_PHYS(pmap_load(l3p)); 920 pa |= (va & L3_OFFSET); 921 } 922 } else { 923 /* L2 is a superpage mapping. */ 924 pa = L2PTE_TO_PHYS(l2); 925 pa |= (va & L2_OFFSET); 926 } 927 } 928 PMAP_UNLOCK(pmap); 929 return (pa); 930 } 931 932 /* 933 * Routine: pmap_extract_and_hold 934 * Function: 935 * Atomically extract and hold the physical page 936 * with the given pmap and virtual address pair 937 * if that mapping permits the given protection. 938 */ 939 vm_page_t 940 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 941 { 942 pt_entry_t *l3p, l3; 943 vm_paddr_t phys; 944 vm_page_t m; 945 946 m = NULL; 947 PMAP_LOCK(pmap); 948 l3p = pmap_l3(pmap, va); 949 if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) { 950 if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) { 951 phys = PTE_TO_PHYS(l3); 952 m = PHYS_TO_VM_PAGE(phys); 953 if (!vm_page_wire_mapped(m)) 954 m = NULL; 955 } 956 } 957 PMAP_UNLOCK(pmap); 958 return (m); 959 } 960 961 /* 962 * Routine: pmap_kextract 963 * Function: 964 * Extract the physical page address associated with the given kernel 965 * virtual address. 966 */ 967 vm_paddr_t 968 pmap_kextract(vm_offset_t va) 969 { 970 pd_entry_t *l2, l2e; 971 pt_entry_t *l3; 972 vm_paddr_t pa; 973 974 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 975 pa = DMAP_TO_PHYS(va); 976 } else { 977 l2 = pmap_l2(kernel_pmap, va); 978 if (l2 == NULL) 979 panic("pmap_kextract: No l2"); 980 l2e = pmap_load(l2); 981 /* 982 * Beware of concurrent promotion and demotion! We must 983 * use l2e rather than loading from l2 multiple times to 984 * ensure we see a consistent state, including the 985 * implicit load in pmap_l2_to_l3. It is, however, safe 986 * to use an old l2e because the L3 page is preserved by 987 * promotion. 988 */ 989 if ((l2e & PTE_RX) != 0) { 990 /* superpages */ 991 pa = L2PTE_TO_PHYS(l2e); 992 pa |= (va & L2_OFFSET); 993 return (pa); 994 } 995 996 l3 = pmap_l2_to_l3(&l2e, va); 997 if (l3 == NULL) 998 panic("pmap_kextract: No l3..."); 999 pa = PTE_TO_PHYS(pmap_load(l3)); 1000 pa |= (va & PAGE_MASK); 1001 } 1002 return (pa); 1003 } 1004 1005 /*************************************************** 1006 * Low level mapping routines..... 1007 ***************************************************/ 1008 1009 void 1010 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode __unused) 1011 { 1012 pt_entry_t entry; 1013 pt_entry_t *l3; 1014 vm_offset_t va; 1015 pn_t pn; 1016 1017 KASSERT((pa & L3_OFFSET) == 0, 1018 ("pmap_kenter_device: Invalid physical address")); 1019 KASSERT((sva & L3_OFFSET) == 0, 1020 ("pmap_kenter_device: Invalid virtual address")); 1021 KASSERT((size & PAGE_MASK) == 0, 1022 ("pmap_kenter_device: Mapping is not page-sized")); 1023 1024 va = sva; 1025 while (size != 0) { 1026 l3 = pmap_l3(kernel_pmap, va); 1027 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 1028 1029 pn = (pa / PAGE_SIZE); 1030 entry = PTE_KERN; 1031 entry |= (pn << PTE_PPN0_S); 1032 pmap_store(l3, entry); 1033 1034 va += PAGE_SIZE; 1035 pa += PAGE_SIZE; 1036 size -= PAGE_SIZE; 1037 } 1038 pmap_invalidate_range(kernel_pmap, sva, va); 1039 } 1040 1041 void 1042 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) 1043 { 1044 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE); 1045 } 1046 1047 /* 1048 * Remove a page from the kernel pagetables. 1049 * Note: not SMP coherent. 1050 */ 1051 void 1052 pmap_kremove(vm_offset_t va) 1053 { 1054 pt_entry_t *l3; 1055 1056 l3 = pmap_l3(kernel_pmap, va); 1057 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 1058 1059 pmap_clear(l3); 1060 sfence_vma(); 1061 } 1062 1063 void 1064 pmap_kremove_device(vm_offset_t sva, vm_size_t size) 1065 { 1066 pt_entry_t *l3; 1067 vm_offset_t va; 1068 1069 KASSERT((sva & L3_OFFSET) == 0, 1070 ("pmap_kremove_device: Invalid virtual address")); 1071 KASSERT((size & PAGE_MASK) == 0, 1072 ("pmap_kremove_device: Mapping is not page-sized")); 1073 1074 va = sva; 1075 while (size != 0) { 1076 l3 = pmap_l3(kernel_pmap, va); 1077 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 1078 pmap_clear(l3); 1079 1080 va += PAGE_SIZE; 1081 size -= PAGE_SIZE; 1082 } 1083 1084 pmap_invalidate_range(kernel_pmap, sva, va); 1085 } 1086 1087 /* 1088 * Used to map a range of physical addresses into kernel 1089 * virtual address space. 1090 * 1091 * The value passed in '*virt' is a suggested virtual address for 1092 * the mapping. Architectures which can support a direct-mapped 1093 * physical to virtual region can return the appropriate address 1094 * within that region, leaving '*virt' unchanged. Other 1095 * architectures should map the pages starting at '*virt' and 1096 * update '*virt' with the first usable address after the mapped 1097 * region. 1098 */ 1099 vm_offset_t 1100 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1101 { 1102 1103 return PHYS_TO_DMAP(start); 1104 } 1105 1106 /* 1107 * Add a list of wired pages to the kva 1108 * this routine is only used for temporary 1109 * kernel mappings that do not need to have 1110 * page modification or references recorded. 1111 * Note that old mappings are simply written 1112 * over. The page *must* be wired. 1113 * Note: SMP coherent. Uses a ranged shootdown IPI. 1114 */ 1115 void 1116 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1117 { 1118 pt_entry_t *l3, pa; 1119 vm_offset_t va; 1120 vm_page_t m; 1121 pt_entry_t entry; 1122 pn_t pn; 1123 int i; 1124 1125 va = sva; 1126 for (i = 0; i < count; i++) { 1127 m = ma[i]; 1128 pa = VM_PAGE_TO_PHYS(m); 1129 pn = (pa / PAGE_SIZE); 1130 l3 = pmap_l3(kernel_pmap, va); 1131 1132 entry = PTE_KERN; 1133 entry |= (pn << PTE_PPN0_S); 1134 pmap_store(l3, entry); 1135 1136 va += L3_SIZE; 1137 } 1138 pmap_invalidate_range(kernel_pmap, sva, va); 1139 } 1140 1141 /* 1142 * This routine tears out page mappings from the 1143 * kernel -- it is meant only for temporary mappings. 1144 * Note: SMP coherent. Uses a ranged shootdown IPI. 1145 */ 1146 void 1147 pmap_qremove(vm_offset_t sva, int count) 1148 { 1149 pt_entry_t *l3; 1150 vm_offset_t va; 1151 1152 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); 1153 1154 for (va = sva; count-- > 0; va += PAGE_SIZE) { 1155 l3 = pmap_l3(kernel_pmap, va); 1156 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 1157 pmap_clear(l3); 1158 } 1159 pmap_invalidate_range(kernel_pmap, sva, va); 1160 } 1161 1162 bool 1163 pmap_ps_enabled(pmap_t pmap __unused) 1164 { 1165 1166 return (superpages_enabled); 1167 } 1168 1169 /*************************************************** 1170 * Page table page management routines..... 1171 ***************************************************/ 1172 /* 1173 * Schedule the specified unused page table page to be freed. Specifically, 1174 * add the page to the specified list of pages that will be released to the 1175 * physical memory manager after the TLB has been updated. 1176 */ 1177 static __inline void 1178 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 1179 boolean_t set_PG_ZERO) 1180 { 1181 1182 if (set_PG_ZERO) 1183 m->flags |= PG_ZERO; 1184 else 1185 m->flags &= ~PG_ZERO; 1186 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1187 } 1188 1189 /* 1190 * Inserts the specified page table page into the specified pmap's collection 1191 * of idle page table pages. Each of a pmap's page table pages is responsible 1192 * for mapping a distinct range of virtual addresses. The pmap's collection is 1193 * ordered by this virtual address range. 1194 * 1195 * If "promoted" is false, then the page table page "mpte" must be zero filled; 1196 * "mpte"'s valid field will be set to 0. 1197 * 1198 * If "promoted" is true and "all_l3e_PTE_A_set" is false, then "mpte" must 1199 * contain valid mappings with identical attributes except for PTE_A; 1200 * "mpte"'s valid field will be set to 1. 1201 * 1202 * If "promoted" and "all_l3e_PTE_A_set" are both true, then "mpte" must contain 1203 * valid mappings with identical attributes including PTE_A; "mpte"'s valid 1204 * field will be set to VM_PAGE_BITS_ALL. 1205 */ 1206 static __inline int 1207 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, 1208 bool all_l3e_PTE_A_set) 1209 { 1210 1211 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1212 KASSERT(promoted || !all_l3e_PTE_A_set, 1213 ("a zero-filled PTP can't have PTE_A set in every PTE")); 1214 mpte->valid = promoted ? (all_l3e_PTE_A_set ? VM_PAGE_BITS_ALL : 1) : 0; 1215 return (vm_radix_insert(&pmap->pm_root, mpte)); 1216 } 1217 1218 /* 1219 * Removes the page table page mapping the specified virtual address from the 1220 * specified pmap's collection of idle page table pages, and returns it. 1221 * Otherwise, returns NULL if there is no page table page corresponding to the 1222 * specified virtual address. 1223 */ 1224 static __inline vm_page_t 1225 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 1226 { 1227 1228 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1229 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); 1230 } 1231 1232 /* 1233 * Decrements a page table page's reference count, which is used to record the 1234 * number of valid page table entries within the page. If the reference count 1235 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1236 * page table page was unmapped and FALSE otherwise. 1237 */ 1238 static inline boolean_t 1239 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1240 { 1241 KASSERT(m->ref_count > 0, 1242 ("%s: page %p ref count underflow", __func__, m)); 1243 1244 --m->ref_count; 1245 if (m->ref_count == 0) { 1246 _pmap_unwire_ptp(pmap, va, m, free); 1247 return (TRUE); 1248 } else { 1249 return (FALSE); 1250 } 1251 } 1252 1253 static void 1254 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1255 { 1256 vm_paddr_t phys; 1257 1258 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1259 if (m->pindex >= NUL2E + NUL1E) { 1260 pd_entry_t *l0; 1261 l0 = pmap_l0(pmap, va); 1262 pmap_clear(l0); 1263 } else if (m->pindex >= NUL2E) { 1264 pd_entry_t *l1; 1265 l1 = pmap_l1(pmap, va); 1266 pmap_clear(l1); 1267 pmap_distribute_l1(pmap, pmap_l1_index(va), 0); 1268 } else { 1269 pd_entry_t *l2; 1270 l2 = pmap_l2(pmap, va); 1271 pmap_clear(l2); 1272 } 1273 pmap_resident_count_dec(pmap, 1); 1274 if (m->pindex < NUL2E) { 1275 pd_entry_t *l1; 1276 vm_page_t pdpg; 1277 1278 l1 = pmap_l1(pmap, va); 1279 phys = PTE_TO_PHYS(pmap_load(l1)); 1280 pdpg = PHYS_TO_VM_PAGE(phys); 1281 pmap_unwire_ptp(pmap, va, pdpg, free); 1282 } else if (m->pindex < NUL2E + NUL1E && pmap_mode != PMAP_MODE_SV39) { 1283 pd_entry_t *l0; 1284 vm_page_t pdpg; 1285 1286 MPASS(pmap_mode != PMAP_MODE_SV39); 1287 l0 = pmap_l0(pmap, va); 1288 phys = PTE_TO_PHYS(pmap_load(l0)); 1289 pdpg = PHYS_TO_VM_PAGE(phys); 1290 pmap_unwire_ptp(pmap, va, pdpg, free); 1291 } 1292 pmap_invalidate_page(pmap, va); 1293 1294 vm_wire_sub(1); 1295 1296 /* 1297 * Put page on a list so that it is released after 1298 * *ALL* TLB shootdown is done 1299 */ 1300 pmap_add_delayed_free_list(m, free, TRUE); 1301 } 1302 1303 /* 1304 * After removing a page table entry, this routine is used to 1305 * conditionally free the page, and manage the reference count. 1306 */ 1307 static int 1308 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 1309 struct spglist *free) 1310 { 1311 vm_page_t mpte; 1312 1313 if (va >= VM_MAXUSER_ADDRESS) 1314 return (0); 1315 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 1316 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde)); 1317 return (pmap_unwire_ptp(pmap, va, mpte, free)); 1318 } 1319 1320 static uint64_t 1321 pmap_satp_mode(void) 1322 { 1323 return (pmap_mode == PMAP_MODE_SV39 ? SATP_MODE_SV39 : SATP_MODE_SV48); 1324 } 1325 1326 void 1327 pmap_pinit0(pmap_t pmap) 1328 { 1329 PMAP_LOCK_INIT(pmap); 1330 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1331 pmap->pm_top = kernel_pmap->pm_top; 1332 pmap->pm_satp = pmap_satp_mode() | 1333 (vtophys(pmap->pm_top) >> PAGE_SHIFT); 1334 CPU_ZERO(&pmap->pm_active); 1335 TAILQ_INIT(&pmap->pm_pvchunk); 1336 vm_radix_init(&pmap->pm_root); 1337 pmap_activate_boot(pmap); 1338 } 1339 1340 int 1341 pmap_pinit(pmap_t pmap) 1342 { 1343 vm_paddr_t topphys; 1344 vm_page_t mtop; 1345 size_t i; 1346 1347 mtop = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO | 1348 VM_ALLOC_WAITOK); 1349 1350 topphys = VM_PAGE_TO_PHYS(mtop); 1351 pmap->pm_top = (pd_entry_t *)PHYS_TO_DMAP(topphys); 1352 pmap->pm_satp = pmap_satp_mode() | (topphys >> PAGE_SHIFT); 1353 1354 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1355 1356 CPU_ZERO(&pmap->pm_active); 1357 1358 if (pmap_mode == PMAP_MODE_SV39) { 1359 /* 1360 * Copy L1 entries from the kernel pmap. This must be done with 1361 * the allpmaps lock held to avoid races with 1362 * pmap_distribute_l1(). 1363 */ 1364 mtx_lock(&allpmaps_lock); 1365 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1366 for (i = pmap_l1_index(VM_MIN_KERNEL_ADDRESS); 1367 i < pmap_l1_index(VM_MAX_KERNEL_ADDRESS); i++) 1368 pmap->pm_top[i] = kernel_pmap->pm_top[i]; 1369 for (i = pmap_l1_index(DMAP_MIN_ADDRESS); 1370 i < pmap_l1_index(DMAP_MAX_ADDRESS); i++) 1371 pmap->pm_top[i] = kernel_pmap->pm_top[i]; 1372 mtx_unlock(&allpmaps_lock); 1373 } else { 1374 i = pmap_l0_index(VM_MIN_KERNEL_ADDRESS); 1375 pmap->pm_top[i] = kernel_pmap->pm_top[i]; 1376 } 1377 1378 TAILQ_INIT(&pmap->pm_pvchunk); 1379 vm_radix_init(&pmap->pm_root); 1380 1381 return (1); 1382 } 1383 1384 /* 1385 * This routine is called if the desired page table page does not exist. 1386 * 1387 * If page table page allocation fails, this routine may sleep before 1388 * returning NULL. It sleeps only if a lock pointer was given. 1389 * 1390 * Note: If a page allocation fails at page table level two or three, 1391 * one or two pages may be held during the wait, only to be released 1392 * afterwards. This conservative approach is easily argued to avoid 1393 * race conditions. 1394 */ 1395 static vm_page_t 1396 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 1397 { 1398 vm_page_t m, pdpg; 1399 pt_entry_t entry; 1400 vm_paddr_t phys; 1401 pn_t pn; 1402 1403 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1404 1405 /* 1406 * Allocate a page table page. 1407 */ 1408 m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1409 if (m == NULL) { 1410 if (lockp != NULL) { 1411 RELEASE_PV_LIST_LOCK(lockp); 1412 PMAP_UNLOCK(pmap); 1413 rw_runlock(&pvh_global_lock); 1414 vm_wait(NULL); 1415 rw_rlock(&pvh_global_lock); 1416 PMAP_LOCK(pmap); 1417 } 1418 1419 /* 1420 * Indicate the need to retry. While waiting, the page table 1421 * page may have been allocated. 1422 */ 1423 return (NULL); 1424 } 1425 m->pindex = ptepindex; 1426 1427 /* 1428 * Map the pagetable page into the process address space, if 1429 * it isn't already there. 1430 */ 1431 pn = VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT; 1432 if (ptepindex >= NUL2E + NUL1E) { 1433 pd_entry_t *l0; 1434 vm_pindex_t l0index; 1435 1436 KASSERT(pmap_mode != PMAP_MODE_SV39, 1437 ("%s: pindex %#lx in SV39 mode", __func__, ptepindex)); 1438 KASSERT(ptepindex < NUL2E + NUL1E + NUL0E, 1439 ("%s: pindex %#lx out of range", __func__, ptepindex)); 1440 1441 l0index = ptepindex - (NUL2E + NUL1E); 1442 l0 = &pmap->pm_top[l0index]; 1443 KASSERT((pmap_load(l0) & PTE_V) == 0, 1444 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0))); 1445 1446 entry = PTE_V | (pn << PTE_PPN0_S); 1447 pmap_store(l0, entry); 1448 } else if (ptepindex >= NUL2E) { 1449 pd_entry_t *l0, *l1; 1450 vm_pindex_t l0index, l1index; 1451 1452 l1index = ptepindex - NUL2E; 1453 if (pmap_mode == PMAP_MODE_SV39) { 1454 l1 = &pmap->pm_top[l1index]; 1455 } else { 1456 l0index = l1index >> Ln_ENTRIES_SHIFT; 1457 l0 = &pmap->pm_top[l0index]; 1458 if (pmap_load(l0) == 0) { 1459 /* Recurse to allocate the L1 page. */ 1460 if (_pmap_alloc_l3(pmap, 1461 NUL2E + NUL1E + l0index, lockp) == NULL) 1462 goto fail; 1463 phys = PTE_TO_PHYS(pmap_load(l0)); 1464 } else { 1465 phys = PTE_TO_PHYS(pmap_load(l0)); 1466 pdpg = PHYS_TO_VM_PAGE(phys); 1467 pdpg->ref_count++; 1468 } 1469 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1470 l1 = &l1[ptepindex & Ln_ADDR_MASK]; 1471 } 1472 KASSERT((pmap_load(l1) & PTE_V) == 0, 1473 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1))); 1474 1475 entry = PTE_V | (pn << PTE_PPN0_S); 1476 pmap_store(l1, entry); 1477 pmap_distribute_l1(pmap, l1index, entry); 1478 } else { 1479 vm_pindex_t l0index, l1index; 1480 pd_entry_t *l0, *l1, *l2; 1481 1482 l1index = ptepindex >> (L1_SHIFT - L2_SHIFT); 1483 if (pmap_mode == PMAP_MODE_SV39) { 1484 l1 = &pmap->pm_top[l1index]; 1485 if (pmap_load(l1) == 0) { 1486 /* recurse for allocating page dir */ 1487 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 1488 lockp) == NULL) 1489 goto fail; 1490 } else { 1491 phys = PTE_TO_PHYS(pmap_load(l1)); 1492 pdpg = PHYS_TO_VM_PAGE(phys); 1493 pdpg->ref_count++; 1494 } 1495 } else { 1496 l0index = l1index >> Ln_ENTRIES_SHIFT; 1497 l0 = &pmap->pm_top[l0index]; 1498 if (pmap_load(l0) == 0) { 1499 /* Recurse to allocate the L1 entry. */ 1500 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 1501 lockp) == NULL) 1502 goto fail; 1503 phys = PTE_TO_PHYS(pmap_load(l0)); 1504 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1505 l1 = &l1[l1index & Ln_ADDR_MASK]; 1506 } else { 1507 phys = PTE_TO_PHYS(pmap_load(l0)); 1508 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1509 l1 = &l1[l1index & Ln_ADDR_MASK]; 1510 if (pmap_load(l1) == 0) { 1511 /* Recurse to allocate the L2 page. */ 1512 if (_pmap_alloc_l3(pmap, 1513 NUL2E + l1index, lockp) == NULL) 1514 goto fail; 1515 } else { 1516 phys = PTE_TO_PHYS(pmap_load(l1)); 1517 pdpg = PHYS_TO_VM_PAGE(phys); 1518 pdpg->ref_count++; 1519 } 1520 } 1521 } 1522 1523 phys = PTE_TO_PHYS(pmap_load(l1)); 1524 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1525 l2 = &l2[ptepindex & Ln_ADDR_MASK]; 1526 KASSERT((pmap_load(l2) & PTE_V) == 0, 1527 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2))); 1528 1529 entry = PTE_V | (pn << PTE_PPN0_S); 1530 pmap_store(l2, entry); 1531 } 1532 1533 pmap_resident_count_inc(pmap, 1); 1534 1535 return (m); 1536 1537 fail: 1538 vm_page_unwire_noq(m); 1539 vm_page_free_zero(m); 1540 return (NULL); 1541 } 1542 1543 static vm_page_t 1544 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1545 { 1546 pd_entry_t *l1; 1547 vm_page_t l2pg; 1548 vm_pindex_t l2pindex; 1549 1550 retry: 1551 l1 = pmap_l1(pmap, va); 1552 if (l1 != NULL && (pmap_load(l1) & PTE_V) != 0) { 1553 KASSERT((pmap_load(l1) & PTE_RWX) == 0, 1554 ("%s: L1 entry %#lx for VA %#lx is a leaf", __func__, 1555 pmap_load(l1), va)); 1556 /* Add a reference to the L2 page. */ 1557 l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1))); 1558 l2pg->ref_count++; 1559 } else { 1560 /* Allocate a L2 page. */ 1561 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; 1562 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); 1563 if (l2pg == NULL && lockp != NULL) 1564 goto retry; 1565 } 1566 return (l2pg); 1567 } 1568 1569 static vm_page_t 1570 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1571 { 1572 vm_pindex_t ptepindex; 1573 pd_entry_t *l2; 1574 vm_paddr_t phys; 1575 vm_page_t m; 1576 1577 /* 1578 * Calculate pagetable page index 1579 */ 1580 ptepindex = pmap_l2_pindex(va); 1581 retry: 1582 /* 1583 * Get the page directory entry 1584 */ 1585 l2 = pmap_l2(pmap, va); 1586 1587 /* 1588 * If the page table page is mapped, we just increment the 1589 * hold count, and activate it. 1590 */ 1591 if (l2 != NULL && pmap_load(l2) != 0) { 1592 phys = PTE_TO_PHYS(pmap_load(l2)); 1593 m = PHYS_TO_VM_PAGE(phys); 1594 m->ref_count++; 1595 } else { 1596 /* 1597 * Here if the pte page isn't mapped, or if it has been 1598 * deallocated. 1599 */ 1600 m = _pmap_alloc_l3(pmap, ptepindex, lockp); 1601 if (m == NULL && lockp != NULL) 1602 goto retry; 1603 } 1604 return (m); 1605 } 1606 1607 /*************************************************** 1608 * Pmap allocation/deallocation routines. 1609 ***************************************************/ 1610 1611 /* 1612 * Release any resources held by the given physical map. 1613 * Called when a pmap initialized by pmap_pinit is being released. 1614 * Should only be called if the map contains no valid mappings. 1615 */ 1616 void 1617 pmap_release(pmap_t pmap) 1618 { 1619 vm_page_t m; 1620 1621 KASSERT(pmap->pm_stats.resident_count == 0, 1622 ("pmap_release: pmap resident count %ld != 0", 1623 pmap->pm_stats.resident_count)); 1624 KASSERT(CPU_EMPTY(&pmap->pm_active), 1625 ("releasing active pmap %p", pmap)); 1626 1627 if (pmap_mode == PMAP_MODE_SV39) { 1628 mtx_lock(&allpmaps_lock); 1629 LIST_REMOVE(pmap, pm_list); 1630 mtx_unlock(&allpmaps_lock); 1631 } 1632 1633 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_top)); 1634 vm_page_unwire_noq(m); 1635 vm_page_free(m); 1636 } 1637 1638 static int 1639 kvm_size(SYSCTL_HANDLER_ARGS) 1640 { 1641 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 1642 1643 return sysctl_handle_long(oidp, &ksize, 0, req); 1644 } 1645 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1646 0, 0, kvm_size, "LU", 1647 "Size of KVM"); 1648 1649 static int 1650 kvm_free(SYSCTL_HANDLER_ARGS) 1651 { 1652 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 1653 1654 return sysctl_handle_long(oidp, &kfree, 0, req); 1655 } 1656 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1657 0, 0, kvm_free, "LU", 1658 "Amount of KVM free"); 1659 1660 /* 1661 * grow the number of kernel page table entries, if needed 1662 */ 1663 void 1664 pmap_growkernel(vm_offset_t addr) 1665 { 1666 vm_paddr_t paddr; 1667 vm_page_t nkpg; 1668 pd_entry_t *l1, *l2; 1669 pt_entry_t entry; 1670 pn_t pn; 1671 1672 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 1673 1674 addr = roundup2(addr, L2_SIZE); 1675 if (addr - 1 >= vm_map_max(kernel_map)) 1676 addr = vm_map_max(kernel_map); 1677 while (kernel_vm_end < addr) { 1678 l1 = pmap_l1(kernel_pmap, kernel_vm_end); 1679 if (pmap_load(l1) == 0) { 1680 /* We need a new PDP entry */ 1681 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | 1682 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1683 if (nkpg == NULL) 1684 panic("pmap_growkernel: no memory to grow kernel"); 1685 nkpg->pindex = kernel_vm_end >> L1_SHIFT; 1686 paddr = VM_PAGE_TO_PHYS(nkpg); 1687 1688 pn = (paddr / PAGE_SIZE); 1689 entry = (PTE_V); 1690 entry |= (pn << PTE_PPN0_S); 1691 pmap_store(l1, entry); 1692 pmap_distribute_l1(kernel_pmap, 1693 pmap_l1_index(kernel_vm_end), entry); 1694 continue; /* try again */ 1695 } 1696 l2 = pmap_l1_to_l2(l1, kernel_vm_end); 1697 if ((pmap_load(l2) & PTE_V) != 0 && 1698 (pmap_load(l2) & PTE_RWX) == 0) { 1699 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1700 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1701 kernel_vm_end = vm_map_max(kernel_map); 1702 break; 1703 } 1704 continue; 1705 } 1706 1707 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 1708 VM_ALLOC_ZERO); 1709 if (nkpg == NULL) 1710 panic("pmap_growkernel: no memory to grow kernel"); 1711 nkpg->pindex = kernel_vm_end >> L2_SHIFT; 1712 paddr = VM_PAGE_TO_PHYS(nkpg); 1713 1714 pn = (paddr / PAGE_SIZE); 1715 entry = (PTE_V); 1716 entry |= (pn << PTE_PPN0_S); 1717 pmap_store(l2, entry); 1718 1719 pmap_invalidate_page(kernel_pmap, kernel_vm_end); 1720 1721 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1722 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1723 kernel_vm_end = vm_map_max(kernel_map); 1724 break; 1725 } 1726 } 1727 } 1728 1729 /*************************************************** 1730 * page management routines. 1731 ***************************************************/ 1732 1733 static const uint64_t pc_freemask[_NPCM] = { 1734 [0 ... _NPCM - 2] = PC_FREEN, 1735 [_NPCM - 1] = PC_FREEL 1736 }; 1737 1738 #if 0 1739 #ifdef PV_STATS 1740 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 1741 1742 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 1743 "Current number of pv entry chunks"); 1744 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 1745 "Current number of pv entry chunks allocated"); 1746 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 1747 "Current number of pv entry chunks frees"); 1748 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 1749 "Number of times tried to get a chunk page but failed."); 1750 1751 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 1752 static int pv_entry_spare; 1753 1754 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 1755 "Current number of pv entry frees"); 1756 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 1757 "Current number of pv entry allocs"); 1758 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 1759 "Current number of pv entries"); 1760 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 1761 "Current number of spare pv entries"); 1762 #endif 1763 #endif /* 0 */ 1764 1765 /* 1766 * We are in a serious low memory condition. Resort to 1767 * drastic measures to free some pages so we can allocate 1768 * another pv entry chunk. 1769 * 1770 * Returns NULL if PV entries were reclaimed from the specified pmap. 1771 * 1772 * We do not, however, unmap 2mpages because subsequent accesses will 1773 * allocate per-page pv entries until repromotion occurs, thereby 1774 * exacerbating the shortage of free pv entries. 1775 */ 1776 static vm_page_t 1777 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 1778 { 1779 1780 panic("RISCVTODO: reclaim_pv_chunk"); 1781 } 1782 1783 /* 1784 * free the pv_entry back to the free list 1785 */ 1786 static void 1787 free_pv_entry(pmap_t pmap, pv_entry_t pv) 1788 { 1789 struct pv_chunk *pc; 1790 int idx, field, bit; 1791 1792 rw_assert(&pvh_global_lock, RA_LOCKED); 1793 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1794 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 1795 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 1796 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 1797 pc = pv_to_chunk(pv); 1798 idx = pv - &pc->pc_pventry[0]; 1799 field = idx / 64; 1800 bit = idx % 64; 1801 pc->pc_map[field] |= 1ul << bit; 1802 if (!pc_is_free(pc)) { 1803 /* 98% of the time, pc is already at the head of the list. */ 1804 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 1805 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1806 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1807 } 1808 return; 1809 } 1810 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1811 free_pv_chunk(pc); 1812 } 1813 1814 static void 1815 free_pv_chunk(struct pv_chunk *pc) 1816 { 1817 vm_page_t m; 1818 1819 mtx_lock(&pv_chunks_mutex); 1820 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 1821 mtx_unlock(&pv_chunks_mutex); 1822 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 1823 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 1824 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 1825 /* entire chunk is free, return it */ 1826 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 1827 dump_drop_page(m->phys_addr); 1828 vm_page_unwire_noq(m); 1829 vm_page_free(m); 1830 } 1831 1832 /* 1833 * Returns a new PV entry, allocating a new PV chunk from the system when 1834 * needed. If this PV chunk allocation fails and a PV list lock pointer was 1835 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 1836 * returned. 1837 * 1838 * The given PV list lock may be released. 1839 */ 1840 static pv_entry_t 1841 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 1842 { 1843 int bit, field; 1844 pv_entry_t pv; 1845 struct pv_chunk *pc; 1846 vm_page_t m; 1847 1848 rw_assert(&pvh_global_lock, RA_LOCKED); 1849 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1850 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 1851 retry: 1852 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1853 if (pc != NULL) { 1854 for (field = 0; field < _NPCM; field++) { 1855 if (pc->pc_map[field]) { 1856 bit = ffsl(pc->pc_map[field]) - 1; 1857 break; 1858 } 1859 } 1860 if (field < _NPCM) { 1861 pv = &pc->pc_pventry[field * 64 + bit]; 1862 pc->pc_map[field] &= ~(1ul << bit); 1863 /* If this was the last item, move it to tail */ 1864 if (pc_is_full(pc)) { 1865 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1866 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 1867 pc_list); 1868 } 1869 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1870 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 1871 return (pv); 1872 } 1873 } 1874 /* No free items, allocate another chunk */ 1875 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 1876 if (m == NULL) { 1877 if (lockp == NULL) { 1878 PV_STAT(pc_chunk_tryfail++); 1879 return (NULL); 1880 } 1881 m = reclaim_pv_chunk(pmap, lockp); 1882 if (m == NULL) 1883 goto retry; 1884 } 1885 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 1886 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 1887 dump_add_page(m->phys_addr); 1888 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1889 pc->pc_pmap = pmap; 1890 pc->pc_map[0] = PC_FREEN & ~1ul; /* preallocated bit 0 */ 1891 pc->pc_map[1] = PC_FREEN; 1892 pc->pc_map[2] = PC_FREEL; 1893 mtx_lock(&pv_chunks_mutex); 1894 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 1895 mtx_unlock(&pv_chunks_mutex); 1896 pv = &pc->pc_pventry[0]; 1897 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1898 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1899 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 1900 return (pv); 1901 } 1902 1903 /* 1904 * Ensure that the number of spare PV entries in the specified pmap meets or 1905 * exceeds the given count, "needed". 1906 * 1907 * The given PV list lock may be released. 1908 */ 1909 static void 1910 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 1911 { 1912 struct pch new_tail; 1913 struct pv_chunk *pc; 1914 vm_page_t m; 1915 int avail, free; 1916 bool reclaimed; 1917 1918 rw_assert(&pvh_global_lock, RA_LOCKED); 1919 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1920 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 1921 1922 /* 1923 * Newly allocated PV chunks must be stored in a private list until 1924 * the required number of PV chunks have been allocated. Otherwise, 1925 * reclaim_pv_chunk() could recycle one of these chunks. In 1926 * contrast, these chunks must be added to the pmap upon allocation. 1927 */ 1928 TAILQ_INIT(&new_tail); 1929 retry: 1930 avail = 0; 1931 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 1932 bit_count((bitstr_t *)pc->pc_map, 0, 1933 sizeof(pc->pc_map) * NBBY, &free); 1934 if (free == 0) 1935 break; 1936 avail += free; 1937 if (avail >= needed) 1938 break; 1939 } 1940 for (reclaimed = false; avail < needed; avail += _NPCPV) { 1941 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 1942 if (m == NULL) { 1943 m = reclaim_pv_chunk(pmap, lockp); 1944 if (m == NULL) 1945 goto retry; 1946 reclaimed = true; 1947 } 1948 /* XXX PV STATS */ 1949 #if 0 1950 dump_add_page(m->phys_addr); 1951 #endif 1952 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1953 pc->pc_pmap = pmap; 1954 pc->pc_map[0] = PC_FREEN; 1955 pc->pc_map[1] = PC_FREEN; 1956 pc->pc_map[2] = PC_FREEL; 1957 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1958 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 1959 1960 /* 1961 * The reclaim might have freed a chunk from the current pmap. 1962 * If that chunk contained available entries, we need to 1963 * re-count the number of available entries. 1964 */ 1965 if (reclaimed) 1966 goto retry; 1967 } 1968 if (!TAILQ_EMPTY(&new_tail)) { 1969 mtx_lock(&pv_chunks_mutex); 1970 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 1971 mtx_unlock(&pv_chunks_mutex); 1972 } 1973 } 1974 1975 /* 1976 * First find and then remove the pv entry for the specified pmap and virtual 1977 * address from the specified pv list. Returns the pv entry if found and NULL 1978 * otherwise. This operation can be performed on pv lists for either 4KB or 1979 * 2MB page mappings. 1980 */ 1981 static __inline pv_entry_t 1982 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1983 { 1984 pv_entry_t pv; 1985 1986 rw_assert(&pvh_global_lock, RA_LOCKED); 1987 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 1988 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 1989 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 1990 pvh->pv_gen++; 1991 break; 1992 } 1993 } 1994 return (pv); 1995 } 1996 1997 /* 1998 * First find and then destroy the pv entry for the specified pmap and virtual 1999 * address. This operation can be performed on pv lists for either 4KB or 2MB 2000 * page mappings. 2001 */ 2002 static void 2003 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2004 { 2005 pv_entry_t pv; 2006 2007 pv = pmap_pvh_remove(pvh, pmap, va); 2008 2009 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va)); 2010 free_pv_entry(pmap, pv); 2011 } 2012 2013 /* 2014 * Conditionally create the PV entry for a 4KB page mapping if the required 2015 * memory can be allocated without resorting to reclamation. 2016 */ 2017 static boolean_t 2018 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 2019 struct rwlock **lockp) 2020 { 2021 pv_entry_t pv; 2022 2023 rw_assert(&pvh_global_lock, RA_LOCKED); 2024 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2025 /* Pass NULL instead of the lock pointer to disable reclamation. */ 2026 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 2027 pv->pv_va = va; 2028 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2029 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2030 m->md.pv_gen++; 2031 return (TRUE); 2032 } else 2033 return (FALSE); 2034 } 2035 2036 /* 2037 * After demotion from a 2MB page mapping to 512 4KB page mappings, 2038 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 2039 * entries for each of the 4KB page mappings. 2040 */ 2041 static void __unused 2042 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 2043 struct rwlock **lockp) 2044 { 2045 struct md_page *pvh; 2046 struct pv_chunk *pc; 2047 pv_entry_t pv; 2048 vm_page_t m; 2049 vm_offset_t va_last; 2050 int bit, field; 2051 2052 rw_assert(&pvh_global_lock, RA_LOCKED); 2053 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2054 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2055 2056 /* 2057 * Transfer the 2mpage's pv entry for this mapping to the first 2058 * page's pv list. Once this transfer begins, the pv list lock 2059 * must not be released until the last pv entry is reinstantiated. 2060 */ 2061 pvh = pa_to_pvh(pa); 2062 va &= ~L2_OFFSET; 2063 pv = pmap_pvh_remove(pvh, pmap, va); 2064 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); 2065 m = PHYS_TO_VM_PAGE(pa); 2066 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2067 m->md.pv_gen++; 2068 /* Instantiate the remaining 511 pv entries. */ 2069 va_last = va + L2_SIZE - PAGE_SIZE; 2070 for (;;) { 2071 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2072 KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare")); 2073 for (field = 0; field < _NPCM; field++) { 2074 while (pc->pc_map[field] != 0) { 2075 bit = ffsl(pc->pc_map[field]) - 1; 2076 pc->pc_map[field] &= ~(1ul << bit); 2077 pv = &pc->pc_pventry[field * 64 + bit]; 2078 va += PAGE_SIZE; 2079 pv->pv_va = va; 2080 m++; 2081 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2082 ("pmap_pv_demote_l2: page %p is not managed", m)); 2083 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2084 m->md.pv_gen++; 2085 if (va == va_last) 2086 goto out; 2087 } 2088 } 2089 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2090 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2091 } 2092 out: 2093 if (pc_is_free(pc)) { 2094 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2095 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2096 } 2097 /* XXX PV stats */ 2098 } 2099 2100 #if VM_NRESERVLEVEL > 0 2101 static void 2102 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 2103 struct rwlock **lockp) 2104 { 2105 struct md_page *pvh; 2106 pv_entry_t pv; 2107 vm_page_t m; 2108 vm_offset_t va_last; 2109 2110 rw_assert(&pvh_global_lock, RA_LOCKED); 2111 KASSERT((pa & L2_OFFSET) == 0, 2112 ("pmap_pv_promote_l2: misaligned pa %#lx", pa)); 2113 2114 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2115 2116 m = PHYS_TO_VM_PAGE(pa); 2117 va = va & ~L2_OFFSET; 2118 pv = pmap_pvh_remove(&m->md, pmap, va); 2119 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va)); 2120 pvh = pa_to_pvh(pa); 2121 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2122 pvh->pv_gen++; 2123 2124 va_last = va + L2_SIZE - PAGE_SIZE; 2125 do { 2126 m++; 2127 va += PAGE_SIZE; 2128 pmap_pvh_free(&m->md, pmap, va); 2129 } while (va < va_last); 2130 } 2131 #endif /* VM_NRESERVLEVEL > 0 */ 2132 2133 /* 2134 * Create the PV entry for a 2MB page mapping. Always returns true unless the 2135 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 2136 * false if the PV entry cannot be allocated without resorting to reclamation. 2137 */ 2138 static bool 2139 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, 2140 struct rwlock **lockp) 2141 { 2142 struct md_page *pvh; 2143 pv_entry_t pv; 2144 vm_paddr_t pa; 2145 2146 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2147 /* Pass NULL instead of the lock pointer to disable reclamation. */ 2148 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 2149 NULL : lockp)) == NULL) 2150 return (false); 2151 pv->pv_va = va; 2152 pa = PTE_TO_PHYS(l2e); 2153 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2154 pvh = pa_to_pvh(pa); 2155 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2156 pvh->pv_gen++; 2157 return (true); 2158 } 2159 2160 static void 2161 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 2162 { 2163 pt_entry_t newl2, oldl2 __diagused; 2164 vm_page_t ml3; 2165 vm_paddr_t ml3pa; 2166 2167 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); 2168 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 2169 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2170 2171 ml3 = pmap_remove_pt_page(pmap, va); 2172 if (ml3 == NULL) 2173 panic("pmap_remove_kernel_l2: Missing pt page"); 2174 2175 ml3pa = VM_PAGE_TO_PHYS(ml3); 2176 newl2 = ml3pa | PTE_V; 2177 2178 /* 2179 * If this page table page was unmapped by a promotion, then it 2180 * contains valid mappings. Zero it to invalidate those mappings. 2181 */ 2182 if (vm_page_any_valid(ml3)) 2183 pagezero((void *)PHYS_TO_DMAP(ml3pa)); 2184 2185 /* 2186 * Demote the mapping. 2187 */ 2188 oldl2 = pmap_load_store(l2, newl2); 2189 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", 2190 __func__, l2, oldl2)); 2191 } 2192 2193 /* 2194 * pmap_remove_l2: Do the things to unmap a level 2 superpage. 2195 */ 2196 static int 2197 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 2198 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) 2199 { 2200 struct md_page *pvh; 2201 pt_entry_t oldl2; 2202 vm_offset_t eva, va; 2203 vm_page_t m, ml3; 2204 2205 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2206 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); 2207 oldl2 = pmap_load_clear(l2); 2208 KASSERT((oldl2 & PTE_RWX) != 0, 2209 ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2)); 2210 2211 /* 2212 * The sfence.vma documentation states that it is sufficient to specify 2213 * a single address within a superpage mapping. However, since we do 2214 * not perform any invalidation upon promotion, TLBs may still be 2215 * caching 4KB mappings within the superpage, so we must invalidate the 2216 * entire range. 2217 */ 2218 pmap_invalidate_range(pmap, sva, sva + L2_SIZE); 2219 if ((oldl2 & PTE_SW_WIRED) != 0) 2220 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; 2221 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); 2222 if ((oldl2 & PTE_SW_MANAGED) != 0) { 2223 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2)); 2224 pvh = pa_to_pvh(PTE_TO_PHYS(oldl2)); 2225 pmap_pvh_free(pvh, pmap, sva); 2226 eva = sva + L2_SIZE; 2227 for (va = sva, m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl2)); 2228 va < eva; va += PAGE_SIZE, m++) { 2229 if ((oldl2 & PTE_D) != 0) 2230 vm_page_dirty(m); 2231 if ((oldl2 & PTE_A) != 0) 2232 vm_page_aflag_set(m, PGA_REFERENCED); 2233 if (TAILQ_EMPTY(&m->md.pv_list) && 2234 TAILQ_EMPTY(&pvh->pv_list)) 2235 vm_page_aflag_clear(m, PGA_WRITEABLE); 2236 } 2237 } 2238 if (pmap == kernel_pmap) { 2239 pmap_remove_kernel_l2(pmap, l2, sva); 2240 } else { 2241 ml3 = pmap_remove_pt_page(pmap, sva); 2242 if (ml3 != NULL) { 2243 KASSERT(vm_page_any_valid(ml3), 2244 ("pmap_remove_l2: l3 page not promoted")); 2245 pmap_resident_count_dec(pmap, 1); 2246 KASSERT(ml3->ref_count == Ln_ENTRIES, 2247 ("pmap_remove_l2: l3 page ref count error")); 2248 ml3->ref_count = 1; 2249 vm_page_unwire_noq(ml3); 2250 pmap_add_delayed_free_list(ml3, free, FALSE); 2251 } 2252 } 2253 return (pmap_unuse_pt(pmap, sva, l1e, free)); 2254 } 2255 2256 /* 2257 * pmap_remove_l3: do the things to unmap a page in a process 2258 */ 2259 static int 2260 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 2261 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) 2262 { 2263 struct md_page *pvh; 2264 pt_entry_t old_l3; 2265 vm_paddr_t phys; 2266 vm_page_t m; 2267 2268 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2269 old_l3 = pmap_load_clear(l3); 2270 pmap_invalidate_page(pmap, va); 2271 if (old_l3 & PTE_SW_WIRED) 2272 pmap->pm_stats.wired_count -= 1; 2273 pmap_resident_count_dec(pmap, 1); 2274 if (old_l3 & PTE_SW_MANAGED) { 2275 phys = PTE_TO_PHYS(old_l3); 2276 m = PHYS_TO_VM_PAGE(phys); 2277 if ((old_l3 & PTE_D) != 0) 2278 vm_page_dirty(m); 2279 if (old_l3 & PTE_A) 2280 vm_page_aflag_set(m, PGA_REFERENCED); 2281 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2282 pmap_pvh_free(&m->md, pmap, va); 2283 if (TAILQ_EMPTY(&m->md.pv_list) && 2284 (m->flags & PG_FICTITIOUS) == 0) { 2285 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2286 if (TAILQ_EMPTY(&pvh->pv_list)) 2287 vm_page_aflag_clear(m, PGA_WRITEABLE); 2288 } 2289 } 2290 2291 return (pmap_unuse_pt(pmap, va, l2e, free)); 2292 } 2293 2294 /* 2295 * Remove the given range of addresses from the specified map. 2296 * 2297 * It is assumed that the start and end are properly 2298 * rounded to the page size. 2299 */ 2300 void 2301 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2302 { 2303 struct spglist free; 2304 struct rwlock *lock; 2305 vm_offset_t va, va_next; 2306 pd_entry_t *l0, *l1, *l2, l2e; 2307 pt_entry_t *l3; 2308 2309 /* 2310 * Perform an unsynchronized read. This is, however, safe. 2311 */ 2312 if (pmap->pm_stats.resident_count == 0) 2313 return; 2314 2315 SLIST_INIT(&free); 2316 2317 rw_rlock(&pvh_global_lock); 2318 PMAP_LOCK(pmap); 2319 2320 lock = NULL; 2321 for (; sva < eva; sva = va_next) { 2322 if (pmap->pm_stats.resident_count == 0) 2323 break; 2324 2325 if (pmap_mode == PMAP_MODE_SV48) { 2326 l0 = pmap_l0(pmap, sva); 2327 if (pmap_load(l0) == 0) { 2328 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 2329 if (va_next < sva) 2330 va_next = eva; 2331 continue; 2332 } 2333 l1 = pmap_l0_to_l1(l0, sva); 2334 } else { 2335 l1 = pmap_l1(pmap, sva); 2336 } 2337 2338 if (pmap_load(l1) == 0) { 2339 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2340 if (va_next < sva) 2341 va_next = eva; 2342 continue; 2343 } 2344 2345 /* 2346 * Calculate index for next page table. 2347 */ 2348 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2349 if (va_next < sva) 2350 va_next = eva; 2351 2352 l2 = pmap_l1_to_l2(l1, sva); 2353 if (l2 == NULL) 2354 continue; 2355 if ((l2e = pmap_load(l2)) == 0) 2356 continue; 2357 if ((l2e & PTE_RWX) != 0) { 2358 if (sva + L2_SIZE == va_next && eva >= va_next) { 2359 (void)pmap_remove_l2(pmap, l2, sva, 2360 pmap_load(l1), &free, &lock); 2361 continue; 2362 } else if (!pmap_demote_l2_locked(pmap, l2, sva, 2363 &lock)) { 2364 /* 2365 * The large page mapping was destroyed. 2366 */ 2367 continue; 2368 } 2369 l2e = pmap_load(l2); 2370 } 2371 2372 /* 2373 * Limit our scan to either the end of the va represented 2374 * by the current page table page, or to the end of the 2375 * range being removed. 2376 */ 2377 if (va_next > eva) 2378 va_next = eva; 2379 2380 va = va_next; 2381 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2382 sva += L3_SIZE) { 2383 if (pmap_load(l3) == 0) { 2384 if (va != va_next) { 2385 pmap_invalidate_range(pmap, va, sva); 2386 va = va_next; 2387 } 2388 continue; 2389 } 2390 if (va == va_next) 2391 va = sva; 2392 if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) { 2393 sva += L3_SIZE; 2394 break; 2395 } 2396 } 2397 if (va != va_next) 2398 pmap_invalidate_range(pmap, va, sva); 2399 } 2400 if (lock != NULL) 2401 rw_wunlock(lock); 2402 rw_runlock(&pvh_global_lock); 2403 PMAP_UNLOCK(pmap); 2404 vm_page_free_pages_toq(&free, false); 2405 } 2406 2407 /* 2408 * Routine: pmap_remove_all 2409 * Function: 2410 * Removes this physical page from 2411 * all physical maps in which it resides. 2412 * Reflects back modify bits to the pager. 2413 * 2414 * Notes: 2415 * Original versions of this routine were very 2416 * inefficient because they iteratively called 2417 * pmap_remove (slow...) 2418 */ 2419 2420 void 2421 pmap_remove_all(vm_page_t m) 2422 { 2423 struct spglist free; 2424 struct md_page *pvh; 2425 pmap_t pmap; 2426 pt_entry_t *l3, l3e; 2427 pd_entry_t *l2, l2e __diagused; 2428 pv_entry_t pv; 2429 vm_offset_t va; 2430 2431 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2432 ("pmap_remove_all: page %p is not managed", m)); 2433 SLIST_INIT(&free); 2434 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 2435 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2436 2437 rw_wlock(&pvh_global_lock); 2438 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 2439 pmap = PV_PMAP(pv); 2440 PMAP_LOCK(pmap); 2441 va = pv->pv_va; 2442 l2 = pmap_l2(pmap, va); 2443 (void)pmap_demote_l2(pmap, l2, va); 2444 PMAP_UNLOCK(pmap); 2445 } 2446 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2447 pmap = PV_PMAP(pv); 2448 PMAP_LOCK(pmap); 2449 pmap_resident_count_dec(pmap, 1); 2450 l2 = pmap_l2(pmap, pv->pv_va); 2451 KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found")); 2452 l2e = pmap_load(l2); 2453 2454 KASSERT((l2e & PTE_RX) == 0, 2455 ("pmap_remove_all: found a superpage in %p's pv list", m)); 2456 2457 l3 = pmap_l2_to_l3(l2, pv->pv_va); 2458 l3e = pmap_load_clear(l3); 2459 pmap_invalidate_page(pmap, pv->pv_va); 2460 if (l3e & PTE_SW_WIRED) 2461 pmap->pm_stats.wired_count--; 2462 if ((l3e & PTE_A) != 0) 2463 vm_page_aflag_set(m, PGA_REFERENCED); 2464 2465 /* 2466 * Update the vm_page_t clean and reference bits. 2467 */ 2468 if ((l3e & PTE_D) != 0) 2469 vm_page_dirty(m); 2470 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free); 2471 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2472 m->md.pv_gen++; 2473 free_pv_entry(pmap, pv); 2474 PMAP_UNLOCK(pmap); 2475 } 2476 vm_page_aflag_clear(m, PGA_WRITEABLE); 2477 rw_wunlock(&pvh_global_lock); 2478 vm_page_free_pages_toq(&free, false); 2479 } 2480 2481 /* 2482 * Set the physical protection on the 2483 * specified range of this map as requested. 2484 */ 2485 void 2486 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2487 { 2488 pd_entry_t *l0, *l1, *l2, l2e; 2489 pt_entry_t *l3, l3e, mask; 2490 vm_page_t m, mt; 2491 vm_paddr_t pa; 2492 vm_offset_t va_next; 2493 bool anychanged, pv_lists_locked; 2494 2495 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2496 pmap_remove(pmap, sva, eva); 2497 return; 2498 } 2499 2500 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == 2501 (VM_PROT_WRITE | VM_PROT_EXECUTE)) 2502 return; 2503 2504 anychanged = false; 2505 pv_lists_locked = false; 2506 mask = 0; 2507 if ((prot & VM_PROT_WRITE) == 0) 2508 mask |= PTE_W | PTE_D; 2509 if ((prot & VM_PROT_EXECUTE) == 0) 2510 mask |= PTE_X; 2511 resume: 2512 PMAP_LOCK(pmap); 2513 for (; sva < eva; sva = va_next) { 2514 if (pmap_mode == PMAP_MODE_SV48) { 2515 l0 = pmap_l0(pmap, sva); 2516 if (pmap_load(l0) == 0) { 2517 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 2518 if (va_next < sva) 2519 va_next = eva; 2520 continue; 2521 } 2522 l1 = pmap_l0_to_l1(l0, sva); 2523 } else { 2524 l1 = pmap_l1(pmap, sva); 2525 } 2526 2527 if (pmap_load(l1) == 0) { 2528 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2529 if (va_next < sva) 2530 va_next = eva; 2531 continue; 2532 } 2533 2534 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2535 if (va_next < sva) 2536 va_next = eva; 2537 2538 l2 = pmap_l1_to_l2(l1, sva); 2539 if (l2 == NULL || (l2e = pmap_load(l2)) == 0) 2540 continue; 2541 if ((l2e & PTE_RWX) != 0) { 2542 if (sva + L2_SIZE == va_next && eva >= va_next) { 2543 retryl2: 2544 if ((prot & VM_PROT_WRITE) == 0 && 2545 (l2e & (PTE_SW_MANAGED | PTE_D)) == 2546 (PTE_SW_MANAGED | PTE_D)) { 2547 pa = PTE_TO_PHYS(l2e); 2548 m = PHYS_TO_VM_PAGE(pa); 2549 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 2550 vm_page_dirty(mt); 2551 } 2552 if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask)) 2553 goto retryl2; 2554 anychanged = true; 2555 continue; 2556 } else { 2557 if (!pv_lists_locked) { 2558 pv_lists_locked = true; 2559 if (!rw_try_rlock(&pvh_global_lock)) { 2560 if (anychanged) 2561 pmap_invalidate_all( 2562 pmap); 2563 PMAP_UNLOCK(pmap); 2564 rw_rlock(&pvh_global_lock); 2565 goto resume; 2566 } 2567 } 2568 if (!pmap_demote_l2(pmap, l2, sva)) { 2569 /* 2570 * The large page mapping was destroyed. 2571 */ 2572 continue; 2573 } 2574 } 2575 } 2576 2577 if (va_next > eva) 2578 va_next = eva; 2579 2580 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2581 sva += L3_SIZE) { 2582 l3e = pmap_load(l3); 2583 retryl3: 2584 if ((l3e & PTE_V) == 0) 2585 continue; 2586 if ((prot & VM_PROT_WRITE) == 0 && 2587 (l3e & (PTE_SW_MANAGED | PTE_D)) == 2588 (PTE_SW_MANAGED | PTE_D)) { 2589 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(l3e)); 2590 vm_page_dirty(m); 2591 } 2592 if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask)) 2593 goto retryl3; 2594 anychanged = true; 2595 } 2596 } 2597 if (anychanged) 2598 pmap_invalidate_all(pmap); 2599 if (pv_lists_locked) 2600 rw_runlock(&pvh_global_lock); 2601 PMAP_UNLOCK(pmap); 2602 } 2603 2604 int 2605 pmap_fault(pmap_t pmap, vm_offset_t va, vm_prot_t ftype) 2606 { 2607 pd_entry_t *l2, l2e; 2608 pt_entry_t bits, *pte, oldpte; 2609 int rv; 2610 2611 KASSERT(VIRT_IS_VALID(va), ("pmap_fault: invalid va %#lx", va)); 2612 2613 rv = 0; 2614 PMAP_LOCK(pmap); 2615 l2 = pmap_l2(pmap, va); 2616 if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) 2617 goto done; 2618 if ((l2e & PTE_RWX) == 0) { 2619 pte = pmap_l2_to_l3(l2, va); 2620 if (pte == NULL || ((oldpte = pmap_load(pte)) & PTE_V) == 0) 2621 goto done; 2622 } else { 2623 pte = l2; 2624 oldpte = l2e; 2625 } 2626 2627 if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) || 2628 (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) || 2629 (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) || 2630 (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0)) 2631 goto done; 2632 2633 bits = PTE_A; 2634 if (ftype == VM_PROT_WRITE) 2635 bits |= PTE_D; 2636 2637 /* 2638 * Spurious faults can occur if the implementation caches invalid 2639 * entries in the TLB, or if simultaneous accesses on multiple CPUs 2640 * race with each other. 2641 */ 2642 if ((oldpte & bits) != bits) 2643 pmap_store_bits(pte, bits); 2644 sfence_vma(); 2645 rv = 1; 2646 done: 2647 PMAP_UNLOCK(pmap); 2648 return (rv); 2649 } 2650 2651 static bool 2652 pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va) 2653 { 2654 struct rwlock *lock; 2655 bool rv; 2656 2657 lock = NULL; 2658 rv = pmap_demote_l2_locked(pmap, l2, va, &lock); 2659 if (lock != NULL) 2660 rw_wunlock(lock); 2661 return (rv); 2662 } 2663 2664 /* 2665 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 2666 * mapping is invalidated. 2667 */ 2668 static bool 2669 pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 2670 struct rwlock **lockp) 2671 { 2672 struct spglist free; 2673 vm_page_t mpte; 2674 pd_entry_t newl2, oldl2; 2675 pt_entry_t *firstl3, newl3; 2676 vm_paddr_t mptepa; 2677 int i; 2678 2679 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2680 2681 oldl2 = pmap_load(l2); 2682 KASSERT((oldl2 & PTE_RWX) != 0, 2683 ("pmap_demote_l2_locked: oldl2 is not a leaf entry")); 2684 if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 2685 NULL) { 2686 KASSERT((oldl2 & PTE_SW_WIRED) == 0, 2687 ("pmap_demote_l2_locked: page table page for a wired mapping is missing")); 2688 if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc_noobj( 2689 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) | 2690 VM_ALLOC_WIRED)) == NULL) { 2691 SLIST_INIT(&free); 2692 (void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET, 2693 pmap_load(pmap_l1(pmap, va)), &free, lockp); 2694 vm_page_free_pages_toq(&free, true); 2695 CTR2(KTR_PMAP, "pmap_demote_l2_locked: " 2696 "failure for va %#lx in pmap %p", va, pmap); 2697 return (false); 2698 } 2699 mpte->pindex = pmap_l2_pindex(va); 2700 if (va < VM_MAXUSER_ADDRESS) { 2701 mpte->ref_count = Ln_ENTRIES; 2702 pmap_resident_count_inc(pmap, 1); 2703 } 2704 } 2705 mptepa = VM_PAGE_TO_PHYS(mpte); 2706 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 2707 newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V; 2708 KASSERT((oldl2 & PTE_A) != 0, 2709 ("pmap_demote_l2_locked: oldl2 is missing PTE_A")); 2710 KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W, 2711 ("pmap_demote_l2_locked: oldl2 is missing PTE_D")); 2712 newl3 = oldl2; 2713 2714 /* 2715 * If the page table page is not leftover from an earlier promotion, 2716 * initialize it. 2717 */ 2718 if (!vm_page_all_valid(mpte)) { 2719 for (i = 0; i < Ln_ENTRIES; i++) 2720 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2721 } 2722 KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3), 2723 ("pmap_demote_l2_locked: firstl3 and newl3 map different physical " 2724 "addresses")); 2725 2726 /* 2727 * If the mapping has changed attributes, update the PTEs. 2728 */ 2729 if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE)) 2730 for (i = 0; i < Ln_ENTRIES; i++) 2731 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2732 2733 /* 2734 * The spare PV entries must be reserved prior to demoting the 2735 * mapping, that is, prior to changing the L2 entry. Otherwise, the 2736 * state of the L2 entry and the PV lists will be inconsistent, which 2737 * can result in reclaim_pv_chunk() attempting to remove a PV entry from 2738 * the wrong PV list and pmap_pv_demote_l2() failing to find the 2739 * expected PV entry for the 2MB page mapping that is being demoted. 2740 */ 2741 if ((oldl2 & PTE_SW_MANAGED) != 0) 2742 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); 2743 2744 /* 2745 * Demote the mapping. 2746 */ 2747 pmap_store(l2, newl2); 2748 2749 /* 2750 * Demote the PV entry. 2751 */ 2752 if ((oldl2 & PTE_SW_MANAGED) != 0) 2753 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp); 2754 2755 atomic_add_long(&pmap_l2_demotions, 1); 2756 CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p", 2757 va, pmap); 2758 return (true); 2759 } 2760 2761 #if VM_NRESERVLEVEL > 0 2762 static bool 2763 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t ml3, 2764 struct rwlock **lockp) 2765 { 2766 pt_entry_t all_l3e_PTE_A, *firstl3, firstl3e, *l3, l3e; 2767 vm_paddr_t pa; 2768 2769 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2770 if (!pmap_ps_enabled(pmap)) 2771 return (false); 2772 2773 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 2774 ("pmap_promote_l2: invalid l2 entry %p", l2)); 2775 2776 /* 2777 * Examine the first L3E in the specified PTP. Abort if this L3E is 2778 * ineligible for promotion or does not map the first 4KB physical page 2779 * within a 2MB page. 2780 */ 2781 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2))); 2782 firstl3e = pmap_load(firstl3); 2783 pa = PTE_TO_PHYS(firstl3e); 2784 if ((pa & L2_OFFSET) != 0) { 2785 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 2786 va, pmap); 2787 atomic_add_long(&pmap_l2_p_failures, 1); 2788 return (false); 2789 } 2790 2791 /* 2792 * Downgrade a clean, writable mapping to read-only to ensure that the 2793 * hardware does not set PTE_D while we are comparing PTEs. 2794 * 2795 * Upon a write access to a clean mapping, the implementation will 2796 * either atomically check protections and set PTE_D, or raise a page 2797 * fault. In the latter case, the pmap lock provides atomicity. Thus, 2798 * we do not issue an sfence.vma here and instead rely on pmap_fault() 2799 * to do so lazily. 2800 */ 2801 while ((firstl3e & (PTE_W | PTE_D)) == PTE_W) { 2802 if (atomic_fcmpset_64(firstl3, &firstl3e, firstl3e & ~PTE_W)) { 2803 firstl3e &= ~PTE_W; 2804 break; 2805 } 2806 } 2807 2808 /* 2809 * Examine each of the other PTEs in the specified PTP. Abort if this 2810 * PTE maps an unexpected 4KB physical page or does not have identical 2811 * characteristics to the first PTE. 2812 */ 2813 all_l3e_PTE_A = firstl3e & PTE_A; 2814 pa += L2_SIZE - PAGE_SIZE; 2815 for (l3 = firstl3 + Ln_ENTRIES - 1; l3 > firstl3; l3--) { 2816 l3e = pmap_load(l3); 2817 if (PTE_TO_PHYS(l3e) != pa) { 2818 CTR2(KTR_PMAP, 2819 "pmap_promote_l2: failure for va %#lx pmap %p", 2820 va, pmap); 2821 atomic_add_long(&pmap_l2_p_failures, 1); 2822 return (false); 2823 } 2824 while ((l3e & (PTE_W | PTE_D)) == PTE_W) { 2825 if (atomic_fcmpset_64(l3, &l3e, l3e & ~PTE_W)) { 2826 l3e &= ~PTE_W; 2827 break; 2828 } 2829 } 2830 if ((l3e & PTE_PROMOTE) != (firstl3e & PTE_PROMOTE)) { 2831 CTR2(KTR_PMAP, 2832 "pmap_promote_l2: failure for va %#lx pmap %p", 2833 va, pmap); 2834 atomic_add_long(&pmap_l2_p_failures, 1); 2835 return (false); 2836 } 2837 all_l3e_PTE_A &= l3e; 2838 pa -= PAGE_SIZE; 2839 } 2840 2841 /* 2842 * Unless all PTEs have PTE_A set, clear it from the superpage 2843 * mapping, so that promotions triggered by speculative mappings, 2844 * such as pmap_enter_quick(), don't automatically mark the 2845 * underlying pages as referenced. 2846 */ 2847 firstl3e &= ~PTE_A | all_l3e_PTE_A; 2848 2849 /* 2850 * Save the page table page in its current state until the L2 2851 * mapping the superpage is demoted by pmap_demote_l2() or 2852 * destroyed by pmap_remove_l3(). 2853 */ 2854 if (ml3 == NULL) 2855 ml3 = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2856 KASSERT(ml3->pindex == pmap_l2_pindex(va), 2857 ("pmap_promote_l2: page table page's pindex is wrong")); 2858 if (pmap_insert_pt_page(pmap, ml3, true, all_l3e_PTE_A != 0)) { 2859 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 2860 va, pmap); 2861 atomic_add_long(&pmap_l2_p_failures, 1); 2862 return (false); 2863 } 2864 2865 if ((firstl3e & PTE_SW_MANAGED) != 0) 2866 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(firstl3e), lockp); 2867 2868 pmap_store(l2, firstl3e); 2869 2870 atomic_add_long(&pmap_l2_promotions, 1); 2871 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, 2872 pmap); 2873 return (true); 2874 } 2875 #endif 2876 2877 /* 2878 * Insert the given physical page (p) at 2879 * the specified virtual address (v) in the 2880 * target physical map with the protection requested. 2881 * 2882 * If specified, the page will be wired down, meaning 2883 * that the related pte can not be reclaimed. 2884 * 2885 * NB: This is the only routine which MAY NOT lazy-evaluate 2886 * or lose information. That is, this routine must actually 2887 * insert this page into the given map NOW. 2888 */ 2889 int 2890 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2891 u_int flags, int8_t psind) 2892 { 2893 struct rwlock *lock; 2894 pd_entry_t *l1, *l2, l2e; 2895 pt_entry_t new_l3, orig_l3; 2896 pt_entry_t *l3; 2897 pv_entry_t pv; 2898 vm_paddr_t opa, pa, l2_pa, l3_pa; 2899 vm_page_t mpte, om, l2_m, l3_m; 2900 pt_entry_t entry; 2901 pn_t l2_pn, l3_pn, pn; 2902 int rv; 2903 bool nosleep; 2904 2905 va = trunc_page(va); 2906 if ((m->oflags & VPO_UNMANAGED) == 0) 2907 VM_PAGE_OBJECT_BUSY_ASSERT(m); 2908 pa = VM_PAGE_TO_PHYS(m); 2909 pn = (pa / PAGE_SIZE); 2910 2911 new_l3 = PTE_V | PTE_R | PTE_A; 2912 if (prot & VM_PROT_EXECUTE) 2913 new_l3 |= PTE_X; 2914 if (flags & VM_PROT_WRITE) 2915 new_l3 |= PTE_D; 2916 if (prot & VM_PROT_WRITE) 2917 new_l3 |= PTE_W; 2918 if (va < VM_MAX_USER_ADDRESS) 2919 new_l3 |= PTE_U; 2920 2921 new_l3 |= (pn << PTE_PPN0_S); 2922 if ((flags & PMAP_ENTER_WIRED) != 0) 2923 new_l3 |= PTE_SW_WIRED; 2924 2925 /* 2926 * Set modified bit gratuitously for writeable mappings if 2927 * the page is unmanaged. We do not want to take a fault 2928 * to do the dirty bit accounting for these mappings. 2929 */ 2930 if ((m->oflags & VPO_UNMANAGED) != 0) { 2931 if (prot & VM_PROT_WRITE) 2932 new_l3 |= PTE_D; 2933 } else 2934 new_l3 |= PTE_SW_MANAGED; 2935 2936 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); 2937 2938 lock = NULL; 2939 mpte = NULL; 2940 rw_rlock(&pvh_global_lock); 2941 PMAP_LOCK(pmap); 2942 if (psind == 1) { 2943 /* Assert the required virtual and physical alignment. */ 2944 KASSERT((va & L2_OFFSET) == 0, 2945 ("pmap_enter: va %#lx unaligned", va)); 2946 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 2947 rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock); 2948 goto out; 2949 } 2950 2951 l2 = pmap_l2(pmap, va); 2952 if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 && 2953 ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2, 2954 va, &lock))) { 2955 l3 = pmap_l2_to_l3(l2, va); 2956 if (va < VM_MAXUSER_ADDRESS) { 2957 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2958 mpte->ref_count++; 2959 } 2960 } else if (va < VM_MAXUSER_ADDRESS) { 2961 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 2962 mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock); 2963 if (mpte == NULL && nosleep) { 2964 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); 2965 if (lock != NULL) 2966 rw_wunlock(lock); 2967 rw_runlock(&pvh_global_lock); 2968 PMAP_UNLOCK(pmap); 2969 return (KERN_RESOURCE_SHORTAGE); 2970 } 2971 l3 = pmap_l3(pmap, va); 2972 } else { 2973 l3 = pmap_l3(pmap, va); 2974 /* TODO: This is not optimal, but should mostly work */ 2975 if (l3 == NULL) { 2976 if (l2 == NULL) { 2977 l2_m = vm_page_alloc_noobj(VM_ALLOC_WIRED | 2978 VM_ALLOC_ZERO); 2979 if (l2_m == NULL) 2980 panic("pmap_enter: l2 pte_m == NULL"); 2981 2982 l2_pa = VM_PAGE_TO_PHYS(l2_m); 2983 l2_pn = (l2_pa / PAGE_SIZE); 2984 2985 l1 = pmap_l1(pmap, va); 2986 entry = (PTE_V); 2987 entry |= (l2_pn << PTE_PPN0_S); 2988 pmap_store(l1, entry); 2989 pmap_distribute_l1(pmap, pmap_l1_index(va), entry); 2990 l2 = pmap_l1_to_l2(l1, va); 2991 } 2992 2993 l3_m = vm_page_alloc_noobj(VM_ALLOC_WIRED | 2994 VM_ALLOC_ZERO); 2995 if (l3_m == NULL) 2996 panic("pmap_enter: l3 pte_m == NULL"); 2997 2998 l3_pa = VM_PAGE_TO_PHYS(l3_m); 2999 l3_pn = (l3_pa / PAGE_SIZE); 3000 entry = (PTE_V); 3001 entry |= (l3_pn << PTE_PPN0_S); 3002 pmap_store(l2, entry); 3003 l3 = pmap_l2_to_l3(l2, va); 3004 } 3005 pmap_invalidate_page(pmap, va); 3006 } 3007 3008 orig_l3 = pmap_load(l3); 3009 opa = PTE_TO_PHYS(orig_l3); 3010 pv = NULL; 3011 3012 /* 3013 * Is the specified virtual address already mapped? 3014 */ 3015 if ((orig_l3 & PTE_V) != 0) { 3016 /* 3017 * Wiring change, just update stats. We don't worry about 3018 * wiring PT pages as they remain resident as long as there 3019 * are valid mappings in them. Hence, if a user page is wired, 3020 * the PT page will be also. 3021 */ 3022 if ((flags & PMAP_ENTER_WIRED) != 0 && 3023 (orig_l3 & PTE_SW_WIRED) == 0) 3024 pmap->pm_stats.wired_count++; 3025 else if ((flags & PMAP_ENTER_WIRED) == 0 && 3026 (orig_l3 & PTE_SW_WIRED) != 0) 3027 pmap->pm_stats.wired_count--; 3028 3029 /* 3030 * Remove the extra PT page reference. 3031 */ 3032 if (mpte != NULL) { 3033 mpte->ref_count--; 3034 KASSERT(mpte->ref_count > 0, 3035 ("pmap_enter: missing reference to page table page," 3036 " va: 0x%lx", va)); 3037 } 3038 3039 /* 3040 * Has the physical page changed? 3041 */ 3042 if (opa == pa) { 3043 /* 3044 * No, might be a protection or wiring change. 3045 */ 3046 if ((orig_l3 & PTE_SW_MANAGED) != 0 && 3047 (new_l3 & PTE_W) != 0) 3048 vm_page_aflag_set(m, PGA_WRITEABLE); 3049 goto validate; 3050 } 3051 3052 /* 3053 * The physical page has changed. Temporarily invalidate 3054 * the mapping. This ensures that all threads sharing the 3055 * pmap keep a consistent view of the mapping, which is 3056 * necessary for the correct handling of COW faults. It 3057 * also permits reuse of the old mapping's PV entry, 3058 * avoiding an allocation. 3059 * 3060 * For consistency, handle unmanaged mappings the same way. 3061 */ 3062 orig_l3 = pmap_load_clear(l3); 3063 KASSERT(PTE_TO_PHYS(orig_l3) == opa, 3064 ("pmap_enter: unexpected pa update for %#lx", va)); 3065 if ((orig_l3 & PTE_SW_MANAGED) != 0) { 3066 om = PHYS_TO_VM_PAGE(opa); 3067 3068 /* 3069 * The pmap lock is sufficient to synchronize with 3070 * concurrent calls to pmap_page_test_mappings() and 3071 * pmap_ts_referenced(). 3072 */ 3073 if ((orig_l3 & PTE_D) != 0) 3074 vm_page_dirty(om); 3075 if ((orig_l3 & PTE_A) != 0) 3076 vm_page_aflag_set(om, PGA_REFERENCED); 3077 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 3078 pv = pmap_pvh_remove(&om->md, pmap, va); 3079 KASSERT(pv != NULL, 3080 ("pmap_enter: no PV entry for %#lx", va)); 3081 if ((new_l3 & PTE_SW_MANAGED) == 0) 3082 free_pv_entry(pmap, pv); 3083 if ((om->a.flags & PGA_WRITEABLE) != 0 && 3084 TAILQ_EMPTY(&om->md.pv_list) && 3085 ((om->flags & PG_FICTITIOUS) != 0 || 3086 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 3087 vm_page_aflag_clear(om, PGA_WRITEABLE); 3088 } 3089 pmap_invalidate_page(pmap, va); 3090 orig_l3 = 0; 3091 } else { 3092 /* 3093 * Increment the counters. 3094 */ 3095 if ((new_l3 & PTE_SW_WIRED) != 0) 3096 pmap->pm_stats.wired_count++; 3097 pmap_resident_count_inc(pmap, 1); 3098 } 3099 /* 3100 * Enter on the PV list if part of our managed memory. 3101 */ 3102 if ((new_l3 & PTE_SW_MANAGED) != 0) { 3103 if (pv == NULL) { 3104 pv = get_pv_entry(pmap, &lock); 3105 pv->pv_va = va; 3106 } 3107 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 3108 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3109 m->md.pv_gen++; 3110 if ((new_l3 & PTE_W) != 0) 3111 vm_page_aflag_set(m, PGA_WRITEABLE); 3112 } 3113 3114 validate: 3115 /* 3116 * Sync the i-cache on all harts before updating the PTE 3117 * if the new PTE is executable. 3118 */ 3119 if (prot & VM_PROT_EXECUTE) 3120 pmap_sync_icache(pmap, va, PAGE_SIZE); 3121 3122 /* 3123 * Update the L3 entry. 3124 */ 3125 if (orig_l3 != 0) { 3126 orig_l3 = pmap_load_store(l3, new_l3); 3127 pmap_invalidate_page(pmap, va); 3128 KASSERT(PTE_TO_PHYS(orig_l3) == pa, 3129 ("pmap_enter: invalid update")); 3130 if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) == 3131 (PTE_D | PTE_SW_MANAGED)) 3132 vm_page_dirty(m); 3133 } else { 3134 pmap_store(l3, new_l3); 3135 } 3136 3137 #if VM_NRESERVLEVEL > 0 3138 if (mpte != NULL && mpte->ref_count == Ln_ENTRIES && 3139 (m->flags & PG_FICTITIOUS) == 0 && 3140 vm_reserv_level_iffullpop(m) == 0) 3141 (void)pmap_promote_l2(pmap, l2, va, mpte, &lock); 3142 #endif 3143 3144 rv = KERN_SUCCESS; 3145 out: 3146 if (lock != NULL) 3147 rw_wunlock(lock); 3148 rw_runlock(&pvh_global_lock); 3149 PMAP_UNLOCK(pmap); 3150 return (rv); 3151 } 3152 3153 /* 3154 * Tries to create a read- and/or execute-only 2MB page mapping. Returns 3155 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error 3156 * value. See pmap_enter_l2() for the possible error values when "no sleep", 3157 * "no replace", and "no reclaim" are specified. 3158 */ 3159 static int 3160 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3161 struct rwlock **lockp) 3162 { 3163 pd_entry_t new_l2; 3164 pn_t pn; 3165 3166 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3167 3168 pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE; 3169 new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V); 3170 if ((m->oflags & VPO_UNMANAGED) == 0) 3171 new_l2 |= PTE_SW_MANAGED; 3172 if ((prot & VM_PROT_EXECUTE) != 0) 3173 new_l2 |= PTE_X; 3174 if (va < VM_MAXUSER_ADDRESS) 3175 new_l2 |= PTE_U; 3176 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | 3177 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp)); 3178 } 3179 3180 /* 3181 * Returns true if every page table entry in the specified page table is 3182 * zero. 3183 */ 3184 static bool 3185 pmap_every_pte_zero(vm_paddr_t pa) 3186 { 3187 pt_entry_t *pt_end, *pte; 3188 3189 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); 3190 pte = (pt_entry_t *)PHYS_TO_DMAP(pa); 3191 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) { 3192 if (*pte != 0) 3193 return (false); 3194 } 3195 return (true); 3196 } 3197 3198 /* 3199 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 3200 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or 3201 * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if 3202 * PMAP_ENTER_NOREPLACE was specified and a 4KB page mapping already exists 3203 * within the 2MB virtual address range starting at the specified virtual 3204 * address. Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a 3205 * 2MB page mapping already exists at the specified virtual address. Returns 3206 * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a 3207 * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified 3208 * and a PV entry allocation failed. 3209 * 3210 * The parameter "m" is only used when creating a managed, writeable mapping. 3211 */ 3212 static int 3213 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, 3214 vm_page_t m, struct rwlock **lockp) 3215 { 3216 struct spglist free; 3217 pd_entry_t *l2, *l3, oldl2; 3218 vm_offset_t sva; 3219 vm_page_t l2pg, mt; 3220 vm_page_t uwptpg; 3221 3222 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3223 3224 if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? 3225 NULL : lockp)) == NULL) { 3226 CTR2(KTR_PMAP, "pmap_enter_l2: failed to allocate PT page" 3227 " for va %#lx in pmap %p", va, pmap); 3228 return (KERN_RESOURCE_SHORTAGE); 3229 } 3230 3231 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); 3232 l2 = &l2[pmap_l2_index(va)]; 3233 if ((oldl2 = pmap_load(l2)) != 0) { 3234 KASSERT(l2pg->ref_count > 1, 3235 ("pmap_enter_l2: l2pg's ref count is too low")); 3236 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 3237 if ((oldl2 & PTE_RWX) != 0) { 3238 l2pg->ref_count--; 3239 CTR2(KTR_PMAP, 3240 "pmap_enter_l2: no space for va %#lx" 3241 " in pmap %p", va, pmap); 3242 return (KERN_NO_SPACE); 3243 } else if (va < VM_MAXUSER_ADDRESS || 3244 !pmap_every_pte_zero(L2PTE_TO_PHYS(oldl2))) { 3245 l2pg->ref_count--; 3246 CTR2(KTR_PMAP, "pmap_enter_l2:" 3247 " failed to replace existing mapping" 3248 " for va %#lx in pmap %p", va, pmap); 3249 return (KERN_FAILURE); 3250 } 3251 } 3252 SLIST_INIT(&free); 3253 if ((oldl2 & PTE_RWX) != 0) 3254 (void)pmap_remove_l2(pmap, l2, va, 3255 pmap_load(pmap_l1(pmap, va)), &free, lockp); 3256 else 3257 for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) { 3258 l3 = pmap_l2_to_l3(l2, sva); 3259 if ((pmap_load(l3) & PTE_V) != 0 && 3260 pmap_remove_l3(pmap, l3, sva, oldl2, &free, 3261 lockp) != 0) 3262 break; 3263 } 3264 vm_page_free_pages_toq(&free, true); 3265 if (va >= VM_MAXUSER_ADDRESS) { 3266 /* 3267 * Both pmap_remove_l2() and pmap_remove_l3() will 3268 * leave the kernel page table page zero filled. 3269 */ 3270 mt = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 3271 if (pmap_insert_pt_page(pmap, mt, false, false)) 3272 panic("pmap_enter_l2: trie insert failed"); 3273 } else 3274 KASSERT(pmap_load(l2) == 0, 3275 ("pmap_enter_l2: non-zero L2 entry %p", l2)); 3276 } 3277 3278 /* 3279 * Allocate leaf ptpage for wired userspace pages. 3280 */ 3281 uwptpg = NULL; 3282 if ((new_l2 & PTE_SW_WIRED) != 0 && pmap != kernel_pmap) { 3283 uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED); 3284 if (uwptpg == NULL) { 3285 return (KERN_RESOURCE_SHORTAGE); 3286 } 3287 uwptpg->pindex = pmap_l2_pindex(va); 3288 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) { 3289 vm_page_unwire_noq(uwptpg); 3290 vm_page_free(uwptpg); 3291 return (KERN_RESOURCE_SHORTAGE); 3292 } 3293 pmap_resident_count_inc(pmap, 1); 3294 uwptpg->ref_count = Ln_ENTRIES; 3295 } 3296 if ((new_l2 & PTE_SW_MANAGED) != 0) { 3297 /* 3298 * Abort this mapping if its PV entry could not be created. 3299 */ 3300 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { 3301 SLIST_INIT(&free); 3302 if (pmap_unwire_ptp(pmap, va, l2pg, &free)) { 3303 /* 3304 * Although "va" is not mapped, paging-structure 3305 * caches could nonetheless have entries that 3306 * refer to the freed page table pages. 3307 * Invalidate those entries. 3308 */ 3309 pmap_invalidate_page(pmap, va); 3310 vm_page_free_pages_toq(&free, true); 3311 } 3312 if (uwptpg != NULL) { 3313 mt = pmap_remove_pt_page(pmap, va); 3314 KASSERT(mt == uwptpg, 3315 ("removed pt page %p, expected %p", mt, 3316 uwptpg)); 3317 pmap_resident_count_dec(pmap, 1); 3318 uwptpg->ref_count = 1; 3319 vm_page_unwire_noq(uwptpg); 3320 vm_page_free(uwptpg); 3321 } 3322 CTR2(KTR_PMAP, 3323 "pmap_enter_l2: failed to create PV entry" 3324 " for va %#lx in pmap %p", va, pmap); 3325 return (KERN_RESOURCE_SHORTAGE); 3326 } 3327 if ((new_l2 & PTE_W) != 0) 3328 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 3329 vm_page_aflag_set(mt, PGA_WRITEABLE); 3330 } 3331 3332 /* 3333 * Increment counters. 3334 */ 3335 if ((new_l2 & PTE_SW_WIRED) != 0) 3336 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; 3337 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; 3338 3339 /* 3340 * Map the superpage. 3341 */ 3342 pmap_store(l2, new_l2); 3343 3344 atomic_add_long(&pmap_l2_mappings, 1); 3345 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", 3346 va, pmap); 3347 3348 return (KERN_SUCCESS); 3349 } 3350 3351 /* 3352 * Maps a sequence of resident pages belonging to the same object. 3353 * The sequence begins with the given page m_start. This page is 3354 * mapped at the given virtual address start. Each subsequent page is 3355 * mapped at a virtual address that is offset from start by the same 3356 * amount as the page is offset from m_start within the object. The 3357 * last page in the sequence is the page with the largest offset from 3358 * m_start that can be mapped at a virtual address less than the given 3359 * virtual address end. Not every virtual page between start and end 3360 * is mapped; only those for which a resident page exists with the 3361 * corresponding offset from m_start are mapped. 3362 */ 3363 void 3364 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3365 vm_page_t m_start, vm_prot_t prot) 3366 { 3367 struct rwlock *lock; 3368 vm_offset_t va; 3369 vm_page_t m, mpte; 3370 vm_pindex_t diff, psize; 3371 int rv; 3372 3373 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3374 3375 psize = atop(end - start); 3376 mpte = NULL; 3377 m = m_start; 3378 lock = NULL; 3379 rw_rlock(&pvh_global_lock); 3380 PMAP_LOCK(pmap); 3381 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3382 va = start + ptoa(diff); 3383 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && 3384 m->psind == 1 && pmap_ps_enabled(pmap) && 3385 ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) == 3386 KERN_SUCCESS || rv == KERN_NO_SPACE)) 3387 m = &m[L2_SIZE / PAGE_SIZE - 1]; 3388 else 3389 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, 3390 &lock); 3391 m = TAILQ_NEXT(m, listq); 3392 } 3393 if (lock != NULL) 3394 rw_wunlock(lock); 3395 rw_runlock(&pvh_global_lock); 3396 PMAP_UNLOCK(pmap); 3397 } 3398 3399 /* 3400 * this code makes some *MAJOR* assumptions: 3401 * 1. Current pmap & pmap exists. 3402 * 2. Not wired. 3403 * 3. Read access. 3404 * 4. No page table pages. 3405 * but is *MUCH* faster than pmap_enter... 3406 */ 3407 3408 void 3409 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3410 { 3411 struct rwlock *lock; 3412 3413 lock = NULL; 3414 rw_rlock(&pvh_global_lock); 3415 PMAP_LOCK(pmap); 3416 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 3417 if (lock != NULL) 3418 rw_wunlock(lock); 3419 rw_runlock(&pvh_global_lock); 3420 PMAP_UNLOCK(pmap); 3421 } 3422 3423 static vm_page_t 3424 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3425 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 3426 { 3427 struct spglist free; 3428 vm_paddr_t phys; 3429 pd_entry_t *l2; 3430 pt_entry_t *l3, newl3; 3431 3432 KASSERT(!VA_IS_CLEANMAP(va) || 3433 (m->oflags & VPO_UNMANAGED) != 0, 3434 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3435 rw_assert(&pvh_global_lock, RA_LOCKED); 3436 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3437 l2 = NULL; 3438 3439 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); 3440 /* 3441 * In the case that a page table page is not 3442 * resident, we are creating it here. 3443 */ 3444 if (va < VM_MAXUSER_ADDRESS) { 3445 vm_pindex_t l2pindex; 3446 3447 /* 3448 * Calculate pagetable page index 3449 */ 3450 l2pindex = pmap_l2_pindex(va); 3451 if (mpte && (mpte->pindex == l2pindex)) { 3452 mpte->ref_count++; 3453 } else { 3454 /* 3455 * Get the l2 entry 3456 */ 3457 l2 = pmap_l2(pmap, va); 3458 3459 /* 3460 * If the page table page is mapped, we just increment 3461 * the hold count, and activate it. Otherwise, we 3462 * attempt to allocate a page table page. If this 3463 * attempt fails, we don't retry. Instead, we give up. 3464 */ 3465 if (l2 != NULL && pmap_load(l2) != 0) { 3466 if ((pmap_load(l2) & PTE_RWX) != 0) 3467 return (NULL); 3468 phys = PTE_TO_PHYS(pmap_load(l2)); 3469 mpte = PHYS_TO_VM_PAGE(phys); 3470 mpte->ref_count++; 3471 } else { 3472 /* 3473 * Pass NULL instead of the PV list lock 3474 * pointer, because we don't intend to sleep. 3475 */ 3476 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); 3477 if (mpte == NULL) 3478 return (mpte); 3479 } 3480 } 3481 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 3482 l3 = &l3[pmap_l3_index(va)]; 3483 } else { 3484 mpte = NULL; 3485 l3 = pmap_l3(kernel_pmap, va); 3486 } 3487 if (l3 == NULL) 3488 panic("pmap_enter_quick_locked: No l3"); 3489 if (pmap_load(l3) != 0) { 3490 if (mpte != NULL) 3491 mpte->ref_count--; 3492 return (NULL); 3493 } 3494 3495 /* 3496 * Enter on the PV list if part of our managed memory. 3497 */ 3498 if ((m->oflags & VPO_UNMANAGED) == 0 && 3499 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 3500 if (mpte != NULL) { 3501 SLIST_INIT(&free); 3502 if (pmap_unwire_ptp(pmap, va, mpte, &free)) 3503 vm_page_free_pages_toq(&free, false); 3504 } 3505 return (NULL); 3506 } 3507 3508 /* 3509 * Increment counters 3510 */ 3511 pmap_resident_count_inc(pmap, 1); 3512 3513 newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) | 3514 PTE_V | PTE_R; 3515 if ((prot & VM_PROT_EXECUTE) != 0) 3516 newl3 |= PTE_X; 3517 if ((m->oflags & VPO_UNMANAGED) == 0) 3518 newl3 |= PTE_SW_MANAGED; 3519 if (va < VM_MAX_USER_ADDRESS) 3520 newl3 |= PTE_U; 3521 3522 /* 3523 * Sync the i-cache on all harts before updating the PTE 3524 * if the new PTE is executable. 3525 */ 3526 if (prot & VM_PROT_EXECUTE) 3527 pmap_sync_icache(pmap, va, PAGE_SIZE); 3528 3529 pmap_store(l3, newl3); 3530 3531 #if VM_NRESERVLEVEL > 0 3532 /* 3533 * If both the PTP and the reservation are fully populated, then attempt 3534 * promotion. 3535 */ 3536 if ((mpte == NULL || mpte->ref_count == Ln_ENTRIES) && 3537 (m->flags & PG_FICTITIOUS) == 0 && 3538 vm_reserv_level_iffullpop(m) == 0) { 3539 if (l2 == NULL) 3540 l2 = pmap_l2(pmap, va); 3541 3542 /* 3543 * If promotion succeeds, then the next call to this function 3544 * should not be given the unmapped PTP as a hint. 3545 */ 3546 if (pmap_promote_l2(pmap, l2, va, mpte, lockp)) 3547 mpte = NULL; 3548 } 3549 #endif 3550 3551 return (mpte); 3552 } 3553 3554 /* 3555 * This code maps large physical mmap regions into the 3556 * processor address space. Note that some shortcuts 3557 * are taken, but the code works. 3558 */ 3559 void 3560 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3561 vm_pindex_t pindex, vm_size_t size) 3562 { 3563 3564 VM_OBJECT_ASSERT_WLOCKED(object); 3565 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3566 ("pmap_object_init_pt: non-device object")); 3567 } 3568 3569 /* 3570 * Clear the wired attribute from the mappings for the specified range of 3571 * addresses in the given pmap. Every valid mapping within that range 3572 * must have the wired attribute set. In contrast, invalid mappings 3573 * cannot have the wired attribute set, so they are ignored. 3574 * 3575 * The wired attribute of the page table entry is not a hardware feature, 3576 * so there is no need to invalidate any TLB entries. 3577 */ 3578 void 3579 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3580 { 3581 vm_offset_t va_next; 3582 pd_entry_t *l0, *l1, *l2, l2e; 3583 pt_entry_t *l3, l3e; 3584 bool pv_lists_locked; 3585 3586 pv_lists_locked = false; 3587 retry: 3588 PMAP_LOCK(pmap); 3589 for (; sva < eva; sva = va_next) { 3590 if (pmap_mode == PMAP_MODE_SV48) { 3591 l0 = pmap_l0(pmap, sva); 3592 if (pmap_load(l0) == 0) { 3593 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 3594 if (va_next < sva) 3595 va_next = eva; 3596 continue; 3597 } 3598 l1 = pmap_l0_to_l1(l0, sva); 3599 } else { 3600 l1 = pmap_l1(pmap, sva); 3601 } 3602 3603 if (pmap_load(l1) == 0) { 3604 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3605 if (va_next < sva) 3606 va_next = eva; 3607 continue; 3608 } 3609 3610 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3611 if (va_next < sva) 3612 va_next = eva; 3613 3614 l2 = pmap_l1_to_l2(l1, sva); 3615 if ((l2e = pmap_load(l2)) == 0) 3616 continue; 3617 if ((l2e & PTE_RWX) != 0) { 3618 if (sva + L2_SIZE == va_next && eva >= va_next) { 3619 if ((l2e & PTE_SW_WIRED) == 0) 3620 panic("pmap_unwire: l2 %#jx is missing " 3621 "PTE_SW_WIRED", (uintmax_t)l2e); 3622 pmap_clear_bits(l2, PTE_SW_WIRED); 3623 continue; 3624 } else { 3625 if (!pv_lists_locked) { 3626 pv_lists_locked = true; 3627 if (!rw_try_rlock(&pvh_global_lock)) { 3628 PMAP_UNLOCK(pmap); 3629 rw_rlock(&pvh_global_lock); 3630 /* Repeat sva. */ 3631 goto retry; 3632 } 3633 } 3634 if (!pmap_demote_l2(pmap, l2, sva)) 3635 panic("pmap_unwire: demotion failed"); 3636 } 3637 } 3638 3639 if (va_next > eva) 3640 va_next = eva; 3641 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 3642 sva += L3_SIZE) { 3643 if ((l3e = pmap_load(l3)) == 0) 3644 continue; 3645 if ((l3e & PTE_SW_WIRED) == 0) 3646 panic("pmap_unwire: l3 %#jx is missing " 3647 "PTE_SW_WIRED", (uintmax_t)l3e); 3648 3649 /* 3650 * PG_W must be cleared atomically. Although the pmap 3651 * lock synchronizes access to PG_W, another processor 3652 * could be setting PG_M and/or PG_A concurrently. 3653 */ 3654 pmap_clear_bits(l3, PTE_SW_WIRED); 3655 pmap->pm_stats.wired_count--; 3656 } 3657 } 3658 if (pv_lists_locked) 3659 rw_runlock(&pvh_global_lock); 3660 PMAP_UNLOCK(pmap); 3661 } 3662 3663 /* 3664 * Copy the range specified by src_addr/len 3665 * from the source map to the range dst_addr/len 3666 * in the destination map. 3667 * 3668 * This routine is only advisory and need not do anything. 3669 */ 3670 3671 void 3672 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 3673 vm_offset_t src_addr) 3674 { 3675 3676 } 3677 3678 /* 3679 * pmap_zero_page zeros the specified hardware page by mapping 3680 * the page into KVM and using bzero to clear its contents. 3681 */ 3682 void 3683 pmap_zero_page(vm_page_t m) 3684 { 3685 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3686 3687 pagezero((void *)va); 3688 } 3689 3690 /* 3691 * pmap_zero_page_area zeros the specified hardware page by mapping 3692 * the page into KVM and using bzero to clear its contents. 3693 * 3694 * off and size may not cover an area beyond a single hardware page. 3695 */ 3696 void 3697 pmap_zero_page_area(vm_page_t m, int off, int size) 3698 { 3699 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3700 3701 if (off == 0 && size == PAGE_SIZE) 3702 pagezero((void *)va); 3703 else 3704 bzero((char *)va + off, size); 3705 } 3706 3707 /* 3708 * pmap_copy_page copies the specified (machine independent) 3709 * page by mapping the page into virtual memory and using 3710 * bcopy to copy the page, one machine dependent page at a 3711 * time. 3712 */ 3713 void 3714 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 3715 { 3716 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 3717 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 3718 3719 pagecopy((void *)src, (void *)dst); 3720 } 3721 3722 int unmapped_buf_allowed = 1; 3723 3724 void 3725 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 3726 vm_offset_t b_offset, int xfersize) 3727 { 3728 void *a_cp, *b_cp; 3729 vm_page_t m_a, m_b; 3730 vm_paddr_t p_a, p_b; 3731 vm_offset_t a_pg_offset, b_pg_offset; 3732 int cnt; 3733 3734 while (xfersize > 0) { 3735 a_pg_offset = a_offset & PAGE_MASK; 3736 m_a = ma[a_offset >> PAGE_SHIFT]; 3737 p_a = m_a->phys_addr; 3738 b_pg_offset = b_offset & PAGE_MASK; 3739 m_b = mb[b_offset >> PAGE_SHIFT]; 3740 p_b = m_b->phys_addr; 3741 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 3742 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 3743 if (__predict_false(!PHYS_IN_DMAP(p_a))) { 3744 panic("!DMAP a %lx", p_a); 3745 } else { 3746 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 3747 } 3748 if (__predict_false(!PHYS_IN_DMAP(p_b))) { 3749 panic("!DMAP b %lx", p_b); 3750 } else { 3751 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 3752 } 3753 bcopy(a_cp, b_cp, cnt); 3754 a_offset += cnt; 3755 b_offset += cnt; 3756 xfersize -= cnt; 3757 } 3758 } 3759 3760 vm_offset_t 3761 pmap_quick_enter_page(vm_page_t m) 3762 { 3763 3764 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); 3765 } 3766 3767 void 3768 pmap_quick_remove_page(vm_offset_t addr) 3769 { 3770 } 3771 3772 /* 3773 * Returns true if the pmap's pv is one of the first 3774 * 16 pvs linked to from this page. This count may 3775 * be changed upwards or downwards in the future; it 3776 * is only necessary that true be returned for a small 3777 * subset of pmaps for proper page aging. 3778 */ 3779 boolean_t 3780 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 3781 { 3782 struct md_page *pvh; 3783 struct rwlock *lock; 3784 pv_entry_t pv; 3785 int loops = 0; 3786 boolean_t rv; 3787 3788 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3789 ("pmap_page_exists_quick: page %p is not managed", m)); 3790 rv = FALSE; 3791 rw_rlock(&pvh_global_lock); 3792 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3793 rw_rlock(lock); 3794 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3795 if (PV_PMAP(pv) == pmap) { 3796 rv = TRUE; 3797 break; 3798 } 3799 loops++; 3800 if (loops >= 16) 3801 break; 3802 } 3803 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 3804 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3805 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3806 if (PV_PMAP(pv) == pmap) { 3807 rv = TRUE; 3808 break; 3809 } 3810 loops++; 3811 if (loops >= 16) 3812 break; 3813 } 3814 } 3815 rw_runlock(lock); 3816 rw_runlock(&pvh_global_lock); 3817 return (rv); 3818 } 3819 3820 /* 3821 * pmap_page_wired_mappings: 3822 * 3823 * Return the number of managed mappings to the given physical page 3824 * that are wired. 3825 */ 3826 int 3827 pmap_page_wired_mappings(vm_page_t m) 3828 { 3829 struct md_page *pvh; 3830 struct rwlock *lock; 3831 pmap_t pmap; 3832 pd_entry_t *l2; 3833 pt_entry_t *l3; 3834 pv_entry_t pv; 3835 int count, md_gen, pvh_gen; 3836 3837 if ((m->oflags & VPO_UNMANAGED) != 0) 3838 return (0); 3839 rw_rlock(&pvh_global_lock); 3840 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3841 rw_rlock(lock); 3842 restart: 3843 count = 0; 3844 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3845 pmap = PV_PMAP(pv); 3846 if (!PMAP_TRYLOCK(pmap)) { 3847 md_gen = m->md.pv_gen; 3848 rw_runlock(lock); 3849 PMAP_LOCK(pmap); 3850 rw_rlock(lock); 3851 if (md_gen != m->md.pv_gen) { 3852 PMAP_UNLOCK(pmap); 3853 goto restart; 3854 } 3855 } 3856 l2 = pmap_l2(pmap, pv->pv_va); 3857 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 3858 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 3859 l3 = pmap_l2_to_l3(l2, pv->pv_va); 3860 if ((pmap_load(l3) & PTE_SW_WIRED) != 0) 3861 count++; 3862 PMAP_UNLOCK(pmap); 3863 } 3864 if ((m->flags & PG_FICTITIOUS) == 0) { 3865 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3866 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3867 pmap = PV_PMAP(pv); 3868 if (!PMAP_TRYLOCK(pmap)) { 3869 md_gen = m->md.pv_gen; 3870 pvh_gen = pvh->pv_gen; 3871 rw_runlock(lock); 3872 PMAP_LOCK(pmap); 3873 rw_rlock(lock); 3874 if (md_gen != m->md.pv_gen || 3875 pvh_gen != pvh->pv_gen) { 3876 PMAP_UNLOCK(pmap); 3877 goto restart; 3878 } 3879 } 3880 l2 = pmap_l2(pmap, pv->pv_va); 3881 if ((pmap_load(l2) & PTE_SW_WIRED) != 0) 3882 count++; 3883 PMAP_UNLOCK(pmap); 3884 } 3885 } 3886 rw_runlock(lock); 3887 rw_runlock(&pvh_global_lock); 3888 return (count); 3889 } 3890 3891 /* 3892 * Returns true if the given page is mapped individually or as part of 3893 * a 2mpage. Otherwise, returns false. 3894 */ 3895 bool 3896 pmap_page_is_mapped(vm_page_t m) 3897 { 3898 struct rwlock *lock; 3899 bool rv; 3900 3901 if ((m->oflags & VPO_UNMANAGED) != 0) 3902 return (false); 3903 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3904 rw_rlock(lock); 3905 rv = !TAILQ_EMPTY(&m->md.pv_list) || 3906 ((m->flags & PG_FICTITIOUS) == 0 && 3907 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 3908 rw_runlock(lock); 3909 return (rv); 3910 } 3911 3912 static void 3913 pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, 3914 struct spglist *free, bool superpage) 3915 { 3916 struct md_page *pvh; 3917 vm_page_t mpte, mt; 3918 3919 if (superpage) { 3920 pmap_resident_count_dec(pmap, Ln_ENTRIES); 3921 pvh = pa_to_pvh(m->phys_addr); 3922 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3923 pvh->pv_gen++; 3924 if (TAILQ_EMPTY(&pvh->pv_list)) { 3925 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 3926 if (TAILQ_EMPTY(&mt->md.pv_list) && 3927 (mt->a.flags & PGA_WRITEABLE) != 0) 3928 vm_page_aflag_clear(mt, PGA_WRITEABLE); 3929 } 3930 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 3931 if (mpte != NULL) { 3932 KASSERT(vm_page_any_valid(mpte), 3933 ("pmap_remove_pages: pte page not promoted")); 3934 pmap_resident_count_dec(pmap, 1); 3935 KASSERT(mpte->ref_count == Ln_ENTRIES, 3936 ("pmap_remove_pages: pte page ref count error")); 3937 mpte->ref_count = 0; 3938 pmap_add_delayed_free_list(mpte, free, FALSE); 3939 } 3940 } else { 3941 pmap_resident_count_dec(pmap, 1); 3942 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3943 m->md.pv_gen++; 3944 if (TAILQ_EMPTY(&m->md.pv_list) && 3945 (m->a.flags & PGA_WRITEABLE) != 0) { 3946 pvh = pa_to_pvh(m->phys_addr); 3947 if (TAILQ_EMPTY(&pvh->pv_list)) 3948 vm_page_aflag_clear(m, PGA_WRITEABLE); 3949 } 3950 } 3951 } 3952 3953 /* 3954 * Destroy all managed, non-wired mappings in the given user-space 3955 * pmap. This pmap cannot be active on any processor besides the 3956 * caller. 3957 * 3958 * This function cannot be applied to the kernel pmap. Moreover, it 3959 * is not intended for general use. It is only to be used during 3960 * process termination. Consequently, it can be implemented in ways 3961 * that make it faster than pmap_remove(). First, it can more quickly 3962 * destroy mappings by iterating over the pmap's collection of PV 3963 * entries, rather than searching the page table. Second, it doesn't 3964 * have to test and clear the page table entries atomically, because 3965 * no processor is currently accessing the user address space. In 3966 * particular, a page table entry's dirty bit won't change state once 3967 * this function starts. 3968 */ 3969 void 3970 pmap_remove_pages(pmap_t pmap) 3971 { 3972 struct spglist free; 3973 pd_entry_t ptepde; 3974 pt_entry_t *pte, tpte; 3975 vm_page_t m, mt; 3976 pv_entry_t pv; 3977 struct pv_chunk *pc, *npc; 3978 struct rwlock *lock; 3979 int64_t bit; 3980 uint64_t inuse, bitmask; 3981 int allfree, field, freed __pv_stat_used, idx; 3982 bool superpage; 3983 3984 lock = NULL; 3985 3986 SLIST_INIT(&free); 3987 rw_rlock(&pvh_global_lock); 3988 PMAP_LOCK(pmap); 3989 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 3990 allfree = 1; 3991 freed = 0; 3992 for (field = 0; field < _NPCM; field++) { 3993 inuse = ~pc->pc_map[field] & pc_freemask[field]; 3994 while (inuse != 0) { 3995 bit = ffsl(inuse) - 1; 3996 bitmask = 1UL << bit; 3997 idx = field * 64 + bit; 3998 pv = &pc->pc_pventry[idx]; 3999 inuse &= ~bitmask; 4000 4001 pte = pmap_l1(pmap, pv->pv_va); 4002 ptepde = pmap_load(pte); 4003 pte = pmap_l1_to_l2(pte, pv->pv_va); 4004 tpte = pmap_load(pte); 4005 4006 KASSERT((tpte & PTE_V) != 0, 4007 ("L2 PTE is invalid... bogus PV entry? " 4008 "va=%#lx, pte=%#lx", pv->pv_va, tpte)); 4009 if ((tpte & PTE_RWX) != 0) { 4010 superpage = true; 4011 } else { 4012 ptepde = tpte; 4013 pte = pmap_l2_to_l3(pte, pv->pv_va); 4014 tpte = pmap_load(pte); 4015 superpage = false; 4016 } 4017 4018 /* 4019 * We cannot remove wired pages from a 4020 * process' mapping at this time. 4021 */ 4022 if (tpte & PTE_SW_WIRED) { 4023 allfree = 0; 4024 continue; 4025 } 4026 4027 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte)); 4028 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4029 m < &vm_page_array[vm_page_array_size], 4030 ("pmap_remove_pages: bad pte %#jx", 4031 (uintmax_t)tpte)); 4032 4033 pmap_clear(pte); 4034 4035 /* 4036 * Update the vm_page_t clean/reference bits. 4037 */ 4038 if ((tpte & (PTE_D | PTE_W)) == 4039 (PTE_D | PTE_W)) { 4040 if (superpage) 4041 for (mt = m; 4042 mt < &m[Ln_ENTRIES]; mt++) 4043 vm_page_dirty(mt); 4044 else 4045 vm_page_dirty(m); 4046 } 4047 4048 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 4049 4050 /* Mark free */ 4051 pc->pc_map[field] |= bitmask; 4052 4053 pmap_remove_pages_pv(pmap, m, pv, &free, 4054 superpage); 4055 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 4056 freed++; 4057 } 4058 } 4059 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 4060 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 4061 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 4062 if (allfree) { 4063 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4064 free_pv_chunk(pc); 4065 } 4066 } 4067 if (lock != NULL) 4068 rw_wunlock(lock); 4069 pmap_invalidate_all(pmap); 4070 rw_runlock(&pvh_global_lock); 4071 PMAP_UNLOCK(pmap); 4072 vm_page_free_pages_toq(&free, false); 4073 } 4074 4075 static bool 4076 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 4077 { 4078 struct md_page *pvh; 4079 struct rwlock *lock; 4080 pd_entry_t *l2; 4081 pt_entry_t *l3, mask; 4082 pv_entry_t pv; 4083 pmap_t pmap; 4084 int md_gen, pvh_gen; 4085 bool rv; 4086 4087 mask = 0; 4088 if (modified) 4089 mask |= PTE_D; 4090 if (accessed) 4091 mask |= PTE_A; 4092 4093 rv = FALSE; 4094 rw_rlock(&pvh_global_lock); 4095 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4096 rw_rlock(lock); 4097 restart: 4098 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4099 pmap = PV_PMAP(pv); 4100 if (!PMAP_TRYLOCK(pmap)) { 4101 md_gen = m->md.pv_gen; 4102 rw_runlock(lock); 4103 PMAP_LOCK(pmap); 4104 rw_rlock(lock); 4105 if (md_gen != m->md.pv_gen) { 4106 PMAP_UNLOCK(pmap); 4107 goto restart; 4108 } 4109 } 4110 l2 = pmap_l2(pmap, pv->pv_va); 4111 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4112 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 4113 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4114 rv = (pmap_load(l3) & mask) == mask; 4115 PMAP_UNLOCK(pmap); 4116 if (rv) 4117 goto out; 4118 } 4119 if ((m->flags & PG_FICTITIOUS) == 0) { 4120 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4121 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4122 pmap = PV_PMAP(pv); 4123 if (!PMAP_TRYLOCK(pmap)) { 4124 md_gen = m->md.pv_gen; 4125 pvh_gen = pvh->pv_gen; 4126 rw_runlock(lock); 4127 PMAP_LOCK(pmap); 4128 rw_rlock(lock); 4129 if (md_gen != m->md.pv_gen || 4130 pvh_gen != pvh->pv_gen) { 4131 PMAP_UNLOCK(pmap); 4132 goto restart; 4133 } 4134 } 4135 l2 = pmap_l2(pmap, pv->pv_va); 4136 rv = (pmap_load(l2) & mask) == mask; 4137 PMAP_UNLOCK(pmap); 4138 if (rv) 4139 goto out; 4140 } 4141 } 4142 out: 4143 rw_runlock(lock); 4144 rw_runlock(&pvh_global_lock); 4145 return (rv); 4146 } 4147 4148 /* 4149 * pmap_is_modified: 4150 * 4151 * Return whether or not the specified physical page was modified 4152 * in any physical maps. 4153 */ 4154 boolean_t 4155 pmap_is_modified(vm_page_t m) 4156 { 4157 4158 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4159 ("pmap_is_modified: page %p is not managed", m)); 4160 4161 /* 4162 * If the page is not busied then this check is racy. 4163 */ 4164 if (!pmap_page_is_write_mapped(m)) 4165 return (FALSE); 4166 return (pmap_page_test_mappings(m, FALSE, TRUE)); 4167 } 4168 4169 /* 4170 * pmap_is_prefaultable: 4171 * 4172 * Return whether or not the specified virtual address is eligible 4173 * for prefault. 4174 */ 4175 boolean_t 4176 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 4177 { 4178 pt_entry_t *l3; 4179 boolean_t rv; 4180 4181 /* 4182 * Return TRUE if and only if the L3 entry for the specified virtual 4183 * address is allocated but invalid. 4184 */ 4185 rv = FALSE; 4186 PMAP_LOCK(pmap); 4187 l3 = pmap_l3(pmap, addr); 4188 if (l3 != NULL && pmap_load(l3) == 0) { 4189 rv = TRUE; 4190 } 4191 PMAP_UNLOCK(pmap); 4192 return (rv); 4193 } 4194 4195 /* 4196 * pmap_is_referenced: 4197 * 4198 * Return whether or not the specified physical page was referenced 4199 * in any physical maps. 4200 */ 4201 boolean_t 4202 pmap_is_referenced(vm_page_t m) 4203 { 4204 4205 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4206 ("pmap_is_referenced: page %p is not managed", m)); 4207 return (pmap_page_test_mappings(m, TRUE, FALSE)); 4208 } 4209 4210 /* 4211 * Clear the write and modified bits in each of the given page's mappings. 4212 */ 4213 void 4214 pmap_remove_write(vm_page_t m) 4215 { 4216 struct md_page *pvh; 4217 struct rwlock *lock; 4218 pmap_t pmap; 4219 pd_entry_t *l2; 4220 pt_entry_t *l3, oldl3, newl3; 4221 pv_entry_t next_pv, pv; 4222 vm_offset_t va; 4223 int md_gen, pvh_gen; 4224 4225 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4226 ("pmap_remove_write: page %p is not managed", m)); 4227 vm_page_assert_busied(m); 4228 4229 if (!pmap_page_is_write_mapped(m)) 4230 return; 4231 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4232 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 4233 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4234 rw_rlock(&pvh_global_lock); 4235 retry_pv_loop: 4236 rw_wlock(lock); 4237 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4238 pmap = PV_PMAP(pv); 4239 if (!PMAP_TRYLOCK(pmap)) { 4240 pvh_gen = pvh->pv_gen; 4241 rw_wunlock(lock); 4242 PMAP_LOCK(pmap); 4243 rw_wlock(lock); 4244 if (pvh_gen != pvh->pv_gen) { 4245 PMAP_UNLOCK(pmap); 4246 rw_wunlock(lock); 4247 goto retry_pv_loop; 4248 } 4249 } 4250 va = pv->pv_va; 4251 l2 = pmap_l2(pmap, va); 4252 if ((pmap_load(l2) & PTE_W) != 0) 4253 (void)pmap_demote_l2_locked(pmap, l2, va, &lock); 4254 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 4255 ("inconsistent pv lock %p %p for page %p", 4256 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 4257 PMAP_UNLOCK(pmap); 4258 } 4259 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4260 pmap = PV_PMAP(pv); 4261 if (!PMAP_TRYLOCK(pmap)) { 4262 pvh_gen = pvh->pv_gen; 4263 md_gen = m->md.pv_gen; 4264 rw_wunlock(lock); 4265 PMAP_LOCK(pmap); 4266 rw_wlock(lock); 4267 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4268 PMAP_UNLOCK(pmap); 4269 rw_wunlock(lock); 4270 goto retry_pv_loop; 4271 } 4272 } 4273 l2 = pmap_l2(pmap, pv->pv_va); 4274 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4275 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 4276 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4277 oldl3 = pmap_load(l3); 4278 retry: 4279 if ((oldl3 & PTE_W) != 0) { 4280 newl3 = oldl3 & ~(PTE_D | PTE_W); 4281 if (!atomic_fcmpset_long(l3, &oldl3, newl3)) 4282 goto retry; 4283 if ((oldl3 & PTE_D) != 0) 4284 vm_page_dirty(m); 4285 pmap_invalidate_page(pmap, pv->pv_va); 4286 } 4287 PMAP_UNLOCK(pmap); 4288 } 4289 rw_wunlock(lock); 4290 vm_page_aflag_clear(m, PGA_WRITEABLE); 4291 rw_runlock(&pvh_global_lock); 4292 } 4293 4294 /* 4295 * pmap_ts_referenced: 4296 * 4297 * Return a count of reference bits for a page, clearing those bits. 4298 * It is not necessary for every reference bit to be cleared, but it 4299 * is necessary that 0 only be returned when there are truly no 4300 * reference bits set. 4301 * 4302 * As an optimization, update the page's dirty field if a modified bit is 4303 * found while counting reference bits. This opportunistic update can be 4304 * performed at low cost and can eliminate the need for some future calls 4305 * to pmap_is_modified(). However, since this function stops after 4306 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 4307 * dirty pages. Those dirty pages will only be detected by a future call 4308 * to pmap_is_modified(). 4309 */ 4310 int 4311 pmap_ts_referenced(vm_page_t m) 4312 { 4313 struct spglist free; 4314 struct md_page *pvh; 4315 struct rwlock *lock; 4316 pv_entry_t pv, pvf; 4317 pmap_t pmap; 4318 pd_entry_t *l2, l2e; 4319 pt_entry_t *l3, l3e; 4320 vm_paddr_t pa; 4321 vm_offset_t va; 4322 int cleared, md_gen, not_cleared, pvh_gen; 4323 4324 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4325 ("pmap_ts_referenced: page %p is not managed", m)); 4326 SLIST_INIT(&free); 4327 cleared = 0; 4328 pa = VM_PAGE_TO_PHYS(m); 4329 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 4330 4331 lock = PHYS_TO_PV_LIST_LOCK(pa); 4332 rw_rlock(&pvh_global_lock); 4333 rw_wlock(lock); 4334 retry: 4335 not_cleared = 0; 4336 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 4337 goto small_mappings; 4338 pv = pvf; 4339 do { 4340 pmap = PV_PMAP(pv); 4341 if (!PMAP_TRYLOCK(pmap)) { 4342 pvh_gen = pvh->pv_gen; 4343 rw_wunlock(lock); 4344 PMAP_LOCK(pmap); 4345 rw_wlock(lock); 4346 if (pvh_gen != pvh->pv_gen) { 4347 PMAP_UNLOCK(pmap); 4348 goto retry; 4349 } 4350 } 4351 va = pv->pv_va; 4352 l2 = pmap_l2(pmap, va); 4353 l2e = pmap_load(l2); 4354 if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) { 4355 /* 4356 * Although l2e is mapping a 2MB page, because 4357 * this function is called at a 4KB page granularity, 4358 * we only update the 4KB page under test. 4359 */ 4360 vm_page_dirty(m); 4361 } 4362 if ((l2e & PTE_A) != 0) { 4363 /* 4364 * Since this reference bit is shared by 512 4KB 4365 * pages, it should not be cleared every time it is 4366 * tested. Apply a simple "hash" function on the 4367 * physical page number, the virtual superpage number, 4368 * and the pmap address to select one 4KB page out of 4369 * the 512 on which testing the reference bit will 4370 * result in clearing that reference bit. This 4371 * function is designed to avoid the selection of the 4372 * same 4KB page for every 2MB page mapping. 4373 * 4374 * On demotion, a mapping that hasn't been referenced 4375 * is simply destroyed. To avoid the possibility of a 4376 * subsequent page fault on a demoted wired mapping, 4377 * always leave its reference bit set. Moreover, 4378 * since the superpage is wired, the current state of 4379 * its reference bit won't affect page replacement. 4380 */ 4381 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ 4382 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && 4383 (l2e & PTE_SW_WIRED) == 0) { 4384 pmap_clear_bits(l2, PTE_A); 4385 pmap_invalidate_page(pmap, va); 4386 cleared++; 4387 } else 4388 not_cleared++; 4389 } 4390 PMAP_UNLOCK(pmap); 4391 /* Rotate the PV list if it has more than one entry. */ 4392 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4393 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4394 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4395 pvh->pv_gen++; 4396 } 4397 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 4398 goto out; 4399 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 4400 small_mappings: 4401 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 4402 goto out; 4403 pv = pvf; 4404 do { 4405 pmap = PV_PMAP(pv); 4406 if (!PMAP_TRYLOCK(pmap)) { 4407 pvh_gen = pvh->pv_gen; 4408 md_gen = m->md.pv_gen; 4409 rw_wunlock(lock); 4410 PMAP_LOCK(pmap); 4411 rw_wlock(lock); 4412 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4413 PMAP_UNLOCK(pmap); 4414 goto retry; 4415 } 4416 } 4417 l2 = pmap_l2(pmap, pv->pv_va); 4418 4419 KASSERT((pmap_load(l2) & PTE_RX) == 0, 4420 ("pmap_ts_referenced: found an invalid l2 table")); 4421 4422 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4423 l3e = pmap_load(l3); 4424 if ((l3e & PTE_D) != 0) 4425 vm_page_dirty(m); 4426 if ((l3e & PTE_A) != 0) { 4427 if ((l3e & PTE_SW_WIRED) == 0) { 4428 /* 4429 * Wired pages cannot be paged out so 4430 * doing accessed bit emulation for 4431 * them is wasted effort. We do the 4432 * hard work for unwired pages only. 4433 */ 4434 pmap_clear_bits(l3, PTE_A); 4435 pmap_invalidate_page(pmap, pv->pv_va); 4436 cleared++; 4437 } else 4438 not_cleared++; 4439 } 4440 PMAP_UNLOCK(pmap); 4441 /* Rotate the PV list if it has more than one entry. */ 4442 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4443 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4444 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4445 m->md.pv_gen++; 4446 } 4447 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 4448 not_cleared < PMAP_TS_REFERENCED_MAX); 4449 out: 4450 rw_wunlock(lock); 4451 rw_runlock(&pvh_global_lock); 4452 vm_page_free_pages_toq(&free, false); 4453 return (cleared + not_cleared); 4454 } 4455 4456 /* 4457 * Apply the given advice to the specified range of addresses within the 4458 * given pmap. Depending on the advice, clear the referenced and/or 4459 * modified flags in each mapping and set the mapped page's dirty field. 4460 */ 4461 void 4462 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 4463 { 4464 } 4465 4466 /* 4467 * Clear the modify bits on the specified physical page. 4468 */ 4469 void 4470 pmap_clear_modify(vm_page_t m) 4471 { 4472 struct md_page *pvh; 4473 struct rwlock *lock; 4474 pmap_t pmap; 4475 pv_entry_t next_pv, pv; 4476 pd_entry_t *l2, oldl2; 4477 pt_entry_t *l3; 4478 vm_offset_t va; 4479 int md_gen, pvh_gen; 4480 4481 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4482 ("pmap_clear_modify: page %p is not managed", m)); 4483 vm_page_assert_busied(m); 4484 4485 if (!pmap_page_is_write_mapped(m)) 4486 return; 4487 4488 /* 4489 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 4490 * If the object containing the page is locked and the page is not 4491 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 4492 */ 4493 if ((m->a.flags & PGA_WRITEABLE) == 0) 4494 return; 4495 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 4496 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4497 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4498 rw_rlock(&pvh_global_lock); 4499 rw_wlock(lock); 4500 restart: 4501 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4502 pmap = PV_PMAP(pv); 4503 if (!PMAP_TRYLOCK(pmap)) { 4504 pvh_gen = pvh->pv_gen; 4505 rw_wunlock(lock); 4506 PMAP_LOCK(pmap); 4507 rw_wlock(lock); 4508 if (pvh_gen != pvh->pv_gen) { 4509 PMAP_UNLOCK(pmap); 4510 goto restart; 4511 } 4512 } 4513 va = pv->pv_va; 4514 l2 = pmap_l2(pmap, va); 4515 oldl2 = pmap_load(l2); 4516 /* If oldl2 has PTE_W set, then it also has PTE_D set. */ 4517 if ((oldl2 & PTE_W) != 0 && 4518 pmap_demote_l2_locked(pmap, l2, va, &lock) && 4519 (oldl2 & PTE_SW_WIRED) == 0) { 4520 /* 4521 * Write protect the mapping to a single page so that 4522 * a subsequent write access may repromote. 4523 */ 4524 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2); 4525 l3 = pmap_l2_to_l3(l2, va); 4526 pmap_clear_bits(l3, PTE_D | PTE_W); 4527 vm_page_dirty(m); 4528 pmap_invalidate_page(pmap, va); 4529 } 4530 PMAP_UNLOCK(pmap); 4531 } 4532 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4533 pmap = PV_PMAP(pv); 4534 if (!PMAP_TRYLOCK(pmap)) { 4535 md_gen = m->md.pv_gen; 4536 pvh_gen = pvh->pv_gen; 4537 rw_wunlock(lock); 4538 PMAP_LOCK(pmap); 4539 rw_wlock(lock); 4540 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4541 PMAP_UNLOCK(pmap); 4542 goto restart; 4543 } 4544 } 4545 l2 = pmap_l2(pmap, pv->pv_va); 4546 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4547 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 4548 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4549 if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) { 4550 pmap_clear_bits(l3, PTE_D | PTE_W); 4551 pmap_invalidate_page(pmap, pv->pv_va); 4552 } 4553 PMAP_UNLOCK(pmap); 4554 } 4555 rw_wunlock(lock); 4556 rw_runlock(&pvh_global_lock); 4557 } 4558 4559 void * 4560 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 4561 { 4562 4563 return ((void *)PHYS_TO_DMAP(pa)); 4564 } 4565 4566 void 4567 pmap_unmapbios(void *p, vm_size_t size) 4568 { 4569 } 4570 4571 /* 4572 * Sets the memory attribute for the specified page. 4573 */ 4574 void 4575 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 4576 { 4577 4578 m->md.pv_memattr = ma; 4579 4580 /* 4581 * If "m" is a normal page, update its direct mapping. This update 4582 * can be relied upon to perform any cache operations that are 4583 * required for data coherence. 4584 */ 4585 if ((m->flags & PG_FICTITIOUS) == 0 && 4586 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 4587 m->md.pv_memattr) != 0) 4588 panic("memory attribute change on the direct map failed"); 4589 } 4590 4591 /* 4592 * Changes the specified virtual address range's memory type to that given by 4593 * the parameter "mode". The specified virtual address range must be 4594 * completely contained within either the direct map or the kernel map. 4595 * 4596 * Returns zero if the change completed successfully, and either EINVAL or 4597 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 4598 * of the virtual address range was not mapped, and ENOMEM is returned if 4599 * there was insufficient memory available to complete the change. In the 4600 * latter case, the memory type may have been changed on some part of the 4601 * virtual address range. 4602 */ 4603 int 4604 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 4605 { 4606 int error; 4607 4608 PMAP_LOCK(kernel_pmap); 4609 error = pmap_change_attr_locked(va, size, mode); 4610 PMAP_UNLOCK(kernel_pmap); 4611 return (error); 4612 } 4613 4614 static int 4615 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) 4616 { 4617 vm_offset_t base, offset, tmpva; 4618 pd_entry_t *l1, l1e; 4619 pd_entry_t *l2, l2e; 4620 pt_entry_t *l3, l3e; 4621 4622 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 4623 base = trunc_page(va); 4624 offset = va & PAGE_MASK; 4625 size = round_page(offset + size); 4626 4627 if (!VIRT_IN_DMAP(base) && 4628 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS)) 4629 return (EINVAL); 4630 4631 for (tmpva = base; tmpva < base + size; ) { 4632 l1 = pmap_l1(kernel_pmap, tmpva); 4633 if (l1 == NULL || ((l1e = pmap_load(l1)) & PTE_V) == 0) 4634 return (EINVAL); 4635 if ((l1e & PTE_RWX) != 0) { 4636 /* 4637 * TODO: Demote if attributes don't match and there 4638 * isn't an L1 page left in the range, and update the 4639 * L1 entry if the attributes don't match but there is 4640 * an L1 page left in the range, once we support the 4641 * upcoming Svpbmt extension. 4642 */ 4643 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; 4644 continue; 4645 } 4646 l2 = pmap_l1_to_l2(l1, tmpva); 4647 if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) 4648 return (EINVAL); 4649 if ((l2e & PTE_RWX) != 0) { 4650 /* 4651 * TODO: Demote if attributes don't match and there 4652 * isn't an L2 page left in the range, and update the 4653 * L2 entry if the attributes don't match but there is 4654 * an L2 page left in the range, once we support the 4655 * upcoming Svpbmt extension. 4656 */ 4657 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; 4658 continue; 4659 } 4660 l3 = pmap_l2_to_l3(l2, tmpva); 4661 if (l3 == NULL || ((l3e = pmap_load(l3)) & PTE_V) == 0) 4662 return (EINVAL); 4663 /* 4664 * TODO: Update the L3 entry if the attributes don't match once 4665 * we support the upcoming Svpbmt extension. 4666 */ 4667 tmpva += PAGE_SIZE; 4668 } 4669 4670 return (0); 4671 } 4672 4673 /* 4674 * Perform the pmap work for mincore(2). If the page is not both referenced and 4675 * modified by this pmap, returns its physical address so that the caller can 4676 * find other mappings. 4677 */ 4678 int 4679 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 4680 { 4681 pt_entry_t *l2, *l3, tpte; 4682 vm_paddr_t pa; 4683 int val; 4684 bool managed; 4685 4686 PMAP_LOCK(pmap); 4687 l2 = pmap_l2(pmap, addr); 4688 if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) { 4689 if ((tpte & PTE_RWX) != 0) { 4690 pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET); 4691 val = MINCORE_INCORE | MINCORE_PSIND(1); 4692 } else { 4693 l3 = pmap_l2_to_l3(l2, addr); 4694 tpte = pmap_load(l3); 4695 if ((tpte & PTE_V) == 0) { 4696 PMAP_UNLOCK(pmap); 4697 return (0); 4698 } 4699 pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET); 4700 val = MINCORE_INCORE; 4701 } 4702 4703 if ((tpte & PTE_D) != 0) 4704 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 4705 if ((tpte & PTE_A) != 0) 4706 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 4707 managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED; 4708 } else { 4709 managed = false; 4710 val = 0; 4711 } 4712 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 4713 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 4714 *pap = pa; 4715 } 4716 PMAP_UNLOCK(pmap); 4717 return (val); 4718 } 4719 4720 void 4721 pmap_activate_sw(struct thread *td) 4722 { 4723 pmap_t oldpmap, pmap; 4724 u_int hart; 4725 4726 oldpmap = PCPU_GET(curpmap); 4727 pmap = vmspace_pmap(td->td_proc->p_vmspace); 4728 if (pmap == oldpmap) 4729 return; 4730 csr_write(satp, pmap->pm_satp); 4731 4732 hart = PCPU_GET(hart); 4733 #ifdef SMP 4734 CPU_SET_ATOMIC(hart, &pmap->pm_active); 4735 CPU_CLR_ATOMIC(hart, &oldpmap->pm_active); 4736 #else 4737 CPU_SET(hart, &pmap->pm_active); 4738 CPU_CLR(hart, &oldpmap->pm_active); 4739 #endif 4740 PCPU_SET(curpmap, pmap); 4741 4742 sfence_vma(); 4743 } 4744 4745 void 4746 pmap_activate(struct thread *td) 4747 { 4748 4749 critical_enter(); 4750 pmap_activate_sw(td); 4751 critical_exit(); 4752 } 4753 4754 void 4755 pmap_activate_boot(pmap_t pmap) 4756 { 4757 u_int hart; 4758 4759 hart = PCPU_GET(hart); 4760 #ifdef SMP 4761 CPU_SET_ATOMIC(hart, &pmap->pm_active); 4762 #else 4763 CPU_SET(hart, &pmap->pm_active); 4764 #endif 4765 PCPU_SET(curpmap, pmap); 4766 } 4767 4768 void 4769 pmap_active_cpus(pmap_t pmap, cpuset_t *res) 4770 { 4771 *res = pmap->pm_active; 4772 } 4773 4774 void 4775 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) 4776 { 4777 cpuset_t mask; 4778 4779 /* 4780 * From the RISC-V User-Level ISA V2.2: 4781 * 4782 * "To make a store to instruction memory visible to all 4783 * RISC-V harts, the writing hart has to execute a data FENCE 4784 * before requesting that all remote RISC-V harts execute a 4785 * FENCE.I." 4786 * 4787 * However, this is slightly misleading; we still need to 4788 * perform a FENCE.I for the local hart, as FENCE does nothing 4789 * for its icache. FENCE.I alone is also sufficient for the 4790 * local hart. 4791 */ 4792 sched_pin(); 4793 mask = all_harts; 4794 CPU_CLR(PCPU_GET(hart), &mask); 4795 fence_i(); 4796 if (!CPU_EMPTY(&mask) && smp_started) { 4797 fence(); 4798 sbi_remote_fence_i(mask.__bits); 4799 } 4800 sched_unpin(); 4801 } 4802 4803 /* 4804 * Increase the starting virtual address of the given mapping if a 4805 * different alignment might result in more superpage mappings. 4806 */ 4807 void 4808 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 4809 vm_offset_t *addr, vm_size_t size) 4810 { 4811 vm_offset_t superpage_offset; 4812 4813 if (size < L2_SIZE) 4814 return; 4815 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 4816 offset += ptoa(object->pg_color); 4817 superpage_offset = offset & L2_OFFSET; 4818 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || 4819 (*addr & L2_OFFSET) == superpage_offset) 4820 return; 4821 if ((*addr & L2_OFFSET) < superpage_offset) 4822 *addr = (*addr & ~L2_OFFSET) + superpage_offset; 4823 else 4824 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; 4825 } 4826 4827 /** 4828 * Get the kernel virtual address of a set of physical pages. If there are 4829 * physical addresses not covered by the DMAP perform a transient mapping 4830 * that will be removed when calling pmap_unmap_io_transient. 4831 * 4832 * \param page The pages the caller wishes to obtain the virtual 4833 * address on the kernel memory map. 4834 * \param vaddr On return contains the kernel virtual memory address 4835 * of the pages passed in the page parameter. 4836 * \param count Number of pages passed in. 4837 * \param can_fault true if the thread using the mapped pages can take 4838 * page faults, false otherwise. 4839 * 4840 * \returns true if the caller must call pmap_unmap_io_transient when 4841 * finished or false otherwise. 4842 * 4843 */ 4844 bool 4845 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 4846 bool can_fault) 4847 { 4848 vm_paddr_t paddr; 4849 bool needs_mapping; 4850 int error __diagused, i; 4851 4852 /* 4853 * Allocate any KVA space that we need, this is done in a separate 4854 * loop to prevent calling vmem_alloc while pinned. 4855 */ 4856 needs_mapping = false; 4857 for (i = 0; i < count; i++) { 4858 paddr = VM_PAGE_TO_PHYS(page[i]); 4859 if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) { 4860 error = vmem_alloc(kernel_arena, PAGE_SIZE, 4861 M_BESTFIT | M_WAITOK, &vaddr[i]); 4862 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 4863 needs_mapping = true; 4864 } else { 4865 vaddr[i] = PHYS_TO_DMAP(paddr); 4866 } 4867 } 4868 4869 /* Exit early if everything is covered by the DMAP */ 4870 if (!needs_mapping) 4871 return (false); 4872 4873 if (!can_fault) 4874 sched_pin(); 4875 for (i = 0; i < count; i++) { 4876 paddr = VM_PAGE_TO_PHYS(page[i]); 4877 if (paddr >= DMAP_MAX_PHYSADDR) { 4878 panic( 4879 "pmap_map_io_transient: TODO: Map out of DMAP data"); 4880 } 4881 } 4882 4883 return (needs_mapping); 4884 } 4885 4886 void 4887 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 4888 bool can_fault) 4889 { 4890 vm_paddr_t paddr; 4891 int i; 4892 4893 if (!can_fault) 4894 sched_unpin(); 4895 for (i = 0; i < count; i++) { 4896 paddr = VM_PAGE_TO_PHYS(page[i]); 4897 if (paddr >= DMAP_MAX_PHYSADDR) { 4898 panic("RISCVTODO: pmap_unmap_io_transient: Unmap data"); 4899 } 4900 } 4901 } 4902 4903 boolean_t 4904 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 4905 { 4906 4907 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_BACK); 4908 } 4909 4910 bool 4911 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2, 4912 pt_entry_t **l3) 4913 { 4914 pd_entry_t *l1p, *l2p; 4915 4916 /* Get l1 directory entry. */ 4917 l1p = pmap_l1(pmap, va); 4918 *l1 = l1p; 4919 4920 if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0) 4921 return (false); 4922 4923 if ((pmap_load(l1p) & PTE_RX) != 0) { 4924 *l2 = NULL; 4925 *l3 = NULL; 4926 return (true); 4927 } 4928 4929 /* Get l2 directory entry. */ 4930 l2p = pmap_l1_to_l2(l1p, va); 4931 *l2 = l2p; 4932 4933 if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0) 4934 return (false); 4935 4936 if ((pmap_load(l2p) & PTE_RX) != 0) { 4937 *l3 = NULL; 4938 return (true); 4939 } 4940 4941 /* Get l3 page table entry. */ 4942 *l3 = pmap_l2_to_l3(l2p, va); 4943 4944 return (true); 4945 } 4946 4947 /* 4948 * Track a range of the kernel's virtual address space that is contiguous 4949 * in various mapping attributes. 4950 */ 4951 struct pmap_kernel_map_range { 4952 vm_offset_t sva; 4953 pt_entry_t attrs; 4954 int l3pages; 4955 int l2pages; 4956 int l1pages; 4957 }; 4958 4959 static void 4960 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 4961 vm_offset_t eva) 4962 { 4963 4964 if (eva <= range->sva) 4965 return; 4966 4967 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %d %d %d\n", 4968 range->sva, eva, 4969 (range->attrs & PTE_W) == PTE_W ? 'w' : '-', 4970 (range->attrs & PTE_X) == PTE_X ? 'x' : '-', 4971 (range->attrs & PTE_U) == PTE_U ? 'u' : 's', 4972 (range->attrs & PTE_G) == PTE_G ? 'g' : '-', 4973 range->l1pages, range->l2pages, range->l3pages); 4974 4975 /* Reset to sentinel value. */ 4976 range->sva = 0xfffffffffffffffful; 4977 } 4978 4979 /* 4980 * Determine whether the attributes specified by a page table entry match those 4981 * being tracked by the current range. 4982 */ 4983 static bool 4984 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 4985 { 4986 4987 return (range->attrs == attrs); 4988 } 4989 4990 static void 4991 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 4992 pt_entry_t attrs) 4993 { 4994 4995 memset(range, 0, sizeof(*range)); 4996 range->sva = va; 4997 range->attrs = attrs; 4998 } 4999 5000 /* 5001 * Given a leaf PTE, derive the mapping's attributes. If they do not match 5002 * those of the current run, dump the address range and its attributes, and 5003 * begin a new run. 5004 */ 5005 static void 5006 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 5007 vm_offset_t va, pd_entry_t l1e, pd_entry_t l2e, pt_entry_t l3e) 5008 { 5009 pt_entry_t attrs; 5010 5011 /* The PTE global bit is inherited by lower levels. */ 5012 attrs = l1e & PTE_G; 5013 if ((l1e & PTE_RWX) != 0) 5014 attrs |= l1e & (PTE_RWX | PTE_U); 5015 else if (l2e != 0) 5016 attrs |= l2e & PTE_G; 5017 if ((l2e & PTE_RWX) != 0) 5018 attrs |= l2e & (PTE_RWX | PTE_U); 5019 else if (l3e != 0) 5020 attrs |= l3e & (PTE_RWX | PTE_U | PTE_G); 5021 5022 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 5023 sysctl_kmaps_dump(sb, range, va); 5024 sysctl_kmaps_reinit(range, va, attrs); 5025 } 5026 } 5027 5028 static int 5029 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 5030 { 5031 struct pmap_kernel_map_range range; 5032 struct sbuf sbuf, *sb; 5033 pd_entry_t l1e, *l2, l2e; 5034 pt_entry_t *l3, l3e; 5035 vm_offset_t sva; 5036 vm_paddr_t pa; 5037 int error, i, j, k; 5038 5039 error = sysctl_wire_old_buffer(req, 0); 5040 if (error != 0) 5041 return (error); 5042 sb = &sbuf; 5043 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 5044 5045 /* Sentinel value. */ 5046 range.sva = 0xfffffffffffffffful; 5047 5048 /* 5049 * Iterate over the kernel page tables without holding the kernel pmap 5050 * lock. Kernel page table pages are never freed, so at worst we will 5051 * observe inconsistencies in the output. 5052 */ 5053 sva = VM_MIN_KERNEL_ADDRESS; 5054 for (i = pmap_l1_index(sva); i < Ln_ENTRIES; i++) { 5055 if (i == pmap_l1_index(DMAP_MIN_ADDRESS)) 5056 sbuf_printf(sb, "\nDirect map:\n"); 5057 else if (i == pmap_l1_index(VM_MIN_KERNEL_ADDRESS)) 5058 sbuf_printf(sb, "\nKernel map:\n"); 5059 5060 l1e = kernel_pmap->pm_top[i]; 5061 if ((l1e & PTE_V) == 0) { 5062 sysctl_kmaps_dump(sb, &range, sva); 5063 sva += L1_SIZE; 5064 continue; 5065 } 5066 if ((l1e & PTE_RWX) != 0) { 5067 sysctl_kmaps_check(sb, &range, sva, l1e, 0, 0); 5068 range.l1pages++; 5069 sva += L1_SIZE; 5070 continue; 5071 } 5072 pa = PTE_TO_PHYS(l1e); 5073 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); 5074 5075 for (j = pmap_l2_index(sva); j < Ln_ENTRIES; j++) { 5076 l2e = l2[j]; 5077 if ((l2e & PTE_V) == 0) { 5078 sysctl_kmaps_dump(sb, &range, sva); 5079 sva += L2_SIZE; 5080 continue; 5081 } 5082 if ((l2e & PTE_RWX) != 0) { 5083 sysctl_kmaps_check(sb, &range, sva, l1e, l2e, 0); 5084 range.l2pages++; 5085 sva += L2_SIZE; 5086 continue; 5087 } 5088 pa = PTE_TO_PHYS(l2e); 5089 l3 = (pd_entry_t *)PHYS_TO_DMAP(pa); 5090 5091 for (k = pmap_l3_index(sva); k < Ln_ENTRIES; k++, 5092 sva += L3_SIZE) { 5093 l3e = l3[k]; 5094 if ((l3e & PTE_V) == 0) { 5095 sysctl_kmaps_dump(sb, &range, sva); 5096 continue; 5097 } 5098 sysctl_kmaps_check(sb, &range, sva, 5099 l1e, l2e, l3e); 5100 range.l3pages++; 5101 } 5102 } 5103 } 5104 5105 error = sbuf_finish(sb); 5106 sbuf_delete(sb); 5107 return (error); 5108 } 5109 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 5110 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 5111 NULL, 0, sysctl_kmaps, "A", 5112 "Dump kernel address layout"); 5113