1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * 15 * This code is derived from software contributed to Berkeley by 16 * the Systems Programming Group of the University of Utah Computer 17 * Science Department and William Jolitz of UUNET Technologies Inc. 18 * 19 * Redistribution and use in source and binary forms, with or without 20 * modification, are permitted provided that the following conditions 21 * are met: 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 3. All advertising materials mentioning features or use of this software 28 * must display the following acknowledgement: 29 * This product includes software developed by the University of 30 * California, Berkeley and its contributors. 31 * 4. Neither the name of the University nor the names of its contributors 32 * may be used to endorse or promote products derived from this software 33 * without specific prior written permission. 34 * 35 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 36 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 38 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 39 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 40 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 41 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 42 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 43 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 44 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 45 * SUCH DAMAGE. 46 * 47 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 48 */ 49 /*- 50 * Copyright (c) 2003 Networks Associates Technology, Inc. 51 * Copyright (c) 2014-2020 The FreeBSD Foundation 52 * All rights reserved. 53 * 54 * This software was developed for the FreeBSD Project by Jake Burkholder, 55 * Safeport Network Services, and Network Associates Laboratories, the 56 * Security Research Division of Network Associates, Inc. under 57 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 58 * CHATS research program. 59 * 60 * Portions of this software were developed by 61 * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from 62 * the FreeBSD Foundation. 63 * 64 * Redistribution and use in source and binary forms, with or without 65 * modification, are permitted provided that the following conditions 66 * are met: 67 * 1. Redistributions of source code must retain the above copyright 68 * notice, this list of conditions and the following disclaimer. 69 * 2. Redistributions in binary form must reproduce the above copyright 70 * notice, this list of conditions and the following disclaimer in the 71 * documentation and/or other materials provided with the distribution. 72 * 73 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 74 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 75 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 76 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 77 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 78 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 79 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 80 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 81 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 82 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 83 * SUCH DAMAGE. 84 */ 85 86 #define AMD64_NPT_AWARE 87 88 #include <sys/cdefs.h> 89 __FBSDID("$FreeBSD$"); 90 91 /* 92 * Manages physical address maps. 93 * 94 * Since the information managed by this module is 95 * also stored by the logical address mapping module, 96 * this module may throw away valid virtual-to-physical 97 * mappings at almost any time. However, invalidations 98 * of virtual-to-physical mappings must be done as 99 * requested. 100 * 101 * In order to cope with hardware architectures which 102 * make virtual-to-physical map invalidates expensive, 103 * this module may delay invalidate or reduced protection 104 * operations until such time as they are actually 105 * necessary. This module is given full information as 106 * to which processors are currently using which maps, 107 * and to when physical maps must be made correct. 108 */ 109 110 #include "opt_ddb.h" 111 #include "opt_pmap.h" 112 #include "opt_vm.h" 113 114 #include <sys/param.h> 115 #include <sys/asan.h> 116 #include <sys/bitstring.h> 117 #include <sys/bus.h> 118 #include <sys/systm.h> 119 #include <sys/counter.h> 120 #include <sys/kernel.h> 121 #include <sys/ktr.h> 122 #include <sys/lock.h> 123 #include <sys/malloc.h> 124 #include <sys/mman.h> 125 #include <sys/msan.h> 126 #include <sys/mutex.h> 127 #include <sys/proc.h> 128 #include <sys/rangeset.h> 129 #include <sys/rwlock.h> 130 #include <sys/sbuf.h> 131 #include <sys/smr.h> 132 #include <sys/sx.h> 133 #include <sys/turnstile.h> 134 #include <sys/vmem.h> 135 #include <sys/vmmeter.h> 136 #include <sys/sched.h> 137 #include <sys/sysctl.h> 138 #include <sys/smp.h> 139 #ifdef DDB 140 #include <sys/kdb.h> 141 #include <ddb/ddb.h> 142 #endif 143 144 #include <vm/vm.h> 145 #include <vm/vm_param.h> 146 #include <vm/vm_kern.h> 147 #include <vm/vm_page.h> 148 #include <vm/vm_map.h> 149 #include <vm/vm_object.h> 150 #include <vm/vm_extern.h> 151 #include <vm/vm_pageout.h> 152 #include <vm/vm_pager.h> 153 #include <vm/vm_phys.h> 154 #include <vm/vm_radix.h> 155 #include <vm/vm_reserv.h> 156 #include <vm/vm_dumpset.h> 157 #include <vm/uma.h> 158 159 #include <machine/asan.h> 160 #include <machine/intr_machdep.h> 161 #include <x86/apicvar.h> 162 #include <x86/ifunc.h> 163 #include <machine/cpu.h> 164 #include <machine/cputypes.h> 165 #include <machine/md_var.h> 166 #include <machine/msan.h> 167 #include <machine/pcb.h> 168 #include <machine/specialreg.h> 169 #ifdef SMP 170 #include <machine/smp.h> 171 #endif 172 #include <machine/sysarch.h> 173 #include <machine/tss.h> 174 175 #ifdef NUMA 176 #define PMAP_MEMDOM MAXMEMDOM 177 #else 178 #define PMAP_MEMDOM 1 179 #endif 180 181 static __inline boolean_t 182 pmap_type_guest(pmap_t pmap) 183 { 184 185 return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI)); 186 } 187 188 static __inline boolean_t 189 pmap_emulate_ad_bits(pmap_t pmap) 190 { 191 192 return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); 193 } 194 195 static __inline pt_entry_t 196 pmap_valid_bit(pmap_t pmap) 197 { 198 pt_entry_t mask; 199 200 switch (pmap->pm_type) { 201 case PT_X86: 202 case PT_RVI: 203 mask = X86_PG_V; 204 break; 205 case PT_EPT: 206 if (pmap_emulate_ad_bits(pmap)) 207 mask = EPT_PG_EMUL_V; 208 else 209 mask = EPT_PG_READ; 210 break; 211 default: 212 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); 213 } 214 215 return (mask); 216 } 217 218 static __inline pt_entry_t 219 pmap_rw_bit(pmap_t pmap) 220 { 221 pt_entry_t mask; 222 223 switch (pmap->pm_type) { 224 case PT_X86: 225 case PT_RVI: 226 mask = X86_PG_RW; 227 break; 228 case PT_EPT: 229 if (pmap_emulate_ad_bits(pmap)) 230 mask = EPT_PG_EMUL_RW; 231 else 232 mask = EPT_PG_WRITE; 233 break; 234 default: 235 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); 236 } 237 238 return (mask); 239 } 240 241 static pt_entry_t pg_g; 242 243 static __inline pt_entry_t 244 pmap_global_bit(pmap_t pmap) 245 { 246 pt_entry_t mask; 247 248 switch (pmap->pm_type) { 249 case PT_X86: 250 mask = pg_g; 251 break; 252 case PT_RVI: 253 case PT_EPT: 254 mask = 0; 255 break; 256 default: 257 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); 258 } 259 260 return (mask); 261 } 262 263 static __inline pt_entry_t 264 pmap_accessed_bit(pmap_t pmap) 265 { 266 pt_entry_t mask; 267 268 switch (pmap->pm_type) { 269 case PT_X86: 270 case PT_RVI: 271 mask = X86_PG_A; 272 break; 273 case PT_EPT: 274 if (pmap_emulate_ad_bits(pmap)) 275 mask = EPT_PG_READ; 276 else 277 mask = EPT_PG_A; 278 break; 279 default: 280 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); 281 } 282 283 return (mask); 284 } 285 286 static __inline pt_entry_t 287 pmap_modified_bit(pmap_t pmap) 288 { 289 pt_entry_t mask; 290 291 switch (pmap->pm_type) { 292 case PT_X86: 293 case PT_RVI: 294 mask = X86_PG_M; 295 break; 296 case PT_EPT: 297 if (pmap_emulate_ad_bits(pmap)) 298 mask = EPT_PG_WRITE; 299 else 300 mask = EPT_PG_M; 301 break; 302 default: 303 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); 304 } 305 306 return (mask); 307 } 308 309 static __inline pt_entry_t 310 pmap_pku_mask_bit(pmap_t pmap) 311 { 312 313 return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0); 314 } 315 316 #if !defined(DIAGNOSTIC) 317 #ifdef __GNUC_GNU_INLINE__ 318 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 319 #else 320 #define PMAP_INLINE extern inline 321 #endif 322 #else 323 #define PMAP_INLINE 324 #endif 325 326 #ifdef PV_STATS 327 #define PV_STAT(x) do { x ; } while (0) 328 #else 329 #define PV_STAT(x) do { } while (0) 330 #endif 331 332 #undef pa_index 333 #ifdef NUMA 334 #define pa_index(pa) ({ \ 335 KASSERT((pa) <= vm_phys_segs[vm_phys_nsegs - 1].end, \ 336 ("address %lx beyond the last segment", (pa))); \ 337 (pa) >> PDRSHIFT; \ 338 }) 339 #define pa_to_pmdp(pa) (&pv_table[pa_index(pa)]) 340 #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page)) 341 #define PHYS_TO_PV_LIST_LOCK(pa) ({ \ 342 struct rwlock *_lock; \ 343 if (__predict_false((pa) > pmap_last_pa)) \ 344 _lock = &pv_dummy_large.pv_lock; \ 345 else \ 346 _lock = &(pa_to_pmdp(pa)->pv_lock); \ 347 _lock; \ 348 }) 349 #else 350 #define pa_index(pa) ((pa) >> PDRSHIFT) 351 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 352 353 #define NPV_LIST_LOCKS MAXCPU 354 355 #define PHYS_TO_PV_LIST_LOCK(pa) \ 356 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) 357 #endif 358 359 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 360 struct rwlock **_lockp = (lockp); \ 361 struct rwlock *_new_lock; \ 362 \ 363 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 364 if (_new_lock != *_lockp) { \ 365 if (*_lockp != NULL) \ 366 rw_wunlock(*_lockp); \ 367 *_lockp = _new_lock; \ 368 rw_wlock(*_lockp); \ 369 } \ 370 } while (0) 371 372 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 373 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 374 375 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 376 struct rwlock **_lockp = (lockp); \ 377 \ 378 if (*_lockp != NULL) { \ 379 rw_wunlock(*_lockp); \ 380 *_lockp = NULL; \ 381 } \ 382 } while (0) 383 384 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 385 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 386 387 struct pmap kernel_pmap_store; 388 389 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 390 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 391 392 int nkpt; 393 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, 394 "Number of kernel page table pages allocated on bootup"); 395 396 static int ndmpdp; 397 vm_paddr_t dmaplimit; 398 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 399 pt_entry_t pg_nx; 400 401 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 402 "VM/pmap parameters"); 403 404 static int pg_ps_enabled = 1; 405 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 406 &pg_ps_enabled, 0, "Are large page mappings enabled?"); 407 408 int __read_frequently la57 = 0; 409 SYSCTL_INT(_vm_pmap, OID_AUTO, la57, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 410 &la57, 0, 411 "5-level paging for host is enabled"); 412 413 static bool 414 pmap_is_la57(pmap_t pmap) 415 { 416 if (pmap->pm_type == PT_X86) 417 return (la57); 418 return (false); /* XXXKIB handle EPT */ 419 } 420 421 #define PAT_INDEX_SIZE 8 422 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 423 424 static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 425 static u_int64_t KPDphys; /* phys addr of kernel level 2 */ 426 static u_int64_t KPDPphys; /* phys addr of kernel level 3 */ 427 u_int64_t KPML4phys; /* phys addr of kernel level 4 */ 428 u_int64_t KPML5phys; /* phys addr of kernel level 5, 429 if supported */ 430 431 #ifdef KASAN 432 static uint64_t KASANPDPphys; 433 #endif 434 #ifdef KMSAN 435 static uint64_t KMSANSHADPDPphys; 436 static uint64_t KMSANORIGPDPphys; 437 438 /* 439 * To support systems with large amounts of memory, it is necessary to extend 440 * the maximum size of the direct map. This could eat into the space reserved 441 * for the shadow map. 442 */ 443 _Static_assert(DMPML4I + NDMPML4E <= KMSANSHADPML4I, "direct map overflow"); 444 #endif 445 446 static pml4_entry_t *kernel_pml4; 447 static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ 448 static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 449 static int ndmpdpphys; /* number of DMPDPphys pages */ 450 451 vm_paddr_t kernphys; /* phys addr of start of bootstrap data */ 452 vm_paddr_t KERNend; /* and the end */ 453 454 /* 455 * pmap_mapdev support pre initialization (i.e. console) 456 */ 457 #define PMAP_PREINIT_MAPPING_COUNT 8 458 static struct pmap_preinit_mapping { 459 vm_paddr_t pa; 460 vm_offset_t va; 461 vm_size_t sz; 462 int mode; 463 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 464 static int pmap_initialized; 465 466 /* 467 * Data for the pv entry allocation mechanism. 468 * Updates to pv_invl_gen are protected by the pv list lock but reads are not. 469 */ 470 #ifdef NUMA 471 static __inline int 472 pc_to_domain(struct pv_chunk *pc) 473 { 474 475 return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc))); 476 } 477 #else 478 static __inline int 479 pc_to_domain(struct pv_chunk *pc __unused) 480 { 481 482 return (0); 483 } 484 #endif 485 486 struct pv_chunks_list { 487 struct mtx pvc_lock; 488 TAILQ_HEAD(pch, pv_chunk) pvc_list; 489 int active_reclaims; 490 } __aligned(CACHE_LINE_SIZE); 491 492 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM]; 493 494 #ifdef NUMA 495 struct pmap_large_md_page { 496 struct rwlock pv_lock; 497 struct md_page pv_page; 498 u_long pv_invl_gen; 499 }; 500 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large; 501 #define pv_dummy pv_dummy_large.pv_page 502 __read_mostly static struct pmap_large_md_page *pv_table; 503 __read_mostly vm_paddr_t pmap_last_pa; 504 #else 505 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; 506 static u_long pv_invl_gen[NPV_LIST_LOCKS]; 507 static struct md_page *pv_table; 508 static struct md_page pv_dummy; 509 #endif 510 511 /* 512 * All those kernel PT submaps that BSD is so fond of 513 */ 514 pt_entry_t *CMAP1 = NULL; 515 caddr_t CADDR1 = 0; 516 static vm_offset_t qframe = 0; 517 static struct mtx qframe_mtx; 518 519 static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ 520 521 static vmem_t *large_vmem; 522 static u_int lm_ents; 523 #define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= LARGEMAP_MIN_ADDRESS && \ 524 (va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents) 525 526 int pmap_pcid_enabled = 1; 527 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 528 &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?"); 529 int invpcid_works = 0; 530 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, 531 "Is the invpcid instruction available ?"); 532 int pmap_pcid_invlpg_workaround = 0; 533 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_invlpg_workaround, 534 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 535 &pmap_pcid_invlpg_workaround, 0, 536 "Enable small core PCID/INVLPG workaround"); 537 int pmap_pcid_invlpg_workaround_uena = 1; 538 539 int __read_frequently pti = 0; 540 SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 541 &pti, 0, 542 "Page Table Isolation enabled"); 543 static vm_object_t pti_obj; 544 static pml4_entry_t *pti_pml4; 545 static vm_pindex_t pti_pg_idx; 546 static bool pti_finalized; 547 548 struct pmap_pkru_range { 549 struct rs_el pkru_rs_el; 550 u_int pkru_keyidx; 551 int pkru_flags; 552 }; 553 554 static uma_zone_t pmap_pkru_ranges_zone; 555 static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 556 static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va); 557 static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 558 static void *pkru_dup_range(void *ctx, void *data); 559 static void pkru_free_range(void *ctx, void *node); 560 static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap); 561 static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 562 static void pmap_pkru_deassign_all(pmap_t pmap); 563 564 static COUNTER_U64_DEFINE_EARLY(pcid_save_cnt); 565 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLFLAG_RD, 566 &pcid_save_cnt, "Count of saved TLB context on switch"); 567 568 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = 569 LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); 570 static struct mtx invl_gen_mtx; 571 /* Fake lock object to satisfy turnstiles interface. */ 572 static struct lock_object invl_gen_ts = { 573 .lo_name = "invlts", 574 }; 575 static struct pmap_invl_gen pmap_invl_gen_head = { 576 .gen = 1, 577 .next = NULL, 578 }; 579 static u_long pmap_invl_gen = 1; 580 static int pmap_invl_waiters; 581 static struct callout pmap_invl_callout; 582 static bool pmap_invl_callout_inited; 583 584 #define PMAP_ASSERT_NOT_IN_DI() \ 585 KASSERT(pmap_not_in_di(), ("DI already started")) 586 587 static bool 588 pmap_di_locked(void) 589 { 590 int tun; 591 592 if ((cpu_feature2 & CPUID2_CX16) == 0) 593 return (true); 594 tun = 0; 595 TUNABLE_INT_FETCH("vm.pmap.di_locked", &tun); 596 return (tun != 0); 597 } 598 599 static int 600 sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS) 601 { 602 int locked; 603 604 locked = pmap_di_locked(); 605 return (sysctl_handle_int(oidp, &locked, 0, req)); 606 } 607 SYSCTL_PROC(_vm_pmap, OID_AUTO, di_locked, CTLTYPE_INT | CTLFLAG_RDTUN | 608 CTLFLAG_MPSAFE, 0, 0, sysctl_pmap_di_locked, "", 609 "Locked delayed invalidation"); 610 611 static bool pmap_not_in_di_l(void); 612 static bool pmap_not_in_di_u(void); 613 DEFINE_IFUNC(, bool, pmap_not_in_di, (void)) 614 { 615 616 return (pmap_di_locked() ? pmap_not_in_di_l : pmap_not_in_di_u); 617 } 618 619 static bool 620 pmap_not_in_di_l(void) 621 { 622 struct pmap_invl_gen *invl_gen; 623 624 invl_gen = &curthread->td_md.md_invl_gen; 625 return (invl_gen->gen == 0); 626 } 627 628 static void 629 pmap_thread_init_invl_gen_l(struct thread *td) 630 { 631 struct pmap_invl_gen *invl_gen; 632 633 invl_gen = &td->td_md.md_invl_gen; 634 invl_gen->gen = 0; 635 } 636 637 static void 638 pmap_delayed_invl_wait_block(u_long *m_gen, u_long *invl_gen) 639 { 640 struct turnstile *ts; 641 642 ts = turnstile_trywait(&invl_gen_ts); 643 if (*m_gen > atomic_load_long(invl_gen)) 644 turnstile_wait(ts, NULL, TS_SHARED_QUEUE); 645 else 646 turnstile_cancel(ts); 647 } 648 649 static void 650 pmap_delayed_invl_finish_unblock(u_long new_gen) 651 { 652 struct turnstile *ts; 653 654 turnstile_chain_lock(&invl_gen_ts); 655 ts = turnstile_lookup(&invl_gen_ts); 656 if (new_gen != 0) 657 pmap_invl_gen = new_gen; 658 if (ts != NULL) { 659 turnstile_broadcast(ts, TS_SHARED_QUEUE); 660 turnstile_unpend(ts); 661 } 662 turnstile_chain_unlock(&invl_gen_ts); 663 } 664 665 /* 666 * Start a new Delayed Invalidation (DI) block of code, executed by 667 * the current thread. Within a DI block, the current thread may 668 * destroy both the page table and PV list entries for a mapping and 669 * then release the corresponding PV list lock before ensuring that 670 * the mapping is flushed from the TLBs of any processors with the 671 * pmap active. 672 */ 673 static void 674 pmap_delayed_invl_start_l(void) 675 { 676 struct pmap_invl_gen *invl_gen; 677 u_long currgen; 678 679 invl_gen = &curthread->td_md.md_invl_gen; 680 PMAP_ASSERT_NOT_IN_DI(); 681 mtx_lock(&invl_gen_mtx); 682 if (LIST_EMPTY(&pmap_invl_gen_tracker)) 683 currgen = pmap_invl_gen; 684 else 685 currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; 686 invl_gen->gen = currgen + 1; 687 LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); 688 mtx_unlock(&invl_gen_mtx); 689 } 690 691 /* 692 * Finish the DI block, previously started by the current thread. All 693 * required TLB flushes for the pages marked by 694 * pmap_delayed_invl_page() must be finished before this function is 695 * called. 696 * 697 * This function works by bumping the global DI generation number to 698 * the generation number of the current thread's DI, unless there is a 699 * pending DI that started earlier. In the latter case, bumping the 700 * global DI generation number would incorrectly signal that the 701 * earlier DI had finished. Instead, this function bumps the earlier 702 * DI's generation number to match the generation number of the 703 * current thread's DI. 704 */ 705 static void 706 pmap_delayed_invl_finish_l(void) 707 { 708 struct pmap_invl_gen *invl_gen, *next; 709 710 invl_gen = &curthread->td_md.md_invl_gen; 711 KASSERT(invl_gen->gen != 0, ("missed invl_start")); 712 mtx_lock(&invl_gen_mtx); 713 next = LIST_NEXT(invl_gen, link); 714 if (next == NULL) 715 pmap_delayed_invl_finish_unblock(invl_gen->gen); 716 else 717 next->gen = invl_gen->gen; 718 LIST_REMOVE(invl_gen, link); 719 mtx_unlock(&invl_gen_mtx); 720 invl_gen->gen = 0; 721 } 722 723 static bool 724 pmap_not_in_di_u(void) 725 { 726 struct pmap_invl_gen *invl_gen; 727 728 invl_gen = &curthread->td_md.md_invl_gen; 729 return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0); 730 } 731 732 static void 733 pmap_thread_init_invl_gen_u(struct thread *td) 734 { 735 struct pmap_invl_gen *invl_gen; 736 737 invl_gen = &td->td_md.md_invl_gen; 738 invl_gen->gen = 0; 739 invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID; 740 } 741 742 static bool 743 pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out) 744 { 745 uint64_t new_high, new_low, old_high, old_low; 746 char res; 747 748 old_low = new_low = 0; 749 old_high = new_high = (uintptr_t)0; 750 751 __asm volatile("lock;cmpxchg16b\t%1" 752 : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) 753 : "b"(new_low), "c" (new_high) 754 : "memory", "cc"); 755 if (res == 0) { 756 if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0) 757 return (false); 758 out->gen = old_low; 759 out->next = (void *)old_high; 760 } else { 761 out->gen = new_low; 762 out->next = (void *)new_high; 763 } 764 return (true); 765 } 766 767 static bool 768 pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val, 769 struct pmap_invl_gen *new_val) 770 { 771 uint64_t new_high, new_low, old_high, old_low; 772 char res; 773 774 new_low = new_val->gen; 775 new_high = (uintptr_t)new_val->next; 776 old_low = old_val->gen; 777 old_high = (uintptr_t)old_val->next; 778 779 __asm volatile("lock;cmpxchg16b\t%1" 780 : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) 781 : "b"(new_low), "c" (new_high) 782 : "memory", "cc"); 783 return (res); 784 } 785 786 static COUNTER_U64_DEFINE_EARLY(pv_page_count); 787 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_page_count, CTLFLAG_RD, 788 &pv_page_count, "Current number of allocated pv pages"); 789 790 static COUNTER_U64_DEFINE_EARLY(user_pt_page_count); 791 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, user_pt_page_count, CTLFLAG_RD, 792 &user_pt_page_count, 793 "Current number of allocated page table pages for userspace"); 794 795 static COUNTER_U64_DEFINE_EARLY(kernel_pt_page_count); 796 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, kernel_pt_page_count, CTLFLAG_RD, 797 &kernel_pt_page_count, 798 "Current number of allocated page table pages for the kernel"); 799 800 #ifdef PV_STATS 801 802 static COUNTER_U64_DEFINE_EARLY(invl_start_restart); 803 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_start_restart, 804 CTLFLAG_RD, &invl_start_restart, 805 "Number of delayed TLB invalidation request restarts"); 806 807 static COUNTER_U64_DEFINE_EARLY(invl_finish_restart); 808 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD, 809 &invl_finish_restart, 810 "Number of delayed TLB invalidation completion restarts"); 811 812 static int invl_max_qlen; 813 SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD, 814 &invl_max_qlen, 0, 815 "Maximum delayed TLB invalidation request queue length"); 816 #endif 817 818 #define di_delay locks_delay 819 820 static void 821 pmap_delayed_invl_start_u(void) 822 { 823 struct pmap_invl_gen *invl_gen, *p, prev, new_prev; 824 struct thread *td; 825 struct lock_delay_arg lda; 826 uintptr_t prevl; 827 u_char pri; 828 #ifdef PV_STATS 829 int i, ii; 830 #endif 831 832 td = curthread; 833 invl_gen = &td->td_md.md_invl_gen; 834 PMAP_ASSERT_NOT_IN_DI(); 835 lock_delay_arg_init(&lda, &di_delay); 836 invl_gen->saved_pri = 0; 837 pri = td->td_base_pri; 838 if (pri > PVM) { 839 thread_lock(td); 840 pri = td->td_base_pri; 841 if (pri > PVM) { 842 invl_gen->saved_pri = pri; 843 sched_prio(td, PVM); 844 } 845 thread_unlock(td); 846 } 847 again: 848 PV_STAT(i = 0); 849 for (p = &pmap_invl_gen_head;; p = prev.next) { 850 PV_STAT(i++); 851 prevl = (uintptr_t)atomic_load_ptr(&p->next); 852 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { 853 PV_STAT(counter_u64_add(invl_start_restart, 1)); 854 lock_delay(&lda); 855 goto again; 856 } 857 if (prevl == 0) 858 break; 859 prev.next = (void *)prevl; 860 } 861 #ifdef PV_STATS 862 if ((ii = invl_max_qlen) < i) 863 atomic_cmpset_int(&invl_max_qlen, ii, i); 864 #endif 865 866 if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) { 867 PV_STAT(counter_u64_add(invl_start_restart, 1)); 868 lock_delay(&lda); 869 goto again; 870 } 871 872 new_prev.gen = prev.gen; 873 new_prev.next = invl_gen; 874 invl_gen->gen = prev.gen + 1; 875 876 /* Formal fence between store to invl->gen and updating *p. */ 877 atomic_thread_fence_rel(); 878 879 /* 880 * After inserting an invl_gen element with invalid bit set, 881 * this thread blocks any other thread trying to enter the 882 * delayed invalidation block. Do not allow to remove us from 883 * the CPU, because it causes starvation for other threads. 884 */ 885 critical_enter(); 886 887 /* 888 * ABA for *p is not possible there, since p->gen can only 889 * increase. So if the *p thread finished its di, then 890 * started a new one and got inserted into the list at the 891 * same place, its gen will appear greater than the previously 892 * read gen. 893 */ 894 if (!pmap_di_store_invl(p, &prev, &new_prev)) { 895 critical_exit(); 896 PV_STAT(counter_u64_add(invl_start_restart, 1)); 897 lock_delay(&lda); 898 goto again; 899 } 900 901 /* 902 * There we clear PMAP_INVL_GEN_NEXT_INVALID in 903 * invl_gen->next, allowing other threads to iterate past us. 904 * pmap_di_store_invl() provides fence between the generation 905 * write and the update of next. 906 */ 907 invl_gen->next = NULL; 908 critical_exit(); 909 } 910 911 static bool 912 pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen *invl_gen, 913 struct pmap_invl_gen *p) 914 { 915 struct pmap_invl_gen prev, new_prev; 916 u_long mygen; 917 918 /* 919 * Load invl_gen->gen after setting invl_gen->next 920 * PMAP_INVL_GEN_NEXT_INVALID. This prevents larger 921 * generations to propagate to our invl_gen->gen. Lock prefix 922 * in atomic_set_ptr() worked as seq_cst fence. 923 */ 924 mygen = atomic_load_long(&invl_gen->gen); 925 926 if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen) 927 return (false); 928 929 KASSERT(prev.gen < mygen, 930 ("invalid di gen sequence %lu %lu", prev.gen, mygen)); 931 new_prev.gen = mygen; 932 new_prev.next = (void *)((uintptr_t)invl_gen->next & 933 ~PMAP_INVL_GEN_NEXT_INVALID); 934 935 /* Formal fence between load of prev and storing update to it. */ 936 atomic_thread_fence_rel(); 937 938 return (pmap_di_store_invl(p, &prev, &new_prev)); 939 } 940 941 static void 942 pmap_delayed_invl_finish_u(void) 943 { 944 struct pmap_invl_gen *invl_gen, *p; 945 struct thread *td; 946 struct lock_delay_arg lda; 947 uintptr_t prevl; 948 949 td = curthread; 950 invl_gen = &td->td_md.md_invl_gen; 951 KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0")); 952 KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0, 953 ("missed invl_start: INVALID")); 954 lock_delay_arg_init(&lda, &di_delay); 955 956 again: 957 for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) { 958 prevl = (uintptr_t)atomic_load_ptr(&p->next); 959 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { 960 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 961 lock_delay(&lda); 962 goto again; 963 } 964 if ((void *)prevl == invl_gen) 965 break; 966 } 967 968 /* 969 * It is legitimate to not find ourself on the list if a 970 * thread before us finished its DI and started it again. 971 */ 972 if (__predict_false(p == NULL)) { 973 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 974 lock_delay(&lda); 975 goto again; 976 } 977 978 critical_enter(); 979 atomic_set_ptr((uintptr_t *)&invl_gen->next, 980 PMAP_INVL_GEN_NEXT_INVALID); 981 if (!pmap_delayed_invl_finish_u_crit(invl_gen, p)) { 982 atomic_clear_ptr((uintptr_t *)&invl_gen->next, 983 PMAP_INVL_GEN_NEXT_INVALID); 984 critical_exit(); 985 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 986 lock_delay(&lda); 987 goto again; 988 } 989 critical_exit(); 990 if (atomic_load_int(&pmap_invl_waiters) > 0) 991 pmap_delayed_invl_finish_unblock(0); 992 if (invl_gen->saved_pri != 0) { 993 thread_lock(td); 994 sched_prio(td, invl_gen->saved_pri); 995 thread_unlock(td); 996 } 997 } 998 999 #ifdef DDB 1000 DB_SHOW_COMMAND(di_queue, pmap_di_queue) 1001 { 1002 struct pmap_invl_gen *p, *pn; 1003 struct thread *td; 1004 uintptr_t nextl; 1005 bool first; 1006 1007 for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn, 1008 first = false) { 1009 nextl = (uintptr_t)atomic_load_ptr(&p->next); 1010 pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID); 1011 td = first ? NULL : __containerof(p, struct thread, 1012 td_md.md_invl_gen); 1013 db_printf("gen %lu inv %d td %p tid %d\n", p->gen, 1014 (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td, 1015 td != NULL ? td->td_tid : -1); 1016 } 1017 } 1018 #endif 1019 1020 #ifdef PV_STATS 1021 static COUNTER_U64_DEFINE_EARLY(invl_wait); 1022 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait, 1023 CTLFLAG_RD, &invl_wait, 1024 "Number of times DI invalidation blocked pmap_remove_all/write"); 1025 1026 static COUNTER_U64_DEFINE_EARLY(invl_wait_slow); 1027 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD, 1028 &invl_wait_slow, "Number of slow invalidation waits for lockless DI"); 1029 1030 #endif 1031 1032 #ifdef NUMA 1033 static u_long * 1034 pmap_delayed_invl_genp(vm_page_t m) 1035 { 1036 vm_paddr_t pa; 1037 u_long *gen; 1038 1039 pa = VM_PAGE_TO_PHYS(m); 1040 if (__predict_false((pa) > pmap_last_pa)) 1041 gen = &pv_dummy_large.pv_invl_gen; 1042 else 1043 gen = &(pa_to_pmdp(pa)->pv_invl_gen); 1044 1045 return (gen); 1046 } 1047 #else 1048 static u_long * 1049 pmap_delayed_invl_genp(vm_page_t m) 1050 { 1051 1052 return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); 1053 } 1054 #endif 1055 1056 static void 1057 pmap_delayed_invl_callout_func(void *arg __unused) 1058 { 1059 1060 if (atomic_load_int(&pmap_invl_waiters) == 0) 1061 return; 1062 pmap_delayed_invl_finish_unblock(0); 1063 } 1064 1065 static void 1066 pmap_delayed_invl_callout_init(void *arg __unused) 1067 { 1068 1069 if (pmap_di_locked()) 1070 return; 1071 callout_init(&pmap_invl_callout, 1); 1072 pmap_invl_callout_inited = true; 1073 } 1074 SYSINIT(pmap_di_callout, SI_SUB_CPU + 1, SI_ORDER_ANY, 1075 pmap_delayed_invl_callout_init, NULL); 1076 1077 /* 1078 * Ensure that all currently executing DI blocks, that need to flush 1079 * TLB for the given page m, actually flushed the TLB at the time the 1080 * function returned. If the page m has an empty PV list and we call 1081 * pmap_delayed_invl_wait(), upon its return we know that no CPU has a 1082 * valid mapping for the page m in either its page table or TLB. 1083 * 1084 * This function works by blocking until the global DI generation 1085 * number catches up with the generation number associated with the 1086 * given page m and its PV list. Since this function's callers 1087 * typically own an object lock and sometimes own a page lock, it 1088 * cannot sleep. Instead, it blocks on a turnstile to relinquish the 1089 * processor. 1090 */ 1091 static void 1092 pmap_delayed_invl_wait_l(vm_page_t m) 1093 { 1094 u_long *m_gen; 1095 #ifdef PV_STATS 1096 bool accounted = false; 1097 #endif 1098 1099 m_gen = pmap_delayed_invl_genp(m); 1100 while (*m_gen > pmap_invl_gen) { 1101 #ifdef PV_STATS 1102 if (!accounted) { 1103 counter_u64_add(invl_wait, 1); 1104 accounted = true; 1105 } 1106 #endif 1107 pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen); 1108 } 1109 } 1110 1111 static void 1112 pmap_delayed_invl_wait_u(vm_page_t m) 1113 { 1114 u_long *m_gen; 1115 struct lock_delay_arg lda; 1116 bool fast; 1117 1118 fast = true; 1119 m_gen = pmap_delayed_invl_genp(m); 1120 lock_delay_arg_init(&lda, &di_delay); 1121 while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) { 1122 if (fast || !pmap_invl_callout_inited) { 1123 PV_STAT(counter_u64_add(invl_wait, 1)); 1124 lock_delay(&lda); 1125 fast = false; 1126 } else { 1127 /* 1128 * The page's invalidation generation number 1129 * is still below the current thread's number. 1130 * Prepare to block so that we do not waste 1131 * CPU cycles or worse, suffer livelock. 1132 * 1133 * Since it is impossible to block without 1134 * racing with pmap_delayed_invl_finish_u(), 1135 * prepare for the race by incrementing 1136 * pmap_invl_waiters and arming a 1-tick 1137 * callout which will unblock us if we lose 1138 * the race. 1139 */ 1140 atomic_add_int(&pmap_invl_waiters, 1); 1141 1142 /* 1143 * Re-check the current thread's invalidation 1144 * generation after incrementing 1145 * pmap_invl_waiters, so that there is no race 1146 * with pmap_delayed_invl_finish_u() setting 1147 * the page generation and checking 1148 * pmap_invl_waiters. The only race allowed 1149 * is for a missed unblock, which is handled 1150 * by the callout. 1151 */ 1152 if (*m_gen > 1153 atomic_load_long(&pmap_invl_gen_head.gen)) { 1154 callout_reset(&pmap_invl_callout, 1, 1155 pmap_delayed_invl_callout_func, NULL); 1156 PV_STAT(counter_u64_add(invl_wait_slow, 1)); 1157 pmap_delayed_invl_wait_block(m_gen, 1158 &pmap_invl_gen_head.gen); 1159 } 1160 atomic_add_int(&pmap_invl_waiters, -1); 1161 } 1162 } 1163 } 1164 1165 DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *)) 1166 { 1167 1168 return (pmap_di_locked() ? pmap_thread_init_invl_gen_l : 1169 pmap_thread_init_invl_gen_u); 1170 } 1171 1172 DEFINE_IFUNC(static, void, pmap_delayed_invl_start, (void)) 1173 { 1174 1175 return (pmap_di_locked() ? pmap_delayed_invl_start_l : 1176 pmap_delayed_invl_start_u); 1177 } 1178 1179 DEFINE_IFUNC(static, void, pmap_delayed_invl_finish, (void)) 1180 { 1181 1182 return (pmap_di_locked() ? pmap_delayed_invl_finish_l : 1183 pmap_delayed_invl_finish_u); 1184 } 1185 1186 DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t)) 1187 { 1188 1189 return (pmap_di_locked() ? pmap_delayed_invl_wait_l : 1190 pmap_delayed_invl_wait_u); 1191 } 1192 1193 /* 1194 * Mark the page m's PV list as participating in the current thread's 1195 * DI block. Any threads concurrently using m's PV list to remove or 1196 * restrict all mappings to m will wait for the current thread's DI 1197 * block to complete before proceeding. 1198 * 1199 * The function works by setting the DI generation number for m's PV 1200 * list to at least the DI generation number of the current thread. 1201 * This forces a caller of pmap_delayed_invl_wait() to block until 1202 * current thread calls pmap_delayed_invl_finish(). 1203 */ 1204 static void 1205 pmap_delayed_invl_page(vm_page_t m) 1206 { 1207 u_long gen, *m_gen; 1208 1209 rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); 1210 gen = curthread->td_md.md_invl_gen.gen; 1211 if (gen == 0) 1212 return; 1213 m_gen = pmap_delayed_invl_genp(m); 1214 if (*m_gen < gen) 1215 *m_gen = gen; 1216 } 1217 1218 /* 1219 * Crashdump maps. 1220 */ 1221 static caddr_t crashdumpmap; 1222 1223 /* 1224 * Internal flags for pmap_enter()'s helper functions. 1225 */ 1226 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 1227 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 1228 1229 /* 1230 * Internal flags for pmap_mapdev_internal() and 1231 * pmap_change_props_locked(). 1232 */ 1233 #define MAPDEV_FLUSHCACHE 0x00000001 /* Flush cache after mapping. */ 1234 #define MAPDEV_SETATTR 0x00000002 /* Modify existing attrs. */ 1235 #define MAPDEV_ASSERTVALID 0x00000004 /* Assert mapping validity. */ 1236 1237 TAILQ_HEAD(pv_chunklist, pv_chunk); 1238 1239 static void free_pv_chunk(struct pv_chunk *pc); 1240 static void free_pv_chunk_batch(struct pv_chunklist *batch); 1241 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 1242 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 1243 static int popcnt_pc_map_pq(uint64_t *map); 1244 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 1245 static void reserve_pv_entries(pmap_t pmap, int needed, 1246 struct rwlock **lockp); 1247 static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1248 struct rwlock **lockp); 1249 static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, 1250 u_int flags, struct rwlock **lockp); 1251 #if VM_NRESERVLEVEL > 0 1252 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1253 struct rwlock **lockp); 1254 #endif 1255 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 1256 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 1257 vm_offset_t va); 1258 1259 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); 1260 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, 1261 vm_prot_t prot, int mode, int flags); 1262 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 1263 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, 1264 vm_offset_t va, struct rwlock **lockp); 1265 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, 1266 vm_offset_t va); 1267 static int pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, 1268 vm_prot_t prot, struct rwlock **lockp); 1269 static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, 1270 u_int flags, vm_page_t m, struct rwlock **lockp); 1271 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 1272 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 1273 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 1274 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted); 1275 static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, 1276 vm_offset_t eva); 1277 static void pmap_invalidate_cache_range_all(vm_offset_t sva, 1278 vm_offset_t eva); 1279 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, 1280 pd_entry_t pde); 1281 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 1282 static vm_page_t pmap_large_map_getptp_unlocked(void); 1283 static vm_paddr_t pmap_large_map_kextract(vm_offset_t va); 1284 #if VM_NRESERVLEVEL > 0 1285 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 1286 vm_page_t mpte, struct rwlock **lockp); 1287 #endif 1288 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 1289 vm_prot_t prot); 1290 static void pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask); 1291 static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, 1292 bool exec); 1293 static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va); 1294 static pd_entry_t *pmap_pti_pde(vm_offset_t va); 1295 static void pmap_pti_wire_pte(void *pte); 1296 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 1297 struct spglist *free, struct rwlock **lockp); 1298 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 1299 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 1300 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 1301 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 1302 struct spglist *free); 1303 static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1304 pd_entry_t *pde, struct spglist *free, 1305 struct rwlock **lockp); 1306 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 1307 vm_page_t m, struct rwlock **lockp); 1308 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 1309 pd_entry_t newpde); 1310 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); 1311 1312 static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, 1313 struct rwlock **lockp); 1314 static vm_page_t pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex, 1315 struct rwlock **lockp, vm_offset_t va); 1316 static vm_page_t pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex, 1317 struct rwlock **lockp, vm_offset_t va); 1318 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, 1319 struct rwlock **lockp); 1320 1321 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 1322 struct spglist *free); 1323 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 1324 1325 static vm_page_t pmap_alloc_pt_page(pmap_t, vm_pindex_t, int); 1326 static void pmap_free_pt_page(pmap_t, vm_page_t, bool); 1327 1328 /********************/ 1329 /* Inline functions */ 1330 /********************/ 1331 1332 /* 1333 * Return a non-clipped indexes for a given VA, which are page table 1334 * pages indexes at the corresponding level. 1335 */ 1336 static __inline vm_pindex_t 1337 pmap_pde_pindex(vm_offset_t va) 1338 { 1339 return (va >> PDRSHIFT); 1340 } 1341 1342 static __inline vm_pindex_t 1343 pmap_pdpe_pindex(vm_offset_t va) 1344 { 1345 return (NUPDE + (va >> PDPSHIFT)); 1346 } 1347 1348 static __inline vm_pindex_t 1349 pmap_pml4e_pindex(vm_offset_t va) 1350 { 1351 return (NUPDE + NUPDPE + (va >> PML4SHIFT)); 1352 } 1353 1354 static __inline vm_pindex_t 1355 pmap_pml5e_pindex(vm_offset_t va) 1356 { 1357 return (NUPDE + NUPDPE + NUPML4E + (va >> PML5SHIFT)); 1358 } 1359 1360 static __inline pml4_entry_t * 1361 pmap_pml5e(pmap_t pmap, vm_offset_t va) 1362 { 1363 1364 MPASS(pmap_is_la57(pmap)); 1365 return (&pmap->pm_pmltop[pmap_pml5e_index(va)]); 1366 } 1367 1368 static __inline pml4_entry_t * 1369 pmap_pml5e_u(pmap_t pmap, vm_offset_t va) 1370 { 1371 1372 MPASS(pmap_is_la57(pmap)); 1373 return (&pmap->pm_pmltopu[pmap_pml5e_index(va)]); 1374 } 1375 1376 static __inline pml4_entry_t * 1377 pmap_pml5e_to_pml4e(pml5_entry_t *pml5e, vm_offset_t va) 1378 { 1379 pml4_entry_t *pml4e; 1380 1381 /* XXX MPASS(pmap_is_la57(pmap); */ 1382 pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); 1383 return (&pml4e[pmap_pml4e_index(va)]); 1384 } 1385 1386 /* Return a pointer to the PML4 slot that corresponds to a VA */ 1387 static __inline pml4_entry_t * 1388 pmap_pml4e(pmap_t pmap, vm_offset_t va) 1389 { 1390 pml5_entry_t *pml5e; 1391 pml4_entry_t *pml4e; 1392 pt_entry_t PG_V; 1393 1394 if (pmap_is_la57(pmap)) { 1395 pml5e = pmap_pml5e(pmap, va); 1396 PG_V = pmap_valid_bit(pmap); 1397 if ((*pml5e & PG_V) == 0) 1398 return (NULL); 1399 pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); 1400 } else { 1401 pml4e = pmap->pm_pmltop; 1402 } 1403 return (&pml4e[pmap_pml4e_index(va)]); 1404 } 1405 1406 static __inline pml4_entry_t * 1407 pmap_pml4e_u(pmap_t pmap, vm_offset_t va) 1408 { 1409 MPASS(!pmap_is_la57(pmap)); 1410 return (&pmap->pm_pmltopu[pmap_pml4e_index(va)]); 1411 } 1412 1413 /* Return a pointer to the PDP slot that corresponds to a VA */ 1414 static __inline pdp_entry_t * 1415 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 1416 { 1417 pdp_entry_t *pdpe; 1418 1419 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 1420 return (&pdpe[pmap_pdpe_index(va)]); 1421 } 1422 1423 /* Return a pointer to the PDP slot that corresponds to a VA */ 1424 static __inline pdp_entry_t * 1425 pmap_pdpe(pmap_t pmap, vm_offset_t va) 1426 { 1427 pml4_entry_t *pml4e; 1428 pt_entry_t PG_V; 1429 1430 PG_V = pmap_valid_bit(pmap); 1431 pml4e = pmap_pml4e(pmap, va); 1432 if (pml4e == NULL || (*pml4e & PG_V) == 0) 1433 return (NULL); 1434 return (pmap_pml4e_to_pdpe(pml4e, va)); 1435 } 1436 1437 /* Return a pointer to the PD slot that corresponds to a VA */ 1438 static __inline pd_entry_t * 1439 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 1440 { 1441 pd_entry_t *pde; 1442 1443 KASSERT((*pdpe & PG_PS) == 0, 1444 ("%s: pdpe %#lx is a leaf", __func__, *pdpe)); 1445 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 1446 return (&pde[pmap_pde_index(va)]); 1447 } 1448 1449 /* Return a pointer to the PD slot that corresponds to a VA */ 1450 static __inline pd_entry_t * 1451 pmap_pde(pmap_t pmap, vm_offset_t va) 1452 { 1453 pdp_entry_t *pdpe; 1454 pt_entry_t PG_V; 1455 1456 PG_V = pmap_valid_bit(pmap); 1457 pdpe = pmap_pdpe(pmap, va); 1458 if (pdpe == NULL || (*pdpe & PG_V) == 0) 1459 return (NULL); 1460 KASSERT((*pdpe & PG_PS) == 0, 1461 ("pmap_pde for 1G page, pmap %p va %#lx", pmap, va)); 1462 return (pmap_pdpe_to_pde(pdpe, va)); 1463 } 1464 1465 /* Return a pointer to the PT slot that corresponds to a VA */ 1466 static __inline pt_entry_t * 1467 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 1468 { 1469 pt_entry_t *pte; 1470 1471 KASSERT((*pde & PG_PS) == 0, 1472 ("%s: pde %#lx is a leaf", __func__, *pde)); 1473 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 1474 return (&pte[pmap_pte_index(va)]); 1475 } 1476 1477 /* Return a pointer to the PT slot that corresponds to a VA */ 1478 static __inline pt_entry_t * 1479 pmap_pte(pmap_t pmap, vm_offset_t va) 1480 { 1481 pd_entry_t *pde; 1482 pt_entry_t PG_V; 1483 1484 PG_V = pmap_valid_bit(pmap); 1485 pde = pmap_pde(pmap, va); 1486 if (pde == NULL || (*pde & PG_V) == 0) 1487 return (NULL); 1488 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 1489 return ((pt_entry_t *)pde); 1490 return (pmap_pde_to_pte(pde, va)); 1491 } 1492 1493 static __inline void 1494 pmap_resident_count_adj(pmap_t pmap, int count) 1495 { 1496 1497 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1498 KASSERT(pmap->pm_stats.resident_count + count >= 0, 1499 ("pmap %p resident count underflow %ld %d", pmap, 1500 pmap->pm_stats.resident_count, count)); 1501 pmap->pm_stats.resident_count += count; 1502 } 1503 1504 static __inline void 1505 pmap_pt_page_count_pinit(pmap_t pmap, int count) 1506 { 1507 KASSERT(pmap->pm_stats.resident_count + count >= 0, 1508 ("pmap %p resident count underflow %ld %d", pmap, 1509 pmap->pm_stats.resident_count, count)); 1510 pmap->pm_stats.resident_count += count; 1511 } 1512 1513 static __inline void 1514 pmap_pt_page_count_adj(pmap_t pmap, int count) 1515 { 1516 if (pmap == kernel_pmap) 1517 counter_u64_add(kernel_pt_page_count, count); 1518 else { 1519 if (pmap != NULL) 1520 pmap_resident_count_adj(pmap, count); 1521 counter_u64_add(user_pt_page_count, count); 1522 } 1523 } 1524 1525 pt_entry_t vtoptem __read_mostly = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + 1526 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1) << 3; 1527 vm_offset_t PTmap __read_mostly = (vm_offset_t)P4Tmap; 1528 1529 PMAP_INLINE pt_entry_t * 1530 vtopte(vm_offset_t va) 1531 { 1532 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); 1533 1534 return ((pt_entry_t *)(PTmap + ((va >> (PAGE_SHIFT - 3)) & vtoptem))); 1535 } 1536 1537 pd_entry_t vtopdem __read_mostly = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 1538 NPML4EPGSHIFT)) - 1) << 3; 1539 vm_offset_t PDmap __read_mostly = (vm_offset_t)P4Dmap; 1540 1541 static __inline pd_entry_t * 1542 vtopde(vm_offset_t va) 1543 { 1544 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); 1545 1546 return ((pt_entry_t *)(PDmap + ((va >> (PDRSHIFT - 3)) & vtopdem))); 1547 } 1548 1549 static u_int64_t 1550 allocpages(vm_paddr_t *firstaddr, int n) 1551 { 1552 u_int64_t ret; 1553 1554 ret = *firstaddr; 1555 bzero((void *)ret, n * PAGE_SIZE); 1556 *firstaddr += n * PAGE_SIZE; 1557 return (ret); 1558 } 1559 1560 CTASSERT(powerof2(NDMPML4E)); 1561 1562 /* number of kernel PDP slots */ 1563 #define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG) 1564 1565 static void 1566 nkpt_init(vm_paddr_t addr) 1567 { 1568 int pt_pages; 1569 1570 #ifdef NKPT 1571 pt_pages = NKPT; 1572 #else 1573 pt_pages = howmany(addr - kernphys, NBPDR) + 1; /* +1 for 2M hole @0 */ 1574 pt_pages += NKPDPE(pt_pages); 1575 1576 /* 1577 * Add some slop beyond the bare minimum required for bootstrapping 1578 * the kernel. 1579 * 1580 * This is quite important when allocating KVA for kernel modules. 1581 * The modules are required to be linked in the negative 2GB of 1582 * the address space. If we run out of KVA in this region then 1583 * pmap_growkernel() will need to allocate page table pages to map 1584 * the entire 512GB of KVA space which is an unnecessary tax on 1585 * physical memory. 1586 * 1587 * Secondly, device memory mapped as part of setting up the low- 1588 * level console(s) is taken from KVA, starting at virtual_avail. 1589 * This is because cninit() is called after pmap_bootstrap() but 1590 * before vm_init() and pmap_init(). 20MB for a frame buffer is 1591 * not uncommon. 1592 */ 1593 pt_pages += 32; /* 64MB additional slop. */ 1594 #endif 1595 nkpt = pt_pages; 1596 } 1597 1598 /* 1599 * Returns the proper write/execute permission for a physical page that is 1600 * part of the initial boot allocations. 1601 * 1602 * If the page has kernel text, it is marked as read-only. If the page has 1603 * kernel read-only data, it is marked as read-only/not-executable. If the 1604 * page has only read-write data, it is marked as read-write/not-executable. 1605 * If the page is below/above the kernel range, it is marked as read-write. 1606 * 1607 * This function operates on 2M pages, since we map the kernel space that 1608 * way. 1609 */ 1610 static inline pt_entry_t 1611 bootaddr_rwx(vm_paddr_t pa) 1612 { 1613 /* 1614 * The kernel is loaded at a 2MB-aligned address, and memory below that 1615 * need not be executable. The .bss section is padded to a 2MB 1616 * boundary, so memory following the kernel need not be executable 1617 * either. Preloaded kernel modules have their mapping permissions 1618 * fixed up by the linker. 1619 */ 1620 if (pa < trunc_2mpage(kernphys + btext - KERNSTART) || 1621 pa >= trunc_2mpage(kernphys + _end - KERNSTART)) 1622 return (X86_PG_RW | pg_nx); 1623 1624 /* 1625 * The linker should ensure that the read-only and read-write 1626 * portions don't share the same 2M page, so this shouldn't 1627 * impact read-only data. However, in any case, any page with 1628 * read-write data needs to be read-write. 1629 */ 1630 if (pa >= trunc_2mpage(kernphys + brwsection - KERNSTART)) 1631 return (X86_PG_RW | pg_nx); 1632 1633 /* 1634 * Mark any 2M page containing kernel text as read-only. Mark 1635 * other pages with read-only data as read-only and not executable. 1636 * (It is likely a small portion of the read-only data section will 1637 * be marked as read-only, but executable. This should be acceptable 1638 * since the read-only protection will keep the data from changing.) 1639 * Note that fixups to the .text section will still work until we 1640 * set CR0.WP. 1641 */ 1642 if (pa < round_2mpage(kernphys + etext - KERNSTART)) 1643 return (0); 1644 return (pg_nx); 1645 } 1646 1647 static void 1648 create_pagetables(vm_paddr_t *firstaddr) 1649 { 1650 pd_entry_t *pd_p; 1651 pdp_entry_t *pdp_p; 1652 pml4_entry_t *p4_p; 1653 uint64_t DMPDkernphys; 1654 vm_paddr_t pax; 1655 #ifdef KASAN 1656 pt_entry_t *pt_p; 1657 uint64_t KASANPDphys, KASANPTphys, KASANphys; 1658 vm_offset_t kasankernbase; 1659 int kasankpdpi, kasankpdi, nkasanpte; 1660 #endif 1661 int i, j, ndm1g, nkpdpe, nkdmpde; 1662 1663 /* Allocate page table pages for the direct map */ 1664 ndmpdp = howmany(ptoa(Maxmem), NBPDP); 1665 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 1666 ndmpdp = 4; 1667 ndmpdpphys = howmany(ndmpdp, NPDPEPG); 1668 if (ndmpdpphys > NDMPML4E) { 1669 /* 1670 * Each NDMPML4E allows 512 GB, so limit to that, 1671 * and then readjust ndmpdp and ndmpdpphys. 1672 */ 1673 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); 1674 Maxmem = atop(NDMPML4E * NBPML4); 1675 ndmpdpphys = NDMPML4E; 1676 ndmpdp = NDMPML4E * NPDEPG; 1677 } 1678 DMPDPphys = allocpages(firstaddr, ndmpdpphys); 1679 ndm1g = 0; 1680 if ((amd_feature & AMDID_PAGE1GB) != 0) { 1681 /* 1682 * Calculate the number of 1G pages that will fully fit in 1683 * Maxmem. 1684 */ 1685 ndm1g = ptoa(Maxmem) >> PDPSHIFT; 1686 1687 /* 1688 * Allocate 2M pages for the kernel. These will be used in 1689 * place of the one or more 1G pages from ndm1g that maps 1690 * kernel memory into DMAP. 1691 */ 1692 nkdmpde = howmany((vm_offset_t)brwsection - KERNSTART + 1693 kernphys - rounddown2(kernphys, NBPDP), NBPDP); 1694 DMPDkernphys = allocpages(firstaddr, nkdmpde); 1695 } 1696 if (ndm1g < ndmpdp) 1697 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); 1698 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 1699 1700 /* Allocate pages. */ 1701 KPML4phys = allocpages(firstaddr, 1); 1702 KPDPphys = allocpages(firstaddr, NKPML4E); 1703 #ifdef KASAN 1704 KASANPDPphys = allocpages(firstaddr, NKASANPML4E); 1705 KASANPDphys = allocpages(firstaddr, 1); 1706 #endif 1707 #ifdef KMSAN 1708 /* 1709 * The KMSAN shadow maps are initially left unpopulated, since there is 1710 * no need to shadow memory above KERNBASE. 1711 */ 1712 KMSANSHADPDPphys = allocpages(firstaddr, NKMSANSHADPML4E); 1713 KMSANORIGPDPphys = allocpages(firstaddr, NKMSANORIGPML4E); 1714 #endif 1715 1716 /* 1717 * Allocate the initial number of kernel page table pages required to 1718 * bootstrap. We defer this until after all memory-size dependent 1719 * allocations are done (e.g. direct map), so that we don't have to 1720 * build in too much slop in our estimate. 1721 * 1722 * Note that when NKPML4E > 1, we have an empty page underneath 1723 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) 1724 * pages. (pmap_enter requires a PD page to exist for each KPML4E.) 1725 */ 1726 nkpt_init(*firstaddr); 1727 nkpdpe = NKPDPE(nkpt); 1728 1729 KPTphys = allocpages(firstaddr, nkpt); 1730 KPDphys = allocpages(firstaddr, nkpdpe); 1731 1732 #ifdef KASAN 1733 nkasanpte = howmany(nkpt, KASAN_SHADOW_SCALE); 1734 KASANPTphys = allocpages(firstaddr, nkasanpte); 1735 KASANphys = allocpages(firstaddr, nkasanpte * NPTEPG); 1736 #endif 1737 1738 /* 1739 * Connect the zero-filled PT pages to their PD entries. This 1740 * implicitly maps the PT pages at their correct locations within 1741 * the PTmap. 1742 */ 1743 pd_p = (pd_entry_t *)KPDphys; 1744 for (i = 0; i < nkpt; i++) 1745 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 1746 1747 /* 1748 * Map from start of the kernel in physical memory (staging 1749 * area) to the end of loader preallocated memory using 2MB 1750 * pages. This replaces some of the PD entries created above. 1751 * For compatibility, identity map 2M at the start. 1752 */ 1753 pd_p[0] = X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | 1754 X86_PG_RW | pg_nx; 1755 for (i = 1, pax = kernphys; pax < KERNend; i++, pax += NBPDR) { 1756 /* Preset PG_M and PG_A because demotion expects it. */ 1757 pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | 1758 X86_PG_A | bootaddr_rwx(pax); 1759 } 1760 1761 /* 1762 * Because we map the physical blocks in 2M pages, adjust firstaddr 1763 * to record the physical blocks we've actually mapped into kernel 1764 * virtual address space. 1765 */ 1766 if (*firstaddr < round_2mpage(KERNend)) 1767 *firstaddr = round_2mpage(KERNend); 1768 1769 /* And connect up the PD to the PDP (leaving room for L4 pages) */ 1770 pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); 1771 for (i = 0; i < nkpdpe; i++) 1772 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 1773 1774 #ifdef KASAN 1775 kasankernbase = kasan_md_addr_to_shad(KERNBASE); 1776 kasankpdpi = pmap_pdpe_index(kasankernbase); 1777 kasankpdi = pmap_pde_index(kasankernbase); 1778 1779 pdp_p = (pdp_entry_t *)KASANPDPphys; 1780 pdp_p[kasankpdpi] = (KASANPDphys | X86_PG_RW | X86_PG_V | pg_nx); 1781 1782 pd_p = (pd_entry_t *)KASANPDphys; 1783 for (i = 0; i < nkasanpte; i++) 1784 pd_p[i + kasankpdi] = (KASANPTphys + ptoa(i)) | X86_PG_RW | 1785 X86_PG_V | pg_nx; 1786 1787 pt_p = (pt_entry_t *)KASANPTphys; 1788 for (i = 0; i < nkasanpte * NPTEPG; i++) 1789 pt_p[i] = (KASANphys + ptoa(i)) | X86_PG_RW | X86_PG_V | 1790 X86_PG_M | X86_PG_A | pg_nx; 1791 #endif 1792 1793 /* 1794 * Now, set up the direct map region using 2MB and/or 1GB pages. If 1795 * the end of physical memory is not aligned to a 1GB page boundary, 1796 * then the residual physical memory is mapped with 2MB pages. Later, 1797 * if pmap_mapdev{_attr}() uses the direct map for non-write-back 1798 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings 1799 * that are partially used. 1800 */ 1801 pd_p = (pd_entry_t *)DMPDphys; 1802 for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { 1803 pd_p[j] = (vm_paddr_t)i << PDRSHIFT; 1804 /* Preset PG_M and PG_A because demotion expects it. */ 1805 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | 1806 X86_PG_M | X86_PG_A | pg_nx; 1807 } 1808 pdp_p = (pdp_entry_t *)DMPDPphys; 1809 for (i = 0; i < ndm1g; i++) { 1810 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; 1811 /* Preset PG_M and PG_A because demotion expects it. */ 1812 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | 1813 X86_PG_M | X86_PG_A | pg_nx; 1814 } 1815 for (j = 0; i < ndmpdp; i++, j++) { 1816 pdp_p[i] = DMPDphys + ptoa(j); 1817 pdp_p[i] |= X86_PG_RW | X86_PG_V | pg_nx; 1818 } 1819 1820 /* 1821 * Instead of using a 1G page for the memory containing the kernel, 1822 * use 2M pages with read-only and no-execute permissions. (If using 1G 1823 * pages, this will partially overwrite the PDPEs above.) 1824 */ 1825 if (ndm1g > 0) { 1826 pd_p = (pd_entry_t *)DMPDkernphys; 1827 for (i = 0, pax = rounddown2(kernphys, NBPDP); 1828 i < NPDEPG * nkdmpde; i++, pax += NBPDR) { 1829 pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | 1830 X86_PG_A | pg_nx | bootaddr_rwx(pax); 1831 } 1832 j = rounddown2(kernphys, NBPDP) >> PDPSHIFT; 1833 for (i = 0; i < nkdmpde; i++) { 1834 pdp_p[i + j] = (DMPDkernphys + ptoa(i)) | 1835 X86_PG_RW | X86_PG_V | pg_nx; 1836 } 1837 } 1838 1839 /* And recursively map PML4 to itself in order to get PTmap */ 1840 p4_p = (pml4_entry_t *)KPML4phys; 1841 p4_p[PML4PML4I] = KPML4phys; 1842 p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx; 1843 1844 #ifdef KASAN 1845 /* Connect the KASAN shadow map slots up to the PML4. */ 1846 for (i = 0; i < NKASANPML4E; i++) { 1847 p4_p[KASANPML4I + i] = KASANPDPphys + ptoa(i); 1848 p4_p[KASANPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1849 } 1850 #endif 1851 1852 #ifdef KMSAN 1853 /* Connect the KMSAN shadow map slots up to the PML4. */ 1854 for (i = 0; i < NKMSANSHADPML4E; i++) { 1855 p4_p[KMSANSHADPML4I + i] = KMSANSHADPDPphys + ptoa(i); 1856 p4_p[KMSANSHADPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1857 } 1858 1859 /* Connect the KMSAN origin map slots up to the PML4. */ 1860 for (i = 0; i < NKMSANORIGPML4E; i++) { 1861 p4_p[KMSANORIGPML4I + i] = KMSANORIGPDPphys + ptoa(i); 1862 p4_p[KMSANORIGPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1863 } 1864 #endif 1865 1866 /* Connect the Direct Map slots up to the PML4. */ 1867 for (i = 0; i < ndmpdpphys; i++) { 1868 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); 1869 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1870 } 1871 1872 /* Connect the KVA slots up to the PML4 */ 1873 for (i = 0; i < NKPML4E; i++) { 1874 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); 1875 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V; 1876 } 1877 1878 kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 1879 } 1880 1881 /* 1882 * Bootstrap the system enough to run with virtual memory. 1883 * 1884 * On amd64 this is called after mapping has already been enabled 1885 * and just syncs the pmap module with what has already been done. 1886 * [We can't call it easily with mapping off since the kernel is not 1887 * mapped with PA == VA, hence we would have to relocate every address 1888 * from the linked base (virtual) address "KERNBASE" to the actual 1889 * (physical) address starting relative to 0] 1890 */ 1891 void 1892 pmap_bootstrap(vm_paddr_t *firstaddr) 1893 { 1894 vm_offset_t va; 1895 pt_entry_t *pte, *pcpu_pte; 1896 struct region_descriptor r_gdt; 1897 uint64_t cr4, pcpu_phys; 1898 u_long res; 1899 int i; 1900 1901 KERNend = *firstaddr; 1902 res = atop(KERNend - (vm_paddr_t)kernphys); 1903 1904 if (!pti) 1905 pg_g = X86_PG_G; 1906 1907 /* 1908 * Create an initial set of page tables to run the kernel in. 1909 */ 1910 create_pagetables(firstaddr); 1911 1912 pcpu_phys = allocpages(firstaddr, MAXCPU); 1913 1914 /* 1915 * Add a physical memory segment (vm_phys_seg) corresponding to the 1916 * preallocated kernel page table pages so that vm_page structures 1917 * representing these pages will be created. The vm_page structures 1918 * are required for promotion of the corresponding kernel virtual 1919 * addresses to superpage mappings. 1920 */ 1921 vm_phys_early_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 1922 1923 /* 1924 * Account for the virtual addresses mapped by create_pagetables(). 1925 */ 1926 virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend - 1927 (vm_paddr_t)kernphys); 1928 virtual_end = VM_MAX_KERNEL_ADDRESS; 1929 1930 /* 1931 * Enable PG_G global pages, then switch to the kernel page 1932 * table from the bootstrap page table. After the switch, it 1933 * is possible to enable SMEP and SMAP since PG_U bits are 1934 * correct now. 1935 */ 1936 cr4 = rcr4(); 1937 cr4 |= CR4_PGE; 1938 load_cr4(cr4); 1939 load_cr3(KPML4phys); 1940 if (cpu_stdext_feature & CPUID_STDEXT_SMEP) 1941 cr4 |= CR4_SMEP; 1942 if (cpu_stdext_feature & CPUID_STDEXT_SMAP) 1943 cr4 |= CR4_SMAP; 1944 load_cr4(cr4); 1945 1946 /* 1947 * Initialize the kernel pmap (which is statically allocated). 1948 * Count bootstrap data as being resident in case any of this data is 1949 * later unmapped (using pmap_remove()) and freed. 1950 */ 1951 PMAP_LOCK_INIT(kernel_pmap); 1952 kernel_pmap->pm_pmltop = kernel_pml4; 1953 kernel_pmap->pm_cr3 = KPML4phys; 1954 kernel_pmap->pm_ucr3 = PMAP_NO_CR3; 1955 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1956 kernel_pmap->pm_stats.resident_count = res; 1957 kernel_pmap->pm_flags = pmap_flags; 1958 1959 /* 1960 * The kernel pmap is always active on all CPUs. Once CPUs are 1961 * enumerated, the mask will be set equal to all_cpus. 1962 */ 1963 CPU_FILL(&kernel_pmap->pm_active); 1964 1965 /* 1966 * Initialize the TLB invalidations generation number lock. 1967 */ 1968 mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); 1969 1970 /* 1971 * Reserve some special page table entries/VA space for temporary 1972 * mapping of pages. 1973 */ 1974 #define SYSMAP(c, p, v, n) \ 1975 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 1976 1977 va = virtual_avail; 1978 pte = vtopte(va); 1979 1980 /* 1981 * Crashdump maps. The first page is reused as CMAP1 for the 1982 * memory test. 1983 */ 1984 SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) 1985 CADDR1 = crashdumpmap; 1986 1987 SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU); 1988 virtual_avail = va; 1989 1990 for (i = 0; i < MAXCPU; i++) { 1991 pcpu_pte[i] = (pcpu_phys + ptoa(i)) | X86_PG_V | X86_PG_RW | 1992 pg_g | pg_nx | X86_PG_M | X86_PG_A; 1993 } 1994 1995 /* 1996 * Re-initialize PCPU area for BSP after switching. 1997 * Make hardware use gdt and common_tss from the new PCPU. 1998 */ 1999 STAILQ_INIT(&cpuhead); 2000 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2001 pcpu_init(&__pcpu[0], 0, sizeof(struct pcpu)); 2002 amd64_bsp_pcpu_init1(&__pcpu[0]); 2003 amd64_bsp_ist_init(&__pcpu[0]); 2004 __pcpu[0].pc_common_tss.tss_iobase = sizeof(struct amd64tss) + 2005 IOPERM_BITMAP_SIZE; 2006 memcpy(__pcpu[0].pc_gdt, temp_bsp_pcpu.pc_gdt, NGDT * 2007 sizeof(struct user_segment_descriptor)); 2008 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&__pcpu[0].pc_common_tss; 2009 ssdtosyssd(&gdt_segs[GPROC0_SEL], 2010 (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); 2011 r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; 2012 r_gdt.rd_base = (long)__pcpu[0].pc_gdt; 2013 lgdt(&r_gdt); 2014 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2015 ltr(GSEL(GPROC0_SEL, SEL_KPL)); 2016 __pcpu[0].pc_dynamic = temp_bsp_pcpu.pc_dynamic; 2017 __pcpu[0].pc_acpi_id = temp_bsp_pcpu.pc_acpi_id; 2018 2019 /* 2020 * Initialize the PAT MSR. 2021 * pmap_init_pat() clears and sets CR4_PGE, which, as a 2022 * side-effect, invalidates stale PG_G TLB entries that might 2023 * have been created in our pre-boot environment. 2024 */ 2025 pmap_init_pat(); 2026 2027 /* Initialize TLB Context Id. */ 2028 if (pmap_pcid_enabled) { 2029 for (i = 0; i < MAXCPU; i++) { 2030 kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN; 2031 kernel_pmap->pm_pcids[i].pm_gen = 1; 2032 } 2033 2034 /* 2035 * PMAP_PCID_KERN + 1 is used for initialization of 2036 * proc0 pmap. The pmap' pcid state might be used by 2037 * EFIRT entry before first context switch, so it 2038 * needs to be valid. 2039 */ 2040 PCPU_SET(pcid_next, PMAP_PCID_KERN + 2); 2041 PCPU_SET(pcid_gen, 1); 2042 2043 /* 2044 * pcpu area for APs is zeroed during AP startup. 2045 * pc_pcid_next and pc_pcid_gen are initialized by AP 2046 * during pcpu setup. 2047 */ 2048 load_cr4(rcr4() | CR4_PCIDE); 2049 } 2050 } 2051 2052 /* 2053 * Setup the PAT MSR. 2054 */ 2055 void 2056 pmap_init_pat(void) 2057 { 2058 uint64_t pat_msr; 2059 u_long cr0, cr4; 2060 int i; 2061 2062 /* Bail if this CPU doesn't implement PAT. */ 2063 if ((cpu_feature & CPUID_PAT) == 0) 2064 panic("no PAT??"); 2065 2066 /* Set default PAT index table. */ 2067 for (i = 0; i < PAT_INDEX_SIZE; i++) 2068 pat_index[i] = -1; 2069 pat_index[PAT_WRITE_BACK] = 0; 2070 pat_index[PAT_WRITE_THROUGH] = 1; 2071 pat_index[PAT_UNCACHEABLE] = 3; 2072 pat_index[PAT_WRITE_COMBINING] = 6; 2073 pat_index[PAT_WRITE_PROTECTED] = 5; 2074 pat_index[PAT_UNCACHED] = 2; 2075 2076 /* 2077 * Initialize default PAT entries. 2078 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 2079 * Program 5 and 6 as WP and WC. 2080 * 2081 * Leave 4 and 7 as WB and UC. Note that a recursive page table 2082 * mapping for a 2M page uses a PAT value with the bit 3 set due 2083 * to its overload with PG_PS. 2084 */ 2085 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 2086 PAT_VALUE(1, PAT_WRITE_THROUGH) | 2087 PAT_VALUE(2, PAT_UNCACHED) | 2088 PAT_VALUE(3, PAT_UNCACHEABLE) | 2089 PAT_VALUE(4, PAT_WRITE_BACK) | 2090 PAT_VALUE(5, PAT_WRITE_PROTECTED) | 2091 PAT_VALUE(6, PAT_WRITE_COMBINING) | 2092 PAT_VALUE(7, PAT_UNCACHEABLE); 2093 2094 /* Disable PGE. */ 2095 cr4 = rcr4(); 2096 load_cr4(cr4 & ~CR4_PGE); 2097 2098 /* Disable caches (CD = 1, NW = 0). */ 2099 cr0 = rcr0(); 2100 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 2101 2102 /* Flushes caches and TLBs. */ 2103 wbinvd(); 2104 invltlb(); 2105 2106 /* Update PAT and index table. */ 2107 wrmsr(MSR_PAT, pat_msr); 2108 2109 /* Flush caches and TLBs again. */ 2110 wbinvd(); 2111 invltlb(); 2112 2113 /* Restore caches and PGE. */ 2114 load_cr0(cr0); 2115 load_cr4(cr4); 2116 } 2117 2118 vm_page_t 2119 pmap_page_alloc_below_4g(bool zeroed) 2120 { 2121 return (vm_page_alloc_noobj_contig((zeroed ? VM_ALLOC_ZERO : 0), 2122 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT)); 2123 } 2124 2125 extern const char la57_trampoline[], la57_trampoline_gdt_desc[], 2126 la57_trampoline_gdt[], la57_trampoline_end[]; 2127 2128 static void 2129 pmap_bootstrap_la57(void *arg __unused) 2130 { 2131 char *v_code; 2132 pml5_entry_t *v_pml5; 2133 pml4_entry_t *v_pml4; 2134 pdp_entry_t *v_pdp; 2135 pd_entry_t *v_pd; 2136 pt_entry_t *v_pt; 2137 vm_page_t m_code, m_pml4, m_pdp, m_pd, m_pt, m_pml5; 2138 void (*la57_tramp)(uint64_t pml5); 2139 struct region_descriptor r_gdt; 2140 2141 if ((cpu_stdext_feature2 & CPUID_STDEXT2_LA57) == 0) 2142 return; 2143 TUNABLE_INT_FETCH("vm.pmap.la57", &la57); 2144 if (!la57) 2145 return; 2146 2147 r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; 2148 r_gdt.rd_base = (long)__pcpu[0].pc_gdt; 2149 2150 m_code = pmap_page_alloc_below_4g(true); 2151 v_code = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_code)); 2152 m_pml5 = pmap_page_alloc_below_4g(true); 2153 KPML5phys = VM_PAGE_TO_PHYS(m_pml5); 2154 v_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(KPML5phys); 2155 m_pml4 = pmap_page_alloc_below_4g(true); 2156 v_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4)); 2157 m_pdp = pmap_page_alloc_below_4g(true); 2158 v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp)); 2159 m_pd = pmap_page_alloc_below_4g(true); 2160 v_pd = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd)); 2161 m_pt = pmap_page_alloc_below_4g(true); 2162 v_pt = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pt)); 2163 2164 /* 2165 * Map m_code 1:1, it appears below 4G in KVA due to physical 2166 * address being below 4G. Since kernel KVA is in upper half, 2167 * the pml4e should be zero and free for temporary use. 2168 */ 2169 kernel_pmap->pm_pmltop[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = 2170 VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | 2171 X86_PG_M; 2172 v_pdp[pmap_pdpe_index(VM_PAGE_TO_PHYS(m_code))] = 2173 VM_PAGE_TO_PHYS(m_pd) | X86_PG_V | X86_PG_RW | X86_PG_A | 2174 X86_PG_M; 2175 v_pd[pmap_pde_index(VM_PAGE_TO_PHYS(m_code))] = 2176 VM_PAGE_TO_PHYS(m_pt) | X86_PG_V | X86_PG_RW | X86_PG_A | 2177 X86_PG_M; 2178 v_pt[pmap_pte_index(VM_PAGE_TO_PHYS(m_code))] = 2179 VM_PAGE_TO_PHYS(m_code) | X86_PG_V | X86_PG_RW | X86_PG_A | 2180 X86_PG_M; 2181 2182 /* 2183 * Add pml5 entry at top of KVA pointing to existing pml4 table, 2184 * entering all existing kernel mappings into level 5 table. 2185 */ 2186 v_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | 2187 X86_PG_RW | X86_PG_A | X86_PG_M | pg_g; 2188 2189 /* 2190 * Add pml5 entry for 1:1 trampoline mapping after LA57 is turned on. 2191 */ 2192 v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))] = 2193 VM_PAGE_TO_PHYS(m_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A | 2194 X86_PG_M; 2195 v_pml4[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = 2196 VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | 2197 X86_PG_M; 2198 2199 /* 2200 * Copy and call the 48->57 trampoline, hope we return there, alive. 2201 */ 2202 bcopy(la57_trampoline, v_code, la57_trampoline_end - la57_trampoline); 2203 *(u_long *)(v_code + 2 + (la57_trampoline_gdt_desc - la57_trampoline)) = 2204 la57_trampoline_gdt - la57_trampoline + VM_PAGE_TO_PHYS(m_code); 2205 la57_tramp = (void (*)(uint64_t))VM_PAGE_TO_PHYS(m_code); 2206 invlpg((vm_offset_t)la57_tramp); 2207 la57_tramp(KPML5phys); 2208 2209 /* 2210 * gdt was necessary reset, switch back to our gdt. 2211 */ 2212 lgdt(&r_gdt); 2213 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2214 load_ds(_udatasel); 2215 load_es(_udatasel); 2216 load_fs(_ufssel); 2217 ssdtosyssd(&gdt_segs[GPROC0_SEL], 2218 (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); 2219 ltr(GSEL(GPROC0_SEL, SEL_KPL)); 2220 2221 /* 2222 * Now unmap the trampoline, and free the pages. 2223 * Clear pml5 entry used for 1:1 trampoline mapping. 2224 */ 2225 pte_clear(&v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))]); 2226 invlpg((vm_offset_t)v_code); 2227 vm_page_free(m_code); 2228 vm_page_free(m_pdp); 2229 vm_page_free(m_pd); 2230 vm_page_free(m_pt); 2231 2232 /* 2233 * Recursively map PML5 to itself in order to get PTmap and 2234 * PDmap. 2235 */ 2236 v_pml5[PML5PML5I] = KPML5phys | X86_PG_RW | X86_PG_V | pg_nx; 2237 2238 vtoptem = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + 2239 NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3; 2240 PTmap = (vm_offset_t)P5Tmap; 2241 vtopdem = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 2242 NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3; 2243 PDmap = (vm_offset_t)P5Dmap; 2244 2245 kernel_pmap->pm_cr3 = KPML5phys; 2246 kernel_pmap->pm_pmltop = v_pml5; 2247 pmap_pt_page_count_adj(kernel_pmap, 1); 2248 } 2249 SYSINIT(la57, SI_SUB_KMEM, SI_ORDER_ANY, pmap_bootstrap_la57, NULL); 2250 2251 /* 2252 * Initialize a vm_page's machine-dependent fields. 2253 */ 2254 void 2255 pmap_page_init(vm_page_t m) 2256 { 2257 2258 TAILQ_INIT(&m->md.pv_list); 2259 m->md.pat_mode = PAT_WRITE_BACK; 2260 } 2261 2262 static int pmap_allow_2m_x_ept; 2263 SYSCTL_INT(_vm_pmap, OID_AUTO, allow_2m_x_ept, CTLFLAG_RWTUN | CTLFLAG_NOFETCH, 2264 &pmap_allow_2m_x_ept, 0, 2265 "Allow executable superpage mappings in EPT"); 2266 2267 void 2268 pmap_allow_2m_x_ept_recalculate(void) 2269 { 2270 /* 2271 * SKL002, SKL012S. Since the EPT format is only used by 2272 * Intel CPUs, the vendor check is merely a formality. 2273 */ 2274 if (!(cpu_vendor_id != CPU_VENDOR_INTEL || 2275 (cpu_ia32_arch_caps & IA32_ARCH_CAP_IF_PSCHANGE_MC_NO) != 0 || 2276 (CPUID_TO_FAMILY(cpu_id) == 0x6 && 2277 (CPUID_TO_MODEL(cpu_id) == 0x26 || /* Atoms */ 2278 CPUID_TO_MODEL(cpu_id) == 0x27 || 2279 CPUID_TO_MODEL(cpu_id) == 0x35 || 2280 CPUID_TO_MODEL(cpu_id) == 0x36 || 2281 CPUID_TO_MODEL(cpu_id) == 0x37 || 2282 CPUID_TO_MODEL(cpu_id) == 0x86 || 2283 CPUID_TO_MODEL(cpu_id) == 0x1c || 2284 CPUID_TO_MODEL(cpu_id) == 0x4a || 2285 CPUID_TO_MODEL(cpu_id) == 0x4c || 2286 CPUID_TO_MODEL(cpu_id) == 0x4d || 2287 CPUID_TO_MODEL(cpu_id) == 0x5a || 2288 CPUID_TO_MODEL(cpu_id) == 0x5c || 2289 CPUID_TO_MODEL(cpu_id) == 0x5d || 2290 CPUID_TO_MODEL(cpu_id) == 0x5f || 2291 CPUID_TO_MODEL(cpu_id) == 0x6e || 2292 CPUID_TO_MODEL(cpu_id) == 0x7a || 2293 CPUID_TO_MODEL(cpu_id) == 0x57 || /* Knights */ 2294 CPUID_TO_MODEL(cpu_id) == 0x85)))) 2295 pmap_allow_2m_x_ept = 1; 2296 TUNABLE_INT_FETCH("hw.allow_2m_x_ept", &pmap_allow_2m_x_ept); 2297 } 2298 2299 static bool 2300 pmap_allow_2m_x_page(pmap_t pmap, bool executable) 2301 { 2302 2303 return (pmap->pm_type != PT_EPT || !executable || 2304 !pmap_allow_2m_x_ept); 2305 } 2306 2307 #ifdef NUMA 2308 static void 2309 pmap_init_pv_table(void) 2310 { 2311 struct pmap_large_md_page *pvd; 2312 vm_size_t s; 2313 long start, end, highest, pv_npg; 2314 int domain, i, j, pages; 2315 2316 /* 2317 * We strongly depend on the size being a power of two, so the assert 2318 * is overzealous. However, should the struct be resized to a 2319 * different power of two, the code below needs to be revisited. 2320 */ 2321 CTASSERT((sizeof(*pvd) == 64)); 2322 2323 /* 2324 * Calculate the size of the array. 2325 */ 2326 pmap_last_pa = vm_phys_segs[vm_phys_nsegs - 1].end; 2327 pv_npg = howmany(pmap_last_pa, NBPDR); 2328 s = (vm_size_t)pv_npg * sizeof(struct pmap_large_md_page); 2329 s = round_page(s); 2330 pv_table = (struct pmap_large_md_page *)kva_alloc(s); 2331 if (pv_table == NULL) 2332 panic("%s: kva_alloc failed\n", __func__); 2333 2334 /* 2335 * Iterate physical segments to allocate space for respective pages. 2336 */ 2337 highest = -1; 2338 s = 0; 2339 for (i = 0; i < vm_phys_nsegs; i++) { 2340 end = vm_phys_segs[i].end / NBPDR; 2341 domain = vm_phys_segs[i].domain; 2342 2343 if (highest >= end) 2344 continue; 2345 2346 start = highest + 1; 2347 pvd = &pv_table[start]; 2348 2349 pages = end - start + 1; 2350 s = round_page(pages * sizeof(*pvd)); 2351 highest = start + (s / sizeof(*pvd)) - 1; 2352 2353 for (j = 0; j < s; j += PAGE_SIZE) { 2354 vm_page_t m = vm_page_alloc_noobj_domain(domain, 0); 2355 if (m == NULL) 2356 panic("failed to allocate PV table page"); 2357 pmap_qenter((vm_offset_t)pvd + j, &m, 1); 2358 } 2359 2360 for (j = 0; j < s / sizeof(*pvd); j++) { 2361 rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW); 2362 TAILQ_INIT(&pvd->pv_page.pv_list); 2363 pvd->pv_page.pv_gen = 0; 2364 pvd->pv_page.pat_mode = 0; 2365 pvd->pv_invl_gen = 0; 2366 pvd++; 2367 } 2368 } 2369 pvd = &pv_dummy_large; 2370 rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW); 2371 TAILQ_INIT(&pvd->pv_page.pv_list); 2372 pvd->pv_page.pv_gen = 0; 2373 pvd->pv_page.pat_mode = 0; 2374 pvd->pv_invl_gen = 0; 2375 } 2376 #else 2377 static void 2378 pmap_init_pv_table(void) 2379 { 2380 vm_size_t s; 2381 long i, pv_npg; 2382 2383 /* 2384 * Initialize the pool of pv list locks. 2385 */ 2386 for (i = 0; i < NPV_LIST_LOCKS; i++) 2387 rw_init(&pv_list_locks[i], "pmap pv list"); 2388 2389 /* 2390 * Calculate the size of the pv head table for superpages. 2391 */ 2392 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); 2393 2394 /* 2395 * Allocate memory for the pv head table for superpages. 2396 */ 2397 s = (vm_size_t)pv_npg * sizeof(struct md_page); 2398 s = round_page(s); 2399 pv_table = kmem_malloc(s, M_WAITOK | M_ZERO); 2400 for (i = 0; i < pv_npg; i++) 2401 TAILQ_INIT(&pv_table[i].pv_list); 2402 TAILQ_INIT(&pv_dummy.pv_list); 2403 } 2404 #endif 2405 2406 /* 2407 * Initialize the pmap module. 2408 * Called by vm_init, to initialize any structures that the pmap 2409 * system needs to map virtual memory. 2410 */ 2411 void 2412 pmap_init(void) 2413 { 2414 struct pmap_preinit_mapping *ppim; 2415 vm_page_t m, mpte; 2416 int error, i, ret, skz63; 2417 2418 /* L1TF, reserve page @0 unconditionally */ 2419 vm_page_blacklist_add(0, bootverbose); 2420 2421 /* Detect bare-metal Skylake Server and Skylake-X. */ 2422 if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL && 2423 CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) { 2424 /* 2425 * Skylake-X errata SKZ63. Processor May Hang When 2426 * Executing Code In an HLE Transaction Region between 2427 * 40000000H and 403FFFFFH. 2428 * 2429 * Mark the pages in the range as preallocated. It 2430 * seems to be impossible to distinguish between 2431 * Skylake Server and Skylake X. 2432 */ 2433 skz63 = 1; 2434 TUNABLE_INT_FETCH("hw.skz63_enable", &skz63); 2435 if (skz63 != 0) { 2436 if (bootverbose) 2437 printf("SKZ63: skipping 4M RAM starting " 2438 "at physical 1G\n"); 2439 for (i = 0; i < atop(0x400000); i++) { 2440 ret = vm_page_blacklist_add(0x40000000 + 2441 ptoa(i), FALSE); 2442 if (!ret && bootverbose) 2443 printf("page at %#lx already used\n", 2444 0x40000000 + ptoa(i)); 2445 } 2446 } 2447 } 2448 2449 /* IFU */ 2450 pmap_allow_2m_x_ept_recalculate(); 2451 2452 /* 2453 * Initialize the vm page array entries for the kernel pmap's 2454 * page table pages. 2455 */ 2456 PMAP_LOCK(kernel_pmap); 2457 for (i = 0; i < nkpt; i++) { 2458 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 2459 KASSERT(mpte >= vm_page_array && 2460 mpte < &vm_page_array[vm_page_array_size], 2461 ("pmap_init: page table page is out of range")); 2462 mpte->pindex = pmap_pde_pindex(KERNBASE) + i; 2463 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 2464 mpte->ref_count = 1; 2465 2466 /* 2467 * Collect the page table pages that were replaced by a 2MB 2468 * page in create_pagetables(). They are zero filled. 2469 */ 2470 if ((i == 0 || 2471 kernphys + ((vm_paddr_t)(i - 1) << PDRSHIFT) < KERNend) && 2472 pmap_insert_pt_page(kernel_pmap, mpte, false)) 2473 panic("pmap_init: pmap_insert_pt_page failed"); 2474 } 2475 PMAP_UNLOCK(kernel_pmap); 2476 vm_wire_add(nkpt); 2477 2478 /* 2479 * If the kernel is running on a virtual machine, then it must assume 2480 * that MCA is enabled by the hypervisor. Moreover, the kernel must 2481 * be prepared for the hypervisor changing the vendor and family that 2482 * are reported by CPUID. Consequently, the workaround for AMD Family 2483 * 10h Erratum 383 is enabled if the processor's feature set does not 2484 * include at least one feature that is only supported by older Intel 2485 * or newer AMD processors. 2486 */ 2487 if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && 2488 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 2489 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 2490 AMDID2_FMA4)) == 0) 2491 workaround_erratum383 = 1; 2492 2493 /* 2494 * Are large page mappings enabled? 2495 */ 2496 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 2497 if (pg_ps_enabled) { 2498 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 2499 ("pmap_init: can't assign to pagesizes[1]")); 2500 pagesizes[1] = NBPDR; 2501 if ((amd_feature & AMDID_PAGE1GB) != 0) { 2502 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0, 2503 ("pmap_init: can't assign to pagesizes[2]")); 2504 pagesizes[2] = NBPDP; 2505 } 2506 } 2507 2508 /* 2509 * Initialize pv chunk lists. 2510 */ 2511 for (i = 0; i < PMAP_MEMDOM; i++) { 2512 mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL, MTX_DEF); 2513 TAILQ_INIT(&pv_chunks[i].pvc_list); 2514 } 2515 pmap_init_pv_table(); 2516 2517 pmap_initialized = 1; 2518 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 2519 ppim = pmap_preinit_mapping + i; 2520 if (ppim->va == 0) 2521 continue; 2522 /* Make the direct map consistent */ 2523 if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) { 2524 (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), 2525 ppim->sz, ppim->mode); 2526 } 2527 if (!bootverbose) 2528 continue; 2529 printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i, 2530 ppim->pa, ppim->va, ppim->sz, ppim->mode); 2531 } 2532 2533 mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); 2534 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 2535 (vmem_addr_t *)&qframe); 2536 if (error != 0) 2537 panic("qframe allocation failed"); 2538 2539 lm_ents = 8; 2540 TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents); 2541 if (lm_ents > LMEPML4I - LMSPML4I + 1) 2542 lm_ents = LMEPML4I - LMSPML4I + 1; 2543 #ifdef KMSAN 2544 if (lm_ents > KMSANORIGPML4I - LMSPML4I) { 2545 printf( 2546 "pmap: shrinking large map for KMSAN (%d slots to %ld slots)\n", 2547 lm_ents, KMSANORIGPML4I - LMSPML4I); 2548 lm_ents = KMSANORIGPML4I - LMSPML4I; 2549 } 2550 #endif 2551 if (bootverbose) 2552 printf("pmap: large map %u PML4 slots (%lu GB)\n", 2553 lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024)); 2554 if (lm_ents != 0) { 2555 large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS, 2556 (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK); 2557 if (large_vmem == NULL) { 2558 printf("pmap: cannot create large map\n"); 2559 lm_ents = 0; 2560 } 2561 for (i = 0; i < lm_ents; i++) { 2562 m = pmap_large_map_getptp_unlocked(); 2563 /* XXXKIB la57 */ 2564 kernel_pml4[LMSPML4I + i] = X86_PG_V | 2565 X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx | 2566 VM_PAGE_TO_PHYS(m); 2567 } 2568 } 2569 } 2570 2571 SYSCTL_UINT(_vm_pmap, OID_AUTO, large_map_pml4_entries, 2572 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &lm_ents, 0, 2573 "Maximum number of PML4 entries for use by large map (tunable). " 2574 "Each entry corresponds to 512GB of address space."); 2575 2576 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 2577 "2MB page mapping counters"); 2578 2579 static COUNTER_U64_DEFINE_EARLY(pmap_pde_demotions); 2580 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, demotions, 2581 CTLFLAG_RD, &pmap_pde_demotions, "2MB page demotions"); 2582 2583 static COUNTER_U64_DEFINE_EARLY(pmap_pde_mappings); 2584 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 2585 &pmap_pde_mappings, "2MB page mappings"); 2586 2587 static COUNTER_U64_DEFINE_EARLY(pmap_pde_p_failures); 2588 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 2589 &pmap_pde_p_failures, "2MB page promotion failures"); 2590 2591 static COUNTER_U64_DEFINE_EARLY(pmap_pde_promotions); 2592 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 2593 &pmap_pde_promotions, "2MB page promotions"); 2594 2595 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 2596 "1GB page mapping counters"); 2597 2598 static COUNTER_U64_DEFINE_EARLY(pmap_pdpe_demotions); 2599 SYSCTL_COUNTER_U64(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, 2600 &pmap_pdpe_demotions, "1GB page demotions"); 2601 2602 /*************************************************** 2603 * Low level helper routines..... 2604 ***************************************************/ 2605 2606 static pt_entry_t 2607 pmap_swap_pat(pmap_t pmap, pt_entry_t entry) 2608 { 2609 int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; 2610 2611 switch (pmap->pm_type) { 2612 case PT_X86: 2613 case PT_RVI: 2614 /* Verify that both PAT bits are not set at the same time */ 2615 KASSERT((entry & x86_pat_bits) != x86_pat_bits, 2616 ("Invalid PAT bits in entry %#lx", entry)); 2617 2618 /* Swap the PAT bits if one of them is set */ 2619 if ((entry & x86_pat_bits) != 0) 2620 entry ^= x86_pat_bits; 2621 break; 2622 case PT_EPT: 2623 /* 2624 * Nothing to do - the memory attributes are represented 2625 * the same way for regular pages and superpages. 2626 */ 2627 break; 2628 default: 2629 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); 2630 } 2631 2632 return (entry); 2633 } 2634 2635 boolean_t 2636 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 2637 { 2638 2639 return (mode >= 0 && mode < PAT_INDEX_SIZE && 2640 pat_index[(int)mode] >= 0); 2641 } 2642 2643 /* 2644 * Determine the appropriate bits to set in a PTE or PDE for a specified 2645 * caching mode. 2646 */ 2647 int 2648 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) 2649 { 2650 int cache_bits, pat_flag, pat_idx; 2651 2652 if (!pmap_is_valid_memattr(pmap, mode)) 2653 panic("Unknown caching mode %d\n", mode); 2654 2655 switch (pmap->pm_type) { 2656 case PT_X86: 2657 case PT_RVI: 2658 /* The PAT bit is different for PTE's and PDE's. */ 2659 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 2660 2661 /* Map the caching mode to a PAT index. */ 2662 pat_idx = pat_index[mode]; 2663 2664 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 2665 cache_bits = 0; 2666 if (pat_idx & 0x4) 2667 cache_bits |= pat_flag; 2668 if (pat_idx & 0x2) 2669 cache_bits |= PG_NC_PCD; 2670 if (pat_idx & 0x1) 2671 cache_bits |= PG_NC_PWT; 2672 break; 2673 2674 case PT_EPT: 2675 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); 2676 break; 2677 2678 default: 2679 panic("unsupported pmap type %d", pmap->pm_type); 2680 } 2681 2682 return (cache_bits); 2683 } 2684 2685 static int 2686 pmap_cache_mask(pmap_t pmap, boolean_t is_pde) 2687 { 2688 int mask; 2689 2690 switch (pmap->pm_type) { 2691 case PT_X86: 2692 case PT_RVI: 2693 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; 2694 break; 2695 case PT_EPT: 2696 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); 2697 break; 2698 default: 2699 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); 2700 } 2701 2702 return (mask); 2703 } 2704 2705 static int 2706 pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde) 2707 { 2708 int pat_flag, pat_idx; 2709 2710 pat_idx = 0; 2711 switch (pmap->pm_type) { 2712 case PT_X86: 2713 case PT_RVI: 2714 /* The PAT bit is different for PTE's and PDE's. */ 2715 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 2716 2717 if ((pte & pat_flag) != 0) 2718 pat_idx |= 0x4; 2719 if ((pte & PG_NC_PCD) != 0) 2720 pat_idx |= 0x2; 2721 if ((pte & PG_NC_PWT) != 0) 2722 pat_idx |= 0x1; 2723 break; 2724 case PT_EPT: 2725 if ((pte & EPT_PG_IGNORE_PAT) != 0) 2726 panic("EPT PTE %#lx has no PAT memory type", pte); 2727 pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3; 2728 break; 2729 } 2730 2731 /* See pmap_init_pat(). */ 2732 if (pat_idx == 4) 2733 pat_idx = 0; 2734 if (pat_idx == 7) 2735 pat_idx = 3; 2736 2737 return (pat_idx); 2738 } 2739 2740 bool 2741 pmap_ps_enabled(pmap_t pmap) 2742 { 2743 2744 return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); 2745 } 2746 2747 static void 2748 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) 2749 { 2750 2751 switch (pmap->pm_type) { 2752 case PT_X86: 2753 break; 2754 case PT_RVI: 2755 case PT_EPT: 2756 /* 2757 * XXX 2758 * This is a little bogus since the generation number is 2759 * supposed to be bumped up when a region of the address 2760 * space is invalidated in the page tables. 2761 * 2762 * In this case the old PDE entry is valid but yet we want 2763 * to make sure that any mappings using the old entry are 2764 * invalidated in the TLB. 2765 * 2766 * The reason this works as expected is because we rendezvous 2767 * "all" host cpus and force any vcpu context to exit as a 2768 * side-effect. 2769 */ 2770 atomic_add_long(&pmap->pm_eptgen, 1); 2771 break; 2772 default: 2773 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); 2774 } 2775 pde_store(pde, newpde); 2776 } 2777 2778 /* 2779 * After changing the page size for the specified virtual address in the page 2780 * table, flush the corresponding entries from the processor's TLB. Only the 2781 * calling processor's TLB is affected. 2782 * 2783 * The calling thread must be pinned to a processor. 2784 */ 2785 static void 2786 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) 2787 { 2788 pt_entry_t PG_G; 2789 2790 if (pmap_type_guest(pmap)) 2791 return; 2792 2793 KASSERT(pmap->pm_type == PT_X86, 2794 ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); 2795 2796 PG_G = pmap_global_bit(pmap); 2797 2798 if ((newpde & PG_PS) == 0) 2799 /* Demotion: flush a specific 2MB page mapping. */ 2800 pmap_invlpg(pmap, va); 2801 else if ((newpde & PG_G) == 0) 2802 /* 2803 * Promotion: flush every 4KB page mapping from the TLB 2804 * because there are too many to flush individually. 2805 */ 2806 invltlb(); 2807 else { 2808 /* 2809 * Promotion: flush every 4KB page mapping from the TLB, 2810 * including any global (PG_G) mappings. 2811 */ 2812 invltlb_glob(); 2813 } 2814 } 2815 2816 /* 2817 * The amd64 pmap uses different approaches to TLB invalidation 2818 * depending on the kernel configuration, available hardware features, 2819 * and known hardware errata. The kernel configuration option that 2820 * has the greatest operational impact on TLB invalidation is PTI, 2821 * which is enabled automatically on affected Intel CPUs. The most 2822 * impactful hardware features are first PCID, and then INVPCID 2823 * instruction presence. PCID usage is quite different for PTI 2824 * vs. non-PTI. 2825 * 2826 * * Kernel Page Table Isolation (PTI or KPTI) is used to mitigate 2827 * the Meltdown bug in some Intel CPUs. Under PTI, each user address 2828 * space is served by two page tables, user and kernel. The user 2829 * page table only maps user space and a kernel trampoline. The 2830 * kernel trampoline includes the entirety of the kernel text but 2831 * only the kernel data that is needed to switch from user to kernel 2832 * mode. The kernel page table maps the user and kernel address 2833 * spaces in their entirety. It is identical to the per-process 2834 * page table used in non-PTI mode. 2835 * 2836 * User page tables are only used when the CPU is in user mode. 2837 * Consequently, some TLB invalidations can be postponed until the 2838 * switch from kernel to user mode. In contrast, the user 2839 * space part of the kernel page table is used for copyout(9), so 2840 * TLB invalidations on this page table cannot be similarly postponed. 2841 * 2842 * The existence of a user mode page table for the given pmap is 2843 * indicated by a pm_ucr3 value that differs from PMAP_NO_CR3, in 2844 * which case pm_ucr3 contains the %cr3 register value for the user 2845 * mode page table's root. 2846 * 2847 * * The pm_active bitmask indicates which CPUs currently have the 2848 * pmap active. A CPU's bit is set on context switch to the pmap, and 2849 * cleared on switching off this CPU. For the kernel page table, 2850 * the pm_active field is immutable and contains all CPUs. The 2851 * kernel page table is always logically active on every processor, 2852 * but not necessarily in use by the hardware, e.g., in PTI mode. 2853 * 2854 * When requesting invalidation of virtual addresses with 2855 * pmap_invalidate_XXX() functions, the pmap sends shootdown IPIs to 2856 * all CPUs recorded as active in pm_active. Updates to and reads 2857 * from pm_active are not synchronized, and so they may race with 2858 * each other. Shootdown handlers are prepared to handle the race. 2859 * 2860 * * PCID is an optional feature of the long mode x86 MMU where TLB 2861 * entries are tagged with the 'Process ID' of the address space 2862 * they belong to. This feature provides a limited namespace for 2863 * process identifiers, 12 bits, supporting 4095 simultaneous IDs 2864 * total. 2865 * 2866 * Allocation of a PCID to a pmap is done by an algorithm described 2867 * in section 15.12, "Other TLB Consistency Algorithms", of 2868 * Vahalia's book "Unix Internals". A PCID cannot be allocated for 2869 * the whole lifetime of a pmap in pmap_pinit() due to the limited 2870 * namespace. Instead, a per-CPU, per-pmap PCID is assigned when 2871 * the CPU is about to start caching TLB entries from a pmap, 2872 * i.e., on the context switch that activates the pmap on the CPU. 2873 * 2874 * The PCID allocator maintains a per-CPU, per-pmap generation 2875 * count, pm_gen, which is incremented each time a new PCID is 2876 * allocated. On TLB invalidation, the generation counters for the 2877 * pmap are zeroed, which signals the context switch code that the 2878 * previously allocated PCID is no longer valid. Effectively, 2879 * zeroing any of these counters triggers a TLB shootdown for the 2880 * given CPU/address space, due to the allocation of a new PCID. 2881 * 2882 * Zeroing can be performed remotely. Consequently, if a pmap is 2883 * inactive on a CPU, then a TLB shootdown for that pmap and CPU can 2884 * be initiated by an ordinary memory access to reset the target 2885 * CPU's generation count within the pmap. The CPU initiating the 2886 * TLB shootdown does not need to send an IPI to the target CPU. 2887 * 2888 * * PTI + PCID. The available PCIDs are divided into two sets: PCIDs 2889 * for complete (kernel) page tables, and PCIDs for user mode page 2890 * tables. A user PCID value is obtained from the kernel PCID value 2891 * by setting the highest bit, 11, to 1 (0x800 == PMAP_PCID_USER_PT). 2892 * 2893 * User space page tables are activated on return to user mode, by 2894 * loading pm_ucr3 into %cr3. If the PCPU(ucr3_load_mask) requests 2895 * clearing bit 63 of the loaded ucr3, this effectively causes 2896 * complete invalidation of the user mode TLB entries for the 2897 * current pmap. In which case, local invalidations of individual 2898 * pages in the user page table are skipped. 2899 * 2900 * * Local invalidation, all modes. If the requested invalidation is 2901 * for a specific address or the total invalidation of a currently 2902 * active pmap, then the TLB is flushed using INVLPG for a kernel 2903 * page table, and INVPCID(INVPCID_CTXGLOB)/invltlb_glob() for a 2904 * user space page table(s). 2905 * 2906 * If the INVPCID instruction is available, it is used to flush user 2907 * entries from the kernel page table. 2908 * 2909 * When PCID is enabled, the INVLPG instruction invalidates all TLB 2910 * entries for the given page that either match the current PCID or 2911 * are global. Since TLB entries for the same page under different 2912 * PCIDs are unaffected, kernel pages which reside in all address 2913 * spaces could be problematic. We avoid the problem by creating 2914 * all kernel PTEs with the global flag (PG_G) set, when PTI is 2915 * disabled. 2916 * 2917 * * mode: PTI disabled, PCID present. The kernel reserves PCID 0 for its 2918 * address space, all other 4095 PCIDs are used for user mode spaces 2919 * as described above. A context switch allocates a new PCID if 2920 * the recorded PCID is zero or the recorded generation does not match 2921 * the CPU's generation, effectively flushing the TLB for this address space. 2922 * Total remote invalidation is performed by zeroing pm_gen for all CPUs. 2923 * local user page: INVLPG 2924 * local kernel page: INVLPG 2925 * local user total: INVPCID(CTX) 2926 * local kernel total: INVPCID(CTXGLOB) or invltlb_glob() 2927 * remote user page, inactive pmap: zero pm_gen 2928 * remote user page, active pmap: zero pm_gen + IPI:INVLPG 2929 * (Both actions are required to handle the aforementioned pm_active races.) 2930 * remote kernel page: IPI:INVLPG 2931 * remote user total, inactive pmap: zero pm_gen 2932 * remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) or 2933 * reload %cr3) 2934 * (See note above about pm_active races.) 2935 * remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob()) 2936 * 2937 * PTI enabled, PCID present. 2938 * local user page: INVLPG for kpt, INVPCID(ADDR) or (INVLPG for ucr3) 2939 * for upt 2940 * local kernel page: INVLPG 2941 * local user total: INVPCID(CTX) or reload %cr3 for kpt, clear PCID_SAVE 2942 * on loading UCR3 into %cr3 for upt 2943 * local kernel total: INVPCID(CTXGLOB) or invltlb_glob() 2944 * remote user page, inactive pmap: zero pm_gen 2945 * remote user page, active pmap: zero pm_gen + IPI:(INVLPG for kpt, 2946 * INVPCID(ADDR) for upt) 2947 * remote kernel page: IPI:INVLPG 2948 * remote user total, inactive pmap: zero pm_gen 2949 * remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) for kpt, 2950 * clear PCID_SAVE on loading UCR3 into $cr3 for upt) 2951 * remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob()) 2952 * 2953 * No PCID. 2954 * local user page: INVLPG 2955 * local kernel page: INVLPG 2956 * local user total: reload %cr3 2957 * local kernel total: invltlb_glob() 2958 * remote user page, inactive pmap: - 2959 * remote user page, active pmap: IPI:INVLPG 2960 * remote kernel page: IPI:INVLPG 2961 * remote user total, inactive pmap: - 2962 * remote user total, active pmap: IPI:(reload %cr3) 2963 * remote kernel total: IPI:invltlb_glob() 2964 * Since on return to user mode, the reload of %cr3 with ucr3 causes 2965 * TLB invalidation, no specific action is required for user page table. 2966 * 2967 * EPT. EPT pmaps do not map KVA, all mappings are userspace. 2968 * XXX TODO 2969 */ 2970 2971 #ifdef SMP 2972 /* 2973 * Interrupt the cpus that are executing in the guest context. 2974 * This will force the vcpu to exit and the cached EPT mappings 2975 * will be invalidated by the host before the next vmresume. 2976 */ 2977 static __inline void 2978 pmap_invalidate_ept(pmap_t pmap) 2979 { 2980 smr_seq_t goal; 2981 int ipinum; 2982 2983 sched_pin(); 2984 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 2985 ("pmap_invalidate_ept: absurd pm_active")); 2986 2987 /* 2988 * The TLB mappings associated with a vcpu context are not 2989 * flushed each time a different vcpu is chosen to execute. 2990 * 2991 * This is in contrast with a process's vtop mappings that 2992 * are flushed from the TLB on each context switch. 2993 * 2994 * Therefore we need to do more than just a TLB shootdown on 2995 * the active cpus in 'pmap->pm_active'. To do this we keep 2996 * track of the number of invalidations performed on this pmap. 2997 * 2998 * Each vcpu keeps a cache of this counter and compares it 2999 * just before a vmresume. If the counter is out-of-date an 3000 * invept will be done to flush stale mappings from the TLB. 3001 * 3002 * To ensure that all vCPU threads have observed the new counter 3003 * value before returning, we use SMR. Ordering is important here: 3004 * the VMM enters an SMR read section before loading the counter 3005 * and after updating the pm_active bit set. Thus, pm_active is 3006 * a superset of active readers, and any reader that has observed 3007 * the goal has observed the new counter value. 3008 */ 3009 atomic_add_long(&pmap->pm_eptgen, 1); 3010 3011 goal = smr_advance(pmap->pm_eptsmr); 3012 3013 /* 3014 * Force the vcpu to exit and trap back into the hypervisor. 3015 */ 3016 ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; 3017 ipi_selected(pmap->pm_active, ipinum); 3018 sched_unpin(); 3019 3020 /* 3021 * Ensure that all active vCPUs will observe the new generation counter 3022 * value before executing any more guest instructions. 3023 */ 3024 smr_wait(pmap->pm_eptsmr, goal); 3025 } 3026 3027 static inline void 3028 pmap_invalidate_preipi_pcid(pmap_t pmap) 3029 { 3030 u_int cpuid, i; 3031 3032 sched_pin(); 3033 3034 cpuid = PCPU_GET(cpuid); 3035 if (pmap != PCPU_GET(curpmap)) 3036 cpuid = 0xffffffff; /* An impossible value */ 3037 3038 CPU_FOREACH(i) { 3039 if (cpuid != i) 3040 pmap->pm_pcids[i].pm_gen = 0; 3041 } 3042 3043 /* 3044 * The fence is between stores to pm_gen and the read of the 3045 * pm_active mask. We need to ensure that it is impossible 3046 * for us to miss the bit update in pm_active and 3047 * simultaneously observe a non-zero pm_gen in 3048 * pmap_activate_sw(), otherwise TLB update is missed. 3049 * Without the fence, IA32 allows such an outcome. Note that 3050 * pm_active is updated by a locked operation, which provides 3051 * the reciprocal fence. 3052 */ 3053 atomic_thread_fence_seq_cst(); 3054 } 3055 3056 static void 3057 pmap_invalidate_preipi_nopcid(pmap_t pmap __unused) 3058 { 3059 sched_pin(); 3060 } 3061 3062 DEFINE_IFUNC(static, void, pmap_invalidate_preipi, (pmap_t)) 3063 { 3064 return (pmap_pcid_enabled ? pmap_invalidate_preipi_pcid : 3065 pmap_invalidate_preipi_nopcid); 3066 } 3067 3068 static inline void 3069 pmap_invalidate_page_pcid_cb(pmap_t pmap, vm_offset_t va, 3070 const bool invpcid_works1) 3071 { 3072 struct invpcid_descr d; 3073 uint64_t kcr3, ucr3; 3074 uint32_t pcid; 3075 u_int cpuid; 3076 3077 /* 3078 * Because pm_pcid is recalculated on a context switch, we 3079 * must ensure there is no preemption, not just pinning. 3080 * Otherwise, we might use a stale value below. 3081 */ 3082 CRITICAL_ASSERT(curthread); 3083 3084 /* 3085 * No need to do anything with user page tables invalidation 3086 * if there is no user page table, or invalidation is deferred 3087 * until the return to userspace. ucr3_load_mask is stable 3088 * because we have preemption disabled. 3089 */ 3090 if (pmap->pm_ucr3 == PMAP_NO_CR3 || 3091 PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) 3092 return; 3093 3094 cpuid = PCPU_GET(cpuid); 3095 3096 pcid = pmap->pm_pcids[cpuid].pm_pcid; 3097 if (invpcid_works1) { 3098 d.pcid = pcid | PMAP_PCID_USER_PT; 3099 d.pad = 0; 3100 d.addr = va; 3101 invpcid(&d, INVPCID_ADDR); 3102 } else { 3103 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3104 ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3105 pmap_pti_pcid_invlpg(ucr3, kcr3, va); 3106 } 3107 } 3108 3109 static void 3110 pmap_invalidate_page_pcid_invpcid_cb(pmap_t pmap, vm_offset_t va) 3111 { 3112 pmap_invalidate_page_pcid_cb(pmap, va, true); 3113 } 3114 3115 static void 3116 pmap_invalidate_page_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t va) 3117 { 3118 pmap_invalidate_page_pcid_cb(pmap, va, false); 3119 } 3120 3121 static void 3122 pmap_invalidate_page_nopcid_cb(pmap_t pmap __unused, vm_offset_t va __unused) 3123 { 3124 } 3125 3126 DEFINE_IFUNC(static, void, pmap_invalidate_page_cb, (pmap_t, vm_offset_t)) 3127 { 3128 if (pmap_pcid_enabled) 3129 return (invpcid_works ? pmap_invalidate_page_pcid_invpcid_cb : 3130 pmap_invalidate_page_pcid_noinvpcid_cb); 3131 return (pmap_invalidate_page_nopcid_cb); 3132 } 3133 3134 static void 3135 pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va, 3136 vm_offset_t addr2 __unused) 3137 { 3138 if (pmap == kernel_pmap) { 3139 pmap_invlpg(kernel_pmap, va); 3140 } else if (pmap == PCPU_GET(curpmap)) { 3141 invlpg(va); 3142 pmap_invalidate_page_cb(pmap, va); 3143 } 3144 } 3145 3146 void 3147 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 3148 { 3149 if (pmap_type_guest(pmap)) { 3150 pmap_invalidate_ept(pmap); 3151 return; 3152 } 3153 3154 KASSERT(pmap->pm_type == PT_X86, 3155 ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); 3156 3157 pmap_invalidate_preipi(pmap); 3158 smp_masked_invlpg(va, pmap, pmap_invalidate_page_curcpu_cb); 3159 } 3160 3161 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ 3162 #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) 3163 3164 static void 3165 pmap_invalidate_range_pcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 3166 const bool invpcid_works1) 3167 { 3168 struct invpcid_descr d; 3169 uint64_t kcr3, ucr3; 3170 uint32_t pcid; 3171 u_int cpuid; 3172 3173 CRITICAL_ASSERT(curthread); 3174 3175 if (pmap != PCPU_GET(curpmap) || 3176 pmap->pm_ucr3 == PMAP_NO_CR3 || 3177 PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) 3178 return; 3179 3180 cpuid = PCPU_GET(cpuid); 3181 3182 pcid = pmap->pm_pcids[cpuid].pm_pcid; 3183 if (invpcid_works1) { 3184 d.pcid = pcid | PMAP_PCID_USER_PT; 3185 d.pad = 0; 3186 for (d.addr = sva; d.addr < eva; d.addr += PAGE_SIZE) 3187 invpcid(&d, INVPCID_ADDR); 3188 } else { 3189 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3190 ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3191 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); 3192 } 3193 } 3194 3195 static void 3196 pmap_invalidate_range_pcid_invpcid_cb(pmap_t pmap, vm_offset_t sva, 3197 vm_offset_t eva) 3198 { 3199 pmap_invalidate_range_pcid_cb(pmap, sva, eva, true); 3200 } 3201 3202 static void 3203 pmap_invalidate_range_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t sva, 3204 vm_offset_t eva) 3205 { 3206 pmap_invalidate_range_pcid_cb(pmap, sva, eva, false); 3207 } 3208 3209 static void 3210 pmap_invalidate_range_nopcid_cb(pmap_t pmap __unused, vm_offset_t sva __unused, 3211 vm_offset_t eva __unused) 3212 { 3213 } 3214 3215 DEFINE_IFUNC(static, void, pmap_invalidate_range_cb, (pmap_t, vm_offset_t, 3216 vm_offset_t)) 3217 { 3218 if (pmap_pcid_enabled) 3219 return (invpcid_works ? pmap_invalidate_range_pcid_invpcid_cb : 3220 pmap_invalidate_range_pcid_noinvpcid_cb); 3221 return (pmap_invalidate_range_nopcid_cb); 3222 } 3223 3224 static void 3225 pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3226 { 3227 vm_offset_t addr; 3228 3229 if (pmap == kernel_pmap) { 3230 if (PCPU_GET(pcid_invlpg_workaround)) { 3231 struct invpcid_descr d = { 0 }; 3232 3233 invpcid(&d, INVPCID_CTXGLOB); 3234 } else { 3235 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3236 invlpg(addr); 3237 } 3238 } else if (pmap == PCPU_GET(curpmap)) { 3239 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3240 invlpg(addr); 3241 pmap_invalidate_range_cb(pmap, sva, eva); 3242 } 3243 } 3244 3245 void 3246 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3247 { 3248 if (eva - sva >= PMAP_INVLPG_THRESHOLD) { 3249 pmap_invalidate_all(pmap); 3250 return; 3251 } 3252 3253 if (pmap_type_guest(pmap)) { 3254 pmap_invalidate_ept(pmap); 3255 return; 3256 } 3257 3258 KASSERT(pmap->pm_type == PT_X86, 3259 ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); 3260 3261 pmap_invalidate_preipi(pmap); 3262 smp_masked_invlpg_range(sva, eva, pmap, 3263 pmap_invalidate_range_curcpu_cb); 3264 } 3265 3266 static inline void 3267 pmap_invalidate_all_pcid_cb(pmap_t pmap, bool invpcid_works1) 3268 { 3269 struct invpcid_descr d; 3270 uint64_t kcr3; 3271 uint32_t pcid; 3272 u_int cpuid; 3273 3274 if (pmap == kernel_pmap) { 3275 if (invpcid_works1) { 3276 bzero(&d, sizeof(d)); 3277 invpcid(&d, INVPCID_CTXGLOB); 3278 } else { 3279 invltlb_glob(); 3280 } 3281 } else if (pmap == PCPU_GET(curpmap)) { 3282 CRITICAL_ASSERT(curthread); 3283 cpuid = PCPU_GET(cpuid); 3284 3285 pcid = pmap->pm_pcids[cpuid].pm_pcid; 3286 if (invpcid_works1) { 3287 d.pcid = pcid; 3288 d.pad = 0; 3289 d.addr = 0; 3290 invpcid(&d, INVPCID_CTX); 3291 } else { 3292 kcr3 = pmap->pm_cr3 | pcid; 3293 load_cr3(kcr3); 3294 } 3295 if (pmap->pm_ucr3 != PMAP_NO_CR3) 3296 PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); 3297 } 3298 } 3299 3300 static void 3301 pmap_invalidate_all_pcid_invpcid_cb(pmap_t pmap) 3302 { 3303 pmap_invalidate_all_pcid_cb(pmap, true); 3304 } 3305 3306 static void 3307 pmap_invalidate_all_pcid_noinvpcid_cb(pmap_t pmap) 3308 { 3309 pmap_invalidate_all_pcid_cb(pmap, false); 3310 } 3311 3312 static void 3313 pmap_invalidate_all_nopcid_cb(pmap_t pmap) 3314 { 3315 if (pmap == kernel_pmap) 3316 invltlb_glob(); 3317 else if (pmap == PCPU_GET(curpmap)) 3318 invltlb(); 3319 } 3320 3321 DEFINE_IFUNC(static, void, pmap_invalidate_all_cb, (pmap_t)) 3322 { 3323 if (pmap_pcid_enabled) 3324 return (invpcid_works ? pmap_invalidate_all_pcid_invpcid_cb : 3325 pmap_invalidate_all_pcid_noinvpcid_cb); 3326 return (pmap_invalidate_all_nopcid_cb); 3327 } 3328 3329 static void 3330 pmap_invalidate_all_curcpu_cb(pmap_t pmap, vm_offset_t addr1 __unused, 3331 vm_offset_t addr2 __unused) 3332 { 3333 pmap_invalidate_all_cb(pmap); 3334 } 3335 3336 void 3337 pmap_invalidate_all(pmap_t pmap) 3338 { 3339 if (pmap_type_guest(pmap)) { 3340 pmap_invalidate_ept(pmap); 3341 return; 3342 } 3343 3344 KASSERT(pmap->pm_type == PT_X86, 3345 ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); 3346 3347 pmap_invalidate_preipi(pmap); 3348 smp_masked_invltlb(pmap, pmap_invalidate_all_curcpu_cb); 3349 } 3350 3351 static void 3352 pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused, vm_offset_t va __unused, 3353 vm_offset_t addr2 __unused) 3354 { 3355 wbinvd(); 3356 } 3357 3358 void 3359 pmap_invalidate_cache(void) 3360 { 3361 sched_pin(); 3362 smp_cache_flush(pmap_invalidate_cache_curcpu_cb); 3363 } 3364 3365 struct pde_action { 3366 cpuset_t invalidate; /* processors that invalidate their TLB */ 3367 pmap_t pmap; 3368 vm_offset_t va; 3369 pd_entry_t *pde; 3370 pd_entry_t newpde; 3371 u_int store; /* processor that updates the PDE */ 3372 }; 3373 3374 static void 3375 pmap_update_pde_action(void *arg) 3376 { 3377 struct pde_action *act = arg; 3378 3379 if (act->store == PCPU_GET(cpuid)) 3380 pmap_update_pde_store(act->pmap, act->pde, act->newpde); 3381 } 3382 3383 static void 3384 pmap_update_pde_teardown(void *arg) 3385 { 3386 struct pde_action *act = arg; 3387 3388 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 3389 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); 3390 } 3391 3392 /* 3393 * Change the page size for the specified virtual address in a way that 3394 * prevents any possibility of the TLB ever having two entries that map the 3395 * same virtual address using different page sizes. This is the recommended 3396 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 3397 * machine check exception for a TLB state that is improperly diagnosed as a 3398 * hardware error. 3399 */ 3400 static void 3401 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 3402 { 3403 struct pde_action act; 3404 cpuset_t active, other_cpus; 3405 u_int cpuid; 3406 3407 sched_pin(); 3408 cpuid = PCPU_GET(cpuid); 3409 other_cpus = all_cpus; 3410 CPU_CLR(cpuid, &other_cpus); 3411 if (pmap == kernel_pmap || pmap_type_guest(pmap)) 3412 active = all_cpus; 3413 else { 3414 active = pmap->pm_active; 3415 } 3416 if (CPU_OVERLAP(&active, &other_cpus)) { 3417 act.store = cpuid; 3418 act.invalidate = active; 3419 act.va = va; 3420 act.pmap = pmap; 3421 act.pde = pde; 3422 act.newpde = newpde; 3423 CPU_SET(cpuid, &active); 3424 smp_rendezvous_cpus(active, 3425 smp_no_rendezvous_barrier, pmap_update_pde_action, 3426 pmap_update_pde_teardown, &act); 3427 } else { 3428 pmap_update_pde_store(pmap, pde, newpde); 3429 if (CPU_ISSET(cpuid, &active)) 3430 pmap_update_pde_invalidate(pmap, va, newpde); 3431 } 3432 sched_unpin(); 3433 } 3434 #else /* !SMP */ 3435 /* 3436 * Normal, non-SMP, invalidation functions. 3437 */ 3438 void 3439 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 3440 { 3441 struct invpcid_descr d; 3442 uint64_t kcr3, ucr3; 3443 uint32_t pcid; 3444 3445 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3446 pmap->pm_eptgen++; 3447 return; 3448 } 3449 KASSERT(pmap->pm_type == PT_X86, 3450 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 3451 3452 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 3453 invlpg(va); 3454 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && 3455 pmap->pm_ucr3 != PMAP_NO_CR3) { 3456 critical_enter(); 3457 pcid = pmap->pm_pcids[0].pm_pcid; 3458 if (invpcid_works) { 3459 d.pcid = pcid | PMAP_PCID_USER_PT; 3460 d.pad = 0; 3461 d.addr = va; 3462 invpcid(&d, INVPCID_ADDR); 3463 } else { 3464 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3465 ucr3 = pmap->pm_ucr3 | pcid | 3466 PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3467 pmap_pti_pcid_invlpg(ucr3, kcr3, va); 3468 } 3469 critical_exit(); 3470 } 3471 } else if (pmap_pcid_enabled) 3472 pmap->pm_pcids[0].pm_gen = 0; 3473 } 3474 3475 void 3476 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3477 { 3478 struct invpcid_descr d; 3479 vm_offset_t addr; 3480 uint64_t kcr3, ucr3; 3481 3482 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3483 pmap->pm_eptgen++; 3484 return; 3485 } 3486 KASSERT(pmap->pm_type == PT_X86, 3487 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 3488 3489 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 3490 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3491 invlpg(addr); 3492 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && 3493 pmap->pm_ucr3 != PMAP_NO_CR3) { 3494 critical_enter(); 3495 if (invpcid_works) { 3496 d.pcid = pmap->pm_pcids[0].pm_pcid | 3497 PMAP_PCID_USER_PT; 3498 d.pad = 0; 3499 d.addr = sva; 3500 for (; d.addr < eva; d.addr += PAGE_SIZE) 3501 invpcid(&d, INVPCID_ADDR); 3502 } else { 3503 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0]. 3504 pm_pcid | CR3_PCID_SAVE; 3505 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0]. 3506 pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3507 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); 3508 } 3509 critical_exit(); 3510 } 3511 } else if (pmap_pcid_enabled) { 3512 pmap->pm_pcids[0].pm_gen = 0; 3513 } 3514 } 3515 3516 void 3517 pmap_invalidate_all(pmap_t pmap) 3518 { 3519 struct invpcid_descr d; 3520 uint64_t kcr3, ucr3; 3521 3522 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3523 pmap->pm_eptgen++; 3524 return; 3525 } 3526 KASSERT(pmap->pm_type == PT_X86, 3527 ("pmap_invalidate_all: unknown type %d", pmap->pm_type)); 3528 3529 if (pmap == kernel_pmap) { 3530 if (pmap_pcid_enabled && invpcid_works) { 3531 bzero(&d, sizeof(d)); 3532 invpcid(&d, INVPCID_CTXGLOB); 3533 } else { 3534 invltlb_glob(); 3535 } 3536 } else if (pmap == PCPU_GET(curpmap)) { 3537 if (pmap_pcid_enabled) { 3538 critical_enter(); 3539 if (invpcid_works) { 3540 d.pcid = pmap->pm_pcids[0].pm_pcid; 3541 d.pad = 0; 3542 d.addr = 0; 3543 invpcid(&d, INVPCID_CTX); 3544 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 3545 d.pcid |= PMAP_PCID_USER_PT; 3546 invpcid(&d, INVPCID_CTX); 3547 } 3548 } else { 3549 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid; 3550 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 3551 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[ 3552 0].pm_pcid | PMAP_PCID_USER_PT; 3553 pmap_pti_pcid_invalidate(ucr3, kcr3); 3554 } else 3555 load_cr3(kcr3); 3556 } 3557 critical_exit(); 3558 } else { 3559 invltlb(); 3560 } 3561 } else if (pmap_pcid_enabled) { 3562 pmap->pm_pcids[0].pm_gen = 0; 3563 } 3564 } 3565 3566 PMAP_INLINE void 3567 pmap_invalidate_cache(void) 3568 { 3569 3570 wbinvd(); 3571 } 3572 3573 static void 3574 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 3575 { 3576 3577 pmap_update_pde_store(pmap, pde, newpde); 3578 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) 3579 pmap_update_pde_invalidate(pmap, va, newpde); 3580 else 3581 pmap->pm_pcids[0].pm_gen = 0; 3582 } 3583 #endif /* !SMP */ 3584 3585 static void 3586 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) 3587 { 3588 3589 /* 3590 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created 3591 * by a promotion that did not invalidate the 512 4KB page mappings 3592 * that might exist in the TLB. Consequently, at this point, the TLB 3593 * may hold both 4KB and 2MB page mappings for the address range [va, 3594 * va + NBPDR). Therefore, the entire range must be invalidated here. 3595 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any 3596 * 4KB page mappings for the address range [va, va + NBPDR), and so a 3597 * single INVLPG suffices to invalidate the 2MB page mapping from the 3598 * TLB. 3599 */ 3600 if ((pde & PG_PROMOTED) != 0) 3601 pmap_invalidate_range(pmap, va, va + NBPDR - 1); 3602 else 3603 pmap_invalidate_page(pmap, va); 3604 } 3605 3606 DEFINE_IFUNC(, void, pmap_invalidate_cache_range, 3607 (vm_offset_t sva, vm_offset_t eva)) 3608 { 3609 3610 if ((cpu_feature & CPUID_SS) != 0) 3611 return (pmap_invalidate_cache_range_selfsnoop); 3612 if ((cpu_feature & CPUID_CLFSH) != 0) 3613 return (pmap_force_invalidate_cache_range); 3614 return (pmap_invalidate_cache_range_all); 3615 } 3616 3617 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 3618 3619 static void 3620 pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva) 3621 { 3622 3623 KASSERT((sva & PAGE_MASK) == 0, 3624 ("pmap_invalidate_cache_range: sva not page-aligned")); 3625 KASSERT((eva & PAGE_MASK) == 0, 3626 ("pmap_invalidate_cache_range: eva not page-aligned")); 3627 } 3628 3629 static void 3630 pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva) 3631 { 3632 3633 pmap_invalidate_cache_range_check_align(sva, eva); 3634 } 3635 3636 void 3637 pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 3638 { 3639 3640 sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); 3641 3642 /* 3643 * XXX: Some CPUs fault, hang, or trash the local APIC 3644 * registers if we use CLFLUSH on the local APIC range. The 3645 * local APIC is always uncached, so we don't need to flush 3646 * for that range anyway. 3647 */ 3648 if (pmap_kextract(sva) == lapic_paddr) 3649 return; 3650 3651 if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) { 3652 /* 3653 * Do per-cache line flush. Use a locked 3654 * instruction to insure that previous stores are 3655 * included in the write-back. The processor 3656 * propagates flush to other processors in the cache 3657 * coherence domain. 3658 */ 3659 atomic_thread_fence_seq_cst(); 3660 for (; sva < eva; sva += cpu_clflush_line_size) 3661 clflushopt(sva); 3662 atomic_thread_fence_seq_cst(); 3663 } else { 3664 /* 3665 * Writes are ordered by CLFLUSH on Intel CPUs. 3666 */ 3667 if (cpu_vendor_id != CPU_VENDOR_INTEL) 3668 mfence(); 3669 for (; sva < eva; sva += cpu_clflush_line_size) 3670 clflush(sva); 3671 if (cpu_vendor_id != CPU_VENDOR_INTEL) 3672 mfence(); 3673 } 3674 } 3675 3676 static void 3677 pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva) 3678 { 3679 3680 pmap_invalidate_cache_range_check_align(sva, eva); 3681 pmap_invalidate_cache(); 3682 } 3683 3684 /* 3685 * Remove the specified set of pages from the data and instruction caches. 3686 * 3687 * In contrast to pmap_invalidate_cache_range(), this function does not 3688 * rely on the CPU's self-snoop feature, because it is intended for use 3689 * when moving pages into a different cache domain. 3690 */ 3691 void 3692 pmap_invalidate_cache_pages(vm_page_t *pages, int count) 3693 { 3694 vm_offset_t daddr, eva; 3695 int i; 3696 bool useclflushopt; 3697 3698 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; 3699 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 3700 ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt)) 3701 pmap_invalidate_cache(); 3702 else { 3703 if (useclflushopt) 3704 atomic_thread_fence_seq_cst(); 3705 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 3706 mfence(); 3707 for (i = 0; i < count; i++) { 3708 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 3709 eva = daddr + PAGE_SIZE; 3710 for (; daddr < eva; daddr += cpu_clflush_line_size) { 3711 if (useclflushopt) 3712 clflushopt(daddr); 3713 else 3714 clflush(daddr); 3715 } 3716 } 3717 if (useclflushopt) 3718 atomic_thread_fence_seq_cst(); 3719 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 3720 mfence(); 3721 } 3722 } 3723 3724 void 3725 pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva) 3726 { 3727 3728 pmap_invalidate_cache_range_check_align(sva, eva); 3729 3730 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) { 3731 pmap_force_invalidate_cache_range(sva, eva); 3732 return; 3733 } 3734 3735 /* See comment in pmap_force_invalidate_cache_range(). */ 3736 if (pmap_kextract(sva) == lapic_paddr) 3737 return; 3738 3739 atomic_thread_fence_seq_cst(); 3740 for (; sva < eva; sva += cpu_clflush_line_size) 3741 clwb(sva); 3742 atomic_thread_fence_seq_cst(); 3743 } 3744 3745 void 3746 pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) 3747 { 3748 pt_entry_t *pte; 3749 vm_offset_t vaddr; 3750 int error __diagused; 3751 int pte_bits; 3752 3753 KASSERT((spa & PAGE_MASK) == 0, 3754 ("pmap_flush_cache_phys_range: spa not page-aligned")); 3755 KASSERT((epa & PAGE_MASK) == 0, 3756 ("pmap_flush_cache_phys_range: epa not page-aligned")); 3757 3758 if (spa < dmaplimit) { 3759 pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN( 3760 dmaplimit, epa))); 3761 if (dmaplimit >= epa) 3762 return; 3763 spa = dmaplimit; 3764 } 3765 3766 pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW | 3767 X86_PG_V; 3768 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 3769 &vaddr); 3770 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 3771 pte = vtopte(vaddr); 3772 for (; spa < epa; spa += PAGE_SIZE) { 3773 sched_pin(); 3774 pte_store(pte, spa | pte_bits); 3775 pmap_invlpg(kernel_pmap, vaddr); 3776 /* XXXKIB atomic inside flush_cache_range are excessive */ 3777 pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE); 3778 sched_unpin(); 3779 } 3780 vmem_free(kernel_arena, vaddr, PAGE_SIZE); 3781 } 3782 3783 /* 3784 * Routine: pmap_extract 3785 * Function: 3786 * Extract the physical page address associated 3787 * with the given map/virtual_address pair. 3788 */ 3789 vm_paddr_t 3790 pmap_extract(pmap_t pmap, vm_offset_t va) 3791 { 3792 pdp_entry_t *pdpe; 3793 pd_entry_t *pde; 3794 pt_entry_t *pte, PG_V; 3795 vm_paddr_t pa; 3796 3797 pa = 0; 3798 PG_V = pmap_valid_bit(pmap); 3799 PMAP_LOCK(pmap); 3800 pdpe = pmap_pdpe(pmap, va); 3801 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 3802 if ((*pdpe & PG_PS) != 0) 3803 pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); 3804 else { 3805 pde = pmap_pdpe_to_pde(pdpe, va); 3806 if ((*pde & PG_V) != 0) { 3807 if ((*pde & PG_PS) != 0) { 3808 pa = (*pde & PG_PS_FRAME) | 3809 (va & PDRMASK); 3810 } else { 3811 pte = pmap_pde_to_pte(pde, va); 3812 pa = (*pte & PG_FRAME) | 3813 (va & PAGE_MASK); 3814 } 3815 } 3816 } 3817 } 3818 PMAP_UNLOCK(pmap); 3819 return (pa); 3820 } 3821 3822 /* 3823 * Routine: pmap_extract_and_hold 3824 * Function: 3825 * Atomically extract and hold the physical page 3826 * with the given pmap and virtual address pair 3827 * if that mapping permits the given protection. 3828 */ 3829 vm_page_t 3830 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 3831 { 3832 pdp_entry_t pdpe, *pdpep; 3833 pd_entry_t pde, *pdep; 3834 pt_entry_t pte, PG_RW, PG_V; 3835 vm_page_t m; 3836 3837 m = NULL; 3838 PG_RW = pmap_rw_bit(pmap); 3839 PG_V = pmap_valid_bit(pmap); 3840 PMAP_LOCK(pmap); 3841 3842 pdpep = pmap_pdpe(pmap, va); 3843 if (pdpep == NULL || ((pdpe = *pdpep) & PG_V) == 0) 3844 goto out; 3845 if ((pdpe & PG_PS) != 0) { 3846 if ((pdpe & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0) 3847 goto out; 3848 m = PHYS_TO_VM_PAGE((pdpe & PG_PS_FRAME) | (va & PDPMASK)); 3849 goto check_page; 3850 } 3851 3852 pdep = pmap_pdpe_to_pde(pdpep, va); 3853 if (pdep == NULL || ((pde = *pdep) & PG_V) == 0) 3854 goto out; 3855 if ((pde & PG_PS) != 0) { 3856 if ((pde & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0) 3857 goto out; 3858 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); 3859 goto check_page; 3860 } 3861 3862 pte = *pmap_pde_to_pte(pdep, va); 3863 if ((pte & PG_V) == 0 || 3864 ((pte & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0)) 3865 goto out; 3866 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 3867 3868 check_page: 3869 if (m != NULL && !vm_page_wire_mapped(m)) 3870 m = NULL; 3871 out: 3872 PMAP_UNLOCK(pmap); 3873 return (m); 3874 } 3875 3876 vm_paddr_t 3877 pmap_kextract(vm_offset_t va) 3878 { 3879 pd_entry_t pde; 3880 vm_paddr_t pa; 3881 3882 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 3883 pa = DMAP_TO_PHYS(va); 3884 } else if (PMAP_ADDRESS_IN_LARGEMAP(va)) { 3885 pa = pmap_large_map_kextract(va); 3886 } else { 3887 pde = *vtopde(va); 3888 if (pde & PG_PS) { 3889 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 3890 } else { 3891 /* 3892 * Beware of a concurrent promotion that changes the 3893 * PDE at this point! For example, vtopte() must not 3894 * be used to access the PTE because it would use the 3895 * new PDE. It is, however, safe to use the old PDE 3896 * because the page table page is preserved by the 3897 * promotion. 3898 */ 3899 pa = *pmap_pde_to_pte(&pde, va); 3900 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 3901 } 3902 } 3903 return (pa); 3904 } 3905 3906 /*************************************************** 3907 * Low level mapping routines..... 3908 ***************************************************/ 3909 3910 /* 3911 * Add a wired page to the kva. 3912 * Note: not SMP coherent. 3913 */ 3914 PMAP_INLINE void 3915 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 3916 { 3917 pt_entry_t *pte; 3918 3919 pte = vtopte(va); 3920 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | X86_PG_M | 3921 X86_PG_RW | X86_PG_V); 3922 } 3923 3924 static __inline void 3925 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 3926 { 3927 pt_entry_t *pte; 3928 int cache_bits; 3929 3930 pte = vtopte(va); 3931 cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); 3932 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | X86_PG_M | 3933 X86_PG_RW | X86_PG_V | cache_bits); 3934 } 3935 3936 /* 3937 * Remove a page from the kernel pagetables. 3938 * Note: not SMP coherent. 3939 */ 3940 PMAP_INLINE void 3941 pmap_kremove(vm_offset_t va) 3942 { 3943 pt_entry_t *pte; 3944 3945 pte = vtopte(va); 3946 pte_clear(pte); 3947 } 3948 3949 /* 3950 * Used to map a range of physical addresses into kernel 3951 * virtual address space. 3952 * 3953 * The value passed in '*virt' is a suggested virtual address for 3954 * the mapping. Architectures which can support a direct-mapped 3955 * physical to virtual region can return the appropriate address 3956 * within that region, leaving '*virt' unchanged. Other 3957 * architectures should map the pages starting at '*virt' and 3958 * update '*virt' with the first usable address after the mapped 3959 * region. 3960 */ 3961 vm_offset_t 3962 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 3963 { 3964 return PHYS_TO_DMAP(start); 3965 } 3966 3967 /* 3968 * Add a list of wired pages to the kva 3969 * this routine is only used for temporary 3970 * kernel mappings that do not need to have 3971 * page modification or references recorded. 3972 * Note that old mappings are simply written 3973 * over. The page *must* be wired. 3974 * Note: SMP coherent. Uses a ranged shootdown IPI. 3975 */ 3976 void 3977 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 3978 { 3979 pt_entry_t *endpte, oldpte, pa, *pte; 3980 vm_page_t m; 3981 int cache_bits; 3982 3983 oldpte = 0; 3984 pte = vtopte(sva); 3985 endpte = pte + count; 3986 while (pte < endpte) { 3987 m = *ma++; 3988 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); 3989 pa = VM_PAGE_TO_PHYS(m) | cache_bits; 3990 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { 3991 oldpte |= *pte; 3992 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | 3993 X86_PG_M | X86_PG_RW | X86_PG_V); 3994 } 3995 pte++; 3996 } 3997 if (__predict_false((oldpte & X86_PG_V) != 0)) 3998 pmap_invalidate_range(kernel_pmap, sva, sva + count * 3999 PAGE_SIZE); 4000 } 4001 4002 /* 4003 * This routine tears out page mappings from the 4004 * kernel -- it is meant only for temporary mappings. 4005 * Note: SMP coherent. Uses a ranged shootdown IPI. 4006 */ 4007 void 4008 pmap_qremove(vm_offset_t sva, int count) 4009 { 4010 vm_offset_t va; 4011 4012 va = sva; 4013 while (count-- > 0) { 4014 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); 4015 pmap_kremove(va); 4016 va += PAGE_SIZE; 4017 } 4018 pmap_invalidate_range(kernel_pmap, sva, va); 4019 } 4020 4021 /*************************************************** 4022 * Page table page management routines..... 4023 ***************************************************/ 4024 /* 4025 * Schedule the specified unused page table page to be freed. Specifically, 4026 * add the page to the specified list of pages that will be released to the 4027 * physical memory manager after the TLB has been updated. 4028 */ 4029 static __inline void 4030 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 4031 boolean_t set_PG_ZERO) 4032 { 4033 4034 if (set_PG_ZERO) 4035 m->flags |= PG_ZERO; 4036 else 4037 m->flags &= ~PG_ZERO; 4038 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 4039 } 4040 4041 /* 4042 * Inserts the specified page table page into the specified pmap's collection 4043 * of idle page table pages. Each of a pmap's page table pages is responsible 4044 * for mapping a distinct range of virtual addresses. The pmap's collection is 4045 * ordered by this virtual address range. 4046 * 4047 * If "promoted" is false, then the page table page "mpte" must be zero filled. 4048 */ 4049 static __inline int 4050 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) 4051 { 4052 4053 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4054 mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; 4055 return (vm_radix_insert(&pmap->pm_root, mpte)); 4056 } 4057 4058 /* 4059 * Removes the page table page mapping the specified virtual address from the 4060 * specified pmap's collection of idle page table pages, and returns it. 4061 * Otherwise, returns NULL if there is no page table page corresponding to the 4062 * specified virtual address. 4063 */ 4064 static __inline vm_page_t 4065 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 4066 { 4067 4068 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4069 return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va))); 4070 } 4071 4072 /* 4073 * Decrements a page table page's reference count, which is used to record the 4074 * number of valid page table entries within the page. If the reference count 4075 * drops to zero, then the page table page is unmapped. Returns TRUE if the 4076 * page table page was unmapped and FALSE otherwise. 4077 */ 4078 static inline boolean_t 4079 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4080 { 4081 4082 --m->ref_count; 4083 if (m->ref_count == 0) { 4084 _pmap_unwire_ptp(pmap, va, m, free); 4085 return (TRUE); 4086 } else 4087 return (FALSE); 4088 } 4089 4090 static void 4091 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4092 { 4093 pml5_entry_t *pml5; 4094 pml4_entry_t *pml4; 4095 pdp_entry_t *pdp; 4096 pd_entry_t *pd; 4097 vm_page_t pdpg, pdppg, pml4pg; 4098 4099 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4100 4101 /* 4102 * unmap the page table page 4103 */ 4104 if (m->pindex >= NUPDE + NUPDPE + NUPML4E) { 4105 /* PML4 page */ 4106 MPASS(pmap_is_la57(pmap)); 4107 pml5 = pmap_pml5e(pmap, va); 4108 *pml5 = 0; 4109 if (pmap->pm_pmltopu != NULL && va <= VM_MAXUSER_ADDRESS) { 4110 pml5 = pmap_pml5e_u(pmap, va); 4111 *pml5 = 0; 4112 } 4113 } else if (m->pindex >= NUPDE + NUPDPE) { 4114 /* PDP page */ 4115 pml4 = pmap_pml4e(pmap, va); 4116 *pml4 = 0; 4117 if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && 4118 va <= VM_MAXUSER_ADDRESS) { 4119 pml4 = pmap_pml4e_u(pmap, va); 4120 *pml4 = 0; 4121 } 4122 } else if (m->pindex >= NUPDE) { 4123 /* PD page */ 4124 pdp = pmap_pdpe(pmap, va); 4125 *pdp = 0; 4126 } else { 4127 /* PTE page */ 4128 pd = pmap_pde(pmap, va); 4129 *pd = 0; 4130 } 4131 if (m->pindex < NUPDE) { 4132 /* We just released a PT, unhold the matching PD */ 4133 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 4134 pmap_unwire_ptp(pmap, va, pdpg, free); 4135 } else if (m->pindex < NUPDE + NUPDPE) { 4136 /* We just released a PD, unhold the matching PDP */ 4137 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 4138 pmap_unwire_ptp(pmap, va, pdppg, free); 4139 } else if (m->pindex < NUPDE + NUPDPE + NUPML4E && pmap_is_la57(pmap)) { 4140 /* We just released a PDP, unhold the matching PML4 */ 4141 pml4pg = PHYS_TO_VM_PAGE(*pmap_pml5e(pmap, va) & PG_FRAME); 4142 pmap_unwire_ptp(pmap, va, pml4pg, free); 4143 } 4144 4145 pmap_pt_page_count_adj(pmap, -1); 4146 4147 /* 4148 * Put page on a list so that it is released after 4149 * *ALL* TLB shootdown is done 4150 */ 4151 pmap_add_delayed_free_list(m, free, TRUE); 4152 } 4153 4154 /* 4155 * After removing a page table entry, this routine is used to 4156 * conditionally free the page, and manage the reference count. 4157 */ 4158 static int 4159 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 4160 struct spglist *free) 4161 { 4162 vm_page_t mpte; 4163 4164 if (va >= VM_MAXUSER_ADDRESS) 4165 return (0); 4166 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 4167 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 4168 return (pmap_unwire_ptp(pmap, va, mpte, free)); 4169 } 4170 4171 /* 4172 * Release a page table page reference after a failed attempt to create a 4173 * mapping. 4174 */ 4175 static void 4176 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 4177 { 4178 struct spglist free; 4179 4180 SLIST_INIT(&free); 4181 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 4182 /* 4183 * Although "va" was never mapped, paging-structure caches 4184 * could nonetheless have entries that refer to the freed 4185 * page table pages. Invalidate those entries. 4186 */ 4187 pmap_invalidate_page(pmap, va); 4188 vm_page_free_pages_toq(&free, true); 4189 } 4190 } 4191 4192 void 4193 pmap_pinit0(pmap_t pmap) 4194 { 4195 struct proc *p; 4196 struct thread *td; 4197 int i; 4198 4199 PMAP_LOCK_INIT(pmap); 4200 pmap->pm_pmltop = kernel_pmap->pm_pmltop; 4201 pmap->pm_pmltopu = NULL; 4202 pmap->pm_cr3 = kernel_pmap->pm_cr3; 4203 /* hack to keep pmap_pti_pcid_invalidate() alive */ 4204 pmap->pm_ucr3 = PMAP_NO_CR3; 4205 vm_radix_init(&pmap->pm_root); 4206 CPU_ZERO(&pmap->pm_active); 4207 TAILQ_INIT(&pmap->pm_pvchunk); 4208 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4209 pmap->pm_flags = pmap_flags; 4210 CPU_FOREACH(i) { 4211 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN + 1; 4212 pmap->pm_pcids[i].pm_gen = 1; 4213 } 4214 pmap_activate_boot(pmap); 4215 td = curthread; 4216 if (pti) { 4217 p = td->td_proc; 4218 PROC_LOCK(p); 4219 p->p_md.md_flags |= P_MD_KPTI; 4220 PROC_UNLOCK(p); 4221 } 4222 pmap_thread_init_invl_gen(td); 4223 4224 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 4225 pmap_pkru_ranges_zone = uma_zcreate("pkru ranges", 4226 sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL, 4227 UMA_ALIGN_PTR, 0); 4228 } 4229 } 4230 4231 void 4232 pmap_pinit_pml4(vm_page_t pml4pg) 4233 { 4234 pml4_entry_t *pm_pml4; 4235 int i; 4236 4237 pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); 4238 4239 /* Wire in kernel global address entries. */ 4240 for (i = 0; i < NKPML4E; i++) { 4241 pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW | 4242 X86_PG_V; 4243 } 4244 #ifdef KASAN 4245 for (i = 0; i < NKASANPML4E; i++) { 4246 pm_pml4[KASANPML4I + i] = (KASANPDPphys + ptoa(i)) | X86_PG_RW | 4247 X86_PG_V | pg_nx; 4248 } 4249 #endif 4250 #ifdef KMSAN 4251 for (i = 0; i < NKMSANSHADPML4E; i++) { 4252 pm_pml4[KMSANSHADPML4I + i] = (KMSANSHADPDPphys + ptoa(i)) | 4253 X86_PG_RW | X86_PG_V | pg_nx; 4254 } 4255 for (i = 0; i < NKMSANORIGPML4E; i++) { 4256 pm_pml4[KMSANORIGPML4I + i] = (KMSANORIGPDPphys + ptoa(i)) | 4257 X86_PG_RW | X86_PG_V | pg_nx; 4258 } 4259 #endif 4260 for (i = 0; i < ndmpdpphys; i++) { 4261 pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | 4262 X86_PG_V; 4263 } 4264 4265 /* install self-referential address mapping entry(s) */ 4266 pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW | 4267 X86_PG_A | X86_PG_M; 4268 4269 /* install large map entries if configured */ 4270 for (i = 0; i < lm_ents; i++) 4271 pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pmltop[LMSPML4I + i]; 4272 } 4273 4274 void 4275 pmap_pinit_pml5(vm_page_t pml5pg) 4276 { 4277 pml5_entry_t *pm_pml5; 4278 4279 pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg)); 4280 4281 /* 4282 * Add pml5 entry at top of KVA pointing to existing pml4 table, 4283 * entering all existing kernel mappings into level 5 table. 4284 */ 4285 pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | 4286 X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | 4287 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); 4288 4289 /* 4290 * Install self-referential address mapping entry. 4291 */ 4292 pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) | 4293 X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A | 4294 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); 4295 } 4296 4297 static void 4298 pmap_pinit_pml4_pti(vm_page_t pml4pgu) 4299 { 4300 pml4_entry_t *pm_pml4u; 4301 int i; 4302 4303 pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pgu)); 4304 for (i = 0; i < NPML4EPG; i++) 4305 pm_pml4u[i] = pti_pml4[i]; 4306 } 4307 4308 static void 4309 pmap_pinit_pml5_pti(vm_page_t pml5pgu) 4310 { 4311 pml5_entry_t *pm_pml5u; 4312 4313 pm_pml5u = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pgu)); 4314 pagezero(pm_pml5u); 4315 4316 /* 4317 * Add pml5 entry at top of KVA pointing to existing pml4 pti 4318 * table, entering all kernel mappings needed for usermode 4319 * into level 5 table. 4320 */ 4321 pm_pml5u[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 4322 pmap_kextract((vm_offset_t)pti_pml4) | 4323 X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | 4324 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); 4325 } 4326 4327 /* Allocate a page table page and do related bookkeeping */ 4328 static vm_page_t 4329 pmap_alloc_pt_page(pmap_t pmap, vm_pindex_t pindex, int flags) 4330 { 4331 vm_page_t m; 4332 4333 m = vm_page_alloc_noobj(flags); 4334 if (__predict_false(m == NULL)) 4335 return (NULL); 4336 m->pindex = pindex; 4337 pmap_pt_page_count_adj(pmap, 1); 4338 return (m); 4339 } 4340 4341 static void 4342 pmap_free_pt_page(pmap_t pmap, vm_page_t m, bool zerofilled) 4343 { 4344 /* 4345 * This function assumes the page will need to be unwired, 4346 * even though the counterpart allocation in pmap_alloc_pt_page() 4347 * doesn't enforce VM_ALLOC_WIRED. However, all current uses 4348 * of pmap_free_pt_page() require unwiring. The case in which 4349 * a PT page doesn't require unwiring because its ref_count has 4350 * naturally reached 0 is handled through _pmap_unwire_ptp(). 4351 */ 4352 vm_page_unwire_noq(m); 4353 if (zerofilled) 4354 vm_page_free_zero(m); 4355 else 4356 vm_page_free(m); 4357 4358 pmap_pt_page_count_adj(pmap, -1); 4359 } 4360 4361 /* 4362 * Initialize a preallocated and zeroed pmap structure, 4363 * such as one in a vmspace structure. 4364 */ 4365 int 4366 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) 4367 { 4368 vm_page_t pmltop_pg, pmltop_pgu; 4369 vm_paddr_t pmltop_phys; 4370 int i; 4371 4372 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4373 4374 /* 4375 * Allocate the page directory page. Pass NULL instead of a 4376 * pointer to the pmap here to avoid calling 4377 * pmap_resident_count_adj() through pmap_pt_page_count_adj(), 4378 * since that requires pmap lock. Instead do the accounting 4379 * manually. 4380 * 4381 * Note that final call to pmap_remove() optimization that 4382 * checks for zero resident_count is basically disabled by 4383 * accounting for top-level page. But the optimization was 4384 * not effective since we started using non-managed mapping of 4385 * the shared page. 4386 */ 4387 pmltop_pg = pmap_alloc_pt_page(NULL, 0, VM_ALLOC_WIRED | VM_ALLOC_ZERO | 4388 VM_ALLOC_WAITOK); 4389 pmap_pt_page_count_pinit(pmap, 1); 4390 4391 pmltop_phys = VM_PAGE_TO_PHYS(pmltop_pg); 4392 pmap->pm_pmltop = (pml5_entry_t *)PHYS_TO_DMAP(pmltop_phys); 4393 4394 CPU_FOREACH(i) { 4395 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; 4396 pmap->pm_pcids[i].pm_gen = 0; 4397 } 4398 pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */ 4399 pmap->pm_ucr3 = PMAP_NO_CR3; 4400 pmap->pm_pmltopu = NULL; 4401 4402 pmap->pm_type = pm_type; 4403 4404 /* 4405 * Do not install the host kernel mappings in the nested page 4406 * tables. These mappings are meaningless in the guest physical 4407 * address space. 4408 * Install minimal kernel mappings in PTI case. 4409 */ 4410 switch (pm_type) { 4411 case PT_X86: 4412 pmap->pm_cr3 = pmltop_phys; 4413 if (pmap_is_la57(pmap)) 4414 pmap_pinit_pml5(pmltop_pg); 4415 else 4416 pmap_pinit_pml4(pmltop_pg); 4417 if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) { 4418 /* 4419 * As with pmltop_pg, pass NULL instead of a 4420 * pointer to the pmap to ensure that the PTI 4421 * page counted explicitly. 4422 */ 4423 pmltop_pgu = pmap_alloc_pt_page(NULL, 0, 4424 VM_ALLOC_WIRED | VM_ALLOC_WAITOK); 4425 pmap_pt_page_count_pinit(pmap, 1); 4426 pmap->pm_pmltopu = (pml4_entry_t *)PHYS_TO_DMAP( 4427 VM_PAGE_TO_PHYS(pmltop_pgu)); 4428 if (pmap_is_la57(pmap)) 4429 pmap_pinit_pml5_pti(pmltop_pgu); 4430 else 4431 pmap_pinit_pml4_pti(pmltop_pgu); 4432 pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pmltop_pgu); 4433 } 4434 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 4435 rangeset_init(&pmap->pm_pkru, pkru_dup_range, 4436 pkru_free_range, pmap, M_NOWAIT); 4437 } 4438 break; 4439 case PT_EPT: 4440 case PT_RVI: 4441 pmap->pm_eptsmr = smr_create("pmap", 0, 0); 4442 break; 4443 } 4444 4445 vm_radix_init(&pmap->pm_root); 4446 CPU_ZERO(&pmap->pm_active); 4447 TAILQ_INIT(&pmap->pm_pvchunk); 4448 pmap->pm_flags = flags; 4449 pmap->pm_eptgen = 0; 4450 4451 return (1); 4452 } 4453 4454 int 4455 pmap_pinit(pmap_t pmap) 4456 { 4457 4458 return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); 4459 } 4460 4461 static void 4462 pmap_allocpte_free_unref(pmap_t pmap, vm_offset_t va, pt_entry_t *pte) 4463 { 4464 vm_page_t mpg; 4465 struct spglist free; 4466 4467 mpg = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 4468 if (mpg->ref_count != 0) 4469 return; 4470 SLIST_INIT(&free); 4471 _pmap_unwire_ptp(pmap, va, mpg, &free); 4472 pmap_invalidate_page(pmap, va); 4473 vm_page_free_pages_toq(&free, true); 4474 } 4475 4476 static pml4_entry_t * 4477 pmap_allocpte_getpml4(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, 4478 bool addref) 4479 { 4480 vm_pindex_t pml5index; 4481 pml5_entry_t *pml5; 4482 pml4_entry_t *pml4; 4483 vm_page_t pml4pg; 4484 pt_entry_t PG_V; 4485 bool allocated; 4486 4487 if (!pmap_is_la57(pmap)) 4488 return (&pmap->pm_pmltop[pmap_pml4e_index(va)]); 4489 4490 PG_V = pmap_valid_bit(pmap); 4491 pml5index = pmap_pml5e_index(va); 4492 pml5 = &pmap->pm_pmltop[pml5index]; 4493 if ((*pml5 & PG_V) == 0) { 4494 if (pmap_allocpte_nosleep(pmap, pmap_pml5e_pindex(va), lockp, 4495 va) == NULL) 4496 return (NULL); 4497 allocated = true; 4498 } else { 4499 allocated = false; 4500 } 4501 pml4 = (pml4_entry_t *)PHYS_TO_DMAP(*pml5 & PG_FRAME); 4502 pml4 = &pml4[pmap_pml4e_index(va)]; 4503 if ((*pml4 & PG_V) == 0) { 4504 pml4pg = PHYS_TO_VM_PAGE(*pml5 & PG_FRAME); 4505 if (allocated && !addref) 4506 pml4pg->ref_count--; 4507 else if (!allocated && addref) 4508 pml4pg->ref_count++; 4509 } 4510 return (pml4); 4511 } 4512 4513 static pdp_entry_t * 4514 pmap_allocpte_getpdp(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, 4515 bool addref) 4516 { 4517 vm_page_t pdppg; 4518 pml4_entry_t *pml4; 4519 pdp_entry_t *pdp; 4520 pt_entry_t PG_V; 4521 bool allocated; 4522 4523 PG_V = pmap_valid_bit(pmap); 4524 4525 pml4 = pmap_allocpte_getpml4(pmap, lockp, va, false); 4526 if (pml4 == NULL) 4527 return (NULL); 4528 4529 if ((*pml4 & PG_V) == 0) { 4530 /* Have to allocate a new pdp, recurse */ 4531 if (pmap_allocpte_nosleep(pmap, pmap_pml4e_pindex(va), lockp, 4532 va) == NULL) { 4533 if (pmap_is_la57(pmap)) 4534 pmap_allocpte_free_unref(pmap, va, 4535 pmap_pml5e(pmap, va)); 4536 return (NULL); 4537 } 4538 allocated = true; 4539 } else { 4540 allocated = false; 4541 } 4542 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 4543 pdp = &pdp[pmap_pdpe_index(va)]; 4544 if ((*pdp & PG_V) == 0) { 4545 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 4546 if (allocated && !addref) 4547 pdppg->ref_count--; 4548 else if (!allocated && addref) 4549 pdppg->ref_count++; 4550 } 4551 return (pdp); 4552 } 4553 4554 /* 4555 * The ptepindexes, i.e. page indices, of the page table pages encountered 4556 * while translating virtual address va are defined as follows: 4557 * - for the page table page (last level), 4558 * ptepindex = pmap_pde_pindex(va) = va >> PDRSHIFT, 4559 * in other words, it is just the index of the PDE that maps the page 4560 * table page. 4561 * - for the page directory page, 4562 * ptepindex = NUPDE (number of userland PD entries) + 4563 * (pmap_pde_index(va) >> NPDEPGSHIFT) 4564 * i.e. index of PDPE is put after the last index of PDE, 4565 * - for the page directory pointer page, 4566 * ptepindex = NUPDE + NUPDPE + (pmap_pde_index(va) >> (NPDEPGSHIFT + 4567 * NPML4EPGSHIFT), 4568 * i.e. index of pml4e is put after the last index of PDPE, 4569 * - for the PML4 page (if LA57 mode is enabled), 4570 * ptepindex = NUPDE + NUPDPE + NUPML4E + (pmap_pde_index(va) >> 4571 * (NPDEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT), 4572 * i.e. index of pml5e is put after the last index of PML4E. 4573 * 4574 * Define an order on the paging entries, where all entries of the 4575 * same height are put together, then heights are put from deepest to 4576 * root. Then ptexpindex is the sequential number of the 4577 * corresponding paging entry in this order. 4578 * 4579 * The values of NUPDE, NUPDPE, and NUPML4E are determined by the size of 4580 * LA57 paging structures even in LA48 paging mode. Moreover, the 4581 * ptepindexes are calculated as if the paging structures were 5-level 4582 * regardless of the actual mode of operation. 4583 * 4584 * The root page at PML4/PML5 does not participate in this indexing scheme, 4585 * since it is statically allocated by pmap_pinit() and not by pmap_allocpte(). 4586 */ 4587 static vm_page_t 4588 pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, 4589 vm_offset_t va) 4590 { 4591 vm_pindex_t pml5index, pml4index; 4592 pml5_entry_t *pml5, *pml5u; 4593 pml4_entry_t *pml4, *pml4u; 4594 pdp_entry_t *pdp; 4595 pd_entry_t *pd; 4596 vm_page_t m, pdpg; 4597 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 4598 4599 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4600 4601 PG_A = pmap_accessed_bit(pmap); 4602 PG_M = pmap_modified_bit(pmap); 4603 PG_V = pmap_valid_bit(pmap); 4604 PG_RW = pmap_rw_bit(pmap); 4605 4606 /* 4607 * Allocate a page table page. 4608 */ 4609 m = pmap_alloc_pt_page(pmap, ptepindex, 4610 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 4611 if (m == NULL) 4612 return (NULL); 4613 4614 /* 4615 * Map the pagetable page into the process address space, if 4616 * it isn't already there. 4617 */ 4618 if (ptepindex >= NUPDE + NUPDPE + NUPML4E) { 4619 MPASS(pmap_is_la57(pmap)); 4620 4621 pml5index = pmap_pml5e_index(va); 4622 pml5 = &pmap->pm_pmltop[pml5index]; 4623 KASSERT((*pml5 & PG_V) == 0, 4624 ("pmap %p va %#lx pml5 %#lx", pmap, va, *pml5)); 4625 *pml5 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4626 4627 if (pmap->pm_pmltopu != NULL && pml5index < NUPML5E) { 4628 if (pmap->pm_ucr3 != PMAP_NO_CR3) 4629 *pml5 |= pg_nx; 4630 4631 pml5u = &pmap->pm_pmltopu[pml5index]; 4632 *pml5u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | 4633 PG_A | PG_M; 4634 } 4635 } else if (ptepindex >= NUPDE + NUPDPE) { 4636 pml4index = pmap_pml4e_index(va); 4637 /* Wire up a new PDPE page */ 4638 pml4 = pmap_allocpte_getpml4(pmap, lockp, va, true); 4639 if (pml4 == NULL) { 4640 pmap_free_pt_page(pmap, m, true); 4641 return (NULL); 4642 } 4643 KASSERT((*pml4 & PG_V) == 0, 4644 ("pmap %p va %#lx pml4 %#lx", pmap, va, *pml4)); 4645 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4646 4647 if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && 4648 pml4index < NUPML4E) { 4649 /* 4650 * PTI: Make all user-space mappings in the 4651 * kernel-mode page table no-execute so that 4652 * we detect any programming errors that leave 4653 * the kernel-mode page table active on return 4654 * to user space. 4655 */ 4656 if (pmap->pm_ucr3 != PMAP_NO_CR3) 4657 *pml4 |= pg_nx; 4658 4659 pml4u = &pmap->pm_pmltopu[pml4index]; 4660 *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | 4661 PG_A | PG_M; 4662 } 4663 } else if (ptepindex >= NUPDE) { 4664 /* Wire up a new PDE page */ 4665 pdp = pmap_allocpte_getpdp(pmap, lockp, va, true); 4666 if (pdp == NULL) { 4667 pmap_free_pt_page(pmap, m, true); 4668 return (NULL); 4669 } 4670 KASSERT((*pdp & PG_V) == 0, 4671 ("pmap %p va %#lx pdp %#lx", pmap, va, *pdp)); 4672 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4673 } else { 4674 /* Wire up a new PTE page */ 4675 pdp = pmap_allocpte_getpdp(pmap, lockp, va, false); 4676 if (pdp == NULL) { 4677 pmap_free_pt_page(pmap, m, true); 4678 return (NULL); 4679 } 4680 if ((*pdp & PG_V) == 0) { 4681 /* Have to allocate a new pd, recurse */ 4682 if (pmap_allocpte_nosleep(pmap, pmap_pdpe_pindex(va), 4683 lockp, va) == NULL) { 4684 pmap_allocpte_free_unref(pmap, va, 4685 pmap_pml4e(pmap, va)); 4686 pmap_free_pt_page(pmap, m, true); 4687 return (NULL); 4688 } 4689 } else { 4690 /* Add reference to the pd page */ 4691 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 4692 pdpg->ref_count++; 4693 } 4694 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 4695 4696 /* Now we know where the page directory page is */ 4697 pd = &pd[pmap_pde_index(va)]; 4698 KASSERT((*pd & PG_V) == 0, 4699 ("pmap %p va %#lx pd %#lx", pmap, va, *pd)); 4700 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4701 } 4702 4703 return (m); 4704 } 4705 4706 /* 4707 * This routine is called if the desired page table page does not exist. 4708 * 4709 * If page table page allocation fails, this routine may sleep before 4710 * returning NULL. It sleeps only if a lock pointer was given. Sleep 4711 * occurs right before returning to the caller. This way, we never 4712 * drop pmap lock to sleep while a page table page has ref_count == 0, 4713 * which prevents the page from being freed under us. 4714 */ 4715 static vm_page_t 4716 pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, 4717 vm_offset_t va) 4718 { 4719 vm_page_t m; 4720 4721 m = pmap_allocpte_nosleep(pmap, ptepindex, lockp, va); 4722 if (m == NULL && lockp != NULL) { 4723 RELEASE_PV_LIST_LOCK(lockp); 4724 PMAP_UNLOCK(pmap); 4725 PMAP_ASSERT_NOT_IN_DI(); 4726 vm_wait(NULL); 4727 PMAP_LOCK(pmap); 4728 } 4729 return (m); 4730 } 4731 4732 static pd_entry_t * 4733 pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, 4734 struct rwlock **lockp) 4735 { 4736 pdp_entry_t *pdpe, PG_V; 4737 pd_entry_t *pde; 4738 vm_page_t pdpg; 4739 vm_pindex_t pdpindex; 4740 4741 PG_V = pmap_valid_bit(pmap); 4742 4743 retry: 4744 pdpe = pmap_pdpe(pmap, va); 4745 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 4746 pde = pmap_pdpe_to_pde(pdpe, va); 4747 if (va < VM_MAXUSER_ADDRESS) { 4748 /* Add a reference to the pd page. */ 4749 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 4750 pdpg->ref_count++; 4751 } else 4752 pdpg = NULL; 4753 } else if (va < VM_MAXUSER_ADDRESS) { 4754 /* Allocate a pd page. */ 4755 pdpindex = pmap_pde_pindex(va) >> NPDPEPGSHIFT; 4756 pdpg = pmap_allocpte_alloc(pmap, NUPDE + pdpindex, lockp, va); 4757 if (pdpg == NULL) { 4758 if (lockp != NULL) 4759 goto retry; 4760 else 4761 return (NULL); 4762 } 4763 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 4764 pde = &pde[pmap_pde_index(va)]; 4765 } else 4766 panic("pmap_alloc_pde: missing page table page for va %#lx", 4767 va); 4768 *pdpgp = pdpg; 4769 return (pde); 4770 } 4771 4772 static vm_page_t 4773 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 4774 { 4775 vm_pindex_t ptepindex; 4776 pd_entry_t *pd, PG_V; 4777 vm_page_t m; 4778 4779 PG_V = pmap_valid_bit(pmap); 4780 4781 /* 4782 * Calculate pagetable page index 4783 */ 4784 ptepindex = pmap_pde_pindex(va); 4785 retry: 4786 /* 4787 * Get the page directory entry 4788 */ 4789 pd = pmap_pde(pmap, va); 4790 4791 /* 4792 * This supports switching from a 2MB page to a 4793 * normal 4K page. 4794 */ 4795 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 4796 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { 4797 /* 4798 * Invalidation of the 2MB page mapping may have caused 4799 * the deallocation of the underlying PD page. 4800 */ 4801 pd = NULL; 4802 } 4803 } 4804 4805 /* 4806 * If the page table page is mapped, we just increment the 4807 * hold count, and activate it. 4808 */ 4809 if (pd != NULL && (*pd & PG_V) != 0) { 4810 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 4811 m->ref_count++; 4812 } else { 4813 /* 4814 * Here if the pte page isn't mapped, or if it has been 4815 * deallocated. 4816 */ 4817 m = pmap_allocpte_alloc(pmap, ptepindex, lockp, va); 4818 if (m == NULL && lockp != NULL) 4819 goto retry; 4820 } 4821 return (m); 4822 } 4823 4824 /*************************************************** 4825 * Pmap allocation/deallocation routines. 4826 ***************************************************/ 4827 4828 /* 4829 * Release any resources held by the given physical map. 4830 * Called when a pmap initialized by pmap_pinit is being released. 4831 * Should only be called if the map contains no valid mappings. 4832 */ 4833 void 4834 pmap_release(pmap_t pmap) 4835 { 4836 vm_page_t m; 4837 int i; 4838 4839 KASSERT(vm_radix_is_empty(&pmap->pm_root), 4840 ("pmap_release: pmap %p has reserved page table page(s)", 4841 pmap)); 4842 KASSERT(CPU_EMPTY(&pmap->pm_active), 4843 ("releasing active pmap %p", pmap)); 4844 4845 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop)); 4846 4847 if (pmap_is_la57(pmap)) { 4848 pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0; 4849 pmap->pm_pmltop[PML5PML5I] = 0; 4850 } else { 4851 for (i = 0; i < NKPML4E; i++) /* KVA */ 4852 pmap->pm_pmltop[KPML4BASE + i] = 0; 4853 #ifdef KASAN 4854 for (i = 0; i < NKASANPML4E; i++) /* KASAN shadow map */ 4855 pmap->pm_pmltop[KASANPML4I + i] = 0; 4856 #endif 4857 #ifdef KMSAN 4858 for (i = 0; i < NKMSANSHADPML4E; i++) /* KMSAN shadow map */ 4859 pmap->pm_pmltop[KMSANSHADPML4I + i] = 0; 4860 for (i = 0; i < NKMSANORIGPML4E; i++) /* KMSAN shadow map */ 4861 pmap->pm_pmltop[KMSANORIGPML4I + i] = 0; 4862 #endif 4863 for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ 4864 pmap->pm_pmltop[DMPML4I + i] = 0; 4865 pmap->pm_pmltop[PML4PML4I] = 0; /* Recursive Mapping */ 4866 for (i = 0; i < lm_ents; i++) /* Large Map */ 4867 pmap->pm_pmltop[LMSPML4I + i] = 0; 4868 } 4869 4870 pmap_free_pt_page(NULL, m, true); 4871 pmap_pt_page_count_pinit(pmap, -1); 4872 4873 if (pmap->pm_pmltopu != NULL) { 4874 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap-> 4875 pm_pmltopu)); 4876 pmap_free_pt_page(NULL, m, false); 4877 pmap_pt_page_count_pinit(pmap, -1); 4878 } 4879 if (pmap->pm_type == PT_X86 && 4880 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) 4881 rangeset_fini(&pmap->pm_pkru); 4882 4883 KASSERT(pmap->pm_stats.resident_count == 0, 4884 ("pmap_release: pmap %p resident count %ld != 0", 4885 pmap, pmap->pm_stats.resident_count)); 4886 } 4887 4888 static int 4889 kvm_size(SYSCTL_HANDLER_ARGS) 4890 { 4891 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 4892 4893 return sysctl_handle_long(oidp, &ksize, 0, req); 4894 } 4895 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 4896 0, 0, kvm_size, "LU", 4897 "Size of KVM"); 4898 4899 static int 4900 kvm_free(SYSCTL_HANDLER_ARGS) 4901 { 4902 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 4903 4904 return sysctl_handle_long(oidp, &kfree, 0, req); 4905 } 4906 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 4907 0, 0, kvm_free, "LU", 4908 "Amount of KVM free"); 4909 4910 #ifdef KMSAN 4911 static void 4912 pmap_kmsan_shadow_map_page_array(vm_paddr_t pdppa, vm_size_t size) 4913 { 4914 pdp_entry_t *pdpe; 4915 pd_entry_t *pde; 4916 pt_entry_t *pte; 4917 vm_paddr_t dummypa, dummypd, dummypt; 4918 int i, npde, npdpg; 4919 4920 npdpg = howmany(size, NBPDP); 4921 npde = size / NBPDR; 4922 4923 dummypa = vm_phys_early_alloc(-1, PAGE_SIZE); 4924 pagezero((void *)PHYS_TO_DMAP(dummypa)); 4925 4926 dummypt = vm_phys_early_alloc(-1, PAGE_SIZE); 4927 pagezero((void *)PHYS_TO_DMAP(dummypt)); 4928 dummypd = vm_phys_early_alloc(-1, PAGE_SIZE * npdpg); 4929 for (i = 0; i < npdpg; i++) 4930 pagezero((void *)PHYS_TO_DMAP(dummypd + ptoa(i))); 4931 4932 pte = (pt_entry_t *)PHYS_TO_DMAP(dummypt); 4933 for (i = 0; i < NPTEPG; i++) 4934 pte[i] = (pt_entry_t)(dummypa | X86_PG_V | X86_PG_RW | 4935 X86_PG_A | X86_PG_M | pg_nx); 4936 4937 pde = (pd_entry_t *)PHYS_TO_DMAP(dummypd); 4938 for (i = 0; i < npde; i++) 4939 pde[i] = (pd_entry_t)(dummypt | X86_PG_V | X86_PG_RW | pg_nx); 4940 4941 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(pdppa); 4942 for (i = 0; i < npdpg; i++) 4943 pdpe[i] = (pdp_entry_t)(dummypd + ptoa(i) | X86_PG_V | 4944 X86_PG_RW | pg_nx); 4945 } 4946 4947 static void 4948 pmap_kmsan_page_array_startup(vm_offset_t start, vm_offset_t end) 4949 { 4950 vm_size_t size; 4951 4952 KASSERT(start % NBPDP == 0, ("unaligned page array start address")); 4953 4954 /* 4955 * The end of the page array's KVA region is 2MB aligned, see 4956 * kmem_init(). 4957 */ 4958 size = round_2mpage(end) - start; 4959 pmap_kmsan_shadow_map_page_array(KMSANSHADPDPphys, size); 4960 pmap_kmsan_shadow_map_page_array(KMSANORIGPDPphys, size); 4961 } 4962 #endif 4963 4964 /* 4965 * Allocate physical memory for the vm_page array and map it into KVA, 4966 * attempting to back the vm_pages with domain-local memory. 4967 */ 4968 void 4969 pmap_page_array_startup(long pages) 4970 { 4971 pdp_entry_t *pdpe; 4972 pd_entry_t *pde, newpdir; 4973 vm_offset_t va, start, end; 4974 vm_paddr_t pa; 4975 long pfn; 4976 int domain, i; 4977 4978 vm_page_array_size = pages; 4979 4980 start = VM_MIN_KERNEL_ADDRESS; 4981 end = start + pages * sizeof(struct vm_page); 4982 for (va = start; va < end; va += NBPDR) { 4983 pfn = first_page + (va - start) / sizeof(struct vm_page); 4984 domain = vm_phys_domain(ptoa(pfn)); 4985 pdpe = pmap_pdpe(kernel_pmap, va); 4986 if ((*pdpe & X86_PG_V) == 0) { 4987 pa = vm_phys_early_alloc(domain, PAGE_SIZE); 4988 dump_add_page(pa); 4989 pagezero((void *)PHYS_TO_DMAP(pa)); 4990 *pdpe = (pdp_entry_t)(pa | X86_PG_V | X86_PG_RW | 4991 X86_PG_A | X86_PG_M); 4992 } 4993 pde = pmap_pdpe_to_pde(pdpe, va); 4994 if ((*pde & X86_PG_V) != 0) 4995 panic("Unexpected pde"); 4996 pa = vm_phys_early_alloc(domain, NBPDR); 4997 for (i = 0; i < NPDEPG; i++) 4998 dump_add_page(pa + i * PAGE_SIZE); 4999 newpdir = (pd_entry_t)(pa | X86_PG_V | X86_PG_RW | X86_PG_A | 5000 X86_PG_M | PG_PS | pg_g | pg_nx); 5001 pde_store(pde, newpdir); 5002 } 5003 vm_page_array = (vm_page_t)start; 5004 5005 #ifdef KMSAN 5006 pmap_kmsan_page_array_startup(start, end); 5007 #endif 5008 } 5009 5010 /* 5011 * grow the number of kernel page table entries, if needed 5012 */ 5013 void 5014 pmap_growkernel(vm_offset_t addr) 5015 { 5016 vm_paddr_t paddr; 5017 vm_page_t nkpg; 5018 pd_entry_t *pde, newpdir; 5019 pdp_entry_t *pdpe; 5020 vm_offset_t end; 5021 5022 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 5023 5024 /* 5025 * The kernel map covers two distinct regions of KVA: that used 5026 * for dynamic kernel memory allocations, and the uppermost 2GB 5027 * of the virtual address space. The latter is used to map the 5028 * kernel and loadable kernel modules. This scheme enables the 5029 * use of a special code generation model for kernel code which 5030 * takes advantage of compact addressing modes in machine code. 5031 * 5032 * Both regions grow upwards; to avoid wasting memory, the gap 5033 * in between is unmapped. If "addr" is above "KERNBASE", the 5034 * kernel's region is grown, otherwise the kmem region is grown. 5035 * 5036 * The correctness of this action is based on the following 5037 * argument: vm_map_insert() allocates contiguous ranges of the 5038 * kernel virtual address space. It calls this function if a range 5039 * ends after "kernel_vm_end". If the kernel is mapped between 5040 * "kernel_vm_end" and "addr", then the range cannot begin at 5041 * "kernel_vm_end". In fact, its beginning address cannot be less 5042 * than the kernel. Thus, there is no immediate need to allocate 5043 * any new kernel page table pages between "kernel_vm_end" and 5044 * "KERNBASE". 5045 */ 5046 if (KERNBASE < addr) { 5047 end = KERNBASE + nkpt * NBPDR; 5048 if (end == 0) 5049 return; 5050 } else { 5051 end = kernel_vm_end; 5052 } 5053 5054 addr = roundup2(addr, NBPDR); 5055 if (addr - 1 >= vm_map_max(kernel_map)) 5056 addr = vm_map_max(kernel_map); 5057 if (addr <= end) { 5058 /* 5059 * The grown region is already mapped, so there is 5060 * nothing to do. 5061 */ 5062 return; 5063 } 5064 5065 kasan_shadow_map(end, addr - end); 5066 kmsan_shadow_map(end, addr - end); 5067 while (end < addr) { 5068 pdpe = pmap_pdpe(kernel_pmap, end); 5069 if ((*pdpe & X86_PG_V) == 0) { 5070 nkpg = pmap_alloc_pt_page(kernel_pmap, 5071 pmap_pdpe_pindex(end), VM_ALLOC_WIRED | 5072 VM_ALLOC_INTERRUPT | VM_ALLOC_ZERO); 5073 if (nkpg == NULL) 5074 panic("pmap_growkernel: no memory to grow kernel"); 5075 paddr = VM_PAGE_TO_PHYS(nkpg); 5076 *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | 5077 X86_PG_A | X86_PG_M); 5078 continue; /* try again */ 5079 } 5080 pde = pmap_pdpe_to_pde(pdpe, end); 5081 if ((*pde & X86_PG_V) != 0) { 5082 end = (end + NBPDR) & ~PDRMASK; 5083 if (end - 1 >= vm_map_max(kernel_map)) { 5084 end = vm_map_max(kernel_map); 5085 break; 5086 } 5087 continue; 5088 } 5089 5090 nkpg = pmap_alloc_pt_page(kernel_pmap, pmap_pde_pindex(end), 5091 VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT | VM_ALLOC_ZERO); 5092 if (nkpg == NULL) 5093 panic("pmap_growkernel: no memory to grow kernel"); 5094 paddr = VM_PAGE_TO_PHYS(nkpg); 5095 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; 5096 pde_store(pde, newpdir); 5097 5098 end = (end + NBPDR) & ~PDRMASK; 5099 if (end - 1 >= vm_map_max(kernel_map)) { 5100 end = vm_map_max(kernel_map); 5101 break; 5102 } 5103 } 5104 5105 if (end <= KERNBASE) 5106 kernel_vm_end = end; 5107 else 5108 nkpt = howmany(end - KERNBASE, NBPDR); 5109 } 5110 5111 /*************************************************** 5112 * page management routines. 5113 ***************************************************/ 5114 5115 static const uint64_t pc_freemask[_NPCM] = { 5116 [0 ... _NPCM - 2] = PC_FREEN, 5117 [_NPCM - 1] = PC_FREEL 5118 }; 5119 5120 #ifdef PV_STATS 5121 5122 static COUNTER_U64_DEFINE_EARLY(pc_chunk_count); 5123 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, 5124 &pc_chunk_count, "Current number of pv entry cnunks"); 5125 5126 static COUNTER_U64_DEFINE_EARLY(pc_chunk_allocs); 5127 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, 5128 &pc_chunk_allocs, "Total number of pv entry chunks allocated"); 5129 5130 static COUNTER_U64_DEFINE_EARLY(pc_chunk_frees); 5131 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, 5132 &pc_chunk_frees, "Total number of pv entry chunks freed"); 5133 5134 static COUNTER_U64_DEFINE_EARLY(pc_chunk_tryfail); 5135 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, 5136 &pc_chunk_tryfail, 5137 "Number of failed attempts to get a pv entry chunk page"); 5138 5139 static COUNTER_U64_DEFINE_EARLY(pv_entry_frees); 5140 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, 5141 &pv_entry_frees, "Total number of pv entries freed"); 5142 5143 static COUNTER_U64_DEFINE_EARLY(pv_entry_allocs); 5144 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, 5145 &pv_entry_allocs, "Total number of pv entries allocated"); 5146 5147 static COUNTER_U64_DEFINE_EARLY(pv_entry_count); 5148 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, 5149 &pv_entry_count, "Current number of pv entries"); 5150 5151 static COUNTER_U64_DEFINE_EARLY(pv_entry_spare); 5152 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, 5153 &pv_entry_spare, "Current number of spare pv entries"); 5154 #endif 5155 5156 static void 5157 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di) 5158 { 5159 5160 if (pmap == NULL) 5161 return; 5162 pmap_invalidate_all(pmap); 5163 if (pmap != locked_pmap) 5164 PMAP_UNLOCK(pmap); 5165 if (start_di) 5166 pmap_delayed_invl_finish(); 5167 } 5168 5169 /* 5170 * We are in a serious low memory condition. Resort to 5171 * drastic measures to free some pages so we can allocate 5172 * another pv entry chunk. 5173 * 5174 * Returns NULL if PV entries were reclaimed from the specified pmap. 5175 * 5176 * We do not, however, unmap 2mpages because subsequent accesses will 5177 * allocate per-page pv entries until repromotion occurs, thereby 5178 * exacerbating the shortage of free pv entries. 5179 */ 5180 static vm_page_t 5181 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain) 5182 { 5183 struct pv_chunks_list *pvc; 5184 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 5185 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 5186 struct md_page *pvh; 5187 pd_entry_t *pde; 5188 pmap_t next_pmap, pmap; 5189 pt_entry_t *pte, tpte; 5190 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 5191 pv_entry_t pv; 5192 vm_offset_t va; 5193 vm_page_t m, m_pc; 5194 struct spglist free; 5195 uint64_t inuse; 5196 int bit, field, freed; 5197 bool start_di, restart; 5198 5199 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 5200 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 5201 pmap = NULL; 5202 m_pc = NULL; 5203 PG_G = PG_A = PG_M = PG_RW = 0; 5204 SLIST_INIT(&free); 5205 bzero(&pc_marker_b, sizeof(pc_marker_b)); 5206 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 5207 pc_marker = (struct pv_chunk *)&pc_marker_b; 5208 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 5209 5210 /* 5211 * A delayed invalidation block should already be active if 5212 * pmap_advise() or pmap_remove() called this function by way 5213 * of pmap_demote_pde_locked(). 5214 */ 5215 start_di = pmap_not_in_di(); 5216 5217 pvc = &pv_chunks[domain]; 5218 mtx_lock(&pvc->pvc_lock); 5219 pvc->active_reclaims++; 5220 TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru); 5221 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru); 5222 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 5223 SLIST_EMPTY(&free)) { 5224 next_pmap = pc->pc_pmap; 5225 if (next_pmap == NULL) { 5226 /* 5227 * The next chunk is a marker. However, it is 5228 * not our marker, so active_reclaims must be 5229 * > 1. Consequently, the next_chunk code 5230 * will not rotate the pv_chunks list. 5231 */ 5232 goto next_chunk; 5233 } 5234 mtx_unlock(&pvc->pvc_lock); 5235 5236 /* 5237 * A pv_chunk can only be removed from the pc_lru list 5238 * when both pc_chunks_mutex is owned and the 5239 * corresponding pmap is locked. 5240 */ 5241 if (pmap != next_pmap) { 5242 restart = false; 5243 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, 5244 start_di); 5245 pmap = next_pmap; 5246 /* Avoid deadlock and lock recursion. */ 5247 if (pmap > locked_pmap) { 5248 RELEASE_PV_LIST_LOCK(lockp); 5249 PMAP_LOCK(pmap); 5250 if (start_di) 5251 pmap_delayed_invl_start(); 5252 mtx_lock(&pvc->pvc_lock); 5253 restart = true; 5254 } else if (pmap != locked_pmap) { 5255 if (PMAP_TRYLOCK(pmap)) { 5256 if (start_di) 5257 pmap_delayed_invl_start(); 5258 mtx_lock(&pvc->pvc_lock); 5259 restart = true; 5260 } else { 5261 pmap = NULL; /* pmap is not locked */ 5262 mtx_lock(&pvc->pvc_lock); 5263 pc = TAILQ_NEXT(pc_marker, pc_lru); 5264 if (pc == NULL || 5265 pc->pc_pmap != next_pmap) 5266 continue; 5267 goto next_chunk; 5268 } 5269 } else if (start_di) 5270 pmap_delayed_invl_start(); 5271 PG_G = pmap_global_bit(pmap); 5272 PG_A = pmap_accessed_bit(pmap); 5273 PG_M = pmap_modified_bit(pmap); 5274 PG_RW = pmap_rw_bit(pmap); 5275 if (restart) 5276 continue; 5277 } 5278 5279 /* 5280 * Destroy every non-wired, 4 KB page mapping in the chunk. 5281 */ 5282 freed = 0; 5283 for (field = 0; field < _NPCM; field++) { 5284 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 5285 inuse != 0; inuse &= ~(1UL << bit)) { 5286 bit = bsfq(inuse); 5287 pv = &pc->pc_pventry[field * 64 + bit]; 5288 va = pv->pv_va; 5289 pde = pmap_pde(pmap, va); 5290 if ((*pde & PG_PS) != 0) 5291 continue; 5292 pte = pmap_pde_to_pte(pde, va); 5293 if ((*pte & PG_W) != 0) 5294 continue; 5295 tpte = pte_load_clear(pte); 5296 if ((tpte & PG_G) != 0) 5297 pmap_invalidate_page(pmap, va); 5298 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 5299 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5300 vm_page_dirty(m); 5301 if ((tpte & PG_A) != 0) 5302 vm_page_aflag_set(m, PGA_REFERENCED); 5303 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 5304 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5305 m->md.pv_gen++; 5306 if (TAILQ_EMPTY(&m->md.pv_list) && 5307 (m->flags & PG_FICTITIOUS) == 0) { 5308 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5309 if (TAILQ_EMPTY(&pvh->pv_list)) { 5310 vm_page_aflag_clear(m, 5311 PGA_WRITEABLE); 5312 } 5313 } 5314 pmap_delayed_invl_page(m); 5315 pc->pc_map[field] |= 1UL << bit; 5316 pmap_unuse_pt(pmap, va, *pde, &free); 5317 freed++; 5318 } 5319 } 5320 if (freed == 0) { 5321 mtx_lock(&pvc->pvc_lock); 5322 goto next_chunk; 5323 } 5324 /* Every freed mapping is for a 4 KB page. */ 5325 pmap_resident_count_adj(pmap, -freed); 5326 PV_STAT(counter_u64_add(pv_entry_frees, freed)); 5327 PV_STAT(counter_u64_add(pv_entry_spare, freed)); 5328 PV_STAT(counter_u64_add(pv_entry_count, -freed)); 5329 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5330 if (pc_is_free(pc)) { 5331 PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV)); 5332 PV_STAT(counter_u64_add(pc_chunk_count, -1)); 5333 PV_STAT(counter_u64_add(pc_chunk_frees, 1)); 5334 /* Entire chunk is free; return it. */ 5335 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 5336 dump_drop_page(m_pc->phys_addr); 5337 mtx_lock(&pvc->pvc_lock); 5338 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5339 break; 5340 } 5341 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5342 mtx_lock(&pvc->pvc_lock); 5343 /* One freed pv entry in locked_pmap is sufficient. */ 5344 if (pmap == locked_pmap) 5345 break; 5346 next_chunk: 5347 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 5348 TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru); 5349 if (pvc->active_reclaims == 1 && pmap != NULL) { 5350 /* 5351 * Rotate the pv chunks list so that we do not 5352 * scan the same pv chunks that could not be 5353 * freed (because they contained a wired 5354 * and/or superpage mapping) on every 5355 * invocation of reclaim_pv_chunk(). 5356 */ 5357 while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker) { 5358 MPASS(pc->pc_pmap != NULL); 5359 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5360 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 5361 } 5362 } 5363 } 5364 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 5365 TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru); 5366 pvc->active_reclaims--; 5367 mtx_unlock(&pvc->pvc_lock); 5368 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); 5369 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 5370 m_pc = SLIST_FIRST(&free); 5371 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 5372 /* Recycle a freed page table page. */ 5373 m_pc->ref_count = 1; 5374 } 5375 vm_page_free_pages_toq(&free, true); 5376 return (m_pc); 5377 } 5378 5379 static vm_page_t 5380 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 5381 { 5382 vm_page_t m; 5383 int i, domain; 5384 5385 domain = PCPU_GET(domain); 5386 for (i = 0; i < vm_ndomains; i++) { 5387 m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain); 5388 if (m != NULL) 5389 break; 5390 domain = (domain + 1) % vm_ndomains; 5391 } 5392 5393 return (m); 5394 } 5395 5396 /* 5397 * free the pv_entry back to the free list 5398 */ 5399 static void 5400 free_pv_entry(pmap_t pmap, pv_entry_t pv) 5401 { 5402 struct pv_chunk *pc; 5403 int idx, field, bit; 5404 5405 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5406 PV_STAT(counter_u64_add(pv_entry_frees, 1)); 5407 PV_STAT(counter_u64_add(pv_entry_spare, 1)); 5408 PV_STAT(counter_u64_add(pv_entry_count, -1)); 5409 pc = pv_to_chunk(pv); 5410 idx = pv - &pc->pc_pventry[0]; 5411 field = idx / 64; 5412 bit = idx % 64; 5413 pc->pc_map[field] |= 1ul << bit; 5414 if (!pc_is_free(pc)) { 5415 /* 98% of the time, pc is already at the head of the list. */ 5416 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 5417 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5418 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5419 } 5420 return; 5421 } 5422 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5423 free_pv_chunk(pc); 5424 } 5425 5426 static void 5427 free_pv_chunk_dequeued(struct pv_chunk *pc) 5428 { 5429 vm_page_t m; 5430 5431 PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV)); 5432 PV_STAT(counter_u64_add(pc_chunk_count, -1)); 5433 PV_STAT(counter_u64_add(pc_chunk_frees, 1)); 5434 counter_u64_add(pv_page_count, -1); 5435 /* entire chunk is free, return it */ 5436 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 5437 dump_drop_page(m->phys_addr); 5438 vm_page_unwire_noq(m); 5439 vm_page_free(m); 5440 } 5441 5442 static void 5443 free_pv_chunk(struct pv_chunk *pc) 5444 { 5445 struct pv_chunks_list *pvc; 5446 5447 pvc = &pv_chunks[pc_to_domain(pc)]; 5448 mtx_lock(&pvc->pvc_lock); 5449 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5450 mtx_unlock(&pvc->pvc_lock); 5451 free_pv_chunk_dequeued(pc); 5452 } 5453 5454 static void 5455 free_pv_chunk_batch(struct pv_chunklist *batch) 5456 { 5457 struct pv_chunks_list *pvc; 5458 struct pv_chunk *pc, *npc; 5459 int i; 5460 5461 for (i = 0; i < vm_ndomains; i++) { 5462 if (TAILQ_EMPTY(&batch[i])) 5463 continue; 5464 pvc = &pv_chunks[i]; 5465 mtx_lock(&pvc->pvc_lock); 5466 TAILQ_FOREACH(pc, &batch[i], pc_list) { 5467 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5468 } 5469 mtx_unlock(&pvc->pvc_lock); 5470 } 5471 5472 for (i = 0; i < vm_ndomains; i++) { 5473 TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) { 5474 free_pv_chunk_dequeued(pc); 5475 } 5476 } 5477 } 5478 5479 /* 5480 * Returns a new PV entry, allocating a new PV chunk from the system when 5481 * needed. If this PV chunk allocation fails and a PV list lock pointer was 5482 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 5483 * returned. 5484 * 5485 * The given PV list lock may be released. 5486 */ 5487 static pv_entry_t 5488 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 5489 { 5490 struct pv_chunks_list *pvc; 5491 int bit, field; 5492 pv_entry_t pv; 5493 struct pv_chunk *pc; 5494 vm_page_t m; 5495 5496 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5497 PV_STAT(counter_u64_add(pv_entry_allocs, 1)); 5498 retry: 5499 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 5500 if (pc != NULL) { 5501 for (field = 0; field < _NPCM; field++) { 5502 if (pc->pc_map[field]) { 5503 bit = bsfq(pc->pc_map[field]); 5504 break; 5505 } 5506 } 5507 if (field < _NPCM) { 5508 pv = &pc->pc_pventry[field * 64 + bit]; 5509 pc->pc_map[field] &= ~(1ul << bit); 5510 /* If this was the last item, move it to tail */ 5511 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 5512 pc->pc_map[2] == 0) { 5513 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5514 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 5515 pc_list); 5516 } 5517 PV_STAT(counter_u64_add(pv_entry_count, 1)); 5518 PV_STAT(counter_u64_add(pv_entry_spare, -1)); 5519 return (pv); 5520 } 5521 } 5522 /* No free items, allocate another chunk */ 5523 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 5524 if (m == NULL) { 5525 if (lockp == NULL) { 5526 PV_STAT(counter_u64_add(pc_chunk_tryfail, 1)); 5527 return (NULL); 5528 } 5529 m = reclaim_pv_chunk(pmap, lockp); 5530 if (m == NULL) 5531 goto retry; 5532 } else 5533 counter_u64_add(pv_page_count, 1); 5534 PV_STAT(counter_u64_add(pc_chunk_count, 1)); 5535 PV_STAT(counter_u64_add(pc_chunk_allocs, 1)); 5536 dump_add_page(m->phys_addr); 5537 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 5538 pc->pc_pmap = pmap; 5539 pc->pc_map[0] = PC_FREEN & ~1ul; /* preallocated bit 0 */ 5540 pc->pc_map[1] = PC_FREEN; 5541 pc->pc_map[2] = PC_FREEL; 5542 pvc = &pv_chunks[vm_page_domain(m)]; 5543 mtx_lock(&pvc->pvc_lock); 5544 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 5545 mtx_unlock(&pvc->pvc_lock); 5546 pv = &pc->pc_pventry[0]; 5547 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5548 PV_STAT(counter_u64_add(pv_entry_count, 1)); 5549 PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV - 1)); 5550 return (pv); 5551 } 5552 5553 /* 5554 * Returns the number of one bits within the given PV chunk map. 5555 * 5556 * The erratas for Intel processors state that "POPCNT Instruction May 5557 * Take Longer to Execute Than Expected". It is believed that the 5558 * issue is the spurious dependency on the destination register. 5559 * Provide a hint to the register rename logic that the destination 5560 * value is overwritten, by clearing it, as suggested in the 5561 * optimization manual. It should be cheap for unaffected processors 5562 * as well. 5563 * 5564 * Reference numbers for erratas are 5565 * 4th Gen Core: HSD146 5566 * 5th Gen Core: BDM85 5567 * 6th Gen Core: SKL029 5568 */ 5569 static int 5570 popcnt_pc_map_pq(uint64_t *map) 5571 { 5572 u_long result, tmp; 5573 5574 __asm __volatile("xorl %k0,%k0;popcntq %2,%0;" 5575 "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;" 5576 "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0" 5577 : "=&r" (result), "=&r" (tmp) 5578 : "m" (map[0]), "m" (map[1]), "m" (map[2])); 5579 return (result); 5580 } 5581 5582 /* 5583 * Ensure that the number of spare PV entries in the specified pmap meets or 5584 * exceeds the given count, "needed". 5585 * 5586 * The given PV list lock may be released. 5587 */ 5588 static void 5589 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 5590 { 5591 struct pv_chunks_list *pvc; 5592 struct pch new_tail[PMAP_MEMDOM]; 5593 struct pv_chunk *pc; 5594 vm_page_t m; 5595 int avail, free, i; 5596 bool reclaimed; 5597 5598 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5599 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 5600 5601 /* 5602 * Newly allocated PV chunks must be stored in a private list until 5603 * the required number of PV chunks have been allocated. Otherwise, 5604 * reclaim_pv_chunk() could recycle one of these chunks. In 5605 * contrast, these chunks must be added to the pmap upon allocation. 5606 */ 5607 for (i = 0; i < PMAP_MEMDOM; i++) 5608 TAILQ_INIT(&new_tail[i]); 5609 retry: 5610 avail = 0; 5611 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 5612 #ifndef __POPCNT__ 5613 if ((cpu_feature2 & CPUID2_POPCNT) == 0) 5614 bit_count((bitstr_t *)pc->pc_map, 0, 5615 sizeof(pc->pc_map) * NBBY, &free); 5616 else 5617 #endif 5618 free = popcnt_pc_map_pq(pc->pc_map); 5619 if (free == 0) 5620 break; 5621 avail += free; 5622 if (avail >= needed) 5623 break; 5624 } 5625 for (reclaimed = false; avail < needed; avail += _NPCPV) { 5626 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 5627 if (m == NULL) { 5628 m = reclaim_pv_chunk(pmap, lockp); 5629 if (m == NULL) 5630 goto retry; 5631 reclaimed = true; 5632 } else 5633 counter_u64_add(pv_page_count, 1); 5634 PV_STAT(counter_u64_add(pc_chunk_count, 1)); 5635 PV_STAT(counter_u64_add(pc_chunk_allocs, 1)); 5636 dump_add_page(m->phys_addr); 5637 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 5638 pc->pc_pmap = pmap; 5639 pc->pc_map[0] = PC_FREEN; 5640 pc->pc_map[1] = PC_FREEN; 5641 pc->pc_map[2] = PC_FREEL; 5642 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5643 TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru); 5644 PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV)); 5645 5646 /* 5647 * The reclaim might have freed a chunk from the current pmap. 5648 * If that chunk contained available entries, we need to 5649 * re-count the number of available entries. 5650 */ 5651 if (reclaimed) 5652 goto retry; 5653 } 5654 for (i = 0; i < vm_ndomains; i++) { 5655 if (TAILQ_EMPTY(&new_tail[i])) 5656 continue; 5657 pvc = &pv_chunks[i]; 5658 mtx_lock(&pvc->pvc_lock); 5659 TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru); 5660 mtx_unlock(&pvc->pvc_lock); 5661 } 5662 } 5663 5664 /* 5665 * First find and then remove the pv entry for the specified pmap and virtual 5666 * address from the specified pv list. Returns the pv entry if found and NULL 5667 * otherwise. This operation can be performed on pv lists for either 4KB or 5668 * 2MB page mappings. 5669 */ 5670 static __inline pv_entry_t 5671 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 5672 { 5673 pv_entry_t pv; 5674 5675 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5676 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 5677 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5678 pvh->pv_gen++; 5679 break; 5680 } 5681 } 5682 return (pv); 5683 } 5684 5685 /* 5686 * After demotion from a 2MB page mapping to 512 4KB page mappings, 5687 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 5688 * entries for each of the 4KB page mappings. 5689 */ 5690 static void 5691 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 5692 struct rwlock **lockp) 5693 { 5694 struct md_page *pvh; 5695 struct pv_chunk *pc; 5696 pv_entry_t pv; 5697 vm_offset_t va_last; 5698 vm_page_t m; 5699 int bit, field; 5700 5701 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5702 KASSERT((pa & PDRMASK) == 0, 5703 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 5704 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5705 5706 /* 5707 * Transfer the 2mpage's pv entry for this mapping to the first 5708 * page's pv list. Once this transfer begins, the pv list lock 5709 * must not be released until the last pv entry is reinstantiated. 5710 */ 5711 pvh = pa_to_pvh(pa); 5712 va = trunc_2mpage(va); 5713 pv = pmap_pvh_remove(pvh, pmap, va); 5714 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 5715 m = PHYS_TO_VM_PAGE(pa); 5716 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5717 m->md.pv_gen++; 5718 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 5719 PV_STAT(counter_u64_add(pv_entry_allocs, NPTEPG - 1)); 5720 va_last = va + NBPDR - PAGE_SIZE; 5721 for (;;) { 5722 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 5723 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 5724 pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); 5725 for (field = 0; field < _NPCM; field++) { 5726 while (pc->pc_map[field]) { 5727 bit = bsfq(pc->pc_map[field]); 5728 pc->pc_map[field] &= ~(1ul << bit); 5729 pv = &pc->pc_pventry[field * 64 + bit]; 5730 va += PAGE_SIZE; 5731 pv->pv_va = va; 5732 m++; 5733 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5734 ("pmap_pv_demote_pde: page %p is not managed", m)); 5735 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5736 m->md.pv_gen++; 5737 if (va == va_last) 5738 goto out; 5739 } 5740 } 5741 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5742 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 5743 } 5744 out: 5745 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 5746 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5747 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 5748 } 5749 PV_STAT(counter_u64_add(pv_entry_count, NPTEPG - 1)); 5750 PV_STAT(counter_u64_add(pv_entry_spare, -(NPTEPG - 1))); 5751 } 5752 5753 #if VM_NRESERVLEVEL > 0 5754 /* 5755 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 5756 * replace the many pv entries for the 4KB page mappings by a single pv entry 5757 * for the 2MB page mapping. 5758 */ 5759 static void 5760 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 5761 struct rwlock **lockp) 5762 { 5763 struct md_page *pvh; 5764 pv_entry_t pv; 5765 vm_offset_t va_last; 5766 vm_page_t m; 5767 5768 KASSERT((pa & PDRMASK) == 0, 5769 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 5770 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5771 5772 /* 5773 * Transfer the first page's pv entry for this mapping to the 2mpage's 5774 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 5775 * a transfer avoids the possibility that get_pv_entry() calls 5776 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 5777 * mappings that is being promoted. 5778 */ 5779 m = PHYS_TO_VM_PAGE(pa); 5780 va = trunc_2mpage(va); 5781 pv = pmap_pvh_remove(&m->md, pmap, va); 5782 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 5783 pvh = pa_to_pvh(pa); 5784 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5785 pvh->pv_gen++; 5786 /* Free the remaining NPTEPG - 1 pv entries. */ 5787 va_last = va + NBPDR - PAGE_SIZE; 5788 do { 5789 m++; 5790 va += PAGE_SIZE; 5791 pmap_pvh_free(&m->md, pmap, va); 5792 } while (va < va_last); 5793 } 5794 #endif /* VM_NRESERVLEVEL > 0 */ 5795 5796 /* 5797 * First find and then destroy the pv entry for the specified pmap and virtual 5798 * address. This operation can be performed on pv lists for either 4KB or 2MB 5799 * page mappings. 5800 */ 5801 static void 5802 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 5803 { 5804 pv_entry_t pv; 5805 5806 pv = pmap_pvh_remove(pvh, pmap, va); 5807 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 5808 free_pv_entry(pmap, pv); 5809 } 5810 5811 /* 5812 * Conditionally create the PV entry for a 4KB page mapping if the required 5813 * memory can be allocated without resorting to reclamation. 5814 */ 5815 static boolean_t 5816 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 5817 struct rwlock **lockp) 5818 { 5819 pv_entry_t pv; 5820 5821 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5822 /* Pass NULL instead of the lock pointer to disable reclamation. */ 5823 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 5824 pv->pv_va = va; 5825 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 5826 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5827 m->md.pv_gen++; 5828 return (TRUE); 5829 } else 5830 return (FALSE); 5831 } 5832 5833 /* 5834 * Create the PV entry for a 2MB page mapping. Always returns true unless the 5835 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 5836 * false if the PV entry cannot be allocated without resorting to reclamation. 5837 */ 5838 static bool 5839 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, 5840 struct rwlock **lockp) 5841 { 5842 struct md_page *pvh; 5843 pv_entry_t pv; 5844 vm_paddr_t pa; 5845 5846 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5847 /* Pass NULL instead of the lock pointer to disable reclamation. */ 5848 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 5849 NULL : lockp)) == NULL) 5850 return (false); 5851 pv->pv_va = va; 5852 pa = pde & PG_PS_FRAME; 5853 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5854 pvh = pa_to_pvh(pa); 5855 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5856 pvh->pv_gen++; 5857 return (true); 5858 } 5859 5860 /* 5861 * Fills a page table page with mappings to consecutive physical pages. 5862 */ 5863 static void 5864 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 5865 { 5866 pt_entry_t *pte; 5867 5868 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 5869 *pte = newpte; 5870 newpte += PAGE_SIZE; 5871 } 5872 } 5873 5874 /* 5875 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 5876 * mapping is invalidated. 5877 */ 5878 static boolean_t 5879 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 5880 { 5881 struct rwlock *lock; 5882 boolean_t rv; 5883 5884 lock = NULL; 5885 rv = pmap_demote_pde_locked(pmap, pde, va, &lock); 5886 if (lock != NULL) 5887 rw_wunlock(lock); 5888 return (rv); 5889 } 5890 5891 static void 5892 pmap_demote_pde_check(pt_entry_t *firstpte __unused, pt_entry_t newpte __unused) 5893 { 5894 #ifdef INVARIANTS 5895 #ifdef DIAGNOSTIC 5896 pt_entry_t *xpte, *ypte; 5897 5898 for (xpte = firstpte; xpte < firstpte + NPTEPG; 5899 xpte++, newpte += PAGE_SIZE) { 5900 if ((*xpte & PG_FRAME) != (newpte & PG_FRAME)) { 5901 printf("pmap_demote_pde: xpte %zd and newpte map " 5902 "different pages: found %#lx, expected %#lx\n", 5903 xpte - firstpte, *xpte, newpte); 5904 printf("page table dump\n"); 5905 for (ypte = firstpte; ypte < firstpte + NPTEPG; ypte++) 5906 printf("%zd %#lx\n", ypte - firstpte, *ypte); 5907 panic("firstpte"); 5908 } 5909 } 5910 #else 5911 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 5912 ("pmap_demote_pde: firstpte and newpte map different physical" 5913 " addresses")); 5914 #endif 5915 #endif 5916 } 5917 5918 static void 5919 pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 5920 pd_entry_t oldpde, struct rwlock **lockp) 5921 { 5922 struct spglist free; 5923 vm_offset_t sva; 5924 5925 SLIST_INIT(&free); 5926 sva = trunc_2mpage(va); 5927 pmap_remove_pde(pmap, pde, sva, &free, lockp); 5928 if ((oldpde & pmap_global_bit(pmap)) == 0) 5929 pmap_invalidate_pde_page(pmap, sva, oldpde); 5930 vm_page_free_pages_toq(&free, true); 5931 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx in pmap %p", 5932 va, pmap); 5933 } 5934 5935 static boolean_t 5936 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 5937 struct rwlock **lockp) 5938 { 5939 pd_entry_t newpde, oldpde; 5940 pt_entry_t *firstpte, newpte; 5941 pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V; 5942 vm_paddr_t mptepa; 5943 vm_page_t mpte; 5944 int PG_PTE_CACHE; 5945 bool in_kernel; 5946 5947 PG_A = pmap_accessed_bit(pmap); 5948 PG_G = pmap_global_bit(pmap); 5949 PG_M = pmap_modified_bit(pmap); 5950 PG_RW = pmap_rw_bit(pmap); 5951 PG_V = pmap_valid_bit(pmap); 5952 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 5953 PG_PKU_MASK = pmap_pku_mask_bit(pmap); 5954 5955 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5956 in_kernel = va >= VM_MAXUSER_ADDRESS; 5957 oldpde = *pde; 5958 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 5959 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 5960 5961 /* 5962 * Invalidate the 2MB page mapping and return "failure" if the 5963 * mapping was never accessed. 5964 */ 5965 if ((oldpde & PG_A) == 0) { 5966 KASSERT((oldpde & PG_W) == 0, 5967 ("pmap_demote_pde: a wired mapping is missing PG_A")); 5968 pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); 5969 return (FALSE); 5970 } 5971 5972 mpte = pmap_remove_pt_page(pmap, va); 5973 if (mpte == NULL) { 5974 KASSERT((oldpde & PG_W) == 0, 5975 ("pmap_demote_pde: page table page for a wired mapping" 5976 " is missing")); 5977 5978 /* 5979 * If the page table page is missing and the mapping 5980 * is for a kernel address, the mapping must belong to 5981 * the direct map. Page table pages are preallocated 5982 * for every other part of the kernel address space, 5983 * so the direct map region is the only part of the 5984 * kernel address space that must be handled here. 5985 */ 5986 KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS && 5987 va < DMAP_MAX_ADDRESS), 5988 ("pmap_demote_pde: No saved mpte for va %#lx", va)); 5989 5990 /* 5991 * If the 2MB page mapping belongs to the direct map 5992 * region of the kernel's address space, then the page 5993 * allocation request specifies the highest possible 5994 * priority (VM_ALLOC_INTERRUPT). Otherwise, the 5995 * priority is normal. 5996 */ 5997 mpte = pmap_alloc_pt_page(pmap, pmap_pde_pindex(va), 5998 (in_kernel ? VM_ALLOC_INTERRUPT : 0) | VM_ALLOC_WIRED); 5999 6000 /* 6001 * If the allocation of the new page table page fails, 6002 * invalidate the 2MB page mapping and return "failure". 6003 */ 6004 if (mpte == NULL) { 6005 pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); 6006 return (FALSE); 6007 } 6008 6009 if (!in_kernel) 6010 mpte->ref_count = NPTEPG; 6011 } 6012 mptepa = VM_PAGE_TO_PHYS(mpte); 6013 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 6014 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 6015 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 6016 ("pmap_demote_pde: oldpde is missing PG_M")); 6017 newpte = oldpde & ~PG_PS; 6018 newpte = pmap_swap_pat(pmap, newpte); 6019 6020 /* 6021 * If the page table page is not leftover from an earlier promotion, 6022 * initialize it. 6023 */ 6024 if (vm_page_none_valid(mpte)) 6025 pmap_fill_ptp(firstpte, newpte); 6026 6027 pmap_demote_pde_check(firstpte, newpte); 6028 6029 /* 6030 * If the mapping has changed attributes, update the page table 6031 * entries. 6032 */ 6033 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 6034 pmap_fill_ptp(firstpte, newpte); 6035 6036 /* 6037 * The spare PV entries must be reserved prior to demoting the 6038 * mapping, that is, prior to changing the PDE. Otherwise, the state 6039 * of the PDE and the PV lists will be inconsistent, which can result 6040 * in reclaim_pv_chunk() attempting to remove a PV entry from the 6041 * wrong PV list and pmap_pv_demote_pde() failing to find the expected 6042 * PV entry for the 2MB page mapping that is being demoted. 6043 */ 6044 if ((oldpde & PG_MANAGED) != 0) 6045 reserve_pv_entries(pmap, NPTEPG - 1, lockp); 6046 6047 /* 6048 * Demote the mapping. This pmap is locked. The old PDE has 6049 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 6050 * set. Thus, there is no danger of a race with another 6051 * processor changing the setting of PG_A and/or PG_M between 6052 * the read above and the store below. 6053 */ 6054 if (workaround_erratum383) 6055 pmap_update_pde(pmap, va, pde, newpde); 6056 else 6057 pde_store(pde, newpde); 6058 6059 /* 6060 * Invalidate a stale recursive mapping of the page table page. 6061 */ 6062 if (in_kernel) 6063 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 6064 6065 /* 6066 * Demote the PV entry. 6067 */ 6068 if ((oldpde & PG_MANAGED) != 0) 6069 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); 6070 6071 counter_u64_add(pmap_pde_demotions, 1); 6072 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p", 6073 va, pmap); 6074 return (TRUE); 6075 } 6076 6077 /* 6078 * pmap_remove_kernel_pde: Remove a kernel superpage mapping. 6079 */ 6080 static void 6081 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 6082 { 6083 pd_entry_t newpde; 6084 vm_paddr_t mptepa; 6085 vm_page_t mpte; 6086 6087 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 6088 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6089 mpte = pmap_remove_pt_page(pmap, va); 6090 if (mpte == NULL) 6091 panic("pmap_remove_kernel_pde: Missing pt page."); 6092 6093 mptepa = VM_PAGE_TO_PHYS(mpte); 6094 newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; 6095 6096 /* 6097 * If this page table page was unmapped by a promotion, then it 6098 * contains valid mappings. Zero it to invalidate those mappings. 6099 */ 6100 if (vm_page_any_valid(mpte)) 6101 pagezero((void *)PHYS_TO_DMAP(mptepa)); 6102 6103 /* 6104 * Demote the mapping. 6105 */ 6106 if (workaround_erratum383) 6107 pmap_update_pde(pmap, va, pde, newpde); 6108 else 6109 pde_store(pde, newpde); 6110 6111 /* 6112 * Invalidate a stale recursive mapping of the page table page. 6113 */ 6114 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 6115 } 6116 6117 /* 6118 * pmap_remove_pde: do the things to unmap a superpage in a process 6119 */ 6120 static int 6121 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 6122 struct spglist *free, struct rwlock **lockp) 6123 { 6124 struct md_page *pvh; 6125 pd_entry_t oldpde; 6126 vm_offset_t eva, va; 6127 vm_page_t m, mpte; 6128 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 6129 6130 PG_G = pmap_global_bit(pmap); 6131 PG_A = pmap_accessed_bit(pmap); 6132 PG_M = pmap_modified_bit(pmap); 6133 PG_RW = pmap_rw_bit(pmap); 6134 6135 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6136 KASSERT((sva & PDRMASK) == 0, 6137 ("pmap_remove_pde: sva is not 2mpage aligned")); 6138 oldpde = pte_load_clear(pdq); 6139 if (oldpde & PG_W) 6140 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 6141 if ((oldpde & PG_G) != 0) 6142 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 6143 pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE); 6144 if (oldpde & PG_MANAGED) { 6145 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); 6146 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 6147 pmap_pvh_free(pvh, pmap, sva); 6148 eva = sva + NBPDR; 6149 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 6150 va < eva; va += PAGE_SIZE, m++) { 6151 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6152 vm_page_dirty(m); 6153 if (oldpde & PG_A) 6154 vm_page_aflag_set(m, PGA_REFERENCED); 6155 if (TAILQ_EMPTY(&m->md.pv_list) && 6156 TAILQ_EMPTY(&pvh->pv_list)) 6157 vm_page_aflag_clear(m, PGA_WRITEABLE); 6158 pmap_delayed_invl_page(m); 6159 } 6160 } 6161 if (pmap == kernel_pmap) { 6162 pmap_remove_kernel_pde(pmap, pdq, sva); 6163 } else { 6164 mpte = pmap_remove_pt_page(pmap, sva); 6165 if (mpte != NULL) { 6166 KASSERT(vm_page_all_valid(mpte), 6167 ("pmap_remove_pde: pte page not promoted")); 6168 pmap_pt_page_count_adj(pmap, -1); 6169 KASSERT(mpte->ref_count == NPTEPG, 6170 ("pmap_remove_pde: pte page ref count error")); 6171 mpte->ref_count = 0; 6172 pmap_add_delayed_free_list(mpte, free, FALSE); 6173 } 6174 } 6175 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); 6176 } 6177 6178 /* 6179 * pmap_remove_pte: do the things to unmap a page in a process 6180 */ 6181 static int 6182 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 6183 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) 6184 { 6185 struct md_page *pvh; 6186 pt_entry_t oldpte, PG_A, PG_M, PG_RW; 6187 vm_page_t m; 6188 6189 PG_A = pmap_accessed_bit(pmap); 6190 PG_M = pmap_modified_bit(pmap); 6191 PG_RW = pmap_rw_bit(pmap); 6192 6193 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6194 oldpte = pte_load_clear(ptq); 6195 if (oldpte & PG_W) 6196 pmap->pm_stats.wired_count -= 1; 6197 pmap_resident_count_adj(pmap, -1); 6198 if (oldpte & PG_MANAGED) { 6199 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 6200 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6201 vm_page_dirty(m); 6202 if (oldpte & PG_A) 6203 vm_page_aflag_set(m, PGA_REFERENCED); 6204 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 6205 pmap_pvh_free(&m->md, pmap, va); 6206 if (TAILQ_EMPTY(&m->md.pv_list) && 6207 (m->flags & PG_FICTITIOUS) == 0) { 6208 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6209 if (TAILQ_EMPTY(&pvh->pv_list)) 6210 vm_page_aflag_clear(m, PGA_WRITEABLE); 6211 } 6212 pmap_delayed_invl_page(m); 6213 } 6214 return (pmap_unuse_pt(pmap, va, ptepde, free)); 6215 } 6216 6217 /* 6218 * Remove a single page from a process address space 6219 */ 6220 static void 6221 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 6222 struct spglist *free) 6223 { 6224 struct rwlock *lock; 6225 pt_entry_t *pte, PG_V; 6226 6227 PG_V = pmap_valid_bit(pmap); 6228 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6229 if ((*pde & PG_V) == 0) 6230 return; 6231 pte = pmap_pde_to_pte(pde, va); 6232 if ((*pte & PG_V) == 0) 6233 return; 6234 lock = NULL; 6235 pmap_remove_pte(pmap, pte, va, *pde, free, &lock); 6236 if (lock != NULL) 6237 rw_wunlock(lock); 6238 pmap_invalidate_page(pmap, va); 6239 } 6240 6241 /* 6242 * Removes the specified range of addresses from the page table page. 6243 */ 6244 static bool 6245 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 6246 pd_entry_t *pde, struct spglist *free, struct rwlock **lockp) 6247 { 6248 pt_entry_t PG_G, *pte; 6249 vm_offset_t va; 6250 bool anyvalid; 6251 6252 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6253 PG_G = pmap_global_bit(pmap); 6254 anyvalid = false; 6255 va = eva; 6256 for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++, 6257 sva += PAGE_SIZE) { 6258 if (*pte == 0) { 6259 if (va != eva) { 6260 pmap_invalidate_range(pmap, va, sva); 6261 va = eva; 6262 } 6263 continue; 6264 } 6265 if ((*pte & PG_G) == 0) 6266 anyvalid = true; 6267 else if (va == eva) 6268 va = sva; 6269 if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) { 6270 sva += PAGE_SIZE; 6271 break; 6272 } 6273 } 6274 if (va != eva) 6275 pmap_invalidate_range(pmap, va, sva); 6276 return (anyvalid); 6277 } 6278 6279 /* 6280 * Remove the given range of addresses from the specified map. 6281 * 6282 * It is assumed that the start and end are properly 6283 * rounded to the page size. 6284 */ 6285 void 6286 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 6287 { 6288 struct rwlock *lock; 6289 vm_page_t mt; 6290 vm_offset_t va_next; 6291 pml5_entry_t *pml5e; 6292 pml4_entry_t *pml4e; 6293 pdp_entry_t *pdpe; 6294 pd_entry_t ptpaddr, *pde; 6295 pt_entry_t PG_G, PG_V; 6296 struct spglist free; 6297 int anyvalid; 6298 6299 PG_G = pmap_global_bit(pmap); 6300 PG_V = pmap_valid_bit(pmap); 6301 6302 /* 6303 * If there are no resident pages besides the top level page 6304 * table page(s), there is nothing to do. Kernel pmap always 6305 * accounts whole preloaded area as resident, which makes its 6306 * resident count > 2. 6307 * Perform an unsynchronized read. This is, however, safe. 6308 */ 6309 if (pmap->pm_stats.resident_count <= 1 + (pmap->pm_pmltopu != NULL ? 6310 1 : 0)) 6311 return; 6312 6313 anyvalid = 0; 6314 SLIST_INIT(&free); 6315 6316 pmap_delayed_invl_start(); 6317 PMAP_LOCK(pmap); 6318 pmap_pkru_on_remove(pmap, sva, eva); 6319 6320 /* 6321 * special handling of removing one page. a very 6322 * common operation and easy to short circuit some 6323 * code. 6324 */ 6325 if (sva + PAGE_SIZE == eva) { 6326 pde = pmap_pde(pmap, sva); 6327 if (pde && (*pde & PG_PS) == 0) { 6328 pmap_remove_page(pmap, sva, pde, &free); 6329 goto out; 6330 } 6331 } 6332 6333 lock = NULL; 6334 for (; sva < eva; sva = va_next) { 6335 if (pmap->pm_stats.resident_count == 0) 6336 break; 6337 6338 if (pmap_is_la57(pmap)) { 6339 pml5e = pmap_pml5e(pmap, sva); 6340 if ((*pml5e & PG_V) == 0) { 6341 va_next = (sva + NBPML5) & ~PML5MASK; 6342 if (va_next < sva) 6343 va_next = eva; 6344 continue; 6345 } 6346 pml4e = pmap_pml5e_to_pml4e(pml5e, sva); 6347 } else { 6348 pml4e = pmap_pml4e(pmap, sva); 6349 } 6350 if ((*pml4e & PG_V) == 0) { 6351 va_next = (sva + NBPML4) & ~PML4MASK; 6352 if (va_next < sva) 6353 va_next = eva; 6354 continue; 6355 } 6356 6357 va_next = (sva + NBPDP) & ~PDPMASK; 6358 if (va_next < sva) 6359 va_next = eva; 6360 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 6361 if ((*pdpe & PG_V) == 0) 6362 continue; 6363 if ((*pdpe & PG_PS) != 0) { 6364 KASSERT(va_next <= eva, 6365 ("partial update of non-transparent 1G mapping " 6366 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 6367 *pdpe, sva, eva, va_next)); 6368 MPASS(pmap != kernel_pmap); /* XXXKIB */ 6369 MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); 6370 anyvalid = 1; 6371 *pdpe = 0; 6372 pmap_resident_count_adj(pmap, -NBPDP / PAGE_SIZE); 6373 mt = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, sva) & PG_FRAME); 6374 pmap_unwire_ptp(pmap, sva, mt, &free); 6375 continue; 6376 } 6377 6378 /* 6379 * Calculate index for next page table. 6380 */ 6381 va_next = (sva + NBPDR) & ~PDRMASK; 6382 if (va_next < sva) 6383 va_next = eva; 6384 6385 pde = pmap_pdpe_to_pde(pdpe, sva); 6386 ptpaddr = *pde; 6387 6388 /* 6389 * Weed out invalid mappings. 6390 */ 6391 if (ptpaddr == 0) 6392 continue; 6393 6394 /* 6395 * Check for large page. 6396 */ 6397 if ((ptpaddr & PG_PS) != 0) { 6398 /* 6399 * Are we removing the entire large page? If not, 6400 * demote the mapping and fall through. 6401 */ 6402 if (sva + NBPDR == va_next && eva >= va_next) { 6403 /* 6404 * The TLB entry for a PG_G mapping is 6405 * invalidated by pmap_remove_pde(). 6406 */ 6407 if ((ptpaddr & PG_G) == 0) 6408 anyvalid = 1; 6409 pmap_remove_pde(pmap, pde, sva, &free, &lock); 6410 continue; 6411 } else if (!pmap_demote_pde_locked(pmap, pde, sva, 6412 &lock)) { 6413 /* The large page mapping was destroyed. */ 6414 continue; 6415 } else 6416 ptpaddr = *pde; 6417 } 6418 6419 /* 6420 * Limit our scan to either the end of the va represented 6421 * by the current page table page, or to the end of the 6422 * range being removed. 6423 */ 6424 if (va_next > eva) 6425 va_next = eva; 6426 6427 if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock)) 6428 anyvalid = 1; 6429 } 6430 if (lock != NULL) 6431 rw_wunlock(lock); 6432 out: 6433 if (anyvalid) 6434 pmap_invalidate_all(pmap); 6435 PMAP_UNLOCK(pmap); 6436 pmap_delayed_invl_finish(); 6437 vm_page_free_pages_toq(&free, true); 6438 } 6439 6440 /* 6441 * Routine: pmap_remove_all 6442 * Function: 6443 * Removes this physical page from 6444 * all physical maps in which it resides. 6445 * Reflects back modify bits to the pager. 6446 * 6447 * Notes: 6448 * Original versions of this routine were very 6449 * inefficient because they iteratively called 6450 * pmap_remove (slow...) 6451 */ 6452 6453 void 6454 pmap_remove_all(vm_page_t m) 6455 { 6456 struct md_page *pvh; 6457 pv_entry_t pv; 6458 pmap_t pmap; 6459 struct rwlock *lock; 6460 pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; 6461 pd_entry_t *pde; 6462 vm_offset_t va; 6463 struct spglist free; 6464 int pvh_gen, md_gen; 6465 6466 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6467 ("pmap_remove_all: page %p is not managed", m)); 6468 SLIST_INIT(&free); 6469 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6470 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 6471 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6472 rw_wlock(lock); 6473 retry: 6474 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 6475 pmap = PV_PMAP(pv); 6476 if (!PMAP_TRYLOCK(pmap)) { 6477 pvh_gen = pvh->pv_gen; 6478 rw_wunlock(lock); 6479 PMAP_LOCK(pmap); 6480 rw_wlock(lock); 6481 if (pvh_gen != pvh->pv_gen) { 6482 PMAP_UNLOCK(pmap); 6483 goto retry; 6484 } 6485 } 6486 va = pv->pv_va; 6487 pde = pmap_pde(pmap, va); 6488 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 6489 PMAP_UNLOCK(pmap); 6490 } 6491 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 6492 pmap = PV_PMAP(pv); 6493 if (!PMAP_TRYLOCK(pmap)) { 6494 pvh_gen = pvh->pv_gen; 6495 md_gen = m->md.pv_gen; 6496 rw_wunlock(lock); 6497 PMAP_LOCK(pmap); 6498 rw_wlock(lock); 6499 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 6500 PMAP_UNLOCK(pmap); 6501 goto retry; 6502 } 6503 } 6504 PG_A = pmap_accessed_bit(pmap); 6505 PG_M = pmap_modified_bit(pmap); 6506 PG_RW = pmap_rw_bit(pmap); 6507 pmap_resident_count_adj(pmap, -1); 6508 pde = pmap_pde(pmap, pv->pv_va); 6509 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 6510 " a 2mpage in page %p's pv list", m)); 6511 pte = pmap_pde_to_pte(pde, pv->pv_va); 6512 tpte = pte_load_clear(pte); 6513 if (tpte & PG_W) 6514 pmap->pm_stats.wired_count--; 6515 if (tpte & PG_A) 6516 vm_page_aflag_set(m, PGA_REFERENCED); 6517 6518 /* 6519 * Update the vm_page_t clean and reference bits. 6520 */ 6521 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6522 vm_page_dirty(m); 6523 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); 6524 pmap_invalidate_page(pmap, pv->pv_va); 6525 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 6526 m->md.pv_gen++; 6527 free_pv_entry(pmap, pv); 6528 PMAP_UNLOCK(pmap); 6529 } 6530 vm_page_aflag_clear(m, PGA_WRITEABLE); 6531 rw_wunlock(lock); 6532 pmap_delayed_invl_wait(m); 6533 vm_page_free_pages_toq(&free, true); 6534 } 6535 6536 /* 6537 * pmap_protect_pde: do the things to protect a 2mpage in a process 6538 */ 6539 static boolean_t 6540 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 6541 { 6542 pd_entry_t newpde, oldpde; 6543 vm_page_t m, mt; 6544 boolean_t anychanged; 6545 pt_entry_t PG_G, PG_M, PG_RW; 6546 6547 PG_G = pmap_global_bit(pmap); 6548 PG_M = pmap_modified_bit(pmap); 6549 PG_RW = pmap_rw_bit(pmap); 6550 6551 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6552 KASSERT((sva & PDRMASK) == 0, 6553 ("pmap_protect_pde: sva is not 2mpage aligned")); 6554 anychanged = FALSE; 6555 retry: 6556 oldpde = newpde = *pde; 6557 if ((prot & VM_PROT_WRITE) == 0) { 6558 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == 6559 (PG_MANAGED | PG_M | PG_RW)) { 6560 m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 6561 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 6562 vm_page_dirty(mt); 6563 } 6564 newpde &= ~(PG_RW | PG_M); 6565 } 6566 if ((prot & VM_PROT_EXECUTE) == 0) 6567 newpde |= pg_nx; 6568 if (newpde != oldpde) { 6569 /* 6570 * As an optimization to future operations on this PDE, clear 6571 * PG_PROMOTED. The impending invalidation will remove any 6572 * lingering 4KB page mappings from the TLB. 6573 */ 6574 if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED)) 6575 goto retry; 6576 if ((oldpde & PG_G) != 0) 6577 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 6578 else 6579 anychanged = TRUE; 6580 } 6581 return (anychanged); 6582 } 6583 6584 /* 6585 * Set the physical protection on the 6586 * specified range of this map as requested. 6587 */ 6588 void 6589 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 6590 { 6591 vm_page_t m; 6592 vm_offset_t va_next; 6593 pml4_entry_t *pml4e; 6594 pdp_entry_t *pdpe; 6595 pd_entry_t ptpaddr, *pde; 6596 pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; 6597 pt_entry_t obits, pbits; 6598 boolean_t anychanged; 6599 6600 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 6601 if (prot == VM_PROT_NONE) { 6602 pmap_remove(pmap, sva, eva); 6603 return; 6604 } 6605 6606 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 6607 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 6608 return; 6609 6610 PG_G = pmap_global_bit(pmap); 6611 PG_M = pmap_modified_bit(pmap); 6612 PG_V = pmap_valid_bit(pmap); 6613 PG_RW = pmap_rw_bit(pmap); 6614 anychanged = FALSE; 6615 6616 /* 6617 * Although this function delays and batches the invalidation 6618 * of stale TLB entries, it does not need to call 6619 * pmap_delayed_invl_start() and 6620 * pmap_delayed_invl_finish(), because it does not 6621 * ordinarily destroy mappings. Stale TLB entries from 6622 * protection-only changes need only be invalidated before the 6623 * pmap lock is released, because protection-only changes do 6624 * not destroy PV entries. Even operations that iterate over 6625 * a physical page's PV list of mappings, like 6626 * pmap_remove_write(), acquire the pmap lock for each 6627 * mapping. Consequently, for protection-only changes, the 6628 * pmap lock suffices to synchronize both page table and TLB 6629 * updates. 6630 * 6631 * This function only destroys a mapping if pmap_demote_pde() 6632 * fails. In that case, stale TLB entries are immediately 6633 * invalidated. 6634 */ 6635 6636 PMAP_LOCK(pmap); 6637 for (; sva < eva; sva = va_next) { 6638 pml4e = pmap_pml4e(pmap, sva); 6639 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 6640 va_next = (sva + NBPML4) & ~PML4MASK; 6641 if (va_next < sva) 6642 va_next = eva; 6643 continue; 6644 } 6645 6646 va_next = (sva + NBPDP) & ~PDPMASK; 6647 if (va_next < sva) 6648 va_next = eva; 6649 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 6650 if ((*pdpe & PG_V) == 0) 6651 continue; 6652 if ((*pdpe & PG_PS) != 0) { 6653 KASSERT(va_next <= eva, 6654 ("partial update of non-transparent 1G mapping " 6655 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 6656 *pdpe, sva, eva, va_next)); 6657 retry_pdpe: 6658 obits = pbits = *pdpe; 6659 MPASS((pbits & (PG_MANAGED | PG_G)) == 0); 6660 MPASS(pmap != kernel_pmap); /* XXXKIB */ 6661 if ((prot & VM_PROT_WRITE) == 0) 6662 pbits &= ~(PG_RW | PG_M); 6663 if ((prot & VM_PROT_EXECUTE) == 0) 6664 pbits |= pg_nx; 6665 6666 if (pbits != obits) { 6667 if (!atomic_cmpset_long(pdpe, obits, pbits)) 6668 /* PG_PS cannot be cleared under us, */ 6669 goto retry_pdpe; 6670 anychanged = TRUE; 6671 } 6672 continue; 6673 } 6674 6675 va_next = (sva + NBPDR) & ~PDRMASK; 6676 if (va_next < sva) 6677 va_next = eva; 6678 6679 pde = pmap_pdpe_to_pde(pdpe, sva); 6680 ptpaddr = *pde; 6681 6682 /* 6683 * Weed out invalid mappings. 6684 */ 6685 if (ptpaddr == 0) 6686 continue; 6687 6688 /* 6689 * Check for large page. 6690 */ 6691 if ((ptpaddr & PG_PS) != 0) { 6692 /* 6693 * Are we protecting the entire large page? If not, 6694 * demote the mapping and fall through. 6695 */ 6696 if (sva + NBPDR == va_next && eva >= va_next) { 6697 /* 6698 * The TLB entry for a PG_G mapping is 6699 * invalidated by pmap_protect_pde(). 6700 */ 6701 if (pmap_protect_pde(pmap, pde, sva, prot)) 6702 anychanged = TRUE; 6703 continue; 6704 } else if (!pmap_demote_pde(pmap, pde, sva)) { 6705 /* 6706 * The large page mapping was destroyed. 6707 */ 6708 continue; 6709 } 6710 } 6711 6712 if (va_next > eva) 6713 va_next = eva; 6714 6715 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 6716 sva += PAGE_SIZE) { 6717 retry: 6718 obits = pbits = *pte; 6719 if ((pbits & PG_V) == 0) 6720 continue; 6721 6722 if ((prot & VM_PROT_WRITE) == 0) { 6723 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 6724 (PG_MANAGED | PG_M | PG_RW)) { 6725 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 6726 vm_page_dirty(m); 6727 } 6728 pbits &= ~(PG_RW | PG_M); 6729 } 6730 if ((prot & VM_PROT_EXECUTE) == 0) 6731 pbits |= pg_nx; 6732 6733 if (pbits != obits) { 6734 if (!atomic_cmpset_long(pte, obits, pbits)) 6735 goto retry; 6736 if (obits & PG_G) 6737 pmap_invalidate_page(pmap, sva); 6738 else 6739 anychanged = TRUE; 6740 } 6741 } 6742 } 6743 if (anychanged) 6744 pmap_invalidate_all(pmap); 6745 PMAP_UNLOCK(pmap); 6746 } 6747 6748 #if VM_NRESERVLEVEL > 0 6749 static bool 6750 pmap_pde_ept_executable(pmap_t pmap, pd_entry_t pde) 6751 { 6752 6753 if (pmap->pm_type != PT_EPT) 6754 return (false); 6755 return ((pde & EPT_PG_EXECUTE) != 0); 6756 } 6757 6758 /* 6759 * Tries to promote the 512, contiguous 4KB page mappings that are within a 6760 * single page table page (PTP) to a single 2MB page mapping. For promotion 6761 * to occur, two conditions must be met: (1) the 4KB page mappings must map 6762 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 6763 * identical characteristics. 6764 */ 6765 static void 6766 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, vm_page_t mpte, 6767 struct rwlock **lockp) 6768 { 6769 pd_entry_t newpde; 6770 pt_entry_t *firstpte, oldpte, pa, *pte; 6771 pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK; 6772 int PG_PTE_CACHE; 6773 6774 PG_A = pmap_accessed_bit(pmap); 6775 PG_G = pmap_global_bit(pmap); 6776 PG_M = pmap_modified_bit(pmap); 6777 PG_V = pmap_valid_bit(pmap); 6778 PG_RW = pmap_rw_bit(pmap); 6779 PG_PKU_MASK = pmap_pku_mask_bit(pmap); 6780 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 6781 6782 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6783 6784 /* 6785 * Examine the first PTE in the specified PTP. Abort if this PTE is 6786 * ineligible for promotion due to hardware errata, invalid, or does 6787 * not map the first 4KB physical page within a 2MB page. 6788 */ 6789 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 6790 newpde = *firstpte; 6791 if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, newpde))) 6792 return; 6793 if ((newpde & ((PG_FRAME & PDRMASK) | PG_V)) != PG_V) { 6794 counter_u64_add(pmap_pde_p_failures, 1); 6795 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6796 " in pmap %p", va, pmap); 6797 return; 6798 } 6799 6800 /* 6801 * Both here and in the below "for" loop, to allow for repromotion 6802 * after MADV_FREE, conditionally write protect a clean PTE before 6803 * possibly aborting the promotion due to other PTE attributes. Why? 6804 * Suppose that MADV_FREE is applied to a part of a superpage, the 6805 * address range [S, E). pmap_advise() will demote the superpage 6806 * mapping, destroy the 4KB page mapping at the end of [S, E), and 6807 * clear PG_M and PG_A in the PTEs for the rest of [S, E). Later, 6808 * imagine that the memory in [S, E) is recycled, but the last 4KB 6809 * page in [S, E) is not the last to be rewritten, or simply accessed. 6810 * In other words, there is still a 4KB page in [S, E), call it P, 6811 * that is writeable but PG_M and PG_A are clear in P's PTE. Unless 6812 * we write protect P before aborting the promotion, if and when P is 6813 * finally rewritten, there won't be a page fault to trigger 6814 * repromotion. 6815 */ 6816 setpde: 6817 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 6818 /* 6819 * When PG_M is already clear, PG_RW can be cleared without 6820 * a TLB invalidation. 6821 */ 6822 if (!atomic_fcmpset_long(firstpte, &newpde, newpde & ~PG_RW)) 6823 goto setpde; 6824 newpde &= ~PG_RW; 6825 } 6826 if ((newpde & PG_A) == 0) { 6827 counter_u64_add(pmap_pde_p_failures, 1); 6828 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6829 " in pmap %p", va, pmap); 6830 return; 6831 } 6832 6833 /* 6834 * Examine each of the other PTEs in the specified PTP. Abort if this 6835 * PTE maps an unexpected 4KB physical page or does not have identical 6836 * characteristics to the first PTE. 6837 */ 6838 pa = (newpde & (PG_PS_FRAME | PG_V)) + NBPDR - PAGE_SIZE; 6839 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 6840 oldpte = *pte; 6841 if ((oldpte & (PG_FRAME | PG_V)) != pa) { 6842 counter_u64_add(pmap_pde_p_failures, 1); 6843 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6844 " in pmap %p", va, pmap); 6845 return; 6846 } 6847 setpte: 6848 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 6849 /* 6850 * When PG_M is already clear, PG_RW can be cleared 6851 * without a TLB invalidation. 6852 */ 6853 if (!atomic_fcmpset_long(pte, &oldpte, oldpte & ~PG_RW)) 6854 goto setpte; 6855 oldpte &= ~PG_RW; 6856 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 6857 " in pmap %p", (oldpte & PG_FRAME & PDRMASK) | 6858 (va & ~PDRMASK), pmap); 6859 } 6860 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 6861 counter_u64_add(pmap_pde_p_failures, 1); 6862 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6863 " in pmap %p", va, pmap); 6864 return; 6865 } 6866 pa -= PAGE_SIZE; 6867 } 6868 6869 /* 6870 * Save the page table page in its current state until the PDE 6871 * mapping the superpage is demoted by pmap_demote_pde() or 6872 * destroyed by pmap_remove_pde(). 6873 */ 6874 if (mpte == NULL) 6875 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 6876 KASSERT(mpte >= vm_page_array && 6877 mpte < &vm_page_array[vm_page_array_size], 6878 ("pmap_promote_pde: page table page is out of range")); 6879 KASSERT(mpte->pindex == pmap_pde_pindex(va), 6880 ("pmap_promote_pde: page table page's pindex is wrong " 6881 "mpte %p pidx %#lx va %#lx va pde pidx %#lx", 6882 mpte, mpte->pindex, va, pmap_pde_pindex(va))); 6883 if (pmap_insert_pt_page(pmap, mpte, true)) { 6884 counter_u64_add(pmap_pde_p_failures, 1); 6885 CTR2(KTR_PMAP, 6886 "pmap_promote_pde: failure for va %#lx in pmap %p", va, 6887 pmap); 6888 return; 6889 } 6890 6891 /* 6892 * Promote the pv entries. 6893 */ 6894 if ((newpde & PG_MANAGED) != 0) 6895 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); 6896 6897 /* 6898 * Propagate the PAT index to its proper position. 6899 */ 6900 newpde = pmap_swap_pat(pmap, newpde); 6901 6902 /* 6903 * Map the superpage. 6904 */ 6905 if (workaround_erratum383) 6906 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 6907 else 6908 pde_store(pde, PG_PROMOTED | PG_PS | newpde); 6909 6910 counter_u64_add(pmap_pde_promotions, 1); 6911 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" 6912 " in pmap %p", va, pmap); 6913 } 6914 #endif /* VM_NRESERVLEVEL > 0 */ 6915 6916 static int 6917 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags, 6918 int psind) 6919 { 6920 vm_page_t mp; 6921 pt_entry_t origpte, *pml4e, *pdpe, *pde, pten, PG_V; 6922 6923 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6924 KASSERT(psind > 0 && psind < MAXPAGESIZES && pagesizes[psind] != 0, 6925 ("psind %d unexpected", psind)); 6926 KASSERT(((newpte & PG_FRAME) & (pagesizes[psind] - 1)) == 0, 6927 ("unaligned phys address %#lx newpte %#lx psind %d", 6928 newpte & PG_FRAME, newpte, psind)); 6929 KASSERT((va & (pagesizes[psind] - 1)) == 0, 6930 ("unaligned va %#lx psind %d", va, psind)); 6931 KASSERT(va < VM_MAXUSER_ADDRESS, 6932 ("kernel mode non-transparent superpage")); /* XXXKIB */ 6933 KASSERT(va + pagesizes[psind] < VM_MAXUSER_ADDRESS, 6934 ("overflowing user map va %#lx psind %d", va, psind)); /* XXXKIB */ 6935 6936 PG_V = pmap_valid_bit(pmap); 6937 6938 restart: 6939 if (!pmap_pkru_same(pmap, va, va + pagesizes[psind])) 6940 return (KERN_PROTECTION_FAILURE); 6941 pten = newpte; 6942 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) 6943 pten |= pmap_pkru_get(pmap, va); 6944 6945 if (psind == 2) { /* 1G */ 6946 pml4e = pmap_pml4e(pmap, va); 6947 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 6948 mp = pmap_allocpte_alloc(pmap, pmap_pml4e_pindex(va), 6949 NULL, va); 6950 if (mp == NULL) 6951 goto allocf; 6952 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 6953 pdpe = &pdpe[pmap_pdpe_index(va)]; 6954 origpte = *pdpe; 6955 MPASS(origpte == 0); 6956 } else { 6957 pdpe = pmap_pml4e_to_pdpe(pml4e, va); 6958 KASSERT(pdpe != NULL, ("va %#lx lost pdpe", va)); 6959 origpte = *pdpe; 6960 if ((origpte & PG_V) == 0) { 6961 mp = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME); 6962 mp->ref_count++; 6963 } 6964 } 6965 *pdpe = pten; 6966 } else /* (psind == 1) */ { /* 2M */ 6967 pde = pmap_pde(pmap, va); 6968 if (pde == NULL) { 6969 mp = pmap_allocpte_alloc(pmap, pmap_pdpe_pindex(va), 6970 NULL, va); 6971 if (mp == NULL) 6972 goto allocf; 6973 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 6974 pde = &pde[pmap_pde_index(va)]; 6975 origpte = *pde; 6976 MPASS(origpte == 0); 6977 } else { 6978 origpte = *pde; 6979 if ((origpte & PG_V) == 0) { 6980 pdpe = pmap_pdpe(pmap, va); 6981 MPASS(pdpe != NULL && (*pdpe & PG_V) != 0); 6982 mp = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 6983 mp->ref_count++; 6984 } 6985 } 6986 *pde = pten; 6987 } 6988 KASSERT((origpte & PG_V) == 0 || ((origpte & PG_PS) != 0 && 6989 (origpte & PG_PS_FRAME) == (pten & PG_PS_FRAME)), 6990 ("va %#lx changing %s phys page origpte %#lx pten %#lx", 6991 va, psind == 2 ? "1G" : "2M", origpte, pten)); 6992 if ((pten & PG_W) != 0 && (origpte & PG_W) == 0) 6993 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE; 6994 else if ((pten & PG_W) == 0 && (origpte & PG_W) != 0) 6995 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE; 6996 if ((origpte & PG_V) == 0) 6997 pmap_resident_count_adj(pmap, pagesizes[psind] / PAGE_SIZE); 6998 6999 return (KERN_SUCCESS); 7000 7001 allocf: 7002 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 7003 return (KERN_RESOURCE_SHORTAGE); 7004 PMAP_UNLOCK(pmap); 7005 vm_wait(NULL); 7006 PMAP_LOCK(pmap); 7007 goto restart; 7008 } 7009 7010 /* 7011 * Insert the given physical page (p) at 7012 * the specified virtual address (v) in the 7013 * target physical map with the protection requested. 7014 * 7015 * If specified, the page will be wired down, meaning 7016 * that the related pte can not be reclaimed. 7017 * 7018 * NB: This is the only routine which MAY NOT lazy-evaluate 7019 * or lose information. That is, this routine must actually 7020 * insert this page into the given map NOW. 7021 * 7022 * When destroying both a page table and PV entry, this function 7023 * performs the TLB invalidation before releasing the PV list 7024 * lock, so we do not need pmap_delayed_invl_page() calls here. 7025 */ 7026 int 7027 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 7028 u_int flags, int8_t psind) 7029 { 7030 struct rwlock *lock; 7031 pd_entry_t *pde; 7032 pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; 7033 pt_entry_t newpte, origpte; 7034 pv_entry_t pv; 7035 vm_paddr_t opa, pa; 7036 vm_page_t mpte, om; 7037 int rv; 7038 boolean_t nosleep; 7039 7040 PG_A = pmap_accessed_bit(pmap); 7041 PG_G = pmap_global_bit(pmap); 7042 PG_M = pmap_modified_bit(pmap); 7043 PG_V = pmap_valid_bit(pmap); 7044 PG_RW = pmap_rw_bit(pmap); 7045 7046 va = trunc_page(va); 7047 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 7048 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 7049 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", 7050 va)); 7051 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va), 7052 ("pmap_enter: managed mapping within the clean submap")); 7053 if ((m->oflags & VPO_UNMANAGED) == 0) 7054 VM_PAGE_OBJECT_BUSY_ASSERT(m); 7055 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 7056 ("pmap_enter: flags %u has reserved bits set", flags)); 7057 pa = VM_PAGE_TO_PHYS(m); 7058 newpte = (pt_entry_t)(pa | PG_A | PG_V); 7059 if ((flags & VM_PROT_WRITE) != 0) 7060 newpte |= PG_M; 7061 if ((prot & VM_PROT_WRITE) != 0) 7062 newpte |= PG_RW; 7063 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 7064 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); 7065 if ((prot & VM_PROT_EXECUTE) == 0) 7066 newpte |= pg_nx; 7067 if ((flags & PMAP_ENTER_WIRED) != 0) 7068 newpte |= PG_W; 7069 if (va < VM_MAXUSER_ADDRESS) 7070 newpte |= PG_U; 7071 if (pmap == kernel_pmap) 7072 newpte |= PG_G; 7073 newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0); 7074 7075 /* 7076 * Set modified bit gratuitously for writeable mappings if 7077 * the page is unmanaged. We do not want to take a fault 7078 * to do the dirty bit accounting for these mappings. 7079 */ 7080 if ((m->oflags & VPO_UNMANAGED) != 0) { 7081 if ((newpte & PG_RW) != 0) 7082 newpte |= PG_M; 7083 } else 7084 newpte |= PG_MANAGED; 7085 7086 lock = NULL; 7087 PMAP_LOCK(pmap); 7088 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { 7089 KASSERT((m->oflags & VPO_UNMANAGED) != 0, 7090 ("managed largepage va %#lx flags %#x", va, flags)); 7091 rv = pmap_enter_largepage(pmap, va, newpte | PG_PS, flags, 7092 psind); 7093 goto out; 7094 } 7095 if (psind == 1) { 7096 /* Assert the required virtual and physical alignment. */ 7097 KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); 7098 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 7099 rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock); 7100 goto out; 7101 } 7102 mpte = NULL; 7103 7104 /* 7105 * In the case that a page table page is not 7106 * resident, we are creating it here. 7107 */ 7108 retry: 7109 pde = pmap_pde(pmap, va); 7110 if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || 7111 pmap_demote_pde_locked(pmap, pde, va, &lock))) { 7112 pte = pmap_pde_to_pte(pde, va); 7113 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 7114 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7115 mpte->ref_count++; 7116 } 7117 } else if (va < VM_MAXUSER_ADDRESS) { 7118 /* 7119 * Here if the pte page isn't mapped, or if it has been 7120 * deallocated. 7121 */ 7122 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 7123 mpte = pmap_allocpte_alloc(pmap, pmap_pde_pindex(va), 7124 nosleep ? NULL : &lock, va); 7125 if (mpte == NULL && nosleep) { 7126 rv = KERN_RESOURCE_SHORTAGE; 7127 goto out; 7128 } 7129 goto retry; 7130 } else 7131 panic("pmap_enter: invalid page directory va=%#lx", va); 7132 7133 origpte = *pte; 7134 pv = NULL; 7135 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) 7136 newpte |= pmap_pkru_get(pmap, va); 7137 7138 /* 7139 * Is the specified virtual address already mapped? 7140 */ 7141 if ((origpte & PG_V) != 0) { 7142 /* 7143 * Wiring change, just update stats. We don't worry about 7144 * wiring PT pages as they remain resident as long as there 7145 * are valid mappings in them. Hence, if a user page is wired, 7146 * the PT page will be also. 7147 */ 7148 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 7149 pmap->pm_stats.wired_count++; 7150 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 7151 pmap->pm_stats.wired_count--; 7152 7153 /* 7154 * Remove the extra PT page reference. 7155 */ 7156 if (mpte != NULL) { 7157 mpte->ref_count--; 7158 KASSERT(mpte->ref_count > 0, 7159 ("pmap_enter: missing reference to page table page," 7160 " va: 0x%lx", va)); 7161 } 7162 7163 /* 7164 * Has the physical page changed? 7165 */ 7166 opa = origpte & PG_FRAME; 7167 if (opa == pa) { 7168 /* 7169 * No, might be a protection or wiring change. 7170 */ 7171 if ((origpte & PG_MANAGED) != 0 && 7172 (newpte & PG_RW) != 0) 7173 vm_page_aflag_set(m, PGA_WRITEABLE); 7174 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 7175 goto unchanged; 7176 goto validate; 7177 } 7178 7179 /* 7180 * The physical page has changed. Temporarily invalidate 7181 * the mapping. This ensures that all threads sharing the 7182 * pmap keep a consistent view of the mapping, which is 7183 * necessary for the correct handling of COW faults. It 7184 * also permits reuse of the old mapping's PV entry, 7185 * avoiding an allocation. 7186 * 7187 * For consistency, handle unmanaged mappings the same way. 7188 */ 7189 origpte = pte_load_clear(pte); 7190 KASSERT((origpte & PG_FRAME) == opa, 7191 ("pmap_enter: unexpected pa update for %#lx", va)); 7192 if ((origpte & PG_MANAGED) != 0) { 7193 om = PHYS_TO_VM_PAGE(opa); 7194 7195 /* 7196 * The pmap lock is sufficient to synchronize with 7197 * concurrent calls to pmap_page_test_mappings() and 7198 * pmap_ts_referenced(). 7199 */ 7200 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 7201 vm_page_dirty(om); 7202 if ((origpte & PG_A) != 0) { 7203 pmap_invalidate_page(pmap, va); 7204 vm_page_aflag_set(om, PGA_REFERENCED); 7205 } 7206 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 7207 pv = pmap_pvh_remove(&om->md, pmap, va); 7208 KASSERT(pv != NULL, 7209 ("pmap_enter: no PV entry for %#lx", va)); 7210 if ((newpte & PG_MANAGED) == 0) 7211 free_pv_entry(pmap, pv); 7212 if ((om->a.flags & PGA_WRITEABLE) != 0 && 7213 TAILQ_EMPTY(&om->md.pv_list) && 7214 ((om->flags & PG_FICTITIOUS) != 0 || 7215 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 7216 vm_page_aflag_clear(om, PGA_WRITEABLE); 7217 } else { 7218 /* 7219 * Since this mapping is unmanaged, assume that PG_A 7220 * is set. 7221 */ 7222 pmap_invalidate_page(pmap, va); 7223 } 7224 origpte = 0; 7225 } else { 7226 /* 7227 * Increment the counters. 7228 */ 7229 if ((newpte & PG_W) != 0) 7230 pmap->pm_stats.wired_count++; 7231 pmap_resident_count_adj(pmap, 1); 7232 } 7233 7234 /* 7235 * Enter on the PV list if part of our managed memory. 7236 */ 7237 if ((newpte & PG_MANAGED) != 0) { 7238 if (pv == NULL) { 7239 pv = get_pv_entry(pmap, &lock); 7240 pv->pv_va = va; 7241 } 7242 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 7243 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 7244 m->md.pv_gen++; 7245 if ((newpte & PG_RW) != 0) 7246 vm_page_aflag_set(m, PGA_WRITEABLE); 7247 } 7248 7249 /* 7250 * Update the PTE. 7251 */ 7252 if ((origpte & PG_V) != 0) { 7253 validate: 7254 origpte = pte_load_store(pte, newpte); 7255 KASSERT((origpte & PG_FRAME) == pa, 7256 ("pmap_enter: unexpected pa update for %#lx", va)); 7257 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == 7258 (PG_M | PG_RW)) { 7259 if ((origpte & PG_MANAGED) != 0) 7260 vm_page_dirty(m); 7261 7262 /* 7263 * Although the PTE may still have PG_RW set, TLB 7264 * invalidation may nonetheless be required because 7265 * the PTE no longer has PG_M set. 7266 */ 7267 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { 7268 /* 7269 * This PTE change does not require TLB invalidation. 7270 */ 7271 goto unchanged; 7272 } 7273 if ((origpte & PG_A) != 0) 7274 pmap_invalidate_page(pmap, va); 7275 } else 7276 pte_store(pte, newpte); 7277 7278 unchanged: 7279 7280 #if VM_NRESERVLEVEL > 0 7281 /* 7282 * If both the page table page and the reservation are fully 7283 * populated, then attempt promotion. 7284 */ 7285 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 7286 pmap_ps_enabled(pmap) && 7287 (m->flags & PG_FICTITIOUS) == 0 && 7288 vm_reserv_level_iffullpop(m) == 0) 7289 pmap_promote_pde(pmap, pde, va, mpte, &lock); 7290 #endif 7291 7292 rv = KERN_SUCCESS; 7293 out: 7294 if (lock != NULL) 7295 rw_wunlock(lock); 7296 PMAP_UNLOCK(pmap); 7297 return (rv); 7298 } 7299 7300 /* 7301 * Tries to create a read- and/or execute-only 2MB page mapping. Returns 7302 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error 7303 * value. See pmap_enter_pde() for the possible error values when "no sleep", 7304 * "no replace", and "no reclaim" are specified. 7305 */ 7306 static int 7307 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 7308 struct rwlock **lockp) 7309 { 7310 pd_entry_t newpde; 7311 pt_entry_t PG_V; 7312 7313 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7314 PG_V = pmap_valid_bit(pmap); 7315 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | 7316 PG_PS | PG_V; 7317 if ((m->oflags & VPO_UNMANAGED) == 0) 7318 newpde |= PG_MANAGED; 7319 if ((prot & VM_PROT_EXECUTE) == 0) 7320 newpde |= pg_nx; 7321 if (va < VM_MAXUSER_ADDRESS) 7322 newpde |= PG_U; 7323 return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | 7324 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp)); 7325 } 7326 7327 /* 7328 * Returns true if every page table entry in the specified page table page is 7329 * zero. 7330 */ 7331 static bool 7332 pmap_every_pte_zero(vm_paddr_t pa) 7333 { 7334 pt_entry_t *pt_end, *pte; 7335 7336 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); 7337 pte = (pt_entry_t *)PHYS_TO_DMAP(pa); 7338 for (pt_end = pte + NPTEPG; pte < pt_end; pte++) { 7339 if (*pte != 0) 7340 return (false); 7341 } 7342 return (true); 7343 } 7344 7345 /* 7346 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 7347 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, 7348 * KERN_PROTECTION_FAILURE, or KERN_RESOURCE_SHORTAGE otherwise. Returns 7349 * KERN_FAILURE if either (1) PMAP_ENTER_NOREPLACE was specified and a 4KB 7350 * page mapping already exists within the 2MB virtual address range starting 7351 * at the specified virtual address or (2) the requested 2MB page mapping is 7352 * not supported due to hardware errata. Returns KERN_NO_SPACE if 7353 * PMAP_ENTER_NOREPLACE was specified and a 2MB page mapping already exists at 7354 * the specified virtual address. Returns KERN_PROTECTION_FAILURE if the PKRU 7355 * settings are not the same across the 2MB virtual address range starting at 7356 * the specified virtual address. Returns KERN_RESOURCE_SHORTAGE if either 7357 * (1) PMAP_ENTER_NOSLEEP was specified and a page table page allocation 7358 * failed or (2) PMAP_ENTER_NORECLAIM was specified and a PV entry allocation 7359 * failed. 7360 * 7361 * The parameter "m" is only used when creating a managed, writeable mapping. 7362 */ 7363 static int 7364 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, 7365 vm_page_t m, struct rwlock **lockp) 7366 { 7367 struct spglist free; 7368 pd_entry_t oldpde, *pde; 7369 pt_entry_t PG_G, PG_RW, PG_V; 7370 vm_page_t mt, pdpg; 7371 7372 KASSERT(pmap == kernel_pmap || (newpde & PG_W) == 0, 7373 ("pmap_enter_pde: cannot create wired user mapping")); 7374 PG_G = pmap_global_bit(pmap); 7375 PG_RW = pmap_rw_bit(pmap); 7376 KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW, 7377 ("pmap_enter_pde: newpde is missing PG_M")); 7378 PG_V = pmap_valid_bit(pmap); 7379 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7380 7381 if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, 7382 newpde))) { 7383 CTR2(KTR_PMAP, "pmap_enter_pde: 2m x blocked for va %#lx" 7384 " in pmap %p", va, pmap); 7385 return (KERN_FAILURE); 7386 } 7387 if ((pde = pmap_alloc_pde(pmap, va, &pdpg, (flags & 7388 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { 7389 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 7390 " in pmap %p", va, pmap); 7391 return (KERN_RESOURCE_SHORTAGE); 7392 } 7393 7394 /* 7395 * If pkru is not same for the whole pde range, return failure 7396 * and let vm_fault() cope. Check after pde allocation, since 7397 * it could sleep. 7398 */ 7399 if (!pmap_pkru_same(pmap, va, va + NBPDR)) { 7400 pmap_abort_ptp(pmap, va, pdpg); 7401 return (KERN_PROTECTION_FAILURE); 7402 } 7403 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) { 7404 newpde &= ~X86_PG_PKU_MASK; 7405 newpde |= pmap_pkru_get(pmap, va); 7406 } 7407 7408 /* 7409 * If there are existing mappings, either abort or remove them. 7410 */ 7411 oldpde = *pde; 7412 if ((oldpde & PG_V) != 0) { 7413 KASSERT(pdpg == NULL || pdpg->ref_count > 1, 7414 ("pmap_enter_pde: pdpg's reference count is too low")); 7415 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 7416 if ((oldpde & PG_PS) != 0) { 7417 if (pdpg != NULL) 7418 pdpg->ref_count--; 7419 CTR2(KTR_PMAP, 7420 "pmap_enter_pde: no space for va %#lx" 7421 " in pmap %p", va, pmap); 7422 return (KERN_NO_SPACE); 7423 } else if (va < VM_MAXUSER_ADDRESS || 7424 !pmap_every_pte_zero(oldpde & PG_FRAME)) { 7425 if (pdpg != NULL) 7426 pdpg->ref_count--; 7427 CTR2(KTR_PMAP, 7428 "pmap_enter_pde: failure for va %#lx" 7429 " in pmap %p", va, pmap); 7430 return (KERN_FAILURE); 7431 } 7432 } 7433 /* Break the existing mapping(s). */ 7434 SLIST_INIT(&free); 7435 if ((oldpde & PG_PS) != 0) { 7436 /* 7437 * The reference to the PD page that was acquired by 7438 * pmap_alloc_pde() ensures that it won't be freed. 7439 * However, if the PDE resulted from a promotion, then 7440 * a reserved PT page could be freed. 7441 */ 7442 (void)pmap_remove_pde(pmap, pde, va, &free, lockp); 7443 if ((oldpde & PG_G) == 0) 7444 pmap_invalidate_pde_page(pmap, va, oldpde); 7445 } else { 7446 pmap_delayed_invl_start(); 7447 if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free, 7448 lockp)) 7449 pmap_invalidate_all(pmap); 7450 pmap_delayed_invl_finish(); 7451 } 7452 if (va < VM_MAXUSER_ADDRESS) { 7453 vm_page_free_pages_toq(&free, true); 7454 KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", 7455 pde)); 7456 } else { 7457 KASSERT(SLIST_EMPTY(&free), 7458 ("pmap_enter_pde: freed kernel page table page")); 7459 7460 /* 7461 * Both pmap_remove_pde() and pmap_remove_ptes() will 7462 * leave the kernel page table page zero filled. 7463 */ 7464 mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7465 if (pmap_insert_pt_page(pmap, mt, false)) 7466 panic("pmap_enter_pde: trie insert failed"); 7467 } 7468 } 7469 7470 if ((newpde & PG_MANAGED) != 0) { 7471 /* 7472 * Abort this mapping if its PV entry could not be created. 7473 */ 7474 if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) { 7475 if (pdpg != NULL) 7476 pmap_abort_ptp(pmap, va, pdpg); 7477 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 7478 " in pmap %p", va, pmap); 7479 return (KERN_RESOURCE_SHORTAGE); 7480 } 7481 if ((newpde & PG_RW) != 0) { 7482 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 7483 vm_page_aflag_set(mt, PGA_WRITEABLE); 7484 } 7485 } 7486 7487 /* 7488 * Increment counters. 7489 */ 7490 if ((newpde & PG_W) != 0) 7491 pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; 7492 pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE); 7493 7494 /* 7495 * Map the superpage. (This is not a promoted mapping; there will not 7496 * be any lingering 4KB page mappings in the TLB.) 7497 */ 7498 pde_store(pde, newpde); 7499 7500 counter_u64_add(pmap_pde_mappings, 1); 7501 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx in pmap %p", 7502 va, pmap); 7503 return (KERN_SUCCESS); 7504 } 7505 7506 /* 7507 * Maps a sequence of resident pages belonging to the same object. 7508 * The sequence begins with the given page m_start. This page is 7509 * mapped at the given virtual address start. Each subsequent page is 7510 * mapped at a virtual address that is offset from start by the same 7511 * amount as the page is offset from m_start within the object. The 7512 * last page in the sequence is the page with the largest offset from 7513 * m_start that can be mapped at a virtual address less than the given 7514 * virtual address end. Not every virtual page between start and end 7515 * is mapped; only those for which a resident page exists with the 7516 * corresponding offset from m_start are mapped. 7517 */ 7518 void 7519 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 7520 vm_page_t m_start, vm_prot_t prot) 7521 { 7522 struct rwlock *lock; 7523 vm_offset_t va; 7524 vm_page_t m, mpte; 7525 vm_pindex_t diff, psize; 7526 int rv; 7527 7528 VM_OBJECT_ASSERT_LOCKED(m_start->object); 7529 7530 psize = atop(end - start); 7531 mpte = NULL; 7532 m = m_start; 7533 lock = NULL; 7534 PMAP_LOCK(pmap); 7535 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 7536 va = start + ptoa(diff); 7537 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 7538 m->psind == 1 && pmap_ps_enabled(pmap) && 7539 ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) == 7540 KERN_SUCCESS || rv == KERN_NO_SPACE)) 7541 m = &m[NBPDR / PAGE_SIZE - 1]; 7542 else 7543 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 7544 mpte, &lock); 7545 m = TAILQ_NEXT(m, listq); 7546 } 7547 if (lock != NULL) 7548 rw_wunlock(lock); 7549 PMAP_UNLOCK(pmap); 7550 } 7551 7552 /* 7553 * this code makes some *MAJOR* assumptions: 7554 * 1. Current pmap & pmap exists. 7555 * 2. Not wired. 7556 * 3. Read access. 7557 * 4. No page table pages. 7558 * but is *MUCH* faster than pmap_enter... 7559 */ 7560 7561 void 7562 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 7563 { 7564 struct rwlock *lock; 7565 7566 lock = NULL; 7567 PMAP_LOCK(pmap); 7568 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 7569 if (lock != NULL) 7570 rw_wunlock(lock); 7571 PMAP_UNLOCK(pmap); 7572 } 7573 7574 static vm_page_t 7575 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 7576 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 7577 { 7578 pt_entry_t newpte, *pte, PG_V; 7579 7580 KASSERT(!VA_IS_CLEANMAP(va) || 7581 (m->oflags & VPO_UNMANAGED) != 0, 7582 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 7583 PG_V = pmap_valid_bit(pmap); 7584 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7585 7586 /* 7587 * In the case that a page table page is not 7588 * resident, we are creating it here. 7589 */ 7590 if (va < VM_MAXUSER_ADDRESS) { 7591 pdp_entry_t *pdpe; 7592 pd_entry_t *pde; 7593 vm_pindex_t ptepindex; 7594 7595 /* 7596 * Calculate pagetable page index 7597 */ 7598 ptepindex = pmap_pde_pindex(va); 7599 if (mpte && (mpte->pindex == ptepindex)) { 7600 mpte->ref_count++; 7601 } else { 7602 /* 7603 * If the page table page is mapped, we just increment 7604 * the hold count, and activate it. Otherwise, we 7605 * attempt to allocate a page table page, passing NULL 7606 * instead of the PV list lock pointer because we don't 7607 * intend to sleep. If this attempt fails, we don't 7608 * retry. Instead, we give up. 7609 */ 7610 pdpe = pmap_pdpe(pmap, va); 7611 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 7612 if ((*pdpe & PG_PS) != 0) 7613 return (NULL); 7614 pde = pmap_pdpe_to_pde(pdpe, va); 7615 if ((*pde & PG_V) != 0) { 7616 if ((*pde & PG_PS) != 0) 7617 return (NULL); 7618 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7619 mpte->ref_count++; 7620 } else { 7621 mpte = pmap_allocpte_alloc(pmap, 7622 ptepindex, NULL, va); 7623 if (mpte == NULL) 7624 return (NULL); 7625 } 7626 } else { 7627 mpte = pmap_allocpte_alloc(pmap, ptepindex, 7628 NULL, va); 7629 if (mpte == NULL) 7630 return (NULL); 7631 } 7632 } 7633 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 7634 pte = &pte[pmap_pte_index(va)]; 7635 } else { 7636 mpte = NULL; 7637 pte = vtopte(va); 7638 } 7639 if (*pte) { 7640 if (mpte != NULL) 7641 mpte->ref_count--; 7642 return (NULL); 7643 } 7644 7645 /* 7646 * Enter on the PV list if part of our managed memory. 7647 */ 7648 if ((m->oflags & VPO_UNMANAGED) == 0 && 7649 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 7650 if (mpte != NULL) 7651 pmap_abort_ptp(pmap, va, mpte); 7652 return (NULL); 7653 } 7654 7655 /* 7656 * Increment counters 7657 */ 7658 pmap_resident_count_adj(pmap, 1); 7659 7660 newpte = VM_PAGE_TO_PHYS(m) | PG_V | 7661 pmap_cache_bits(pmap, m->md.pat_mode, 0); 7662 if ((m->oflags & VPO_UNMANAGED) == 0) 7663 newpte |= PG_MANAGED; 7664 if ((prot & VM_PROT_EXECUTE) == 0) 7665 newpte |= pg_nx; 7666 if (va < VM_MAXUSER_ADDRESS) 7667 newpte |= PG_U | pmap_pkru_get(pmap, va); 7668 pte_store(pte, newpte); 7669 return (mpte); 7670 } 7671 7672 /* 7673 * Make a temporary mapping for a physical address. This is only intended 7674 * to be used for panic dumps. 7675 */ 7676 void * 7677 pmap_kenter_temporary(vm_paddr_t pa, int i) 7678 { 7679 vm_offset_t va; 7680 7681 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 7682 pmap_kenter(va, pa); 7683 pmap_invlpg(kernel_pmap, va); 7684 return ((void *)crashdumpmap); 7685 } 7686 7687 /* 7688 * This code maps large physical mmap regions into the 7689 * processor address space. Note that some shortcuts 7690 * are taken, but the code works. 7691 */ 7692 void 7693 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 7694 vm_pindex_t pindex, vm_size_t size) 7695 { 7696 pd_entry_t *pde; 7697 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 7698 vm_paddr_t pa, ptepa; 7699 vm_page_t p, pdpg; 7700 int pat_mode; 7701 7702 PG_A = pmap_accessed_bit(pmap); 7703 PG_M = pmap_modified_bit(pmap); 7704 PG_V = pmap_valid_bit(pmap); 7705 PG_RW = pmap_rw_bit(pmap); 7706 7707 VM_OBJECT_ASSERT_WLOCKED(object); 7708 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 7709 ("pmap_object_init_pt: non-device object")); 7710 if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 7711 if (!pmap_ps_enabled(pmap)) 7712 return; 7713 if (!vm_object_populate(object, pindex, pindex + atop(size))) 7714 return; 7715 p = vm_page_lookup(object, pindex); 7716 KASSERT(vm_page_all_valid(p), 7717 ("pmap_object_init_pt: invalid page %p", p)); 7718 pat_mode = p->md.pat_mode; 7719 7720 /* 7721 * Abort the mapping if the first page is not physically 7722 * aligned to a 2MB page boundary. 7723 */ 7724 ptepa = VM_PAGE_TO_PHYS(p); 7725 if (ptepa & (NBPDR - 1)) 7726 return; 7727 7728 /* 7729 * Skip the first page. Abort the mapping if the rest of 7730 * the pages are not physically contiguous or have differing 7731 * memory attributes. 7732 */ 7733 p = TAILQ_NEXT(p, listq); 7734 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 7735 pa += PAGE_SIZE) { 7736 KASSERT(vm_page_all_valid(p), 7737 ("pmap_object_init_pt: invalid page %p", p)); 7738 if (pa != VM_PAGE_TO_PHYS(p) || 7739 pat_mode != p->md.pat_mode) 7740 return; 7741 p = TAILQ_NEXT(p, listq); 7742 } 7743 7744 /* 7745 * Map using 2MB pages. Since "ptepa" is 2M aligned and 7746 * "size" is a multiple of 2M, adding the PAT setting to "pa" 7747 * will not affect the termination of this loop. 7748 */ 7749 PMAP_LOCK(pmap); 7750 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); 7751 pa < ptepa + size; pa += NBPDR) { 7752 pde = pmap_alloc_pde(pmap, addr, &pdpg, NULL); 7753 if (pde == NULL) { 7754 /* 7755 * The creation of mappings below is only an 7756 * optimization. If a page directory page 7757 * cannot be allocated without blocking, 7758 * continue on to the next mapping rather than 7759 * blocking. 7760 */ 7761 addr += NBPDR; 7762 continue; 7763 } 7764 if ((*pde & PG_V) == 0) { 7765 pde_store(pde, pa | PG_PS | PG_M | PG_A | 7766 PG_U | PG_RW | PG_V); 7767 pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE); 7768 counter_u64_add(pmap_pde_mappings, 1); 7769 } else { 7770 /* Continue on if the PDE is already valid. */ 7771 pdpg->ref_count--; 7772 KASSERT(pdpg->ref_count > 0, 7773 ("pmap_object_init_pt: missing reference " 7774 "to page directory page, va: 0x%lx", addr)); 7775 } 7776 addr += NBPDR; 7777 } 7778 PMAP_UNLOCK(pmap); 7779 } 7780 } 7781 7782 /* 7783 * Clear the wired attribute from the mappings for the specified range of 7784 * addresses in the given pmap. Every valid mapping within that range 7785 * must have the wired attribute set. In contrast, invalid mappings 7786 * cannot have the wired attribute set, so they are ignored. 7787 * 7788 * The wired attribute of the page table entry is not a hardware 7789 * feature, so there is no need to invalidate any TLB entries. 7790 * Since pmap_demote_pde() for the wired entry must never fail, 7791 * pmap_delayed_invl_start()/finish() calls around the 7792 * function are not needed. 7793 */ 7794 void 7795 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 7796 { 7797 vm_offset_t va_next; 7798 pml4_entry_t *pml4e; 7799 pdp_entry_t *pdpe; 7800 pd_entry_t *pde; 7801 pt_entry_t *pte, PG_V, PG_G __diagused; 7802 7803 PG_V = pmap_valid_bit(pmap); 7804 PG_G = pmap_global_bit(pmap); 7805 PMAP_LOCK(pmap); 7806 for (; sva < eva; sva = va_next) { 7807 pml4e = pmap_pml4e(pmap, sva); 7808 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 7809 va_next = (sva + NBPML4) & ~PML4MASK; 7810 if (va_next < sva) 7811 va_next = eva; 7812 continue; 7813 } 7814 7815 va_next = (sva + NBPDP) & ~PDPMASK; 7816 if (va_next < sva) 7817 va_next = eva; 7818 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 7819 if ((*pdpe & PG_V) == 0) 7820 continue; 7821 if ((*pdpe & PG_PS) != 0) { 7822 KASSERT(va_next <= eva, 7823 ("partial update of non-transparent 1G mapping " 7824 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 7825 *pdpe, sva, eva, va_next)); 7826 MPASS(pmap != kernel_pmap); /* XXXKIB */ 7827 MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); 7828 atomic_clear_long(pdpe, PG_W); 7829 pmap->pm_stats.wired_count -= NBPDP / PAGE_SIZE; 7830 continue; 7831 } 7832 7833 va_next = (sva + NBPDR) & ~PDRMASK; 7834 if (va_next < sva) 7835 va_next = eva; 7836 pde = pmap_pdpe_to_pde(pdpe, sva); 7837 if ((*pde & PG_V) == 0) 7838 continue; 7839 if ((*pde & PG_PS) != 0) { 7840 if ((*pde & PG_W) == 0) 7841 panic("pmap_unwire: pde %#jx is missing PG_W", 7842 (uintmax_t)*pde); 7843 7844 /* 7845 * Are we unwiring the entire large page? If not, 7846 * demote the mapping and fall through. 7847 */ 7848 if (sva + NBPDR == va_next && eva >= va_next) { 7849 atomic_clear_long(pde, PG_W); 7850 pmap->pm_stats.wired_count -= NBPDR / 7851 PAGE_SIZE; 7852 continue; 7853 } else if (!pmap_demote_pde(pmap, pde, sva)) 7854 panic("pmap_unwire: demotion failed"); 7855 } 7856 if (va_next > eva) 7857 va_next = eva; 7858 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 7859 sva += PAGE_SIZE) { 7860 if ((*pte & PG_V) == 0) 7861 continue; 7862 if ((*pte & PG_W) == 0) 7863 panic("pmap_unwire: pte %#jx is missing PG_W", 7864 (uintmax_t)*pte); 7865 7866 /* 7867 * PG_W must be cleared atomically. Although the pmap 7868 * lock synchronizes access to PG_W, another processor 7869 * could be setting PG_M and/or PG_A concurrently. 7870 */ 7871 atomic_clear_long(pte, PG_W); 7872 pmap->pm_stats.wired_count--; 7873 } 7874 } 7875 PMAP_UNLOCK(pmap); 7876 } 7877 7878 /* 7879 * Copy the range specified by src_addr/len 7880 * from the source map to the range dst_addr/len 7881 * in the destination map. 7882 * 7883 * This routine is only advisory and need not do anything. 7884 */ 7885 void 7886 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 7887 vm_offset_t src_addr) 7888 { 7889 struct rwlock *lock; 7890 pml4_entry_t *pml4e; 7891 pdp_entry_t *pdpe; 7892 pd_entry_t *pde, srcptepaddr; 7893 pt_entry_t *dst_pte, PG_A, PG_M, PG_V, ptetemp, *src_pte; 7894 vm_offset_t addr, end_addr, va_next; 7895 vm_page_t dst_pdpg, dstmpte, srcmpte; 7896 7897 if (dst_addr != src_addr) 7898 return; 7899 7900 if (dst_pmap->pm_type != src_pmap->pm_type) 7901 return; 7902 7903 /* 7904 * EPT page table entries that require emulation of A/D bits are 7905 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although 7906 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit 7907 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT 7908 * implementations flag an EPT misconfiguration for exec-only 7909 * mappings we skip this function entirely for emulated pmaps. 7910 */ 7911 if (pmap_emulate_ad_bits(dst_pmap)) 7912 return; 7913 7914 end_addr = src_addr + len; 7915 lock = NULL; 7916 if (dst_pmap < src_pmap) { 7917 PMAP_LOCK(dst_pmap); 7918 PMAP_LOCK(src_pmap); 7919 } else { 7920 PMAP_LOCK(src_pmap); 7921 PMAP_LOCK(dst_pmap); 7922 } 7923 7924 PG_A = pmap_accessed_bit(dst_pmap); 7925 PG_M = pmap_modified_bit(dst_pmap); 7926 PG_V = pmap_valid_bit(dst_pmap); 7927 7928 for (addr = src_addr; addr < end_addr; addr = va_next) { 7929 KASSERT(addr < UPT_MIN_ADDRESS, 7930 ("pmap_copy: invalid to pmap_copy page tables")); 7931 7932 pml4e = pmap_pml4e(src_pmap, addr); 7933 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 7934 va_next = (addr + NBPML4) & ~PML4MASK; 7935 if (va_next < addr) 7936 va_next = end_addr; 7937 continue; 7938 } 7939 7940 va_next = (addr + NBPDP) & ~PDPMASK; 7941 if (va_next < addr) 7942 va_next = end_addr; 7943 pdpe = pmap_pml4e_to_pdpe(pml4e, addr); 7944 if ((*pdpe & PG_V) == 0) 7945 continue; 7946 if ((*pdpe & PG_PS) != 0) { 7947 KASSERT(va_next <= end_addr, 7948 ("partial update of non-transparent 1G mapping " 7949 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 7950 *pdpe, addr, end_addr, va_next)); 7951 MPASS((addr & PDPMASK) == 0); 7952 MPASS((*pdpe & PG_MANAGED) == 0); 7953 srcptepaddr = *pdpe; 7954 pdpe = pmap_pdpe(dst_pmap, addr); 7955 if (pdpe == NULL) { 7956 if (pmap_allocpte_alloc(dst_pmap, 7957 pmap_pml4e_pindex(addr), NULL, addr) == 7958 NULL) 7959 break; 7960 pdpe = pmap_pdpe(dst_pmap, addr); 7961 } else { 7962 pml4e = pmap_pml4e(dst_pmap, addr); 7963 dst_pdpg = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME); 7964 dst_pdpg->ref_count++; 7965 } 7966 KASSERT(*pdpe == 0, 7967 ("1G mapping present in dst pmap " 7968 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 7969 *pdpe, addr, end_addr, va_next)); 7970 *pdpe = srcptepaddr & ~PG_W; 7971 pmap_resident_count_adj(dst_pmap, NBPDP / PAGE_SIZE); 7972 continue; 7973 } 7974 7975 va_next = (addr + NBPDR) & ~PDRMASK; 7976 if (va_next < addr) 7977 va_next = end_addr; 7978 7979 pde = pmap_pdpe_to_pde(pdpe, addr); 7980 srcptepaddr = *pde; 7981 if (srcptepaddr == 0) 7982 continue; 7983 7984 if (srcptepaddr & PG_PS) { 7985 /* 7986 * We can only virtual copy whole superpages. 7987 */ 7988 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 7989 continue; 7990 pde = pmap_alloc_pde(dst_pmap, addr, &dst_pdpg, NULL); 7991 if (pde == NULL) 7992 break; 7993 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 7994 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, 7995 PMAP_ENTER_NORECLAIM, &lock))) { 7996 /* 7997 * We leave the dirty bit unchanged because 7998 * managed read/write superpage mappings are 7999 * required to be dirty. However, managed 8000 * superpage mappings are not required to 8001 * have their accessed bit set, so we clear 8002 * it because we don't know if this mapping 8003 * will be used. 8004 */ 8005 srcptepaddr &= ~PG_W; 8006 if ((srcptepaddr & PG_MANAGED) != 0) 8007 srcptepaddr &= ~PG_A; 8008 *pde = srcptepaddr; 8009 pmap_resident_count_adj(dst_pmap, NBPDR / 8010 PAGE_SIZE); 8011 counter_u64_add(pmap_pde_mappings, 1); 8012 } else 8013 pmap_abort_ptp(dst_pmap, addr, dst_pdpg); 8014 continue; 8015 } 8016 8017 srcptepaddr &= PG_FRAME; 8018 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 8019 KASSERT(srcmpte->ref_count > 0, 8020 ("pmap_copy: source page table page is unused")); 8021 8022 if (va_next > end_addr) 8023 va_next = end_addr; 8024 8025 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 8026 src_pte = &src_pte[pmap_pte_index(addr)]; 8027 dstmpte = NULL; 8028 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { 8029 ptetemp = *src_pte; 8030 8031 /* 8032 * We only virtual copy managed pages. 8033 */ 8034 if ((ptetemp & PG_MANAGED) == 0) 8035 continue; 8036 8037 if (dstmpte != NULL) { 8038 KASSERT(dstmpte->pindex == 8039 pmap_pde_pindex(addr), 8040 ("dstmpte pindex/addr mismatch")); 8041 dstmpte->ref_count++; 8042 } else if ((dstmpte = pmap_allocpte(dst_pmap, addr, 8043 NULL)) == NULL) 8044 goto out; 8045 dst_pte = (pt_entry_t *) 8046 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 8047 dst_pte = &dst_pte[pmap_pte_index(addr)]; 8048 if (*dst_pte == 0 && 8049 pmap_try_insert_pv_entry(dst_pmap, addr, 8050 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) { 8051 /* 8052 * Clear the wired, modified, and accessed 8053 * (referenced) bits during the copy. 8054 */ 8055 *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); 8056 pmap_resident_count_adj(dst_pmap, 1); 8057 } else { 8058 pmap_abort_ptp(dst_pmap, addr, dstmpte); 8059 goto out; 8060 } 8061 /* Have we copied all of the valid mappings? */ 8062 if (dstmpte->ref_count >= srcmpte->ref_count) 8063 break; 8064 } 8065 } 8066 out: 8067 if (lock != NULL) 8068 rw_wunlock(lock); 8069 PMAP_UNLOCK(src_pmap); 8070 PMAP_UNLOCK(dst_pmap); 8071 } 8072 8073 int 8074 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap) 8075 { 8076 int error; 8077 8078 if (dst_pmap->pm_type != src_pmap->pm_type || 8079 dst_pmap->pm_type != PT_X86 || 8080 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) 8081 return (0); 8082 for (;;) { 8083 if (dst_pmap < src_pmap) { 8084 PMAP_LOCK(dst_pmap); 8085 PMAP_LOCK(src_pmap); 8086 } else { 8087 PMAP_LOCK(src_pmap); 8088 PMAP_LOCK(dst_pmap); 8089 } 8090 error = pmap_pkru_copy(dst_pmap, src_pmap); 8091 /* Clean up partial copy on failure due to no memory. */ 8092 if (error == ENOMEM) 8093 pmap_pkru_deassign_all(dst_pmap); 8094 PMAP_UNLOCK(src_pmap); 8095 PMAP_UNLOCK(dst_pmap); 8096 if (error != ENOMEM) 8097 break; 8098 vm_wait(NULL); 8099 } 8100 return (error); 8101 } 8102 8103 /* 8104 * Zero the specified hardware page. 8105 */ 8106 void 8107 pmap_zero_page(vm_page_t m) 8108 { 8109 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 8110 8111 pagezero((void *)va); 8112 } 8113 8114 /* 8115 * Zero an area within a single hardware page. off and size must not 8116 * cover an area beyond a single hardware page. 8117 */ 8118 void 8119 pmap_zero_page_area(vm_page_t m, int off, int size) 8120 { 8121 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 8122 8123 if (off == 0 && size == PAGE_SIZE) 8124 pagezero((void *)va); 8125 else 8126 bzero((char *)va + off, size); 8127 } 8128 8129 /* 8130 * Copy 1 specified hardware page to another. 8131 */ 8132 void 8133 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 8134 { 8135 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 8136 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 8137 8138 pagecopy((void *)src, (void *)dst); 8139 } 8140 8141 int unmapped_buf_allowed = 1; 8142 8143 void 8144 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 8145 vm_offset_t b_offset, int xfersize) 8146 { 8147 void *a_cp, *b_cp; 8148 vm_page_t pages[2]; 8149 vm_offset_t vaddr[2], a_pg_offset, b_pg_offset; 8150 int cnt; 8151 boolean_t mapped; 8152 8153 while (xfersize > 0) { 8154 a_pg_offset = a_offset & PAGE_MASK; 8155 pages[0] = ma[a_offset >> PAGE_SHIFT]; 8156 b_pg_offset = b_offset & PAGE_MASK; 8157 pages[1] = mb[b_offset >> PAGE_SHIFT]; 8158 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 8159 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 8160 mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE); 8161 a_cp = (char *)vaddr[0] + a_pg_offset; 8162 b_cp = (char *)vaddr[1] + b_pg_offset; 8163 bcopy(a_cp, b_cp, cnt); 8164 if (__predict_false(mapped)) 8165 pmap_unmap_io_transient(pages, vaddr, 2, FALSE); 8166 a_offset += cnt; 8167 b_offset += cnt; 8168 xfersize -= cnt; 8169 } 8170 } 8171 8172 /* 8173 * Returns true if the pmap's pv is one of the first 8174 * 16 pvs linked to from this page. This count may 8175 * be changed upwards or downwards in the future; it 8176 * is only necessary that true be returned for a small 8177 * subset of pmaps for proper page aging. 8178 */ 8179 boolean_t 8180 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 8181 { 8182 struct md_page *pvh; 8183 struct rwlock *lock; 8184 pv_entry_t pv; 8185 int loops = 0; 8186 boolean_t rv; 8187 8188 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8189 ("pmap_page_exists_quick: page %p is not managed", m)); 8190 rv = FALSE; 8191 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8192 rw_rlock(lock); 8193 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8194 if (PV_PMAP(pv) == pmap) { 8195 rv = TRUE; 8196 break; 8197 } 8198 loops++; 8199 if (loops >= 16) 8200 break; 8201 } 8202 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 8203 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8204 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8205 if (PV_PMAP(pv) == pmap) { 8206 rv = TRUE; 8207 break; 8208 } 8209 loops++; 8210 if (loops >= 16) 8211 break; 8212 } 8213 } 8214 rw_runlock(lock); 8215 return (rv); 8216 } 8217 8218 /* 8219 * pmap_page_wired_mappings: 8220 * 8221 * Return the number of managed mappings to the given physical page 8222 * that are wired. 8223 */ 8224 int 8225 pmap_page_wired_mappings(vm_page_t m) 8226 { 8227 struct rwlock *lock; 8228 struct md_page *pvh; 8229 pmap_t pmap; 8230 pt_entry_t *pte; 8231 pv_entry_t pv; 8232 int count, md_gen, pvh_gen; 8233 8234 if ((m->oflags & VPO_UNMANAGED) != 0) 8235 return (0); 8236 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8237 rw_rlock(lock); 8238 restart: 8239 count = 0; 8240 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8241 pmap = PV_PMAP(pv); 8242 if (!PMAP_TRYLOCK(pmap)) { 8243 md_gen = m->md.pv_gen; 8244 rw_runlock(lock); 8245 PMAP_LOCK(pmap); 8246 rw_rlock(lock); 8247 if (md_gen != m->md.pv_gen) { 8248 PMAP_UNLOCK(pmap); 8249 goto restart; 8250 } 8251 } 8252 pte = pmap_pte(pmap, pv->pv_va); 8253 if ((*pte & PG_W) != 0) 8254 count++; 8255 PMAP_UNLOCK(pmap); 8256 } 8257 if ((m->flags & PG_FICTITIOUS) == 0) { 8258 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8259 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8260 pmap = PV_PMAP(pv); 8261 if (!PMAP_TRYLOCK(pmap)) { 8262 md_gen = m->md.pv_gen; 8263 pvh_gen = pvh->pv_gen; 8264 rw_runlock(lock); 8265 PMAP_LOCK(pmap); 8266 rw_rlock(lock); 8267 if (md_gen != m->md.pv_gen || 8268 pvh_gen != pvh->pv_gen) { 8269 PMAP_UNLOCK(pmap); 8270 goto restart; 8271 } 8272 } 8273 pte = pmap_pde(pmap, pv->pv_va); 8274 if ((*pte & PG_W) != 0) 8275 count++; 8276 PMAP_UNLOCK(pmap); 8277 } 8278 } 8279 rw_runlock(lock); 8280 return (count); 8281 } 8282 8283 /* 8284 * Returns TRUE if the given page is mapped individually or as part of 8285 * a 2mpage. Otherwise, returns FALSE. 8286 */ 8287 boolean_t 8288 pmap_page_is_mapped(vm_page_t m) 8289 { 8290 struct rwlock *lock; 8291 boolean_t rv; 8292 8293 if ((m->oflags & VPO_UNMANAGED) != 0) 8294 return (FALSE); 8295 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8296 rw_rlock(lock); 8297 rv = !TAILQ_EMPTY(&m->md.pv_list) || 8298 ((m->flags & PG_FICTITIOUS) == 0 && 8299 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 8300 rw_runlock(lock); 8301 return (rv); 8302 } 8303 8304 /* 8305 * Destroy all managed, non-wired mappings in the given user-space 8306 * pmap. This pmap cannot be active on any processor besides the 8307 * caller. 8308 * 8309 * This function cannot be applied to the kernel pmap. Moreover, it 8310 * is not intended for general use. It is only to be used during 8311 * process termination. Consequently, it can be implemented in ways 8312 * that make it faster than pmap_remove(). First, it can more quickly 8313 * destroy mappings by iterating over the pmap's collection of PV 8314 * entries, rather than searching the page table. Second, it doesn't 8315 * have to test and clear the page table entries atomically, because 8316 * no processor is currently accessing the user address space. In 8317 * particular, a page table entry's dirty bit won't change state once 8318 * this function starts. 8319 * 8320 * Although this function destroys all of the pmap's managed, 8321 * non-wired mappings, it can delay and batch the invalidation of TLB 8322 * entries without calling pmap_delayed_invl_start() and 8323 * pmap_delayed_invl_finish(). Because the pmap is not active on 8324 * any other processor, none of these TLB entries will ever be used 8325 * before their eventual invalidation. Consequently, there is no need 8326 * for either pmap_remove_all() or pmap_remove_write() to wait for 8327 * that eventual TLB invalidation. 8328 */ 8329 void 8330 pmap_remove_pages(pmap_t pmap) 8331 { 8332 pd_entry_t ptepde; 8333 pt_entry_t *pte, tpte; 8334 pt_entry_t PG_M, PG_RW, PG_V; 8335 struct spglist free; 8336 struct pv_chunklist free_chunks[PMAP_MEMDOM]; 8337 vm_page_t m, mpte, mt; 8338 pv_entry_t pv; 8339 struct md_page *pvh; 8340 struct pv_chunk *pc, *npc; 8341 struct rwlock *lock; 8342 int64_t bit; 8343 uint64_t inuse, bitmask; 8344 int allfree, field, i, idx; 8345 #ifdef PV_STATS 8346 int freed; 8347 #endif 8348 boolean_t superpage; 8349 vm_paddr_t pa; 8350 8351 /* 8352 * Assert that the given pmap is only active on the current 8353 * CPU. Unfortunately, we cannot block another CPU from 8354 * activating the pmap while this function is executing. 8355 */ 8356 KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); 8357 #ifdef INVARIANTS 8358 { 8359 cpuset_t other_cpus; 8360 8361 other_cpus = all_cpus; 8362 critical_enter(); 8363 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 8364 CPU_AND(&other_cpus, &other_cpus, &pmap->pm_active); 8365 critical_exit(); 8366 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); 8367 } 8368 #endif 8369 8370 lock = NULL; 8371 PG_M = pmap_modified_bit(pmap); 8372 PG_V = pmap_valid_bit(pmap); 8373 PG_RW = pmap_rw_bit(pmap); 8374 8375 for (i = 0; i < PMAP_MEMDOM; i++) 8376 TAILQ_INIT(&free_chunks[i]); 8377 SLIST_INIT(&free); 8378 PMAP_LOCK(pmap); 8379 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 8380 allfree = 1; 8381 #ifdef PV_STATS 8382 freed = 0; 8383 #endif 8384 for (field = 0; field < _NPCM; field++) { 8385 inuse = ~pc->pc_map[field] & pc_freemask[field]; 8386 while (inuse != 0) { 8387 bit = bsfq(inuse); 8388 bitmask = 1UL << bit; 8389 idx = field * 64 + bit; 8390 pv = &pc->pc_pventry[idx]; 8391 inuse &= ~bitmask; 8392 8393 pte = pmap_pdpe(pmap, pv->pv_va); 8394 ptepde = *pte; 8395 pte = pmap_pdpe_to_pde(pte, pv->pv_va); 8396 tpte = *pte; 8397 if ((tpte & (PG_PS | PG_V)) == PG_V) { 8398 superpage = FALSE; 8399 ptepde = tpte; 8400 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 8401 PG_FRAME); 8402 pte = &pte[pmap_pte_index(pv->pv_va)]; 8403 tpte = *pte; 8404 } else { 8405 /* 8406 * Keep track whether 'tpte' is a 8407 * superpage explicitly instead of 8408 * relying on PG_PS being set. 8409 * 8410 * This is because PG_PS is numerically 8411 * identical to PG_PTE_PAT and thus a 8412 * regular page could be mistaken for 8413 * a superpage. 8414 */ 8415 superpage = TRUE; 8416 } 8417 8418 if ((tpte & PG_V) == 0) { 8419 panic("bad pte va %lx pte %lx", 8420 pv->pv_va, tpte); 8421 } 8422 8423 /* 8424 * We cannot remove wired pages from a process' mapping at this time 8425 */ 8426 if (tpte & PG_W) { 8427 allfree = 0; 8428 continue; 8429 } 8430 8431 /* Mark free */ 8432 pc->pc_map[field] |= bitmask; 8433 8434 /* 8435 * Because this pmap is not active on other 8436 * processors, the dirty bit cannot have 8437 * changed state since we last loaded pte. 8438 */ 8439 pte_clear(pte); 8440 8441 if (superpage) 8442 pa = tpte & PG_PS_FRAME; 8443 else 8444 pa = tpte & PG_FRAME; 8445 8446 m = PHYS_TO_VM_PAGE(pa); 8447 KASSERT(m->phys_addr == pa, 8448 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 8449 m, (uintmax_t)m->phys_addr, 8450 (uintmax_t)tpte)); 8451 8452 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 8453 m < &vm_page_array[vm_page_array_size], 8454 ("pmap_remove_pages: bad tpte %#jx", 8455 (uintmax_t)tpte)); 8456 8457 /* 8458 * Update the vm_page_t clean/reference bits. 8459 */ 8460 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 8461 if (superpage) { 8462 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 8463 vm_page_dirty(mt); 8464 } else 8465 vm_page_dirty(m); 8466 } 8467 8468 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 8469 8470 if (superpage) { 8471 pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE); 8472 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 8473 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 8474 pvh->pv_gen++; 8475 if (TAILQ_EMPTY(&pvh->pv_list)) { 8476 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 8477 if ((mt->a.flags & PGA_WRITEABLE) != 0 && 8478 TAILQ_EMPTY(&mt->md.pv_list)) 8479 vm_page_aflag_clear(mt, PGA_WRITEABLE); 8480 } 8481 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 8482 if (mpte != NULL) { 8483 KASSERT(vm_page_all_valid(mpte), 8484 ("pmap_remove_pages: pte page not promoted")); 8485 pmap_pt_page_count_adj(pmap, -1); 8486 KASSERT(mpte->ref_count == NPTEPG, 8487 ("pmap_remove_pages: pte page reference count error")); 8488 mpte->ref_count = 0; 8489 pmap_add_delayed_free_list(mpte, &free, FALSE); 8490 } 8491 } else { 8492 pmap_resident_count_adj(pmap, -1); 8493 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 8494 m->md.pv_gen++; 8495 if ((m->a.flags & PGA_WRITEABLE) != 0 && 8496 TAILQ_EMPTY(&m->md.pv_list) && 8497 (m->flags & PG_FICTITIOUS) == 0) { 8498 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8499 if (TAILQ_EMPTY(&pvh->pv_list)) 8500 vm_page_aflag_clear(m, PGA_WRITEABLE); 8501 } 8502 } 8503 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 8504 #ifdef PV_STATS 8505 freed++; 8506 #endif 8507 } 8508 } 8509 PV_STAT(counter_u64_add(pv_entry_frees, freed)); 8510 PV_STAT(counter_u64_add(pv_entry_spare, freed)); 8511 PV_STAT(counter_u64_add(pv_entry_count, -freed)); 8512 if (allfree) { 8513 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 8514 TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc, pc_list); 8515 } 8516 } 8517 if (lock != NULL) 8518 rw_wunlock(lock); 8519 pmap_invalidate_all(pmap); 8520 pmap_pkru_deassign_all(pmap); 8521 free_pv_chunk_batch((struct pv_chunklist *)&free_chunks); 8522 PMAP_UNLOCK(pmap); 8523 vm_page_free_pages_toq(&free, true); 8524 } 8525 8526 static boolean_t 8527 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 8528 { 8529 struct rwlock *lock; 8530 pv_entry_t pv; 8531 struct md_page *pvh; 8532 pt_entry_t *pte, mask; 8533 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 8534 pmap_t pmap; 8535 int md_gen, pvh_gen; 8536 boolean_t rv; 8537 8538 rv = FALSE; 8539 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8540 rw_rlock(lock); 8541 restart: 8542 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8543 pmap = PV_PMAP(pv); 8544 if (!PMAP_TRYLOCK(pmap)) { 8545 md_gen = m->md.pv_gen; 8546 rw_runlock(lock); 8547 PMAP_LOCK(pmap); 8548 rw_rlock(lock); 8549 if (md_gen != m->md.pv_gen) { 8550 PMAP_UNLOCK(pmap); 8551 goto restart; 8552 } 8553 } 8554 pte = pmap_pte(pmap, pv->pv_va); 8555 mask = 0; 8556 if (modified) { 8557 PG_M = pmap_modified_bit(pmap); 8558 PG_RW = pmap_rw_bit(pmap); 8559 mask |= PG_RW | PG_M; 8560 } 8561 if (accessed) { 8562 PG_A = pmap_accessed_bit(pmap); 8563 PG_V = pmap_valid_bit(pmap); 8564 mask |= PG_V | PG_A; 8565 } 8566 rv = (*pte & mask) == mask; 8567 PMAP_UNLOCK(pmap); 8568 if (rv) 8569 goto out; 8570 } 8571 if ((m->flags & PG_FICTITIOUS) == 0) { 8572 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8573 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8574 pmap = PV_PMAP(pv); 8575 if (!PMAP_TRYLOCK(pmap)) { 8576 md_gen = m->md.pv_gen; 8577 pvh_gen = pvh->pv_gen; 8578 rw_runlock(lock); 8579 PMAP_LOCK(pmap); 8580 rw_rlock(lock); 8581 if (md_gen != m->md.pv_gen || 8582 pvh_gen != pvh->pv_gen) { 8583 PMAP_UNLOCK(pmap); 8584 goto restart; 8585 } 8586 } 8587 pte = pmap_pde(pmap, pv->pv_va); 8588 mask = 0; 8589 if (modified) { 8590 PG_M = pmap_modified_bit(pmap); 8591 PG_RW = pmap_rw_bit(pmap); 8592 mask |= PG_RW | PG_M; 8593 } 8594 if (accessed) { 8595 PG_A = pmap_accessed_bit(pmap); 8596 PG_V = pmap_valid_bit(pmap); 8597 mask |= PG_V | PG_A; 8598 } 8599 rv = (*pte & mask) == mask; 8600 PMAP_UNLOCK(pmap); 8601 if (rv) 8602 goto out; 8603 } 8604 } 8605 out: 8606 rw_runlock(lock); 8607 return (rv); 8608 } 8609 8610 /* 8611 * pmap_is_modified: 8612 * 8613 * Return whether or not the specified physical page was modified 8614 * in any physical maps. 8615 */ 8616 boolean_t 8617 pmap_is_modified(vm_page_t m) 8618 { 8619 8620 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8621 ("pmap_is_modified: page %p is not managed", m)); 8622 8623 /* 8624 * If the page is not busied then this check is racy. 8625 */ 8626 if (!pmap_page_is_write_mapped(m)) 8627 return (FALSE); 8628 return (pmap_page_test_mappings(m, FALSE, TRUE)); 8629 } 8630 8631 /* 8632 * pmap_is_prefaultable: 8633 * 8634 * Return whether or not the specified virtual address is eligible 8635 * for prefault. 8636 */ 8637 boolean_t 8638 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 8639 { 8640 pd_entry_t *pde; 8641 pt_entry_t *pte, PG_V; 8642 boolean_t rv; 8643 8644 PG_V = pmap_valid_bit(pmap); 8645 8646 /* 8647 * Return TRUE if and only if the PTE for the specified virtual 8648 * address is allocated but invalid. 8649 */ 8650 rv = FALSE; 8651 PMAP_LOCK(pmap); 8652 pde = pmap_pde(pmap, addr); 8653 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { 8654 pte = pmap_pde_to_pte(pde, addr); 8655 rv = (*pte & PG_V) == 0; 8656 } 8657 PMAP_UNLOCK(pmap); 8658 return (rv); 8659 } 8660 8661 /* 8662 * pmap_is_referenced: 8663 * 8664 * Return whether or not the specified physical page was referenced 8665 * in any physical maps. 8666 */ 8667 boolean_t 8668 pmap_is_referenced(vm_page_t m) 8669 { 8670 8671 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8672 ("pmap_is_referenced: page %p is not managed", m)); 8673 return (pmap_page_test_mappings(m, TRUE, FALSE)); 8674 } 8675 8676 /* 8677 * Clear the write and modified bits in each of the given page's mappings. 8678 */ 8679 void 8680 pmap_remove_write(vm_page_t m) 8681 { 8682 struct md_page *pvh; 8683 pmap_t pmap; 8684 struct rwlock *lock; 8685 pv_entry_t next_pv, pv; 8686 pd_entry_t *pde; 8687 pt_entry_t oldpte, *pte, PG_M, PG_RW; 8688 vm_offset_t va; 8689 int pvh_gen, md_gen; 8690 8691 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8692 ("pmap_remove_write: page %p is not managed", m)); 8693 8694 vm_page_assert_busied(m); 8695 if (!pmap_page_is_write_mapped(m)) 8696 return; 8697 8698 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8699 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 8700 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8701 rw_wlock(lock); 8702 retry: 8703 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 8704 pmap = PV_PMAP(pv); 8705 if (!PMAP_TRYLOCK(pmap)) { 8706 pvh_gen = pvh->pv_gen; 8707 rw_wunlock(lock); 8708 PMAP_LOCK(pmap); 8709 rw_wlock(lock); 8710 if (pvh_gen != pvh->pv_gen) { 8711 PMAP_UNLOCK(pmap); 8712 goto retry; 8713 } 8714 } 8715 PG_RW = pmap_rw_bit(pmap); 8716 va = pv->pv_va; 8717 pde = pmap_pde(pmap, va); 8718 if ((*pde & PG_RW) != 0) 8719 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 8720 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 8721 ("inconsistent pv lock %p %p for page %p", 8722 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 8723 PMAP_UNLOCK(pmap); 8724 } 8725 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8726 pmap = PV_PMAP(pv); 8727 if (!PMAP_TRYLOCK(pmap)) { 8728 pvh_gen = pvh->pv_gen; 8729 md_gen = m->md.pv_gen; 8730 rw_wunlock(lock); 8731 PMAP_LOCK(pmap); 8732 rw_wlock(lock); 8733 if (pvh_gen != pvh->pv_gen || 8734 md_gen != m->md.pv_gen) { 8735 PMAP_UNLOCK(pmap); 8736 goto retry; 8737 } 8738 } 8739 PG_M = pmap_modified_bit(pmap); 8740 PG_RW = pmap_rw_bit(pmap); 8741 pde = pmap_pde(pmap, pv->pv_va); 8742 KASSERT((*pde & PG_PS) == 0, 8743 ("pmap_remove_write: found a 2mpage in page %p's pv list", 8744 m)); 8745 pte = pmap_pde_to_pte(pde, pv->pv_va); 8746 oldpte = *pte; 8747 if (oldpte & PG_RW) { 8748 while (!atomic_fcmpset_long(pte, &oldpte, oldpte & 8749 ~(PG_RW | PG_M))) 8750 cpu_spinwait(); 8751 if ((oldpte & PG_M) != 0) 8752 vm_page_dirty(m); 8753 pmap_invalidate_page(pmap, pv->pv_va); 8754 } 8755 PMAP_UNLOCK(pmap); 8756 } 8757 rw_wunlock(lock); 8758 vm_page_aflag_clear(m, PGA_WRITEABLE); 8759 pmap_delayed_invl_wait(m); 8760 } 8761 8762 static __inline boolean_t 8763 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) 8764 { 8765 8766 if (!pmap_emulate_ad_bits(pmap)) 8767 return (TRUE); 8768 8769 KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); 8770 8771 /* 8772 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration 8773 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared 8774 * if the EPT_PG_WRITE bit is set. 8775 */ 8776 if ((pte & EPT_PG_WRITE) != 0) 8777 return (FALSE); 8778 8779 /* 8780 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. 8781 */ 8782 if ((pte & EPT_PG_EXECUTE) == 0 || 8783 ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) 8784 return (TRUE); 8785 else 8786 return (FALSE); 8787 } 8788 8789 /* 8790 * pmap_ts_referenced: 8791 * 8792 * Return a count of reference bits for a page, clearing those bits. 8793 * It is not necessary for every reference bit to be cleared, but it 8794 * is necessary that 0 only be returned when there are truly no 8795 * reference bits set. 8796 * 8797 * As an optimization, update the page's dirty field if a modified bit is 8798 * found while counting reference bits. This opportunistic update can be 8799 * performed at low cost and can eliminate the need for some future calls 8800 * to pmap_is_modified(). However, since this function stops after 8801 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 8802 * dirty pages. Those dirty pages will only be detected by a future call 8803 * to pmap_is_modified(). 8804 * 8805 * A DI block is not needed within this function, because 8806 * invalidations are performed before the PV list lock is 8807 * released. 8808 */ 8809 int 8810 pmap_ts_referenced(vm_page_t m) 8811 { 8812 struct md_page *pvh; 8813 pv_entry_t pv, pvf; 8814 pmap_t pmap; 8815 struct rwlock *lock; 8816 pd_entry_t oldpde, *pde; 8817 pt_entry_t *pte, PG_A, PG_M, PG_RW; 8818 vm_offset_t va; 8819 vm_paddr_t pa; 8820 int cleared, md_gen, not_cleared, pvh_gen; 8821 struct spglist free; 8822 boolean_t demoted; 8823 8824 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8825 ("pmap_ts_referenced: page %p is not managed", m)); 8826 SLIST_INIT(&free); 8827 cleared = 0; 8828 pa = VM_PAGE_TO_PHYS(m); 8829 lock = PHYS_TO_PV_LIST_LOCK(pa); 8830 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 8831 rw_wlock(lock); 8832 retry: 8833 not_cleared = 0; 8834 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 8835 goto small_mappings; 8836 pv = pvf; 8837 do { 8838 if (pvf == NULL) 8839 pvf = pv; 8840 pmap = PV_PMAP(pv); 8841 if (!PMAP_TRYLOCK(pmap)) { 8842 pvh_gen = pvh->pv_gen; 8843 rw_wunlock(lock); 8844 PMAP_LOCK(pmap); 8845 rw_wlock(lock); 8846 if (pvh_gen != pvh->pv_gen) { 8847 PMAP_UNLOCK(pmap); 8848 goto retry; 8849 } 8850 } 8851 PG_A = pmap_accessed_bit(pmap); 8852 PG_M = pmap_modified_bit(pmap); 8853 PG_RW = pmap_rw_bit(pmap); 8854 va = pv->pv_va; 8855 pde = pmap_pde(pmap, pv->pv_va); 8856 oldpde = *pde; 8857 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 8858 /* 8859 * Although "oldpde" is mapping a 2MB page, because 8860 * this function is called at a 4KB page granularity, 8861 * we only update the 4KB page under test. 8862 */ 8863 vm_page_dirty(m); 8864 } 8865 if ((oldpde & PG_A) != 0) { 8866 /* 8867 * Since this reference bit is shared by 512 4KB 8868 * pages, it should not be cleared every time it is 8869 * tested. Apply a simple "hash" function on the 8870 * physical page number, the virtual superpage number, 8871 * and the pmap address to select one 4KB page out of 8872 * the 512 on which testing the reference bit will 8873 * result in clearing that reference bit. This 8874 * function is designed to avoid the selection of the 8875 * same 4KB page for every 2MB page mapping. 8876 * 8877 * On demotion, a mapping that hasn't been referenced 8878 * is simply destroyed. To avoid the possibility of a 8879 * subsequent page fault on a demoted wired mapping, 8880 * always leave its reference bit set. Moreover, 8881 * since the superpage is wired, the current state of 8882 * its reference bit won't affect page replacement. 8883 */ 8884 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 8885 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 8886 (oldpde & PG_W) == 0) { 8887 if (safe_to_clear_referenced(pmap, oldpde)) { 8888 atomic_clear_long(pde, PG_A); 8889 pmap_invalidate_page(pmap, pv->pv_va); 8890 demoted = FALSE; 8891 } else if (pmap_demote_pde_locked(pmap, pde, 8892 pv->pv_va, &lock)) { 8893 /* 8894 * Remove the mapping to a single page 8895 * so that a subsequent access may 8896 * repromote. Since the underlying 8897 * page table page is fully populated, 8898 * this removal never frees a page 8899 * table page. 8900 */ 8901 demoted = TRUE; 8902 va += VM_PAGE_TO_PHYS(m) - (oldpde & 8903 PG_PS_FRAME); 8904 pte = pmap_pde_to_pte(pde, va); 8905 pmap_remove_pte(pmap, pte, va, *pde, 8906 NULL, &lock); 8907 pmap_invalidate_page(pmap, va); 8908 } else 8909 demoted = TRUE; 8910 8911 if (demoted) { 8912 /* 8913 * The superpage mapping was removed 8914 * entirely and therefore 'pv' is no 8915 * longer valid. 8916 */ 8917 if (pvf == pv) 8918 pvf = NULL; 8919 pv = NULL; 8920 } 8921 cleared++; 8922 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 8923 ("inconsistent pv lock %p %p for page %p", 8924 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 8925 } else 8926 not_cleared++; 8927 } 8928 PMAP_UNLOCK(pmap); 8929 /* Rotate the PV list if it has more than one entry. */ 8930 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 8931 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 8932 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 8933 pvh->pv_gen++; 8934 } 8935 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 8936 goto out; 8937 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 8938 small_mappings: 8939 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 8940 goto out; 8941 pv = pvf; 8942 do { 8943 if (pvf == NULL) 8944 pvf = pv; 8945 pmap = PV_PMAP(pv); 8946 if (!PMAP_TRYLOCK(pmap)) { 8947 pvh_gen = pvh->pv_gen; 8948 md_gen = m->md.pv_gen; 8949 rw_wunlock(lock); 8950 PMAP_LOCK(pmap); 8951 rw_wlock(lock); 8952 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 8953 PMAP_UNLOCK(pmap); 8954 goto retry; 8955 } 8956 } 8957 PG_A = pmap_accessed_bit(pmap); 8958 PG_M = pmap_modified_bit(pmap); 8959 PG_RW = pmap_rw_bit(pmap); 8960 pde = pmap_pde(pmap, pv->pv_va); 8961 KASSERT((*pde & PG_PS) == 0, 8962 ("pmap_ts_referenced: found a 2mpage in page %p's pv list", 8963 m)); 8964 pte = pmap_pde_to_pte(pde, pv->pv_va); 8965 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 8966 vm_page_dirty(m); 8967 if ((*pte & PG_A) != 0) { 8968 if (safe_to_clear_referenced(pmap, *pte)) { 8969 atomic_clear_long(pte, PG_A); 8970 pmap_invalidate_page(pmap, pv->pv_va); 8971 cleared++; 8972 } else if ((*pte & PG_W) == 0) { 8973 /* 8974 * Wired pages cannot be paged out so 8975 * doing accessed bit emulation for 8976 * them is wasted effort. We do the 8977 * hard work for unwired pages only. 8978 */ 8979 pmap_remove_pte(pmap, pte, pv->pv_va, 8980 *pde, &free, &lock); 8981 pmap_invalidate_page(pmap, pv->pv_va); 8982 cleared++; 8983 if (pvf == pv) 8984 pvf = NULL; 8985 pv = NULL; 8986 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 8987 ("inconsistent pv lock %p %p for page %p", 8988 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 8989 } else 8990 not_cleared++; 8991 } 8992 PMAP_UNLOCK(pmap); 8993 /* Rotate the PV list if it has more than one entry. */ 8994 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 8995 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 8996 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 8997 m->md.pv_gen++; 8998 } 8999 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 9000 not_cleared < PMAP_TS_REFERENCED_MAX); 9001 out: 9002 rw_wunlock(lock); 9003 vm_page_free_pages_toq(&free, true); 9004 return (cleared + not_cleared); 9005 } 9006 9007 /* 9008 * Apply the given advice to the specified range of addresses within the 9009 * given pmap. Depending on the advice, clear the referenced and/or 9010 * modified flags in each mapping and set the mapped page's dirty field. 9011 */ 9012 void 9013 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 9014 { 9015 struct rwlock *lock; 9016 pml4_entry_t *pml4e; 9017 pdp_entry_t *pdpe; 9018 pd_entry_t oldpde, *pde; 9019 pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; 9020 vm_offset_t va, va_next; 9021 vm_page_t m; 9022 bool anychanged; 9023 9024 if (advice != MADV_DONTNEED && advice != MADV_FREE) 9025 return; 9026 9027 /* 9028 * A/D bit emulation requires an alternate code path when clearing 9029 * the modified and accessed bits below. Since this function is 9030 * advisory in nature we skip it entirely for pmaps that require 9031 * A/D bit emulation. 9032 */ 9033 if (pmap_emulate_ad_bits(pmap)) 9034 return; 9035 9036 PG_A = pmap_accessed_bit(pmap); 9037 PG_G = pmap_global_bit(pmap); 9038 PG_M = pmap_modified_bit(pmap); 9039 PG_V = pmap_valid_bit(pmap); 9040 PG_RW = pmap_rw_bit(pmap); 9041 anychanged = false; 9042 pmap_delayed_invl_start(); 9043 PMAP_LOCK(pmap); 9044 for (; sva < eva; sva = va_next) { 9045 pml4e = pmap_pml4e(pmap, sva); 9046 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 9047 va_next = (sva + NBPML4) & ~PML4MASK; 9048 if (va_next < sva) 9049 va_next = eva; 9050 continue; 9051 } 9052 9053 va_next = (sva + NBPDP) & ~PDPMASK; 9054 if (va_next < sva) 9055 va_next = eva; 9056 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 9057 if ((*pdpe & PG_V) == 0) 9058 continue; 9059 if ((*pdpe & PG_PS) != 0) 9060 continue; 9061 9062 va_next = (sva + NBPDR) & ~PDRMASK; 9063 if (va_next < sva) 9064 va_next = eva; 9065 pde = pmap_pdpe_to_pde(pdpe, sva); 9066 oldpde = *pde; 9067 if ((oldpde & PG_V) == 0) 9068 continue; 9069 else if ((oldpde & PG_PS) != 0) { 9070 if ((oldpde & PG_MANAGED) == 0) 9071 continue; 9072 lock = NULL; 9073 if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { 9074 if (lock != NULL) 9075 rw_wunlock(lock); 9076 9077 /* 9078 * The large page mapping was destroyed. 9079 */ 9080 continue; 9081 } 9082 9083 /* 9084 * Unless the page mappings are wired, remove the 9085 * mapping to a single page so that a subsequent 9086 * access may repromote. Choosing the last page 9087 * within the address range [sva, min(va_next, eva)) 9088 * generally results in more repromotions. Since the 9089 * underlying page table page is fully populated, this 9090 * removal never frees a page table page. 9091 */ 9092 if ((oldpde & PG_W) == 0) { 9093 va = eva; 9094 if (va > va_next) 9095 va = va_next; 9096 va -= PAGE_SIZE; 9097 KASSERT(va >= sva, 9098 ("pmap_advise: no address gap")); 9099 pte = pmap_pde_to_pte(pde, va); 9100 KASSERT((*pte & PG_V) != 0, 9101 ("pmap_advise: invalid PTE")); 9102 pmap_remove_pte(pmap, pte, va, *pde, NULL, 9103 &lock); 9104 anychanged = true; 9105 } 9106 if (lock != NULL) 9107 rw_wunlock(lock); 9108 } 9109 if (va_next > eva) 9110 va_next = eva; 9111 va = va_next; 9112 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 9113 sva += PAGE_SIZE) { 9114 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) 9115 goto maybe_invlrng; 9116 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 9117 if (advice == MADV_DONTNEED) { 9118 /* 9119 * Future calls to pmap_is_modified() 9120 * can be avoided by making the page 9121 * dirty now. 9122 */ 9123 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 9124 vm_page_dirty(m); 9125 } 9126 atomic_clear_long(pte, PG_M | PG_A); 9127 } else if ((*pte & PG_A) != 0) 9128 atomic_clear_long(pte, PG_A); 9129 else 9130 goto maybe_invlrng; 9131 9132 if ((*pte & PG_G) != 0) { 9133 if (va == va_next) 9134 va = sva; 9135 } else 9136 anychanged = true; 9137 continue; 9138 maybe_invlrng: 9139 if (va != va_next) { 9140 pmap_invalidate_range(pmap, va, sva); 9141 va = va_next; 9142 } 9143 } 9144 if (va != va_next) 9145 pmap_invalidate_range(pmap, va, sva); 9146 } 9147 if (anychanged) 9148 pmap_invalidate_all(pmap); 9149 PMAP_UNLOCK(pmap); 9150 pmap_delayed_invl_finish(); 9151 } 9152 9153 /* 9154 * Clear the modify bits on the specified physical page. 9155 */ 9156 void 9157 pmap_clear_modify(vm_page_t m) 9158 { 9159 struct md_page *pvh; 9160 pmap_t pmap; 9161 pv_entry_t next_pv, pv; 9162 pd_entry_t oldpde, *pde; 9163 pt_entry_t *pte, PG_M, PG_RW; 9164 struct rwlock *lock; 9165 vm_offset_t va; 9166 int md_gen, pvh_gen; 9167 9168 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 9169 ("pmap_clear_modify: page %p is not managed", m)); 9170 vm_page_assert_busied(m); 9171 9172 if (!pmap_page_is_write_mapped(m)) 9173 return; 9174 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 9175 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 9176 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 9177 rw_wlock(lock); 9178 restart: 9179 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 9180 pmap = PV_PMAP(pv); 9181 if (!PMAP_TRYLOCK(pmap)) { 9182 pvh_gen = pvh->pv_gen; 9183 rw_wunlock(lock); 9184 PMAP_LOCK(pmap); 9185 rw_wlock(lock); 9186 if (pvh_gen != pvh->pv_gen) { 9187 PMAP_UNLOCK(pmap); 9188 goto restart; 9189 } 9190 } 9191 PG_M = pmap_modified_bit(pmap); 9192 PG_RW = pmap_rw_bit(pmap); 9193 va = pv->pv_va; 9194 pde = pmap_pde(pmap, va); 9195 oldpde = *pde; 9196 /* If oldpde has PG_RW set, then it also has PG_M set. */ 9197 if ((oldpde & PG_RW) != 0 && 9198 pmap_demote_pde_locked(pmap, pde, va, &lock) && 9199 (oldpde & PG_W) == 0) { 9200 /* 9201 * Write protect the mapping to a single page so that 9202 * a subsequent write access may repromote. 9203 */ 9204 va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); 9205 pte = pmap_pde_to_pte(pde, va); 9206 atomic_clear_long(pte, PG_M | PG_RW); 9207 vm_page_dirty(m); 9208 pmap_invalidate_page(pmap, va); 9209 } 9210 PMAP_UNLOCK(pmap); 9211 } 9212 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 9213 pmap = PV_PMAP(pv); 9214 if (!PMAP_TRYLOCK(pmap)) { 9215 md_gen = m->md.pv_gen; 9216 pvh_gen = pvh->pv_gen; 9217 rw_wunlock(lock); 9218 PMAP_LOCK(pmap); 9219 rw_wlock(lock); 9220 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 9221 PMAP_UNLOCK(pmap); 9222 goto restart; 9223 } 9224 } 9225 PG_M = pmap_modified_bit(pmap); 9226 PG_RW = pmap_rw_bit(pmap); 9227 pde = pmap_pde(pmap, pv->pv_va); 9228 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 9229 " a 2mpage in page %p's pv list", m)); 9230 pte = pmap_pde_to_pte(pde, pv->pv_va); 9231 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 9232 atomic_clear_long(pte, PG_M); 9233 pmap_invalidate_page(pmap, pv->pv_va); 9234 } 9235 PMAP_UNLOCK(pmap); 9236 } 9237 rw_wunlock(lock); 9238 } 9239 9240 /* 9241 * Miscellaneous support routines follow 9242 */ 9243 9244 /* Adjust the properties for a leaf page table entry. */ 9245 static __inline void 9246 pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask) 9247 { 9248 u_long opte, npte; 9249 9250 opte = *(u_long *)pte; 9251 do { 9252 npte = opte & ~mask; 9253 npte |= bits; 9254 } while (npte != opte && !atomic_fcmpset_long((u_long *)pte, &opte, 9255 npte)); 9256 } 9257 9258 /* 9259 * Map a set of physical memory pages into the kernel virtual 9260 * address space. Return a pointer to where it is mapped. This 9261 * routine is intended to be used for mapping device memory, 9262 * NOT real memory. 9263 */ 9264 static void * 9265 pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags) 9266 { 9267 struct pmap_preinit_mapping *ppim; 9268 vm_offset_t va, offset; 9269 vm_size_t tmpsize; 9270 int i; 9271 9272 offset = pa & PAGE_MASK; 9273 size = round_page(offset + size); 9274 pa = trunc_page(pa); 9275 9276 if (!pmap_initialized) { 9277 va = 0; 9278 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9279 ppim = pmap_preinit_mapping + i; 9280 if (ppim->va == 0) { 9281 ppim->pa = pa; 9282 ppim->sz = size; 9283 ppim->mode = mode; 9284 ppim->va = virtual_avail; 9285 virtual_avail += size; 9286 va = ppim->va; 9287 break; 9288 } 9289 } 9290 if (va == 0) 9291 panic("%s: too many preinit mappings", __func__); 9292 } else { 9293 /* 9294 * If we have a preinit mapping, re-use it. 9295 */ 9296 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9297 ppim = pmap_preinit_mapping + i; 9298 if (ppim->pa == pa && ppim->sz == size && 9299 (ppim->mode == mode || 9300 (flags & MAPDEV_SETATTR) == 0)) 9301 return ((void *)(ppim->va + offset)); 9302 } 9303 /* 9304 * If the specified range of physical addresses fits within 9305 * the direct map window, use the direct map. 9306 */ 9307 if (pa < dmaplimit && pa + size <= dmaplimit) { 9308 va = PHYS_TO_DMAP(pa); 9309 if ((flags & MAPDEV_SETATTR) != 0) { 9310 PMAP_LOCK(kernel_pmap); 9311 i = pmap_change_props_locked(va, size, 9312 PROT_NONE, mode, flags); 9313 PMAP_UNLOCK(kernel_pmap); 9314 } else 9315 i = 0; 9316 if (!i) 9317 return ((void *)(va + offset)); 9318 } 9319 va = kva_alloc(size); 9320 if (va == 0) 9321 panic("%s: Couldn't allocate KVA", __func__); 9322 } 9323 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 9324 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 9325 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 9326 if ((flags & MAPDEV_FLUSHCACHE) != 0) 9327 pmap_invalidate_cache_range(va, va + tmpsize); 9328 return ((void *)(va + offset)); 9329 } 9330 9331 void * 9332 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 9333 { 9334 9335 return (pmap_mapdev_internal(pa, size, mode, MAPDEV_FLUSHCACHE | 9336 MAPDEV_SETATTR)); 9337 } 9338 9339 void * 9340 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 9341 { 9342 9343 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 9344 } 9345 9346 void * 9347 pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size) 9348 { 9349 9350 return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE, 9351 MAPDEV_SETATTR)); 9352 } 9353 9354 void * 9355 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 9356 { 9357 9358 return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK, 9359 MAPDEV_FLUSHCACHE)); 9360 } 9361 9362 void 9363 pmap_unmapdev(void *p, vm_size_t size) 9364 { 9365 struct pmap_preinit_mapping *ppim; 9366 vm_offset_t offset, va; 9367 int i; 9368 9369 va = (vm_offset_t)p; 9370 9371 /* If we gave a direct map region in pmap_mapdev, do nothing */ 9372 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 9373 return; 9374 offset = va & PAGE_MASK; 9375 size = round_page(offset + size); 9376 va = trunc_page(va); 9377 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9378 ppim = pmap_preinit_mapping + i; 9379 if (ppim->va == va && ppim->sz == size) { 9380 if (pmap_initialized) 9381 return; 9382 ppim->pa = 0; 9383 ppim->va = 0; 9384 ppim->sz = 0; 9385 ppim->mode = 0; 9386 if (va + size == virtual_avail) 9387 virtual_avail = va; 9388 return; 9389 } 9390 } 9391 if (pmap_initialized) { 9392 pmap_qremove(va, atop(size)); 9393 kva_free(va, size); 9394 } 9395 } 9396 9397 /* 9398 * Tries to demote a 1GB page mapping. 9399 */ 9400 static boolean_t 9401 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) 9402 { 9403 pdp_entry_t newpdpe, oldpdpe; 9404 pd_entry_t *firstpde, newpde, *pde; 9405 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 9406 vm_paddr_t pdpgpa; 9407 vm_page_t pdpg; 9408 9409 PG_A = pmap_accessed_bit(pmap); 9410 PG_M = pmap_modified_bit(pmap); 9411 PG_V = pmap_valid_bit(pmap); 9412 PG_RW = pmap_rw_bit(pmap); 9413 9414 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9415 oldpdpe = *pdpe; 9416 KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), 9417 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 9418 pdpg = pmap_alloc_pt_page(pmap, va >> PDPSHIFT, 9419 VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT); 9420 if (pdpg == NULL) { 9421 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 9422 " in pmap %p", va, pmap); 9423 return (FALSE); 9424 } 9425 pdpgpa = VM_PAGE_TO_PHYS(pdpg); 9426 firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa); 9427 newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; 9428 KASSERT((oldpdpe & PG_A) != 0, 9429 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 9430 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 9431 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 9432 newpde = oldpdpe; 9433 9434 /* 9435 * Initialize the page directory page. 9436 */ 9437 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 9438 *pde = newpde; 9439 newpde += NBPDR; 9440 } 9441 9442 /* 9443 * Demote the mapping. 9444 */ 9445 *pdpe = newpdpe; 9446 9447 /* 9448 * Invalidate a stale recursive mapping of the page directory page. 9449 */ 9450 pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); 9451 9452 counter_u64_add(pmap_pdpe_demotions, 1); 9453 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 9454 " in pmap %p", va, pmap); 9455 return (TRUE); 9456 } 9457 9458 /* 9459 * Sets the memory attribute for the specified page. 9460 */ 9461 void 9462 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 9463 { 9464 9465 m->md.pat_mode = ma; 9466 9467 /* 9468 * If "m" is a normal page, update its direct mapping. This update 9469 * can be relied upon to perform any cache operations that are 9470 * required for data coherence. 9471 */ 9472 if ((m->flags & PG_FICTITIOUS) == 0 && 9473 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 9474 m->md.pat_mode)) 9475 panic("memory attribute change on the direct map failed"); 9476 } 9477 9478 void 9479 pmap_page_set_memattr_noflush(vm_page_t m, vm_memattr_t ma) 9480 { 9481 int error; 9482 9483 m->md.pat_mode = ma; 9484 9485 if ((m->flags & PG_FICTITIOUS) != 0) 9486 return; 9487 PMAP_LOCK(kernel_pmap); 9488 error = pmap_change_props_locked(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 9489 PAGE_SIZE, PROT_NONE, m->md.pat_mode, 0); 9490 PMAP_UNLOCK(kernel_pmap); 9491 if (error != 0) 9492 panic("memory attribute change on the direct map failed"); 9493 } 9494 9495 /* 9496 * Changes the specified virtual address range's memory type to that given by 9497 * the parameter "mode". The specified virtual address range must be 9498 * completely contained within either the direct map or the kernel map. If 9499 * the virtual address range is contained within the kernel map, then the 9500 * memory type for each of the corresponding ranges of the direct map is also 9501 * changed. (The corresponding ranges of the direct map are those ranges that 9502 * map the same physical pages as the specified virtual address range.) These 9503 * changes to the direct map are necessary because Intel describes the 9504 * behavior of their processors as "undefined" if two or more mappings to the 9505 * same physical page have different memory types. 9506 * 9507 * Returns zero if the change completed successfully, and either EINVAL or 9508 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 9509 * of the virtual address range was not mapped, and ENOMEM is returned if 9510 * there was insufficient memory available to complete the change. In the 9511 * latter case, the memory type may have been changed on some part of the 9512 * virtual address range or the direct map. 9513 */ 9514 int 9515 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 9516 { 9517 int error; 9518 9519 PMAP_LOCK(kernel_pmap); 9520 error = pmap_change_props_locked(va, size, PROT_NONE, mode, 9521 MAPDEV_FLUSHCACHE); 9522 PMAP_UNLOCK(kernel_pmap); 9523 return (error); 9524 } 9525 9526 /* 9527 * Changes the specified virtual address range's protections to those 9528 * specified by "prot". Like pmap_change_attr(), protections for aliases 9529 * in the direct map are updated as well. Protections on aliasing mappings may 9530 * be a subset of the requested protections; for example, mappings in the direct 9531 * map are never executable. 9532 */ 9533 int 9534 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot) 9535 { 9536 int error; 9537 9538 /* Only supported within the kernel map. */ 9539 if (va < VM_MIN_KERNEL_ADDRESS) 9540 return (EINVAL); 9541 9542 PMAP_LOCK(kernel_pmap); 9543 error = pmap_change_props_locked(va, size, prot, -1, 9544 MAPDEV_ASSERTVALID); 9545 PMAP_UNLOCK(kernel_pmap); 9546 return (error); 9547 } 9548 9549 static int 9550 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, 9551 int mode, int flags) 9552 { 9553 vm_offset_t base, offset, tmpva; 9554 vm_paddr_t pa_start, pa_end, pa_end1; 9555 pdp_entry_t *pdpe; 9556 pd_entry_t *pde, pde_bits, pde_mask; 9557 pt_entry_t *pte, pte_bits, pte_mask; 9558 int error; 9559 bool changed; 9560 9561 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 9562 base = trunc_page(va); 9563 offset = va & PAGE_MASK; 9564 size = round_page(offset + size); 9565 9566 /* 9567 * Only supported on kernel virtual addresses, including the direct 9568 * map but excluding the recursive map. 9569 */ 9570 if (base < DMAP_MIN_ADDRESS) 9571 return (EINVAL); 9572 9573 /* 9574 * Construct our flag sets and masks. "bits" is the subset of 9575 * "mask" that will be set in each modified PTE. 9576 * 9577 * Mappings in the direct map are never allowed to be executable. 9578 */ 9579 pde_bits = pte_bits = 0; 9580 pde_mask = pte_mask = 0; 9581 if (mode != -1) { 9582 pde_bits |= pmap_cache_bits(kernel_pmap, mode, true); 9583 pde_mask |= X86_PG_PDE_CACHE; 9584 pte_bits |= pmap_cache_bits(kernel_pmap, mode, false); 9585 pte_mask |= X86_PG_PTE_CACHE; 9586 } 9587 if (prot != VM_PROT_NONE) { 9588 if ((prot & VM_PROT_WRITE) != 0) { 9589 pde_bits |= X86_PG_RW; 9590 pte_bits |= X86_PG_RW; 9591 } 9592 if ((prot & VM_PROT_EXECUTE) == 0 || 9593 va < VM_MIN_KERNEL_ADDRESS) { 9594 pde_bits |= pg_nx; 9595 pte_bits |= pg_nx; 9596 } 9597 pde_mask |= X86_PG_RW | pg_nx; 9598 pte_mask |= X86_PG_RW | pg_nx; 9599 } 9600 9601 /* 9602 * Pages that aren't mapped aren't supported. Also break down 2MB pages 9603 * into 4KB pages if required. 9604 */ 9605 for (tmpva = base; tmpva < base + size; ) { 9606 pdpe = pmap_pdpe(kernel_pmap, tmpva); 9607 if (pdpe == NULL || *pdpe == 0) { 9608 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9609 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9610 return (EINVAL); 9611 } 9612 if (*pdpe & PG_PS) { 9613 /* 9614 * If the current 1GB page already has the required 9615 * properties, then we need not demote this page. Just 9616 * increment tmpva to the next 1GB page frame. 9617 */ 9618 if ((*pdpe & pde_mask) == pde_bits) { 9619 tmpva = trunc_1gpage(tmpva) + NBPDP; 9620 continue; 9621 } 9622 9623 /* 9624 * If the current offset aligns with a 1GB page frame 9625 * and there is at least 1GB left within the range, then 9626 * we need not break down this page into 2MB pages. 9627 */ 9628 if ((tmpva & PDPMASK) == 0 && 9629 tmpva + PDPMASK < base + size) { 9630 tmpva += NBPDP; 9631 continue; 9632 } 9633 if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) 9634 return (ENOMEM); 9635 } 9636 pde = pmap_pdpe_to_pde(pdpe, tmpva); 9637 if (*pde == 0) { 9638 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9639 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9640 return (EINVAL); 9641 } 9642 if (*pde & PG_PS) { 9643 /* 9644 * If the current 2MB page already has the required 9645 * properties, then we need not demote this page. Just 9646 * increment tmpva to the next 2MB page frame. 9647 */ 9648 if ((*pde & pde_mask) == pde_bits) { 9649 tmpva = trunc_2mpage(tmpva) + NBPDR; 9650 continue; 9651 } 9652 9653 /* 9654 * If the current offset aligns with a 2MB page frame 9655 * and there is at least 2MB left within the range, then 9656 * we need not break down this page into 4KB pages. 9657 */ 9658 if ((tmpva & PDRMASK) == 0 && 9659 tmpva + PDRMASK < base + size) { 9660 tmpva += NBPDR; 9661 continue; 9662 } 9663 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) 9664 return (ENOMEM); 9665 } 9666 pte = pmap_pde_to_pte(pde, tmpva); 9667 if (*pte == 0) { 9668 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9669 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9670 return (EINVAL); 9671 } 9672 tmpva += PAGE_SIZE; 9673 } 9674 error = 0; 9675 9676 /* 9677 * Ok, all the pages exist, so run through them updating their 9678 * properties if required. 9679 */ 9680 changed = false; 9681 pa_start = pa_end = 0; 9682 for (tmpva = base; tmpva < base + size; ) { 9683 pdpe = pmap_pdpe(kernel_pmap, tmpva); 9684 if (*pdpe & PG_PS) { 9685 if ((*pdpe & pde_mask) != pde_bits) { 9686 pmap_pte_props(pdpe, pde_bits, pde_mask); 9687 changed = true; 9688 } 9689 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9690 (*pdpe & PG_PS_FRAME) < dmaplimit) { 9691 if (pa_start == pa_end) { 9692 /* Start physical address run. */ 9693 pa_start = *pdpe & PG_PS_FRAME; 9694 pa_end = pa_start + NBPDP; 9695 } else if (pa_end == (*pdpe & PG_PS_FRAME)) 9696 pa_end += NBPDP; 9697 else { 9698 /* Run ended, update direct map. */ 9699 error = pmap_change_props_locked( 9700 PHYS_TO_DMAP(pa_start), 9701 pa_end - pa_start, prot, mode, 9702 flags); 9703 if (error != 0) 9704 break; 9705 /* Start physical address run. */ 9706 pa_start = *pdpe & PG_PS_FRAME; 9707 pa_end = pa_start + NBPDP; 9708 } 9709 } 9710 tmpva = trunc_1gpage(tmpva) + NBPDP; 9711 continue; 9712 } 9713 pde = pmap_pdpe_to_pde(pdpe, tmpva); 9714 if (*pde & PG_PS) { 9715 if ((*pde & pde_mask) != pde_bits) { 9716 pmap_pte_props(pde, pde_bits, pde_mask); 9717 changed = true; 9718 } 9719 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9720 (*pde & PG_PS_FRAME) < dmaplimit) { 9721 if (pa_start == pa_end) { 9722 /* Start physical address run. */ 9723 pa_start = *pde & PG_PS_FRAME; 9724 pa_end = pa_start + NBPDR; 9725 } else if (pa_end == (*pde & PG_PS_FRAME)) 9726 pa_end += NBPDR; 9727 else { 9728 /* Run ended, update direct map. */ 9729 error = pmap_change_props_locked( 9730 PHYS_TO_DMAP(pa_start), 9731 pa_end - pa_start, prot, mode, 9732 flags); 9733 if (error != 0) 9734 break; 9735 /* Start physical address run. */ 9736 pa_start = *pde & PG_PS_FRAME; 9737 pa_end = pa_start + NBPDR; 9738 } 9739 } 9740 tmpva = trunc_2mpage(tmpva) + NBPDR; 9741 } else { 9742 pte = pmap_pde_to_pte(pde, tmpva); 9743 if ((*pte & pte_mask) != pte_bits) { 9744 pmap_pte_props(pte, pte_bits, pte_mask); 9745 changed = true; 9746 } 9747 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9748 (*pte & PG_FRAME) < dmaplimit) { 9749 if (pa_start == pa_end) { 9750 /* Start physical address run. */ 9751 pa_start = *pte & PG_FRAME; 9752 pa_end = pa_start + PAGE_SIZE; 9753 } else if (pa_end == (*pte & PG_FRAME)) 9754 pa_end += PAGE_SIZE; 9755 else { 9756 /* Run ended, update direct map. */ 9757 error = pmap_change_props_locked( 9758 PHYS_TO_DMAP(pa_start), 9759 pa_end - pa_start, prot, mode, 9760 flags); 9761 if (error != 0) 9762 break; 9763 /* Start physical address run. */ 9764 pa_start = *pte & PG_FRAME; 9765 pa_end = pa_start + PAGE_SIZE; 9766 } 9767 } 9768 tmpva += PAGE_SIZE; 9769 } 9770 } 9771 if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { 9772 pa_end1 = MIN(pa_end, dmaplimit); 9773 if (pa_start != pa_end1) 9774 error = pmap_change_props_locked(PHYS_TO_DMAP(pa_start), 9775 pa_end1 - pa_start, prot, mode, flags); 9776 } 9777 9778 /* 9779 * Flush CPU caches if required to make sure any data isn't cached that 9780 * shouldn't be, etc. 9781 */ 9782 if (changed) { 9783 pmap_invalidate_range(kernel_pmap, base, tmpva); 9784 if ((flags & MAPDEV_FLUSHCACHE) != 0) 9785 pmap_invalidate_cache_range(base, tmpva); 9786 } 9787 return (error); 9788 } 9789 9790 /* 9791 * Demotes any mapping within the direct map region that covers more than the 9792 * specified range of physical addresses. This range's size must be a power 9793 * of two and its starting address must be a multiple of its size. Since the 9794 * demotion does not change any attributes of the mapping, a TLB invalidation 9795 * is not mandatory. The caller may, however, request a TLB invalidation. 9796 */ 9797 void 9798 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) 9799 { 9800 pdp_entry_t *pdpe; 9801 pd_entry_t *pde; 9802 vm_offset_t va; 9803 boolean_t changed; 9804 9805 if (len == 0) 9806 return; 9807 KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); 9808 KASSERT((base & (len - 1)) == 0, 9809 ("pmap_demote_DMAP: base is not a multiple of len")); 9810 if (len < NBPDP && base < dmaplimit) { 9811 va = PHYS_TO_DMAP(base); 9812 changed = FALSE; 9813 PMAP_LOCK(kernel_pmap); 9814 pdpe = pmap_pdpe(kernel_pmap, va); 9815 if ((*pdpe & X86_PG_V) == 0) 9816 panic("pmap_demote_DMAP: invalid PDPE"); 9817 if ((*pdpe & PG_PS) != 0) { 9818 if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) 9819 panic("pmap_demote_DMAP: PDPE failed"); 9820 changed = TRUE; 9821 } 9822 if (len < NBPDR) { 9823 pde = pmap_pdpe_to_pde(pdpe, va); 9824 if ((*pde & X86_PG_V) == 0) 9825 panic("pmap_demote_DMAP: invalid PDE"); 9826 if ((*pde & PG_PS) != 0) { 9827 if (!pmap_demote_pde(kernel_pmap, pde, va)) 9828 panic("pmap_demote_DMAP: PDE failed"); 9829 changed = TRUE; 9830 } 9831 } 9832 if (changed && invalidate) 9833 pmap_invalidate_page(kernel_pmap, va); 9834 PMAP_UNLOCK(kernel_pmap); 9835 } 9836 } 9837 9838 /* 9839 * Perform the pmap work for mincore(2). If the page is not both referenced and 9840 * modified by this pmap, returns its physical address so that the caller can 9841 * find other mappings. 9842 */ 9843 int 9844 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 9845 { 9846 pdp_entry_t *pdpe; 9847 pd_entry_t *pdep; 9848 pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; 9849 vm_paddr_t pa; 9850 int val; 9851 9852 PG_A = pmap_accessed_bit(pmap); 9853 PG_M = pmap_modified_bit(pmap); 9854 PG_V = pmap_valid_bit(pmap); 9855 PG_RW = pmap_rw_bit(pmap); 9856 9857 PMAP_LOCK(pmap); 9858 pte = 0; 9859 pa = 0; 9860 val = 0; 9861 pdpe = pmap_pdpe(pmap, addr); 9862 if (pdpe == NULL) 9863 goto out; 9864 if ((*pdpe & PG_V) != 0) { 9865 if ((*pdpe & PG_PS) != 0) { 9866 pte = *pdpe; 9867 pa = ((pte & PG_PS_PDP_FRAME) | (addr & PDPMASK)) & 9868 PG_FRAME; 9869 val = MINCORE_PSIND(2); 9870 } else { 9871 pdep = pmap_pde(pmap, addr); 9872 if (pdep != NULL && (*pdep & PG_V) != 0) { 9873 if ((*pdep & PG_PS) != 0) { 9874 pte = *pdep; 9875 /* Compute the physical address of the 4KB page. */ 9876 pa = ((pte & PG_PS_FRAME) | (addr & 9877 PDRMASK)) & PG_FRAME; 9878 val = MINCORE_PSIND(1); 9879 } else { 9880 pte = *pmap_pde_to_pte(pdep, addr); 9881 pa = pte & PG_FRAME; 9882 val = 0; 9883 } 9884 } 9885 } 9886 } 9887 if ((pte & PG_V) != 0) { 9888 val |= MINCORE_INCORE; 9889 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 9890 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 9891 if ((pte & PG_A) != 0) 9892 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 9893 } 9894 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 9895 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 9896 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 9897 *pap = pa; 9898 } 9899 out: 9900 PMAP_UNLOCK(pmap); 9901 return (val); 9902 } 9903 9904 static uint64_t 9905 pmap_pcid_alloc(pmap_t pmap, u_int cpuid) 9906 { 9907 uint32_t gen, new_gen, pcid_next; 9908 9909 CRITICAL_ASSERT(curthread); 9910 gen = PCPU_GET(pcid_gen); 9911 if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN) 9912 return (pti ? 0 : CR3_PCID_SAVE); 9913 if (pmap->pm_pcids[cpuid].pm_gen == gen) 9914 return (CR3_PCID_SAVE); 9915 pcid_next = PCPU_GET(pcid_next); 9916 KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) || 9917 (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN), 9918 ("cpu %d pcid_next %#x", cpuid, pcid_next)); 9919 if ((!pti && pcid_next == PMAP_PCID_OVERMAX) || 9920 (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) { 9921 new_gen = gen + 1; 9922 if (new_gen == 0) 9923 new_gen = 1; 9924 PCPU_SET(pcid_gen, new_gen); 9925 pcid_next = PMAP_PCID_KERN + 1; 9926 } else { 9927 new_gen = gen; 9928 } 9929 pmap->pm_pcids[cpuid].pm_pcid = pcid_next; 9930 pmap->pm_pcids[cpuid].pm_gen = new_gen; 9931 PCPU_SET(pcid_next, pcid_next + 1); 9932 return (0); 9933 } 9934 9935 static uint64_t 9936 pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid) 9937 { 9938 uint64_t cached; 9939 9940 cached = pmap_pcid_alloc(pmap, cpuid); 9941 KASSERT(pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, 9942 ("pmap %p cpu %d pcid %#x", pmap, cpuid, 9943 pmap->pm_pcids[cpuid].pm_pcid)); 9944 KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || 9945 pmap == kernel_pmap, 9946 ("non-kernel pmap pmap %p cpu %d pcid %#x", 9947 pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); 9948 return (cached); 9949 } 9950 9951 static void 9952 pmap_activate_sw_pti_post(struct thread *td, pmap_t pmap) 9953 { 9954 9955 PCPU_GET(tssp)->tss_rsp0 = pmap->pm_ucr3 != PMAP_NO_CR3 ? 9956 PCPU_GET(pti_rsp0) : (uintptr_t)td->td_md.md_stack_base; 9957 } 9958 9959 static void 9960 pmap_activate_sw_pcid_pti(struct thread *td, pmap_t pmap, u_int cpuid) 9961 { 9962 pmap_t old_pmap; 9963 uint64_t cached, cr3, kcr3, ucr3; 9964 9965 KASSERT((read_rflags() & PSL_I) == 0, 9966 ("PCID needs interrupts disabled in pmap_activate_sw()")); 9967 9968 /* See the comment in pmap_invalidate_page_pcid(). */ 9969 if (PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) { 9970 PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK); 9971 old_pmap = PCPU_GET(curpmap); 9972 MPASS(old_pmap->pm_ucr3 != PMAP_NO_CR3); 9973 old_pmap->pm_pcids[cpuid].pm_gen = 0; 9974 } 9975 9976 cached = pmap_pcid_alloc_checked(pmap, cpuid); 9977 cr3 = rcr3(); 9978 if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) 9979 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid); 9980 PCPU_SET(curpmap, pmap); 9981 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid; 9982 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid | 9983 PMAP_PCID_USER_PT; 9984 9985 if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) 9986 PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); 9987 9988 PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); 9989 PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); 9990 if (cached) 9991 counter_u64_add(pcid_save_cnt, 1); 9992 9993 pmap_activate_sw_pti_post(td, pmap); 9994 } 9995 9996 static void 9997 pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap, 9998 u_int cpuid) 9999 { 10000 uint64_t cached, cr3; 10001 10002 KASSERT((read_rflags() & PSL_I) == 0, 10003 ("PCID needs interrupts disabled in pmap_activate_sw()")); 10004 10005 cached = pmap_pcid_alloc_checked(pmap, cpuid); 10006 cr3 = rcr3(); 10007 if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) 10008 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | 10009 cached); 10010 PCPU_SET(curpmap, pmap); 10011 if (cached) 10012 counter_u64_add(pcid_save_cnt, 1); 10013 } 10014 10015 static void 10016 pmap_activate_sw_nopcid_nopti(struct thread *td __unused, pmap_t pmap, 10017 u_int cpuid __unused) 10018 { 10019 10020 load_cr3(pmap->pm_cr3); 10021 PCPU_SET(curpmap, pmap); 10022 } 10023 10024 static void 10025 pmap_activate_sw_nopcid_pti(struct thread *td, pmap_t pmap, 10026 u_int cpuid __unused) 10027 { 10028 10029 pmap_activate_sw_nopcid_nopti(td, pmap, cpuid); 10030 PCPU_SET(kcr3, pmap->pm_cr3); 10031 PCPU_SET(ucr3, pmap->pm_ucr3); 10032 pmap_activate_sw_pti_post(td, pmap); 10033 } 10034 10035 DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (struct thread *, pmap_t, 10036 u_int)) 10037 { 10038 10039 if (pmap_pcid_enabled && pti) 10040 return (pmap_activate_sw_pcid_pti); 10041 else if (pmap_pcid_enabled && !pti) 10042 return (pmap_activate_sw_pcid_nopti); 10043 else if (!pmap_pcid_enabled && pti) 10044 return (pmap_activate_sw_nopcid_pti); 10045 else /* if (!pmap_pcid_enabled && !pti) */ 10046 return (pmap_activate_sw_nopcid_nopti); 10047 } 10048 10049 void 10050 pmap_activate_sw(struct thread *td) 10051 { 10052 pmap_t oldpmap, pmap; 10053 u_int cpuid; 10054 10055 oldpmap = PCPU_GET(curpmap); 10056 pmap = vmspace_pmap(td->td_proc->p_vmspace); 10057 if (oldpmap == pmap) { 10058 if (cpu_vendor_id != CPU_VENDOR_INTEL) 10059 mfence(); 10060 return; 10061 } 10062 cpuid = PCPU_GET(cpuid); 10063 #ifdef SMP 10064 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 10065 #else 10066 CPU_SET(cpuid, &pmap->pm_active); 10067 #endif 10068 pmap_activate_sw_mode(td, pmap, cpuid); 10069 #ifdef SMP 10070 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 10071 #else 10072 CPU_CLR(cpuid, &oldpmap->pm_active); 10073 #endif 10074 } 10075 10076 void 10077 pmap_activate(struct thread *td) 10078 { 10079 /* 10080 * invltlb_{invpcid,}_pcid_handler() is used to handle an 10081 * invalidate_all IPI, which checks for curpmap == 10082 * smp_tlb_pmap. The below sequence of operations has a 10083 * window where %CR3 is loaded with the new pmap's PML4 10084 * address, but the curpmap value has not yet been updated. 10085 * This causes the invltlb IPI handler, which is called 10086 * between the updates, to execute as a NOP, which leaves 10087 * stale TLB entries. 10088 * 10089 * Note that the most common use of pmap_activate_sw(), from 10090 * a context switch, is immune to this race, because 10091 * interrupts are disabled (while the thread lock is owned), 10092 * so the IPI is delayed until after curpmap is updated. Protect 10093 * other callers in a similar way, by disabling interrupts 10094 * around the %cr3 register reload and curpmap assignment. 10095 */ 10096 spinlock_enter(); 10097 pmap_activate_sw(td); 10098 spinlock_exit(); 10099 } 10100 10101 void 10102 pmap_activate_boot(pmap_t pmap) 10103 { 10104 uint64_t kcr3; 10105 u_int cpuid; 10106 10107 /* 10108 * kernel_pmap must be never deactivated, and we ensure that 10109 * by never activating it at all. 10110 */ 10111 MPASS(pmap != kernel_pmap); 10112 10113 cpuid = PCPU_GET(cpuid); 10114 #ifdef SMP 10115 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 10116 #else 10117 CPU_SET(cpuid, &pmap->pm_active); 10118 #endif 10119 PCPU_SET(curpmap, pmap); 10120 if (pti) { 10121 kcr3 = pmap->pm_cr3; 10122 if (pmap_pcid_enabled) 10123 kcr3 |= pmap->pm_pcids[cpuid].pm_pcid | CR3_PCID_SAVE; 10124 } else { 10125 kcr3 = PMAP_NO_CR3; 10126 } 10127 PCPU_SET(kcr3, kcr3); 10128 PCPU_SET(ucr3, PMAP_NO_CR3); 10129 } 10130 10131 void 10132 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 10133 { 10134 } 10135 10136 /* 10137 * Increase the starting virtual address of the given mapping if a 10138 * different alignment might result in more superpage mappings. 10139 */ 10140 void 10141 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 10142 vm_offset_t *addr, vm_size_t size) 10143 { 10144 vm_offset_t superpage_offset; 10145 10146 if (size < NBPDR) 10147 return; 10148 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 10149 offset += ptoa(object->pg_color); 10150 superpage_offset = offset & PDRMASK; 10151 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 10152 (*addr & PDRMASK) == superpage_offset) 10153 return; 10154 if ((*addr & PDRMASK) < superpage_offset) 10155 *addr = (*addr & ~PDRMASK) + superpage_offset; 10156 else 10157 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 10158 } 10159 10160 #ifdef INVARIANTS 10161 static unsigned long num_dirty_emulations; 10162 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, 10163 &num_dirty_emulations, 0, NULL); 10164 10165 static unsigned long num_accessed_emulations; 10166 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, 10167 &num_accessed_emulations, 0, NULL); 10168 10169 static unsigned long num_superpage_accessed_emulations; 10170 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, 10171 &num_superpage_accessed_emulations, 0, NULL); 10172 10173 static unsigned long ad_emulation_superpage_promotions; 10174 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, 10175 &ad_emulation_superpage_promotions, 0, NULL); 10176 #endif /* INVARIANTS */ 10177 10178 int 10179 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) 10180 { 10181 int rv; 10182 struct rwlock *lock; 10183 #if VM_NRESERVLEVEL > 0 10184 vm_page_t m, mpte; 10185 #endif 10186 pd_entry_t *pde; 10187 pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; 10188 10189 KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, 10190 ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); 10191 10192 if (!pmap_emulate_ad_bits(pmap)) 10193 return (-1); 10194 10195 PG_A = pmap_accessed_bit(pmap); 10196 PG_M = pmap_modified_bit(pmap); 10197 PG_V = pmap_valid_bit(pmap); 10198 PG_RW = pmap_rw_bit(pmap); 10199 10200 rv = -1; 10201 lock = NULL; 10202 PMAP_LOCK(pmap); 10203 10204 pde = pmap_pde(pmap, va); 10205 if (pde == NULL || (*pde & PG_V) == 0) 10206 goto done; 10207 10208 if ((*pde & PG_PS) != 0) { 10209 if (ftype == VM_PROT_READ) { 10210 #ifdef INVARIANTS 10211 atomic_add_long(&num_superpage_accessed_emulations, 1); 10212 #endif 10213 *pde |= PG_A; 10214 rv = 0; 10215 } 10216 goto done; 10217 } 10218 10219 pte = pmap_pde_to_pte(pde, va); 10220 if ((*pte & PG_V) == 0) 10221 goto done; 10222 10223 if (ftype == VM_PROT_WRITE) { 10224 if ((*pte & PG_RW) == 0) 10225 goto done; 10226 /* 10227 * Set the modified and accessed bits simultaneously. 10228 * 10229 * Intel EPT PTEs that do software emulation of A/D bits map 10230 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively. 10231 * An EPT misconfiguration is triggered if the PTE is writable 10232 * but not readable (WR=10). This is avoided by setting PG_A 10233 * and PG_M simultaneously. 10234 */ 10235 *pte |= PG_M | PG_A; 10236 } else { 10237 *pte |= PG_A; 10238 } 10239 10240 #if VM_NRESERVLEVEL > 0 10241 /* try to promote the mapping */ 10242 if (va < VM_MAXUSER_ADDRESS) 10243 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 10244 else 10245 mpte = NULL; 10246 10247 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 10248 10249 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 10250 pmap_ps_enabled(pmap) && 10251 (m->flags & PG_FICTITIOUS) == 0 && 10252 vm_reserv_level_iffullpop(m) == 0) { 10253 pmap_promote_pde(pmap, pde, va, mpte, &lock); 10254 #ifdef INVARIANTS 10255 atomic_add_long(&ad_emulation_superpage_promotions, 1); 10256 #endif 10257 } 10258 #endif 10259 10260 #ifdef INVARIANTS 10261 if (ftype == VM_PROT_WRITE) 10262 atomic_add_long(&num_dirty_emulations, 1); 10263 else 10264 atomic_add_long(&num_accessed_emulations, 1); 10265 #endif 10266 rv = 0; /* success */ 10267 done: 10268 if (lock != NULL) 10269 rw_wunlock(lock); 10270 PMAP_UNLOCK(pmap); 10271 return (rv); 10272 } 10273 10274 void 10275 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) 10276 { 10277 pml4_entry_t *pml4; 10278 pdp_entry_t *pdp; 10279 pd_entry_t *pde; 10280 pt_entry_t *pte, PG_V; 10281 int idx; 10282 10283 idx = 0; 10284 PG_V = pmap_valid_bit(pmap); 10285 PMAP_LOCK(pmap); 10286 10287 pml4 = pmap_pml4e(pmap, va); 10288 if (pml4 == NULL) 10289 goto done; 10290 ptr[idx++] = *pml4; 10291 if ((*pml4 & PG_V) == 0) 10292 goto done; 10293 10294 pdp = pmap_pml4e_to_pdpe(pml4, va); 10295 ptr[idx++] = *pdp; 10296 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) 10297 goto done; 10298 10299 pde = pmap_pdpe_to_pde(pdp, va); 10300 ptr[idx++] = *pde; 10301 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) 10302 goto done; 10303 10304 pte = pmap_pde_to_pte(pde, va); 10305 ptr[idx++] = *pte; 10306 10307 done: 10308 PMAP_UNLOCK(pmap); 10309 *num = idx; 10310 } 10311 10312 /** 10313 * Get the kernel virtual address of a set of physical pages. If there are 10314 * physical addresses not covered by the DMAP perform a transient mapping 10315 * that will be removed when calling pmap_unmap_io_transient. 10316 * 10317 * \param page The pages the caller wishes to obtain the virtual 10318 * address on the kernel memory map. 10319 * \param vaddr On return contains the kernel virtual memory address 10320 * of the pages passed in the page parameter. 10321 * \param count Number of pages passed in. 10322 * \param can_fault TRUE if the thread using the mapped pages can take 10323 * page faults, FALSE otherwise. 10324 * 10325 * \returns TRUE if the caller must call pmap_unmap_io_transient when 10326 * finished or FALSE otherwise. 10327 * 10328 */ 10329 boolean_t 10330 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 10331 boolean_t can_fault) 10332 { 10333 vm_paddr_t paddr; 10334 boolean_t needs_mapping; 10335 pt_entry_t *pte; 10336 int cache_bits, error __unused, i; 10337 10338 /* 10339 * Allocate any KVA space that we need, this is done in a separate 10340 * loop to prevent calling vmem_alloc while pinned. 10341 */ 10342 needs_mapping = FALSE; 10343 for (i = 0; i < count; i++) { 10344 paddr = VM_PAGE_TO_PHYS(page[i]); 10345 if (__predict_false(paddr >= dmaplimit)) { 10346 error = vmem_alloc(kernel_arena, PAGE_SIZE, 10347 M_BESTFIT | M_WAITOK, &vaddr[i]); 10348 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 10349 needs_mapping = TRUE; 10350 } else { 10351 vaddr[i] = PHYS_TO_DMAP(paddr); 10352 } 10353 } 10354 10355 /* Exit early if everything is covered by the DMAP */ 10356 if (!needs_mapping) 10357 return (FALSE); 10358 10359 /* 10360 * NB: The sequence of updating a page table followed by accesses 10361 * to the corresponding pages used in the !DMAP case is subject to 10362 * the situation described in the "AMD64 Architecture Programmer's 10363 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special 10364 * Coherency Considerations". Therefore, issuing the INVLPG right 10365 * after modifying the PTE bits is crucial. 10366 */ 10367 if (!can_fault) 10368 sched_pin(); 10369 for (i = 0; i < count; i++) { 10370 paddr = VM_PAGE_TO_PHYS(page[i]); 10371 if (paddr >= dmaplimit) { 10372 if (can_fault) { 10373 /* 10374 * Slow path, since we can get page faults 10375 * while mappings are active don't pin the 10376 * thread to the CPU and instead add a global 10377 * mapping visible to all CPUs. 10378 */ 10379 pmap_qenter(vaddr[i], &page[i], 1); 10380 } else { 10381 pte = vtopte(vaddr[i]); 10382 cache_bits = pmap_cache_bits(kernel_pmap, 10383 page[i]->md.pat_mode, 0); 10384 pte_store(pte, paddr | X86_PG_RW | X86_PG_V | 10385 cache_bits); 10386 pmap_invlpg(kernel_pmap, vaddr[i]); 10387 } 10388 } 10389 } 10390 10391 return (needs_mapping); 10392 } 10393 10394 void 10395 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 10396 boolean_t can_fault) 10397 { 10398 vm_paddr_t paddr; 10399 int i; 10400 10401 if (!can_fault) 10402 sched_unpin(); 10403 for (i = 0; i < count; i++) { 10404 paddr = VM_PAGE_TO_PHYS(page[i]); 10405 if (paddr >= dmaplimit) { 10406 if (can_fault) 10407 pmap_qremove(vaddr[i], 1); 10408 vmem_free(kernel_arena, vaddr[i], PAGE_SIZE); 10409 } 10410 } 10411 } 10412 10413 vm_offset_t 10414 pmap_quick_enter_page(vm_page_t m) 10415 { 10416 vm_paddr_t paddr; 10417 10418 paddr = VM_PAGE_TO_PHYS(m); 10419 if (paddr < dmaplimit) 10420 return (PHYS_TO_DMAP(paddr)); 10421 mtx_lock_spin(&qframe_mtx); 10422 KASSERT(*vtopte(qframe) == 0, ("qframe busy")); 10423 10424 /* 10425 * Since qframe is exclusively mapped by us, and we do not set 10426 * PG_G, we can use INVLPG here. 10427 */ 10428 invlpg(qframe); 10429 10430 pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A | 10431 X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0)); 10432 return (qframe); 10433 } 10434 10435 void 10436 pmap_quick_remove_page(vm_offset_t addr) 10437 { 10438 10439 if (addr != qframe) 10440 return; 10441 pte_store(vtopte(qframe), 0); 10442 mtx_unlock_spin(&qframe_mtx); 10443 } 10444 10445 /* 10446 * Pdp pages from the large map are managed differently from either 10447 * kernel or user page table pages. They are permanently allocated at 10448 * initialization time, and their reference count is permanently set to 10449 * zero. The pml4 entries pointing to those pages are copied into 10450 * each allocated pmap. 10451 * 10452 * In contrast, pd and pt pages are managed like user page table 10453 * pages. They are dynamically allocated, and their reference count 10454 * represents the number of valid entries within the page. 10455 */ 10456 static vm_page_t 10457 pmap_large_map_getptp_unlocked(void) 10458 { 10459 return (pmap_alloc_pt_page(kernel_pmap, 0, VM_ALLOC_ZERO)); 10460 } 10461 10462 static vm_page_t 10463 pmap_large_map_getptp(void) 10464 { 10465 vm_page_t m; 10466 10467 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 10468 m = pmap_large_map_getptp_unlocked(); 10469 if (m == NULL) { 10470 PMAP_UNLOCK(kernel_pmap); 10471 vm_wait(NULL); 10472 PMAP_LOCK(kernel_pmap); 10473 /* Callers retry. */ 10474 } 10475 return (m); 10476 } 10477 10478 static pdp_entry_t * 10479 pmap_large_map_pdpe(vm_offset_t va) 10480 { 10481 vm_pindex_t pml4_idx; 10482 vm_paddr_t mphys; 10483 10484 pml4_idx = pmap_pml4e_index(va); 10485 KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents, 10486 ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I " 10487 "%#jx lm_ents %d", 10488 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); 10489 KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0, 10490 ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " 10491 "LMSPML4I %#jx lm_ents %d", 10492 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); 10493 mphys = kernel_pml4[pml4_idx] & PG_FRAME; 10494 return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va)); 10495 } 10496 10497 static pd_entry_t * 10498 pmap_large_map_pde(vm_offset_t va) 10499 { 10500 pdp_entry_t *pdpe; 10501 vm_page_t m; 10502 vm_paddr_t mphys; 10503 10504 retry: 10505 pdpe = pmap_large_map_pdpe(va); 10506 if (*pdpe == 0) { 10507 m = pmap_large_map_getptp(); 10508 if (m == NULL) 10509 goto retry; 10510 mphys = VM_PAGE_TO_PHYS(m); 10511 *pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; 10512 } else { 10513 MPASS((*pdpe & X86_PG_PS) == 0); 10514 mphys = *pdpe & PG_FRAME; 10515 } 10516 return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va)); 10517 } 10518 10519 static pt_entry_t * 10520 pmap_large_map_pte(vm_offset_t va) 10521 { 10522 pd_entry_t *pde; 10523 vm_page_t m; 10524 vm_paddr_t mphys; 10525 10526 retry: 10527 pde = pmap_large_map_pde(va); 10528 if (*pde == 0) { 10529 m = pmap_large_map_getptp(); 10530 if (m == NULL) 10531 goto retry; 10532 mphys = VM_PAGE_TO_PHYS(m); 10533 *pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; 10534 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->ref_count++; 10535 } else { 10536 MPASS((*pde & X86_PG_PS) == 0); 10537 mphys = *pde & PG_FRAME; 10538 } 10539 return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va)); 10540 } 10541 10542 static vm_paddr_t 10543 pmap_large_map_kextract(vm_offset_t va) 10544 { 10545 pdp_entry_t *pdpe, pdp; 10546 pd_entry_t *pde, pd; 10547 pt_entry_t *pte, pt; 10548 10549 KASSERT(PMAP_ADDRESS_IN_LARGEMAP(va), 10550 ("not largemap range %#lx", (u_long)va)); 10551 pdpe = pmap_large_map_pdpe(va); 10552 pdp = *pdpe; 10553 KASSERT((pdp & X86_PG_V) != 0, 10554 ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, 10555 (u_long)pdpe, pdp)); 10556 if ((pdp & X86_PG_PS) != 0) { 10557 KASSERT((amd_feature & AMDID_PAGE1GB) != 0, 10558 ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, 10559 (u_long)pdpe, pdp)); 10560 return ((pdp & PG_PS_PDP_FRAME) | (va & PDPMASK)); 10561 } 10562 pde = pmap_pdpe_to_pde(pdpe, va); 10563 pd = *pde; 10564 KASSERT((pd & X86_PG_V) != 0, 10565 ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); 10566 if ((pd & X86_PG_PS) != 0) 10567 return ((pd & PG_PS_FRAME) | (va & PDRMASK)); 10568 pte = pmap_pde_to_pte(pde, va); 10569 pt = *pte; 10570 KASSERT((pt & X86_PG_V) != 0, 10571 ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, pt)); 10572 return ((pt & PG_FRAME) | (va & PAGE_MASK)); 10573 } 10574 10575 static int 10576 pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase, 10577 vmem_addr_t *vmem_res) 10578 { 10579 10580 /* 10581 * Large mappings are all but static. Consequently, there 10582 * is no point in waiting for an earlier allocation to be 10583 * freed. 10584 */ 10585 return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN, 10586 VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res)); 10587 } 10588 10589 int 10590 pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr, 10591 vm_memattr_t mattr) 10592 { 10593 pdp_entry_t *pdpe; 10594 pd_entry_t *pde; 10595 pt_entry_t *pte; 10596 vm_offset_t va, inc; 10597 vmem_addr_t vmem_res; 10598 vm_paddr_t pa; 10599 int error; 10600 10601 if (len == 0 || spa + len < spa) 10602 return (EINVAL); 10603 10604 /* See if DMAP can serve. */ 10605 if (spa + len <= dmaplimit) { 10606 va = PHYS_TO_DMAP(spa); 10607 *addr = (void *)va; 10608 return (pmap_change_attr(va, len, mattr)); 10609 } 10610 10611 /* 10612 * No, allocate KVA. Fit the address with best possible 10613 * alignment for superpages. Fall back to worse align if 10614 * failed. 10615 */ 10616 error = ENOMEM; 10617 if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len, 10618 NBPDP) >= roundup2(spa, NBPDP) + NBPDP) 10619 error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK, 10620 &vmem_res); 10621 if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa, 10622 NBPDR) + NBPDR) 10623 error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK, 10624 &vmem_res); 10625 if (error != 0) 10626 error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res); 10627 if (error != 0) 10628 return (error); 10629 10630 /* 10631 * Fill pagetable. PG_M is not pre-set, we scan modified bits 10632 * in the pagetable to minimize flushing. No need to 10633 * invalidate TLB, since we only update invalid entries. 10634 */ 10635 PMAP_LOCK(kernel_pmap); 10636 for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc, 10637 len -= inc) { 10638 if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP && 10639 (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) { 10640 pdpe = pmap_large_map_pdpe(va); 10641 MPASS(*pdpe == 0); 10642 *pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW | 10643 X86_PG_V | X86_PG_A | pg_nx | 10644 pmap_cache_bits(kernel_pmap, mattr, TRUE); 10645 inc = NBPDP; 10646 } else if (len >= NBPDR && (pa & PDRMASK) == 0 && 10647 (va & PDRMASK) == 0) { 10648 pde = pmap_large_map_pde(va); 10649 MPASS(*pde == 0); 10650 *pde = pa | pg_g | X86_PG_PS | X86_PG_RW | 10651 X86_PG_V | X86_PG_A | pg_nx | 10652 pmap_cache_bits(kernel_pmap, mattr, TRUE); 10653 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))-> 10654 ref_count++; 10655 inc = NBPDR; 10656 } else { 10657 pte = pmap_large_map_pte(va); 10658 MPASS(*pte == 0); 10659 *pte = pa | pg_g | X86_PG_RW | X86_PG_V | 10660 X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, 10661 mattr, FALSE); 10662 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))-> 10663 ref_count++; 10664 inc = PAGE_SIZE; 10665 } 10666 } 10667 PMAP_UNLOCK(kernel_pmap); 10668 MPASS(len == 0); 10669 10670 *addr = (void *)vmem_res; 10671 return (0); 10672 } 10673 10674 void 10675 pmap_large_unmap(void *svaa, vm_size_t len) 10676 { 10677 vm_offset_t sva, va; 10678 vm_size_t inc; 10679 pdp_entry_t *pdpe, pdp; 10680 pd_entry_t *pde, pd; 10681 pt_entry_t *pte; 10682 vm_page_t m; 10683 struct spglist spgf; 10684 10685 sva = (vm_offset_t)svaa; 10686 if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS && 10687 sva + len <= DMAP_MIN_ADDRESS + dmaplimit)) 10688 return; 10689 10690 SLIST_INIT(&spgf); 10691 KASSERT(PMAP_ADDRESS_IN_LARGEMAP(sva) && 10692 PMAP_ADDRESS_IN_LARGEMAP(sva + len - 1), 10693 ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len)); 10694 PMAP_LOCK(kernel_pmap); 10695 for (va = sva; va < sva + len; va += inc) { 10696 pdpe = pmap_large_map_pdpe(va); 10697 pdp = *pdpe; 10698 KASSERT((pdp & X86_PG_V) != 0, 10699 ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, 10700 (u_long)pdpe, pdp)); 10701 if ((pdp & X86_PG_PS) != 0) { 10702 KASSERT((amd_feature & AMDID_PAGE1GB) != 0, 10703 ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, 10704 (u_long)pdpe, pdp)); 10705 KASSERT((va & PDPMASK) == 0, 10706 ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va, 10707 (u_long)pdpe, pdp)); 10708 KASSERT(va + NBPDP <= sva + len, 10709 ("unmap covers partial 1GB page, sva %#lx va %#lx " 10710 "pdpe %#lx pdp %#lx len %#lx", sva, va, 10711 (u_long)pdpe, pdp, len)); 10712 *pdpe = 0; 10713 inc = NBPDP; 10714 continue; 10715 } 10716 pde = pmap_pdpe_to_pde(pdpe, va); 10717 pd = *pde; 10718 KASSERT((pd & X86_PG_V) != 0, 10719 ("invalid pd va %#lx pde %#lx pd %#lx", va, 10720 (u_long)pde, pd)); 10721 if ((pd & X86_PG_PS) != 0) { 10722 KASSERT((va & PDRMASK) == 0, 10723 ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va, 10724 (u_long)pde, pd)); 10725 KASSERT(va + NBPDR <= sva + len, 10726 ("unmap covers partial 2MB page, sva %#lx va %#lx " 10727 "pde %#lx pd %#lx len %#lx", sva, va, (u_long)pde, 10728 pd, len)); 10729 pde_store(pde, 0); 10730 inc = NBPDR; 10731 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); 10732 m->ref_count--; 10733 if (m->ref_count == 0) { 10734 *pdpe = 0; 10735 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10736 } 10737 continue; 10738 } 10739 pte = pmap_pde_to_pte(pde, va); 10740 KASSERT((*pte & X86_PG_V) != 0, 10741 ("invalid pte va %#lx pte %#lx pt %#lx", va, 10742 (u_long)pte, *pte)); 10743 pte_clear(pte); 10744 inc = PAGE_SIZE; 10745 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte)); 10746 m->ref_count--; 10747 if (m->ref_count == 0) { 10748 *pde = 0; 10749 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10750 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); 10751 m->ref_count--; 10752 if (m->ref_count == 0) { 10753 *pdpe = 0; 10754 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10755 } 10756 } 10757 } 10758 pmap_invalidate_range(kernel_pmap, sva, sva + len); 10759 PMAP_UNLOCK(kernel_pmap); 10760 vm_page_free_pages_toq(&spgf, false); 10761 vmem_free(large_vmem, sva, len); 10762 } 10763 10764 static void 10765 pmap_large_map_wb_fence_mfence(void) 10766 { 10767 10768 mfence(); 10769 } 10770 10771 static void 10772 pmap_large_map_wb_fence_atomic(void) 10773 { 10774 10775 atomic_thread_fence_seq_cst(); 10776 } 10777 10778 static void 10779 pmap_large_map_wb_fence_nop(void) 10780 { 10781 } 10782 10783 DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void)) 10784 { 10785 10786 if (cpu_vendor_id != CPU_VENDOR_INTEL) 10787 return (pmap_large_map_wb_fence_mfence); 10788 else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB | 10789 CPUID_STDEXT_CLFLUSHOPT)) == 0) 10790 return (pmap_large_map_wb_fence_atomic); 10791 else 10792 /* clflush is strongly enough ordered */ 10793 return (pmap_large_map_wb_fence_nop); 10794 } 10795 10796 static void 10797 pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len) 10798 { 10799 10800 for (; len > 0; len -= cpu_clflush_line_size, 10801 va += cpu_clflush_line_size) 10802 clwb(va); 10803 } 10804 10805 static void 10806 pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len) 10807 { 10808 10809 for (; len > 0; len -= cpu_clflush_line_size, 10810 va += cpu_clflush_line_size) 10811 clflushopt(va); 10812 } 10813 10814 static void 10815 pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len) 10816 { 10817 10818 for (; len > 0; len -= cpu_clflush_line_size, 10819 va += cpu_clflush_line_size) 10820 clflush(va); 10821 } 10822 10823 static void 10824 pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len __unused) 10825 { 10826 } 10827 10828 DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t, vm_size_t)) 10829 { 10830 10831 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0) 10832 return (pmap_large_map_flush_range_clwb); 10833 else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) 10834 return (pmap_large_map_flush_range_clflushopt); 10835 else if ((cpu_feature & CPUID_CLFSH) != 0) 10836 return (pmap_large_map_flush_range_clflush); 10837 else 10838 return (pmap_large_map_flush_range_nop); 10839 } 10840 10841 static void 10842 pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva) 10843 { 10844 volatile u_long *pe; 10845 u_long p; 10846 vm_offset_t va; 10847 vm_size_t inc; 10848 bool seen_other; 10849 10850 for (va = sva; va < eva; va += inc) { 10851 inc = 0; 10852 if ((amd_feature & AMDID_PAGE1GB) != 0) { 10853 pe = (volatile u_long *)pmap_large_map_pdpe(va); 10854 p = *pe; 10855 if ((p & X86_PG_PS) != 0) 10856 inc = NBPDP; 10857 } 10858 if (inc == 0) { 10859 pe = (volatile u_long *)pmap_large_map_pde(va); 10860 p = *pe; 10861 if ((p & X86_PG_PS) != 0) 10862 inc = NBPDR; 10863 } 10864 if (inc == 0) { 10865 pe = (volatile u_long *)pmap_large_map_pte(va); 10866 p = *pe; 10867 inc = PAGE_SIZE; 10868 } 10869 seen_other = false; 10870 for (;;) { 10871 if ((p & X86_PG_AVAIL1) != 0) { 10872 /* 10873 * Spin-wait for the end of a parallel 10874 * write-back. 10875 */ 10876 cpu_spinwait(); 10877 p = *pe; 10878 10879 /* 10880 * If we saw other write-back 10881 * occuring, we cannot rely on PG_M to 10882 * indicate state of the cache. The 10883 * PG_M bit is cleared before the 10884 * flush to avoid ignoring new writes, 10885 * and writes which are relevant for 10886 * us might happen after. 10887 */ 10888 seen_other = true; 10889 continue; 10890 } 10891 10892 if ((p & X86_PG_M) != 0 || seen_other) { 10893 if (!atomic_fcmpset_long(pe, &p, 10894 (p & ~X86_PG_M) | X86_PG_AVAIL1)) 10895 /* 10896 * If we saw PG_M without 10897 * PG_AVAIL1, and then on the 10898 * next attempt we do not 10899 * observe either PG_M or 10900 * PG_AVAIL1, the other 10901 * write-back started after us 10902 * and finished before us. We 10903 * can rely on it doing our 10904 * work. 10905 */ 10906 continue; 10907 pmap_large_map_flush_range(va, inc); 10908 atomic_clear_long(pe, X86_PG_AVAIL1); 10909 } 10910 break; 10911 } 10912 maybe_yield(); 10913 } 10914 } 10915 10916 /* 10917 * Write-back cache lines for the given address range. 10918 * 10919 * Must be called only on the range or sub-range returned from 10920 * pmap_large_map(). Must not be called on the coalesced ranges. 10921 * 10922 * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH 10923 * instructions support. 10924 */ 10925 void 10926 pmap_large_map_wb(void *svap, vm_size_t len) 10927 { 10928 vm_offset_t eva, sva; 10929 10930 sva = (vm_offset_t)svap; 10931 eva = sva + len; 10932 pmap_large_map_wb_fence(); 10933 if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) { 10934 pmap_large_map_flush_range(sva, len); 10935 } else { 10936 KASSERT(sva >= LARGEMAP_MIN_ADDRESS && 10937 eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4, 10938 ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len)); 10939 pmap_large_map_wb_large(sva, eva); 10940 } 10941 pmap_large_map_wb_fence(); 10942 } 10943 10944 static vm_page_t 10945 pmap_pti_alloc_page(void) 10946 { 10947 vm_page_t m; 10948 10949 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 10950 m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_WIRED | VM_ALLOC_ZERO); 10951 return (m); 10952 } 10953 10954 static bool 10955 pmap_pti_free_page(vm_page_t m) 10956 { 10957 if (!vm_page_unwire_noq(m)) 10958 return (false); 10959 vm_page_xbusy_claim(m); 10960 vm_page_free_zero(m); 10961 return (true); 10962 } 10963 10964 static void 10965 pmap_pti_init(void) 10966 { 10967 vm_page_t pml4_pg; 10968 pdp_entry_t *pdpe; 10969 vm_offset_t va; 10970 int i; 10971 10972 if (!pti) 10973 return; 10974 pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL); 10975 VM_OBJECT_WLOCK(pti_obj); 10976 pml4_pg = pmap_pti_alloc_page(); 10977 pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg)); 10978 for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS && 10979 va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) { 10980 pdpe = pmap_pti_pdpe(va); 10981 pmap_pti_wire_pte(pdpe); 10982 } 10983 pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0], 10984 (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false); 10985 pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt + 10986 sizeof(struct gate_descriptor) * NIDT, false); 10987 CPU_FOREACH(i) { 10988 /* Doublefault stack IST 1 */ 10989 va = __pcpu[i].pc_common_tss.tss_ist1 + sizeof(struct nmi_pcpu); 10990 pmap_pti_add_kva_locked(va - DBLFAULT_STACK_SIZE, va, false); 10991 /* NMI stack IST 2 */ 10992 va = __pcpu[i].pc_common_tss.tss_ist2 + sizeof(struct nmi_pcpu); 10993 pmap_pti_add_kva_locked(va - NMI_STACK_SIZE, va, false); 10994 /* MC# stack IST 3 */ 10995 va = __pcpu[i].pc_common_tss.tss_ist3 + 10996 sizeof(struct nmi_pcpu); 10997 pmap_pti_add_kva_locked(va - MCE_STACK_SIZE, va, false); 10998 /* DB# stack IST 4 */ 10999 va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu); 11000 pmap_pti_add_kva_locked(va - DBG_STACK_SIZE, va, false); 11001 } 11002 pmap_pti_add_kva_locked((vm_offset_t)KERNSTART, (vm_offset_t)etext, 11003 true); 11004 pti_finalized = true; 11005 VM_OBJECT_WUNLOCK(pti_obj); 11006 } 11007 11008 static void 11009 pmap_cpu_init(void *arg __unused) 11010 { 11011 CPU_COPY(&all_cpus, &kernel_pmap->pm_active); 11012 pmap_pti_init(); 11013 } 11014 SYSINIT(pmap_cpu, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_cpu_init, NULL); 11015 11016 static pdp_entry_t * 11017 pmap_pti_pdpe(vm_offset_t va) 11018 { 11019 pml4_entry_t *pml4e; 11020 pdp_entry_t *pdpe; 11021 vm_page_t m; 11022 vm_pindex_t pml4_idx; 11023 vm_paddr_t mphys; 11024 11025 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11026 11027 pml4_idx = pmap_pml4e_index(va); 11028 pml4e = &pti_pml4[pml4_idx]; 11029 m = NULL; 11030 if (*pml4e == 0) { 11031 if (pti_finalized) 11032 panic("pml4 alloc after finalization\n"); 11033 m = pmap_pti_alloc_page(); 11034 if (*pml4e != 0) { 11035 pmap_pti_free_page(m); 11036 mphys = *pml4e & ~PAGE_MASK; 11037 } else { 11038 mphys = VM_PAGE_TO_PHYS(m); 11039 *pml4e = mphys | X86_PG_RW | X86_PG_V; 11040 } 11041 } else { 11042 mphys = *pml4e & ~PAGE_MASK; 11043 } 11044 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va); 11045 return (pdpe); 11046 } 11047 11048 static void 11049 pmap_pti_wire_pte(void *pte) 11050 { 11051 vm_page_t m; 11052 11053 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11054 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); 11055 m->ref_count++; 11056 } 11057 11058 static void 11059 pmap_pti_unwire_pde(void *pde, bool only_ref) 11060 { 11061 vm_page_t m; 11062 11063 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11064 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde)); 11065 MPASS(only_ref || m->ref_count > 1); 11066 pmap_pti_free_page(m); 11067 } 11068 11069 static void 11070 pmap_pti_unwire_pte(void *pte, vm_offset_t va) 11071 { 11072 vm_page_t m; 11073 pd_entry_t *pde; 11074 11075 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11076 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); 11077 if (pmap_pti_free_page(m)) { 11078 pde = pmap_pti_pde(va); 11079 MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V); 11080 *pde = 0; 11081 pmap_pti_unwire_pde(pde, false); 11082 } 11083 } 11084 11085 static pd_entry_t * 11086 pmap_pti_pde(vm_offset_t va) 11087 { 11088 pdp_entry_t *pdpe; 11089 pd_entry_t *pde; 11090 vm_page_t m; 11091 vm_pindex_t pd_idx; 11092 vm_paddr_t mphys; 11093 11094 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11095 11096 pdpe = pmap_pti_pdpe(va); 11097 if (*pdpe == 0) { 11098 m = pmap_pti_alloc_page(); 11099 if (*pdpe != 0) { 11100 pmap_pti_free_page(m); 11101 MPASS((*pdpe & X86_PG_PS) == 0); 11102 mphys = *pdpe & ~PAGE_MASK; 11103 } else { 11104 mphys = VM_PAGE_TO_PHYS(m); 11105 *pdpe = mphys | X86_PG_RW | X86_PG_V; 11106 } 11107 } else { 11108 MPASS((*pdpe & X86_PG_PS) == 0); 11109 mphys = *pdpe & ~PAGE_MASK; 11110 } 11111 11112 pde = (pd_entry_t *)PHYS_TO_DMAP(mphys); 11113 pd_idx = pmap_pde_index(va); 11114 pde += pd_idx; 11115 return (pde); 11116 } 11117 11118 static pt_entry_t * 11119 pmap_pti_pte(vm_offset_t va, bool *unwire_pde) 11120 { 11121 pd_entry_t *pde; 11122 pt_entry_t *pte; 11123 vm_page_t m; 11124 vm_paddr_t mphys; 11125 11126 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11127 11128 pde = pmap_pti_pde(va); 11129 if (unwire_pde != NULL) { 11130 *unwire_pde = true; 11131 pmap_pti_wire_pte(pde); 11132 } 11133 if (*pde == 0) { 11134 m = pmap_pti_alloc_page(); 11135 if (*pde != 0) { 11136 pmap_pti_free_page(m); 11137 MPASS((*pde & X86_PG_PS) == 0); 11138 mphys = *pde & ~(PAGE_MASK | pg_nx); 11139 } else { 11140 mphys = VM_PAGE_TO_PHYS(m); 11141 *pde = mphys | X86_PG_RW | X86_PG_V; 11142 if (unwire_pde != NULL) 11143 *unwire_pde = false; 11144 } 11145 } else { 11146 MPASS((*pde & X86_PG_PS) == 0); 11147 mphys = *pde & ~(PAGE_MASK | pg_nx); 11148 } 11149 11150 pte = (pt_entry_t *)PHYS_TO_DMAP(mphys); 11151 pte += pmap_pte_index(va); 11152 11153 return (pte); 11154 } 11155 11156 static void 11157 pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec) 11158 { 11159 vm_paddr_t pa; 11160 pd_entry_t *pde; 11161 pt_entry_t *pte, ptev; 11162 bool unwire_pde; 11163 11164 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11165 11166 sva = trunc_page(sva); 11167 MPASS(sva > VM_MAXUSER_ADDRESS); 11168 eva = round_page(eva); 11169 MPASS(sva < eva); 11170 for (; sva < eva; sva += PAGE_SIZE) { 11171 pte = pmap_pti_pte(sva, &unwire_pde); 11172 pa = pmap_kextract(sva); 11173 ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G | 11174 (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap, 11175 VM_MEMATTR_DEFAULT, FALSE); 11176 if (*pte == 0) { 11177 pte_store(pte, ptev); 11178 pmap_pti_wire_pte(pte); 11179 } else { 11180 KASSERT(!pti_finalized, 11181 ("pti overlap after fin %#lx %#lx %#lx", 11182 sva, *pte, ptev)); 11183 KASSERT(*pte == ptev, 11184 ("pti non-identical pte after fin %#lx %#lx %#lx", 11185 sva, *pte, ptev)); 11186 } 11187 if (unwire_pde) { 11188 pde = pmap_pti_pde(sva); 11189 pmap_pti_unwire_pde(pde, true); 11190 } 11191 } 11192 } 11193 11194 void 11195 pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec) 11196 { 11197 11198 if (!pti) 11199 return; 11200 VM_OBJECT_WLOCK(pti_obj); 11201 pmap_pti_add_kva_locked(sva, eva, exec); 11202 VM_OBJECT_WUNLOCK(pti_obj); 11203 } 11204 11205 void 11206 pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva) 11207 { 11208 pt_entry_t *pte; 11209 vm_offset_t va; 11210 11211 if (!pti) 11212 return; 11213 sva = rounddown2(sva, PAGE_SIZE); 11214 MPASS(sva > VM_MAXUSER_ADDRESS); 11215 eva = roundup2(eva, PAGE_SIZE); 11216 MPASS(sva < eva); 11217 VM_OBJECT_WLOCK(pti_obj); 11218 for (va = sva; va < eva; va += PAGE_SIZE) { 11219 pte = pmap_pti_pte(va, NULL); 11220 KASSERT((*pte & X86_PG_V) != 0, 11221 ("invalid pte va %#lx pte %#lx pt %#lx", va, 11222 (u_long)pte, *pte)); 11223 pte_clear(pte); 11224 pmap_pti_unwire_pte(pte, va); 11225 } 11226 pmap_invalidate_range(kernel_pmap, sva, eva); 11227 VM_OBJECT_WUNLOCK(pti_obj); 11228 } 11229 11230 static void * 11231 pkru_dup_range(void *ctx __unused, void *data) 11232 { 11233 struct pmap_pkru_range *node, *new_node; 11234 11235 new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); 11236 if (new_node == NULL) 11237 return (NULL); 11238 node = data; 11239 memcpy(new_node, node, sizeof(*node)); 11240 return (new_node); 11241 } 11242 11243 static void 11244 pkru_free_range(void *ctx __unused, void *node) 11245 { 11246 11247 uma_zfree(pmap_pkru_ranges_zone, node); 11248 } 11249 11250 static int 11251 pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, 11252 int flags) 11253 { 11254 struct pmap_pkru_range *ppr; 11255 int error; 11256 11257 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11258 MPASS(pmap->pm_type == PT_X86); 11259 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11260 if ((flags & AMD64_PKRU_EXCL) != 0 && 11261 !rangeset_check_empty(&pmap->pm_pkru, sva, eva)) 11262 return (EBUSY); 11263 ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); 11264 if (ppr == NULL) 11265 return (ENOMEM); 11266 ppr->pkru_keyidx = keyidx; 11267 ppr->pkru_flags = flags & AMD64_PKRU_PERSIST; 11268 error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr); 11269 if (error != 0) 11270 uma_zfree(pmap_pkru_ranges_zone, ppr); 11271 return (error); 11272 } 11273 11274 static int 11275 pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11276 { 11277 11278 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11279 MPASS(pmap->pm_type == PT_X86); 11280 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11281 return (rangeset_remove(&pmap->pm_pkru, sva, eva)); 11282 } 11283 11284 static void 11285 pmap_pkru_deassign_all(pmap_t pmap) 11286 { 11287 11288 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11289 if (pmap->pm_type == PT_X86 && 11290 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) 11291 rangeset_remove_all(&pmap->pm_pkru); 11292 } 11293 11294 static bool 11295 pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11296 { 11297 struct pmap_pkru_range *ppr, *prev_ppr; 11298 vm_offset_t va; 11299 11300 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11301 if (pmap->pm_type != PT_X86 || 11302 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || 11303 sva >= VM_MAXUSER_ADDRESS) 11304 return (true); 11305 MPASS(eva <= VM_MAXUSER_ADDRESS); 11306 for (va = sva; va < eva; prev_ppr = ppr) { 11307 ppr = rangeset_lookup(&pmap->pm_pkru, va); 11308 if (va == sva) 11309 prev_ppr = ppr; 11310 else if ((ppr == NULL) ^ (prev_ppr == NULL)) 11311 return (false); 11312 if (ppr == NULL) { 11313 va += PAGE_SIZE; 11314 continue; 11315 } 11316 if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx) 11317 return (false); 11318 va = ppr->pkru_rs_el.re_end; 11319 } 11320 return (true); 11321 } 11322 11323 static pt_entry_t 11324 pmap_pkru_get(pmap_t pmap, vm_offset_t va) 11325 { 11326 struct pmap_pkru_range *ppr; 11327 11328 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11329 if (pmap->pm_type != PT_X86 || 11330 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || 11331 va >= VM_MAXUSER_ADDRESS) 11332 return (0); 11333 ppr = rangeset_lookup(&pmap->pm_pkru, va); 11334 if (ppr != NULL) 11335 return (X86_PG_PKU(ppr->pkru_keyidx)); 11336 return (0); 11337 } 11338 11339 static bool 11340 pred_pkru_on_remove(void *ctx __unused, void *r) 11341 { 11342 struct pmap_pkru_range *ppr; 11343 11344 ppr = r; 11345 return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0); 11346 } 11347 11348 static void 11349 pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11350 { 11351 11352 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11353 if (pmap->pm_type == PT_X86 && 11354 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 11355 rangeset_remove_pred(&pmap->pm_pkru, sva, eva, 11356 pred_pkru_on_remove); 11357 } 11358 } 11359 11360 static int 11361 pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap) 11362 { 11363 11364 PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED); 11365 PMAP_LOCK_ASSERT(src_pmap, MA_OWNED); 11366 MPASS(dst_pmap->pm_type == PT_X86); 11367 MPASS(src_pmap->pm_type == PT_X86); 11368 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11369 if (src_pmap->pm_pkru.rs_data_ctx == NULL) 11370 return (0); 11371 return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru)); 11372 } 11373 11374 static void 11375 pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 11376 u_int keyidx) 11377 { 11378 pml4_entry_t *pml4e; 11379 pdp_entry_t *pdpe; 11380 pd_entry_t newpde, ptpaddr, *pde; 11381 pt_entry_t newpte, *ptep, pte; 11382 vm_offset_t va, va_next; 11383 bool changed; 11384 11385 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11386 MPASS(pmap->pm_type == PT_X86); 11387 MPASS(keyidx <= PMAP_MAX_PKRU_IDX); 11388 11389 for (changed = false, va = sva; va < eva; va = va_next) { 11390 pml4e = pmap_pml4e(pmap, va); 11391 if (pml4e == NULL || (*pml4e & X86_PG_V) == 0) { 11392 va_next = (va + NBPML4) & ~PML4MASK; 11393 if (va_next < va) 11394 va_next = eva; 11395 continue; 11396 } 11397 11398 pdpe = pmap_pml4e_to_pdpe(pml4e, va); 11399 if ((*pdpe & X86_PG_V) == 0) { 11400 va_next = (va + NBPDP) & ~PDPMASK; 11401 if (va_next < va) 11402 va_next = eva; 11403 continue; 11404 } 11405 11406 va_next = (va + NBPDR) & ~PDRMASK; 11407 if (va_next < va) 11408 va_next = eva; 11409 11410 pde = pmap_pdpe_to_pde(pdpe, va); 11411 ptpaddr = *pde; 11412 if (ptpaddr == 0) 11413 continue; 11414 11415 MPASS((ptpaddr & X86_PG_V) != 0); 11416 if ((ptpaddr & PG_PS) != 0) { 11417 if (va + NBPDR == va_next && eva >= va_next) { 11418 newpde = (ptpaddr & ~X86_PG_PKU_MASK) | 11419 X86_PG_PKU(keyidx); 11420 if (newpde != ptpaddr) { 11421 *pde = newpde; 11422 changed = true; 11423 } 11424 continue; 11425 } else if (!pmap_demote_pde(pmap, pde, va)) { 11426 continue; 11427 } 11428 } 11429 11430 if (va_next > eva) 11431 va_next = eva; 11432 11433 for (ptep = pmap_pde_to_pte(pde, va); va != va_next; 11434 ptep++, va += PAGE_SIZE) { 11435 pte = *ptep; 11436 if ((pte & X86_PG_V) == 0) 11437 continue; 11438 newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx); 11439 if (newpte != pte) { 11440 *ptep = newpte; 11441 changed = true; 11442 } 11443 } 11444 } 11445 if (changed) 11446 pmap_invalidate_range(pmap, sva, eva); 11447 } 11448 11449 static int 11450 pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 11451 u_int keyidx, int flags) 11452 { 11453 11454 if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX || 11455 (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0) 11456 return (EINVAL); 11457 if (eva <= sva || eva > VM_MAXUSER_ADDRESS) 11458 return (EFAULT); 11459 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) 11460 return (ENOTSUP); 11461 return (0); 11462 } 11463 11464 int 11465 pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, 11466 int flags) 11467 { 11468 int error; 11469 11470 sva = trunc_page(sva); 11471 eva = round_page(eva); 11472 error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags); 11473 if (error != 0) 11474 return (error); 11475 for (;;) { 11476 PMAP_LOCK(pmap); 11477 error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags); 11478 if (error == 0) 11479 pmap_pkru_update_range(pmap, sva, eva, keyidx); 11480 PMAP_UNLOCK(pmap); 11481 if (error != ENOMEM) 11482 break; 11483 vm_wait(NULL); 11484 } 11485 return (error); 11486 } 11487 11488 int 11489 pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11490 { 11491 int error; 11492 11493 sva = trunc_page(sva); 11494 eva = round_page(eva); 11495 error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0); 11496 if (error != 0) 11497 return (error); 11498 for (;;) { 11499 PMAP_LOCK(pmap); 11500 error = pmap_pkru_deassign(pmap, sva, eva); 11501 if (error == 0) 11502 pmap_pkru_update_range(pmap, sva, eva, 0); 11503 PMAP_UNLOCK(pmap); 11504 if (error != ENOMEM) 11505 break; 11506 vm_wait(NULL); 11507 } 11508 return (error); 11509 } 11510 11511 #if defined(KASAN) || defined(KMSAN) 11512 11513 /* 11514 * Reserve enough memory to: 11515 * 1) allocate PDP pages for the shadow map(s), 11516 * 2) shadow one page of memory, so one PD page, one PT page, and one shadow 11517 * page per shadow map. 11518 */ 11519 #ifdef KASAN 11520 #define SAN_EARLY_PAGES (NKASANPML4E + 3) 11521 #else 11522 #define SAN_EARLY_PAGES (NKMSANSHADPML4E + NKMSANORIGPML4E + 2 * 3) 11523 #endif 11524 11525 static uint64_t __nosanitizeaddress __nosanitizememory 11526 pmap_san_enter_early_alloc_4k(uint64_t pabase) 11527 { 11528 static uint8_t data[PAGE_SIZE * SAN_EARLY_PAGES] __aligned(PAGE_SIZE); 11529 static size_t offset = 0; 11530 uint64_t pa; 11531 11532 if (offset == sizeof(data)) { 11533 panic("%s: ran out of memory for the bootstrap shadow map", 11534 __func__); 11535 } 11536 11537 pa = pabase + ((vm_offset_t)&data[offset] - KERNSTART); 11538 offset += PAGE_SIZE; 11539 return (pa); 11540 } 11541 11542 /* 11543 * Map a shadow page, before the kernel has bootstrapped its page tables. This 11544 * is currently only used to shadow the temporary boot stack set up by locore. 11545 */ 11546 static void __nosanitizeaddress __nosanitizememory 11547 pmap_san_enter_early(vm_offset_t va) 11548 { 11549 static bool first = true; 11550 pml4_entry_t *pml4e; 11551 pdp_entry_t *pdpe; 11552 pd_entry_t *pde; 11553 pt_entry_t *pte; 11554 uint64_t cr3, pa, base; 11555 int i; 11556 11557 base = amd64_loadaddr(); 11558 cr3 = rcr3(); 11559 11560 if (first) { 11561 /* 11562 * If this the first call, we need to allocate new PML4Es for 11563 * the bootstrap shadow map(s). We don't know how the PML4 page 11564 * was initialized by the boot loader, so we can't simply test 11565 * whether the shadow map's PML4Es are zero. 11566 */ 11567 first = false; 11568 #ifdef KASAN 11569 for (i = 0; i < NKASANPML4E; i++) { 11570 pa = pmap_san_enter_early_alloc_4k(base); 11571 11572 pml4e = (pml4_entry_t *)cr3 + 11573 pmap_pml4e_index(KASAN_MIN_ADDRESS + i * NBPML4); 11574 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V); 11575 } 11576 #else 11577 for (i = 0; i < NKMSANORIGPML4E; i++) { 11578 pa = pmap_san_enter_early_alloc_4k(base); 11579 11580 pml4e = (pml4_entry_t *)cr3 + 11581 pmap_pml4e_index(KMSAN_ORIG_MIN_ADDRESS + 11582 i * NBPML4); 11583 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V); 11584 } 11585 for (i = 0; i < NKMSANSHADPML4E; i++) { 11586 pa = pmap_san_enter_early_alloc_4k(base); 11587 11588 pml4e = (pml4_entry_t *)cr3 + 11589 pmap_pml4e_index(KMSAN_SHAD_MIN_ADDRESS + 11590 i * NBPML4); 11591 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V); 11592 } 11593 #endif 11594 } 11595 pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(va); 11596 pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(va); 11597 if (*pdpe == 0) { 11598 pa = pmap_san_enter_early_alloc_4k(base); 11599 *pdpe = (pdp_entry_t)(pa | X86_PG_RW | X86_PG_V); 11600 } 11601 pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(va); 11602 if (*pde == 0) { 11603 pa = pmap_san_enter_early_alloc_4k(base); 11604 *pde = (pd_entry_t)(pa | X86_PG_RW | X86_PG_V); 11605 } 11606 pte = (pt_entry_t *)(*pde & PG_FRAME) + pmap_pte_index(va); 11607 if (*pte != 0) 11608 panic("%s: PTE for %#lx is already initialized", __func__, va); 11609 pa = pmap_san_enter_early_alloc_4k(base); 11610 *pte = (pt_entry_t)(pa | X86_PG_A | X86_PG_M | X86_PG_RW | X86_PG_V); 11611 } 11612 11613 static vm_page_t 11614 pmap_san_enter_alloc_4k(void) 11615 { 11616 vm_page_t m; 11617 11618 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 11619 VM_ALLOC_ZERO); 11620 if (m == NULL) 11621 panic("%s: no memory to grow shadow map", __func__); 11622 return (m); 11623 } 11624 11625 static vm_page_t 11626 pmap_san_enter_alloc_2m(void) 11627 { 11628 return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO, 11629 NPTEPG, 0, ~0ul, NBPDR, 0, VM_MEMATTR_DEFAULT)); 11630 } 11631 11632 /* 11633 * Grow a shadow map by at least one 4KB page at the specified address. Use 2MB 11634 * pages when possible. 11635 */ 11636 void __nosanitizeaddress __nosanitizememory 11637 pmap_san_enter(vm_offset_t va) 11638 { 11639 pdp_entry_t *pdpe; 11640 pd_entry_t *pde; 11641 pt_entry_t *pte; 11642 vm_page_t m; 11643 11644 if (kernphys == 0) { 11645 /* 11646 * We're creating a temporary shadow map for the boot stack. 11647 */ 11648 pmap_san_enter_early(va); 11649 return; 11650 } 11651 11652 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 11653 11654 pdpe = pmap_pdpe(kernel_pmap, va); 11655 if ((*pdpe & X86_PG_V) == 0) { 11656 m = pmap_san_enter_alloc_4k(); 11657 *pdpe = (pdp_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11658 X86_PG_V | pg_nx); 11659 } 11660 pde = pmap_pdpe_to_pde(pdpe, va); 11661 if ((*pde & X86_PG_V) == 0) { 11662 m = pmap_san_enter_alloc_2m(); 11663 if (m != NULL) { 11664 *pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11665 X86_PG_PS | X86_PG_V | X86_PG_A | X86_PG_M | pg_nx); 11666 } else { 11667 m = pmap_san_enter_alloc_4k(); 11668 *pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11669 X86_PG_V | pg_nx); 11670 } 11671 } 11672 if ((*pde & X86_PG_PS) != 0) 11673 return; 11674 pte = pmap_pde_to_pte(pde, va); 11675 if ((*pte & X86_PG_V) != 0) 11676 return; 11677 m = pmap_san_enter_alloc_4k(); 11678 *pte = (pt_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | X86_PG_V | 11679 X86_PG_M | X86_PG_A | pg_nx); 11680 } 11681 #endif 11682 11683 /* 11684 * Track a range of the kernel's virtual address space that is contiguous 11685 * in various mapping attributes. 11686 */ 11687 struct pmap_kernel_map_range { 11688 vm_offset_t sva; 11689 pt_entry_t attrs; 11690 int ptes; 11691 int pdes; 11692 int pdpes; 11693 }; 11694 11695 static void 11696 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 11697 vm_offset_t eva) 11698 { 11699 const char *mode; 11700 int i, pat_idx; 11701 11702 if (eva <= range->sva) 11703 return; 11704 11705 pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true); 11706 for (i = 0; i < PAT_INDEX_SIZE; i++) 11707 if (pat_index[i] == pat_idx) 11708 break; 11709 11710 switch (i) { 11711 case PAT_WRITE_BACK: 11712 mode = "WB"; 11713 break; 11714 case PAT_WRITE_THROUGH: 11715 mode = "WT"; 11716 break; 11717 case PAT_UNCACHEABLE: 11718 mode = "UC"; 11719 break; 11720 case PAT_UNCACHED: 11721 mode = "U-"; 11722 break; 11723 case PAT_WRITE_PROTECTED: 11724 mode = "WP"; 11725 break; 11726 case PAT_WRITE_COMBINING: 11727 mode = "WC"; 11728 break; 11729 default: 11730 printf("%s: unknown PAT mode %#x for range 0x%016lx-0x%016lx\n", 11731 __func__, pat_idx, range->sva, eva); 11732 mode = "??"; 11733 break; 11734 } 11735 11736 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n", 11737 range->sva, eva, 11738 (range->attrs & X86_PG_RW) != 0 ? 'w' : '-', 11739 (range->attrs & pg_nx) != 0 ? '-' : 'x', 11740 (range->attrs & X86_PG_U) != 0 ? 'u' : 's', 11741 (range->attrs & X86_PG_G) != 0 ? 'g' : '-', 11742 mode, range->pdpes, range->pdes, range->ptes); 11743 11744 /* Reset to sentinel value. */ 11745 range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, 11746 NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, 11747 NPDEPG - 1, NPTEPG - 1); 11748 } 11749 11750 /* 11751 * Determine whether the attributes specified by a page table entry match those 11752 * being tracked by the current range. This is not quite as simple as a direct 11753 * flag comparison since some PAT modes have multiple representations. 11754 */ 11755 static bool 11756 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 11757 { 11758 pt_entry_t diff, mask; 11759 11760 mask = X86_PG_G | X86_PG_RW | X86_PG_U | X86_PG_PDE_CACHE | pg_nx; 11761 diff = (range->attrs ^ attrs) & mask; 11762 if (diff == 0) 11763 return (true); 11764 if ((diff & ~X86_PG_PDE_PAT) == 0 && 11765 pmap_pat_index(kernel_pmap, range->attrs, true) == 11766 pmap_pat_index(kernel_pmap, attrs, true)) 11767 return (true); 11768 return (false); 11769 } 11770 11771 static void 11772 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 11773 pt_entry_t attrs) 11774 { 11775 11776 memset(range, 0, sizeof(*range)); 11777 range->sva = va; 11778 range->attrs = attrs; 11779 } 11780 11781 /* 11782 * Given a leaf PTE, derive the mapping's attributes. If they do not match 11783 * those of the current run, dump the address range and its attributes, and 11784 * begin a new run. 11785 */ 11786 static void 11787 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 11788 vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde, 11789 pt_entry_t pte) 11790 { 11791 pt_entry_t attrs; 11792 11793 attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx); 11794 11795 attrs |= pdpe & pg_nx; 11796 attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U)); 11797 if ((pdpe & PG_PS) != 0) { 11798 attrs |= pdpe & (X86_PG_G | X86_PG_PDE_CACHE); 11799 } else if (pde != 0) { 11800 attrs |= pde & pg_nx; 11801 attrs &= pg_nx | (pde & (X86_PG_RW | X86_PG_U)); 11802 } 11803 if ((pde & PG_PS) != 0) { 11804 attrs |= pde & (X86_PG_G | X86_PG_PDE_CACHE); 11805 } else if (pte != 0) { 11806 attrs |= pte & pg_nx; 11807 attrs &= pg_nx | (pte & (X86_PG_RW | X86_PG_U)); 11808 attrs |= pte & (X86_PG_G | X86_PG_PTE_CACHE); 11809 11810 /* Canonicalize by always using the PDE PAT bit. */ 11811 if ((attrs & X86_PG_PTE_PAT) != 0) 11812 attrs ^= X86_PG_PDE_PAT | X86_PG_PTE_PAT; 11813 } 11814 11815 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 11816 sysctl_kmaps_dump(sb, range, va); 11817 sysctl_kmaps_reinit(range, va, attrs); 11818 } 11819 } 11820 11821 static int 11822 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 11823 { 11824 struct pmap_kernel_map_range range; 11825 struct sbuf sbuf, *sb; 11826 pml4_entry_t pml4e; 11827 pdp_entry_t *pdp, pdpe; 11828 pd_entry_t *pd, pde; 11829 pt_entry_t *pt, pte; 11830 vm_offset_t sva; 11831 vm_paddr_t pa; 11832 int error, i, j, k, l; 11833 11834 error = sysctl_wire_old_buffer(req, 0); 11835 if (error != 0) 11836 return (error); 11837 sb = &sbuf; 11838 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 11839 11840 /* Sentinel value. */ 11841 range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, 11842 NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, 11843 NPDEPG - 1, NPTEPG - 1); 11844 11845 /* 11846 * Iterate over the kernel page tables without holding the kernel pmap 11847 * lock. Outside of the large map, kernel page table pages are never 11848 * freed, so at worst we will observe inconsistencies in the output. 11849 * Within the large map, ensure that PDP and PD page addresses are 11850 * valid before descending. 11851 */ 11852 for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) { 11853 switch (i) { 11854 case PML4PML4I: 11855 sbuf_printf(sb, "\nRecursive map:\n"); 11856 break; 11857 case DMPML4I: 11858 sbuf_printf(sb, "\nDirect map:\n"); 11859 break; 11860 #ifdef KASAN 11861 case KASANPML4I: 11862 sbuf_printf(sb, "\nKASAN shadow map:\n"); 11863 break; 11864 #endif 11865 #ifdef KMSAN 11866 case KMSANSHADPML4I: 11867 sbuf_printf(sb, "\nKMSAN shadow map:\n"); 11868 break; 11869 case KMSANORIGPML4I: 11870 sbuf_printf(sb, "\nKMSAN origin map:\n"); 11871 break; 11872 #endif 11873 case KPML4BASE: 11874 sbuf_printf(sb, "\nKernel map:\n"); 11875 break; 11876 case LMSPML4I: 11877 sbuf_printf(sb, "\nLarge map:\n"); 11878 break; 11879 } 11880 11881 /* Convert to canonical form. */ 11882 if (sva == 1ul << 47) 11883 sva |= -1ul << 48; 11884 11885 restart: 11886 pml4e = kernel_pml4[i]; 11887 if ((pml4e & X86_PG_V) == 0) { 11888 sva = rounddown2(sva, NBPML4); 11889 sysctl_kmaps_dump(sb, &range, sva); 11890 sva += NBPML4; 11891 continue; 11892 } 11893 pa = pml4e & PG_FRAME; 11894 pdp = (pdp_entry_t *)PHYS_TO_DMAP(pa); 11895 11896 for (j = pmap_pdpe_index(sva); j < NPDPEPG; j++) { 11897 pdpe = pdp[j]; 11898 if ((pdpe & X86_PG_V) == 0) { 11899 sva = rounddown2(sva, NBPDP); 11900 sysctl_kmaps_dump(sb, &range, sva); 11901 sva += NBPDP; 11902 continue; 11903 } 11904 pa = pdpe & PG_FRAME; 11905 if ((pdpe & PG_PS) != 0) { 11906 sva = rounddown2(sva, NBPDP); 11907 sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, 11908 0, 0); 11909 range.pdpes++; 11910 sva += NBPDP; 11911 continue; 11912 } 11913 if (PMAP_ADDRESS_IN_LARGEMAP(sva) && 11914 vm_phys_paddr_to_vm_page(pa) == NULL) { 11915 /* 11916 * Page table pages for the large map may be 11917 * freed. Validate the next-level address 11918 * before descending. 11919 */ 11920 goto restart; 11921 } 11922 pd = (pd_entry_t *)PHYS_TO_DMAP(pa); 11923 11924 for (k = pmap_pde_index(sva); k < NPDEPG; k++) { 11925 pde = pd[k]; 11926 if ((pde & X86_PG_V) == 0) { 11927 sva = rounddown2(sva, NBPDR); 11928 sysctl_kmaps_dump(sb, &range, sva); 11929 sva += NBPDR; 11930 continue; 11931 } 11932 pa = pde & PG_FRAME; 11933 if ((pde & PG_PS) != 0) { 11934 sva = rounddown2(sva, NBPDR); 11935 sysctl_kmaps_check(sb, &range, sva, 11936 pml4e, pdpe, pde, 0); 11937 range.pdes++; 11938 sva += NBPDR; 11939 continue; 11940 } 11941 if (PMAP_ADDRESS_IN_LARGEMAP(sva) && 11942 vm_phys_paddr_to_vm_page(pa) == NULL) { 11943 /* 11944 * Page table pages for the large map 11945 * may be freed. Validate the 11946 * next-level address before descending. 11947 */ 11948 goto restart; 11949 } 11950 pt = (pt_entry_t *)PHYS_TO_DMAP(pa); 11951 11952 for (l = pmap_pte_index(sva); l < NPTEPG; l++, 11953 sva += PAGE_SIZE) { 11954 pte = pt[l]; 11955 if ((pte & X86_PG_V) == 0) { 11956 sysctl_kmaps_dump(sb, &range, 11957 sva); 11958 continue; 11959 } 11960 sysctl_kmaps_check(sb, &range, sva, 11961 pml4e, pdpe, pde, pte); 11962 range.ptes++; 11963 } 11964 } 11965 } 11966 } 11967 11968 error = sbuf_finish(sb); 11969 sbuf_delete(sb); 11970 return (error); 11971 } 11972 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 11973 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 11974 NULL, 0, sysctl_kmaps, "A", 11975 "Dump kernel address layout"); 11976 11977 #ifdef DDB 11978 DB_SHOW_COMMAND(pte, pmap_print_pte) 11979 { 11980 pmap_t pmap; 11981 pml5_entry_t *pml5; 11982 pml4_entry_t *pml4; 11983 pdp_entry_t *pdp; 11984 pd_entry_t *pde; 11985 pt_entry_t *pte, PG_V; 11986 vm_offset_t va; 11987 11988 if (!have_addr) { 11989 db_printf("show pte addr\n"); 11990 return; 11991 } 11992 va = (vm_offset_t)addr; 11993 11994 if (kdb_thread != NULL) 11995 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); 11996 else 11997 pmap = PCPU_GET(curpmap); 11998 11999 PG_V = pmap_valid_bit(pmap); 12000 db_printf("VA 0x%016lx", va); 12001 12002 if (pmap_is_la57(pmap)) { 12003 pml5 = pmap_pml5e(pmap, va); 12004 db_printf(" pml5e 0x%016lx", *pml5); 12005 if ((*pml5 & PG_V) == 0) { 12006 db_printf("\n"); 12007 return; 12008 } 12009 pml4 = pmap_pml5e_to_pml4e(pml5, va); 12010 } else { 12011 pml4 = pmap_pml4e(pmap, va); 12012 } 12013 db_printf(" pml4e 0x%016lx", *pml4); 12014 if ((*pml4 & PG_V) == 0) { 12015 db_printf("\n"); 12016 return; 12017 } 12018 pdp = pmap_pml4e_to_pdpe(pml4, va); 12019 db_printf(" pdpe 0x%016lx", *pdp); 12020 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { 12021 db_printf("\n"); 12022 return; 12023 } 12024 pde = pmap_pdpe_to_pde(pdp, va); 12025 db_printf(" pde 0x%016lx", *pde); 12026 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { 12027 db_printf("\n"); 12028 return; 12029 } 12030 pte = pmap_pde_to_pte(pde, va); 12031 db_printf(" pte 0x%016lx\n", *pte); 12032 } 12033 12034 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) 12035 { 12036 vm_paddr_t a; 12037 12038 if (have_addr) { 12039 a = (vm_paddr_t)addr; 12040 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); 12041 } else { 12042 db_printf("show phys2dmap addr\n"); 12043 } 12044 } 12045 12046 static void 12047 ptpages_show_page(int level, int idx, vm_page_t pg) 12048 { 12049 db_printf("l %d i %d pg %p phys %#lx ref %x\n", 12050 level, idx, pg, VM_PAGE_TO_PHYS(pg), pg->ref_count); 12051 } 12052 12053 static void 12054 ptpages_show_complain(int level, int idx, uint64_t pte) 12055 { 12056 db_printf("l %d i %d pte %#lx\n", level, idx, pte); 12057 } 12058 12059 static void 12060 ptpages_show_pml4(vm_page_t pg4, int num_entries, uint64_t PG_V) 12061 { 12062 vm_page_t pg3, pg2, pg1; 12063 pml4_entry_t *pml4; 12064 pdp_entry_t *pdp; 12065 pd_entry_t *pd; 12066 int i4, i3, i2; 12067 12068 pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg4)); 12069 for (i4 = 0; i4 < num_entries; i4++) { 12070 if ((pml4[i4] & PG_V) == 0) 12071 continue; 12072 pg3 = PHYS_TO_VM_PAGE(pml4[i4] & PG_FRAME); 12073 if (pg3 == NULL) { 12074 ptpages_show_complain(3, i4, pml4[i4]); 12075 continue; 12076 } 12077 ptpages_show_page(3, i4, pg3); 12078 pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg3)); 12079 for (i3 = 0; i3 < NPDPEPG; i3++) { 12080 if ((pdp[i3] & PG_V) == 0) 12081 continue; 12082 pg2 = PHYS_TO_VM_PAGE(pdp[i3] & PG_FRAME); 12083 if (pg3 == NULL) { 12084 ptpages_show_complain(2, i3, pdp[i3]); 12085 continue; 12086 } 12087 ptpages_show_page(2, i3, pg2); 12088 pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg2)); 12089 for (i2 = 0; i2 < NPDEPG; i2++) { 12090 if ((pd[i2] & PG_V) == 0) 12091 continue; 12092 pg1 = PHYS_TO_VM_PAGE(pd[i2] & PG_FRAME); 12093 if (pg1 == NULL) { 12094 ptpages_show_complain(1, i2, pd[i2]); 12095 continue; 12096 } 12097 ptpages_show_page(1, i2, pg1); 12098 } 12099 } 12100 } 12101 } 12102 12103 DB_SHOW_COMMAND(ptpages, pmap_ptpages) 12104 { 12105 pmap_t pmap; 12106 vm_page_t pg; 12107 pml5_entry_t *pml5; 12108 uint64_t PG_V; 12109 int i5; 12110 12111 if (have_addr) 12112 pmap = (pmap_t)addr; 12113 else 12114 pmap = PCPU_GET(curpmap); 12115 12116 PG_V = pmap_valid_bit(pmap); 12117 12118 if (pmap_is_la57(pmap)) { 12119 pml5 = pmap->pm_pmltop; 12120 for (i5 = 0; i5 < NUPML5E; i5++) { 12121 if ((pml5[i5] & PG_V) == 0) 12122 continue; 12123 pg = PHYS_TO_VM_PAGE(pml5[i5] & PG_FRAME); 12124 if (pg == NULL) { 12125 ptpages_show_complain(4, i5, pml5[i5]); 12126 continue; 12127 } 12128 ptpages_show_page(4, i5, pg); 12129 ptpages_show_pml4(pg, NPML4EPG, PG_V); 12130 } 12131 } else { 12132 ptpages_show_pml4(PHYS_TO_VM_PAGE(DMAP_TO_PHYS( 12133 (vm_offset_t)pmap->pm_pmltop)), NUP4ML4E, PG_V); 12134 } 12135 } 12136 #endif 12137