1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * 15 * This code is derived from software contributed to Berkeley by 16 * the Systems Programming Group of the University of Utah Computer 17 * Science Department and William Jolitz of UUNET Technologies Inc. 18 * 19 * Redistribution and use in source and binary forms, with or without 20 * modification, are permitted provided that the following conditions 21 * are met: 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 3. All advertising materials mentioning features or use of this software 28 * must display the following acknowledgement: 29 * This product includes software developed by the University of 30 * California, Berkeley and its contributors. 31 * 4. Neither the name of the University nor the names of its contributors 32 * may be used to endorse or promote products derived from this software 33 * without specific prior written permission. 34 * 35 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 36 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 38 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 39 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 40 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 41 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 42 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 43 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 44 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 45 * SUCH DAMAGE. 46 * 47 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 48 */ 49 /*- 50 * Copyright (c) 2003 Networks Associates Technology, Inc. 51 * Copyright (c) 2014-2020 The FreeBSD Foundation 52 * All rights reserved. 53 * 54 * This software was developed for the FreeBSD Project by Jake Burkholder, 55 * Safeport Network Services, and Network Associates Laboratories, the 56 * Security Research Division of Network Associates, Inc. under 57 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 58 * CHATS research program. 59 * 60 * Portions of this software were developed by 61 * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from 62 * the FreeBSD Foundation. 63 * 64 * Redistribution and use in source and binary forms, with or without 65 * modification, are permitted provided that the following conditions 66 * are met: 67 * 1. Redistributions of source code must retain the above copyright 68 * notice, this list of conditions and the following disclaimer. 69 * 2. Redistributions in binary form must reproduce the above copyright 70 * notice, this list of conditions and the following disclaimer in the 71 * documentation and/or other materials provided with the distribution. 72 * 73 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 74 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 75 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 76 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 77 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 78 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 79 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 80 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 81 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 82 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 83 * SUCH DAMAGE. 84 */ 85 86 #define AMD64_NPT_AWARE 87 88 #include <sys/cdefs.h> 89 __FBSDID("$FreeBSD$"); 90 91 /* 92 * Manages physical address maps. 93 * 94 * Since the information managed by this module is 95 * also stored by the logical address mapping module, 96 * this module may throw away valid virtual-to-physical 97 * mappings at almost any time. However, invalidations 98 * of virtual-to-physical mappings must be done as 99 * requested. 100 * 101 * In order to cope with hardware architectures which 102 * make virtual-to-physical map invalidates expensive, 103 * this module may delay invalidate or reduced protection 104 * operations until such time as they are actually 105 * necessary. This module is given full information as 106 * to which processors are currently using which maps, 107 * and to when physical maps must be made correct. 108 */ 109 110 #include "opt_ddb.h" 111 #include "opt_pmap.h" 112 #include "opt_vm.h" 113 114 #include <sys/param.h> 115 #include <sys/asan.h> 116 #include <sys/bitstring.h> 117 #include <sys/bus.h> 118 #include <sys/systm.h> 119 #include <sys/counter.h> 120 #include <sys/kernel.h> 121 #include <sys/ktr.h> 122 #include <sys/lock.h> 123 #include <sys/malloc.h> 124 #include <sys/mman.h> 125 #include <sys/msan.h> 126 #include <sys/mutex.h> 127 #include <sys/proc.h> 128 #include <sys/rangeset.h> 129 #include <sys/rwlock.h> 130 #include <sys/sbuf.h> 131 #include <sys/smr.h> 132 #include <sys/sx.h> 133 #include <sys/turnstile.h> 134 #include <sys/vmem.h> 135 #include <sys/vmmeter.h> 136 #include <sys/sched.h> 137 #include <sys/sysctl.h> 138 #include <sys/smp.h> 139 #ifdef DDB 140 #include <sys/kdb.h> 141 #include <ddb/ddb.h> 142 #endif 143 144 #include <vm/vm.h> 145 #include <vm/vm_param.h> 146 #include <vm/vm_kern.h> 147 #include <vm/vm_page.h> 148 #include <vm/vm_map.h> 149 #include <vm/vm_object.h> 150 #include <vm/vm_extern.h> 151 #include <vm/vm_pageout.h> 152 #include <vm/vm_pager.h> 153 #include <vm/vm_phys.h> 154 #include <vm/vm_radix.h> 155 #include <vm/vm_reserv.h> 156 #include <vm/vm_dumpset.h> 157 #include <vm/uma.h> 158 159 #include <machine/asan.h> 160 #include <machine/intr_machdep.h> 161 #include <x86/apicvar.h> 162 #include <x86/ifunc.h> 163 #include <machine/cpu.h> 164 #include <machine/cputypes.h> 165 #include <machine/md_var.h> 166 #include <machine/msan.h> 167 #include <machine/pcb.h> 168 #include <machine/specialreg.h> 169 #ifdef SMP 170 #include <machine/smp.h> 171 #endif 172 #include <machine/sysarch.h> 173 #include <machine/tss.h> 174 175 #ifdef NUMA 176 #define PMAP_MEMDOM MAXMEMDOM 177 #else 178 #define PMAP_MEMDOM 1 179 #endif 180 181 static __inline boolean_t 182 pmap_type_guest(pmap_t pmap) 183 { 184 185 return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI)); 186 } 187 188 static __inline boolean_t 189 pmap_emulate_ad_bits(pmap_t pmap) 190 { 191 192 return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); 193 } 194 195 static __inline pt_entry_t 196 pmap_valid_bit(pmap_t pmap) 197 { 198 pt_entry_t mask; 199 200 switch (pmap->pm_type) { 201 case PT_X86: 202 case PT_RVI: 203 mask = X86_PG_V; 204 break; 205 case PT_EPT: 206 if (pmap_emulate_ad_bits(pmap)) 207 mask = EPT_PG_EMUL_V; 208 else 209 mask = EPT_PG_READ; 210 break; 211 default: 212 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); 213 } 214 215 return (mask); 216 } 217 218 static __inline pt_entry_t 219 pmap_rw_bit(pmap_t pmap) 220 { 221 pt_entry_t mask; 222 223 switch (pmap->pm_type) { 224 case PT_X86: 225 case PT_RVI: 226 mask = X86_PG_RW; 227 break; 228 case PT_EPT: 229 if (pmap_emulate_ad_bits(pmap)) 230 mask = EPT_PG_EMUL_RW; 231 else 232 mask = EPT_PG_WRITE; 233 break; 234 default: 235 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); 236 } 237 238 return (mask); 239 } 240 241 static pt_entry_t pg_g; 242 243 static __inline pt_entry_t 244 pmap_global_bit(pmap_t pmap) 245 { 246 pt_entry_t mask; 247 248 switch (pmap->pm_type) { 249 case PT_X86: 250 mask = pg_g; 251 break; 252 case PT_RVI: 253 case PT_EPT: 254 mask = 0; 255 break; 256 default: 257 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); 258 } 259 260 return (mask); 261 } 262 263 static __inline pt_entry_t 264 pmap_accessed_bit(pmap_t pmap) 265 { 266 pt_entry_t mask; 267 268 switch (pmap->pm_type) { 269 case PT_X86: 270 case PT_RVI: 271 mask = X86_PG_A; 272 break; 273 case PT_EPT: 274 if (pmap_emulate_ad_bits(pmap)) 275 mask = EPT_PG_READ; 276 else 277 mask = EPT_PG_A; 278 break; 279 default: 280 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); 281 } 282 283 return (mask); 284 } 285 286 static __inline pt_entry_t 287 pmap_modified_bit(pmap_t pmap) 288 { 289 pt_entry_t mask; 290 291 switch (pmap->pm_type) { 292 case PT_X86: 293 case PT_RVI: 294 mask = X86_PG_M; 295 break; 296 case PT_EPT: 297 if (pmap_emulate_ad_bits(pmap)) 298 mask = EPT_PG_WRITE; 299 else 300 mask = EPT_PG_M; 301 break; 302 default: 303 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); 304 } 305 306 return (mask); 307 } 308 309 static __inline pt_entry_t 310 pmap_pku_mask_bit(pmap_t pmap) 311 { 312 313 return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0); 314 } 315 316 #if !defined(DIAGNOSTIC) 317 #ifdef __GNUC_GNU_INLINE__ 318 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 319 #else 320 #define PMAP_INLINE extern inline 321 #endif 322 #else 323 #define PMAP_INLINE 324 #endif 325 326 #ifdef PV_STATS 327 #define PV_STAT(x) do { x ; } while (0) 328 #else 329 #define PV_STAT(x) do { } while (0) 330 #endif 331 332 #undef pa_index 333 #ifdef NUMA 334 #define pa_index(pa) ({ \ 335 KASSERT((pa) <= vm_phys_segs[vm_phys_nsegs - 1].end, \ 336 ("address %lx beyond the last segment", (pa))); \ 337 (pa) >> PDRSHIFT; \ 338 }) 339 #define pa_to_pmdp(pa) (&pv_table[pa_index(pa)]) 340 #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page)) 341 #define PHYS_TO_PV_LIST_LOCK(pa) ({ \ 342 struct rwlock *_lock; \ 343 if (__predict_false((pa) > pmap_last_pa)) \ 344 _lock = &pv_dummy_large.pv_lock; \ 345 else \ 346 _lock = &(pa_to_pmdp(pa)->pv_lock); \ 347 _lock; \ 348 }) 349 #else 350 #define pa_index(pa) ((pa) >> PDRSHIFT) 351 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 352 353 #define NPV_LIST_LOCKS MAXCPU 354 355 #define PHYS_TO_PV_LIST_LOCK(pa) \ 356 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) 357 #endif 358 359 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 360 struct rwlock **_lockp = (lockp); \ 361 struct rwlock *_new_lock; \ 362 \ 363 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 364 if (_new_lock != *_lockp) { \ 365 if (*_lockp != NULL) \ 366 rw_wunlock(*_lockp); \ 367 *_lockp = _new_lock; \ 368 rw_wlock(*_lockp); \ 369 } \ 370 } while (0) 371 372 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 373 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 374 375 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 376 struct rwlock **_lockp = (lockp); \ 377 \ 378 if (*_lockp != NULL) { \ 379 rw_wunlock(*_lockp); \ 380 *_lockp = NULL; \ 381 } \ 382 } while (0) 383 384 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 385 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 386 387 struct pmap kernel_pmap_store; 388 389 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 390 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 391 392 int nkpt; 393 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, 394 "Number of kernel page table pages allocated on bootup"); 395 396 static int ndmpdp; 397 vm_paddr_t dmaplimit; 398 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 399 pt_entry_t pg_nx; 400 401 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 402 "VM/pmap parameters"); 403 404 static int pg_ps_enabled = 1; 405 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 406 &pg_ps_enabled, 0, "Are large page mappings enabled?"); 407 408 int __read_frequently la57 = 0; 409 SYSCTL_INT(_vm_pmap, OID_AUTO, la57, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 410 &la57, 0, 411 "5-level paging for host is enabled"); 412 413 static bool 414 pmap_is_la57(pmap_t pmap) 415 { 416 if (pmap->pm_type == PT_X86) 417 return (la57); 418 return (false); /* XXXKIB handle EPT */ 419 } 420 421 #define PAT_INDEX_SIZE 8 422 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 423 424 static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 425 static u_int64_t KPDphys; /* phys addr of kernel level 2 */ 426 static u_int64_t KPDPphys; /* phys addr of kernel level 3 */ 427 u_int64_t KPML4phys; /* phys addr of kernel level 4 */ 428 u_int64_t KPML5phys; /* phys addr of kernel level 5, 429 if supported */ 430 431 #ifdef KASAN 432 static uint64_t KASANPDPphys; 433 #endif 434 #ifdef KMSAN 435 static uint64_t KMSANSHADPDPphys; 436 static uint64_t KMSANORIGPDPphys; 437 438 /* 439 * To support systems with large amounts of memory, it is necessary to extend 440 * the maximum size of the direct map. This could eat into the space reserved 441 * for the shadow map. 442 */ 443 _Static_assert(DMPML4I + NDMPML4E <= KMSANSHADPML4I, "direct map overflow"); 444 #endif 445 446 static pml4_entry_t *kernel_pml4; 447 static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ 448 static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 449 static int ndmpdpphys; /* number of DMPDPphys pages */ 450 451 vm_paddr_t kernphys; /* phys addr of start of bootstrap data */ 452 vm_paddr_t KERNend; /* and the end */ 453 454 /* 455 * pmap_mapdev support pre initialization (i.e. console) 456 */ 457 #define PMAP_PREINIT_MAPPING_COUNT 8 458 static struct pmap_preinit_mapping { 459 vm_paddr_t pa; 460 vm_offset_t va; 461 vm_size_t sz; 462 int mode; 463 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 464 static int pmap_initialized; 465 466 /* 467 * Data for the pv entry allocation mechanism. 468 * Updates to pv_invl_gen are protected by the pv list lock but reads are not. 469 */ 470 #ifdef NUMA 471 static __inline int 472 pc_to_domain(struct pv_chunk *pc) 473 { 474 475 return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc))); 476 } 477 #else 478 static __inline int 479 pc_to_domain(struct pv_chunk *pc __unused) 480 { 481 482 return (0); 483 } 484 #endif 485 486 struct pv_chunks_list { 487 struct mtx pvc_lock; 488 TAILQ_HEAD(pch, pv_chunk) pvc_list; 489 int active_reclaims; 490 } __aligned(CACHE_LINE_SIZE); 491 492 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM]; 493 494 #ifdef NUMA 495 struct pmap_large_md_page { 496 struct rwlock pv_lock; 497 struct md_page pv_page; 498 u_long pv_invl_gen; 499 }; 500 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large; 501 #define pv_dummy pv_dummy_large.pv_page 502 __read_mostly static struct pmap_large_md_page *pv_table; 503 __read_mostly vm_paddr_t pmap_last_pa; 504 #else 505 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; 506 static u_long pv_invl_gen[NPV_LIST_LOCKS]; 507 static struct md_page *pv_table; 508 static struct md_page pv_dummy; 509 #endif 510 511 /* 512 * All those kernel PT submaps that BSD is so fond of 513 */ 514 pt_entry_t *CMAP1 = NULL; 515 caddr_t CADDR1 = 0; 516 static vm_offset_t qframe = 0; 517 static struct mtx qframe_mtx; 518 519 static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ 520 521 static vmem_t *large_vmem; 522 static u_int lm_ents; 523 #define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= LARGEMAP_MIN_ADDRESS && \ 524 (va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents) 525 526 int pmap_pcid_enabled = 1; 527 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 528 &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?"); 529 int invpcid_works = 0; 530 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, 531 "Is the invpcid instruction available ?"); 532 533 int __read_frequently pti = 0; 534 SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 535 &pti, 0, 536 "Page Table Isolation enabled"); 537 static vm_object_t pti_obj; 538 static pml4_entry_t *pti_pml4; 539 static vm_pindex_t pti_pg_idx; 540 static bool pti_finalized; 541 542 struct pmap_pkru_range { 543 struct rs_el pkru_rs_el; 544 u_int pkru_keyidx; 545 int pkru_flags; 546 }; 547 548 static uma_zone_t pmap_pkru_ranges_zone; 549 static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 550 static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va); 551 static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 552 static void *pkru_dup_range(void *ctx, void *data); 553 static void pkru_free_range(void *ctx, void *node); 554 static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap); 555 static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 556 static void pmap_pkru_deassign_all(pmap_t pmap); 557 558 static COUNTER_U64_DEFINE_EARLY(pcid_save_cnt); 559 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLFLAG_RD, 560 &pcid_save_cnt, "Count of saved TLB context on switch"); 561 562 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = 563 LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); 564 static struct mtx invl_gen_mtx; 565 /* Fake lock object to satisfy turnstiles interface. */ 566 static struct lock_object invl_gen_ts = { 567 .lo_name = "invlts", 568 }; 569 static struct pmap_invl_gen pmap_invl_gen_head = { 570 .gen = 1, 571 .next = NULL, 572 }; 573 static u_long pmap_invl_gen = 1; 574 static int pmap_invl_waiters; 575 static struct callout pmap_invl_callout; 576 static bool pmap_invl_callout_inited; 577 578 #define PMAP_ASSERT_NOT_IN_DI() \ 579 KASSERT(pmap_not_in_di(), ("DI already started")) 580 581 static bool 582 pmap_di_locked(void) 583 { 584 int tun; 585 586 if ((cpu_feature2 & CPUID2_CX16) == 0) 587 return (true); 588 tun = 0; 589 TUNABLE_INT_FETCH("vm.pmap.di_locked", &tun); 590 return (tun != 0); 591 } 592 593 static int 594 sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS) 595 { 596 int locked; 597 598 locked = pmap_di_locked(); 599 return (sysctl_handle_int(oidp, &locked, 0, req)); 600 } 601 SYSCTL_PROC(_vm_pmap, OID_AUTO, di_locked, CTLTYPE_INT | CTLFLAG_RDTUN | 602 CTLFLAG_MPSAFE, 0, 0, sysctl_pmap_di_locked, "", 603 "Locked delayed invalidation"); 604 605 static bool pmap_not_in_di_l(void); 606 static bool pmap_not_in_di_u(void); 607 DEFINE_IFUNC(, bool, pmap_not_in_di, (void)) 608 { 609 610 return (pmap_di_locked() ? pmap_not_in_di_l : pmap_not_in_di_u); 611 } 612 613 static bool 614 pmap_not_in_di_l(void) 615 { 616 struct pmap_invl_gen *invl_gen; 617 618 invl_gen = &curthread->td_md.md_invl_gen; 619 return (invl_gen->gen == 0); 620 } 621 622 static void 623 pmap_thread_init_invl_gen_l(struct thread *td) 624 { 625 struct pmap_invl_gen *invl_gen; 626 627 invl_gen = &td->td_md.md_invl_gen; 628 invl_gen->gen = 0; 629 } 630 631 static void 632 pmap_delayed_invl_wait_block(u_long *m_gen, u_long *invl_gen) 633 { 634 struct turnstile *ts; 635 636 ts = turnstile_trywait(&invl_gen_ts); 637 if (*m_gen > atomic_load_long(invl_gen)) 638 turnstile_wait(ts, NULL, TS_SHARED_QUEUE); 639 else 640 turnstile_cancel(ts); 641 } 642 643 static void 644 pmap_delayed_invl_finish_unblock(u_long new_gen) 645 { 646 struct turnstile *ts; 647 648 turnstile_chain_lock(&invl_gen_ts); 649 ts = turnstile_lookup(&invl_gen_ts); 650 if (new_gen != 0) 651 pmap_invl_gen = new_gen; 652 if (ts != NULL) { 653 turnstile_broadcast(ts, TS_SHARED_QUEUE); 654 turnstile_unpend(ts); 655 } 656 turnstile_chain_unlock(&invl_gen_ts); 657 } 658 659 /* 660 * Start a new Delayed Invalidation (DI) block of code, executed by 661 * the current thread. Within a DI block, the current thread may 662 * destroy both the page table and PV list entries for a mapping and 663 * then release the corresponding PV list lock before ensuring that 664 * the mapping is flushed from the TLBs of any processors with the 665 * pmap active. 666 */ 667 static void 668 pmap_delayed_invl_start_l(void) 669 { 670 struct pmap_invl_gen *invl_gen; 671 u_long currgen; 672 673 invl_gen = &curthread->td_md.md_invl_gen; 674 PMAP_ASSERT_NOT_IN_DI(); 675 mtx_lock(&invl_gen_mtx); 676 if (LIST_EMPTY(&pmap_invl_gen_tracker)) 677 currgen = pmap_invl_gen; 678 else 679 currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; 680 invl_gen->gen = currgen + 1; 681 LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); 682 mtx_unlock(&invl_gen_mtx); 683 } 684 685 /* 686 * Finish the DI block, previously started by the current thread. All 687 * required TLB flushes for the pages marked by 688 * pmap_delayed_invl_page() must be finished before this function is 689 * called. 690 * 691 * This function works by bumping the global DI generation number to 692 * the generation number of the current thread's DI, unless there is a 693 * pending DI that started earlier. In the latter case, bumping the 694 * global DI generation number would incorrectly signal that the 695 * earlier DI had finished. Instead, this function bumps the earlier 696 * DI's generation number to match the generation number of the 697 * current thread's DI. 698 */ 699 static void 700 pmap_delayed_invl_finish_l(void) 701 { 702 struct pmap_invl_gen *invl_gen, *next; 703 704 invl_gen = &curthread->td_md.md_invl_gen; 705 KASSERT(invl_gen->gen != 0, ("missed invl_start")); 706 mtx_lock(&invl_gen_mtx); 707 next = LIST_NEXT(invl_gen, link); 708 if (next == NULL) 709 pmap_delayed_invl_finish_unblock(invl_gen->gen); 710 else 711 next->gen = invl_gen->gen; 712 LIST_REMOVE(invl_gen, link); 713 mtx_unlock(&invl_gen_mtx); 714 invl_gen->gen = 0; 715 } 716 717 static bool 718 pmap_not_in_di_u(void) 719 { 720 struct pmap_invl_gen *invl_gen; 721 722 invl_gen = &curthread->td_md.md_invl_gen; 723 return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0); 724 } 725 726 static void 727 pmap_thread_init_invl_gen_u(struct thread *td) 728 { 729 struct pmap_invl_gen *invl_gen; 730 731 invl_gen = &td->td_md.md_invl_gen; 732 invl_gen->gen = 0; 733 invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID; 734 } 735 736 static bool 737 pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out) 738 { 739 uint64_t new_high, new_low, old_high, old_low; 740 char res; 741 742 old_low = new_low = 0; 743 old_high = new_high = (uintptr_t)0; 744 745 __asm volatile("lock;cmpxchg16b\t%1" 746 : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) 747 : "b"(new_low), "c" (new_high) 748 : "memory", "cc"); 749 if (res == 0) { 750 if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0) 751 return (false); 752 out->gen = old_low; 753 out->next = (void *)old_high; 754 } else { 755 out->gen = new_low; 756 out->next = (void *)new_high; 757 } 758 return (true); 759 } 760 761 static bool 762 pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val, 763 struct pmap_invl_gen *new_val) 764 { 765 uint64_t new_high, new_low, old_high, old_low; 766 char res; 767 768 new_low = new_val->gen; 769 new_high = (uintptr_t)new_val->next; 770 old_low = old_val->gen; 771 old_high = (uintptr_t)old_val->next; 772 773 __asm volatile("lock;cmpxchg16b\t%1" 774 : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) 775 : "b"(new_low), "c" (new_high) 776 : "memory", "cc"); 777 return (res); 778 } 779 780 static COUNTER_U64_DEFINE_EARLY(pv_page_count); 781 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_page_count, CTLFLAG_RD, 782 &pv_page_count, "Current number of allocated pv pages"); 783 784 static COUNTER_U64_DEFINE_EARLY(user_pt_page_count); 785 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, user_pt_page_count, CTLFLAG_RD, 786 &user_pt_page_count, 787 "Current number of allocated page table pages for userspace"); 788 789 static COUNTER_U64_DEFINE_EARLY(kernel_pt_page_count); 790 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, kernel_pt_page_count, CTLFLAG_RD, 791 &kernel_pt_page_count, 792 "Current number of allocated page table pages for the kernel"); 793 794 #ifdef PV_STATS 795 796 static COUNTER_U64_DEFINE_EARLY(invl_start_restart); 797 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_start_restart, 798 CTLFLAG_RD, &invl_start_restart, 799 "Number of delayed TLB invalidation request restarts"); 800 801 static COUNTER_U64_DEFINE_EARLY(invl_finish_restart); 802 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD, 803 &invl_finish_restart, 804 "Number of delayed TLB invalidation completion restarts"); 805 806 static int invl_max_qlen; 807 SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD, 808 &invl_max_qlen, 0, 809 "Maximum delayed TLB invalidation request queue length"); 810 #endif 811 812 #define di_delay locks_delay 813 814 static void 815 pmap_delayed_invl_start_u(void) 816 { 817 struct pmap_invl_gen *invl_gen, *p, prev, new_prev; 818 struct thread *td; 819 struct lock_delay_arg lda; 820 uintptr_t prevl; 821 u_char pri; 822 #ifdef PV_STATS 823 int i, ii; 824 #endif 825 826 td = curthread; 827 invl_gen = &td->td_md.md_invl_gen; 828 PMAP_ASSERT_NOT_IN_DI(); 829 lock_delay_arg_init(&lda, &di_delay); 830 invl_gen->saved_pri = 0; 831 pri = td->td_base_pri; 832 if (pri > PVM) { 833 thread_lock(td); 834 pri = td->td_base_pri; 835 if (pri > PVM) { 836 invl_gen->saved_pri = pri; 837 sched_prio(td, PVM); 838 } 839 thread_unlock(td); 840 } 841 again: 842 PV_STAT(i = 0); 843 for (p = &pmap_invl_gen_head;; p = prev.next) { 844 PV_STAT(i++); 845 prevl = (uintptr_t)atomic_load_ptr(&p->next); 846 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { 847 PV_STAT(counter_u64_add(invl_start_restart, 1)); 848 lock_delay(&lda); 849 goto again; 850 } 851 if (prevl == 0) 852 break; 853 prev.next = (void *)prevl; 854 } 855 #ifdef PV_STATS 856 if ((ii = invl_max_qlen) < i) 857 atomic_cmpset_int(&invl_max_qlen, ii, i); 858 #endif 859 860 if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) { 861 PV_STAT(counter_u64_add(invl_start_restart, 1)); 862 lock_delay(&lda); 863 goto again; 864 } 865 866 new_prev.gen = prev.gen; 867 new_prev.next = invl_gen; 868 invl_gen->gen = prev.gen + 1; 869 870 /* Formal fence between store to invl->gen and updating *p. */ 871 atomic_thread_fence_rel(); 872 873 /* 874 * After inserting an invl_gen element with invalid bit set, 875 * this thread blocks any other thread trying to enter the 876 * delayed invalidation block. Do not allow to remove us from 877 * the CPU, because it causes starvation for other threads. 878 */ 879 critical_enter(); 880 881 /* 882 * ABA for *p is not possible there, since p->gen can only 883 * increase. So if the *p thread finished its di, then 884 * started a new one and got inserted into the list at the 885 * same place, its gen will appear greater than the previously 886 * read gen. 887 */ 888 if (!pmap_di_store_invl(p, &prev, &new_prev)) { 889 critical_exit(); 890 PV_STAT(counter_u64_add(invl_start_restart, 1)); 891 lock_delay(&lda); 892 goto again; 893 } 894 895 /* 896 * There we clear PMAP_INVL_GEN_NEXT_INVALID in 897 * invl_gen->next, allowing other threads to iterate past us. 898 * pmap_di_store_invl() provides fence between the generation 899 * write and the update of next. 900 */ 901 invl_gen->next = NULL; 902 critical_exit(); 903 } 904 905 static bool 906 pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen *invl_gen, 907 struct pmap_invl_gen *p) 908 { 909 struct pmap_invl_gen prev, new_prev; 910 u_long mygen; 911 912 /* 913 * Load invl_gen->gen after setting invl_gen->next 914 * PMAP_INVL_GEN_NEXT_INVALID. This prevents larger 915 * generations to propagate to our invl_gen->gen. Lock prefix 916 * in atomic_set_ptr() worked as seq_cst fence. 917 */ 918 mygen = atomic_load_long(&invl_gen->gen); 919 920 if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen) 921 return (false); 922 923 KASSERT(prev.gen < mygen, 924 ("invalid di gen sequence %lu %lu", prev.gen, mygen)); 925 new_prev.gen = mygen; 926 new_prev.next = (void *)((uintptr_t)invl_gen->next & 927 ~PMAP_INVL_GEN_NEXT_INVALID); 928 929 /* Formal fence between load of prev and storing update to it. */ 930 atomic_thread_fence_rel(); 931 932 return (pmap_di_store_invl(p, &prev, &new_prev)); 933 } 934 935 static void 936 pmap_delayed_invl_finish_u(void) 937 { 938 struct pmap_invl_gen *invl_gen, *p; 939 struct thread *td; 940 struct lock_delay_arg lda; 941 uintptr_t prevl; 942 943 td = curthread; 944 invl_gen = &td->td_md.md_invl_gen; 945 KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0")); 946 KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0, 947 ("missed invl_start: INVALID")); 948 lock_delay_arg_init(&lda, &di_delay); 949 950 again: 951 for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) { 952 prevl = (uintptr_t)atomic_load_ptr(&p->next); 953 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { 954 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 955 lock_delay(&lda); 956 goto again; 957 } 958 if ((void *)prevl == invl_gen) 959 break; 960 } 961 962 /* 963 * It is legitimate to not find ourself on the list if a 964 * thread before us finished its DI and started it again. 965 */ 966 if (__predict_false(p == NULL)) { 967 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 968 lock_delay(&lda); 969 goto again; 970 } 971 972 critical_enter(); 973 atomic_set_ptr((uintptr_t *)&invl_gen->next, 974 PMAP_INVL_GEN_NEXT_INVALID); 975 if (!pmap_delayed_invl_finish_u_crit(invl_gen, p)) { 976 atomic_clear_ptr((uintptr_t *)&invl_gen->next, 977 PMAP_INVL_GEN_NEXT_INVALID); 978 critical_exit(); 979 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 980 lock_delay(&lda); 981 goto again; 982 } 983 critical_exit(); 984 if (atomic_load_int(&pmap_invl_waiters) > 0) 985 pmap_delayed_invl_finish_unblock(0); 986 if (invl_gen->saved_pri != 0) { 987 thread_lock(td); 988 sched_prio(td, invl_gen->saved_pri); 989 thread_unlock(td); 990 } 991 } 992 993 #ifdef DDB 994 DB_SHOW_COMMAND(di_queue, pmap_di_queue) 995 { 996 struct pmap_invl_gen *p, *pn; 997 struct thread *td; 998 uintptr_t nextl; 999 bool first; 1000 1001 for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn, 1002 first = false) { 1003 nextl = (uintptr_t)atomic_load_ptr(&p->next); 1004 pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID); 1005 td = first ? NULL : __containerof(p, struct thread, 1006 td_md.md_invl_gen); 1007 db_printf("gen %lu inv %d td %p tid %d\n", p->gen, 1008 (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td, 1009 td != NULL ? td->td_tid : -1); 1010 } 1011 } 1012 #endif 1013 1014 #ifdef PV_STATS 1015 static COUNTER_U64_DEFINE_EARLY(invl_wait); 1016 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait, 1017 CTLFLAG_RD, &invl_wait, 1018 "Number of times DI invalidation blocked pmap_remove_all/write"); 1019 1020 static COUNTER_U64_DEFINE_EARLY(invl_wait_slow); 1021 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD, 1022 &invl_wait_slow, "Number of slow invalidation waits for lockless DI"); 1023 1024 #endif 1025 1026 #ifdef NUMA 1027 static u_long * 1028 pmap_delayed_invl_genp(vm_page_t m) 1029 { 1030 vm_paddr_t pa; 1031 u_long *gen; 1032 1033 pa = VM_PAGE_TO_PHYS(m); 1034 if (__predict_false((pa) > pmap_last_pa)) 1035 gen = &pv_dummy_large.pv_invl_gen; 1036 else 1037 gen = &(pa_to_pmdp(pa)->pv_invl_gen); 1038 1039 return (gen); 1040 } 1041 #else 1042 static u_long * 1043 pmap_delayed_invl_genp(vm_page_t m) 1044 { 1045 1046 return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); 1047 } 1048 #endif 1049 1050 static void 1051 pmap_delayed_invl_callout_func(void *arg __unused) 1052 { 1053 1054 if (atomic_load_int(&pmap_invl_waiters) == 0) 1055 return; 1056 pmap_delayed_invl_finish_unblock(0); 1057 } 1058 1059 static void 1060 pmap_delayed_invl_callout_init(void *arg __unused) 1061 { 1062 1063 if (pmap_di_locked()) 1064 return; 1065 callout_init(&pmap_invl_callout, 1); 1066 pmap_invl_callout_inited = true; 1067 } 1068 SYSINIT(pmap_di_callout, SI_SUB_CPU + 1, SI_ORDER_ANY, 1069 pmap_delayed_invl_callout_init, NULL); 1070 1071 /* 1072 * Ensure that all currently executing DI blocks, that need to flush 1073 * TLB for the given page m, actually flushed the TLB at the time the 1074 * function returned. If the page m has an empty PV list and we call 1075 * pmap_delayed_invl_wait(), upon its return we know that no CPU has a 1076 * valid mapping for the page m in either its page table or TLB. 1077 * 1078 * This function works by blocking until the global DI generation 1079 * number catches up with the generation number associated with the 1080 * given page m and its PV list. Since this function's callers 1081 * typically own an object lock and sometimes own a page lock, it 1082 * cannot sleep. Instead, it blocks on a turnstile to relinquish the 1083 * processor. 1084 */ 1085 static void 1086 pmap_delayed_invl_wait_l(vm_page_t m) 1087 { 1088 u_long *m_gen; 1089 #ifdef PV_STATS 1090 bool accounted = false; 1091 #endif 1092 1093 m_gen = pmap_delayed_invl_genp(m); 1094 while (*m_gen > pmap_invl_gen) { 1095 #ifdef PV_STATS 1096 if (!accounted) { 1097 counter_u64_add(invl_wait, 1); 1098 accounted = true; 1099 } 1100 #endif 1101 pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen); 1102 } 1103 } 1104 1105 static void 1106 pmap_delayed_invl_wait_u(vm_page_t m) 1107 { 1108 u_long *m_gen; 1109 struct lock_delay_arg lda; 1110 bool fast; 1111 1112 fast = true; 1113 m_gen = pmap_delayed_invl_genp(m); 1114 lock_delay_arg_init(&lda, &di_delay); 1115 while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) { 1116 if (fast || !pmap_invl_callout_inited) { 1117 PV_STAT(counter_u64_add(invl_wait, 1)); 1118 lock_delay(&lda); 1119 fast = false; 1120 } else { 1121 /* 1122 * The page's invalidation generation number 1123 * is still below the current thread's number. 1124 * Prepare to block so that we do not waste 1125 * CPU cycles or worse, suffer livelock. 1126 * 1127 * Since it is impossible to block without 1128 * racing with pmap_delayed_invl_finish_u(), 1129 * prepare for the race by incrementing 1130 * pmap_invl_waiters and arming a 1-tick 1131 * callout which will unblock us if we lose 1132 * the race. 1133 */ 1134 atomic_add_int(&pmap_invl_waiters, 1); 1135 1136 /* 1137 * Re-check the current thread's invalidation 1138 * generation after incrementing 1139 * pmap_invl_waiters, so that there is no race 1140 * with pmap_delayed_invl_finish_u() setting 1141 * the page generation and checking 1142 * pmap_invl_waiters. The only race allowed 1143 * is for a missed unblock, which is handled 1144 * by the callout. 1145 */ 1146 if (*m_gen > 1147 atomic_load_long(&pmap_invl_gen_head.gen)) { 1148 callout_reset(&pmap_invl_callout, 1, 1149 pmap_delayed_invl_callout_func, NULL); 1150 PV_STAT(counter_u64_add(invl_wait_slow, 1)); 1151 pmap_delayed_invl_wait_block(m_gen, 1152 &pmap_invl_gen_head.gen); 1153 } 1154 atomic_add_int(&pmap_invl_waiters, -1); 1155 } 1156 } 1157 } 1158 1159 DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *)) 1160 { 1161 1162 return (pmap_di_locked() ? pmap_thread_init_invl_gen_l : 1163 pmap_thread_init_invl_gen_u); 1164 } 1165 1166 DEFINE_IFUNC(static, void, pmap_delayed_invl_start, (void)) 1167 { 1168 1169 return (pmap_di_locked() ? pmap_delayed_invl_start_l : 1170 pmap_delayed_invl_start_u); 1171 } 1172 1173 DEFINE_IFUNC(static, void, pmap_delayed_invl_finish, (void)) 1174 { 1175 1176 return (pmap_di_locked() ? pmap_delayed_invl_finish_l : 1177 pmap_delayed_invl_finish_u); 1178 } 1179 1180 DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t)) 1181 { 1182 1183 return (pmap_di_locked() ? pmap_delayed_invl_wait_l : 1184 pmap_delayed_invl_wait_u); 1185 } 1186 1187 /* 1188 * Mark the page m's PV list as participating in the current thread's 1189 * DI block. Any threads concurrently using m's PV list to remove or 1190 * restrict all mappings to m will wait for the current thread's DI 1191 * block to complete before proceeding. 1192 * 1193 * The function works by setting the DI generation number for m's PV 1194 * list to at least the DI generation number of the current thread. 1195 * This forces a caller of pmap_delayed_invl_wait() to block until 1196 * current thread calls pmap_delayed_invl_finish(). 1197 */ 1198 static void 1199 pmap_delayed_invl_page(vm_page_t m) 1200 { 1201 u_long gen, *m_gen; 1202 1203 rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); 1204 gen = curthread->td_md.md_invl_gen.gen; 1205 if (gen == 0) 1206 return; 1207 m_gen = pmap_delayed_invl_genp(m); 1208 if (*m_gen < gen) 1209 *m_gen = gen; 1210 } 1211 1212 /* 1213 * Crashdump maps. 1214 */ 1215 static caddr_t crashdumpmap; 1216 1217 /* 1218 * Internal flags for pmap_enter()'s helper functions. 1219 */ 1220 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 1221 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 1222 1223 /* 1224 * Internal flags for pmap_mapdev_internal() and 1225 * pmap_change_props_locked(). 1226 */ 1227 #define MAPDEV_FLUSHCACHE 0x00000001 /* Flush cache after mapping. */ 1228 #define MAPDEV_SETATTR 0x00000002 /* Modify existing attrs. */ 1229 #define MAPDEV_ASSERTVALID 0x00000004 /* Assert mapping validity. */ 1230 1231 TAILQ_HEAD(pv_chunklist, pv_chunk); 1232 1233 static void free_pv_chunk(struct pv_chunk *pc); 1234 static void free_pv_chunk_batch(struct pv_chunklist *batch); 1235 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 1236 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 1237 static int popcnt_pc_map_pq(uint64_t *map); 1238 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 1239 static void reserve_pv_entries(pmap_t pmap, int needed, 1240 struct rwlock **lockp); 1241 static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1242 struct rwlock **lockp); 1243 static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, 1244 u_int flags, struct rwlock **lockp); 1245 #if VM_NRESERVLEVEL > 0 1246 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1247 struct rwlock **lockp); 1248 #endif 1249 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 1250 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 1251 vm_offset_t va); 1252 1253 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); 1254 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, 1255 vm_prot_t prot, int mode, int flags); 1256 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 1257 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, 1258 vm_offset_t va, struct rwlock **lockp); 1259 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, 1260 vm_offset_t va); 1261 static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, 1262 vm_prot_t prot, struct rwlock **lockp); 1263 static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, 1264 u_int flags, vm_page_t m, struct rwlock **lockp); 1265 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 1266 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 1267 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 1268 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted); 1269 static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, 1270 vm_offset_t eva); 1271 static void pmap_invalidate_cache_range_all(vm_offset_t sva, 1272 vm_offset_t eva); 1273 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, 1274 pd_entry_t pde); 1275 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 1276 static vm_page_t pmap_large_map_getptp_unlocked(void); 1277 static vm_paddr_t pmap_large_map_kextract(vm_offset_t va); 1278 #if VM_NRESERVLEVEL > 0 1279 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 1280 struct rwlock **lockp); 1281 #endif 1282 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 1283 vm_prot_t prot); 1284 static void pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask); 1285 static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, 1286 bool exec); 1287 static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va); 1288 static pd_entry_t *pmap_pti_pde(vm_offset_t va); 1289 static void pmap_pti_wire_pte(void *pte); 1290 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 1291 struct spglist *free, struct rwlock **lockp); 1292 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 1293 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 1294 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 1295 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 1296 struct spglist *free); 1297 static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1298 pd_entry_t *pde, struct spglist *free, 1299 struct rwlock **lockp); 1300 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 1301 vm_page_t m, struct rwlock **lockp); 1302 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 1303 pd_entry_t newpde); 1304 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); 1305 1306 static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, 1307 struct rwlock **lockp); 1308 static vm_page_t pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex, 1309 struct rwlock **lockp, vm_offset_t va); 1310 static vm_page_t pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex, 1311 struct rwlock **lockp, vm_offset_t va); 1312 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, 1313 struct rwlock **lockp); 1314 1315 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 1316 struct spglist *free); 1317 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 1318 1319 static vm_page_t pmap_alloc_pt_page(pmap_t, vm_pindex_t, int); 1320 static void pmap_free_pt_page(pmap_t, vm_page_t, bool); 1321 1322 /********************/ 1323 /* Inline functions */ 1324 /********************/ 1325 1326 /* 1327 * Return a non-clipped indexes for a given VA, which are page table 1328 * pages indexes at the corresponding level. 1329 */ 1330 static __inline vm_pindex_t 1331 pmap_pde_pindex(vm_offset_t va) 1332 { 1333 return (va >> PDRSHIFT); 1334 } 1335 1336 static __inline vm_pindex_t 1337 pmap_pdpe_pindex(vm_offset_t va) 1338 { 1339 return (NUPDE + (va >> PDPSHIFT)); 1340 } 1341 1342 static __inline vm_pindex_t 1343 pmap_pml4e_pindex(vm_offset_t va) 1344 { 1345 return (NUPDE + NUPDPE + (va >> PML4SHIFT)); 1346 } 1347 1348 static __inline vm_pindex_t 1349 pmap_pml5e_pindex(vm_offset_t va) 1350 { 1351 return (NUPDE + NUPDPE + NUPML4E + (va >> PML5SHIFT)); 1352 } 1353 1354 static __inline pml4_entry_t * 1355 pmap_pml5e(pmap_t pmap, vm_offset_t va) 1356 { 1357 1358 MPASS(pmap_is_la57(pmap)); 1359 return (&pmap->pm_pmltop[pmap_pml5e_index(va)]); 1360 } 1361 1362 static __inline pml4_entry_t * 1363 pmap_pml5e_u(pmap_t pmap, vm_offset_t va) 1364 { 1365 1366 MPASS(pmap_is_la57(pmap)); 1367 return (&pmap->pm_pmltopu[pmap_pml5e_index(va)]); 1368 } 1369 1370 static __inline pml4_entry_t * 1371 pmap_pml5e_to_pml4e(pml5_entry_t *pml5e, vm_offset_t va) 1372 { 1373 pml4_entry_t *pml4e; 1374 1375 /* XXX MPASS(pmap_is_la57(pmap); */ 1376 pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); 1377 return (&pml4e[pmap_pml4e_index(va)]); 1378 } 1379 1380 /* Return a pointer to the PML4 slot that corresponds to a VA */ 1381 static __inline pml4_entry_t * 1382 pmap_pml4e(pmap_t pmap, vm_offset_t va) 1383 { 1384 pml5_entry_t *pml5e; 1385 pml4_entry_t *pml4e; 1386 pt_entry_t PG_V; 1387 1388 if (pmap_is_la57(pmap)) { 1389 pml5e = pmap_pml5e(pmap, va); 1390 PG_V = pmap_valid_bit(pmap); 1391 if ((*pml5e & PG_V) == 0) 1392 return (NULL); 1393 pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); 1394 } else { 1395 pml4e = pmap->pm_pmltop; 1396 } 1397 return (&pml4e[pmap_pml4e_index(va)]); 1398 } 1399 1400 static __inline pml4_entry_t * 1401 pmap_pml4e_u(pmap_t pmap, vm_offset_t va) 1402 { 1403 MPASS(!pmap_is_la57(pmap)); 1404 return (&pmap->pm_pmltopu[pmap_pml4e_index(va)]); 1405 } 1406 1407 /* Return a pointer to the PDP slot that corresponds to a VA */ 1408 static __inline pdp_entry_t * 1409 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 1410 { 1411 pdp_entry_t *pdpe; 1412 1413 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 1414 return (&pdpe[pmap_pdpe_index(va)]); 1415 } 1416 1417 /* Return a pointer to the PDP slot that corresponds to a VA */ 1418 static __inline pdp_entry_t * 1419 pmap_pdpe(pmap_t pmap, vm_offset_t va) 1420 { 1421 pml4_entry_t *pml4e; 1422 pt_entry_t PG_V; 1423 1424 PG_V = pmap_valid_bit(pmap); 1425 pml4e = pmap_pml4e(pmap, va); 1426 if (pml4e == NULL || (*pml4e & PG_V) == 0) 1427 return (NULL); 1428 return (pmap_pml4e_to_pdpe(pml4e, va)); 1429 } 1430 1431 /* Return a pointer to the PD slot that corresponds to a VA */ 1432 static __inline pd_entry_t * 1433 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 1434 { 1435 pd_entry_t *pde; 1436 1437 KASSERT((*pdpe & PG_PS) == 0, 1438 ("%s: pdpe %#lx is a leaf", __func__, *pdpe)); 1439 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 1440 return (&pde[pmap_pde_index(va)]); 1441 } 1442 1443 /* Return a pointer to the PD slot that corresponds to a VA */ 1444 static __inline pd_entry_t * 1445 pmap_pde(pmap_t pmap, vm_offset_t va) 1446 { 1447 pdp_entry_t *pdpe; 1448 pt_entry_t PG_V; 1449 1450 PG_V = pmap_valid_bit(pmap); 1451 pdpe = pmap_pdpe(pmap, va); 1452 if (pdpe == NULL || (*pdpe & PG_V) == 0) 1453 return (NULL); 1454 KASSERT((*pdpe & PG_PS) == 0, 1455 ("pmap_pde for 1G page, pmap %p va %#lx", pmap, va)); 1456 return (pmap_pdpe_to_pde(pdpe, va)); 1457 } 1458 1459 /* Return a pointer to the PT slot that corresponds to a VA */ 1460 static __inline pt_entry_t * 1461 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 1462 { 1463 pt_entry_t *pte; 1464 1465 KASSERT((*pde & PG_PS) == 0, 1466 ("%s: pde %#lx is a leaf", __func__, *pde)); 1467 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 1468 return (&pte[pmap_pte_index(va)]); 1469 } 1470 1471 /* Return a pointer to the PT slot that corresponds to a VA */ 1472 static __inline pt_entry_t * 1473 pmap_pte(pmap_t pmap, vm_offset_t va) 1474 { 1475 pd_entry_t *pde; 1476 pt_entry_t PG_V; 1477 1478 PG_V = pmap_valid_bit(pmap); 1479 pde = pmap_pde(pmap, va); 1480 if (pde == NULL || (*pde & PG_V) == 0) 1481 return (NULL); 1482 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 1483 return ((pt_entry_t *)pde); 1484 return (pmap_pde_to_pte(pde, va)); 1485 } 1486 1487 static __inline void 1488 pmap_resident_count_adj(pmap_t pmap, int count) 1489 { 1490 1491 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1492 KASSERT(pmap->pm_stats.resident_count + count >= 0, 1493 ("pmap %p resident count underflow %ld %d", pmap, 1494 pmap->pm_stats.resident_count, count)); 1495 pmap->pm_stats.resident_count += count; 1496 } 1497 1498 static __inline void 1499 pmap_pt_page_count_pinit(pmap_t pmap, int count) 1500 { 1501 KASSERT(pmap->pm_stats.resident_count + count >= 0, 1502 ("pmap %p resident count underflow %ld %d", pmap, 1503 pmap->pm_stats.resident_count, count)); 1504 pmap->pm_stats.resident_count += count; 1505 } 1506 1507 static __inline void 1508 pmap_pt_page_count_adj(pmap_t pmap, int count) 1509 { 1510 if (pmap == kernel_pmap) 1511 counter_u64_add(kernel_pt_page_count, count); 1512 else { 1513 if (pmap != NULL) 1514 pmap_resident_count_adj(pmap, count); 1515 counter_u64_add(user_pt_page_count, count); 1516 } 1517 } 1518 1519 PMAP_INLINE pt_entry_t * 1520 vtopte(vm_offset_t va) 1521 { 1522 u_int64_t mask; 1523 1524 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); 1525 1526 if (la57) { 1527 mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + 1528 NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1); 1529 return (P5Tmap + ((va >> PAGE_SHIFT) & mask)); 1530 } else { 1531 mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + 1532 NPML4EPGSHIFT)) - 1); 1533 return (P4Tmap + ((va >> PAGE_SHIFT) & mask)); 1534 } 1535 } 1536 1537 static __inline pd_entry_t * 1538 vtopde(vm_offset_t va) 1539 { 1540 u_int64_t mask; 1541 1542 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); 1543 1544 if (la57) { 1545 mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 1546 NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1); 1547 return (P5Dmap + ((va >> PDRSHIFT) & mask)); 1548 } else { 1549 mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 1550 NPML4EPGSHIFT)) - 1); 1551 return (P4Dmap + ((va >> PDRSHIFT) & mask)); 1552 } 1553 } 1554 1555 static u_int64_t 1556 allocpages(vm_paddr_t *firstaddr, int n) 1557 { 1558 u_int64_t ret; 1559 1560 ret = *firstaddr; 1561 bzero((void *)ret, n * PAGE_SIZE); 1562 *firstaddr += n * PAGE_SIZE; 1563 return (ret); 1564 } 1565 1566 CTASSERT(powerof2(NDMPML4E)); 1567 1568 /* number of kernel PDP slots */ 1569 #define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG) 1570 1571 static void 1572 nkpt_init(vm_paddr_t addr) 1573 { 1574 int pt_pages; 1575 1576 #ifdef NKPT 1577 pt_pages = NKPT; 1578 #else 1579 pt_pages = howmany(addr - kernphys, NBPDR) + 1; /* +1 for 2M hole @0 */ 1580 pt_pages += NKPDPE(pt_pages); 1581 1582 /* 1583 * Add some slop beyond the bare minimum required for bootstrapping 1584 * the kernel. 1585 * 1586 * This is quite important when allocating KVA for kernel modules. 1587 * The modules are required to be linked in the negative 2GB of 1588 * the address space. If we run out of KVA in this region then 1589 * pmap_growkernel() will need to allocate page table pages to map 1590 * the entire 512GB of KVA space which is an unnecessary tax on 1591 * physical memory. 1592 * 1593 * Secondly, device memory mapped as part of setting up the low- 1594 * level console(s) is taken from KVA, starting at virtual_avail. 1595 * This is because cninit() is called after pmap_bootstrap() but 1596 * before vm_init() and pmap_init(). 20MB for a frame buffer is 1597 * not uncommon. 1598 */ 1599 pt_pages += 32; /* 64MB additional slop. */ 1600 #endif 1601 nkpt = pt_pages; 1602 } 1603 1604 /* 1605 * Returns the proper write/execute permission for a physical page that is 1606 * part of the initial boot allocations. 1607 * 1608 * If the page has kernel text, it is marked as read-only. If the page has 1609 * kernel read-only data, it is marked as read-only/not-executable. If the 1610 * page has only read-write data, it is marked as read-write/not-executable. 1611 * If the page is below/above the kernel range, it is marked as read-write. 1612 * 1613 * This function operates on 2M pages, since we map the kernel space that 1614 * way. 1615 */ 1616 static inline pt_entry_t 1617 bootaddr_rwx(vm_paddr_t pa) 1618 { 1619 /* 1620 * The kernel is loaded at a 2MB-aligned address, and memory below that 1621 * need not be executable. The .bss section is padded to a 2MB 1622 * boundary, so memory following the kernel need not be executable 1623 * either. Preloaded kernel modules have their mapping permissions 1624 * fixed up by the linker. 1625 */ 1626 if (pa < trunc_2mpage(kernphys + btext - KERNSTART) || 1627 pa >= trunc_2mpage(kernphys + _end - KERNSTART)) 1628 return (X86_PG_RW | pg_nx); 1629 1630 /* 1631 * The linker should ensure that the read-only and read-write 1632 * portions don't share the same 2M page, so this shouldn't 1633 * impact read-only data. However, in any case, any page with 1634 * read-write data needs to be read-write. 1635 */ 1636 if (pa >= trunc_2mpage(kernphys + brwsection - KERNSTART)) 1637 return (X86_PG_RW | pg_nx); 1638 1639 /* 1640 * Mark any 2M page containing kernel text as read-only. Mark 1641 * other pages with read-only data as read-only and not executable. 1642 * (It is likely a small portion of the read-only data section will 1643 * be marked as read-only, but executable. This should be acceptable 1644 * since the read-only protection will keep the data from changing.) 1645 * Note that fixups to the .text section will still work until we 1646 * set CR0.WP. 1647 */ 1648 if (pa < round_2mpage(kernphys + etext - KERNSTART)) 1649 return (0); 1650 return (pg_nx); 1651 } 1652 1653 static void 1654 create_pagetables(vm_paddr_t *firstaddr) 1655 { 1656 pd_entry_t *pd_p; 1657 pdp_entry_t *pdp_p; 1658 pml4_entry_t *p4_p; 1659 uint64_t DMPDkernphys; 1660 vm_paddr_t pax; 1661 #ifdef KASAN 1662 pt_entry_t *pt_p; 1663 uint64_t KASANPDphys, KASANPTphys, KASANphys; 1664 vm_offset_t kasankernbase; 1665 int kasankpdpi, kasankpdi, nkasanpte; 1666 #endif 1667 int i, j, ndm1g, nkpdpe, nkdmpde; 1668 1669 /* Allocate page table pages for the direct map */ 1670 ndmpdp = howmany(ptoa(Maxmem), NBPDP); 1671 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 1672 ndmpdp = 4; 1673 ndmpdpphys = howmany(ndmpdp, NPDPEPG); 1674 if (ndmpdpphys > NDMPML4E) { 1675 /* 1676 * Each NDMPML4E allows 512 GB, so limit to that, 1677 * and then readjust ndmpdp and ndmpdpphys. 1678 */ 1679 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); 1680 Maxmem = atop(NDMPML4E * NBPML4); 1681 ndmpdpphys = NDMPML4E; 1682 ndmpdp = NDMPML4E * NPDEPG; 1683 } 1684 DMPDPphys = allocpages(firstaddr, ndmpdpphys); 1685 ndm1g = 0; 1686 if ((amd_feature & AMDID_PAGE1GB) != 0) { 1687 /* 1688 * Calculate the number of 1G pages that will fully fit in 1689 * Maxmem. 1690 */ 1691 ndm1g = ptoa(Maxmem) >> PDPSHIFT; 1692 1693 /* 1694 * Allocate 2M pages for the kernel. These will be used in 1695 * place of the one or more 1G pages from ndm1g that maps 1696 * kernel memory into DMAP. 1697 */ 1698 nkdmpde = howmany((vm_offset_t)brwsection - KERNSTART + 1699 kernphys - rounddown2(kernphys, NBPDP), NBPDP); 1700 DMPDkernphys = allocpages(firstaddr, nkdmpde); 1701 } 1702 if (ndm1g < ndmpdp) 1703 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); 1704 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 1705 1706 /* Allocate pages. */ 1707 KPML4phys = allocpages(firstaddr, 1); 1708 KPDPphys = allocpages(firstaddr, NKPML4E); 1709 #ifdef KASAN 1710 KASANPDPphys = allocpages(firstaddr, NKASANPML4E); 1711 KASANPDphys = allocpages(firstaddr, 1); 1712 #endif 1713 #ifdef KMSAN 1714 /* 1715 * The KMSAN shadow maps are initially left unpopulated, since there is 1716 * no need to shadow memory above KERNBASE. 1717 */ 1718 KMSANSHADPDPphys = allocpages(firstaddr, NKMSANSHADPML4E); 1719 KMSANORIGPDPphys = allocpages(firstaddr, NKMSANORIGPML4E); 1720 #endif 1721 1722 /* 1723 * Allocate the initial number of kernel page table pages required to 1724 * bootstrap. We defer this until after all memory-size dependent 1725 * allocations are done (e.g. direct map), so that we don't have to 1726 * build in too much slop in our estimate. 1727 * 1728 * Note that when NKPML4E > 1, we have an empty page underneath 1729 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) 1730 * pages. (pmap_enter requires a PD page to exist for each KPML4E.) 1731 */ 1732 nkpt_init(*firstaddr); 1733 nkpdpe = NKPDPE(nkpt); 1734 1735 KPTphys = allocpages(firstaddr, nkpt); 1736 KPDphys = allocpages(firstaddr, nkpdpe); 1737 1738 #ifdef KASAN 1739 nkasanpte = howmany(nkpt, KASAN_SHADOW_SCALE); 1740 KASANPTphys = allocpages(firstaddr, nkasanpte); 1741 KASANphys = allocpages(firstaddr, nkasanpte * NPTEPG); 1742 #endif 1743 1744 /* 1745 * Connect the zero-filled PT pages to their PD entries. This 1746 * implicitly maps the PT pages at their correct locations within 1747 * the PTmap. 1748 */ 1749 pd_p = (pd_entry_t *)KPDphys; 1750 for (i = 0; i < nkpt; i++) 1751 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 1752 1753 /* 1754 * Map from start of the kernel in physical memory (staging 1755 * area) to the end of loader preallocated memory using 2MB 1756 * pages. This replaces some of the PD entries created above. 1757 * For compatibility, identity map 2M at the start. 1758 */ 1759 pd_p[0] = X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | 1760 X86_PG_RW | pg_nx; 1761 for (i = 1, pax = kernphys; pax < KERNend; i++, pax += NBPDR) { 1762 /* Preset PG_M and PG_A because demotion expects it. */ 1763 pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | 1764 X86_PG_A | bootaddr_rwx(pax); 1765 } 1766 1767 /* 1768 * Because we map the physical blocks in 2M pages, adjust firstaddr 1769 * to record the physical blocks we've actually mapped into kernel 1770 * virtual address space. 1771 */ 1772 if (*firstaddr < round_2mpage(KERNend)) 1773 *firstaddr = round_2mpage(KERNend); 1774 1775 /* And connect up the PD to the PDP (leaving room for L4 pages) */ 1776 pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); 1777 for (i = 0; i < nkpdpe; i++) 1778 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 1779 1780 #ifdef KASAN 1781 kasankernbase = kasan_md_addr_to_shad(KERNBASE); 1782 kasankpdpi = pmap_pdpe_index(kasankernbase); 1783 kasankpdi = pmap_pde_index(kasankernbase); 1784 1785 pdp_p = (pdp_entry_t *)KASANPDPphys; 1786 pdp_p[kasankpdpi] = (KASANPDphys | X86_PG_RW | X86_PG_V | pg_nx); 1787 1788 pd_p = (pd_entry_t *)KASANPDphys; 1789 for (i = 0; i < nkasanpte; i++) 1790 pd_p[i + kasankpdi] = (KASANPTphys + ptoa(i)) | X86_PG_RW | 1791 X86_PG_V | pg_nx; 1792 1793 pt_p = (pt_entry_t *)KASANPTphys; 1794 for (i = 0; i < nkasanpte * NPTEPG; i++) 1795 pt_p[i] = (KASANphys + ptoa(i)) | X86_PG_RW | X86_PG_V | 1796 X86_PG_M | X86_PG_A | pg_nx; 1797 #endif 1798 1799 /* 1800 * Now, set up the direct map region using 2MB and/or 1GB pages. If 1801 * the end of physical memory is not aligned to a 1GB page boundary, 1802 * then the residual physical memory is mapped with 2MB pages. Later, 1803 * if pmap_mapdev{_attr}() uses the direct map for non-write-back 1804 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings 1805 * that are partially used. 1806 */ 1807 pd_p = (pd_entry_t *)DMPDphys; 1808 for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { 1809 pd_p[j] = (vm_paddr_t)i << PDRSHIFT; 1810 /* Preset PG_M and PG_A because demotion expects it. */ 1811 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | 1812 X86_PG_M | X86_PG_A | pg_nx; 1813 } 1814 pdp_p = (pdp_entry_t *)DMPDPphys; 1815 for (i = 0; i < ndm1g; i++) { 1816 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; 1817 /* Preset PG_M and PG_A because demotion expects it. */ 1818 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | 1819 X86_PG_M | X86_PG_A | pg_nx; 1820 } 1821 for (j = 0; i < ndmpdp; i++, j++) { 1822 pdp_p[i] = DMPDphys + ptoa(j); 1823 pdp_p[i] |= X86_PG_RW | X86_PG_V | pg_nx; 1824 } 1825 1826 /* 1827 * Instead of using a 1G page for the memory containing the kernel, 1828 * use 2M pages with read-only and no-execute permissions. (If using 1G 1829 * pages, this will partially overwrite the PDPEs above.) 1830 */ 1831 if (ndm1g > 0) { 1832 pd_p = (pd_entry_t *)DMPDkernphys; 1833 for (i = 0, pax = rounddown2(kernphys, NBPDP); 1834 i < NPDEPG * nkdmpde; i++, pax += NBPDR) { 1835 pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | 1836 X86_PG_A | pg_nx | bootaddr_rwx(pax); 1837 } 1838 j = rounddown2(kernphys, NBPDP) >> PDPSHIFT; 1839 for (i = 0; i < nkdmpde; i++) { 1840 pdp_p[i + j] = (DMPDkernphys + ptoa(i)) | 1841 X86_PG_RW | X86_PG_V | pg_nx; 1842 } 1843 } 1844 1845 /* And recursively map PML4 to itself in order to get PTmap */ 1846 p4_p = (pml4_entry_t *)KPML4phys; 1847 p4_p[PML4PML4I] = KPML4phys; 1848 p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx; 1849 1850 #ifdef KASAN 1851 /* Connect the KASAN shadow map slots up to the PML4. */ 1852 for (i = 0; i < NKASANPML4E; i++) { 1853 p4_p[KASANPML4I + i] = KASANPDPphys + ptoa(i); 1854 p4_p[KASANPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1855 } 1856 #endif 1857 1858 #ifdef KMSAN 1859 /* Connect the KMSAN shadow map slots up to the PML4. */ 1860 for (i = 0; i < NKMSANSHADPML4E; i++) { 1861 p4_p[KMSANSHADPML4I + i] = KMSANSHADPDPphys + ptoa(i); 1862 p4_p[KMSANSHADPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1863 } 1864 1865 /* Connect the KMSAN origin map slots up to the PML4. */ 1866 for (i = 0; i < NKMSANORIGPML4E; i++) { 1867 p4_p[KMSANORIGPML4I + i] = KMSANORIGPDPphys + ptoa(i); 1868 p4_p[KMSANORIGPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1869 } 1870 #endif 1871 1872 /* Connect the Direct Map slots up to the PML4. */ 1873 for (i = 0; i < ndmpdpphys; i++) { 1874 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); 1875 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1876 } 1877 1878 /* Connect the KVA slots up to the PML4 */ 1879 for (i = 0; i < NKPML4E; i++) { 1880 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); 1881 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V; 1882 } 1883 1884 kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 1885 } 1886 1887 /* 1888 * Bootstrap the system enough to run with virtual memory. 1889 * 1890 * On amd64 this is called after mapping has already been enabled 1891 * and just syncs the pmap module with what has already been done. 1892 * [We can't call it easily with mapping off since the kernel is not 1893 * mapped with PA == VA, hence we would have to relocate every address 1894 * from the linked base (virtual) address "KERNBASE" to the actual 1895 * (physical) address starting relative to 0] 1896 */ 1897 void 1898 pmap_bootstrap(vm_paddr_t *firstaddr) 1899 { 1900 vm_offset_t va; 1901 pt_entry_t *pte, *pcpu_pte; 1902 struct region_descriptor r_gdt; 1903 uint64_t cr4, pcpu_phys; 1904 u_long res; 1905 int i; 1906 1907 KERNend = *firstaddr; 1908 res = atop(KERNend - (vm_paddr_t)kernphys); 1909 1910 if (!pti) 1911 pg_g = X86_PG_G; 1912 1913 /* 1914 * Create an initial set of page tables to run the kernel in. 1915 */ 1916 create_pagetables(firstaddr); 1917 1918 pcpu_phys = allocpages(firstaddr, MAXCPU); 1919 1920 /* 1921 * Add a physical memory segment (vm_phys_seg) corresponding to the 1922 * preallocated kernel page table pages so that vm_page structures 1923 * representing these pages will be created. The vm_page structures 1924 * are required for promotion of the corresponding kernel virtual 1925 * addresses to superpage mappings. 1926 */ 1927 vm_phys_early_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 1928 1929 /* 1930 * Account for the virtual addresses mapped by create_pagetables(). 1931 */ 1932 virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend - 1933 (vm_paddr_t)kernphys); 1934 virtual_end = VM_MAX_KERNEL_ADDRESS; 1935 1936 /* 1937 * Enable PG_G global pages, then switch to the kernel page 1938 * table from the bootstrap page table. After the switch, it 1939 * is possible to enable SMEP and SMAP since PG_U bits are 1940 * correct now. 1941 */ 1942 cr4 = rcr4(); 1943 cr4 |= CR4_PGE; 1944 load_cr4(cr4); 1945 load_cr3(KPML4phys); 1946 if (cpu_stdext_feature & CPUID_STDEXT_SMEP) 1947 cr4 |= CR4_SMEP; 1948 if (cpu_stdext_feature & CPUID_STDEXT_SMAP) 1949 cr4 |= CR4_SMAP; 1950 load_cr4(cr4); 1951 1952 /* 1953 * Initialize the kernel pmap (which is statically allocated). 1954 * Count bootstrap data as being resident in case any of this data is 1955 * later unmapped (using pmap_remove()) and freed. 1956 */ 1957 PMAP_LOCK_INIT(kernel_pmap); 1958 kernel_pmap->pm_pmltop = kernel_pml4; 1959 kernel_pmap->pm_cr3 = KPML4phys; 1960 kernel_pmap->pm_ucr3 = PMAP_NO_CR3; 1961 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 1962 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1963 kernel_pmap->pm_stats.resident_count = res; 1964 kernel_pmap->pm_flags = pmap_flags; 1965 1966 /* 1967 * Initialize the TLB invalidations generation number lock. 1968 */ 1969 mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); 1970 1971 /* 1972 * Reserve some special page table entries/VA space for temporary 1973 * mapping of pages. 1974 */ 1975 #define SYSMAP(c, p, v, n) \ 1976 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 1977 1978 va = virtual_avail; 1979 pte = vtopte(va); 1980 1981 /* 1982 * Crashdump maps. The first page is reused as CMAP1 for the 1983 * memory test. 1984 */ 1985 SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) 1986 CADDR1 = crashdumpmap; 1987 1988 SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU); 1989 virtual_avail = va; 1990 1991 for (i = 0; i < MAXCPU; i++) { 1992 pcpu_pte[i] = (pcpu_phys + ptoa(i)) | X86_PG_V | X86_PG_RW | 1993 pg_g | pg_nx | X86_PG_M | X86_PG_A; 1994 } 1995 1996 /* 1997 * Re-initialize PCPU area for BSP after switching. 1998 * Make hardware use gdt and common_tss from the new PCPU. 1999 */ 2000 STAILQ_INIT(&cpuhead); 2001 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2002 pcpu_init(&__pcpu[0], 0, sizeof(struct pcpu)); 2003 amd64_bsp_pcpu_init1(&__pcpu[0]); 2004 amd64_bsp_ist_init(&__pcpu[0]); 2005 __pcpu[0].pc_common_tss.tss_iobase = sizeof(struct amd64tss) + 2006 IOPERM_BITMAP_SIZE; 2007 memcpy(__pcpu[0].pc_gdt, temp_bsp_pcpu.pc_gdt, NGDT * 2008 sizeof(struct user_segment_descriptor)); 2009 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&__pcpu[0].pc_common_tss; 2010 ssdtosyssd(&gdt_segs[GPROC0_SEL], 2011 (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); 2012 r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; 2013 r_gdt.rd_base = (long)__pcpu[0].pc_gdt; 2014 lgdt(&r_gdt); 2015 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2016 ltr(GSEL(GPROC0_SEL, SEL_KPL)); 2017 __pcpu[0].pc_dynamic = temp_bsp_pcpu.pc_dynamic; 2018 __pcpu[0].pc_acpi_id = temp_bsp_pcpu.pc_acpi_id; 2019 2020 /* 2021 * Initialize the PAT MSR. 2022 * pmap_init_pat() clears and sets CR4_PGE, which, as a 2023 * side-effect, invalidates stale PG_G TLB entries that might 2024 * have been created in our pre-boot environment. 2025 */ 2026 pmap_init_pat(); 2027 2028 /* Initialize TLB Context Id. */ 2029 if (pmap_pcid_enabled) { 2030 for (i = 0; i < MAXCPU; i++) { 2031 kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN; 2032 kernel_pmap->pm_pcids[i].pm_gen = 1; 2033 } 2034 2035 /* 2036 * PMAP_PCID_KERN + 1 is used for initialization of 2037 * proc0 pmap. The pmap' pcid state might be used by 2038 * EFIRT entry before first context switch, so it 2039 * needs to be valid. 2040 */ 2041 PCPU_SET(pcid_next, PMAP_PCID_KERN + 2); 2042 PCPU_SET(pcid_gen, 1); 2043 2044 /* 2045 * pcpu area for APs is zeroed during AP startup. 2046 * pc_pcid_next and pc_pcid_gen are initialized by AP 2047 * during pcpu setup. 2048 */ 2049 load_cr4(rcr4() | CR4_PCIDE); 2050 } 2051 } 2052 2053 /* 2054 * Setup the PAT MSR. 2055 */ 2056 void 2057 pmap_init_pat(void) 2058 { 2059 uint64_t pat_msr; 2060 u_long cr0, cr4; 2061 int i; 2062 2063 /* Bail if this CPU doesn't implement PAT. */ 2064 if ((cpu_feature & CPUID_PAT) == 0) 2065 panic("no PAT??"); 2066 2067 /* Set default PAT index table. */ 2068 for (i = 0; i < PAT_INDEX_SIZE; i++) 2069 pat_index[i] = -1; 2070 pat_index[PAT_WRITE_BACK] = 0; 2071 pat_index[PAT_WRITE_THROUGH] = 1; 2072 pat_index[PAT_UNCACHEABLE] = 3; 2073 pat_index[PAT_WRITE_COMBINING] = 6; 2074 pat_index[PAT_WRITE_PROTECTED] = 5; 2075 pat_index[PAT_UNCACHED] = 2; 2076 2077 /* 2078 * Initialize default PAT entries. 2079 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 2080 * Program 5 and 6 as WP and WC. 2081 * 2082 * Leave 4 and 7 as WB and UC. Note that a recursive page table 2083 * mapping for a 2M page uses a PAT value with the bit 3 set due 2084 * to its overload with PG_PS. 2085 */ 2086 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 2087 PAT_VALUE(1, PAT_WRITE_THROUGH) | 2088 PAT_VALUE(2, PAT_UNCACHED) | 2089 PAT_VALUE(3, PAT_UNCACHEABLE) | 2090 PAT_VALUE(4, PAT_WRITE_BACK) | 2091 PAT_VALUE(5, PAT_WRITE_PROTECTED) | 2092 PAT_VALUE(6, PAT_WRITE_COMBINING) | 2093 PAT_VALUE(7, PAT_UNCACHEABLE); 2094 2095 /* Disable PGE. */ 2096 cr4 = rcr4(); 2097 load_cr4(cr4 & ~CR4_PGE); 2098 2099 /* Disable caches (CD = 1, NW = 0). */ 2100 cr0 = rcr0(); 2101 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 2102 2103 /* Flushes caches and TLBs. */ 2104 wbinvd(); 2105 invltlb(); 2106 2107 /* Update PAT and index table. */ 2108 wrmsr(MSR_PAT, pat_msr); 2109 2110 /* Flush caches and TLBs again. */ 2111 wbinvd(); 2112 invltlb(); 2113 2114 /* Restore caches and PGE. */ 2115 load_cr0(cr0); 2116 load_cr4(cr4); 2117 } 2118 2119 vm_page_t 2120 pmap_page_alloc_below_4g(bool zeroed) 2121 { 2122 return (vm_page_alloc_noobj_contig((zeroed ? VM_ALLOC_ZERO : 0), 2123 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT)); 2124 } 2125 2126 extern const char la57_trampoline[], la57_trampoline_gdt_desc[], 2127 la57_trampoline_gdt[], la57_trampoline_end[]; 2128 2129 static void 2130 pmap_bootstrap_la57(void *arg __unused) 2131 { 2132 char *v_code; 2133 pml5_entry_t *v_pml5; 2134 pml4_entry_t *v_pml4; 2135 pdp_entry_t *v_pdp; 2136 pd_entry_t *v_pd; 2137 pt_entry_t *v_pt; 2138 vm_page_t m_code, m_pml4, m_pdp, m_pd, m_pt, m_pml5; 2139 void (*la57_tramp)(uint64_t pml5); 2140 struct region_descriptor r_gdt; 2141 2142 if ((cpu_stdext_feature2 & CPUID_STDEXT2_LA57) == 0) 2143 return; 2144 TUNABLE_INT_FETCH("vm.pmap.la57", &la57); 2145 if (!la57) 2146 return; 2147 2148 r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; 2149 r_gdt.rd_base = (long)__pcpu[0].pc_gdt; 2150 2151 m_code = pmap_page_alloc_below_4g(true); 2152 v_code = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_code)); 2153 m_pml5 = pmap_page_alloc_below_4g(true); 2154 KPML5phys = VM_PAGE_TO_PHYS(m_pml5); 2155 v_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(KPML5phys); 2156 m_pml4 = pmap_page_alloc_below_4g(true); 2157 v_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4)); 2158 m_pdp = pmap_page_alloc_below_4g(true); 2159 v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp)); 2160 m_pd = pmap_page_alloc_below_4g(true); 2161 v_pd = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd)); 2162 m_pt = pmap_page_alloc_below_4g(true); 2163 v_pt = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pt)); 2164 2165 /* 2166 * Map m_code 1:1, it appears below 4G in KVA due to physical 2167 * address being below 4G. Since kernel KVA is in upper half, 2168 * the pml4e should be zero and free for temporary use. 2169 */ 2170 kernel_pmap->pm_pmltop[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = 2171 VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | 2172 X86_PG_M; 2173 v_pdp[pmap_pdpe_index(VM_PAGE_TO_PHYS(m_code))] = 2174 VM_PAGE_TO_PHYS(m_pd) | X86_PG_V | X86_PG_RW | X86_PG_A | 2175 X86_PG_M; 2176 v_pd[pmap_pde_index(VM_PAGE_TO_PHYS(m_code))] = 2177 VM_PAGE_TO_PHYS(m_pt) | X86_PG_V | X86_PG_RW | X86_PG_A | 2178 X86_PG_M; 2179 v_pt[pmap_pte_index(VM_PAGE_TO_PHYS(m_code))] = 2180 VM_PAGE_TO_PHYS(m_code) | X86_PG_V | X86_PG_RW | X86_PG_A | 2181 X86_PG_M; 2182 2183 /* 2184 * Add pml5 entry at top of KVA pointing to existing pml4 table, 2185 * entering all existing kernel mappings into level 5 table. 2186 */ 2187 v_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | 2188 X86_PG_RW | X86_PG_A | X86_PG_M | pg_g; 2189 2190 /* 2191 * Add pml5 entry for 1:1 trampoline mapping after LA57 is turned on. 2192 */ 2193 v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))] = 2194 VM_PAGE_TO_PHYS(m_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A | 2195 X86_PG_M; 2196 v_pml4[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = 2197 VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | 2198 X86_PG_M; 2199 2200 /* 2201 * Copy and call the 48->57 trampoline, hope we return there, alive. 2202 */ 2203 bcopy(la57_trampoline, v_code, la57_trampoline_end - la57_trampoline); 2204 *(u_long *)(v_code + 2 + (la57_trampoline_gdt_desc - la57_trampoline)) = 2205 la57_trampoline_gdt - la57_trampoline + VM_PAGE_TO_PHYS(m_code); 2206 la57_tramp = (void (*)(uint64_t))VM_PAGE_TO_PHYS(m_code); 2207 invlpg((vm_offset_t)la57_tramp); 2208 la57_tramp(KPML5phys); 2209 2210 /* 2211 * gdt was necessary reset, switch back to our gdt. 2212 */ 2213 lgdt(&r_gdt); 2214 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2215 load_ds(_udatasel); 2216 load_es(_udatasel); 2217 load_fs(_ufssel); 2218 ssdtosyssd(&gdt_segs[GPROC0_SEL], 2219 (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); 2220 ltr(GSEL(GPROC0_SEL, SEL_KPL)); 2221 2222 /* 2223 * Now unmap the trampoline, and free the pages. 2224 * Clear pml5 entry used for 1:1 trampoline mapping. 2225 */ 2226 pte_clear(&v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))]); 2227 invlpg((vm_offset_t)v_code); 2228 vm_page_free(m_code); 2229 vm_page_free(m_pdp); 2230 vm_page_free(m_pd); 2231 vm_page_free(m_pt); 2232 2233 /* 2234 * Recursively map PML5 to itself in order to get PTmap and 2235 * PDmap. 2236 */ 2237 v_pml5[PML5PML5I] = KPML5phys | X86_PG_RW | X86_PG_V | pg_nx; 2238 2239 kernel_pmap->pm_cr3 = KPML5phys; 2240 kernel_pmap->pm_pmltop = v_pml5; 2241 pmap_pt_page_count_adj(kernel_pmap, 1); 2242 } 2243 SYSINIT(la57, SI_SUB_KMEM, SI_ORDER_ANY, pmap_bootstrap_la57, NULL); 2244 2245 /* 2246 * Initialize a vm_page's machine-dependent fields. 2247 */ 2248 void 2249 pmap_page_init(vm_page_t m) 2250 { 2251 2252 TAILQ_INIT(&m->md.pv_list); 2253 m->md.pat_mode = PAT_WRITE_BACK; 2254 } 2255 2256 static int pmap_allow_2m_x_ept; 2257 SYSCTL_INT(_vm_pmap, OID_AUTO, allow_2m_x_ept, CTLFLAG_RWTUN | CTLFLAG_NOFETCH, 2258 &pmap_allow_2m_x_ept, 0, 2259 "Allow executable superpage mappings in EPT"); 2260 2261 void 2262 pmap_allow_2m_x_ept_recalculate(void) 2263 { 2264 /* 2265 * SKL002, SKL012S. Since the EPT format is only used by 2266 * Intel CPUs, the vendor check is merely a formality. 2267 */ 2268 if (!(cpu_vendor_id != CPU_VENDOR_INTEL || 2269 (cpu_ia32_arch_caps & IA32_ARCH_CAP_IF_PSCHANGE_MC_NO) != 0 || 2270 (CPUID_TO_FAMILY(cpu_id) == 0x6 && 2271 (CPUID_TO_MODEL(cpu_id) == 0x26 || /* Atoms */ 2272 CPUID_TO_MODEL(cpu_id) == 0x27 || 2273 CPUID_TO_MODEL(cpu_id) == 0x35 || 2274 CPUID_TO_MODEL(cpu_id) == 0x36 || 2275 CPUID_TO_MODEL(cpu_id) == 0x37 || 2276 CPUID_TO_MODEL(cpu_id) == 0x86 || 2277 CPUID_TO_MODEL(cpu_id) == 0x1c || 2278 CPUID_TO_MODEL(cpu_id) == 0x4a || 2279 CPUID_TO_MODEL(cpu_id) == 0x4c || 2280 CPUID_TO_MODEL(cpu_id) == 0x4d || 2281 CPUID_TO_MODEL(cpu_id) == 0x5a || 2282 CPUID_TO_MODEL(cpu_id) == 0x5c || 2283 CPUID_TO_MODEL(cpu_id) == 0x5d || 2284 CPUID_TO_MODEL(cpu_id) == 0x5f || 2285 CPUID_TO_MODEL(cpu_id) == 0x6e || 2286 CPUID_TO_MODEL(cpu_id) == 0x7a || 2287 CPUID_TO_MODEL(cpu_id) == 0x57 || /* Knights */ 2288 CPUID_TO_MODEL(cpu_id) == 0x85)))) 2289 pmap_allow_2m_x_ept = 1; 2290 TUNABLE_INT_FETCH("hw.allow_2m_x_ept", &pmap_allow_2m_x_ept); 2291 } 2292 2293 static bool 2294 pmap_allow_2m_x_page(pmap_t pmap, bool executable) 2295 { 2296 2297 return (pmap->pm_type != PT_EPT || !executable || 2298 !pmap_allow_2m_x_ept); 2299 } 2300 2301 #ifdef NUMA 2302 static void 2303 pmap_init_pv_table(void) 2304 { 2305 struct pmap_large_md_page *pvd; 2306 vm_size_t s; 2307 long start, end, highest, pv_npg; 2308 int domain, i, j, pages; 2309 2310 /* 2311 * We strongly depend on the size being a power of two, so the assert 2312 * is overzealous. However, should the struct be resized to a 2313 * different power of two, the code below needs to be revisited. 2314 */ 2315 CTASSERT((sizeof(*pvd) == 64)); 2316 2317 /* 2318 * Calculate the size of the array. 2319 */ 2320 pmap_last_pa = vm_phys_segs[vm_phys_nsegs - 1].end; 2321 pv_npg = howmany(pmap_last_pa, NBPDR); 2322 s = (vm_size_t)pv_npg * sizeof(struct pmap_large_md_page); 2323 s = round_page(s); 2324 pv_table = (struct pmap_large_md_page *)kva_alloc(s); 2325 if (pv_table == NULL) 2326 panic("%s: kva_alloc failed\n", __func__); 2327 2328 /* 2329 * Iterate physical segments to allocate space for respective pages. 2330 */ 2331 highest = -1; 2332 s = 0; 2333 for (i = 0; i < vm_phys_nsegs; i++) { 2334 end = vm_phys_segs[i].end / NBPDR; 2335 domain = vm_phys_segs[i].domain; 2336 2337 if (highest >= end) 2338 continue; 2339 2340 start = highest + 1; 2341 pvd = &pv_table[start]; 2342 2343 pages = end - start + 1; 2344 s = round_page(pages * sizeof(*pvd)); 2345 highest = start + (s / sizeof(*pvd)) - 1; 2346 2347 for (j = 0; j < s; j += PAGE_SIZE) { 2348 vm_page_t m = vm_page_alloc_noobj_domain(domain, 0); 2349 if (m == NULL) 2350 panic("failed to allocate PV table page"); 2351 pmap_qenter((vm_offset_t)pvd + j, &m, 1); 2352 } 2353 2354 for (j = 0; j < s / sizeof(*pvd); j++) { 2355 rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW); 2356 TAILQ_INIT(&pvd->pv_page.pv_list); 2357 pvd->pv_page.pv_gen = 0; 2358 pvd->pv_page.pat_mode = 0; 2359 pvd->pv_invl_gen = 0; 2360 pvd++; 2361 } 2362 } 2363 pvd = &pv_dummy_large; 2364 rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW); 2365 TAILQ_INIT(&pvd->pv_page.pv_list); 2366 pvd->pv_page.pv_gen = 0; 2367 pvd->pv_page.pat_mode = 0; 2368 pvd->pv_invl_gen = 0; 2369 } 2370 #else 2371 static void 2372 pmap_init_pv_table(void) 2373 { 2374 vm_size_t s; 2375 long i, pv_npg; 2376 2377 /* 2378 * Initialize the pool of pv list locks. 2379 */ 2380 for (i = 0; i < NPV_LIST_LOCKS; i++) 2381 rw_init(&pv_list_locks[i], "pmap pv list"); 2382 2383 /* 2384 * Calculate the size of the pv head table for superpages. 2385 */ 2386 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); 2387 2388 /* 2389 * Allocate memory for the pv head table for superpages. 2390 */ 2391 s = (vm_size_t)pv_npg * sizeof(struct md_page); 2392 s = round_page(s); 2393 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 2394 for (i = 0; i < pv_npg; i++) 2395 TAILQ_INIT(&pv_table[i].pv_list); 2396 TAILQ_INIT(&pv_dummy.pv_list); 2397 } 2398 #endif 2399 2400 /* 2401 * Initialize the pmap module. 2402 * Called by vm_init, to initialize any structures that the pmap 2403 * system needs to map virtual memory. 2404 */ 2405 void 2406 pmap_init(void) 2407 { 2408 struct pmap_preinit_mapping *ppim; 2409 vm_page_t m, mpte; 2410 int error, i, ret, skz63; 2411 2412 /* L1TF, reserve page @0 unconditionally */ 2413 vm_page_blacklist_add(0, bootverbose); 2414 2415 /* Detect bare-metal Skylake Server and Skylake-X. */ 2416 if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL && 2417 CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) { 2418 /* 2419 * Skylake-X errata SKZ63. Processor May Hang When 2420 * Executing Code In an HLE Transaction Region between 2421 * 40000000H and 403FFFFFH. 2422 * 2423 * Mark the pages in the range as preallocated. It 2424 * seems to be impossible to distinguish between 2425 * Skylake Server and Skylake X. 2426 */ 2427 skz63 = 1; 2428 TUNABLE_INT_FETCH("hw.skz63_enable", &skz63); 2429 if (skz63 != 0) { 2430 if (bootverbose) 2431 printf("SKZ63: skipping 4M RAM starting " 2432 "at physical 1G\n"); 2433 for (i = 0; i < atop(0x400000); i++) { 2434 ret = vm_page_blacklist_add(0x40000000 + 2435 ptoa(i), FALSE); 2436 if (!ret && bootverbose) 2437 printf("page at %#lx already used\n", 2438 0x40000000 + ptoa(i)); 2439 } 2440 } 2441 } 2442 2443 /* IFU */ 2444 pmap_allow_2m_x_ept_recalculate(); 2445 2446 /* 2447 * Initialize the vm page array entries for the kernel pmap's 2448 * page table pages. 2449 */ 2450 PMAP_LOCK(kernel_pmap); 2451 for (i = 0; i < nkpt; i++) { 2452 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 2453 KASSERT(mpte >= vm_page_array && 2454 mpte < &vm_page_array[vm_page_array_size], 2455 ("pmap_init: page table page is out of range")); 2456 mpte->pindex = pmap_pde_pindex(KERNBASE) + i; 2457 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 2458 mpte->ref_count = 1; 2459 2460 /* 2461 * Collect the page table pages that were replaced by a 2MB 2462 * page in create_pagetables(). They are zero filled. 2463 */ 2464 if ((i == 0 || 2465 kernphys + ((vm_paddr_t)(i - 1) << PDRSHIFT) < KERNend) && 2466 pmap_insert_pt_page(kernel_pmap, mpte, false)) 2467 panic("pmap_init: pmap_insert_pt_page failed"); 2468 } 2469 PMAP_UNLOCK(kernel_pmap); 2470 vm_wire_add(nkpt); 2471 2472 /* 2473 * If the kernel is running on a virtual machine, then it must assume 2474 * that MCA is enabled by the hypervisor. Moreover, the kernel must 2475 * be prepared for the hypervisor changing the vendor and family that 2476 * are reported by CPUID. Consequently, the workaround for AMD Family 2477 * 10h Erratum 383 is enabled if the processor's feature set does not 2478 * include at least one feature that is only supported by older Intel 2479 * or newer AMD processors. 2480 */ 2481 if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && 2482 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 2483 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 2484 AMDID2_FMA4)) == 0) 2485 workaround_erratum383 = 1; 2486 2487 /* 2488 * Are large page mappings enabled? 2489 */ 2490 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 2491 if (pg_ps_enabled) { 2492 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 2493 ("pmap_init: can't assign to pagesizes[1]")); 2494 pagesizes[1] = NBPDR; 2495 if ((amd_feature & AMDID_PAGE1GB) != 0) { 2496 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0, 2497 ("pmap_init: can't assign to pagesizes[2]")); 2498 pagesizes[2] = NBPDP; 2499 } 2500 } 2501 2502 /* 2503 * Initialize pv chunk lists. 2504 */ 2505 for (i = 0; i < PMAP_MEMDOM; i++) { 2506 mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL, MTX_DEF); 2507 TAILQ_INIT(&pv_chunks[i].pvc_list); 2508 } 2509 pmap_init_pv_table(); 2510 2511 pmap_initialized = 1; 2512 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 2513 ppim = pmap_preinit_mapping + i; 2514 if (ppim->va == 0) 2515 continue; 2516 /* Make the direct map consistent */ 2517 if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) { 2518 (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), 2519 ppim->sz, ppim->mode); 2520 } 2521 if (!bootverbose) 2522 continue; 2523 printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i, 2524 ppim->pa, ppim->va, ppim->sz, ppim->mode); 2525 } 2526 2527 mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); 2528 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 2529 (vmem_addr_t *)&qframe); 2530 if (error != 0) 2531 panic("qframe allocation failed"); 2532 2533 lm_ents = 8; 2534 TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents); 2535 if (lm_ents > LMEPML4I - LMSPML4I + 1) 2536 lm_ents = LMEPML4I - LMSPML4I + 1; 2537 #ifdef KMSAN 2538 if (lm_ents > KMSANORIGPML4I - LMSPML4I) { 2539 printf( 2540 "pmap: shrinking large map for KMSAN (%d slots to %ld slots)\n", 2541 lm_ents, KMSANORIGPML4I - LMSPML4I); 2542 lm_ents = KMSANORIGPML4I - LMSPML4I; 2543 } 2544 #endif 2545 if (bootverbose) 2546 printf("pmap: large map %u PML4 slots (%lu GB)\n", 2547 lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024)); 2548 if (lm_ents != 0) { 2549 large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS, 2550 (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK); 2551 if (large_vmem == NULL) { 2552 printf("pmap: cannot create large map\n"); 2553 lm_ents = 0; 2554 } 2555 for (i = 0; i < lm_ents; i++) { 2556 m = pmap_large_map_getptp_unlocked(); 2557 /* XXXKIB la57 */ 2558 kernel_pml4[LMSPML4I + i] = X86_PG_V | 2559 X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx | 2560 VM_PAGE_TO_PHYS(m); 2561 } 2562 } 2563 } 2564 2565 SYSCTL_UINT(_vm_pmap, OID_AUTO, large_map_pml4_entries, 2566 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &lm_ents, 0, 2567 "Maximum number of PML4 entries for use by large map (tunable). " 2568 "Each entry corresponds to 512GB of address space."); 2569 2570 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 2571 "2MB page mapping counters"); 2572 2573 static COUNTER_U64_DEFINE_EARLY(pmap_pde_demotions); 2574 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, demotions, 2575 CTLFLAG_RD, &pmap_pde_demotions, "2MB page demotions"); 2576 2577 static COUNTER_U64_DEFINE_EARLY(pmap_pde_mappings); 2578 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 2579 &pmap_pde_mappings, "2MB page mappings"); 2580 2581 static COUNTER_U64_DEFINE_EARLY(pmap_pde_p_failures); 2582 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 2583 &pmap_pde_p_failures, "2MB page promotion failures"); 2584 2585 static COUNTER_U64_DEFINE_EARLY(pmap_pde_promotions); 2586 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 2587 &pmap_pde_promotions, "2MB page promotions"); 2588 2589 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 2590 "1GB page mapping counters"); 2591 2592 static COUNTER_U64_DEFINE_EARLY(pmap_pdpe_demotions); 2593 SYSCTL_COUNTER_U64(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, 2594 &pmap_pdpe_demotions, "1GB page demotions"); 2595 2596 /*************************************************** 2597 * Low level helper routines..... 2598 ***************************************************/ 2599 2600 static pt_entry_t 2601 pmap_swap_pat(pmap_t pmap, pt_entry_t entry) 2602 { 2603 int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; 2604 2605 switch (pmap->pm_type) { 2606 case PT_X86: 2607 case PT_RVI: 2608 /* Verify that both PAT bits are not set at the same time */ 2609 KASSERT((entry & x86_pat_bits) != x86_pat_bits, 2610 ("Invalid PAT bits in entry %#lx", entry)); 2611 2612 /* Swap the PAT bits if one of them is set */ 2613 if ((entry & x86_pat_bits) != 0) 2614 entry ^= x86_pat_bits; 2615 break; 2616 case PT_EPT: 2617 /* 2618 * Nothing to do - the memory attributes are represented 2619 * the same way for regular pages and superpages. 2620 */ 2621 break; 2622 default: 2623 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); 2624 } 2625 2626 return (entry); 2627 } 2628 2629 boolean_t 2630 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 2631 { 2632 2633 return (mode >= 0 && mode < PAT_INDEX_SIZE && 2634 pat_index[(int)mode] >= 0); 2635 } 2636 2637 /* 2638 * Determine the appropriate bits to set in a PTE or PDE for a specified 2639 * caching mode. 2640 */ 2641 int 2642 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) 2643 { 2644 int cache_bits, pat_flag, pat_idx; 2645 2646 if (!pmap_is_valid_memattr(pmap, mode)) 2647 panic("Unknown caching mode %d\n", mode); 2648 2649 switch (pmap->pm_type) { 2650 case PT_X86: 2651 case PT_RVI: 2652 /* The PAT bit is different for PTE's and PDE's. */ 2653 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 2654 2655 /* Map the caching mode to a PAT index. */ 2656 pat_idx = pat_index[mode]; 2657 2658 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 2659 cache_bits = 0; 2660 if (pat_idx & 0x4) 2661 cache_bits |= pat_flag; 2662 if (pat_idx & 0x2) 2663 cache_bits |= PG_NC_PCD; 2664 if (pat_idx & 0x1) 2665 cache_bits |= PG_NC_PWT; 2666 break; 2667 2668 case PT_EPT: 2669 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); 2670 break; 2671 2672 default: 2673 panic("unsupported pmap type %d", pmap->pm_type); 2674 } 2675 2676 return (cache_bits); 2677 } 2678 2679 static int 2680 pmap_cache_mask(pmap_t pmap, boolean_t is_pde) 2681 { 2682 int mask; 2683 2684 switch (pmap->pm_type) { 2685 case PT_X86: 2686 case PT_RVI: 2687 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; 2688 break; 2689 case PT_EPT: 2690 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); 2691 break; 2692 default: 2693 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); 2694 } 2695 2696 return (mask); 2697 } 2698 2699 static int 2700 pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde) 2701 { 2702 int pat_flag, pat_idx; 2703 2704 pat_idx = 0; 2705 switch (pmap->pm_type) { 2706 case PT_X86: 2707 case PT_RVI: 2708 /* The PAT bit is different for PTE's and PDE's. */ 2709 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 2710 2711 if ((pte & pat_flag) != 0) 2712 pat_idx |= 0x4; 2713 if ((pte & PG_NC_PCD) != 0) 2714 pat_idx |= 0x2; 2715 if ((pte & PG_NC_PWT) != 0) 2716 pat_idx |= 0x1; 2717 break; 2718 case PT_EPT: 2719 if ((pte & EPT_PG_IGNORE_PAT) != 0) 2720 panic("EPT PTE %#lx has no PAT memory type", pte); 2721 pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3; 2722 break; 2723 } 2724 2725 /* See pmap_init_pat(). */ 2726 if (pat_idx == 4) 2727 pat_idx = 0; 2728 if (pat_idx == 7) 2729 pat_idx = 3; 2730 2731 return (pat_idx); 2732 } 2733 2734 bool 2735 pmap_ps_enabled(pmap_t pmap) 2736 { 2737 2738 return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); 2739 } 2740 2741 static void 2742 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) 2743 { 2744 2745 switch (pmap->pm_type) { 2746 case PT_X86: 2747 break; 2748 case PT_RVI: 2749 case PT_EPT: 2750 /* 2751 * XXX 2752 * This is a little bogus since the generation number is 2753 * supposed to be bumped up when a region of the address 2754 * space is invalidated in the page tables. 2755 * 2756 * In this case the old PDE entry is valid but yet we want 2757 * to make sure that any mappings using the old entry are 2758 * invalidated in the TLB. 2759 * 2760 * The reason this works as expected is because we rendezvous 2761 * "all" host cpus and force any vcpu context to exit as a 2762 * side-effect. 2763 */ 2764 atomic_add_long(&pmap->pm_eptgen, 1); 2765 break; 2766 default: 2767 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); 2768 } 2769 pde_store(pde, newpde); 2770 } 2771 2772 /* 2773 * After changing the page size for the specified virtual address in the page 2774 * table, flush the corresponding entries from the processor's TLB. Only the 2775 * calling processor's TLB is affected. 2776 * 2777 * The calling thread must be pinned to a processor. 2778 */ 2779 static void 2780 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) 2781 { 2782 pt_entry_t PG_G; 2783 2784 if (pmap_type_guest(pmap)) 2785 return; 2786 2787 KASSERT(pmap->pm_type == PT_X86, 2788 ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); 2789 2790 PG_G = pmap_global_bit(pmap); 2791 2792 if ((newpde & PG_PS) == 0) 2793 /* Demotion: flush a specific 2MB page mapping. */ 2794 invlpg(va); 2795 else if ((newpde & PG_G) == 0) 2796 /* 2797 * Promotion: flush every 4KB page mapping from the TLB 2798 * because there are too many to flush individually. 2799 */ 2800 invltlb(); 2801 else { 2802 /* 2803 * Promotion: flush every 4KB page mapping from the TLB, 2804 * including any global (PG_G) mappings. 2805 */ 2806 invltlb_glob(); 2807 } 2808 } 2809 2810 /* 2811 * The amd64 pmap uses different approaches to TLB invalidation 2812 * depending on the kernel configuration, available hardware features, 2813 * and known hardware errata. The kernel configuration option that 2814 * has the greatest operational impact on TLB invalidation is PTI, 2815 * which is enabled automatically on affected Intel CPUs. The most 2816 * impactful hardware features are first PCID, and then INVPCID 2817 * instruction presence. PCID usage is quite different for PTI 2818 * vs. non-PTI. 2819 * 2820 * * Kernel Page Table Isolation (PTI or KPTI) is used to mitigate 2821 * the Meltdown bug in some Intel CPUs. Under PTI, each user address 2822 * space is served by two page tables, user and kernel. The user 2823 * page table only maps user space and a kernel trampoline. The 2824 * kernel trampoline includes the entirety of the kernel text but 2825 * only the kernel data that is needed to switch from user to kernel 2826 * mode. The kernel page table maps the user and kernel address 2827 * spaces in their entirety. It is identical to the per-process 2828 * page table used in non-PTI mode. 2829 * 2830 * User page tables are only used when the CPU is in user mode. 2831 * Consequently, some TLB invalidations can be postponed until the 2832 * switch from kernel to user mode. In contrast, the user 2833 * space part of the kernel page table is used for copyout(9), so 2834 * TLB invalidations on this page table cannot be similarly postponed. 2835 * 2836 * The existence of a user mode page table for the given pmap is 2837 * indicated by a pm_ucr3 value that differs from PMAP_NO_CR3, in 2838 * which case pm_ucr3 contains the %cr3 register value for the user 2839 * mode page table's root. 2840 * 2841 * * The pm_active bitmask indicates which CPUs currently have the 2842 * pmap active. A CPU's bit is set on context switch to the pmap, and 2843 * cleared on switching off this CPU. For the kernel page table, 2844 * the pm_active field is immutable and contains all CPUs. The 2845 * kernel page table is always logically active on every processor, 2846 * but not necessarily in use by the hardware, e.g., in PTI mode. 2847 * 2848 * When requesting invalidation of virtual addresses with 2849 * pmap_invalidate_XXX() functions, the pmap sends shootdown IPIs to 2850 * all CPUs recorded as active in pm_active. Updates to and reads 2851 * from pm_active are not synchronized, and so they may race with 2852 * each other. Shootdown handlers are prepared to handle the race. 2853 * 2854 * * PCID is an optional feature of the long mode x86 MMU where TLB 2855 * entries are tagged with the 'Process ID' of the address space 2856 * they belong to. This feature provides a limited namespace for 2857 * process identifiers, 12 bits, supporting 4095 simultaneous IDs 2858 * total. 2859 * 2860 * Allocation of a PCID to a pmap is done by an algorithm described 2861 * in section 15.12, "Other TLB Consistency Algorithms", of 2862 * Vahalia's book "Unix Internals". A PCID cannot be allocated for 2863 * the whole lifetime of a pmap in pmap_pinit() due to the limited 2864 * namespace. Instead, a per-CPU, per-pmap PCID is assigned when 2865 * the CPU is about to start caching TLB entries from a pmap, 2866 * i.e., on the context switch that activates the pmap on the CPU. 2867 * 2868 * The PCID allocator maintains a per-CPU, per-pmap generation 2869 * count, pm_gen, which is incremented each time a new PCID is 2870 * allocated. On TLB invalidation, the generation counters for the 2871 * pmap are zeroed, which signals the context switch code that the 2872 * previously allocated PCID is no longer valid. Effectively, 2873 * zeroing any of these counters triggers a TLB shootdown for the 2874 * given CPU/address space, due to the allocation of a new PCID. 2875 * 2876 * Zeroing can be performed remotely. Consequently, if a pmap is 2877 * inactive on a CPU, then a TLB shootdown for that pmap and CPU can 2878 * be initiated by an ordinary memory access to reset the target 2879 * CPU's generation count within the pmap. The CPU initiating the 2880 * TLB shootdown does not need to send an IPI to the target CPU. 2881 * 2882 * * PTI + PCID. The available PCIDs are divided into two sets: PCIDs 2883 * for complete (kernel) page tables, and PCIDs for user mode page 2884 * tables. A user PCID value is obtained from the kernel PCID value 2885 * by setting the highest bit, 11, to 1 (0x800 == PMAP_PCID_USER_PT). 2886 * 2887 * User space page tables are activated on return to user mode, by 2888 * loading pm_ucr3 into %cr3. If the PCPU(ucr3_load_mask) requests 2889 * clearing bit 63 of the loaded ucr3, this effectively causes 2890 * complete invalidation of the user mode TLB entries for the 2891 * current pmap. In which case, local invalidations of individual 2892 * pages in the user page table are skipped. 2893 * 2894 * * Local invalidation, all modes. If the requested invalidation is 2895 * for a specific address or the total invalidation of a currently 2896 * active pmap, then the TLB is flushed using INVLPG for a kernel 2897 * page table, and INVPCID(INVPCID_CTXGLOB)/invltlb_glob() for a 2898 * user space page table(s). 2899 * 2900 * If the INVPCID instruction is available, it is used to flush entries 2901 * from the kernel page table. 2902 * 2903 * * mode: PTI disabled, PCID present. The kernel reserves PCID 0 for its 2904 * address space, all other 4095 PCIDs are used for user mode spaces 2905 * as described above. A context switch allocates a new PCID if 2906 * the recorded PCID is zero or the recorded generation does not match 2907 * the CPU's generation, effectively flushing the TLB for this address space. 2908 * Total remote invalidation is performed by zeroing pm_gen for all CPUs. 2909 * local user page: INVLPG 2910 * local kernel page: INVLPG 2911 * local user total: INVPCID(CTX) 2912 * local kernel total: INVPCID(CTXGLOB) or invltlb_glob() 2913 * remote user page, inactive pmap: zero pm_gen 2914 * remote user page, active pmap: zero pm_gen + IPI:INVLPG 2915 * (Both actions are required to handle the aforementioned pm_active races.) 2916 * remote kernel page: IPI:INVLPG 2917 * remote user total, inactive pmap: zero pm_gen 2918 * remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) or 2919 * reload %cr3) 2920 * (See note above about pm_active races.) 2921 * remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob()) 2922 * 2923 * PTI enabled, PCID present. 2924 * local user page: INVLPG for kpt, INVPCID(ADDR) or (INVLPG for ucr3) 2925 * for upt 2926 * local kernel page: INVLPG 2927 * local user total: INVPCID(CTX) or reload %cr3 for kpt, clear PCID_SAVE 2928 * on loading UCR3 into %cr3 for upt 2929 * local kernel total: INVPCID(CTXGLOB) or invltlb_glob() 2930 * remote user page, inactive pmap: zero pm_gen 2931 * remote user page, active pmap: zero pm_gen + IPI:(INVLPG for kpt, 2932 * INVPCID(ADDR) for upt) 2933 * remote kernel page: IPI:INVLPG 2934 * remote user total, inactive pmap: zero pm_gen 2935 * remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) for kpt, 2936 * clear PCID_SAVE on loading UCR3 into $cr3 for upt) 2937 * remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob()) 2938 * 2939 * No PCID. 2940 * local user page: INVLPG 2941 * local kernel page: INVLPG 2942 * local user total: reload %cr3 2943 * local kernel total: invltlb_glob() 2944 * remote user page, inactive pmap: - 2945 * remote user page, active pmap: IPI:INVLPG 2946 * remote kernel page: IPI:INVLPG 2947 * remote user total, inactive pmap: - 2948 * remote user total, active pmap: IPI:(reload %cr3) 2949 * remote kernel total: IPI:invltlb_glob() 2950 * Since on return to user mode, the reload of %cr3 with ucr3 causes 2951 * TLB invalidation, no specific action is required for user page table. 2952 * 2953 * EPT. EPT pmaps do not map KVA, all mappings are userspace. 2954 * XXX TODO 2955 */ 2956 2957 #ifdef SMP 2958 /* 2959 * Interrupt the cpus that are executing in the guest context. 2960 * This will force the vcpu to exit and the cached EPT mappings 2961 * will be invalidated by the host before the next vmresume. 2962 */ 2963 static __inline void 2964 pmap_invalidate_ept(pmap_t pmap) 2965 { 2966 smr_seq_t goal; 2967 int ipinum; 2968 2969 sched_pin(); 2970 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 2971 ("pmap_invalidate_ept: absurd pm_active")); 2972 2973 /* 2974 * The TLB mappings associated with a vcpu context are not 2975 * flushed each time a different vcpu is chosen to execute. 2976 * 2977 * This is in contrast with a process's vtop mappings that 2978 * are flushed from the TLB on each context switch. 2979 * 2980 * Therefore we need to do more than just a TLB shootdown on 2981 * the active cpus in 'pmap->pm_active'. To do this we keep 2982 * track of the number of invalidations performed on this pmap. 2983 * 2984 * Each vcpu keeps a cache of this counter and compares it 2985 * just before a vmresume. If the counter is out-of-date an 2986 * invept will be done to flush stale mappings from the TLB. 2987 * 2988 * To ensure that all vCPU threads have observed the new counter 2989 * value before returning, we use SMR. Ordering is important here: 2990 * the VMM enters an SMR read section before loading the counter 2991 * and after updating the pm_active bit set. Thus, pm_active is 2992 * a superset of active readers, and any reader that has observed 2993 * the goal has observed the new counter value. 2994 */ 2995 atomic_add_long(&pmap->pm_eptgen, 1); 2996 2997 goal = smr_advance(pmap->pm_eptsmr); 2998 2999 /* 3000 * Force the vcpu to exit and trap back into the hypervisor. 3001 */ 3002 ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; 3003 ipi_selected(pmap->pm_active, ipinum); 3004 sched_unpin(); 3005 3006 /* 3007 * Ensure that all active vCPUs will observe the new generation counter 3008 * value before executing any more guest instructions. 3009 */ 3010 smr_wait(pmap->pm_eptsmr, goal); 3011 } 3012 3013 static cpuset_t 3014 pmap_invalidate_cpu_mask(pmap_t pmap) 3015 { 3016 return (pmap == kernel_pmap ? all_cpus : pmap->pm_active); 3017 } 3018 3019 static inline void 3020 pmap_invalidate_preipi_pcid(pmap_t pmap) 3021 { 3022 u_int cpuid, i; 3023 3024 sched_pin(); 3025 3026 cpuid = PCPU_GET(cpuid); 3027 if (pmap != PCPU_GET(curpmap)) 3028 cpuid = 0xffffffff; /* An impossible value */ 3029 3030 CPU_FOREACH(i) { 3031 if (cpuid != i) 3032 pmap->pm_pcids[i].pm_gen = 0; 3033 } 3034 3035 /* 3036 * The fence is between stores to pm_gen and the read of the 3037 * pm_active mask. We need to ensure that it is impossible 3038 * for us to miss the bit update in pm_active and 3039 * simultaneously observe a non-zero pm_gen in 3040 * pmap_activate_sw(), otherwise TLB update is missed. 3041 * Without the fence, IA32 allows such an outcome. Note that 3042 * pm_active is updated by a locked operation, which provides 3043 * the reciprocal fence. 3044 */ 3045 atomic_thread_fence_seq_cst(); 3046 } 3047 3048 static void 3049 pmap_invalidate_preipi_nopcid(pmap_t pmap __unused) 3050 { 3051 sched_pin(); 3052 } 3053 3054 DEFINE_IFUNC(static, void, pmap_invalidate_preipi, (pmap_t)) 3055 { 3056 return (pmap_pcid_enabled ? pmap_invalidate_preipi_pcid : 3057 pmap_invalidate_preipi_nopcid); 3058 } 3059 3060 static inline void 3061 pmap_invalidate_page_pcid_cb(pmap_t pmap, vm_offset_t va, 3062 const bool invpcid_works1) 3063 { 3064 struct invpcid_descr d; 3065 uint64_t kcr3, ucr3; 3066 uint32_t pcid; 3067 u_int cpuid; 3068 3069 /* 3070 * Because pm_pcid is recalculated on a context switch, we 3071 * must ensure there is no preemption, not just pinning. 3072 * Otherwise, we might use a stale value below. 3073 */ 3074 CRITICAL_ASSERT(curthread); 3075 3076 /* 3077 * No need to do anything with user page tables invalidation 3078 * if there is no user page table, or invalidation is deferred 3079 * until the return to userspace. ucr3_load_mask is stable 3080 * because we have preemption disabled. 3081 */ 3082 if (pmap->pm_ucr3 == PMAP_NO_CR3 || 3083 PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) 3084 return; 3085 3086 cpuid = PCPU_GET(cpuid); 3087 3088 pcid = pmap->pm_pcids[cpuid].pm_pcid; 3089 if (invpcid_works1) { 3090 d.pcid = pcid | PMAP_PCID_USER_PT; 3091 d.pad = 0; 3092 d.addr = va; 3093 invpcid(&d, INVPCID_ADDR); 3094 } else { 3095 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3096 ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3097 pmap_pti_pcid_invlpg(ucr3, kcr3, va); 3098 } 3099 } 3100 3101 static void 3102 pmap_invalidate_page_pcid_invpcid_cb(pmap_t pmap, vm_offset_t va) 3103 { 3104 pmap_invalidate_page_pcid_cb(pmap, va, true); 3105 } 3106 3107 static void 3108 pmap_invalidate_page_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t va) 3109 { 3110 pmap_invalidate_page_pcid_cb(pmap, va, false); 3111 } 3112 3113 static void 3114 pmap_invalidate_page_nopcid_cb(pmap_t pmap __unused, vm_offset_t va __unused) 3115 { 3116 } 3117 3118 DEFINE_IFUNC(static, void, pmap_invalidate_page_cb, (pmap_t, vm_offset_t)) 3119 { 3120 if (pmap_pcid_enabled) 3121 return (invpcid_works ? pmap_invalidate_page_pcid_invpcid_cb : 3122 pmap_invalidate_page_pcid_noinvpcid_cb); 3123 return (pmap_invalidate_page_nopcid_cb); 3124 } 3125 3126 static void 3127 pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va, 3128 vm_offset_t addr2 __unused) 3129 { 3130 if (pmap == kernel_pmap) { 3131 invlpg(va); 3132 } else if (pmap == PCPU_GET(curpmap)) { 3133 invlpg(va); 3134 pmap_invalidate_page_cb(pmap, va); 3135 } 3136 } 3137 3138 void 3139 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 3140 { 3141 if (pmap_type_guest(pmap)) { 3142 pmap_invalidate_ept(pmap); 3143 return; 3144 } 3145 3146 KASSERT(pmap->pm_type == PT_X86, 3147 ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); 3148 3149 pmap_invalidate_preipi(pmap); 3150 smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap, 3151 pmap_invalidate_page_curcpu_cb); 3152 } 3153 3154 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ 3155 #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) 3156 3157 static void 3158 pmap_invalidate_range_pcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 3159 const bool invpcid_works1) 3160 { 3161 struct invpcid_descr d; 3162 uint64_t kcr3, ucr3; 3163 uint32_t pcid; 3164 u_int cpuid; 3165 3166 CRITICAL_ASSERT(curthread); 3167 3168 if (pmap != PCPU_GET(curpmap) || 3169 pmap->pm_ucr3 == PMAP_NO_CR3 || 3170 PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) 3171 return; 3172 3173 cpuid = PCPU_GET(cpuid); 3174 3175 pcid = pmap->pm_pcids[cpuid].pm_pcid; 3176 if (invpcid_works1) { 3177 d.pcid = pcid | PMAP_PCID_USER_PT; 3178 d.pad = 0; 3179 for (d.addr = sva; d.addr < eva; d.addr += PAGE_SIZE) 3180 invpcid(&d, INVPCID_ADDR); 3181 } else { 3182 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3183 ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3184 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); 3185 } 3186 } 3187 3188 static void 3189 pmap_invalidate_range_pcid_invpcid_cb(pmap_t pmap, vm_offset_t sva, 3190 vm_offset_t eva) 3191 { 3192 pmap_invalidate_range_pcid_cb(pmap, sva, eva, true); 3193 } 3194 3195 static void 3196 pmap_invalidate_range_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t sva, 3197 vm_offset_t eva) 3198 { 3199 pmap_invalidate_range_pcid_cb(pmap, sva, eva, false); 3200 } 3201 3202 static void 3203 pmap_invalidate_range_nopcid_cb(pmap_t pmap __unused, vm_offset_t sva __unused, 3204 vm_offset_t eva __unused) 3205 { 3206 } 3207 3208 DEFINE_IFUNC(static, void, pmap_invalidate_range_cb, (pmap_t, vm_offset_t, 3209 vm_offset_t)) 3210 { 3211 if (pmap_pcid_enabled) 3212 return (invpcid_works ? pmap_invalidate_range_pcid_invpcid_cb : 3213 pmap_invalidate_range_pcid_noinvpcid_cb); 3214 return (pmap_invalidate_range_nopcid_cb); 3215 } 3216 3217 static void 3218 pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3219 { 3220 vm_offset_t addr; 3221 3222 if (pmap == kernel_pmap) { 3223 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3224 invlpg(addr); 3225 } else if (pmap == PCPU_GET(curpmap)) { 3226 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3227 invlpg(addr); 3228 pmap_invalidate_range_cb(pmap, sva, eva); 3229 } 3230 } 3231 3232 void 3233 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3234 { 3235 if (eva - sva >= PMAP_INVLPG_THRESHOLD) { 3236 pmap_invalidate_all(pmap); 3237 return; 3238 } 3239 3240 if (pmap_type_guest(pmap)) { 3241 pmap_invalidate_ept(pmap); 3242 return; 3243 } 3244 3245 KASSERT(pmap->pm_type == PT_X86, 3246 ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); 3247 3248 pmap_invalidate_preipi(pmap); 3249 smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap, 3250 pmap_invalidate_range_curcpu_cb); 3251 } 3252 3253 static inline void 3254 pmap_invalidate_all_pcid_cb(pmap_t pmap, bool invpcid_works1) 3255 { 3256 struct invpcid_descr d; 3257 uint64_t kcr3; 3258 uint32_t pcid; 3259 u_int cpuid; 3260 3261 if (pmap == kernel_pmap) { 3262 if (invpcid_works1) { 3263 bzero(&d, sizeof(d)); 3264 invpcid(&d, INVPCID_CTXGLOB); 3265 } else { 3266 invltlb_glob(); 3267 } 3268 } else if (pmap == PCPU_GET(curpmap)) { 3269 CRITICAL_ASSERT(curthread); 3270 cpuid = PCPU_GET(cpuid); 3271 3272 pcid = pmap->pm_pcids[cpuid].pm_pcid; 3273 if (invpcid_works1) { 3274 d.pcid = pcid; 3275 d.pad = 0; 3276 d.addr = 0; 3277 invpcid(&d, INVPCID_CTX); 3278 } else { 3279 kcr3 = pmap->pm_cr3 | pcid; 3280 load_cr3(kcr3); 3281 } 3282 if (pmap->pm_ucr3 != PMAP_NO_CR3) 3283 PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); 3284 } 3285 } 3286 3287 static void 3288 pmap_invalidate_all_pcid_invpcid_cb(pmap_t pmap) 3289 { 3290 pmap_invalidate_all_pcid_cb(pmap, true); 3291 } 3292 3293 static void 3294 pmap_invalidate_all_pcid_noinvpcid_cb(pmap_t pmap) 3295 { 3296 pmap_invalidate_all_pcid_cb(pmap, false); 3297 } 3298 3299 static void 3300 pmap_invalidate_all_nopcid_cb(pmap_t pmap) 3301 { 3302 if (pmap == kernel_pmap) 3303 invltlb_glob(); 3304 else if (pmap == PCPU_GET(curpmap)) 3305 invltlb(); 3306 } 3307 3308 DEFINE_IFUNC(static, void, pmap_invalidate_all_cb, (pmap_t)) 3309 { 3310 if (pmap_pcid_enabled) 3311 return (invpcid_works ? pmap_invalidate_all_pcid_invpcid_cb : 3312 pmap_invalidate_all_pcid_noinvpcid_cb); 3313 return (pmap_invalidate_all_nopcid_cb); 3314 } 3315 3316 static void 3317 pmap_invalidate_all_curcpu_cb(pmap_t pmap, vm_offset_t addr1 __unused, 3318 vm_offset_t addr2 __unused) 3319 { 3320 pmap_invalidate_all_cb(pmap); 3321 } 3322 3323 void 3324 pmap_invalidate_all(pmap_t pmap) 3325 { 3326 if (pmap_type_guest(pmap)) { 3327 pmap_invalidate_ept(pmap); 3328 return; 3329 } 3330 3331 KASSERT(pmap->pm_type == PT_X86, 3332 ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); 3333 3334 pmap_invalidate_preipi(pmap); 3335 smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap, 3336 pmap_invalidate_all_curcpu_cb); 3337 } 3338 3339 static void 3340 pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused, vm_offset_t va __unused, 3341 vm_offset_t addr2 __unused) 3342 { 3343 wbinvd(); 3344 } 3345 3346 void 3347 pmap_invalidate_cache(void) 3348 { 3349 sched_pin(); 3350 smp_cache_flush(pmap_invalidate_cache_curcpu_cb); 3351 } 3352 3353 struct pde_action { 3354 cpuset_t invalidate; /* processors that invalidate their TLB */ 3355 pmap_t pmap; 3356 vm_offset_t va; 3357 pd_entry_t *pde; 3358 pd_entry_t newpde; 3359 u_int store; /* processor that updates the PDE */ 3360 }; 3361 3362 static void 3363 pmap_update_pde_action(void *arg) 3364 { 3365 struct pde_action *act = arg; 3366 3367 if (act->store == PCPU_GET(cpuid)) 3368 pmap_update_pde_store(act->pmap, act->pde, act->newpde); 3369 } 3370 3371 static void 3372 pmap_update_pde_teardown(void *arg) 3373 { 3374 struct pde_action *act = arg; 3375 3376 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 3377 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); 3378 } 3379 3380 /* 3381 * Change the page size for the specified virtual address in a way that 3382 * prevents any possibility of the TLB ever having two entries that map the 3383 * same virtual address using different page sizes. This is the recommended 3384 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 3385 * machine check exception for a TLB state that is improperly diagnosed as a 3386 * hardware error. 3387 */ 3388 static void 3389 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 3390 { 3391 struct pde_action act; 3392 cpuset_t active, other_cpus; 3393 u_int cpuid; 3394 3395 sched_pin(); 3396 cpuid = PCPU_GET(cpuid); 3397 other_cpus = all_cpus; 3398 CPU_CLR(cpuid, &other_cpus); 3399 if (pmap == kernel_pmap || pmap_type_guest(pmap)) 3400 active = all_cpus; 3401 else { 3402 active = pmap->pm_active; 3403 } 3404 if (CPU_OVERLAP(&active, &other_cpus)) { 3405 act.store = cpuid; 3406 act.invalidate = active; 3407 act.va = va; 3408 act.pmap = pmap; 3409 act.pde = pde; 3410 act.newpde = newpde; 3411 CPU_SET(cpuid, &active); 3412 smp_rendezvous_cpus(active, 3413 smp_no_rendezvous_barrier, pmap_update_pde_action, 3414 pmap_update_pde_teardown, &act); 3415 } else { 3416 pmap_update_pde_store(pmap, pde, newpde); 3417 if (CPU_ISSET(cpuid, &active)) 3418 pmap_update_pde_invalidate(pmap, va, newpde); 3419 } 3420 sched_unpin(); 3421 } 3422 #else /* !SMP */ 3423 /* 3424 * Normal, non-SMP, invalidation functions. 3425 */ 3426 void 3427 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 3428 { 3429 struct invpcid_descr d; 3430 uint64_t kcr3, ucr3; 3431 uint32_t pcid; 3432 3433 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3434 pmap->pm_eptgen++; 3435 return; 3436 } 3437 KASSERT(pmap->pm_type == PT_X86, 3438 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 3439 3440 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 3441 invlpg(va); 3442 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && 3443 pmap->pm_ucr3 != PMAP_NO_CR3) { 3444 critical_enter(); 3445 pcid = pmap->pm_pcids[0].pm_pcid; 3446 if (invpcid_works) { 3447 d.pcid = pcid | PMAP_PCID_USER_PT; 3448 d.pad = 0; 3449 d.addr = va; 3450 invpcid(&d, INVPCID_ADDR); 3451 } else { 3452 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3453 ucr3 = pmap->pm_ucr3 | pcid | 3454 PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3455 pmap_pti_pcid_invlpg(ucr3, kcr3, va); 3456 } 3457 critical_exit(); 3458 } 3459 } else if (pmap_pcid_enabled) 3460 pmap->pm_pcids[0].pm_gen = 0; 3461 } 3462 3463 void 3464 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3465 { 3466 struct invpcid_descr d; 3467 vm_offset_t addr; 3468 uint64_t kcr3, ucr3; 3469 3470 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3471 pmap->pm_eptgen++; 3472 return; 3473 } 3474 KASSERT(pmap->pm_type == PT_X86, 3475 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 3476 3477 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 3478 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3479 invlpg(addr); 3480 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && 3481 pmap->pm_ucr3 != PMAP_NO_CR3) { 3482 critical_enter(); 3483 if (invpcid_works) { 3484 d.pcid = pmap->pm_pcids[0].pm_pcid | 3485 PMAP_PCID_USER_PT; 3486 d.pad = 0; 3487 d.addr = sva; 3488 for (; d.addr < eva; d.addr += PAGE_SIZE) 3489 invpcid(&d, INVPCID_ADDR); 3490 } else { 3491 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0]. 3492 pm_pcid | CR3_PCID_SAVE; 3493 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0]. 3494 pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3495 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); 3496 } 3497 critical_exit(); 3498 } 3499 } else if (pmap_pcid_enabled) { 3500 pmap->pm_pcids[0].pm_gen = 0; 3501 } 3502 } 3503 3504 void 3505 pmap_invalidate_all(pmap_t pmap) 3506 { 3507 struct invpcid_descr d; 3508 uint64_t kcr3, ucr3; 3509 3510 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3511 pmap->pm_eptgen++; 3512 return; 3513 } 3514 KASSERT(pmap->pm_type == PT_X86, 3515 ("pmap_invalidate_all: unknown type %d", pmap->pm_type)); 3516 3517 if (pmap == kernel_pmap) { 3518 if (pmap_pcid_enabled && invpcid_works) { 3519 bzero(&d, sizeof(d)); 3520 invpcid(&d, INVPCID_CTXGLOB); 3521 } else { 3522 invltlb_glob(); 3523 } 3524 } else if (pmap == PCPU_GET(curpmap)) { 3525 if (pmap_pcid_enabled) { 3526 critical_enter(); 3527 if (invpcid_works) { 3528 d.pcid = pmap->pm_pcids[0].pm_pcid; 3529 d.pad = 0; 3530 d.addr = 0; 3531 invpcid(&d, INVPCID_CTX); 3532 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 3533 d.pcid |= PMAP_PCID_USER_PT; 3534 invpcid(&d, INVPCID_CTX); 3535 } 3536 } else { 3537 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid; 3538 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 3539 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[ 3540 0].pm_pcid | PMAP_PCID_USER_PT; 3541 pmap_pti_pcid_invalidate(ucr3, kcr3); 3542 } else 3543 load_cr3(kcr3); 3544 } 3545 critical_exit(); 3546 } else { 3547 invltlb(); 3548 } 3549 } else if (pmap_pcid_enabled) { 3550 pmap->pm_pcids[0].pm_gen = 0; 3551 } 3552 } 3553 3554 PMAP_INLINE void 3555 pmap_invalidate_cache(void) 3556 { 3557 3558 wbinvd(); 3559 } 3560 3561 static void 3562 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 3563 { 3564 3565 pmap_update_pde_store(pmap, pde, newpde); 3566 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) 3567 pmap_update_pde_invalidate(pmap, va, newpde); 3568 else 3569 pmap->pm_pcids[0].pm_gen = 0; 3570 } 3571 #endif /* !SMP */ 3572 3573 static void 3574 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) 3575 { 3576 3577 /* 3578 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created 3579 * by a promotion that did not invalidate the 512 4KB page mappings 3580 * that might exist in the TLB. Consequently, at this point, the TLB 3581 * may hold both 4KB and 2MB page mappings for the address range [va, 3582 * va + NBPDR). Therefore, the entire range must be invalidated here. 3583 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any 3584 * 4KB page mappings for the address range [va, va + NBPDR), and so a 3585 * single INVLPG suffices to invalidate the 2MB page mapping from the 3586 * TLB. 3587 */ 3588 if ((pde & PG_PROMOTED) != 0) 3589 pmap_invalidate_range(pmap, va, va + NBPDR - 1); 3590 else 3591 pmap_invalidate_page(pmap, va); 3592 } 3593 3594 DEFINE_IFUNC(, void, pmap_invalidate_cache_range, 3595 (vm_offset_t sva, vm_offset_t eva)) 3596 { 3597 3598 if ((cpu_feature & CPUID_SS) != 0) 3599 return (pmap_invalidate_cache_range_selfsnoop); 3600 if ((cpu_feature & CPUID_CLFSH) != 0) 3601 return (pmap_force_invalidate_cache_range); 3602 return (pmap_invalidate_cache_range_all); 3603 } 3604 3605 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 3606 3607 static void 3608 pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva) 3609 { 3610 3611 KASSERT((sva & PAGE_MASK) == 0, 3612 ("pmap_invalidate_cache_range: sva not page-aligned")); 3613 KASSERT((eva & PAGE_MASK) == 0, 3614 ("pmap_invalidate_cache_range: eva not page-aligned")); 3615 } 3616 3617 static void 3618 pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva) 3619 { 3620 3621 pmap_invalidate_cache_range_check_align(sva, eva); 3622 } 3623 3624 void 3625 pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 3626 { 3627 3628 sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); 3629 3630 /* 3631 * XXX: Some CPUs fault, hang, or trash the local APIC 3632 * registers if we use CLFLUSH on the local APIC range. The 3633 * local APIC is always uncached, so we don't need to flush 3634 * for that range anyway. 3635 */ 3636 if (pmap_kextract(sva) == lapic_paddr) 3637 return; 3638 3639 if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) { 3640 /* 3641 * Do per-cache line flush. Use a locked 3642 * instruction to insure that previous stores are 3643 * included in the write-back. The processor 3644 * propagates flush to other processors in the cache 3645 * coherence domain. 3646 */ 3647 atomic_thread_fence_seq_cst(); 3648 for (; sva < eva; sva += cpu_clflush_line_size) 3649 clflushopt(sva); 3650 atomic_thread_fence_seq_cst(); 3651 } else { 3652 /* 3653 * Writes are ordered by CLFLUSH on Intel CPUs. 3654 */ 3655 if (cpu_vendor_id != CPU_VENDOR_INTEL) 3656 mfence(); 3657 for (; sva < eva; sva += cpu_clflush_line_size) 3658 clflush(sva); 3659 if (cpu_vendor_id != CPU_VENDOR_INTEL) 3660 mfence(); 3661 } 3662 } 3663 3664 static void 3665 pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva) 3666 { 3667 3668 pmap_invalidate_cache_range_check_align(sva, eva); 3669 pmap_invalidate_cache(); 3670 } 3671 3672 /* 3673 * Remove the specified set of pages from the data and instruction caches. 3674 * 3675 * In contrast to pmap_invalidate_cache_range(), this function does not 3676 * rely on the CPU's self-snoop feature, because it is intended for use 3677 * when moving pages into a different cache domain. 3678 */ 3679 void 3680 pmap_invalidate_cache_pages(vm_page_t *pages, int count) 3681 { 3682 vm_offset_t daddr, eva; 3683 int i; 3684 bool useclflushopt; 3685 3686 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; 3687 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 3688 ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt)) 3689 pmap_invalidate_cache(); 3690 else { 3691 if (useclflushopt) 3692 atomic_thread_fence_seq_cst(); 3693 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 3694 mfence(); 3695 for (i = 0; i < count; i++) { 3696 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 3697 eva = daddr + PAGE_SIZE; 3698 for (; daddr < eva; daddr += cpu_clflush_line_size) { 3699 if (useclflushopt) 3700 clflushopt(daddr); 3701 else 3702 clflush(daddr); 3703 } 3704 } 3705 if (useclflushopt) 3706 atomic_thread_fence_seq_cst(); 3707 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 3708 mfence(); 3709 } 3710 } 3711 3712 void 3713 pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva) 3714 { 3715 3716 pmap_invalidate_cache_range_check_align(sva, eva); 3717 3718 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) { 3719 pmap_force_invalidate_cache_range(sva, eva); 3720 return; 3721 } 3722 3723 /* See comment in pmap_force_invalidate_cache_range(). */ 3724 if (pmap_kextract(sva) == lapic_paddr) 3725 return; 3726 3727 atomic_thread_fence_seq_cst(); 3728 for (; sva < eva; sva += cpu_clflush_line_size) 3729 clwb(sva); 3730 atomic_thread_fence_seq_cst(); 3731 } 3732 3733 void 3734 pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) 3735 { 3736 pt_entry_t *pte; 3737 vm_offset_t vaddr; 3738 int error, pte_bits; 3739 3740 KASSERT((spa & PAGE_MASK) == 0, 3741 ("pmap_flush_cache_phys_range: spa not page-aligned")); 3742 KASSERT((epa & PAGE_MASK) == 0, 3743 ("pmap_flush_cache_phys_range: epa not page-aligned")); 3744 3745 if (spa < dmaplimit) { 3746 pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN( 3747 dmaplimit, epa))); 3748 if (dmaplimit >= epa) 3749 return; 3750 spa = dmaplimit; 3751 } 3752 3753 pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW | 3754 X86_PG_V; 3755 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 3756 &vaddr); 3757 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 3758 pte = vtopte(vaddr); 3759 for (; spa < epa; spa += PAGE_SIZE) { 3760 sched_pin(); 3761 pte_store(pte, spa | pte_bits); 3762 invlpg(vaddr); 3763 /* XXXKIB atomic inside flush_cache_range are excessive */ 3764 pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE); 3765 sched_unpin(); 3766 } 3767 vmem_free(kernel_arena, vaddr, PAGE_SIZE); 3768 } 3769 3770 /* 3771 * Routine: pmap_extract 3772 * Function: 3773 * Extract the physical page address associated 3774 * with the given map/virtual_address pair. 3775 */ 3776 vm_paddr_t 3777 pmap_extract(pmap_t pmap, vm_offset_t va) 3778 { 3779 pdp_entry_t *pdpe; 3780 pd_entry_t *pde; 3781 pt_entry_t *pte, PG_V; 3782 vm_paddr_t pa; 3783 3784 pa = 0; 3785 PG_V = pmap_valid_bit(pmap); 3786 PMAP_LOCK(pmap); 3787 pdpe = pmap_pdpe(pmap, va); 3788 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 3789 if ((*pdpe & PG_PS) != 0) 3790 pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); 3791 else { 3792 pde = pmap_pdpe_to_pde(pdpe, va); 3793 if ((*pde & PG_V) != 0) { 3794 if ((*pde & PG_PS) != 0) { 3795 pa = (*pde & PG_PS_FRAME) | 3796 (va & PDRMASK); 3797 } else { 3798 pte = pmap_pde_to_pte(pde, va); 3799 pa = (*pte & PG_FRAME) | 3800 (va & PAGE_MASK); 3801 } 3802 } 3803 } 3804 } 3805 PMAP_UNLOCK(pmap); 3806 return (pa); 3807 } 3808 3809 /* 3810 * Routine: pmap_extract_and_hold 3811 * Function: 3812 * Atomically extract and hold the physical page 3813 * with the given pmap and virtual address pair 3814 * if that mapping permits the given protection. 3815 */ 3816 vm_page_t 3817 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 3818 { 3819 pdp_entry_t pdpe, *pdpep; 3820 pd_entry_t pde, *pdep; 3821 pt_entry_t pte, PG_RW, PG_V; 3822 vm_page_t m; 3823 3824 m = NULL; 3825 PG_RW = pmap_rw_bit(pmap); 3826 PG_V = pmap_valid_bit(pmap); 3827 PMAP_LOCK(pmap); 3828 3829 pdpep = pmap_pdpe(pmap, va); 3830 if (pdpep == NULL || ((pdpe = *pdpep) & PG_V) == 0) 3831 goto out; 3832 if ((pdpe & PG_PS) != 0) { 3833 if ((pdpe & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0) 3834 goto out; 3835 m = PHYS_TO_VM_PAGE((pdpe & PG_PS_FRAME) | (va & PDPMASK)); 3836 goto check_page; 3837 } 3838 3839 pdep = pmap_pdpe_to_pde(pdpep, va); 3840 if (pdep == NULL || ((pde = *pdep) & PG_V) == 0) 3841 goto out; 3842 if ((pde & PG_PS) != 0) { 3843 if ((pde & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0) 3844 goto out; 3845 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); 3846 goto check_page; 3847 } 3848 3849 pte = *pmap_pde_to_pte(pdep, va); 3850 if ((pte & PG_V) == 0 || 3851 ((pte & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0)) 3852 goto out; 3853 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 3854 3855 check_page: 3856 if (m != NULL && !vm_page_wire_mapped(m)) 3857 m = NULL; 3858 out: 3859 PMAP_UNLOCK(pmap); 3860 return (m); 3861 } 3862 3863 vm_paddr_t 3864 pmap_kextract(vm_offset_t va) 3865 { 3866 pd_entry_t pde; 3867 vm_paddr_t pa; 3868 3869 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 3870 pa = DMAP_TO_PHYS(va); 3871 } else if (PMAP_ADDRESS_IN_LARGEMAP(va)) { 3872 pa = pmap_large_map_kextract(va); 3873 } else { 3874 pde = *vtopde(va); 3875 if (pde & PG_PS) { 3876 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 3877 } else { 3878 /* 3879 * Beware of a concurrent promotion that changes the 3880 * PDE at this point! For example, vtopte() must not 3881 * be used to access the PTE because it would use the 3882 * new PDE. It is, however, safe to use the old PDE 3883 * because the page table page is preserved by the 3884 * promotion. 3885 */ 3886 pa = *pmap_pde_to_pte(&pde, va); 3887 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 3888 } 3889 } 3890 return (pa); 3891 } 3892 3893 /*************************************************** 3894 * Low level mapping routines..... 3895 ***************************************************/ 3896 3897 /* 3898 * Add a wired page to the kva. 3899 * Note: not SMP coherent. 3900 */ 3901 PMAP_INLINE void 3902 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 3903 { 3904 pt_entry_t *pte; 3905 3906 pte = vtopte(va); 3907 pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | pg_nx); 3908 } 3909 3910 static __inline void 3911 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 3912 { 3913 pt_entry_t *pte; 3914 int cache_bits; 3915 3916 pte = vtopte(va); 3917 cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); 3918 pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | pg_nx | cache_bits); 3919 } 3920 3921 /* 3922 * Remove a page from the kernel pagetables. 3923 * Note: not SMP coherent. 3924 */ 3925 PMAP_INLINE void 3926 pmap_kremove(vm_offset_t va) 3927 { 3928 pt_entry_t *pte; 3929 3930 pte = vtopte(va); 3931 pte_clear(pte); 3932 } 3933 3934 /* 3935 * Used to map a range of physical addresses into kernel 3936 * virtual address space. 3937 * 3938 * The value passed in '*virt' is a suggested virtual address for 3939 * the mapping. Architectures which can support a direct-mapped 3940 * physical to virtual region can return the appropriate address 3941 * within that region, leaving '*virt' unchanged. Other 3942 * architectures should map the pages starting at '*virt' and 3943 * update '*virt' with the first usable address after the mapped 3944 * region. 3945 */ 3946 vm_offset_t 3947 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 3948 { 3949 return PHYS_TO_DMAP(start); 3950 } 3951 3952 /* 3953 * Add a list of wired pages to the kva 3954 * this routine is only used for temporary 3955 * kernel mappings that do not need to have 3956 * page modification or references recorded. 3957 * Note that old mappings are simply written 3958 * over. The page *must* be wired. 3959 * Note: SMP coherent. Uses a ranged shootdown IPI. 3960 */ 3961 void 3962 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 3963 { 3964 pt_entry_t *endpte, oldpte, pa, *pte; 3965 vm_page_t m; 3966 int cache_bits; 3967 3968 oldpte = 0; 3969 pte = vtopte(sva); 3970 endpte = pte + count; 3971 while (pte < endpte) { 3972 m = *ma++; 3973 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); 3974 pa = VM_PAGE_TO_PHYS(m) | cache_bits; 3975 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { 3976 oldpte |= *pte; 3977 pte_store(pte, pa | pg_g | pg_nx | X86_PG_RW | X86_PG_V); 3978 } 3979 pte++; 3980 } 3981 if (__predict_false((oldpte & X86_PG_V) != 0)) 3982 pmap_invalidate_range(kernel_pmap, sva, sva + count * 3983 PAGE_SIZE); 3984 } 3985 3986 /* 3987 * This routine tears out page mappings from the 3988 * kernel -- it is meant only for temporary mappings. 3989 * Note: SMP coherent. Uses a ranged shootdown IPI. 3990 */ 3991 void 3992 pmap_qremove(vm_offset_t sva, int count) 3993 { 3994 vm_offset_t va; 3995 3996 va = sva; 3997 while (count-- > 0) { 3998 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); 3999 pmap_kremove(va); 4000 va += PAGE_SIZE; 4001 } 4002 pmap_invalidate_range(kernel_pmap, sva, va); 4003 } 4004 4005 /*************************************************** 4006 * Page table page management routines..... 4007 ***************************************************/ 4008 /* 4009 * Schedule the specified unused page table page to be freed. Specifically, 4010 * add the page to the specified list of pages that will be released to the 4011 * physical memory manager after the TLB has been updated. 4012 */ 4013 static __inline void 4014 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 4015 boolean_t set_PG_ZERO) 4016 { 4017 4018 if (set_PG_ZERO) 4019 m->flags |= PG_ZERO; 4020 else 4021 m->flags &= ~PG_ZERO; 4022 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 4023 } 4024 4025 /* 4026 * Inserts the specified page table page into the specified pmap's collection 4027 * of idle page table pages. Each of a pmap's page table pages is responsible 4028 * for mapping a distinct range of virtual addresses. The pmap's collection is 4029 * ordered by this virtual address range. 4030 * 4031 * If "promoted" is false, then the page table page "mpte" must be zero filled. 4032 */ 4033 static __inline int 4034 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) 4035 { 4036 4037 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4038 mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; 4039 return (vm_radix_insert(&pmap->pm_root, mpte)); 4040 } 4041 4042 /* 4043 * Removes the page table page mapping the specified virtual address from the 4044 * specified pmap's collection of idle page table pages, and returns it. 4045 * Otherwise, returns NULL if there is no page table page corresponding to the 4046 * specified virtual address. 4047 */ 4048 static __inline vm_page_t 4049 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 4050 { 4051 4052 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4053 return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va))); 4054 } 4055 4056 /* 4057 * Decrements a page table page's reference count, which is used to record the 4058 * number of valid page table entries within the page. If the reference count 4059 * drops to zero, then the page table page is unmapped. Returns TRUE if the 4060 * page table page was unmapped and FALSE otherwise. 4061 */ 4062 static inline boolean_t 4063 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4064 { 4065 4066 --m->ref_count; 4067 if (m->ref_count == 0) { 4068 _pmap_unwire_ptp(pmap, va, m, free); 4069 return (TRUE); 4070 } else 4071 return (FALSE); 4072 } 4073 4074 static void 4075 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4076 { 4077 pml5_entry_t *pml5; 4078 pml4_entry_t *pml4; 4079 pdp_entry_t *pdp; 4080 pd_entry_t *pd; 4081 vm_page_t pdpg, pdppg, pml4pg; 4082 4083 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4084 4085 /* 4086 * unmap the page table page 4087 */ 4088 if (m->pindex >= NUPDE + NUPDPE + NUPML4E) { 4089 /* PML4 page */ 4090 MPASS(pmap_is_la57(pmap)); 4091 pml5 = pmap_pml5e(pmap, va); 4092 *pml5 = 0; 4093 if (pmap->pm_pmltopu != NULL && va <= VM_MAXUSER_ADDRESS) { 4094 pml5 = pmap_pml5e_u(pmap, va); 4095 *pml5 = 0; 4096 } 4097 } else if (m->pindex >= NUPDE + NUPDPE) { 4098 /* PDP page */ 4099 pml4 = pmap_pml4e(pmap, va); 4100 *pml4 = 0; 4101 if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && 4102 va <= VM_MAXUSER_ADDRESS) { 4103 pml4 = pmap_pml4e_u(pmap, va); 4104 *pml4 = 0; 4105 } 4106 } else if (m->pindex >= NUPDE) { 4107 /* PD page */ 4108 pdp = pmap_pdpe(pmap, va); 4109 *pdp = 0; 4110 } else { 4111 /* PTE page */ 4112 pd = pmap_pde(pmap, va); 4113 *pd = 0; 4114 } 4115 if (m->pindex < NUPDE) { 4116 /* We just released a PT, unhold the matching PD */ 4117 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 4118 pmap_unwire_ptp(pmap, va, pdpg, free); 4119 } else if (m->pindex < NUPDE + NUPDPE) { 4120 /* We just released a PD, unhold the matching PDP */ 4121 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 4122 pmap_unwire_ptp(pmap, va, pdppg, free); 4123 } else if (m->pindex < NUPDE + NUPDPE + NUPML4E && pmap_is_la57(pmap)) { 4124 /* We just released a PDP, unhold the matching PML4 */ 4125 pml4pg = PHYS_TO_VM_PAGE(*pmap_pml5e(pmap, va) & PG_FRAME); 4126 pmap_unwire_ptp(pmap, va, pml4pg, free); 4127 } 4128 4129 pmap_pt_page_count_adj(pmap, -1); 4130 4131 /* 4132 * Put page on a list so that it is released after 4133 * *ALL* TLB shootdown is done 4134 */ 4135 pmap_add_delayed_free_list(m, free, TRUE); 4136 } 4137 4138 /* 4139 * After removing a page table entry, this routine is used to 4140 * conditionally free the page, and manage the reference count. 4141 */ 4142 static int 4143 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 4144 struct spglist *free) 4145 { 4146 vm_page_t mpte; 4147 4148 if (va >= VM_MAXUSER_ADDRESS) 4149 return (0); 4150 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 4151 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 4152 return (pmap_unwire_ptp(pmap, va, mpte, free)); 4153 } 4154 4155 /* 4156 * Release a page table page reference after a failed attempt to create a 4157 * mapping. 4158 */ 4159 static void 4160 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 4161 { 4162 struct spglist free; 4163 4164 SLIST_INIT(&free); 4165 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 4166 /* 4167 * Although "va" was never mapped, paging-structure caches 4168 * could nonetheless have entries that refer to the freed 4169 * page table pages. Invalidate those entries. 4170 */ 4171 pmap_invalidate_page(pmap, va); 4172 vm_page_free_pages_toq(&free, true); 4173 } 4174 } 4175 4176 void 4177 pmap_pinit0(pmap_t pmap) 4178 { 4179 struct proc *p; 4180 struct thread *td; 4181 int i; 4182 4183 PMAP_LOCK_INIT(pmap); 4184 pmap->pm_pmltop = kernel_pmap->pm_pmltop; 4185 pmap->pm_pmltopu = NULL; 4186 pmap->pm_cr3 = kernel_pmap->pm_cr3; 4187 /* hack to keep pmap_pti_pcid_invalidate() alive */ 4188 pmap->pm_ucr3 = PMAP_NO_CR3; 4189 vm_radix_init(&pmap->pm_root); 4190 CPU_ZERO(&pmap->pm_active); 4191 TAILQ_INIT(&pmap->pm_pvchunk); 4192 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4193 pmap->pm_flags = pmap_flags; 4194 CPU_FOREACH(i) { 4195 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN + 1; 4196 pmap->pm_pcids[i].pm_gen = 1; 4197 } 4198 pmap_activate_boot(pmap); 4199 td = curthread; 4200 if (pti) { 4201 p = td->td_proc; 4202 PROC_LOCK(p); 4203 p->p_md.md_flags |= P_MD_KPTI; 4204 PROC_UNLOCK(p); 4205 } 4206 pmap_thread_init_invl_gen(td); 4207 4208 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 4209 pmap_pkru_ranges_zone = uma_zcreate("pkru ranges", 4210 sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL, 4211 UMA_ALIGN_PTR, 0); 4212 } 4213 } 4214 4215 void 4216 pmap_pinit_pml4(vm_page_t pml4pg) 4217 { 4218 pml4_entry_t *pm_pml4; 4219 int i; 4220 4221 pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); 4222 4223 /* Wire in kernel global address entries. */ 4224 for (i = 0; i < NKPML4E; i++) { 4225 pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW | 4226 X86_PG_V; 4227 } 4228 #ifdef KASAN 4229 for (i = 0; i < NKASANPML4E; i++) { 4230 pm_pml4[KASANPML4I + i] = (KASANPDPphys + ptoa(i)) | X86_PG_RW | 4231 X86_PG_V | pg_nx; 4232 } 4233 #endif 4234 #ifdef KMSAN 4235 for (i = 0; i < NKMSANSHADPML4E; i++) { 4236 pm_pml4[KMSANSHADPML4I + i] = (KMSANSHADPDPphys + ptoa(i)) | 4237 X86_PG_RW | X86_PG_V | pg_nx; 4238 } 4239 for (i = 0; i < NKMSANORIGPML4E; i++) { 4240 pm_pml4[KMSANORIGPML4I + i] = (KMSANORIGPDPphys + ptoa(i)) | 4241 X86_PG_RW | X86_PG_V | pg_nx; 4242 } 4243 #endif 4244 for (i = 0; i < ndmpdpphys; i++) { 4245 pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | 4246 X86_PG_V; 4247 } 4248 4249 /* install self-referential address mapping entry(s) */ 4250 pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW | 4251 X86_PG_A | X86_PG_M; 4252 4253 /* install large map entries if configured */ 4254 for (i = 0; i < lm_ents; i++) 4255 pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pmltop[LMSPML4I + i]; 4256 } 4257 4258 void 4259 pmap_pinit_pml5(vm_page_t pml5pg) 4260 { 4261 pml5_entry_t *pm_pml5; 4262 4263 pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg)); 4264 4265 /* 4266 * Add pml5 entry at top of KVA pointing to existing pml4 table, 4267 * entering all existing kernel mappings into level 5 table. 4268 */ 4269 pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | 4270 X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | 4271 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); 4272 4273 /* 4274 * Install self-referential address mapping entry. 4275 */ 4276 pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) | 4277 X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A | 4278 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); 4279 } 4280 4281 static void 4282 pmap_pinit_pml4_pti(vm_page_t pml4pgu) 4283 { 4284 pml4_entry_t *pm_pml4u; 4285 int i; 4286 4287 pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pgu)); 4288 for (i = 0; i < NPML4EPG; i++) 4289 pm_pml4u[i] = pti_pml4[i]; 4290 } 4291 4292 static void 4293 pmap_pinit_pml5_pti(vm_page_t pml5pgu) 4294 { 4295 pml5_entry_t *pm_pml5u; 4296 4297 pm_pml5u = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pgu)); 4298 pagezero(pm_pml5u); 4299 4300 /* 4301 * Add pml5 entry at top of KVA pointing to existing pml4 pti 4302 * table, entering all kernel mappings needed for usermode 4303 * into level 5 table. 4304 */ 4305 pm_pml5u[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 4306 pmap_kextract((vm_offset_t)pti_pml4) | 4307 X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | 4308 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); 4309 } 4310 4311 /* Allocate a page table page and do related bookkeeping */ 4312 static vm_page_t 4313 pmap_alloc_pt_page(pmap_t pmap, vm_pindex_t pindex, int flags) 4314 { 4315 vm_page_t m; 4316 4317 m = vm_page_alloc_noobj(flags); 4318 if (__predict_false(m == NULL)) 4319 return (NULL); 4320 m->pindex = pindex; 4321 pmap_pt_page_count_adj(pmap, 1); 4322 return (m); 4323 } 4324 4325 static void 4326 pmap_free_pt_page(pmap_t pmap, vm_page_t m, bool zerofilled) 4327 { 4328 /* 4329 * This function assumes the page will need to be unwired, 4330 * even though the counterpart allocation in pmap_alloc_pt_page() 4331 * doesn't enforce VM_ALLOC_WIRED. However, all current uses 4332 * of pmap_free_pt_page() require unwiring. The case in which 4333 * a PT page doesn't require unwiring because its ref_count has 4334 * naturally reached 0 is handled through _pmap_unwire_ptp(). 4335 */ 4336 vm_page_unwire_noq(m); 4337 if (zerofilled) 4338 vm_page_free_zero(m); 4339 else 4340 vm_page_free(m); 4341 4342 pmap_pt_page_count_adj(pmap, -1); 4343 } 4344 4345 /* 4346 * Initialize a preallocated and zeroed pmap structure, 4347 * such as one in a vmspace structure. 4348 */ 4349 int 4350 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) 4351 { 4352 vm_page_t pmltop_pg, pmltop_pgu; 4353 vm_paddr_t pmltop_phys; 4354 int i; 4355 4356 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4357 4358 /* 4359 * Allocate the page directory page. Pass NULL instead of a 4360 * pointer to the pmap here to avoid calling 4361 * pmap_resident_count_adj() through pmap_pt_page_count_adj(), 4362 * since that requires pmap lock. Instead do the accounting 4363 * manually. 4364 * 4365 * Note that final call to pmap_remove() optimization that 4366 * checks for zero resident_count is basically disabled by 4367 * accounting for top-level page. But the optimization was 4368 * not effective since we started using non-managed mapping of 4369 * the shared page. 4370 */ 4371 pmltop_pg = pmap_alloc_pt_page(NULL, 0, VM_ALLOC_WIRED | VM_ALLOC_ZERO | 4372 VM_ALLOC_WAITOK); 4373 pmap_pt_page_count_pinit(pmap, 1); 4374 4375 pmltop_phys = VM_PAGE_TO_PHYS(pmltop_pg); 4376 pmap->pm_pmltop = (pml5_entry_t *)PHYS_TO_DMAP(pmltop_phys); 4377 4378 CPU_FOREACH(i) { 4379 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; 4380 pmap->pm_pcids[i].pm_gen = 0; 4381 } 4382 pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */ 4383 pmap->pm_ucr3 = PMAP_NO_CR3; 4384 pmap->pm_pmltopu = NULL; 4385 4386 pmap->pm_type = pm_type; 4387 4388 /* 4389 * Do not install the host kernel mappings in the nested page 4390 * tables. These mappings are meaningless in the guest physical 4391 * address space. 4392 * Install minimal kernel mappings in PTI case. 4393 */ 4394 switch (pm_type) { 4395 case PT_X86: 4396 pmap->pm_cr3 = pmltop_phys; 4397 if (pmap_is_la57(pmap)) 4398 pmap_pinit_pml5(pmltop_pg); 4399 else 4400 pmap_pinit_pml4(pmltop_pg); 4401 if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) { 4402 /* 4403 * As with pmltop_pg, pass NULL instead of a 4404 * pointer to the pmap to ensure that the PTI 4405 * page counted explicitly. 4406 */ 4407 pmltop_pgu = pmap_alloc_pt_page(NULL, 0, 4408 VM_ALLOC_WIRED | VM_ALLOC_WAITOK); 4409 pmap_pt_page_count_pinit(pmap, 1); 4410 pmap->pm_pmltopu = (pml4_entry_t *)PHYS_TO_DMAP( 4411 VM_PAGE_TO_PHYS(pmltop_pgu)); 4412 if (pmap_is_la57(pmap)) 4413 pmap_pinit_pml5_pti(pmltop_pgu); 4414 else 4415 pmap_pinit_pml4_pti(pmltop_pgu); 4416 pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pmltop_pgu); 4417 } 4418 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 4419 rangeset_init(&pmap->pm_pkru, pkru_dup_range, 4420 pkru_free_range, pmap, M_NOWAIT); 4421 } 4422 break; 4423 case PT_EPT: 4424 case PT_RVI: 4425 pmap->pm_eptsmr = smr_create("pmap", 0, 0); 4426 break; 4427 } 4428 4429 vm_radix_init(&pmap->pm_root); 4430 CPU_ZERO(&pmap->pm_active); 4431 TAILQ_INIT(&pmap->pm_pvchunk); 4432 pmap->pm_flags = flags; 4433 pmap->pm_eptgen = 0; 4434 4435 return (1); 4436 } 4437 4438 int 4439 pmap_pinit(pmap_t pmap) 4440 { 4441 4442 return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); 4443 } 4444 4445 static void 4446 pmap_allocpte_free_unref(pmap_t pmap, vm_offset_t va, pt_entry_t *pte) 4447 { 4448 vm_page_t mpg; 4449 struct spglist free; 4450 4451 mpg = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 4452 if (mpg->ref_count != 0) 4453 return; 4454 SLIST_INIT(&free); 4455 _pmap_unwire_ptp(pmap, va, mpg, &free); 4456 pmap_invalidate_page(pmap, va); 4457 vm_page_free_pages_toq(&free, true); 4458 } 4459 4460 static pml4_entry_t * 4461 pmap_allocpte_getpml4(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, 4462 bool addref) 4463 { 4464 vm_pindex_t pml5index; 4465 pml5_entry_t *pml5; 4466 pml4_entry_t *pml4; 4467 vm_page_t pml4pg; 4468 pt_entry_t PG_V; 4469 bool allocated; 4470 4471 if (!pmap_is_la57(pmap)) 4472 return (&pmap->pm_pmltop[pmap_pml4e_index(va)]); 4473 4474 PG_V = pmap_valid_bit(pmap); 4475 pml5index = pmap_pml5e_index(va); 4476 pml5 = &pmap->pm_pmltop[pml5index]; 4477 if ((*pml5 & PG_V) == 0) { 4478 if (pmap_allocpte_nosleep(pmap, pmap_pml5e_pindex(va), lockp, 4479 va) == NULL) 4480 return (NULL); 4481 allocated = true; 4482 } else { 4483 allocated = false; 4484 } 4485 pml4 = (pml4_entry_t *)PHYS_TO_DMAP(*pml5 & PG_FRAME); 4486 pml4 = &pml4[pmap_pml4e_index(va)]; 4487 if ((*pml4 & PG_V) == 0) { 4488 pml4pg = PHYS_TO_VM_PAGE(*pml5 & PG_FRAME); 4489 if (allocated && !addref) 4490 pml4pg->ref_count--; 4491 else if (!allocated && addref) 4492 pml4pg->ref_count++; 4493 } 4494 return (pml4); 4495 } 4496 4497 static pdp_entry_t * 4498 pmap_allocpte_getpdp(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, 4499 bool addref) 4500 { 4501 vm_page_t pdppg; 4502 pml4_entry_t *pml4; 4503 pdp_entry_t *pdp; 4504 pt_entry_t PG_V; 4505 bool allocated; 4506 4507 PG_V = pmap_valid_bit(pmap); 4508 4509 pml4 = pmap_allocpte_getpml4(pmap, lockp, va, false); 4510 if (pml4 == NULL) 4511 return (NULL); 4512 4513 if ((*pml4 & PG_V) == 0) { 4514 /* Have to allocate a new pdp, recurse */ 4515 if (pmap_allocpte_nosleep(pmap, pmap_pml4e_pindex(va), lockp, 4516 va) == NULL) { 4517 if (pmap_is_la57(pmap)) 4518 pmap_allocpte_free_unref(pmap, va, 4519 pmap_pml5e(pmap, va)); 4520 return (NULL); 4521 } 4522 allocated = true; 4523 } else { 4524 allocated = false; 4525 } 4526 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 4527 pdp = &pdp[pmap_pdpe_index(va)]; 4528 if ((*pdp & PG_V) == 0) { 4529 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 4530 if (allocated && !addref) 4531 pdppg->ref_count--; 4532 else if (!allocated && addref) 4533 pdppg->ref_count++; 4534 } 4535 return (pdp); 4536 } 4537 4538 /* 4539 * The ptepindexes, i.e. page indices, of the page table pages encountered 4540 * while translating virtual address va are defined as follows: 4541 * - for the page table page (last level), 4542 * ptepindex = pmap_pde_pindex(va) = va >> PDRSHIFT, 4543 * in other words, it is just the index of the PDE that maps the page 4544 * table page. 4545 * - for the page directory page, 4546 * ptepindex = NUPDE (number of userland PD entries) + 4547 * (pmap_pde_index(va) >> NPDEPGSHIFT) 4548 * i.e. index of PDPE is put after the last index of PDE, 4549 * - for the page directory pointer page, 4550 * ptepindex = NUPDE + NUPDPE + (pmap_pde_index(va) >> (NPDEPGSHIFT + 4551 * NPML4EPGSHIFT), 4552 * i.e. index of pml4e is put after the last index of PDPE, 4553 * - for the PML4 page (if LA57 mode is enabled), 4554 * ptepindex = NUPDE + NUPDPE + NUPML4E + (pmap_pde_index(va) >> 4555 * (NPDEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT), 4556 * i.e. index of pml5e is put after the last index of PML4E. 4557 * 4558 * Define an order on the paging entries, where all entries of the 4559 * same height are put together, then heights are put from deepest to 4560 * root. Then ptexpindex is the sequential number of the 4561 * corresponding paging entry in this order. 4562 * 4563 * The values of NUPDE, NUPDPE, and NUPML4E are determined by the size of 4564 * LA57 paging structures even in LA48 paging mode. Moreover, the 4565 * ptepindexes are calculated as if the paging structures were 5-level 4566 * regardless of the actual mode of operation. 4567 * 4568 * The root page at PML4/PML5 does not participate in this indexing scheme, 4569 * since it is statically allocated by pmap_pinit() and not by pmap_allocpte(). 4570 */ 4571 static vm_page_t 4572 pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, 4573 vm_offset_t va) 4574 { 4575 vm_pindex_t pml5index, pml4index; 4576 pml5_entry_t *pml5, *pml5u; 4577 pml4_entry_t *pml4, *pml4u; 4578 pdp_entry_t *pdp; 4579 pd_entry_t *pd; 4580 vm_page_t m, pdpg; 4581 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 4582 4583 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4584 4585 PG_A = pmap_accessed_bit(pmap); 4586 PG_M = pmap_modified_bit(pmap); 4587 PG_V = pmap_valid_bit(pmap); 4588 PG_RW = pmap_rw_bit(pmap); 4589 4590 /* 4591 * Allocate a page table page. 4592 */ 4593 m = pmap_alloc_pt_page(pmap, ptepindex, 4594 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 4595 if (m == NULL) 4596 return (NULL); 4597 4598 /* 4599 * Map the pagetable page into the process address space, if 4600 * it isn't already there. 4601 */ 4602 if (ptepindex >= NUPDE + NUPDPE + NUPML4E) { 4603 MPASS(pmap_is_la57(pmap)); 4604 4605 pml5index = pmap_pml5e_index(va); 4606 pml5 = &pmap->pm_pmltop[pml5index]; 4607 KASSERT((*pml5 & PG_V) == 0, 4608 ("pmap %p va %#lx pml5 %#lx", pmap, va, *pml5)); 4609 *pml5 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4610 4611 if (pmap->pm_pmltopu != NULL && pml5index < NUPML5E) { 4612 if (pmap->pm_ucr3 != PMAP_NO_CR3) 4613 *pml5 |= pg_nx; 4614 4615 pml5u = &pmap->pm_pmltopu[pml5index]; 4616 *pml5u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | 4617 PG_A | PG_M; 4618 } 4619 } else if (ptepindex >= NUPDE + NUPDPE) { 4620 pml4index = pmap_pml4e_index(va); 4621 /* Wire up a new PDPE page */ 4622 pml4 = pmap_allocpte_getpml4(pmap, lockp, va, true); 4623 if (pml4 == NULL) { 4624 pmap_free_pt_page(pmap, m, true); 4625 return (NULL); 4626 } 4627 KASSERT((*pml4 & PG_V) == 0, 4628 ("pmap %p va %#lx pml4 %#lx", pmap, va, *pml4)); 4629 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4630 4631 if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && 4632 pml4index < NUPML4E) { 4633 /* 4634 * PTI: Make all user-space mappings in the 4635 * kernel-mode page table no-execute so that 4636 * we detect any programming errors that leave 4637 * the kernel-mode page table active on return 4638 * to user space. 4639 */ 4640 if (pmap->pm_ucr3 != PMAP_NO_CR3) 4641 *pml4 |= pg_nx; 4642 4643 pml4u = &pmap->pm_pmltopu[pml4index]; 4644 *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | 4645 PG_A | PG_M; 4646 } 4647 } else if (ptepindex >= NUPDE) { 4648 /* Wire up a new PDE page */ 4649 pdp = pmap_allocpte_getpdp(pmap, lockp, va, true); 4650 if (pdp == NULL) { 4651 pmap_free_pt_page(pmap, m, true); 4652 return (NULL); 4653 } 4654 KASSERT((*pdp & PG_V) == 0, 4655 ("pmap %p va %#lx pdp %#lx", pmap, va, *pdp)); 4656 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4657 } else { 4658 /* Wire up a new PTE page */ 4659 pdp = pmap_allocpte_getpdp(pmap, lockp, va, false); 4660 if (pdp == NULL) { 4661 pmap_free_pt_page(pmap, m, true); 4662 return (NULL); 4663 } 4664 if ((*pdp & PG_V) == 0) { 4665 /* Have to allocate a new pd, recurse */ 4666 if (pmap_allocpte_nosleep(pmap, pmap_pdpe_pindex(va), 4667 lockp, va) == NULL) { 4668 pmap_allocpte_free_unref(pmap, va, 4669 pmap_pml4e(pmap, va)); 4670 pmap_free_pt_page(pmap, m, true); 4671 return (NULL); 4672 } 4673 } else { 4674 /* Add reference to the pd page */ 4675 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 4676 pdpg->ref_count++; 4677 } 4678 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 4679 4680 /* Now we know where the page directory page is */ 4681 pd = &pd[pmap_pde_index(va)]; 4682 KASSERT((*pd & PG_V) == 0, 4683 ("pmap %p va %#lx pd %#lx", pmap, va, *pd)); 4684 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4685 } 4686 4687 return (m); 4688 } 4689 4690 /* 4691 * This routine is called if the desired page table page does not exist. 4692 * 4693 * If page table page allocation fails, this routine may sleep before 4694 * returning NULL. It sleeps only if a lock pointer was given. Sleep 4695 * occurs right before returning to the caller. This way, we never 4696 * drop pmap lock to sleep while a page table page has ref_count == 0, 4697 * which prevents the page from being freed under us. 4698 */ 4699 static vm_page_t 4700 pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, 4701 vm_offset_t va) 4702 { 4703 vm_page_t m; 4704 4705 m = pmap_allocpte_nosleep(pmap, ptepindex, lockp, va); 4706 if (m == NULL && lockp != NULL) { 4707 RELEASE_PV_LIST_LOCK(lockp); 4708 PMAP_UNLOCK(pmap); 4709 PMAP_ASSERT_NOT_IN_DI(); 4710 vm_wait(NULL); 4711 PMAP_LOCK(pmap); 4712 } 4713 return (m); 4714 } 4715 4716 static pd_entry_t * 4717 pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, 4718 struct rwlock **lockp) 4719 { 4720 pdp_entry_t *pdpe, PG_V; 4721 pd_entry_t *pde; 4722 vm_page_t pdpg; 4723 vm_pindex_t pdpindex; 4724 4725 PG_V = pmap_valid_bit(pmap); 4726 4727 retry: 4728 pdpe = pmap_pdpe(pmap, va); 4729 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 4730 pde = pmap_pdpe_to_pde(pdpe, va); 4731 if (va < VM_MAXUSER_ADDRESS) { 4732 /* Add a reference to the pd page. */ 4733 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 4734 pdpg->ref_count++; 4735 } else 4736 pdpg = NULL; 4737 } else if (va < VM_MAXUSER_ADDRESS) { 4738 /* Allocate a pd page. */ 4739 pdpindex = pmap_pde_pindex(va) >> NPDPEPGSHIFT; 4740 pdpg = pmap_allocpte_alloc(pmap, NUPDE + pdpindex, lockp, va); 4741 if (pdpg == NULL) { 4742 if (lockp != NULL) 4743 goto retry; 4744 else 4745 return (NULL); 4746 } 4747 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 4748 pde = &pde[pmap_pde_index(va)]; 4749 } else 4750 panic("pmap_alloc_pde: missing page table page for va %#lx", 4751 va); 4752 *pdpgp = pdpg; 4753 return (pde); 4754 } 4755 4756 static vm_page_t 4757 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 4758 { 4759 vm_pindex_t ptepindex; 4760 pd_entry_t *pd, PG_V; 4761 vm_page_t m; 4762 4763 PG_V = pmap_valid_bit(pmap); 4764 4765 /* 4766 * Calculate pagetable page index 4767 */ 4768 ptepindex = pmap_pde_pindex(va); 4769 retry: 4770 /* 4771 * Get the page directory entry 4772 */ 4773 pd = pmap_pde(pmap, va); 4774 4775 /* 4776 * This supports switching from a 2MB page to a 4777 * normal 4K page. 4778 */ 4779 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 4780 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { 4781 /* 4782 * Invalidation of the 2MB page mapping may have caused 4783 * the deallocation of the underlying PD page. 4784 */ 4785 pd = NULL; 4786 } 4787 } 4788 4789 /* 4790 * If the page table page is mapped, we just increment the 4791 * hold count, and activate it. 4792 */ 4793 if (pd != NULL && (*pd & PG_V) != 0) { 4794 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 4795 m->ref_count++; 4796 } else { 4797 /* 4798 * Here if the pte page isn't mapped, or if it has been 4799 * deallocated. 4800 */ 4801 m = pmap_allocpte_alloc(pmap, ptepindex, lockp, va); 4802 if (m == NULL && lockp != NULL) 4803 goto retry; 4804 } 4805 return (m); 4806 } 4807 4808 /*************************************************** 4809 * Pmap allocation/deallocation routines. 4810 ***************************************************/ 4811 4812 /* 4813 * Release any resources held by the given physical map. 4814 * Called when a pmap initialized by pmap_pinit is being released. 4815 * Should only be called if the map contains no valid mappings. 4816 */ 4817 void 4818 pmap_release(pmap_t pmap) 4819 { 4820 vm_page_t m; 4821 int i; 4822 4823 KASSERT(vm_radix_is_empty(&pmap->pm_root), 4824 ("pmap_release: pmap %p has reserved page table page(s)", 4825 pmap)); 4826 KASSERT(CPU_EMPTY(&pmap->pm_active), 4827 ("releasing active pmap %p", pmap)); 4828 4829 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop)); 4830 4831 if (pmap_is_la57(pmap)) { 4832 pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0; 4833 pmap->pm_pmltop[PML5PML5I] = 0; 4834 } else { 4835 for (i = 0; i < NKPML4E; i++) /* KVA */ 4836 pmap->pm_pmltop[KPML4BASE + i] = 0; 4837 #ifdef KASAN 4838 for (i = 0; i < NKASANPML4E; i++) /* KASAN shadow map */ 4839 pmap->pm_pmltop[KASANPML4I + i] = 0; 4840 #endif 4841 #ifdef KMSAN 4842 for (i = 0; i < NKMSANSHADPML4E; i++) /* KMSAN shadow map */ 4843 pmap->pm_pmltop[KMSANSHADPML4I + i] = 0; 4844 for (i = 0; i < NKMSANORIGPML4E; i++) /* KMSAN shadow map */ 4845 pmap->pm_pmltop[KMSANORIGPML4I + i] = 0; 4846 #endif 4847 for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ 4848 pmap->pm_pmltop[DMPML4I + i] = 0; 4849 pmap->pm_pmltop[PML4PML4I] = 0; /* Recursive Mapping */ 4850 for (i = 0; i < lm_ents; i++) /* Large Map */ 4851 pmap->pm_pmltop[LMSPML4I + i] = 0; 4852 } 4853 4854 pmap_free_pt_page(NULL, m, true); 4855 pmap_pt_page_count_pinit(pmap, -1); 4856 4857 if (pmap->pm_pmltopu != NULL) { 4858 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap-> 4859 pm_pmltopu)); 4860 pmap_free_pt_page(NULL, m, false); 4861 pmap_pt_page_count_pinit(pmap, -1); 4862 } 4863 if (pmap->pm_type == PT_X86 && 4864 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) 4865 rangeset_fini(&pmap->pm_pkru); 4866 4867 KASSERT(pmap->pm_stats.resident_count == 0, 4868 ("pmap_release: pmap %p resident count %ld != 0", 4869 pmap, pmap->pm_stats.resident_count)); 4870 } 4871 4872 static int 4873 kvm_size(SYSCTL_HANDLER_ARGS) 4874 { 4875 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 4876 4877 return sysctl_handle_long(oidp, &ksize, 0, req); 4878 } 4879 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 4880 0, 0, kvm_size, "LU", 4881 "Size of KVM"); 4882 4883 static int 4884 kvm_free(SYSCTL_HANDLER_ARGS) 4885 { 4886 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 4887 4888 return sysctl_handle_long(oidp, &kfree, 0, req); 4889 } 4890 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 4891 0, 0, kvm_free, "LU", 4892 "Amount of KVM free"); 4893 4894 #ifdef KMSAN 4895 static void 4896 pmap_kmsan_shadow_map_page_array(vm_paddr_t pdppa, vm_size_t size) 4897 { 4898 pdp_entry_t *pdpe; 4899 pd_entry_t *pde; 4900 pt_entry_t *pte; 4901 vm_paddr_t dummypa, dummypd, dummypt; 4902 int i, npde, npdpg; 4903 4904 npdpg = howmany(size, NBPDP); 4905 npde = size / NBPDR; 4906 4907 dummypa = vm_phys_early_alloc(-1, PAGE_SIZE); 4908 pagezero((void *)PHYS_TO_DMAP(dummypa)); 4909 4910 dummypt = vm_phys_early_alloc(-1, PAGE_SIZE); 4911 pagezero((void *)PHYS_TO_DMAP(dummypt)); 4912 dummypd = vm_phys_early_alloc(-1, PAGE_SIZE * npdpg); 4913 for (i = 0; i < npdpg; i++) 4914 pagezero((void *)PHYS_TO_DMAP(dummypd + ptoa(i))); 4915 4916 pte = (pt_entry_t *)PHYS_TO_DMAP(dummypt); 4917 for (i = 0; i < NPTEPG; i++) 4918 pte[i] = (pt_entry_t)(dummypa | X86_PG_V | X86_PG_RW | 4919 X86_PG_A | X86_PG_M | pg_nx); 4920 4921 pde = (pd_entry_t *)PHYS_TO_DMAP(dummypd); 4922 for (i = 0; i < npde; i++) 4923 pde[i] = (pd_entry_t)(dummypt | X86_PG_V | X86_PG_RW | pg_nx); 4924 4925 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(pdppa); 4926 for (i = 0; i < npdpg; i++) 4927 pdpe[i] = (pdp_entry_t)(dummypd + ptoa(i) | X86_PG_V | 4928 X86_PG_RW | pg_nx); 4929 } 4930 4931 static void 4932 pmap_kmsan_page_array_startup(vm_offset_t start, vm_offset_t end) 4933 { 4934 vm_size_t size; 4935 4936 KASSERT(start % NBPDP == 0, ("unaligned page array start address")); 4937 4938 /* 4939 * The end of the page array's KVA region is 2MB aligned, see 4940 * kmem_init(). 4941 */ 4942 size = round_2mpage(end) - start; 4943 pmap_kmsan_shadow_map_page_array(KMSANSHADPDPphys, size); 4944 pmap_kmsan_shadow_map_page_array(KMSANORIGPDPphys, size); 4945 } 4946 #endif 4947 4948 /* 4949 * Allocate physical memory for the vm_page array and map it into KVA, 4950 * attempting to back the vm_pages with domain-local memory. 4951 */ 4952 void 4953 pmap_page_array_startup(long pages) 4954 { 4955 pdp_entry_t *pdpe; 4956 pd_entry_t *pde, newpdir; 4957 vm_offset_t va, start, end; 4958 vm_paddr_t pa; 4959 long pfn; 4960 int domain, i; 4961 4962 vm_page_array_size = pages; 4963 4964 start = VM_MIN_KERNEL_ADDRESS; 4965 end = start + pages * sizeof(struct vm_page); 4966 for (va = start; va < end; va += NBPDR) { 4967 pfn = first_page + (va - start) / sizeof(struct vm_page); 4968 domain = vm_phys_domain(ptoa(pfn)); 4969 pdpe = pmap_pdpe(kernel_pmap, va); 4970 if ((*pdpe & X86_PG_V) == 0) { 4971 pa = vm_phys_early_alloc(domain, PAGE_SIZE); 4972 dump_add_page(pa); 4973 pagezero((void *)PHYS_TO_DMAP(pa)); 4974 *pdpe = (pdp_entry_t)(pa | X86_PG_V | X86_PG_RW | 4975 X86_PG_A | X86_PG_M); 4976 } 4977 pde = pmap_pdpe_to_pde(pdpe, va); 4978 if ((*pde & X86_PG_V) != 0) 4979 panic("Unexpected pde"); 4980 pa = vm_phys_early_alloc(domain, NBPDR); 4981 for (i = 0; i < NPDEPG; i++) 4982 dump_add_page(pa + i * PAGE_SIZE); 4983 newpdir = (pd_entry_t)(pa | X86_PG_V | X86_PG_RW | X86_PG_A | 4984 X86_PG_M | PG_PS | pg_g | pg_nx); 4985 pde_store(pde, newpdir); 4986 } 4987 vm_page_array = (vm_page_t)start; 4988 4989 #ifdef KMSAN 4990 pmap_kmsan_page_array_startup(start, end); 4991 #endif 4992 } 4993 4994 /* 4995 * grow the number of kernel page table entries, if needed 4996 */ 4997 void 4998 pmap_growkernel(vm_offset_t addr) 4999 { 5000 vm_paddr_t paddr; 5001 vm_page_t nkpg; 5002 pd_entry_t *pde, newpdir; 5003 pdp_entry_t *pdpe; 5004 5005 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 5006 5007 /* 5008 * Return if "addr" is within the range of kernel page table pages 5009 * that were preallocated during pmap bootstrap. Moreover, leave 5010 * "kernel_vm_end" and the kernel page table as they were. 5011 * 5012 * The correctness of this action is based on the following 5013 * argument: vm_map_insert() allocates contiguous ranges of the 5014 * kernel virtual address space. It calls this function if a range 5015 * ends after "kernel_vm_end". If the kernel is mapped between 5016 * "kernel_vm_end" and "addr", then the range cannot begin at 5017 * "kernel_vm_end". In fact, its beginning address cannot be less 5018 * than the kernel. Thus, there is no immediate need to allocate 5019 * any new kernel page table pages between "kernel_vm_end" and 5020 * "KERNBASE". 5021 */ 5022 if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR) 5023 return; 5024 5025 addr = roundup2(addr, NBPDR); 5026 if (addr - 1 >= vm_map_max(kernel_map)) 5027 addr = vm_map_max(kernel_map); 5028 if (kernel_vm_end < addr) 5029 kasan_shadow_map(kernel_vm_end, addr - kernel_vm_end); 5030 if (kernel_vm_end < addr) 5031 kmsan_shadow_map(kernel_vm_end, addr - kernel_vm_end); 5032 while (kernel_vm_end < addr) { 5033 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end); 5034 if ((*pdpe & X86_PG_V) == 0) { 5035 /* We need a new PDP entry */ 5036 nkpg = pmap_alloc_pt_page(kernel_pmap, 5037 kernel_vm_end >> PDPSHIFT, VM_ALLOC_WIRED | 5038 VM_ALLOC_INTERRUPT | VM_ALLOC_ZERO); 5039 if (nkpg == NULL) 5040 panic("pmap_growkernel: no memory to grow kernel"); 5041 paddr = VM_PAGE_TO_PHYS(nkpg); 5042 *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | 5043 X86_PG_A | X86_PG_M); 5044 continue; /* try again */ 5045 } 5046 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); 5047 if ((*pde & X86_PG_V) != 0) { 5048 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 5049 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 5050 kernel_vm_end = vm_map_max(kernel_map); 5051 break; 5052 } 5053 continue; 5054 } 5055 5056 nkpg = pmap_alloc_pt_page(kernel_pmap, 5057 pmap_pde_pindex(kernel_vm_end), VM_ALLOC_WIRED | 5058 VM_ALLOC_INTERRUPT | VM_ALLOC_ZERO); 5059 if (nkpg == NULL) 5060 panic("pmap_growkernel: no memory to grow kernel"); 5061 paddr = VM_PAGE_TO_PHYS(nkpg); 5062 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; 5063 pde_store(pde, newpdir); 5064 5065 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 5066 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 5067 kernel_vm_end = vm_map_max(kernel_map); 5068 break; 5069 } 5070 } 5071 } 5072 5073 /*************************************************** 5074 * page management routines. 5075 ***************************************************/ 5076 5077 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 5078 CTASSERT(_NPCM == 3); 5079 CTASSERT(_NPCPV == 168); 5080 5081 static __inline struct pv_chunk * 5082 pv_to_chunk(pv_entry_t pv) 5083 { 5084 5085 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 5086 } 5087 5088 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 5089 5090 #define PC_FREE0 0xfffffffffffffffful 5091 #define PC_FREE1 0xfffffffffffffffful 5092 #define PC_FREE2 0x000000fffffffffful 5093 5094 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 5095 5096 #ifdef PV_STATS 5097 5098 static COUNTER_U64_DEFINE_EARLY(pc_chunk_count); 5099 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, 5100 &pc_chunk_count, "Current number of pv entry cnunks"); 5101 5102 static COUNTER_U64_DEFINE_EARLY(pc_chunk_allocs); 5103 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, 5104 &pc_chunk_allocs, "Total number of pv entry chunks allocated"); 5105 5106 static COUNTER_U64_DEFINE_EARLY(pc_chunk_frees); 5107 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, 5108 &pc_chunk_frees, "Total number of pv entry chunks freed"); 5109 5110 static COUNTER_U64_DEFINE_EARLY(pc_chunk_tryfail); 5111 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, 5112 &pc_chunk_tryfail, 5113 "Number of failed attempts to get a pv entry chunk page"); 5114 5115 static COUNTER_U64_DEFINE_EARLY(pv_entry_frees); 5116 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, 5117 &pv_entry_frees, "Total number of pv entries freed"); 5118 5119 static COUNTER_U64_DEFINE_EARLY(pv_entry_allocs); 5120 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, 5121 &pv_entry_allocs, "Total number of pv entries allocated"); 5122 5123 static COUNTER_U64_DEFINE_EARLY(pv_entry_count); 5124 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, 5125 &pv_entry_count, "Current number of pv entries"); 5126 5127 static COUNTER_U64_DEFINE_EARLY(pv_entry_spare); 5128 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, 5129 &pv_entry_spare, "Current number of spare pv entries"); 5130 #endif 5131 5132 static void 5133 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di) 5134 { 5135 5136 if (pmap == NULL) 5137 return; 5138 pmap_invalidate_all(pmap); 5139 if (pmap != locked_pmap) 5140 PMAP_UNLOCK(pmap); 5141 if (start_di) 5142 pmap_delayed_invl_finish(); 5143 } 5144 5145 /* 5146 * We are in a serious low memory condition. Resort to 5147 * drastic measures to free some pages so we can allocate 5148 * another pv entry chunk. 5149 * 5150 * Returns NULL if PV entries were reclaimed from the specified pmap. 5151 * 5152 * We do not, however, unmap 2mpages because subsequent accesses will 5153 * allocate per-page pv entries until repromotion occurs, thereby 5154 * exacerbating the shortage of free pv entries. 5155 */ 5156 static vm_page_t 5157 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain) 5158 { 5159 struct pv_chunks_list *pvc; 5160 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 5161 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 5162 struct md_page *pvh; 5163 pd_entry_t *pde; 5164 pmap_t next_pmap, pmap; 5165 pt_entry_t *pte, tpte; 5166 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 5167 pv_entry_t pv; 5168 vm_offset_t va; 5169 vm_page_t m, m_pc; 5170 struct spglist free; 5171 uint64_t inuse; 5172 int bit, field, freed; 5173 bool start_di, restart; 5174 5175 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 5176 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 5177 pmap = NULL; 5178 m_pc = NULL; 5179 PG_G = PG_A = PG_M = PG_RW = 0; 5180 SLIST_INIT(&free); 5181 bzero(&pc_marker_b, sizeof(pc_marker_b)); 5182 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 5183 pc_marker = (struct pv_chunk *)&pc_marker_b; 5184 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 5185 5186 /* 5187 * A delayed invalidation block should already be active if 5188 * pmap_advise() or pmap_remove() called this function by way 5189 * of pmap_demote_pde_locked(). 5190 */ 5191 start_di = pmap_not_in_di(); 5192 5193 pvc = &pv_chunks[domain]; 5194 mtx_lock(&pvc->pvc_lock); 5195 pvc->active_reclaims++; 5196 TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru); 5197 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru); 5198 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 5199 SLIST_EMPTY(&free)) { 5200 next_pmap = pc->pc_pmap; 5201 if (next_pmap == NULL) { 5202 /* 5203 * The next chunk is a marker. However, it is 5204 * not our marker, so active_reclaims must be 5205 * > 1. Consequently, the next_chunk code 5206 * will not rotate the pv_chunks list. 5207 */ 5208 goto next_chunk; 5209 } 5210 mtx_unlock(&pvc->pvc_lock); 5211 5212 /* 5213 * A pv_chunk can only be removed from the pc_lru list 5214 * when both pc_chunks_mutex is owned and the 5215 * corresponding pmap is locked. 5216 */ 5217 if (pmap != next_pmap) { 5218 restart = false; 5219 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, 5220 start_di); 5221 pmap = next_pmap; 5222 /* Avoid deadlock and lock recursion. */ 5223 if (pmap > locked_pmap) { 5224 RELEASE_PV_LIST_LOCK(lockp); 5225 PMAP_LOCK(pmap); 5226 if (start_di) 5227 pmap_delayed_invl_start(); 5228 mtx_lock(&pvc->pvc_lock); 5229 restart = true; 5230 } else if (pmap != locked_pmap) { 5231 if (PMAP_TRYLOCK(pmap)) { 5232 if (start_di) 5233 pmap_delayed_invl_start(); 5234 mtx_lock(&pvc->pvc_lock); 5235 restart = true; 5236 } else { 5237 pmap = NULL; /* pmap is not locked */ 5238 mtx_lock(&pvc->pvc_lock); 5239 pc = TAILQ_NEXT(pc_marker, pc_lru); 5240 if (pc == NULL || 5241 pc->pc_pmap != next_pmap) 5242 continue; 5243 goto next_chunk; 5244 } 5245 } else if (start_di) 5246 pmap_delayed_invl_start(); 5247 PG_G = pmap_global_bit(pmap); 5248 PG_A = pmap_accessed_bit(pmap); 5249 PG_M = pmap_modified_bit(pmap); 5250 PG_RW = pmap_rw_bit(pmap); 5251 if (restart) 5252 continue; 5253 } 5254 5255 /* 5256 * Destroy every non-wired, 4 KB page mapping in the chunk. 5257 */ 5258 freed = 0; 5259 for (field = 0; field < _NPCM; field++) { 5260 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 5261 inuse != 0; inuse &= ~(1UL << bit)) { 5262 bit = bsfq(inuse); 5263 pv = &pc->pc_pventry[field * 64 + bit]; 5264 va = pv->pv_va; 5265 pde = pmap_pde(pmap, va); 5266 if ((*pde & PG_PS) != 0) 5267 continue; 5268 pte = pmap_pde_to_pte(pde, va); 5269 if ((*pte & PG_W) != 0) 5270 continue; 5271 tpte = pte_load_clear(pte); 5272 if ((tpte & PG_G) != 0) 5273 pmap_invalidate_page(pmap, va); 5274 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 5275 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5276 vm_page_dirty(m); 5277 if ((tpte & PG_A) != 0) 5278 vm_page_aflag_set(m, PGA_REFERENCED); 5279 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 5280 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5281 m->md.pv_gen++; 5282 if (TAILQ_EMPTY(&m->md.pv_list) && 5283 (m->flags & PG_FICTITIOUS) == 0) { 5284 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5285 if (TAILQ_EMPTY(&pvh->pv_list)) { 5286 vm_page_aflag_clear(m, 5287 PGA_WRITEABLE); 5288 } 5289 } 5290 pmap_delayed_invl_page(m); 5291 pc->pc_map[field] |= 1UL << bit; 5292 pmap_unuse_pt(pmap, va, *pde, &free); 5293 freed++; 5294 } 5295 } 5296 if (freed == 0) { 5297 mtx_lock(&pvc->pvc_lock); 5298 goto next_chunk; 5299 } 5300 /* Every freed mapping is for a 4 KB page. */ 5301 pmap_resident_count_adj(pmap, -freed); 5302 PV_STAT(counter_u64_add(pv_entry_frees, freed)); 5303 PV_STAT(counter_u64_add(pv_entry_spare, freed)); 5304 PV_STAT(counter_u64_add(pv_entry_count, -freed)); 5305 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5306 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && 5307 pc->pc_map[2] == PC_FREE2) { 5308 PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV)); 5309 PV_STAT(counter_u64_add(pc_chunk_count, -1)); 5310 PV_STAT(counter_u64_add(pc_chunk_frees, 1)); 5311 /* Entire chunk is free; return it. */ 5312 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 5313 dump_drop_page(m_pc->phys_addr); 5314 mtx_lock(&pvc->pvc_lock); 5315 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5316 break; 5317 } 5318 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5319 mtx_lock(&pvc->pvc_lock); 5320 /* One freed pv entry in locked_pmap is sufficient. */ 5321 if (pmap == locked_pmap) 5322 break; 5323 next_chunk: 5324 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 5325 TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru); 5326 if (pvc->active_reclaims == 1 && pmap != NULL) { 5327 /* 5328 * Rotate the pv chunks list so that we do not 5329 * scan the same pv chunks that could not be 5330 * freed (because they contained a wired 5331 * and/or superpage mapping) on every 5332 * invocation of reclaim_pv_chunk(). 5333 */ 5334 while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker) { 5335 MPASS(pc->pc_pmap != NULL); 5336 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5337 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 5338 } 5339 } 5340 } 5341 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 5342 TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru); 5343 pvc->active_reclaims--; 5344 mtx_unlock(&pvc->pvc_lock); 5345 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); 5346 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 5347 m_pc = SLIST_FIRST(&free); 5348 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 5349 /* Recycle a freed page table page. */ 5350 m_pc->ref_count = 1; 5351 } 5352 vm_page_free_pages_toq(&free, true); 5353 return (m_pc); 5354 } 5355 5356 static vm_page_t 5357 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 5358 { 5359 vm_page_t m; 5360 int i, domain; 5361 5362 domain = PCPU_GET(domain); 5363 for (i = 0; i < vm_ndomains; i++) { 5364 m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain); 5365 if (m != NULL) 5366 break; 5367 domain = (domain + 1) % vm_ndomains; 5368 } 5369 5370 return (m); 5371 } 5372 5373 /* 5374 * free the pv_entry back to the free list 5375 */ 5376 static void 5377 free_pv_entry(pmap_t pmap, pv_entry_t pv) 5378 { 5379 struct pv_chunk *pc; 5380 int idx, field, bit; 5381 5382 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5383 PV_STAT(counter_u64_add(pv_entry_frees, 1)); 5384 PV_STAT(counter_u64_add(pv_entry_spare, 1)); 5385 PV_STAT(counter_u64_add(pv_entry_count, -1)); 5386 pc = pv_to_chunk(pv); 5387 idx = pv - &pc->pc_pventry[0]; 5388 field = idx / 64; 5389 bit = idx % 64; 5390 pc->pc_map[field] |= 1ul << bit; 5391 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 5392 pc->pc_map[2] != PC_FREE2) { 5393 /* 98% of the time, pc is already at the head of the list. */ 5394 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 5395 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5396 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5397 } 5398 return; 5399 } 5400 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5401 free_pv_chunk(pc); 5402 } 5403 5404 static void 5405 free_pv_chunk_dequeued(struct pv_chunk *pc) 5406 { 5407 vm_page_t m; 5408 5409 PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV)); 5410 PV_STAT(counter_u64_add(pc_chunk_count, -1)); 5411 PV_STAT(counter_u64_add(pc_chunk_frees, 1)); 5412 counter_u64_add(pv_page_count, -1); 5413 /* entire chunk is free, return it */ 5414 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 5415 dump_drop_page(m->phys_addr); 5416 vm_page_unwire_noq(m); 5417 vm_page_free(m); 5418 } 5419 5420 static void 5421 free_pv_chunk(struct pv_chunk *pc) 5422 { 5423 struct pv_chunks_list *pvc; 5424 5425 pvc = &pv_chunks[pc_to_domain(pc)]; 5426 mtx_lock(&pvc->pvc_lock); 5427 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5428 mtx_unlock(&pvc->pvc_lock); 5429 free_pv_chunk_dequeued(pc); 5430 } 5431 5432 static void 5433 free_pv_chunk_batch(struct pv_chunklist *batch) 5434 { 5435 struct pv_chunks_list *pvc; 5436 struct pv_chunk *pc, *npc; 5437 int i; 5438 5439 for (i = 0; i < vm_ndomains; i++) { 5440 if (TAILQ_EMPTY(&batch[i])) 5441 continue; 5442 pvc = &pv_chunks[i]; 5443 mtx_lock(&pvc->pvc_lock); 5444 TAILQ_FOREACH(pc, &batch[i], pc_list) { 5445 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5446 } 5447 mtx_unlock(&pvc->pvc_lock); 5448 } 5449 5450 for (i = 0; i < vm_ndomains; i++) { 5451 TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) { 5452 free_pv_chunk_dequeued(pc); 5453 } 5454 } 5455 } 5456 5457 /* 5458 * Returns a new PV entry, allocating a new PV chunk from the system when 5459 * needed. If this PV chunk allocation fails and a PV list lock pointer was 5460 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 5461 * returned. 5462 * 5463 * The given PV list lock may be released. 5464 */ 5465 static pv_entry_t 5466 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 5467 { 5468 struct pv_chunks_list *pvc; 5469 int bit, field; 5470 pv_entry_t pv; 5471 struct pv_chunk *pc; 5472 vm_page_t m; 5473 5474 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5475 PV_STAT(counter_u64_add(pv_entry_allocs, 1)); 5476 retry: 5477 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 5478 if (pc != NULL) { 5479 for (field = 0; field < _NPCM; field++) { 5480 if (pc->pc_map[field]) { 5481 bit = bsfq(pc->pc_map[field]); 5482 break; 5483 } 5484 } 5485 if (field < _NPCM) { 5486 pv = &pc->pc_pventry[field * 64 + bit]; 5487 pc->pc_map[field] &= ~(1ul << bit); 5488 /* If this was the last item, move it to tail */ 5489 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 5490 pc->pc_map[2] == 0) { 5491 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5492 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 5493 pc_list); 5494 } 5495 PV_STAT(counter_u64_add(pv_entry_count, 1)); 5496 PV_STAT(counter_u64_add(pv_entry_spare, -1)); 5497 return (pv); 5498 } 5499 } 5500 /* No free items, allocate another chunk */ 5501 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 5502 if (m == NULL) { 5503 if (lockp == NULL) { 5504 PV_STAT(counter_u64_add(pc_chunk_tryfail, 1)); 5505 return (NULL); 5506 } 5507 m = reclaim_pv_chunk(pmap, lockp); 5508 if (m == NULL) 5509 goto retry; 5510 } else 5511 counter_u64_add(pv_page_count, 1); 5512 PV_STAT(counter_u64_add(pc_chunk_count, 1)); 5513 PV_STAT(counter_u64_add(pc_chunk_allocs, 1)); 5514 dump_add_page(m->phys_addr); 5515 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 5516 pc->pc_pmap = pmap; 5517 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 5518 pc->pc_map[1] = PC_FREE1; 5519 pc->pc_map[2] = PC_FREE2; 5520 pvc = &pv_chunks[vm_page_domain(m)]; 5521 mtx_lock(&pvc->pvc_lock); 5522 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 5523 mtx_unlock(&pvc->pvc_lock); 5524 pv = &pc->pc_pventry[0]; 5525 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5526 PV_STAT(counter_u64_add(pv_entry_count, 1)); 5527 PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV - 1)); 5528 return (pv); 5529 } 5530 5531 /* 5532 * Returns the number of one bits within the given PV chunk map. 5533 * 5534 * The erratas for Intel processors state that "POPCNT Instruction May 5535 * Take Longer to Execute Than Expected". It is believed that the 5536 * issue is the spurious dependency on the destination register. 5537 * Provide a hint to the register rename logic that the destination 5538 * value is overwritten, by clearing it, as suggested in the 5539 * optimization manual. It should be cheap for unaffected processors 5540 * as well. 5541 * 5542 * Reference numbers for erratas are 5543 * 4th Gen Core: HSD146 5544 * 5th Gen Core: BDM85 5545 * 6th Gen Core: SKL029 5546 */ 5547 static int 5548 popcnt_pc_map_pq(uint64_t *map) 5549 { 5550 u_long result, tmp; 5551 5552 __asm __volatile("xorl %k0,%k0;popcntq %2,%0;" 5553 "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;" 5554 "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0" 5555 : "=&r" (result), "=&r" (tmp) 5556 : "m" (map[0]), "m" (map[1]), "m" (map[2])); 5557 return (result); 5558 } 5559 5560 /* 5561 * Ensure that the number of spare PV entries in the specified pmap meets or 5562 * exceeds the given count, "needed". 5563 * 5564 * The given PV list lock may be released. 5565 */ 5566 static void 5567 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 5568 { 5569 struct pv_chunks_list *pvc; 5570 struct pch new_tail[PMAP_MEMDOM]; 5571 struct pv_chunk *pc; 5572 vm_page_t m; 5573 int avail, free, i; 5574 bool reclaimed; 5575 5576 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5577 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 5578 5579 /* 5580 * Newly allocated PV chunks must be stored in a private list until 5581 * the required number of PV chunks have been allocated. Otherwise, 5582 * reclaim_pv_chunk() could recycle one of these chunks. In 5583 * contrast, these chunks must be added to the pmap upon allocation. 5584 */ 5585 for (i = 0; i < PMAP_MEMDOM; i++) 5586 TAILQ_INIT(&new_tail[i]); 5587 retry: 5588 avail = 0; 5589 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 5590 #ifndef __POPCNT__ 5591 if ((cpu_feature2 & CPUID2_POPCNT) == 0) 5592 bit_count((bitstr_t *)pc->pc_map, 0, 5593 sizeof(pc->pc_map) * NBBY, &free); 5594 else 5595 #endif 5596 free = popcnt_pc_map_pq(pc->pc_map); 5597 if (free == 0) 5598 break; 5599 avail += free; 5600 if (avail >= needed) 5601 break; 5602 } 5603 for (reclaimed = false; avail < needed; avail += _NPCPV) { 5604 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 5605 if (m == NULL) { 5606 m = reclaim_pv_chunk(pmap, lockp); 5607 if (m == NULL) 5608 goto retry; 5609 reclaimed = true; 5610 } else 5611 counter_u64_add(pv_page_count, 1); 5612 PV_STAT(counter_u64_add(pc_chunk_count, 1)); 5613 PV_STAT(counter_u64_add(pc_chunk_allocs, 1)); 5614 dump_add_page(m->phys_addr); 5615 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 5616 pc->pc_pmap = pmap; 5617 pc->pc_map[0] = PC_FREE0; 5618 pc->pc_map[1] = PC_FREE1; 5619 pc->pc_map[2] = PC_FREE2; 5620 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5621 TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru); 5622 PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV)); 5623 5624 /* 5625 * The reclaim might have freed a chunk from the current pmap. 5626 * If that chunk contained available entries, we need to 5627 * re-count the number of available entries. 5628 */ 5629 if (reclaimed) 5630 goto retry; 5631 } 5632 for (i = 0; i < vm_ndomains; i++) { 5633 if (TAILQ_EMPTY(&new_tail[i])) 5634 continue; 5635 pvc = &pv_chunks[i]; 5636 mtx_lock(&pvc->pvc_lock); 5637 TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru); 5638 mtx_unlock(&pvc->pvc_lock); 5639 } 5640 } 5641 5642 /* 5643 * First find and then remove the pv entry for the specified pmap and virtual 5644 * address from the specified pv list. Returns the pv entry if found and NULL 5645 * otherwise. This operation can be performed on pv lists for either 4KB or 5646 * 2MB page mappings. 5647 */ 5648 static __inline pv_entry_t 5649 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 5650 { 5651 pv_entry_t pv; 5652 5653 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5654 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 5655 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5656 pvh->pv_gen++; 5657 break; 5658 } 5659 } 5660 return (pv); 5661 } 5662 5663 /* 5664 * After demotion from a 2MB page mapping to 512 4KB page mappings, 5665 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 5666 * entries for each of the 4KB page mappings. 5667 */ 5668 static void 5669 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 5670 struct rwlock **lockp) 5671 { 5672 struct md_page *pvh; 5673 struct pv_chunk *pc; 5674 pv_entry_t pv; 5675 vm_offset_t va_last; 5676 vm_page_t m; 5677 int bit, field; 5678 5679 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5680 KASSERT((pa & PDRMASK) == 0, 5681 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 5682 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5683 5684 /* 5685 * Transfer the 2mpage's pv entry for this mapping to the first 5686 * page's pv list. Once this transfer begins, the pv list lock 5687 * must not be released until the last pv entry is reinstantiated. 5688 */ 5689 pvh = pa_to_pvh(pa); 5690 va = trunc_2mpage(va); 5691 pv = pmap_pvh_remove(pvh, pmap, va); 5692 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 5693 m = PHYS_TO_VM_PAGE(pa); 5694 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5695 m->md.pv_gen++; 5696 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 5697 PV_STAT(counter_u64_add(pv_entry_allocs, NPTEPG - 1)); 5698 va_last = va + NBPDR - PAGE_SIZE; 5699 for (;;) { 5700 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 5701 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 5702 pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); 5703 for (field = 0; field < _NPCM; field++) { 5704 while (pc->pc_map[field]) { 5705 bit = bsfq(pc->pc_map[field]); 5706 pc->pc_map[field] &= ~(1ul << bit); 5707 pv = &pc->pc_pventry[field * 64 + bit]; 5708 va += PAGE_SIZE; 5709 pv->pv_va = va; 5710 m++; 5711 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5712 ("pmap_pv_demote_pde: page %p is not managed", m)); 5713 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5714 m->md.pv_gen++; 5715 if (va == va_last) 5716 goto out; 5717 } 5718 } 5719 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5720 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 5721 } 5722 out: 5723 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 5724 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5725 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 5726 } 5727 PV_STAT(counter_u64_add(pv_entry_count, NPTEPG - 1)); 5728 PV_STAT(counter_u64_add(pv_entry_spare, -(NPTEPG - 1))); 5729 } 5730 5731 #if VM_NRESERVLEVEL > 0 5732 /* 5733 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 5734 * replace the many pv entries for the 4KB page mappings by a single pv entry 5735 * for the 2MB page mapping. 5736 */ 5737 static void 5738 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 5739 struct rwlock **lockp) 5740 { 5741 struct md_page *pvh; 5742 pv_entry_t pv; 5743 vm_offset_t va_last; 5744 vm_page_t m; 5745 5746 KASSERT((pa & PDRMASK) == 0, 5747 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 5748 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5749 5750 /* 5751 * Transfer the first page's pv entry for this mapping to the 2mpage's 5752 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 5753 * a transfer avoids the possibility that get_pv_entry() calls 5754 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 5755 * mappings that is being promoted. 5756 */ 5757 m = PHYS_TO_VM_PAGE(pa); 5758 va = trunc_2mpage(va); 5759 pv = pmap_pvh_remove(&m->md, pmap, va); 5760 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 5761 pvh = pa_to_pvh(pa); 5762 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5763 pvh->pv_gen++; 5764 /* Free the remaining NPTEPG - 1 pv entries. */ 5765 va_last = va + NBPDR - PAGE_SIZE; 5766 do { 5767 m++; 5768 va += PAGE_SIZE; 5769 pmap_pvh_free(&m->md, pmap, va); 5770 } while (va < va_last); 5771 } 5772 #endif /* VM_NRESERVLEVEL > 0 */ 5773 5774 /* 5775 * First find and then destroy the pv entry for the specified pmap and virtual 5776 * address. This operation can be performed on pv lists for either 4KB or 2MB 5777 * page mappings. 5778 */ 5779 static void 5780 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 5781 { 5782 pv_entry_t pv; 5783 5784 pv = pmap_pvh_remove(pvh, pmap, va); 5785 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 5786 free_pv_entry(pmap, pv); 5787 } 5788 5789 /* 5790 * Conditionally create the PV entry for a 4KB page mapping if the required 5791 * memory can be allocated without resorting to reclamation. 5792 */ 5793 static boolean_t 5794 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 5795 struct rwlock **lockp) 5796 { 5797 pv_entry_t pv; 5798 5799 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5800 /* Pass NULL instead of the lock pointer to disable reclamation. */ 5801 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 5802 pv->pv_va = va; 5803 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 5804 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5805 m->md.pv_gen++; 5806 return (TRUE); 5807 } else 5808 return (FALSE); 5809 } 5810 5811 /* 5812 * Create the PV entry for a 2MB page mapping. Always returns true unless the 5813 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 5814 * false if the PV entry cannot be allocated without resorting to reclamation. 5815 */ 5816 static bool 5817 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, 5818 struct rwlock **lockp) 5819 { 5820 struct md_page *pvh; 5821 pv_entry_t pv; 5822 vm_paddr_t pa; 5823 5824 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5825 /* Pass NULL instead of the lock pointer to disable reclamation. */ 5826 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 5827 NULL : lockp)) == NULL) 5828 return (false); 5829 pv->pv_va = va; 5830 pa = pde & PG_PS_FRAME; 5831 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5832 pvh = pa_to_pvh(pa); 5833 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5834 pvh->pv_gen++; 5835 return (true); 5836 } 5837 5838 /* 5839 * Fills a page table page with mappings to consecutive physical pages. 5840 */ 5841 static void 5842 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 5843 { 5844 pt_entry_t *pte; 5845 5846 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 5847 *pte = newpte; 5848 newpte += PAGE_SIZE; 5849 } 5850 } 5851 5852 /* 5853 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 5854 * mapping is invalidated. 5855 */ 5856 static boolean_t 5857 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 5858 { 5859 struct rwlock *lock; 5860 boolean_t rv; 5861 5862 lock = NULL; 5863 rv = pmap_demote_pde_locked(pmap, pde, va, &lock); 5864 if (lock != NULL) 5865 rw_wunlock(lock); 5866 return (rv); 5867 } 5868 5869 static void 5870 pmap_demote_pde_check(pt_entry_t *firstpte __unused, pt_entry_t newpte __unused) 5871 { 5872 #ifdef INVARIANTS 5873 #ifdef DIAGNOSTIC 5874 pt_entry_t *xpte, *ypte; 5875 5876 for (xpte = firstpte; xpte < firstpte + NPTEPG; 5877 xpte++, newpte += PAGE_SIZE) { 5878 if ((*xpte & PG_FRAME) != (newpte & PG_FRAME)) { 5879 printf("pmap_demote_pde: xpte %zd and newpte map " 5880 "different pages: found %#lx, expected %#lx\n", 5881 xpte - firstpte, *xpte, newpte); 5882 printf("page table dump\n"); 5883 for (ypte = firstpte; ypte < firstpte + NPTEPG; ypte++) 5884 printf("%zd %#lx\n", ypte - firstpte, *ypte); 5885 panic("firstpte"); 5886 } 5887 } 5888 #else 5889 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 5890 ("pmap_demote_pde: firstpte and newpte map different physical" 5891 " addresses")); 5892 #endif 5893 #endif 5894 } 5895 5896 static void 5897 pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 5898 pd_entry_t oldpde, struct rwlock **lockp) 5899 { 5900 struct spglist free; 5901 vm_offset_t sva; 5902 5903 SLIST_INIT(&free); 5904 sva = trunc_2mpage(va); 5905 pmap_remove_pde(pmap, pde, sva, &free, lockp); 5906 if ((oldpde & pmap_global_bit(pmap)) == 0) 5907 pmap_invalidate_pde_page(pmap, sva, oldpde); 5908 vm_page_free_pages_toq(&free, true); 5909 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx in pmap %p", 5910 va, pmap); 5911 } 5912 5913 static boolean_t 5914 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 5915 struct rwlock **lockp) 5916 { 5917 pd_entry_t newpde, oldpde; 5918 pt_entry_t *firstpte, newpte; 5919 pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V; 5920 vm_paddr_t mptepa; 5921 vm_page_t mpte; 5922 int PG_PTE_CACHE; 5923 bool in_kernel; 5924 5925 PG_A = pmap_accessed_bit(pmap); 5926 PG_G = pmap_global_bit(pmap); 5927 PG_M = pmap_modified_bit(pmap); 5928 PG_RW = pmap_rw_bit(pmap); 5929 PG_V = pmap_valid_bit(pmap); 5930 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 5931 PG_PKU_MASK = pmap_pku_mask_bit(pmap); 5932 5933 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5934 in_kernel = va >= VM_MAXUSER_ADDRESS; 5935 oldpde = *pde; 5936 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 5937 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 5938 5939 /* 5940 * Invalidate the 2MB page mapping and return "failure" if the 5941 * mapping was never accessed. 5942 */ 5943 if ((oldpde & PG_A) == 0) { 5944 KASSERT((oldpde & PG_W) == 0, 5945 ("pmap_demote_pde: a wired mapping is missing PG_A")); 5946 pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); 5947 return (FALSE); 5948 } 5949 5950 mpte = pmap_remove_pt_page(pmap, va); 5951 if (mpte == NULL) { 5952 KASSERT((oldpde & PG_W) == 0, 5953 ("pmap_demote_pde: page table page for a wired mapping" 5954 " is missing")); 5955 5956 /* 5957 * If the page table page is missing and the mapping 5958 * is for a kernel address, the mapping must belong to 5959 * the direct map. Page table pages are preallocated 5960 * for every other part of the kernel address space, 5961 * so the direct map region is the only part of the 5962 * kernel address space that must be handled here. 5963 */ 5964 KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS && 5965 va < DMAP_MAX_ADDRESS), 5966 ("pmap_demote_pde: No saved mpte for va %#lx", va)); 5967 5968 /* 5969 * If the 2MB page mapping belongs to the direct map 5970 * region of the kernel's address space, then the page 5971 * allocation request specifies the highest possible 5972 * priority (VM_ALLOC_INTERRUPT). Otherwise, the 5973 * priority is normal. 5974 */ 5975 mpte = pmap_alloc_pt_page(pmap, pmap_pde_pindex(va), 5976 (in_kernel ? VM_ALLOC_INTERRUPT : 0) | VM_ALLOC_WIRED); 5977 5978 /* 5979 * If the allocation of the new page table page fails, 5980 * invalidate the 2MB page mapping and return "failure". 5981 */ 5982 if (mpte == NULL) { 5983 pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); 5984 return (FALSE); 5985 } 5986 5987 if (!in_kernel) 5988 mpte->ref_count = NPTEPG; 5989 } 5990 mptepa = VM_PAGE_TO_PHYS(mpte); 5991 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 5992 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 5993 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 5994 ("pmap_demote_pde: oldpde is missing PG_M")); 5995 newpte = oldpde & ~PG_PS; 5996 newpte = pmap_swap_pat(pmap, newpte); 5997 5998 /* 5999 * If the page table page is not leftover from an earlier promotion, 6000 * initialize it. 6001 */ 6002 if (mpte->valid == 0) 6003 pmap_fill_ptp(firstpte, newpte); 6004 6005 pmap_demote_pde_check(firstpte, newpte); 6006 6007 /* 6008 * If the mapping has changed attributes, update the page table 6009 * entries. 6010 */ 6011 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 6012 pmap_fill_ptp(firstpte, newpte); 6013 6014 /* 6015 * The spare PV entries must be reserved prior to demoting the 6016 * mapping, that is, prior to changing the PDE. Otherwise, the state 6017 * of the PDE and the PV lists will be inconsistent, which can result 6018 * in reclaim_pv_chunk() attempting to remove a PV entry from the 6019 * wrong PV list and pmap_pv_demote_pde() failing to find the expected 6020 * PV entry for the 2MB page mapping that is being demoted. 6021 */ 6022 if ((oldpde & PG_MANAGED) != 0) 6023 reserve_pv_entries(pmap, NPTEPG - 1, lockp); 6024 6025 /* 6026 * Demote the mapping. This pmap is locked. The old PDE has 6027 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 6028 * set. Thus, there is no danger of a race with another 6029 * processor changing the setting of PG_A and/or PG_M between 6030 * the read above and the store below. 6031 */ 6032 if (workaround_erratum383) 6033 pmap_update_pde(pmap, va, pde, newpde); 6034 else 6035 pde_store(pde, newpde); 6036 6037 /* 6038 * Invalidate a stale recursive mapping of the page table page. 6039 */ 6040 if (in_kernel) 6041 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 6042 6043 /* 6044 * Demote the PV entry. 6045 */ 6046 if ((oldpde & PG_MANAGED) != 0) 6047 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); 6048 6049 counter_u64_add(pmap_pde_demotions, 1); 6050 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p", 6051 va, pmap); 6052 return (TRUE); 6053 } 6054 6055 /* 6056 * pmap_remove_kernel_pde: Remove a kernel superpage mapping. 6057 */ 6058 static void 6059 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 6060 { 6061 pd_entry_t newpde; 6062 vm_paddr_t mptepa; 6063 vm_page_t mpte; 6064 6065 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 6066 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6067 mpte = pmap_remove_pt_page(pmap, va); 6068 if (mpte == NULL) 6069 panic("pmap_remove_kernel_pde: Missing pt page."); 6070 6071 mptepa = VM_PAGE_TO_PHYS(mpte); 6072 newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; 6073 6074 /* 6075 * If this page table page was unmapped by a promotion, then it 6076 * contains valid mappings. Zero it to invalidate those mappings. 6077 */ 6078 if (mpte->valid != 0) 6079 pagezero((void *)PHYS_TO_DMAP(mptepa)); 6080 6081 /* 6082 * Demote the mapping. 6083 */ 6084 if (workaround_erratum383) 6085 pmap_update_pde(pmap, va, pde, newpde); 6086 else 6087 pde_store(pde, newpde); 6088 6089 /* 6090 * Invalidate a stale recursive mapping of the page table page. 6091 */ 6092 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 6093 } 6094 6095 /* 6096 * pmap_remove_pde: do the things to unmap a superpage in a process 6097 */ 6098 static int 6099 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 6100 struct spglist *free, struct rwlock **lockp) 6101 { 6102 struct md_page *pvh; 6103 pd_entry_t oldpde; 6104 vm_offset_t eva, va; 6105 vm_page_t m, mpte; 6106 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 6107 6108 PG_G = pmap_global_bit(pmap); 6109 PG_A = pmap_accessed_bit(pmap); 6110 PG_M = pmap_modified_bit(pmap); 6111 PG_RW = pmap_rw_bit(pmap); 6112 6113 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6114 KASSERT((sva & PDRMASK) == 0, 6115 ("pmap_remove_pde: sva is not 2mpage aligned")); 6116 oldpde = pte_load_clear(pdq); 6117 if (oldpde & PG_W) 6118 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 6119 if ((oldpde & PG_G) != 0) 6120 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 6121 pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE); 6122 if (oldpde & PG_MANAGED) { 6123 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); 6124 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 6125 pmap_pvh_free(pvh, pmap, sva); 6126 eva = sva + NBPDR; 6127 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 6128 va < eva; va += PAGE_SIZE, m++) { 6129 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6130 vm_page_dirty(m); 6131 if (oldpde & PG_A) 6132 vm_page_aflag_set(m, PGA_REFERENCED); 6133 if (TAILQ_EMPTY(&m->md.pv_list) && 6134 TAILQ_EMPTY(&pvh->pv_list)) 6135 vm_page_aflag_clear(m, PGA_WRITEABLE); 6136 pmap_delayed_invl_page(m); 6137 } 6138 } 6139 if (pmap == kernel_pmap) { 6140 pmap_remove_kernel_pde(pmap, pdq, sva); 6141 } else { 6142 mpte = pmap_remove_pt_page(pmap, sva); 6143 if (mpte != NULL) { 6144 KASSERT(mpte->valid == VM_PAGE_BITS_ALL, 6145 ("pmap_remove_pde: pte page not promoted")); 6146 pmap_resident_count_adj(pmap, -1); 6147 KASSERT(mpte->ref_count == NPTEPG, 6148 ("pmap_remove_pde: pte page ref count error")); 6149 mpte->ref_count = 0; 6150 pmap_add_delayed_free_list(mpte, free, FALSE); 6151 } 6152 } 6153 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); 6154 } 6155 6156 /* 6157 * pmap_remove_pte: do the things to unmap a page in a process 6158 */ 6159 static int 6160 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 6161 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) 6162 { 6163 struct md_page *pvh; 6164 pt_entry_t oldpte, PG_A, PG_M, PG_RW; 6165 vm_page_t m; 6166 6167 PG_A = pmap_accessed_bit(pmap); 6168 PG_M = pmap_modified_bit(pmap); 6169 PG_RW = pmap_rw_bit(pmap); 6170 6171 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6172 oldpte = pte_load_clear(ptq); 6173 if (oldpte & PG_W) 6174 pmap->pm_stats.wired_count -= 1; 6175 pmap_resident_count_adj(pmap, -1); 6176 if (oldpte & PG_MANAGED) { 6177 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 6178 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6179 vm_page_dirty(m); 6180 if (oldpte & PG_A) 6181 vm_page_aflag_set(m, PGA_REFERENCED); 6182 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 6183 pmap_pvh_free(&m->md, pmap, va); 6184 if (TAILQ_EMPTY(&m->md.pv_list) && 6185 (m->flags & PG_FICTITIOUS) == 0) { 6186 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6187 if (TAILQ_EMPTY(&pvh->pv_list)) 6188 vm_page_aflag_clear(m, PGA_WRITEABLE); 6189 } 6190 pmap_delayed_invl_page(m); 6191 } 6192 return (pmap_unuse_pt(pmap, va, ptepde, free)); 6193 } 6194 6195 /* 6196 * Remove a single page from a process address space 6197 */ 6198 static void 6199 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 6200 struct spglist *free) 6201 { 6202 struct rwlock *lock; 6203 pt_entry_t *pte, PG_V; 6204 6205 PG_V = pmap_valid_bit(pmap); 6206 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6207 if ((*pde & PG_V) == 0) 6208 return; 6209 pte = pmap_pde_to_pte(pde, va); 6210 if ((*pte & PG_V) == 0) 6211 return; 6212 lock = NULL; 6213 pmap_remove_pte(pmap, pte, va, *pde, free, &lock); 6214 if (lock != NULL) 6215 rw_wunlock(lock); 6216 pmap_invalidate_page(pmap, va); 6217 } 6218 6219 /* 6220 * Removes the specified range of addresses from the page table page. 6221 */ 6222 static bool 6223 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 6224 pd_entry_t *pde, struct spglist *free, struct rwlock **lockp) 6225 { 6226 pt_entry_t PG_G, *pte; 6227 vm_offset_t va; 6228 bool anyvalid; 6229 6230 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6231 PG_G = pmap_global_bit(pmap); 6232 anyvalid = false; 6233 va = eva; 6234 for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++, 6235 sva += PAGE_SIZE) { 6236 if (*pte == 0) { 6237 if (va != eva) { 6238 pmap_invalidate_range(pmap, va, sva); 6239 va = eva; 6240 } 6241 continue; 6242 } 6243 if ((*pte & PG_G) == 0) 6244 anyvalid = true; 6245 else if (va == eva) 6246 va = sva; 6247 if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) { 6248 sva += PAGE_SIZE; 6249 break; 6250 } 6251 } 6252 if (va != eva) 6253 pmap_invalidate_range(pmap, va, sva); 6254 return (anyvalid); 6255 } 6256 6257 /* 6258 * Remove the given range of addresses from the specified map. 6259 * 6260 * It is assumed that the start and end are properly 6261 * rounded to the page size. 6262 */ 6263 void 6264 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 6265 { 6266 struct rwlock *lock; 6267 vm_page_t mt; 6268 vm_offset_t va_next; 6269 pml5_entry_t *pml5e; 6270 pml4_entry_t *pml4e; 6271 pdp_entry_t *pdpe; 6272 pd_entry_t ptpaddr, *pde; 6273 pt_entry_t PG_G, PG_V; 6274 struct spglist free; 6275 int anyvalid; 6276 6277 PG_G = pmap_global_bit(pmap); 6278 PG_V = pmap_valid_bit(pmap); 6279 6280 /* 6281 * If there are no resident pages besides the top level page 6282 * table page(s), there is nothing to do. Kernel pmap always 6283 * accounts whole preloaded area as resident, which makes its 6284 * resident count > 2. 6285 * Perform an unsynchronized read. This is, however, safe. 6286 */ 6287 if (pmap->pm_stats.resident_count <= 1 + (pmap->pm_pmltopu != NULL ? 6288 1 : 0)) 6289 return; 6290 6291 anyvalid = 0; 6292 SLIST_INIT(&free); 6293 6294 pmap_delayed_invl_start(); 6295 PMAP_LOCK(pmap); 6296 pmap_pkru_on_remove(pmap, sva, eva); 6297 6298 /* 6299 * special handling of removing one page. a very 6300 * common operation and easy to short circuit some 6301 * code. 6302 */ 6303 if (sva + PAGE_SIZE == eva) { 6304 pde = pmap_pde(pmap, sva); 6305 if (pde && (*pde & PG_PS) == 0) { 6306 pmap_remove_page(pmap, sva, pde, &free); 6307 goto out; 6308 } 6309 } 6310 6311 lock = NULL; 6312 for (; sva < eva; sva = va_next) { 6313 if (pmap->pm_stats.resident_count == 0) 6314 break; 6315 6316 if (pmap_is_la57(pmap)) { 6317 pml5e = pmap_pml5e(pmap, sva); 6318 if ((*pml5e & PG_V) == 0) { 6319 va_next = (sva + NBPML5) & ~PML5MASK; 6320 if (va_next < sva) 6321 va_next = eva; 6322 continue; 6323 } 6324 pml4e = pmap_pml5e_to_pml4e(pml5e, sva); 6325 } else { 6326 pml4e = pmap_pml4e(pmap, sva); 6327 } 6328 if ((*pml4e & PG_V) == 0) { 6329 va_next = (sva + NBPML4) & ~PML4MASK; 6330 if (va_next < sva) 6331 va_next = eva; 6332 continue; 6333 } 6334 6335 va_next = (sva + NBPDP) & ~PDPMASK; 6336 if (va_next < sva) 6337 va_next = eva; 6338 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 6339 if ((*pdpe & PG_V) == 0) 6340 continue; 6341 if ((*pdpe & PG_PS) != 0) { 6342 KASSERT(va_next <= eva, 6343 ("partial update of non-transparent 1G mapping " 6344 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 6345 *pdpe, sva, eva, va_next)); 6346 MPASS(pmap != kernel_pmap); /* XXXKIB */ 6347 MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); 6348 anyvalid = 1; 6349 *pdpe = 0; 6350 pmap_resident_count_adj(pmap, -NBPDP / PAGE_SIZE); 6351 mt = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, sva) & PG_FRAME); 6352 pmap_unwire_ptp(pmap, sva, mt, &free); 6353 continue; 6354 } 6355 6356 /* 6357 * Calculate index for next page table. 6358 */ 6359 va_next = (sva + NBPDR) & ~PDRMASK; 6360 if (va_next < sva) 6361 va_next = eva; 6362 6363 pde = pmap_pdpe_to_pde(pdpe, sva); 6364 ptpaddr = *pde; 6365 6366 /* 6367 * Weed out invalid mappings. 6368 */ 6369 if (ptpaddr == 0) 6370 continue; 6371 6372 /* 6373 * Check for large page. 6374 */ 6375 if ((ptpaddr & PG_PS) != 0) { 6376 /* 6377 * Are we removing the entire large page? If not, 6378 * demote the mapping and fall through. 6379 */ 6380 if (sva + NBPDR == va_next && eva >= va_next) { 6381 /* 6382 * The TLB entry for a PG_G mapping is 6383 * invalidated by pmap_remove_pde(). 6384 */ 6385 if ((ptpaddr & PG_G) == 0) 6386 anyvalid = 1; 6387 pmap_remove_pde(pmap, pde, sva, &free, &lock); 6388 continue; 6389 } else if (!pmap_demote_pde_locked(pmap, pde, sva, 6390 &lock)) { 6391 /* The large page mapping was destroyed. */ 6392 continue; 6393 } else 6394 ptpaddr = *pde; 6395 } 6396 6397 /* 6398 * Limit our scan to either the end of the va represented 6399 * by the current page table page, or to the end of the 6400 * range being removed. 6401 */ 6402 if (va_next > eva) 6403 va_next = eva; 6404 6405 if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock)) 6406 anyvalid = 1; 6407 } 6408 if (lock != NULL) 6409 rw_wunlock(lock); 6410 out: 6411 if (anyvalid) 6412 pmap_invalidate_all(pmap); 6413 PMAP_UNLOCK(pmap); 6414 pmap_delayed_invl_finish(); 6415 vm_page_free_pages_toq(&free, true); 6416 } 6417 6418 /* 6419 * Routine: pmap_remove_all 6420 * Function: 6421 * Removes this physical page from 6422 * all physical maps in which it resides. 6423 * Reflects back modify bits to the pager. 6424 * 6425 * Notes: 6426 * Original versions of this routine were very 6427 * inefficient because they iteratively called 6428 * pmap_remove (slow...) 6429 */ 6430 6431 void 6432 pmap_remove_all(vm_page_t m) 6433 { 6434 struct md_page *pvh; 6435 pv_entry_t pv; 6436 pmap_t pmap; 6437 struct rwlock *lock; 6438 pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; 6439 pd_entry_t *pde; 6440 vm_offset_t va; 6441 struct spglist free; 6442 int pvh_gen, md_gen; 6443 6444 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6445 ("pmap_remove_all: page %p is not managed", m)); 6446 SLIST_INIT(&free); 6447 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6448 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 6449 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6450 rw_wlock(lock); 6451 retry: 6452 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 6453 pmap = PV_PMAP(pv); 6454 if (!PMAP_TRYLOCK(pmap)) { 6455 pvh_gen = pvh->pv_gen; 6456 rw_wunlock(lock); 6457 PMAP_LOCK(pmap); 6458 rw_wlock(lock); 6459 if (pvh_gen != pvh->pv_gen) { 6460 PMAP_UNLOCK(pmap); 6461 goto retry; 6462 } 6463 } 6464 va = pv->pv_va; 6465 pde = pmap_pde(pmap, va); 6466 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 6467 PMAP_UNLOCK(pmap); 6468 } 6469 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 6470 pmap = PV_PMAP(pv); 6471 if (!PMAP_TRYLOCK(pmap)) { 6472 pvh_gen = pvh->pv_gen; 6473 md_gen = m->md.pv_gen; 6474 rw_wunlock(lock); 6475 PMAP_LOCK(pmap); 6476 rw_wlock(lock); 6477 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 6478 PMAP_UNLOCK(pmap); 6479 goto retry; 6480 } 6481 } 6482 PG_A = pmap_accessed_bit(pmap); 6483 PG_M = pmap_modified_bit(pmap); 6484 PG_RW = pmap_rw_bit(pmap); 6485 pmap_resident_count_adj(pmap, -1); 6486 pde = pmap_pde(pmap, pv->pv_va); 6487 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 6488 " a 2mpage in page %p's pv list", m)); 6489 pte = pmap_pde_to_pte(pde, pv->pv_va); 6490 tpte = pte_load_clear(pte); 6491 if (tpte & PG_W) 6492 pmap->pm_stats.wired_count--; 6493 if (tpte & PG_A) 6494 vm_page_aflag_set(m, PGA_REFERENCED); 6495 6496 /* 6497 * Update the vm_page_t clean and reference bits. 6498 */ 6499 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6500 vm_page_dirty(m); 6501 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); 6502 pmap_invalidate_page(pmap, pv->pv_va); 6503 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 6504 m->md.pv_gen++; 6505 free_pv_entry(pmap, pv); 6506 PMAP_UNLOCK(pmap); 6507 } 6508 vm_page_aflag_clear(m, PGA_WRITEABLE); 6509 rw_wunlock(lock); 6510 pmap_delayed_invl_wait(m); 6511 vm_page_free_pages_toq(&free, true); 6512 } 6513 6514 /* 6515 * pmap_protect_pde: do the things to protect a 2mpage in a process 6516 */ 6517 static boolean_t 6518 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 6519 { 6520 pd_entry_t newpde, oldpde; 6521 vm_page_t m, mt; 6522 boolean_t anychanged; 6523 pt_entry_t PG_G, PG_M, PG_RW; 6524 6525 PG_G = pmap_global_bit(pmap); 6526 PG_M = pmap_modified_bit(pmap); 6527 PG_RW = pmap_rw_bit(pmap); 6528 6529 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6530 KASSERT((sva & PDRMASK) == 0, 6531 ("pmap_protect_pde: sva is not 2mpage aligned")); 6532 anychanged = FALSE; 6533 retry: 6534 oldpde = newpde = *pde; 6535 if ((prot & VM_PROT_WRITE) == 0) { 6536 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == 6537 (PG_MANAGED | PG_M | PG_RW)) { 6538 m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 6539 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 6540 vm_page_dirty(mt); 6541 } 6542 newpde &= ~(PG_RW | PG_M); 6543 } 6544 if ((prot & VM_PROT_EXECUTE) == 0) 6545 newpde |= pg_nx; 6546 if (newpde != oldpde) { 6547 /* 6548 * As an optimization to future operations on this PDE, clear 6549 * PG_PROMOTED. The impending invalidation will remove any 6550 * lingering 4KB page mappings from the TLB. 6551 */ 6552 if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED)) 6553 goto retry; 6554 if ((oldpde & PG_G) != 0) 6555 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 6556 else 6557 anychanged = TRUE; 6558 } 6559 return (anychanged); 6560 } 6561 6562 /* 6563 * Set the physical protection on the 6564 * specified range of this map as requested. 6565 */ 6566 void 6567 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 6568 { 6569 vm_page_t m; 6570 vm_offset_t va_next; 6571 pml4_entry_t *pml4e; 6572 pdp_entry_t *pdpe; 6573 pd_entry_t ptpaddr, *pde; 6574 pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; 6575 pt_entry_t obits, pbits; 6576 boolean_t anychanged; 6577 6578 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 6579 if (prot == VM_PROT_NONE) { 6580 pmap_remove(pmap, sva, eva); 6581 return; 6582 } 6583 6584 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 6585 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 6586 return; 6587 6588 PG_G = pmap_global_bit(pmap); 6589 PG_M = pmap_modified_bit(pmap); 6590 PG_V = pmap_valid_bit(pmap); 6591 PG_RW = pmap_rw_bit(pmap); 6592 anychanged = FALSE; 6593 6594 /* 6595 * Although this function delays and batches the invalidation 6596 * of stale TLB entries, it does not need to call 6597 * pmap_delayed_invl_start() and 6598 * pmap_delayed_invl_finish(), because it does not 6599 * ordinarily destroy mappings. Stale TLB entries from 6600 * protection-only changes need only be invalidated before the 6601 * pmap lock is released, because protection-only changes do 6602 * not destroy PV entries. Even operations that iterate over 6603 * a physical page's PV list of mappings, like 6604 * pmap_remove_write(), acquire the pmap lock for each 6605 * mapping. Consequently, for protection-only changes, the 6606 * pmap lock suffices to synchronize both page table and TLB 6607 * updates. 6608 * 6609 * This function only destroys a mapping if pmap_demote_pde() 6610 * fails. In that case, stale TLB entries are immediately 6611 * invalidated. 6612 */ 6613 6614 PMAP_LOCK(pmap); 6615 for (; sva < eva; sva = va_next) { 6616 pml4e = pmap_pml4e(pmap, sva); 6617 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 6618 va_next = (sva + NBPML4) & ~PML4MASK; 6619 if (va_next < sva) 6620 va_next = eva; 6621 continue; 6622 } 6623 6624 va_next = (sva + NBPDP) & ~PDPMASK; 6625 if (va_next < sva) 6626 va_next = eva; 6627 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 6628 if ((*pdpe & PG_V) == 0) 6629 continue; 6630 if ((*pdpe & PG_PS) != 0) { 6631 KASSERT(va_next <= eva, 6632 ("partial update of non-transparent 1G mapping " 6633 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 6634 *pdpe, sva, eva, va_next)); 6635 retry_pdpe: 6636 obits = pbits = *pdpe; 6637 MPASS((pbits & (PG_MANAGED | PG_G)) == 0); 6638 MPASS(pmap != kernel_pmap); /* XXXKIB */ 6639 if ((prot & VM_PROT_WRITE) == 0) 6640 pbits &= ~(PG_RW | PG_M); 6641 if ((prot & VM_PROT_EXECUTE) == 0) 6642 pbits |= pg_nx; 6643 6644 if (pbits != obits) { 6645 if (!atomic_cmpset_long(pdpe, obits, pbits)) 6646 /* PG_PS cannot be cleared under us, */ 6647 goto retry_pdpe; 6648 anychanged = TRUE; 6649 } 6650 continue; 6651 } 6652 6653 va_next = (sva + NBPDR) & ~PDRMASK; 6654 if (va_next < sva) 6655 va_next = eva; 6656 6657 pde = pmap_pdpe_to_pde(pdpe, sva); 6658 ptpaddr = *pde; 6659 6660 /* 6661 * Weed out invalid mappings. 6662 */ 6663 if (ptpaddr == 0) 6664 continue; 6665 6666 /* 6667 * Check for large page. 6668 */ 6669 if ((ptpaddr & PG_PS) != 0) { 6670 /* 6671 * Are we protecting the entire large page? If not, 6672 * demote the mapping and fall through. 6673 */ 6674 if (sva + NBPDR == va_next && eva >= va_next) { 6675 /* 6676 * The TLB entry for a PG_G mapping is 6677 * invalidated by pmap_protect_pde(). 6678 */ 6679 if (pmap_protect_pde(pmap, pde, sva, prot)) 6680 anychanged = TRUE; 6681 continue; 6682 } else if (!pmap_demote_pde(pmap, pde, sva)) { 6683 /* 6684 * The large page mapping was destroyed. 6685 */ 6686 continue; 6687 } 6688 } 6689 6690 if (va_next > eva) 6691 va_next = eva; 6692 6693 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 6694 sva += PAGE_SIZE) { 6695 retry: 6696 obits = pbits = *pte; 6697 if ((pbits & PG_V) == 0) 6698 continue; 6699 6700 if ((prot & VM_PROT_WRITE) == 0) { 6701 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 6702 (PG_MANAGED | PG_M | PG_RW)) { 6703 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 6704 vm_page_dirty(m); 6705 } 6706 pbits &= ~(PG_RW | PG_M); 6707 } 6708 if ((prot & VM_PROT_EXECUTE) == 0) 6709 pbits |= pg_nx; 6710 6711 if (pbits != obits) { 6712 if (!atomic_cmpset_long(pte, obits, pbits)) 6713 goto retry; 6714 if (obits & PG_G) 6715 pmap_invalidate_page(pmap, sva); 6716 else 6717 anychanged = TRUE; 6718 } 6719 } 6720 } 6721 if (anychanged) 6722 pmap_invalidate_all(pmap); 6723 PMAP_UNLOCK(pmap); 6724 } 6725 6726 #if VM_NRESERVLEVEL > 0 6727 static bool 6728 pmap_pde_ept_executable(pmap_t pmap, pd_entry_t pde) 6729 { 6730 6731 if (pmap->pm_type != PT_EPT) 6732 return (false); 6733 return ((pde & EPT_PG_EXECUTE) != 0); 6734 } 6735 6736 /* 6737 * Tries to promote the 512, contiguous 4KB page mappings that are within a 6738 * single page table page (PTP) to a single 2MB page mapping. For promotion 6739 * to occur, two conditions must be met: (1) the 4KB page mappings must map 6740 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 6741 * identical characteristics. 6742 */ 6743 static void 6744 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 6745 struct rwlock **lockp) 6746 { 6747 pd_entry_t newpde; 6748 pt_entry_t *firstpte, oldpte, pa, *pte; 6749 pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK; 6750 vm_page_t mpte; 6751 int PG_PTE_CACHE; 6752 6753 PG_A = pmap_accessed_bit(pmap); 6754 PG_G = pmap_global_bit(pmap); 6755 PG_M = pmap_modified_bit(pmap); 6756 PG_V = pmap_valid_bit(pmap); 6757 PG_RW = pmap_rw_bit(pmap); 6758 PG_PKU_MASK = pmap_pku_mask_bit(pmap); 6759 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 6760 6761 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6762 6763 /* 6764 * Examine the first PTE in the specified PTP. Abort if this PTE is 6765 * either invalid, unused, or does not map the first 4KB physical page 6766 * within a 2MB page. 6767 */ 6768 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 6769 newpde = *firstpte; 6770 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V) || 6771 !pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, 6772 newpde))) { 6773 counter_u64_add(pmap_pde_p_failures, 1); 6774 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6775 " in pmap %p", va, pmap); 6776 return; 6777 } 6778 setpde: 6779 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 6780 /* 6781 * When PG_M is already clear, PG_RW can be cleared without 6782 * a TLB invalidation. 6783 */ 6784 if (!atomic_fcmpset_long(firstpte, &newpde, newpde & ~PG_RW)) 6785 goto setpde; 6786 newpde &= ~PG_RW; 6787 } 6788 6789 /* 6790 * Examine each of the other PTEs in the specified PTP. Abort if this 6791 * PTE maps an unexpected 4KB physical page or does not have identical 6792 * characteristics to the first PTE. 6793 */ 6794 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 6795 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 6796 oldpte = *pte; 6797 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 6798 counter_u64_add(pmap_pde_p_failures, 1); 6799 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6800 " in pmap %p", va, pmap); 6801 return; 6802 } 6803 setpte: 6804 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 6805 /* 6806 * When PG_M is already clear, PG_RW can be cleared 6807 * without a TLB invalidation. 6808 */ 6809 if (!atomic_fcmpset_long(pte, &oldpte, oldpte & ~PG_RW)) 6810 goto setpte; 6811 oldpte &= ~PG_RW; 6812 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 6813 " in pmap %p", (oldpte & PG_FRAME & PDRMASK) | 6814 (va & ~PDRMASK), pmap); 6815 } 6816 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 6817 counter_u64_add(pmap_pde_p_failures, 1); 6818 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6819 " in pmap %p", va, pmap); 6820 return; 6821 } 6822 pa -= PAGE_SIZE; 6823 } 6824 6825 /* 6826 * Save the page table page in its current state until the PDE 6827 * mapping the superpage is demoted by pmap_demote_pde() or 6828 * destroyed by pmap_remove_pde(). 6829 */ 6830 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 6831 KASSERT(mpte >= vm_page_array && 6832 mpte < &vm_page_array[vm_page_array_size], 6833 ("pmap_promote_pde: page table page is out of range")); 6834 KASSERT(mpte->pindex == pmap_pde_pindex(va), 6835 ("pmap_promote_pde: page table page's pindex is wrong " 6836 "mpte %p pidx %#lx va %#lx va pde pidx %#lx", 6837 mpte, mpte->pindex, va, pmap_pde_pindex(va))); 6838 if (pmap_insert_pt_page(pmap, mpte, true)) { 6839 counter_u64_add(pmap_pde_p_failures, 1); 6840 CTR2(KTR_PMAP, 6841 "pmap_promote_pde: failure for va %#lx in pmap %p", va, 6842 pmap); 6843 return; 6844 } 6845 6846 /* 6847 * Promote the pv entries. 6848 */ 6849 if ((newpde & PG_MANAGED) != 0) 6850 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); 6851 6852 /* 6853 * Propagate the PAT index to its proper position. 6854 */ 6855 newpde = pmap_swap_pat(pmap, newpde); 6856 6857 /* 6858 * Map the superpage. 6859 */ 6860 if (workaround_erratum383) 6861 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 6862 else 6863 pde_store(pde, PG_PROMOTED | PG_PS | newpde); 6864 6865 counter_u64_add(pmap_pde_promotions, 1); 6866 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" 6867 " in pmap %p", va, pmap); 6868 } 6869 #endif /* VM_NRESERVLEVEL > 0 */ 6870 6871 static int 6872 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags, 6873 int psind) 6874 { 6875 vm_page_t mp; 6876 pt_entry_t origpte, *pml4e, *pdpe, *pde, pten, PG_V; 6877 6878 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6879 KASSERT(psind > 0 && psind < MAXPAGESIZES && pagesizes[psind] != 0, 6880 ("psind %d unexpected", psind)); 6881 KASSERT(((newpte & PG_FRAME) & (pagesizes[psind] - 1)) == 0, 6882 ("unaligned phys address %#lx newpte %#lx psind %d", 6883 newpte & PG_FRAME, newpte, psind)); 6884 KASSERT((va & (pagesizes[psind] - 1)) == 0, 6885 ("unaligned va %#lx psind %d", va, psind)); 6886 KASSERT(va < VM_MAXUSER_ADDRESS, 6887 ("kernel mode non-transparent superpage")); /* XXXKIB */ 6888 KASSERT(va + pagesizes[psind] < VM_MAXUSER_ADDRESS, 6889 ("overflowing user map va %#lx psind %d", va, psind)); /* XXXKIB */ 6890 6891 PG_V = pmap_valid_bit(pmap); 6892 6893 restart: 6894 if (!pmap_pkru_same(pmap, va, va + pagesizes[psind])) 6895 return (KERN_PROTECTION_FAILURE); 6896 pten = newpte; 6897 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) 6898 pten |= pmap_pkru_get(pmap, va); 6899 6900 if (psind == 2) { /* 1G */ 6901 pml4e = pmap_pml4e(pmap, va); 6902 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 6903 mp = pmap_allocpte_alloc(pmap, pmap_pml4e_pindex(va), 6904 NULL, va); 6905 if (mp == NULL) 6906 goto allocf; 6907 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 6908 pdpe = &pdpe[pmap_pdpe_index(va)]; 6909 origpte = *pdpe; 6910 MPASS(origpte == 0); 6911 } else { 6912 pdpe = pmap_pml4e_to_pdpe(pml4e, va); 6913 KASSERT(pdpe != NULL, ("va %#lx lost pdpe", va)); 6914 origpte = *pdpe; 6915 if ((origpte & PG_V) == 0) { 6916 mp = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME); 6917 mp->ref_count++; 6918 } 6919 } 6920 *pdpe = pten; 6921 } else /* (psind == 1) */ { /* 2M */ 6922 pde = pmap_pde(pmap, va); 6923 if (pde == NULL) { 6924 mp = pmap_allocpte_alloc(pmap, pmap_pdpe_pindex(va), 6925 NULL, va); 6926 if (mp == NULL) 6927 goto allocf; 6928 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 6929 pde = &pde[pmap_pde_index(va)]; 6930 origpte = *pde; 6931 MPASS(origpte == 0); 6932 } else { 6933 origpte = *pde; 6934 if ((origpte & PG_V) == 0) { 6935 pdpe = pmap_pdpe(pmap, va); 6936 MPASS(pdpe != NULL && (*pdpe & PG_V) != 0); 6937 mp = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 6938 mp->ref_count++; 6939 } 6940 } 6941 *pde = pten; 6942 } 6943 KASSERT((origpte & PG_V) == 0 || ((origpte & PG_PS) != 0 && 6944 (origpte & PG_PS_FRAME) == (pten & PG_PS_FRAME)), 6945 ("va %#lx changing %s phys page origpte %#lx pten %#lx", 6946 va, psind == 2 ? "1G" : "2M", origpte, pten)); 6947 if ((pten & PG_W) != 0 && (origpte & PG_W) == 0) 6948 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE; 6949 else if ((pten & PG_W) == 0 && (origpte & PG_W) != 0) 6950 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE; 6951 if ((origpte & PG_V) == 0) 6952 pmap_resident_count_adj(pmap, pagesizes[psind] / PAGE_SIZE); 6953 6954 return (KERN_SUCCESS); 6955 6956 allocf: 6957 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 6958 return (KERN_RESOURCE_SHORTAGE); 6959 PMAP_UNLOCK(pmap); 6960 vm_wait(NULL); 6961 PMAP_LOCK(pmap); 6962 goto restart; 6963 } 6964 6965 /* 6966 * Insert the given physical page (p) at 6967 * the specified virtual address (v) in the 6968 * target physical map with the protection requested. 6969 * 6970 * If specified, the page will be wired down, meaning 6971 * that the related pte can not be reclaimed. 6972 * 6973 * NB: This is the only routine which MAY NOT lazy-evaluate 6974 * or lose information. That is, this routine must actually 6975 * insert this page into the given map NOW. 6976 * 6977 * When destroying both a page table and PV entry, this function 6978 * performs the TLB invalidation before releasing the PV list 6979 * lock, so we do not need pmap_delayed_invl_page() calls here. 6980 */ 6981 int 6982 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 6983 u_int flags, int8_t psind) 6984 { 6985 struct rwlock *lock; 6986 pd_entry_t *pde; 6987 pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; 6988 pt_entry_t newpte, origpte; 6989 pv_entry_t pv; 6990 vm_paddr_t opa, pa; 6991 vm_page_t mpte, om; 6992 int rv; 6993 boolean_t nosleep; 6994 6995 PG_A = pmap_accessed_bit(pmap); 6996 PG_G = pmap_global_bit(pmap); 6997 PG_M = pmap_modified_bit(pmap); 6998 PG_V = pmap_valid_bit(pmap); 6999 PG_RW = pmap_rw_bit(pmap); 7000 7001 va = trunc_page(va); 7002 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 7003 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 7004 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", 7005 va)); 7006 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va), 7007 ("pmap_enter: managed mapping within the clean submap")); 7008 if ((m->oflags & VPO_UNMANAGED) == 0) 7009 VM_PAGE_OBJECT_BUSY_ASSERT(m); 7010 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 7011 ("pmap_enter: flags %u has reserved bits set", flags)); 7012 pa = VM_PAGE_TO_PHYS(m); 7013 newpte = (pt_entry_t)(pa | PG_A | PG_V); 7014 if ((flags & VM_PROT_WRITE) != 0) 7015 newpte |= PG_M; 7016 if ((prot & VM_PROT_WRITE) != 0) 7017 newpte |= PG_RW; 7018 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 7019 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); 7020 if ((prot & VM_PROT_EXECUTE) == 0) 7021 newpte |= pg_nx; 7022 if ((flags & PMAP_ENTER_WIRED) != 0) 7023 newpte |= PG_W; 7024 if (va < VM_MAXUSER_ADDRESS) 7025 newpte |= PG_U; 7026 if (pmap == kernel_pmap) 7027 newpte |= PG_G; 7028 newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0); 7029 7030 /* 7031 * Set modified bit gratuitously for writeable mappings if 7032 * the page is unmanaged. We do not want to take a fault 7033 * to do the dirty bit accounting for these mappings. 7034 */ 7035 if ((m->oflags & VPO_UNMANAGED) != 0) { 7036 if ((newpte & PG_RW) != 0) 7037 newpte |= PG_M; 7038 } else 7039 newpte |= PG_MANAGED; 7040 7041 lock = NULL; 7042 PMAP_LOCK(pmap); 7043 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { 7044 KASSERT((m->oflags & VPO_UNMANAGED) != 0, 7045 ("managed largepage va %#lx flags %#x", va, flags)); 7046 rv = pmap_enter_largepage(pmap, va, newpte | PG_PS, flags, 7047 psind); 7048 goto out; 7049 } 7050 if (psind == 1) { 7051 /* Assert the required virtual and physical alignment. */ 7052 KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); 7053 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 7054 rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock); 7055 goto out; 7056 } 7057 mpte = NULL; 7058 7059 /* 7060 * In the case that a page table page is not 7061 * resident, we are creating it here. 7062 */ 7063 retry: 7064 pde = pmap_pde(pmap, va); 7065 if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || 7066 pmap_demote_pde_locked(pmap, pde, va, &lock))) { 7067 pte = pmap_pde_to_pte(pde, va); 7068 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 7069 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7070 mpte->ref_count++; 7071 } 7072 } else if (va < VM_MAXUSER_ADDRESS) { 7073 /* 7074 * Here if the pte page isn't mapped, or if it has been 7075 * deallocated. 7076 */ 7077 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 7078 mpte = pmap_allocpte_alloc(pmap, pmap_pde_pindex(va), 7079 nosleep ? NULL : &lock, va); 7080 if (mpte == NULL && nosleep) { 7081 rv = KERN_RESOURCE_SHORTAGE; 7082 goto out; 7083 } 7084 goto retry; 7085 } else 7086 panic("pmap_enter: invalid page directory va=%#lx", va); 7087 7088 origpte = *pte; 7089 pv = NULL; 7090 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) 7091 newpte |= pmap_pkru_get(pmap, va); 7092 7093 /* 7094 * Is the specified virtual address already mapped? 7095 */ 7096 if ((origpte & PG_V) != 0) { 7097 /* 7098 * Wiring change, just update stats. We don't worry about 7099 * wiring PT pages as they remain resident as long as there 7100 * are valid mappings in them. Hence, if a user page is wired, 7101 * the PT page will be also. 7102 */ 7103 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 7104 pmap->pm_stats.wired_count++; 7105 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 7106 pmap->pm_stats.wired_count--; 7107 7108 /* 7109 * Remove the extra PT page reference. 7110 */ 7111 if (mpte != NULL) { 7112 mpte->ref_count--; 7113 KASSERT(mpte->ref_count > 0, 7114 ("pmap_enter: missing reference to page table page," 7115 " va: 0x%lx", va)); 7116 } 7117 7118 /* 7119 * Has the physical page changed? 7120 */ 7121 opa = origpte & PG_FRAME; 7122 if (opa == pa) { 7123 /* 7124 * No, might be a protection or wiring change. 7125 */ 7126 if ((origpte & PG_MANAGED) != 0 && 7127 (newpte & PG_RW) != 0) 7128 vm_page_aflag_set(m, PGA_WRITEABLE); 7129 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 7130 goto unchanged; 7131 goto validate; 7132 } 7133 7134 /* 7135 * The physical page has changed. Temporarily invalidate 7136 * the mapping. This ensures that all threads sharing the 7137 * pmap keep a consistent view of the mapping, which is 7138 * necessary for the correct handling of COW faults. It 7139 * also permits reuse of the old mapping's PV entry, 7140 * avoiding an allocation. 7141 * 7142 * For consistency, handle unmanaged mappings the same way. 7143 */ 7144 origpte = pte_load_clear(pte); 7145 KASSERT((origpte & PG_FRAME) == opa, 7146 ("pmap_enter: unexpected pa update for %#lx", va)); 7147 if ((origpte & PG_MANAGED) != 0) { 7148 om = PHYS_TO_VM_PAGE(opa); 7149 7150 /* 7151 * The pmap lock is sufficient to synchronize with 7152 * concurrent calls to pmap_page_test_mappings() and 7153 * pmap_ts_referenced(). 7154 */ 7155 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 7156 vm_page_dirty(om); 7157 if ((origpte & PG_A) != 0) { 7158 pmap_invalidate_page(pmap, va); 7159 vm_page_aflag_set(om, PGA_REFERENCED); 7160 } 7161 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 7162 pv = pmap_pvh_remove(&om->md, pmap, va); 7163 KASSERT(pv != NULL, 7164 ("pmap_enter: no PV entry for %#lx", va)); 7165 if ((newpte & PG_MANAGED) == 0) 7166 free_pv_entry(pmap, pv); 7167 if ((om->a.flags & PGA_WRITEABLE) != 0 && 7168 TAILQ_EMPTY(&om->md.pv_list) && 7169 ((om->flags & PG_FICTITIOUS) != 0 || 7170 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 7171 vm_page_aflag_clear(om, PGA_WRITEABLE); 7172 } else { 7173 /* 7174 * Since this mapping is unmanaged, assume that PG_A 7175 * is set. 7176 */ 7177 pmap_invalidate_page(pmap, va); 7178 } 7179 origpte = 0; 7180 } else { 7181 /* 7182 * Increment the counters. 7183 */ 7184 if ((newpte & PG_W) != 0) 7185 pmap->pm_stats.wired_count++; 7186 pmap_resident_count_adj(pmap, 1); 7187 } 7188 7189 /* 7190 * Enter on the PV list if part of our managed memory. 7191 */ 7192 if ((newpte & PG_MANAGED) != 0) { 7193 if (pv == NULL) { 7194 pv = get_pv_entry(pmap, &lock); 7195 pv->pv_va = va; 7196 } 7197 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 7198 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 7199 m->md.pv_gen++; 7200 if ((newpte & PG_RW) != 0) 7201 vm_page_aflag_set(m, PGA_WRITEABLE); 7202 } 7203 7204 /* 7205 * Update the PTE. 7206 */ 7207 if ((origpte & PG_V) != 0) { 7208 validate: 7209 origpte = pte_load_store(pte, newpte); 7210 KASSERT((origpte & PG_FRAME) == pa, 7211 ("pmap_enter: unexpected pa update for %#lx", va)); 7212 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == 7213 (PG_M | PG_RW)) { 7214 if ((origpte & PG_MANAGED) != 0) 7215 vm_page_dirty(m); 7216 7217 /* 7218 * Although the PTE may still have PG_RW set, TLB 7219 * invalidation may nonetheless be required because 7220 * the PTE no longer has PG_M set. 7221 */ 7222 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { 7223 /* 7224 * This PTE change does not require TLB invalidation. 7225 */ 7226 goto unchanged; 7227 } 7228 if ((origpte & PG_A) != 0) 7229 pmap_invalidate_page(pmap, va); 7230 } else 7231 pte_store(pte, newpte); 7232 7233 unchanged: 7234 7235 #if VM_NRESERVLEVEL > 0 7236 /* 7237 * If both the page table page and the reservation are fully 7238 * populated, then attempt promotion. 7239 */ 7240 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 7241 pmap_ps_enabled(pmap) && 7242 (m->flags & PG_FICTITIOUS) == 0 && 7243 vm_reserv_level_iffullpop(m) == 0) 7244 pmap_promote_pde(pmap, pde, va, &lock); 7245 #endif 7246 7247 rv = KERN_SUCCESS; 7248 out: 7249 if (lock != NULL) 7250 rw_wunlock(lock); 7251 PMAP_UNLOCK(pmap); 7252 return (rv); 7253 } 7254 7255 /* 7256 * Tries to create a read- and/or execute-only 2MB page mapping. Returns true 7257 * if successful. Returns false if (1) a page table page cannot be allocated 7258 * without sleeping, (2) a mapping already exists at the specified virtual 7259 * address, or (3) a PV entry cannot be allocated without reclaiming another 7260 * PV entry. 7261 */ 7262 static bool 7263 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 7264 struct rwlock **lockp) 7265 { 7266 pd_entry_t newpde; 7267 pt_entry_t PG_V; 7268 7269 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7270 PG_V = pmap_valid_bit(pmap); 7271 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | 7272 PG_PS | PG_V; 7273 if ((m->oflags & VPO_UNMANAGED) == 0) 7274 newpde |= PG_MANAGED; 7275 if ((prot & VM_PROT_EXECUTE) == 0) 7276 newpde |= pg_nx; 7277 if (va < VM_MAXUSER_ADDRESS) 7278 newpde |= PG_U; 7279 return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | 7280 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == 7281 KERN_SUCCESS); 7282 } 7283 7284 /* 7285 * Returns true if every page table entry in the specified page table page is 7286 * zero. 7287 */ 7288 static bool 7289 pmap_every_pte_zero(vm_paddr_t pa) 7290 { 7291 pt_entry_t *pt_end, *pte; 7292 7293 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); 7294 pte = (pt_entry_t *)PHYS_TO_DMAP(pa); 7295 for (pt_end = pte + NPTEPG; pte < pt_end; pte++) { 7296 if (*pte != 0) 7297 return (false); 7298 } 7299 return (true); 7300 } 7301 7302 /* 7303 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 7304 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 7305 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 7306 * a mapping already exists at the specified virtual address. Returns 7307 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table 7308 * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if 7309 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. 7310 * 7311 * The parameter "m" is only used when creating a managed, writeable mapping. 7312 */ 7313 static int 7314 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, 7315 vm_page_t m, struct rwlock **lockp) 7316 { 7317 struct spglist free; 7318 pd_entry_t oldpde, *pde; 7319 pt_entry_t PG_G, PG_RW, PG_V; 7320 vm_page_t mt, pdpg; 7321 7322 KASSERT(pmap == kernel_pmap || (newpde & PG_W) == 0, 7323 ("pmap_enter_pde: cannot create wired user mapping")); 7324 PG_G = pmap_global_bit(pmap); 7325 PG_RW = pmap_rw_bit(pmap); 7326 KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW, 7327 ("pmap_enter_pde: newpde is missing PG_M")); 7328 PG_V = pmap_valid_bit(pmap); 7329 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7330 7331 if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, 7332 newpde))) { 7333 CTR2(KTR_PMAP, "pmap_enter_pde: 2m x blocked for va %#lx" 7334 " in pmap %p", va, pmap); 7335 return (KERN_FAILURE); 7336 } 7337 if ((pde = pmap_alloc_pde(pmap, va, &pdpg, (flags & 7338 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { 7339 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 7340 " in pmap %p", va, pmap); 7341 return (KERN_RESOURCE_SHORTAGE); 7342 } 7343 7344 /* 7345 * If pkru is not same for the whole pde range, return failure 7346 * and let vm_fault() cope. Check after pde allocation, since 7347 * it could sleep. 7348 */ 7349 if (!pmap_pkru_same(pmap, va, va + NBPDR)) { 7350 pmap_abort_ptp(pmap, va, pdpg); 7351 return (KERN_PROTECTION_FAILURE); 7352 } 7353 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) { 7354 newpde &= ~X86_PG_PKU_MASK; 7355 newpde |= pmap_pkru_get(pmap, va); 7356 } 7357 7358 /* 7359 * If there are existing mappings, either abort or remove them. 7360 */ 7361 oldpde = *pde; 7362 if ((oldpde & PG_V) != 0) { 7363 KASSERT(pdpg == NULL || pdpg->ref_count > 1, 7364 ("pmap_enter_pde: pdpg's reference count is too low")); 7365 if ((flags & PMAP_ENTER_NOREPLACE) != 0 && (va < 7366 VM_MAXUSER_ADDRESS || (oldpde & PG_PS) != 0 || 7367 !pmap_every_pte_zero(oldpde & PG_FRAME))) { 7368 if (pdpg != NULL) 7369 pdpg->ref_count--; 7370 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 7371 " in pmap %p", va, pmap); 7372 return (KERN_FAILURE); 7373 } 7374 /* Break the existing mapping(s). */ 7375 SLIST_INIT(&free); 7376 if ((oldpde & PG_PS) != 0) { 7377 /* 7378 * The reference to the PD page that was acquired by 7379 * pmap_alloc_pde() ensures that it won't be freed. 7380 * However, if the PDE resulted from a promotion, then 7381 * a reserved PT page could be freed. 7382 */ 7383 (void)pmap_remove_pde(pmap, pde, va, &free, lockp); 7384 if ((oldpde & PG_G) == 0) 7385 pmap_invalidate_pde_page(pmap, va, oldpde); 7386 } else { 7387 pmap_delayed_invl_start(); 7388 if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free, 7389 lockp)) 7390 pmap_invalidate_all(pmap); 7391 pmap_delayed_invl_finish(); 7392 } 7393 if (va < VM_MAXUSER_ADDRESS) { 7394 vm_page_free_pages_toq(&free, true); 7395 KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", 7396 pde)); 7397 } else { 7398 KASSERT(SLIST_EMPTY(&free), 7399 ("pmap_enter_pde: freed kernel page table page")); 7400 7401 /* 7402 * Both pmap_remove_pde() and pmap_remove_ptes() will 7403 * leave the kernel page table page zero filled. 7404 */ 7405 mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7406 if (pmap_insert_pt_page(pmap, mt, false)) 7407 panic("pmap_enter_pde: trie insert failed"); 7408 } 7409 } 7410 7411 if ((newpde & PG_MANAGED) != 0) { 7412 /* 7413 * Abort this mapping if its PV entry could not be created. 7414 */ 7415 if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) { 7416 if (pdpg != NULL) 7417 pmap_abort_ptp(pmap, va, pdpg); 7418 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 7419 " in pmap %p", va, pmap); 7420 return (KERN_RESOURCE_SHORTAGE); 7421 } 7422 if ((newpde & PG_RW) != 0) { 7423 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 7424 vm_page_aflag_set(mt, PGA_WRITEABLE); 7425 } 7426 } 7427 7428 /* 7429 * Increment counters. 7430 */ 7431 if ((newpde & PG_W) != 0) 7432 pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; 7433 pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE); 7434 7435 /* 7436 * Map the superpage. (This is not a promoted mapping; there will not 7437 * be any lingering 4KB page mappings in the TLB.) 7438 */ 7439 pde_store(pde, newpde); 7440 7441 counter_u64_add(pmap_pde_mappings, 1); 7442 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx in pmap %p", 7443 va, pmap); 7444 return (KERN_SUCCESS); 7445 } 7446 7447 /* 7448 * Maps a sequence of resident pages belonging to the same object. 7449 * The sequence begins with the given page m_start. This page is 7450 * mapped at the given virtual address start. Each subsequent page is 7451 * mapped at a virtual address that is offset from start by the same 7452 * amount as the page is offset from m_start within the object. The 7453 * last page in the sequence is the page with the largest offset from 7454 * m_start that can be mapped at a virtual address less than the given 7455 * virtual address end. Not every virtual page between start and end 7456 * is mapped; only those for which a resident page exists with the 7457 * corresponding offset from m_start are mapped. 7458 */ 7459 void 7460 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 7461 vm_page_t m_start, vm_prot_t prot) 7462 { 7463 struct rwlock *lock; 7464 vm_offset_t va; 7465 vm_page_t m, mpte; 7466 vm_pindex_t diff, psize; 7467 7468 VM_OBJECT_ASSERT_LOCKED(m_start->object); 7469 7470 psize = atop(end - start); 7471 mpte = NULL; 7472 m = m_start; 7473 lock = NULL; 7474 PMAP_LOCK(pmap); 7475 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 7476 va = start + ptoa(diff); 7477 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 7478 m->psind == 1 && pmap_ps_enabled(pmap) && 7479 pmap_enter_2mpage(pmap, va, m, prot, &lock)) 7480 m = &m[NBPDR / PAGE_SIZE - 1]; 7481 else 7482 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 7483 mpte, &lock); 7484 m = TAILQ_NEXT(m, listq); 7485 } 7486 if (lock != NULL) 7487 rw_wunlock(lock); 7488 PMAP_UNLOCK(pmap); 7489 } 7490 7491 /* 7492 * this code makes some *MAJOR* assumptions: 7493 * 1. Current pmap & pmap exists. 7494 * 2. Not wired. 7495 * 3. Read access. 7496 * 4. No page table pages. 7497 * but is *MUCH* faster than pmap_enter... 7498 */ 7499 7500 void 7501 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 7502 { 7503 struct rwlock *lock; 7504 7505 lock = NULL; 7506 PMAP_LOCK(pmap); 7507 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 7508 if (lock != NULL) 7509 rw_wunlock(lock); 7510 PMAP_UNLOCK(pmap); 7511 } 7512 7513 static vm_page_t 7514 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 7515 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 7516 { 7517 pt_entry_t newpte, *pte, PG_V; 7518 7519 KASSERT(!VA_IS_CLEANMAP(va) || 7520 (m->oflags & VPO_UNMANAGED) != 0, 7521 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 7522 PG_V = pmap_valid_bit(pmap); 7523 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7524 7525 /* 7526 * In the case that a page table page is not 7527 * resident, we are creating it here. 7528 */ 7529 if (va < VM_MAXUSER_ADDRESS) { 7530 vm_pindex_t ptepindex; 7531 pd_entry_t *ptepa; 7532 7533 /* 7534 * Calculate pagetable page index 7535 */ 7536 ptepindex = pmap_pde_pindex(va); 7537 if (mpte && (mpte->pindex == ptepindex)) { 7538 mpte->ref_count++; 7539 } else { 7540 /* 7541 * Get the page directory entry 7542 */ 7543 ptepa = pmap_pde(pmap, va); 7544 7545 /* 7546 * If the page table page is mapped, we just increment 7547 * the hold count, and activate it. Otherwise, we 7548 * attempt to allocate a page table page. If this 7549 * attempt fails, we don't retry. Instead, we give up. 7550 */ 7551 if (ptepa && (*ptepa & PG_V) != 0) { 7552 if (*ptepa & PG_PS) 7553 return (NULL); 7554 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); 7555 mpte->ref_count++; 7556 } else { 7557 /* 7558 * Pass NULL instead of the PV list lock 7559 * pointer, because we don't intend to sleep. 7560 */ 7561 mpte = pmap_allocpte_alloc(pmap, ptepindex, 7562 NULL, va); 7563 if (mpte == NULL) 7564 return (mpte); 7565 } 7566 } 7567 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 7568 pte = &pte[pmap_pte_index(va)]; 7569 } else { 7570 mpte = NULL; 7571 pte = vtopte(va); 7572 } 7573 if (*pte) { 7574 if (mpte != NULL) 7575 mpte->ref_count--; 7576 return (NULL); 7577 } 7578 7579 /* 7580 * Enter on the PV list if part of our managed memory. 7581 */ 7582 if ((m->oflags & VPO_UNMANAGED) == 0 && 7583 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 7584 if (mpte != NULL) 7585 pmap_abort_ptp(pmap, va, mpte); 7586 return (NULL); 7587 } 7588 7589 /* 7590 * Increment counters 7591 */ 7592 pmap_resident_count_adj(pmap, 1); 7593 7594 newpte = VM_PAGE_TO_PHYS(m) | PG_V | 7595 pmap_cache_bits(pmap, m->md.pat_mode, 0); 7596 if ((m->oflags & VPO_UNMANAGED) == 0) 7597 newpte |= PG_MANAGED; 7598 if ((prot & VM_PROT_EXECUTE) == 0) 7599 newpte |= pg_nx; 7600 if (va < VM_MAXUSER_ADDRESS) 7601 newpte |= PG_U | pmap_pkru_get(pmap, va); 7602 pte_store(pte, newpte); 7603 return (mpte); 7604 } 7605 7606 /* 7607 * Make a temporary mapping for a physical address. This is only intended 7608 * to be used for panic dumps. 7609 */ 7610 void * 7611 pmap_kenter_temporary(vm_paddr_t pa, int i) 7612 { 7613 vm_offset_t va; 7614 7615 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 7616 pmap_kenter(va, pa); 7617 invlpg(va); 7618 return ((void *)crashdumpmap); 7619 } 7620 7621 /* 7622 * This code maps large physical mmap regions into the 7623 * processor address space. Note that some shortcuts 7624 * are taken, but the code works. 7625 */ 7626 void 7627 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 7628 vm_pindex_t pindex, vm_size_t size) 7629 { 7630 pd_entry_t *pde; 7631 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 7632 vm_paddr_t pa, ptepa; 7633 vm_page_t p, pdpg; 7634 int pat_mode; 7635 7636 PG_A = pmap_accessed_bit(pmap); 7637 PG_M = pmap_modified_bit(pmap); 7638 PG_V = pmap_valid_bit(pmap); 7639 PG_RW = pmap_rw_bit(pmap); 7640 7641 VM_OBJECT_ASSERT_WLOCKED(object); 7642 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 7643 ("pmap_object_init_pt: non-device object")); 7644 if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 7645 if (!pmap_ps_enabled(pmap)) 7646 return; 7647 if (!vm_object_populate(object, pindex, pindex + atop(size))) 7648 return; 7649 p = vm_page_lookup(object, pindex); 7650 KASSERT(p->valid == VM_PAGE_BITS_ALL, 7651 ("pmap_object_init_pt: invalid page %p", p)); 7652 pat_mode = p->md.pat_mode; 7653 7654 /* 7655 * Abort the mapping if the first page is not physically 7656 * aligned to a 2MB page boundary. 7657 */ 7658 ptepa = VM_PAGE_TO_PHYS(p); 7659 if (ptepa & (NBPDR - 1)) 7660 return; 7661 7662 /* 7663 * Skip the first page. Abort the mapping if the rest of 7664 * the pages are not physically contiguous or have differing 7665 * memory attributes. 7666 */ 7667 p = TAILQ_NEXT(p, listq); 7668 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 7669 pa += PAGE_SIZE) { 7670 KASSERT(p->valid == VM_PAGE_BITS_ALL, 7671 ("pmap_object_init_pt: invalid page %p", p)); 7672 if (pa != VM_PAGE_TO_PHYS(p) || 7673 pat_mode != p->md.pat_mode) 7674 return; 7675 p = TAILQ_NEXT(p, listq); 7676 } 7677 7678 /* 7679 * Map using 2MB pages. Since "ptepa" is 2M aligned and 7680 * "size" is a multiple of 2M, adding the PAT setting to "pa" 7681 * will not affect the termination of this loop. 7682 */ 7683 PMAP_LOCK(pmap); 7684 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); 7685 pa < ptepa + size; pa += NBPDR) { 7686 pde = pmap_alloc_pde(pmap, addr, &pdpg, NULL); 7687 if (pde == NULL) { 7688 /* 7689 * The creation of mappings below is only an 7690 * optimization. If a page directory page 7691 * cannot be allocated without blocking, 7692 * continue on to the next mapping rather than 7693 * blocking. 7694 */ 7695 addr += NBPDR; 7696 continue; 7697 } 7698 if ((*pde & PG_V) == 0) { 7699 pde_store(pde, pa | PG_PS | PG_M | PG_A | 7700 PG_U | PG_RW | PG_V); 7701 pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE); 7702 counter_u64_add(pmap_pde_mappings, 1); 7703 } else { 7704 /* Continue on if the PDE is already valid. */ 7705 pdpg->ref_count--; 7706 KASSERT(pdpg->ref_count > 0, 7707 ("pmap_object_init_pt: missing reference " 7708 "to page directory page, va: 0x%lx", addr)); 7709 } 7710 addr += NBPDR; 7711 } 7712 PMAP_UNLOCK(pmap); 7713 } 7714 } 7715 7716 /* 7717 * Clear the wired attribute from the mappings for the specified range of 7718 * addresses in the given pmap. Every valid mapping within that range 7719 * must have the wired attribute set. In contrast, invalid mappings 7720 * cannot have the wired attribute set, so they are ignored. 7721 * 7722 * The wired attribute of the page table entry is not a hardware 7723 * feature, so there is no need to invalidate any TLB entries. 7724 * Since pmap_demote_pde() for the wired entry must never fail, 7725 * pmap_delayed_invl_start()/finish() calls around the 7726 * function are not needed. 7727 */ 7728 void 7729 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 7730 { 7731 vm_offset_t va_next; 7732 pml4_entry_t *pml4e; 7733 pdp_entry_t *pdpe; 7734 pd_entry_t *pde; 7735 pt_entry_t *pte, PG_V, PG_G; 7736 7737 PG_V = pmap_valid_bit(pmap); 7738 PG_G = pmap_global_bit(pmap); 7739 PMAP_LOCK(pmap); 7740 for (; sva < eva; sva = va_next) { 7741 pml4e = pmap_pml4e(pmap, sva); 7742 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 7743 va_next = (sva + NBPML4) & ~PML4MASK; 7744 if (va_next < sva) 7745 va_next = eva; 7746 continue; 7747 } 7748 7749 va_next = (sva + NBPDP) & ~PDPMASK; 7750 if (va_next < sva) 7751 va_next = eva; 7752 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 7753 if ((*pdpe & PG_V) == 0) 7754 continue; 7755 if ((*pdpe & PG_PS) != 0) { 7756 KASSERT(va_next <= eva, 7757 ("partial update of non-transparent 1G mapping " 7758 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 7759 *pdpe, sva, eva, va_next)); 7760 MPASS(pmap != kernel_pmap); /* XXXKIB */ 7761 MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); 7762 atomic_clear_long(pdpe, PG_W); 7763 pmap->pm_stats.wired_count -= NBPDP / PAGE_SIZE; 7764 continue; 7765 } 7766 7767 va_next = (sva + NBPDR) & ~PDRMASK; 7768 if (va_next < sva) 7769 va_next = eva; 7770 pde = pmap_pdpe_to_pde(pdpe, sva); 7771 if ((*pde & PG_V) == 0) 7772 continue; 7773 if ((*pde & PG_PS) != 0) { 7774 if ((*pde & PG_W) == 0) 7775 panic("pmap_unwire: pde %#jx is missing PG_W", 7776 (uintmax_t)*pde); 7777 7778 /* 7779 * Are we unwiring the entire large page? If not, 7780 * demote the mapping and fall through. 7781 */ 7782 if (sva + NBPDR == va_next && eva >= va_next) { 7783 atomic_clear_long(pde, PG_W); 7784 pmap->pm_stats.wired_count -= NBPDR / 7785 PAGE_SIZE; 7786 continue; 7787 } else if (!pmap_demote_pde(pmap, pde, sva)) 7788 panic("pmap_unwire: demotion failed"); 7789 } 7790 if (va_next > eva) 7791 va_next = eva; 7792 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 7793 sva += PAGE_SIZE) { 7794 if ((*pte & PG_V) == 0) 7795 continue; 7796 if ((*pte & PG_W) == 0) 7797 panic("pmap_unwire: pte %#jx is missing PG_W", 7798 (uintmax_t)*pte); 7799 7800 /* 7801 * PG_W must be cleared atomically. Although the pmap 7802 * lock synchronizes access to PG_W, another processor 7803 * could be setting PG_M and/or PG_A concurrently. 7804 */ 7805 atomic_clear_long(pte, PG_W); 7806 pmap->pm_stats.wired_count--; 7807 } 7808 } 7809 PMAP_UNLOCK(pmap); 7810 } 7811 7812 /* 7813 * Copy the range specified by src_addr/len 7814 * from the source map to the range dst_addr/len 7815 * in the destination map. 7816 * 7817 * This routine is only advisory and need not do anything. 7818 */ 7819 void 7820 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 7821 vm_offset_t src_addr) 7822 { 7823 struct rwlock *lock; 7824 pml4_entry_t *pml4e; 7825 pdp_entry_t *pdpe; 7826 pd_entry_t *pde, srcptepaddr; 7827 pt_entry_t *dst_pte, PG_A, PG_M, PG_V, ptetemp, *src_pte; 7828 vm_offset_t addr, end_addr, va_next; 7829 vm_page_t dst_pdpg, dstmpte, srcmpte; 7830 7831 if (dst_addr != src_addr) 7832 return; 7833 7834 if (dst_pmap->pm_type != src_pmap->pm_type) 7835 return; 7836 7837 /* 7838 * EPT page table entries that require emulation of A/D bits are 7839 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although 7840 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit 7841 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT 7842 * implementations flag an EPT misconfiguration for exec-only 7843 * mappings we skip this function entirely for emulated pmaps. 7844 */ 7845 if (pmap_emulate_ad_bits(dst_pmap)) 7846 return; 7847 7848 end_addr = src_addr + len; 7849 lock = NULL; 7850 if (dst_pmap < src_pmap) { 7851 PMAP_LOCK(dst_pmap); 7852 PMAP_LOCK(src_pmap); 7853 } else { 7854 PMAP_LOCK(src_pmap); 7855 PMAP_LOCK(dst_pmap); 7856 } 7857 7858 PG_A = pmap_accessed_bit(dst_pmap); 7859 PG_M = pmap_modified_bit(dst_pmap); 7860 PG_V = pmap_valid_bit(dst_pmap); 7861 7862 for (addr = src_addr; addr < end_addr; addr = va_next) { 7863 KASSERT(addr < UPT_MIN_ADDRESS, 7864 ("pmap_copy: invalid to pmap_copy page tables")); 7865 7866 pml4e = pmap_pml4e(src_pmap, addr); 7867 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 7868 va_next = (addr + NBPML4) & ~PML4MASK; 7869 if (va_next < addr) 7870 va_next = end_addr; 7871 continue; 7872 } 7873 7874 va_next = (addr + NBPDP) & ~PDPMASK; 7875 if (va_next < addr) 7876 va_next = end_addr; 7877 pdpe = pmap_pml4e_to_pdpe(pml4e, addr); 7878 if ((*pdpe & PG_V) == 0) 7879 continue; 7880 if ((*pdpe & PG_PS) != 0) { 7881 KASSERT(va_next <= end_addr, 7882 ("partial update of non-transparent 1G mapping " 7883 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 7884 *pdpe, addr, end_addr, va_next)); 7885 MPASS((addr & PDPMASK) == 0); 7886 MPASS((*pdpe & PG_MANAGED) == 0); 7887 srcptepaddr = *pdpe; 7888 pdpe = pmap_pdpe(dst_pmap, addr); 7889 if (pdpe == NULL) { 7890 if (pmap_allocpte_alloc(dst_pmap, 7891 pmap_pml4e_pindex(addr), NULL, addr) == 7892 NULL) 7893 break; 7894 pdpe = pmap_pdpe(dst_pmap, addr); 7895 } else { 7896 pml4e = pmap_pml4e(dst_pmap, addr); 7897 dst_pdpg = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME); 7898 dst_pdpg->ref_count++; 7899 } 7900 KASSERT(*pdpe == 0, 7901 ("1G mapping present in dst pmap " 7902 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 7903 *pdpe, addr, end_addr, va_next)); 7904 *pdpe = srcptepaddr & ~PG_W; 7905 pmap_resident_count_adj(dst_pmap, NBPDP / PAGE_SIZE); 7906 continue; 7907 } 7908 7909 va_next = (addr + NBPDR) & ~PDRMASK; 7910 if (va_next < addr) 7911 va_next = end_addr; 7912 7913 pde = pmap_pdpe_to_pde(pdpe, addr); 7914 srcptepaddr = *pde; 7915 if (srcptepaddr == 0) 7916 continue; 7917 7918 if (srcptepaddr & PG_PS) { 7919 /* 7920 * We can only virtual copy whole superpages. 7921 */ 7922 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 7923 continue; 7924 pde = pmap_alloc_pde(dst_pmap, addr, &dst_pdpg, NULL); 7925 if (pde == NULL) 7926 break; 7927 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 7928 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, 7929 PMAP_ENTER_NORECLAIM, &lock))) { 7930 /* 7931 * We leave the dirty bit unchanged because 7932 * managed read/write superpage mappings are 7933 * required to be dirty. However, managed 7934 * superpage mappings are not required to 7935 * have their accessed bit set, so we clear 7936 * it because we don't know if this mapping 7937 * will be used. 7938 */ 7939 srcptepaddr &= ~PG_W; 7940 if ((srcptepaddr & PG_MANAGED) != 0) 7941 srcptepaddr &= ~PG_A; 7942 *pde = srcptepaddr; 7943 pmap_resident_count_adj(dst_pmap, NBPDR / 7944 PAGE_SIZE); 7945 counter_u64_add(pmap_pde_mappings, 1); 7946 } else 7947 pmap_abort_ptp(dst_pmap, addr, dst_pdpg); 7948 continue; 7949 } 7950 7951 srcptepaddr &= PG_FRAME; 7952 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 7953 KASSERT(srcmpte->ref_count > 0, 7954 ("pmap_copy: source page table page is unused")); 7955 7956 if (va_next > end_addr) 7957 va_next = end_addr; 7958 7959 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 7960 src_pte = &src_pte[pmap_pte_index(addr)]; 7961 dstmpte = NULL; 7962 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { 7963 ptetemp = *src_pte; 7964 7965 /* 7966 * We only virtual copy managed pages. 7967 */ 7968 if ((ptetemp & PG_MANAGED) == 0) 7969 continue; 7970 7971 if (dstmpte != NULL) { 7972 KASSERT(dstmpte->pindex == 7973 pmap_pde_pindex(addr), 7974 ("dstmpte pindex/addr mismatch")); 7975 dstmpte->ref_count++; 7976 } else if ((dstmpte = pmap_allocpte(dst_pmap, addr, 7977 NULL)) == NULL) 7978 goto out; 7979 dst_pte = (pt_entry_t *) 7980 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 7981 dst_pte = &dst_pte[pmap_pte_index(addr)]; 7982 if (*dst_pte == 0 && 7983 pmap_try_insert_pv_entry(dst_pmap, addr, 7984 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) { 7985 /* 7986 * Clear the wired, modified, and accessed 7987 * (referenced) bits during the copy. 7988 */ 7989 *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); 7990 pmap_resident_count_adj(dst_pmap, 1); 7991 } else { 7992 pmap_abort_ptp(dst_pmap, addr, dstmpte); 7993 goto out; 7994 } 7995 /* Have we copied all of the valid mappings? */ 7996 if (dstmpte->ref_count >= srcmpte->ref_count) 7997 break; 7998 } 7999 } 8000 out: 8001 if (lock != NULL) 8002 rw_wunlock(lock); 8003 PMAP_UNLOCK(src_pmap); 8004 PMAP_UNLOCK(dst_pmap); 8005 } 8006 8007 int 8008 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap) 8009 { 8010 int error; 8011 8012 if (dst_pmap->pm_type != src_pmap->pm_type || 8013 dst_pmap->pm_type != PT_X86 || 8014 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) 8015 return (0); 8016 for (;;) { 8017 if (dst_pmap < src_pmap) { 8018 PMAP_LOCK(dst_pmap); 8019 PMAP_LOCK(src_pmap); 8020 } else { 8021 PMAP_LOCK(src_pmap); 8022 PMAP_LOCK(dst_pmap); 8023 } 8024 error = pmap_pkru_copy(dst_pmap, src_pmap); 8025 /* Clean up partial copy on failure due to no memory. */ 8026 if (error == ENOMEM) 8027 pmap_pkru_deassign_all(dst_pmap); 8028 PMAP_UNLOCK(src_pmap); 8029 PMAP_UNLOCK(dst_pmap); 8030 if (error != ENOMEM) 8031 break; 8032 vm_wait(NULL); 8033 } 8034 return (error); 8035 } 8036 8037 /* 8038 * Zero the specified hardware page. 8039 */ 8040 void 8041 pmap_zero_page(vm_page_t m) 8042 { 8043 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 8044 8045 pagezero((void *)va); 8046 } 8047 8048 /* 8049 * Zero an an area within a single hardware page. off and size must not 8050 * cover an area beyond a single hardware page. 8051 */ 8052 void 8053 pmap_zero_page_area(vm_page_t m, int off, int size) 8054 { 8055 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 8056 8057 if (off == 0 && size == PAGE_SIZE) 8058 pagezero((void *)va); 8059 else 8060 bzero((char *)va + off, size); 8061 } 8062 8063 /* 8064 * Copy 1 specified hardware page to another. 8065 */ 8066 void 8067 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 8068 { 8069 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 8070 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 8071 8072 pagecopy((void *)src, (void *)dst); 8073 } 8074 8075 int unmapped_buf_allowed = 1; 8076 8077 void 8078 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 8079 vm_offset_t b_offset, int xfersize) 8080 { 8081 void *a_cp, *b_cp; 8082 vm_page_t pages[2]; 8083 vm_offset_t vaddr[2], a_pg_offset, b_pg_offset; 8084 int cnt; 8085 boolean_t mapped; 8086 8087 while (xfersize > 0) { 8088 a_pg_offset = a_offset & PAGE_MASK; 8089 pages[0] = ma[a_offset >> PAGE_SHIFT]; 8090 b_pg_offset = b_offset & PAGE_MASK; 8091 pages[1] = mb[b_offset >> PAGE_SHIFT]; 8092 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 8093 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 8094 mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE); 8095 a_cp = (char *)vaddr[0] + a_pg_offset; 8096 b_cp = (char *)vaddr[1] + b_pg_offset; 8097 bcopy(a_cp, b_cp, cnt); 8098 if (__predict_false(mapped)) 8099 pmap_unmap_io_transient(pages, vaddr, 2, FALSE); 8100 a_offset += cnt; 8101 b_offset += cnt; 8102 xfersize -= cnt; 8103 } 8104 } 8105 8106 /* 8107 * Returns true if the pmap's pv is one of the first 8108 * 16 pvs linked to from this page. This count may 8109 * be changed upwards or downwards in the future; it 8110 * is only necessary that true be returned for a small 8111 * subset of pmaps for proper page aging. 8112 */ 8113 boolean_t 8114 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 8115 { 8116 struct md_page *pvh; 8117 struct rwlock *lock; 8118 pv_entry_t pv; 8119 int loops = 0; 8120 boolean_t rv; 8121 8122 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8123 ("pmap_page_exists_quick: page %p is not managed", m)); 8124 rv = FALSE; 8125 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8126 rw_rlock(lock); 8127 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8128 if (PV_PMAP(pv) == pmap) { 8129 rv = TRUE; 8130 break; 8131 } 8132 loops++; 8133 if (loops >= 16) 8134 break; 8135 } 8136 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 8137 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8138 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8139 if (PV_PMAP(pv) == pmap) { 8140 rv = TRUE; 8141 break; 8142 } 8143 loops++; 8144 if (loops >= 16) 8145 break; 8146 } 8147 } 8148 rw_runlock(lock); 8149 return (rv); 8150 } 8151 8152 /* 8153 * pmap_page_wired_mappings: 8154 * 8155 * Return the number of managed mappings to the given physical page 8156 * that are wired. 8157 */ 8158 int 8159 pmap_page_wired_mappings(vm_page_t m) 8160 { 8161 struct rwlock *lock; 8162 struct md_page *pvh; 8163 pmap_t pmap; 8164 pt_entry_t *pte; 8165 pv_entry_t pv; 8166 int count, md_gen, pvh_gen; 8167 8168 if ((m->oflags & VPO_UNMANAGED) != 0) 8169 return (0); 8170 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8171 rw_rlock(lock); 8172 restart: 8173 count = 0; 8174 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8175 pmap = PV_PMAP(pv); 8176 if (!PMAP_TRYLOCK(pmap)) { 8177 md_gen = m->md.pv_gen; 8178 rw_runlock(lock); 8179 PMAP_LOCK(pmap); 8180 rw_rlock(lock); 8181 if (md_gen != m->md.pv_gen) { 8182 PMAP_UNLOCK(pmap); 8183 goto restart; 8184 } 8185 } 8186 pte = pmap_pte(pmap, pv->pv_va); 8187 if ((*pte & PG_W) != 0) 8188 count++; 8189 PMAP_UNLOCK(pmap); 8190 } 8191 if ((m->flags & PG_FICTITIOUS) == 0) { 8192 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8193 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8194 pmap = PV_PMAP(pv); 8195 if (!PMAP_TRYLOCK(pmap)) { 8196 md_gen = m->md.pv_gen; 8197 pvh_gen = pvh->pv_gen; 8198 rw_runlock(lock); 8199 PMAP_LOCK(pmap); 8200 rw_rlock(lock); 8201 if (md_gen != m->md.pv_gen || 8202 pvh_gen != pvh->pv_gen) { 8203 PMAP_UNLOCK(pmap); 8204 goto restart; 8205 } 8206 } 8207 pte = pmap_pde(pmap, pv->pv_va); 8208 if ((*pte & PG_W) != 0) 8209 count++; 8210 PMAP_UNLOCK(pmap); 8211 } 8212 } 8213 rw_runlock(lock); 8214 return (count); 8215 } 8216 8217 /* 8218 * Returns TRUE if the given page is mapped individually or as part of 8219 * a 2mpage. Otherwise, returns FALSE. 8220 */ 8221 boolean_t 8222 pmap_page_is_mapped(vm_page_t m) 8223 { 8224 struct rwlock *lock; 8225 boolean_t rv; 8226 8227 if ((m->oflags & VPO_UNMANAGED) != 0) 8228 return (FALSE); 8229 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8230 rw_rlock(lock); 8231 rv = !TAILQ_EMPTY(&m->md.pv_list) || 8232 ((m->flags & PG_FICTITIOUS) == 0 && 8233 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 8234 rw_runlock(lock); 8235 return (rv); 8236 } 8237 8238 /* 8239 * Destroy all managed, non-wired mappings in the given user-space 8240 * pmap. This pmap cannot be active on any processor besides the 8241 * caller. 8242 * 8243 * This function cannot be applied to the kernel pmap. Moreover, it 8244 * is not intended for general use. It is only to be used during 8245 * process termination. Consequently, it can be implemented in ways 8246 * that make it faster than pmap_remove(). First, it can more quickly 8247 * destroy mappings by iterating over the pmap's collection of PV 8248 * entries, rather than searching the page table. Second, it doesn't 8249 * have to test and clear the page table entries atomically, because 8250 * no processor is currently accessing the user address space. In 8251 * particular, a page table entry's dirty bit won't change state once 8252 * this function starts. 8253 * 8254 * Although this function destroys all of the pmap's managed, 8255 * non-wired mappings, it can delay and batch the invalidation of TLB 8256 * entries without calling pmap_delayed_invl_start() and 8257 * pmap_delayed_invl_finish(). Because the pmap is not active on 8258 * any other processor, none of these TLB entries will ever be used 8259 * before their eventual invalidation. Consequently, there is no need 8260 * for either pmap_remove_all() or pmap_remove_write() to wait for 8261 * that eventual TLB invalidation. 8262 */ 8263 void 8264 pmap_remove_pages(pmap_t pmap) 8265 { 8266 pd_entry_t ptepde; 8267 pt_entry_t *pte, tpte; 8268 pt_entry_t PG_M, PG_RW, PG_V; 8269 struct spglist free; 8270 struct pv_chunklist free_chunks[PMAP_MEMDOM]; 8271 vm_page_t m, mpte, mt; 8272 pv_entry_t pv; 8273 struct md_page *pvh; 8274 struct pv_chunk *pc, *npc; 8275 struct rwlock *lock; 8276 int64_t bit; 8277 uint64_t inuse, bitmask; 8278 int allfree, field, freed, i, idx; 8279 boolean_t superpage; 8280 vm_paddr_t pa; 8281 8282 /* 8283 * Assert that the given pmap is only active on the current 8284 * CPU. Unfortunately, we cannot block another CPU from 8285 * activating the pmap while this function is executing. 8286 */ 8287 KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); 8288 #ifdef INVARIANTS 8289 { 8290 cpuset_t other_cpus; 8291 8292 other_cpus = all_cpus; 8293 critical_enter(); 8294 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 8295 CPU_AND(&other_cpus, &pmap->pm_active); 8296 critical_exit(); 8297 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); 8298 } 8299 #endif 8300 8301 lock = NULL; 8302 PG_M = pmap_modified_bit(pmap); 8303 PG_V = pmap_valid_bit(pmap); 8304 PG_RW = pmap_rw_bit(pmap); 8305 8306 for (i = 0; i < PMAP_MEMDOM; i++) 8307 TAILQ_INIT(&free_chunks[i]); 8308 SLIST_INIT(&free); 8309 PMAP_LOCK(pmap); 8310 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 8311 allfree = 1; 8312 freed = 0; 8313 for (field = 0; field < _NPCM; field++) { 8314 inuse = ~pc->pc_map[field] & pc_freemask[field]; 8315 while (inuse != 0) { 8316 bit = bsfq(inuse); 8317 bitmask = 1UL << bit; 8318 idx = field * 64 + bit; 8319 pv = &pc->pc_pventry[idx]; 8320 inuse &= ~bitmask; 8321 8322 pte = pmap_pdpe(pmap, pv->pv_va); 8323 ptepde = *pte; 8324 pte = pmap_pdpe_to_pde(pte, pv->pv_va); 8325 tpte = *pte; 8326 if ((tpte & (PG_PS | PG_V)) == PG_V) { 8327 superpage = FALSE; 8328 ptepde = tpte; 8329 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 8330 PG_FRAME); 8331 pte = &pte[pmap_pte_index(pv->pv_va)]; 8332 tpte = *pte; 8333 } else { 8334 /* 8335 * Keep track whether 'tpte' is a 8336 * superpage explicitly instead of 8337 * relying on PG_PS being set. 8338 * 8339 * This is because PG_PS is numerically 8340 * identical to PG_PTE_PAT and thus a 8341 * regular page could be mistaken for 8342 * a superpage. 8343 */ 8344 superpage = TRUE; 8345 } 8346 8347 if ((tpte & PG_V) == 0) { 8348 panic("bad pte va %lx pte %lx", 8349 pv->pv_va, tpte); 8350 } 8351 8352 /* 8353 * We cannot remove wired pages from a process' mapping at this time 8354 */ 8355 if (tpte & PG_W) { 8356 allfree = 0; 8357 continue; 8358 } 8359 8360 /* Mark free */ 8361 pc->pc_map[field] |= bitmask; 8362 8363 /* 8364 * Because this pmap is not active on other 8365 * processors, the dirty bit cannot have 8366 * changed state since we last loaded pte. 8367 */ 8368 pte_clear(pte); 8369 8370 if (superpage) 8371 pa = tpte & PG_PS_FRAME; 8372 else 8373 pa = tpte & PG_FRAME; 8374 8375 m = PHYS_TO_VM_PAGE(pa); 8376 KASSERT(m->phys_addr == pa, 8377 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 8378 m, (uintmax_t)m->phys_addr, 8379 (uintmax_t)tpte)); 8380 8381 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 8382 m < &vm_page_array[vm_page_array_size], 8383 ("pmap_remove_pages: bad tpte %#jx", 8384 (uintmax_t)tpte)); 8385 8386 /* 8387 * Update the vm_page_t clean/reference bits. 8388 */ 8389 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 8390 if (superpage) { 8391 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 8392 vm_page_dirty(mt); 8393 } else 8394 vm_page_dirty(m); 8395 } 8396 8397 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 8398 8399 if (superpage) { 8400 pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE); 8401 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 8402 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 8403 pvh->pv_gen++; 8404 if (TAILQ_EMPTY(&pvh->pv_list)) { 8405 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 8406 if ((mt->a.flags & PGA_WRITEABLE) != 0 && 8407 TAILQ_EMPTY(&mt->md.pv_list)) 8408 vm_page_aflag_clear(mt, PGA_WRITEABLE); 8409 } 8410 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 8411 if (mpte != NULL) { 8412 KASSERT(mpte->valid == VM_PAGE_BITS_ALL, 8413 ("pmap_remove_pages: pte page not promoted")); 8414 pmap_resident_count_adj(pmap, -1); 8415 KASSERT(mpte->ref_count == NPTEPG, 8416 ("pmap_remove_pages: pte page reference count error")); 8417 mpte->ref_count = 0; 8418 pmap_add_delayed_free_list(mpte, &free, FALSE); 8419 } 8420 } else { 8421 pmap_resident_count_adj(pmap, -1); 8422 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 8423 m->md.pv_gen++; 8424 if ((m->a.flags & PGA_WRITEABLE) != 0 && 8425 TAILQ_EMPTY(&m->md.pv_list) && 8426 (m->flags & PG_FICTITIOUS) == 0) { 8427 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8428 if (TAILQ_EMPTY(&pvh->pv_list)) 8429 vm_page_aflag_clear(m, PGA_WRITEABLE); 8430 } 8431 } 8432 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 8433 freed++; 8434 } 8435 } 8436 PV_STAT(counter_u64_add(pv_entry_frees, freed)); 8437 PV_STAT(counter_u64_add(pv_entry_spare, freed)); 8438 PV_STAT(counter_u64_add(pv_entry_count, -freed)); 8439 if (allfree) { 8440 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 8441 TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc, pc_list); 8442 } 8443 } 8444 if (lock != NULL) 8445 rw_wunlock(lock); 8446 pmap_invalidate_all(pmap); 8447 pmap_pkru_deassign_all(pmap); 8448 free_pv_chunk_batch((struct pv_chunklist *)&free_chunks); 8449 PMAP_UNLOCK(pmap); 8450 vm_page_free_pages_toq(&free, true); 8451 } 8452 8453 static boolean_t 8454 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 8455 { 8456 struct rwlock *lock; 8457 pv_entry_t pv; 8458 struct md_page *pvh; 8459 pt_entry_t *pte, mask; 8460 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 8461 pmap_t pmap; 8462 int md_gen, pvh_gen; 8463 boolean_t rv; 8464 8465 rv = FALSE; 8466 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8467 rw_rlock(lock); 8468 restart: 8469 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8470 pmap = PV_PMAP(pv); 8471 if (!PMAP_TRYLOCK(pmap)) { 8472 md_gen = m->md.pv_gen; 8473 rw_runlock(lock); 8474 PMAP_LOCK(pmap); 8475 rw_rlock(lock); 8476 if (md_gen != m->md.pv_gen) { 8477 PMAP_UNLOCK(pmap); 8478 goto restart; 8479 } 8480 } 8481 pte = pmap_pte(pmap, pv->pv_va); 8482 mask = 0; 8483 if (modified) { 8484 PG_M = pmap_modified_bit(pmap); 8485 PG_RW = pmap_rw_bit(pmap); 8486 mask |= PG_RW | PG_M; 8487 } 8488 if (accessed) { 8489 PG_A = pmap_accessed_bit(pmap); 8490 PG_V = pmap_valid_bit(pmap); 8491 mask |= PG_V | PG_A; 8492 } 8493 rv = (*pte & mask) == mask; 8494 PMAP_UNLOCK(pmap); 8495 if (rv) 8496 goto out; 8497 } 8498 if ((m->flags & PG_FICTITIOUS) == 0) { 8499 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8500 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8501 pmap = PV_PMAP(pv); 8502 if (!PMAP_TRYLOCK(pmap)) { 8503 md_gen = m->md.pv_gen; 8504 pvh_gen = pvh->pv_gen; 8505 rw_runlock(lock); 8506 PMAP_LOCK(pmap); 8507 rw_rlock(lock); 8508 if (md_gen != m->md.pv_gen || 8509 pvh_gen != pvh->pv_gen) { 8510 PMAP_UNLOCK(pmap); 8511 goto restart; 8512 } 8513 } 8514 pte = pmap_pde(pmap, pv->pv_va); 8515 mask = 0; 8516 if (modified) { 8517 PG_M = pmap_modified_bit(pmap); 8518 PG_RW = pmap_rw_bit(pmap); 8519 mask |= PG_RW | PG_M; 8520 } 8521 if (accessed) { 8522 PG_A = pmap_accessed_bit(pmap); 8523 PG_V = pmap_valid_bit(pmap); 8524 mask |= PG_V | PG_A; 8525 } 8526 rv = (*pte & mask) == mask; 8527 PMAP_UNLOCK(pmap); 8528 if (rv) 8529 goto out; 8530 } 8531 } 8532 out: 8533 rw_runlock(lock); 8534 return (rv); 8535 } 8536 8537 /* 8538 * pmap_is_modified: 8539 * 8540 * Return whether or not the specified physical page was modified 8541 * in any physical maps. 8542 */ 8543 boolean_t 8544 pmap_is_modified(vm_page_t m) 8545 { 8546 8547 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8548 ("pmap_is_modified: page %p is not managed", m)); 8549 8550 /* 8551 * If the page is not busied then this check is racy. 8552 */ 8553 if (!pmap_page_is_write_mapped(m)) 8554 return (FALSE); 8555 return (pmap_page_test_mappings(m, FALSE, TRUE)); 8556 } 8557 8558 /* 8559 * pmap_is_prefaultable: 8560 * 8561 * Return whether or not the specified virtual address is eligible 8562 * for prefault. 8563 */ 8564 boolean_t 8565 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 8566 { 8567 pd_entry_t *pde; 8568 pt_entry_t *pte, PG_V; 8569 boolean_t rv; 8570 8571 PG_V = pmap_valid_bit(pmap); 8572 rv = FALSE; 8573 PMAP_LOCK(pmap); 8574 pde = pmap_pde(pmap, addr); 8575 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { 8576 pte = pmap_pde_to_pte(pde, addr); 8577 rv = (*pte & PG_V) == 0; 8578 } 8579 PMAP_UNLOCK(pmap); 8580 return (rv); 8581 } 8582 8583 /* 8584 * pmap_is_referenced: 8585 * 8586 * Return whether or not the specified physical page was referenced 8587 * in any physical maps. 8588 */ 8589 boolean_t 8590 pmap_is_referenced(vm_page_t m) 8591 { 8592 8593 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8594 ("pmap_is_referenced: page %p is not managed", m)); 8595 return (pmap_page_test_mappings(m, TRUE, FALSE)); 8596 } 8597 8598 /* 8599 * Clear the write and modified bits in each of the given page's mappings. 8600 */ 8601 void 8602 pmap_remove_write(vm_page_t m) 8603 { 8604 struct md_page *pvh; 8605 pmap_t pmap; 8606 struct rwlock *lock; 8607 pv_entry_t next_pv, pv; 8608 pd_entry_t *pde; 8609 pt_entry_t oldpte, *pte, PG_M, PG_RW; 8610 vm_offset_t va; 8611 int pvh_gen, md_gen; 8612 8613 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8614 ("pmap_remove_write: page %p is not managed", m)); 8615 8616 vm_page_assert_busied(m); 8617 if (!pmap_page_is_write_mapped(m)) 8618 return; 8619 8620 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8621 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 8622 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8623 rw_wlock(lock); 8624 retry: 8625 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 8626 pmap = PV_PMAP(pv); 8627 if (!PMAP_TRYLOCK(pmap)) { 8628 pvh_gen = pvh->pv_gen; 8629 rw_wunlock(lock); 8630 PMAP_LOCK(pmap); 8631 rw_wlock(lock); 8632 if (pvh_gen != pvh->pv_gen) { 8633 PMAP_UNLOCK(pmap); 8634 goto retry; 8635 } 8636 } 8637 PG_RW = pmap_rw_bit(pmap); 8638 va = pv->pv_va; 8639 pde = pmap_pde(pmap, va); 8640 if ((*pde & PG_RW) != 0) 8641 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 8642 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 8643 ("inconsistent pv lock %p %p for page %p", 8644 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 8645 PMAP_UNLOCK(pmap); 8646 } 8647 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8648 pmap = PV_PMAP(pv); 8649 if (!PMAP_TRYLOCK(pmap)) { 8650 pvh_gen = pvh->pv_gen; 8651 md_gen = m->md.pv_gen; 8652 rw_wunlock(lock); 8653 PMAP_LOCK(pmap); 8654 rw_wlock(lock); 8655 if (pvh_gen != pvh->pv_gen || 8656 md_gen != m->md.pv_gen) { 8657 PMAP_UNLOCK(pmap); 8658 goto retry; 8659 } 8660 } 8661 PG_M = pmap_modified_bit(pmap); 8662 PG_RW = pmap_rw_bit(pmap); 8663 pde = pmap_pde(pmap, pv->pv_va); 8664 KASSERT((*pde & PG_PS) == 0, 8665 ("pmap_remove_write: found a 2mpage in page %p's pv list", 8666 m)); 8667 pte = pmap_pde_to_pte(pde, pv->pv_va); 8668 oldpte = *pte; 8669 if (oldpte & PG_RW) { 8670 while (!atomic_fcmpset_long(pte, &oldpte, oldpte & 8671 ~(PG_RW | PG_M))) 8672 cpu_spinwait(); 8673 if ((oldpte & PG_M) != 0) 8674 vm_page_dirty(m); 8675 pmap_invalidate_page(pmap, pv->pv_va); 8676 } 8677 PMAP_UNLOCK(pmap); 8678 } 8679 rw_wunlock(lock); 8680 vm_page_aflag_clear(m, PGA_WRITEABLE); 8681 pmap_delayed_invl_wait(m); 8682 } 8683 8684 static __inline boolean_t 8685 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) 8686 { 8687 8688 if (!pmap_emulate_ad_bits(pmap)) 8689 return (TRUE); 8690 8691 KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); 8692 8693 /* 8694 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration 8695 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared 8696 * if the EPT_PG_WRITE bit is set. 8697 */ 8698 if ((pte & EPT_PG_WRITE) != 0) 8699 return (FALSE); 8700 8701 /* 8702 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. 8703 */ 8704 if ((pte & EPT_PG_EXECUTE) == 0 || 8705 ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) 8706 return (TRUE); 8707 else 8708 return (FALSE); 8709 } 8710 8711 /* 8712 * pmap_ts_referenced: 8713 * 8714 * Return a count of reference bits for a page, clearing those bits. 8715 * It is not necessary for every reference bit to be cleared, but it 8716 * is necessary that 0 only be returned when there are truly no 8717 * reference bits set. 8718 * 8719 * As an optimization, update the page's dirty field if a modified bit is 8720 * found while counting reference bits. This opportunistic update can be 8721 * performed at low cost and can eliminate the need for some future calls 8722 * to pmap_is_modified(). However, since this function stops after 8723 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 8724 * dirty pages. Those dirty pages will only be detected by a future call 8725 * to pmap_is_modified(). 8726 * 8727 * A DI block is not needed within this function, because 8728 * invalidations are performed before the PV list lock is 8729 * released. 8730 */ 8731 int 8732 pmap_ts_referenced(vm_page_t m) 8733 { 8734 struct md_page *pvh; 8735 pv_entry_t pv, pvf; 8736 pmap_t pmap; 8737 struct rwlock *lock; 8738 pd_entry_t oldpde, *pde; 8739 pt_entry_t *pte, PG_A, PG_M, PG_RW; 8740 vm_offset_t va; 8741 vm_paddr_t pa; 8742 int cleared, md_gen, not_cleared, pvh_gen; 8743 struct spglist free; 8744 boolean_t demoted; 8745 8746 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8747 ("pmap_ts_referenced: page %p is not managed", m)); 8748 SLIST_INIT(&free); 8749 cleared = 0; 8750 pa = VM_PAGE_TO_PHYS(m); 8751 lock = PHYS_TO_PV_LIST_LOCK(pa); 8752 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 8753 rw_wlock(lock); 8754 retry: 8755 not_cleared = 0; 8756 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 8757 goto small_mappings; 8758 pv = pvf; 8759 do { 8760 if (pvf == NULL) 8761 pvf = pv; 8762 pmap = PV_PMAP(pv); 8763 if (!PMAP_TRYLOCK(pmap)) { 8764 pvh_gen = pvh->pv_gen; 8765 rw_wunlock(lock); 8766 PMAP_LOCK(pmap); 8767 rw_wlock(lock); 8768 if (pvh_gen != pvh->pv_gen) { 8769 PMAP_UNLOCK(pmap); 8770 goto retry; 8771 } 8772 } 8773 PG_A = pmap_accessed_bit(pmap); 8774 PG_M = pmap_modified_bit(pmap); 8775 PG_RW = pmap_rw_bit(pmap); 8776 va = pv->pv_va; 8777 pde = pmap_pde(pmap, pv->pv_va); 8778 oldpde = *pde; 8779 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 8780 /* 8781 * Although "oldpde" is mapping a 2MB page, because 8782 * this function is called at a 4KB page granularity, 8783 * we only update the 4KB page under test. 8784 */ 8785 vm_page_dirty(m); 8786 } 8787 if ((oldpde & PG_A) != 0) { 8788 /* 8789 * Since this reference bit is shared by 512 4KB 8790 * pages, it should not be cleared every time it is 8791 * tested. Apply a simple "hash" function on the 8792 * physical page number, the virtual superpage number, 8793 * and the pmap address to select one 4KB page out of 8794 * the 512 on which testing the reference bit will 8795 * result in clearing that reference bit. This 8796 * function is designed to avoid the selection of the 8797 * same 4KB page for every 2MB page mapping. 8798 * 8799 * On demotion, a mapping that hasn't been referenced 8800 * is simply destroyed. To avoid the possibility of a 8801 * subsequent page fault on a demoted wired mapping, 8802 * always leave its reference bit set. Moreover, 8803 * since the superpage is wired, the current state of 8804 * its reference bit won't affect page replacement. 8805 */ 8806 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 8807 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 8808 (oldpde & PG_W) == 0) { 8809 if (safe_to_clear_referenced(pmap, oldpde)) { 8810 atomic_clear_long(pde, PG_A); 8811 pmap_invalidate_page(pmap, pv->pv_va); 8812 demoted = FALSE; 8813 } else if (pmap_demote_pde_locked(pmap, pde, 8814 pv->pv_va, &lock)) { 8815 /* 8816 * Remove the mapping to a single page 8817 * so that a subsequent access may 8818 * repromote. Since the underlying 8819 * page table page is fully populated, 8820 * this removal never frees a page 8821 * table page. 8822 */ 8823 demoted = TRUE; 8824 va += VM_PAGE_TO_PHYS(m) - (oldpde & 8825 PG_PS_FRAME); 8826 pte = pmap_pde_to_pte(pde, va); 8827 pmap_remove_pte(pmap, pte, va, *pde, 8828 NULL, &lock); 8829 pmap_invalidate_page(pmap, va); 8830 } else 8831 demoted = TRUE; 8832 8833 if (demoted) { 8834 /* 8835 * The superpage mapping was removed 8836 * entirely and therefore 'pv' is no 8837 * longer valid. 8838 */ 8839 if (pvf == pv) 8840 pvf = NULL; 8841 pv = NULL; 8842 } 8843 cleared++; 8844 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 8845 ("inconsistent pv lock %p %p for page %p", 8846 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 8847 } else 8848 not_cleared++; 8849 } 8850 PMAP_UNLOCK(pmap); 8851 /* Rotate the PV list if it has more than one entry. */ 8852 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 8853 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 8854 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 8855 pvh->pv_gen++; 8856 } 8857 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 8858 goto out; 8859 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 8860 small_mappings: 8861 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 8862 goto out; 8863 pv = pvf; 8864 do { 8865 if (pvf == NULL) 8866 pvf = pv; 8867 pmap = PV_PMAP(pv); 8868 if (!PMAP_TRYLOCK(pmap)) { 8869 pvh_gen = pvh->pv_gen; 8870 md_gen = m->md.pv_gen; 8871 rw_wunlock(lock); 8872 PMAP_LOCK(pmap); 8873 rw_wlock(lock); 8874 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 8875 PMAP_UNLOCK(pmap); 8876 goto retry; 8877 } 8878 } 8879 PG_A = pmap_accessed_bit(pmap); 8880 PG_M = pmap_modified_bit(pmap); 8881 PG_RW = pmap_rw_bit(pmap); 8882 pde = pmap_pde(pmap, pv->pv_va); 8883 KASSERT((*pde & PG_PS) == 0, 8884 ("pmap_ts_referenced: found a 2mpage in page %p's pv list", 8885 m)); 8886 pte = pmap_pde_to_pte(pde, pv->pv_va); 8887 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 8888 vm_page_dirty(m); 8889 if ((*pte & PG_A) != 0) { 8890 if (safe_to_clear_referenced(pmap, *pte)) { 8891 atomic_clear_long(pte, PG_A); 8892 pmap_invalidate_page(pmap, pv->pv_va); 8893 cleared++; 8894 } else if ((*pte & PG_W) == 0) { 8895 /* 8896 * Wired pages cannot be paged out so 8897 * doing accessed bit emulation for 8898 * them is wasted effort. We do the 8899 * hard work for unwired pages only. 8900 */ 8901 pmap_remove_pte(pmap, pte, pv->pv_va, 8902 *pde, &free, &lock); 8903 pmap_invalidate_page(pmap, pv->pv_va); 8904 cleared++; 8905 if (pvf == pv) 8906 pvf = NULL; 8907 pv = NULL; 8908 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 8909 ("inconsistent pv lock %p %p for page %p", 8910 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 8911 } else 8912 not_cleared++; 8913 } 8914 PMAP_UNLOCK(pmap); 8915 /* Rotate the PV list if it has more than one entry. */ 8916 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 8917 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 8918 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 8919 m->md.pv_gen++; 8920 } 8921 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 8922 not_cleared < PMAP_TS_REFERENCED_MAX); 8923 out: 8924 rw_wunlock(lock); 8925 vm_page_free_pages_toq(&free, true); 8926 return (cleared + not_cleared); 8927 } 8928 8929 /* 8930 * Apply the given advice to the specified range of addresses within the 8931 * given pmap. Depending on the advice, clear the referenced and/or 8932 * modified flags in each mapping and set the mapped page's dirty field. 8933 */ 8934 void 8935 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 8936 { 8937 struct rwlock *lock; 8938 pml4_entry_t *pml4e; 8939 pdp_entry_t *pdpe; 8940 pd_entry_t oldpde, *pde; 8941 pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; 8942 vm_offset_t va, va_next; 8943 vm_page_t m; 8944 bool anychanged; 8945 8946 if (advice != MADV_DONTNEED && advice != MADV_FREE) 8947 return; 8948 8949 /* 8950 * A/D bit emulation requires an alternate code path when clearing 8951 * the modified and accessed bits below. Since this function is 8952 * advisory in nature we skip it entirely for pmaps that require 8953 * A/D bit emulation. 8954 */ 8955 if (pmap_emulate_ad_bits(pmap)) 8956 return; 8957 8958 PG_A = pmap_accessed_bit(pmap); 8959 PG_G = pmap_global_bit(pmap); 8960 PG_M = pmap_modified_bit(pmap); 8961 PG_V = pmap_valid_bit(pmap); 8962 PG_RW = pmap_rw_bit(pmap); 8963 anychanged = false; 8964 pmap_delayed_invl_start(); 8965 PMAP_LOCK(pmap); 8966 for (; sva < eva; sva = va_next) { 8967 pml4e = pmap_pml4e(pmap, sva); 8968 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 8969 va_next = (sva + NBPML4) & ~PML4MASK; 8970 if (va_next < sva) 8971 va_next = eva; 8972 continue; 8973 } 8974 8975 va_next = (sva + NBPDP) & ~PDPMASK; 8976 if (va_next < sva) 8977 va_next = eva; 8978 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 8979 if ((*pdpe & PG_V) == 0) 8980 continue; 8981 if ((*pdpe & PG_PS) != 0) { 8982 KASSERT(va_next <= eva, 8983 ("partial update of non-transparent 1G mapping " 8984 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 8985 *pdpe, sva, eva, va_next)); 8986 continue; 8987 } 8988 8989 va_next = (sva + NBPDR) & ~PDRMASK; 8990 if (va_next < sva) 8991 va_next = eva; 8992 pde = pmap_pdpe_to_pde(pdpe, sva); 8993 oldpde = *pde; 8994 if ((oldpde & PG_V) == 0) 8995 continue; 8996 else if ((oldpde & PG_PS) != 0) { 8997 if ((oldpde & PG_MANAGED) == 0) 8998 continue; 8999 lock = NULL; 9000 if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { 9001 if (lock != NULL) 9002 rw_wunlock(lock); 9003 9004 /* 9005 * The large page mapping was destroyed. 9006 */ 9007 continue; 9008 } 9009 9010 /* 9011 * Unless the page mappings are wired, remove the 9012 * mapping to a single page so that a subsequent 9013 * access may repromote. Choosing the last page 9014 * within the address range [sva, min(va_next, eva)) 9015 * generally results in more repromotions. Since the 9016 * underlying page table page is fully populated, this 9017 * removal never frees a page table page. 9018 */ 9019 if ((oldpde & PG_W) == 0) { 9020 va = eva; 9021 if (va > va_next) 9022 va = va_next; 9023 va -= PAGE_SIZE; 9024 KASSERT(va >= sva, 9025 ("pmap_advise: no address gap")); 9026 pte = pmap_pde_to_pte(pde, va); 9027 KASSERT((*pte & PG_V) != 0, 9028 ("pmap_advise: invalid PTE")); 9029 pmap_remove_pte(pmap, pte, va, *pde, NULL, 9030 &lock); 9031 anychanged = true; 9032 } 9033 if (lock != NULL) 9034 rw_wunlock(lock); 9035 } 9036 if (va_next > eva) 9037 va_next = eva; 9038 va = va_next; 9039 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 9040 sva += PAGE_SIZE) { 9041 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) 9042 goto maybe_invlrng; 9043 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 9044 if (advice == MADV_DONTNEED) { 9045 /* 9046 * Future calls to pmap_is_modified() 9047 * can be avoided by making the page 9048 * dirty now. 9049 */ 9050 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 9051 vm_page_dirty(m); 9052 } 9053 atomic_clear_long(pte, PG_M | PG_A); 9054 } else if ((*pte & PG_A) != 0) 9055 atomic_clear_long(pte, PG_A); 9056 else 9057 goto maybe_invlrng; 9058 9059 if ((*pte & PG_G) != 0) { 9060 if (va == va_next) 9061 va = sva; 9062 } else 9063 anychanged = true; 9064 continue; 9065 maybe_invlrng: 9066 if (va != va_next) { 9067 pmap_invalidate_range(pmap, va, sva); 9068 va = va_next; 9069 } 9070 } 9071 if (va != va_next) 9072 pmap_invalidate_range(pmap, va, sva); 9073 } 9074 if (anychanged) 9075 pmap_invalidate_all(pmap); 9076 PMAP_UNLOCK(pmap); 9077 pmap_delayed_invl_finish(); 9078 } 9079 9080 /* 9081 * Clear the modify bits on the specified physical page. 9082 */ 9083 void 9084 pmap_clear_modify(vm_page_t m) 9085 { 9086 struct md_page *pvh; 9087 pmap_t pmap; 9088 pv_entry_t next_pv, pv; 9089 pd_entry_t oldpde, *pde; 9090 pt_entry_t *pte, PG_M, PG_RW; 9091 struct rwlock *lock; 9092 vm_offset_t va; 9093 int md_gen, pvh_gen; 9094 9095 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 9096 ("pmap_clear_modify: page %p is not managed", m)); 9097 vm_page_assert_busied(m); 9098 9099 if (!pmap_page_is_write_mapped(m)) 9100 return; 9101 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 9102 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 9103 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 9104 rw_wlock(lock); 9105 restart: 9106 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 9107 pmap = PV_PMAP(pv); 9108 if (!PMAP_TRYLOCK(pmap)) { 9109 pvh_gen = pvh->pv_gen; 9110 rw_wunlock(lock); 9111 PMAP_LOCK(pmap); 9112 rw_wlock(lock); 9113 if (pvh_gen != pvh->pv_gen) { 9114 PMAP_UNLOCK(pmap); 9115 goto restart; 9116 } 9117 } 9118 PG_M = pmap_modified_bit(pmap); 9119 PG_RW = pmap_rw_bit(pmap); 9120 va = pv->pv_va; 9121 pde = pmap_pde(pmap, va); 9122 oldpde = *pde; 9123 /* If oldpde has PG_RW set, then it also has PG_M set. */ 9124 if ((oldpde & PG_RW) != 0 && 9125 pmap_demote_pde_locked(pmap, pde, va, &lock) && 9126 (oldpde & PG_W) == 0) { 9127 /* 9128 * Write protect the mapping to a single page so that 9129 * a subsequent write access may repromote. 9130 */ 9131 va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); 9132 pte = pmap_pde_to_pte(pde, va); 9133 atomic_clear_long(pte, PG_M | PG_RW); 9134 vm_page_dirty(m); 9135 pmap_invalidate_page(pmap, va); 9136 } 9137 PMAP_UNLOCK(pmap); 9138 } 9139 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 9140 pmap = PV_PMAP(pv); 9141 if (!PMAP_TRYLOCK(pmap)) { 9142 md_gen = m->md.pv_gen; 9143 pvh_gen = pvh->pv_gen; 9144 rw_wunlock(lock); 9145 PMAP_LOCK(pmap); 9146 rw_wlock(lock); 9147 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 9148 PMAP_UNLOCK(pmap); 9149 goto restart; 9150 } 9151 } 9152 PG_M = pmap_modified_bit(pmap); 9153 PG_RW = pmap_rw_bit(pmap); 9154 pde = pmap_pde(pmap, pv->pv_va); 9155 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 9156 " a 2mpage in page %p's pv list", m)); 9157 pte = pmap_pde_to_pte(pde, pv->pv_va); 9158 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 9159 atomic_clear_long(pte, PG_M); 9160 pmap_invalidate_page(pmap, pv->pv_va); 9161 } 9162 PMAP_UNLOCK(pmap); 9163 } 9164 rw_wunlock(lock); 9165 } 9166 9167 /* 9168 * Miscellaneous support routines follow 9169 */ 9170 9171 /* Adjust the properties for a leaf page table entry. */ 9172 static __inline void 9173 pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask) 9174 { 9175 u_long opte, npte; 9176 9177 opte = *(u_long *)pte; 9178 do { 9179 npte = opte & ~mask; 9180 npte |= bits; 9181 } while (npte != opte && !atomic_fcmpset_long((u_long *)pte, &opte, 9182 npte)); 9183 } 9184 9185 /* 9186 * Map a set of physical memory pages into the kernel virtual 9187 * address space. Return a pointer to where it is mapped. This 9188 * routine is intended to be used for mapping device memory, 9189 * NOT real memory. 9190 */ 9191 static void * 9192 pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags) 9193 { 9194 struct pmap_preinit_mapping *ppim; 9195 vm_offset_t va, offset; 9196 vm_size_t tmpsize; 9197 int i; 9198 9199 offset = pa & PAGE_MASK; 9200 size = round_page(offset + size); 9201 pa = trunc_page(pa); 9202 9203 if (!pmap_initialized) { 9204 va = 0; 9205 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9206 ppim = pmap_preinit_mapping + i; 9207 if (ppim->va == 0) { 9208 ppim->pa = pa; 9209 ppim->sz = size; 9210 ppim->mode = mode; 9211 ppim->va = virtual_avail; 9212 virtual_avail += size; 9213 va = ppim->va; 9214 break; 9215 } 9216 } 9217 if (va == 0) 9218 panic("%s: too many preinit mappings", __func__); 9219 } else { 9220 /* 9221 * If we have a preinit mapping, re-use it. 9222 */ 9223 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9224 ppim = pmap_preinit_mapping + i; 9225 if (ppim->pa == pa && ppim->sz == size && 9226 (ppim->mode == mode || 9227 (flags & MAPDEV_SETATTR) == 0)) 9228 return ((void *)(ppim->va + offset)); 9229 } 9230 /* 9231 * If the specified range of physical addresses fits within 9232 * the direct map window, use the direct map. 9233 */ 9234 if (pa < dmaplimit && pa + size <= dmaplimit) { 9235 va = PHYS_TO_DMAP(pa); 9236 if ((flags & MAPDEV_SETATTR) != 0) { 9237 PMAP_LOCK(kernel_pmap); 9238 i = pmap_change_props_locked(va, size, 9239 PROT_NONE, mode, flags); 9240 PMAP_UNLOCK(kernel_pmap); 9241 } else 9242 i = 0; 9243 if (!i) 9244 return ((void *)(va + offset)); 9245 } 9246 va = kva_alloc(size); 9247 if (va == 0) 9248 panic("%s: Couldn't allocate KVA", __func__); 9249 } 9250 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 9251 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 9252 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 9253 if ((flags & MAPDEV_FLUSHCACHE) != 0) 9254 pmap_invalidate_cache_range(va, va + tmpsize); 9255 return ((void *)(va + offset)); 9256 } 9257 9258 void * 9259 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 9260 { 9261 9262 return (pmap_mapdev_internal(pa, size, mode, MAPDEV_FLUSHCACHE | 9263 MAPDEV_SETATTR)); 9264 } 9265 9266 void * 9267 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 9268 { 9269 9270 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 9271 } 9272 9273 void * 9274 pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size) 9275 { 9276 9277 return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE, 9278 MAPDEV_SETATTR)); 9279 } 9280 9281 void * 9282 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 9283 { 9284 9285 return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK, 9286 MAPDEV_FLUSHCACHE)); 9287 } 9288 9289 void 9290 pmap_unmapdev(vm_offset_t va, vm_size_t size) 9291 { 9292 struct pmap_preinit_mapping *ppim; 9293 vm_offset_t offset; 9294 int i; 9295 9296 /* If we gave a direct map region in pmap_mapdev, do nothing */ 9297 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 9298 return; 9299 offset = va & PAGE_MASK; 9300 size = round_page(offset + size); 9301 va = trunc_page(va); 9302 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9303 ppim = pmap_preinit_mapping + i; 9304 if (ppim->va == va && ppim->sz == size) { 9305 if (pmap_initialized) 9306 return; 9307 ppim->pa = 0; 9308 ppim->va = 0; 9309 ppim->sz = 0; 9310 ppim->mode = 0; 9311 if (va + size == virtual_avail) 9312 virtual_avail = va; 9313 return; 9314 } 9315 } 9316 if (pmap_initialized) { 9317 pmap_qremove(va, atop(size)); 9318 kva_free(va, size); 9319 } 9320 } 9321 9322 /* 9323 * Tries to demote a 1GB page mapping. 9324 */ 9325 static boolean_t 9326 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) 9327 { 9328 pdp_entry_t newpdpe, oldpdpe; 9329 pd_entry_t *firstpde, newpde, *pde; 9330 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 9331 vm_paddr_t pdpgpa; 9332 vm_page_t pdpg; 9333 9334 PG_A = pmap_accessed_bit(pmap); 9335 PG_M = pmap_modified_bit(pmap); 9336 PG_V = pmap_valid_bit(pmap); 9337 PG_RW = pmap_rw_bit(pmap); 9338 9339 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9340 oldpdpe = *pdpe; 9341 KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), 9342 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 9343 pdpg = pmap_alloc_pt_page(pmap, va >> PDPSHIFT, 9344 VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT); 9345 if (pdpg == NULL) { 9346 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 9347 " in pmap %p", va, pmap); 9348 return (FALSE); 9349 } 9350 pdpgpa = VM_PAGE_TO_PHYS(pdpg); 9351 firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa); 9352 newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; 9353 KASSERT((oldpdpe & PG_A) != 0, 9354 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 9355 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 9356 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 9357 newpde = oldpdpe; 9358 9359 /* 9360 * Initialize the page directory page. 9361 */ 9362 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 9363 *pde = newpde; 9364 newpde += NBPDR; 9365 } 9366 9367 /* 9368 * Demote the mapping. 9369 */ 9370 *pdpe = newpdpe; 9371 9372 /* 9373 * Invalidate a stale recursive mapping of the page directory page. 9374 */ 9375 pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); 9376 9377 counter_u64_add(pmap_pdpe_demotions, 1); 9378 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 9379 " in pmap %p", va, pmap); 9380 return (TRUE); 9381 } 9382 9383 /* 9384 * Sets the memory attribute for the specified page. 9385 */ 9386 void 9387 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 9388 { 9389 9390 m->md.pat_mode = ma; 9391 9392 /* 9393 * If "m" is a normal page, update its direct mapping. This update 9394 * can be relied upon to perform any cache operations that are 9395 * required for data coherence. 9396 */ 9397 if ((m->flags & PG_FICTITIOUS) == 0 && 9398 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 9399 m->md.pat_mode)) 9400 panic("memory attribute change on the direct map failed"); 9401 } 9402 9403 void 9404 pmap_page_set_memattr_noflush(vm_page_t m, vm_memattr_t ma) 9405 { 9406 int error; 9407 9408 m->md.pat_mode = ma; 9409 9410 if ((m->flags & PG_FICTITIOUS) != 0) 9411 return; 9412 PMAP_LOCK(kernel_pmap); 9413 error = pmap_change_props_locked(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 9414 PAGE_SIZE, PROT_NONE, m->md.pat_mode, 0); 9415 PMAP_UNLOCK(kernel_pmap); 9416 if (error != 0) 9417 panic("memory attribute change on the direct map failed"); 9418 } 9419 9420 /* 9421 * Changes the specified virtual address range's memory type to that given by 9422 * the parameter "mode". The specified virtual address range must be 9423 * completely contained within either the direct map or the kernel map. If 9424 * the virtual address range is contained within the kernel map, then the 9425 * memory type for each of the corresponding ranges of the direct map is also 9426 * changed. (The corresponding ranges of the direct map are those ranges that 9427 * map the same physical pages as the specified virtual address range.) These 9428 * changes to the direct map are necessary because Intel describes the 9429 * behavior of their processors as "undefined" if two or more mappings to the 9430 * same physical page have different memory types. 9431 * 9432 * Returns zero if the change completed successfully, and either EINVAL or 9433 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 9434 * of the virtual address range was not mapped, and ENOMEM is returned if 9435 * there was insufficient memory available to complete the change. In the 9436 * latter case, the memory type may have been changed on some part of the 9437 * virtual address range or the direct map. 9438 */ 9439 int 9440 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 9441 { 9442 int error; 9443 9444 PMAP_LOCK(kernel_pmap); 9445 error = pmap_change_props_locked(va, size, PROT_NONE, mode, 9446 MAPDEV_FLUSHCACHE); 9447 PMAP_UNLOCK(kernel_pmap); 9448 return (error); 9449 } 9450 9451 /* 9452 * Changes the specified virtual address range's protections to those 9453 * specified by "prot". Like pmap_change_attr(), protections for aliases 9454 * in the direct map are updated as well. Protections on aliasing mappings may 9455 * be a subset of the requested protections; for example, mappings in the direct 9456 * map are never executable. 9457 */ 9458 int 9459 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot) 9460 { 9461 int error; 9462 9463 /* Only supported within the kernel map. */ 9464 if (va < VM_MIN_KERNEL_ADDRESS) 9465 return (EINVAL); 9466 9467 PMAP_LOCK(kernel_pmap); 9468 error = pmap_change_props_locked(va, size, prot, -1, 9469 MAPDEV_ASSERTVALID); 9470 PMAP_UNLOCK(kernel_pmap); 9471 return (error); 9472 } 9473 9474 static int 9475 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, 9476 int mode, int flags) 9477 { 9478 vm_offset_t base, offset, tmpva; 9479 vm_paddr_t pa_start, pa_end, pa_end1; 9480 pdp_entry_t *pdpe; 9481 pd_entry_t *pde, pde_bits, pde_mask; 9482 pt_entry_t *pte, pte_bits, pte_mask; 9483 int error; 9484 bool changed; 9485 9486 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 9487 base = trunc_page(va); 9488 offset = va & PAGE_MASK; 9489 size = round_page(offset + size); 9490 9491 /* 9492 * Only supported on kernel virtual addresses, including the direct 9493 * map but excluding the recursive map. 9494 */ 9495 if (base < DMAP_MIN_ADDRESS) 9496 return (EINVAL); 9497 9498 /* 9499 * Construct our flag sets and masks. "bits" is the subset of 9500 * "mask" that will be set in each modified PTE. 9501 * 9502 * Mappings in the direct map are never allowed to be executable. 9503 */ 9504 pde_bits = pte_bits = 0; 9505 pde_mask = pte_mask = 0; 9506 if (mode != -1) { 9507 pde_bits |= pmap_cache_bits(kernel_pmap, mode, true); 9508 pde_mask |= X86_PG_PDE_CACHE; 9509 pte_bits |= pmap_cache_bits(kernel_pmap, mode, false); 9510 pte_mask |= X86_PG_PTE_CACHE; 9511 } 9512 if (prot != VM_PROT_NONE) { 9513 if ((prot & VM_PROT_WRITE) != 0) { 9514 pde_bits |= X86_PG_RW; 9515 pte_bits |= X86_PG_RW; 9516 } 9517 if ((prot & VM_PROT_EXECUTE) == 0 || 9518 va < VM_MIN_KERNEL_ADDRESS) { 9519 pde_bits |= pg_nx; 9520 pte_bits |= pg_nx; 9521 } 9522 pde_mask |= X86_PG_RW | pg_nx; 9523 pte_mask |= X86_PG_RW | pg_nx; 9524 } 9525 9526 /* 9527 * Pages that aren't mapped aren't supported. Also break down 2MB pages 9528 * into 4KB pages if required. 9529 */ 9530 for (tmpva = base; tmpva < base + size; ) { 9531 pdpe = pmap_pdpe(kernel_pmap, tmpva); 9532 if (pdpe == NULL || *pdpe == 0) { 9533 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9534 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9535 return (EINVAL); 9536 } 9537 if (*pdpe & PG_PS) { 9538 /* 9539 * If the current 1GB page already has the required 9540 * properties, then we need not demote this page. Just 9541 * increment tmpva to the next 1GB page frame. 9542 */ 9543 if ((*pdpe & pde_mask) == pde_bits) { 9544 tmpva = trunc_1gpage(tmpva) + NBPDP; 9545 continue; 9546 } 9547 9548 /* 9549 * If the current offset aligns with a 1GB page frame 9550 * and there is at least 1GB left within the range, then 9551 * we need not break down this page into 2MB pages. 9552 */ 9553 if ((tmpva & PDPMASK) == 0 && 9554 tmpva + PDPMASK < base + size) { 9555 tmpva += NBPDP; 9556 continue; 9557 } 9558 if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) 9559 return (ENOMEM); 9560 } 9561 pde = pmap_pdpe_to_pde(pdpe, tmpva); 9562 if (*pde == 0) { 9563 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9564 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9565 return (EINVAL); 9566 } 9567 if (*pde & PG_PS) { 9568 /* 9569 * If the current 2MB page already has the required 9570 * properties, then we need not demote this page. Just 9571 * increment tmpva to the next 2MB page frame. 9572 */ 9573 if ((*pde & pde_mask) == pde_bits) { 9574 tmpva = trunc_2mpage(tmpva) + NBPDR; 9575 continue; 9576 } 9577 9578 /* 9579 * If the current offset aligns with a 2MB page frame 9580 * and there is at least 2MB left within the range, then 9581 * we need not break down this page into 4KB pages. 9582 */ 9583 if ((tmpva & PDRMASK) == 0 && 9584 tmpva + PDRMASK < base + size) { 9585 tmpva += NBPDR; 9586 continue; 9587 } 9588 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) 9589 return (ENOMEM); 9590 } 9591 pte = pmap_pde_to_pte(pde, tmpva); 9592 if (*pte == 0) { 9593 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9594 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9595 return (EINVAL); 9596 } 9597 tmpva += PAGE_SIZE; 9598 } 9599 error = 0; 9600 9601 /* 9602 * Ok, all the pages exist, so run through them updating their 9603 * properties if required. 9604 */ 9605 changed = false; 9606 pa_start = pa_end = 0; 9607 for (tmpva = base; tmpva < base + size; ) { 9608 pdpe = pmap_pdpe(kernel_pmap, tmpva); 9609 if (*pdpe & PG_PS) { 9610 if ((*pdpe & pde_mask) != pde_bits) { 9611 pmap_pte_props(pdpe, pde_bits, pde_mask); 9612 changed = true; 9613 } 9614 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9615 (*pdpe & PG_PS_FRAME) < dmaplimit) { 9616 if (pa_start == pa_end) { 9617 /* Start physical address run. */ 9618 pa_start = *pdpe & PG_PS_FRAME; 9619 pa_end = pa_start + NBPDP; 9620 } else if (pa_end == (*pdpe & PG_PS_FRAME)) 9621 pa_end += NBPDP; 9622 else { 9623 /* Run ended, update direct map. */ 9624 error = pmap_change_props_locked( 9625 PHYS_TO_DMAP(pa_start), 9626 pa_end - pa_start, prot, mode, 9627 flags); 9628 if (error != 0) 9629 break; 9630 /* Start physical address run. */ 9631 pa_start = *pdpe & PG_PS_FRAME; 9632 pa_end = pa_start + NBPDP; 9633 } 9634 } 9635 tmpva = trunc_1gpage(tmpva) + NBPDP; 9636 continue; 9637 } 9638 pde = pmap_pdpe_to_pde(pdpe, tmpva); 9639 if (*pde & PG_PS) { 9640 if ((*pde & pde_mask) != pde_bits) { 9641 pmap_pte_props(pde, pde_bits, pde_mask); 9642 changed = true; 9643 } 9644 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9645 (*pde & PG_PS_FRAME) < dmaplimit) { 9646 if (pa_start == pa_end) { 9647 /* Start physical address run. */ 9648 pa_start = *pde & PG_PS_FRAME; 9649 pa_end = pa_start + NBPDR; 9650 } else if (pa_end == (*pde & PG_PS_FRAME)) 9651 pa_end += NBPDR; 9652 else { 9653 /* Run ended, update direct map. */ 9654 error = pmap_change_props_locked( 9655 PHYS_TO_DMAP(pa_start), 9656 pa_end - pa_start, prot, mode, 9657 flags); 9658 if (error != 0) 9659 break; 9660 /* Start physical address run. */ 9661 pa_start = *pde & PG_PS_FRAME; 9662 pa_end = pa_start + NBPDR; 9663 } 9664 } 9665 tmpva = trunc_2mpage(tmpva) + NBPDR; 9666 } else { 9667 pte = pmap_pde_to_pte(pde, tmpva); 9668 if ((*pte & pte_mask) != pte_bits) { 9669 pmap_pte_props(pte, pte_bits, pte_mask); 9670 changed = true; 9671 } 9672 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9673 (*pte & PG_FRAME) < dmaplimit) { 9674 if (pa_start == pa_end) { 9675 /* Start physical address run. */ 9676 pa_start = *pte & PG_FRAME; 9677 pa_end = pa_start + PAGE_SIZE; 9678 } else if (pa_end == (*pte & PG_FRAME)) 9679 pa_end += PAGE_SIZE; 9680 else { 9681 /* Run ended, update direct map. */ 9682 error = pmap_change_props_locked( 9683 PHYS_TO_DMAP(pa_start), 9684 pa_end - pa_start, prot, mode, 9685 flags); 9686 if (error != 0) 9687 break; 9688 /* Start physical address run. */ 9689 pa_start = *pte & PG_FRAME; 9690 pa_end = pa_start + PAGE_SIZE; 9691 } 9692 } 9693 tmpva += PAGE_SIZE; 9694 } 9695 } 9696 if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { 9697 pa_end1 = MIN(pa_end, dmaplimit); 9698 if (pa_start != pa_end1) 9699 error = pmap_change_props_locked(PHYS_TO_DMAP(pa_start), 9700 pa_end1 - pa_start, prot, mode, flags); 9701 } 9702 9703 /* 9704 * Flush CPU caches if required to make sure any data isn't cached that 9705 * shouldn't be, etc. 9706 */ 9707 if (changed) { 9708 pmap_invalidate_range(kernel_pmap, base, tmpva); 9709 if ((flags & MAPDEV_FLUSHCACHE) != 0) 9710 pmap_invalidate_cache_range(base, tmpva); 9711 } 9712 return (error); 9713 } 9714 9715 /* 9716 * Demotes any mapping within the direct map region that covers more than the 9717 * specified range of physical addresses. This range's size must be a power 9718 * of two and its starting address must be a multiple of its size. Since the 9719 * demotion does not change any attributes of the mapping, a TLB invalidation 9720 * is not mandatory. The caller may, however, request a TLB invalidation. 9721 */ 9722 void 9723 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) 9724 { 9725 pdp_entry_t *pdpe; 9726 pd_entry_t *pde; 9727 vm_offset_t va; 9728 boolean_t changed; 9729 9730 if (len == 0) 9731 return; 9732 KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); 9733 KASSERT((base & (len - 1)) == 0, 9734 ("pmap_demote_DMAP: base is not a multiple of len")); 9735 if (len < NBPDP && base < dmaplimit) { 9736 va = PHYS_TO_DMAP(base); 9737 changed = FALSE; 9738 PMAP_LOCK(kernel_pmap); 9739 pdpe = pmap_pdpe(kernel_pmap, va); 9740 if ((*pdpe & X86_PG_V) == 0) 9741 panic("pmap_demote_DMAP: invalid PDPE"); 9742 if ((*pdpe & PG_PS) != 0) { 9743 if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) 9744 panic("pmap_demote_DMAP: PDPE failed"); 9745 changed = TRUE; 9746 } 9747 if (len < NBPDR) { 9748 pde = pmap_pdpe_to_pde(pdpe, va); 9749 if ((*pde & X86_PG_V) == 0) 9750 panic("pmap_demote_DMAP: invalid PDE"); 9751 if ((*pde & PG_PS) != 0) { 9752 if (!pmap_demote_pde(kernel_pmap, pde, va)) 9753 panic("pmap_demote_DMAP: PDE failed"); 9754 changed = TRUE; 9755 } 9756 } 9757 if (changed && invalidate) 9758 pmap_invalidate_page(kernel_pmap, va); 9759 PMAP_UNLOCK(kernel_pmap); 9760 } 9761 } 9762 9763 /* 9764 * Perform the pmap work for mincore(2). If the page is not both referenced and 9765 * modified by this pmap, returns its physical address so that the caller can 9766 * find other mappings. 9767 */ 9768 int 9769 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 9770 { 9771 pdp_entry_t *pdpe; 9772 pd_entry_t *pdep; 9773 pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; 9774 vm_paddr_t pa; 9775 int val; 9776 9777 PG_A = pmap_accessed_bit(pmap); 9778 PG_M = pmap_modified_bit(pmap); 9779 PG_V = pmap_valid_bit(pmap); 9780 PG_RW = pmap_rw_bit(pmap); 9781 9782 PMAP_LOCK(pmap); 9783 pte = 0; 9784 pa = 0; 9785 val = 0; 9786 pdpe = pmap_pdpe(pmap, addr); 9787 if (pdpe == NULL) 9788 goto out; 9789 if ((*pdpe & PG_V) != 0) { 9790 if ((*pdpe & PG_PS) != 0) { 9791 pte = *pdpe; 9792 pa = ((pte & PG_PS_PDP_FRAME) | (addr & PDPMASK)) & 9793 PG_FRAME; 9794 val = MINCORE_PSIND(2); 9795 } else { 9796 pdep = pmap_pde(pmap, addr); 9797 if (pdep != NULL && (*pdep & PG_V) != 0) { 9798 if ((*pdep & PG_PS) != 0) { 9799 pte = *pdep; 9800 /* Compute the physical address of the 4KB page. */ 9801 pa = ((pte & PG_PS_FRAME) | (addr & 9802 PDRMASK)) & PG_FRAME; 9803 val = MINCORE_PSIND(1); 9804 } else { 9805 pte = *pmap_pde_to_pte(pdep, addr); 9806 pa = pte & PG_FRAME; 9807 val = 0; 9808 } 9809 } 9810 } 9811 } 9812 if ((pte & PG_V) != 0) { 9813 val |= MINCORE_INCORE; 9814 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 9815 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 9816 if ((pte & PG_A) != 0) 9817 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 9818 } 9819 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 9820 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 9821 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 9822 *pap = pa; 9823 } 9824 out: 9825 PMAP_UNLOCK(pmap); 9826 return (val); 9827 } 9828 9829 static uint64_t 9830 pmap_pcid_alloc(pmap_t pmap, u_int cpuid) 9831 { 9832 uint32_t gen, new_gen, pcid_next; 9833 9834 CRITICAL_ASSERT(curthread); 9835 gen = PCPU_GET(pcid_gen); 9836 if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN) 9837 return (pti ? 0 : CR3_PCID_SAVE); 9838 if (pmap->pm_pcids[cpuid].pm_gen == gen) 9839 return (CR3_PCID_SAVE); 9840 pcid_next = PCPU_GET(pcid_next); 9841 KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) || 9842 (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN), 9843 ("cpu %d pcid_next %#x", cpuid, pcid_next)); 9844 if ((!pti && pcid_next == PMAP_PCID_OVERMAX) || 9845 (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) { 9846 new_gen = gen + 1; 9847 if (new_gen == 0) 9848 new_gen = 1; 9849 PCPU_SET(pcid_gen, new_gen); 9850 pcid_next = PMAP_PCID_KERN + 1; 9851 } else { 9852 new_gen = gen; 9853 } 9854 pmap->pm_pcids[cpuid].pm_pcid = pcid_next; 9855 pmap->pm_pcids[cpuid].pm_gen = new_gen; 9856 PCPU_SET(pcid_next, pcid_next + 1); 9857 return (0); 9858 } 9859 9860 static uint64_t 9861 pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid) 9862 { 9863 uint64_t cached; 9864 9865 cached = pmap_pcid_alloc(pmap, cpuid); 9866 KASSERT(pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, 9867 ("pmap %p cpu %d pcid %#x", pmap, cpuid, 9868 pmap->pm_pcids[cpuid].pm_pcid)); 9869 KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || 9870 pmap == kernel_pmap, 9871 ("non-kernel pmap pmap %p cpu %d pcid %#x", 9872 pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); 9873 return (cached); 9874 } 9875 9876 static void 9877 pmap_activate_sw_pti_post(struct thread *td, pmap_t pmap) 9878 { 9879 9880 PCPU_GET(tssp)->tss_rsp0 = pmap->pm_ucr3 != PMAP_NO_CR3 ? 9881 PCPU_GET(pti_rsp0) : (uintptr_t)td->td_md.md_stack_base; 9882 } 9883 9884 static void 9885 pmap_activate_sw_pcid_pti(struct thread *td, pmap_t pmap, u_int cpuid) 9886 { 9887 pmap_t old_pmap; 9888 uint64_t cached, cr3, kcr3, ucr3; 9889 9890 KASSERT((read_rflags() & PSL_I) == 0, 9891 ("PCID needs interrupts disabled in pmap_activate_sw()")); 9892 9893 /* See the comment in pmap_invalidate_page_pcid(). */ 9894 if (PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) { 9895 PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK); 9896 old_pmap = PCPU_GET(curpmap); 9897 MPASS(old_pmap->pm_ucr3 != PMAP_NO_CR3); 9898 old_pmap->pm_pcids[cpuid].pm_gen = 0; 9899 } 9900 9901 cached = pmap_pcid_alloc_checked(pmap, cpuid); 9902 cr3 = rcr3(); 9903 if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) 9904 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid); 9905 PCPU_SET(curpmap, pmap); 9906 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid; 9907 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid | 9908 PMAP_PCID_USER_PT; 9909 9910 if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) 9911 PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); 9912 9913 PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); 9914 PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); 9915 if (cached) 9916 counter_u64_add(pcid_save_cnt, 1); 9917 9918 pmap_activate_sw_pti_post(td, pmap); 9919 } 9920 9921 static void 9922 pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap, 9923 u_int cpuid) 9924 { 9925 uint64_t cached, cr3; 9926 9927 KASSERT((read_rflags() & PSL_I) == 0, 9928 ("PCID needs interrupts disabled in pmap_activate_sw()")); 9929 9930 cached = pmap_pcid_alloc_checked(pmap, cpuid); 9931 cr3 = rcr3(); 9932 if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) 9933 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | 9934 cached); 9935 PCPU_SET(curpmap, pmap); 9936 if (cached) 9937 counter_u64_add(pcid_save_cnt, 1); 9938 } 9939 9940 static void 9941 pmap_activate_sw_nopcid_nopti(struct thread *td __unused, pmap_t pmap, 9942 u_int cpuid __unused) 9943 { 9944 9945 load_cr3(pmap->pm_cr3); 9946 PCPU_SET(curpmap, pmap); 9947 } 9948 9949 static void 9950 pmap_activate_sw_nopcid_pti(struct thread *td, pmap_t pmap, 9951 u_int cpuid __unused) 9952 { 9953 9954 pmap_activate_sw_nopcid_nopti(td, pmap, cpuid); 9955 PCPU_SET(kcr3, pmap->pm_cr3); 9956 PCPU_SET(ucr3, pmap->pm_ucr3); 9957 pmap_activate_sw_pti_post(td, pmap); 9958 } 9959 9960 DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (struct thread *, pmap_t, 9961 u_int)) 9962 { 9963 9964 if (pmap_pcid_enabled && pti) 9965 return (pmap_activate_sw_pcid_pti); 9966 else if (pmap_pcid_enabled && !pti) 9967 return (pmap_activate_sw_pcid_nopti); 9968 else if (!pmap_pcid_enabled && pti) 9969 return (pmap_activate_sw_nopcid_pti); 9970 else /* if (!pmap_pcid_enabled && !pti) */ 9971 return (pmap_activate_sw_nopcid_nopti); 9972 } 9973 9974 void 9975 pmap_activate_sw(struct thread *td) 9976 { 9977 pmap_t oldpmap, pmap; 9978 u_int cpuid; 9979 9980 oldpmap = PCPU_GET(curpmap); 9981 pmap = vmspace_pmap(td->td_proc->p_vmspace); 9982 if (oldpmap == pmap) { 9983 if (cpu_vendor_id != CPU_VENDOR_INTEL) 9984 mfence(); 9985 return; 9986 } 9987 cpuid = PCPU_GET(cpuid); 9988 #ifdef SMP 9989 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 9990 #else 9991 CPU_SET(cpuid, &pmap->pm_active); 9992 #endif 9993 pmap_activate_sw_mode(td, pmap, cpuid); 9994 #ifdef SMP 9995 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 9996 #else 9997 CPU_CLR(cpuid, &oldpmap->pm_active); 9998 #endif 9999 } 10000 10001 void 10002 pmap_activate(struct thread *td) 10003 { 10004 /* 10005 * invltlb_{invpcid,}_pcid_handler() is used to handle an 10006 * invalidate_all IPI, which checks for curpmap == 10007 * smp_tlb_pmap. The below sequence of operations has a 10008 * window where %CR3 is loaded with the new pmap's PML4 10009 * address, but the curpmap value has not yet been updated. 10010 * This causes the invltlb IPI handler, which is called 10011 * between the updates, to execute as a NOP, which leaves 10012 * stale TLB entries. 10013 * 10014 * Note that the most common use of pmap_activate_sw(), from 10015 * a context switch, is immune to this race, because 10016 * interrupts are disabled (while the thread lock is owned), 10017 * so the IPI is delayed until after curpmap is updated. Protect 10018 * other callers in a similar way, by disabling interrupts 10019 * around the %cr3 register reload and curpmap assignment. 10020 */ 10021 spinlock_enter(); 10022 pmap_activate_sw(td); 10023 spinlock_exit(); 10024 } 10025 10026 void 10027 pmap_activate_boot(pmap_t pmap) 10028 { 10029 uint64_t kcr3; 10030 u_int cpuid; 10031 10032 /* 10033 * kernel_pmap must be never deactivated, and we ensure that 10034 * by never activating it at all. 10035 */ 10036 MPASS(pmap != kernel_pmap); 10037 10038 cpuid = PCPU_GET(cpuid); 10039 #ifdef SMP 10040 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 10041 #else 10042 CPU_SET(cpuid, &pmap->pm_active); 10043 #endif 10044 PCPU_SET(curpmap, pmap); 10045 if (pti) { 10046 kcr3 = pmap->pm_cr3; 10047 if (pmap_pcid_enabled) 10048 kcr3 |= pmap->pm_pcids[cpuid].pm_pcid | CR3_PCID_SAVE; 10049 } else { 10050 kcr3 = PMAP_NO_CR3; 10051 } 10052 PCPU_SET(kcr3, kcr3); 10053 PCPU_SET(ucr3, PMAP_NO_CR3); 10054 } 10055 10056 void 10057 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 10058 { 10059 } 10060 10061 /* 10062 * Increase the starting virtual address of the given mapping if a 10063 * different alignment might result in more superpage mappings. 10064 */ 10065 void 10066 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 10067 vm_offset_t *addr, vm_size_t size) 10068 { 10069 vm_offset_t superpage_offset; 10070 10071 if (size < NBPDR) 10072 return; 10073 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 10074 offset += ptoa(object->pg_color); 10075 superpage_offset = offset & PDRMASK; 10076 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 10077 (*addr & PDRMASK) == superpage_offset) 10078 return; 10079 if ((*addr & PDRMASK) < superpage_offset) 10080 *addr = (*addr & ~PDRMASK) + superpage_offset; 10081 else 10082 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 10083 } 10084 10085 #ifdef INVARIANTS 10086 static unsigned long num_dirty_emulations; 10087 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, 10088 &num_dirty_emulations, 0, NULL); 10089 10090 static unsigned long num_accessed_emulations; 10091 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, 10092 &num_accessed_emulations, 0, NULL); 10093 10094 static unsigned long num_superpage_accessed_emulations; 10095 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, 10096 &num_superpage_accessed_emulations, 0, NULL); 10097 10098 static unsigned long ad_emulation_superpage_promotions; 10099 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, 10100 &ad_emulation_superpage_promotions, 0, NULL); 10101 #endif /* INVARIANTS */ 10102 10103 int 10104 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) 10105 { 10106 int rv; 10107 struct rwlock *lock; 10108 #if VM_NRESERVLEVEL > 0 10109 vm_page_t m, mpte; 10110 #endif 10111 pd_entry_t *pde; 10112 pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; 10113 10114 KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, 10115 ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); 10116 10117 if (!pmap_emulate_ad_bits(pmap)) 10118 return (-1); 10119 10120 PG_A = pmap_accessed_bit(pmap); 10121 PG_M = pmap_modified_bit(pmap); 10122 PG_V = pmap_valid_bit(pmap); 10123 PG_RW = pmap_rw_bit(pmap); 10124 10125 rv = -1; 10126 lock = NULL; 10127 PMAP_LOCK(pmap); 10128 10129 pde = pmap_pde(pmap, va); 10130 if (pde == NULL || (*pde & PG_V) == 0) 10131 goto done; 10132 10133 if ((*pde & PG_PS) != 0) { 10134 if (ftype == VM_PROT_READ) { 10135 #ifdef INVARIANTS 10136 atomic_add_long(&num_superpage_accessed_emulations, 1); 10137 #endif 10138 *pde |= PG_A; 10139 rv = 0; 10140 } 10141 goto done; 10142 } 10143 10144 pte = pmap_pde_to_pte(pde, va); 10145 if ((*pte & PG_V) == 0) 10146 goto done; 10147 10148 if (ftype == VM_PROT_WRITE) { 10149 if ((*pte & PG_RW) == 0) 10150 goto done; 10151 /* 10152 * Set the modified and accessed bits simultaneously. 10153 * 10154 * Intel EPT PTEs that do software emulation of A/D bits map 10155 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively. 10156 * An EPT misconfiguration is triggered if the PTE is writable 10157 * but not readable (WR=10). This is avoided by setting PG_A 10158 * and PG_M simultaneously. 10159 */ 10160 *pte |= PG_M | PG_A; 10161 } else { 10162 *pte |= PG_A; 10163 } 10164 10165 #if VM_NRESERVLEVEL > 0 10166 /* try to promote the mapping */ 10167 if (va < VM_MAXUSER_ADDRESS) 10168 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 10169 else 10170 mpte = NULL; 10171 10172 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 10173 10174 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 10175 pmap_ps_enabled(pmap) && 10176 (m->flags & PG_FICTITIOUS) == 0 && 10177 vm_reserv_level_iffullpop(m) == 0) { 10178 pmap_promote_pde(pmap, pde, va, &lock); 10179 #ifdef INVARIANTS 10180 atomic_add_long(&ad_emulation_superpage_promotions, 1); 10181 #endif 10182 } 10183 #endif 10184 10185 #ifdef INVARIANTS 10186 if (ftype == VM_PROT_WRITE) 10187 atomic_add_long(&num_dirty_emulations, 1); 10188 else 10189 atomic_add_long(&num_accessed_emulations, 1); 10190 #endif 10191 rv = 0; /* success */ 10192 done: 10193 if (lock != NULL) 10194 rw_wunlock(lock); 10195 PMAP_UNLOCK(pmap); 10196 return (rv); 10197 } 10198 10199 void 10200 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) 10201 { 10202 pml4_entry_t *pml4; 10203 pdp_entry_t *pdp; 10204 pd_entry_t *pde; 10205 pt_entry_t *pte, PG_V; 10206 int idx; 10207 10208 idx = 0; 10209 PG_V = pmap_valid_bit(pmap); 10210 PMAP_LOCK(pmap); 10211 10212 pml4 = pmap_pml4e(pmap, va); 10213 if (pml4 == NULL) 10214 goto done; 10215 ptr[idx++] = *pml4; 10216 if ((*pml4 & PG_V) == 0) 10217 goto done; 10218 10219 pdp = pmap_pml4e_to_pdpe(pml4, va); 10220 ptr[idx++] = *pdp; 10221 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) 10222 goto done; 10223 10224 pde = pmap_pdpe_to_pde(pdp, va); 10225 ptr[idx++] = *pde; 10226 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) 10227 goto done; 10228 10229 pte = pmap_pde_to_pte(pde, va); 10230 ptr[idx++] = *pte; 10231 10232 done: 10233 PMAP_UNLOCK(pmap); 10234 *num = idx; 10235 } 10236 10237 /** 10238 * Get the kernel virtual address of a set of physical pages. If there are 10239 * physical addresses not covered by the DMAP perform a transient mapping 10240 * that will be removed when calling pmap_unmap_io_transient. 10241 * 10242 * \param page The pages the caller wishes to obtain the virtual 10243 * address on the kernel memory map. 10244 * \param vaddr On return contains the kernel virtual memory address 10245 * of the pages passed in the page parameter. 10246 * \param count Number of pages passed in. 10247 * \param can_fault TRUE if the thread using the mapped pages can take 10248 * page faults, FALSE otherwise. 10249 * 10250 * \returns TRUE if the caller must call pmap_unmap_io_transient when 10251 * finished or FALSE otherwise. 10252 * 10253 */ 10254 boolean_t 10255 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 10256 boolean_t can_fault) 10257 { 10258 vm_paddr_t paddr; 10259 boolean_t needs_mapping; 10260 pt_entry_t *pte; 10261 int cache_bits, error __unused, i; 10262 10263 /* 10264 * Allocate any KVA space that we need, this is done in a separate 10265 * loop to prevent calling vmem_alloc while pinned. 10266 */ 10267 needs_mapping = FALSE; 10268 for (i = 0; i < count; i++) { 10269 paddr = VM_PAGE_TO_PHYS(page[i]); 10270 if (__predict_false(paddr >= dmaplimit)) { 10271 error = vmem_alloc(kernel_arena, PAGE_SIZE, 10272 M_BESTFIT | M_WAITOK, &vaddr[i]); 10273 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 10274 needs_mapping = TRUE; 10275 } else { 10276 vaddr[i] = PHYS_TO_DMAP(paddr); 10277 } 10278 } 10279 10280 /* Exit early if everything is covered by the DMAP */ 10281 if (!needs_mapping) 10282 return (FALSE); 10283 10284 /* 10285 * NB: The sequence of updating a page table followed by accesses 10286 * to the corresponding pages used in the !DMAP case is subject to 10287 * the situation described in the "AMD64 Architecture Programmer's 10288 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special 10289 * Coherency Considerations". Therefore, issuing the INVLPG right 10290 * after modifying the PTE bits is crucial. 10291 */ 10292 if (!can_fault) 10293 sched_pin(); 10294 for (i = 0; i < count; i++) { 10295 paddr = VM_PAGE_TO_PHYS(page[i]); 10296 if (paddr >= dmaplimit) { 10297 if (can_fault) { 10298 /* 10299 * Slow path, since we can get page faults 10300 * while mappings are active don't pin the 10301 * thread to the CPU and instead add a global 10302 * mapping visible to all CPUs. 10303 */ 10304 pmap_qenter(vaddr[i], &page[i], 1); 10305 } else { 10306 pte = vtopte(vaddr[i]); 10307 cache_bits = pmap_cache_bits(kernel_pmap, 10308 page[i]->md.pat_mode, 0); 10309 pte_store(pte, paddr | X86_PG_RW | X86_PG_V | 10310 cache_bits); 10311 invlpg(vaddr[i]); 10312 } 10313 } 10314 } 10315 10316 return (needs_mapping); 10317 } 10318 10319 void 10320 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 10321 boolean_t can_fault) 10322 { 10323 vm_paddr_t paddr; 10324 int i; 10325 10326 if (!can_fault) 10327 sched_unpin(); 10328 for (i = 0; i < count; i++) { 10329 paddr = VM_PAGE_TO_PHYS(page[i]); 10330 if (paddr >= dmaplimit) { 10331 if (can_fault) 10332 pmap_qremove(vaddr[i], 1); 10333 vmem_free(kernel_arena, vaddr[i], PAGE_SIZE); 10334 } 10335 } 10336 } 10337 10338 vm_offset_t 10339 pmap_quick_enter_page(vm_page_t m) 10340 { 10341 vm_paddr_t paddr; 10342 10343 paddr = VM_PAGE_TO_PHYS(m); 10344 if (paddr < dmaplimit) 10345 return (PHYS_TO_DMAP(paddr)); 10346 mtx_lock_spin(&qframe_mtx); 10347 KASSERT(*vtopte(qframe) == 0, ("qframe busy")); 10348 pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A | 10349 X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0)); 10350 return (qframe); 10351 } 10352 10353 void 10354 pmap_quick_remove_page(vm_offset_t addr) 10355 { 10356 10357 if (addr != qframe) 10358 return; 10359 pte_store(vtopte(qframe), 0); 10360 invlpg(qframe); 10361 mtx_unlock_spin(&qframe_mtx); 10362 } 10363 10364 /* 10365 * Pdp pages from the large map are managed differently from either 10366 * kernel or user page table pages. They are permanently allocated at 10367 * initialization time, and their reference count is permanently set to 10368 * zero. The pml4 entries pointing to those pages are copied into 10369 * each allocated pmap. 10370 * 10371 * In contrast, pd and pt pages are managed like user page table 10372 * pages. They are dynamically allocated, and their reference count 10373 * represents the number of valid entries within the page. 10374 */ 10375 static vm_page_t 10376 pmap_large_map_getptp_unlocked(void) 10377 { 10378 return (pmap_alloc_pt_page(kernel_pmap, 0, VM_ALLOC_ZERO)); 10379 } 10380 10381 static vm_page_t 10382 pmap_large_map_getptp(void) 10383 { 10384 vm_page_t m; 10385 10386 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 10387 m = pmap_large_map_getptp_unlocked(); 10388 if (m == NULL) { 10389 PMAP_UNLOCK(kernel_pmap); 10390 vm_wait(NULL); 10391 PMAP_LOCK(kernel_pmap); 10392 /* Callers retry. */ 10393 } 10394 return (m); 10395 } 10396 10397 static pdp_entry_t * 10398 pmap_large_map_pdpe(vm_offset_t va) 10399 { 10400 vm_pindex_t pml4_idx; 10401 vm_paddr_t mphys; 10402 10403 pml4_idx = pmap_pml4e_index(va); 10404 KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents, 10405 ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I " 10406 "%#jx lm_ents %d", 10407 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); 10408 KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0, 10409 ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " 10410 "LMSPML4I %#jx lm_ents %d", 10411 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); 10412 mphys = kernel_pml4[pml4_idx] & PG_FRAME; 10413 return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va)); 10414 } 10415 10416 static pd_entry_t * 10417 pmap_large_map_pde(vm_offset_t va) 10418 { 10419 pdp_entry_t *pdpe; 10420 vm_page_t m; 10421 vm_paddr_t mphys; 10422 10423 retry: 10424 pdpe = pmap_large_map_pdpe(va); 10425 if (*pdpe == 0) { 10426 m = pmap_large_map_getptp(); 10427 if (m == NULL) 10428 goto retry; 10429 mphys = VM_PAGE_TO_PHYS(m); 10430 *pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; 10431 } else { 10432 MPASS((*pdpe & X86_PG_PS) == 0); 10433 mphys = *pdpe & PG_FRAME; 10434 } 10435 return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va)); 10436 } 10437 10438 static pt_entry_t * 10439 pmap_large_map_pte(vm_offset_t va) 10440 { 10441 pd_entry_t *pde; 10442 vm_page_t m; 10443 vm_paddr_t mphys; 10444 10445 retry: 10446 pde = pmap_large_map_pde(va); 10447 if (*pde == 0) { 10448 m = pmap_large_map_getptp(); 10449 if (m == NULL) 10450 goto retry; 10451 mphys = VM_PAGE_TO_PHYS(m); 10452 *pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; 10453 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->ref_count++; 10454 } else { 10455 MPASS((*pde & X86_PG_PS) == 0); 10456 mphys = *pde & PG_FRAME; 10457 } 10458 return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va)); 10459 } 10460 10461 static vm_paddr_t 10462 pmap_large_map_kextract(vm_offset_t va) 10463 { 10464 pdp_entry_t *pdpe, pdp; 10465 pd_entry_t *pde, pd; 10466 pt_entry_t *pte, pt; 10467 10468 KASSERT(PMAP_ADDRESS_IN_LARGEMAP(va), 10469 ("not largemap range %#lx", (u_long)va)); 10470 pdpe = pmap_large_map_pdpe(va); 10471 pdp = *pdpe; 10472 KASSERT((pdp & X86_PG_V) != 0, 10473 ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, 10474 (u_long)pdpe, pdp)); 10475 if ((pdp & X86_PG_PS) != 0) { 10476 KASSERT((amd_feature & AMDID_PAGE1GB) != 0, 10477 ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, 10478 (u_long)pdpe, pdp)); 10479 return ((pdp & PG_PS_PDP_FRAME) | (va & PDPMASK)); 10480 } 10481 pde = pmap_pdpe_to_pde(pdpe, va); 10482 pd = *pde; 10483 KASSERT((pd & X86_PG_V) != 0, 10484 ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); 10485 if ((pd & X86_PG_PS) != 0) 10486 return ((pd & PG_PS_FRAME) | (va & PDRMASK)); 10487 pte = pmap_pde_to_pte(pde, va); 10488 pt = *pte; 10489 KASSERT((pt & X86_PG_V) != 0, 10490 ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, pt)); 10491 return ((pt & PG_FRAME) | (va & PAGE_MASK)); 10492 } 10493 10494 static int 10495 pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase, 10496 vmem_addr_t *vmem_res) 10497 { 10498 10499 /* 10500 * Large mappings are all but static. Consequently, there 10501 * is no point in waiting for an earlier allocation to be 10502 * freed. 10503 */ 10504 return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN, 10505 VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res)); 10506 } 10507 10508 int 10509 pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr, 10510 vm_memattr_t mattr) 10511 { 10512 pdp_entry_t *pdpe; 10513 pd_entry_t *pde; 10514 pt_entry_t *pte; 10515 vm_offset_t va, inc; 10516 vmem_addr_t vmem_res; 10517 vm_paddr_t pa; 10518 int error; 10519 10520 if (len == 0 || spa + len < spa) 10521 return (EINVAL); 10522 10523 /* See if DMAP can serve. */ 10524 if (spa + len <= dmaplimit) { 10525 va = PHYS_TO_DMAP(spa); 10526 *addr = (void *)va; 10527 return (pmap_change_attr(va, len, mattr)); 10528 } 10529 10530 /* 10531 * No, allocate KVA. Fit the address with best possible 10532 * alignment for superpages. Fall back to worse align if 10533 * failed. 10534 */ 10535 error = ENOMEM; 10536 if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len, 10537 NBPDP) >= roundup2(spa, NBPDP) + NBPDP) 10538 error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK, 10539 &vmem_res); 10540 if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa, 10541 NBPDR) + NBPDR) 10542 error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK, 10543 &vmem_res); 10544 if (error != 0) 10545 error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res); 10546 if (error != 0) 10547 return (error); 10548 10549 /* 10550 * Fill pagetable. PG_M is not pre-set, we scan modified bits 10551 * in the pagetable to minimize flushing. No need to 10552 * invalidate TLB, since we only update invalid entries. 10553 */ 10554 PMAP_LOCK(kernel_pmap); 10555 for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc, 10556 len -= inc) { 10557 if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP && 10558 (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) { 10559 pdpe = pmap_large_map_pdpe(va); 10560 MPASS(*pdpe == 0); 10561 *pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW | 10562 X86_PG_V | X86_PG_A | pg_nx | 10563 pmap_cache_bits(kernel_pmap, mattr, TRUE); 10564 inc = NBPDP; 10565 } else if (len >= NBPDR && (pa & PDRMASK) == 0 && 10566 (va & PDRMASK) == 0) { 10567 pde = pmap_large_map_pde(va); 10568 MPASS(*pde == 0); 10569 *pde = pa | pg_g | X86_PG_PS | X86_PG_RW | 10570 X86_PG_V | X86_PG_A | pg_nx | 10571 pmap_cache_bits(kernel_pmap, mattr, TRUE); 10572 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))-> 10573 ref_count++; 10574 inc = NBPDR; 10575 } else { 10576 pte = pmap_large_map_pte(va); 10577 MPASS(*pte == 0); 10578 *pte = pa | pg_g | X86_PG_RW | X86_PG_V | 10579 X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, 10580 mattr, FALSE); 10581 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))-> 10582 ref_count++; 10583 inc = PAGE_SIZE; 10584 } 10585 } 10586 PMAP_UNLOCK(kernel_pmap); 10587 MPASS(len == 0); 10588 10589 *addr = (void *)vmem_res; 10590 return (0); 10591 } 10592 10593 void 10594 pmap_large_unmap(void *svaa, vm_size_t len) 10595 { 10596 vm_offset_t sva, va; 10597 vm_size_t inc; 10598 pdp_entry_t *pdpe, pdp; 10599 pd_entry_t *pde, pd; 10600 pt_entry_t *pte; 10601 vm_page_t m; 10602 struct spglist spgf; 10603 10604 sva = (vm_offset_t)svaa; 10605 if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS && 10606 sva + len <= DMAP_MIN_ADDRESS + dmaplimit)) 10607 return; 10608 10609 SLIST_INIT(&spgf); 10610 KASSERT(PMAP_ADDRESS_IN_LARGEMAP(sva) && 10611 PMAP_ADDRESS_IN_LARGEMAP(sva + len - 1), 10612 ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len)); 10613 PMAP_LOCK(kernel_pmap); 10614 for (va = sva; va < sva + len; va += inc) { 10615 pdpe = pmap_large_map_pdpe(va); 10616 pdp = *pdpe; 10617 KASSERT((pdp & X86_PG_V) != 0, 10618 ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, 10619 (u_long)pdpe, pdp)); 10620 if ((pdp & X86_PG_PS) != 0) { 10621 KASSERT((amd_feature & AMDID_PAGE1GB) != 0, 10622 ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, 10623 (u_long)pdpe, pdp)); 10624 KASSERT((va & PDPMASK) == 0, 10625 ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va, 10626 (u_long)pdpe, pdp)); 10627 KASSERT(va + NBPDP <= sva + len, 10628 ("unmap covers partial 1GB page, sva %#lx va %#lx " 10629 "pdpe %#lx pdp %#lx len %#lx", sva, va, 10630 (u_long)pdpe, pdp, len)); 10631 *pdpe = 0; 10632 inc = NBPDP; 10633 continue; 10634 } 10635 pde = pmap_pdpe_to_pde(pdpe, va); 10636 pd = *pde; 10637 KASSERT((pd & X86_PG_V) != 0, 10638 ("invalid pd va %#lx pde %#lx pd %#lx", va, 10639 (u_long)pde, pd)); 10640 if ((pd & X86_PG_PS) != 0) { 10641 KASSERT((va & PDRMASK) == 0, 10642 ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va, 10643 (u_long)pde, pd)); 10644 KASSERT(va + NBPDR <= sva + len, 10645 ("unmap covers partial 2MB page, sva %#lx va %#lx " 10646 "pde %#lx pd %#lx len %#lx", sva, va, (u_long)pde, 10647 pd, len)); 10648 pde_store(pde, 0); 10649 inc = NBPDR; 10650 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); 10651 m->ref_count--; 10652 if (m->ref_count == 0) { 10653 *pdpe = 0; 10654 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10655 } 10656 continue; 10657 } 10658 pte = pmap_pde_to_pte(pde, va); 10659 KASSERT((*pte & X86_PG_V) != 0, 10660 ("invalid pte va %#lx pte %#lx pt %#lx", va, 10661 (u_long)pte, *pte)); 10662 pte_clear(pte); 10663 inc = PAGE_SIZE; 10664 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte)); 10665 m->ref_count--; 10666 if (m->ref_count == 0) { 10667 *pde = 0; 10668 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10669 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); 10670 m->ref_count--; 10671 if (m->ref_count == 0) { 10672 *pdpe = 0; 10673 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10674 } 10675 } 10676 } 10677 pmap_invalidate_range(kernel_pmap, sva, sva + len); 10678 PMAP_UNLOCK(kernel_pmap); 10679 vm_page_free_pages_toq(&spgf, false); 10680 vmem_free(large_vmem, sva, len); 10681 } 10682 10683 static void 10684 pmap_large_map_wb_fence_mfence(void) 10685 { 10686 10687 mfence(); 10688 } 10689 10690 static void 10691 pmap_large_map_wb_fence_atomic(void) 10692 { 10693 10694 atomic_thread_fence_seq_cst(); 10695 } 10696 10697 static void 10698 pmap_large_map_wb_fence_nop(void) 10699 { 10700 } 10701 10702 DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void)) 10703 { 10704 10705 if (cpu_vendor_id != CPU_VENDOR_INTEL) 10706 return (pmap_large_map_wb_fence_mfence); 10707 else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB | 10708 CPUID_STDEXT_CLFLUSHOPT)) == 0) 10709 return (pmap_large_map_wb_fence_atomic); 10710 else 10711 /* clflush is strongly enough ordered */ 10712 return (pmap_large_map_wb_fence_nop); 10713 } 10714 10715 static void 10716 pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len) 10717 { 10718 10719 for (; len > 0; len -= cpu_clflush_line_size, 10720 va += cpu_clflush_line_size) 10721 clwb(va); 10722 } 10723 10724 static void 10725 pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len) 10726 { 10727 10728 for (; len > 0; len -= cpu_clflush_line_size, 10729 va += cpu_clflush_line_size) 10730 clflushopt(va); 10731 } 10732 10733 static void 10734 pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len) 10735 { 10736 10737 for (; len > 0; len -= cpu_clflush_line_size, 10738 va += cpu_clflush_line_size) 10739 clflush(va); 10740 } 10741 10742 static void 10743 pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len __unused) 10744 { 10745 } 10746 10747 DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t, vm_size_t)) 10748 { 10749 10750 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0) 10751 return (pmap_large_map_flush_range_clwb); 10752 else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) 10753 return (pmap_large_map_flush_range_clflushopt); 10754 else if ((cpu_feature & CPUID_CLFSH) != 0) 10755 return (pmap_large_map_flush_range_clflush); 10756 else 10757 return (pmap_large_map_flush_range_nop); 10758 } 10759 10760 static void 10761 pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva) 10762 { 10763 volatile u_long *pe; 10764 u_long p; 10765 vm_offset_t va; 10766 vm_size_t inc; 10767 bool seen_other; 10768 10769 for (va = sva; va < eva; va += inc) { 10770 inc = 0; 10771 if ((amd_feature & AMDID_PAGE1GB) != 0) { 10772 pe = (volatile u_long *)pmap_large_map_pdpe(va); 10773 p = *pe; 10774 if ((p & X86_PG_PS) != 0) 10775 inc = NBPDP; 10776 } 10777 if (inc == 0) { 10778 pe = (volatile u_long *)pmap_large_map_pde(va); 10779 p = *pe; 10780 if ((p & X86_PG_PS) != 0) 10781 inc = NBPDR; 10782 } 10783 if (inc == 0) { 10784 pe = (volatile u_long *)pmap_large_map_pte(va); 10785 p = *pe; 10786 inc = PAGE_SIZE; 10787 } 10788 seen_other = false; 10789 for (;;) { 10790 if ((p & X86_PG_AVAIL1) != 0) { 10791 /* 10792 * Spin-wait for the end of a parallel 10793 * write-back. 10794 */ 10795 cpu_spinwait(); 10796 p = *pe; 10797 10798 /* 10799 * If we saw other write-back 10800 * occuring, we cannot rely on PG_M to 10801 * indicate state of the cache. The 10802 * PG_M bit is cleared before the 10803 * flush to avoid ignoring new writes, 10804 * and writes which are relevant for 10805 * us might happen after. 10806 */ 10807 seen_other = true; 10808 continue; 10809 } 10810 10811 if ((p & X86_PG_M) != 0 || seen_other) { 10812 if (!atomic_fcmpset_long(pe, &p, 10813 (p & ~X86_PG_M) | X86_PG_AVAIL1)) 10814 /* 10815 * If we saw PG_M without 10816 * PG_AVAIL1, and then on the 10817 * next attempt we do not 10818 * observe either PG_M or 10819 * PG_AVAIL1, the other 10820 * write-back started after us 10821 * and finished before us. We 10822 * can rely on it doing our 10823 * work. 10824 */ 10825 continue; 10826 pmap_large_map_flush_range(va, inc); 10827 atomic_clear_long(pe, X86_PG_AVAIL1); 10828 } 10829 break; 10830 } 10831 maybe_yield(); 10832 } 10833 } 10834 10835 /* 10836 * Write-back cache lines for the given address range. 10837 * 10838 * Must be called only on the range or sub-range returned from 10839 * pmap_large_map(). Must not be called on the coalesced ranges. 10840 * 10841 * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH 10842 * instructions support. 10843 */ 10844 void 10845 pmap_large_map_wb(void *svap, vm_size_t len) 10846 { 10847 vm_offset_t eva, sva; 10848 10849 sva = (vm_offset_t)svap; 10850 eva = sva + len; 10851 pmap_large_map_wb_fence(); 10852 if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) { 10853 pmap_large_map_flush_range(sva, len); 10854 } else { 10855 KASSERT(sva >= LARGEMAP_MIN_ADDRESS && 10856 eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4, 10857 ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len)); 10858 pmap_large_map_wb_large(sva, eva); 10859 } 10860 pmap_large_map_wb_fence(); 10861 } 10862 10863 static vm_page_t 10864 pmap_pti_alloc_page(void) 10865 { 10866 vm_page_t m; 10867 10868 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 10869 m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY | 10870 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 10871 return (m); 10872 } 10873 10874 static bool 10875 pmap_pti_free_page(vm_page_t m) 10876 { 10877 10878 KASSERT(m->ref_count > 0, ("page %p not referenced", m)); 10879 if (!vm_page_unwire_noq(m)) 10880 return (false); 10881 vm_page_free_zero(m); 10882 return (true); 10883 } 10884 10885 static void 10886 pmap_pti_init(void) 10887 { 10888 vm_page_t pml4_pg; 10889 pdp_entry_t *pdpe; 10890 vm_offset_t va; 10891 int i; 10892 10893 if (!pti) 10894 return; 10895 pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL); 10896 VM_OBJECT_WLOCK(pti_obj); 10897 pml4_pg = pmap_pti_alloc_page(); 10898 pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg)); 10899 for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS && 10900 va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) { 10901 pdpe = pmap_pti_pdpe(va); 10902 pmap_pti_wire_pte(pdpe); 10903 } 10904 pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0], 10905 (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false); 10906 pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt + 10907 sizeof(struct gate_descriptor) * NIDT, false); 10908 CPU_FOREACH(i) { 10909 /* Doublefault stack IST 1 */ 10910 va = __pcpu[i].pc_common_tss.tss_ist1 + sizeof(struct nmi_pcpu); 10911 pmap_pti_add_kva_locked(va - DBLFAULT_STACK_SIZE, va, false); 10912 /* NMI stack IST 2 */ 10913 va = __pcpu[i].pc_common_tss.tss_ist2 + sizeof(struct nmi_pcpu); 10914 pmap_pti_add_kva_locked(va - NMI_STACK_SIZE, va, false); 10915 /* MC# stack IST 3 */ 10916 va = __pcpu[i].pc_common_tss.tss_ist3 + 10917 sizeof(struct nmi_pcpu); 10918 pmap_pti_add_kva_locked(va - MCE_STACK_SIZE, va, false); 10919 /* DB# stack IST 4 */ 10920 va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu); 10921 pmap_pti_add_kva_locked(va - DBG_STACK_SIZE, va, false); 10922 } 10923 pmap_pti_add_kva_locked((vm_offset_t)KERNSTART, (vm_offset_t)etext, 10924 true); 10925 pti_finalized = true; 10926 VM_OBJECT_WUNLOCK(pti_obj); 10927 } 10928 SYSINIT(pmap_pti, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_pti_init, NULL); 10929 10930 static pdp_entry_t * 10931 pmap_pti_pdpe(vm_offset_t va) 10932 { 10933 pml4_entry_t *pml4e; 10934 pdp_entry_t *pdpe; 10935 vm_page_t m; 10936 vm_pindex_t pml4_idx; 10937 vm_paddr_t mphys; 10938 10939 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 10940 10941 pml4_idx = pmap_pml4e_index(va); 10942 pml4e = &pti_pml4[pml4_idx]; 10943 m = NULL; 10944 if (*pml4e == 0) { 10945 if (pti_finalized) 10946 panic("pml4 alloc after finalization\n"); 10947 m = pmap_pti_alloc_page(); 10948 if (*pml4e != 0) { 10949 pmap_pti_free_page(m); 10950 mphys = *pml4e & ~PAGE_MASK; 10951 } else { 10952 mphys = VM_PAGE_TO_PHYS(m); 10953 *pml4e = mphys | X86_PG_RW | X86_PG_V; 10954 } 10955 } else { 10956 mphys = *pml4e & ~PAGE_MASK; 10957 } 10958 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va); 10959 return (pdpe); 10960 } 10961 10962 static void 10963 pmap_pti_wire_pte(void *pte) 10964 { 10965 vm_page_t m; 10966 10967 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 10968 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); 10969 m->ref_count++; 10970 } 10971 10972 static void 10973 pmap_pti_unwire_pde(void *pde, bool only_ref) 10974 { 10975 vm_page_t m; 10976 10977 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 10978 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde)); 10979 MPASS(m->ref_count > 0); 10980 MPASS(only_ref || m->ref_count > 1); 10981 pmap_pti_free_page(m); 10982 } 10983 10984 static void 10985 pmap_pti_unwire_pte(void *pte, vm_offset_t va) 10986 { 10987 vm_page_t m; 10988 pd_entry_t *pde; 10989 10990 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 10991 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); 10992 MPASS(m->ref_count > 0); 10993 if (pmap_pti_free_page(m)) { 10994 pde = pmap_pti_pde(va); 10995 MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V); 10996 *pde = 0; 10997 pmap_pti_unwire_pde(pde, false); 10998 } 10999 } 11000 11001 static pd_entry_t * 11002 pmap_pti_pde(vm_offset_t va) 11003 { 11004 pdp_entry_t *pdpe; 11005 pd_entry_t *pde; 11006 vm_page_t m; 11007 vm_pindex_t pd_idx; 11008 vm_paddr_t mphys; 11009 11010 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11011 11012 pdpe = pmap_pti_pdpe(va); 11013 if (*pdpe == 0) { 11014 m = pmap_pti_alloc_page(); 11015 if (*pdpe != 0) { 11016 pmap_pti_free_page(m); 11017 MPASS((*pdpe & X86_PG_PS) == 0); 11018 mphys = *pdpe & ~PAGE_MASK; 11019 } else { 11020 mphys = VM_PAGE_TO_PHYS(m); 11021 *pdpe = mphys | X86_PG_RW | X86_PG_V; 11022 } 11023 } else { 11024 MPASS((*pdpe & X86_PG_PS) == 0); 11025 mphys = *pdpe & ~PAGE_MASK; 11026 } 11027 11028 pde = (pd_entry_t *)PHYS_TO_DMAP(mphys); 11029 pd_idx = pmap_pde_index(va); 11030 pde += pd_idx; 11031 return (pde); 11032 } 11033 11034 static pt_entry_t * 11035 pmap_pti_pte(vm_offset_t va, bool *unwire_pde) 11036 { 11037 pd_entry_t *pde; 11038 pt_entry_t *pte; 11039 vm_page_t m; 11040 vm_paddr_t mphys; 11041 11042 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11043 11044 pde = pmap_pti_pde(va); 11045 if (unwire_pde != NULL) { 11046 *unwire_pde = true; 11047 pmap_pti_wire_pte(pde); 11048 } 11049 if (*pde == 0) { 11050 m = pmap_pti_alloc_page(); 11051 if (*pde != 0) { 11052 pmap_pti_free_page(m); 11053 MPASS((*pde & X86_PG_PS) == 0); 11054 mphys = *pde & ~(PAGE_MASK | pg_nx); 11055 } else { 11056 mphys = VM_PAGE_TO_PHYS(m); 11057 *pde = mphys | X86_PG_RW | X86_PG_V; 11058 if (unwire_pde != NULL) 11059 *unwire_pde = false; 11060 } 11061 } else { 11062 MPASS((*pde & X86_PG_PS) == 0); 11063 mphys = *pde & ~(PAGE_MASK | pg_nx); 11064 } 11065 11066 pte = (pt_entry_t *)PHYS_TO_DMAP(mphys); 11067 pte += pmap_pte_index(va); 11068 11069 return (pte); 11070 } 11071 11072 static void 11073 pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec) 11074 { 11075 vm_paddr_t pa; 11076 pd_entry_t *pde; 11077 pt_entry_t *pte, ptev; 11078 bool unwire_pde; 11079 11080 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11081 11082 sva = trunc_page(sva); 11083 MPASS(sva > VM_MAXUSER_ADDRESS); 11084 eva = round_page(eva); 11085 MPASS(sva < eva); 11086 for (; sva < eva; sva += PAGE_SIZE) { 11087 pte = pmap_pti_pte(sva, &unwire_pde); 11088 pa = pmap_kextract(sva); 11089 ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G | 11090 (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap, 11091 VM_MEMATTR_DEFAULT, FALSE); 11092 if (*pte == 0) { 11093 pte_store(pte, ptev); 11094 pmap_pti_wire_pte(pte); 11095 } else { 11096 KASSERT(!pti_finalized, 11097 ("pti overlap after fin %#lx %#lx %#lx", 11098 sva, *pte, ptev)); 11099 KASSERT(*pte == ptev, 11100 ("pti non-identical pte after fin %#lx %#lx %#lx", 11101 sva, *pte, ptev)); 11102 } 11103 if (unwire_pde) { 11104 pde = pmap_pti_pde(sva); 11105 pmap_pti_unwire_pde(pde, true); 11106 } 11107 } 11108 } 11109 11110 void 11111 pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec) 11112 { 11113 11114 if (!pti) 11115 return; 11116 VM_OBJECT_WLOCK(pti_obj); 11117 pmap_pti_add_kva_locked(sva, eva, exec); 11118 VM_OBJECT_WUNLOCK(pti_obj); 11119 } 11120 11121 void 11122 pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva) 11123 { 11124 pt_entry_t *pte; 11125 vm_offset_t va; 11126 11127 if (!pti) 11128 return; 11129 sva = rounddown2(sva, PAGE_SIZE); 11130 MPASS(sva > VM_MAXUSER_ADDRESS); 11131 eva = roundup2(eva, PAGE_SIZE); 11132 MPASS(sva < eva); 11133 VM_OBJECT_WLOCK(pti_obj); 11134 for (va = sva; va < eva; va += PAGE_SIZE) { 11135 pte = pmap_pti_pte(va, NULL); 11136 KASSERT((*pte & X86_PG_V) != 0, 11137 ("invalid pte va %#lx pte %#lx pt %#lx", va, 11138 (u_long)pte, *pte)); 11139 pte_clear(pte); 11140 pmap_pti_unwire_pte(pte, va); 11141 } 11142 pmap_invalidate_range(kernel_pmap, sva, eva); 11143 VM_OBJECT_WUNLOCK(pti_obj); 11144 } 11145 11146 static void * 11147 pkru_dup_range(void *ctx __unused, void *data) 11148 { 11149 struct pmap_pkru_range *node, *new_node; 11150 11151 new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); 11152 if (new_node == NULL) 11153 return (NULL); 11154 node = data; 11155 memcpy(new_node, node, sizeof(*node)); 11156 return (new_node); 11157 } 11158 11159 static void 11160 pkru_free_range(void *ctx __unused, void *node) 11161 { 11162 11163 uma_zfree(pmap_pkru_ranges_zone, node); 11164 } 11165 11166 static int 11167 pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, 11168 int flags) 11169 { 11170 struct pmap_pkru_range *ppr; 11171 int error; 11172 11173 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11174 MPASS(pmap->pm_type == PT_X86); 11175 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11176 if ((flags & AMD64_PKRU_EXCL) != 0 && 11177 !rangeset_check_empty(&pmap->pm_pkru, sva, eva)) 11178 return (EBUSY); 11179 ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); 11180 if (ppr == NULL) 11181 return (ENOMEM); 11182 ppr->pkru_keyidx = keyidx; 11183 ppr->pkru_flags = flags & AMD64_PKRU_PERSIST; 11184 error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr); 11185 if (error != 0) 11186 uma_zfree(pmap_pkru_ranges_zone, ppr); 11187 return (error); 11188 } 11189 11190 static int 11191 pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11192 { 11193 11194 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11195 MPASS(pmap->pm_type == PT_X86); 11196 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11197 return (rangeset_remove(&pmap->pm_pkru, sva, eva)); 11198 } 11199 11200 static void 11201 pmap_pkru_deassign_all(pmap_t pmap) 11202 { 11203 11204 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11205 if (pmap->pm_type == PT_X86 && 11206 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) 11207 rangeset_remove_all(&pmap->pm_pkru); 11208 } 11209 11210 static bool 11211 pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11212 { 11213 struct pmap_pkru_range *ppr, *prev_ppr; 11214 vm_offset_t va; 11215 11216 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11217 if (pmap->pm_type != PT_X86 || 11218 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || 11219 sva >= VM_MAXUSER_ADDRESS) 11220 return (true); 11221 MPASS(eva <= VM_MAXUSER_ADDRESS); 11222 for (va = sva; va < eva; prev_ppr = ppr) { 11223 ppr = rangeset_lookup(&pmap->pm_pkru, va); 11224 if (va == sva) 11225 prev_ppr = ppr; 11226 else if ((ppr == NULL) ^ (prev_ppr == NULL)) 11227 return (false); 11228 if (ppr == NULL) { 11229 va += PAGE_SIZE; 11230 continue; 11231 } 11232 if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx) 11233 return (false); 11234 va = ppr->pkru_rs_el.re_end; 11235 } 11236 return (true); 11237 } 11238 11239 static pt_entry_t 11240 pmap_pkru_get(pmap_t pmap, vm_offset_t va) 11241 { 11242 struct pmap_pkru_range *ppr; 11243 11244 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11245 if (pmap->pm_type != PT_X86 || 11246 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || 11247 va >= VM_MAXUSER_ADDRESS) 11248 return (0); 11249 ppr = rangeset_lookup(&pmap->pm_pkru, va); 11250 if (ppr != NULL) 11251 return (X86_PG_PKU(ppr->pkru_keyidx)); 11252 return (0); 11253 } 11254 11255 static bool 11256 pred_pkru_on_remove(void *ctx __unused, void *r) 11257 { 11258 struct pmap_pkru_range *ppr; 11259 11260 ppr = r; 11261 return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0); 11262 } 11263 11264 static void 11265 pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11266 { 11267 11268 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11269 if (pmap->pm_type == PT_X86 && 11270 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 11271 rangeset_remove_pred(&pmap->pm_pkru, sva, eva, 11272 pred_pkru_on_remove); 11273 } 11274 } 11275 11276 static int 11277 pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap) 11278 { 11279 11280 PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED); 11281 PMAP_LOCK_ASSERT(src_pmap, MA_OWNED); 11282 MPASS(dst_pmap->pm_type == PT_X86); 11283 MPASS(src_pmap->pm_type == PT_X86); 11284 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11285 if (src_pmap->pm_pkru.rs_data_ctx == NULL) 11286 return (0); 11287 return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru)); 11288 } 11289 11290 static void 11291 pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 11292 u_int keyidx) 11293 { 11294 pml4_entry_t *pml4e; 11295 pdp_entry_t *pdpe; 11296 pd_entry_t newpde, ptpaddr, *pde; 11297 pt_entry_t newpte, *ptep, pte; 11298 vm_offset_t va, va_next; 11299 bool changed; 11300 11301 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11302 MPASS(pmap->pm_type == PT_X86); 11303 MPASS(keyidx <= PMAP_MAX_PKRU_IDX); 11304 11305 for (changed = false, va = sva; va < eva; va = va_next) { 11306 pml4e = pmap_pml4e(pmap, va); 11307 if (pml4e == NULL || (*pml4e & X86_PG_V) == 0) { 11308 va_next = (va + NBPML4) & ~PML4MASK; 11309 if (va_next < va) 11310 va_next = eva; 11311 continue; 11312 } 11313 11314 pdpe = pmap_pml4e_to_pdpe(pml4e, va); 11315 if ((*pdpe & X86_PG_V) == 0) { 11316 va_next = (va + NBPDP) & ~PDPMASK; 11317 if (va_next < va) 11318 va_next = eva; 11319 continue; 11320 } 11321 11322 va_next = (va + NBPDR) & ~PDRMASK; 11323 if (va_next < va) 11324 va_next = eva; 11325 11326 pde = pmap_pdpe_to_pde(pdpe, va); 11327 ptpaddr = *pde; 11328 if (ptpaddr == 0) 11329 continue; 11330 11331 MPASS((ptpaddr & X86_PG_V) != 0); 11332 if ((ptpaddr & PG_PS) != 0) { 11333 if (va + NBPDR == va_next && eva >= va_next) { 11334 newpde = (ptpaddr & ~X86_PG_PKU_MASK) | 11335 X86_PG_PKU(keyidx); 11336 if (newpde != ptpaddr) { 11337 *pde = newpde; 11338 changed = true; 11339 } 11340 continue; 11341 } else if (!pmap_demote_pde(pmap, pde, va)) { 11342 continue; 11343 } 11344 } 11345 11346 if (va_next > eva) 11347 va_next = eva; 11348 11349 for (ptep = pmap_pde_to_pte(pde, va); va != va_next; 11350 ptep++, va += PAGE_SIZE) { 11351 pte = *ptep; 11352 if ((pte & X86_PG_V) == 0) 11353 continue; 11354 newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx); 11355 if (newpte != pte) { 11356 *ptep = newpte; 11357 changed = true; 11358 } 11359 } 11360 } 11361 if (changed) 11362 pmap_invalidate_range(pmap, sva, eva); 11363 } 11364 11365 static int 11366 pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 11367 u_int keyidx, int flags) 11368 { 11369 11370 if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX || 11371 (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0) 11372 return (EINVAL); 11373 if (eva <= sva || eva > VM_MAXUSER_ADDRESS) 11374 return (EFAULT); 11375 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) 11376 return (ENOTSUP); 11377 return (0); 11378 } 11379 11380 int 11381 pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, 11382 int flags) 11383 { 11384 int error; 11385 11386 sva = trunc_page(sva); 11387 eva = round_page(eva); 11388 error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags); 11389 if (error != 0) 11390 return (error); 11391 for (;;) { 11392 PMAP_LOCK(pmap); 11393 error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags); 11394 if (error == 0) 11395 pmap_pkru_update_range(pmap, sva, eva, keyidx); 11396 PMAP_UNLOCK(pmap); 11397 if (error != ENOMEM) 11398 break; 11399 vm_wait(NULL); 11400 } 11401 return (error); 11402 } 11403 11404 int 11405 pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11406 { 11407 int error; 11408 11409 sva = trunc_page(sva); 11410 eva = round_page(eva); 11411 error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0); 11412 if (error != 0) 11413 return (error); 11414 for (;;) { 11415 PMAP_LOCK(pmap); 11416 error = pmap_pkru_deassign(pmap, sva, eva); 11417 if (error == 0) 11418 pmap_pkru_update_range(pmap, sva, eva, 0); 11419 PMAP_UNLOCK(pmap); 11420 if (error != ENOMEM) 11421 break; 11422 vm_wait(NULL); 11423 } 11424 return (error); 11425 } 11426 11427 #if defined(KASAN) || defined(KMSAN) 11428 static vm_page_t 11429 pmap_san_enter_alloc_4k(void) 11430 { 11431 vm_page_t m; 11432 11433 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 11434 VM_ALLOC_ZERO); 11435 if (m == NULL) 11436 panic("%s: no memory to grow shadow map", __func__); 11437 return (m); 11438 } 11439 11440 static vm_page_t 11441 pmap_san_enter_alloc_2m(void) 11442 { 11443 return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO, 11444 NPTEPG, 0, ~0ul, NBPDR, 0, VM_MEMATTR_DEFAULT)); 11445 } 11446 11447 /* 11448 * Grow a shadow map by at least one 4KB page at the specified address. Use 2MB 11449 * pages when possible. 11450 */ 11451 void 11452 pmap_san_enter(vm_offset_t va) 11453 { 11454 pdp_entry_t *pdpe; 11455 pd_entry_t *pde; 11456 pt_entry_t *pte; 11457 vm_page_t m; 11458 11459 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 11460 11461 pdpe = pmap_pdpe(kernel_pmap, va); 11462 if ((*pdpe & X86_PG_V) == 0) { 11463 m = pmap_san_enter_alloc_4k(); 11464 *pdpe = (pdp_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11465 X86_PG_V | pg_nx); 11466 } 11467 pde = pmap_pdpe_to_pde(pdpe, va); 11468 if ((*pde & X86_PG_V) == 0) { 11469 m = pmap_san_enter_alloc_2m(); 11470 if (m != NULL) { 11471 *pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11472 X86_PG_PS | X86_PG_V | X86_PG_A | X86_PG_M | pg_nx); 11473 } else { 11474 m = pmap_san_enter_alloc_4k(); 11475 *pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11476 X86_PG_V | pg_nx); 11477 } 11478 } 11479 if ((*pde & X86_PG_PS) != 0) 11480 return; 11481 pte = pmap_pde_to_pte(pde, va); 11482 if ((*pte & X86_PG_V) != 0) 11483 return; 11484 m = pmap_san_enter_alloc_4k(); 11485 *pte = (pt_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | X86_PG_V | 11486 X86_PG_M | X86_PG_A | pg_nx); 11487 } 11488 #endif 11489 11490 /* 11491 * Track a range of the kernel's virtual address space that is contiguous 11492 * in various mapping attributes. 11493 */ 11494 struct pmap_kernel_map_range { 11495 vm_offset_t sva; 11496 pt_entry_t attrs; 11497 int ptes; 11498 int pdes; 11499 int pdpes; 11500 }; 11501 11502 static void 11503 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 11504 vm_offset_t eva) 11505 { 11506 const char *mode; 11507 int i, pat_idx; 11508 11509 if (eva <= range->sva) 11510 return; 11511 11512 pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true); 11513 for (i = 0; i < PAT_INDEX_SIZE; i++) 11514 if (pat_index[i] == pat_idx) 11515 break; 11516 11517 switch (i) { 11518 case PAT_WRITE_BACK: 11519 mode = "WB"; 11520 break; 11521 case PAT_WRITE_THROUGH: 11522 mode = "WT"; 11523 break; 11524 case PAT_UNCACHEABLE: 11525 mode = "UC"; 11526 break; 11527 case PAT_UNCACHED: 11528 mode = "U-"; 11529 break; 11530 case PAT_WRITE_PROTECTED: 11531 mode = "WP"; 11532 break; 11533 case PAT_WRITE_COMBINING: 11534 mode = "WC"; 11535 break; 11536 default: 11537 printf("%s: unknown PAT mode %#x for range 0x%016lx-0x%016lx\n", 11538 __func__, pat_idx, range->sva, eva); 11539 mode = "??"; 11540 break; 11541 } 11542 11543 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n", 11544 range->sva, eva, 11545 (range->attrs & X86_PG_RW) != 0 ? 'w' : '-', 11546 (range->attrs & pg_nx) != 0 ? '-' : 'x', 11547 (range->attrs & X86_PG_U) != 0 ? 'u' : 's', 11548 (range->attrs & X86_PG_G) != 0 ? 'g' : '-', 11549 mode, range->pdpes, range->pdes, range->ptes); 11550 11551 /* Reset to sentinel value. */ 11552 range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, 11553 NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, 11554 NPDEPG - 1, NPTEPG - 1); 11555 } 11556 11557 /* 11558 * Determine whether the attributes specified by a page table entry match those 11559 * being tracked by the current range. This is not quite as simple as a direct 11560 * flag comparison since some PAT modes have multiple representations. 11561 */ 11562 static bool 11563 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 11564 { 11565 pt_entry_t diff, mask; 11566 11567 mask = X86_PG_G | X86_PG_RW | X86_PG_U | X86_PG_PDE_CACHE | pg_nx; 11568 diff = (range->attrs ^ attrs) & mask; 11569 if (diff == 0) 11570 return (true); 11571 if ((diff & ~X86_PG_PDE_PAT) == 0 && 11572 pmap_pat_index(kernel_pmap, range->attrs, true) == 11573 pmap_pat_index(kernel_pmap, attrs, true)) 11574 return (true); 11575 return (false); 11576 } 11577 11578 static void 11579 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 11580 pt_entry_t attrs) 11581 { 11582 11583 memset(range, 0, sizeof(*range)); 11584 range->sva = va; 11585 range->attrs = attrs; 11586 } 11587 11588 /* 11589 * Given a leaf PTE, derive the mapping's attributes. If they do not match 11590 * those of the current run, dump the address range and its attributes, and 11591 * begin a new run. 11592 */ 11593 static void 11594 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 11595 vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde, 11596 pt_entry_t pte) 11597 { 11598 pt_entry_t attrs; 11599 11600 attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx); 11601 11602 attrs |= pdpe & pg_nx; 11603 attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U)); 11604 if ((pdpe & PG_PS) != 0) { 11605 attrs |= pdpe & (X86_PG_G | X86_PG_PDE_CACHE); 11606 } else if (pde != 0) { 11607 attrs |= pde & pg_nx; 11608 attrs &= pg_nx | (pde & (X86_PG_RW | X86_PG_U)); 11609 } 11610 if ((pde & PG_PS) != 0) { 11611 attrs |= pde & (X86_PG_G | X86_PG_PDE_CACHE); 11612 } else if (pte != 0) { 11613 attrs |= pte & pg_nx; 11614 attrs &= pg_nx | (pte & (X86_PG_RW | X86_PG_U)); 11615 attrs |= pte & (X86_PG_G | X86_PG_PTE_CACHE); 11616 11617 /* Canonicalize by always using the PDE PAT bit. */ 11618 if ((attrs & X86_PG_PTE_PAT) != 0) 11619 attrs ^= X86_PG_PDE_PAT | X86_PG_PTE_PAT; 11620 } 11621 11622 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 11623 sysctl_kmaps_dump(sb, range, va); 11624 sysctl_kmaps_reinit(range, va, attrs); 11625 } 11626 } 11627 11628 static int 11629 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 11630 { 11631 struct pmap_kernel_map_range range; 11632 struct sbuf sbuf, *sb; 11633 pml4_entry_t pml4e; 11634 pdp_entry_t *pdp, pdpe; 11635 pd_entry_t *pd, pde; 11636 pt_entry_t *pt, pte; 11637 vm_offset_t sva; 11638 vm_paddr_t pa; 11639 int error, i, j, k, l; 11640 11641 error = sysctl_wire_old_buffer(req, 0); 11642 if (error != 0) 11643 return (error); 11644 sb = &sbuf; 11645 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 11646 11647 /* Sentinel value. */ 11648 range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, 11649 NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, 11650 NPDEPG - 1, NPTEPG - 1); 11651 11652 /* 11653 * Iterate over the kernel page tables without holding the kernel pmap 11654 * lock. Outside of the large map, kernel page table pages are never 11655 * freed, so at worst we will observe inconsistencies in the output. 11656 * Within the large map, ensure that PDP and PD page addresses are 11657 * valid before descending. 11658 */ 11659 for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) { 11660 switch (i) { 11661 case PML4PML4I: 11662 sbuf_printf(sb, "\nRecursive map:\n"); 11663 break; 11664 case DMPML4I: 11665 sbuf_printf(sb, "\nDirect map:\n"); 11666 break; 11667 #ifdef KASAN 11668 case KASANPML4I: 11669 sbuf_printf(sb, "\nKASAN shadow map:\n"); 11670 break; 11671 #endif 11672 #ifdef KMSAN 11673 case KMSANSHADPML4I: 11674 sbuf_printf(sb, "\nKMSAN shadow map:\n"); 11675 break; 11676 case KMSANORIGPML4I: 11677 sbuf_printf(sb, "\nKMSAN origin map:\n"); 11678 break; 11679 #endif 11680 case KPML4BASE: 11681 sbuf_printf(sb, "\nKernel map:\n"); 11682 break; 11683 case LMSPML4I: 11684 sbuf_printf(sb, "\nLarge map:\n"); 11685 break; 11686 } 11687 11688 /* Convert to canonical form. */ 11689 if (sva == 1ul << 47) 11690 sva |= -1ul << 48; 11691 11692 restart: 11693 pml4e = kernel_pml4[i]; 11694 if ((pml4e & X86_PG_V) == 0) { 11695 sva = rounddown2(sva, NBPML4); 11696 sysctl_kmaps_dump(sb, &range, sva); 11697 sva += NBPML4; 11698 continue; 11699 } 11700 pa = pml4e & PG_FRAME; 11701 pdp = (pdp_entry_t *)PHYS_TO_DMAP(pa); 11702 11703 for (j = pmap_pdpe_index(sva); j < NPDPEPG; j++) { 11704 pdpe = pdp[j]; 11705 if ((pdpe & X86_PG_V) == 0) { 11706 sva = rounddown2(sva, NBPDP); 11707 sysctl_kmaps_dump(sb, &range, sva); 11708 sva += NBPDP; 11709 continue; 11710 } 11711 pa = pdpe & PG_FRAME; 11712 if ((pdpe & PG_PS) != 0) { 11713 sva = rounddown2(sva, NBPDP); 11714 sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, 11715 0, 0); 11716 range.pdpes++; 11717 sva += NBPDP; 11718 continue; 11719 } 11720 if (PMAP_ADDRESS_IN_LARGEMAP(sva) && 11721 vm_phys_paddr_to_vm_page(pa) == NULL) { 11722 /* 11723 * Page table pages for the large map may be 11724 * freed. Validate the next-level address 11725 * before descending. 11726 */ 11727 goto restart; 11728 } 11729 pd = (pd_entry_t *)PHYS_TO_DMAP(pa); 11730 11731 for (k = pmap_pde_index(sva); k < NPDEPG; k++) { 11732 pde = pd[k]; 11733 if ((pde & X86_PG_V) == 0) { 11734 sva = rounddown2(sva, NBPDR); 11735 sysctl_kmaps_dump(sb, &range, sva); 11736 sva += NBPDR; 11737 continue; 11738 } 11739 pa = pde & PG_FRAME; 11740 if ((pde & PG_PS) != 0) { 11741 sva = rounddown2(sva, NBPDR); 11742 sysctl_kmaps_check(sb, &range, sva, 11743 pml4e, pdpe, pde, 0); 11744 range.pdes++; 11745 sva += NBPDR; 11746 continue; 11747 } 11748 if (PMAP_ADDRESS_IN_LARGEMAP(sva) && 11749 vm_phys_paddr_to_vm_page(pa) == NULL) { 11750 /* 11751 * Page table pages for the large map 11752 * may be freed. Validate the 11753 * next-level address before descending. 11754 */ 11755 goto restart; 11756 } 11757 pt = (pt_entry_t *)PHYS_TO_DMAP(pa); 11758 11759 for (l = pmap_pte_index(sva); l < NPTEPG; l++, 11760 sva += PAGE_SIZE) { 11761 pte = pt[l]; 11762 if ((pte & X86_PG_V) == 0) { 11763 sysctl_kmaps_dump(sb, &range, 11764 sva); 11765 continue; 11766 } 11767 sysctl_kmaps_check(sb, &range, sva, 11768 pml4e, pdpe, pde, pte); 11769 range.ptes++; 11770 } 11771 } 11772 } 11773 } 11774 11775 error = sbuf_finish(sb); 11776 sbuf_delete(sb); 11777 return (error); 11778 } 11779 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 11780 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 11781 NULL, 0, sysctl_kmaps, "A", 11782 "Dump kernel address layout"); 11783 11784 #ifdef DDB 11785 DB_SHOW_COMMAND(pte, pmap_print_pte) 11786 { 11787 pmap_t pmap; 11788 pml5_entry_t *pml5; 11789 pml4_entry_t *pml4; 11790 pdp_entry_t *pdp; 11791 pd_entry_t *pde; 11792 pt_entry_t *pte, PG_V; 11793 vm_offset_t va; 11794 11795 if (!have_addr) { 11796 db_printf("show pte addr\n"); 11797 return; 11798 } 11799 va = (vm_offset_t)addr; 11800 11801 if (kdb_thread != NULL) 11802 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); 11803 else 11804 pmap = PCPU_GET(curpmap); 11805 11806 PG_V = pmap_valid_bit(pmap); 11807 db_printf("VA 0x%016lx", va); 11808 11809 if (pmap_is_la57(pmap)) { 11810 pml5 = pmap_pml5e(pmap, va); 11811 db_printf(" pml5e 0x%016lx", *pml5); 11812 if ((*pml5 & PG_V) == 0) { 11813 db_printf("\n"); 11814 return; 11815 } 11816 pml4 = pmap_pml5e_to_pml4e(pml5, va); 11817 } else { 11818 pml4 = pmap_pml4e(pmap, va); 11819 } 11820 db_printf(" pml4e 0x%016lx", *pml4); 11821 if ((*pml4 & PG_V) == 0) { 11822 db_printf("\n"); 11823 return; 11824 } 11825 pdp = pmap_pml4e_to_pdpe(pml4, va); 11826 db_printf(" pdpe 0x%016lx", *pdp); 11827 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { 11828 db_printf("\n"); 11829 return; 11830 } 11831 pde = pmap_pdpe_to_pde(pdp, va); 11832 db_printf(" pde 0x%016lx", *pde); 11833 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { 11834 db_printf("\n"); 11835 return; 11836 } 11837 pte = pmap_pde_to_pte(pde, va); 11838 db_printf(" pte 0x%016lx\n", *pte); 11839 } 11840 11841 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) 11842 { 11843 vm_paddr_t a; 11844 11845 if (have_addr) { 11846 a = (vm_paddr_t)addr; 11847 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); 11848 } else { 11849 db_printf("show phys2dmap addr\n"); 11850 } 11851 } 11852 11853 static void 11854 ptpages_show_page(int level, int idx, vm_page_t pg) 11855 { 11856 db_printf("l %d i %d pg %p phys %#lx ref %x\n", 11857 level, idx, pg, VM_PAGE_TO_PHYS(pg), pg->ref_count); 11858 } 11859 11860 static void 11861 ptpages_show_complain(int level, int idx, uint64_t pte) 11862 { 11863 db_printf("l %d i %d pte %#lx\n", level, idx, pte); 11864 } 11865 11866 static void 11867 ptpages_show_pml4(vm_page_t pg4, int num_entries, uint64_t PG_V) 11868 { 11869 vm_page_t pg3, pg2, pg1; 11870 pml4_entry_t *pml4; 11871 pdp_entry_t *pdp; 11872 pd_entry_t *pd; 11873 int i4, i3, i2; 11874 11875 pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg4)); 11876 for (i4 = 0; i4 < num_entries; i4++) { 11877 if ((pml4[i4] & PG_V) == 0) 11878 continue; 11879 pg3 = PHYS_TO_VM_PAGE(pml4[i4] & PG_FRAME); 11880 if (pg3 == NULL) { 11881 ptpages_show_complain(3, i4, pml4[i4]); 11882 continue; 11883 } 11884 ptpages_show_page(3, i4, pg3); 11885 pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg3)); 11886 for (i3 = 0; i3 < NPDPEPG; i3++) { 11887 if ((pdp[i3] & PG_V) == 0) 11888 continue; 11889 pg2 = PHYS_TO_VM_PAGE(pdp[i3] & PG_FRAME); 11890 if (pg3 == NULL) { 11891 ptpages_show_complain(2, i3, pdp[i3]); 11892 continue; 11893 } 11894 ptpages_show_page(2, i3, pg2); 11895 pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg2)); 11896 for (i2 = 0; i2 < NPDEPG; i2++) { 11897 if ((pd[i2] & PG_V) == 0) 11898 continue; 11899 pg1 = PHYS_TO_VM_PAGE(pd[i2] & PG_FRAME); 11900 if (pg1 == NULL) { 11901 ptpages_show_complain(1, i2, pd[i2]); 11902 continue; 11903 } 11904 ptpages_show_page(1, i2, pg1); 11905 } 11906 } 11907 } 11908 } 11909 11910 DB_SHOW_COMMAND(ptpages, pmap_ptpages) 11911 { 11912 pmap_t pmap; 11913 vm_page_t pg; 11914 pml5_entry_t *pml5; 11915 uint64_t PG_V; 11916 int i5; 11917 11918 if (have_addr) 11919 pmap = (pmap_t)addr; 11920 else 11921 pmap = PCPU_GET(curpmap); 11922 11923 PG_V = pmap_valid_bit(pmap); 11924 11925 if (pmap_is_la57(pmap)) { 11926 pml5 = pmap->pm_pmltop; 11927 for (i5 = 0; i5 < NUPML5E; i5++) { 11928 if ((pml5[i5] & PG_V) == 0) 11929 continue; 11930 pg = PHYS_TO_VM_PAGE(pml5[i5] & PG_FRAME); 11931 if (pg == NULL) { 11932 ptpages_show_complain(4, i5, pml5[i5]); 11933 continue; 11934 } 11935 ptpages_show_page(4, i5, pg); 11936 ptpages_show_pml4(pg, NPML4EPG, PG_V); 11937 } 11938 } else { 11939 ptpages_show_pml4(PHYS_TO_VM_PAGE(DMAP_TO_PHYS( 11940 (vm_offset_t)pmap->pm_pmltop)), NUP4ML4E, PG_V); 11941 } 11942 } 11943 #endif 11944