1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * 15 * This code is derived from software contributed to Berkeley by 16 * the Systems Programming Group of the University of Utah Computer 17 * Science Department and William Jolitz of UUNET Technologies Inc. 18 * 19 * Redistribution and use in source and binary forms, with or without 20 * modification, are permitted provided that the following conditions 21 * are met: 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 3. All advertising materials mentioning features or use of this software 28 * must display the following acknowledgement: 29 * This product includes software developed by the University of 30 * California, Berkeley and its contributors. 31 * 4. Neither the name of the University nor the names of its contributors 32 * may be used to endorse or promote products derived from this software 33 * without specific prior written permission. 34 * 35 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 36 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 38 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 39 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 40 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 41 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 42 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 43 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 44 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 45 * SUCH DAMAGE. 46 * 47 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 48 */ 49 /*- 50 * Copyright (c) 2003 Networks Associates Technology, Inc. 51 * Copyright (c) 2014-2020 The FreeBSD Foundation 52 * All rights reserved. 53 * 54 * This software was developed for the FreeBSD Project by Jake Burkholder, 55 * Safeport Network Services, and Network Associates Laboratories, the 56 * Security Research Division of Network Associates, Inc. under 57 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 58 * CHATS research program. 59 * 60 * Portions of this software were developed by 61 * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from 62 * the FreeBSD Foundation. 63 * 64 * Redistribution and use in source and binary forms, with or without 65 * modification, are permitted provided that the following conditions 66 * are met: 67 * 1. Redistributions of source code must retain the above copyright 68 * notice, this list of conditions and the following disclaimer. 69 * 2. Redistributions in binary form must reproduce the above copyright 70 * notice, this list of conditions and the following disclaimer in the 71 * documentation and/or other materials provided with the distribution. 72 * 73 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 74 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 75 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 76 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 77 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 78 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 79 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 80 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 81 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 82 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 83 * SUCH DAMAGE. 84 */ 85 86 #define AMD64_NPT_AWARE 87 88 #include <sys/cdefs.h> 89 __FBSDID("$FreeBSD$"); 90 91 /* 92 * Manages physical address maps. 93 * 94 * Since the information managed by this module is 95 * also stored by the logical address mapping module, 96 * this module may throw away valid virtual-to-physical 97 * mappings at almost any time. However, invalidations 98 * of virtual-to-physical mappings must be done as 99 * requested. 100 * 101 * In order to cope with hardware architectures which 102 * make virtual-to-physical map invalidates expensive, 103 * this module may delay invalidate or reduced protection 104 * operations until such time as they are actually 105 * necessary. This module is given full information as 106 * to which processors are currently using which maps, 107 * and to when physical maps must be made correct. 108 */ 109 110 #include "opt_ddb.h" 111 #include "opt_pmap.h" 112 #include "opt_vm.h" 113 114 #include <sys/param.h> 115 #include <sys/asan.h> 116 #include <sys/bitstring.h> 117 #include <sys/bus.h> 118 #include <sys/systm.h> 119 #include <sys/counter.h> 120 #include <sys/kernel.h> 121 #include <sys/ktr.h> 122 #include <sys/lock.h> 123 #include <sys/malloc.h> 124 #include <sys/mman.h> 125 #include <sys/msan.h> 126 #include <sys/mutex.h> 127 #include <sys/proc.h> 128 #include <sys/rangeset.h> 129 #include <sys/rwlock.h> 130 #include <sys/sbuf.h> 131 #include <sys/smr.h> 132 #include <sys/sx.h> 133 #include <sys/turnstile.h> 134 #include <sys/vmem.h> 135 #include <sys/vmmeter.h> 136 #include <sys/sched.h> 137 #include <sys/sysctl.h> 138 #include <sys/smp.h> 139 #ifdef DDB 140 #include <sys/kdb.h> 141 #include <ddb/ddb.h> 142 #endif 143 144 #include <vm/vm.h> 145 #include <vm/vm_param.h> 146 #include <vm/vm_kern.h> 147 #include <vm/vm_page.h> 148 #include <vm/vm_map.h> 149 #include <vm/vm_object.h> 150 #include <vm/vm_extern.h> 151 #include <vm/vm_pageout.h> 152 #include <vm/vm_pager.h> 153 #include <vm/vm_phys.h> 154 #include <vm/vm_radix.h> 155 #include <vm/vm_reserv.h> 156 #include <vm/vm_dumpset.h> 157 #include <vm/uma.h> 158 159 #include <machine/asan.h> 160 #include <machine/intr_machdep.h> 161 #include <x86/apicvar.h> 162 #include <x86/ifunc.h> 163 #include <machine/cpu.h> 164 #include <machine/cputypes.h> 165 #include <machine/md_var.h> 166 #include <machine/msan.h> 167 #include <machine/pcb.h> 168 #include <machine/specialreg.h> 169 #ifdef SMP 170 #include <machine/smp.h> 171 #endif 172 #include <machine/sysarch.h> 173 #include <machine/tss.h> 174 175 #ifdef NUMA 176 #define PMAP_MEMDOM MAXMEMDOM 177 #else 178 #define PMAP_MEMDOM 1 179 #endif 180 181 static __inline boolean_t 182 pmap_type_guest(pmap_t pmap) 183 { 184 185 return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI)); 186 } 187 188 static __inline boolean_t 189 pmap_emulate_ad_bits(pmap_t pmap) 190 { 191 192 return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); 193 } 194 195 static __inline pt_entry_t 196 pmap_valid_bit(pmap_t pmap) 197 { 198 pt_entry_t mask; 199 200 switch (pmap->pm_type) { 201 case PT_X86: 202 case PT_RVI: 203 mask = X86_PG_V; 204 break; 205 case PT_EPT: 206 if (pmap_emulate_ad_bits(pmap)) 207 mask = EPT_PG_EMUL_V; 208 else 209 mask = EPT_PG_READ; 210 break; 211 default: 212 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); 213 } 214 215 return (mask); 216 } 217 218 static __inline pt_entry_t 219 pmap_rw_bit(pmap_t pmap) 220 { 221 pt_entry_t mask; 222 223 switch (pmap->pm_type) { 224 case PT_X86: 225 case PT_RVI: 226 mask = X86_PG_RW; 227 break; 228 case PT_EPT: 229 if (pmap_emulate_ad_bits(pmap)) 230 mask = EPT_PG_EMUL_RW; 231 else 232 mask = EPT_PG_WRITE; 233 break; 234 default: 235 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); 236 } 237 238 return (mask); 239 } 240 241 static pt_entry_t pg_g; 242 243 static __inline pt_entry_t 244 pmap_global_bit(pmap_t pmap) 245 { 246 pt_entry_t mask; 247 248 switch (pmap->pm_type) { 249 case PT_X86: 250 mask = pg_g; 251 break; 252 case PT_RVI: 253 case PT_EPT: 254 mask = 0; 255 break; 256 default: 257 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); 258 } 259 260 return (mask); 261 } 262 263 static __inline pt_entry_t 264 pmap_accessed_bit(pmap_t pmap) 265 { 266 pt_entry_t mask; 267 268 switch (pmap->pm_type) { 269 case PT_X86: 270 case PT_RVI: 271 mask = X86_PG_A; 272 break; 273 case PT_EPT: 274 if (pmap_emulate_ad_bits(pmap)) 275 mask = EPT_PG_READ; 276 else 277 mask = EPT_PG_A; 278 break; 279 default: 280 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); 281 } 282 283 return (mask); 284 } 285 286 static __inline pt_entry_t 287 pmap_modified_bit(pmap_t pmap) 288 { 289 pt_entry_t mask; 290 291 switch (pmap->pm_type) { 292 case PT_X86: 293 case PT_RVI: 294 mask = X86_PG_M; 295 break; 296 case PT_EPT: 297 if (pmap_emulate_ad_bits(pmap)) 298 mask = EPT_PG_WRITE; 299 else 300 mask = EPT_PG_M; 301 break; 302 default: 303 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); 304 } 305 306 return (mask); 307 } 308 309 static __inline pt_entry_t 310 pmap_pku_mask_bit(pmap_t pmap) 311 { 312 313 return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0); 314 } 315 316 #if !defined(DIAGNOSTIC) 317 #ifdef __GNUC_GNU_INLINE__ 318 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 319 #else 320 #define PMAP_INLINE extern inline 321 #endif 322 #else 323 #define PMAP_INLINE 324 #endif 325 326 #ifdef PV_STATS 327 #define PV_STAT(x) do { x ; } while (0) 328 #else 329 #define PV_STAT(x) do { } while (0) 330 #endif 331 332 #undef pa_index 333 #ifdef NUMA 334 #define pa_index(pa) ({ \ 335 KASSERT((pa) <= vm_phys_segs[vm_phys_nsegs - 1].end, \ 336 ("address %lx beyond the last segment", (pa))); \ 337 (pa) >> PDRSHIFT; \ 338 }) 339 #define pa_to_pmdp(pa) (&pv_table[pa_index(pa)]) 340 #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page)) 341 #define PHYS_TO_PV_LIST_LOCK(pa) ({ \ 342 struct rwlock *_lock; \ 343 if (__predict_false((pa) > pmap_last_pa)) \ 344 _lock = &pv_dummy_large.pv_lock; \ 345 else \ 346 _lock = &(pa_to_pmdp(pa)->pv_lock); \ 347 _lock; \ 348 }) 349 #else 350 #define pa_index(pa) ((pa) >> PDRSHIFT) 351 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 352 353 #define NPV_LIST_LOCKS MAXCPU 354 355 #define PHYS_TO_PV_LIST_LOCK(pa) \ 356 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) 357 #endif 358 359 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 360 struct rwlock **_lockp = (lockp); \ 361 struct rwlock *_new_lock; \ 362 \ 363 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 364 if (_new_lock != *_lockp) { \ 365 if (*_lockp != NULL) \ 366 rw_wunlock(*_lockp); \ 367 *_lockp = _new_lock; \ 368 rw_wlock(*_lockp); \ 369 } \ 370 } while (0) 371 372 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 373 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 374 375 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 376 struct rwlock **_lockp = (lockp); \ 377 \ 378 if (*_lockp != NULL) { \ 379 rw_wunlock(*_lockp); \ 380 *_lockp = NULL; \ 381 } \ 382 } while (0) 383 384 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 385 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 386 387 struct pmap kernel_pmap_store; 388 389 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 390 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 391 392 int nkpt; 393 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, 394 "Number of kernel page table pages allocated on bootup"); 395 396 static int ndmpdp; 397 vm_paddr_t dmaplimit; 398 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 399 pt_entry_t pg_nx; 400 401 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 402 "VM/pmap parameters"); 403 404 static int pg_ps_enabled = 1; 405 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 406 &pg_ps_enabled, 0, "Are large page mappings enabled?"); 407 408 int __read_frequently la57 = 0; 409 SYSCTL_INT(_vm_pmap, OID_AUTO, la57, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 410 &la57, 0, 411 "5-level paging for host is enabled"); 412 413 static bool 414 pmap_is_la57(pmap_t pmap) 415 { 416 if (pmap->pm_type == PT_X86) 417 return (la57); 418 return (false); /* XXXKIB handle EPT */ 419 } 420 421 #define PAT_INDEX_SIZE 8 422 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 423 424 static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 425 static u_int64_t KPDphys; /* phys addr of kernel level 2 */ 426 static u_int64_t KPDPphys; /* phys addr of kernel level 3 */ 427 u_int64_t KPML4phys; /* phys addr of kernel level 4 */ 428 u_int64_t KPML5phys; /* phys addr of kernel level 5, 429 if supported */ 430 431 #ifdef KASAN 432 static uint64_t KASANPDPphys; 433 #endif 434 #ifdef KMSAN 435 static uint64_t KMSANSHADPDPphys; 436 static uint64_t KMSANORIGPDPphys; 437 438 /* 439 * To support systems with large amounts of memory, it is necessary to extend 440 * the maximum size of the direct map. This could eat into the space reserved 441 * for the shadow map. 442 */ 443 _Static_assert(DMPML4I + NDMPML4E <= KMSANSHADPML4I, "direct map overflow"); 444 #endif 445 446 static pml4_entry_t *kernel_pml4; 447 static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ 448 static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 449 static int ndmpdpphys; /* number of DMPDPphys pages */ 450 451 vm_paddr_t kernphys; /* phys addr of start of bootstrap data */ 452 vm_paddr_t KERNend; /* and the end */ 453 454 /* 455 * pmap_mapdev support pre initialization (i.e. console) 456 */ 457 #define PMAP_PREINIT_MAPPING_COUNT 8 458 static struct pmap_preinit_mapping { 459 vm_paddr_t pa; 460 vm_offset_t va; 461 vm_size_t sz; 462 int mode; 463 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 464 static int pmap_initialized; 465 466 /* 467 * Data for the pv entry allocation mechanism. 468 * Updates to pv_invl_gen are protected by the pv list lock but reads are not. 469 */ 470 #ifdef NUMA 471 static __inline int 472 pc_to_domain(struct pv_chunk *pc) 473 { 474 475 return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc))); 476 } 477 #else 478 static __inline int 479 pc_to_domain(struct pv_chunk *pc __unused) 480 { 481 482 return (0); 483 } 484 #endif 485 486 struct pv_chunks_list { 487 struct mtx pvc_lock; 488 TAILQ_HEAD(pch, pv_chunk) pvc_list; 489 int active_reclaims; 490 } __aligned(CACHE_LINE_SIZE); 491 492 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM]; 493 494 #ifdef NUMA 495 struct pmap_large_md_page { 496 struct rwlock pv_lock; 497 struct md_page pv_page; 498 u_long pv_invl_gen; 499 }; 500 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large; 501 #define pv_dummy pv_dummy_large.pv_page 502 __read_mostly static struct pmap_large_md_page *pv_table; 503 __read_mostly vm_paddr_t pmap_last_pa; 504 #else 505 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; 506 static u_long pv_invl_gen[NPV_LIST_LOCKS]; 507 static struct md_page *pv_table; 508 static struct md_page pv_dummy; 509 #endif 510 511 /* 512 * All those kernel PT submaps that BSD is so fond of 513 */ 514 pt_entry_t *CMAP1 = NULL; 515 caddr_t CADDR1 = 0; 516 static vm_offset_t qframe = 0; 517 static struct mtx qframe_mtx; 518 519 static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ 520 521 static vmem_t *large_vmem; 522 static u_int lm_ents; 523 #define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= LARGEMAP_MIN_ADDRESS && \ 524 (va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents) 525 526 int pmap_pcid_enabled = 1; 527 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 528 &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?"); 529 int invpcid_works = 0; 530 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, 531 "Is the invpcid instruction available ?"); 532 int pmap_pcid_invlpg_workaround = 0; 533 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_invlpg_workaround, 534 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 535 &pmap_pcid_invlpg_workaround, 0, 536 "Enable small core PCID/INVLPG workaround"); 537 int pmap_pcid_invlpg_workaround_uena = 1; 538 539 int __read_frequently pti = 0; 540 SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 541 &pti, 0, 542 "Page Table Isolation enabled"); 543 static vm_object_t pti_obj; 544 static pml4_entry_t *pti_pml4; 545 static vm_pindex_t pti_pg_idx; 546 static bool pti_finalized; 547 548 struct pmap_pkru_range { 549 struct rs_el pkru_rs_el; 550 u_int pkru_keyidx; 551 int pkru_flags; 552 }; 553 554 static uma_zone_t pmap_pkru_ranges_zone; 555 static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 556 static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va); 557 static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 558 static void *pkru_dup_range(void *ctx, void *data); 559 static void pkru_free_range(void *ctx, void *node); 560 static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap); 561 static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 562 static void pmap_pkru_deassign_all(pmap_t pmap); 563 564 static COUNTER_U64_DEFINE_EARLY(pcid_save_cnt); 565 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLFLAG_RD, 566 &pcid_save_cnt, "Count of saved TLB context on switch"); 567 568 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = 569 LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); 570 static struct mtx invl_gen_mtx; 571 /* Fake lock object to satisfy turnstiles interface. */ 572 static struct lock_object invl_gen_ts = { 573 .lo_name = "invlts", 574 }; 575 static struct pmap_invl_gen pmap_invl_gen_head = { 576 .gen = 1, 577 .next = NULL, 578 }; 579 static u_long pmap_invl_gen = 1; 580 static int pmap_invl_waiters; 581 static struct callout pmap_invl_callout; 582 static bool pmap_invl_callout_inited; 583 584 #define PMAP_ASSERT_NOT_IN_DI() \ 585 KASSERT(pmap_not_in_di(), ("DI already started")) 586 587 static bool 588 pmap_di_locked(void) 589 { 590 int tun; 591 592 if ((cpu_feature2 & CPUID2_CX16) == 0) 593 return (true); 594 tun = 0; 595 TUNABLE_INT_FETCH("vm.pmap.di_locked", &tun); 596 return (tun != 0); 597 } 598 599 static int 600 sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS) 601 { 602 int locked; 603 604 locked = pmap_di_locked(); 605 return (sysctl_handle_int(oidp, &locked, 0, req)); 606 } 607 SYSCTL_PROC(_vm_pmap, OID_AUTO, di_locked, CTLTYPE_INT | CTLFLAG_RDTUN | 608 CTLFLAG_MPSAFE, 0, 0, sysctl_pmap_di_locked, "", 609 "Locked delayed invalidation"); 610 611 static bool pmap_not_in_di_l(void); 612 static bool pmap_not_in_di_u(void); 613 DEFINE_IFUNC(, bool, pmap_not_in_di, (void)) 614 { 615 616 return (pmap_di_locked() ? pmap_not_in_di_l : pmap_not_in_di_u); 617 } 618 619 static bool 620 pmap_not_in_di_l(void) 621 { 622 struct pmap_invl_gen *invl_gen; 623 624 invl_gen = &curthread->td_md.md_invl_gen; 625 return (invl_gen->gen == 0); 626 } 627 628 static void 629 pmap_thread_init_invl_gen_l(struct thread *td) 630 { 631 struct pmap_invl_gen *invl_gen; 632 633 invl_gen = &td->td_md.md_invl_gen; 634 invl_gen->gen = 0; 635 } 636 637 static void 638 pmap_delayed_invl_wait_block(u_long *m_gen, u_long *invl_gen) 639 { 640 struct turnstile *ts; 641 642 ts = turnstile_trywait(&invl_gen_ts); 643 if (*m_gen > atomic_load_long(invl_gen)) 644 turnstile_wait(ts, NULL, TS_SHARED_QUEUE); 645 else 646 turnstile_cancel(ts); 647 } 648 649 static void 650 pmap_delayed_invl_finish_unblock(u_long new_gen) 651 { 652 struct turnstile *ts; 653 654 turnstile_chain_lock(&invl_gen_ts); 655 ts = turnstile_lookup(&invl_gen_ts); 656 if (new_gen != 0) 657 pmap_invl_gen = new_gen; 658 if (ts != NULL) { 659 turnstile_broadcast(ts, TS_SHARED_QUEUE); 660 turnstile_unpend(ts); 661 } 662 turnstile_chain_unlock(&invl_gen_ts); 663 } 664 665 /* 666 * Start a new Delayed Invalidation (DI) block of code, executed by 667 * the current thread. Within a DI block, the current thread may 668 * destroy both the page table and PV list entries for a mapping and 669 * then release the corresponding PV list lock before ensuring that 670 * the mapping is flushed from the TLBs of any processors with the 671 * pmap active. 672 */ 673 static void 674 pmap_delayed_invl_start_l(void) 675 { 676 struct pmap_invl_gen *invl_gen; 677 u_long currgen; 678 679 invl_gen = &curthread->td_md.md_invl_gen; 680 PMAP_ASSERT_NOT_IN_DI(); 681 mtx_lock(&invl_gen_mtx); 682 if (LIST_EMPTY(&pmap_invl_gen_tracker)) 683 currgen = pmap_invl_gen; 684 else 685 currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; 686 invl_gen->gen = currgen + 1; 687 LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); 688 mtx_unlock(&invl_gen_mtx); 689 } 690 691 /* 692 * Finish the DI block, previously started by the current thread. All 693 * required TLB flushes for the pages marked by 694 * pmap_delayed_invl_page() must be finished before this function is 695 * called. 696 * 697 * This function works by bumping the global DI generation number to 698 * the generation number of the current thread's DI, unless there is a 699 * pending DI that started earlier. In the latter case, bumping the 700 * global DI generation number would incorrectly signal that the 701 * earlier DI had finished. Instead, this function bumps the earlier 702 * DI's generation number to match the generation number of the 703 * current thread's DI. 704 */ 705 static void 706 pmap_delayed_invl_finish_l(void) 707 { 708 struct pmap_invl_gen *invl_gen, *next; 709 710 invl_gen = &curthread->td_md.md_invl_gen; 711 KASSERT(invl_gen->gen != 0, ("missed invl_start")); 712 mtx_lock(&invl_gen_mtx); 713 next = LIST_NEXT(invl_gen, link); 714 if (next == NULL) 715 pmap_delayed_invl_finish_unblock(invl_gen->gen); 716 else 717 next->gen = invl_gen->gen; 718 LIST_REMOVE(invl_gen, link); 719 mtx_unlock(&invl_gen_mtx); 720 invl_gen->gen = 0; 721 } 722 723 static bool 724 pmap_not_in_di_u(void) 725 { 726 struct pmap_invl_gen *invl_gen; 727 728 invl_gen = &curthread->td_md.md_invl_gen; 729 return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0); 730 } 731 732 static void 733 pmap_thread_init_invl_gen_u(struct thread *td) 734 { 735 struct pmap_invl_gen *invl_gen; 736 737 invl_gen = &td->td_md.md_invl_gen; 738 invl_gen->gen = 0; 739 invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID; 740 } 741 742 static bool 743 pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out) 744 { 745 uint64_t new_high, new_low, old_high, old_low; 746 char res; 747 748 old_low = new_low = 0; 749 old_high = new_high = (uintptr_t)0; 750 751 __asm volatile("lock;cmpxchg16b\t%1" 752 : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) 753 : "b"(new_low), "c" (new_high) 754 : "memory", "cc"); 755 if (res == 0) { 756 if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0) 757 return (false); 758 out->gen = old_low; 759 out->next = (void *)old_high; 760 } else { 761 out->gen = new_low; 762 out->next = (void *)new_high; 763 } 764 return (true); 765 } 766 767 static bool 768 pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val, 769 struct pmap_invl_gen *new_val) 770 { 771 uint64_t new_high, new_low, old_high, old_low; 772 char res; 773 774 new_low = new_val->gen; 775 new_high = (uintptr_t)new_val->next; 776 old_low = old_val->gen; 777 old_high = (uintptr_t)old_val->next; 778 779 __asm volatile("lock;cmpxchg16b\t%1" 780 : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) 781 : "b"(new_low), "c" (new_high) 782 : "memory", "cc"); 783 return (res); 784 } 785 786 static COUNTER_U64_DEFINE_EARLY(pv_page_count); 787 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_page_count, CTLFLAG_RD, 788 &pv_page_count, "Current number of allocated pv pages"); 789 790 static COUNTER_U64_DEFINE_EARLY(user_pt_page_count); 791 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, user_pt_page_count, CTLFLAG_RD, 792 &user_pt_page_count, 793 "Current number of allocated page table pages for userspace"); 794 795 static COUNTER_U64_DEFINE_EARLY(kernel_pt_page_count); 796 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, kernel_pt_page_count, CTLFLAG_RD, 797 &kernel_pt_page_count, 798 "Current number of allocated page table pages for the kernel"); 799 800 #ifdef PV_STATS 801 802 static COUNTER_U64_DEFINE_EARLY(invl_start_restart); 803 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_start_restart, 804 CTLFLAG_RD, &invl_start_restart, 805 "Number of delayed TLB invalidation request restarts"); 806 807 static COUNTER_U64_DEFINE_EARLY(invl_finish_restart); 808 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD, 809 &invl_finish_restart, 810 "Number of delayed TLB invalidation completion restarts"); 811 812 static int invl_max_qlen; 813 SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD, 814 &invl_max_qlen, 0, 815 "Maximum delayed TLB invalidation request queue length"); 816 #endif 817 818 #define di_delay locks_delay 819 820 static void 821 pmap_delayed_invl_start_u(void) 822 { 823 struct pmap_invl_gen *invl_gen, *p, prev, new_prev; 824 struct thread *td; 825 struct lock_delay_arg lda; 826 uintptr_t prevl; 827 u_char pri; 828 #ifdef PV_STATS 829 int i, ii; 830 #endif 831 832 td = curthread; 833 invl_gen = &td->td_md.md_invl_gen; 834 PMAP_ASSERT_NOT_IN_DI(); 835 lock_delay_arg_init(&lda, &di_delay); 836 invl_gen->saved_pri = 0; 837 pri = td->td_base_pri; 838 if (pri > PVM) { 839 thread_lock(td); 840 pri = td->td_base_pri; 841 if (pri > PVM) { 842 invl_gen->saved_pri = pri; 843 sched_prio(td, PVM); 844 } 845 thread_unlock(td); 846 } 847 again: 848 PV_STAT(i = 0); 849 for (p = &pmap_invl_gen_head;; p = prev.next) { 850 PV_STAT(i++); 851 prevl = (uintptr_t)atomic_load_ptr(&p->next); 852 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { 853 PV_STAT(counter_u64_add(invl_start_restart, 1)); 854 lock_delay(&lda); 855 goto again; 856 } 857 if (prevl == 0) 858 break; 859 prev.next = (void *)prevl; 860 } 861 #ifdef PV_STATS 862 if ((ii = invl_max_qlen) < i) 863 atomic_cmpset_int(&invl_max_qlen, ii, i); 864 #endif 865 866 if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) { 867 PV_STAT(counter_u64_add(invl_start_restart, 1)); 868 lock_delay(&lda); 869 goto again; 870 } 871 872 new_prev.gen = prev.gen; 873 new_prev.next = invl_gen; 874 invl_gen->gen = prev.gen + 1; 875 876 /* Formal fence between store to invl->gen and updating *p. */ 877 atomic_thread_fence_rel(); 878 879 /* 880 * After inserting an invl_gen element with invalid bit set, 881 * this thread blocks any other thread trying to enter the 882 * delayed invalidation block. Do not allow to remove us from 883 * the CPU, because it causes starvation for other threads. 884 */ 885 critical_enter(); 886 887 /* 888 * ABA for *p is not possible there, since p->gen can only 889 * increase. So if the *p thread finished its di, then 890 * started a new one and got inserted into the list at the 891 * same place, its gen will appear greater than the previously 892 * read gen. 893 */ 894 if (!pmap_di_store_invl(p, &prev, &new_prev)) { 895 critical_exit(); 896 PV_STAT(counter_u64_add(invl_start_restart, 1)); 897 lock_delay(&lda); 898 goto again; 899 } 900 901 /* 902 * There we clear PMAP_INVL_GEN_NEXT_INVALID in 903 * invl_gen->next, allowing other threads to iterate past us. 904 * pmap_di_store_invl() provides fence between the generation 905 * write and the update of next. 906 */ 907 invl_gen->next = NULL; 908 critical_exit(); 909 } 910 911 static bool 912 pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen *invl_gen, 913 struct pmap_invl_gen *p) 914 { 915 struct pmap_invl_gen prev, new_prev; 916 u_long mygen; 917 918 /* 919 * Load invl_gen->gen after setting invl_gen->next 920 * PMAP_INVL_GEN_NEXT_INVALID. This prevents larger 921 * generations to propagate to our invl_gen->gen. Lock prefix 922 * in atomic_set_ptr() worked as seq_cst fence. 923 */ 924 mygen = atomic_load_long(&invl_gen->gen); 925 926 if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen) 927 return (false); 928 929 KASSERT(prev.gen < mygen, 930 ("invalid di gen sequence %lu %lu", prev.gen, mygen)); 931 new_prev.gen = mygen; 932 new_prev.next = (void *)((uintptr_t)invl_gen->next & 933 ~PMAP_INVL_GEN_NEXT_INVALID); 934 935 /* Formal fence between load of prev and storing update to it. */ 936 atomic_thread_fence_rel(); 937 938 return (pmap_di_store_invl(p, &prev, &new_prev)); 939 } 940 941 static void 942 pmap_delayed_invl_finish_u(void) 943 { 944 struct pmap_invl_gen *invl_gen, *p; 945 struct thread *td; 946 struct lock_delay_arg lda; 947 uintptr_t prevl; 948 949 td = curthread; 950 invl_gen = &td->td_md.md_invl_gen; 951 KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0")); 952 KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0, 953 ("missed invl_start: INVALID")); 954 lock_delay_arg_init(&lda, &di_delay); 955 956 again: 957 for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) { 958 prevl = (uintptr_t)atomic_load_ptr(&p->next); 959 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { 960 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 961 lock_delay(&lda); 962 goto again; 963 } 964 if ((void *)prevl == invl_gen) 965 break; 966 } 967 968 /* 969 * It is legitimate to not find ourself on the list if a 970 * thread before us finished its DI and started it again. 971 */ 972 if (__predict_false(p == NULL)) { 973 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 974 lock_delay(&lda); 975 goto again; 976 } 977 978 critical_enter(); 979 atomic_set_ptr((uintptr_t *)&invl_gen->next, 980 PMAP_INVL_GEN_NEXT_INVALID); 981 if (!pmap_delayed_invl_finish_u_crit(invl_gen, p)) { 982 atomic_clear_ptr((uintptr_t *)&invl_gen->next, 983 PMAP_INVL_GEN_NEXT_INVALID); 984 critical_exit(); 985 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 986 lock_delay(&lda); 987 goto again; 988 } 989 critical_exit(); 990 if (atomic_load_int(&pmap_invl_waiters) > 0) 991 pmap_delayed_invl_finish_unblock(0); 992 if (invl_gen->saved_pri != 0) { 993 thread_lock(td); 994 sched_prio(td, invl_gen->saved_pri); 995 thread_unlock(td); 996 } 997 } 998 999 #ifdef DDB 1000 DB_SHOW_COMMAND(di_queue, pmap_di_queue) 1001 { 1002 struct pmap_invl_gen *p, *pn; 1003 struct thread *td; 1004 uintptr_t nextl; 1005 bool first; 1006 1007 for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn, 1008 first = false) { 1009 nextl = (uintptr_t)atomic_load_ptr(&p->next); 1010 pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID); 1011 td = first ? NULL : __containerof(p, struct thread, 1012 td_md.md_invl_gen); 1013 db_printf("gen %lu inv %d td %p tid %d\n", p->gen, 1014 (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td, 1015 td != NULL ? td->td_tid : -1); 1016 } 1017 } 1018 #endif 1019 1020 #ifdef PV_STATS 1021 static COUNTER_U64_DEFINE_EARLY(invl_wait); 1022 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait, 1023 CTLFLAG_RD, &invl_wait, 1024 "Number of times DI invalidation blocked pmap_remove_all/write"); 1025 1026 static COUNTER_U64_DEFINE_EARLY(invl_wait_slow); 1027 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD, 1028 &invl_wait_slow, "Number of slow invalidation waits for lockless DI"); 1029 1030 #endif 1031 1032 #ifdef NUMA 1033 static u_long * 1034 pmap_delayed_invl_genp(vm_page_t m) 1035 { 1036 vm_paddr_t pa; 1037 u_long *gen; 1038 1039 pa = VM_PAGE_TO_PHYS(m); 1040 if (__predict_false((pa) > pmap_last_pa)) 1041 gen = &pv_dummy_large.pv_invl_gen; 1042 else 1043 gen = &(pa_to_pmdp(pa)->pv_invl_gen); 1044 1045 return (gen); 1046 } 1047 #else 1048 static u_long * 1049 pmap_delayed_invl_genp(vm_page_t m) 1050 { 1051 1052 return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); 1053 } 1054 #endif 1055 1056 static void 1057 pmap_delayed_invl_callout_func(void *arg __unused) 1058 { 1059 1060 if (atomic_load_int(&pmap_invl_waiters) == 0) 1061 return; 1062 pmap_delayed_invl_finish_unblock(0); 1063 } 1064 1065 static void 1066 pmap_delayed_invl_callout_init(void *arg __unused) 1067 { 1068 1069 if (pmap_di_locked()) 1070 return; 1071 callout_init(&pmap_invl_callout, 1); 1072 pmap_invl_callout_inited = true; 1073 } 1074 SYSINIT(pmap_di_callout, SI_SUB_CPU + 1, SI_ORDER_ANY, 1075 pmap_delayed_invl_callout_init, NULL); 1076 1077 /* 1078 * Ensure that all currently executing DI blocks, that need to flush 1079 * TLB for the given page m, actually flushed the TLB at the time the 1080 * function returned. If the page m has an empty PV list and we call 1081 * pmap_delayed_invl_wait(), upon its return we know that no CPU has a 1082 * valid mapping for the page m in either its page table or TLB. 1083 * 1084 * This function works by blocking until the global DI generation 1085 * number catches up with the generation number associated with the 1086 * given page m and its PV list. Since this function's callers 1087 * typically own an object lock and sometimes own a page lock, it 1088 * cannot sleep. Instead, it blocks on a turnstile to relinquish the 1089 * processor. 1090 */ 1091 static void 1092 pmap_delayed_invl_wait_l(vm_page_t m) 1093 { 1094 u_long *m_gen; 1095 #ifdef PV_STATS 1096 bool accounted = false; 1097 #endif 1098 1099 m_gen = pmap_delayed_invl_genp(m); 1100 while (*m_gen > pmap_invl_gen) { 1101 #ifdef PV_STATS 1102 if (!accounted) { 1103 counter_u64_add(invl_wait, 1); 1104 accounted = true; 1105 } 1106 #endif 1107 pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen); 1108 } 1109 } 1110 1111 static void 1112 pmap_delayed_invl_wait_u(vm_page_t m) 1113 { 1114 u_long *m_gen; 1115 struct lock_delay_arg lda; 1116 bool fast; 1117 1118 fast = true; 1119 m_gen = pmap_delayed_invl_genp(m); 1120 lock_delay_arg_init(&lda, &di_delay); 1121 while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) { 1122 if (fast || !pmap_invl_callout_inited) { 1123 PV_STAT(counter_u64_add(invl_wait, 1)); 1124 lock_delay(&lda); 1125 fast = false; 1126 } else { 1127 /* 1128 * The page's invalidation generation number 1129 * is still below the current thread's number. 1130 * Prepare to block so that we do not waste 1131 * CPU cycles or worse, suffer livelock. 1132 * 1133 * Since it is impossible to block without 1134 * racing with pmap_delayed_invl_finish_u(), 1135 * prepare for the race by incrementing 1136 * pmap_invl_waiters and arming a 1-tick 1137 * callout which will unblock us if we lose 1138 * the race. 1139 */ 1140 atomic_add_int(&pmap_invl_waiters, 1); 1141 1142 /* 1143 * Re-check the current thread's invalidation 1144 * generation after incrementing 1145 * pmap_invl_waiters, so that there is no race 1146 * with pmap_delayed_invl_finish_u() setting 1147 * the page generation and checking 1148 * pmap_invl_waiters. The only race allowed 1149 * is for a missed unblock, which is handled 1150 * by the callout. 1151 */ 1152 if (*m_gen > 1153 atomic_load_long(&pmap_invl_gen_head.gen)) { 1154 callout_reset(&pmap_invl_callout, 1, 1155 pmap_delayed_invl_callout_func, NULL); 1156 PV_STAT(counter_u64_add(invl_wait_slow, 1)); 1157 pmap_delayed_invl_wait_block(m_gen, 1158 &pmap_invl_gen_head.gen); 1159 } 1160 atomic_add_int(&pmap_invl_waiters, -1); 1161 } 1162 } 1163 } 1164 1165 DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *)) 1166 { 1167 1168 return (pmap_di_locked() ? pmap_thread_init_invl_gen_l : 1169 pmap_thread_init_invl_gen_u); 1170 } 1171 1172 DEFINE_IFUNC(static, void, pmap_delayed_invl_start, (void)) 1173 { 1174 1175 return (pmap_di_locked() ? pmap_delayed_invl_start_l : 1176 pmap_delayed_invl_start_u); 1177 } 1178 1179 DEFINE_IFUNC(static, void, pmap_delayed_invl_finish, (void)) 1180 { 1181 1182 return (pmap_di_locked() ? pmap_delayed_invl_finish_l : 1183 pmap_delayed_invl_finish_u); 1184 } 1185 1186 DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t)) 1187 { 1188 1189 return (pmap_di_locked() ? pmap_delayed_invl_wait_l : 1190 pmap_delayed_invl_wait_u); 1191 } 1192 1193 /* 1194 * Mark the page m's PV list as participating in the current thread's 1195 * DI block. Any threads concurrently using m's PV list to remove or 1196 * restrict all mappings to m will wait for the current thread's DI 1197 * block to complete before proceeding. 1198 * 1199 * The function works by setting the DI generation number for m's PV 1200 * list to at least the DI generation number of the current thread. 1201 * This forces a caller of pmap_delayed_invl_wait() to block until 1202 * current thread calls pmap_delayed_invl_finish(). 1203 */ 1204 static void 1205 pmap_delayed_invl_page(vm_page_t m) 1206 { 1207 u_long gen, *m_gen; 1208 1209 rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); 1210 gen = curthread->td_md.md_invl_gen.gen; 1211 if (gen == 0) 1212 return; 1213 m_gen = pmap_delayed_invl_genp(m); 1214 if (*m_gen < gen) 1215 *m_gen = gen; 1216 } 1217 1218 /* 1219 * Crashdump maps. 1220 */ 1221 static caddr_t crashdumpmap; 1222 1223 /* 1224 * Internal flags for pmap_enter()'s helper functions. 1225 */ 1226 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 1227 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 1228 1229 /* 1230 * Internal flags for pmap_mapdev_internal() and 1231 * pmap_change_props_locked(). 1232 */ 1233 #define MAPDEV_FLUSHCACHE 0x00000001 /* Flush cache after mapping. */ 1234 #define MAPDEV_SETATTR 0x00000002 /* Modify existing attrs. */ 1235 #define MAPDEV_ASSERTVALID 0x00000004 /* Assert mapping validity. */ 1236 1237 TAILQ_HEAD(pv_chunklist, pv_chunk); 1238 1239 static void free_pv_chunk(struct pv_chunk *pc); 1240 static void free_pv_chunk_batch(struct pv_chunklist *batch); 1241 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 1242 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 1243 static int popcnt_pc_map_pq(uint64_t *map); 1244 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 1245 static void reserve_pv_entries(pmap_t pmap, int needed, 1246 struct rwlock **lockp); 1247 static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1248 struct rwlock **lockp); 1249 static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, 1250 u_int flags, struct rwlock **lockp); 1251 #if VM_NRESERVLEVEL > 0 1252 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1253 struct rwlock **lockp); 1254 #endif 1255 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 1256 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 1257 vm_offset_t va); 1258 1259 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); 1260 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, 1261 vm_prot_t prot, int mode, int flags); 1262 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 1263 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, 1264 vm_offset_t va, struct rwlock **lockp); 1265 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, 1266 vm_offset_t va); 1267 static int pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, 1268 vm_prot_t prot, struct rwlock **lockp); 1269 static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, 1270 u_int flags, vm_page_t m, struct rwlock **lockp); 1271 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 1272 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 1273 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 1274 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted); 1275 static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, 1276 vm_offset_t eva); 1277 static void pmap_invalidate_cache_range_all(vm_offset_t sva, 1278 vm_offset_t eva); 1279 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, 1280 pd_entry_t pde); 1281 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 1282 static vm_page_t pmap_large_map_getptp_unlocked(void); 1283 static vm_paddr_t pmap_large_map_kextract(vm_offset_t va); 1284 #if VM_NRESERVLEVEL > 0 1285 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 1286 vm_page_t mpte, struct rwlock **lockp); 1287 #endif 1288 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 1289 vm_prot_t prot); 1290 static void pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask); 1291 static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, 1292 bool exec); 1293 static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va); 1294 static pd_entry_t *pmap_pti_pde(vm_offset_t va); 1295 static void pmap_pti_wire_pte(void *pte); 1296 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 1297 struct spglist *free, struct rwlock **lockp); 1298 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 1299 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 1300 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 1301 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 1302 struct spglist *free); 1303 static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1304 pd_entry_t *pde, struct spglist *free, 1305 struct rwlock **lockp); 1306 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 1307 vm_page_t m, struct rwlock **lockp); 1308 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 1309 pd_entry_t newpde); 1310 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); 1311 1312 static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, 1313 struct rwlock **lockp); 1314 static vm_page_t pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex, 1315 struct rwlock **lockp, vm_offset_t va); 1316 static vm_page_t pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex, 1317 struct rwlock **lockp, vm_offset_t va); 1318 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, 1319 struct rwlock **lockp); 1320 1321 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 1322 struct spglist *free); 1323 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 1324 1325 static vm_page_t pmap_alloc_pt_page(pmap_t, vm_pindex_t, int); 1326 static void pmap_free_pt_page(pmap_t, vm_page_t, bool); 1327 1328 /********************/ 1329 /* Inline functions */ 1330 /********************/ 1331 1332 /* 1333 * Return a non-clipped indexes for a given VA, which are page table 1334 * pages indexes at the corresponding level. 1335 */ 1336 static __inline vm_pindex_t 1337 pmap_pde_pindex(vm_offset_t va) 1338 { 1339 return (va >> PDRSHIFT); 1340 } 1341 1342 static __inline vm_pindex_t 1343 pmap_pdpe_pindex(vm_offset_t va) 1344 { 1345 return (NUPDE + (va >> PDPSHIFT)); 1346 } 1347 1348 static __inline vm_pindex_t 1349 pmap_pml4e_pindex(vm_offset_t va) 1350 { 1351 return (NUPDE + NUPDPE + (va >> PML4SHIFT)); 1352 } 1353 1354 static __inline vm_pindex_t 1355 pmap_pml5e_pindex(vm_offset_t va) 1356 { 1357 return (NUPDE + NUPDPE + NUPML4E + (va >> PML5SHIFT)); 1358 } 1359 1360 static __inline pml4_entry_t * 1361 pmap_pml5e(pmap_t pmap, vm_offset_t va) 1362 { 1363 1364 MPASS(pmap_is_la57(pmap)); 1365 return (&pmap->pm_pmltop[pmap_pml5e_index(va)]); 1366 } 1367 1368 static __inline pml4_entry_t * 1369 pmap_pml5e_u(pmap_t pmap, vm_offset_t va) 1370 { 1371 1372 MPASS(pmap_is_la57(pmap)); 1373 return (&pmap->pm_pmltopu[pmap_pml5e_index(va)]); 1374 } 1375 1376 static __inline pml4_entry_t * 1377 pmap_pml5e_to_pml4e(pml5_entry_t *pml5e, vm_offset_t va) 1378 { 1379 pml4_entry_t *pml4e; 1380 1381 /* XXX MPASS(pmap_is_la57(pmap); */ 1382 pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); 1383 return (&pml4e[pmap_pml4e_index(va)]); 1384 } 1385 1386 /* Return a pointer to the PML4 slot that corresponds to a VA */ 1387 static __inline pml4_entry_t * 1388 pmap_pml4e(pmap_t pmap, vm_offset_t va) 1389 { 1390 pml5_entry_t *pml5e; 1391 pml4_entry_t *pml4e; 1392 pt_entry_t PG_V; 1393 1394 if (pmap_is_la57(pmap)) { 1395 pml5e = pmap_pml5e(pmap, va); 1396 PG_V = pmap_valid_bit(pmap); 1397 if ((*pml5e & PG_V) == 0) 1398 return (NULL); 1399 pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); 1400 } else { 1401 pml4e = pmap->pm_pmltop; 1402 } 1403 return (&pml4e[pmap_pml4e_index(va)]); 1404 } 1405 1406 static __inline pml4_entry_t * 1407 pmap_pml4e_u(pmap_t pmap, vm_offset_t va) 1408 { 1409 MPASS(!pmap_is_la57(pmap)); 1410 return (&pmap->pm_pmltopu[pmap_pml4e_index(va)]); 1411 } 1412 1413 /* Return a pointer to the PDP slot that corresponds to a VA */ 1414 static __inline pdp_entry_t * 1415 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 1416 { 1417 pdp_entry_t *pdpe; 1418 1419 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 1420 return (&pdpe[pmap_pdpe_index(va)]); 1421 } 1422 1423 /* Return a pointer to the PDP slot that corresponds to a VA */ 1424 static __inline pdp_entry_t * 1425 pmap_pdpe(pmap_t pmap, vm_offset_t va) 1426 { 1427 pml4_entry_t *pml4e; 1428 pt_entry_t PG_V; 1429 1430 PG_V = pmap_valid_bit(pmap); 1431 pml4e = pmap_pml4e(pmap, va); 1432 if (pml4e == NULL || (*pml4e & PG_V) == 0) 1433 return (NULL); 1434 return (pmap_pml4e_to_pdpe(pml4e, va)); 1435 } 1436 1437 /* Return a pointer to the PD slot that corresponds to a VA */ 1438 static __inline pd_entry_t * 1439 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 1440 { 1441 pd_entry_t *pde; 1442 1443 KASSERT((*pdpe & PG_PS) == 0, 1444 ("%s: pdpe %#lx is a leaf", __func__, *pdpe)); 1445 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 1446 return (&pde[pmap_pde_index(va)]); 1447 } 1448 1449 /* Return a pointer to the PD slot that corresponds to a VA */ 1450 static __inline pd_entry_t * 1451 pmap_pde(pmap_t pmap, vm_offset_t va) 1452 { 1453 pdp_entry_t *pdpe; 1454 pt_entry_t PG_V; 1455 1456 PG_V = pmap_valid_bit(pmap); 1457 pdpe = pmap_pdpe(pmap, va); 1458 if (pdpe == NULL || (*pdpe & PG_V) == 0) 1459 return (NULL); 1460 KASSERT((*pdpe & PG_PS) == 0, 1461 ("pmap_pde for 1G page, pmap %p va %#lx", pmap, va)); 1462 return (pmap_pdpe_to_pde(pdpe, va)); 1463 } 1464 1465 /* Return a pointer to the PT slot that corresponds to a VA */ 1466 static __inline pt_entry_t * 1467 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 1468 { 1469 pt_entry_t *pte; 1470 1471 KASSERT((*pde & PG_PS) == 0, 1472 ("%s: pde %#lx is a leaf", __func__, *pde)); 1473 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 1474 return (&pte[pmap_pte_index(va)]); 1475 } 1476 1477 /* Return a pointer to the PT slot that corresponds to a VA */ 1478 static __inline pt_entry_t * 1479 pmap_pte(pmap_t pmap, vm_offset_t va) 1480 { 1481 pd_entry_t *pde; 1482 pt_entry_t PG_V; 1483 1484 PG_V = pmap_valid_bit(pmap); 1485 pde = pmap_pde(pmap, va); 1486 if (pde == NULL || (*pde & PG_V) == 0) 1487 return (NULL); 1488 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 1489 return ((pt_entry_t *)pde); 1490 return (pmap_pde_to_pte(pde, va)); 1491 } 1492 1493 static __inline void 1494 pmap_resident_count_adj(pmap_t pmap, int count) 1495 { 1496 1497 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1498 KASSERT(pmap->pm_stats.resident_count + count >= 0, 1499 ("pmap %p resident count underflow %ld %d", pmap, 1500 pmap->pm_stats.resident_count, count)); 1501 pmap->pm_stats.resident_count += count; 1502 } 1503 1504 static __inline void 1505 pmap_pt_page_count_pinit(pmap_t pmap, int count) 1506 { 1507 KASSERT(pmap->pm_stats.resident_count + count >= 0, 1508 ("pmap %p resident count underflow %ld %d", pmap, 1509 pmap->pm_stats.resident_count, count)); 1510 pmap->pm_stats.resident_count += count; 1511 } 1512 1513 static __inline void 1514 pmap_pt_page_count_adj(pmap_t pmap, int count) 1515 { 1516 if (pmap == kernel_pmap) 1517 counter_u64_add(kernel_pt_page_count, count); 1518 else { 1519 if (pmap != NULL) 1520 pmap_resident_count_adj(pmap, count); 1521 counter_u64_add(user_pt_page_count, count); 1522 } 1523 } 1524 1525 pt_entry_t vtoptem __read_mostly = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + 1526 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1) << 3; 1527 vm_offset_t PTmap __read_mostly = (vm_offset_t)P4Tmap; 1528 1529 PMAP_INLINE pt_entry_t * 1530 vtopte(vm_offset_t va) 1531 { 1532 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); 1533 1534 return ((pt_entry_t *)(PTmap + ((va >> (PAGE_SHIFT - 3)) & vtoptem))); 1535 } 1536 1537 pd_entry_t vtopdem __read_mostly = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 1538 NPML4EPGSHIFT)) - 1) << 3; 1539 vm_offset_t PDmap __read_mostly = (vm_offset_t)P4Dmap; 1540 1541 static __inline pd_entry_t * 1542 vtopde(vm_offset_t va) 1543 { 1544 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); 1545 1546 return ((pt_entry_t *)(PDmap + ((va >> (PDRSHIFT - 3)) & vtopdem))); 1547 } 1548 1549 static u_int64_t 1550 allocpages(vm_paddr_t *firstaddr, int n) 1551 { 1552 u_int64_t ret; 1553 1554 ret = *firstaddr; 1555 bzero((void *)ret, n * PAGE_SIZE); 1556 *firstaddr += n * PAGE_SIZE; 1557 return (ret); 1558 } 1559 1560 CTASSERT(powerof2(NDMPML4E)); 1561 1562 /* number of kernel PDP slots */ 1563 #define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG) 1564 1565 static void 1566 nkpt_init(vm_paddr_t addr) 1567 { 1568 int pt_pages; 1569 1570 #ifdef NKPT 1571 pt_pages = NKPT; 1572 #else 1573 pt_pages = howmany(addr - kernphys, NBPDR) + 1; /* +1 for 2M hole @0 */ 1574 pt_pages += NKPDPE(pt_pages); 1575 1576 /* 1577 * Add some slop beyond the bare minimum required for bootstrapping 1578 * the kernel. 1579 * 1580 * This is quite important when allocating KVA for kernel modules. 1581 * The modules are required to be linked in the negative 2GB of 1582 * the address space. If we run out of KVA in this region then 1583 * pmap_growkernel() will need to allocate page table pages to map 1584 * the entire 512GB of KVA space which is an unnecessary tax on 1585 * physical memory. 1586 * 1587 * Secondly, device memory mapped as part of setting up the low- 1588 * level console(s) is taken from KVA, starting at virtual_avail. 1589 * This is because cninit() is called after pmap_bootstrap() but 1590 * before vm_init() and pmap_init(). 20MB for a frame buffer is 1591 * not uncommon. 1592 */ 1593 pt_pages += 32; /* 64MB additional slop. */ 1594 #endif 1595 nkpt = pt_pages; 1596 } 1597 1598 /* 1599 * Returns the proper write/execute permission for a physical page that is 1600 * part of the initial boot allocations. 1601 * 1602 * If the page has kernel text, it is marked as read-only. If the page has 1603 * kernel read-only data, it is marked as read-only/not-executable. If the 1604 * page has only read-write data, it is marked as read-write/not-executable. 1605 * If the page is below/above the kernel range, it is marked as read-write. 1606 * 1607 * This function operates on 2M pages, since we map the kernel space that 1608 * way. 1609 */ 1610 static inline pt_entry_t 1611 bootaddr_rwx(vm_paddr_t pa) 1612 { 1613 /* 1614 * The kernel is loaded at a 2MB-aligned address, and memory below that 1615 * need not be executable. The .bss section is padded to a 2MB 1616 * boundary, so memory following the kernel need not be executable 1617 * either. Preloaded kernel modules have their mapping permissions 1618 * fixed up by the linker. 1619 */ 1620 if (pa < trunc_2mpage(kernphys + btext - KERNSTART) || 1621 pa >= trunc_2mpage(kernphys + _end - KERNSTART)) 1622 return (X86_PG_RW | pg_nx); 1623 1624 /* 1625 * The linker should ensure that the read-only and read-write 1626 * portions don't share the same 2M page, so this shouldn't 1627 * impact read-only data. However, in any case, any page with 1628 * read-write data needs to be read-write. 1629 */ 1630 if (pa >= trunc_2mpage(kernphys + brwsection - KERNSTART)) 1631 return (X86_PG_RW | pg_nx); 1632 1633 /* 1634 * Mark any 2M page containing kernel text as read-only. Mark 1635 * other pages with read-only data as read-only and not executable. 1636 * (It is likely a small portion of the read-only data section will 1637 * be marked as read-only, but executable. This should be acceptable 1638 * since the read-only protection will keep the data from changing.) 1639 * Note that fixups to the .text section will still work until we 1640 * set CR0.WP. 1641 */ 1642 if (pa < round_2mpage(kernphys + etext - KERNSTART)) 1643 return (0); 1644 return (pg_nx); 1645 } 1646 1647 static void 1648 create_pagetables(vm_paddr_t *firstaddr) 1649 { 1650 pd_entry_t *pd_p; 1651 pdp_entry_t *pdp_p; 1652 pml4_entry_t *p4_p; 1653 uint64_t DMPDkernphys; 1654 vm_paddr_t pax; 1655 #ifdef KASAN 1656 pt_entry_t *pt_p; 1657 uint64_t KASANPDphys, KASANPTphys, KASANphys; 1658 vm_offset_t kasankernbase; 1659 int kasankpdpi, kasankpdi, nkasanpte; 1660 #endif 1661 int i, j, ndm1g, nkpdpe, nkdmpde; 1662 1663 /* Allocate page table pages for the direct map */ 1664 ndmpdp = howmany(ptoa(Maxmem), NBPDP); 1665 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 1666 ndmpdp = 4; 1667 ndmpdpphys = howmany(ndmpdp, NPDPEPG); 1668 if (ndmpdpphys > NDMPML4E) { 1669 /* 1670 * Each NDMPML4E allows 512 GB, so limit to that, 1671 * and then readjust ndmpdp and ndmpdpphys. 1672 */ 1673 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); 1674 Maxmem = atop(NDMPML4E * NBPML4); 1675 ndmpdpphys = NDMPML4E; 1676 ndmpdp = NDMPML4E * NPDEPG; 1677 } 1678 DMPDPphys = allocpages(firstaddr, ndmpdpphys); 1679 ndm1g = 0; 1680 if ((amd_feature & AMDID_PAGE1GB) != 0) { 1681 /* 1682 * Calculate the number of 1G pages that will fully fit in 1683 * Maxmem. 1684 */ 1685 ndm1g = ptoa(Maxmem) >> PDPSHIFT; 1686 1687 /* 1688 * Allocate 2M pages for the kernel. These will be used in 1689 * place of the one or more 1G pages from ndm1g that maps 1690 * kernel memory into DMAP. 1691 */ 1692 nkdmpde = howmany((vm_offset_t)brwsection - KERNSTART + 1693 kernphys - rounddown2(kernphys, NBPDP), NBPDP); 1694 DMPDkernphys = allocpages(firstaddr, nkdmpde); 1695 } 1696 if (ndm1g < ndmpdp) 1697 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); 1698 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 1699 1700 /* Allocate pages. */ 1701 KPML4phys = allocpages(firstaddr, 1); 1702 KPDPphys = allocpages(firstaddr, NKPML4E); 1703 #ifdef KASAN 1704 KASANPDPphys = allocpages(firstaddr, NKASANPML4E); 1705 KASANPDphys = allocpages(firstaddr, 1); 1706 #endif 1707 #ifdef KMSAN 1708 /* 1709 * The KMSAN shadow maps are initially left unpopulated, since there is 1710 * no need to shadow memory above KERNBASE. 1711 */ 1712 KMSANSHADPDPphys = allocpages(firstaddr, NKMSANSHADPML4E); 1713 KMSANORIGPDPphys = allocpages(firstaddr, NKMSANORIGPML4E); 1714 #endif 1715 1716 /* 1717 * Allocate the initial number of kernel page table pages required to 1718 * bootstrap. We defer this until after all memory-size dependent 1719 * allocations are done (e.g. direct map), so that we don't have to 1720 * build in too much slop in our estimate. 1721 * 1722 * Note that when NKPML4E > 1, we have an empty page underneath 1723 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) 1724 * pages. (pmap_enter requires a PD page to exist for each KPML4E.) 1725 */ 1726 nkpt_init(*firstaddr); 1727 nkpdpe = NKPDPE(nkpt); 1728 1729 KPTphys = allocpages(firstaddr, nkpt); 1730 KPDphys = allocpages(firstaddr, nkpdpe); 1731 1732 #ifdef KASAN 1733 nkasanpte = howmany(nkpt, KASAN_SHADOW_SCALE); 1734 KASANPTphys = allocpages(firstaddr, nkasanpte); 1735 KASANphys = allocpages(firstaddr, nkasanpte * NPTEPG); 1736 #endif 1737 1738 /* 1739 * Connect the zero-filled PT pages to their PD entries. This 1740 * implicitly maps the PT pages at their correct locations within 1741 * the PTmap. 1742 */ 1743 pd_p = (pd_entry_t *)KPDphys; 1744 for (i = 0; i < nkpt; i++) 1745 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 1746 1747 /* 1748 * Map from start of the kernel in physical memory (staging 1749 * area) to the end of loader preallocated memory using 2MB 1750 * pages. This replaces some of the PD entries created above. 1751 * For compatibility, identity map 2M at the start. 1752 */ 1753 pd_p[0] = X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | 1754 X86_PG_RW | pg_nx; 1755 for (i = 1, pax = kernphys; pax < KERNend; i++, pax += NBPDR) { 1756 /* Preset PG_M and PG_A because demotion expects it. */ 1757 pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | 1758 X86_PG_A | bootaddr_rwx(pax); 1759 } 1760 1761 /* 1762 * Because we map the physical blocks in 2M pages, adjust firstaddr 1763 * to record the physical blocks we've actually mapped into kernel 1764 * virtual address space. 1765 */ 1766 if (*firstaddr < round_2mpage(KERNend)) 1767 *firstaddr = round_2mpage(KERNend); 1768 1769 /* And connect up the PD to the PDP (leaving room for L4 pages) */ 1770 pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); 1771 for (i = 0; i < nkpdpe; i++) 1772 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 1773 1774 #ifdef KASAN 1775 kasankernbase = kasan_md_addr_to_shad(KERNBASE); 1776 kasankpdpi = pmap_pdpe_index(kasankernbase); 1777 kasankpdi = pmap_pde_index(kasankernbase); 1778 1779 pdp_p = (pdp_entry_t *)KASANPDPphys; 1780 pdp_p[kasankpdpi] = (KASANPDphys | X86_PG_RW | X86_PG_V | pg_nx); 1781 1782 pd_p = (pd_entry_t *)KASANPDphys; 1783 for (i = 0; i < nkasanpte; i++) 1784 pd_p[i + kasankpdi] = (KASANPTphys + ptoa(i)) | X86_PG_RW | 1785 X86_PG_V | pg_nx; 1786 1787 pt_p = (pt_entry_t *)KASANPTphys; 1788 for (i = 0; i < nkasanpte * NPTEPG; i++) 1789 pt_p[i] = (KASANphys + ptoa(i)) | X86_PG_RW | X86_PG_V | 1790 X86_PG_M | X86_PG_A | pg_nx; 1791 #endif 1792 1793 /* 1794 * Now, set up the direct map region using 2MB and/or 1GB pages. If 1795 * the end of physical memory is not aligned to a 1GB page boundary, 1796 * then the residual physical memory is mapped with 2MB pages. Later, 1797 * if pmap_mapdev{_attr}() uses the direct map for non-write-back 1798 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings 1799 * that are partially used. 1800 */ 1801 pd_p = (pd_entry_t *)DMPDphys; 1802 for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { 1803 pd_p[j] = (vm_paddr_t)i << PDRSHIFT; 1804 /* Preset PG_M and PG_A because demotion expects it. */ 1805 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | 1806 X86_PG_M | X86_PG_A | pg_nx; 1807 } 1808 pdp_p = (pdp_entry_t *)DMPDPphys; 1809 for (i = 0; i < ndm1g; i++) { 1810 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; 1811 /* Preset PG_M and PG_A because demotion expects it. */ 1812 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | 1813 X86_PG_M | X86_PG_A | pg_nx; 1814 } 1815 for (j = 0; i < ndmpdp; i++, j++) { 1816 pdp_p[i] = DMPDphys + ptoa(j); 1817 pdp_p[i] |= X86_PG_RW | X86_PG_V | pg_nx; 1818 } 1819 1820 /* 1821 * Instead of using a 1G page for the memory containing the kernel, 1822 * use 2M pages with read-only and no-execute permissions. (If using 1G 1823 * pages, this will partially overwrite the PDPEs above.) 1824 */ 1825 if (ndm1g > 0) { 1826 pd_p = (pd_entry_t *)DMPDkernphys; 1827 for (i = 0, pax = rounddown2(kernphys, NBPDP); 1828 i < NPDEPG * nkdmpde; i++, pax += NBPDR) { 1829 pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | 1830 X86_PG_A | pg_nx | bootaddr_rwx(pax); 1831 } 1832 j = rounddown2(kernphys, NBPDP) >> PDPSHIFT; 1833 for (i = 0; i < nkdmpde; i++) { 1834 pdp_p[i + j] = (DMPDkernphys + ptoa(i)) | 1835 X86_PG_RW | X86_PG_V | pg_nx; 1836 } 1837 } 1838 1839 /* And recursively map PML4 to itself in order to get PTmap */ 1840 p4_p = (pml4_entry_t *)KPML4phys; 1841 p4_p[PML4PML4I] = KPML4phys; 1842 p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx; 1843 1844 #ifdef KASAN 1845 /* Connect the KASAN shadow map slots up to the PML4. */ 1846 for (i = 0; i < NKASANPML4E; i++) { 1847 p4_p[KASANPML4I + i] = KASANPDPphys + ptoa(i); 1848 p4_p[KASANPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1849 } 1850 #endif 1851 1852 #ifdef KMSAN 1853 /* Connect the KMSAN shadow map slots up to the PML4. */ 1854 for (i = 0; i < NKMSANSHADPML4E; i++) { 1855 p4_p[KMSANSHADPML4I + i] = KMSANSHADPDPphys + ptoa(i); 1856 p4_p[KMSANSHADPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1857 } 1858 1859 /* Connect the KMSAN origin map slots up to the PML4. */ 1860 for (i = 0; i < NKMSANORIGPML4E; i++) { 1861 p4_p[KMSANORIGPML4I + i] = KMSANORIGPDPphys + ptoa(i); 1862 p4_p[KMSANORIGPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1863 } 1864 #endif 1865 1866 /* Connect the Direct Map slots up to the PML4. */ 1867 for (i = 0; i < ndmpdpphys; i++) { 1868 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); 1869 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1870 } 1871 1872 /* Connect the KVA slots up to the PML4 */ 1873 for (i = 0; i < NKPML4E; i++) { 1874 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); 1875 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V; 1876 } 1877 1878 kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 1879 } 1880 1881 /* 1882 * Bootstrap the system enough to run with virtual memory. 1883 * 1884 * On amd64 this is called after mapping has already been enabled 1885 * and just syncs the pmap module with what has already been done. 1886 * [We can't call it easily with mapping off since the kernel is not 1887 * mapped with PA == VA, hence we would have to relocate every address 1888 * from the linked base (virtual) address "KERNBASE" to the actual 1889 * (physical) address starting relative to 0] 1890 */ 1891 void 1892 pmap_bootstrap(vm_paddr_t *firstaddr) 1893 { 1894 vm_offset_t va; 1895 pt_entry_t *pte, *pcpu_pte; 1896 struct region_descriptor r_gdt; 1897 uint64_t cr4, pcpu_phys; 1898 u_long res; 1899 int i; 1900 1901 KERNend = *firstaddr; 1902 res = atop(KERNend - (vm_paddr_t)kernphys); 1903 1904 if (!pti) 1905 pg_g = X86_PG_G; 1906 1907 /* 1908 * Create an initial set of page tables to run the kernel in. 1909 */ 1910 create_pagetables(firstaddr); 1911 1912 pcpu_phys = allocpages(firstaddr, MAXCPU); 1913 1914 /* 1915 * Add a physical memory segment (vm_phys_seg) corresponding to the 1916 * preallocated kernel page table pages so that vm_page structures 1917 * representing these pages will be created. The vm_page structures 1918 * are required for promotion of the corresponding kernel virtual 1919 * addresses to superpage mappings. 1920 */ 1921 vm_phys_early_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 1922 1923 /* 1924 * Account for the virtual addresses mapped by create_pagetables(). 1925 */ 1926 virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend - 1927 (vm_paddr_t)kernphys); 1928 virtual_end = VM_MAX_KERNEL_ADDRESS; 1929 1930 /* 1931 * Enable PG_G global pages, then switch to the kernel page 1932 * table from the bootstrap page table. After the switch, it 1933 * is possible to enable SMEP and SMAP since PG_U bits are 1934 * correct now. 1935 */ 1936 cr4 = rcr4(); 1937 cr4 |= CR4_PGE; 1938 load_cr4(cr4); 1939 load_cr3(KPML4phys); 1940 if (cpu_stdext_feature & CPUID_STDEXT_SMEP) 1941 cr4 |= CR4_SMEP; 1942 if (cpu_stdext_feature & CPUID_STDEXT_SMAP) 1943 cr4 |= CR4_SMAP; 1944 load_cr4(cr4); 1945 1946 /* 1947 * Initialize the kernel pmap (which is statically allocated). 1948 * Count bootstrap data as being resident in case any of this data is 1949 * later unmapped (using pmap_remove()) and freed. 1950 */ 1951 PMAP_LOCK_INIT(kernel_pmap); 1952 kernel_pmap->pm_pmltop = kernel_pml4; 1953 kernel_pmap->pm_cr3 = KPML4phys; 1954 kernel_pmap->pm_ucr3 = PMAP_NO_CR3; 1955 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1956 kernel_pmap->pm_stats.resident_count = res; 1957 kernel_pmap->pm_flags = pmap_flags; 1958 1959 /* 1960 * The kernel pmap is always active on all CPUs. Once CPUs are 1961 * enumerated, the mask will be set equal to all_cpus. 1962 */ 1963 CPU_FILL(&kernel_pmap->pm_active); 1964 1965 /* 1966 * Initialize the TLB invalidations generation number lock. 1967 */ 1968 mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); 1969 1970 /* 1971 * Reserve some special page table entries/VA space for temporary 1972 * mapping of pages. 1973 */ 1974 #define SYSMAP(c, p, v, n) \ 1975 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 1976 1977 va = virtual_avail; 1978 pte = vtopte(va); 1979 1980 /* 1981 * Crashdump maps. The first page is reused as CMAP1 for the 1982 * memory test. 1983 */ 1984 SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) 1985 CADDR1 = crashdumpmap; 1986 1987 SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU); 1988 virtual_avail = va; 1989 1990 for (i = 0; i < MAXCPU; i++) { 1991 pcpu_pte[i] = (pcpu_phys + ptoa(i)) | X86_PG_V | X86_PG_RW | 1992 pg_g | pg_nx | X86_PG_M | X86_PG_A; 1993 } 1994 1995 /* 1996 * Re-initialize PCPU area for BSP after switching. 1997 * Make hardware use gdt and common_tss from the new PCPU. 1998 */ 1999 STAILQ_INIT(&cpuhead); 2000 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2001 pcpu_init(&__pcpu[0], 0, sizeof(struct pcpu)); 2002 amd64_bsp_pcpu_init1(&__pcpu[0]); 2003 amd64_bsp_ist_init(&__pcpu[0]); 2004 __pcpu[0].pc_common_tss.tss_iobase = sizeof(struct amd64tss) + 2005 IOPERM_BITMAP_SIZE; 2006 memcpy(__pcpu[0].pc_gdt, temp_bsp_pcpu.pc_gdt, NGDT * 2007 sizeof(struct user_segment_descriptor)); 2008 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&__pcpu[0].pc_common_tss; 2009 ssdtosyssd(&gdt_segs[GPROC0_SEL], 2010 (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); 2011 r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; 2012 r_gdt.rd_base = (long)__pcpu[0].pc_gdt; 2013 lgdt(&r_gdt); 2014 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2015 ltr(GSEL(GPROC0_SEL, SEL_KPL)); 2016 __pcpu[0].pc_dynamic = temp_bsp_pcpu.pc_dynamic; 2017 __pcpu[0].pc_acpi_id = temp_bsp_pcpu.pc_acpi_id; 2018 2019 /* 2020 * Initialize the PAT MSR. 2021 * pmap_init_pat() clears and sets CR4_PGE, which, as a 2022 * side-effect, invalidates stale PG_G TLB entries that might 2023 * have been created in our pre-boot environment. 2024 */ 2025 pmap_init_pat(); 2026 2027 /* Initialize TLB Context Id. */ 2028 if (pmap_pcid_enabled) { 2029 for (i = 0; i < MAXCPU; i++) { 2030 kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN; 2031 kernel_pmap->pm_pcids[i].pm_gen = 1; 2032 } 2033 2034 /* 2035 * PMAP_PCID_KERN + 1 is used for initialization of 2036 * proc0 pmap. The pmap' pcid state might be used by 2037 * EFIRT entry before first context switch, so it 2038 * needs to be valid. 2039 */ 2040 PCPU_SET(pcid_next, PMAP_PCID_KERN + 2); 2041 PCPU_SET(pcid_gen, 1); 2042 2043 /* 2044 * pcpu area for APs is zeroed during AP startup. 2045 * pc_pcid_next and pc_pcid_gen are initialized by AP 2046 * during pcpu setup. 2047 */ 2048 load_cr4(rcr4() | CR4_PCIDE); 2049 } 2050 } 2051 2052 /* 2053 * Setup the PAT MSR. 2054 */ 2055 void 2056 pmap_init_pat(void) 2057 { 2058 uint64_t pat_msr; 2059 u_long cr0, cr4; 2060 int i; 2061 2062 /* Bail if this CPU doesn't implement PAT. */ 2063 if ((cpu_feature & CPUID_PAT) == 0) 2064 panic("no PAT??"); 2065 2066 /* Set default PAT index table. */ 2067 for (i = 0; i < PAT_INDEX_SIZE; i++) 2068 pat_index[i] = -1; 2069 pat_index[PAT_WRITE_BACK] = 0; 2070 pat_index[PAT_WRITE_THROUGH] = 1; 2071 pat_index[PAT_UNCACHEABLE] = 3; 2072 pat_index[PAT_WRITE_COMBINING] = 6; 2073 pat_index[PAT_WRITE_PROTECTED] = 5; 2074 pat_index[PAT_UNCACHED] = 2; 2075 2076 /* 2077 * Initialize default PAT entries. 2078 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 2079 * Program 5 and 6 as WP and WC. 2080 * 2081 * Leave 4 and 7 as WB and UC. Note that a recursive page table 2082 * mapping for a 2M page uses a PAT value with the bit 3 set due 2083 * to its overload with PG_PS. 2084 */ 2085 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 2086 PAT_VALUE(1, PAT_WRITE_THROUGH) | 2087 PAT_VALUE(2, PAT_UNCACHED) | 2088 PAT_VALUE(3, PAT_UNCACHEABLE) | 2089 PAT_VALUE(4, PAT_WRITE_BACK) | 2090 PAT_VALUE(5, PAT_WRITE_PROTECTED) | 2091 PAT_VALUE(6, PAT_WRITE_COMBINING) | 2092 PAT_VALUE(7, PAT_UNCACHEABLE); 2093 2094 /* Disable PGE. */ 2095 cr4 = rcr4(); 2096 load_cr4(cr4 & ~CR4_PGE); 2097 2098 /* Disable caches (CD = 1, NW = 0). */ 2099 cr0 = rcr0(); 2100 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 2101 2102 /* Flushes caches and TLBs. */ 2103 wbinvd(); 2104 invltlb(); 2105 2106 /* Update PAT and index table. */ 2107 wrmsr(MSR_PAT, pat_msr); 2108 2109 /* Flush caches and TLBs again. */ 2110 wbinvd(); 2111 invltlb(); 2112 2113 /* Restore caches and PGE. */ 2114 load_cr0(cr0); 2115 load_cr4(cr4); 2116 } 2117 2118 vm_page_t 2119 pmap_page_alloc_below_4g(bool zeroed) 2120 { 2121 return (vm_page_alloc_noobj_contig((zeroed ? VM_ALLOC_ZERO : 0), 2122 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT)); 2123 } 2124 2125 extern const char la57_trampoline[], la57_trampoline_gdt_desc[], 2126 la57_trampoline_gdt[], la57_trampoline_end[]; 2127 2128 static void 2129 pmap_bootstrap_la57(void *arg __unused) 2130 { 2131 char *v_code; 2132 pml5_entry_t *v_pml5; 2133 pml4_entry_t *v_pml4; 2134 pdp_entry_t *v_pdp; 2135 pd_entry_t *v_pd; 2136 pt_entry_t *v_pt; 2137 vm_page_t m_code, m_pml4, m_pdp, m_pd, m_pt, m_pml5; 2138 void (*la57_tramp)(uint64_t pml5); 2139 struct region_descriptor r_gdt; 2140 2141 if ((cpu_stdext_feature2 & CPUID_STDEXT2_LA57) == 0) 2142 return; 2143 TUNABLE_INT_FETCH("vm.pmap.la57", &la57); 2144 if (!la57) 2145 return; 2146 2147 r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; 2148 r_gdt.rd_base = (long)__pcpu[0].pc_gdt; 2149 2150 m_code = pmap_page_alloc_below_4g(true); 2151 v_code = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_code)); 2152 m_pml5 = pmap_page_alloc_below_4g(true); 2153 KPML5phys = VM_PAGE_TO_PHYS(m_pml5); 2154 v_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(KPML5phys); 2155 m_pml4 = pmap_page_alloc_below_4g(true); 2156 v_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4)); 2157 m_pdp = pmap_page_alloc_below_4g(true); 2158 v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp)); 2159 m_pd = pmap_page_alloc_below_4g(true); 2160 v_pd = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd)); 2161 m_pt = pmap_page_alloc_below_4g(true); 2162 v_pt = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pt)); 2163 2164 /* 2165 * Map m_code 1:1, it appears below 4G in KVA due to physical 2166 * address being below 4G. Since kernel KVA is in upper half, 2167 * the pml4e should be zero and free for temporary use. 2168 */ 2169 kernel_pmap->pm_pmltop[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = 2170 VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | 2171 X86_PG_M; 2172 v_pdp[pmap_pdpe_index(VM_PAGE_TO_PHYS(m_code))] = 2173 VM_PAGE_TO_PHYS(m_pd) | X86_PG_V | X86_PG_RW | X86_PG_A | 2174 X86_PG_M; 2175 v_pd[pmap_pde_index(VM_PAGE_TO_PHYS(m_code))] = 2176 VM_PAGE_TO_PHYS(m_pt) | X86_PG_V | X86_PG_RW | X86_PG_A | 2177 X86_PG_M; 2178 v_pt[pmap_pte_index(VM_PAGE_TO_PHYS(m_code))] = 2179 VM_PAGE_TO_PHYS(m_code) | X86_PG_V | X86_PG_RW | X86_PG_A | 2180 X86_PG_M; 2181 2182 /* 2183 * Add pml5 entry at top of KVA pointing to existing pml4 table, 2184 * entering all existing kernel mappings into level 5 table. 2185 */ 2186 v_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | 2187 X86_PG_RW | X86_PG_A | X86_PG_M | pg_g; 2188 2189 /* 2190 * Add pml5 entry for 1:1 trampoline mapping after LA57 is turned on. 2191 */ 2192 v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))] = 2193 VM_PAGE_TO_PHYS(m_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A | 2194 X86_PG_M; 2195 v_pml4[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = 2196 VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | 2197 X86_PG_M; 2198 2199 /* 2200 * Copy and call the 48->57 trampoline, hope we return there, alive. 2201 */ 2202 bcopy(la57_trampoline, v_code, la57_trampoline_end - la57_trampoline); 2203 *(u_long *)(v_code + 2 + (la57_trampoline_gdt_desc - la57_trampoline)) = 2204 la57_trampoline_gdt - la57_trampoline + VM_PAGE_TO_PHYS(m_code); 2205 la57_tramp = (void (*)(uint64_t))VM_PAGE_TO_PHYS(m_code); 2206 invlpg((vm_offset_t)la57_tramp); 2207 la57_tramp(KPML5phys); 2208 2209 /* 2210 * gdt was necessary reset, switch back to our gdt. 2211 */ 2212 lgdt(&r_gdt); 2213 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2214 load_ds(_udatasel); 2215 load_es(_udatasel); 2216 load_fs(_ufssel); 2217 ssdtosyssd(&gdt_segs[GPROC0_SEL], 2218 (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); 2219 ltr(GSEL(GPROC0_SEL, SEL_KPL)); 2220 2221 /* 2222 * Now unmap the trampoline, and free the pages. 2223 * Clear pml5 entry used for 1:1 trampoline mapping. 2224 */ 2225 pte_clear(&v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))]); 2226 invlpg((vm_offset_t)v_code); 2227 vm_page_free(m_code); 2228 vm_page_free(m_pdp); 2229 vm_page_free(m_pd); 2230 vm_page_free(m_pt); 2231 2232 /* 2233 * Recursively map PML5 to itself in order to get PTmap and 2234 * PDmap. 2235 */ 2236 v_pml5[PML5PML5I] = KPML5phys | X86_PG_RW | X86_PG_V | pg_nx; 2237 2238 vtoptem = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + 2239 NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3; 2240 PTmap = (vm_offset_t)P5Tmap; 2241 vtopdem = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 2242 NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3; 2243 PDmap = (vm_offset_t)P5Dmap; 2244 2245 kernel_pmap->pm_cr3 = KPML5phys; 2246 kernel_pmap->pm_pmltop = v_pml5; 2247 pmap_pt_page_count_adj(kernel_pmap, 1); 2248 } 2249 SYSINIT(la57, SI_SUB_KMEM, SI_ORDER_ANY, pmap_bootstrap_la57, NULL); 2250 2251 /* 2252 * Initialize a vm_page's machine-dependent fields. 2253 */ 2254 void 2255 pmap_page_init(vm_page_t m) 2256 { 2257 2258 TAILQ_INIT(&m->md.pv_list); 2259 m->md.pat_mode = PAT_WRITE_BACK; 2260 } 2261 2262 static int pmap_allow_2m_x_ept; 2263 SYSCTL_INT(_vm_pmap, OID_AUTO, allow_2m_x_ept, CTLFLAG_RWTUN | CTLFLAG_NOFETCH, 2264 &pmap_allow_2m_x_ept, 0, 2265 "Allow executable superpage mappings in EPT"); 2266 2267 void 2268 pmap_allow_2m_x_ept_recalculate(void) 2269 { 2270 /* 2271 * SKL002, SKL012S. Since the EPT format is only used by 2272 * Intel CPUs, the vendor check is merely a formality. 2273 */ 2274 if (!(cpu_vendor_id != CPU_VENDOR_INTEL || 2275 (cpu_ia32_arch_caps & IA32_ARCH_CAP_IF_PSCHANGE_MC_NO) != 0 || 2276 (CPUID_TO_FAMILY(cpu_id) == 0x6 && 2277 (CPUID_TO_MODEL(cpu_id) == 0x26 || /* Atoms */ 2278 CPUID_TO_MODEL(cpu_id) == 0x27 || 2279 CPUID_TO_MODEL(cpu_id) == 0x35 || 2280 CPUID_TO_MODEL(cpu_id) == 0x36 || 2281 CPUID_TO_MODEL(cpu_id) == 0x37 || 2282 CPUID_TO_MODEL(cpu_id) == 0x86 || 2283 CPUID_TO_MODEL(cpu_id) == 0x1c || 2284 CPUID_TO_MODEL(cpu_id) == 0x4a || 2285 CPUID_TO_MODEL(cpu_id) == 0x4c || 2286 CPUID_TO_MODEL(cpu_id) == 0x4d || 2287 CPUID_TO_MODEL(cpu_id) == 0x5a || 2288 CPUID_TO_MODEL(cpu_id) == 0x5c || 2289 CPUID_TO_MODEL(cpu_id) == 0x5d || 2290 CPUID_TO_MODEL(cpu_id) == 0x5f || 2291 CPUID_TO_MODEL(cpu_id) == 0x6e || 2292 CPUID_TO_MODEL(cpu_id) == 0x7a || 2293 CPUID_TO_MODEL(cpu_id) == 0x57 || /* Knights */ 2294 CPUID_TO_MODEL(cpu_id) == 0x85)))) 2295 pmap_allow_2m_x_ept = 1; 2296 TUNABLE_INT_FETCH("hw.allow_2m_x_ept", &pmap_allow_2m_x_ept); 2297 } 2298 2299 static bool 2300 pmap_allow_2m_x_page(pmap_t pmap, bool executable) 2301 { 2302 2303 return (pmap->pm_type != PT_EPT || !executable || 2304 !pmap_allow_2m_x_ept); 2305 } 2306 2307 #ifdef NUMA 2308 static void 2309 pmap_init_pv_table(void) 2310 { 2311 struct pmap_large_md_page *pvd; 2312 vm_size_t s; 2313 long start, end, highest, pv_npg; 2314 int domain, i, j, pages; 2315 2316 /* 2317 * For correctness we depend on the size being evenly divisible into a 2318 * page. As a tradeoff between performance and total memory use, the 2319 * entry is 64 bytes (aka one cacheline) in size. Not being smaller 2320 * avoids false-sharing, but not being 128 bytes potentially allows for 2321 * avoidable traffic due to adjacent cacheline prefetcher. 2322 * 2323 * Assert the size so that accidental changes fail to compile. 2324 */ 2325 CTASSERT((sizeof(*pvd) == 64)); 2326 2327 /* 2328 * Calculate the size of the array. 2329 */ 2330 pmap_last_pa = vm_phys_segs[vm_phys_nsegs - 1].end; 2331 pv_npg = howmany(pmap_last_pa, NBPDR); 2332 s = (vm_size_t)pv_npg * sizeof(struct pmap_large_md_page); 2333 s = round_page(s); 2334 pv_table = (struct pmap_large_md_page *)kva_alloc(s); 2335 if (pv_table == NULL) 2336 panic("%s: kva_alloc failed\n", __func__); 2337 2338 /* 2339 * Iterate physical segments to allocate space for respective pages. 2340 */ 2341 highest = -1; 2342 s = 0; 2343 for (i = 0; i < vm_phys_nsegs; i++) { 2344 end = vm_phys_segs[i].end / NBPDR; 2345 domain = vm_phys_segs[i].domain; 2346 2347 if (highest >= end) 2348 continue; 2349 2350 start = highest + 1; 2351 pvd = &pv_table[start]; 2352 2353 pages = end - start + 1; 2354 s = round_page(pages * sizeof(*pvd)); 2355 highest = start + (s / sizeof(*pvd)) - 1; 2356 2357 for (j = 0; j < s; j += PAGE_SIZE) { 2358 vm_page_t m = vm_page_alloc_noobj_domain(domain, 0); 2359 if (m == NULL) 2360 panic("failed to allocate PV table page"); 2361 pmap_qenter((vm_offset_t)pvd + j, &m, 1); 2362 } 2363 2364 for (j = 0; j < s / sizeof(*pvd); j++) { 2365 rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW); 2366 TAILQ_INIT(&pvd->pv_page.pv_list); 2367 pvd->pv_page.pv_gen = 0; 2368 pvd->pv_page.pat_mode = 0; 2369 pvd->pv_invl_gen = 0; 2370 pvd++; 2371 } 2372 } 2373 pvd = &pv_dummy_large; 2374 rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW); 2375 TAILQ_INIT(&pvd->pv_page.pv_list); 2376 pvd->pv_page.pv_gen = 0; 2377 pvd->pv_page.pat_mode = 0; 2378 pvd->pv_invl_gen = 0; 2379 } 2380 #else 2381 static void 2382 pmap_init_pv_table(void) 2383 { 2384 vm_size_t s; 2385 long i, pv_npg; 2386 2387 /* 2388 * Initialize the pool of pv list locks. 2389 */ 2390 for (i = 0; i < NPV_LIST_LOCKS; i++) 2391 rw_init(&pv_list_locks[i], "pmap pv list"); 2392 2393 /* 2394 * Calculate the size of the pv head table for superpages. 2395 */ 2396 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); 2397 2398 /* 2399 * Allocate memory for the pv head table for superpages. 2400 */ 2401 s = (vm_size_t)pv_npg * sizeof(struct md_page); 2402 s = round_page(s); 2403 pv_table = kmem_malloc(s, M_WAITOK | M_ZERO); 2404 for (i = 0; i < pv_npg; i++) 2405 TAILQ_INIT(&pv_table[i].pv_list); 2406 TAILQ_INIT(&pv_dummy.pv_list); 2407 } 2408 #endif 2409 2410 /* 2411 * Initialize the pmap module. 2412 * Called by vm_init, to initialize any structures that the pmap 2413 * system needs to map virtual memory. 2414 */ 2415 void 2416 pmap_init(void) 2417 { 2418 struct pmap_preinit_mapping *ppim; 2419 vm_page_t m, mpte; 2420 int error, i, ret, skz63; 2421 2422 /* L1TF, reserve page @0 unconditionally */ 2423 vm_page_blacklist_add(0, bootverbose); 2424 2425 /* Detect bare-metal Skylake Server and Skylake-X. */ 2426 if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL && 2427 CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) { 2428 /* 2429 * Skylake-X errata SKZ63. Processor May Hang When 2430 * Executing Code In an HLE Transaction Region between 2431 * 40000000H and 403FFFFFH. 2432 * 2433 * Mark the pages in the range as preallocated. It 2434 * seems to be impossible to distinguish between 2435 * Skylake Server and Skylake X. 2436 */ 2437 skz63 = 1; 2438 TUNABLE_INT_FETCH("hw.skz63_enable", &skz63); 2439 if (skz63 != 0) { 2440 if (bootverbose) 2441 printf("SKZ63: skipping 4M RAM starting " 2442 "at physical 1G\n"); 2443 for (i = 0; i < atop(0x400000); i++) { 2444 ret = vm_page_blacklist_add(0x40000000 + 2445 ptoa(i), FALSE); 2446 if (!ret && bootverbose) 2447 printf("page at %#lx already used\n", 2448 0x40000000 + ptoa(i)); 2449 } 2450 } 2451 } 2452 2453 /* IFU */ 2454 pmap_allow_2m_x_ept_recalculate(); 2455 2456 /* 2457 * Initialize the vm page array entries for the kernel pmap's 2458 * page table pages. 2459 */ 2460 PMAP_LOCK(kernel_pmap); 2461 for (i = 0; i < nkpt; i++) { 2462 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 2463 KASSERT(mpte >= vm_page_array && 2464 mpte < &vm_page_array[vm_page_array_size], 2465 ("pmap_init: page table page is out of range")); 2466 mpte->pindex = pmap_pde_pindex(KERNBASE) + i; 2467 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 2468 mpte->ref_count = 1; 2469 2470 /* 2471 * Collect the page table pages that were replaced by a 2MB 2472 * page in create_pagetables(). They are zero filled. 2473 */ 2474 if ((i == 0 || 2475 kernphys + ((vm_paddr_t)(i - 1) << PDRSHIFT) < KERNend) && 2476 pmap_insert_pt_page(kernel_pmap, mpte, false)) 2477 panic("pmap_init: pmap_insert_pt_page failed"); 2478 } 2479 PMAP_UNLOCK(kernel_pmap); 2480 vm_wire_add(nkpt); 2481 2482 /* 2483 * If the kernel is running on a virtual machine, then it must assume 2484 * that MCA is enabled by the hypervisor. Moreover, the kernel must 2485 * be prepared for the hypervisor changing the vendor and family that 2486 * are reported by CPUID. Consequently, the workaround for AMD Family 2487 * 10h Erratum 383 is enabled if the processor's feature set does not 2488 * include at least one feature that is only supported by older Intel 2489 * or newer AMD processors. 2490 */ 2491 if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && 2492 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 2493 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 2494 AMDID2_FMA4)) == 0) 2495 workaround_erratum383 = 1; 2496 2497 /* 2498 * Are large page mappings enabled? 2499 */ 2500 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 2501 if (pg_ps_enabled) { 2502 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 2503 ("pmap_init: can't assign to pagesizes[1]")); 2504 pagesizes[1] = NBPDR; 2505 if ((amd_feature & AMDID_PAGE1GB) != 0) { 2506 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0, 2507 ("pmap_init: can't assign to pagesizes[2]")); 2508 pagesizes[2] = NBPDP; 2509 } 2510 } 2511 2512 /* 2513 * Initialize pv chunk lists. 2514 */ 2515 for (i = 0; i < PMAP_MEMDOM; i++) { 2516 mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL, MTX_DEF); 2517 TAILQ_INIT(&pv_chunks[i].pvc_list); 2518 } 2519 pmap_init_pv_table(); 2520 2521 pmap_initialized = 1; 2522 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 2523 ppim = pmap_preinit_mapping + i; 2524 if (ppim->va == 0) 2525 continue; 2526 /* Make the direct map consistent */ 2527 if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) { 2528 (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), 2529 ppim->sz, ppim->mode); 2530 } 2531 if (!bootverbose) 2532 continue; 2533 printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i, 2534 ppim->pa, ppim->va, ppim->sz, ppim->mode); 2535 } 2536 2537 mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); 2538 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 2539 (vmem_addr_t *)&qframe); 2540 if (error != 0) 2541 panic("qframe allocation failed"); 2542 2543 lm_ents = 8; 2544 TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents); 2545 if (lm_ents > LMEPML4I - LMSPML4I + 1) 2546 lm_ents = LMEPML4I - LMSPML4I + 1; 2547 #ifdef KMSAN 2548 if (lm_ents > KMSANORIGPML4I - LMSPML4I) { 2549 printf( 2550 "pmap: shrinking large map for KMSAN (%d slots to %ld slots)\n", 2551 lm_ents, KMSANORIGPML4I - LMSPML4I); 2552 lm_ents = KMSANORIGPML4I - LMSPML4I; 2553 } 2554 #endif 2555 if (bootverbose) 2556 printf("pmap: large map %u PML4 slots (%lu GB)\n", 2557 lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024)); 2558 if (lm_ents != 0) { 2559 large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS, 2560 (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK); 2561 if (large_vmem == NULL) { 2562 printf("pmap: cannot create large map\n"); 2563 lm_ents = 0; 2564 } 2565 for (i = 0; i < lm_ents; i++) { 2566 m = pmap_large_map_getptp_unlocked(); 2567 /* XXXKIB la57 */ 2568 kernel_pml4[LMSPML4I + i] = X86_PG_V | 2569 X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx | 2570 VM_PAGE_TO_PHYS(m); 2571 } 2572 } 2573 } 2574 2575 SYSCTL_UINT(_vm_pmap, OID_AUTO, large_map_pml4_entries, 2576 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &lm_ents, 0, 2577 "Maximum number of PML4 entries for use by large map (tunable). " 2578 "Each entry corresponds to 512GB of address space."); 2579 2580 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 2581 "2MB page mapping counters"); 2582 2583 static COUNTER_U64_DEFINE_EARLY(pmap_pde_demotions); 2584 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, demotions, 2585 CTLFLAG_RD, &pmap_pde_demotions, "2MB page demotions"); 2586 2587 static COUNTER_U64_DEFINE_EARLY(pmap_pde_mappings); 2588 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 2589 &pmap_pde_mappings, "2MB page mappings"); 2590 2591 static COUNTER_U64_DEFINE_EARLY(pmap_pde_p_failures); 2592 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 2593 &pmap_pde_p_failures, "2MB page promotion failures"); 2594 2595 static COUNTER_U64_DEFINE_EARLY(pmap_pde_promotions); 2596 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 2597 &pmap_pde_promotions, "2MB page promotions"); 2598 2599 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 2600 "1GB page mapping counters"); 2601 2602 static COUNTER_U64_DEFINE_EARLY(pmap_pdpe_demotions); 2603 SYSCTL_COUNTER_U64(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, 2604 &pmap_pdpe_demotions, "1GB page demotions"); 2605 2606 /*************************************************** 2607 * Low level helper routines..... 2608 ***************************************************/ 2609 2610 static pt_entry_t 2611 pmap_swap_pat(pmap_t pmap, pt_entry_t entry) 2612 { 2613 int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; 2614 2615 switch (pmap->pm_type) { 2616 case PT_X86: 2617 case PT_RVI: 2618 /* Verify that both PAT bits are not set at the same time */ 2619 KASSERT((entry & x86_pat_bits) != x86_pat_bits, 2620 ("Invalid PAT bits in entry %#lx", entry)); 2621 2622 /* Swap the PAT bits if one of them is set */ 2623 if ((entry & x86_pat_bits) != 0) 2624 entry ^= x86_pat_bits; 2625 break; 2626 case PT_EPT: 2627 /* 2628 * Nothing to do - the memory attributes are represented 2629 * the same way for regular pages and superpages. 2630 */ 2631 break; 2632 default: 2633 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); 2634 } 2635 2636 return (entry); 2637 } 2638 2639 boolean_t 2640 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 2641 { 2642 2643 return (mode >= 0 && mode < PAT_INDEX_SIZE && 2644 pat_index[(int)mode] >= 0); 2645 } 2646 2647 /* 2648 * Determine the appropriate bits to set in a PTE or PDE for a specified 2649 * caching mode. 2650 */ 2651 int 2652 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) 2653 { 2654 int cache_bits, pat_flag, pat_idx; 2655 2656 if (!pmap_is_valid_memattr(pmap, mode)) 2657 panic("Unknown caching mode %d\n", mode); 2658 2659 switch (pmap->pm_type) { 2660 case PT_X86: 2661 case PT_RVI: 2662 /* The PAT bit is different for PTE's and PDE's. */ 2663 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 2664 2665 /* Map the caching mode to a PAT index. */ 2666 pat_idx = pat_index[mode]; 2667 2668 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 2669 cache_bits = 0; 2670 if (pat_idx & 0x4) 2671 cache_bits |= pat_flag; 2672 if (pat_idx & 0x2) 2673 cache_bits |= PG_NC_PCD; 2674 if (pat_idx & 0x1) 2675 cache_bits |= PG_NC_PWT; 2676 break; 2677 2678 case PT_EPT: 2679 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); 2680 break; 2681 2682 default: 2683 panic("unsupported pmap type %d", pmap->pm_type); 2684 } 2685 2686 return (cache_bits); 2687 } 2688 2689 static int 2690 pmap_cache_mask(pmap_t pmap, boolean_t is_pde) 2691 { 2692 int mask; 2693 2694 switch (pmap->pm_type) { 2695 case PT_X86: 2696 case PT_RVI: 2697 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; 2698 break; 2699 case PT_EPT: 2700 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); 2701 break; 2702 default: 2703 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); 2704 } 2705 2706 return (mask); 2707 } 2708 2709 static int 2710 pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde) 2711 { 2712 int pat_flag, pat_idx; 2713 2714 pat_idx = 0; 2715 switch (pmap->pm_type) { 2716 case PT_X86: 2717 case PT_RVI: 2718 /* The PAT bit is different for PTE's and PDE's. */ 2719 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 2720 2721 if ((pte & pat_flag) != 0) 2722 pat_idx |= 0x4; 2723 if ((pte & PG_NC_PCD) != 0) 2724 pat_idx |= 0x2; 2725 if ((pte & PG_NC_PWT) != 0) 2726 pat_idx |= 0x1; 2727 break; 2728 case PT_EPT: 2729 if ((pte & EPT_PG_IGNORE_PAT) != 0) 2730 panic("EPT PTE %#lx has no PAT memory type", pte); 2731 pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3; 2732 break; 2733 } 2734 2735 /* See pmap_init_pat(). */ 2736 if (pat_idx == 4) 2737 pat_idx = 0; 2738 if (pat_idx == 7) 2739 pat_idx = 3; 2740 2741 return (pat_idx); 2742 } 2743 2744 bool 2745 pmap_ps_enabled(pmap_t pmap) 2746 { 2747 2748 return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); 2749 } 2750 2751 static void 2752 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) 2753 { 2754 2755 switch (pmap->pm_type) { 2756 case PT_X86: 2757 break; 2758 case PT_RVI: 2759 case PT_EPT: 2760 /* 2761 * XXX 2762 * This is a little bogus since the generation number is 2763 * supposed to be bumped up when a region of the address 2764 * space is invalidated in the page tables. 2765 * 2766 * In this case the old PDE entry is valid but yet we want 2767 * to make sure that any mappings using the old entry are 2768 * invalidated in the TLB. 2769 * 2770 * The reason this works as expected is because we rendezvous 2771 * "all" host cpus and force any vcpu context to exit as a 2772 * side-effect. 2773 */ 2774 atomic_add_long(&pmap->pm_eptgen, 1); 2775 break; 2776 default: 2777 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); 2778 } 2779 pde_store(pde, newpde); 2780 } 2781 2782 /* 2783 * After changing the page size for the specified virtual address in the page 2784 * table, flush the corresponding entries from the processor's TLB. Only the 2785 * calling processor's TLB is affected. 2786 * 2787 * The calling thread must be pinned to a processor. 2788 */ 2789 static void 2790 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) 2791 { 2792 pt_entry_t PG_G; 2793 2794 if (pmap_type_guest(pmap)) 2795 return; 2796 2797 KASSERT(pmap->pm_type == PT_X86, 2798 ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); 2799 2800 PG_G = pmap_global_bit(pmap); 2801 2802 if ((newpde & PG_PS) == 0) 2803 /* Demotion: flush a specific 2MB page mapping. */ 2804 pmap_invlpg(pmap, va); 2805 else if ((newpde & PG_G) == 0) 2806 /* 2807 * Promotion: flush every 4KB page mapping from the TLB 2808 * because there are too many to flush individually. 2809 */ 2810 invltlb(); 2811 else { 2812 /* 2813 * Promotion: flush every 4KB page mapping from the TLB, 2814 * including any global (PG_G) mappings. 2815 */ 2816 invltlb_glob(); 2817 } 2818 } 2819 2820 /* 2821 * The amd64 pmap uses different approaches to TLB invalidation 2822 * depending on the kernel configuration, available hardware features, 2823 * and known hardware errata. The kernel configuration option that 2824 * has the greatest operational impact on TLB invalidation is PTI, 2825 * which is enabled automatically on affected Intel CPUs. The most 2826 * impactful hardware features are first PCID, and then INVPCID 2827 * instruction presence. PCID usage is quite different for PTI 2828 * vs. non-PTI. 2829 * 2830 * * Kernel Page Table Isolation (PTI or KPTI) is used to mitigate 2831 * the Meltdown bug in some Intel CPUs. Under PTI, each user address 2832 * space is served by two page tables, user and kernel. The user 2833 * page table only maps user space and a kernel trampoline. The 2834 * kernel trampoline includes the entirety of the kernel text but 2835 * only the kernel data that is needed to switch from user to kernel 2836 * mode. The kernel page table maps the user and kernel address 2837 * spaces in their entirety. It is identical to the per-process 2838 * page table used in non-PTI mode. 2839 * 2840 * User page tables are only used when the CPU is in user mode. 2841 * Consequently, some TLB invalidations can be postponed until the 2842 * switch from kernel to user mode. In contrast, the user 2843 * space part of the kernel page table is used for copyout(9), so 2844 * TLB invalidations on this page table cannot be similarly postponed. 2845 * 2846 * The existence of a user mode page table for the given pmap is 2847 * indicated by a pm_ucr3 value that differs from PMAP_NO_CR3, in 2848 * which case pm_ucr3 contains the %cr3 register value for the user 2849 * mode page table's root. 2850 * 2851 * * The pm_active bitmask indicates which CPUs currently have the 2852 * pmap active. A CPU's bit is set on context switch to the pmap, and 2853 * cleared on switching off this CPU. For the kernel page table, 2854 * the pm_active field is immutable and contains all CPUs. The 2855 * kernel page table is always logically active on every processor, 2856 * but not necessarily in use by the hardware, e.g., in PTI mode. 2857 * 2858 * When requesting invalidation of virtual addresses with 2859 * pmap_invalidate_XXX() functions, the pmap sends shootdown IPIs to 2860 * all CPUs recorded as active in pm_active. Updates to and reads 2861 * from pm_active are not synchronized, and so they may race with 2862 * each other. Shootdown handlers are prepared to handle the race. 2863 * 2864 * * PCID is an optional feature of the long mode x86 MMU where TLB 2865 * entries are tagged with the 'Process ID' of the address space 2866 * they belong to. This feature provides a limited namespace for 2867 * process identifiers, 12 bits, supporting 4095 simultaneous IDs 2868 * total. 2869 * 2870 * Allocation of a PCID to a pmap is done by an algorithm described 2871 * in section 15.12, "Other TLB Consistency Algorithms", of 2872 * Vahalia's book "Unix Internals". A PCID cannot be allocated for 2873 * the whole lifetime of a pmap in pmap_pinit() due to the limited 2874 * namespace. Instead, a per-CPU, per-pmap PCID is assigned when 2875 * the CPU is about to start caching TLB entries from a pmap, 2876 * i.e., on the context switch that activates the pmap on the CPU. 2877 * 2878 * The PCID allocator maintains a per-CPU, per-pmap generation 2879 * count, pm_gen, which is incremented each time a new PCID is 2880 * allocated. On TLB invalidation, the generation counters for the 2881 * pmap are zeroed, which signals the context switch code that the 2882 * previously allocated PCID is no longer valid. Effectively, 2883 * zeroing any of these counters triggers a TLB shootdown for the 2884 * given CPU/address space, due to the allocation of a new PCID. 2885 * 2886 * Zeroing can be performed remotely. Consequently, if a pmap is 2887 * inactive on a CPU, then a TLB shootdown for that pmap and CPU can 2888 * be initiated by an ordinary memory access to reset the target 2889 * CPU's generation count within the pmap. The CPU initiating the 2890 * TLB shootdown does not need to send an IPI to the target CPU. 2891 * 2892 * * PTI + PCID. The available PCIDs are divided into two sets: PCIDs 2893 * for complete (kernel) page tables, and PCIDs for user mode page 2894 * tables. A user PCID value is obtained from the kernel PCID value 2895 * by setting the highest bit, 11, to 1 (0x800 == PMAP_PCID_USER_PT). 2896 * 2897 * User space page tables are activated on return to user mode, by 2898 * loading pm_ucr3 into %cr3. If the PCPU(ucr3_load_mask) requests 2899 * clearing bit 63 of the loaded ucr3, this effectively causes 2900 * complete invalidation of the user mode TLB entries for the 2901 * current pmap. In which case, local invalidations of individual 2902 * pages in the user page table are skipped. 2903 * 2904 * * Local invalidation, all modes. If the requested invalidation is 2905 * for a specific address or the total invalidation of a currently 2906 * active pmap, then the TLB is flushed using INVLPG for a kernel 2907 * page table, and INVPCID(INVPCID_CTXGLOB)/invltlb_glob() for a 2908 * user space page table(s). 2909 * 2910 * If the INVPCID instruction is available, it is used to flush user 2911 * entries from the kernel page table. 2912 * 2913 * When PCID is enabled, the INVLPG instruction invalidates all TLB 2914 * entries for the given page that either match the current PCID or 2915 * are global. Since TLB entries for the same page under different 2916 * PCIDs are unaffected, kernel pages which reside in all address 2917 * spaces could be problematic. We avoid the problem by creating 2918 * all kernel PTEs with the global flag (PG_G) set, when PTI is 2919 * disabled. 2920 * 2921 * * mode: PTI disabled, PCID present. The kernel reserves PCID 0 for its 2922 * address space, all other 4095 PCIDs are used for user mode spaces 2923 * as described above. A context switch allocates a new PCID if 2924 * the recorded PCID is zero or the recorded generation does not match 2925 * the CPU's generation, effectively flushing the TLB for this address space. 2926 * Total remote invalidation is performed by zeroing pm_gen for all CPUs. 2927 * local user page: INVLPG 2928 * local kernel page: INVLPG 2929 * local user total: INVPCID(CTX) 2930 * local kernel total: INVPCID(CTXGLOB) or invltlb_glob() 2931 * remote user page, inactive pmap: zero pm_gen 2932 * remote user page, active pmap: zero pm_gen + IPI:INVLPG 2933 * (Both actions are required to handle the aforementioned pm_active races.) 2934 * remote kernel page: IPI:INVLPG 2935 * remote user total, inactive pmap: zero pm_gen 2936 * remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) or 2937 * reload %cr3) 2938 * (See note above about pm_active races.) 2939 * remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob()) 2940 * 2941 * PTI enabled, PCID present. 2942 * local user page: INVLPG for kpt, INVPCID(ADDR) or (INVLPG for ucr3) 2943 * for upt 2944 * local kernel page: INVLPG 2945 * local user total: INVPCID(CTX) or reload %cr3 for kpt, clear PCID_SAVE 2946 * on loading UCR3 into %cr3 for upt 2947 * local kernel total: INVPCID(CTXGLOB) or invltlb_glob() 2948 * remote user page, inactive pmap: zero pm_gen 2949 * remote user page, active pmap: zero pm_gen + IPI:(INVLPG for kpt, 2950 * INVPCID(ADDR) for upt) 2951 * remote kernel page: IPI:INVLPG 2952 * remote user total, inactive pmap: zero pm_gen 2953 * remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) for kpt, 2954 * clear PCID_SAVE on loading UCR3 into $cr3 for upt) 2955 * remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob()) 2956 * 2957 * No PCID. 2958 * local user page: INVLPG 2959 * local kernel page: INVLPG 2960 * local user total: reload %cr3 2961 * local kernel total: invltlb_glob() 2962 * remote user page, inactive pmap: - 2963 * remote user page, active pmap: IPI:INVLPG 2964 * remote kernel page: IPI:INVLPG 2965 * remote user total, inactive pmap: - 2966 * remote user total, active pmap: IPI:(reload %cr3) 2967 * remote kernel total: IPI:invltlb_glob() 2968 * Since on return to user mode, the reload of %cr3 with ucr3 causes 2969 * TLB invalidation, no specific action is required for user page table. 2970 * 2971 * EPT. EPT pmaps do not map KVA, all mappings are userspace. 2972 * XXX TODO 2973 */ 2974 2975 #ifdef SMP 2976 /* 2977 * Interrupt the cpus that are executing in the guest context. 2978 * This will force the vcpu to exit and the cached EPT mappings 2979 * will be invalidated by the host before the next vmresume. 2980 */ 2981 static __inline void 2982 pmap_invalidate_ept(pmap_t pmap) 2983 { 2984 smr_seq_t goal; 2985 int ipinum; 2986 2987 sched_pin(); 2988 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 2989 ("pmap_invalidate_ept: absurd pm_active")); 2990 2991 /* 2992 * The TLB mappings associated with a vcpu context are not 2993 * flushed each time a different vcpu is chosen to execute. 2994 * 2995 * This is in contrast with a process's vtop mappings that 2996 * are flushed from the TLB on each context switch. 2997 * 2998 * Therefore we need to do more than just a TLB shootdown on 2999 * the active cpus in 'pmap->pm_active'. To do this we keep 3000 * track of the number of invalidations performed on this pmap. 3001 * 3002 * Each vcpu keeps a cache of this counter and compares it 3003 * just before a vmresume. If the counter is out-of-date an 3004 * invept will be done to flush stale mappings from the TLB. 3005 * 3006 * To ensure that all vCPU threads have observed the new counter 3007 * value before returning, we use SMR. Ordering is important here: 3008 * the VMM enters an SMR read section before loading the counter 3009 * and after updating the pm_active bit set. Thus, pm_active is 3010 * a superset of active readers, and any reader that has observed 3011 * the goal has observed the new counter value. 3012 */ 3013 atomic_add_long(&pmap->pm_eptgen, 1); 3014 3015 goal = smr_advance(pmap->pm_eptsmr); 3016 3017 /* 3018 * Force the vcpu to exit and trap back into the hypervisor. 3019 */ 3020 ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; 3021 ipi_selected(pmap->pm_active, ipinum); 3022 sched_unpin(); 3023 3024 /* 3025 * Ensure that all active vCPUs will observe the new generation counter 3026 * value before executing any more guest instructions. 3027 */ 3028 smr_wait(pmap->pm_eptsmr, goal); 3029 } 3030 3031 static inline void 3032 pmap_invalidate_preipi_pcid(pmap_t pmap) 3033 { 3034 u_int cpuid, i; 3035 3036 sched_pin(); 3037 3038 cpuid = PCPU_GET(cpuid); 3039 if (pmap != PCPU_GET(curpmap)) 3040 cpuid = 0xffffffff; /* An impossible value */ 3041 3042 CPU_FOREACH(i) { 3043 if (cpuid != i) 3044 pmap->pm_pcids[i].pm_gen = 0; 3045 } 3046 3047 /* 3048 * The fence is between stores to pm_gen and the read of the 3049 * pm_active mask. We need to ensure that it is impossible 3050 * for us to miss the bit update in pm_active and 3051 * simultaneously observe a non-zero pm_gen in 3052 * pmap_activate_sw(), otherwise TLB update is missed. 3053 * Without the fence, IA32 allows such an outcome. Note that 3054 * pm_active is updated by a locked operation, which provides 3055 * the reciprocal fence. 3056 */ 3057 atomic_thread_fence_seq_cst(); 3058 } 3059 3060 static void 3061 pmap_invalidate_preipi_nopcid(pmap_t pmap __unused) 3062 { 3063 sched_pin(); 3064 } 3065 3066 DEFINE_IFUNC(static, void, pmap_invalidate_preipi, (pmap_t)) 3067 { 3068 return (pmap_pcid_enabled ? pmap_invalidate_preipi_pcid : 3069 pmap_invalidate_preipi_nopcid); 3070 } 3071 3072 static inline void 3073 pmap_invalidate_page_pcid_cb(pmap_t pmap, vm_offset_t va, 3074 const bool invpcid_works1) 3075 { 3076 struct invpcid_descr d; 3077 uint64_t kcr3, ucr3; 3078 uint32_t pcid; 3079 u_int cpuid; 3080 3081 /* 3082 * Because pm_pcid is recalculated on a context switch, we 3083 * must ensure there is no preemption, not just pinning. 3084 * Otherwise, we might use a stale value below. 3085 */ 3086 CRITICAL_ASSERT(curthread); 3087 3088 /* 3089 * No need to do anything with user page tables invalidation 3090 * if there is no user page table, or invalidation is deferred 3091 * until the return to userspace. ucr3_load_mask is stable 3092 * because we have preemption disabled. 3093 */ 3094 if (pmap->pm_ucr3 == PMAP_NO_CR3 || 3095 PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) 3096 return; 3097 3098 cpuid = PCPU_GET(cpuid); 3099 3100 pcid = pmap->pm_pcids[cpuid].pm_pcid; 3101 if (invpcid_works1) { 3102 d.pcid = pcid | PMAP_PCID_USER_PT; 3103 d.pad = 0; 3104 d.addr = va; 3105 invpcid(&d, INVPCID_ADDR); 3106 } else { 3107 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3108 ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3109 pmap_pti_pcid_invlpg(ucr3, kcr3, va); 3110 } 3111 } 3112 3113 static void 3114 pmap_invalidate_page_pcid_invpcid_cb(pmap_t pmap, vm_offset_t va) 3115 { 3116 pmap_invalidate_page_pcid_cb(pmap, va, true); 3117 } 3118 3119 static void 3120 pmap_invalidate_page_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t va) 3121 { 3122 pmap_invalidate_page_pcid_cb(pmap, va, false); 3123 } 3124 3125 static void 3126 pmap_invalidate_page_nopcid_cb(pmap_t pmap __unused, vm_offset_t va __unused) 3127 { 3128 } 3129 3130 DEFINE_IFUNC(static, void, pmap_invalidate_page_cb, (pmap_t, vm_offset_t)) 3131 { 3132 if (pmap_pcid_enabled) 3133 return (invpcid_works ? pmap_invalidate_page_pcid_invpcid_cb : 3134 pmap_invalidate_page_pcid_noinvpcid_cb); 3135 return (pmap_invalidate_page_nopcid_cb); 3136 } 3137 3138 static void 3139 pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va, 3140 vm_offset_t addr2 __unused) 3141 { 3142 if (pmap == kernel_pmap) { 3143 pmap_invlpg(kernel_pmap, va); 3144 } else if (pmap == PCPU_GET(curpmap)) { 3145 invlpg(va); 3146 pmap_invalidate_page_cb(pmap, va); 3147 } 3148 } 3149 3150 void 3151 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 3152 { 3153 if (pmap_type_guest(pmap)) { 3154 pmap_invalidate_ept(pmap); 3155 return; 3156 } 3157 3158 KASSERT(pmap->pm_type == PT_X86, 3159 ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); 3160 3161 pmap_invalidate_preipi(pmap); 3162 smp_masked_invlpg(va, pmap, pmap_invalidate_page_curcpu_cb); 3163 } 3164 3165 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ 3166 #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) 3167 3168 static void 3169 pmap_invalidate_range_pcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 3170 const bool invpcid_works1) 3171 { 3172 struct invpcid_descr d; 3173 uint64_t kcr3, ucr3; 3174 uint32_t pcid; 3175 u_int cpuid; 3176 3177 CRITICAL_ASSERT(curthread); 3178 3179 if (pmap != PCPU_GET(curpmap) || 3180 pmap->pm_ucr3 == PMAP_NO_CR3 || 3181 PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) 3182 return; 3183 3184 cpuid = PCPU_GET(cpuid); 3185 3186 pcid = pmap->pm_pcids[cpuid].pm_pcid; 3187 if (invpcid_works1) { 3188 d.pcid = pcid | PMAP_PCID_USER_PT; 3189 d.pad = 0; 3190 for (d.addr = sva; d.addr < eva; d.addr += PAGE_SIZE) 3191 invpcid(&d, INVPCID_ADDR); 3192 } else { 3193 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3194 ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3195 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); 3196 } 3197 } 3198 3199 static void 3200 pmap_invalidate_range_pcid_invpcid_cb(pmap_t pmap, vm_offset_t sva, 3201 vm_offset_t eva) 3202 { 3203 pmap_invalidate_range_pcid_cb(pmap, sva, eva, true); 3204 } 3205 3206 static void 3207 pmap_invalidate_range_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t sva, 3208 vm_offset_t eva) 3209 { 3210 pmap_invalidate_range_pcid_cb(pmap, sva, eva, false); 3211 } 3212 3213 static void 3214 pmap_invalidate_range_nopcid_cb(pmap_t pmap __unused, vm_offset_t sva __unused, 3215 vm_offset_t eva __unused) 3216 { 3217 } 3218 3219 DEFINE_IFUNC(static, void, pmap_invalidate_range_cb, (pmap_t, vm_offset_t, 3220 vm_offset_t)) 3221 { 3222 if (pmap_pcid_enabled) 3223 return (invpcid_works ? pmap_invalidate_range_pcid_invpcid_cb : 3224 pmap_invalidate_range_pcid_noinvpcid_cb); 3225 return (pmap_invalidate_range_nopcid_cb); 3226 } 3227 3228 static void 3229 pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3230 { 3231 vm_offset_t addr; 3232 3233 if (pmap == kernel_pmap) { 3234 if (PCPU_GET(pcid_invlpg_workaround)) { 3235 struct invpcid_descr d = { 0 }; 3236 3237 invpcid(&d, INVPCID_CTXGLOB); 3238 } else { 3239 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3240 invlpg(addr); 3241 } 3242 } else if (pmap == PCPU_GET(curpmap)) { 3243 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3244 invlpg(addr); 3245 pmap_invalidate_range_cb(pmap, sva, eva); 3246 } 3247 } 3248 3249 void 3250 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3251 { 3252 if (eva - sva >= PMAP_INVLPG_THRESHOLD) { 3253 pmap_invalidate_all(pmap); 3254 return; 3255 } 3256 3257 if (pmap_type_guest(pmap)) { 3258 pmap_invalidate_ept(pmap); 3259 return; 3260 } 3261 3262 KASSERT(pmap->pm_type == PT_X86, 3263 ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); 3264 3265 pmap_invalidate_preipi(pmap); 3266 smp_masked_invlpg_range(sva, eva, pmap, 3267 pmap_invalidate_range_curcpu_cb); 3268 } 3269 3270 static inline void 3271 pmap_invalidate_all_pcid_cb(pmap_t pmap, bool invpcid_works1) 3272 { 3273 struct invpcid_descr d; 3274 uint64_t kcr3; 3275 uint32_t pcid; 3276 u_int cpuid; 3277 3278 if (pmap == kernel_pmap) { 3279 if (invpcid_works1) { 3280 bzero(&d, sizeof(d)); 3281 invpcid(&d, INVPCID_CTXGLOB); 3282 } else { 3283 invltlb_glob(); 3284 } 3285 } else if (pmap == PCPU_GET(curpmap)) { 3286 CRITICAL_ASSERT(curthread); 3287 cpuid = PCPU_GET(cpuid); 3288 3289 pcid = pmap->pm_pcids[cpuid].pm_pcid; 3290 if (invpcid_works1) { 3291 d.pcid = pcid; 3292 d.pad = 0; 3293 d.addr = 0; 3294 invpcid(&d, INVPCID_CTX); 3295 } else { 3296 kcr3 = pmap->pm_cr3 | pcid; 3297 load_cr3(kcr3); 3298 } 3299 if (pmap->pm_ucr3 != PMAP_NO_CR3) 3300 PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); 3301 } 3302 } 3303 3304 static void 3305 pmap_invalidate_all_pcid_invpcid_cb(pmap_t pmap) 3306 { 3307 pmap_invalidate_all_pcid_cb(pmap, true); 3308 } 3309 3310 static void 3311 pmap_invalidate_all_pcid_noinvpcid_cb(pmap_t pmap) 3312 { 3313 pmap_invalidate_all_pcid_cb(pmap, false); 3314 } 3315 3316 static void 3317 pmap_invalidate_all_nopcid_cb(pmap_t pmap) 3318 { 3319 if (pmap == kernel_pmap) 3320 invltlb_glob(); 3321 else if (pmap == PCPU_GET(curpmap)) 3322 invltlb(); 3323 } 3324 3325 DEFINE_IFUNC(static, void, pmap_invalidate_all_cb, (pmap_t)) 3326 { 3327 if (pmap_pcid_enabled) 3328 return (invpcid_works ? pmap_invalidate_all_pcid_invpcid_cb : 3329 pmap_invalidate_all_pcid_noinvpcid_cb); 3330 return (pmap_invalidate_all_nopcid_cb); 3331 } 3332 3333 static void 3334 pmap_invalidate_all_curcpu_cb(pmap_t pmap, vm_offset_t addr1 __unused, 3335 vm_offset_t addr2 __unused) 3336 { 3337 pmap_invalidate_all_cb(pmap); 3338 } 3339 3340 void 3341 pmap_invalidate_all(pmap_t pmap) 3342 { 3343 if (pmap_type_guest(pmap)) { 3344 pmap_invalidate_ept(pmap); 3345 return; 3346 } 3347 3348 KASSERT(pmap->pm_type == PT_X86, 3349 ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); 3350 3351 pmap_invalidate_preipi(pmap); 3352 smp_masked_invltlb(pmap, pmap_invalidate_all_curcpu_cb); 3353 } 3354 3355 static void 3356 pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused, vm_offset_t va __unused, 3357 vm_offset_t addr2 __unused) 3358 { 3359 wbinvd(); 3360 } 3361 3362 void 3363 pmap_invalidate_cache(void) 3364 { 3365 sched_pin(); 3366 smp_cache_flush(pmap_invalidate_cache_curcpu_cb); 3367 } 3368 3369 struct pde_action { 3370 cpuset_t invalidate; /* processors that invalidate their TLB */ 3371 pmap_t pmap; 3372 vm_offset_t va; 3373 pd_entry_t *pde; 3374 pd_entry_t newpde; 3375 u_int store; /* processor that updates the PDE */ 3376 }; 3377 3378 static void 3379 pmap_update_pde_action(void *arg) 3380 { 3381 struct pde_action *act = arg; 3382 3383 if (act->store == PCPU_GET(cpuid)) 3384 pmap_update_pde_store(act->pmap, act->pde, act->newpde); 3385 } 3386 3387 static void 3388 pmap_update_pde_teardown(void *arg) 3389 { 3390 struct pde_action *act = arg; 3391 3392 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 3393 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); 3394 } 3395 3396 /* 3397 * Change the page size for the specified virtual address in a way that 3398 * prevents any possibility of the TLB ever having two entries that map the 3399 * same virtual address using different page sizes. This is the recommended 3400 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 3401 * machine check exception for a TLB state that is improperly diagnosed as a 3402 * hardware error. 3403 */ 3404 static void 3405 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 3406 { 3407 struct pde_action act; 3408 cpuset_t active, other_cpus; 3409 u_int cpuid; 3410 3411 sched_pin(); 3412 cpuid = PCPU_GET(cpuid); 3413 other_cpus = all_cpus; 3414 CPU_CLR(cpuid, &other_cpus); 3415 if (pmap == kernel_pmap || pmap_type_guest(pmap)) 3416 active = all_cpus; 3417 else { 3418 active = pmap->pm_active; 3419 } 3420 if (CPU_OVERLAP(&active, &other_cpus)) { 3421 act.store = cpuid; 3422 act.invalidate = active; 3423 act.va = va; 3424 act.pmap = pmap; 3425 act.pde = pde; 3426 act.newpde = newpde; 3427 CPU_SET(cpuid, &active); 3428 smp_rendezvous_cpus(active, 3429 smp_no_rendezvous_barrier, pmap_update_pde_action, 3430 pmap_update_pde_teardown, &act); 3431 } else { 3432 pmap_update_pde_store(pmap, pde, newpde); 3433 if (CPU_ISSET(cpuid, &active)) 3434 pmap_update_pde_invalidate(pmap, va, newpde); 3435 } 3436 sched_unpin(); 3437 } 3438 #else /* !SMP */ 3439 /* 3440 * Normal, non-SMP, invalidation functions. 3441 */ 3442 void 3443 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 3444 { 3445 struct invpcid_descr d; 3446 uint64_t kcr3, ucr3; 3447 uint32_t pcid; 3448 3449 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3450 pmap->pm_eptgen++; 3451 return; 3452 } 3453 KASSERT(pmap->pm_type == PT_X86, 3454 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 3455 3456 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 3457 invlpg(va); 3458 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && 3459 pmap->pm_ucr3 != PMAP_NO_CR3) { 3460 critical_enter(); 3461 pcid = pmap->pm_pcids[0].pm_pcid; 3462 if (invpcid_works) { 3463 d.pcid = pcid | PMAP_PCID_USER_PT; 3464 d.pad = 0; 3465 d.addr = va; 3466 invpcid(&d, INVPCID_ADDR); 3467 } else { 3468 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3469 ucr3 = pmap->pm_ucr3 | pcid | 3470 PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3471 pmap_pti_pcid_invlpg(ucr3, kcr3, va); 3472 } 3473 critical_exit(); 3474 } 3475 } else if (pmap_pcid_enabled) 3476 pmap->pm_pcids[0].pm_gen = 0; 3477 } 3478 3479 void 3480 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3481 { 3482 struct invpcid_descr d; 3483 vm_offset_t addr; 3484 uint64_t kcr3, ucr3; 3485 3486 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3487 pmap->pm_eptgen++; 3488 return; 3489 } 3490 KASSERT(pmap->pm_type == PT_X86, 3491 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 3492 3493 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 3494 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3495 invlpg(addr); 3496 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && 3497 pmap->pm_ucr3 != PMAP_NO_CR3) { 3498 critical_enter(); 3499 if (invpcid_works) { 3500 d.pcid = pmap->pm_pcids[0].pm_pcid | 3501 PMAP_PCID_USER_PT; 3502 d.pad = 0; 3503 d.addr = sva; 3504 for (; d.addr < eva; d.addr += PAGE_SIZE) 3505 invpcid(&d, INVPCID_ADDR); 3506 } else { 3507 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0]. 3508 pm_pcid | CR3_PCID_SAVE; 3509 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0]. 3510 pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3511 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); 3512 } 3513 critical_exit(); 3514 } 3515 } else if (pmap_pcid_enabled) { 3516 pmap->pm_pcids[0].pm_gen = 0; 3517 } 3518 } 3519 3520 void 3521 pmap_invalidate_all(pmap_t pmap) 3522 { 3523 struct invpcid_descr d; 3524 uint64_t kcr3, ucr3; 3525 3526 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3527 pmap->pm_eptgen++; 3528 return; 3529 } 3530 KASSERT(pmap->pm_type == PT_X86, 3531 ("pmap_invalidate_all: unknown type %d", pmap->pm_type)); 3532 3533 if (pmap == kernel_pmap) { 3534 if (pmap_pcid_enabled && invpcid_works) { 3535 bzero(&d, sizeof(d)); 3536 invpcid(&d, INVPCID_CTXGLOB); 3537 } else { 3538 invltlb_glob(); 3539 } 3540 } else if (pmap == PCPU_GET(curpmap)) { 3541 if (pmap_pcid_enabled) { 3542 critical_enter(); 3543 if (invpcid_works) { 3544 d.pcid = pmap->pm_pcids[0].pm_pcid; 3545 d.pad = 0; 3546 d.addr = 0; 3547 invpcid(&d, INVPCID_CTX); 3548 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 3549 d.pcid |= PMAP_PCID_USER_PT; 3550 invpcid(&d, INVPCID_CTX); 3551 } 3552 } else { 3553 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid; 3554 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 3555 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[ 3556 0].pm_pcid | PMAP_PCID_USER_PT; 3557 pmap_pti_pcid_invalidate(ucr3, kcr3); 3558 } else 3559 load_cr3(kcr3); 3560 } 3561 critical_exit(); 3562 } else { 3563 invltlb(); 3564 } 3565 } else if (pmap_pcid_enabled) { 3566 pmap->pm_pcids[0].pm_gen = 0; 3567 } 3568 } 3569 3570 PMAP_INLINE void 3571 pmap_invalidate_cache(void) 3572 { 3573 3574 wbinvd(); 3575 } 3576 3577 static void 3578 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 3579 { 3580 3581 pmap_update_pde_store(pmap, pde, newpde); 3582 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) 3583 pmap_update_pde_invalidate(pmap, va, newpde); 3584 else 3585 pmap->pm_pcids[0].pm_gen = 0; 3586 } 3587 #endif /* !SMP */ 3588 3589 static void 3590 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) 3591 { 3592 3593 /* 3594 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created 3595 * by a promotion that did not invalidate the 512 4KB page mappings 3596 * that might exist in the TLB. Consequently, at this point, the TLB 3597 * may hold both 4KB and 2MB page mappings for the address range [va, 3598 * va + NBPDR). Therefore, the entire range must be invalidated here. 3599 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any 3600 * 4KB page mappings for the address range [va, va + NBPDR), and so a 3601 * single INVLPG suffices to invalidate the 2MB page mapping from the 3602 * TLB. 3603 */ 3604 if ((pde & PG_PROMOTED) != 0) 3605 pmap_invalidate_range(pmap, va, va + NBPDR - 1); 3606 else 3607 pmap_invalidate_page(pmap, va); 3608 } 3609 3610 DEFINE_IFUNC(, void, pmap_invalidate_cache_range, 3611 (vm_offset_t sva, vm_offset_t eva)) 3612 { 3613 3614 if ((cpu_feature & CPUID_SS) != 0) 3615 return (pmap_invalidate_cache_range_selfsnoop); 3616 if ((cpu_feature & CPUID_CLFSH) != 0) 3617 return (pmap_force_invalidate_cache_range); 3618 return (pmap_invalidate_cache_range_all); 3619 } 3620 3621 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 3622 3623 static void 3624 pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva) 3625 { 3626 3627 KASSERT((sva & PAGE_MASK) == 0, 3628 ("pmap_invalidate_cache_range: sva not page-aligned")); 3629 KASSERT((eva & PAGE_MASK) == 0, 3630 ("pmap_invalidate_cache_range: eva not page-aligned")); 3631 } 3632 3633 static void 3634 pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva) 3635 { 3636 3637 pmap_invalidate_cache_range_check_align(sva, eva); 3638 } 3639 3640 void 3641 pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 3642 { 3643 3644 sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); 3645 3646 /* 3647 * XXX: Some CPUs fault, hang, or trash the local APIC 3648 * registers if we use CLFLUSH on the local APIC range. The 3649 * local APIC is always uncached, so we don't need to flush 3650 * for that range anyway. 3651 */ 3652 if (pmap_kextract(sva) == lapic_paddr) 3653 return; 3654 3655 if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) { 3656 /* 3657 * Do per-cache line flush. Use a locked 3658 * instruction to insure that previous stores are 3659 * included in the write-back. The processor 3660 * propagates flush to other processors in the cache 3661 * coherence domain. 3662 */ 3663 atomic_thread_fence_seq_cst(); 3664 for (; sva < eva; sva += cpu_clflush_line_size) 3665 clflushopt(sva); 3666 atomic_thread_fence_seq_cst(); 3667 } else { 3668 /* 3669 * Writes are ordered by CLFLUSH on Intel CPUs. 3670 */ 3671 if (cpu_vendor_id != CPU_VENDOR_INTEL) 3672 mfence(); 3673 for (; sva < eva; sva += cpu_clflush_line_size) 3674 clflush(sva); 3675 if (cpu_vendor_id != CPU_VENDOR_INTEL) 3676 mfence(); 3677 } 3678 } 3679 3680 static void 3681 pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva) 3682 { 3683 3684 pmap_invalidate_cache_range_check_align(sva, eva); 3685 pmap_invalidate_cache(); 3686 } 3687 3688 /* 3689 * Remove the specified set of pages from the data and instruction caches. 3690 * 3691 * In contrast to pmap_invalidate_cache_range(), this function does not 3692 * rely on the CPU's self-snoop feature, because it is intended for use 3693 * when moving pages into a different cache domain. 3694 */ 3695 void 3696 pmap_invalidate_cache_pages(vm_page_t *pages, int count) 3697 { 3698 vm_offset_t daddr, eva; 3699 int i; 3700 bool useclflushopt; 3701 3702 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; 3703 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 3704 ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt)) 3705 pmap_invalidate_cache(); 3706 else { 3707 if (useclflushopt) 3708 atomic_thread_fence_seq_cst(); 3709 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 3710 mfence(); 3711 for (i = 0; i < count; i++) { 3712 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 3713 eva = daddr + PAGE_SIZE; 3714 for (; daddr < eva; daddr += cpu_clflush_line_size) { 3715 if (useclflushopt) 3716 clflushopt(daddr); 3717 else 3718 clflush(daddr); 3719 } 3720 } 3721 if (useclflushopt) 3722 atomic_thread_fence_seq_cst(); 3723 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 3724 mfence(); 3725 } 3726 } 3727 3728 void 3729 pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva) 3730 { 3731 3732 pmap_invalidate_cache_range_check_align(sva, eva); 3733 3734 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) { 3735 pmap_force_invalidate_cache_range(sva, eva); 3736 return; 3737 } 3738 3739 /* See comment in pmap_force_invalidate_cache_range(). */ 3740 if (pmap_kextract(sva) == lapic_paddr) 3741 return; 3742 3743 atomic_thread_fence_seq_cst(); 3744 for (; sva < eva; sva += cpu_clflush_line_size) 3745 clwb(sva); 3746 atomic_thread_fence_seq_cst(); 3747 } 3748 3749 void 3750 pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) 3751 { 3752 pt_entry_t *pte; 3753 vm_offset_t vaddr; 3754 int error __diagused; 3755 int pte_bits; 3756 3757 KASSERT((spa & PAGE_MASK) == 0, 3758 ("pmap_flush_cache_phys_range: spa not page-aligned")); 3759 KASSERT((epa & PAGE_MASK) == 0, 3760 ("pmap_flush_cache_phys_range: epa not page-aligned")); 3761 3762 if (spa < dmaplimit) { 3763 pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN( 3764 dmaplimit, epa))); 3765 if (dmaplimit >= epa) 3766 return; 3767 spa = dmaplimit; 3768 } 3769 3770 pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW | 3771 X86_PG_V; 3772 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 3773 &vaddr); 3774 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 3775 pte = vtopte(vaddr); 3776 for (; spa < epa; spa += PAGE_SIZE) { 3777 sched_pin(); 3778 pte_store(pte, spa | pte_bits); 3779 pmap_invlpg(kernel_pmap, vaddr); 3780 /* XXXKIB atomic inside flush_cache_range are excessive */ 3781 pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE); 3782 sched_unpin(); 3783 } 3784 vmem_free(kernel_arena, vaddr, PAGE_SIZE); 3785 } 3786 3787 /* 3788 * Routine: pmap_extract 3789 * Function: 3790 * Extract the physical page address associated 3791 * with the given map/virtual_address pair. 3792 */ 3793 vm_paddr_t 3794 pmap_extract(pmap_t pmap, vm_offset_t va) 3795 { 3796 pdp_entry_t *pdpe; 3797 pd_entry_t *pde; 3798 pt_entry_t *pte, PG_V; 3799 vm_paddr_t pa; 3800 3801 pa = 0; 3802 PG_V = pmap_valid_bit(pmap); 3803 PMAP_LOCK(pmap); 3804 pdpe = pmap_pdpe(pmap, va); 3805 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 3806 if ((*pdpe & PG_PS) != 0) 3807 pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); 3808 else { 3809 pde = pmap_pdpe_to_pde(pdpe, va); 3810 if ((*pde & PG_V) != 0) { 3811 if ((*pde & PG_PS) != 0) { 3812 pa = (*pde & PG_PS_FRAME) | 3813 (va & PDRMASK); 3814 } else { 3815 pte = pmap_pde_to_pte(pde, va); 3816 pa = (*pte & PG_FRAME) | 3817 (va & PAGE_MASK); 3818 } 3819 } 3820 } 3821 } 3822 PMAP_UNLOCK(pmap); 3823 return (pa); 3824 } 3825 3826 /* 3827 * Routine: pmap_extract_and_hold 3828 * Function: 3829 * Atomically extract and hold the physical page 3830 * with the given pmap and virtual address pair 3831 * if that mapping permits the given protection. 3832 */ 3833 vm_page_t 3834 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 3835 { 3836 pdp_entry_t pdpe, *pdpep; 3837 pd_entry_t pde, *pdep; 3838 pt_entry_t pte, PG_RW, PG_V; 3839 vm_page_t m; 3840 3841 m = NULL; 3842 PG_RW = pmap_rw_bit(pmap); 3843 PG_V = pmap_valid_bit(pmap); 3844 PMAP_LOCK(pmap); 3845 3846 pdpep = pmap_pdpe(pmap, va); 3847 if (pdpep == NULL || ((pdpe = *pdpep) & PG_V) == 0) 3848 goto out; 3849 if ((pdpe & PG_PS) != 0) { 3850 if ((pdpe & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0) 3851 goto out; 3852 m = PHYS_TO_VM_PAGE((pdpe & PG_PS_FRAME) | (va & PDPMASK)); 3853 goto check_page; 3854 } 3855 3856 pdep = pmap_pdpe_to_pde(pdpep, va); 3857 if (pdep == NULL || ((pde = *pdep) & PG_V) == 0) 3858 goto out; 3859 if ((pde & PG_PS) != 0) { 3860 if ((pde & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0) 3861 goto out; 3862 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); 3863 goto check_page; 3864 } 3865 3866 pte = *pmap_pde_to_pte(pdep, va); 3867 if ((pte & PG_V) == 0 || 3868 ((pte & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0)) 3869 goto out; 3870 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 3871 3872 check_page: 3873 if (m != NULL && !vm_page_wire_mapped(m)) 3874 m = NULL; 3875 out: 3876 PMAP_UNLOCK(pmap); 3877 return (m); 3878 } 3879 3880 vm_paddr_t 3881 pmap_kextract(vm_offset_t va) 3882 { 3883 pd_entry_t pde; 3884 vm_paddr_t pa; 3885 3886 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 3887 pa = DMAP_TO_PHYS(va); 3888 } else if (PMAP_ADDRESS_IN_LARGEMAP(va)) { 3889 pa = pmap_large_map_kextract(va); 3890 } else { 3891 pde = *vtopde(va); 3892 if (pde & PG_PS) { 3893 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 3894 } else { 3895 /* 3896 * Beware of a concurrent promotion that changes the 3897 * PDE at this point! For example, vtopte() must not 3898 * be used to access the PTE because it would use the 3899 * new PDE. It is, however, safe to use the old PDE 3900 * because the page table page is preserved by the 3901 * promotion. 3902 */ 3903 pa = *pmap_pde_to_pte(&pde, va); 3904 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 3905 } 3906 } 3907 return (pa); 3908 } 3909 3910 /*************************************************** 3911 * Low level mapping routines..... 3912 ***************************************************/ 3913 3914 /* 3915 * Add a wired page to the kva. 3916 * Note: not SMP coherent. 3917 */ 3918 PMAP_INLINE void 3919 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 3920 { 3921 pt_entry_t *pte; 3922 3923 pte = vtopte(va); 3924 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | X86_PG_M | 3925 X86_PG_RW | X86_PG_V); 3926 } 3927 3928 static __inline void 3929 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 3930 { 3931 pt_entry_t *pte; 3932 int cache_bits; 3933 3934 pte = vtopte(va); 3935 cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); 3936 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | X86_PG_M | 3937 X86_PG_RW | X86_PG_V | cache_bits); 3938 } 3939 3940 /* 3941 * Remove a page from the kernel pagetables. 3942 * Note: not SMP coherent. 3943 */ 3944 PMAP_INLINE void 3945 pmap_kremove(vm_offset_t va) 3946 { 3947 pt_entry_t *pte; 3948 3949 pte = vtopte(va); 3950 pte_clear(pte); 3951 } 3952 3953 /* 3954 * Used to map a range of physical addresses into kernel 3955 * virtual address space. 3956 * 3957 * The value passed in '*virt' is a suggested virtual address for 3958 * the mapping. Architectures which can support a direct-mapped 3959 * physical to virtual region can return the appropriate address 3960 * within that region, leaving '*virt' unchanged. Other 3961 * architectures should map the pages starting at '*virt' and 3962 * update '*virt' with the first usable address after the mapped 3963 * region. 3964 */ 3965 vm_offset_t 3966 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 3967 { 3968 return PHYS_TO_DMAP(start); 3969 } 3970 3971 /* 3972 * Add a list of wired pages to the kva 3973 * this routine is only used for temporary 3974 * kernel mappings that do not need to have 3975 * page modification or references recorded. 3976 * Note that old mappings are simply written 3977 * over. The page *must* be wired. 3978 * Note: SMP coherent. Uses a ranged shootdown IPI. 3979 */ 3980 void 3981 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 3982 { 3983 pt_entry_t *endpte, oldpte, pa, *pte; 3984 vm_page_t m; 3985 int cache_bits; 3986 3987 oldpte = 0; 3988 pte = vtopte(sva); 3989 endpte = pte + count; 3990 while (pte < endpte) { 3991 m = *ma++; 3992 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); 3993 pa = VM_PAGE_TO_PHYS(m) | cache_bits; 3994 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { 3995 oldpte |= *pte; 3996 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | 3997 X86_PG_M | X86_PG_RW | X86_PG_V); 3998 } 3999 pte++; 4000 } 4001 if (__predict_false((oldpte & X86_PG_V) != 0)) 4002 pmap_invalidate_range(kernel_pmap, sva, sva + count * 4003 PAGE_SIZE); 4004 } 4005 4006 /* 4007 * This routine tears out page mappings from the 4008 * kernel -- it is meant only for temporary mappings. 4009 * Note: SMP coherent. Uses a ranged shootdown IPI. 4010 */ 4011 void 4012 pmap_qremove(vm_offset_t sva, int count) 4013 { 4014 vm_offset_t va; 4015 4016 va = sva; 4017 while (count-- > 0) { 4018 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); 4019 pmap_kremove(va); 4020 va += PAGE_SIZE; 4021 } 4022 pmap_invalidate_range(kernel_pmap, sva, va); 4023 } 4024 4025 /*************************************************** 4026 * Page table page management routines..... 4027 ***************************************************/ 4028 /* 4029 * Schedule the specified unused page table page to be freed. Specifically, 4030 * add the page to the specified list of pages that will be released to the 4031 * physical memory manager after the TLB has been updated. 4032 */ 4033 static __inline void 4034 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 4035 boolean_t set_PG_ZERO) 4036 { 4037 4038 if (set_PG_ZERO) 4039 m->flags |= PG_ZERO; 4040 else 4041 m->flags &= ~PG_ZERO; 4042 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 4043 } 4044 4045 /* 4046 * Inserts the specified page table page into the specified pmap's collection 4047 * of idle page table pages. Each of a pmap's page table pages is responsible 4048 * for mapping a distinct range of virtual addresses. The pmap's collection is 4049 * ordered by this virtual address range. 4050 * 4051 * If "promoted" is false, then the page table page "mpte" must be zero filled. 4052 */ 4053 static __inline int 4054 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) 4055 { 4056 4057 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4058 mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; 4059 return (vm_radix_insert(&pmap->pm_root, mpte)); 4060 } 4061 4062 /* 4063 * Removes the page table page mapping the specified virtual address from the 4064 * specified pmap's collection of idle page table pages, and returns it. 4065 * Otherwise, returns NULL if there is no page table page corresponding to the 4066 * specified virtual address. 4067 */ 4068 static __inline vm_page_t 4069 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 4070 { 4071 4072 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4073 return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va))); 4074 } 4075 4076 /* 4077 * Decrements a page table page's reference count, which is used to record the 4078 * number of valid page table entries within the page. If the reference count 4079 * drops to zero, then the page table page is unmapped. Returns TRUE if the 4080 * page table page was unmapped and FALSE otherwise. 4081 */ 4082 static inline boolean_t 4083 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4084 { 4085 4086 --m->ref_count; 4087 if (m->ref_count == 0) { 4088 _pmap_unwire_ptp(pmap, va, m, free); 4089 return (TRUE); 4090 } else 4091 return (FALSE); 4092 } 4093 4094 static void 4095 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4096 { 4097 pml5_entry_t *pml5; 4098 pml4_entry_t *pml4; 4099 pdp_entry_t *pdp; 4100 pd_entry_t *pd; 4101 vm_page_t pdpg, pdppg, pml4pg; 4102 4103 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4104 4105 /* 4106 * unmap the page table page 4107 */ 4108 if (m->pindex >= NUPDE + NUPDPE + NUPML4E) { 4109 /* PML4 page */ 4110 MPASS(pmap_is_la57(pmap)); 4111 pml5 = pmap_pml5e(pmap, va); 4112 *pml5 = 0; 4113 if (pmap->pm_pmltopu != NULL && va <= VM_MAXUSER_ADDRESS) { 4114 pml5 = pmap_pml5e_u(pmap, va); 4115 *pml5 = 0; 4116 } 4117 } else if (m->pindex >= NUPDE + NUPDPE) { 4118 /* PDP page */ 4119 pml4 = pmap_pml4e(pmap, va); 4120 *pml4 = 0; 4121 if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && 4122 va <= VM_MAXUSER_ADDRESS) { 4123 pml4 = pmap_pml4e_u(pmap, va); 4124 *pml4 = 0; 4125 } 4126 } else if (m->pindex >= NUPDE) { 4127 /* PD page */ 4128 pdp = pmap_pdpe(pmap, va); 4129 *pdp = 0; 4130 } else { 4131 /* PTE page */ 4132 pd = pmap_pde(pmap, va); 4133 *pd = 0; 4134 } 4135 if (m->pindex < NUPDE) { 4136 /* We just released a PT, unhold the matching PD */ 4137 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 4138 pmap_unwire_ptp(pmap, va, pdpg, free); 4139 } else if (m->pindex < NUPDE + NUPDPE) { 4140 /* We just released a PD, unhold the matching PDP */ 4141 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 4142 pmap_unwire_ptp(pmap, va, pdppg, free); 4143 } else if (m->pindex < NUPDE + NUPDPE + NUPML4E && pmap_is_la57(pmap)) { 4144 /* We just released a PDP, unhold the matching PML4 */ 4145 pml4pg = PHYS_TO_VM_PAGE(*pmap_pml5e(pmap, va) & PG_FRAME); 4146 pmap_unwire_ptp(pmap, va, pml4pg, free); 4147 } 4148 4149 pmap_pt_page_count_adj(pmap, -1); 4150 4151 /* 4152 * Put page on a list so that it is released after 4153 * *ALL* TLB shootdown is done 4154 */ 4155 pmap_add_delayed_free_list(m, free, TRUE); 4156 } 4157 4158 /* 4159 * After removing a page table entry, this routine is used to 4160 * conditionally free the page, and manage the reference count. 4161 */ 4162 static int 4163 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 4164 struct spglist *free) 4165 { 4166 vm_page_t mpte; 4167 4168 if (va >= VM_MAXUSER_ADDRESS) 4169 return (0); 4170 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 4171 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 4172 return (pmap_unwire_ptp(pmap, va, mpte, free)); 4173 } 4174 4175 /* 4176 * Release a page table page reference after a failed attempt to create a 4177 * mapping. 4178 */ 4179 static void 4180 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 4181 { 4182 struct spglist free; 4183 4184 SLIST_INIT(&free); 4185 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 4186 /* 4187 * Although "va" was never mapped, paging-structure caches 4188 * could nonetheless have entries that refer to the freed 4189 * page table pages. Invalidate those entries. 4190 */ 4191 pmap_invalidate_page(pmap, va); 4192 vm_page_free_pages_toq(&free, true); 4193 } 4194 } 4195 4196 void 4197 pmap_pinit0(pmap_t pmap) 4198 { 4199 struct proc *p; 4200 struct thread *td; 4201 int i; 4202 4203 PMAP_LOCK_INIT(pmap); 4204 pmap->pm_pmltop = kernel_pmap->pm_pmltop; 4205 pmap->pm_pmltopu = NULL; 4206 pmap->pm_cr3 = kernel_pmap->pm_cr3; 4207 /* hack to keep pmap_pti_pcid_invalidate() alive */ 4208 pmap->pm_ucr3 = PMAP_NO_CR3; 4209 vm_radix_init(&pmap->pm_root); 4210 CPU_ZERO(&pmap->pm_active); 4211 TAILQ_INIT(&pmap->pm_pvchunk); 4212 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4213 pmap->pm_flags = pmap_flags; 4214 CPU_FOREACH(i) { 4215 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN + 1; 4216 pmap->pm_pcids[i].pm_gen = 1; 4217 } 4218 pmap_activate_boot(pmap); 4219 td = curthread; 4220 if (pti) { 4221 p = td->td_proc; 4222 PROC_LOCK(p); 4223 p->p_md.md_flags |= P_MD_KPTI; 4224 PROC_UNLOCK(p); 4225 } 4226 pmap_thread_init_invl_gen(td); 4227 4228 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 4229 pmap_pkru_ranges_zone = uma_zcreate("pkru ranges", 4230 sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL, 4231 UMA_ALIGN_PTR, 0); 4232 } 4233 } 4234 4235 void 4236 pmap_pinit_pml4(vm_page_t pml4pg) 4237 { 4238 pml4_entry_t *pm_pml4; 4239 int i; 4240 4241 pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); 4242 4243 /* Wire in kernel global address entries. */ 4244 for (i = 0; i < NKPML4E; i++) { 4245 pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW | 4246 X86_PG_V; 4247 } 4248 #ifdef KASAN 4249 for (i = 0; i < NKASANPML4E; i++) { 4250 pm_pml4[KASANPML4I + i] = (KASANPDPphys + ptoa(i)) | X86_PG_RW | 4251 X86_PG_V | pg_nx; 4252 } 4253 #endif 4254 #ifdef KMSAN 4255 for (i = 0; i < NKMSANSHADPML4E; i++) { 4256 pm_pml4[KMSANSHADPML4I + i] = (KMSANSHADPDPphys + ptoa(i)) | 4257 X86_PG_RW | X86_PG_V | pg_nx; 4258 } 4259 for (i = 0; i < NKMSANORIGPML4E; i++) { 4260 pm_pml4[KMSANORIGPML4I + i] = (KMSANORIGPDPphys + ptoa(i)) | 4261 X86_PG_RW | X86_PG_V | pg_nx; 4262 } 4263 #endif 4264 for (i = 0; i < ndmpdpphys; i++) { 4265 pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | 4266 X86_PG_V; 4267 } 4268 4269 /* install self-referential address mapping entry(s) */ 4270 pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW | 4271 X86_PG_A | X86_PG_M; 4272 4273 /* install large map entries if configured */ 4274 for (i = 0; i < lm_ents; i++) 4275 pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pmltop[LMSPML4I + i]; 4276 } 4277 4278 void 4279 pmap_pinit_pml5(vm_page_t pml5pg) 4280 { 4281 pml5_entry_t *pm_pml5; 4282 4283 pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg)); 4284 4285 /* 4286 * Add pml5 entry at top of KVA pointing to existing pml4 table, 4287 * entering all existing kernel mappings into level 5 table. 4288 */ 4289 pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | 4290 X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | 4291 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); 4292 4293 /* 4294 * Install self-referential address mapping entry. 4295 */ 4296 pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) | 4297 X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A | 4298 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); 4299 } 4300 4301 static void 4302 pmap_pinit_pml4_pti(vm_page_t pml4pgu) 4303 { 4304 pml4_entry_t *pm_pml4u; 4305 int i; 4306 4307 pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pgu)); 4308 for (i = 0; i < NPML4EPG; i++) 4309 pm_pml4u[i] = pti_pml4[i]; 4310 } 4311 4312 static void 4313 pmap_pinit_pml5_pti(vm_page_t pml5pgu) 4314 { 4315 pml5_entry_t *pm_pml5u; 4316 4317 pm_pml5u = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pgu)); 4318 pagezero(pm_pml5u); 4319 4320 /* 4321 * Add pml5 entry at top of KVA pointing to existing pml4 pti 4322 * table, entering all kernel mappings needed for usermode 4323 * into level 5 table. 4324 */ 4325 pm_pml5u[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 4326 pmap_kextract((vm_offset_t)pti_pml4) | 4327 X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | 4328 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); 4329 } 4330 4331 /* Allocate a page table page and do related bookkeeping */ 4332 static vm_page_t 4333 pmap_alloc_pt_page(pmap_t pmap, vm_pindex_t pindex, int flags) 4334 { 4335 vm_page_t m; 4336 4337 m = vm_page_alloc_noobj(flags); 4338 if (__predict_false(m == NULL)) 4339 return (NULL); 4340 m->pindex = pindex; 4341 pmap_pt_page_count_adj(pmap, 1); 4342 return (m); 4343 } 4344 4345 static void 4346 pmap_free_pt_page(pmap_t pmap, vm_page_t m, bool zerofilled) 4347 { 4348 /* 4349 * This function assumes the page will need to be unwired, 4350 * even though the counterpart allocation in pmap_alloc_pt_page() 4351 * doesn't enforce VM_ALLOC_WIRED. However, all current uses 4352 * of pmap_free_pt_page() require unwiring. The case in which 4353 * a PT page doesn't require unwiring because its ref_count has 4354 * naturally reached 0 is handled through _pmap_unwire_ptp(). 4355 */ 4356 vm_page_unwire_noq(m); 4357 if (zerofilled) 4358 vm_page_free_zero(m); 4359 else 4360 vm_page_free(m); 4361 4362 pmap_pt_page_count_adj(pmap, -1); 4363 } 4364 4365 /* 4366 * Initialize a preallocated and zeroed pmap structure, 4367 * such as one in a vmspace structure. 4368 */ 4369 int 4370 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) 4371 { 4372 vm_page_t pmltop_pg, pmltop_pgu; 4373 vm_paddr_t pmltop_phys; 4374 int i; 4375 4376 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4377 4378 /* 4379 * Allocate the page directory page. Pass NULL instead of a 4380 * pointer to the pmap here to avoid calling 4381 * pmap_resident_count_adj() through pmap_pt_page_count_adj(), 4382 * since that requires pmap lock. Instead do the accounting 4383 * manually. 4384 * 4385 * Note that final call to pmap_remove() optimization that 4386 * checks for zero resident_count is basically disabled by 4387 * accounting for top-level page. But the optimization was 4388 * not effective since we started using non-managed mapping of 4389 * the shared page. 4390 */ 4391 pmltop_pg = pmap_alloc_pt_page(NULL, 0, VM_ALLOC_WIRED | VM_ALLOC_ZERO | 4392 VM_ALLOC_WAITOK); 4393 pmap_pt_page_count_pinit(pmap, 1); 4394 4395 pmltop_phys = VM_PAGE_TO_PHYS(pmltop_pg); 4396 pmap->pm_pmltop = (pml5_entry_t *)PHYS_TO_DMAP(pmltop_phys); 4397 4398 CPU_FOREACH(i) { 4399 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; 4400 pmap->pm_pcids[i].pm_gen = 0; 4401 } 4402 pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */ 4403 pmap->pm_ucr3 = PMAP_NO_CR3; 4404 pmap->pm_pmltopu = NULL; 4405 4406 pmap->pm_type = pm_type; 4407 4408 /* 4409 * Do not install the host kernel mappings in the nested page 4410 * tables. These mappings are meaningless in the guest physical 4411 * address space. 4412 * Install minimal kernel mappings in PTI case. 4413 */ 4414 switch (pm_type) { 4415 case PT_X86: 4416 pmap->pm_cr3 = pmltop_phys; 4417 if (pmap_is_la57(pmap)) 4418 pmap_pinit_pml5(pmltop_pg); 4419 else 4420 pmap_pinit_pml4(pmltop_pg); 4421 if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) { 4422 /* 4423 * As with pmltop_pg, pass NULL instead of a 4424 * pointer to the pmap to ensure that the PTI 4425 * page counted explicitly. 4426 */ 4427 pmltop_pgu = pmap_alloc_pt_page(NULL, 0, 4428 VM_ALLOC_WIRED | VM_ALLOC_WAITOK); 4429 pmap_pt_page_count_pinit(pmap, 1); 4430 pmap->pm_pmltopu = (pml4_entry_t *)PHYS_TO_DMAP( 4431 VM_PAGE_TO_PHYS(pmltop_pgu)); 4432 if (pmap_is_la57(pmap)) 4433 pmap_pinit_pml5_pti(pmltop_pgu); 4434 else 4435 pmap_pinit_pml4_pti(pmltop_pgu); 4436 pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pmltop_pgu); 4437 } 4438 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 4439 rangeset_init(&pmap->pm_pkru, pkru_dup_range, 4440 pkru_free_range, pmap, M_NOWAIT); 4441 } 4442 break; 4443 case PT_EPT: 4444 case PT_RVI: 4445 pmap->pm_eptsmr = smr_create("pmap", 0, 0); 4446 break; 4447 } 4448 4449 vm_radix_init(&pmap->pm_root); 4450 CPU_ZERO(&pmap->pm_active); 4451 TAILQ_INIT(&pmap->pm_pvchunk); 4452 pmap->pm_flags = flags; 4453 pmap->pm_eptgen = 0; 4454 4455 return (1); 4456 } 4457 4458 int 4459 pmap_pinit(pmap_t pmap) 4460 { 4461 4462 return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); 4463 } 4464 4465 static void 4466 pmap_allocpte_free_unref(pmap_t pmap, vm_offset_t va, pt_entry_t *pte) 4467 { 4468 vm_page_t mpg; 4469 struct spglist free; 4470 4471 mpg = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 4472 if (mpg->ref_count != 0) 4473 return; 4474 SLIST_INIT(&free); 4475 _pmap_unwire_ptp(pmap, va, mpg, &free); 4476 pmap_invalidate_page(pmap, va); 4477 vm_page_free_pages_toq(&free, true); 4478 } 4479 4480 static pml4_entry_t * 4481 pmap_allocpte_getpml4(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, 4482 bool addref) 4483 { 4484 vm_pindex_t pml5index; 4485 pml5_entry_t *pml5; 4486 pml4_entry_t *pml4; 4487 vm_page_t pml4pg; 4488 pt_entry_t PG_V; 4489 bool allocated; 4490 4491 if (!pmap_is_la57(pmap)) 4492 return (&pmap->pm_pmltop[pmap_pml4e_index(va)]); 4493 4494 PG_V = pmap_valid_bit(pmap); 4495 pml5index = pmap_pml5e_index(va); 4496 pml5 = &pmap->pm_pmltop[pml5index]; 4497 if ((*pml5 & PG_V) == 0) { 4498 if (pmap_allocpte_nosleep(pmap, pmap_pml5e_pindex(va), lockp, 4499 va) == NULL) 4500 return (NULL); 4501 allocated = true; 4502 } else { 4503 allocated = false; 4504 } 4505 pml4 = (pml4_entry_t *)PHYS_TO_DMAP(*pml5 & PG_FRAME); 4506 pml4 = &pml4[pmap_pml4e_index(va)]; 4507 if ((*pml4 & PG_V) == 0) { 4508 pml4pg = PHYS_TO_VM_PAGE(*pml5 & PG_FRAME); 4509 if (allocated && !addref) 4510 pml4pg->ref_count--; 4511 else if (!allocated && addref) 4512 pml4pg->ref_count++; 4513 } 4514 return (pml4); 4515 } 4516 4517 static pdp_entry_t * 4518 pmap_allocpte_getpdp(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, 4519 bool addref) 4520 { 4521 vm_page_t pdppg; 4522 pml4_entry_t *pml4; 4523 pdp_entry_t *pdp; 4524 pt_entry_t PG_V; 4525 bool allocated; 4526 4527 PG_V = pmap_valid_bit(pmap); 4528 4529 pml4 = pmap_allocpte_getpml4(pmap, lockp, va, false); 4530 if (pml4 == NULL) 4531 return (NULL); 4532 4533 if ((*pml4 & PG_V) == 0) { 4534 /* Have to allocate a new pdp, recurse */ 4535 if (pmap_allocpte_nosleep(pmap, pmap_pml4e_pindex(va), lockp, 4536 va) == NULL) { 4537 if (pmap_is_la57(pmap)) 4538 pmap_allocpte_free_unref(pmap, va, 4539 pmap_pml5e(pmap, va)); 4540 return (NULL); 4541 } 4542 allocated = true; 4543 } else { 4544 allocated = false; 4545 } 4546 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 4547 pdp = &pdp[pmap_pdpe_index(va)]; 4548 if ((*pdp & PG_V) == 0) { 4549 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 4550 if (allocated && !addref) 4551 pdppg->ref_count--; 4552 else if (!allocated && addref) 4553 pdppg->ref_count++; 4554 } 4555 return (pdp); 4556 } 4557 4558 /* 4559 * The ptepindexes, i.e. page indices, of the page table pages encountered 4560 * while translating virtual address va are defined as follows: 4561 * - for the page table page (last level), 4562 * ptepindex = pmap_pde_pindex(va) = va >> PDRSHIFT, 4563 * in other words, it is just the index of the PDE that maps the page 4564 * table page. 4565 * - for the page directory page, 4566 * ptepindex = NUPDE (number of userland PD entries) + 4567 * (pmap_pde_index(va) >> NPDEPGSHIFT) 4568 * i.e. index of PDPE is put after the last index of PDE, 4569 * - for the page directory pointer page, 4570 * ptepindex = NUPDE + NUPDPE + (pmap_pde_index(va) >> (NPDEPGSHIFT + 4571 * NPML4EPGSHIFT), 4572 * i.e. index of pml4e is put after the last index of PDPE, 4573 * - for the PML4 page (if LA57 mode is enabled), 4574 * ptepindex = NUPDE + NUPDPE + NUPML4E + (pmap_pde_index(va) >> 4575 * (NPDEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT), 4576 * i.e. index of pml5e is put after the last index of PML4E. 4577 * 4578 * Define an order on the paging entries, where all entries of the 4579 * same height are put together, then heights are put from deepest to 4580 * root. Then ptexpindex is the sequential number of the 4581 * corresponding paging entry in this order. 4582 * 4583 * The values of NUPDE, NUPDPE, and NUPML4E are determined by the size of 4584 * LA57 paging structures even in LA48 paging mode. Moreover, the 4585 * ptepindexes are calculated as if the paging structures were 5-level 4586 * regardless of the actual mode of operation. 4587 * 4588 * The root page at PML4/PML5 does not participate in this indexing scheme, 4589 * since it is statically allocated by pmap_pinit() and not by pmap_allocpte(). 4590 */ 4591 static vm_page_t 4592 pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, 4593 vm_offset_t va) 4594 { 4595 vm_pindex_t pml5index, pml4index; 4596 pml5_entry_t *pml5, *pml5u; 4597 pml4_entry_t *pml4, *pml4u; 4598 pdp_entry_t *pdp; 4599 pd_entry_t *pd; 4600 vm_page_t m, pdpg; 4601 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 4602 4603 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4604 4605 PG_A = pmap_accessed_bit(pmap); 4606 PG_M = pmap_modified_bit(pmap); 4607 PG_V = pmap_valid_bit(pmap); 4608 PG_RW = pmap_rw_bit(pmap); 4609 4610 /* 4611 * Allocate a page table page. 4612 */ 4613 m = pmap_alloc_pt_page(pmap, ptepindex, 4614 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 4615 if (m == NULL) 4616 return (NULL); 4617 4618 /* 4619 * Map the pagetable page into the process address space, if 4620 * it isn't already there. 4621 */ 4622 if (ptepindex >= NUPDE + NUPDPE + NUPML4E) { 4623 MPASS(pmap_is_la57(pmap)); 4624 4625 pml5index = pmap_pml5e_index(va); 4626 pml5 = &pmap->pm_pmltop[pml5index]; 4627 KASSERT((*pml5 & PG_V) == 0, 4628 ("pmap %p va %#lx pml5 %#lx", pmap, va, *pml5)); 4629 *pml5 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4630 4631 if (pmap->pm_pmltopu != NULL && pml5index < NUPML5E) { 4632 if (pmap->pm_ucr3 != PMAP_NO_CR3) 4633 *pml5 |= pg_nx; 4634 4635 pml5u = &pmap->pm_pmltopu[pml5index]; 4636 *pml5u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | 4637 PG_A | PG_M; 4638 } 4639 } else if (ptepindex >= NUPDE + NUPDPE) { 4640 pml4index = pmap_pml4e_index(va); 4641 /* Wire up a new PDPE page */ 4642 pml4 = pmap_allocpte_getpml4(pmap, lockp, va, true); 4643 if (pml4 == NULL) { 4644 pmap_free_pt_page(pmap, m, true); 4645 return (NULL); 4646 } 4647 KASSERT((*pml4 & PG_V) == 0, 4648 ("pmap %p va %#lx pml4 %#lx", pmap, va, *pml4)); 4649 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4650 4651 if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && 4652 pml4index < NUPML4E) { 4653 /* 4654 * PTI: Make all user-space mappings in the 4655 * kernel-mode page table no-execute so that 4656 * we detect any programming errors that leave 4657 * the kernel-mode page table active on return 4658 * to user space. 4659 */ 4660 if (pmap->pm_ucr3 != PMAP_NO_CR3) 4661 *pml4 |= pg_nx; 4662 4663 pml4u = &pmap->pm_pmltopu[pml4index]; 4664 *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | 4665 PG_A | PG_M; 4666 } 4667 } else if (ptepindex >= NUPDE) { 4668 /* Wire up a new PDE page */ 4669 pdp = pmap_allocpte_getpdp(pmap, lockp, va, true); 4670 if (pdp == NULL) { 4671 pmap_free_pt_page(pmap, m, true); 4672 return (NULL); 4673 } 4674 KASSERT((*pdp & PG_V) == 0, 4675 ("pmap %p va %#lx pdp %#lx", pmap, va, *pdp)); 4676 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4677 } else { 4678 /* Wire up a new PTE page */ 4679 pdp = pmap_allocpte_getpdp(pmap, lockp, va, false); 4680 if (pdp == NULL) { 4681 pmap_free_pt_page(pmap, m, true); 4682 return (NULL); 4683 } 4684 if ((*pdp & PG_V) == 0) { 4685 /* Have to allocate a new pd, recurse */ 4686 if (pmap_allocpte_nosleep(pmap, pmap_pdpe_pindex(va), 4687 lockp, va) == NULL) { 4688 pmap_allocpte_free_unref(pmap, va, 4689 pmap_pml4e(pmap, va)); 4690 pmap_free_pt_page(pmap, m, true); 4691 return (NULL); 4692 } 4693 } else { 4694 /* Add reference to the pd page */ 4695 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 4696 pdpg->ref_count++; 4697 } 4698 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 4699 4700 /* Now we know where the page directory page is */ 4701 pd = &pd[pmap_pde_index(va)]; 4702 KASSERT((*pd & PG_V) == 0, 4703 ("pmap %p va %#lx pd %#lx", pmap, va, *pd)); 4704 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4705 } 4706 4707 return (m); 4708 } 4709 4710 /* 4711 * This routine is called if the desired page table page does not exist. 4712 * 4713 * If page table page allocation fails, this routine may sleep before 4714 * returning NULL. It sleeps only if a lock pointer was given. Sleep 4715 * occurs right before returning to the caller. This way, we never 4716 * drop pmap lock to sleep while a page table page has ref_count == 0, 4717 * which prevents the page from being freed under us. 4718 */ 4719 static vm_page_t 4720 pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, 4721 vm_offset_t va) 4722 { 4723 vm_page_t m; 4724 4725 m = pmap_allocpte_nosleep(pmap, ptepindex, lockp, va); 4726 if (m == NULL && lockp != NULL) { 4727 RELEASE_PV_LIST_LOCK(lockp); 4728 PMAP_UNLOCK(pmap); 4729 PMAP_ASSERT_NOT_IN_DI(); 4730 vm_wait(NULL); 4731 PMAP_LOCK(pmap); 4732 } 4733 return (m); 4734 } 4735 4736 static pd_entry_t * 4737 pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, 4738 struct rwlock **lockp) 4739 { 4740 pdp_entry_t *pdpe, PG_V; 4741 pd_entry_t *pde; 4742 vm_page_t pdpg; 4743 vm_pindex_t pdpindex; 4744 4745 PG_V = pmap_valid_bit(pmap); 4746 4747 retry: 4748 pdpe = pmap_pdpe(pmap, va); 4749 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 4750 pde = pmap_pdpe_to_pde(pdpe, va); 4751 if (va < VM_MAXUSER_ADDRESS) { 4752 /* Add a reference to the pd page. */ 4753 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 4754 pdpg->ref_count++; 4755 } else 4756 pdpg = NULL; 4757 } else if (va < VM_MAXUSER_ADDRESS) { 4758 /* Allocate a pd page. */ 4759 pdpindex = pmap_pde_pindex(va) >> NPDPEPGSHIFT; 4760 pdpg = pmap_allocpte_alloc(pmap, NUPDE + pdpindex, lockp, va); 4761 if (pdpg == NULL) { 4762 if (lockp != NULL) 4763 goto retry; 4764 else 4765 return (NULL); 4766 } 4767 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 4768 pde = &pde[pmap_pde_index(va)]; 4769 } else 4770 panic("pmap_alloc_pde: missing page table page for va %#lx", 4771 va); 4772 *pdpgp = pdpg; 4773 return (pde); 4774 } 4775 4776 static vm_page_t 4777 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 4778 { 4779 vm_pindex_t ptepindex; 4780 pd_entry_t *pd, PG_V; 4781 vm_page_t m; 4782 4783 PG_V = pmap_valid_bit(pmap); 4784 4785 /* 4786 * Calculate pagetable page index 4787 */ 4788 ptepindex = pmap_pde_pindex(va); 4789 retry: 4790 /* 4791 * Get the page directory entry 4792 */ 4793 pd = pmap_pde(pmap, va); 4794 4795 /* 4796 * This supports switching from a 2MB page to a 4797 * normal 4K page. 4798 */ 4799 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 4800 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { 4801 /* 4802 * Invalidation of the 2MB page mapping may have caused 4803 * the deallocation of the underlying PD page. 4804 */ 4805 pd = NULL; 4806 } 4807 } 4808 4809 /* 4810 * If the page table page is mapped, we just increment the 4811 * hold count, and activate it. 4812 */ 4813 if (pd != NULL && (*pd & PG_V) != 0) { 4814 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 4815 m->ref_count++; 4816 } else { 4817 /* 4818 * Here if the pte page isn't mapped, or if it has been 4819 * deallocated. 4820 */ 4821 m = pmap_allocpte_alloc(pmap, ptepindex, lockp, va); 4822 if (m == NULL && lockp != NULL) 4823 goto retry; 4824 } 4825 return (m); 4826 } 4827 4828 /*************************************************** 4829 * Pmap allocation/deallocation routines. 4830 ***************************************************/ 4831 4832 /* 4833 * Release any resources held by the given physical map. 4834 * Called when a pmap initialized by pmap_pinit is being released. 4835 * Should only be called if the map contains no valid mappings. 4836 */ 4837 void 4838 pmap_release(pmap_t pmap) 4839 { 4840 vm_page_t m; 4841 int i; 4842 4843 KASSERT(vm_radix_is_empty(&pmap->pm_root), 4844 ("pmap_release: pmap %p has reserved page table page(s)", 4845 pmap)); 4846 KASSERT(CPU_EMPTY(&pmap->pm_active), 4847 ("releasing active pmap %p", pmap)); 4848 4849 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop)); 4850 4851 if (pmap_is_la57(pmap)) { 4852 pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0; 4853 pmap->pm_pmltop[PML5PML5I] = 0; 4854 } else { 4855 for (i = 0; i < NKPML4E; i++) /* KVA */ 4856 pmap->pm_pmltop[KPML4BASE + i] = 0; 4857 #ifdef KASAN 4858 for (i = 0; i < NKASANPML4E; i++) /* KASAN shadow map */ 4859 pmap->pm_pmltop[KASANPML4I + i] = 0; 4860 #endif 4861 #ifdef KMSAN 4862 for (i = 0; i < NKMSANSHADPML4E; i++) /* KMSAN shadow map */ 4863 pmap->pm_pmltop[KMSANSHADPML4I + i] = 0; 4864 for (i = 0; i < NKMSANORIGPML4E; i++) /* KMSAN shadow map */ 4865 pmap->pm_pmltop[KMSANORIGPML4I + i] = 0; 4866 #endif 4867 for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ 4868 pmap->pm_pmltop[DMPML4I + i] = 0; 4869 pmap->pm_pmltop[PML4PML4I] = 0; /* Recursive Mapping */ 4870 for (i = 0; i < lm_ents; i++) /* Large Map */ 4871 pmap->pm_pmltop[LMSPML4I + i] = 0; 4872 } 4873 4874 pmap_free_pt_page(NULL, m, true); 4875 pmap_pt_page_count_pinit(pmap, -1); 4876 4877 if (pmap->pm_pmltopu != NULL) { 4878 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap-> 4879 pm_pmltopu)); 4880 pmap_free_pt_page(NULL, m, false); 4881 pmap_pt_page_count_pinit(pmap, -1); 4882 } 4883 if (pmap->pm_type == PT_X86 && 4884 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) 4885 rangeset_fini(&pmap->pm_pkru); 4886 4887 KASSERT(pmap->pm_stats.resident_count == 0, 4888 ("pmap_release: pmap %p resident count %ld != 0", 4889 pmap, pmap->pm_stats.resident_count)); 4890 } 4891 4892 static int 4893 kvm_size(SYSCTL_HANDLER_ARGS) 4894 { 4895 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 4896 4897 return sysctl_handle_long(oidp, &ksize, 0, req); 4898 } 4899 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 4900 0, 0, kvm_size, "LU", 4901 "Size of KVM"); 4902 4903 static int 4904 kvm_free(SYSCTL_HANDLER_ARGS) 4905 { 4906 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 4907 4908 return sysctl_handle_long(oidp, &kfree, 0, req); 4909 } 4910 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 4911 0, 0, kvm_free, "LU", 4912 "Amount of KVM free"); 4913 4914 #ifdef KMSAN 4915 static void 4916 pmap_kmsan_shadow_map_page_array(vm_paddr_t pdppa, vm_size_t size) 4917 { 4918 pdp_entry_t *pdpe; 4919 pd_entry_t *pde; 4920 pt_entry_t *pte; 4921 vm_paddr_t dummypa, dummypd, dummypt; 4922 int i, npde, npdpg; 4923 4924 npdpg = howmany(size, NBPDP); 4925 npde = size / NBPDR; 4926 4927 dummypa = vm_phys_early_alloc(-1, PAGE_SIZE); 4928 pagezero((void *)PHYS_TO_DMAP(dummypa)); 4929 4930 dummypt = vm_phys_early_alloc(-1, PAGE_SIZE); 4931 pagezero((void *)PHYS_TO_DMAP(dummypt)); 4932 dummypd = vm_phys_early_alloc(-1, PAGE_SIZE * npdpg); 4933 for (i = 0; i < npdpg; i++) 4934 pagezero((void *)PHYS_TO_DMAP(dummypd + ptoa(i))); 4935 4936 pte = (pt_entry_t *)PHYS_TO_DMAP(dummypt); 4937 for (i = 0; i < NPTEPG; i++) 4938 pte[i] = (pt_entry_t)(dummypa | X86_PG_V | X86_PG_RW | 4939 X86_PG_A | X86_PG_M | pg_nx); 4940 4941 pde = (pd_entry_t *)PHYS_TO_DMAP(dummypd); 4942 for (i = 0; i < npde; i++) 4943 pde[i] = (pd_entry_t)(dummypt | X86_PG_V | X86_PG_RW | pg_nx); 4944 4945 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(pdppa); 4946 for (i = 0; i < npdpg; i++) 4947 pdpe[i] = (pdp_entry_t)(dummypd + ptoa(i) | X86_PG_V | 4948 X86_PG_RW | pg_nx); 4949 } 4950 4951 static void 4952 pmap_kmsan_page_array_startup(vm_offset_t start, vm_offset_t end) 4953 { 4954 vm_size_t size; 4955 4956 KASSERT(start % NBPDP == 0, ("unaligned page array start address")); 4957 4958 /* 4959 * The end of the page array's KVA region is 2MB aligned, see 4960 * kmem_init(). 4961 */ 4962 size = round_2mpage(end) - start; 4963 pmap_kmsan_shadow_map_page_array(KMSANSHADPDPphys, size); 4964 pmap_kmsan_shadow_map_page_array(KMSANORIGPDPphys, size); 4965 } 4966 #endif 4967 4968 /* 4969 * Allocate physical memory for the vm_page array and map it into KVA, 4970 * attempting to back the vm_pages with domain-local memory. 4971 */ 4972 void 4973 pmap_page_array_startup(long pages) 4974 { 4975 pdp_entry_t *pdpe; 4976 pd_entry_t *pde, newpdir; 4977 vm_offset_t va, start, end; 4978 vm_paddr_t pa; 4979 long pfn; 4980 int domain, i; 4981 4982 vm_page_array_size = pages; 4983 4984 start = VM_MIN_KERNEL_ADDRESS; 4985 end = start + pages * sizeof(struct vm_page); 4986 for (va = start; va < end; va += NBPDR) { 4987 pfn = first_page + (va - start) / sizeof(struct vm_page); 4988 domain = vm_phys_domain(ptoa(pfn)); 4989 pdpe = pmap_pdpe(kernel_pmap, va); 4990 if ((*pdpe & X86_PG_V) == 0) { 4991 pa = vm_phys_early_alloc(domain, PAGE_SIZE); 4992 dump_add_page(pa); 4993 pagezero((void *)PHYS_TO_DMAP(pa)); 4994 *pdpe = (pdp_entry_t)(pa | X86_PG_V | X86_PG_RW | 4995 X86_PG_A | X86_PG_M); 4996 } 4997 pde = pmap_pdpe_to_pde(pdpe, va); 4998 if ((*pde & X86_PG_V) != 0) 4999 panic("Unexpected pde"); 5000 pa = vm_phys_early_alloc(domain, NBPDR); 5001 for (i = 0; i < NPDEPG; i++) 5002 dump_add_page(pa + i * PAGE_SIZE); 5003 newpdir = (pd_entry_t)(pa | X86_PG_V | X86_PG_RW | X86_PG_A | 5004 X86_PG_M | PG_PS | pg_g | pg_nx); 5005 pde_store(pde, newpdir); 5006 } 5007 vm_page_array = (vm_page_t)start; 5008 5009 #ifdef KMSAN 5010 pmap_kmsan_page_array_startup(start, end); 5011 #endif 5012 } 5013 5014 /* 5015 * grow the number of kernel page table entries, if needed 5016 */ 5017 void 5018 pmap_growkernel(vm_offset_t addr) 5019 { 5020 vm_paddr_t paddr; 5021 vm_page_t nkpg; 5022 pd_entry_t *pde, newpdir; 5023 pdp_entry_t *pdpe; 5024 vm_offset_t end; 5025 5026 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 5027 5028 /* 5029 * The kernel map covers two distinct regions of KVA: that used 5030 * for dynamic kernel memory allocations, and the uppermost 2GB 5031 * of the virtual address space. The latter is used to map the 5032 * kernel and loadable kernel modules. This scheme enables the 5033 * use of a special code generation model for kernel code which 5034 * takes advantage of compact addressing modes in machine code. 5035 * 5036 * Both regions grow upwards; to avoid wasting memory, the gap 5037 * in between is unmapped. If "addr" is above "KERNBASE", the 5038 * kernel's region is grown, otherwise the kmem region is grown. 5039 * 5040 * The correctness of this action is based on the following 5041 * argument: vm_map_insert() allocates contiguous ranges of the 5042 * kernel virtual address space. It calls this function if a range 5043 * ends after "kernel_vm_end". If the kernel is mapped between 5044 * "kernel_vm_end" and "addr", then the range cannot begin at 5045 * "kernel_vm_end". In fact, its beginning address cannot be less 5046 * than the kernel. Thus, there is no immediate need to allocate 5047 * any new kernel page table pages between "kernel_vm_end" and 5048 * "KERNBASE". 5049 */ 5050 if (KERNBASE < addr) { 5051 end = KERNBASE + nkpt * NBPDR; 5052 if (end == 0) 5053 return; 5054 } else { 5055 end = kernel_vm_end; 5056 } 5057 5058 addr = roundup2(addr, NBPDR); 5059 if (addr - 1 >= vm_map_max(kernel_map)) 5060 addr = vm_map_max(kernel_map); 5061 if (addr <= end) { 5062 /* 5063 * The grown region is already mapped, so there is 5064 * nothing to do. 5065 */ 5066 return; 5067 } 5068 5069 kasan_shadow_map(end, addr - end); 5070 kmsan_shadow_map(end, addr - end); 5071 while (end < addr) { 5072 pdpe = pmap_pdpe(kernel_pmap, end); 5073 if ((*pdpe & X86_PG_V) == 0) { 5074 nkpg = pmap_alloc_pt_page(kernel_pmap, 5075 pmap_pdpe_pindex(end), VM_ALLOC_WIRED | 5076 VM_ALLOC_INTERRUPT | VM_ALLOC_ZERO); 5077 if (nkpg == NULL) 5078 panic("pmap_growkernel: no memory to grow kernel"); 5079 paddr = VM_PAGE_TO_PHYS(nkpg); 5080 *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | 5081 X86_PG_A | X86_PG_M); 5082 continue; /* try again */ 5083 } 5084 pde = pmap_pdpe_to_pde(pdpe, end); 5085 if ((*pde & X86_PG_V) != 0) { 5086 end = (end + NBPDR) & ~PDRMASK; 5087 if (end - 1 >= vm_map_max(kernel_map)) { 5088 end = vm_map_max(kernel_map); 5089 break; 5090 } 5091 continue; 5092 } 5093 5094 nkpg = pmap_alloc_pt_page(kernel_pmap, pmap_pde_pindex(end), 5095 VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT | VM_ALLOC_ZERO); 5096 if (nkpg == NULL) 5097 panic("pmap_growkernel: no memory to grow kernel"); 5098 paddr = VM_PAGE_TO_PHYS(nkpg); 5099 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; 5100 pde_store(pde, newpdir); 5101 5102 end = (end + NBPDR) & ~PDRMASK; 5103 if (end - 1 >= vm_map_max(kernel_map)) { 5104 end = vm_map_max(kernel_map); 5105 break; 5106 } 5107 } 5108 5109 if (end <= KERNBASE) 5110 kernel_vm_end = end; 5111 else 5112 nkpt = howmany(end - KERNBASE, NBPDR); 5113 } 5114 5115 /*************************************************** 5116 * page management routines. 5117 ***************************************************/ 5118 5119 static const uint64_t pc_freemask[_NPCM] = { 5120 [0 ... _NPCM - 2] = PC_FREEN, 5121 [_NPCM - 1] = PC_FREEL 5122 }; 5123 5124 #ifdef PV_STATS 5125 5126 static COUNTER_U64_DEFINE_EARLY(pc_chunk_count); 5127 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, 5128 &pc_chunk_count, "Current number of pv entry cnunks"); 5129 5130 static COUNTER_U64_DEFINE_EARLY(pc_chunk_allocs); 5131 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, 5132 &pc_chunk_allocs, "Total number of pv entry chunks allocated"); 5133 5134 static COUNTER_U64_DEFINE_EARLY(pc_chunk_frees); 5135 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, 5136 &pc_chunk_frees, "Total number of pv entry chunks freed"); 5137 5138 static COUNTER_U64_DEFINE_EARLY(pc_chunk_tryfail); 5139 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, 5140 &pc_chunk_tryfail, 5141 "Number of failed attempts to get a pv entry chunk page"); 5142 5143 static COUNTER_U64_DEFINE_EARLY(pv_entry_frees); 5144 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, 5145 &pv_entry_frees, "Total number of pv entries freed"); 5146 5147 static COUNTER_U64_DEFINE_EARLY(pv_entry_allocs); 5148 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, 5149 &pv_entry_allocs, "Total number of pv entries allocated"); 5150 5151 static COUNTER_U64_DEFINE_EARLY(pv_entry_count); 5152 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, 5153 &pv_entry_count, "Current number of pv entries"); 5154 5155 static COUNTER_U64_DEFINE_EARLY(pv_entry_spare); 5156 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, 5157 &pv_entry_spare, "Current number of spare pv entries"); 5158 #endif 5159 5160 static void 5161 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di) 5162 { 5163 5164 if (pmap == NULL) 5165 return; 5166 pmap_invalidate_all(pmap); 5167 if (pmap != locked_pmap) 5168 PMAP_UNLOCK(pmap); 5169 if (start_di) 5170 pmap_delayed_invl_finish(); 5171 } 5172 5173 /* 5174 * We are in a serious low memory condition. Resort to 5175 * drastic measures to free some pages so we can allocate 5176 * another pv entry chunk. 5177 * 5178 * Returns NULL if PV entries were reclaimed from the specified pmap. 5179 * 5180 * We do not, however, unmap 2mpages because subsequent accesses will 5181 * allocate per-page pv entries until repromotion occurs, thereby 5182 * exacerbating the shortage of free pv entries. 5183 */ 5184 static vm_page_t 5185 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain) 5186 { 5187 struct pv_chunks_list *pvc; 5188 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 5189 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 5190 struct md_page *pvh; 5191 pd_entry_t *pde; 5192 pmap_t next_pmap, pmap; 5193 pt_entry_t *pte, tpte; 5194 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 5195 pv_entry_t pv; 5196 vm_offset_t va; 5197 vm_page_t m, m_pc; 5198 struct spglist free; 5199 uint64_t inuse; 5200 int bit, field, freed; 5201 bool start_di, restart; 5202 5203 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 5204 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 5205 pmap = NULL; 5206 m_pc = NULL; 5207 PG_G = PG_A = PG_M = PG_RW = 0; 5208 SLIST_INIT(&free); 5209 bzero(&pc_marker_b, sizeof(pc_marker_b)); 5210 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 5211 pc_marker = (struct pv_chunk *)&pc_marker_b; 5212 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 5213 5214 /* 5215 * A delayed invalidation block should already be active if 5216 * pmap_advise() or pmap_remove() called this function by way 5217 * of pmap_demote_pde_locked(). 5218 */ 5219 start_di = pmap_not_in_di(); 5220 5221 pvc = &pv_chunks[domain]; 5222 mtx_lock(&pvc->pvc_lock); 5223 pvc->active_reclaims++; 5224 TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru); 5225 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru); 5226 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 5227 SLIST_EMPTY(&free)) { 5228 next_pmap = pc->pc_pmap; 5229 if (next_pmap == NULL) { 5230 /* 5231 * The next chunk is a marker. However, it is 5232 * not our marker, so active_reclaims must be 5233 * > 1. Consequently, the next_chunk code 5234 * will not rotate the pv_chunks list. 5235 */ 5236 goto next_chunk; 5237 } 5238 mtx_unlock(&pvc->pvc_lock); 5239 5240 /* 5241 * A pv_chunk can only be removed from the pc_lru list 5242 * when both pc_chunks_mutex is owned and the 5243 * corresponding pmap is locked. 5244 */ 5245 if (pmap != next_pmap) { 5246 restart = false; 5247 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, 5248 start_di); 5249 pmap = next_pmap; 5250 /* Avoid deadlock and lock recursion. */ 5251 if (pmap > locked_pmap) { 5252 RELEASE_PV_LIST_LOCK(lockp); 5253 PMAP_LOCK(pmap); 5254 if (start_di) 5255 pmap_delayed_invl_start(); 5256 mtx_lock(&pvc->pvc_lock); 5257 restart = true; 5258 } else if (pmap != locked_pmap) { 5259 if (PMAP_TRYLOCK(pmap)) { 5260 if (start_di) 5261 pmap_delayed_invl_start(); 5262 mtx_lock(&pvc->pvc_lock); 5263 restart = true; 5264 } else { 5265 pmap = NULL; /* pmap is not locked */ 5266 mtx_lock(&pvc->pvc_lock); 5267 pc = TAILQ_NEXT(pc_marker, pc_lru); 5268 if (pc == NULL || 5269 pc->pc_pmap != next_pmap) 5270 continue; 5271 goto next_chunk; 5272 } 5273 } else if (start_di) 5274 pmap_delayed_invl_start(); 5275 PG_G = pmap_global_bit(pmap); 5276 PG_A = pmap_accessed_bit(pmap); 5277 PG_M = pmap_modified_bit(pmap); 5278 PG_RW = pmap_rw_bit(pmap); 5279 if (restart) 5280 continue; 5281 } 5282 5283 /* 5284 * Destroy every non-wired, 4 KB page mapping in the chunk. 5285 */ 5286 freed = 0; 5287 for (field = 0; field < _NPCM; field++) { 5288 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 5289 inuse != 0; inuse &= ~(1UL << bit)) { 5290 bit = bsfq(inuse); 5291 pv = &pc->pc_pventry[field * 64 + bit]; 5292 va = pv->pv_va; 5293 pde = pmap_pde(pmap, va); 5294 if ((*pde & PG_PS) != 0) 5295 continue; 5296 pte = pmap_pde_to_pte(pde, va); 5297 if ((*pte & PG_W) != 0) 5298 continue; 5299 tpte = pte_load_clear(pte); 5300 if ((tpte & PG_G) != 0) 5301 pmap_invalidate_page(pmap, va); 5302 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 5303 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5304 vm_page_dirty(m); 5305 if ((tpte & PG_A) != 0) 5306 vm_page_aflag_set(m, PGA_REFERENCED); 5307 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 5308 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5309 m->md.pv_gen++; 5310 if (TAILQ_EMPTY(&m->md.pv_list) && 5311 (m->flags & PG_FICTITIOUS) == 0) { 5312 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5313 if (TAILQ_EMPTY(&pvh->pv_list)) { 5314 vm_page_aflag_clear(m, 5315 PGA_WRITEABLE); 5316 } 5317 } 5318 pmap_delayed_invl_page(m); 5319 pc->pc_map[field] |= 1UL << bit; 5320 pmap_unuse_pt(pmap, va, *pde, &free); 5321 freed++; 5322 } 5323 } 5324 if (freed == 0) { 5325 mtx_lock(&pvc->pvc_lock); 5326 goto next_chunk; 5327 } 5328 /* Every freed mapping is for a 4 KB page. */ 5329 pmap_resident_count_adj(pmap, -freed); 5330 PV_STAT(counter_u64_add(pv_entry_frees, freed)); 5331 PV_STAT(counter_u64_add(pv_entry_spare, freed)); 5332 PV_STAT(counter_u64_add(pv_entry_count, -freed)); 5333 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5334 if (pc_is_free(pc)) { 5335 PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV)); 5336 PV_STAT(counter_u64_add(pc_chunk_count, -1)); 5337 PV_STAT(counter_u64_add(pc_chunk_frees, 1)); 5338 /* Entire chunk is free; return it. */ 5339 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 5340 dump_drop_page(m_pc->phys_addr); 5341 mtx_lock(&pvc->pvc_lock); 5342 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5343 break; 5344 } 5345 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5346 mtx_lock(&pvc->pvc_lock); 5347 /* One freed pv entry in locked_pmap is sufficient. */ 5348 if (pmap == locked_pmap) 5349 break; 5350 next_chunk: 5351 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 5352 TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru); 5353 if (pvc->active_reclaims == 1 && pmap != NULL) { 5354 /* 5355 * Rotate the pv chunks list so that we do not 5356 * scan the same pv chunks that could not be 5357 * freed (because they contained a wired 5358 * and/or superpage mapping) on every 5359 * invocation of reclaim_pv_chunk(). 5360 */ 5361 while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker) { 5362 MPASS(pc->pc_pmap != NULL); 5363 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5364 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 5365 } 5366 } 5367 } 5368 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 5369 TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru); 5370 pvc->active_reclaims--; 5371 mtx_unlock(&pvc->pvc_lock); 5372 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); 5373 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 5374 m_pc = SLIST_FIRST(&free); 5375 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 5376 /* Recycle a freed page table page. */ 5377 m_pc->ref_count = 1; 5378 } 5379 vm_page_free_pages_toq(&free, true); 5380 return (m_pc); 5381 } 5382 5383 static vm_page_t 5384 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 5385 { 5386 vm_page_t m; 5387 int i, domain; 5388 5389 domain = PCPU_GET(domain); 5390 for (i = 0; i < vm_ndomains; i++) { 5391 m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain); 5392 if (m != NULL) 5393 break; 5394 domain = (domain + 1) % vm_ndomains; 5395 } 5396 5397 return (m); 5398 } 5399 5400 /* 5401 * free the pv_entry back to the free list 5402 */ 5403 static void 5404 free_pv_entry(pmap_t pmap, pv_entry_t pv) 5405 { 5406 struct pv_chunk *pc; 5407 int idx, field, bit; 5408 5409 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5410 PV_STAT(counter_u64_add(pv_entry_frees, 1)); 5411 PV_STAT(counter_u64_add(pv_entry_spare, 1)); 5412 PV_STAT(counter_u64_add(pv_entry_count, -1)); 5413 pc = pv_to_chunk(pv); 5414 idx = pv - &pc->pc_pventry[0]; 5415 field = idx / 64; 5416 bit = idx % 64; 5417 pc->pc_map[field] |= 1ul << bit; 5418 if (!pc_is_free(pc)) { 5419 /* 98% of the time, pc is already at the head of the list. */ 5420 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 5421 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5422 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5423 } 5424 return; 5425 } 5426 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5427 free_pv_chunk(pc); 5428 } 5429 5430 static void 5431 free_pv_chunk_dequeued(struct pv_chunk *pc) 5432 { 5433 vm_page_t m; 5434 5435 PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV)); 5436 PV_STAT(counter_u64_add(pc_chunk_count, -1)); 5437 PV_STAT(counter_u64_add(pc_chunk_frees, 1)); 5438 counter_u64_add(pv_page_count, -1); 5439 /* entire chunk is free, return it */ 5440 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 5441 dump_drop_page(m->phys_addr); 5442 vm_page_unwire_noq(m); 5443 vm_page_free(m); 5444 } 5445 5446 static void 5447 free_pv_chunk(struct pv_chunk *pc) 5448 { 5449 struct pv_chunks_list *pvc; 5450 5451 pvc = &pv_chunks[pc_to_domain(pc)]; 5452 mtx_lock(&pvc->pvc_lock); 5453 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5454 mtx_unlock(&pvc->pvc_lock); 5455 free_pv_chunk_dequeued(pc); 5456 } 5457 5458 static void 5459 free_pv_chunk_batch(struct pv_chunklist *batch) 5460 { 5461 struct pv_chunks_list *pvc; 5462 struct pv_chunk *pc, *npc; 5463 int i; 5464 5465 for (i = 0; i < vm_ndomains; i++) { 5466 if (TAILQ_EMPTY(&batch[i])) 5467 continue; 5468 pvc = &pv_chunks[i]; 5469 mtx_lock(&pvc->pvc_lock); 5470 TAILQ_FOREACH(pc, &batch[i], pc_list) { 5471 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5472 } 5473 mtx_unlock(&pvc->pvc_lock); 5474 } 5475 5476 for (i = 0; i < vm_ndomains; i++) { 5477 TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) { 5478 free_pv_chunk_dequeued(pc); 5479 } 5480 } 5481 } 5482 5483 /* 5484 * Returns a new PV entry, allocating a new PV chunk from the system when 5485 * needed. If this PV chunk allocation fails and a PV list lock pointer was 5486 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 5487 * returned. 5488 * 5489 * The given PV list lock may be released. 5490 */ 5491 static pv_entry_t 5492 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 5493 { 5494 struct pv_chunks_list *pvc; 5495 int bit, field; 5496 pv_entry_t pv; 5497 struct pv_chunk *pc; 5498 vm_page_t m; 5499 5500 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5501 PV_STAT(counter_u64_add(pv_entry_allocs, 1)); 5502 retry: 5503 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 5504 if (pc != NULL) { 5505 for (field = 0; field < _NPCM; field++) { 5506 if (pc->pc_map[field]) { 5507 bit = bsfq(pc->pc_map[field]); 5508 break; 5509 } 5510 } 5511 if (field < _NPCM) { 5512 pv = &pc->pc_pventry[field * 64 + bit]; 5513 pc->pc_map[field] &= ~(1ul << bit); 5514 /* If this was the last item, move it to tail */ 5515 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 5516 pc->pc_map[2] == 0) { 5517 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5518 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 5519 pc_list); 5520 } 5521 PV_STAT(counter_u64_add(pv_entry_count, 1)); 5522 PV_STAT(counter_u64_add(pv_entry_spare, -1)); 5523 return (pv); 5524 } 5525 } 5526 /* No free items, allocate another chunk */ 5527 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 5528 if (m == NULL) { 5529 if (lockp == NULL) { 5530 PV_STAT(counter_u64_add(pc_chunk_tryfail, 1)); 5531 return (NULL); 5532 } 5533 m = reclaim_pv_chunk(pmap, lockp); 5534 if (m == NULL) 5535 goto retry; 5536 } else 5537 counter_u64_add(pv_page_count, 1); 5538 PV_STAT(counter_u64_add(pc_chunk_count, 1)); 5539 PV_STAT(counter_u64_add(pc_chunk_allocs, 1)); 5540 dump_add_page(m->phys_addr); 5541 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 5542 pc->pc_pmap = pmap; 5543 pc->pc_map[0] = PC_FREEN & ~1ul; /* preallocated bit 0 */ 5544 pc->pc_map[1] = PC_FREEN; 5545 pc->pc_map[2] = PC_FREEL; 5546 pvc = &pv_chunks[vm_page_domain(m)]; 5547 mtx_lock(&pvc->pvc_lock); 5548 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 5549 mtx_unlock(&pvc->pvc_lock); 5550 pv = &pc->pc_pventry[0]; 5551 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5552 PV_STAT(counter_u64_add(pv_entry_count, 1)); 5553 PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV - 1)); 5554 return (pv); 5555 } 5556 5557 /* 5558 * Returns the number of one bits within the given PV chunk map. 5559 * 5560 * The erratas for Intel processors state that "POPCNT Instruction May 5561 * Take Longer to Execute Than Expected". It is believed that the 5562 * issue is the spurious dependency on the destination register. 5563 * Provide a hint to the register rename logic that the destination 5564 * value is overwritten, by clearing it, as suggested in the 5565 * optimization manual. It should be cheap for unaffected processors 5566 * as well. 5567 * 5568 * Reference numbers for erratas are 5569 * 4th Gen Core: HSD146 5570 * 5th Gen Core: BDM85 5571 * 6th Gen Core: SKL029 5572 */ 5573 static int 5574 popcnt_pc_map_pq(uint64_t *map) 5575 { 5576 u_long result, tmp; 5577 5578 __asm __volatile("xorl %k0,%k0;popcntq %2,%0;" 5579 "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;" 5580 "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0" 5581 : "=&r" (result), "=&r" (tmp) 5582 : "m" (map[0]), "m" (map[1]), "m" (map[2])); 5583 return (result); 5584 } 5585 5586 /* 5587 * Ensure that the number of spare PV entries in the specified pmap meets or 5588 * exceeds the given count, "needed". 5589 * 5590 * The given PV list lock may be released. 5591 */ 5592 static void 5593 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 5594 { 5595 struct pv_chunks_list *pvc; 5596 struct pch new_tail[PMAP_MEMDOM]; 5597 struct pv_chunk *pc; 5598 vm_page_t m; 5599 int avail, free, i; 5600 bool reclaimed; 5601 5602 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5603 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 5604 5605 /* 5606 * Newly allocated PV chunks must be stored in a private list until 5607 * the required number of PV chunks have been allocated. Otherwise, 5608 * reclaim_pv_chunk() could recycle one of these chunks. In 5609 * contrast, these chunks must be added to the pmap upon allocation. 5610 */ 5611 for (i = 0; i < PMAP_MEMDOM; i++) 5612 TAILQ_INIT(&new_tail[i]); 5613 retry: 5614 avail = 0; 5615 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 5616 #ifndef __POPCNT__ 5617 if ((cpu_feature2 & CPUID2_POPCNT) == 0) 5618 bit_count((bitstr_t *)pc->pc_map, 0, 5619 sizeof(pc->pc_map) * NBBY, &free); 5620 else 5621 #endif 5622 free = popcnt_pc_map_pq(pc->pc_map); 5623 if (free == 0) 5624 break; 5625 avail += free; 5626 if (avail >= needed) 5627 break; 5628 } 5629 for (reclaimed = false; avail < needed; avail += _NPCPV) { 5630 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 5631 if (m == NULL) { 5632 m = reclaim_pv_chunk(pmap, lockp); 5633 if (m == NULL) 5634 goto retry; 5635 reclaimed = true; 5636 } else 5637 counter_u64_add(pv_page_count, 1); 5638 PV_STAT(counter_u64_add(pc_chunk_count, 1)); 5639 PV_STAT(counter_u64_add(pc_chunk_allocs, 1)); 5640 dump_add_page(m->phys_addr); 5641 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 5642 pc->pc_pmap = pmap; 5643 pc->pc_map[0] = PC_FREEN; 5644 pc->pc_map[1] = PC_FREEN; 5645 pc->pc_map[2] = PC_FREEL; 5646 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5647 TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru); 5648 PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV)); 5649 5650 /* 5651 * The reclaim might have freed a chunk from the current pmap. 5652 * If that chunk contained available entries, we need to 5653 * re-count the number of available entries. 5654 */ 5655 if (reclaimed) 5656 goto retry; 5657 } 5658 for (i = 0; i < vm_ndomains; i++) { 5659 if (TAILQ_EMPTY(&new_tail[i])) 5660 continue; 5661 pvc = &pv_chunks[i]; 5662 mtx_lock(&pvc->pvc_lock); 5663 TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru); 5664 mtx_unlock(&pvc->pvc_lock); 5665 } 5666 } 5667 5668 /* 5669 * First find and then remove the pv entry for the specified pmap and virtual 5670 * address from the specified pv list. Returns the pv entry if found and NULL 5671 * otherwise. This operation can be performed on pv lists for either 4KB or 5672 * 2MB page mappings. 5673 */ 5674 static __inline pv_entry_t 5675 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 5676 { 5677 pv_entry_t pv; 5678 5679 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5680 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 5681 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5682 pvh->pv_gen++; 5683 break; 5684 } 5685 } 5686 return (pv); 5687 } 5688 5689 /* 5690 * After demotion from a 2MB page mapping to 512 4KB page mappings, 5691 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 5692 * entries for each of the 4KB page mappings. 5693 */ 5694 static void 5695 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 5696 struct rwlock **lockp) 5697 { 5698 struct md_page *pvh; 5699 struct pv_chunk *pc; 5700 pv_entry_t pv; 5701 vm_offset_t va_last; 5702 vm_page_t m; 5703 int bit, field; 5704 5705 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5706 KASSERT((pa & PDRMASK) == 0, 5707 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 5708 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5709 5710 /* 5711 * Transfer the 2mpage's pv entry for this mapping to the first 5712 * page's pv list. Once this transfer begins, the pv list lock 5713 * must not be released until the last pv entry is reinstantiated. 5714 */ 5715 pvh = pa_to_pvh(pa); 5716 va = trunc_2mpage(va); 5717 pv = pmap_pvh_remove(pvh, pmap, va); 5718 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 5719 m = PHYS_TO_VM_PAGE(pa); 5720 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5721 m->md.pv_gen++; 5722 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 5723 PV_STAT(counter_u64_add(pv_entry_allocs, NPTEPG - 1)); 5724 va_last = va + NBPDR - PAGE_SIZE; 5725 for (;;) { 5726 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 5727 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 5728 pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); 5729 for (field = 0; field < _NPCM; field++) { 5730 while (pc->pc_map[field]) { 5731 bit = bsfq(pc->pc_map[field]); 5732 pc->pc_map[field] &= ~(1ul << bit); 5733 pv = &pc->pc_pventry[field * 64 + bit]; 5734 va += PAGE_SIZE; 5735 pv->pv_va = va; 5736 m++; 5737 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5738 ("pmap_pv_demote_pde: page %p is not managed", m)); 5739 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5740 m->md.pv_gen++; 5741 if (va == va_last) 5742 goto out; 5743 } 5744 } 5745 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5746 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 5747 } 5748 out: 5749 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 5750 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5751 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 5752 } 5753 PV_STAT(counter_u64_add(pv_entry_count, NPTEPG - 1)); 5754 PV_STAT(counter_u64_add(pv_entry_spare, -(NPTEPG - 1))); 5755 } 5756 5757 #if VM_NRESERVLEVEL > 0 5758 /* 5759 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 5760 * replace the many pv entries for the 4KB page mappings by a single pv entry 5761 * for the 2MB page mapping. 5762 */ 5763 static void 5764 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 5765 struct rwlock **lockp) 5766 { 5767 struct md_page *pvh; 5768 pv_entry_t pv; 5769 vm_offset_t va_last; 5770 vm_page_t m; 5771 5772 KASSERT((pa & PDRMASK) == 0, 5773 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 5774 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5775 5776 /* 5777 * Transfer the first page's pv entry for this mapping to the 2mpage's 5778 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 5779 * a transfer avoids the possibility that get_pv_entry() calls 5780 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 5781 * mappings that is being promoted. 5782 */ 5783 m = PHYS_TO_VM_PAGE(pa); 5784 va = trunc_2mpage(va); 5785 pv = pmap_pvh_remove(&m->md, pmap, va); 5786 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 5787 pvh = pa_to_pvh(pa); 5788 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5789 pvh->pv_gen++; 5790 /* Free the remaining NPTEPG - 1 pv entries. */ 5791 va_last = va + NBPDR - PAGE_SIZE; 5792 do { 5793 m++; 5794 va += PAGE_SIZE; 5795 pmap_pvh_free(&m->md, pmap, va); 5796 } while (va < va_last); 5797 } 5798 #endif /* VM_NRESERVLEVEL > 0 */ 5799 5800 /* 5801 * First find and then destroy the pv entry for the specified pmap and virtual 5802 * address. This operation can be performed on pv lists for either 4KB or 2MB 5803 * page mappings. 5804 */ 5805 static void 5806 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 5807 { 5808 pv_entry_t pv; 5809 5810 pv = pmap_pvh_remove(pvh, pmap, va); 5811 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 5812 free_pv_entry(pmap, pv); 5813 } 5814 5815 /* 5816 * Conditionally create the PV entry for a 4KB page mapping if the required 5817 * memory can be allocated without resorting to reclamation. 5818 */ 5819 static boolean_t 5820 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 5821 struct rwlock **lockp) 5822 { 5823 pv_entry_t pv; 5824 5825 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5826 /* Pass NULL instead of the lock pointer to disable reclamation. */ 5827 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 5828 pv->pv_va = va; 5829 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 5830 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5831 m->md.pv_gen++; 5832 return (TRUE); 5833 } else 5834 return (FALSE); 5835 } 5836 5837 /* 5838 * Create the PV entry for a 2MB page mapping. Always returns true unless the 5839 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 5840 * false if the PV entry cannot be allocated without resorting to reclamation. 5841 */ 5842 static bool 5843 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, 5844 struct rwlock **lockp) 5845 { 5846 struct md_page *pvh; 5847 pv_entry_t pv; 5848 vm_paddr_t pa; 5849 5850 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5851 /* Pass NULL instead of the lock pointer to disable reclamation. */ 5852 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 5853 NULL : lockp)) == NULL) 5854 return (false); 5855 pv->pv_va = va; 5856 pa = pde & PG_PS_FRAME; 5857 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5858 pvh = pa_to_pvh(pa); 5859 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5860 pvh->pv_gen++; 5861 return (true); 5862 } 5863 5864 /* 5865 * Fills a page table page with mappings to consecutive physical pages. 5866 */ 5867 static void 5868 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 5869 { 5870 pt_entry_t *pte; 5871 5872 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 5873 *pte = newpte; 5874 newpte += PAGE_SIZE; 5875 } 5876 } 5877 5878 /* 5879 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 5880 * mapping is invalidated. 5881 */ 5882 static boolean_t 5883 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 5884 { 5885 struct rwlock *lock; 5886 boolean_t rv; 5887 5888 lock = NULL; 5889 rv = pmap_demote_pde_locked(pmap, pde, va, &lock); 5890 if (lock != NULL) 5891 rw_wunlock(lock); 5892 return (rv); 5893 } 5894 5895 static void 5896 pmap_demote_pde_check(pt_entry_t *firstpte __unused, pt_entry_t newpte __unused) 5897 { 5898 #ifdef INVARIANTS 5899 #ifdef DIAGNOSTIC 5900 pt_entry_t *xpte, *ypte; 5901 5902 for (xpte = firstpte; xpte < firstpte + NPTEPG; 5903 xpte++, newpte += PAGE_SIZE) { 5904 if ((*xpte & PG_FRAME) != (newpte & PG_FRAME)) { 5905 printf("pmap_demote_pde: xpte %zd and newpte map " 5906 "different pages: found %#lx, expected %#lx\n", 5907 xpte - firstpte, *xpte, newpte); 5908 printf("page table dump\n"); 5909 for (ypte = firstpte; ypte < firstpte + NPTEPG; ypte++) 5910 printf("%zd %#lx\n", ypte - firstpte, *ypte); 5911 panic("firstpte"); 5912 } 5913 } 5914 #else 5915 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 5916 ("pmap_demote_pde: firstpte and newpte map different physical" 5917 " addresses")); 5918 #endif 5919 #endif 5920 } 5921 5922 static void 5923 pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 5924 pd_entry_t oldpde, struct rwlock **lockp) 5925 { 5926 struct spglist free; 5927 vm_offset_t sva; 5928 5929 SLIST_INIT(&free); 5930 sva = trunc_2mpage(va); 5931 pmap_remove_pde(pmap, pde, sva, &free, lockp); 5932 if ((oldpde & pmap_global_bit(pmap)) == 0) 5933 pmap_invalidate_pde_page(pmap, sva, oldpde); 5934 vm_page_free_pages_toq(&free, true); 5935 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx in pmap %p", 5936 va, pmap); 5937 } 5938 5939 static boolean_t 5940 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 5941 struct rwlock **lockp) 5942 { 5943 pd_entry_t newpde, oldpde; 5944 pt_entry_t *firstpte, newpte; 5945 pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V; 5946 vm_paddr_t mptepa; 5947 vm_page_t mpte; 5948 int PG_PTE_CACHE; 5949 bool in_kernel; 5950 5951 PG_A = pmap_accessed_bit(pmap); 5952 PG_G = pmap_global_bit(pmap); 5953 PG_M = pmap_modified_bit(pmap); 5954 PG_RW = pmap_rw_bit(pmap); 5955 PG_V = pmap_valid_bit(pmap); 5956 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 5957 PG_PKU_MASK = pmap_pku_mask_bit(pmap); 5958 5959 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5960 in_kernel = va >= VM_MAXUSER_ADDRESS; 5961 oldpde = *pde; 5962 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 5963 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 5964 5965 /* 5966 * Invalidate the 2MB page mapping and return "failure" if the 5967 * mapping was never accessed. 5968 */ 5969 if ((oldpde & PG_A) == 0) { 5970 KASSERT((oldpde & PG_W) == 0, 5971 ("pmap_demote_pde: a wired mapping is missing PG_A")); 5972 pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); 5973 return (FALSE); 5974 } 5975 5976 mpte = pmap_remove_pt_page(pmap, va); 5977 if (mpte == NULL) { 5978 KASSERT((oldpde & PG_W) == 0, 5979 ("pmap_demote_pde: page table page for a wired mapping" 5980 " is missing")); 5981 5982 /* 5983 * If the page table page is missing and the mapping 5984 * is for a kernel address, the mapping must belong to 5985 * the direct map. Page table pages are preallocated 5986 * for every other part of the kernel address space, 5987 * so the direct map region is the only part of the 5988 * kernel address space that must be handled here. 5989 */ 5990 KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS && 5991 va < DMAP_MAX_ADDRESS), 5992 ("pmap_demote_pde: No saved mpte for va %#lx", va)); 5993 5994 /* 5995 * If the 2MB page mapping belongs to the direct map 5996 * region of the kernel's address space, then the page 5997 * allocation request specifies the highest possible 5998 * priority (VM_ALLOC_INTERRUPT). Otherwise, the 5999 * priority is normal. 6000 */ 6001 mpte = pmap_alloc_pt_page(pmap, pmap_pde_pindex(va), 6002 (in_kernel ? VM_ALLOC_INTERRUPT : 0) | VM_ALLOC_WIRED); 6003 6004 /* 6005 * If the allocation of the new page table page fails, 6006 * invalidate the 2MB page mapping and return "failure". 6007 */ 6008 if (mpte == NULL) { 6009 pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); 6010 return (FALSE); 6011 } 6012 6013 if (!in_kernel) 6014 mpte->ref_count = NPTEPG; 6015 } 6016 mptepa = VM_PAGE_TO_PHYS(mpte); 6017 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 6018 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 6019 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 6020 ("pmap_demote_pde: oldpde is missing PG_M")); 6021 newpte = oldpde & ~PG_PS; 6022 newpte = pmap_swap_pat(pmap, newpte); 6023 6024 /* 6025 * If the page table page is not leftover from an earlier promotion, 6026 * initialize it. 6027 */ 6028 if (vm_page_none_valid(mpte)) 6029 pmap_fill_ptp(firstpte, newpte); 6030 6031 pmap_demote_pde_check(firstpte, newpte); 6032 6033 /* 6034 * If the mapping has changed attributes, update the page table 6035 * entries. 6036 */ 6037 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 6038 pmap_fill_ptp(firstpte, newpte); 6039 6040 /* 6041 * The spare PV entries must be reserved prior to demoting the 6042 * mapping, that is, prior to changing the PDE. Otherwise, the state 6043 * of the PDE and the PV lists will be inconsistent, which can result 6044 * in reclaim_pv_chunk() attempting to remove a PV entry from the 6045 * wrong PV list and pmap_pv_demote_pde() failing to find the expected 6046 * PV entry for the 2MB page mapping that is being demoted. 6047 */ 6048 if ((oldpde & PG_MANAGED) != 0) 6049 reserve_pv_entries(pmap, NPTEPG - 1, lockp); 6050 6051 /* 6052 * Demote the mapping. This pmap is locked. The old PDE has 6053 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 6054 * set. Thus, there is no danger of a race with another 6055 * processor changing the setting of PG_A and/or PG_M between 6056 * the read above and the store below. 6057 */ 6058 if (workaround_erratum383) 6059 pmap_update_pde(pmap, va, pde, newpde); 6060 else 6061 pde_store(pde, newpde); 6062 6063 /* 6064 * Invalidate a stale recursive mapping of the page table page. 6065 */ 6066 if (in_kernel) 6067 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 6068 6069 /* 6070 * Demote the PV entry. 6071 */ 6072 if ((oldpde & PG_MANAGED) != 0) 6073 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); 6074 6075 counter_u64_add(pmap_pde_demotions, 1); 6076 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p", 6077 va, pmap); 6078 return (TRUE); 6079 } 6080 6081 /* 6082 * pmap_remove_kernel_pde: Remove a kernel superpage mapping. 6083 */ 6084 static void 6085 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 6086 { 6087 pd_entry_t newpde; 6088 vm_paddr_t mptepa; 6089 vm_page_t mpte; 6090 6091 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 6092 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6093 mpte = pmap_remove_pt_page(pmap, va); 6094 if (mpte == NULL) 6095 panic("pmap_remove_kernel_pde: Missing pt page."); 6096 6097 mptepa = VM_PAGE_TO_PHYS(mpte); 6098 newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; 6099 6100 /* 6101 * If this page table page was unmapped by a promotion, then it 6102 * contains valid mappings. Zero it to invalidate those mappings. 6103 */ 6104 if (vm_page_any_valid(mpte)) 6105 pagezero((void *)PHYS_TO_DMAP(mptepa)); 6106 6107 /* 6108 * Demote the mapping. 6109 */ 6110 if (workaround_erratum383) 6111 pmap_update_pde(pmap, va, pde, newpde); 6112 else 6113 pde_store(pde, newpde); 6114 6115 /* 6116 * Invalidate a stale recursive mapping of the page table page. 6117 */ 6118 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 6119 } 6120 6121 /* 6122 * pmap_remove_pde: do the things to unmap a superpage in a process 6123 */ 6124 static int 6125 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 6126 struct spglist *free, struct rwlock **lockp) 6127 { 6128 struct md_page *pvh; 6129 pd_entry_t oldpde; 6130 vm_offset_t eva, va; 6131 vm_page_t m, mpte; 6132 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 6133 6134 PG_G = pmap_global_bit(pmap); 6135 PG_A = pmap_accessed_bit(pmap); 6136 PG_M = pmap_modified_bit(pmap); 6137 PG_RW = pmap_rw_bit(pmap); 6138 6139 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6140 KASSERT((sva & PDRMASK) == 0, 6141 ("pmap_remove_pde: sva is not 2mpage aligned")); 6142 oldpde = pte_load_clear(pdq); 6143 if (oldpde & PG_W) 6144 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 6145 if ((oldpde & PG_G) != 0) 6146 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 6147 pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE); 6148 if (oldpde & PG_MANAGED) { 6149 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); 6150 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 6151 pmap_pvh_free(pvh, pmap, sva); 6152 eva = sva + NBPDR; 6153 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 6154 va < eva; va += PAGE_SIZE, m++) { 6155 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6156 vm_page_dirty(m); 6157 if (oldpde & PG_A) 6158 vm_page_aflag_set(m, PGA_REFERENCED); 6159 if (TAILQ_EMPTY(&m->md.pv_list) && 6160 TAILQ_EMPTY(&pvh->pv_list)) 6161 vm_page_aflag_clear(m, PGA_WRITEABLE); 6162 pmap_delayed_invl_page(m); 6163 } 6164 } 6165 if (pmap == kernel_pmap) { 6166 pmap_remove_kernel_pde(pmap, pdq, sva); 6167 } else { 6168 mpte = pmap_remove_pt_page(pmap, sva); 6169 if (mpte != NULL) { 6170 KASSERT(vm_page_all_valid(mpte), 6171 ("pmap_remove_pde: pte page not promoted")); 6172 pmap_pt_page_count_adj(pmap, -1); 6173 KASSERT(mpte->ref_count == NPTEPG, 6174 ("pmap_remove_pde: pte page ref count error")); 6175 mpte->ref_count = 0; 6176 pmap_add_delayed_free_list(mpte, free, FALSE); 6177 } 6178 } 6179 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); 6180 } 6181 6182 /* 6183 * pmap_remove_pte: do the things to unmap a page in a process 6184 */ 6185 static int 6186 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 6187 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) 6188 { 6189 struct md_page *pvh; 6190 pt_entry_t oldpte, PG_A, PG_M, PG_RW; 6191 vm_page_t m; 6192 6193 PG_A = pmap_accessed_bit(pmap); 6194 PG_M = pmap_modified_bit(pmap); 6195 PG_RW = pmap_rw_bit(pmap); 6196 6197 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6198 oldpte = pte_load_clear(ptq); 6199 if (oldpte & PG_W) 6200 pmap->pm_stats.wired_count -= 1; 6201 pmap_resident_count_adj(pmap, -1); 6202 if (oldpte & PG_MANAGED) { 6203 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 6204 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6205 vm_page_dirty(m); 6206 if (oldpte & PG_A) 6207 vm_page_aflag_set(m, PGA_REFERENCED); 6208 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 6209 pmap_pvh_free(&m->md, pmap, va); 6210 if (TAILQ_EMPTY(&m->md.pv_list) && 6211 (m->flags & PG_FICTITIOUS) == 0) { 6212 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6213 if (TAILQ_EMPTY(&pvh->pv_list)) 6214 vm_page_aflag_clear(m, PGA_WRITEABLE); 6215 } 6216 pmap_delayed_invl_page(m); 6217 } 6218 return (pmap_unuse_pt(pmap, va, ptepde, free)); 6219 } 6220 6221 /* 6222 * Remove a single page from a process address space 6223 */ 6224 static void 6225 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 6226 struct spglist *free) 6227 { 6228 struct rwlock *lock; 6229 pt_entry_t *pte, PG_V; 6230 6231 PG_V = pmap_valid_bit(pmap); 6232 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6233 if ((*pde & PG_V) == 0) 6234 return; 6235 pte = pmap_pde_to_pte(pde, va); 6236 if ((*pte & PG_V) == 0) 6237 return; 6238 lock = NULL; 6239 pmap_remove_pte(pmap, pte, va, *pde, free, &lock); 6240 if (lock != NULL) 6241 rw_wunlock(lock); 6242 pmap_invalidate_page(pmap, va); 6243 } 6244 6245 /* 6246 * Removes the specified range of addresses from the page table page. 6247 */ 6248 static bool 6249 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 6250 pd_entry_t *pde, struct spglist *free, struct rwlock **lockp) 6251 { 6252 pt_entry_t PG_G, *pte; 6253 vm_offset_t va; 6254 bool anyvalid; 6255 6256 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6257 PG_G = pmap_global_bit(pmap); 6258 anyvalid = false; 6259 va = eva; 6260 for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++, 6261 sva += PAGE_SIZE) { 6262 if (*pte == 0) { 6263 if (va != eva) { 6264 pmap_invalidate_range(pmap, va, sva); 6265 va = eva; 6266 } 6267 continue; 6268 } 6269 if ((*pte & PG_G) == 0) 6270 anyvalid = true; 6271 else if (va == eva) 6272 va = sva; 6273 if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) { 6274 sva += PAGE_SIZE; 6275 break; 6276 } 6277 } 6278 if (va != eva) 6279 pmap_invalidate_range(pmap, va, sva); 6280 return (anyvalid); 6281 } 6282 6283 static void 6284 pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete) 6285 { 6286 struct rwlock *lock; 6287 vm_page_t mt; 6288 vm_offset_t va_next; 6289 pml5_entry_t *pml5e; 6290 pml4_entry_t *pml4e; 6291 pdp_entry_t *pdpe; 6292 pd_entry_t ptpaddr, *pde; 6293 pt_entry_t PG_G, PG_V; 6294 struct spglist free; 6295 int anyvalid; 6296 6297 PG_G = pmap_global_bit(pmap); 6298 PG_V = pmap_valid_bit(pmap); 6299 6300 /* 6301 * If there are no resident pages besides the top level page 6302 * table page(s), there is nothing to do. Kernel pmap always 6303 * accounts whole preloaded area as resident, which makes its 6304 * resident count > 2. 6305 * Perform an unsynchronized read. This is, however, safe. 6306 */ 6307 if (pmap->pm_stats.resident_count <= 1 + (pmap->pm_pmltopu != NULL ? 6308 1 : 0)) 6309 return; 6310 6311 anyvalid = 0; 6312 SLIST_INIT(&free); 6313 6314 pmap_delayed_invl_start(); 6315 PMAP_LOCK(pmap); 6316 if (map_delete) 6317 pmap_pkru_on_remove(pmap, sva, eva); 6318 6319 /* 6320 * special handling of removing one page. a very 6321 * common operation and easy to short circuit some 6322 * code. 6323 */ 6324 if (sva + PAGE_SIZE == eva) { 6325 pde = pmap_pde(pmap, sva); 6326 if (pde && (*pde & PG_PS) == 0) { 6327 pmap_remove_page(pmap, sva, pde, &free); 6328 goto out; 6329 } 6330 } 6331 6332 lock = NULL; 6333 for (; sva < eva; sva = va_next) { 6334 if (pmap->pm_stats.resident_count == 0) 6335 break; 6336 6337 if (pmap_is_la57(pmap)) { 6338 pml5e = pmap_pml5e(pmap, sva); 6339 if ((*pml5e & PG_V) == 0) { 6340 va_next = (sva + NBPML5) & ~PML5MASK; 6341 if (va_next < sva) 6342 va_next = eva; 6343 continue; 6344 } 6345 pml4e = pmap_pml5e_to_pml4e(pml5e, sva); 6346 } else { 6347 pml4e = pmap_pml4e(pmap, sva); 6348 } 6349 if ((*pml4e & PG_V) == 0) { 6350 va_next = (sva + NBPML4) & ~PML4MASK; 6351 if (va_next < sva) 6352 va_next = eva; 6353 continue; 6354 } 6355 6356 va_next = (sva + NBPDP) & ~PDPMASK; 6357 if (va_next < sva) 6358 va_next = eva; 6359 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 6360 if ((*pdpe & PG_V) == 0) 6361 continue; 6362 if ((*pdpe & PG_PS) != 0) { 6363 KASSERT(va_next <= eva, 6364 ("partial update of non-transparent 1G mapping " 6365 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 6366 *pdpe, sva, eva, va_next)); 6367 MPASS(pmap != kernel_pmap); /* XXXKIB */ 6368 MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); 6369 anyvalid = 1; 6370 *pdpe = 0; 6371 pmap_resident_count_adj(pmap, -NBPDP / PAGE_SIZE); 6372 mt = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, sva) & PG_FRAME); 6373 pmap_unwire_ptp(pmap, sva, mt, &free); 6374 continue; 6375 } 6376 6377 /* 6378 * Calculate index for next page table. 6379 */ 6380 va_next = (sva + NBPDR) & ~PDRMASK; 6381 if (va_next < sva) 6382 va_next = eva; 6383 6384 pde = pmap_pdpe_to_pde(pdpe, sva); 6385 ptpaddr = *pde; 6386 6387 /* 6388 * Weed out invalid mappings. 6389 */ 6390 if (ptpaddr == 0) 6391 continue; 6392 6393 /* 6394 * Check for large page. 6395 */ 6396 if ((ptpaddr & PG_PS) != 0) { 6397 /* 6398 * Are we removing the entire large page? If not, 6399 * demote the mapping and fall through. 6400 */ 6401 if (sva + NBPDR == va_next && eva >= va_next) { 6402 /* 6403 * The TLB entry for a PG_G mapping is 6404 * invalidated by pmap_remove_pde(). 6405 */ 6406 if ((ptpaddr & PG_G) == 0) 6407 anyvalid = 1; 6408 pmap_remove_pde(pmap, pde, sva, &free, &lock); 6409 continue; 6410 } else if (!pmap_demote_pde_locked(pmap, pde, sva, 6411 &lock)) { 6412 /* The large page mapping was destroyed. */ 6413 continue; 6414 } else 6415 ptpaddr = *pde; 6416 } 6417 6418 /* 6419 * Limit our scan to either the end of the va represented 6420 * by the current page table page, or to the end of the 6421 * range being removed. 6422 */ 6423 if (va_next > eva) 6424 va_next = eva; 6425 6426 if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock)) 6427 anyvalid = 1; 6428 } 6429 if (lock != NULL) 6430 rw_wunlock(lock); 6431 out: 6432 if (anyvalid) 6433 pmap_invalidate_all(pmap); 6434 PMAP_UNLOCK(pmap); 6435 pmap_delayed_invl_finish(); 6436 vm_page_free_pages_toq(&free, true); 6437 } 6438 6439 /* 6440 * Remove the given range of addresses from the specified map. 6441 * 6442 * It is assumed that the start and end are properly 6443 * rounded to the page size. 6444 */ 6445 void 6446 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 6447 { 6448 pmap_remove1(pmap, sva, eva, false); 6449 } 6450 6451 /* 6452 * Remove the given range of addresses as part of a logical unmap 6453 * operation. This has the effect of calling pmap_remove(), but 6454 * also clears any metadata that should persist for the lifetime 6455 * of a logical mapping. 6456 */ 6457 void 6458 pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 6459 { 6460 pmap_remove1(pmap, sva, eva, true); 6461 } 6462 6463 /* 6464 * Routine: pmap_remove_all 6465 * Function: 6466 * Removes this physical page from 6467 * all physical maps in which it resides. 6468 * Reflects back modify bits to the pager. 6469 * 6470 * Notes: 6471 * Original versions of this routine were very 6472 * inefficient because they iteratively called 6473 * pmap_remove (slow...) 6474 */ 6475 6476 void 6477 pmap_remove_all(vm_page_t m) 6478 { 6479 struct md_page *pvh; 6480 pv_entry_t pv; 6481 pmap_t pmap; 6482 struct rwlock *lock; 6483 pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; 6484 pd_entry_t *pde; 6485 vm_offset_t va; 6486 struct spglist free; 6487 int pvh_gen, md_gen; 6488 6489 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6490 ("pmap_remove_all: page %p is not managed", m)); 6491 SLIST_INIT(&free); 6492 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6493 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 6494 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6495 rw_wlock(lock); 6496 retry: 6497 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 6498 pmap = PV_PMAP(pv); 6499 if (!PMAP_TRYLOCK(pmap)) { 6500 pvh_gen = pvh->pv_gen; 6501 rw_wunlock(lock); 6502 PMAP_LOCK(pmap); 6503 rw_wlock(lock); 6504 if (pvh_gen != pvh->pv_gen) { 6505 PMAP_UNLOCK(pmap); 6506 goto retry; 6507 } 6508 } 6509 va = pv->pv_va; 6510 pde = pmap_pde(pmap, va); 6511 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 6512 PMAP_UNLOCK(pmap); 6513 } 6514 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 6515 pmap = PV_PMAP(pv); 6516 if (!PMAP_TRYLOCK(pmap)) { 6517 pvh_gen = pvh->pv_gen; 6518 md_gen = m->md.pv_gen; 6519 rw_wunlock(lock); 6520 PMAP_LOCK(pmap); 6521 rw_wlock(lock); 6522 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 6523 PMAP_UNLOCK(pmap); 6524 goto retry; 6525 } 6526 } 6527 PG_A = pmap_accessed_bit(pmap); 6528 PG_M = pmap_modified_bit(pmap); 6529 PG_RW = pmap_rw_bit(pmap); 6530 pmap_resident_count_adj(pmap, -1); 6531 pde = pmap_pde(pmap, pv->pv_va); 6532 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 6533 " a 2mpage in page %p's pv list", m)); 6534 pte = pmap_pde_to_pte(pde, pv->pv_va); 6535 tpte = pte_load_clear(pte); 6536 if (tpte & PG_W) 6537 pmap->pm_stats.wired_count--; 6538 if (tpte & PG_A) 6539 vm_page_aflag_set(m, PGA_REFERENCED); 6540 6541 /* 6542 * Update the vm_page_t clean and reference bits. 6543 */ 6544 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6545 vm_page_dirty(m); 6546 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); 6547 pmap_invalidate_page(pmap, pv->pv_va); 6548 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 6549 m->md.pv_gen++; 6550 free_pv_entry(pmap, pv); 6551 PMAP_UNLOCK(pmap); 6552 } 6553 vm_page_aflag_clear(m, PGA_WRITEABLE); 6554 rw_wunlock(lock); 6555 pmap_delayed_invl_wait(m); 6556 vm_page_free_pages_toq(&free, true); 6557 } 6558 6559 /* 6560 * pmap_protect_pde: do the things to protect a 2mpage in a process 6561 */ 6562 static boolean_t 6563 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 6564 { 6565 pd_entry_t newpde, oldpde; 6566 vm_page_t m, mt; 6567 boolean_t anychanged; 6568 pt_entry_t PG_G, PG_M, PG_RW; 6569 6570 PG_G = pmap_global_bit(pmap); 6571 PG_M = pmap_modified_bit(pmap); 6572 PG_RW = pmap_rw_bit(pmap); 6573 6574 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6575 KASSERT((sva & PDRMASK) == 0, 6576 ("pmap_protect_pde: sva is not 2mpage aligned")); 6577 anychanged = FALSE; 6578 retry: 6579 oldpde = newpde = *pde; 6580 if ((prot & VM_PROT_WRITE) == 0) { 6581 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == 6582 (PG_MANAGED | PG_M | PG_RW)) { 6583 m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 6584 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 6585 vm_page_dirty(mt); 6586 } 6587 newpde &= ~(PG_RW | PG_M); 6588 } 6589 if ((prot & VM_PROT_EXECUTE) == 0) 6590 newpde |= pg_nx; 6591 if (newpde != oldpde) { 6592 /* 6593 * As an optimization to future operations on this PDE, clear 6594 * PG_PROMOTED. The impending invalidation will remove any 6595 * lingering 4KB page mappings from the TLB. 6596 */ 6597 if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED)) 6598 goto retry; 6599 if ((oldpde & PG_G) != 0) 6600 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 6601 else 6602 anychanged = TRUE; 6603 } 6604 return (anychanged); 6605 } 6606 6607 /* 6608 * Set the physical protection on the 6609 * specified range of this map as requested. 6610 */ 6611 void 6612 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 6613 { 6614 vm_page_t m; 6615 vm_offset_t va_next; 6616 pml4_entry_t *pml4e; 6617 pdp_entry_t *pdpe; 6618 pd_entry_t ptpaddr, *pde; 6619 pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; 6620 pt_entry_t obits, pbits; 6621 boolean_t anychanged; 6622 6623 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 6624 if (prot == VM_PROT_NONE) { 6625 pmap_remove(pmap, sva, eva); 6626 return; 6627 } 6628 6629 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 6630 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 6631 return; 6632 6633 PG_G = pmap_global_bit(pmap); 6634 PG_M = pmap_modified_bit(pmap); 6635 PG_V = pmap_valid_bit(pmap); 6636 PG_RW = pmap_rw_bit(pmap); 6637 anychanged = FALSE; 6638 6639 /* 6640 * Although this function delays and batches the invalidation 6641 * of stale TLB entries, it does not need to call 6642 * pmap_delayed_invl_start() and 6643 * pmap_delayed_invl_finish(), because it does not 6644 * ordinarily destroy mappings. Stale TLB entries from 6645 * protection-only changes need only be invalidated before the 6646 * pmap lock is released, because protection-only changes do 6647 * not destroy PV entries. Even operations that iterate over 6648 * a physical page's PV list of mappings, like 6649 * pmap_remove_write(), acquire the pmap lock for each 6650 * mapping. Consequently, for protection-only changes, the 6651 * pmap lock suffices to synchronize both page table and TLB 6652 * updates. 6653 * 6654 * This function only destroys a mapping if pmap_demote_pde() 6655 * fails. In that case, stale TLB entries are immediately 6656 * invalidated. 6657 */ 6658 6659 PMAP_LOCK(pmap); 6660 for (; sva < eva; sva = va_next) { 6661 pml4e = pmap_pml4e(pmap, sva); 6662 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 6663 va_next = (sva + NBPML4) & ~PML4MASK; 6664 if (va_next < sva) 6665 va_next = eva; 6666 continue; 6667 } 6668 6669 va_next = (sva + NBPDP) & ~PDPMASK; 6670 if (va_next < sva) 6671 va_next = eva; 6672 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 6673 if ((*pdpe & PG_V) == 0) 6674 continue; 6675 if ((*pdpe & PG_PS) != 0) { 6676 KASSERT(va_next <= eva, 6677 ("partial update of non-transparent 1G mapping " 6678 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 6679 *pdpe, sva, eva, va_next)); 6680 retry_pdpe: 6681 obits = pbits = *pdpe; 6682 MPASS((pbits & (PG_MANAGED | PG_G)) == 0); 6683 MPASS(pmap != kernel_pmap); /* XXXKIB */ 6684 if ((prot & VM_PROT_WRITE) == 0) 6685 pbits &= ~(PG_RW | PG_M); 6686 if ((prot & VM_PROT_EXECUTE) == 0) 6687 pbits |= pg_nx; 6688 6689 if (pbits != obits) { 6690 if (!atomic_cmpset_long(pdpe, obits, pbits)) 6691 /* PG_PS cannot be cleared under us, */ 6692 goto retry_pdpe; 6693 anychanged = TRUE; 6694 } 6695 continue; 6696 } 6697 6698 va_next = (sva + NBPDR) & ~PDRMASK; 6699 if (va_next < sva) 6700 va_next = eva; 6701 6702 pde = pmap_pdpe_to_pde(pdpe, sva); 6703 ptpaddr = *pde; 6704 6705 /* 6706 * Weed out invalid mappings. 6707 */ 6708 if (ptpaddr == 0) 6709 continue; 6710 6711 /* 6712 * Check for large page. 6713 */ 6714 if ((ptpaddr & PG_PS) != 0) { 6715 /* 6716 * Are we protecting the entire large page? If not, 6717 * demote the mapping and fall through. 6718 */ 6719 if (sva + NBPDR == va_next && eva >= va_next) { 6720 /* 6721 * The TLB entry for a PG_G mapping is 6722 * invalidated by pmap_protect_pde(). 6723 */ 6724 if (pmap_protect_pde(pmap, pde, sva, prot)) 6725 anychanged = TRUE; 6726 continue; 6727 } else if (!pmap_demote_pde(pmap, pde, sva)) { 6728 /* 6729 * The large page mapping was destroyed. 6730 */ 6731 continue; 6732 } 6733 } 6734 6735 if (va_next > eva) 6736 va_next = eva; 6737 6738 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 6739 sva += PAGE_SIZE) { 6740 retry: 6741 obits = pbits = *pte; 6742 if ((pbits & PG_V) == 0) 6743 continue; 6744 6745 if ((prot & VM_PROT_WRITE) == 0) { 6746 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 6747 (PG_MANAGED | PG_M | PG_RW)) { 6748 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 6749 vm_page_dirty(m); 6750 } 6751 pbits &= ~(PG_RW | PG_M); 6752 } 6753 if ((prot & VM_PROT_EXECUTE) == 0) 6754 pbits |= pg_nx; 6755 6756 if (pbits != obits) { 6757 if (!atomic_cmpset_long(pte, obits, pbits)) 6758 goto retry; 6759 if (obits & PG_G) 6760 pmap_invalidate_page(pmap, sva); 6761 else 6762 anychanged = TRUE; 6763 } 6764 } 6765 } 6766 if (anychanged) 6767 pmap_invalidate_all(pmap); 6768 PMAP_UNLOCK(pmap); 6769 } 6770 6771 #if VM_NRESERVLEVEL > 0 6772 static bool 6773 pmap_pde_ept_executable(pmap_t pmap, pd_entry_t pde) 6774 { 6775 6776 if (pmap->pm_type != PT_EPT) 6777 return (false); 6778 return ((pde & EPT_PG_EXECUTE) != 0); 6779 } 6780 6781 /* 6782 * Tries to promote the 512, contiguous 4KB page mappings that are within a 6783 * single page table page (PTP) to a single 2MB page mapping. For promotion 6784 * to occur, two conditions must be met: (1) the 4KB page mappings must map 6785 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 6786 * identical characteristics. 6787 */ 6788 static void 6789 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, vm_page_t mpte, 6790 struct rwlock **lockp) 6791 { 6792 pd_entry_t newpde; 6793 pt_entry_t *firstpte, oldpte, pa, *pte; 6794 pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK; 6795 int PG_PTE_CACHE; 6796 6797 PG_A = pmap_accessed_bit(pmap); 6798 PG_G = pmap_global_bit(pmap); 6799 PG_M = pmap_modified_bit(pmap); 6800 PG_V = pmap_valid_bit(pmap); 6801 PG_RW = pmap_rw_bit(pmap); 6802 PG_PKU_MASK = pmap_pku_mask_bit(pmap); 6803 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 6804 6805 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6806 6807 /* 6808 * Examine the first PTE in the specified PTP. Abort if this PTE is 6809 * ineligible for promotion due to hardware errata, invalid, or does 6810 * not map the first 4KB physical page within a 2MB page. 6811 */ 6812 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 6813 newpde = *firstpte; 6814 if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, newpde))) 6815 return; 6816 if ((newpde & ((PG_FRAME & PDRMASK) | PG_V)) != PG_V) { 6817 counter_u64_add(pmap_pde_p_failures, 1); 6818 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6819 " in pmap %p", va, pmap); 6820 return; 6821 } 6822 6823 /* 6824 * Both here and in the below "for" loop, to allow for repromotion 6825 * after MADV_FREE, conditionally write protect a clean PTE before 6826 * possibly aborting the promotion due to other PTE attributes. Why? 6827 * Suppose that MADV_FREE is applied to a part of a superpage, the 6828 * address range [S, E). pmap_advise() will demote the superpage 6829 * mapping, destroy the 4KB page mapping at the end of [S, E), and 6830 * clear PG_M and PG_A in the PTEs for the rest of [S, E). Later, 6831 * imagine that the memory in [S, E) is recycled, but the last 4KB 6832 * page in [S, E) is not the last to be rewritten, or simply accessed. 6833 * In other words, there is still a 4KB page in [S, E), call it P, 6834 * that is writeable but PG_M and PG_A are clear in P's PTE. Unless 6835 * we write protect P before aborting the promotion, if and when P is 6836 * finally rewritten, there won't be a page fault to trigger 6837 * repromotion. 6838 */ 6839 setpde: 6840 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 6841 /* 6842 * When PG_M is already clear, PG_RW can be cleared without 6843 * a TLB invalidation. 6844 */ 6845 if (!atomic_fcmpset_long(firstpte, &newpde, newpde & ~PG_RW)) 6846 goto setpde; 6847 newpde &= ~PG_RW; 6848 } 6849 if ((newpde & PG_A) == 0) { 6850 counter_u64_add(pmap_pde_p_failures, 1); 6851 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6852 " in pmap %p", va, pmap); 6853 return; 6854 } 6855 6856 /* 6857 * Examine each of the other PTEs in the specified PTP. Abort if this 6858 * PTE maps an unexpected 4KB physical page or does not have identical 6859 * characteristics to the first PTE. 6860 */ 6861 pa = (newpde & (PG_PS_FRAME | PG_V)) + NBPDR - PAGE_SIZE; 6862 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 6863 oldpte = *pte; 6864 if ((oldpte & (PG_FRAME | PG_V)) != pa) { 6865 counter_u64_add(pmap_pde_p_failures, 1); 6866 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6867 " in pmap %p", va, pmap); 6868 return; 6869 } 6870 setpte: 6871 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 6872 /* 6873 * When PG_M is already clear, PG_RW can be cleared 6874 * without a TLB invalidation. 6875 */ 6876 if (!atomic_fcmpset_long(pte, &oldpte, oldpte & ~PG_RW)) 6877 goto setpte; 6878 oldpte &= ~PG_RW; 6879 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 6880 " in pmap %p", (oldpte & PG_FRAME & PDRMASK) | 6881 (va & ~PDRMASK), pmap); 6882 } 6883 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 6884 counter_u64_add(pmap_pde_p_failures, 1); 6885 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6886 " in pmap %p", va, pmap); 6887 return; 6888 } 6889 pa -= PAGE_SIZE; 6890 } 6891 6892 /* 6893 * Save the page table page in its current state until the PDE 6894 * mapping the superpage is demoted by pmap_demote_pde() or 6895 * destroyed by pmap_remove_pde(). 6896 */ 6897 if (mpte == NULL) 6898 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 6899 KASSERT(mpte >= vm_page_array && 6900 mpte < &vm_page_array[vm_page_array_size], 6901 ("pmap_promote_pde: page table page is out of range")); 6902 KASSERT(mpte->pindex == pmap_pde_pindex(va), 6903 ("pmap_promote_pde: page table page's pindex is wrong " 6904 "mpte %p pidx %#lx va %#lx va pde pidx %#lx", 6905 mpte, mpte->pindex, va, pmap_pde_pindex(va))); 6906 if (pmap_insert_pt_page(pmap, mpte, true)) { 6907 counter_u64_add(pmap_pde_p_failures, 1); 6908 CTR2(KTR_PMAP, 6909 "pmap_promote_pde: failure for va %#lx in pmap %p", va, 6910 pmap); 6911 return; 6912 } 6913 6914 /* 6915 * Promote the pv entries. 6916 */ 6917 if ((newpde & PG_MANAGED) != 0) 6918 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); 6919 6920 /* 6921 * Propagate the PAT index to its proper position. 6922 */ 6923 newpde = pmap_swap_pat(pmap, newpde); 6924 6925 /* 6926 * Map the superpage. 6927 */ 6928 if (workaround_erratum383) 6929 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 6930 else 6931 pde_store(pde, PG_PROMOTED | PG_PS | newpde); 6932 6933 counter_u64_add(pmap_pde_promotions, 1); 6934 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" 6935 " in pmap %p", va, pmap); 6936 } 6937 #endif /* VM_NRESERVLEVEL > 0 */ 6938 6939 static int 6940 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags, 6941 int psind) 6942 { 6943 vm_page_t mp; 6944 pt_entry_t origpte, *pml4e, *pdpe, *pde, pten, PG_V; 6945 6946 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6947 KASSERT(psind > 0 && psind < MAXPAGESIZES && pagesizes[psind] != 0, 6948 ("psind %d unexpected", psind)); 6949 KASSERT(((newpte & PG_FRAME) & (pagesizes[psind] - 1)) == 0, 6950 ("unaligned phys address %#lx newpte %#lx psind %d", 6951 newpte & PG_FRAME, newpte, psind)); 6952 KASSERT((va & (pagesizes[psind] - 1)) == 0, 6953 ("unaligned va %#lx psind %d", va, psind)); 6954 KASSERT(va < VM_MAXUSER_ADDRESS, 6955 ("kernel mode non-transparent superpage")); /* XXXKIB */ 6956 KASSERT(va + pagesizes[psind] < VM_MAXUSER_ADDRESS, 6957 ("overflowing user map va %#lx psind %d", va, psind)); /* XXXKIB */ 6958 6959 PG_V = pmap_valid_bit(pmap); 6960 6961 restart: 6962 if (!pmap_pkru_same(pmap, va, va + pagesizes[psind])) 6963 return (KERN_PROTECTION_FAILURE); 6964 pten = newpte; 6965 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) 6966 pten |= pmap_pkru_get(pmap, va); 6967 6968 if (psind == 2) { /* 1G */ 6969 pml4e = pmap_pml4e(pmap, va); 6970 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 6971 mp = pmap_allocpte_alloc(pmap, pmap_pml4e_pindex(va), 6972 NULL, va); 6973 if (mp == NULL) 6974 goto allocf; 6975 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 6976 pdpe = &pdpe[pmap_pdpe_index(va)]; 6977 origpte = *pdpe; 6978 MPASS(origpte == 0); 6979 } else { 6980 pdpe = pmap_pml4e_to_pdpe(pml4e, va); 6981 KASSERT(pdpe != NULL, ("va %#lx lost pdpe", va)); 6982 origpte = *pdpe; 6983 if ((origpte & PG_V) == 0) { 6984 mp = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME); 6985 mp->ref_count++; 6986 } 6987 } 6988 *pdpe = pten; 6989 } else /* (psind == 1) */ { /* 2M */ 6990 pde = pmap_pde(pmap, va); 6991 if (pde == NULL) { 6992 mp = pmap_allocpte_alloc(pmap, pmap_pdpe_pindex(va), 6993 NULL, va); 6994 if (mp == NULL) 6995 goto allocf; 6996 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 6997 pde = &pde[pmap_pde_index(va)]; 6998 origpte = *pde; 6999 MPASS(origpte == 0); 7000 } else { 7001 origpte = *pde; 7002 if ((origpte & PG_V) == 0) { 7003 pdpe = pmap_pdpe(pmap, va); 7004 MPASS(pdpe != NULL && (*pdpe & PG_V) != 0); 7005 mp = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 7006 mp->ref_count++; 7007 } 7008 } 7009 *pde = pten; 7010 } 7011 KASSERT((origpte & PG_V) == 0 || ((origpte & PG_PS) != 0 && 7012 (origpte & PG_PS_FRAME) == (pten & PG_PS_FRAME)), 7013 ("va %#lx changing %s phys page origpte %#lx pten %#lx", 7014 va, psind == 2 ? "1G" : "2M", origpte, pten)); 7015 if ((pten & PG_W) != 0 && (origpte & PG_W) == 0) 7016 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE; 7017 else if ((pten & PG_W) == 0 && (origpte & PG_W) != 0) 7018 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE; 7019 if ((origpte & PG_V) == 0) 7020 pmap_resident_count_adj(pmap, pagesizes[psind] / PAGE_SIZE); 7021 7022 return (KERN_SUCCESS); 7023 7024 allocf: 7025 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 7026 return (KERN_RESOURCE_SHORTAGE); 7027 PMAP_UNLOCK(pmap); 7028 vm_wait(NULL); 7029 PMAP_LOCK(pmap); 7030 goto restart; 7031 } 7032 7033 /* 7034 * Insert the given physical page (p) at 7035 * the specified virtual address (v) in the 7036 * target physical map with the protection requested. 7037 * 7038 * If specified, the page will be wired down, meaning 7039 * that the related pte can not be reclaimed. 7040 * 7041 * NB: This is the only routine which MAY NOT lazy-evaluate 7042 * or lose information. That is, this routine must actually 7043 * insert this page into the given map NOW. 7044 * 7045 * When destroying both a page table and PV entry, this function 7046 * performs the TLB invalidation before releasing the PV list 7047 * lock, so we do not need pmap_delayed_invl_page() calls here. 7048 */ 7049 int 7050 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 7051 u_int flags, int8_t psind) 7052 { 7053 struct rwlock *lock; 7054 pd_entry_t *pde; 7055 pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; 7056 pt_entry_t newpte, origpte; 7057 pv_entry_t pv; 7058 vm_paddr_t opa, pa; 7059 vm_page_t mpte, om; 7060 int rv; 7061 boolean_t nosleep; 7062 7063 PG_A = pmap_accessed_bit(pmap); 7064 PG_G = pmap_global_bit(pmap); 7065 PG_M = pmap_modified_bit(pmap); 7066 PG_V = pmap_valid_bit(pmap); 7067 PG_RW = pmap_rw_bit(pmap); 7068 7069 va = trunc_page(va); 7070 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 7071 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 7072 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", 7073 va)); 7074 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va), 7075 ("pmap_enter: managed mapping within the clean submap")); 7076 if ((m->oflags & VPO_UNMANAGED) == 0) 7077 VM_PAGE_OBJECT_BUSY_ASSERT(m); 7078 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 7079 ("pmap_enter: flags %u has reserved bits set", flags)); 7080 pa = VM_PAGE_TO_PHYS(m); 7081 newpte = (pt_entry_t)(pa | PG_A | PG_V); 7082 if ((flags & VM_PROT_WRITE) != 0) 7083 newpte |= PG_M; 7084 if ((prot & VM_PROT_WRITE) != 0) 7085 newpte |= PG_RW; 7086 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 7087 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); 7088 if ((prot & VM_PROT_EXECUTE) == 0) 7089 newpte |= pg_nx; 7090 if ((flags & PMAP_ENTER_WIRED) != 0) 7091 newpte |= PG_W; 7092 if (va < VM_MAXUSER_ADDRESS) 7093 newpte |= PG_U; 7094 if (pmap == kernel_pmap) 7095 newpte |= PG_G; 7096 newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0); 7097 7098 /* 7099 * Set modified bit gratuitously for writeable mappings if 7100 * the page is unmanaged. We do not want to take a fault 7101 * to do the dirty bit accounting for these mappings. 7102 */ 7103 if ((m->oflags & VPO_UNMANAGED) != 0) { 7104 if ((newpte & PG_RW) != 0) 7105 newpte |= PG_M; 7106 } else 7107 newpte |= PG_MANAGED; 7108 7109 lock = NULL; 7110 PMAP_LOCK(pmap); 7111 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { 7112 KASSERT((m->oflags & VPO_UNMANAGED) != 0, 7113 ("managed largepage va %#lx flags %#x", va, flags)); 7114 rv = pmap_enter_largepage(pmap, va, newpte | PG_PS, flags, 7115 psind); 7116 goto out; 7117 } 7118 if (psind == 1) { 7119 /* Assert the required virtual and physical alignment. */ 7120 KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); 7121 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 7122 rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock); 7123 goto out; 7124 } 7125 mpte = NULL; 7126 7127 /* 7128 * In the case that a page table page is not 7129 * resident, we are creating it here. 7130 */ 7131 retry: 7132 pde = pmap_pde(pmap, va); 7133 if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || 7134 pmap_demote_pde_locked(pmap, pde, va, &lock))) { 7135 pte = pmap_pde_to_pte(pde, va); 7136 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 7137 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7138 mpte->ref_count++; 7139 } 7140 } else if (va < VM_MAXUSER_ADDRESS) { 7141 /* 7142 * Here if the pte page isn't mapped, or if it has been 7143 * deallocated. 7144 */ 7145 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 7146 mpte = pmap_allocpte_alloc(pmap, pmap_pde_pindex(va), 7147 nosleep ? NULL : &lock, va); 7148 if (mpte == NULL && nosleep) { 7149 rv = KERN_RESOURCE_SHORTAGE; 7150 goto out; 7151 } 7152 goto retry; 7153 } else 7154 panic("pmap_enter: invalid page directory va=%#lx", va); 7155 7156 origpte = *pte; 7157 pv = NULL; 7158 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) 7159 newpte |= pmap_pkru_get(pmap, va); 7160 7161 /* 7162 * Is the specified virtual address already mapped? 7163 */ 7164 if ((origpte & PG_V) != 0) { 7165 /* 7166 * Wiring change, just update stats. We don't worry about 7167 * wiring PT pages as they remain resident as long as there 7168 * are valid mappings in them. Hence, if a user page is wired, 7169 * the PT page will be also. 7170 */ 7171 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 7172 pmap->pm_stats.wired_count++; 7173 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 7174 pmap->pm_stats.wired_count--; 7175 7176 /* 7177 * Remove the extra PT page reference. 7178 */ 7179 if (mpte != NULL) { 7180 mpte->ref_count--; 7181 KASSERT(mpte->ref_count > 0, 7182 ("pmap_enter: missing reference to page table page," 7183 " va: 0x%lx", va)); 7184 } 7185 7186 /* 7187 * Has the physical page changed? 7188 */ 7189 opa = origpte & PG_FRAME; 7190 if (opa == pa) { 7191 /* 7192 * No, might be a protection or wiring change. 7193 */ 7194 if ((origpte & PG_MANAGED) != 0 && 7195 (newpte & PG_RW) != 0) 7196 vm_page_aflag_set(m, PGA_WRITEABLE); 7197 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 7198 goto unchanged; 7199 goto validate; 7200 } 7201 7202 /* 7203 * The physical page has changed. Temporarily invalidate 7204 * the mapping. This ensures that all threads sharing the 7205 * pmap keep a consistent view of the mapping, which is 7206 * necessary for the correct handling of COW faults. It 7207 * also permits reuse of the old mapping's PV entry, 7208 * avoiding an allocation. 7209 * 7210 * For consistency, handle unmanaged mappings the same way. 7211 */ 7212 origpte = pte_load_clear(pte); 7213 KASSERT((origpte & PG_FRAME) == opa, 7214 ("pmap_enter: unexpected pa update for %#lx", va)); 7215 if ((origpte & PG_MANAGED) != 0) { 7216 om = PHYS_TO_VM_PAGE(opa); 7217 7218 /* 7219 * The pmap lock is sufficient to synchronize with 7220 * concurrent calls to pmap_page_test_mappings() and 7221 * pmap_ts_referenced(). 7222 */ 7223 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 7224 vm_page_dirty(om); 7225 if ((origpte & PG_A) != 0) { 7226 pmap_invalidate_page(pmap, va); 7227 vm_page_aflag_set(om, PGA_REFERENCED); 7228 } 7229 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 7230 pv = pmap_pvh_remove(&om->md, pmap, va); 7231 KASSERT(pv != NULL, 7232 ("pmap_enter: no PV entry for %#lx", va)); 7233 if ((newpte & PG_MANAGED) == 0) 7234 free_pv_entry(pmap, pv); 7235 if ((om->a.flags & PGA_WRITEABLE) != 0 && 7236 TAILQ_EMPTY(&om->md.pv_list) && 7237 ((om->flags & PG_FICTITIOUS) != 0 || 7238 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 7239 vm_page_aflag_clear(om, PGA_WRITEABLE); 7240 } else { 7241 /* 7242 * Since this mapping is unmanaged, assume that PG_A 7243 * is set. 7244 */ 7245 pmap_invalidate_page(pmap, va); 7246 } 7247 origpte = 0; 7248 } else { 7249 /* 7250 * Increment the counters. 7251 */ 7252 if ((newpte & PG_W) != 0) 7253 pmap->pm_stats.wired_count++; 7254 pmap_resident_count_adj(pmap, 1); 7255 } 7256 7257 /* 7258 * Enter on the PV list if part of our managed memory. 7259 */ 7260 if ((newpte & PG_MANAGED) != 0) { 7261 if (pv == NULL) { 7262 pv = get_pv_entry(pmap, &lock); 7263 pv->pv_va = va; 7264 } 7265 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 7266 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 7267 m->md.pv_gen++; 7268 if ((newpte & PG_RW) != 0) 7269 vm_page_aflag_set(m, PGA_WRITEABLE); 7270 } 7271 7272 /* 7273 * Update the PTE. 7274 */ 7275 if ((origpte & PG_V) != 0) { 7276 validate: 7277 origpte = pte_load_store(pte, newpte); 7278 KASSERT((origpte & PG_FRAME) == pa, 7279 ("pmap_enter: unexpected pa update for %#lx", va)); 7280 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == 7281 (PG_M | PG_RW)) { 7282 if ((origpte & PG_MANAGED) != 0) 7283 vm_page_dirty(m); 7284 7285 /* 7286 * Although the PTE may still have PG_RW set, TLB 7287 * invalidation may nonetheless be required because 7288 * the PTE no longer has PG_M set. 7289 */ 7290 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { 7291 /* 7292 * This PTE change does not require TLB invalidation. 7293 */ 7294 goto unchanged; 7295 } 7296 if ((origpte & PG_A) != 0) 7297 pmap_invalidate_page(pmap, va); 7298 } else 7299 pte_store(pte, newpte); 7300 7301 unchanged: 7302 7303 #if VM_NRESERVLEVEL > 0 7304 /* 7305 * If both the page table page and the reservation are fully 7306 * populated, then attempt promotion. 7307 */ 7308 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 7309 pmap_ps_enabled(pmap) && 7310 (m->flags & PG_FICTITIOUS) == 0 && 7311 vm_reserv_level_iffullpop(m) == 0) 7312 pmap_promote_pde(pmap, pde, va, mpte, &lock); 7313 #endif 7314 7315 rv = KERN_SUCCESS; 7316 out: 7317 if (lock != NULL) 7318 rw_wunlock(lock); 7319 PMAP_UNLOCK(pmap); 7320 return (rv); 7321 } 7322 7323 /* 7324 * Tries to create a read- and/or execute-only 2MB page mapping. Returns 7325 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error 7326 * value. See pmap_enter_pde() for the possible error values when "no sleep", 7327 * "no replace", and "no reclaim" are specified. 7328 */ 7329 static int 7330 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 7331 struct rwlock **lockp) 7332 { 7333 pd_entry_t newpde; 7334 pt_entry_t PG_V; 7335 7336 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7337 PG_V = pmap_valid_bit(pmap); 7338 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | 7339 PG_PS | PG_V; 7340 if ((m->oflags & VPO_UNMANAGED) == 0) 7341 newpde |= PG_MANAGED; 7342 if ((prot & VM_PROT_EXECUTE) == 0) 7343 newpde |= pg_nx; 7344 if (va < VM_MAXUSER_ADDRESS) 7345 newpde |= PG_U; 7346 return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | 7347 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp)); 7348 } 7349 7350 /* 7351 * Returns true if every page table entry in the specified page table page is 7352 * zero. 7353 */ 7354 static bool 7355 pmap_every_pte_zero(vm_paddr_t pa) 7356 { 7357 pt_entry_t *pt_end, *pte; 7358 7359 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); 7360 pte = (pt_entry_t *)PHYS_TO_DMAP(pa); 7361 for (pt_end = pte + NPTEPG; pte < pt_end; pte++) { 7362 if (*pte != 0) 7363 return (false); 7364 } 7365 return (true); 7366 } 7367 7368 /* 7369 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 7370 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, 7371 * KERN_PROTECTION_FAILURE, or KERN_RESOURCE_SHORTAGE otherwise. Returns 7372 * KERN_FAILURE if either (1) PMAP_ENTER_NOREPLACE was specified and a 4KB 7373 * page mapping already exists within the 2MB virtual address range starting 7374 * at the specified virtual address or (2) the requested 2MB page mapping is 7375 * not supported due to hardware errata. Returns KERN_NO_SPACE if 7376 * PMAP_ENTER_NOREPLACE was specified and a 2MB page mapping already exists at 7377 * the specified virtual address. Returns KERN_PROTECTION_FAILURE if the PKRU 7378 * settings are not the same across the 2MB virtual address range starting at 7379 * the specified virtual address. Returns KERN_RESOURCE_SHORTAGE if either 7380 * (1) PMAP_ENTER_NOSLEEP was specified and a page table page allocation 7381 * failed or (2) PMAP_ENTER_NORECLAIM was specified and a PV entry allocation 7382 * failed. 7383 * 7384 * The parameter "m" is only used when creating a managed, writeable mapping. 7385 */ 7386 static int 7387 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, 7388 vm_page_t m, struct rwlock **lockp) 7389 { 7390 struct spglist free; 7391 pd_entry_t oldpde, *pde; 7392 pt_entry_t PG_G, PG_RW, PG_V; 7393 vm_page_t mt, pdpg; 7394 7395 KASSERT(pmap == kernel_pmap || (newpde & PG_W) == 0, 7396 ("pmap_enter_pde: cannot create wired user mapping")); 7397 PG_G = pmap_global_bit(pmap); 7398 PG_RW = pmap_rw_bit(pmap); 7399 KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW, 7400 ("pmap_enter_pde: newpde is missing PG_M")); 7401 PG_V = pmap_valid_bit(pmap); 7402 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7403 7404 if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, 7405 newpde))) { 7406 CTR2(KTR_PMAP, "pmap_enter_pde: 2m x blocked for va %#lx" 7407 " in pmap %p", va, pmap); 7408 return (KERN_FAILURE); 7409 } 7410 if ((pde = pmap_alloc_pde(pmap, va, &pdpg, (flags & 7411 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { 7412 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 7413 " in pmap %p", va, pmap); 7414 return (KERN_RESOURCE_SHORTAGE); 7415 } 7416 7417 /* 7418 * If pkru is not same for the whole pde range, return failure 7419 * and let vm_fault() cope. Check after pde allocation, since 7420 * it could sleep. 7421 */ 7422 if (!pmap_pkru_same(pmap, va, va + NBPDR)) { 7423 pmap_abort_ptp(pmap, va, pdpg); 7424 return (KERN_PROTECTION_FAILURE); 7425 } 7426 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) { 7427 newpde &= ~X86_PG_PKU_MASK; 7428 newpde |= pmap_pkru_get(pmap, va); 7429 } 7430 7431 /* 7432 * If there are existing mappings, either abort or remove them. 7433 */ 7434 oldpde = *pde; 7435 if ((oldpde & PG_V) != 0) { 7436 KASSERT(pdpg == NULL || pdpg->ref_count > 1, 7437 ("pmap_enter_pde: pdpg's reference count is too low")); 7438 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 7439 if ((oldpde & PG_PS) != 0) { 7440 if (pdpg != NULL) 7441 pdpg->ref_count--; 7442 CTR2(KTR_PMAP, 7443 "pmap_enter_pde: no space for va %#lx" 7444 " in pmap %p", va, pmap); 7445 return (KERN_NO_SPACE); 7446 } else if (va < VM_MAXUSER_ADDRESS || 7447 !pmap_every_pte_zero(oldpde & PG_FRAME)) { 7448 if (pdpg != NULL) 7449 pdpg->ref_count--; 7450 CTR2(KTR_PMAP, 7451 "pmap_enter_pde: failure for va %#lx" 7452 " in pmap %p", va, pmap); 7453 return (KERN_FAILURE); 7454 } 7455 } 7456 /* Break the existing mapping(s). */ 7457 SLIST_INIT(&free); 7458 if ((oldpde & PG_PS) != 0) { 7459 /* 7460 * The reference to the PD page that was acquired by 7461 * pmap_alloc_pde() ensures that it won't be freed. 7462 * However, if the PDE resulted from a promotion, then 7463 * a reserved PT page could be freed. 7464 */ 7465 (void)pmap_remove_pde(pmap, pde, va, &free, lockp); 7466 if ((oldpde & PG_G) == 0) 7467 pmap_invalidate_pde_page(pmap, va, oldpde); 7468 } else { 7469 pmap_delayed_invl_start(); 7470 if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free, 7471 lockp)) 7472 pmap_invalidate_all(pmap); 7473 pmap_delayed_invl_finish(); 7474 } 7475 if (va < VM_MAXUSER_ADDRESS) { 7476 vm_page_free_pages_toq(&free, true); 7477 KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", 7478 pde)); 7479 } else { 7480 KASSERT(SLIST_EMPTY(&free), 7481 ("pmap_enter_pde: freed kernel page table page")); 7482 7483 /* 7484 * Both pmap_remove_pde() and pmap_remove_ptes() will 7485 * leave the kernel page table page zero filled. 7486 */ 7487 mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7488 if (pmap_insert_pt_page(pmap, mt, false)) 7489 panic("pmap_enter_pde: trie insert failed"); 7490 } 7491 } 7492 7493 if ((newpde & PG_MANAGED) != 0) { 7494 /* 7495 * Abort this mapping if its PV entry could not be created. 7496 */ 7497 if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) { 7498 if (pdpg != NULL) 7499 pmap_abort_ptp(pmap, va, pdpg); 7500 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 7501 " in pmap %p", va, pmap); 7502 return (KERN_RESOURCE_SHORTAGE); 7503 } 7504 if ((newpde & PG_RW) != 0) { 7505 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 7506 vm_page_aflag_set(mt, PGA_WRITEABLE); 7507 } 7508 } 7509 7510 /* 7511 * Increment counters. 7512 */ 7513 if ((newpde & PG_W) != 0) 7514 pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; 7515 pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE); 7516 7517 /* 7518 * Map the superpage. (This is not a promoted mapping; there will not 7519 * be any lingering 4KB page mappings in the TLB.) 7520 */ 7521 pde_store(pde, newpde); 7522 7523 counter_u64_add(pmap_pde_mappings, 1); 7524 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx in pmap %p", 7525 va, pmap); 7526 return (KERN_SUCCESS); 7527 } 7528 7529 /* 7530 * Maps a sequence of resident pages belonging to the same object. 7531 * The sequence begins with the given page m_start. This page is 7532 * mapped at the given virtual address start. Each subsequent page is 7533 * mapped at a virtual address that is offset from start by the same 7534 * amount as the page is offset from m_start within the object. The 7535 * last page in the sequence is the page with the largest offset from 7536 * m_start that can be mapped at a virtual address less than the given 7537 * virtual address end. Not every virtual page between start and end 7538 * is mapped; only those for which a resident page exists with the 7539 * corresponding offset from m_start are mapped. 7540 */ 7541 void 7542 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 7543 vm_page_t m_start, vm_prot_t prot) 7544 { 7545 struct rwlock *lock; 7546 vm_offset_t va; 7547 vm_page_t m, mpte; 7548 vm_pindex_t diff, psize; 7549 int rv; 7550 7551 VM_OBJECT_ASSERT_LOCKED(m_start->object); 7552 7553 psize = atop(end - start); 7554 mpte = NULL; 7555 m = m_start; 7556 lock = NULL; 7557 PMAP_LOCK(pmap); 7558 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 7559 va = start + ptoa(diff); 7560 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 7561 m->psind == 1 && pmap_ps_enabled(pmap) && 7562 ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) == 7563 KERN_SUCCESS || rv == KERN_NO_SPACE)) 7564 m = &m[NBPDR / PAGE_SIZE - 1]; 7565 else 7566 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 7567 mpte, &lock); 7568 m = TAILQ_NEXT(m, listq); 7569 } 7570 if (lock != NULL) 7571 rw_wunlock(lock); 7572 PMAP_UNLOCK(pmap); 7573 } 7574 7575 /* 7576 * this code makes some *MAJOR* assumptions: 7577 * 1. Current pmap & pmap exists. 7578 * 2. Not wired. 7579 * 3. Read access. 7580 * 4. No page table pages. 7581 * but is *MUCH* faster than pmap_enter... 7582 */ 7583 7584 void 7585 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 7586 { 7587 struct rwlock *lock; 7588 7589 lock = NULL; 7590 PMAP_LOCK(pmap); 7591 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 7592 if (lock != NULL) 7593 rw_wunlock(lock); 7594 PMAP_UNLOCK(pmap); 7595 } 7596 7597 static vm_page_t 7598 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 7599 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 7600 { 7601 pt_entry_t newpte, *pte, PG_V; 7602 7603 KASSERT(!VA_IS_CLEANMAP(va) || 7604 (m->oflags & VPO_UNMANAGED) != 0, 7605 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 7606 PG_V = pmap_valid_bit(pmap); 7607 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7608 7609 /* 7610 * In the case that a page table page is not 7611 * resident, we are creating it here. 7612 */ 7613 if (va < VM_MAXUSER_ADDRESS) { 7614 pdp_entry_t *pdpe; 7615 pd_entry_t *pde; 7616 vm_pindex_t ptepindex; 7617 7618 /* 7619 * Calculate pagetable page index 7620 */ 7621 ptepindex = pmap_pde_pindex(va); 7622 if (mpte && (mpte->pindex == ptepindex)) { 7623 mpte->ref_count++; 7624 } else { 7625 /* 7626 * If the page table page is mapped, we just increment 7627 * the hold count, and activate it. Otherwise, we 7628 * attempt to allocate a page table page, passing NULL 7629 * instead of the PV list lock pointer because we don't 7630 * intend to sleep. If this attempt fails, we don't 7631 * retry. Instead, we give up. 7632 */ 7633 pdpe = pmap_pdpe(pmap, va); 7634 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 7635 if ((*pdpe & PG_PS) != 0) 7636 return (NULL); 7637 pde = pmap_pdpe_to_pde(pdpe, va); 7638 if ((*pde & PG_V) != 0) { 7639 if ((*pde & PG_PS) != 0) 7640 return (NULL); 7641 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7642 mpte->ref_count++; 7643 } else { 7644 mpte = pmap_allocpte_alloc(pmap, 7645 ptepindex, NULL, va); 7646 if (mpte == NULL) 7647 return (NULL); 7648 } 7649 } else { 7650 mpte = pmap_allocpte_alloc(pmap, ptepindex, 7651 NULL, va); 7652 if (mpte == NULL) 7653 return (NULL); 7654 } 7655 } 7656 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 7657 pte = &pte[pmap_pte_index(va)]; 7658 } else { 7659 mpte = NULL; 7660 pte = vtopte(va); 7661 } 7662 if (*pte) { 7663 if (mpte != NULL) 7664 mpte->ref_count--; 7665 return (NULL); 7666 } 7667 7668 /* 7669 * Enter on the PV list if part of our managed memory. 7670 */ 7671 if ((m->oflags & VPO_UNMANAGED) == 0 && 7672 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 7673 if (mpte != NULL) 7674 pmap_abort_ptp(pmap, va, mpte); 7675 return (NULL); 7676 } 7677 7678 /* 7679 * Increment counters 7680 */ 7681 pmap_resident_count_adj(pmap, 1); 7682 7683 newpte = VM_PAGE_TO_PHYS(m) | PG_V | 7684 pmap_cache_bits(pmap, m->md.pat_mode, 0); 7685 if ((m->oflags & VPO_UNMANAGED) == 0) 7686 newpte |= PG_MANAGED; 7687 if ((prot & VM_PROT_EXECUTE) == 0) 7688 newpte |= pg_nx; 7689 if (va < VM_MAXUSER_ADDRESS) 7690 newpte |= PG_U | pmap_pkru_get(pmap, va); 7691 pte_store(pte, newpte); 7692 return (mpte); 7693 } 7694 7695 /* 7696 * Make a temporary mapping for a physical address. This is only intended 7697 * to be used for panic dumps. 7698 */ 7699 void * 7700 pmap_kenter_temporary(vm_paddr_t pa, int i) 7701 { 7702 vm_offset_t va; 7703 7704 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 7705 pmap_kenter(va, pa); 7706 pmap_invlpg(kernel_pmap, va); 7707 return ((void *)crashdumpmap); 7708 } 7709 7710 /* 7711 * This code maps large physical mmap regions into the 7712 * processor address space. Note that some shortcuts 7713 * are taken, but the code works. 7714 */ 7715 void 7716 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 7717 vm_pindex_t pindex, vm_size_t size) 7718 { 7719 pd_entry_t *pde; 7720 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 7721 vm_paddr_t pa, ptepa; 7722 vm_page_t p, pdpg; 7723 int pat_mode; 7724 7725 PG_A = pmap_accessed_bit(pmap); 7726 PG_M = pmap_modified_bit(pmap); 7727 PG_V = pmap_valid_bit(pmap); 7728 PG_RW = pmap_rw_bit(pmap); 7729 7730 VM_OBJECT_ASSERT_WLOCKED(object); 7731 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 7732 ("pmap_object_init_pt: non-device object")); 7733 if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 7734 if (!pmap_ps_enabled(pmap)) 7735 return; 7736 if (!vm_object_populate(object, pindex, pindex + atop(size))) 7737 return; 7738 p = vm_page_lookup(object, pindex); 7739 KASSERT(vm_page_all_valid(p), 7740 ("pmap_object_init_pt: invalid page %p", p)); 7741 pat_mode = p->md.pat_mode; 7742 7743 /* 7744 * Abort the mapping if the first page is not physically 7745 * aligned to a 2MB page boundary. 7746 */ 7747 ptepa = VM_PAGE_TO_PHYS(p); 7748 if (ptepa & (NBPDR - 1)) 7749 return; 7750 7751 /* 7752 * Skip the first page. Abort the mapping if the rest of 7753 * the pages are not physically contiguous or have differing 7754 * memory attributes. 7755 */ 7756 p = TAILQ_NEXT(p, listq); 7757 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 7758 pa += PAGE_SIZE) { 7759 KASSERT(vm_page_all_valid(p), 7760 ("pmap_object_init_pt: invalid page %p", p)); 7761 if (pa != VM_PAGE_TO_PHYS(p) || 7762 pat_mode != p->md.pat_mode) 7763 return; 7764 p = TAILQ_NEXT(p, listq); 7765 } 7766 7767 /* 7768 * Map using 2MB pages. Since "ptepa" is 2M aligned and 7769 * "size" is a multiple of 2M, adding the PAT setting to "pa" 7770 * will not affect the termination of this loop. 7771 */ 7772 PMAP_LOCK(pmap); 7773 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); 7774 pa < ptepa + size; pa += NBPDR) { 7775 pde = pmap_alloc_pde(pmap, addr, &pdpg, NULL); 7776 if (pde == NULL) { 7777 /* 7778 * The creation of mappings below is only an 7779 * optimization. If a page directory page 7780 * cannot be allocated without blocking, 7781 * continue on to the next mapping rather than 7782 * blocking. 7783 */ 7784 addr += NBPDR; 7785 continue; 7786 } 7787 if ((*pde & PG_V) == 0) { 7788 pde_store(pde, pa | PG_PS | PG_M | PG_A | 7789 PG_U | PG_RW | PG_V); 7790 pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE); 7791 counter_u64_add(pmap_pde_mappings, 1); 7792 } else { 7793 /* Continue on if the PDE is already valid. */ 7794 pdpg->ref_count--; 7795 KASSERT(pdpg->ref_count > 0, 7796 ("pmap_object_init_pt: missing reference " 7797 "to page directory page, va: 0x%lx", addr)); 7798 } 7799 addr += NBPDR; 7800 } 7801 PMAP_UNLOCK(pmap); 7802 } 7803 } 7804 7805 /* 7806 * Clear the wired attribute from the mappings for the specified range of 7807 * addresses in the given pmap. Every valid mapping within that range 7808 * must have the wired attribute set. In contrast, invalid mappings 7809 * cannot have the wired attribute set, so they are ignored. 7810 * 7811 * The wired attribute of the page table entry is not a hardware 7812 * feature, so there is no need to invalidate any TLB entries. 7813 * Since pmap_demote_pde() for the wired entry must never fail, 7814 * pmap_delayed_invl_start()/finish() calls around the 7815 * function are not needed. 7816 */ 7817 void 7818 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 7819 { 7820 vm_offset_t va_next; 7821 pml4_entry_t *pml4e; 7822 pdp_entry_t *pdpe; 7823 pd_entry_t *pde; 7824 pt_entry_t *pte, PG_V, PG_G __diagused; 7825 7826 PG_V = pmap_valid_bit(pmap); 7827 PG_G = pmap_global_bit(pmap); 7828 PMAP_LOCK(pmap); 7829 for (; sva < eva; sva = va_next) { 7830 pml4e = pmap_pml4e(pmap, sva); 7831 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 7832 va_next = (sva + NBPML4) & ~PML4MASK; 7833 if (va_next < sva) 7834 va_next = eva; 7835 continue; 7836 } 7837 7838 va_next = (sva + NBPDP) & ~PDPMASK; 7839 if (va_next < sva) 7840 va_next = eva; 7841 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 7842 if ((*pdpe & PG_V) == 0) 7843 continue; 7844 if ((*pdpe & PG_PS) != 0) { 7845 KASSERT(va_next <= eva, 7846 ("partial update of non-transparent 1G mapping " 7847 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 7848 *pdpe, sva, eva, va_next)); 7849 MPASS(pmap != kernel_pmap); /* XXXKIB */ 7850 MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); 7851 atomic_clear_long(pdpe, PG_W); 7852 pmap->pm_stats.wired_count -= NBPDP / PAGE_SIZE; 7853 continue; 7854 } 7855 7856 va_next = (sva + NBPDR) & ~PDRMASK; 7857 if (va_next < sva) 7858 va_next = eva; 7859 pde = pmap_pdpe_to_pde(pdpe, sva); 7860 if ((*pde & PG_V) == 0) 7861 continue; 7862 if ((*pde & PG_PS) != 0) { 7863 if ((*pde & PG_W) == 0) 7864 panic("pmap_unwire: pde %#jx is missing PG_W", 7865 (uintmax_t)*pde); 7866 7867 /* 7868 * Are we unwiring the entire large page? If not, 7869 * demote the mapping and fall through. 7870 */ 7871 if (sva + NBPDR == va_next && eva >= va_next) { 7872 atomic_clear_long(pde, PG_W); 7873 pmap->pm_stats.wired_count -= NBPDR / 7874 PAGE_SIZE; 7875 continue; 7876 } else if (!pmap_demote_pde(pmap, pde, sva)) 7877 panic("pmap_unwire: demotion failed"); 7878 } 7879 if (va_next > eva) 7880 va_next = eva; 7881 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 7882 sva += PAGE_SIZE) { 7883 if ((*pte & PG_V) == 0) 7884 continue; 7885 if ((*pte & PG_W) == 0) 7886 panic("pmap_unwire: pte %#jx is missing PG_W", 7887 (uintmax_t)*pte); 7888 7889 /* 7890 * PG_W must be cleared atomically. Although the pmap 7891 * lock synchronizes access to PG_W, another processor 7892 * could be setting PG_M and/or PG_A concurrently. 7893 */ 7894 atomic_clear_long(pte, PG_W); 7895 pmap->pm_stats.wired_count--; 7896 } 7897 } 7898 PMAP_UNLOCK(pmap); 7899 } 7900 7901 /* 7902 * Copy the range specified by src_addr/len 7903 * from the source map to the range dst_addr/len 7904 * in the destination map. 7905 * 7906 * This routine is only advisory and need not do anything. 7907 */ 7908 void 7909 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 7910 vm_offset_t src_addr) 7911 { 7912 struct rwlock *lock; 7913 pml4_entry_t *pml4e; 7914 pdp_entry_t *pdpe; 7915 pd_entry_t *pde, srcptepaddr; 7916 pt_entry_t *dst_pte, PG_A, PG_M, PG_V, ptetemp, *src_pte; 7917 vm_offset_t addr, end_addr, va_next; 7918 vm_page_t dst_pdpg, dstmpte, srcmpte; 7919 7920 if (dst_addr != src_addr) 7921 return; 7922 7923 if (dst_pmap->pm_type != src_pmap->pm_type) 7924 return; 7925 7926 /* 7927 * EPT page table entries that require emulation of A/D bits are 7928 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although 7929 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit 7930 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT 7931 * implementations flag an EPT misconfiguration for exec-only 7932 * mappings we skip this function entirely for emulated pmaps. 7933 */ 7934 if (pmap_emulate_ad_bits(dst_pmap)) 7935 return; 7936 7937 end_addr = src_addr + len; 7938 lock = NULL; 7939 if (dst_pmap < src_pmap) { 7940 PMAP_LOCK(dst_pmap); 7941 PMAP_LOCK(src_pmap); 7942 } else { 7943 PMAP_LOCK(src_pmap); 7944 PMAP_LOCK(dst_pmap); 7945 } 7946 7947 PG_A = pmap_accessed_bit(dst_pmap); 7948 PG_M = pmap_modified_bit(dst_pmap); 7949 PG_V = pmap_valid_bit(dst_pmap); 7950 7951 for (addr = src_addr; addr < end_addr; addr = va_next) { 7952 KASSERT(addr < UPT_MIN_ADDRESS, 7953 ("pmap_copy: invalid to pmap_copy page tables")); 7954 7955 pml4e = pmap_pml4e(src_pmap, addr); 7956 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 7957 va_next = (addr + NBPML4) & ~PML4MASK; 7958 if (va_next < addr) 7959 va_next = end_addr; 7960 continue; 7961 } 7962 7963 va_next = (addr + NBPDP) & ~PDPMASK; 7964 if (va_next < addr) 7965 va_next = end_addr; 7966 pdpe = pmap_pml4e_to_pdpe(pml4e, addr); 7967 if ((*pdpe & PG_V) == 0) 7968 continue; 7969 if ((*pdpe & PG_PS) != 0) { 7970 KASSERT(va_next <= end_addr, 7971 ("partial update of non-transparent 1G mapping " 7972 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 7973 *pdpe, addr, end_addr, va_next)); 7974 MPASS((addr & PDPMASK) == 0); 7975 MPASS((*pdpe & PG_MANAGED) == 0); 7976 srcptepaddr = *pdpe; 7977 pdpe = pmap_pdpe(dst_pmap, addr); 7978 if (pdpe == NULL) { 7979 if (pmap_allocpte_alloc(dst_pmap, 7980 pmap_pml4e_pindex(addr), NULL, addr) == 7981 NULL) 7982 break; 7983 pdpe = pmap_pdpe(dst_pmap, addr); 7984 } else { 7985 pml4e = pmap_pml4e(dst_pmap, addr); 7986 dst_pdpg = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME); 7987 dst_pdpg->ref_count++; 7988 } 7989 KASSERT(*pdpe == 0, 7990 ("1G mapping present in dst pmap " 7991 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 7992 *pdpe, addr, end_addr, va_next)); 7993 *pdpe = srcptepaddr & ~PG_W; 7994 pmap_resident_count_adj(dst_pmap, NBPDP / PAGE_SIZE); 7995 continue; 7996 } 7997 7998 va_next = (addr + NBPDR) & ~PDRMASK; 7999 if (va_next < addr) 8000 va_next = end_addr; 8001 8002 pde = pmap_pdpe_to_pde(pdpe, addr); 8003 srcptepaddr = *pde; 8004 if (srcptepaddr == 0) 8005 continue; 8006 8007 if (srcptepaddr & PG_PS) { 8008 /* 8009 * We can only virtual copy whole superpages. 8010 */ 8011 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 8012 continue; 8013 pde = pmap_alloc_pde(dst_pmap, addr, &dst_pdpg, NULL); 8014 if (pde == NULL) 8015 break; 8016 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 8017 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, 8018 PMAP_ENTER_NORECLAIM, &lock))) { 8019 /* 8020 * We leave the dirty bit unchanged because 8021 * managed read/write superpage mappings are 8022 * required to be dirty. However, managed 8023 * superpage mappings are not required to 8024 * have their accessed bit set, so we clear 8025 * it because we don't know if this mapping 8026 * will be used. 8027 */ 8028 srcptepaddr &= ~PG_W; 8029 if ((srcptepaddr & PG_MANAGED) != 0) 8030 srcptepaddr &= ~PG_A; 8031 *pde = srcptepaddr; 8032 pmap_resident_count_adj(dst_pmap, NBPDR / 8033 PAGE_SIZE); 8034 counter_u64_add(pmap_pde_mappings, 1); 8035 } else 8036 pmap_abort_ptp(dst_pmap, addr, dst_pdpg); 8037 continue; 8038 } 8039 8040 srcptepaddr &= PG_FRAME; 8041 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 8042 KASSERT(srcmpte->ref_count > 0, 8043 ("pmap_copy: source page table page is unused")); 8044 8045 if (va_next > end_addr) 8046 va_next = end_addr; 8047 8048 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 8049 src_pte = &src_pte[pmap_pte_index(addr)]; 8050 dstmpte = NULL; 8051 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { 8052 ptetemp = *src_pte; 8053 8054 /* 8055 * We only virtual copy managed pages. 8056 */ 8057 if ((ptetemp & PG_MANAGED) == 0) 8058 continue; 8059 8060 if (dstmpte != NULL) { 8061 KASSERT(dstmpte->pindex == 8062 pmap_pde_pindex(addr), 8063 ("dstmpte pindex/addr mismatch")); 8064 dstmpte->ref_count++; 8065 } else if ((dstmpte = pmap_allocpte(dst_pmap, addr, 8066 NULL)) == NULL) 8067 goto out; 8068 dst_pte = (pt_entry_t *) 8069 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 8070 dst_pte = &dst_pte[pmap_pte_index(addr)]; 8071 if (*dst_pte == 0 && 8072 pmap_try_insert_pv_entry(dst_pmap, addr, 8073 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) { 8074 /* 8075 * Clear the wired, modified, and accessed 8076 * (referenced) bits during the copy. 8077 */ 8078 *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); 8079 pmap_resident_count_adj(dst_pmap, 1); 8080 } else { 8081 pmap_abort_ptp(dst_pmap, addr, dstmpte); 8082 goto out; 8083 } 8084 /* Have we copied all of the valid mappings? */ 8085 if (dstmpte->ref_count >= srcmpte->ref_count) 8086 break; 8087 } 8088 } 8089 out: 8090 if (lock != NULL) 8091 rw_wunlock(lock); 8092 PMAP_UNLOCK(src_pmap); 8093 PMAP_UNLOCK(dst_pmap); 8094 } 8095 8096 int 8097 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap) 8098 { 8099 int error; 8100 8101 if (dst_pmap->pm_type != src_pmap->pm_type || 8102 dst_pmap->pm_type != PT_X86 || 8103 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) 8104 return (0); 8105 for (;;) { 8106 if (dst_pmap < src_pmap) { 8107 PMAP_LOCK(dst_pmap); 8108 PMAP_LOCK(src_pmap); 8109 } else { 8110 PMAP_LOCK(src_pmap); 8111 PMAP_LOCK(dst_pmap); 8112 } 8113 error = pmap_pkru_copy(dst_pmap, src_pmap); 8114 /* Clean up partial copy on failure due to no memory. */ 8115 if (error == ENOMEM) 8116 pmap_pkru_deassign_all(dst_pmap); 8117 PMAP_UNLOCK(src_pmap); 8118 PMAP_UNLOCK(dst_pmap); 8119 if (error != ENOMEM) 8120 break; 8121 vm_wait(NULL); 8122 } 8123 return (error); 8124 } 8125 8126 /* 8127 * Zero the specified hardware page. 8128 */ 8129 void 8130 pmap_zero_page(vm_page_t m) 8131 { 8132 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 8133 8134 pagezero((void *)va); 8135 } 8136 8137 /* 8138 * Zero an area within a single hardware page. off and size must not 8139 * cover an area beyond a single hardware page. 8140 */ 8141 void 8142 pmap_zero_page_area(vm_page_t m, int off, int size) 8143 { 8144 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 8145 8146 if (off == 0 && size == PAGE_SIZE) 8147 pagezero((void *)va); 8148 else 8149 bzero((char *)va + off, size); 8150 } 8151 8152 /* 8153 * Copy 1 specified hardware page to another. 8154 */ 8155 void 8156 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 8157 { 8158 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 8159 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 8160 8161 pagecopy((void *)src, (void *)dst); 8162 } 8163 8164 int unmapped_buf_allowed = 1; 8165 8166 void 8167 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 8168 vm_offset_t b_offset, int xfersize) 8169 { 8170 void *a_cp, *b_cp; 8171 vm_page_t pages[2]; 8172 vm_offset_t vaddr[2], a_pg_offset, b_pg_offset; 8173 int cnt; 8174 boolean_t mapped; 8175 8176 while (xfersize > 0) { 8177 a_pg_offset = a_offset & PAGE_MASK; 8178 pages[0] = ma[a_offset >> PAGE_SHIFT]; 8179 b_pg_offset = b_offset & PAGE_MASK; 8180 pages[1] = mb[b_offset >> PAGE_SHIFT]; 8181 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 8182 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 8183 mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE); 8184 a_cp = (char *)vaddr[0] + a_pg_offset; 8185 b_cp = (char *)vaddr[1] + b_pg_offset; 8186 bcopy(a_cp, b_cp, cnt); 8187 if (__predict_false(mapped)) 8188 pmap_unmap_io_transient(pages, vaddr, 2, FALSE); 8189 a_offset += cnt; 8190 b_offset += cnt; 8191 xfersize -= cnt; 8192 } 8193 } 8194 8195 /* 8196 * Returns true if the pmap's pv is one of the first 8197 * 16 pvs linked to from this page. This count may 8198 * be changed upwards or downwards in the future; it 8199 * is only necessary that true be returned for a small 8200 * subset of pmaps for proper page aging. 8201 */ 8202 boolean_t 8203 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 8204 { 8205 struct md_page *pvh; 8206 struct rwlock *lock; 8207 pv_entry_t pv; 8208 int loops = 0; 8209 boolean_t rv; 8210 8211 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8212 ("pmap_page_exists_quick: page %p is not managed", m)); 8213 rv = FALSE; 8214 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8215 rw_rlock(lock); 8216 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8217 if (PV_PMAP(pv) == pmap) { 8218 rv = TRUE; 8219 break; 8220 } 8221 loops++; 8222 if (loops >= 16) 8223 break; 8224 } 8225 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 8226 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8227 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8228 if (PV_PMAP(pv) == pmap) { 8229 rv = TRUE; 8230 break; 8231 } 8232 loops++; 8233 if (loops >= 16) 8234 break; 8235 } 8236 } 8237 rw_runlock(lock); 8238 return (rv); 8239 } 8240 8241 /* 8242 * pmap_page_wired_mappings: 8243 * 8244 * Return the number of managed mappings to the given physical page 8245 * that are wired. 8246 */ 8247 int 8248 pmap_page_wired_mappings(vm_page_t m) 8249 { 8250 struct rwlock *lock; 8251 struct md_page *pvh; 8252 pmap_t pmap; 8253 pt_entry_t *pte; 8254 pv_entry_t pv; 8255 int count, md_gen, pvh_gen; 8256 8257 if ((m->oflags & VPO_UNMANAGED) != 0) 8258 return (0); 8259 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8260 rw_rlock(lock); 8261 restart: 8262 count = 0; 8263 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8264 pmap = PV_PMAP(pv); 8265 if (!PMAP_TRYLOCK(pmap)) { 8266 md_gen = m->md.pv_gen; 8267 rw_runlock(lock); 8268 PMAP_LOCK(pmap); 8269 rw_rlock(lock); 8270 if (md_gen != m->md.pv_gen) { 8271 PMAP_UNLOCK(pmap); 8272 goto restart; 8273 } 8274 } 8275 pte = pmap_pte(pmap, pv->pv_va); 8276 if ((*pte & PG_W) != 0) 8277 count++; 8278 PMAP_UNLOCK(pmap); 8279 } 8280 if ((m->flags & PG_FICTITIOUS) == 0) { 8281 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8282 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8283 pmap = PV_PMAP(pv); 8284 if (!PMAP_TRYLOCK(pmap)) { 8285 md_gen = m->md.pv_gen; 8286 pvh_gen = pvh->pv_gen; 8287 rw_runlock(lock); 8288 PMAP_LOCK(pmap); 8289 rw_rlock(lock); 8290 if (md_gen != m->md.pv_gen || 8291 pvh_gen != pvh->pv_gen) { 8292 PMAP_UNLOCK(pmap); 8293 goto restart; 8294 } 8295 } 8296 pte = pmap_pde(pmap, pv->pv_va); 8297 if ((*pte & PG_W) != 0) 8298 count++; 8299 PMAP_UNLOCK(pmap); 8300 } 8301 } 8302 rw_runlock(lock); 8303 return (count); 8304 } 8305 8306 /* 8307 * Returns TRUE if the given page is mapped individually or as part of 8308 * a 2mpage. Otherwise, returns FALSE. 8309 */ 8310 boolean_t 8311 pmap_page_is_mapped(vm_page_t m) 8312 { 8313 struct rwlock *lock; 8314 boolean_t rv; 8315 8316 if ((m->oflags & VPO_UNMANAGED) != 0) 8317 return (FALSE); 8318 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8319 rw_rlock(lock); 8320 rv = !TAILQ_EMPTY(&m->md.pv_list) || 8321 ((m->flags & PG_FICTITIOUS) == 0 && 8322 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 8323 rw_runlock(lock); 8324 return (rv); 8325 } 8326 8327 /* 8328 * Destroy all managed, non-wired mappings in the given user-space 8329 * pmap. This pmap cannot be active on any processor besides the 8330 * caller. 8331 * 8332 * This function cannot be applied to the kernel pmap. Moreover, it 8333 * is not intended for general use. It is only to be used during 8334 * process termination. Consequently, it can be implemented in ways 8335 * that make it faster than pmap_remove(). First, it can more quickly 8336 * destroy mappings by iterating over the pmap's collection of PV 8337 * entries, rather than searching the page table. Second, it doesn't 8338 * have to test and clear the page table entries atomically, because 8339 * no processor is currently accessing the user address space. In 8340 * particular, a page table entry's dirty bit won't change state once 8341 * this function starts. 8342 * 8343 * Although this function destroys all of the pmap's managed, 8344 * non-wired mappings, it can delay and batch the invalidation of TLB 8345 * entries without calling pmap_delayed_invl_start() and 8346 * pmap_delayed_invl_finish(). Because the pmap is not active on 8347 * any other processor, none of these TLB entries will ever be used 8348 * before their eventual invalidation. Consequently, there is no need 8349 * for either pmap_remove_all() or pmap_remove_write() to wait for 8350 * that eventual TLB invalidation. 8351 */ 8352 void 8353 pmap_remove_pages(pmap_t pmap) 8354 { 8355 pd_entry_t ptepde; 8356 pt_entry_t *pte, tpte; 8357 pt_entry_t PG_M, PG_RW, PG_V; 8358 struct spglist free; 8359 struct pv_chunklist free_chunks[PMAP_MEMDOM]; 8360 vm_page_t m, mpte, mt; 8361 pv_entry_t pv; 8362 struct md_page *pvh; 8363 struct pv_chunk *pc, *npc; 8364 struct rwlock *lock; 8365 int64_t bit; 8366 uint64_t inuse, bitmask; 8367 int allfree, field, i, idx; 8368 #ifdef PV_STATS 8369 int freed; 8370 #endif 8371 boolean_t superpage; 8372 vm_paddr_t pa; 8373 8374 /* 8375 * Assert that the given pmap is only active on the current 8376 * CPU. Unfortunately, we cannot block another CPU from 8377 * activating the pmap while this function is executing. 8378 */ 8379 KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); 8380 #ifdef INVARIANTS 8381 { 8382 cpuset_t other_cpus; 8383 8384 other_cpus = all_cpus; 8385 critical_enter(); 8386 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 8387 CPU_AND(&other_cpus, &other_cpus, &pmap->pm_active); 8388 critical_exit(); 8389 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); 8390 } 8391 #endif 8392 8393 lock = NULL; 8394 PG_M = pmap_modified_bit(pmap); 8395 PG_V = pmap_valid_bit(pmap); 8396 PG_RW = pmap_rw_bit(pmap); 8397 8398 for (i = 0; i < PMAP_MEMDOM; i++) 8399 TAILQ_INIT(&free_chunks[i]); 8400 SLIST_INIT(&free); 8401 PMAP_LOCK(pmap); 8402 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 8403 allfree = 1; 8404 #ifdef PV_STATS 8405 freed = 0; 8406 #endif 8407 for (field = 0; field < _NPCM; field++) { 8408 inuse = ~pc->pc_map[field] & pc_freemask[field]; 8409 while (inuse != 0) { 8410 bit = bsfq(inuse); 8411 bitmask = 1UL << bit; 8412 idx = field * 64 + bit; 8413 pv = &pc->pc_pventry[idx]; 8414 inuse &= ~bitmask; 8415 8416 pte = pmap_pdpe(pmap, pv->pv_va); 8417 ptepde = *pte; 8418 pte = pmap_pdpe_to_pde(pte, pv->pv_va); 8419 tpte = *pte; 8420 if ((tpte & (PG_PS | PG_V)) == PG_V) { 8421 superpage = FALSE; 8422 ptepde = tpte; 8423 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 8424 PG_FRAME); 8425 pte = &pte[pmap_pte_index(pv->pv_va)]; 8426 tpte = *pte; 8427 } else { 8428 /* 8429 * Keep track whether 'tpte' is a 8430 * superpage explicitly instead of 8431 * relying on PG_PS being set. 8432 * 8433 * This is because PG_PS is numerically 8434 * identical to PG_PTE_PAT and thus a 8435 * regular page could be mistaken for 8436 * a superpage. 8437 */ 8438 superpage = TRUE; 8439 } 8440 8441 if ((tpte & PG_V) == 0) { 8442 panic("bad pte va %lx pte %lx", 8443 pv->pv_va, tpte); 8444 } 8445 8446 /* 8447 * We cannot remove wired pages from a process' mapping at this time 8448 */ 8449 if (tpte & PG_W) { 8450 allfree = 0; 8451 continue; 8452 } 8453 8454 /* Mark free */ 8455 pc->pc_map[field] |= bitmask; 8456 8457 /* 8458 * Because this pmap is not active on other 8459 * processors, the dirty bit cannot have 8460 * changed state since we last loaded pte. 8461 */ 8462 pte_clear(pte); 8463 8464 if (superpage) 8465 pa = tpte & PG_PS_FRAME; 8466 else 8467 pa = tpte & PG_FRAME; 8468 8469 m = PHYS_TO_VM_PAGE(pa); 8470 KASSERT(m->phys_addr == pa, 8471 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 8472 m, (uintmax_t)m->phys_addr, 8473 (uintmax_t)tpte)); 8474 8475 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 8476 m < &vm_page_array[vm_page_array_size], 8477 ("pmap_remove_pages: bad tpte %#jx", 8478 (uintmax_t)tpte)); 8479 8480 /* 8481 * Update the vm_page_t clean/reference bits. 8482 */ 8483 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 8484 if (superpage) { 8485 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 8486 vm_page_dirty(mt); 8487 } else 8488 vm_page_dirty(m); 8489 } 8490 8491 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 8492 8493 if (superpage) { 8494 pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE); 8495 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 8496 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 8497 pvh->pv_gen++; 8498 if (TAILQ_EMPTY(&pvh->pv_list)) { 8499 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 8500 if ((mt->a.flags & PGA_WRITEABLE) != 0 && 8501 TAILQ_EMPTY(&mt->md.pv_list)) 8502 vm_page_aflag_clear(mt, PGA_WRITEABLE); 8503 } 8504 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 8505 if (mpte != NULL) { 8506 KASSERT(vm_page_all_valid(mpte), 8507 ("pmap_remove_pages: pte page not promoted")); 8508 pmap_pt_page_count_adj(pmap, -1); 8509 KASSERT(mpte->ref_count == NPTEPG, 8510 ("pmap_remove_pages: pte page reference count error")); 8511 mpte->ref_count = 0; 8512 pmap_add_delayed_free_list(mpte, &free, FALSE); 8513 } 8514 } else { 8515 pmap_resident_count_adj(pmap, -1); 8516 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 8517 m->md.pv_gen++; 8518 if ((m->a.flags & PGA_WRITEABLE) != 0 && 8519 TAILQ_EMPTY(&m->md.pv_list) && 8520 (m->flags & PG_FICTITIOUS) == 0) { 8521 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8522 if (TAILQ_EMPTY(&pvh->pv_list)) 8523 vm_page_aflag_clear(m, PGA_WRITEABLE); 8524 } 8525 } 8526 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 8527 #ifdef PV_STATS 8528 freed++; 8529 #endif 8530 } 8531 } 8532 PV_STAT(counter_u64_add(pv_entry_frees, freed)); 8533 PV_STAT(counter_u64_add(pv_entry_spare, freed)); 8534 PV_STAT(counter_u64_add(pv_entry_count, -freed)); 8535 if (allfree) { 8536 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 8537 TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc, pc_list); 8538 } 8539 } 8540 if (lock != NULL) 8541 rw_wunlock(lock); 8542 pmap_invalidate_all(pmap); 8543 pmap_pkru_deassign_all(pmap); 8544 free_pv_chunk_batch((struct pv_chunklist *)&free_chunks); 8545 PMAP_UNLOCK(pmap); 8546 vm_page_free_pages_toq(&free, true); 8547 } 8548 8549 static boolean_t 8550 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 8551 { 8552 struct rwlock *lock; 8553 pv_entry_t pv; 8554 struct md_page *pvh; 8555 pt_entry_t *pte, mask; 8556 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 8557 pmap_t pmap; 8558 int md_gen, pvh_gen; 8559 boolean_t rv; 8560 8561 rv = FALSE; 8562 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8563 rw_rlock(lock); 8564 restart: 8565 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8566 pmap = PV_PMAP(pv); 8567 if (!PMAP_TRYLOCK(pmap)) { 8568 md_gen = m->md.pv_gen; 8569 rw_runlock(lock); 8570 PMAP_LOCK(pmap); 8571 rw_rlock(lock); 8572 if (md_gen != m->md.pv_gen) { 8573 PMAP_UNLOCK(pmap); 8574 goto restart; 8575 } 8576 } 8577 pte = pmap_pte(pmap, pv->pv_va); 8578 mask = 0; 8579 if (modified) { 8580 PG_M = pmap_modified_bit(pmap); 8581 PG_RW = pmap_rw_bit(pmap); 8582 mask |= PG_RW | PG_M; 8583 } 8584 if (accessed) { 8585 PG_A = pmap_accessed_bit(pmap); 8586 PG_V = pmap_valid_bit(pmap); 8587 mask |= PG_V | PG_A; 8588 } 8589 rv = (*pte & mask) == mask; 8590 PMAP_UNLOCK(pmap); 8591 if (rv) 8592 goto out; 8593 } 8594 if ((m->flags & PG_FICTITIOUS) == 0) { 8595 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8596 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8597 pmap = PV_PMAP(pv); 8598 if (!PMAP_TRYLOCK(pmap)) { 8599 md_gen = m->md.pv_gen; 8600 pvh_gen = pvh->pv_gen; 8601 rw_runlock(lock); 8602 PMAP_LOCK(pmap); 8603 rw_rlock(lock); 8604 if (md_gen != m->md.pv_gen || 8605 pvh_gen != pvh->pv_gen) { 8606 PMAP_UNLOCK(pmap); 8607 goto restart; 8608 } 8609 } 8610 pte = pmap_pde(pmap, pv->pv_va); 8611 mask = 0; 8612 if (modified) { 8613 PG_M = pmap_modified_bit(pmap); 8614 PG_RW = pmap_rw_bit(pmap); 8615 mask |= PG_RW | PG_M; 8616 } 8617 if (accessed) { 8618 PG_A = pmap_accessed_bit(pmap); 8619 PG_V = pmap_valid_bit(pmap); 8620 mask |= PG_V | PG_A; 8621 } 8622 rv = (*pte & mask) == mask; 8623 PMAP_UNLOCK(pmap); 8624 if (rv) 8625 goto out; 8626 } 8627 } 8628 out: 8629 rw_runlock(lock); 8630 return (rv); 8631 } 8632 8633 /* 8634 * pmap_is_modified: 8635 * 8636 * Return whether or not the specified physical page was modified 8637 * in any physical maps. 8638 */ 8639 boolean_t 8640 pmap_is_modified(vm_page_t m) 8641 { 8642 8643 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8644 ("pmap_is_modified: page %p is not managed", m)); 8645 8646 /* 8647 * If the page is not busied then this check is racy. 8648 */ 8649 if (!pmap_page_is_write_mapped(m)) 8650 return (FALSE); 8651 return (pmap_page_test_mappings(m, FALSE, TRUE)); 8652 } 8653 8654 /* 8655 * pmap_is_prefaultable: 8656 * 8657 * Return whether or not the specified virtual address is eligible 8658 * for prefault. 8659 */ 8660 boolean_t 8661 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 8662 { 8663 pd_entry_t *pde; 8664 pt_entry_t *pte, PG_V; 8665 boolean_t rv; 8666 8667 PG_V = pmap_valid_bit(pmap); 8668 8669 /* 8670 * Return TRUE if and only if the PTE for the specified virtual 8671 * address is allocated but invalid. 8672 */ 8673 rv = FALSE; 8674 PMAP_LOCK(pmap); 8675 pde = pmap_pde(pmap, addr); 8676 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { 8677 pte = pmap_pde_to_pte(pde, addr); 8678 rv = (*pte & PG_V) == 0; 8679 } 8680 PMAP_UNLOCK(pmap); 8681 return (rv); 8682 } 8683 8684 /* 8685 * pmap_is_referenced: 8686 * 8687 * Return whether or not the specified physical page was referenced 8688 * in any physical maps. 8689 */ 8690 boolean_t 8691 pmap_is_referenced(vm_page_t m) 8692 { 8693 8694 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8695 ("pmap_is_referenced: page %p is not managed", m)); 8696 return (pmap_page_test_mappings(m, TRUE, FALSE)); 8697 } 8698 8699 /* 8700 * Clear the write and modified bits in each of the given page's mappings. 8701 */ 8702 void 8703 pmap_remove_write(vm_page_t m) 8704 { 8705 struct md_page *pvh; 8706 pmap_t pmap; 8707 struct rwlock *lock; 8708 pv_entry_t next_pv, pv; 8709 pd_entry_t *pde; 8710 pt_entry_t oldpte, *pte, PG_M, PG_RW; 8711 vm_offset_t va; 8712 int pvh_gen, md_gen; 8713 8714 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8715 ("pmap_remove_write: page %p is not managed", m)); 8716 8717 vm_page_assert_busied(m); 8718 if (!pmap_page_is_write_mapped(m)) 8719 return; 8720 8721 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8722 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 8723 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8724 rw_wlock(lock); 8725 retry: 8726 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 8727 pmap = PV_PMAP(pv); 8728 if (!PMAP_TRYLOCK(pmap)) { 8729 pvh_gen = pvh->pv_gen; 8730 rw_wunlock(lock); 8731 PMAP_LOCK(pmap); 8732 rw_wlock(lock); 8733 if (pvh_gen != pvh->pv_gen) { 8734 PMAP_UNLOCK(pmap); 8735 goto retry; 8736 } 8737 } 8738 PG_RW = pmap_rw_bit(pmap); 8739 va = pv->pv_va; 8740 pde = pmap_pde(pmap, va); 8741 if ((*pde & PG_RW) != 0) 8742 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 8743 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 8744 ("inconsistent pv lock %p %p for page %p", 8745 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 8746 PMAP_UNLOCK(pmap); 8747 } 8748 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8749 pmap = PV_PMAP(pv); 8750 if (!PMAP_TRYLOCK(pmap)) { 8751 pvh_gen = pvh->pv_gen; 8752 md_gen = m->md.pv_gen; 8753 rw_wunlock(lock); 8754 PMAP_LOCK(pmap); 8755 rw_wlock(lock); 8756 if (pvh_gen != pvh->pv_gen || 8757 md_gen != m->md.pv_gen) { 8758 PMAP_UNLOCK(pmap); 8759 goto retry; 8760 } 8761 } 8762 PG_M = pmap_modified_bit(pmap); 8763 PG_RW = pmap_rw_bit(pmap); 8764 pde = pmap_pde(pmap, pv->pv_va); 8765 KASSERT((*pde & PG_PS) == 0, 8766 ("pmap_remove_write: found a 2mpage in page %p's pv list", 8767 m)); 8768 pte = pmap_pde_to_pte(pde, pv->pv_va); 8769 oldpte = *pte; 8770 if (oldpte & PG_RW) { 8771 while (!atomic_fcmpset_long(pte, &oldpte, oldpte & 8772 ~(PG_RW | PG_M))) 8773 cpu_spinwait(); 8774 if ((oldpte & PG_M) != 0) 8775 vm_page_dirty(m); 8776 pmap_invalidate_page(pmap, pv->pv_va); 8777 } 8778 PMAP_UNLOCK(pmap); 8779 } 8780 rw_wunlock(lock); 8781 vm_page_aflag_clear(m, PGA_WRITEABLE); 8782 pmap_delayed_invl_wait(m); 8783 } 8784 8785 static __inline boolean_t 8786 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) 8787 { 8788 8789 if (!pmap_emulate_ad_bits(pmap)) 8790 return (TRUE); 8791 8792 KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); 8793 8794 /* 8795 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration 8796 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared 8797 * if the EPT_PG_WRITE bit is set. 8798 */ 8799 if ((pte & EPT_PG_WRITE) != 0) 8800 return (FALSE); 8801 8802 /* 8803 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. 8804 */ 8805 if ((pte & EPT_PG_EXECUTE) == 0 || 8806 ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) 8807 return (TRUE); 8808 else 8809 return (FALSE); 8810 } 8811 8812 /* 8813 * pmap_ts_referenced: 8814 * 8815 * Return a count of reference bits for a page, clearing those bits. 8816 * It is not necessary for every reference bit to be cleared, but it 8817 * is necessary that 0 only be returned when there are truly no 8818 * reference bits set. 8819 * 8820 * As an optimization, update the page's dirty field if a modified bit is 8821 * found while counting reference bits. This opportunistic update can be 8822 * performed at low cost and can eliminate the need for some future calls 8823 * to pmap_is_modified(). However, since this function stops after 8824 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 8825 * dirty pages. Those dirty pages will only be detected by a future call 8826 * to pmap_is_modified(). 8827 * 8828 * A DI block is not needed within this function, because 8829 * invalidations are performed before the PV list lock is 8830 * released. 8831 */ 8832 int 8833 pmap_ts_referenced(vm_page_t m) 8834 { 8835 struct md_page *pvh; 8836 pv_entry_t pv, pvf; 8837 pmap_t pmap; 8838 struct rwlock *lock; 8839 pd_entry_t oldpde, *pde; 8840 pt_entry_t *pte, PG_A, PG_M, PG_RW; 8841 vm_offset_t va; 8842 vm_paddr_t pa; 8843 int cleared, md_gen, not_cleared, pvh_gen; 8844 struct spglist free; 8845 boolean_t demoted; 8846 8847 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8848 ("pmap_ts_referenced: page %p is not managed", m)); 8849 SLIST_INIT(&free); 8850 cleared = 0; 8851 pa = VM_PAGE_TO_PHYS(m); 8852 lock = PHYS_TO_PV_LIST_LOCK(pa); 8853 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 8854 rw_wlock(lock); 8855 retry: 8856 not_cleared = 0; 8857 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 8858 goto small_mappings; 8859 pv = pvf; 8860 do { 8861 if (pvf == NULL) 8862 pvf = pv; 8863 pmap = PV_PMAP(pv); 8864 if (!PMAP_TRYLOCK(pmap)) { 8865 pvh_gen = pvh->pv_gen; 8866 rw_wunlock(lock); 8867 PMAP_LOCK(pmap); 8868 rw_wlock(lock); 8869 if (pvh_gen != pvh->pv_gen) { 8870 PMAP_UNLOCK(pmap); 8871 goto retry; 8872 } 8873 } 8874 PG_A = pmap_accessed_bit(pmap); 8875 PG_M = pmap_modified_bit(pmap); 8876 PG_RW = pmap_rw_bit(pmap); 8877 va = pv->pv_va; 8878 pde = pmap_pde(pmap, pv->pv_va); 8879 oldpde = *pde; 8880 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 8881 /* 8882 * Although "oldpde" is mapping a 2MB page, because 8883 * this function is called at a 4KB page granularity, 8884 * we only update the 4KB page under test. 8885 */ 8886 vm_page_dirty(m); 8887 } 8888 if ((oldpde & PG_A) != 0) { 8889 /* 8890 * Since this reference bit is shared by 512 4KB 8891 * pages, it should not be cleared every time it is 8892 * tested. Apply a simple "hash" function on the 8893 * physical page number, the virtual superpage number, 8894 * and the pmap address to select one 4KB page out of 8895 * the 512 on which testing the reference bit will 8896 * result in clearing that reference bit. This 8897 * function is designed to avoid the selection of the 8898 * same 4KB page for every 2MB page mapping. 8899 * 8900 * On demotion, a mapping that hasn't been referenced 8901 * is simply destroyed. To avoid the possibility of a 8902 * subsequent page fault on a demoted wired mapping, 8903 * always leave its reference bit set. Moreover, 8904 * since the superpage is wired, the current state of 8905 * its reference bit won't affect page replacement. 8906 */ 8907 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 8908 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 8909 (oldpde & PG_W) == 0) { 8910 if (safe_to_clear_referenced(pmap, oldpde)) { 8911 atomic_clear_long(pde, PG_A); 8912 pmap_invalidate_page(pmap, pv->pv_va); 8913 demoted = FALSE; 8914 } else if (pmap_demote_pde_locked(pmap, pde, 8915 pv->pv_va, &lock)) { 8916 /* 8917 * Remove the mapping to a single page 8918 * so that a subsequent access may 8919 * repromote. Since the underlying 8920 * page table page is fully populated, 8921 * this removal never frees a page 8922 * table page. 8923 */ 8924 demoted = TRUE; 8925 va += VM_PAGE_TO_PHYS(m) - (oldpde & 8926 PG_PS_FRAME); 8927 pte = pmap_pde_to_pte(pde, va); 8928 pmap_remove_pte(pmap, pte, va, *pde, 8929 NULL, &lock); 8930 pmap_invalidate_page(pmap, va); 8931 } else 8932 demoted = TRUE; 8933 8934 if (demoted) { 8935 /* 8936 * The superpage mapping was removed 8937 * entirely and therefore 'pv' is no 8938 * longer valid. 8939 */ 8940 if (pvf == pv) 8941 pvf = NULL; 8942 pv = NULL; 8943 } 8944 cleared++; 8945 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 8946 ("inconsistent pv lock %p %p for page %p", 8947 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 8948 } else 8949 not_cleared++; 8950 } 8951 PMAP_UNLOCK(pmap); 8952 /* Rotate the PV list if it has more than one entry. */ 8953 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 8954 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 8955 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 8956 pvh->pv_gen++; 8957 } 8958 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 8959 goto out; 8960 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 8961 small_mappings: 8962 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 8963 goto out; 8964 pv = pvf; 8965 do { 8966 if (pvf == NULL) 8967 pvf = pv; 8968 pmap = PV_PMAP(pv); 8969 if (!PMAP_TRYLOCK(pmap)) { 8970 pvh_gen = pvh->pv_gen; 8971 md_gen = m->md.pv_gen; 8972 rw_wunlock(lock); 8973 PMAP_LOCK(pmap); 8974 rw_wlock(lock); 8975 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 8976 PMAP_UNLOCK(pmap); 8977 goto retry; 8978 } 8979 } 8980 PG_A = pmap_accessed_bit(pmap); 8981 PG_M = pmap_modified_bit(pmap); 8982 PG_RW = pmap_rw_bit(pmap); 8983 pde = pmap_pde(pmap, pv->pv_va); 8984 KASSERT((*pde & PG_PS) == 0, 8985 ("pmap_ts_referenced: found a 2mpage in page %p's pv list", 8986 m)); 8987 pte = pmap_pde_to_pte(pde, pv->pv_va); 8988 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 8989 vm_page_dirty(m); 8990 if ((*pte & PG_A) != 0) { 8991 if (safe_to_clear_referenced(pmap, *pte)) { 8992 atomic_clear_long(pte, PG_A); 8993 pmap_invalidate_page(pmap, pv->pv_va); 8994 cleared++; 8995 } else if ((*pte & PG_W) == 0) { 8996 /* 8997 * Wired pages cannot be paged out so 8998 * doing accessed bit emulation for 8999 * them is wasted effort. We do the 9000 * hard work for unwired pages only. 9001 */ 9002 pmap_remove_pte(pmap, pte, pv->pv_va, 9003 *pde, &free, &lock); 9004 pmap_invalidate_page(pmap, pv->pv_va); 9005 cleared++; 9006 if (pvf == pv) 9007 pvf = NULL; 9008 pv = NULL; 9009 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 9010 ("inconsistent pv lock %p %p for page %p", 9011 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 9012 } else 9013 not_cleared++; 9014 } 9015 PMAP_UNLOCK(pmap); 9016 /* Rotate the PV list if it has more than one entry. */ 9017 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 9018 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 9019 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 9020 m->md.pv_gen++; 9021 } 9022 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 9023 not_cleared < PMAP_TS_REFERENCED_MAX); 9024 out: 9025 rw_wunlock(lock); 9026 vm_page_free_pages_toq(&free, true); 9027 return (cleared + not_cleared); 9028 } 9029 9030 /* 9031 * Apply the given advice to the specified range of addresses within the 9032 * given pmap. Depending on the advice, clear the referenced and/or 9033 * modified flags in each mapping and set the mapped page's dirty field. 9034 */ 9035 void 9036 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 9037 { 9038 struct rwlock *lock; 9039 pml4_entry_t *pml4e; 9040 pdp_entry_t *pdpe; 9041 pd_entry_t oldpde, *pde; 9042 pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; 9043 vm_offset_t va, va_next; 9044 vm_page_t m; 9045 bool anychanged; 9046 9047 if (advice != MADV_DONTNEED && advice != MADV_FREE) 9048 return; 9049 9050 /* 9051 * A/D bit emulation requires an alternate code path when clearing 9052 * the modified and accessed bits below. Since this function is 9053 * advisory in nature we skip it entirely for pmaps that require 9054 * A/D bit emulation. 9055 */ 9056 if (pmap_emulate_ad_bits(pmap)) 9057 return; 9058 9059 PG_A = pmap_accessed_bit(pmap); 9060 PG_G = pmap_global_bit(pmap); 9061 PG_M = pmap_modified_bit(pmap); 9062 PG_V = pmap_valid_bit(pmap); 9063 PG_RW = pmap_rw_bit(pmap); 9064 anychanged = false; 9065 pmap_delayed_invl_start(); 9066 PMAP_LOCK(pmap); 9067 for (; sva < eva; sva = va_next) { 9068 pml4e = pmap_pml4e(pmap, sva); 9069 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 9070 va_next = (sva + NBPML4) & ~PML4MASK; 9071 if (va_next < sva) 9072 va_next = eva; 9073 continue; 9074 } 9075 9076 va_next = (sva + NBPDP) & ~PDPMASK; 9077 if (va_next < sva) 9078 va_next = eva; 9079 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 9080 if ((*pdpe & PG_V) == 0) 9081 continue; 9082 if ((*pdpe & PG_PS) != 0) 9083 continue; 9084 9085 va_next = (sva + NBPDR) & ~PDRMASK; 9086 if (va_next < sva) 9087 va_next = eva; 9088 pde = pmap_pdpe_to_pde(pdpe, sva); 9089 oldpde = *pde; 9090 if ((oldpde & PG_V) == 0) 9091 continue; 9092 else if ((oldpde & PG_PS) != 0) { 9093 if ((oldpde & PG_MANAGED) == 0) 9094 continue; 9095 lock = NULL; 9096 if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { 9097 if (lock != NULL) 9098 rw_wunlock(lock); 9099 9100 /* 9101 * The large page mapping was destroyed. 9102 */ 9103 continue; 9104 } 9105 9106 /* 9107 * Unless the page mappings are wired, remove the 9108 * mapping to a single page so that a subsequent 9109 * access may repromote. Choosing the last page 9110 * within the address range [sva, min(va_next, eva)) 9111 * generally results in more repromotions. Since the 9112 * underlying page table page is fully populated, this 9113 * removal never frees a page table page. 9114 */ 9115 if ((oldpde & PG_W) == 0) { 9116 va = eva; 9117 if (va > va_next) 9118 va = va_next; 9119 va -= PAGE_SIZE; 9120 KASSERT(va >= sva, 9121 ("pmap_advise: no address gap")); 9122 pte = pmap_pde_to_pte(pde, va); 9123 KASSERT((*pte & PG_V) != 0, 9124 ("pmap_advise: invalid PTE")); 9125 pmap_remove_pte(pmap, pte, va, *pde, NULL, 9126 &lock); 9127 anychanged = true; 9128 } 9129 if (lock != NULL) 9130 rw_wunlock(lock); 9131 } 9132 if (va_next > eva) 9133 va_next = eva; 9134 va = va_next; 9135 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 9136 sva += PAGE_SIZE) { 9137 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) 9138 goto maybe_invlrng; 9139 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 9140 if (advice == MADV_DONTNEED) { 9141 /* 9142 * Future calls to pmap_is_modified() 9143 * can be avoided by making the page 9144 * dirty now. 9145 */ 9146 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 9147 vm_page_dirty(m); 9148 } 9149 atomic_clear_long(pte, PG_M | PG_A); 9150 } else if ((*pte & PG_A) != 0) 9151 atomic_clear_long(pte, PG_A); 9152 else 9153 goto maybe_invlrng; 9154 9155 if ((*pte & PG_G) != 0) { 9156 if (va == va_next) 9157 va = sva; 9158 } else 9159 anychanged = true; 9160 continue; 9161 maybe_invlrng: 9162 if (va != va_next) { 9163 pmap_invalidate_range(pmap, va, sva); 9164 va = va_next; 9165 } 9166 } 9167 if (va != va_next) 9168 pmap_invalidate_range(pmap, va, sva); 9169 } 9170 if (anychanged) 9171 pmap_invalidate_all(pmap); 9172 PMAP_UNLOCK(pmap); 9173 pmap_delayed_invl_finish(); 9174 } 9175 9176 /* 9177 * Clear the modify bits on the specified physical page. 9178 */ 9179 void 9180 pmap_clear_modify(vm_page_t m) 9181 { 9182 struct md_page *pvh; 9183 pmap_t pmap; 9184 pv_entry_t next_pv, pv; 9185 pd_entry_t oldpde, *pde; 9186 pt_entry_t *pte, PG_M, PG_RW; 9187 struct rwlock *lock; 9188 vm_offset_t va; 9189 int md_gen, pvh_gen; 9190 9191 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 9192 ("pmap_clear_modify: page %p is not managed", m)); 9193 vm_page_assert_busied(m); 9194 9195 if (!pmap_page_is_write_mapped(m)) 9196 return; 9197 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 9198 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 9199 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 9200 rw_wlock(lock); 9201 restart: 9202 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 9203 pmap = PV_PMAP(pv); 9204 if (!PMAP_TRYLOCK(pmap)) { 9205 pvh_gen = pvh->pv_gen; 9206 rw_wunlock(lock); 9207 PMAP_LOCK(pmap); 9208 rw_wlock(lock); 9209 if (pvh_gen != pvh->pv_gen) { 9210 PMAP_UNLOCK(pmap); 9211 goto restart; 9212 } 9213 } 9214 PG_M = pmap_modified_bit(pmap); 9215 PG_RW = pmap_rw_bit(pmap); 9216 va = pv->pv_va; 9217 pde = pmap_pde(pmap, va); 9218 oldpde = *pde; 9219 /* If oldpde has PG_RW set, then it also has PG_M set. */ 9220 if ((oldpde & PG_RW) != 0 && 9221 pmap_demote_pde_locked(pmap, pde, va, &lock) && 9222 (oldpde & PG_W) == 0) { 9223 /* 9224 * Write protect the mapping to a single page so that 9225 * a subsequent write access may repromote. 9226 */ 9227 va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); 9228 pte = pmap_pde_to_pte(pde, va); 9229 atomic_clear_long(pte, PG_M | PG_RW); 9230 vm_page_dirty(m); 9231 pmap_invalidate_page(pmap, va); 9232 } 9233 PMAP_UNLOCK(pmap); 9234 } 9235 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 9236 pmap = PV_PMAP(pv); 9237 if (!PMAP_TRYLOCK(pmap)) { 9238 md_gen = m->md.pv_gen; 9239 pvh_gen = pvh->pv_gen; 9240 rw_wunlock(lock); 9241 PMAP_LOCK(pmap); 9242 rw_wlock(lock); 9243 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 9244 PMAP_UNLOCK(pmap); 9245 goto restart; 9246 } 9247 } 9248 PG_M = pmap_modified_bit(pmap); 9249 PG_RW = pmap_rw_bit(pmap); 9250 pde = pmap_pde(pmap, pv->pv_va); 9251 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 9252 " a 2mpage in page %p's pv list", m)); 9253 pte = pmap_pde_to_pte(pde, pv->pv_va); 9254 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 9255 atomic_clear_long(pte, PG_M); 9256 pmap_invalidate_page(pmap, pv->pv_va); 9257 } 9258 PMAP_UNLOCK(pmap); 9259 } 9260 rw_wunlock(lock); 9261 } 9262 9263 /* 9264 * Miscellaneous support routines follow 9265 */ 9266 9267 /* Adjust the properties for a leaf page table entry. */ 9268 static __inline void 9269 pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask) 9270 { 9271 u_long opte, npte; 9272 9273 opte = *(u_long *)pte; 9274 do { 9275 npte = opte & ~mask; 9276 npte |= bits; 9277 } while (npte != opte && !atomic_fcmpset_long((u_long *)pte, &opte, 9278 npte)); 9279 } 9280 9281 /* 9282 * Map a set of physical memory pages into the kernel virtual 9283 * address space. Return a pointer to where it is mapped. This 9284 * routine is intended to be used for mapping device memory, 9285 * NOT real memory. 9286 */ 9287 static void * 9288 pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags) 9289 { 9290 struct pmap_preinit_mapping *ppim; 9291 vm_offset_t va, offset; 9292 vm_size_t tmpsize; 9293 int i; 9294 9295 offset = pa & PAGE_MASK; 9296 size = round_page(offset + size); 9297 pa = trunc_page(pa); 9298 9299 if (!pmap_initialized) { 9300 va = 0; 9301 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9302 ppim = pmap_preinit_mapping + i; 9303 if (ppim->va == 0) { 9304 ppim->pa = pa; 9305 ppim->sz = size; 9306 ppim->mode = mode; 9307 ppim->va = virtual_avail; 9308 virtual_avail += size; 9309 va = ppim->va; 9310 break; 9311 } 9312 } 9313 if (va == 0) 9314 panic("%s: too many preinit mappings", __func__); 9315 } else { 9316 /* 9317 * If we have a preinit mapping, re-use it. 9318 */ 9319 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9320 ppim = pmap_preinit_mapping + i; 9321 if (ppim->pa == pa && ppim->sz == size && 9322 (ppim->mode == mode || 9323 (flags & MAPDEV_SETATTR) == 0)) 9324 return ((void *)(ppim->va + offset)); 9325 } 9326 /* 9327 * If the specified range of physical addresses fits within 9328 * the direct map window, use the direct map. 9329 */ 9330 if (pa < dmaplimit && pa + size <= dmaplimit) { 9331 va = PHYS_TO_DMAP(pa); 9332 if ((flags & MAPDEV_SETATTR) != 0) { 9333 PMAP_LOCK(kernel_pmap); 9334 i = pmap_change_props_locked(va, size, 9335 PROT_NONE, mode, flags); 9336 PMAP_UNLOCK(kernel_pmap); 9337 } else 9338 i = 0; 9339 if (!i) 9340 return ((void *)(va + offset)); 9341 } 9342 va = kva_alloc(size); 9343 if (va == 0) 9344 panic("%s: Couldn't allocate KVA", __func__); 9345 } 9346 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 9347 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 9348 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 9349 if ((flags & MAPDEV_FLUSHCACHE) != 0) 9350 pmap_invalidate_cache_range(va, va + tmpsize); 9351 return ((void *)(va + offset)); 9352 } 9353 9354 void * 9355 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 9356 { 9357 9358 return (pmap_mapdev_internal(pa, size, mode, MAPDEV_FLUSHCACHE | 9359 MAPDEV_SETATTR)); 9360 } 9361 9362 void * 9363 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 9364 { 9365 9366 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 9367 } 9368 9369 void * 9370 pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size) 9371 { 9372 9373 return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE, 9374 MAPDEV_SETATTR)); 9375 } 9376 9377 void * 9378 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 9379 { 9380 9381 return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK, 9382 MAPDEV_FLUSHCACHE)); 9383 } 9384 9385 void 9386 pmap_unmapdev(void *p, vm_size_t size) 9387 { 9388 struct pmap_preinit_mapping *ppim; 9389 vm_offset_t offset, va; 9390 int i; 9391 9392 va = (vm_offset_t)p; 9393 9394 /* If we gave a direct map region in pmap_mapdev, do nothing */ 9395 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 9396 return; 9397 offset = va & PAGE_MASK; 9398 size = round_page(offset + size); 9399 va = trunc_page(va); 9400 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9401 ppim = pmap_preinit_mapping + i; 9402 if (ppim->va == va && ppim->sz == size) { 9403 if (pmap_initialized) 9404 return; 9405 ppim->pa = 0; 9406 ppim->va = 0; 9407 ppim->sz = 0; 9408 ppim->mode = 0; 9409 if (va + size == virtual_avail) 9410 virtual_avail = va; 9411 return; 9412 } 9413 } 9414 if (pmap_initialized) { 9415 pmap_qremove(va, atop(size)); 9416 kva_free(va, size); 9417 } 9418 } 9419 9420 /* 9421 * Tries to demote a 1GB page mapping. 9422 */ 9423 static boolean_t 9424 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) 9425 { 9426 pdp_entry_t newpdpe, oldpdpe; 9427 pd_entry_t *firstpde, newpde, *pde; 9428 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 9429 vm_paddr_t pdpgpa; 9430 vm_page_t pdpg; 9431 9432 PG_A = pmap_accessed_bit(pmap); 9433 PG_M = pmap_modified_bit(pmap); 9434 PG_V = pmap_valid_bit(pmap); 9435 PG_RW = pmap_rw_bit(pmap); 9436 9437 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9438 oldpdpe = *pdpe; 9439 KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), 9440 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 9441 pdpg = pmap_alloc_pt_page(pmap, va >> PDPSHIFT, 9442 VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT); 9443 if (pdpg == NULL) { 9444 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 9445 " in pmap %p", va, pmap); 9446 return (FALSE); 9447 } 9448 pdpgpa = VM_PAGE_TO_PHYS(pdpg); 9449 firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa); 9450 newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; 9451 KASSERT((oldpdpe & PG_A) != 0, 9452 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 9453 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 9454 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 9455 newpde = oldpdpe; 9456 9457 /* 9458 * Initialize the page directory page. 9459 */ 9460 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 9461 *pde = newpde; 9462 newpde += NBPDR; 9463 } 9464 9465 /* 9466 * Demote the mapping. 9467 */ 9468 *pdpe = newpdpe; 9469 9470 /* 9471 * Invalidate a stale recursive mapping of the page directory page. 9472 */ 9473 pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); 9474 9475 counter_u64_add(pmap_pdpe_demotions, 1); 9476 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 9477 " in pmap %p", va, pmap); 9478 return (TRUE); 9479 } 9480 9481 /* 9482 * Sets the memory attribute for the specified page. 9483 */ 9484 void 9485 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 9486 { 9487 9488 m->md.pat_mode = ma; 9489 9490 /* 9491 * If "m" is a normal page, update its direct mapping. This update 9492 * can be relied upon to perform any cache operations that are 9493 * required for data coherence. 9494 */ 9495 if ((m->flags & PG_FICTITIOUS) == 0 && 9496 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 9497 m->md.pat_mode)) 9498 panic("memory attribute change on the direct map failed"); 9499 } 9500 9501 void 9502 pmap_page_set_memattr_noflush(vm_page_t m, vm_memattr_t ma) 9503 { 9504 int error; 9505 9506 m->md.pat_mode = ma; 9507 9508 if ((m->flags & PG_FICTITIOUS) != 0) 9509 return; 9510 PMAP_LOCK(kernel_pmap); 9511 error = pmap_change_props_locked(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 9512 PAGE_SIZE, PROT_NONE, m->md.pat_mode, 0); 9513 PMAP_UNLOCK(kernel_pmap); 9514 if (error != 0) 9515 panic("memory attribute change on the direct map failed"); 9516 } 9517 9518 /* 9519 * Changes the specified virtual address range's memory type to that given by 9520 * the parameter "mode". The specified virtual address range must be 9521 * completely contained within either the direct map or the kernel map. If 9522 * the virtual address range is contained within the kernel map, then the 9523 * memory type for each of the corresponding ranges of the direct map is also 9524 * changed. (The corresponding ranges of the direct map are those ranges that 9525 * map the same physical pages as the specified virtual address range.) These 9526 * changes to the direct map are necessary because Intel describes the 9527 * behavior of their processors as "undefined" if two or more mappings to the 9528 * same physical page have different memory types. 9529 * 9530 * Returns zero if the change completed successfully, and either EINVAL or 9531 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 9532 * of the virtual address range was not mapped, and ENOMEM is returned if 9533 * there was insufficient memory available to complete the change. In the 9534 * latter case, the memory type may have been changed on some part of the 9535 * virtual address range or the direct map. 9536 */ 9537 int 9538 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 9539 { 9540 int error; 9541 9542 PMAP_LOCK(kernel_pmap); 9543 error = pmap_change_props_locked(va, size, PROT_NONE, mode, 9544 MAPDEV_FLUSHCACHE); 9545 PMAP_UNLOCK(kernel_pmap); 9546 return (error); 9547 } 9548 9549 /* 9550 * Changes the specified virtual address range's protections to those 9551 * specified by "prot". Like pmap_change_attr(), protections for aliases 9552 * in the direct map are updated as well. Protections on aliasing mappings may 9553 * be a subset of the requested protections; for example, mappings in the direct 9554 * map are never executable. 9555 */ 9556 int 9557 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot) 9558 { 9559 int error; 9560 9561 /* Only supported within the kernel map. */ 9562 if (va < VM_MIN_KERNEL_ADDRESS) 9563 return (EINVAL); 9564 9565 PMAP_LOCK(kernel_pmap); 9566 error = pmap_change_props_locked(va, size, prot, -1, 9567 MAPDEV_ASSERTVALID); 9568 PMAP_UNLOCK(kernel_pmap); 9569 return (error); 9570 } 9571 9572 static int 9573 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, 9574 int mode, int flags) 9575 { 9576 vm_offset_t base, offset, tmpva; 9577 vm_paddr_t pa_start, pa_end, pa_end1; 9578 pdp_entry_t *pdpe; 9579 pd_entry_t *pde, pde_bits, pde_mask; 9580 pt_entry_t *pte, pte_bits, pte_mask; 9581 int error; 9582 bool changed; 9583 9584 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 9585 base = trunc_page(va); 9586 offset = va & PAGE_MASK; 9587 size = round_page(offset + size); 9588 9589 /* 9590 * Only supported on kernel virtual addresses, including the direct 9591 * map but excluding the recursive map. 9592 */ 9593 if (base < DMAP_MIN_ADDRESS) 9594 return (EINVAL); 9595 9596 /* 9597 * Construct our flag sets and masks. "bits" is the subset of 9598 * "mask" that will be set in each modified PTE. 9599 * 9600 * Mappings in the direct map are never allowed to be executable. 9601 */ 9602 pde_bits = pte_bits = 0; 9603 pde_mask = pte_mask = 0; 9604 if (mode != -1) { 9605 pde_bits |= pmap_cache_bits(kernel_pmap, mode, true); 9606 pde_mask |= X86_PG_PDE_CACHE; 9607 pte_bits |= pmap_cache_bits(kernel_pmap, mode, false); 9608 pte_mask |= X86_PG_PTE_CACHE; 9609 } 9610 if (prot != VM_PROT_NONE) { 9611 if ((prot & VM_PROT_WRITE) != 0) { 9612 pde_bits |= X86_PG_RW; 9613 pte_bits |= X86_PG_RW; 9614 } 9615 if ((prot & VM_PROT_EXECUTE) == 0 || 9616 va < VM_MIN_KERNEL_ADDRESS) { 9617 pde_bits |= pg_nx; 9618 pte_bits |= pg_nx; 9619 } 9620 pde_mask |= X86_PG_RW | pg_nx; 9621 pte_mask |= X86_PG_RW | pg_nx; 9622 } 9623 9624 /* 9625 * Pages that aren't mapped aren't supported. Also break down 2MB pages 9626 * into 4KB pages if required. 9627 */ 9628 for (tmpva = base; tmpva < base + size; ) { 9629 pdpe = pmap_pdpe(kernel_pmap, tmpva); 9630 if (pdpe == NULL || *pdpe == 0) { 9631 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9632 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9633 return (EINVAL); 9634 } 9635 if (*pdpe & PG_PS) { 9636 /* 9637 * If the current 1GB page already has the required 9638 * properties, then we need not demote this page. Just 9639 * increment tmpva to the next 1GB page frame. 9640 */ 9641 if ((*pdpe & pde_mask) == pde_bits) { 9642 tmpva = trunc_1gpage(tmpva) + NBPDP; 9643 continue; 9644 } 9645 9646 /* 9647 * If the current offset aligns with a 1GB page frame 9648 * and there is at least 1GB left within the range, then 9649 * we need not break down this page into 2MB pages. 9650 */ 9651 if ((tmpva & PDPMASK) == 0 && 9652 tmpva + PDPMASK < base + size) { 9653 tmpva += NBPDP; 9654 continue; 9655 } 9656 if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) 9657 return (ENOMEM); 9658 } 9659 pde = pmap_pdpe_to_pde(pdpe, tmpva); 9660 if (*pde == 0) { 9661 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9662 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9663 return (EINVAL); 9664 } 9665 if (*pde & PG_PS) { 9666 /* 9667 * If the current 2MB page already has the required 9668 * properties, then we need not demote this page. Just 9669 * increment tmpva to the next 2MB page frame. 9670 */ 9671 if ((*pde & pde_mask) == pde_bits) { 9672 tmpva = trunc_2mpage(tmpva) + NBPDR; 9673 continue; 9674 } 9675 9676 /* 9677 * If the current offset aligns with a 2MB page frame 9678 * and there is at least 2MB left within the range, then 9679 * we need not break down this page into 4KB pages. 9680 */ 9681 if ((tmpva & PDRMASK) == 0 && 9682 tmpva + PDRMASK < base + size) { 9683 tmpva += NBPDR; 9684 continue; 9685 } 9686 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) 9687 return (ENOMEM); 9688 } 9689 pte = pmap_pde_to_pte(pde, tmpva); 9690 if (*pte == 0) { 9691 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9692 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9693 return (EINVAL); 9694 } 9695 tmpva += PAGE_SIZE; 9696 } 9697 error = 0; 9698 9699 /* 9700 * Ok, all the pages exist, so run through them updating their 9701 * properties if required. 9702 */ 9703 changed = false; 9704 pa_start = pa_end = 0; 9705 for (tmpva = base; tmpva < base + size; ) { 9706 pdpe = pmap_pdpe(kernel_pmap, tmpva); 9707 if (*pdpe & PG_PS) { 9708 if ((*pdpe & pde_mask) != pde_bits) { 9709 pmap_pte_props(pdpe, pde_bits, pde_mask); 9710 changed = true; 9711 } 9712 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9713 (*pdpe & PG_PS_FRAME) < dmaplimit) { 9714 if (pa_start == pa_end) { 9715 /* Start physical address run. */ 9716 pa_start = *pdpe & PG_PS_FRAME; 9717 pa_end = pa_start + NBPDP; 9718 } else if (pa_end == (*pdpe & PG_PS_FRAME)) 9719 pa_end += NBPDP; 9720 else { 9721 /* Run ended, update direct map. */ 9722 error = pmap_change_props_locked( 9723 PHYS_TO_DMAP(pa_start), 9724 pa_end - pa_start, prot, mode, 9725 flags); 9726 if (error != 0) 9727 break; 9728 /* Start physical address run. */ 9729 pa_start = *pdpe & PG_PS_FRAME; 9730 pa_end = pa_start + NBPDP; 9731 } 9732 } 9733 tmpva = trunc_1gpage(tmpva) + NBPDP; 9734 continue; 9735 } 9736 pde = pmap_pdpe_to_pde(pdpe, tmpva); 9737 if (*pde & PG_PS) { 9738 if ((*pde & pde_mask) != pde_bits) { 9739 pmap_pte_props(pde, pde_bits, pde_mask); 9740 changed = true; 9741 } 9742 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9743 (*pde & PG_PS_FRAME) < dmaplimit) { 9744 if (pa_start == pa_end) { 9745 /* Start physical address run. */ 9746 pa_start = *pde & PG_PS_FRAME; 9747 pa_end = pa_start + NBPDR; 9748 } else if (pa_end == (*pde & PG_PS_FRAME)) 9749 pa_end += NBPDR; 9750 else { 9751 /* Run ended, update direct map. */ 9752 error = pmap_change_props_locked( 9753 PHYS_TO_DMAP(pa_start), 9754 pa_end - pa_start, prot, mode, 9755 flags); 9756 if (error != 0) 9757 break; 9758 /* Start physical address run. */ 9759 pa_start = *pde & PG_PS_FRAME; 9760 pa_end = pa_start + NBPDR; 9761 } 9762 } 9763 tmpva = trunc_2mpage(tmpva) + NBPDR; 9764 } else { 9765 pte = pmap_pde_to_pte(pde, tmpva); 9766 if ((*pte & pte_mask) != pte_bits) { 9767 pmap_pte_props(pte, pte_bits, pte_mask); 9768 changed = true; 9769 } 9770 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9771 (*pte & PG_FRAME) < dmaplimit) { 9772 if (pa_start == pa_end) { 9773 /* Start physical address run. */ 9774 pa_start = *pte & PG_FRAME; 9775 pa_end = pa_start + PAGE_SIZE; 9776 } else if (pa_end == (*pte & PG_FRAME)) 9777 pa_end += PAGE_SIZE; 9778 else { 9779 /* Run ended, update direct map. */ 9780 error = pmap_change_props_locked( 9781 PHYS_TO_DMAP(pa_start), 9782 pa_end - pa_start, prot, mode, 9783 flags); 9784 if (error != 0) 9785 break; 9786 /* Start physical address run. */ 9787 pa_start = *pte & PG_FRAME; 9788 pa_end = pa_start + PAGE_SIZE; 9789 } 9790 } 9791 tmpva += PAGE_SIZE; 9792 } 9793 } 9794 if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { 9795 pa_end1 = MIN(pa_end, dmaplimit); 9796 if (pa_start != pa_end1) 9797 error = pmap_change_props_locked(PHYS_TO_DMAP(pa_start), 9798 pa_end1 - pa_start, prot, mode, flags); 9799 } 9800 9801 /* 9802 * Flush CPU caches if required to make sure any data isn't cached that 9803 * shouldn't be, etc. 9804 */ 9805 if (changed) { 9806 pmap_invalidate_range(kernel_pmap, base, tmpva); 9807 if ((flags & MAPDEV_FLUSHCACHE) != 0) 9808 pmap_invalidate_cache_range(base, tmpva); 9809 } 9810 return (error); 9811 } 9812 9813 /* 9814 * Demotes any mapping within the direct map region that covers more than the 9815 * specified range of physical addresses. This range's size must be a power 9816 * of two and its starting address must be a multiple of its size. Since the 9817 * demotion does not change any attributes of the mapping, a TLB invalidation 9818 * is not mandatory. The caller may, however, request a TLB invalidation. 9819 */ 9820 void 9821 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) 9822 { 9823 pdp_entry_t *pdpe; 9824 pd_entry_t *pde; 9825 vm_offset_t va; 9826 boolean_t changed; 9827 9828 if (len == 0) 9829 return; 9830 KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); 9831 KASSERT((base & (len - 1)) == 0, 9832 ("pmap_demote_DMAP: base is not a multiple of len")); 9833 if (len < NBPDP && base < dmaplimit) { 9834 va = PHYS_TO_DMAP(base); 9835 changed = FALSE; 9836 PMAP_LOCK(kernel_pmap); 9837 pdpe = pmap_pdpe(kernel_pmap, va); 9838 if ((*pdpe & X86_PG_V) == 0) 9839 panic("pmap_demote_DMAP: invalid PDPE"); 9840 if ((*pdpe & PG_PS) != 0) { 9841 if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) 9842 panic("pmap_demote_DMAP: PDPE failed"); 9843 changed = TRUE; 9844 } 9845 if (len < NBPDR) { 9846 pde = pmap_pdpe_to_pde(pdpe, va); 9847 if ((*pde & X86_PG_V) == 0) 9848 panic("pmap_demote_DMAP: invalid PDE"); 9849 if ((*pde & PG_PS) != 0) { 9850 if (!pmap_demote_pde(kernel_pmap, pde, va)) 9851 panic("pmap_demote_DMAP: PDE failed"); 9852 changed = TRUE; 9853 } 9854 } 9855 if (changed && invalidate) 9856 pmap_invalidate_page(kernel_pmap, va); 9857 PMAP_UNLOCK(kernel_pmap); 9858 } 9859 } 9860 9861 /* 9862 * Perform the pmap work for mincore(2). If the page is not both referenced and 9863 * modified by this pmap, returns its physical address so that the caller can 9864 * find other mappings. 9865 */ 9866 int 9867 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 9868 { 9869 pdp_entry_t *pdpe; 9870 pd_entry_t *pdep; 9871 pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; 9872 vm_paddr_t pa; 9873 int val; 9874 9875 PG_A = pmap_accessed_bit(pmap); 9876 PG_M = pmap_modified_bit(pmap); 9877 PG_V = pmap_valid_bit(pmap); 9878 PG_RW = pmap_rw_bit(pmap); 9879 9880 PMAP_LOCK(pmap); 9881 pte = 0; 9882 pa = 0; 9883 val = 0; 9884 pdpe = pmap_pdpe(pmap, addr); 9885 if (pdpe == NULL) 9886 goto out; 9887 if ((*pdpe & PG_V) != 0) { 9888 if ((*pdpe & PG_PS) != 0) { 9889 pte = *pdpe; 9890 pa = ((pte & PG_PS_PDP_FRAME) | (addr & PDPMASK)) & 9891 PG_FRAME; 9892 val = MINCORE_PSIND(2); 9893 } else { 9894 pdep = pmap_pde(pmap, addr); 9895 if (pdep != NULL && (*pdep & PG_V) != 0) { 9896 if ((*pdep & PG_PS) != 0) { 9897 pte = *pdep; 9898 /* Compute the physical address of the 4KB page. */ 9899 pa = ((pte & PG_PS_FRAME) | (addr & 9900 PDRMASK)) & PG_FRAME; 9901 val = MINCORE_PSIND(1); 9902 } else { 9903 pte = *pmap_pde_to_pte(pdep, addr); 9904 pa = pte & PG_FRAME; 9905 val = 0; 9906 } 9907 } 9908 } 9909 } 9910 if ((pte & PG_V) != 0) { 9911 val |= MINCORE_INCORE; 9912 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 9913 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 9914 if ((pte & PG_A) != 0) 9915 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 9916 } 9917 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 9918 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 9919 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 9920 *pap = pa; 9921 } 9922 out: 9923 PMAP_UNLOCK(pmap); 9924 return (val); 9925 } 9926 9927 static uint64_t 9928 pmap_pcid_alloc(pmap_t pmap, u_int cpuid) 9929 { 9930 uint32_t gen, new_gen, pcid_next; 9931 9932 CRITICAL_ASSERT(curthread); 9933 gen = PCPU_GET(pcid_gen); 9934 if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN) 9935 return (pti ? 0 : CR3_PCID_SAVE); 9936 if (pmap->pm_pcids[cpuid].pm_gen == gen) 9937 return (CR3_PCID_SAVE); 9938 pcid_next = PCPU_GET(pcid_next); 9939 KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) || 9940 (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN), 9941 ("cpu %d pcid_next %#x", cpuid, pcid_next)); 9942 if ((!pti && pcid_next == PMAP_PCID_OVERMAX) || 9943 (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) { 9944 new_gen = gen + 1; 9945 if (new_gen == 0) 9946 new_gen = 1; 9947 PCPU_SET(pcid_gen, new_gen); 9948 pcid_next = PMAP_PCID_KERN + 1; 9949 } else { 9950 new_gen = gen; 9951 } 9952 pmap->pm_pcids[cpuid].pm_pcid = pcid_next; 9953 pmap->pm_pcids[cpuid].pm_gen = new_gen; 9954 PCPU_SET(pcid_next, pcid_next + 1); 9955 return (0); 9956 } 9957 9958 static uint64_t 9959 pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid) 9960 { 9961 uint64_t cached; 9962 9963 cached = pmap_pcid_alloc(pmap, cpuid); 9964 KASSERT(pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, 9965 ("pmap %p cpu %d pcid %#x", pmap, cpuid, 9966 pmap->pm_pcids[cpuid].pm_pcid)); 9967 KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || 9968 pmap == kernel_pmap, 9969 ("non-kernel pmap pmap %p cpu %d pcid %#x", 9970 pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); 9971 return (cached); 9972 } 9973 9974 static void 9975 pmap_activate_sw_pti_post(struct thread *td, pmap_t pmap) 9976 { 9977 9978 PCPU_GET(tssp)->tss_rsp0 = pmap->pm_ucr3 != PMAP_NO_CR3 ? 9979 PCPU_GET(pti_rsp0) : (uintptr_t)td->td_md.md_stack_base; 9980 } 9981 9982 static void 9983 pmap_activate_sw_pcid_pti(struct thread *td, pmap_t pmap, u_int cpuid) 9984 { 9985 pmap_t old_pmap; 9986 uint64_t cached, cr3, kcr3, ucr3; 9987 9988 KASSERT((read_rflags() & PSL_I) == 0, 9989 ("PCID needs interrupts disabled in pmap_activate_sw()")); 9990 9991 /* See the comment in pmap_invalidate_page_pcid(). */ 9992 if (PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) { 9993 PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK); 9994 old_pmap = PCPU_GET(curpmap); 9995 MPASS(old_pmap->pm_ucr3 != PMAP_NO_CR3); 9996 old_pmap->pm_pcids[cpuid].pm_gen = 0; 9997 } 9998 9999 cached = pmap_pcid_alloc_checked(pmap, cpuid); 10000 cr3 = rcr3(); 10001 if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) 10002 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid); 10003 PCPU_SET(curpmap, pmap); 10004 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid; 10005 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid | 10006 PMAP_PCID_USER_PT; 10007 10008 if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) 10009 PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); 10010 10011 PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); 10012 PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); 10013 if (cached) 10014 counter_u64_add(pcid_save_cnt, 1); 10015 10016 pmap_activate_sw_pti_post(td, pmap); 10017 } 10018 10019 static void 10020 pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap, 10021 u_int cpuid) 10022 { 10023 uint64_t cached, cr3; 10024 10025 KASSERT((read_rflags() & PSL_I) == 0, 10026 ("PCID needs interrupts disabled in pmap_activate_sw()")); 10027 10028 cached = pmap_pcid_alloc_checked(pmap, cpuid); 10029 cr3 = rcr3(); 10030 if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) 10031 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | 10032 cached); 10033 PCPU_SET(curpmap, pmap); 10034 if (cached) 10035 counter_u64_add(pcid_save_cnt, 1); 10036 } 10037 10038 static void 10039 pmap_activate_sw_nopcid_nopti(struct thread *td __unused, pmap_t pmap, 10040 u_int cpuid __unused) 10041 { 10042 10043 load_cr3(pmap->pm_cr3); 10044 PCPU_SET(curpmap, pmap); 10045 } 10046 10047 static void 10048 pmap_activate_sw_nopcid_pti(struct thread *td, pmap_t pmap, 10049 u_int cpuid __unused) 10050 { 10051 10052 pmap_activate_sw_nopcid_nopti(td, pmap, cpuid); 10053 PCPU_SET(kcr3, pmap->pm_cr3); 10054 PCPU_SET(ucr3, pmap->pm_ucr3); 10055 pmap_activate_sw_pti_post(td, pmap); 10056 } 10057 10058 DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (struct thread *, pmap_t, 10059 u_int)) 10060 { 10061 10062 if (pmap_pcid_enabled && pti) 10063 return (pmap_activate_sw_pcid_pti); 10064 else if (pmap_pcid_enabled && !pti) 10065 return (pmap_activate_sw_pcid_nopti); 10066 else if (!pmap_pcid_enabled && pti) 10067 return (pmap_activate_sw_nopcid_pti); 10068 else /* if (!pmap_pcid_enabled && !pti) */ 10069 return (pmap_activate_sw_nopcid_nopti); 10070 } 10071 10072 void 10073 pmap_activate_sw(struct thread *td) 10074 { 10075 pmap_t oldpmap, pmap; 10076 u_int cpuid; 10077 10078 oldpmap = PCPU_GET(curpmap); 10079 pmap = vmspace_pmap(td->td_proc->p_vmspace); 10080 if (oldpmap == pmap) { 10081 if (cpu_vendor_id != CPU_VENDOR_INTEL) 10082 mfence(); 10083 return; 10084 } 10085 cpuid = PCPU_GET(cpuid); 10086 #ifdef SMP 10087 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 10088 #else 10089 CPU_SET(cpuid, &pmap->pm_active); 10090 #endif 10091 pmap_activate_sw_mode(td, pmap, cpuid); 10092 #ifdef SMP 10093 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 10094 #else 10095 CPU_CLR(cpuid, &oldpmap->pm_active); 10096 #endif 10097 } 10098 10099 void 10100 pmap_activate(struct thread *td) 10101 { 10102 /* 10103 * invltlb_{invpcid,}_pcid_handler() is used to handle an 10104 * invalidate_all IPI, which checks for curpmap == 10105 * smp_tlb_pmap. The below sequence of operations has a 10106 * window where %CR3 is loaded with the new pmap's PML4 10107 * address, but the curpmap value has not yet been updated. 10108 * This causes the invltlb IPI handler, which is called 10109 * between the updates, to execute as a NOP, which leaves 10110 * stale TLB entries. 10111 * 10112 * Note that the most common use of pmap_activate_sw(), from 10113 * a context switch, is immune to this race, because 10114 * interrupts are disabled (while the thread lock is owned), 10115 * so the IPI is delayed until after curpmap is updated. Protect 10116 * other callers in a similar way, by disabling interrupts 10117 * around the %cr3 register reload and curpmap assignment. 10118 */ 10119 spinlock_enter(); 10120 pmap_activate_sw(td); 10121 spinlock_exit(); 10122 } 10123 10124 void 10125 pmap_activate_boot(pmap_t pmap) 10126 { 10127 uint64_t kcr3; 10128 u_int cpuid; 10129 10130 /* 10131 * kernel_pmap must be never deactivated, and we ensure that 10132 * by never activating it at all. 10133 */ 10134 MPASS(pmap != kernel_pmap); 10135 10136 cpuid = PCPU_GET(cpuid); 10137 #ifdef SMP 10138 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 10139 #else 10140 CPU_SET(cpuid, &pmap->pm_active); 10141 #endif 10142 PCPU_SET(curpmap, pmap); 10143 if (pti) { 10144 kcr3 = pmap->pm_cr3; 10145 if (pmap_pcid_enabled) 10146 kcr3 |= pmap->pm_pcids[cpuid].pm_pcid | CR3_PCID_SAVE; 10147 } else { 10148 kcr3 = PMAP_NO_CR3; 10149 } 10150 PCPU_SET(kcr3, kcr3); 10151 PCPU_SET(ucr3, PMAP_NO_CR3); 10152 } 10153 10154 void 10155 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 10156 { 10157 } 10158 10159 /* 10160 * Increase the starting virtual address of the given mapping if a 10161 * different alignment might result in more superpage mappings. 10162 */ 10163 void 10164 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 10165 vm_offset_t *addr, vm_size_t size) 10166 { 10167 vm_offset_t superpage_offset; 10168 10169 if (size < NBPDR) 10170 return; 10171 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 10172 offset += ptoa(object->pg_color); 10173 superpage_offset = offset & PDRMASK; 10174 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 10175 (*addr & PDRMASK) == superpage_offset) 10176 return; 10177 if ((*addr & PDRMASK) < superpage_offset) 10178 *addr = (*addr & ~PDRMASK) + superpage_offset; 10179 else 10180 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 10181 } 10182 10183 #ifdef INVARIANTS 10184 static unsigned long num_dirty_emulations; 10185 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, 10186 &num_dirty_emulations, 0, NULL); 10187 10188 static unsigned long num_accessed_emulations; 10189 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, 10190 &num_accessed_emulations, 0, NULL); 10191 10192 static unsigned long num_superpage_accessed_emulations; 10193 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, 10194 &num_superpage_accessed_emulations, 0, NULL); 10195 10196 static unsigned long ad_emulation_superpage_promotions; 10197 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, 10198 &ad_emulation_superpage_promotions, 0, NULL); 10199 #endif /* INVARIANTS */ 10200 10201 int 10202 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) 10203 { 10204 int rv; 10205 struct rwlock *lock; 10206 #if VM_NRESERVLEVEL > 0 10207 vm_page_t m, mpte; 10208 #endif 10209 pd_entry_t *pde; 10210 pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; 10211 10212 KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, 10213 ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); 10214 10215 if (!pmap_emulate_ad_bits(pmap)) 10216 return (-1); 10217 10218 PG_A = pmap_accessed_bit(pmap); 10219 PG_M = pmap_modified_bit(pmap); 10220 PG_V = pmap_valid_bit(pmap); 10221 PG_RW = pmap_rw_bit(pmap); 10222 10223 rv = -1; 10224 lock = NULL; 10225 PMAP_LOCK(pmap); 10226 10227 pde = pmap_pde(pmap, va); 10228 if (pde == NULL || (*pde & PG_V) == 0) 10229 goto done; 10230 10231 if ((*pde & PG_PS) != 0) { 10232 if (ftype == VM_PROT_READ) { 10233 #ifdef INVARIANTS 10234 atomic_add_long(&num_superpage_accessed_emulations, 1); 10235 #endif 10236 *pde |= PG_A; 10237 rv = 0; 10238 } 10239 goto done; 10240 } 10241 10242 pte = pmap_pde_to_pte(pde, va); 10243 if ((*pte & PG_V) == 0) 10244 goto done; 10245 10246 if (ftype == VM_PROT_WRITE) { 10247 if ((*pte & PG_RW) == 0) 10248 goto done; 10249 /* 10250 * Set the modified and accessed bits simultaneously. 10251 * 10252 * Intel EPT PTEs that do software emulation of A/D bits map 10253 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively. 10254 * An EPT misconfiguration is triggered if the PTE is writable 10255 * but not readable (WR=10). This is avoided by setting PG_A 10256 * and PG_M simultaneously. 10257 */ 10258 *pte |= PG_M | PG_A; 10259 } else { 10260 *pte |= PG_A; 10261 } 10262 10263 #if VM_NRESERVLEVEL > 0 10264 /* try to promote the mapping */ 10265 if (va < VM_MAXUSER_ADDRESS) 10266 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 10267 else 10268 mpte = NULL; 10269 10270 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 10271 10272 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 10273 pmap_ps_enabled(pmap) && 10274 (m->flags & PG_FICTITIOUS) == 0 && 10275 vm_reserv_level_iffullpop(m) == 0) { 10276 pmap_promote_pde(pmap, pde, va, mpte, &lock); 10277 #ifdef INVARIANTS 10278 atomic_add_long(&ad_emulation_superpage_promotions, 1); 10279 #endif 10280 } 10281 #endif 10282 10283 #ifdef INVARIANTS 10284 if (ftype == VM_PROT_WRITE) 10285 atomic_add_long(&num_dirty_emulations, 1); 10286 else 10287 atomic_add_long(&num_accessed_emulations, 1); 10288 #endif 10289 rv = 0; /* success */ 10290 done: 10291 if (lock != NULL) 10292 rw_wunlock(lock); 10293 PMAP_UNLOCK(pmap); 10294 return (rv); 10295 } 10296 10297 void 10298 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) 10299 { 10300 pml4_entry_t *pml4; 10301 pdp_entry_t *pdp; 10302 pd_entry_t *pde; 10303 pt_entry_t *pte, PG_V; 10304 int idx; 10305 10306 idx = 0; 10307 PG_V = pmap_valid_bit(pmap); 10308 PMAP_LOCK(pmap); 10309 10310 pml4 = pmap_pml4e(pmap, va); 10311 if (pml4 == NULL) 10312 goto done; 10313 ptr[idx++] = *pml4; 10314 if ((*pml4 & PG_V) == 0) 10315 goto done; 10316 10317 pdp = pmap_pml4e_to_pdpe(pml4, va); 10318 ptr[idx++] = *pdp; 10319 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) 10320 goto done; 10321 10322 pde = pmap_pdpe_to_pde(pdp, va); 10323 ptr[idx++] = *pde; 10324 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) 10325 goto done; 10326 10327 pte = pmap_pde_to_pte(pde, va); 10328 ptr[idx++] = *pte; 10329 10330 done: 10331 PMAP_UNLOCK(pmap); 10332 *num = idx; 10333 } 10334 10335 /** 10336 * Get the kernel virtual address of a set of physical pages. If there are 10337 * physical addresses not covered by the DMAP perform a transient mapping 10338 * that will be removed when calling pmap_unmap_io_transient. 10339 * 10340 * \param page The pages the caller wishes to obtain the virtual 10341 * address on the kernel memory map. 10342 * \param vaddr On return contains the kernel virtual memory address 10343 * of the pages passed in the page parameter. 10344 * \param count Number of pages passed in. 10345 * \param can_fault TRUE if the thread using the mapped pages can take 10346 * page faults, FALSE otherwise. 10347 * 10348 * \returns TRUE if the caller must call pmap_unmap_io_transient when 10349 * finished or FALSE otherwise. 10350 * 10351 */ 10352 boolean_t 10353 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 10354 boolean_t can_fault) 10355 { 10356 vm_paddr_t paddr; 10357 boolean_t needs_mapping; 10358 pt_entry_t *pte; 10359 int cache_bits, error __unused, i; 10360 10361 /* 10362 * Allocate any KVA space that we need, this is done in a separate 10363 * loop to prevent calling vmem_alloc while pinned. 10364 */ 10365 needs_mapping = FALSE; 10366 for (i = 0; i < count; i++) { 10367 paddr = VM_PAGE_TO_PHYS(page[i]); 10368 if (__predict_false(paddr >= dmaplimit)) { 10369 error = vmem_alloc(kernel_arena, PAGE_SIZE, 10370 M_BESTFIT | M_WAITOK, &vaddr[i]); 10371 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 10372 needs_mapping = TRUE; 10373 } else { 10374 vaddr[i] = PHYS_TO_DMAP(paddr); 10375 } 10376 } 10377 10378 /* Exit early if everything is covered by the DMAP */ 10379 if (!needs_mapping) 10380 return (FALSE); 10381 10382 /* 10383 * NB: The sequence of updating a page table followed by accesses 10384 * to the corresponding pages used in the !DMAP case is subject to 10385 * the situation described in the "AMD64 Architecture Programmer's 10386 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special 10387 * Coherency Considerations". Therefore, issuing the INVLPG right 10388 * after modifying the PTE bits is crucial. 10389 */ 10390 if (!can_fault) 10391 sched_pin(); 10392 for (i = 0; i < count; i++) { 10393 paddr = VM_PAGE_TO_PHYS(page[i]); 10394 if (paddr >= dmaplimit) { 10395 if (can_fault) { 10396 /* 10397 * Slow path, since we can get page faults 10398 * while mappings are active don't pin the 10399 * thread to the CPU and instead add a global 10400 * mapping visible to all CPUs. 10401 */ 10402 pmap_qenter(vaddr[i], &page[i], 1); 10403 } else { 10404 pte = vtopte(vaddr[i]); 10405 cache_bits = pmap_cache_bits(kernel_pmap, 10406 page[i]->md.pat_mode, 0); 10407 pte_store(pte, paddr | X86_PG_RW | X86_PG_V | 10408 cache_bits); 10409 pmap_invlpg(kernel_pmap, vaddr[i]); 10410 } 10411 } 10412 } 10413 10414 return (needs_mapping); 10415 } 10416 10417 void 10418 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 10419 boolean_t can_fault) 10420 { 10421 vm_paddr_t paddr; 10422 int i; 10423 10424 if (!can_fault) 10425 sched_unpin(); 10426 for (i = 0; i < count; i++) { 10427 paddr = VM_PAGE_TO_PHYS(page[i]); 10428 if (paddr >= dmaplimit) { 10429 if (can_fault) 10430 pmap_qremove(vaddr[i], 1); 10431 vmem_free(kernel_arena, vaddr[i], PAGE_SIZE); 10432 } 10433 } 10434 } 10435 10436 vm_offset_t 10437 pmap_quick_enter_page(vm_page_t m) 10438 { 10439 vm_paddr_t paddr; 10440 10441 paddr = VM_PAGE_TO_PHYS(m); 10442 if (paddr < dmaplimit) 10443 return (PHYS_TO_DMAP(paddr)); 10444 mtx_lock_spin(&qframe_mtx); 10445 KASSERT(*vtopte(qframe) == 0, ("qframe busy")); 10446 10447 /* 10448 * Since qframe is exclusively mapped by us, and we do not set 10449 * PG_G, we can use INVLPG here. 10450 */ 10451 invlpg(qframe); 10452 10453 pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A | 10454 X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0)); 10455 return (qframe); 10456 } 10457 10458 void 10459 pmap_quick_remove_page(vm_offset_t addr) 10460 { 10461 10462 if (addr != qframe) 10463 return; 10464 pte_store(vtopte(qframe), 0); 10465 mtx_unlock_spin(&qframe_mtx); 10466 } 10467 10468 /* 10469 * Pdp pages from the large map are managed differently from either 10470 * kernel or user page table pages. They are permanently allocated at 10471 * initialization time, and their reference count is permanently set to 10472 * zero. The pml4 entries pointing to those pages are copied into 10473 * each allocated pmap. 10474 * 10475 * In contrast, pd and pt pages are managed like user page table 10476 * pages. They are dynamically allocated, and their reference count 10477 * represents the number of valid entries within the page. 10478 */ 10479 static vm_page_t 10480 pmap_large_map_getptp_unlocked(void) 10481 { 10482 return (pmap_alloc_pt_page(kernel_pmap, 0, VM_ALLOC_ZERO)); 10483 } 10484 10485 static vm_page_t 10486 pmap_large_map_getptp(void) 10487 { 10488 vm_page_t m; 10489 10490 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 10491 m = pmap_large_map_getptp_unlocked(); 10492 if (m == NULL) { 10493 PMAP_UNLOCK(kernel_pmap); 10494 vm_wait(NULL); 10495 PMAP_LOCK(kernel_pmap); 10496 /* Callers retry. */ 10497 } 10498 return (m); 10499 } 10500 10501 static pdp_entry_t * 10502 pmap_large_map_pdpe(vm_offset_t va) 10503 { 10504 vm_pindex_t pml4_idx; 10505 vm_paddr_t mphys; 10506 10507 pml4_idx = pmap_pml4e_index(va); 10508 KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents, 10509 ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I " 10510 "%#jx lm_ents %d", 10511 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); 10512 KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0, 10513 ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " 10514 "LMSPML4I %#jx lm_ents %d", 10515 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); 10516 mphys = kernel_pml4[pml4_idx] & PG_FRAME; 10517 return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va)); 10518 } 10519 10520 static pd_entry_t * 10521 pmap_large_map_pde(vm_offset_t va) 10522 { 10523 pdp_entry_t *pdpe; 10524 vm_page_t m; 10525 vm_paddr_t mphys; 10526 10527 retry: 10528 pdpe = pmap_large_map_pdpe(va); 10529 if (*pdpe == 0) { 10530 m = pmap_large_map_getptp(); 10531 if (m == NULL) 10532 goto retry; 10533 mphys = VM_PAGE_TO_PHYS(m); 10534 *pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; 10535 } else { 10536 MPASS((*pdpe & X86_PG_PS) == 0); 10537 mphys = *pdpe & PG_FRAME; 10538 } 10539 return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va)); 10540 } 10541 10542 static pt_entry_t * 10543 pmap_large_map_pte(vm_offset_t va) 10544 { 10545 pd_entry_t *pde; 10546 vm_page_t m; 10547 vm_paddr_t mphys; 10548 10549 retry: 10550 pde = pmap_large_map_pde(va); 10551 if (*pde == 0) { 10552 m = pmap_large_map_getptp(); 10553 if (m == NULL) 10554 goto retry; 10555 mphys = VM_PAGE_TO_PHYS(m); 10556 *pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; 10557 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->ref_count++; 10558 } else { 10559 MPASS((*pde & X86_PG_PS) == 0); 10560 mphys = *pde & PG_FRAME; 10561 } 10562 return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va)); 10563 } 10564 10565 static vm_paddr_t 10566 pmap_large_map_kextract(vm_offset_t va) 10567 { 10568 pdp_entry_t *pdpe, pdp; 10569 pd_entry_t *pde, pd; 10570 pt_entry_t *pte, pt; 10571 10572 KASSERT(PMAP_ADDRESS_IN_LARGEMAP(va), 10573 ("not largemap range %#lx", (u_long)va)); 10574 pdpe = pmap_large_map_pdpe(va); 10575 pdp = *pdpe; 10576 KASSERT((pdp & X86_PG_V) != 0, 10577 ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, 10578 (u_long)pdpe, pdp)); 10579 if ((pdp & X86_PG_PS) != 0) { 10580 KASSERT((amd_feature & AMDID_PAGE1GB) != 0, 10581 ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, 10582 (u_long)pdpe, pdp)); 10583 return ((pdp & PG_PS_PDP_FRAME) | (va & PDPMASK)); 10584 } 10585 pde = pmap_pdpe_to_pde(pdpe, va); 10586 pd = *pde; 10587 KASSERT((pd & X86_PG_V) != 0, 10588 ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); 10589 if ((pd & X86_PG_PS) != 0) 10590 return ((pd & PG_PS_FRAME) | (va & PDRMASK)); 10591 pte = pmap_pde_to_pte(pde, va); 10592 pt = *pte; 10593 KASSERT((pt & X86_PG_V) != 0, 10594 ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, pt)); 10595 return ((pt & PG_FRAME) | (va & PAGE_MASK)); 10596 } 10597 10598 static int 10599 pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase, 10600 vmem_addr_t *vmem_res) 10601 { 10602 10603 /* 10604 * Large mappings are all but static. Consequently, there 10605 * is no point in waiting for an earlier allocation to be 10606 * freed. 10607 */ 10608 return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN, 10609 VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res)); 10610 } 10611 10612 int 10613 pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr, 10614 vm_memattr_t mattr) 10615 { 10616 pdp_entry_t *pdpe; 10617 pd_entry_t *pde; 10618 pt_entry_t *pte; 10619 vm_offset_t va, inc; 10620 vmem_addr_t vmem_res; 10621 vm_paddr_t pa; 10622 int error; 10623 10624 if (len == 0 || spa + len < spa) 10625 return (EINVAL); 10626 10627 /* See if DMAP can serve. */ 10628 if (spa + len <= dmaplimit) { 10629 va = PHYS_TO_DMAP(spa); 10630 *addr = (void *)va; 10631 return (pmap_change_attr(va, len, mattr)); 10632 } 10633 10634 /* 10635 * No, allocate KVA. Fit the address with best possible 10636 * alignment for superpages. Fall back to worse align if 10637 * failed. 10638 */ 10639 error = ENOMEM; 10640 if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len, 10641 NBPDP) >= roundup2(spa, NBPDP) + NBPDP) 10642 error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK, 10643 &vmem_res); 10644 if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa, 10645 NBPDR) + NBPDR) 10646 error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK, 10647 &vmem_res); 10648 if (error != 0) 10649 error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res); 10650 if (error != 0) 10651 return (error); 10652 10653 /* 10654 * Fill pagetable. PG_M is not pre-set, we scan modified bits 10655 * in the pagetable to minimize flushing. No need to 10656 * invalidate TLB, since we only update invalid entries. 10657 */ 10658 PMAP_LOCK(kernel_pmap); 10659 for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc, 10660 len -= inc) { 10661 if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP && 10662 (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) { 10663 pdpe = pmap_large_map_pdpe(va); 10664 MPASS(*pdpe == 0); 10665 *pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW | 10666 X86_PG_V | X86_PG_A | pg_nx | 10667 pmap_cache_bits(kernel_pmap, mattr, TRUE); 10668 inc = NBPDP; 10669 } else if (len >= NBPDR && (pa & PDRMASK) == 0 && 10670 (va & PDRMASK) == 0) { 10671 pde = pmap_large_map_pde(va); 10672 MPASS(*pde == 0); 10673 *pde = pa | pg_g | X86_PG_PS | X86_PG_RW | 10674 X86_PG_V | X86_PG_A | pg_nx | 10675 pmap_cache_bits(kernel_pmap, mattr, TRUE); 10676 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))-> 10677 ref_count++; 10678 inc = NBPDR; 10679 } else { 10680 pte = pmap_large_map_pte(va); 10681 MPASS(*pte == 0); 10682 *pte = pa | pg_g | X86_PG_RW | X86_PG_V | 10683 X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, 10684 mattr, FALSE); 10685 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))-> 10686 ref_count++; 10687 inc = PAGE_SIZE; 10688 } 10689 } 10690 PMAP_UNLOCK(kernel_pmap); 10691 MPASS(len == 0); 10692 10693 *addr = (void *)vmem_res; 10694 return (0); 10695 } 10696 10697 void 10698 pmap_large_unmap(void *svaa, vm_size_t len) 10699 { 10700 vm_offset_t sva, va; 10701 vm_size_t inc; 10702 pdp_entry_t *pdpe, pdp; 10703 pd_entry_t *pde, pd; 10704 pt_entry_t *pte; 10705 vm_page_t m; 10706 struct spglist spgf; 10707 10708 sva = (vm_offset_t)svaa; 10709 if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS && 10710 sva + len <= DMAP_MIN_ADDRESS + dmaplimit)) 10711 return; 10712 10713 SLIST_INIT(&spgf); 10714 KASSERT(PMAP_ADDRESS_IN_LARGEMAP(sva) && 10715 PMAP_ADDRESS_IN_LARGEMAP(sva + len - 1), 10716 ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len)); 10717 PMAP_LOCK(kernel_pmap); 10718 for (va = sva; va < sva + len; va += inc) { 10719 pdpe = pmap_large_map_pdpe(va); 10720 pdp = *pdpe; 10721 KASSERT((pdp & X86_PG_V) != 0, 10722 ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, 10723 (u_long)pdpe, pdp)); 10724 if ((pdp & X86_PG_PS) != 0) { 10725 KASSERT((amd_feature & AMDID_PAGE1GB) != 0, 10726 ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, 10727 (u_long)pdpe, pdp)); 10728 KASSERT((va & PDPMASK) == 0, 10729 ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va, 10730 (u_long)pdpe, pdp)); 10731 KASSERT(va + NBPDP <= sva + len, 10732 ("unmap covers partial 1GB page, sva %#lx va %#lx " 10733 "pdpe %#lx pdp %#lx len %#lx", sva, va, 10734 (u_long)pdpe, pdp, len)); 10735 *pdpe = 0; 10736 inc = NBPDP; 10737 continue; 10738 } 10739 pde = pmap_pdpe_to_pde(pdpe, va); 10740 pd = *pde; 10741 KASSERT((pd & X86_PG_V) != 0, 10742 ("invalid pd va %#lx pde %#lx pd %#lx", va, 10743 (u_long)pde, pd)); 10744 if ((pd & X86_PG_PS) != 0) { 10745 KASSERT((va & PDRMASK) == 0, 10746 ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va, 10747 (u_long)pde, pd)); 10748 KASSERT(va + NBPDR <= sva + len, 10749 ("unmap covers partial 2MB page, sva %#lx va %#lx " 10750 "pde %#lx pd %#lx len %#lx", sva, va, (u_long)pde, 10751 pd, len)); 10752 pde_store(pde, 0); 10753 inc = NBPDR; 10754 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); 10755 m->ref_count--; 10756 if (m->ref_count == 0) { 10757 *pdpe = 0; 10758 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10759 } 10760 continue; 10761 } 10762 pte = pmap_pde_to_pte(pde, va); 10763 KASSERT((*pte & X86_PG_V) != 0, 10764 ("invalid pte va %#lx pte %#lx pt %#lx", va, 10765 (u_long)pte, *pte)); 10766 pte_clear(pte); 10767 inc = PAGE_SIZE; 10768 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte)); 10769 m->ref_count--; 10770 if (m->ref_count == 0) { 10771 *pde = 0; 10772 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10773 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); 10774 m->ref_count--; 10775 if (m->ref_count == 0) { 10776 *pdpe = 0; 10777 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10778 } 10779 } 10780 } 10781 pmap_invalidate_range(kernel_pmap, sva, sva + len); 10782 PMAP_UNLOCK(kernel_pmap); 10783 vm_page_free_pages_toq(&spgf, false); 10784 vmem_free(large_vmem, sva, len); 10785 } 10786 10787 static void 10788 pmap_large_map_wb_fence_mfence(void) 10789 { 10790 10791 mfence(); 10792 } 10793 10794 static void 10795 pmap_large_map_wb_fence_atomic(void) 10796 { 10797 10798 atomic_thread_fence_seq_cst(); 10799 } 10800 10801 static void 10802 pmap_large_map_wb_fence_nop(void) 10803 { 10804 } 10805 10806 DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void)) 10807 { 10808 10809 if (cpu_vendor_id != CPU_VENDOR_INTEL) 10810 return (pmap_large_map_wb_fence_mfence); 10811 else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB | 10812 CPUID_STDEXT_CLFLUSHOPT)) == 0) 10813 return (pmap_large_map_wb_fence_atomic); 10814 else 10815 /* clflush is strongly enough ordered */ 10816 return (pmap_large_map_wb_fence_nop); 10817 } 10818 10819 static void 10820 pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len) 10821 { 10822 10823 for (; len > 0; len -= cpu_clflush_line_size, 10824 va += cpu_clflush_line_size) 10825 clwb(va); 10826 } 10827 10828 static void 10829 pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len) 10830 { 10831 10832 for (; len > 0; len -= cpu_clflush_line_size, 10833 va += cpu_clflush_line_size) 10834 clflushopt(va); 10835 } 10836 10837 static void 10838 pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len) 10839 { 10840 10841 for (; len > 0; len -= cpu_clflush_line_size, 10842 va += cpu_clflush_line_size) 10843 clflush(va); 10844 } 10845 10846 static void 10847 pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len __unused) 10848 { 10849 } 10850 10851 DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t, vm_size_t)) 10852 { 10853 10854 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0) 10855 return (pmap_large_map_flush_range_clwb); 10856 else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) 10857 return (pmap_large_map_flush_range_clflushopt); 10858 else if ((cpu_feature & CPUID_CLFSH) != 0) 10859 return (pmap_large_map_flush_range_clflush); 10860 else 10861 return (pmap_large_map_flush_range_nop); 10862 } 10863 10864 static void 10865 pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva) 10866 { 10867 volatile u_long *pe; 10868 u_long p; 10869 vm_offset_t va; 10870 vm_size_t inc; 10871 bool seen_other; 10872 10873 for (va = sva; va < eva; va += inc) { 10874 inc = 0; 10875 if ((amd_feature & AMDID_PAGE1GB) != 0) { 10876 pe = (volatile u_long *)pmap_large_map_pdpe(va); 10877 p = *pe; 10878 if ((p & X86_PG_PS) != 0) 10879 inc = NBPDP; 10880 } 10881 if (inc == 0) { 10882 pe = (volatile u_long *)pmap_large_map_pde(va); 10883 p = *pe; 10884 if ((p & X86_PG_PS) != 0) 10885 inc = NBPDR; 10886 } 10887 if (inc == 0) { 10888 pe = (volatile u_long *)pmap_large_map_pte(va); 10889 p = *pe; 10890 inc = PAGE_SIZE; 10891 } 10892 seen_other = false; 10893 for (;;) { 10894 if ((p & X86_PG_AVAIL1) != 0) { 10895 /* 10896 * Spin-wait for the end of a parallel 10897 * write-back. 10898 */ 10899 cpu_spinwait(); 10900 p = *pe; 10901 10902 /* 10903 * If we saw other write-back 10904 * occuring, we cannot rely on PG_M to 10905 * indicate state of the cache. The 10906 * PG_M bit is cleared before the 10907 * flush to avoid ignoring new writes, 10908 * and writes which are relevant for 10909 * us might happen after. 10910 */ 10911 seen_other = true; 10912 continue; 10913 } 10914 10915 if ((p & X86_PG_M) != 0 || seen_other) { 10916 if (!atomic_fcmpset_long(pe, &p, 10917 (p & ~X86_PG_M) | X86_PG_AVAIL1)) 10918 /* 10919 * If we saw PG_M without 10920 * PG_AVAIL1, and then on the 10921 * next attempt we do not 10922 * observe either PG_M or 10923 * PG_AVAIL1, the other 10924 * write-back started after us 10925 * and finished before us. We 10926 * can rely on it doing our 10927 * work. 10928 */ 10929 continue; 10930 pmap_large_map_flush_range(va, inc); 10931 atomic_clear_long(pe, X86_PG_AVAIL1); 10932 } 10933 break; 10934 } 10935 maybe_yield(); 10936 } 10937 } 10938 10939 /* 10940 * Write-back cache lines for the given address range. 10941 * 10942 * Must be called only on the range or sub-range returned from 10943 * pmap_large_map(). Must not be called on the coalesced ranges. 10944 * 10945 * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH 10946 * instructions support. 10947 */ 10948 void 10949 pmap_large_map_wb(void *svap, vm_size_t len) 10950 { 10951 vm_offset_t eva, sva; 10952 10953 sva = (vm_offset_t)svap; 10954 eva = sva + len; 10955 pmap_large_map_wb_fence(); 10956 if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) { 10957 pmap_large_map_flush_range(sva, len); 10958 } else { 10959 KASSERT(sva >= LARGEMAP_MIN_ADDRESS && 10960 eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4, 10961 ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len)); 10962 pmap_large_map_wb_large(sva, eva); 10963 } 10964 pmap_large_map_wb_fence(); 10965 } 10966 10967 static vm_page_t 10968 pmap_pti_alloc_page(void) 10969 { 10970 vm_page_t m; 10971 10972 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 10973 m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_WIRED | VM_ALLOC_ZERO); 10974 return (m); 10975 } 10976 10977 static bool 10978 pmap_pti_free_page(vm_page_t m) 10979 { 10980 if (!vm_page_unwire_noq(m)) 10981 return (false); 10982 vm_page_xbusy_claim(m); 10983 vm_page_free_zero(m); 10984 return (true); 10985 } 10986 10987 static void 10988 pmap_pti_init(void) 10989 { 10990 vm_page_t pml4_pg; 10991 pdp_entry_t *pdpe; 10992 vm_offset_t va; 10993 int i; 10994 10995 if (!pti) 10996 return; 10997 pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL); 10998 VM_OBJECT_WLOCK(pti_obj); 10999 pml4_pg = pmap_pti_alloc_page(); 11000 pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg)); 11001 for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS && 11002 va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) { 11003 pdpe = pmap_pti_pdpe(va); 11004 pmap_pti_wire_pte(pdpe); 11005 } 11006 pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0], 11007 (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false); 11008 pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt + 11009 sizeof(struct gate_descriptor) * NIDT, false); 11010 CPU_FOREACH(i) { 11011 /* Doublefault stack IST 1 */ 11012 va = __pcpu[i].pc_common_tss.tss_ist1 + sizeof(struct nmi_pcpu); 11013 pmap_pti_add_kva_locked(va - DBLFAULT_STACK_SIZE, va, false); 11014 /* NMI stack IST 2 */ 11015 va = __pcpu[i].pc_common_tss.tss_ist2 + sizeof(struct nmi_pcpu); 11016 pmap_pti_add_kva_locked(va - NMI_STACK_SIZE, va, false); 11017 /* MC# stack IST 3 */ 11018 va = __pcpu[i].pc_common_tss.tss_ist3 + 11019 sizeof(struct nmi_pcpu); 11020 pmap_pti_add_kva_locked(va - MCE_STACK_SIZE, va, false); 11021 /* DB# stack IST 4 */ 11022 va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu); 11023 pmap_pti_add_kva_locked(va - DBG_STACK_SIZE, va, false); 11024 } 11025 pmap_pti_add_kva_locked((vm_offset_t)KERNSTART, (vm_offset_t)etext, 11026 true); 11027 pti_finalized = true; 11028 VM_OBJECT_WUNLOCK(pti_obj); 11029 } 11030 11031 static void 11032 pmap_cpu_init(void *arg __unused) 11033 { 11034 CPU_COPY(&all_cpus, &kernel_pmap->pm_active); 11035 pmap_pti_init(); 11036 } 11037 SYSINIT(pmap_cpu, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_cpu_init, NULL); 11038 11039 static pdp_entry_t * 11040 pmap_pti_pdpe(vm_offset_t va) 11041 { 11042 pml4_entry_t *pml4e; 11043 pdp_entry_t *pdpe; 11044 vm_page_t m; 11045 vm_pindex_t pml4_idx; 11046 vm_paddr_t mphys; 11047 11048 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11049 11050 pml4_idx = pmap_pml4e_index(va); 11051 pml4e = &pti_pml4[pml4_idx]; 11052 m = NULL; 11053 if (*pml4e == 0) { 11054 if (pti_finalized) 11055 panic("pml4 alloc after finalization\n"); 11056 m = pmap_pti_alloc_page(); 11057 if (*pml4e != 0) { 11058 pmap_pti_free_page(m); 11059 mphys = *pml4e & ~PAGE_MASK; 11060 } else { 11061 mphys = VM_PAGE_TO_PHYS(m); 11062 *pml4e = mphys | X86_PG_RW | X86_PG_V; 11063 } 11064 } else { 11065 mphys = *pml4e & ~PAGE_MASK; 11066 } 11067 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va); 11068 return (pdpe); 11069 } 11070 11071 static void 11072 pmap_pti_wire_pte(void *pte) 11073 { 11074 vm_page_t m; 11075 11076 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11077 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); 11078 m->ref_count++; 11079 } 11080 11081 static void 11082 pmap_pti_unwire_pde(void *pde, bool only_ref) 11083 { 11084 vm_page_t m; 11085 11086 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11087 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde)); 11088 MPASS(only_ref || m->ref_count > 1); 11089 pmap_pti_free_page(m); 11090 } 11091 11092 static void 11093 pmap_pti_unwire_pte(void *pte, vm_offset_t va) 11094 { 11095 vm_page_t m; 11096 pd_entry_t *pde; 11097 11098 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11099 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); 11100 if (pmap_pti_free_page(m)) { 11101 pde = pmap_pti_pde(va); 11102 MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V); 11103 *pde = 0; 11104 pmap_pti_unwire_pde(pde, false); 11105 } 11106 } 11107 11108 static pd_entry_t * 11109 pmap_pti_pde(vm_offset_t va) 11110 { 11111 pdp_entry_t *pdpe; 11112 pd_entry_t *pde; 11113 vm_page_t m; 11114 vm_pindex_t pd_idx; 11115 vm_paddr_t mphys; 11116 11117 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11118 11119 pdpe = pmap_pti_pdpe(va); 11120 if (*pdpe == 0) { 11121 m = pmap_pti_alloc_page(); 11122 if (*pdpe != 0) { 11123 pmap_pti_free_page(m); 11124 MPASS((*pdpe & X86_PG_PS) == 0); 11125 mphys = *pdpe & ~PAGE_MASK; 11126 } else { 11127 mphys = VM_PAGE_TO_PHYS(m); 11128 *pdpe = mphys | X86_PG_RW | X86_PG_V; 11129 } 11130 } else { 11131 MPASS((*pdpe & X86_PG_PS) == 0); 11132 mphys = *pdpe & ~PAGE_MASK; 11133 } 11134 11135 pde = (pd_entry_t *)PHYS_TO_DMAP(mphys); 11136 pd_idx = pmap_pde_index(va); 11137 pde += pd_idx; 11138 return (pde); 11139 } 11140 11141 static pt_entry_t * 11142 pmap_pti_pte(vm_offset_t va, bool *unwire_pde) 11143 { 11144 pd_entry_t *pde; 11145 pt_entry_t *pte; 11146 vm_page_t m; 11147 vm_paddr_t mphys; 11148 11149 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11150 11151 pde = pmap_pti_pde(va); 11152 if (unwire_pde != NULL) { 11153 *unwire_pde = true; 11154 pmap_pti_wire_pte(pde); 11155 } 11156 if (*pde == 0) { 11157 m = pmap_pti_alloc_page(); 11158 if (*pde != 0) { 11159 pmap_pti_free_page(m); 11160 MPASS((*pde & X86_PG_PS) == 0); 11161 mphys = *pde & ~(PAGE_MASK | pg_nx); 11162 } else { 11163 mphys = VM_PAGE_TO_PHYS(m); 11164 *pde = mphys | X86_PG_RW | X86_PG_V; 11165 if (unwire_pde != NULL) 11166 *unwire_pde = false; 11167 } 11168 } else { 11169 MPASS((*pde & X86_PG_PS) == 0); 11170 mphys = *pde & ~(PAGE_MASK | pg_nx); 11171 } 11172 11173 pte = (pt_entry_t *)PHYS_TO_DMAP(mphys); 11174 pte += pmap_pte_index(va); 11175 11176 return (pte); 11177 } 11178 11179 static void 11180 pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec) 11181 { 11182 vm_paddr_t pa; 11183 pd_entry_t *pde; 11184 pt_entry_t *pte, ptev; 11185 bool unwire_pde; 11186 11187 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11188 11189 sva = trunc_page(sva); 11190 MPASS(sva > VM_MAXUSER_ADDRESS); 11191 eva = round_page(eva); 11192 MPASS(sva < eva); 11193 for (; sva < eva; sva += PAGE_SIZE) { 11194 pte = pmap_pti_pte(sva, &unwire_pde); 11195 pa = pmap_kextract(sva); 11196 ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G | 11197 (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap, 11198 VM_MEMATTR_DEFAULT, FALSE); 11199 if (*pte == 0) { 11200 pte_store(pte, ptev); 11201 pmap_pti_wire_pte(pte); 11202 } else { 11203 KASSERT(!pti_finalized, 11204 ("pti overlap after fin %#lx %#lx %#lx", 11205 sva, *pte, ptev)); 11206 KASSERT(*pte == ptev, 11207 ("pti non-identical pte after fin %#lx %#lx %#lx", 11208 sva, *pte, ptev)); 11209 } 11210 if (unwire_pde) { 11211 pde = pmap_pti_pde(sva); 11212 pmap_pti_unwire_pde(pde, true); 11213 } 11214 } 11215 } 11216 11217 void 11218 pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec) 11219 { 11220 11221 if (!pti) 11222 return; 11223 VM_OBJECT_WLOCK(pti_obj); 11224 pmap_pti_add_kva_locked(sva, eva, exec); 11225 VM_OBJECT_WUNLOCK(pti_obj); 11226 } 11227 11228 void 11229 pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva) 11230 { 11231 pt_entry_t *pte; 11232 vm_offset_t va; 11233 11234 if (!pti) 11235 return; 11236 sva = rounddown2(sva, PAGE_SIZE); 11237 MPASS(sva > VM_MAXUSER_ADDRESS); 11238 eva = roundup2(eva, PAGE_SIZE); 11239 MPASS(sva < eva); 11240 VM_OBJECT_WLOCK(pti_obj); 11241 for (va = sva; va < eva; va += PAGE_SIZE) { 11242 pte = pmap_pti_pte(va, NULL); 11243 KASSERT((*pte & X86_PG_V) != 0, 11244 ("invalid pte va %#lx pte %#lx pt %#lx", va, 11245 (u_long)pte, *pte)); 11246 pte_clear(pte); 11247 pmap_pti_unwire_pte(pte, va); 11248 } 11249 pmap_invalidate_range(kernel_pmap, sva, eva); 11250 VM_OBJECT_WUNLOCK(pti_obj); 11251 } 11252 11253 static void * 11254 pkru_dup_range(void *ctx __unused, void *data) 11255 { 11256 struct pmap_pkru_range *node, *new_node; 11257 11258 new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); 11259 if (new_node == NULL) 11260 return (NULL); 11261 node = data; 11262 memcpy(new_node, node, sizeof(*node)); 11263 return (new_node); 11264 } 11265 11266 static void 11267 pkru_free_range(void *ctx __unused, void *node) 11268 { 11269 11270 uma_zfree(pmap_pkru_ranges_zone, node); 11271 } 11272 11273 static int 11274 pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, 11275 int flags) 11276 { 11277 struct pmap_pkru_range *ppr; 11278 int error; 11279 11280 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11281 MPASS(pmap->pm_type == PT_X86); 11282 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11283 if ((flags & AMD64_PKRU_EXCL) != 0 && 11284 !rangeset_check_empty(&pmap->pm_pkru, sva, eva)) 11285 return (EBUSY); 11286 ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); 11287 if (ppr == NULL) 11288 return (ENOMEM); 11289 ppr->pkru_keyidx = keyidx; 11290 ppr->pkru_flags = flags & AMD64_PKRU_PERSIST; 11291 error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr); 11292 if (error != 0) 11293 uma_zfree(pmap_pkru_ranges_zone, ppr); 11294 return (error); 11295 } 11296 11297 static int 11298 pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11299 { 11300 11301 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11302 MPASS(pmap->pm_type == PT_X86); 11303 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11304 return (rangeset_remove(&pmap->pm_pkru, sva, eva)); 11305 } 11306 11307 static void 11308 pmap_pkru_deassign_all(pmap_t pmap) 11309 { 11310 11311 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11312 if (pmap->pm_type == PT_X86 && 11313 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) 11314 rangeset_remove_all(&pmap->pm_pkru); 11315 } 11316 11317 static bool 11318 pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11319 { 11320 struct pmap_pkru_range *ppr, *prev_ppr; 11321 vm_offset_t va; 11322 11323 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11324 if (pmap->pm_type != PT_X86 || 11325 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || 11326 sva >= VM_MAXUSER_ADDRESS) 11327 return (true); 11328 MPASS(eva <= VM_MAXUSER_ADDRESS); 11329 for (va = sva; va < eva; prev_ppr = ppr) { 11330 ppr = rangeset_lookup(&pmap->pm_pkru, va); 11331 if (va == sva) 11332 prev_ppr = ppr; 11333 else if ((ppr == NULL) ^ (prev_ppr == NULL)) 11334 return (false); 11335 if (ppr == NULL) { 11336 va += PAGE_SIZE; 11337 continue; 11338 } 11339 if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx) 11340 return (false); 11341 va = ppr->pkru_rs_el.re_end; 11342 } 11343 return (true); 11344 } 11345 11346 static pt_entry_t 11347 pmap_pkru_get(pmap_t pmap, vm_offset_t va) 11348 { 11349 struct pmap_pkru_range *ppr; 11350 11351 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11352 if (pmap->pm_type != PT_X86 || 11353 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || 11354 va >= VM_MAXUSER_ADDRESS) 11355 return (0); 11356 ppr = rangeset_lookup(&pmap->pm_pkru, va); 11357 if (ppr != NULL) 11358 return (X86_PG_PKU(ppr->pkru_keyidx)); 11359 return (0); 11360 } 11361 11362 static bool 11363 pred_pkru_on_remove(void *ctx __unused, void *r) 11364 { 11365 struct pmap_pkru_range *ppr; 11366 11367 ppr = r; 11368 return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0); 11369 } 11370 11371 static void 11372 pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11373 { 11374 11375 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11376 if (pmap->pm_type == PT_X86 && 11377 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 11378 rangeset_remove_pred(&pmap->pm_pkru, sva, eva, 11379 pred_pkru_on_remove); 11380 } 11381 } 11382 11383 static int 11384 pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap) 11385 { 11386 11387 PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED); 11388 PMAP_LOCK_ASSERT(src_pmap, MA_OWNED); 11389 MPASS(dst_pmap->pm_type == PT_X86); 11390 MPASS(src_pmap->pm_type == PT_X86); 11391 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11392 if (src_pmap->pm_pkru.rs_data_ctx == NULL) 11393 return (0); 11394 return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru)); 11395 } 11396 11397 static void 11398 pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 11399 u_int keyidx) 11400 { 11401 pml4_entry_t *pml4e; 11402 pdp_entry_t *pdpe; 11403 pd_entry_t newpde, ptpaddr, *pde; 11404 pt_entry_t newpte, *ptep, pte; 11405 vm_offset_t va, va_next; 11406 bool changed; 11407 11408 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11409 MPASS(pmap->pm_type == PT_X86); 11410 MPASS(keyidx <= PMAP_MAX_PKRU_IDX); 11411 11412 for (changed = false, va = sva; va < eva; va = va_next) { 11413 pml4e = pmap_pml4e(pmap, va); 11414 if (pml4e == NULL || (*pml4e & X86_PG_V) == 0) { 11415 va_next = (va + NBPML4) & ~PML4MASK; 11416 if (va_next < va) 11417 va_next = eva; 11418 continue; 11419 } 11420 11421 pdpe = pmap_pml4e_to_pdpe(pml4e, va); 11422 if ((*pdpe & X86_PG_V) == 0) { 11423 va_next = (va + NBPDP) & ~PDPMASK; 11424 if (va_next < va) 11425 va_next = eva; 11426 continue; 11427 } 11428 11429 va_next = (va + NBPDR) & ~PDRMASK; 11430 if (va_next < va) 11431 va_next = eva; 11432 11433 pde = pmap_pdpe_to_pde(pdpe, va); 11434 ptpaddr = *pde; 11435 if (ptpaddr == 0) 11436 continue; 11437 11438 MPASS((ptpaddr & X86_PG_V) != 0); 11439 if ((ptpaddr & PG_PS) != 0) { 11440 if (va + NBPDR == va_next && eva >= va_next) { 11441 newpde = (ptpaddr & ~X86_PG_PKU_MASK) | 11442 X86_PG_PKU(keyidx); 11443 if (newpde != ptpaddr) { 11444 *pde = newpde; 11445 changed = true; 11446 } 11447 continue; 11448 } else if (!pmap_demote_pde(pmap, pde, va)) { 11449 continue; 11450 } 11451 } 11452 11453 if (va_next > eva) 11454 va_next = eva; 11455 11456 for (ptep = pmap_pde_to_pte(pde, va); va != va_next; 11457 ptep++, va += PAGE_SIZE) { 11458 pte = *ptep; 11459 if ((pte & X86_PG_V) == 0) 11460 continue; 11461 newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx); 11462 if (newpte != pte) { 11463 *ptep = newpte; 11464 changed = true; 11465 } 11466 } 11467 } 11468 if (changed) 11469 pmap_invalidate_range(pmap, sva, eva); 11470 } 11471 11472 static int 11473 pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 11474 u_int keyidx, int flags) 11475 { 11476 11477 if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX || 11478 (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0) 11479 return (EINVAL); 11480 if (eva <= sva || eva > VM_MAXUSER_ADDRESS) 11481 return (EFAULT); 11482 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) 11483 return (ENOTSUP); 11484 return (0); 11485 } 11486 11487 int 11488 pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, 11489 int flags) 11490 { 11491 int error; 11492 11493 sva = trunc_page(sva); 11494 eva = round_page(eva); 11495 error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags); 11496 if (error != 0) 11497 return (error); 11498 for (;;) { 11499 PMAP_LOCK(pmap); 11500 error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags); 11501 if (error == 0) 11502 pmap_pkru_update_range(pmap, sva, eva, keyidx); 11503 PMAP_UNLOCK(pmap); 11504 if (error != ENOMEM) 11505 break; 11506 vm_wait(NULL); 11507 } 11508 return (error); 11509 } 11510 11511 int 11512 pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11513 { 11514 int error; 11515 11516 sva = trunc_page(sva); 11517 eva = round_page(eva); 11518 error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0); 11519 if (error != 0) 11520 return (error); 11521 for (;;) { 11522 PMAP_LOCK(pmap); 11523 error = pmap_pkru_deassign(pmap, sva, eva); 11524 if (error == 0) 11525 pmap_pkru_update_range(pmap, sva, eva, 0); 11526 PMAP_UNLOCK(pmap); 11527 if (error != ENOMEM) 11528 break; 11529 vm_wait(NULL); 11530 } 11531 return (error); 11532 } 11533 11534 #if defined(KASAN) || defined(KMSAN) 11535 11536 /* 11537 * Reserve enough memory to: 11538 * 1) allocate PDP pages for the shadow map(s), 11539 * 2) shadow one page of memory, so one PD page, one PT page, and one shadow 11540 * page per shadow map. 11541 */ 11542 #ifdef KASAN 11543 #define SAN_EARLY_PAGES (NKASANPML4E + 3) 11544 #else 11545 #define SAN_EARLY_PAGES (NKMSANSHADPML4E + NKMSANORIGPML4E + 2 * 3) 11546 #endif 11547 11548 static uint64_t __nosanitizeaddress __nosanitizememory 11549 pmap_san_enter_early_alloc_4k(uint64_t pabase) 11550 { 11551 static uint8_t data[PAGE_SIZE * SAN_EARLY_PAGES] __aligned(PAGE_SIZE); 11552 static size_t offset = 0; 11553 uint64_t pa; 11554 11555 if (offset == sizeof(data)) { 11556 panic("%s: ran out of memory for the bootstrap shadow map", 11557 __func__); 11558 } 11559 11560 pa = pabase + ((vm_offset_t)&data[offset] - KERNSTART); 11561 offset += PAGE_SIZE; 11562 return (pa); 11563 } 11564 11565 /* 11566 * Map a shadow page, before the kernel has bootstrapped its page tables. This 11567 * is currently only used to shadow the temporary boot stack set up by locore. 11568 */ 11569 static void __nosanitizeaddress __nosanitizememory 11570 pmap_san_enter_early(vm_offset_t va) 11571 { 11572 static bool first = true; 11573 pml4_entry_t *pml4e; 11574 pdp_entry_t *pdpe; 11575 pd_entry_t *pde; 11576 pt_entry_t *pte; 11577 uint64_t cr3, pa, base; 11578 int i; 11579 11580 base = amd64_loadaddr(); 11581 cr3 = rcr3(); 11582 11583 if (first) { 11584 /* 11585 * If this the first call, we need to allocate new PML4Es for 11586 * the bootstrap shadow map(s). We don't know how the PML4 page 11587 * was initialized by the boot loader, so we can't simply test 11588 * whether the shadow map's PML4Es are zero. 11589 */ 11590 first = false; 11591 #ifdef KASAN 11592 for (i = 0; i < NKASANPML4E; i++) { 11593 pa = pmap_san_enter_early_alloc_4k(base); 11594 11595 pml4e = (pml4_entry_t *)cr3 + 11596 pmap_pml4e_index(KASAN_MIN_ADDRESS + i * NBPML4); 11597 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V); 11598 } 11599 #else 11600 for (i = 0; i < NKMSANORIGPML4E; i++) { 11601 pa = pmap_san_enter_early_alloc_4k(base); 11602 11603 pml4e = (pml4_entry_t *)cr3 + 11604 pmap_pml4e_index(KMSAN_ORIG_MIN_ADDRESS + 11605 i * NBPML4); 11606 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V); 11607 } 11608 for (i = 0; i < NKMSANSHADPML4E; i++) { 11609 pa = pmap_san_enter_early_alloc_4k(base); 11610 11611 pml4e = (pml4_entry_t *)cr3 + 11612 pmap_pml4e_index(KMSAN_SHAD_MIN_ADDRESS + 11613 i * NBPML4); 11614 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V); 11615 } 11616 #endif 11617 } 11618 pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(va); 11619 pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(va); 11620 if (*pdpe == 0) { 11621 pa = pmap_san_enter_early_alloc_4k(base); 11622 *pdpe = (pdp_entry_t)(pa | X86_PG_RW | X86_PG_V); 11623 } 11624 pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(va); 11625 if (*pde == 0) { 11626 pa = pmap_san_enter_early_alloc_4k(base); 11627 *pde = (pd_entry_t)(pa | X86_PG_RW | X86_PG_V); 11628 } 11629 pte = (pt_entry_t *)(*pde & PG_FRAME) + pmap_pte_index(va); 11630 if (*pte != 0) 11631 panic("%s: PTE for %#lx is already initialized", __func__, va); 11632 pa = pmap_san_enter_early_alloc_4k(base); 11633 *pte = (pt_entry_t)(pa | X86_PG_A | X86_PG_M | X86_PG_RW | X86_PG_V); 11634 } 11635 11636 static vm_page_t 11637 pmap_san_enter_alloc_4k(void) 11638 { 11639 vm_page_t m; 11640 11641 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 11642 VM_ALLOC_ZERO); 11643 if (m == NULL) 11644 panic("%s: no memory to grow shadow map", __func__); 11645 return (m); 11646 } 11647 11648 static vm_page_t 11649 pmap_san_enter_alloc_2m(void) 11650 { 11651 return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO, 11652 NPTEPG, 0, ~0ul, NBPDR, 0, VM_MEMATTR_DEFAULT)); 11653 } 11654 11655 /* 11656 * Grow a shadow map by at least one 4KB page at the specified address. Use 2MB 11657 * pages when possible. 11658 */ 11659 void __nosanitizeaddress __nosanitizememory 11660 pmap_san_enter(vm_offset_t va) 11661 { 11662 pdp_entry_t *pdpe; 11663 pd_entry_t *pde; 11664 pt_entry_t *pte; 11665 vm_page_t m; 11666 11667 if (kernphys == 0) { 11668 /* 11669 * We're creating a temporary shadow map for the boot stack. 11670 */ 11671 pmap_san_enter_early(va); 11672 return; 11673 } 11674 11675 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 11676 11677 pdpe = pmap_pdpe(kernel_pmap, va); 11678 if ((*pdpe & X86_PG_V) == 0) { 11679 m = pmap_san_enter_alloc_4k(); 11680 *pdpe = (pdp_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11681 X86_PG_V | pg_nx); 11682 } 11683 pde = pmap_pdpe_to_pde(pdpe, va); 11684 if ((*pde & X86_PG_V) == 0) { 11685 m = pmap_san_enter_alloc_2m(); 11686 if (m != NULL) { 11687 *pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11688 X86_PG_PS | X86_PG_V | X86_PG_A | X86_PG_M | pg_nx); 11689 } else { 11690 m = pmap_san_enter_alloc_4k(); 11691 *pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11692 X86_PG_V | pg_nx); 11693 } 11694 } 11695 if ((*pde & X86_PG_PS) != 0) 11696 return; 11697 pte = pmap_pde_to_pte(pde, va); 11698 if ((*pte & X86_PG_V) != 0) 11699 return; 11700 m = pmap_san_enter_alloc_4k(); 11701 *pte = (pt_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | X86_PG_V | 11702 X86_PG_M | X86_PG_A | pg_nx); 11703 } 11704 #endif 11705 11706 /* 11707 * Track a range of the kernel's virtual address space that is contiguous 11708 * in various mapping attributes. 11709 */ 11710 struct pmap_kernel_map_range { 11711 vm_offset_t sva; 11712 pt_entry_t attrs; 11713 int ptes; 11714 int pdes; 11715 int pdpes; 11716 }; 11717 11718 static void 11719 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 11720 vm_offset_t eva) 11721 { 11722 const char *mode; 11723 int i, pat_idx; 11724 11725 if (eva <= range->sva) 11726 return; 11727 11728 pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true); 11729 for (i = 0; i < PAT_INDEX_SIZE; i++) 11730 if (pat_index[i] == pat_idx) 11731 break; 11732 11733 switch (i) { 11734 case PAT_WRITE_BACK: 11735 mode = "WB"; 11736 break; 11737 case PAT_WRITE_THROUGH: 11738 mode = "WT"; 11739 break; 11740 case PAT_UNCACHEABLE: 11741 mode = "UC"; 11742 break; 11743 case PAT_UNCACHED: 11744 mode = "U-"; 11745 break; 11746 case PAT_WRITE_PROTECTED: 11747 mode = "WP"; 11748 break; 11749 case PAT_WRITE_COMBINING: 11750 mode = "WC"; 11751 break; 11752 default: 11753 printf("%s: unknown PAT mode %#x for range 0x%016lx-0x%016lx\n", 11754 __func__, pat_idx, range->sva, eva); 11755 mode = "??"; 11756 break; 11757 } 11758 11759 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n", 11760 range->sva, eva, 11761 (range->attrs & X86_PG_RW) != 0 ? 'w' : '-', 11762 (range->attrs & pg_nx) != 0 ? '-' : 'x', 11763 (range->attrs & X86_PG_U) != 0 ? 'u' : 's', 11764 (range->attrs & X86_PG_G) != 0 ? 'g' : '-', 11765 mode, range->pdpes, range->pdes, range->ptes); 11766 11767 /* Reset to sentinel value. */ 11768 range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, 11769 NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, 11770 NPDEPG - 1, NPTEPG - 1); 11771 } 11772 11773 /* 11774 * Determine whether the attributes specified by a page table entry match those 11775 * being tracked by the current range. This is not quite as simple as a direct 11776 * flag comparison since some PAT modes have multiple representations. 11777 */ 11778 static bool 11779 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 11780 { 11781 pt_entry_t diff, mask; 11782 11783 mask = X86_PG_G | X86_PG_RW | X86_PG_U | X86_PG_PDE_CACHE | pg_nx; 11784 diff = (range->attrs ^ attrs) & mask; 11785 if (diff == 0) 11786 return (true); 11787 if ((diff & ~X86_PG_PDE_PAT) == 0 && 11788 pmap_pat_index(kernel_pmap, range->attrs, true) == 11789 pmap_pat_index(kernel_pmap, attrs, true)) 11790 return (true); 11791 return (false); 11792 } 11793 11794 static void 11795 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 11796 pt_entry_t attrs) 11797 { 11798 11799 memset(range, 0, sizeof(*range)); 11800 range->sva = va; 11801 range->attrs = attrs; 11802 } 11803 11804 /* 11805 * Given a leaf PTE, derive the mapping's attributes. If they do not match 11806 * those of the current run, dump the address range and its attributes, and 11807 * begin a new run. 11808 */ 11809 static void 11810 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 11811 vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde, 11812 pt_entry_t pte) 11813 { 11814 pt_entry_t attrs; 11815 11816 attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx); 11817 11818 attrs |= pdpe & pg_nx; 11819 attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U)); 11820 if ((pdpe & PG_PS) != 0) { 11821 attrs |= pdpe & (X86_PG_G | X86_PG_PDE_CACHE); 11822 } else if (pde != 0) { 11823 attrs |= pde & pg_nx; 11824 attrs &= pg_nx | (pde & (X86_PG_RW | X86_PG_U)); 11825 } 11826 if ((pde & PG_PS) != 0) { 11827 attrs |= pde & (X86_PG_G | X86_PG_PDE_CACHE); 11828 } else if (pte != 0) { 11829 attrs |= pte & pg_nx; 11830 attrs &= pg_nx | (pte & (X86_PG_RW | X86_PG_U)); 11831 attrs |= pte & (X86_PG_G | X86_PG_PTE_CACHE); 11832 11833 /* Canonicalize by always using the PDE PAT bit. */ 11834 if ((attrs & X86_PG_PTE_PAT) != 0) 11835 attrs ^= X86_PG_PDE_PAT | X86_PG_PTE_PAT; 11836 } 11837 11838 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 11839 sysctl_kmaps_dump(sb, range, va); 11840 sysctl_kmaps_reinit(range, va, attrs); 11841 } 11842 } 11843 11844 static int 11845 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 11846 { 11847 struct pmap_kernel_map_range range; 11848 struct sbuf sbuf, *sb; 11849 pml4_entry_t pml4e; 11850 pdp_entry_t *pdp, pdpe; 11851 pd_entry_t *pd, pde; 11852 pt_entry_t *pt, pte; 11853 vm_offset_t sva; 11854 vm_paddr_t pa; 11855 int error, i, j, k, l; 11856 11857 error = sysctl_wire_old_buffer(req, 0); 11858 if (error != 0) 11859 return (error); 11860 sb = &sbuf; 11861 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 11862 11863 /* Sentinel value. */ 11864 range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, 11865 NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, 11866 NPDEPG - 1, NPTEPG - 1); 11867 11868 /* 11869 * Iterate over the kernel page tables without holding the kernel pmap 11870 * lock. Outside of the large map, kernel page table pages are never 11871 * freed, so at worst we will observe inconsistencies in the output. 11872 * Within the large map, ensure that PDP and PD page addresses are 11873 * valid before descending. 11874 */ 11875 for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) { 11876 switch (i) { 11877 case PML4PML4I: 11878 sbuf_printf(sb, "\nRecursive map:\n"); 11879 break; 11880 case DMPML4I: 11881 sbuf_printf(sb, "\nDirect map:\n"); 11882 break; 11883 #ifdef KASAN 11884 case KASANPML4I: 11885 sbuf_printf(sb, "\nKASAN shadow map:\n"); 11886 break; 11887 #endif 11888 #ifdef KMSAN 11889 case KMSANSHADPML4I: 11890 sbuf_printf(sb, "\nKMSAN shadow map:\n"); 11891 break; 11892 case KMSANORIGPML4I: 11893 sbuf_printf(sb, "\nKMSAN origin map:\n"); 11894 break; 11895 #endif 11896 case KPML4BASE: 11897 sbuf_printf(sb, "\nKernel map:\n"); 11898 break; 11899 case LMSPML4I: 11900 sbuf_printf(sb, "\nLarge map:\n"); 11901 break; 11902 } 11903 11904 /* Convert to canonical form. */ 11905 if (sva == 1ul << 47) 11906 sva |= -1ul << 48; 11907 11908 restart: 11909 pml4e = kernel_pml4[i]; 11910 if ((pml4e & X86_PG_V) == 0) { 11911 sva = rounddown2(sva, NBPML4); 11912 sysctl_kmaps_dump(sb, &range, sva); 11913 sva += NBPML4; 11914 continue; 11915 } 11916 pa = pml4e & PG_FRAME; 11917 pdp = (pdp_entry_t *)PHYS_TO_DMAP(pa); 11918 11919 for (j = pmap_pdpe_index(sva); j < NPDPEPG; j++) { 11920 pdpe = pdp[j]; 11921 if ((pdpe & X86_PG_V) == 0) { 11922 sva = rounddown2(sva, NBPDP); 11923 sysctl_kmaps_dump(sb, &range, sva); 11924 sva += NBPDP; 11925 continue; 11926 } 11927 pa = pdpe & PG_FRAME; 11928 if ((pdpe & PG_PS) != 0) { 11929 sva = rounddown2(sva, NBPDP); 11930 sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, 11931 0, 0); 11932 range.pdpes++; 11933 sva += NBPDP; 11934 continue; 11935 } 11936 if (PMAP_ADDRESS_IN_LARGEMAP(sva) && 11937 vm_phys_paddr_to_vm_page(pa) == NULL) { 11938 /* 11939 * Page table pages for the large map may be 11940 * freed. Validate the next-level address 11941 * before descending. 11942 */ 11943 goto restart; 11944 } 11945 pd = (pd_entry_t *)PHYS_TO_DMAP(pa); 11946 11947 for (k = pmap_pde_index(sva); k < NPDEPG; k++) { 11948 pde = pd[k]; 11949 if ((pde & X86_PG_V) == 0) { 11950 sva = rounddown2(sva, NBPDR); 11951 sysctl_kmaps_dump(sb, &range, sva); 11952 sva += NBPDR; 11953 continue; 11954 } 11955 pa = pde & PG_FRAME; 11956 if ((pde & PG_PS) != 0) { 11957 sva = rounddown2(sva, NBPDR); 11958 sysctl_kmaps_check(sb, &range, sva, 11959 pml4e, pdpe, pde, 0); 11960 range.pdes++; 11961 sva += NBPDR; 11962 continue; 11963 } 11964 if (PMAP_ADDRESS_IN_LARGEMAP(sva) && 11965 vm_phys_paddr_to_vm_page(pa) == NULL) { 11966 /* 11967 * Page table pages for the large map 11968 * may be freed. Validate the 11969 * next-level address before descending. 11970 */ 11971 goto restart; 11972 } 11973 pt = (pt_entry_t *)PHYS_TO_DMAP(pa); 11974 11975 for (l = pmap_pte_index(sva); l < NPTEPG; l++, 11976 sva += PAGE_SIZE) { 11977 pte = pt[l]; 11978 if ((pte & X86_PG_V) == 0) { 11979 sysctl_kmaps_dump(sb, &range, 11980 sva); 11981 continue; 11982 } 11983 sysctl_kmaps_check(sb, &range, sva, 11984 pml4e, pdpe, pde, pte); 11985 range.ptes++; 11986 } 11987 } 11988 } 11989 } 11990 11991 error = sbuf_finish(sb); 11992 sbuf_delete(sb); 11993 return (error); 11994 } 11995 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 11996 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 11997 NULL, 0, sysctl_kmaps, "A", 11998 "Dump kernel address layout"); 11999 12000 #ifdef DDB 12001 DB_SHOW_COMMAND(pte, pmap_print_pte) 12002 { 12003 pmap_t pmap; 12004 pml5_entry_t *pml5; 12005 pml4_entry_t *pml4; 12006 pdp_entry_t *pdp; 12007 pd_entry_t *pde; 12008 pt_entry_t *pte, PG_V; 12009 vm_offset_t va; 12010 12011 if (!have_addr) { 12012 db_printf("show pte addr\n"); 12013 return; 12014 } 12015 va = (vm_offset_t)addr; 12016 12017 if (kdb_thread != NULL) 12018 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); 12019 else 12020 pmap = PCPU_GET(curpmap); 12021 12022 PG_V = pmap_valid_bit(pmap); 12023 db_printf("VA 0x%016lx", va); 12024 12025 if (pmap_is_la57(pmap)) { 12026 pml5 = pmap_pml5e(pmap, va); 12027 db_printf(" pml5e 0x%016lx", *pml5); 12028 if ((*pml5 & PG_V) == 0) { 12029 db_printf("\n"); 12030 return; 12031 } 12032 pml4 = pmap_pml5e_to_pml4e(pml5, va); 12033 } else { 12034 pml4 = pmap_pml4e(pmap, va); 12035 } 12036 db_printf(" pml4e 0x%016lx", *pml4); 12037 if ((*pml4 & PG_V) == 0) { 12038 db_printf("\n"); 12039 return; 12040 } 12041 pdp = pmap_pml4e_to_pdpe(pml4, va); 12042 db_printf(" pdpe 0x%016lx", *pdp); 12043 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { 12044 db_printf("\n"); 12045 return; 12046 } 12047 pde = pmap_pdpe_to_pde(pdp, va); 12048 db_printf(" pde 0x%016lx", *pde); 12049 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { 12050 db_printf("\n"); 12051 return; 12052 } 12053 pte = pmap_pde_to_pte(pde, va); 12054 db_printf(" pte 0x%016lx\n", *pte); 12055 } 12056 12057 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) 12058 { 12059 vm_paddr_t a; 12060 12061 if (have_addr) { 12062 a = (vm_paddr_t)addr; 12063 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); 12064 } else { 12065 db_printf("show phys2dmap addr\n"); 12066 } 12067 } 12068 12069 static void 12070 ptpages_show_page(int level, int idx, vm_page_t pg) 12071 { 12072 db_printf("l %d i %d pg %p phys %#lx ref %x\n", 12073 level, idx, pg, VM_PAGE_TO_PHYS(pg), pg->ref_count); 12074 } 12075 12076 static void 12077 ptpages_show_complain(int level, int idx, uint64_t pte) 12078 { 12079 db_printf("l %d i %d pte %#lx\n", level, idx, pte); 12080 } 12081 12082 static void 12083 ptpages_show_pml4(vm_page_t pg4, int num_entries, uint64_t PG_V) 12084 { 12085 vm_page_t pg3, pg2, pg1; 12086 pml4_entry_t *pml4; 12087 pdp_entry_t *pdp; 12088 pd_entry_t *pd; 12089 int i4, i3, i2; 12090 12091 pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg4)); 12092 for (i4 = 0; i4 < num_entries; i4++) { 12093 if ((pml4[i4] & PG_V) == 0) 12094 continue; 12095 pg3 = PHYS_TO_VM_PAGE(pml4[i4] & PG_FRAME); 12096 if (pg3 == NULL) { 12097 ptpages_show_complain(3, i4, pml4[i4]); 12098 continue; 12099 } 12100 ptpages_show_page(3, i4, pg3); 12101 pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg3)); 12102 for (i3 = 0; i3 < NPDPEPG; i3++) { 12103 if ((pdp[i3] & PG_V) == 0) 12104 continue; 12105 pg2 = PHYS_TO_VM_PAGE(pdp[i3] & PG_FRAME); 12106 if (pg3 == NULL) { 12107 ptpages_show_complain(2, i3, pdp[i3]); 12108 continue; 12109 } 12110 ptpages_show_page(2, i3, pg2); 12111 pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg2)); 12112 for (i2 = 0; i2 < NPDEPG; i2++) { 12113 if ((pd[i2] & PG_V) == 0) 12114 continue; 12115 pg1 = PHYS_TO_VM_PAGE(pd[i2] & PG_FRAME); 12116 if (pg1 == NULL) { 12117 ptpages_show_complain(1, i2, pd[i2]); 12118 continue; 12119 } 12120 ptpages_show_page(1, i2, pg1); 12121 } 12122 } 12123 } 12124 } 12125 12126 DB_SHOW_COMMAND(ptpages, pmap_ptpages) 12127 { 12128 pmap_t pmap; 12129 vm_page_t pg; 12130 pml5_entry_t *pml5; 12131 uint64_t PG_V; 12132 int i5; 12133 12134 if (have_addr) 12135 pmap = (pmap_t)addr; 12136 else 12137 pmap = PCPU_GET(curpmap); 12138 12139 PG_V = pmap_valid_bit(pmap); 12140 12141 if (pmap_is_la57(pmap)) { 12142 pml5 = pmap->pm_pmltop; 12143 for (i5 = 0; i5 < NUPML5E; i5++) { 12144 if ((pml5[i5] & PG_V) == 0) 12145 continue; 12146 pg = PHYS_TO_VM_PAGE(pml5[i5] & PG_FRAME); 12147 if (pg == NULL) { 12148 ptpages_show_complain(4, i5, pml5[i5]); 12149 continue; 12150 } 12151 ptpages_show_page(4, i5, pg); 12152 ptpages_show_pml4(pg, NPML4EPG, PG_V); 12153 } 12154 } else { 12155 ptpages_show_pml4(PHYS_TO_VM_PAGE(DMAP_TO_PHYS( 12156 (vm_offset_t)pmap->pm_pmltop)), NUP4ML4E, PG_V); 12157 } 12158 } 12159 #endif 12160