1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * 15 * This code is derived from software contributed to Berkeley by 16 * the Systems Programming Group of the University of Utah Computer 17 * Science Department and William Jolitz of UUNET Technologies Inc. 18 * 19 * Redistribution and use in source and binary forms, with or without 20 * modification, are permitted provided that the following conditions 21 * are met: 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 3. All advertising materials mentioning features or use of this software 28 * must display the following acknowledgement: 29 * This product includes software developed by the University of 30 * California, Berkeley and its contributors. 31 * 4. Neither the name of the University nor the names of its contributors 32 * may be used to endorse or promote products derived from this software 33 * without specific prior written permission. 34 * 35 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 36 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 38 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 39 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 40 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 41 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 42 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 43 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 44 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 45 * SUCH DAMAGE. 46 * 47 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 48 */ 49 /*- 50 * Copyright (c) 2003 Networks Associates Technology, Inc. 51 * Copyright (c) 2014-2020 The FreeBSD Foundation 52 * All rights reserved. 53 * 54 * This software was developed for the FreeBSD Project by Jake Burkholder, 55 * Safeport Network Services, and Network Associates Laboratories, the 56 * Security Research Division of Network Associates, Inc. under 57 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 58 * CHATS research program. 59 * 60 * Portions of this software were developed by 61 * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from 62 * the FreeBSD Foundation. 63 * 64 * Redistribution and use in source and binary forms, with or without 65 * modification, are permitted provided that the following conditions 66 * are met: 67 * 1. Redistributions of source code must retain the above copyright 68 * notice, this list of conditions and the following disclaimer. 69 * 2. Redistributions in binary form must reproduce the above copyright 70 * notice, this list of conditions and the following disclaimer in the 71 * documentation and/or other materials provided with the distribution. 72 * 73 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 74 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 75 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 76 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 77 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 78 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 79 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 80 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 81 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 82 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 83 * SUCH DAMAGE. 84 */ 85 86 #define AMD64_NPT_AWARE 87 88 #include <sys/cdefs.h> 89 __FBSDID("$FreeBSD$"); 90 91 /* 92 * Manages physical address maps. 93 * 94 * Since the information managed by this module is 95 * also stored by the logical address mapping module, 96 * this module may throw away valid virtual-to-physical 97 * mappings at almost any time. However, invalidations 98 * of virtual-to-physical mappings must be done as 99 * requested. 100 * 101 * In order to cope with hardware architectures which 102 * make virtual-to-physical map invalidates expensive, 103 * this module may delay invalidate or reduced protection 104 * operations until such time as they are actually 105 * necessary. This module is given full information as 106 * to which processors are currently using which maps, 107 * and to when physical maps must be made correct. 108 */ 109 110 #include "opt_ddb.h" 111 #include "opt_pmap.h" 112 #include "opt_vm.h" 113 114 #include <sys/param.h> 115 #include <sys/asan.h> 116 #include <sys/bitstring.h> 117 #include <sys/bus.h> 118 #include <sys/systm.h> 119 #include <sys/counter.h> 120 #include <sys/kernel.h> 121 #include <sys/ktr.h> 122 #include <sys/lock.h> 123 #include <sys/malloc.h> 124 #include <sys/mman.h> 125 #include <sys/msan.h> 126 #include <sys/mutex.h> 127 #include <sys/proc.h> 128 #include <sys/rangeset.h> 129 #include <sys/rwlock.h> 130 #include <sys/sbuf.h> 131 #include <sys/smr.h> 132 #include <sys/sx.h> 133 #include <sys/turnstile.h> 134 #include <sys/vmem.h> 135 #include <sys/vmmeter.h> 136 #include <sys/sched.h> 137 #include <sys/sysctl.h> 138 #include <sys/smp.h> 139 #ifdef DDB 140 #include <sys/kdb.h> 141 #include <ddb/ddb.h> 142 #endif 143 144 #include <vm/vm.h> 145 #include <vm/vm_param.h> 146 #include <vm/vm_kern.h> 147 #include <vm/vm_page.h> 148 #include <vm/vm_map.h> 149 #include <vm/vm_object.h> 150 #include <vm/vm_extern.h> 151 #include <vm/vm_pageout.h> 152 #include <vm/vm_pager.h> 153 #include <vm/vm_phys.h> 154 #include <vm/vm_radix.h> 155 #include <vm/vm_reserv.h> 156 #include <vm/vm_dumpset.h> 157 #include <vm/uma.h> 158 159 #include <machine/asan.h> 160 #include <machine/intr_machdep.h> 161 #include <x86/apicvar.h> 162 #include <x86/ifunc.h> 163 #include <machine/cpu.h> 164 #include <machine/cputypes.h> 165 #include <machine/md_var.h> 166 #include <machine/msan.h> 167 #include <machine/pcb.h> 168 #include <machine/specialreg.h> 169 #ifdef SMP 170 #include <machine/smp.h> 171 #endif 172 #include <machine/sysarch.h> 173 #include <machine/tss.h> 174 175 #ifdef NUMA 176 #define PMAP_MEMDOM MAXMEMDOM 177 #else 178 #define PMAP_MEMDOM 1 179 #endif 180 181 static __inline boolean_t 182 pmap_type_guest(pmap_t pmap) 183 { 184 185 return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI)); 186 } 187 188 static __inline boolean_t 189 pmap_emulate_ad_bits(pmap_t pmap) 190 { 191 192 return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); 193 } 194 195 static __inline pt_entry_t 196 pmap_valid_bit(pmap_t pmap) 197 { 198 pt_entry_t mask; 199 200 switch (pmap->pm_type) { 201 case PT_X86: 202 case PT_RVI: 203 mask = X86_PG_V; 204 break; 205 case PT_EPT: 206 if (pmap_emulate_ad_bits(pmap)) 207 mask = EPT_PG_EMUL_V; 208 else 209 mask = EPT_PG_READ; 210 break; 211 default: 212 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); 213 } 214 215 return (mask); 216 } 217 218 static __inline pt_entry_t 219 pmap_rw_bit(pmap_t pmap) 220 { 221 pt_entry_t mask; 222 223 switch (pmap->pm_type) { 224 case PT_X86: 225 case PT_RVI: 226 mask = X86_PG_RW; 227 break; 228 case PT_EPT: 229 if (pmap_emulate_ad_bits(pmap)) 230 mask = EPT_PG_EMUL_RW; 231 else 232 mask = EPT_PG_WRITE; 233 break; 234 default: 235 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); 236 } 237 238 return (mask); 239 } 240 241 static pt_entry_t pg_g; 242 243 static __inline pt_entry_t 244 pmap_global_bit(pmap_t pmap) 245 { 246 pt_entry_t mask; 247 248 switch (pmap->pm_type) { 249 case PT_X86: 250 mask = pg_g; 251 break; 252 case PT_RVI: 253 case PT_EPT: 254 mask = 0; 255 break; 256 default: 257 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); 258 } 259 260 return (mask); 261 } 262 263 static __inline pt_entry_t 264 pmap_accessed_bit(pmap_t pmap) 265 { 266 pt_entry_t mask; 267 268 switch (pmap->pm_type) { 269 case PT_X86: 270 case PT_RVI: 271 mask = X86_PG_A; 272 break; 273 case PT_EPT: 274 if (pmap_emulate_ad_bits(pmap)) 275 mask = EPT_PG_READ; 276 else 277 mask = EPT_PG_A; 278 break; 279 default: 280 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); 281 } 282 283 return (mask); 284 } 285 286 static __inline pt_entry_t 287 pmap_modified_bit(pmap_t pmap) 288 { 289 pt_entry_t mask; 290 291 switch (pmap->pm_type) { 292 case PT_X86: 293 case PT_RVI: 294 mask = X86_PG_M; 295 break; 296 case PT_EPT: 297 if (pmap_emulate_ad_bits(pmap)) 298 mask = EPT_PG_WRITE; 299 else 300 mask = EPT_PG_M; 301 break; 302 default: 303 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); 304 } 305 306 return (mask); 307 } 308 309 static __inline pt_entry_t 310 pmap_pku_mask_bit(pmap_t pmap) 311 { 312 313 return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0); 314 } 315 316 #if !defined(DIAGNOSTIC) 317 #ifdef __GNUC_GNU_INLINE__ 318 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 319 #else 320 #define PMAP_INLINE extern inline 321 #endif 322 #else 323 #define PMAP_INLINE 324 #endif 325 326 #ifdef PV_STATS 327 #define PV_STAT(x) do { x ; } while (0) 328 #else 329 #define PV_STAT(x) do { } while (0) 330 #endif 331 332 #undef pa_index 333 #ifdef NUMA 334 #define pa_index(pa) ({ \ 335 KASSERT((pa) <= vm_phys_segs[vm_phys_nsegs - 1].end, \ 336 ("address %lx beyond the last segment", (pa))); \ 337 (pa) >> PDRSHIFT; \ 338 }) 339 #define pa_to_pmdp(pa) (&pv_table[pa_index(pa)]) 340 #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page)) 341 #define PHYS_TO_PV_LIST_LOCK(pa) ({ \ 342 struct rwlock *_lock; \ 343 if (__predict_false((pa) > pmap_last_pa)) \ 344 _lock = &pv_dummy_large.pv_lock; \ 345 else \ 346 _lock = &(pa_to_pmdp(pa)->pv_lock); \ 347 _lock; \ 348 }) 349 #else 350 #define pa_index(pa) ((pa) >> PDRSHIFT) 351 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 352 353 #define NPV_LIST_LOCKS MAXCPU 354 355 #define PHYS_TO_PV_LIST_LOCK(pa) \ 356 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) 357 #endif 358 359 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 360 struct rwlock **_lockp = (lockp); \ 361 struct rwlock *_new_lock; \ 362 \ 363 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 364 if (_new_lock != *_lockp) { \ 365 if (*_lockp != NULL) \ 366 rw_wunlock(*_lockp); \ 367 *_lockp = _new_lock; \ 368 rw_wlock(*_lockp); \ 369 } \ 370 } while (0) 371 372 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 373 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 374 375 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 376 struct rwlock **_lockp = (lockp); \ 377 \ 378 if (*_lockp != NULL) { \ 379 rw_wunlock(*_lockp); \ 380 *_lockp = NULL; \ 381 } \ 382 } while (0) 383 384 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 385 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 386 387 struct pmap kernel_pmap_store; 388 389 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 390 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 391 392 int nkpt; 393 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, 394 "Number of kernel page table pages allocated on bootup"); 395 396 static int ndmpdp; 397 vm_paddr_t dmaplimit; 398 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 399 pt_entry_t pg_nx; 400 401 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 402 "VM/pmap parameters"); 403 404 static int pg_ps_enabled = 1; 405 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 406 &pg_ps_enabled, 0, "Are large page mappings enabled?"); 407 408 int __read_frequently la57 = 0; 409 SYSCTL_INT(_vm_pmap, OID_AUTO, la57, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 410 &la57, 0, 411 "5-level paging for host is enabled"); 412 413 static bool 414 pmap_is_la57(pmap_t pmap) 415 { 416 if (pmap->pm_type == PT_X86) 417 return (la57); 418 return (false); /* XXXKIB handle EPT */ 419 } 420 421 #define PAT_INDEX_SIZE 8 422 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 423 424 static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 425 static u_int64_t KPDphys; /* phys addr of kernel level 2 */ 426 static u_int64_t KPDPphys; /* phys addr of kernel level 3 */ 427 u_int64_t KPML4phys; /* phys addr of kernel level 4 */ 428 u_int64_t KPML5phys; /* phys addr of kernel level 5, 429 if supported */ 430 431 #ifdef KASAN 432 static uint64_t KASANPDPphys; 433 #endif 434 #ifdef KMSAN 435 static uint64_t KMSANSHADPDPphys; 436 static uint64_t KMSANORIGPDPphys; 437 438 /* 439 * To support systems with large amounts of memory, it is necessary to extend 440 * the maximum size of the direct map. This could eat into the space reserved 441 * for the shadow map. 442 */ 443 _Static_assert(DMPML4I + NDMPML4E <= KMSANSHADPML4I, "direct map overflow"); 444 #endif 445 446 static pml4_entry_t *kernel_pml4; 447 static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ 448 static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 449 static int ndmpdpphys; /* number of DMPDPphys pages */ 450 451 vm_paddr_t kernphys; /* phys addr of start of bootstrap data */ 452 vm_paddr_t KERNend; /* and the end */ 453 454 /* 455 * pmap_mapdev support pre initialization (i.e. console) 456 */ 457 #define PMAP_PREINIT_MAPPING_COUNT 8 458 static struct pmap_preinit_mapping { 459 vm_paddr_t pa; 460 vm_offset_t va; 461 vm_size_t sz; 462 int mode; 463 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 464 static int pmap_initialized; 465 466 /* 467 * Data for the pv entry allocation mechanism. 468 * Updates to pv_invl_gen are protected by the pv list lock but reads are not. 469 */ 470 #ifdef NUMA 471 static __inline int 472 pc_to_domain(struct pv_chunk *pc) 473 { 474 475 return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc))); 476 } 477 #else 478 static __inline int 479 pc_to_domain(struct pv_chunk *pc __unused) 480 { 481 482 return (0); 483 } 484 #endif 485 486 struct pv_chunks_list { 487 struct mtx pvc_lock; 488 TAILQ_HEAD(pch, pv_chunk) pvc_list; 489 int active_reclaims; 490 } __aligned(CACHE_LINE_SIZE); 491 492 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM]; 493 494 #ifdef NUMA 495 struct pmap_large_md_page { 496 struct rwlock pv_lock; 497 struct md_page pv_page; 498 u_long pv_invl_gen; 499 }; 500 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large; 501 #define pv_dummy pv_dummy_large.pv_page 502 __read_mostly static struct pmap_large_md_page *pv_table; 503 __read_mostly vm_paddr_t pmap_last_pa; 504 #else 505 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; 506 static u_long pv_invl_gen[NPV_LIST_LOCKS]; 507 static struct md_page *pv_table; 508 static struct md_page pv_dummy; 509 #endif 510 511 /* 512 * All those kernel PT submaps that BSD is so fond of 513 */ 514 pt_entry_t *CMAP1 = NULL; 515 caddr_t CADDR1 = 0; 516 static vm_offset_t qframe = 0; 517 static struct mtx qframe_mtx; 518 519 static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ 520 521 static vmem_t *large_vmem; 522 static u_int lm_ents; 523 #define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= LARGEMAP_MIN_ADDRESS && \ 524 (va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents) 525 526 int pmap_pcid_enabled = 1; 527 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 528 &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?"); 529 int invpcid_works = 0; 530 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, 531 "Is the invpcid instruction available ?"); 532 int pmap_pcid_invlpg_workaround = 0; 533 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_invlpg_workaround, 534 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 535 &pmap_pcid_invlpg_workaround, 0, 536 "Enable small core PCID/INVLPG workaround"); 537 int pmap_pcid_invlpg_workaround_uena = 1; 538 539 int __read_frequently pti = 0; 540 SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 541 &pti, 0, 542 "Page Table Isolation enabled"); 543 static vm_object_t pti_obj; 544 static pml4_entry_t *pti_pml4; 545 static vm_pindex_t pti_pg_idx; 546 static bool pti_finalized; 547 548 struct pmap_pkru_range { 549 struct rs_el pkru_rs_el; 550 u_int pkru_keyidx; 551 int pkru_flags; 552 }; 553 554 static uma_zone_t pmap_pkru_ranges_zone; 555 static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 556 static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va); 557 static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 558 static void *pkru_dup_range(void *ctx, void *data); 559 static void pkru_free_range(void *ctx, void *node); 560 static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap); 561 static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 562 static void pmap_pkru_deassign_all(pmap_t pmap); 563 564 static COUNTER_U64_DEFINE_EARLY(pcid_save_cnt); 565 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLFLAG_RD, 566 &pcid_save_cnt, "Count of saved TLB context on switch"); 567 568 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = 569 LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); 570 static struct mtx invl_gen_mtx; 571 /* Fake lock object to satisfy turnstiles interface. */ 572 static struct lock_object invl_gen_ts = { 573 .lo_name = "invlts", 574 }; 575 static struct pmap_invl_gen pmap_invl_gen_head = { 576 .gen = 1, 577 .next = NULL, 578 }; 579 static u_long pmap_invl_gen = 1; 580 static int pmap_invl_waiters; 581 static struct callout pmap_invl_callout; 582 static bool pmap_invl_callout_inited; 583 584 #define PMAP_ASSERT_NOT_IN_DI() \ 585 KASSERT(pmap_not_in_di(), ("DI already started")) 586 587 static bool 588 pmap_di_locked(void) 589 { 590 int tun; 591 592 if ((cpu_feature2 & CPUID2_CX16) == 0) 593 return (true); 594 tun = 0; 595 TUNABLE_INT_FETCH("vm.pmap.di_locked", &tun); 596 return (tun != 0); 597 } 598 599 static int 600 sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS) 601 { 602 int locked; 603 604 locked = pmap_di_locked(); 605 return (sysctl_handle_int(oidp, &locked, 0, req)); 606 } 607 SYSCTL_PROC(_vm_pmap, OID_AUTO, di_locked, CTLTYPE_INT | CTLFLAG_RDTUN | 608 CTLFLAG_MPSAFE, 0, 0, sysctl_pmap_di_locked, "", 609 "Locked delayed invalidation"); 610 611 static bool pmap_not_in_di_l(void); 612 static bool pmap_not_in_di_u(void); 613 DEFINE_IFUNC(, bool, pmap_not_in_di, (void)) 614 { 615 616 return (pmap_di_locked() ? pmap_not_in_di_l : pmap_not_in_di_u); 617 } 618 619 static bool 620 pmap_not_in_di_l(void) 621 { 622 struct pmap_invl_gen *invl_gen; 623 624 invl_gen = &curthread->td_md.md_invl_gen; 625 return (invl_gen->gen == 0); 626 } 627 628 static void 629 pmap_thread_init_invl_gen_l(struct thread *td) 630 { 631 struct pmap_invl_gen *invl_gen; 632 633 invl_gen = &td->td_md.md_invl_gen; 634 invl_gen->gen = 0; 635 } 636 637 static void 638 pmap_delayed_invl_wait_block(u_long *m_gen, u_long *invl_gen) 639 { 640 struct turnstile *ts; 641 642 ts = turnstile_trywait(&invl_gen_ts); 643 if (*m_gen > atomic_load_long(invl_gen)) 644 turnstile_wait(ts, NULL, TS_SHARED_QUEUE); 645 else 646 turnstile_cancel(ts); 647 } 648 649 static void 650 pmap_delayed_invl_finish_unblock(u_long new_gen) 651 { 652 struct turnstile *ts; 653 654 turnstile_chain_lock(&invl_gen_ts); 655 ts = turnstile_lookup(&invl_gen_ts); 656 if (new_gen != 0) 657 pmap_invl_gen = new_gen; 658 if (ts != NULL) { 659 turnstile_broadcast(ts, TS_SHARED_QUEUE); 660 turnstile_unpend(ts); 661 } 662 turnstile_chain_unlock(&invl_gen_ts); 663 } 664 665 /* 666 * Start a new Delayed Invalidation (DI) block of code, executed by 667 * the current thread. Within a DI block, the current thread may 668 * destroy both the page table and PV list entries for a mapping and 669 * then release the corresponding PV list lock before ensuring that 670 * the mapping is flushed from the TLBs of any processors with the 671 * pmap active. 672 */ 673 static void 674 pmap_delayed_invl_start_l(void) 675 { 676 struct pmap_invl_gen *invl_gen; 677 u_long currgen; 678 679 invl_gen = &curthread->td_md.md_invl_gen; 680 PMAP_ASSERT_NOT_IN_DI(); 681 mtx_lock(&invl_gen_mtx); 682 if (LIST_EMPTY(&pmap_invl_gen_tracker)) 683 currgen = pmap_invl_gen; 684 else 685 currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; 686 invl_gen->gen = currgen + 1; 687 LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); 688 mtx_unlock(&invl_gen_mtx); 689 } 690 691 /* 692 * Finish the DI block, previously started by the current thread. All 693 * required TLB flushes for the pages marked by 694 * pmap_delayed_invl_page() must be finished before this function is 695 * called. 696 * 697 * This function works by bumping the global DI generation number to 698 * the generation number of the current thread's DI, unless there is a 699 * pending DI that started earlier. In the latter case, bumping the 700 * global DI generation number would incorrectly signal that the 701 * earlier DI had finished. Instead, this function bumps the earlier 702 * DI's generation number to match the generation number of the 703 * current thread's DI. 704 */ 705 static void 706 pmap_delayed_invl_finish_l(void) 707 { 708 struct pmap_invl_gen *invl_gen, *next; 709 710 invl_gen = &curthread->td_md.md_invl_gen; 711 KASSERT(invl_gen->gen != 0, ("missed invl_start")); 712 mtx_lock(&invl_gen_mtx); 713 next = LIST_NEXT(invl_gen, link); 714 if (next == NULL) 715 pmap_delayed_invl_finish_unblock(invl_gen->gen); 716 else 717 next->gen = invl_gen->gen; 718 LIST_REMOVE(invl_gen, link); 719 mtx_unlock(&invl_gen_mtx); 720 invl_gen->gen = 0; 721 } 722 723 static bool 724 pmap_not_in_di_u(void) 725 { 726 struct pmap_invl_gen *invl_gen; 727 728 invl_gen = &curthread->td_md.md_invl_gen; 729 return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0); 730 } 731 732 static void 733 pmap_thread_init_invl_gen_u(struct thread *td) 734 { 735 struct pmap_invl_gen *invl_gen; 736 737 invl_gen = &td->td_md.md_invl_gen; 738 invl_gen->gen = 0; 739 invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID; 740 } 741 742 static bool 743 pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out) 744 { 745 uint64_t new_high, new_low, old_high, old_low; 746 char res; 747 748 old_low = new_low = 0; 749 old_high = new_high = (uintptr_t)0; 750 751 __asm volatile("lock;cmpxchg16b\t%1" 752 : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) 753 : "b"(new_low), "c" (new_high) 754 : "memory", "cc"); 755 if (res == 0) { 756 if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0) 757 return (false); 758 out->gen = old_low; 759 out->next = (void *)old_high; 760 } else { 761 out->gen = new_low; 762 out->next = (void *)new_high; 763 } 764 return (true); 765 } 766 767 static bool 768 pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val, 769 struct pmap_invl_gen *new_val) 770 { 771 uint64_t new_high, new_low, old_high, old_low; 772 char res; 773 774 new_low = new_val->gen; 775 new_high = (uintptr_t)new_val->next; 776 old_low = old_val->gen; 777 old_high = (uintptr_t)old_val->next; 778 779 __asm volatile("lock;cmpxchg16b\t%1" 780 : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) 781 : "b"(new_low), "c" (new_high) 782 : "memory", "cc"); 783 return (res); 784 } 785 786 static COUNTER_U64_DEFINE_EARLY(pv_page_count); 787 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_page_count, CTLFLAG_RD, 788 &pv_page_count, "Current number of allocated pv pages"); 789 790 static COUNTER_U64_DEFINE_EARLY(user_pt_page_count); 791 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, user_pt_page_count, CTLFLAG_RD, 792 &user_pt_page_count, 793 "Current number of allocated page table pages for userspace"); 794 795 static COUNTER_U64_DEFINE_EARLY(kernel_pt_page_count); 796 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, kernel_pt_page_count, CTLFLAG_RD, 797 &kernel_pt_page_count, 798 "Current number of allocated page table pages for the kernel"); 799 800 #ifdef PV_STATS 801 802 static COUNTER_U64_DEFINE_EARLY(invl_start_restart); 803 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_start_restart, 804 CTLFLAG_RD, &invl_start_restart, 805 "Number of delayed TLB invalidation request restarts"); 806 807 static COUNTER_U64_DEFINE_EARLY(invl_finish_restart); 808 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD, 809 &invl_finish_restart, 810 "Number of delayed TLB invalidation completion restarts"); 811 812 static int invl_max_qlen; 813 SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD, 814 &invl_max_qlen, 0, 815 "Maximum delayed TLB invalidation request queue length"); 816 #endif 817 818 #define di_delay locks_delay 819 820 static void 821 pmap_delayed_invl_start_u(void) 822 { 823 struct pmap_invl_gen *invl_gen, *p, prev, new_prev; 824 struct thread *td; 825 struct lock_delay_arg lda; 826 uintptr_t prevl; 827 u_char pri; 828 #ifdef PV_STATS 829 int i, ii; 830 #endif 831 832 td = curthread; 833 invl_gen = &td->td_md.md_invl_gen; 834 PMAP_ASSERT_NOT_IN_DI(); 835 lock_delay_arg_init(&lda, &di_delay); 836 invl_gen->saved_pri = 0; 837 pri = td->td_base_pri; 838 if (pri > PVM) { 839 thread_lock(td); 840 pri = td->td_base_pri; 841 if (pri > PVM) { 842 invl_gen->saved_pri = pri; 843 sched_prio(td, PVM); 844 } 845 thread_unlock(td); 846 } 847 again: 848 PV_STAT(i = 0); 849 for (p = &pmap_invl_gen_head;; p = prev.next) { 850 PV_STAT(i++); 851 prevl = (uintptr_t)atomic_load_ptr(&p->next); 852 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { 853 PV_STAT(counter_u64_add(invl_start_restart, 1)); 854 lock_delay(&lda); 855 goto again; 856 } 857 if (prevl == 0) 858 break; 859 prev.next = (void *)prevl; 860 } 861 #ifdef PV_STATS 862 if ((ii = invl_max_qlen) < i) 863 atomic_cmpset_int(&invl_max_qlen, ii, i); 864 #endif 865 866 if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) { 867 PV_STAT(counter_u64_add(invl_start_restart, 1)); 868 lock_delay(&lda); 869 goto again; 870 } 871 872 new_prev.gen = prev.gen; 873 new_prev.next = invl_gen; 874 invl_gen->gen = prev.gen + 1; 875 876 /* Formal fence between store to invl->gen and updating *p. */ 877 atomic_thread_fence_rel(); 878 879 /* 880 * After inserting an invl_gen element with invalid bit set, 881 * this thread blocks any other thread trying to enter the 882 * delayed invalidation block. Do not allow to remove us from 883 * the CPU, because it causes starvation for other threads. 884 */ 885 critical_enter(); 886 887 /* 888 * ABA for *p is not possible there, since p->gen can only 889 * increase. So if the *p thread finished its di, then 890 * started a new one and got inserted into the list at the 891 * same place, its gen will appear greater than the previously 892 * read gen. 893 */ 894 if (!pmap_di_store_invl(p, &prev, &new_prev)) { 895 critical_exit(); 896 PV_STAT(counter_u64_add(invl_start_restart, 1)); 897 lock_delay(&lda); 898 goto again; 899 } 900 901 /* 902 * There we clear PMAP_INVL_GEN_NEXT_INVALID in 903 * invl_gen->next, allowing other threads to iterate past us. 904 * pmap_di_store_invl() provides fence between the generation 905 * write and the update of next. 906 */ 907 invl_gen->next = NULL; 908 critical_exit(); 909 } 910 911 static bool 912 pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen *invl_gen, 913 struct pmap_invl_gen *p) 914 { 915 struct pmap_invl_gen prev, new_prev; 916 u_long mygen; 917 918 /* 919 * Load invl_gen->gen after setting invl_gen->next 920 * PMAP_INVL_GEN_NEXT_INVALID. This prevents larger 921 * generations to propagate to our invl_gen->gen. Lock prefix 922 * in atomic_set_ptr() worked as seq_cst fence. 923 */ 924 mygen = atomic_load_long(&invl_gen->gen); 925 926 if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen) 927 return (false); 928 929 KASSERT(prev.gen < mygen, 930 ("invalid di gen sequence %lu %lu", prev.gen, mygen)); 931 new_prev.gen = mygen; 932 new_prev.next = (void *)((uintptr_t)invl_gen->next & 933 ~PMAP_INVL_GEN_NEXT_INVALID); 934 935 /* Formal fence between load of prev and storing update to it. */ 936 atomic_thread_fence_rel(); 937 938 return (pmap_di_store_invl(p, &prev, &new_prev)); 939 } 940 941 static void 942 pmap_delayed_invl_finish_u(void) 943 { 944 struct pmap_invl_gen *invl_gen, *p; 945 struct thread *td; 946 struct lock_delay_arg lda; 947 uintptr_t prevl; 948 949 td = curthread; 950 invl_gen = &td->td_md.md_invl_gen; 951 KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0")); 952 KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0, 953 ("missed invl_start: INVALID")); 954 lock_delay_arg_init(&lda, &di_delay); 955 956 again: 957 for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) { 958 prevl = (uintptr_t)atomic_load_ptr(&p->next); 959 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { 960 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 961 lock_delay(&lda); 962 goto again; 963 } 964 if ((void *)prevl == invl_gen) 965 break; 966 } 967 968 /* 969 * It is legitimate to not find ourself on the list if a 970 * thread before us finished its DI and started it again. 971 */ 972 if (__predict_false(p == NULL)) { 973 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 974 lock_delay(&lda); 975 goto again; 976 } 977 978 critical_enter(); 979 atomic_set_ptr((uintptr_t *)&invl_gen->next, 980 PMAP_INVL_GEN_NEXT_INVALID); 981 if (!pmap_delayed_invl_finish_u_crit(invl_gen, p)) { 982 atomic_clear_ptr((uintptr_t *)&invl_gen->next, 983 PMAP_INVL_GEN_NEXT_INVALID); 984 critical_exit(); 985 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 986 lock_delay(&lda); 987 goto again; 988 } 989 critical_exit(); 990 if (atomic_load_int(&pmap_invl_waiters) > 0) 991 pmap_delayed_invl_finish_unblock(0); 992 if (invl_gen->saved_pri != 0) { 993 thread_lock(td); 994 sched_prio(td, invl_gen->saved_pri); 995 thread_unlock(td); 996 } 997 } 998 999 #ifdef DDB 1000 DB_SHOW_COMMAND(di_queue, pmap_di_queue) 1001 { 1002 struct pmap_invl_gen *p, *pn; 1003 struct thread *td; 1004 uintptr_t nextl; 1005 bool first; 1006 1007 for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn, 1008 first = false) { 1009 nextl = (uintptr_t)atomic_load_ptr(&p->next); 1010 pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID); 1011 td = first ? NULL : __containerof(p, struct thread, 1012 td_md.md_invl_gen); 1013 db_printf("gen %lu inv %d td %p tid %d\n", p->gen, 1014 (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td, 1015 td != NULL ? td->td_tid : -1); 1016 } 1017 } 1018 #endif 1019 1020 #ifdef PV_STATS 1021 static COUNTER_U64_DEFINE_EARLY(invl_wait); 1022 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait, 1023 CTLFLAG_RD, &invl_wait, 1024 "Number of times DI invalidation blocked pmap_remove_all/write"); 1025 1026 static COUNTER_U64_DEFINE_EARLY(invl_wait_slow); 1027 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD, 1028 &invl_wait_slow, "Number of slow invalidation waits for lockless DI"); 1029 1030 #endif 1031 1032 #ifdef NUMA 1033 static u_long * 1034 pmap_delayed_invl_genp(vm_page_t m) 1035 { 1036 vm_paddr_t pa; 1037 u_long *gen; 1038 1039 pa = VM_PAGE_TO_PHYS(m); 1040 if (__predict_false((pa) > pmap_last_pa)) 1041 gen = &pv_dummy_large.pv_invl_gen; 1042 else 1043 gen = &(pa_to_pmdp(pa)->pv_invl_gen); 1044 1045 return (gen); 1046 } 1047 #else 1048 static u_long * 1049 pmap_delayed_invl_genp(vm_page_t m) 1050 { 1051 1052 return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); 1053 } 1054 #endif 1055 1056 static void 1057 pmap_delayed_invl_callout_func(void *arg __unused) 1058 { 1059 1060 if (atomic_load_int(&pmap_invl_waiters) == 0) 1061 return; 1062 pmap_delayed_invl_finish_unblock(0); 1063 } 1064 1065 static void 1066 pmap_delayed_invl_callout_init(void *arg __unused) 1067 { 1068 1069 if (pmap_di_locked()) 1070 return; 1071 callout_init(&pmap_invl_callout, 1); 1072 pmap_invl_callout_inited = true; 1073 } 1074 SYSINIT(pmap_di_callout, SI_SUB_CPU + 1, SI_ORDER_ANY, 1075 pmap_delayed_invl_callout_init, NULL); 1076 1077 /* 1078 * Ensure that all currently executing DI blocks, that need to flush 1079 * TLB for the given page m, actually flushed the TLB at the time the 1080 * function returned. If the page m has an empty PV list and we call 1081 * pmap_delayed_invl_wait(), upon its return we know that no CPU has a 1082 * valid mapping for the page m in either its page table or TLB. 1083 * 1084 * This function works by blocking until the global DI generation 1085 * number catches up with the generation number associated with the 1086 * given page m and its PV list. Since this function's callers 1087 * typically own an object lock and sometimes own a page lock, it 1088 * cannot sleep. Instead, it blocks on a turnstile to relinquish the 1089 * processor. 1090 */ 1091 static void 1092 pmap_delayed_invl_wait_l(vm_page_t m) 1093 { 1094 u_long *m_gen; 1095 #ifdef PV_STATS 1096 bool accounted = false; 1097 #endif 1098 1099 m_gen = pmap_delayed_invl_genp(m); 1100 while (*m_gen > pmap_invl_gen) { 1101 #ifdef PV_STATS 1102 if (!accounted) { 1103 counter_u64_add(invl_wait, 1); 1104 accounted = true; 1105 } 1106 #endif 1107 pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen); 1108 } 1109 } 1110 1111 static void 1112 pmap_delayed_invl_wait_u(vm_page_t m) 1113 { 1114 u_long *m_gen; 1115 struct lock_delay_arg lda; 1116 bool fast; 1117 1118 fast = true; 1119 m_gen = pmap_delayed_invl_genp(m); 1120 lock_delay_arg_init(&lda, &di_delay); 1121 while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) { 1122 if (fast || !pmap_invl_callout_inited) { 1123 PV_STAT(counter_u64_add(invl_wait, 1)); 1124 lock_delay(&lda); 1125 fast = false; 1126 } else { 1127 /* 1128 * The page's invalidation generation number 1129 * is still below the current thread's number. 1130 * Prepare to block so that we do not waste 1131 * CPU cycles or worse, suffer livelock. 1132 * 1133 * Since it is impossible to block without 1134 * racing with pmap_delayed_invl_finish_u(), 1135 * prepare for the race by incrementing 1136 * pmap_invl_waiters and arming a 1-tick 1137 * callout which will unblock us if we lose 1138 * the race. 1139 */ 1140 atomic_add_int(&pmap_invl_waiters, 1); 1141 1142 /* 1143 * Re-check the current thread's invalidation 1144 * generation after incrementing 1145 * pmap_invl_waiters, so that there is no race 1146 * with pmap_delayed_invl_finish_u() setting 1147 * the page generation and checking 1148 * pmap_invl_waiters. The only race allowed 1149 * is for a missed unblock, which is handled 1150 * by the callout. 1151 */ 1152 if (*m_gen > 1153 atomic_load_long(&pmap_invl_gen_head.gen)) { 1154 callout_reset(&pmap_invl_callout, 1, 1155 pmap_delayed_invl_callout_func, NULL); 1156 PV_STAT(counter_u64_add(invl_wait_slow, 1)); 1157 pmap_delayed_invl_wait_block(m_gen, 1158 &pmap_invl_gen_head.gen); 1159 } 1160 atomic_add_int(&pmap_invl_waiters, -1); 1161 } 1162 } 1163 } 1164 1165 DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *)) 1166 { 1167 1168 return (pmap_di_locked() ? pmap_thread_init_invl_gen_l : 1169 pmap_thread_init_invl_gen_u); 1170 } 1171 1172 DEFINE_IFUNC(static, void, pmap_delayed_invl_start, (void)) 1173 { 1174 1175 return (pmap_di_locked() ? pmap_delayed_invl_start_l : 1176 pmap_delayed_invl_start_u); 1177 } 1178 1179 DEFINE_IFUNC(static, void, pmap_delayed_invl_finish, (void)) 1180 { 1181 1182 return (pmap_di_locked() ? pmap_delayed_invl_finish_l : 1183 pmap_delayed_invl_finish_u); 1184 } 1185 1186 DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t)) 1187 { 1188 1189 return (pmap_di_locked() ? pmap_delayed_invl_wait_l : 1190 pmap_delayed_invl_wait_u); 1191 } 1192 1193 /* 1194 * Mark the page m's PV list as participating in the current thread's 1195 * DI block. Any threads concurrently using m's PV list to remove or 1196 * restrict all mappings to m will wait for the current thread's DI 1197 * block to complete before proceeding. 1198 * 1199 * The function works by setting the DI generation number for m's PV 1200 * list to at least the DI generation number of the current thread. 1201 * This forces a caller of pmap_delayed_invl_wait() to block until 1202 * current thread calls pmap_delayed_invl_finish(). 1203 */ 1204 static void 1205 pmap_delayed_invl_page(vm_page_t m) 1206 { 1207 u_long gen, *m_gen; 1208 1209 rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); 1210 gen = curthread->td_md.md_invl_gen.gen; 1211 if (gen == 0) 1212 return; 1213 m_gen = pmap_delayed_invl_genp(m); 1214 if (*m_gen < gen) 1215 *m_gen = gen; 1216 } 1217 1218 /* 1219 * Crashdump maps. 1220 */ 1221 static caddr_t crashdumpmap; 1222 1223 /* 1224 * Internal flags for pmap_enter()'s helper functions. 1225 */ 1226 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 1227 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 1228 1229 /* 1230 * Internal flags for pmap_mapdev_internal() and 1231 * pmap_change_props_locked(). 1232 */ 1233 #define MAPDEV_FLUSHCACHE 0x00000001 /* Flush cache after mapping. */ 1234 #define MAPDEV_SETATTR 0x00000002 /* Modify existing attrs. */ 1235 #define MAPDEV_ASSERTVALID 0x00000004 /* Assert mapping validity. */ 1236 1237 TAILQ_HEAD(pv_chunklist, pv_chunk); 1238 1239 static void free_pv_chunk(struct pv_chunk *pc); 1240 static void free_pv_chunk_batch(struct pv_chunklist *batch); 1241 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 1242 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 1243 static int popcnt_pc_map_pq(uint64_t *map); 1244 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 1245 static void reserve_pv_entries(pmap_t pmap, int needed, 1246 struct rwlock **lockp); 1247 static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1248 struct rwlock **lockp); 1249 static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, 1250 u_int flags, struct rwlock **lockp); 1251 #if VM_NRESERVLEVEL > 0 1252 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1253 struct rwlock **lockp); 1254 #endif 1255 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 1256 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 1257 vm_offset_t va); 1258 1259 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); 1260 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, 1261 vm_prot_t prot, int mode, int flags); 1262 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 1263 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, 1264 vm_offset_t va, struct rwlock **lockp); 1265 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, 1266 vm_offset_t va); 1267 static int pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, 1268 vm_prot_t prot, struct rwlock **lockp); 1269 static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, 1270 u_int flags, vm_page_t m, struct rwlock **lockp); 1271 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 1272 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 1273 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 1274 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted); 1275 static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, 1276 vm_offset_t eva); 1277 static void pmap_invalidate_cache_range_all(vm_offset_t sva, 1278 vm_offset_t eva); 1279 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, 1280 pd_entry_t pde); 1281 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 1282 static vm_page_t pmap_large_map_getptp_unlocked(void); 1283 static vm_paddr_t pmap_large_map_kextract(vm_offset_t va); 1284 #if VM_NRESERVLEVEL > 0 1285 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 1286 vm_page_t mpte, struct rwlock **lockp); 1287 #endif 1288 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 1289 vm_prot_t prot); 1290 static void pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask); 1291 static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, 1292 bool exec); 1293 static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va); 1294 static pd_entry_t *pmap_pti_pde(vm_offset_t va); 1295 static void pmap_pti_wire_pte(void *pte); 1296 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 1297 struct spglist *free, struct rwlock **lockp); 1298 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 1299 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 1300 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 1301 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 1302 struct spglist *free); 1303 static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1304 pd_entry_t *pde, struct spglist *free, 1305 struct rwlock **lockp); 1306 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 1307 vm_page_t m, struct rwlock **lockp); 1308 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 1309 pd_entry_t newpde); 1310 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); 1311 1312 static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, 1313 struct rwlock **lockp); 1314 static vm_page_t pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex, 1315 struct rwlock **lockp, vm_offset_t va); 1316 static vm_page_t pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex, 1317 struct rwlock **lockp, vm_offset_t va); 1318 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, 1319 struct rwlock **lockp); 1320 1321 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 1322 struct spglist *free); 1323 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 1324 1325 static vm_page_t pmap_alloc_pt_page(pmap_t, vm_pindex_t, int); 1326 static void pmap_free_pt_page(pmap_t, vm_page_t, bool); 1327 1328 /********************/ 1329 /* Inline functions */ 1330 /********************/ 1331 1332 /* 1333 * Return a non-clipped indexes for a given VA, which are page table 1334 * pages indexes at the corresponding level. 1335 */ 1336 static __inline vm_pindex_t 1337 pmap_pde_pindex(vm_offset_t va) 1338 { 1339 return (va >> PDRSHIFT); 1340 } 1341 1342 static __inline vm_pindex_t 1343 pmap_pdpe_pindex(vm_offset_t va) 1344 { 1345 return (NUPDE + (va >> PDPSHIFT)); 1346 } 1347 1348 static __inline vm_pindex_t 1349 pmap_pml4e_pindex(vm_offset_t va) 1350 { 1351 return (NUPDE + NUPDPE + (va >> PML4SHIFT)); 1352 } 1353 1354 static __inline vm_pindex_t 1355 pmap_pml5e_pindex(vm_offset_t va) 1356 { 1357 return (NUPDE + NUPDPE + NUPML4E + (va >> PML5SHIFT)); 1358 } 1359 1360 static __inline pml4_entry_t * 1361 pmap_pml5e(pmap_t pmap, vm_offset_t va) 1362 { 1363 1364 MPASS(pmap_is_la57(pmap)); 1365 return (&pmap->pm_pmltop[pmap_pml5e_index(va)]); 1366 } 1367 1368 static __inline pml4_entry_t * 1369 pmap_pml5e_u(pmap_t pmap, vm_offset_t va) 1370 { 1371 1372 MPASS(pmap_is_la57(pmap)); 1373 return (&pmap->pm_pmltopu[pmap_pml5e_index(va)]); 1374 } 1375 1376 static __inline pml4_entry_t * 1377 pmap_pml5e_to_pml4e(pml5_entry_t *pml5e, vm_offset_t va) 1378 { 1379 pml4_entry_t *pml4e; 1380 1381 /* XXX MPASS(pmap_is_la57(pmap); */ 1382 pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); 1383 return (&pml4e[pmap_pml4e_index(va)]); 1384 } 1385 1386 /* Return a pointer to the PML4 slot that corresponds to a VA */ 1387 static __inline pml4_entry_t * 1388 pmap_pml4e(pmap_t pmap, vm_offset_t va) 1389 { 1390 pml5_entry_t *pml5e; 1391 pml4_entry_t *pml4e; 1392 pt_entry_t PG_V; 1393 1394 if (pmap_is_la57(pmap)) { 1395 pml5e = pmap_pml5e(pmap, va); 1396 PG_V = pmap_valid_bit(pmap); 1397 if ((*pml5e & PG_V) == 0) 1398 return (NULL); 1399 pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); 1400 } else { 1401 pml4e = pmap->pm_pmltop; 1402 } 1403 return (&pml4e[pmap_pml4e_index(va)]); 1404 } 1405 1406 static __inline pml4_entry_t * 1407 pmap_pml4e_u(pmap_t pmap, vm_offset_t va) 1408 { 1409 MPASS(!pmap_is_la57(pmap)); 1410 return (&pmap->pm_pmltopu[pmap_pml4e_index(va)]); 1411 } 1412 1413 /* Return a pointer to the PDP slot that corresponds to a VA */ 1414 static __inline pdp_entry_t * 1415 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 1416 { 1417 pdp_entry_t *pdpe; 1418 1419 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 1420 return (&pdpe[pmap_pdpe_index(va)]); 1421 } 1422 1423 /* Return a pointer to the PDP slot that corresponds to a VA */ 1424 static __inline pdp_entry_t * 1425 pmap_pdpe(pmap_t pmap, vm_offset_t va) 1426 { 1427 pml4_entry_t *pml4e; 1428 pt_entry_t PG_V; 1429 1430 PG_V = pmap_valid_bit(pmap); 1431 pml4e = pmap_pml4e(pmap, va); 1432 if (pml4e == NULL || (*pml4e & PG_V) == 0) 1433 return (NULL); 1434 return (pmap_pml4e_to_pdpe(pml4e, va)); 1435 } 1436 1437 /* Return a pointer to the PD slot that corresponds to a VA */ 1438 static __inline pd_entry_t * 1439 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 1440 { 1441 pd_entry_t *pde; 1442 1443 KASSERT((*pdpe & PG_PS) == 0, 1444 ("%s: pdpe %#lx is a leaf", __func__, *pdpe)); 1445 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 1446 return (&pde[pmap_pde_index(va)]); 1447 } 1448 1449 /* Return a pointer to the PD slot that corresponds to a VA */ 1450 static __inline pd_entry_t * 1451 pmap_pde(pmap_t pmap, vm_offset_t va) 1452 { 1453 pdp_entry_t *pdpe; 1454 pt_entry_t PG_V; 1455 1456 PG_V = pmap_valid_bit(pmap); 1457 pdpe = pmap_pdpe(pmap, va); 1458 if (pdpe == NULL || (*pdpe & PG_V) == 0) 1459 return (NULL); 1460 KASSERT((*pdpe & PG_PS) == 0, 1461 ("pmap_pde for 1G page, pmap %p va %#lx", pmap, va)); 1462 return (pmap_pdpe_to_pde(pdpe, va)); 1463 } 1464 1465 /* Return a pointer to the PT slot that corresponds to a VA */ 1466 static __inline pt_entry_t * 1467 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 1468 { 1469 pt_entry_t *pte; 1470 1471 KASSERT((*pde & PG_PS) == 0, 1472 ("%s: pde %#lx is a leaf", __func__, *pde)); 1473 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 1474 return (&pte[pmap_pte_index(va)]); 1475 } 1476 1477 /* Return a pointer to the PT slot that corresponds to a VA */ 1478 static __inline pt_entry_t * 1479 pmap_pte(pmap_t pmap, vm_offset_t va) 1480 { 1481 pd_entry_t *pde; 1482 pt_entry_t PG_V; 1483 1484 PG_V = pmap_valid_bit(pmap); 1485 pde = pmap_pde(pmap, va); 1486 if (pde == NULL || (*pde & PG_V) == 0) 1487 return (NULL); 1488 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 1489 return ((pt_entry_t *)pde); 1490 return (pmap_pde_to_pte(pde, va)); 1491 } 1492 1493 static __inline void 1494 pmap_resident_count_adj(pmap_t pmap, int count) 1495 { 1496 1497 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1498 KASSERT(pmap->pm_stats.resident_count + count >= 0, 1499 ("pmap %p resident count underflow %ld %d", pmap, 1500 pmap->pm_stats.resident_count, count)); 1501 pmap->pm_stats.resident_count += count; 1502 } 1503 1504 static __inline void 1505 pmap_pt_page_count_pinit(pmap_t pmap, int count) 1506 { 1507 KASSERT(pmap->pm_stats.resident_count + count >= 0, 1508 ("pmap %p resident count underflow %ld %d", pmap, 1509 pmap->pm_stats.resident_count, count)); 1510 pmap->pm_stats.resident_count += count; 1511 } 1512 1513 static __inline void 1514 pmap_pt_page_count_adj(pmap_t pmap, int count) 1515 { 1516 if (pmap == kernel_pmap) 1517 counter_u64_add(kernel_pt_page_count, count); 1518 else { 1519 if (pmap != NULL) 1520 pmap_resident_count_adj(pmap, count); 1521 counter_u64_add(user_pt_page_count, count); 1522 } 1523 } 1524 1525 pt_entry_t vtoptem __read_mostly = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + 1526 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1) << 3; 1527 vm_offset_t PTmap __read_mostly = (vm_offset_t)P4Tmap; 1528 1529 PMAP_INLINE pt_entry_t * 1530 vtopte(vm_offset_t va) 1531 { 1532 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); 1533 1534 return ((pt_entry_t *)(PTmap + ((va >> (PAGE_SHIFT - 3)) & vtoptem))); 1535 } 1536 1537 pd_entry_t vtopdem __read_mostly = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 1538 NPML4EPGSHIFT)) - 1) << 3; 1539 vm_offset_t PDmap __read_mostly = (vm_offset_t)P4Dmap; 1540 1541 static __inline pd_entry_t * 1542 vtopde(vm_offset_t va) 1543 { 1544 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); 1545 1546 return ((pt_entry_t *)(PDmap + ((va >> (PDRSHIFT - 3)) & vtopdem))); 1547 } 1548 1549 static u_int64_t 1550 allocpages(vm_paddr_t *firstaddr, int n) 1551 { 1552 u_int64_t ret; 1553 1554 ret = *firstaddr; 1555 bzero((void *)ret, n * PAGE_SIZE); 1556 *firstaddr += n * PAGE_SIZE; 1557 return (ret); 1558 } 1559 1560 CTASSERT(powerof2(NDMPML4E)); 1561 1562 /* number of kernel PDP slots */ 1563 #define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG) 1564 1565 static void 1566 nkpt_init(vm_paddr_t addr) 1567 { 1568 int pt_pages; 1569 1570 #ifdef NKPT 1571 pt_pages = NKPT; 1572 #else 1573 pt_pages = howmany(addr - kernphys, NBPDR) + 1; /* +1 for 2M hole @0 */ 1574 pt_pages += NKPDPE(pt_pages); 1575 1576 /* 1577 * Add some slop beyond the bare minimum required for bootstrapping 1578 * the kernel. 1579 * 1580 * This is quite important when allocating KVA for kernel modules. 1581 * The modules are required to be linked in the negative 2GB of 1582 * the address space. If we run out of KVA in this region then 1583 * pmap_growkernel() will need to allocate page table pages to map 1584 * the entire 512GB of KVA space which is an unnecessary tax on 1585 * physical memory. 1586 * 1587 * Secondly, device memory mapped as part of setting up the low- 1588 * level console(s) is taken from KVA, starting at virtual_avail. 1589 * This is because cninit() is called after pmap_bootstrap() but 1590 * before vm_init() and pmap_init(). 20MB for a frame buffer is 1591 * not uncommon. 1592 */ 1593 pt_pages += 32; /* 64MB additional slop. */ 1594 #endif 1595 nkpt = pt_pages; 1596 } 1597 1598 /* 1599 * Returns the proper write/execute permission for a physical page that is 1600 * part of the initial boot allocations. 1601 * 1602 * If the page has kernel text, it is marked as read-only. If the page has 1603 * kernel read-only data, it is marked as read-only/not-executable. If the 1604 * page has only read-write data, it is marked as read-write/not-executable. 1605 * If the page is below/above the kernel range, it is marked as read-write. 1606 * 1607 * This function operates on 2M pages, since we map the kernel space that 1608 * way. 1609 */ 1610 static inline pt_entry_t 1611 bootaddr_rwx(vm_paddr_t pa) 1612 { 1613 /* 1614 * The kernel is loaded at a 2MB-aligned address, and memory below that 1615 * need not be executable. The .bss section is padded to a 2MB 1616 * boundary, so memory following the kernel need not be executable 1617 * either. Preloaded kernel modules have their mapping permissions 1618 * fixed up by the linker. 1619 */ 1620 if (pa < trunc_2mpage(kernphys + btext - KERNSTART) || 1621 pa >= trunc_2mpage(kernphys + _end - KERNSTART)) 1622 return (X86_PG_RW | pg_nx); 1623 1624 /* 1625 * The linker should ensure that the read-only and read-write 1626 * portions don't share the same 2M page, so this shouldn't 1627 * impact read-only data. However, in any case, any page with 1628 * read-write data needs to be read-write. 1629 */ 1630 if (pa >= trunc_2mpage(kernphys + brwsection - KERNSTART)) 1631 return (X86_PG_RW | pg_nx); 1632 1633 /* 1634 * Mark any 2M page containing kernel text as read-only. Mark 1635 * other pages with read-only data as read-only and not executable. 1636 * (It is likely a small portion of the read-only data section will 1637 * be marked as read-only, but executable. This should be acceptable 1638 * since the read-only protection will keep the data from changing.) 1639 * Note that fixups to the .text section will still work until we 1640 * set CR0.WP. 1641 */ 1642 if (pa < round_2mpage(kernphys + etext - KERNSTART)) 1643 return (0); 1644 return (pg_nx); 1645 } 1646 1647 static void 1648 create_pagetables(vm_paddr_t *firstaddr) 1649 { 1650 pd_entry_t *pd_p; 1651 pdp_entry_t *pdp_p; 1652 pml4_entry_t *p4_p; 1653 uint64_t DMPDkernphys; 1654 vm_paddr_t pax; 1655 #ifdef KASAN 1656 pt_entry_t *pt_p; 1657 uint64_t KASANPDphys, KASANPTphys, KASANphys; 1658 vm_offset_t kasankernbase; 1659 int kasankpdpi, kasankpdi, nkasanpte; 1660 #endif 1661 int i, j, ndm1g, nkpdpe, nkdmpde; 1662 1663 /* Allocate page table pages for the direct map */ 1664 ndmpdp = howmany(ptoa(Maxmem), NBPDP); 1665 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 1666 ndmpdp = 4; 1667 ndmpdpphys = howmany(ndmpdp, NPDPEPG); 1668 if (ndmpdpphys > NDMPML4E) { 1669 /* 1670 * Each NDMPML4E allows 512 GB, so limit to that, 1671 * and then readjust ndmpdp and ndmpdpphys. 1672 */ 1673 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); 1674 Maxmem = atop(NDMPML4E * NBPML4); 1675 ndmpdpphys = NDMPML4E; 1676 ndmpdp = NDMPML4E * NPDEPG; 1677 } 1678 DMPDPphys = allocpages(firstaddr, ndmpdpphys); 1679 ndm1g = 0; 1680 if ((amd_feature & AMDID_PAGE1GB) != 0) { 1681 /* 1682 * Calculate the number of 1G pages that will fully fit in 1683 * Maxmem. 1684 */ 1685 ndm1g = ptoa(Maxmem) >> PDPSHIFT; 1686 1687 /* 1688 * Allocate 2M pages for the kernel. These will be used in 1689 * place of the one or more 1G pages from ndm1g that maps 1690 * kernel memory into DMAP. 1691 */ 1692 nkdmpde = howmany((vm_offset_t)brwsection - KERNSTART + 1693 kernphys - rounddown2(kernphys, NBPDP), NBPDP); 1694 DMPDkernphys = allocpages(firstaddr, nkdmpde); 1695 } 1696 if (ndm1g < ndmpdp) 1697 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); 1698 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 1699 1700 /* Allocate pages. */ 1701 KPML4phys = allocpages(firstaddr, 1); 1702 KPDPphys = allocpages(firstaddr, NKPML4E); 1703 #ifdef KASAN 1704 KASANPDPphys = allocpages(firstaddr, NKASANPML4E); 1705 KASANPDphys = allocpages(firstaddr, 1); 1706 #endif 1707 #ifdef KMSAN 1708 /* 1709 * The KMSAN shadow maps are initially left unpopulated, since there is 1710 * no need to shadow memory above KERNBASE. 1711 */ 1712 KMSANSHADPDPphys = allocpages(firstaddr, NKMSANSHADPML4E); 1713 KMSANORIGPDPphys = allocpages(firstaddr, NKMSANORIGPML4E); 1714 #endif 1715 1716 /* 1717 * Allocate the initial number of kernel page table pages required to 1718 * bootstrap. We defer this until after all memory-size dependent 1719 * allocations are done (e.g. direct map), so that we don't have to 1720 * build in too much slop in our estimate. 1721 * 1722 * Note that when NKPML4E > 1, we have an empty page underneath 1723 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) 1724 * pages. (pmap_enter requires a PD page to exist for each KPML4E.) 1725 */ 1726 nkpt_init(*firstaddr); 1727 nkpdpe = NKPDPE(nkpt); 1728 1729 KPTphys = allocpages(firstaddr, nkpt); 1730 KPDphys = allocpages(firstaddr, nkpdpe); 1731 1732 #ifdef KASAN 1733 nkasanpte = howmany(nkpt, KASAN_SHADOW_SCALE); 1734 KASANPTphys = allocpages(firstaddr, nkasanpte); 1735 KASANphys = allocpages(firstaddr, nkasanpte * NPTEPG); 1736 #endif 1737 1738 /* 1739 * Connect the zero-filled PT pages to their PD entries. This 1740 * implicitly maps the PT pages at their correct locations within 1741 * the PTmap. 1742 */ 1743 pd_p = (pd_entry_t *)KPDphys; 1744 for (i = 0; i < nkpt; i++) 1745 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 1746 1747 /* 1748 * Map from start of the kernel in physical memory (staging 1749 * area) to the end of loader preallocated memory using 2MB 1750 * pages. This replaces some of the PD entries created above. 1751 * For compatibility, identity map 2M at the start. 1752 */ 1753 pd_p[0] = X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | 1754 X86_PG_RW | pg_nx; 1755 for (i = 1, pax = kernphys; pax < KERNend; i++, pax += NBPDR) { 1756 /* Preset PG_M and PG_A because demotion expects it. */ 1757 pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | 1758 X86_PG_A | bootaddr_rwx(pax); 1759 } 1760 1761 /* 1762 * Because we map the physical blocks in 2M pages, adjust firstaddr 1763 * to record the physical blocks we've actually mapped into kernel 1764 * virtual address space. 1765 */ 1766 if (*firstaddr < round_2mpage(KERNend)) 1767 *firstaddr = round_2mpage(KERNend); 1768 1769 /* And connect up the PD to the PDP (leaving room for L4 pages) */ 1770 pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); 1771 for (i = 0; i < nkpdpe; i++) 1772 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 1773 1774 #ifdef KASAN 1775 kasankernbase = kasan_md_addr_to_shad(KERNBASE); 1776 kasankpdpi = pmap_pdpe_index(kasankernbase); 1777 kasankpdi = pmap_pde_index(kasankernbase); 1778 1779 pdp_p = (pdp_entry_t *)KASANPDPphys; 1780 pdp_p[kasankpdpi] = (KASANPDphys | X86_PG_RW | X86_PG_V | pg_nx); 1781 1782 pd_p = (pd_entry_t *)KASANPDphys; 1783 for (i = 0; i < nkasanpte; i++) 1784 pd_p[i + kasankpdi] = (KASANPTphys + ptoa(i)) | X86_PG_RW | 1785 X86_PG_V | pg_nx; 1786 1787 pt_p = (pt_entry_t *)KASANPTphys; 1788 for (i = 0; i < nkasanpte * NPTEPG; i++) 1789 pt_p[i] = (KASANphys + ptoa(i)) | X86_PG_RW | X86_PG_V | 1790 X86_PG_M | X86_PG_A | pg_nx; 1791 #endif 1792 1793 /* 1794 * Now, set up the direct map region using 2MB and/or 1GB pages. If 1795 * the end of physical memory is not aligned to a 1GB page boundary, 1796 * then the residual physical memory is mapped with 2MB pages. Later, 1797 * if pmap_mapdev{_attr}() uses the direct map for non-write-back 1798 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings 1799 * that are partially used. 1800 */ 1801 pd_p = (pd_entry_t *)DMPDphys; 1802 for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { 1803 pd_p[j] = (vm_paddr_t)i << PDRSHIFT; 1804 /* Preset PG_M and PG_A because demotion expects it. */ 1805 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | 1806 X86_PG_M | X86_PG_A | pg_nx; 1807 } 1808 pdp_p = (pdp_entry_t *)DMPDPphys; 1809 for (i = 0; i < ndm1g; i++) { 1810 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; 1811 /* Preset PG_M and PG_A because demotion expects it. */ 1812 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | 1813 X86_PG_M | X86_PG_A | pg_nx; 1814 } 1815 for (j = 0; i < ndmpdp; i++, j++) { 1816 pdp_p[i] = DMPDphys + ptoa(j); 1817 pdp_p[i] |= X86_PG_RW | X86_PG_V | pg_nx; 1818 } 1819 1820 /* 1821 * Instead of using a 1G page for the memory containing the kernel, 1822 * use 2M pages with read-only and no-execute permissions. (If using 1G 1823 * pages, this will partially overwrite the PDPEs above.) 1824 */ 1825 if (ndm1g > 0) { 1826 pd_p = (pd_entry_t *)DMPDkernphys; 1827 for (i = 0, pax = rounddown2(kernphys, NBPDP); 1828 i < NPDEPG * nkdmpde; i++, pax += NBPDR) { 1829 pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | 1830 X86_PG_A | pg_nx | bootaddr_rwx(pax); 1831 } 1832 j = rounddown2(kernphys, NBPDP) >> PDPSHIFT; 1833 for (i = 0; i < nkdmpde; i++) { 1834 pdp_p[i + j] = (DMPDkernphys + ptoa(i)) | 1835 X86_PG_RW | X86_PG_V | pg_nx; 1836 } 1837 } 1838 1839 /* And recursively map PML4 to itself in order to get PTmap */ 1840 p4_p = (pml4_entry_t *)KPML4phys; 1841 p4_p[PML4PML4I] = KPML4phys; 1842 p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx; 1843 1844 #ifdef KASAN 1845 /* Connect the KASAN shadow map slots up to the PML4. */ 1846 for (i = 0; i < NKASANPML4E; i++) { 1847 p4_p[KASANPML4I + i] = KASANPDPphys + ptoa(i); 1848 p4_p[KASANPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1849 } 1850 #endif 1851 1852 #ifdef KMSAN 1853 /* Connect the KMSAN shadow map slots up to the PML4. */ 1854 for (i = 0; i < NKMSANSHADPML4E; i++) { 1855 p4_p[KMSANSHADPML4I + i] = KMSANSHADPDPphys + ptoa(i); 1856 p4_p[KMSANSHADPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1857 } 1858 1859 /* Connect the KMSAN origin map slots up to the PML4. */ 1860 for (i = 0; i < NKMSANORIGPML4E; i++) { 1861 p4_p[KMSANORIGPML4I + i] = KMSANORIGPDPphys + ptoa(i); 1862 p4_p[KMSANORIGPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1863 } 1864 #endif 1865 1866 /* Connect the Direct Map slots up to the PML4. */ 1867 for (i = 0; i < ndmpdpphys; i++) { 1868 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); 1869 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1870 } 1871 1872 /* Connect the KVA slots up to the PML4 */ 1873 for (i = 0; i < NKPML4E; i++) { 1874 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); 1875 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V; 1876 } 1877 1878 kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 1879 } 1880 1881 /* 1882 * Bootstrap the system enough to run with virtual memory. 1883 * 1884 * On amd64 this is called after mapping has already been enabled 1885 * and just syncs the pmap module with what has already been done. 1886 * [We can't call it easily with mapping off since the kernel is not 1887 * mapped with PA == VA, hence we would have to relocate every address 1888 * from the linked base (virtual) address "KERNBASE" to the actual 1889 * (physical) address starting relative to 0] 1890 */ 1891 void 1892 pmap_bootstrap(vm_paddr_t *firstaddr) 1893 { 1894 vm_offset_t va; 1895 pt_entry_t *pte, *pcpu_pte; 1896 struct region_descriptor r_gdt; 1897 uint64_t cr4, pcpu_phys; 1898 u_long res; 1899 int i; 1900 1901 KERNend = *firstaddr; 1902 res = atop(KERNend - (vm_paddr_t)kernphys); 1903 1904 if (!pti) 1905 pg_g = X86_PG_G; 1906 1907 /* 1908 * Create an initial set of page tables to run the kernel in. 1909 */ 1910 create_pagetables(firstaddr); 1911 1912 pcpu_phys = allocpages(firstaddr, MAXCPU); 1913 1914 /* 1915 * Add a physical memory segment (vm_phys_seg) corresponding to the 1916 * preallocated kernel page table pages so that vm_page structures 1917 * representing these pages will be created. The vm_page structures 1918 * are required for promotion of the corresponding kernel virtual 1919 * addresses to superpage mappings. 1920 */ 1921 vm_phys_early_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 1922 1923 /* 1924 * Account for the virtual addresses mapped by create_pagetables(). 1925 */ 1926 virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend - 1927 (vm_paddr_t)kernphys); 1928 virtual_end = VM_MAX_KERNEL_ADDRESS; 1929 1930 /* 1931 * Enable PG_G global pages, then switch to the kernel page 1932 * table from the bootstrap page table. After the switch, it 1933 * is possible to enable SMEP and SMAP since PG_U bits are 1934 * correct now. 1935 */ 1936 cr4 = rcr4(); 1937 cr4 |= CR4_PGE; 1938 load_cr4(cr4); 1939 load_cr3(KPML4phys); 1940 if (cpu_stdext_feature & CPUID_STDEXT_SMEP) 1941 cr4 |= CR4_SMEP; 1942 if (cpu_stdext_feature & CPUID_STDEXT_SMAP) 1943 cr4 |= CR4_SMAP; 1944 load_cr4(cr4); 1945 1946 /* 1947 * Initialize the kernel pmap (which is statically allocated). 1948 * Count bootstrap data as being resident in case any of this data is 1949 * later unmapped (using pmap_remove()) and freed. 1950 */ 1951 PMAP_LOCK_INIT(kernel_pmap); 1952 kernel_pmap->pm_pmltop = kernel_pml4; 1953 kernel_pmap->pm_cr3 = KPML4phys; 1954 kernel_pmap->pm_ucr3 = PMAP_NO_CR3; 1955 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1956 kernel_pmap->pm_stats.resident_count = res; 1957 kernel_pmap->pm_flags = pmap_flags; 1958 1959 /* 1960 * The kernel pmap is always active on all CPUs. Once CPUs are 1961 * enumerated, the mask will be set equal to all_cpus. 1962 */ 1963 CPU_FILL(&kernel_pmap->pm_active); 1964 1965 /* 1966 * Initialize the TLB invalidations generation number lock. 1967 */ 1968 mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); 1969 1970 /* 1971 * Reserve some special page table entries/VA space for temporary 1972 * mapping of pages. 1973 */ 1974 #define SYSMAP(c, p, v, n) \ 1975 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 1976 1977 va = virtual_avail; 1978 pte = vtopte(va); 1979 1980 /* 1981 * Crashdump maps. The first page is reused as CMAP1 for the 1982 * memory test. 1983 */ 1984 SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) 1985 CADDR1 = crashdumpmap; 1986 1987 SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU); 1988 virtual_avail = va; 1989 1990 for (i = 0; i < MAXCPU; i++) { 1991 pcpu_pte[i] = (pcpu_phys + ptoa(i)) | X86_PG_V | X86_PG_RW | 1992 pg_g | pg_nx | X86_PG_M | X86_PG_A; 1993 } 1994 1995 /* 1996 * Re-initialize PCPU area for BSP after switching. 1997 * Make hardware use gdt and common_tss from the new PCPU. 1998 */ 1999 STAILQ_INIT(&cpuhead); 2000 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2001 pcpu_init(&__pcpu[0], 0, sizeof(struct pcpu)); 2002 amd64_bsp_pcpu_init1(&__pcpu[0]); 2003 amd64_bsp_ist_init(&__pcpu[0]); 2004 __pcpu[0].pc_common_tss.tss_iobase = sizeof(struct amd64tss) + 2005 IOPERM_BITMAP_SIZE; 2006 memcpy(__pcpu[0].pc_gdt, temp_bsp_pcpu.pc_gdt, NGDT * 2007 sizeof(struct user_segment_descriptor)); 2008 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&__pcpu[0].pc_common_tss; 2009 ssdtosyssd(&gdt_segs[GPROC0_SEL], 2010 (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); 2011 r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; 2012 r_gdt.rd_base = (long)__pcpu[0].pc_gdt; 2013 lgdt(&r_gdt); 2014 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2015 ltr(GSEL(GPROC0_SEL, SEL_KPL)); 2016 __pcpu[0].pc_dynamic = temp_bsp_pcpu.pc_dynamic; 2017 __pcpu[0].pc_acpi_id = temp_bsp_pcpu.pc_acpi_id; 2018 2019 /* 2020 * Initialize the PAT MSR. 2021 * pmap_init_pat() clears and sets CR4_PGE, which, as a 2022 * side-effect, invalidates stale PG_G TLB entries that might 2023 * have been created in our pre-boot environment. 2024 */ 2025 pmap_init_pat(); 2026 2027 /* Initialize TLB Context Id. */ 2028 if (pmap_pcid_enabled) { 2029 for (i = 0; i < MAXCPU; i++) { 2030 kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN; 2031 kernel_pmap->pm_pcids[i].pm_gen = 1; 2032 } 2033 2034 /* 2035 * PMAP_PCID_KERN + 1 is used for initialization of 2036 * proc0 pmap. The pmap' pcid state might be used by 2037 * EFIRT entry before first context switch, so it 2038 * needs to be valid. 2039 */ 2040 PCPU_SET(pcid_next, PMAP_PCID_KERN + 2); 2041 PCPU_SET(pcid_gen, 1); 2042 2043 /* 2044 * pcpu area for APs is zeroed during AP startup. 2045 * pc_pcid_next and pc_pcid_gen are initialized by AP 2046 * during pcpu setup. 2047 */ 2048 load_cr4(rcr4() | CR4_PCIDE); 2049 } 2050 } 2051 2052 /* 2053 * Setup the PAT MSR. 2054 */ 2055 void 2056 pmap_init_pat(void) 2057 { 2058 uint64_t pat_msr; 2059 u_long cr0, cr4; 2060 int i; 2061 2062 /* Bail if this CPU doesn't implement PAT. */ 2063 if ((cpu_feature & CPUID_PAT) == 0) 2064 panic("no PAT??"); 2065 2066 /* Set default PAT index table. */ 2067 for (i = 0; i < PAT_INDEX_SIZE; i++) 2068 pat_index[i] = -1; 2069 pat_index[PAT_WRITE_BACK] = 0; 2070 pat_index[PAT_WRITE_THROUGH] = 1; 2071 pat_index[PAT_UNCACHEABLE] = 3; 2072 pat_index[PAT_WRITE_COMBINING] = 6; 2073 pat_index[PAT_WRITE_PROTECTED] = 5; 2074 pat_index[PAT_UNCACHED] = 2; 2075 2076 /* 2077 * Initialize default PAT entries. 2078 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 2079 * Program 5 and 6 as WP and WC. 2080 * 2081 * Leave 4 and 7 as WB and UC. Note that a recursive page table 2082 * mapping for a 2M page uses a PAT value with the bit 3 set due 2083 * to its overload with PG_PS. 2084 */ 2085 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 2086 PAT_VALUE(1, PAT_WRITE_THROUGH) | 2087 PAT_VALUE(2, PAT_UNCACHED) | 2088 PAT_VALUE(3, PAT_UNCACHEABLE) | 2089 PAT_VALUE(4, PAT_WRITE_BACK) | 2090 PAT_VALUE(5, PAT_WRITE_PROTECTED) | 2091 PAT_VALUE(6, PAT_WRITE_COMBINING) | 2092 PAT_VALUE(7, PAT_UNCACHEABLE); 2093 2094 /* Disable PGE. */ 2095 cr4 = rcr4(); 2096 load_cr4(cr4 & ~CR4_PGE); 2097 2098 /* Disable caches (CD = 1, NW = 0). */ 2099 cr0 = rcr0(); 2100 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 2101 2102 /* Flushes caches and TLBs. */ 2103 wbinvd(); 2104 invltlb(); 2105 2106 /* Update PAT and index table. */ 2107 wrmsr(MSR_PAT, pat_msr); 2108 2109 /* Flush caches and TLBs again. */ 2110 wbinvd(); 2111 invltlb(); 2112 2113 /* Restore caches and PGE. */ 2114 load_cr0(cr0); 2115 load_cr4(cr4); 2116 } 2117 2118 vm_page_t 2119 pmap_page_alloc_below_4g(bool zeroed) 2120 { 2121 return (vm_page_alloc_noobj_contig((zeroed ? VM_ALLOC_ZERO : 0), 2122 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT)); 2123 } 2124 2125 extern const char la57_trampoline[], la57_trampoline_gdt_desc[], 2126 la57_trampoline_gdt[], la57_trampoline_end[]; 2127 2128 static void 2129 pmap_bootstrap_la57(void *arg __unused) 2130 { 2131 char *v_code; 2132 pml5_entry_t *v_pml5; 2133 pml4_entry_t *v_pml4; 2134 pdp_entry_t *v_pdp; 2135 pd_entry_t *v_pd; 2136 pt_entry_t *v_pt; 2137 vm_page_t m_code, m_pml4, m_pdp, m_pd, m_pt, m_pml5; 2138 void (*la57_tramp)(uint64_t pml5); 2139 struct region_descriptor r_gdt; 2140 2141 if ((cpu_stdext_feature2 & CPUID_STDEXT2_LA57) == 0) 2142 return; 2143 TUNABLE_INT_FETCH("vm.pmap.la57", &la57); 2144 if (!la57) 2145 return; 2146 2147 r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; 2148 r_gdt.rd_base = (long)__pcpu[0].pc_gdt; 2149 2150 m_code = pmap_page_alloc_below_4g(true); 2151 v_code = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_code)); 2152 m_pml5 = pmap_page_alloc_below_4g(true); 2153 KPML5phys = VM_PAGE_TO_PHYS(m_pml5); 2154 v_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(KPML5phys); 2155 m_pml4 = pmap_page_alloc_below_4g(true); 2156 v_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4)); 2157 m_pdp = pmap_page_alloc_below_4g(true); 2158 v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp)); 2159 m_pd = pmap_page_alloc_below_4g(true); 2160 v_pd = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd)); 2161 m_pt = pmap_page_alloc_below_4g(true); 2162 v_pt = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pt)); 2163 2164 /* 2165 * Map m_code 1:1, it appears below 4G in KVA due to physical 2166 * address being below 4G. Since kernel KVA is in upper half, 2167 * the pml4e should be zero and free for temporary use. 2168 */ 2169 kernel_pmap->pm_pmltop[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = 2170 VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | 2171 X86_PG_M; 2172 v_pdp[pmap_pdpe_index(VM_PAGE_TO_PHYS(m_code))] = 2173 VM_PAGE_TO_PHYS(m_pd) | X86_PG_V | X86_PG_RW | X86_PG_A | 2174 X86_PG_M; 2175 v_pd[pmap_pde_index(VM_PAGE_TO_PHYS(m_code))] = 2176 VM_PAGE_TO_PHYS(m_pt) | X86_PG_V | X86_PG_RW | X86_PG_A | 2177 X86_PG_M; 2178 v_pt[pmap_pte_index(VM_PAGE_TO_PHYS(m_code))] = 2179 VM_PAGE_TO_PHYS(m_code) | X86_PG_V | X86_PG_RW | X86_PG_A | 2180 X86_PG_M; 2181 2182 /* 2183 * Add pml5 entry at top of KVA pointing to existing pml4 table, 2184 * entering all existing kernel mappings into level 5 table. 2185 */ 2186 v_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | 2187 X86_PG_RW | X86_PG_A | X86_PG_M | pg_g; 2188 2189 /* 2190 * Add pml5 entry for 1:1 trampoline mapping after LA57 is turned on. 2191 */ 2192 v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))] = 2193 VM_PAGE_TO_PHYS(m_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A | 2194 X86_PG_M; 2195 v_pml4[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = 2196 VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | 2197 X86_PG_M; 2198 2199 /* 2200 * Copy and call the 48->57 trampoline, hope we return there, alive. 2201 */ 2202 bcopy(la57_trampoline, v_code, la57_trampoline_end - la57_trampoline); 2203 *(u_long *)(v_code + 2 + (la57_trampoline_gdt_desc - la57_trampoline)) = 2204 la57_trampoline_gdt - la57_trampoline + VM_PAGE_TO_PHYS(m_code); 2205 la57_tramp = (void (*)(uint64_t))VM_PAGE_TO_PHYS(m_code); 2206 invlpg((vm_offset_t)la57_tramp); 2207 la57_tramp(KPML5phys); 2208 2209 /* 2210 * gdt was necessary reset, switch back to our gdt. 2211 */ 2212 lgdt(&r_gdt); 2213 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2214 load_ds(_udatasel); 2215 load_es(_udatasel); 2216 load_fs(_ufssel); 2217 ssdtosyssd(&gdt_segs[GPROC0_SEL], 2218 (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); 2219 ltr(GSEL(GPROC0_SEL, SEL_KPL)); 2220 2221 /* 2222 * Now unmap the trampoline, and free the pages. 2223 * Clear pml5 entry used for 1:1 trampoline mapping. 2224 */ 2225 pte_clear(&v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))]); 2226 invlpg((vm_offset_t)v_code); 2227 vm_page_free(m_code); 2228 vm_page_free(m_pdp); 2229 vm_page_free(m_pd); 2230 vm_page_free(m_pt); 2231 2232 /* 2233 * Recursively map PML5 to itself in order to get PTmap and 2234 * PDmap. 2235 */ 2236 v_pml5[PML5PML5I] = KPML5phys | X86_PG_RW | X86_PG_V | pg_nx; 2237 2238 vtoptem = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + 2239 NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3; 2240 PTmap = (vm_offset_t)P5Tmap; 2241 vtopdem = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 2242 NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3; 2243 PDmap = (vm_offset_t)P5Dmap; 2244 2245 kernel_pmap->pm_cr3 = KPML5phys; 2246 kernel_pmap->pm_pmltop = v_pml5; 2247 pmap_pt_page_count_adj(kernel_pmap, 1); 2248 } 2249 SYSINIT(la57, SI_SUB_KMEM, SI_ORDER_ANY, pmap_bootstrap_la57, NULL); 2250 2251 /* 2252 * Initialize a vm_page's machine-dependent fields. 2253 */ 2254 void 2255 pmap_page_init(vm_page_t m) 2256 { 2257 2258 TAILQ_INIT(&m->md.pv_list); 2259 m->md.pat_mode = PAT_WRITE_BACK; 2260 } 2261 2262 static int pmap_allow_2m_x_ept; 2263 SYSCTL_INT(_vm_pmap, OID_AUTO, allow_2m_x_ept, CTLFLAG_RWTUN | CTLFLAG_NOFETCH, 2264 &pmap_allow_2m_x_ept, 0, 2265 "Allow executable superpage mappings in EPT"); 2266 2267 void 2268 pmap_allow_2m_x_ept_recalculate(void) 2269 { 2270 /* 2271 * SKL002, SKL012S. Since the EPT format is only used by 2272 * Intel CPUs, the vendor check is merely a formality. 2273 */ 2274 if (!(cpu_vendor_id != CPU_VENDOR_INTEL || 2275 (cpu_ia32_arch_caps & IA32_ARCH_CAP_IF_PSCHANGE_MC_NO) != 0 || 2276 (CPUID_TO_FAMILY(cpu_id) == 0x6 && 2277 (CPUID_TO_MODEL(cpu_id) == 0x26 || /* Atoms */ 2278 CPUID_TO_MODEL(cpu_id) == 0x27 || 2279 CPUID_TO_MODEL(cpu_id) == 0x35 || 2280 CPUID_TO_MODEL(cpu_id) == 0x36 || 2281 CPUID_TO_MODEL(cpu_id) == 0x37 || 2282 CPUID_TO_MODEL(cpu_id) == 0x86 || 2283 CPUID_TO_MODEL(cpu_id) == 0x1c || 2284 CPUID_TO_MODEL(cpu_id) == 0x4a || 2285 CPUID_TO_MODEL(cpu_id) == 0x4c || 2286 CPUID_TO_MODEL(cpu_id) == 0x4d || 2287 CPUID_TO_MODEL(cpu_id) == 0x5a || 2288 CPUID_TO_MODEL(cpu_id) == 0x5c || 2289 CPUID_TO_MODEL(cpu_id) == 0x5d || 2290 CPUID_TO_MODEL(cpu_id) == 0x5f || 2291 CPUID_TO_MODEL(cpu_id) == 0x6e || 2292 CPUID_TO_MODEL(cpu_id) == 0x7a || 2293 CPUID_TO_MODEL(cpu_id) == 0x57 || /* Knights */ 2294 CPUID_TO_MODEL(cpu_id) == 0x85)))) 2295 pmap_allow_2m_x_ept = 1; 2296 TUNABLE_INT_FETCH("hw.allow_2m_x_ept", &pmap_allow_2m_x_ept); 2297 } 2298 2299 static bool 2300 pmap_allow_2m_x_page(pmap_t pmap, bool executable) 2301 { 2302 2303 return (pmap->pm_type != PT_EPT || !executable || 2304 !pmap_allow_2m_x_ept); 2305 } 2306 2307 #ifdef NUMA 2308 static void 2309 pmap_init_pv_table(void) 2310 { 2311 struct pmap_large_md_page *pvd; 2312 vm_size_t s; 2313 long start, end, highest, pv_npg; 2314 int domain, i, j, pages; 2315 2316 /* 2317 * For correctness we depend on the size being evenly divisible into a 2318 * page. As a tradeoff between performance and total memory use, the 2319 * entry is 64 bytes (aka one cacheline) in size. Not being smaller 2320 * avoids false-sharing, but not being 128 bytes potentially allows for 2321 * avoidable traffic due to adjacent cacheline prefetcher. 2322 * 2323 * Assert the size so that accidental changes fail to compile. 2324 */ 2325 CTASSERT((sizeof(*pvd) == 64)); 2326 2327 /* 2328 * Calculate the size of the array. 2329 */ 2330 pmap_last_pa = vm_phys_segs[vm_phys_nsegs - 1].end; 2331 pv_npg = howmany(pmap_last_pa, NBPDR); 2332 s = (vm_size_t)pv_npg * sizeof(struct pmap_large_md_page); 2333 s = round_page(s); 2334 pv_table = (struct pmap_large_md_page *)kva_alloc(s); 2335 if (pv_table == NULL) 2336 panic("%s: kva_alloc failed\n", __func__); 2337 2338 /* 2339 * Iterate physical segments to allocate space for respective pages. 2340 */ 2341 highest = -1; 2342 s = 0; 2343 for (i = 0; i < vm_phys_nsegs; i++) { 2344 end = vm_phys_segs[i].end / NBPDR; 2345 domain = vm_phys_segs[i].domain; 2346 2347 if (highest >= end) 2348 continue; 2349 2350 start = highest + 1; 2351 pvd = &pv_table[start]; 2352 2353 pages = end - start + 1; 2354 s = round_page(pages * sizeof(*pvd)); 2355 highest = start + (s / sizeof(*pvd)) - 1; 2356 2357 for (j = 0; j < s; j += PAGE_SIZE) { 2358 vm_page_t m = vm_page_alloc_noobj_domain(domain, 0); 2359 if (m == NULL) 2360 panic("failed to allocate PV table page"); 2361 pmap_qenter((vm_offset_t)pvd + j, &m, 1); 2362 } 2363 2364 for (j = 0; j < s / sizeof(*pvd); j++) { 2365 rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW); 2366 TAILQ_INIT(&pvd->pv_page.pv_list); 2367 pvd->pv_page.pv_gen = 0; 2368 pvd->pv_page.pat_mode = 0; 2369 pvd->pv_invl_gen = 0; 2370 pvd++; 2371 } 2372 } 2373 pvd = &pv_dummy_large; 2374 rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW); 2375 TAILQ_INIT(&pvd->pv_page.pv_list); 2376 pvd->pv_page.pv_gen = 0; 2377 pvd->pv_page.pat_mode = 0; 2378 pvd->pv_invl_gen = 0; 2379 } 2380 #else 2381 static void 2382 pmap_init_pv_table(void) 2383 { 2384 vm_size_t s; 2385 long i, pv_npg; 2386 2387 /* 2388 * Initialize the pool of pv list locks. 2389 */ 2390 for (i = 0; i < NPV_LIST_LOCKS; i++) 2391 rw_init(&pv_list_locks[i], "pmap pv list"); 2392 2393 /* 2394 * Calculate the size of the pv head table for superpages. 2395 */ 2396 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); 2397 2398 /* 2399 * Allocate memory for the pv head table for superpages. 2400 */ 2401 s = (vm_size_t)pv_npg * sizeof(struct md_page); 2402 s = round_page(s); 2403 pv_table = kmem_malloc(s, M_WAITOK | M_ZERO); 2404 for (i = 0; i < pv_npg; i++) 2405 TAILQ_INIT(&pv_table[i].pv_list); 2406 TAILQ_INIT(&pv_dummy.pv_list); 2407 } 2408 #endif 2409 2410 /* 2411 * Initialize the pmap module. 2412 * Called by vm_init, to initialize any structures that the pmap 2413 * system needs to map virtual memory. 2414 */ 2415 void 2416 pmap_init(void) 2417 { 2418 struct pmap_preinit_mapping *ppim; 2419 vm_page_t m, mpte; 2420 int error, i, ret, skz63; 2421 2422 /* L1TF, reserve page @0 unconditionally */ 2423 vm_page_blacklist_add(0, bootverbose); 2424 2425 /* Detect bare-metal Skylake Server and Skylake-X. */ 2426 if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL && 2427 CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) { 2428 /* 2429 * Skylake-X errata SKZ63. Processor May Hang When 2430 * Executing Code In an HLE Transaction Region between 2431 * 40000000H and 403FFFFFH. 2432 * 2433 * Mark the pages in the range as preallocated. It 2434 * seems to be impossible to distinguish between 2435 * Skylake Server and Skylake X. 2436 */ 2437 skz63 = 1; 2438 TUNABLE_INT_FETCH("hw.skz63_enable", &skz63); 2439 if (skz63 != 0) { 2440 if (bootverbose) 2441 printf("SKZ63: skipping 4M RAM starting " 2442 "at physical 1G\n"); 2443 for (i = 0; i < atop(0x400000); i++) { 2444 ret = vm_page_blacklist_add(0x40000000 + 2445 ptoa(i), FALSE); 2446 if (!ret && bootverbose) 2447 printf("page at %#lx already used\n", 2448 0x40000000 + ptoa(i)); 2449 } 2450 } 2451 } 2452 2453 /* IFU */ 2454 pmap_allow_2m_x_ept_recalculate(); 2455 2456 /* 2457 * Initialize the vm page array entries for the kernel pmap's 2458 * page table pages. 2459 */ 2460 PMAP_LOCK(kernel_pmap); 2461 for (i = 0; i < nkpt; i++) { 2462 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 2463 KASSERT(mpte >= vm_page_array && 2464 mpte < &vm_page_array[vm_page_array_size], 2465 ("pmap_init: page table page is out of range")); 2466 mpte->pindex = pmap_pde_pindex(KERNBASE) + i; 2467 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 2468 mpte->ref_count = 1; 2469 2470 /* 2471 * Collect the page table pages that were replaced by a 2MB 2472 * page in create_pagetables(). They are zero filled. 2473 */ 2474 if ((i == 0 || 2475 kernphys + ((vm_paddr_t)(i - 1) << PDRSHIFT) < KERNend) && 2476 pmap_insert_pt_page(kernel_pmap, mpte, false)) 2477 panic("pmap_init: pmap_insert_pt_page failed"); 2478 } 2479 PMAP_UNLOCK(kernel_pmap); 2480 vm_wire_add(nkpt); 2481 2482 /* 2483 * If the kernel is running on a virtual machine, then it must assume 2484 * that MCA is enabled by the hypervisor. Moreover, the kernel must 2485 * be prepared for the hypervisor changing the vendor and family that 2486 * are reported by CPUID. Consequently, the workaround for AMD Family 2487 * 10h Erratum 383 is enabled if the processor's feature set does not 2488 * include at least one feature that is only supported by older Intel 2489 * or newer AMD processors. 2490 */ 2491 if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && 2492 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 2493 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 2494 AMDID2_FMA4)) == 0) 2495 workaround_erratum383 = 1; 2496 2497 /* 2498 * Are large page mappings enabled? 2499 */ 2500 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 2501 if (pg_ps_enabled) { 2502 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 2503 ("pmap_init: can't assign to pagesizes[1]")); 2504 pagesizes[1] = NBPDR; 2505 if ((amd_feature & AMDID_PAGE1GB) != 0) { 2506 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0, 2507 ("pmap_init: can't assign to pagesizes[2]")); 2508 pagesizes[2] = NBPDP; 2509 } 2510 } 2511 2512 /* 2513 * Initialize pv chunk lists. 2514 */ 2515 for (i = 0; i < PMAP_MEMDOM; i++) { 2516 mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL, MTX_DEF); 2517 TAILQ_INIT(&pv_chunks[i].pvc_list); 2518 } 2519 pmap_init_pv_table(); 2520 2521 pmap_initialized = 1; 2522 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 2523 ppim = pmap_preinit_mapping + i; 2524 if (ppim->va == 0) 2525 continue; 2526 /* Make the direct map consistent */ 2527 if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) { 2528 (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), 2529 ppim->sz, ppim->mode); 2530 } 2531 if (!bootverbose) 2532 continue; 2533 printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i, 2534 ppim->pa, ppim->va, ppim->sz, ppim->mode); 2535 } 2536 2537 mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); 2538 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 2539 (vmem_addr_t *)&qframe); 2540 if (error != 0) 2541 panic("qframe allocation failed"); 2542 2543 lm_ents = 8; 2544 TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents); 2545 if (lm_ents > LMEPML4I - LMSPML4I + 1) 2546 lm_ents = LMEPML4I - LMSPML4I + 1; 2547 #ifdef KMSAN 2548 if (lm_ents > KMSANORIGPML4I - LMSPML4I) { 2549 printf( 2550 "pmap: shrinking large map for KMSAN (%d slots to %ld slots)\n", 2551 lm_ents, KMSANORIGPML4I - LMSPML4I); 2552 lm_ents = KMSANORIGPML4I - LMSPML4I; 2553 } 2554 #endif 2555 if (bootverbose) 2556 printf("pmap: large map %u PML4 slots (%lu GB)\n", 2557 lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024)); 2558 if (lm_ents != 0) { 2559 large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS, 2560 (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK); 2561 if (large_vmem == NULL) { 2562 printf("pmap: cannot create large map\n"); 2563 lm_ents = 0; 2564 } 2565 for (i = 0; i < lm_ents; i++) { 2566 m = pmap_large_map_getptp_unlocked(); 2567 /* XXXKIB la57 */ 2568 kernel_pml4[LMSPML4I + i] = X86_PG_V | 2569 X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx | 2570 VM_PAGE_TO_PHYS(m); 2571 } 2572 } 2573 } 2574 2575 SYSCTL_UINT(_vm_pmap, OID_AUTO, large_map_pml4_entries, 2576 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &lm_ents, 0, 2577 "Maximum number of PML4 entries for use by large map (tunable). " 2578 "Each entry corresponds to 512GB of address space."); 2579 2580 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 2581 "2MB page mapping counters"); 2582 2583 static COUNTER_U64_DEFINE_EARLY(pmap_pde_demotions); 2584 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, demotions, 2585 CTLFLAG_RD, &pmap_pde_demotions, "2MB page demotions"); 2586 2587 static COUNTER_U64_DEFINE_EARLY(pmap_pde_mappings); 2588 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 2589 &pmap_pde_mappings, "2MB page mappings"); 2590 2591 static COUNTER_U64_DEFINE_EARLY(pmap_pde_p_failures); 2592 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 2593 &pmap_pde_p_failures, "2MB page promotion failures"); 2594 2595 static COUNTER_U64_DEFINE_EARLY(pmap_pde_promotions); 2596 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 2597 &pmap_pde_promotions, "2MB page promotions"); 2598 2599 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 2600 "1GB page mapping counters"); 2601 2602 static COUNTER_U64_DEFINE_EARLY(pmap_pdpe_demotions); 2603 SYSCTL_COUNTER_U64(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, 2604 &pmap_pdpe_demotions, "1GB page demotions"); 2605 2606 /*************************************************** 2607 * Low level helper routines..... 2608 ***************************************************/ 2609 2610 static pt_entry_t 2611 pmap_swap_pat(pmap_t pmap, pt_entry_t entry) 2612 { 2613 int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; 2614 2615 switch (pmap->pm_type) { 2616 case PT_X86: 2617 case PT_RVI: 2618 /* Verify that both PAT bits are not set at the same time */ 2619 KASSERT((entry & x86_pat_bits) != x86_pat_bits, 2620 ("Invalid PAT bits in entry %#lx", entry)); 2621 2622 /* Swap the PAT bits if one of them is set */ 2623 if ((entry & x86_pat_bits) != 0) 2624 entry ^= x86_pat_bits; 2625 break; 2626 case PT_EPT: 2627 /* 2628 * Nothing to do - the memory attributes are represented 2629 * the same way for regular pages and superpages. 2630 */ 2631 break; 2632 default: 2633 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); 2634 } 2635 2636 return (entry); 2637 } 2638 2639 boolean_t 2640 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 2641 { 2642 2643 return (mode >= 0 && mode < PAT_INDEX_SIZE && 2644 pat_index[(int)mode] >= 0); 2645 } 2646 2647 /* 2648 * Determine the appropriate bits to set in a PTE or PDE for a specified 2649 * caching mode. 2650 */ 2651 int 2652 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) 2653 { 2654 int cache_bits, pat_flag, pat_idx; 2655 2656 if (!pmap_is_valid_memattr(pmap, mode)) 2657 panic("Unknown caching mode %d\n", mode); 2658 2659 switch (pmap->pm_type) { 2660 case PT_X86: 2661 case PT_RVI: 2662 /* The PAT bit is different for PTE's and PDE's. */ 2663 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 2664 2665 /* Map the caching mode to a PAT index. */ 2666 pat_idx = pat_index[mode]; 2667 2668 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 2669 cache_bits = 0; 2670 if (pat_idx & 0x4) 2671 cache_bits |= pat_flag; 2672 if (pat_idx & 0x2) 2673 cache_bits |= PG_NC_PCD; 2674 if (pat_idx & 0x1) 2675 cache_bits |= PG_NC_PWT; 2676 break; 2677 2678 case PT_EPT: 2679 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); 2680 break; 2681 2682 default: 2683 panic("unsupported pmap type %d", pmap->pm_type); 2684 } 2685 2686 return (cache_bits); 2687 } 2688 2689 static int 2690 pmap_cache_mask(pmap_t pmap, boolean_t is_pde) 2691 { 2692 int mask; 2693 2694 switch (pmap->pm_type) { 2695 case PT_X86: 2696 case PT_RVI: 2697 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; 2698 break; 2699 case PT_EPT: 2700 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); 2701 break; 2702 default: 2703 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); 2704 } 2705 2706 return (mask); 2707 } 2708 2709 static int 2710 pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde) 2711 { 2712 int pat_flag, pat_idx; 2713 2714 pat_idx = 0; 2715 switch (pmap->pm_type) { 2716 case PT_X86: 2717 case PT_RVI: 2718 /* The PAT bit is different for PTE's and PDE's. */ 2719 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 2720 2721 if ((pte & pat_flag) != 0) 2722 pat_idx |= 0x4; 2723 if ((pte & PG_NC_PCD) != 0) 2724 pat_idx |= 0x2; 2725 if ((pte & PG_NC_PWT) != 0) 2726 pat_idx |= 0x1; 2727 break; 2728 case PT_EPT: 2729 if ((pte & EPT_PG_IGNORE_PAT) != 0) 2730 panic("EPT PTE %#lx has no PAT memory type", pte); 2731 pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3; 2732 break; 2733 } 2734 2735 /* See pmap_init_pat(). */ 2736 if (pat_idx == 4) 2737 pat_idx = 0; 2738 if (pat_idx == 7) 2739 pat_idx = 3; 2740 2741 return (pat_idx); 2742 } 2743 2744 bool 2745 pmap_ps_enabled(pmap_t pmap) 2746 { 2747 2748 return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); 2749 } 2750 2751 static void 2752 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) 2753 { 2754 2755 switch (pmap->pm_type) { 2756 case PT_X86: 2757 break; 2758 case PT_RVI: 2759 case PT_EPT: 2760 /* 2761 * XXX 2762 * This is a little bogus since the generation number is 2763 * supposed to be bumped up when a region of the address 2764 * space is invalidated in the page tables. 2765 * 2766 * In this case the old PDE entry is valid but yet we want 2767 * to make sure that any mappings using the old entry are 2768 * invalidated in the TLB. 2769 * 2770 * The reason this works as expected is because we rendezvous 2771 * "all" host cpus and force any vcpu context to exit as a 2772 * side-effect. 2773 */ 2774 atomic_add_long(&pmap->pm_eptgen, 1); 2775 break; 2776 default: 2777 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); 2778 } 2779 pde_store(pde, newpde); 2780 } 2781 2782 /* 2783 * After changing the page size for the specified virtual address in the page 2784 * table, flush the corresponding entries from the processor's TLB. Only the 2785 * calling processor's TLB is affected. 2786 * 2787 * The calling thread must be pinned to a processor. 2788 */ 2789 static void 2790 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) 2791 { 2792 pt_entry_t PG_G; 2793 2794 if (pmap_type_guest(pmap)) 2795 return; 2796 2797 KASSERT(pmap->pm_type == PT_X86, 2798 ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); 2799 2800 PG_G = pmap_global_bit(pmap); 2801 2802 if ((newpde & PG_PS) == 0) 2803 /* Demotion: flush a specific 2MB page mapping. */ 2804 pmap_invlpg(pmap, va); 2805 else if ((newpde & PG_G) == 0) 2806 /* 2807 * Promotion: flush every 4KB page mapping from the TLB 2808 * because there are too many to flush individually. 2809 */ 2810 invltlb(); 2811 else { 2812 /* 2813 * Promotion: flush every 4KB page mapping from the TLB, 2814 * including any global (PG_G) mappings. 2815 */ 2816 invltlb_glob(); 2817 } 2818 } 2819 2820 /* 2821 * The amd64 pmap uses different approaches to TLB invalidation 2822 * depending on the kernel configuration, available hardware features, 2823 * and known hardware errata. The kernel configuration option that 2824 * has the greatest operational impact on TLB invalidation is PTI, 2825 * which is enabled automatically on affected Intel CPUs. The most 2826 * impactful hardware features are first PCID, and then INVPCID 2827 * instruction presence. PCID usage is quite different for PTI 2828 * vs. non-PTI. 2829 * 2830 * * Kernel Page Table Isolation (PTI or KPTI) is used to mitigate 2831 * the Meltdown bug in some Intel CPUs. Under PTI, each user address 2832 * space is served by two page tables, user and kernel. The user 2833 * page table only maps user space and a kernel trampoline. The 2834 * kernel trampoline includes the entirety of the kernel text but 2835 * only the kernel data that is needed to switch from user to kernel 2836 * mode. The kernel page table maps the user and kernel address 2837 * spaces in their entirety. It is identical to the per-process 2838 * page table used in non-PTI mode. 2839 * 2840 * User page tables are only used when the CPU is in user mode. 2841 * Consequently, some TLB invalidations can be postponed until the 2842 * switch from kernel to user mode. In contrast, the user 2843 * space part of the kernel page table is used for copyout(9), so 2844 * TLB invalidations on this page table cannot be similarly postponed. 2845 * 2846 * The existence of a user mode page table for the given pmap is 2847 * indicated by a pm_ucr3 value that differs from PMAP_NO_CR3, in 2848 * which case pm_ucr3 contains the %cr3 register value for the user 2849 * mode page table's root. 2850 * 2851 * * The pm_active bitmask indicates which CPUs currently have the 2852 * pmap active. A CPU's bit is set on context switch to the pmap, and 2853 * cleared on switching off this CPU. For the kernel page table, 2854 * the pm_active field is immutable and contains all CPUs. The 2855 * kernel page table is always logically active on every processor, 2856 * but not necessarily in use by the hardware, e.g., in PTI mode. 2857 * 2858 * When requesting invalidation of virtual addresses with 2859 * pmap_invalidate_XXX() functions, the pmap sends shootdown IPIs to 2860 * all CPUs recorded as active in pm_active. Updates to and reads 2861 * from pm_active are not synchronized, and so they may race with 2862 * each other. Shootdown handlers are prepared to handle the race. 2863 * 2864 * * PCID is an optional feature of the long mode x86 MMU where TLB 2865 * entries are tagged with the 'Process ID' of the address space 2866 * they belong to. This feature provides a limited namespace for 2867 * process identifiers, 12 bits, supporting 4095 simultaneous IDs 2868 * total. 2869 * 2870 * Allocation of a PCID to a pmap is done by an algorithm described 2871 * in section 15.12, "Other TLB Consistency Algorithms", of 2872 * Vahalia's book "Unix Internals". A PCID cannot be allocated for 2873 * the whole lifetime of a pmap in pmap_pinit() due to the limited 2874 * namespace. Instead, a per-CPU, per-pmap PCID is assigned when 2875 * the CPU is about to start caching TLB entries from a pmap, 2876 * i.e., on the context switch that activates the pmap on the CPU. 2877 * 2878 * The PCID allocator maintains a per-CPU, per-pmap generation 2879 * count, pm_gen, which is incremented each time a new PCID is 2880 * allocated. On TLB invalidation, the generation counters for the 2881 * pmap are zeroed, which signals the context switch code that the 2882 * previously allocated PCID is no longer valid. Effectively, 2883 * zeroing any of these counters triggers a TLB shootdown for the 2884 * given CPU/address space, due to the allocation of a new PCID. 2885 * 2886 * Zeroing can be performed remotely. Consequently, if a pmap is 2887 * inactive on a CPU, then a TLB shootdown for that pmap and CPU can 2888 * be initiated by an ordinary memory access to reset the target 2889 * CPU's generation count within the pmap. The CPU initiating the 2890 * TLB shootdown does not need to send an IPI to the target CPU. 2891 * 2892 * * PTI + PCID. The available PCIDs are divided into two sets: PCIDs 2893 * for complete (kernel) page tables, and PCIDs for user mode page 2894 * tables. A user PCID value is obtained from the kernel PCID value 2895 * by setting the highest bit, 11, to 1 (0x800 == PMAP_PCID_USER_PT). 2896 * 2897 * User space page tables are activated on return to user mode, by 2898 * loading pm_ucr3 into %cr3. If the PCPU(ucr3_load_mask) requests 2899 * clearing bit 63 of the loaded ucr3, this effectively causes 2900 * complete invalidation of the user mode TLB entries for the 2901 * current pmap. In which case, local invalidations of individual 2902 * pages in the user page table are skipped. 2903 * 2904 * * Local invalidation, all modes. If the requested invalidation is 2905 * for a specific address or the total invalidation of a currently 2906 * active pmap, then the TLB is flushed using INVLPG for a kernel 2907 * page table, and INVPCID(INVPCID_CTXGLOB)/invltlb_glob() for a 2908 * user space page table(s). 2909 * 2910 * If the INVPCID instruction is available, it is used to flush user 2911 * entries from the kernel page table. 2912 * 2913 * When PCID is enabled, the INVLPG instruction invalidates all TLB 2914 * entries for the given page that either match the current PCID or 2915 * are global. Since TLB entries for the same page under different 2916 * PCIDs are unaffected, kernel pages which reside in all address 2917 * spaces could be problematic. We avoid the problem by creating 2918 * all kernel PTEs with the global flag (PG_G) set, when PTI is 2919 * disabled. 2920 * 2921 * * mode: PTI disabled, PCID present. The kernel reserves PCID 0 for its 2922 * address space, all other 4095 PCIDs are used for user mode spaces 2923 * as described above. A context switch allocates a new PCID if 2924 * the recorded PCID is zero or the recorded generation does not match 2925 * the CPU's generation, effectively flushing the TLB for this address space. 2926 * Total remote invalidation is performed by zeroing pm_gen for all CPUs. 2927 * local user page: INVLPG 2928 * local kernel page: INVLPG 2929 * local user total: INVPCID(CTX) 2930 * local kernel total: INVPCID(CTXGLOB) or invltlb_glob() 2931 * remote user page, inactive pmap: zero pm_gen 2932 * remote user page, active pmap: zero pm_gen + IPI:INVLPG 2933 * (Both actions are required to handle the aforementioned pm_active races.) 2934 * remote kernel page: IPI:INVLPG 2935 * remote user total, inactive pmap: zero pm_gen 2936 * remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) or 2937 * reload %cr3) 2938 * (See note above about pm_active races.) 2939 * remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob()) 2940 * 2941 * PTI enabled, PCID present. 2942 * local user page: INVLPG for kpt, INVPCID(ADDR) or (INVLPG for ucr3) 2943 * for upt 2944 * local kernel page: INVLPG 2945 * local user total: INVPCID(CTX) or reload %cr3 for kpt, clear PCID_SAVE 2946 * on loading UCR3 into %cr3 for upt 2947 * local kernel total: INVPCID(CTXGLOB) or invltlb_glob() 2948 * remote user page, inactive pmap: zero pm_gen 2949 * remote user page, active pmap: zero pm_gen + IPI:(INVLPG for kpt, 2950 * INVPCID(ADDR) for upt) 2951 * remote kernel page: IPI:INVLPG 2952 * remote user total, inactive pmap: zero pm_gen 2953 * remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) for kpt, 2954 * clear PCID_SAVE on loading UCR3 into $cr3 for upt) 2955 * remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob()) 2956 * 2957 * No PCID. 2958 * local user page: INVLPG 2959 * local kernel page: INVLPG 2960 * local user total: reload %cr3 2961 * local kernel total: invltlb_glob() 2962 * remote user page, inactive pmap: - 2963 * remote user page, active pmap: IPI:INVLPG 2964 * remote kernel page: IPI:INVLPG 2965 * remote user total, inactive pmap: - 2966 * remote user total, active pmap: IPI:(reload %cr3) 2967 * remote kernel total: IPI:invltlb_glob() 2968 * Since on return to user mode, the reload of %cr3 with ucr3 causes 2969 * TLB invalidation, no specific action is required for user page table. 2970 * 2971 * EPT. EPT pmaps do not map KVA, all mappings are userspace. 2972 * XXX TODO 2973 */ 2974 2975 #ifdef SMP 2976 /* 2977 * Interrupt the cpus that are executing in the guest context. 2978 * This will force the vcpu to exit and the cached EPT mappings 2979 * will be invalidated by the host before the next vmresume. 2980 */ 2981 static __inline void 2982 pmap_invalidate_ept(pmap_t pmap) 2983 { 2984 smr_seq_t goal; 2985 int ipinum; 2986 2987 sched_pin(); 2988 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 2989 ("pmap_invalidate_ept: absurd pm_active")); 2990 2991 /* 2992 * The TLB mappings associated with a vcpu context are not 2993 * flushed each time a different vcpu is chosen to execute. 2994 * 2995 * This is in contrast with a process's vtop mappings that 2996 * are flushed from the TLB on each context switch. 2997 * 2998 * Therefore we need to do more than just a TLB shootdown on 2999 * the active cpus in 'pmap->pm_active'. To do this we keep 3000 * track of the number of invalidations performed on this pmap. 3001 * 3002 * Each vcpu keeps a cache of this counter and compares it 3003 * just before a vmresume. If the counter is out-of-date an 3004 * invept will be done to flush stale mappings from the TLB. 3005 * 3006 * To ensure that all vCPU threads have observed the new counter 3007 * value before returning, we use SMR. Ordering is important here: 3008 * the VMM enters an SMR read section before loading the counter 3009 * and after updating the pm_active bit set. Thus, pm_active is 3010 * a superset of active readers, and any reader that has observed 3011 * the goal has observed the new counter value. 3012 */ 3013 atomic_add_long(&pmap->pm_eptgen, 1); 3014 3015 goal = smr_advance(pmap->pm_eptsmr); 3016 3017 /* 3018 * Force the vcpu to exit and trap back into the hypervisor. 3019 */ 3020 ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; 3021 ipi_selected(pmap->pm_active, ipinum); 3022 sched_unpin(); 3023 3024 /* 3025 * Ensure that all active vCPUs will observe the new generation counter 3026 * value before executing any more guest instructions. 3027 */ 3028 smr_wait(pmap->pm_eptsmr, goal); 3029 } 3030 3031 static inline void 3032 pmap_invalidate_preipi_pcid(pmap_t pmap) 3033 { 3034 u_int cpuid, i; 3035 3036 sched_pin(); 3037 3038 cpuid = PCPU_GET(cpuid); 3039 if (pmap != PCPU_GET(curpmap)) 3040 cpuid = 0xffffffff; /* An impossible value */ 3041 3042 CPU_FOREACH(i) { 3043 if (cpuid != i) 3044 pmap->pm_pcids[i].pm_gen = 0; 3045 } 3046 3047 /* 3048 * The fence is between stores to pm_gen and the read of the 3049 * pm_active mask. We need to ensure that it is impossible 3050 * for us to miss the bit update in pm_active and 3051 * simultaneously observe a non-zero pm_gen in 3052 * pmap_activate_sw(), otherwise TLB update is missed. 3053 * Without the fence, IA32 allows such an outcome. Note that 3054 * pm_active is updated by a locked operation, which provides 3055 * the reciprocal fence. 3056 */ 3057 atomic_thread_fence_seq_cst(); 3058 } 3059 3060 static void 3061 pmap_invalidate_preipi_nopcid(pmap_t pmap __unused) 3062 { 3063 sched_pin(); 3064 } 3065 3066 DEFINE_IFUNC(static, void, pmap_invalidate_preipi, (pmap_t)) 3067 { 3068 return (pmap_pcid_enabled ? pmap_invalidate_preipi_pcid : 3069 pmap_invalidate_preipi_nopcid); 3070 } 3071 3072 static inline void 3073 pmap_invalidate_page_pcid_cb(pmap_t pmap, vm_offset_t va, 3074 const bool invpcid_works1) 3075 { 3076 struct invpcid_descr d; 3077 uint64_t kcr3, ucr3; 3078 uint32_t pcid; 3079 u_int cpuid; 3080 3081 /* 3082 * Because pm_pcid is recalculated on a context switch, we 3083 * must ensure there is no preemption, not just pinning. 3084 * Otherwise, we might use a stale value below. 3085 */ 3086 CRITICAL_ASSERT(curthread); 3087 3088 /* 3089 * No need to do anything with user page tables invalidation 3090 * if there is no user page table, or invalidation is deferred 3091 * until the return to userspace. ucr3_load_mask is stable 3092 * because we have preemption disabled. 3093 */ 3094 if (pmap->pm_ucr3 == PMAP_NO_CR3 || 3095 PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) 3096 return; 3097 3098 cpuid = PCPU_GET(cpuid); 3099 3100 pcid = pmap->pm_pcids[cpuid].pm_pcid; 3101 if (invpcid_works1) { 3102 d.pcid = pcid | PMAP_PCID_USER_PT; 3103 d.pad = 0; 3104 d.addr = va; 3105 invpcid(&d, INVPCID_ADDR); 3106 } else { 3107 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3108 ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3109 pmap_pti_pcid_invlpg(ucr3, kcr3, va); 3110 } 3111 } 3112 3113 static void 3114 pmap_invalidate_page_pcid_invpcid_cb(pmap_t pmap, vm_offset_t va) 3115 { 3116 pmap_invalidate_page_pcid_cb(pmap, va, true); 3117 } 3118 3119 static void 3120 pmap_invalidate_page_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t va) 3121 { 3122 pmap_invalidate_page_pcid_cb(pmap, va, false); 3123 } 3124 3125 static void 3126 pmap_invalidate_page_nopcid_cb(pmap_t pmap __unused, vm_offset_t va __unused) 3127 { 3128 } 3129 3130 DEFINE_IFUNC(static, void, pmap_invalidate_page_cb, (pmap_t, vm_offset_t)) 3131 { 3132 if (pmap_pcid_enabled) 3133 return (invpcid_works ? pmap_invalidate_page_pcid_invpcid_cb : 3134 pmap_invalidate_page_pcid_noinvpcid_cb); 3135 return (pmap_invalidate_page_nopcid_cb); 3136 } 3137 3138 static void 3139 pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va, 3140 vm_offset_t addr2 __unused) 3141 { 3142 if (pmap == kernel_pmap) { 3143 pmap_invlpg(kernel_pmap, va); 3144 } else if (pmap == PCPU_GET(curpmap)) { 3145 invlpg(va); 3146 pmap_invalidate_page_cb(pmap, va); 3147 } 3148 } 3149 3150 void 3151 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 3152 { 3153 if (pmap_type_guest(pmap)) { 3154 pmap_invalidate_ept(pmap); 3155 return; 3156 } 3157 3158 KASSERT(pmap->pm_type == PT_X86, 3159 ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); 3160 3161 pmap_invalidate_preipi(pmap); 3162 smp_masked_invlpg(va, pmap, pmap_invalidate_page_curcpu_cb); 3163 } 3164 3165 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ 3166 #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) 3167 3168 static void 3169 pmap_invalidate_range_pcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 3170 const bool invpcid_works1) 3171 { 3172 struct invpcid_descr d; 3173 uint64_t kcr3, ucr3; 3174 uint32_t pcid; 3175 u_int cpuid; 3176 3177 CRITICAL_ASSERT(curthread); 3178 3179 if (pmap != PCPU_GET(curpmap) || 3180 pmap->pm_ucr3 == PMAP_NO_CR3 || 3181 PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) 3182 return; 3183 3184 cpuid = PCPU_GET(cpuid); 3185 3186 pcid = pmap->pm_pcids[cpuid].pm_pcid; 3187 if (invpcid_works1) { 3188 d.pcid = pcid | PMAP_PCID_USER_PT; 3189 d.pad = 0; 3190 for (d.addr = sva; d.addr < eva; d.addr += PAGE_SIZE) 3191 invpcid(&d, INVPCID_ADDR); 3192 } else { 3193 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3194 ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3195 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); 3196 } 3197 } 3198 3199 static void 3200 pmap_invalidate_range_pcid_invpcid_cb(pmap_t pmap, vm_offset_t sva, 3201 vm_offset_t eva) 3202 { 3203 pmap_invalidate_range_pcid_cb(pmap, sva, eva, true); 3204 } 3205 3206 static void 3207 pmap_invalidate_range_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t sva, 3208 vm_offset_t eva) 3209 { 3210 pmap_invalidate_range_pcid_cb(pmap, sva, eva, false); 3211 } 3212 3213 static void 3214 pmap_invalidate_range_nopcid_cb(pmap_t pmap __unused, vm_offset_t sva __unused, 3215 vm_offset_t eva __unused) 3216 { 3217 } 3218 3219 DEFINE_IFUNC(static, void, pmap_invalidate_range_cb, (pmap_t, vm_offset_t, 3220 vm_offset_t)) 3221 { 3222 if (pmap_pcid_enabled) 3223 return (invpcid_works ? pmap_invalidate_range_pcid_invpcid_cb : 3224 pmap_invalidate_range_pcid_noinvpcid_cb); 3225 return (pmap_invalidate_range_nopcid_cb); 3226 } 3227 3228 static void 3229 pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3230 { 3231 vm_offset_t addr; 3232 3233 if (pmap == kernel_pmap) { 3234 if (PCPU_GET(pcid_invlpg_workaround)) { 3235 struct invpcid_descr d = { 0 }; 3236 3237 invpcid(&d, INVPCID_CTXGLOB); 3238 } else { 3239 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3240 invlpg(addr); 3241 } 3242 } else if (pmap == PCPU_GET(curpmap)) { 3243 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3244 invlpg(addr); 3245 pmap_invalidate_range_cb(pmap, sva, eva); 3246 } 3247 } 3248 3249 void 3250 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3251 { 3252 if (eva - sva >= PMAP_INVLPG_THRESHOLD) { 3253 pmap_invalidate_all(pmap); 3254 return; 3255 } 3256 3257 if (pmap_type_guest(pmap)) { 3258 pmap_invalidate_ept(pmap); 3259 return; 3260 } 3261 3262 KASSERT(pmap->pm_type == PT_X86, 3263 ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); 3264 3265 pmap_invalidate_preipi(pmap); 3266 smp_masked_invlpg_range(sva, eva, pmap, 3267 pmap_invalidate_range_curcpu_cb); 3268 } 3269 3270 static inline void 3271 pmap_invalidate_all_pcid_cb(pmap_t pmap, bool invpcid_works1) 3272 { 3273 struct invpcid_descr d; 3274 uint64_t kcr3; 3275 uint32_t pcid; 3276 u_int cpuid; 3277 3278 if (pmap == kernel_pmap) { 3279 if (invpcid_works1) { 3280 bzero(&d, sizeof(d)); 3281 invpcid(&d, INVPCID_CTXGLOB); 3282 } else { 3283 invltlb_glob(); 3284 } 3285 } else if (pmap == PCPU_GET(curpmap)) { 3286 CRITICAL_ASSERT(curthread); 3287 cpuid = PCPU_GET(cpuid); 3288 3289 pcid = pmap->pm_pcids[cpuid].pm_pcid; 3290 if (invpcid_works1) { 3291 d.pcid = pcid; 3292 d.pad = 0; 3293 d.addr = 0; 3294 invpcid(&d, INVPCID_CTX); 3295 } else { 3296 kcr3 = pmap->pm_cr3 | pcid; 3297 load_cr3(kcr3); 3298 } 3299 if (pmap->pm_ucr3 != PMAP_NO_CR3) 3300 PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); 3301 } 3302 } 3303 3304 static void 3305 pmap_invalidate_all_pcid_invpcid_cb(pmap_t pmap) 3306 { 3307 pmap_invalidate_all_pcid_cb(pmap, true); 3308 } 3309 3310 static void 3311 pmap_invalidate_all_pcid_noinvpcid_cb(pmap_t pmap) 3312 { 3313 pmap_invalidate_all_pcid_cb(pmap, false); 3314 } 3315 3316 static void 3317 pmap_invalidate_all_nopcid_cb(pmap_t pmap) 3318 { 3319 if (pmap == kernel_pmap) 3320 invltlb_glob(); 3321 else if (pmap == PCPU_GET(curpmap)) 3322 invltlb(); 3323 } 3324 3325 DEFINE_IFUNC(static, void, pmap_invalidate_all_cb, (pmap_t)) 3326 { 3327 if (pmap_pcid_enabled) 3328 return (invpcid_works ? pmap_invalidate_all_pcid_invpcid_cb : 3329 pmap_invalidate_all_pcid_noinvpcid_cb); 3330 return (pmap_invalidate_all_nopcid_cb); 3331 } 3332 3333 static void 3334 pmap_invalidate_all_curcpu_cb(pmap_t pmap, vm_offset_t addr1 __unused, 3335 vm_offset_t addr2 __unused) 3336 { 3337 pmap_invalidate_all_cb(pmap); 3338 } 3339 3340 void 3341 pmap_invalidate_all(pmap_t pmap) 3342 { 3343 if (pmap_type_guest(pmap)) { 3344 pmap_invalidate_ept(pmap); 3345 return; 3346 } 3347 3348 KASSERT(pmap->pm_type == PT_X86, 3349 ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); 3350 3351 pmap_invalidate_preipi(pmap); 3352 smp_masked_invltlb(pmap, pmap_invalidate_all_curcpu_cb); 3353 } 3354 3355 static void 3356 pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused, vm_offset_t va __unused, 3357 vm_offset_t addr2 __unused) 3358 { 3359 wbinvd(); 3360 } 3361 3362 void 3363 pmap_invalidate_cache(void) 3364 { 3365 sched_pin(); 3366 smp_cache_flush(pmap_invalidate_cache_curcpu_cb); 3367 } 3368 3369 struct pde_action { 3370 cpuset_t invalidate; /* processors that invalidate their TLB */ 3371 pmap_t pmap; 3372 vm_offset_t va; 3373 pd_entry_t *pde; 3374 pd_entry_t newpde; 3375 u_int store; /* processor that updates the PDE */ 3376 }; 3377 3378 static void 3379 pmap_update_pde_action(void *arg) 3380 { 3381 struct pde_action *act = arg; 3382 3383 if (act->store == PCPU_GET(cpuid)) 3384 pmap_update_pde_store(act->pmap, act->pde, act->newpde); 3385 } 3386 3387 static void 3388 pmap_update_pde_teardown(void *arg) 3389 { 3390 struct pde_action *act = arg; 3391 3392 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 3393 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); 3394 } 3395 3396 /* 3397 * Change the page size for the specified virtual address in a way that 3398 * prevents any possibility of the TLB ever having two entries that map the 3399 * same virtual address using different page sizes. This is the recommended 3400 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 3401 * machine check exception for a TLB state that is improperly diagnosed as a 3402 * hardware error. 3403 */ 3404 static void 3405 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 3406 { 3407 struct pde_action act; 3408 cpuset_t active, other_cpus; 3409 u_int cpuid; 3410 3411 sched_pin(); 3412 cpuid = PCPU_GET(cpuid); 3413 other_cpus = all_cpus; 3414 CPU_CLR(cpuid, &other_cpus); 3415 if (pmap == kernel_pmap || pmap_type_guest(pmap)) 3416 active = all_cpus; 3417 else { 3418 active = pmap->pm_active; 3419 } 3420 if (CPU_OVERLAP(&active, &other_cpus)) { 3421 act.store = cpuid; 3422 act.invalidate = active; 3423 act.va = va; 3424 act.pmap = pmap; 3425 act.pde = pde; 3426 act.newpde = newpde; 3427 CPU_SET(cpuid, &active); 3428 smp_rendezvous_cpus(active, 3429 smp_no_rendezvous_barrier, pmap_update_pde_action, 3430 pmap_update_pde_teardown, &act); 3431 } else { 3432 pmap_update_pde_store(pmap, pde, newpde); 3433 if (CPU_ISSET(cpuid, &active)) 3434 pmap_update_pde_invalidate(pmap, va, newpde); 3435 } 3436 sched_unpin(); 3437 } 3438 #else /* !SMP */ 3439 /* 3440 * Normal, non-SMP, invalidation functions. 3441 */ 3442 void 3443 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 3444 { 3445 struct invpcid_descr d; 3446 uint64_t kcr3, ucr3; 3447 uint32_t pcid; 3448 3449 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3450 pmap->pm_eptgen++; 3451 return; 3452 } 3453 KASSERT(pmap->pm_type == PT_X86, 3454 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 3455 3456 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 3457 invlpg(va); 3458 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && 3459 pmap->pm_ucr3 != PMAP_NO_CR3) { 3460 critical_enter(); 3461 pcid = pmap->pm_pcids[0].pm_pcid; 3462 if (invpcid_works) { 3463 d.pcid = pcid | PMAP_PCID_USER_PT; 3464 d.pad = 0; 3465 d.addr = va; 3466 invpcid(&d, INVPCID_ADDR); 3467 } else { 3468 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3469 ucr3 = pmap->pm_ucr3 | pcid | 3470 PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3471 pmap_pti_pcid_invlpg(ucr3, kcr3, va); 3472 } 3473 critical_exit(); 3474 } 3475 } else if (pmap_pcid_enabled) 3476 pmap->pm_pcids[0].pm_gen = 0; 3477 } 3478 3479 void 3480 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3481 { 3482 struct invpcid_descr d; 3483 vm_offset_t addr; 3484 uint64_t kcr3, ucr3; 3485 3486 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3487 pmap->pm_eptgen++; 3488 return; 3489 } 3490 KASSERT(pmap->pm_type == PT_X86, 3491 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 3492 3493 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 3494 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3495 invlpg(addr); 3496 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && 3497 pmap->pm_ucr3 != PMAP_NO_CR3) { 3498 critical_enter(); 3499 if (invpcid_works) { 3500 d.pcid = pmap->pm_pcids[0].pm_pcid | 3501 PMAP_PCID_USER_PT; 3502 d.pad = 0; 3503 d.addr = sva; 3504 for (; d.addr < eva; d.addr += PAGE_SIZE) 3505 invpcid(&d, INVPCID_ADDR); 3506 } else { 3507 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0]. 3508 pm_pcid | CR3_PCID_SAVE; 3509 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0]. 3510 pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3511 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); 3512 } 3513 critical_exit(); 3514 } 3515 } else if (pmap_pcid_enabled) { 3516 pmap->pm_pcids[0].pm_gen = 0; 3517 } 3518 } 3519 3520 void 3521 pmap_invalidate_all(pmap_t pmap) 3522 { 3523 struct invpcid_descr d; 3524 uint64_t kcr3, ucr3; 3525 3526 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3527 pmap->pm_eptgen++; 3528 return; 3529 } 3530 KASSERT(pmap->pm_type == PT_X86, 3531 ("pmap_invalidate_all: unknown type %d", pmap->pm_type)); 3532 3533 if (pmap == kernel_pmap) { 3534 if (pmap_pcid_enabled && invpcid_works) { 3535 bzero(&d, sizeof(d)); 3536 invpcid(&d, INVPCID_CTXGLOB); 3537 } else { 3538 invltlb_glob(); 3539 } 3540 } else if (pmap == PCPU_GET(curpmap)) { 3541 if (pmap_pcid_enabled) { 3542 critical_enter(); 3543 if (invpcid_works) { 3544 d.pcid = pmap->pm_pcids[0].pm_pcid; 3545 d.pad = 0; 3546 d.addr = 0; 3547 invpcid(&d, INVPCID_CTX); 3548 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 3549 d.pcid |= PMAP_PCID_USER_PT; 3550 invpcid(&d, INVPCID_CTX); 3551 } 3552 } else { 3553 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid; 3554 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 3555 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[ 3556 0].pm_pcid | PMAP_PCID_USER_PT; 3557 pmap_pti_pcid_invalidate(ucr3, kcr3); 3558 } else 3559 load_cr3(kcr3); 3560 } 3561 critical_exit(); 3562 } else { 3563 invltlb(); 3564 } 3565 } else if (pmap_pcid_enabled) { 3566 pmap->pm_pcids[0].pm_gen = 0; 3567 } 3568 } 3569 3570 PMAP_INLINE void 3571 pmap_invalidate_cache(void) 3572 { 3573 3574 wbinvd(); 3575 } 3576 3577 static void 3578 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 3579 { 3580 3581 pmap_update_pde_store(pmap, pde, newpde); 3582 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) 3583 pmap_update_pde_invalidate(pmap, va, newpde); 3584 else 3585 pmap->pm_pcids[0].pm_gen = 0; 3586 } 3587 #endif /* !SMP */ 3588 3589 static void 3590 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) 3591 { 3592 3593 /* 3594 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created 3595 * by a promotion that did not invalidate the 512 4KB page mappings 3596 * that might exist in the TLB. Consequently, at this point, the TLB 3597 * may hold both 4KB and 2MB page mappings for the address range [va, 3598 * va + NBPDR). Therefore, the entire range must be invalidated here. 3599 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any 3600 * 4KB page mappings for the address range [va, va + NBPDR), and so a 3601 * single INVLPG suffices to invalidate the 2MB page mapping from the 3602 * TLB. 3603 */ 3604 if ((pde & PG_PROMOTED) != 0) 3605 pmap_invalidate_range(pmap, va, va + NBPDR - 1); 3606 else 3607 pmap_invalidate_page(pmap, va); 3608 } 3609 3610 DEFINE_IFUNC(, void, pmap_invalidate_cache_range, 3611 (vm_offset_t sva, vm_offset_t eva)) 3612 { 3613 3614 if ((cpu_feature & CPUID_SS) != 0) 3615 return (pmap_invalidate_cache_range_selfsnoop); 3616 if ((cpu_feature & CPUID_CLFSH) != 0) 3617 return (pmap_force_invalidate_cache_range); 3618 return (pmap_invalidate_cache_range_all); 3619 } 3620 3621 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 3622 3623 static void 3624 pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva) 3625 { 3626 3627 KASSERT((sva & PAGE_MASK) == 0, 3628 ("pmap_invalidate_cache_range: sva not page-aligned")); 3629 KASSERT((eva & PAGE_MASK) == 0, 3630 ("pmap_invalidate_cache_range: eva not page-aligned")); 3631 } 3632 3633 static void 3634 pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva) 3635 { 3636 3637 pmap_invalidate_cache_range_check_align(sva, eva); 3638 } 3639 3640 void 3641 pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 3642 { 3643 3644 sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); 3645 3646 /* 3647 * XXX: Some CPUs fault, hang, or trash the local APIC 3648 * registers if we use CLFLUSH on the local APIC range. The 3649 * local APIC is always uncached, so we don't need to flush 3650 * for that range anyway. 3651 */ 3652 if (pmap_kextract(sva) == lapic_paddr) 3653 return; 3654 3655 if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) { 3656 /* 3657 * Do per-cache line flush. Use a locked 3658 * instruction to insure that previous stores are 3659 * included in the write-back. The processor 3660 * propagates flush to other processors in the cache 3661 * coherence domain. 3662 */ 3663 atomic_thread_fence_seq_cst(); 3664 for (; sva < eva; sva += cpu_clflush_line_size) 3665 clflushopt(sva); 3666 atomic_thread_fence_seq_cst(); 3667 } else { 3668 /* 3669 * Writes are ordered by CLFLUSH on Intel CPUs. 3670 */ 3671 if (cpu_vendor_id != CPU_VENDOR_INTEL) 3672 mfence(); 3673 for (; sva < eva; sva += cpu_clflush_line_size) 3674 clflush(sva); 3675 if (cpu_vendor_id != CPU_VENDOR_INTEL) 3676 mfence(); 3677 } 3678 } 3679 3680 static void 3681 pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva) 3682 { 3683 3684 pmap_invalidate_cache_range_check_align(sva, eva); 3685 pmap_invalidate_cache(); 3686 } 3687 3688 /* 3689 * Remove the specified set of pages from the data and instruction caches. 3690 * 3691 * In contrast to pmap_invalidate_cache_range(), this function does not 3692 * rely on the CPU's self-snoop feature, because it is intended for use 3693 * when moving pages into a different cache domain. 3694 */ 3695 void 3696 pmap_invalidate_cache_pages(vm_page_t *pages, int count) 3697 { 3698 vm_offset_t daddr, eva; 3699 int i; 3700 bool useclflushopt; 3701 3702 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; 3703 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 3704 ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt)) 3705 pmap_invalidate_cache(); 3706 else { 3707 if (useclflushopt) 3708 atomic_thread_fence_seq_cst(); 3709 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 3710 mfence(); 3711 for (i = 0; i < count; i++) { 3712 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 3713 eva = daddr + PAGE_SIZE; 3714 for (; daddr < eva; daddr += cpu_clflush_line_size) { 3715 if (useclflushopt) 3716 clflushopt(daddr); 3717 else 3718 clflush(daddr); 3719 } 3720 } 3721 if (useclflushopt) 3722 atomic_thread_fence_seq_cst(); 3723 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 3724 mfence(); 3725 } 3726 } 3727 3728 void 3729 pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva) 3730 { 3731 3732 pmap_invalidate_cache_range_check_align(sva, eva); 3733 3734 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) { 3735 pmap_force_invalidate_cache_range(sva, eva); 3736 return; 3737 } 3738 3739 /* See comment in pmap_force_invalidate_cache_range(). */ 3740 if (pmap_kextract(sva) == lapic_paddr) 3741 return; 3742 3743 atomic_thread_fence_seq_cst(); 3744 for (; sva < eva; sva += cpu_clflush_line_size) 3745 clwb(sva); 3746 atomic_thread_fence_seq_cst(); 3747 } 3748 3749 void 3750 pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) 3751 { 3752 pt_entry_t *pte; 3753 vm_offset_t vaddr; 3754 int error __diagused; 3755 int pte_bits; 3756 3757 KASSERT((spa & PAGE_MASK) == 0, 3758 ("pmap_flush_cache_phys_range: spa not page-aligned")); 3759 KASSERT((epa & PAGE_MASK) == 0, 3760 ("pmap_flush_cache_phys_range: epa not page-aligned")); 3761 3762 if (spa < dmaplimit) { 3763 pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN( 3764 dmaplimit, epa))); 3765 if (dmaplimit >= epa) 3766 return; 3767 spa = dmaplimit; 3768 } 3769 3770 pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW | 3771 X86_PG_V; 3772 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 3773 &vaddr); 3774 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 3775 pte = vtopte(vaddr); 3776 for (; spa < epa; spa += PAGE_SIZE) { 3777 sched_pin(); 3778 pte_store(pte, spa | pte_bits); 3779 pmap_invlpg(kernel_pmap, vaddr); 3780 /* XXXKIB atomic inside flush_cache_range are excessive */ 3781 pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE); 3782 sched_unpin(); 3783 } 3784 vmem_free(kernel_arena, vaddr, PAGE_SIZE); 3785 } 3786 3787 /* 3788 * Routine: pmap_extract 3789 * Function: 3790 * Extract the physical page address associated 3791 * with the given map/virtual_address pair. 3792 */ 3793 vm_paddr_t 3794 pmap_extract(pmap_t pmap, vm_offset_t va) 3795 { 3796 pdp_entry_t *pdpe; 3797 pd_entry_t *pde; 3798 pt_entry_t *pte, PG_V; 3799 vm_paddr_t pa; 3800 3801 pa = 0; 3802 PG_V = pmap_valid_bit(pmap); 3803 PMAP_LOCK(pmap); 3804 pdpe = pmap_pdpe(pmap, va); 3805 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 3806 if ((*pdpe & PG_PS) != 0) 3807 pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); 3808 else { 3809 pde = pmap_pdpe_to_pde(pdpe, va); 3810 if ((*pde & PG_V) != 0) { 3811 if ((*pde & PG_PS) != 0) { 3812 pa = (*pde & PG_PS_FRAME) | 3813 (va & PDRMASK); 3814 } else { 3815 pte = pmap_pde_to_pte(pde, va); 3816 pa = (*pte & PG_FRAME) | 3817 (va & PAGE_MASK); 3818 } 3819 } 3820 } 3821 } 3822 PMAP_UNLOCK(pmap); 3823 return (pa); 3824 } 3825 3826 /* 3827 * Routine: pmap_extract_and_hold 3828 * Function: 3829 * Atomically extract and hold the physical page 3830 * with the given pmap and virtual address pair 3831 * if that mapping permits the given protection. 3832 */ 3833 vm_page_t 3834 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 3835 { 3836 pdp_entry_t pdpe, *pdpep; 3837 pd_entry_t pde, *pdep; 3838 pt_entry_t pte, PG_RW, PG_V; 3839 vm_page_t m; 3840 3841 m = NULL; 3842 PG_RW = pmap_rw_bit(pmap); 3843 PG_V = pmap_valid_bit(pmap); 3844 PMAP_LOCK(pmap); 3845 3846 pdpep = pmap_pdpe(pmap, va); 3847 if (pdpep == NULL || ((pdpe = *pdpep) & PG_V) == 0) 3848 goto out; 3849 if ((pdpe & PG_PS) != 0) { 3850 if ((pdpe & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0) 3851 goto out; 3852 m = PHYS_TO_VM_PAGE((pdpe & PG_PS_FRAME) | (va & PDPMASK)); 3853 goto check_page; 3854 } 3855 3856 pdep = pmap_pdpe_to_pde(pdpep, va); 3857 if (pdep == NULL || ((pde = *pdep) & PG_V) == 0) 3858 goto out; 3859 if ((pde & PG_PS) != 0) { 3860 if ((pde & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0) 3861 goto out; 3862 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); 3863 goto check_page; 3864 } 3865 3866 pte = *pmap_pde_to_pte(pdep, va); 3867 if ((pte & PG_V) == 0 || 3868 ((pte & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0)) 3869 goto out; 3870 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 3871 3872 check_page: 3873 if (m != NULL && !vm_page_wire_mapped(m)) 3874 m = NULL; 3875 out: 3876 PMAP_UNLOCK(pmap); 3877 return (m); 3878 } 3879 3880 vm_paddr_t 3881 pmap_kextract(vm_offset_t va) 3882 { 3883 pd_entry_t pde; 3884 vm_paddr_t pa; 3885 3886 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 3887 pa = DMAP_TO_PHYS(va); 3888 } else if (PMAP_ADDRESS_IN_LARGEMAP(va)) { 3889 pa = pmap_large_map_kextract(va); 3890 } else { 3891 pde = *vtopde(va); 3892 if (pde & PG_PS) { 3893 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 3894 } else { 3895 /* 3896 * Beware of a concurrent promotion that changes the 3897 * PDE at this point! For example, vtopte() must not 3898 * be used to access the PTE because it would use the 3899 * new PDE. It is, however, safe to use the old PDE 3900 * because the page table page is preserved by the 3901 * promotion. 3902 */ 3903 pa = *pmap_pde_to_pte(&pde, va); 3904 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 3905 } 3906 } 3907 return (pa); 3908 } 3909 3910 /*************************************************** 3911 * Low level mapping routines..... 3912 ***************************************************/ 3913 3914 /* 3915 * Add a wired page to the kva. 3916 * Note: not SMP coherent. 3917 */ 3918 PMAP_INLINE void 3919 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 3920 { 3921 pt_entry_t *pte; 3922 3923 pte = vtopte(va); 3924 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | X86_PG_M | 3925 X86_PG_RW | X86_PG_V); 3926 } 3927 3928 static __inline void 3929 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 3930 { 3931 pt_entry_t *pte; 3932 int cache_bits; 3933 3934 pte = vtopte(va); 3935 cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); 3936 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | X86_PG_M | 3937 X86_PG_RW | X86_PG_V | cache_bits); 3938 } 3939 3940 /* 3941 * Remove a page from the kernel pagetables. 3942 * Note: not SMP coherent. 3943 */ 3944 PMAP_INLINE void 3945 pmap_kremove(vm_offset_t va) 3946 { 3947 pt_entry_t *pte; 3948 3949 pte = vtopte(va); 3950 pte_clear(pte); 3951 } 3952 3953 /* 3954 * Used to map a range of physical addresses into kernel 3955 * virtual address space. 3956 * 3957 * The value passed in '*virt' is a suggested virtual address for 3958 * the mapping. Architectures which can support a direct-mapped 3959 * physical to virtual region can return the appropriate address 3960 * within that region, leaving '*virt' unchanged. Other 3961 * architectures should map the pages starting at '*virt' and 3962 * update '*virt' with the first usable address after the mapped 3963 * region. 3964 */ 3965 vm_offset_t 3966 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 3967 { 3968 return PHYS_TO_DMAP(start); 3969 } 3970 3971 /* 3972 * Add a list of wired pages to the kva 3973 * this routine is only used for temporary 3974 * kernel mappings that do not need to have 3975 * page modification or references recorded. 3976 * Note that old mappings are simply written 3977 * over. The page *must* be wired. 3978 * Note: SMP coherent. Uses a ranged shootdown IPI. 3979 */ 3980 void 3981 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 3982 { 3983 pt_entry_t *endpte, oldpte, pa, *pte; 3984 vm_page_t m; 3985 int cache_bits; 3986 3987 oldpte = 0; 3988 pte = vtopte(sva); 3989 endpte = pte + count; 3990 while (pte < endpte) { 3991 m = *ma++; 3992 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); 3993 pa = VM_PAGE_TO_PHYS(m) | cache_bits; 3994 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { 3995 oldpte |= *pte; 3996 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | 3997 X86_PG_M | X86_PG_RW | X86_PG_V); 3998 } 3999 pte++; 4000 } 4001 if (__predict_false((oldpte & X86_PG_V) != 0)) 4002 pmap_invalidate_range(kernel_pmap, sva, sva + count * 4003 PAGE_SIZE); 4004 } 4005 4006 /* 4007 * This routine tears out page mappings from the 4008 * kernel -- it is meant only for temporary mappings. 4009 * Note: SMP coherent. Uses a ranged shootdown IPI. 4010 */ 4011 void 4012 pmap_qremove(vm_offset_t sva, int count) 4013 { 4014 vm_offset_t va; 4015 4016 va = sva; 4017 while (count-- > 0) { 4018 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); 4019 pmap_kremove(va); 4020 va += PAGE_SIZE; 4021 } 4022 pmap_invalidate_range(kernel_pmap, sva, va); 4023 } 4024 4025 /*************************************************** 4026 * Page table page management routines..... 4027 ***************************************************/ 4028 /* 4029 * Schedule the specified unused page table page to be freed. Specifically, 4030 * add the page to the specified list of pages that will be released to the 4031 * physical memory manager after the TLB has been updated. 4032 */ 4033 static __inline void 4034 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 4035 boolean_t set_PG_ZERO) 4036 { 4037 4038 if (set_PG_ZERO) 4039 m->flags |= PG_ZERO; 4040 else 4041 m->flags &= ~PG_ZERO; 4042 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 4043 } 4044 4045 /* 4046 * Inserts the specified page table page into the specified pmap's collection 4047 * of idle page table pages. Each of a pmap's page table pages is responsible 4048 * for mapping a distinct range of virtual addresses. The pmap's collection is 4049 * ordered by this virtual address range. 4050 * 4051 * If "promoted" is false, then the page table page "mpte" must be zero filled. 4052 */ 4053 static __inline int 4054 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) 4055 { 4056 4057 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4058 mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; 4059 return (vm_radix_insert(&pmap->pm_root, mpte)); 4060 } 4061 4062 /* 4063 * Removes the page table page mapping the specified virtual address from the 4064 * specified pmap's collection of idle page table pages, and returns it. 4065 * Otherwise, returns NULL if there is no page table page corresponding to the 4066 * specified virtual address. 4067 */ 4068 static __inline vm_page_t 4069 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 4070 { 4071 4072 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4073 return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va))); 4074 } 4075 4076 /* 4077 * Decrements a page table page's reference count, which is used to record the 4078 * number of valid page table entries within the page. If the reference count 4079 * drops to zero, then the page table page is unmapped. Returns TRUE if the 4080 * page table page was unmapped and FALSE otherwise. 4081 */ 4082 static inline boolean_t 4083 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4084 { 4085 4086 --m->ref_count; 4087 if (m->ref_count == 0) { 4088 _pmap_unwire_ptp(pmap, va, m, free); 4089 return (TRUE); 4090 } else 4091 return (FALSE); 4092 } 4093 4094 static void 4095 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4096 { 4097 pml5_entry_t *pml5; 4098 pml4_entry_t *pml4; 4099 pdp_entry_t *pdp; 4100 pd_entry_t *pd; 4101 vm_page_t pdpg, pdppg, pml4pg; 4102 4103 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4104 4105 /* 4106 * unmap the page table page 4107 */ 4108 if (m->pindex >= NUPDE + NUPDPE + NUPML4E) { 4109 /* PML4 page */ 4110 MPASS(pmap_is_la57(pmap)); 4111 pml5 = pmap_pml5e(pmap, va); 4112 *pml5 = 0; 4113 if (pmap->pm_pmltopu != NULL && va <= VM_MAXUSER_ADDRESS) { 4114 pml5 = pmap_pml5e_u(pmap, va); 4115 *pml5 = 0; 4116 } 4117 } else if (m->pindex >= NUPDE + NUPDPE) { 4118 /* PDP page */ 4119 pml4 = pmap_pml4e(pmap, va); 4120 *pml4 = 0; 4121 if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && 4122 va <= VM_MAXUSER_ADDRESS) { 4123 pml4 = pmap_pml4e_u(pmap, va); 4124 *pml4 = 0; 4125 } 4126 } else if (m->pindex >= NUPDE) { 4127 /* PD page */ 4128 pdp = pmap_pdpe(pmap, va); 4129 *pdp = 0; 4130 } else { 4131 /* PTE page */ 4132 pd = pmap_pde(pmap, va); 4133 *pd = 0; 4134 } 4135 if (m->pindex < NUPDE) { 4136 /* We just released a PT, unhold the matching PD */ 4137 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 4138 pmap_unwire_ptp(pmap, va, pdpg, free); 4139 } else if (m->pindex < NUPDE + NUPDPE) { 4140 /* We just released a PD, unhold the matching PDP */ 4141 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 4142 pmap_unwire_ptp(pmap, va, pdppg, free); 4143 } else if (m->pindex < NUPDE + NUPDPE + NUPML4E && pmap_is_la57(pmap)) { 4144 /* We just released a PDP, unhold the matching PML4 */ 4145 pml4pg = PHYS_TO_VM_PAGE(*pmap_pml5e(pmap, va) & PG_FRAME); 4146 pmap_unwire_ptp(pmap, va, pml4pg, free); 4147 } 4148 4149 pmap_pt_page_count_adj(pmap, -1); 4150 4151 /* 4152 * Put page on a list so that it is released after 4153 * *ALL* TLB shootdown is done 4154 */ 4155 pmap_add_delayed_free_list(m, free, TRUE); 4156 } 4157 4158 /* 4159 * After removing a page table entry, this routine is used to 4160 * conditionally free the page, and manage the reference count. 4161 */ 4162 static int 4163 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 4164 struct spglist *free) 4165 { 4166 vm_page_t mpte; 4167 4168 if (va >= VM_MAXUSER_ADDRESS) 4169 return (0); 4170 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 4171 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 4172 return (pmap_unwire_ptp(pmap, va, mpte, free)); 4173 } 4174 4175 /* 4176 * Release a page table page reference after a failed attempt to create a 4177 * mapping. 4178 */ 4179 static void 4180 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 4181 { 4182 struct spglist free; 4183 4184 SLIST_INIT(&free); 4185 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 4186 /* 4187 * Although "va" was never mapped, paging-structure caches 4188 * could nonetheless have entries that refer to the freed 4189 * page table pages. Invalidate those entries. 4190 */ 4191 pmap_invalidate_page(pmap, va); 4192 vm_page_free_pages_toq(&free, true); 4193 } 4194 } 4195 4196 void 4197 pmap_pinit0(pmap_t pmap) 4198 { 4199 struct proc *p; 4200 struct thread *td; 4201 int i; 4202 4203 PMAP_LOCK_INIT(pmap); 4204 pmap->pm_pmltop = kernel_pmap->pm_pmltop; 4205 pmap->pm_pmltopu = NULL; 4206 pmap->pm_cr3 = kernel_pmap->pm_cr3; 4207 /* hack to keep pmap_pti_pcid_invalidate() alive */ 4208 pmap->pm_ucr3 = PMAP_NO_CR3; 4209 vm_radix_init(&pmap->pm_root); 4210 CPU_ZERO(&pmap->pm_active); 4211 TAILQ_INIT(&pmap->pm_pvchunk); 4212 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4213 pmap->pm_flags = pmap_flags; 4214 CPU_FOREACH(i) { 4215 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN + 1; 4216 pmap->pm_pcids[i].pm_gen = 1; 4217 } 4218 pmap_activate_boot(pmap); 4219 td = curthread; 4220 if (pti) { 4221 p = td->td_proc; 4222 PROC_LOCK(p); 4223 p->p_md.md_flags |= P_MD_KPTI; 4224 PROC_UNLOCK(p); 4225 } 4226 pmap_thread_init_invl_gen(td); 4227 4228 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 4229 pmap_pkru_ranges_zone = uma_zcreate("pkru ranges", 4230 sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL, 4231 UMA_ALIGN_PTR, 0); 4232 } 4233 } 4234 4235 void 4236 pmap_pinit_pml4(vm_page_t pml4pg) 4237 { 4238 pml4_entry_t *pm_pml4; 4239 int i; 4240 4241 pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); 4242 4243 /* Wire in kernel global address entries. */ 4244 for (i = 0; i < NKPML4E; i++) { 4245 pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW | 4246 X86_PG_V; 4247 } 4248 #ifdef KASAN 4249 for (i = 0; i < NKASANPML4E; i++) { 4250 pm_pml4[KASANPML4I + i] = (KASANPDPphys + ptoa(i)) | X86_PG_RW | 4251 X86_PG_V | pg_nx; 4252 } 4253 #endif 4254 #ifdef KMSAN 4255 for (i = 0; i < NKMSANSHADPML4E; i++) { 4256 pm_pml4[KMSANSHADPML4I + i] = (KMSANSHADPDPphys + ptoa(i)) | 4257 X86_PG_RW | X86_PG_V | pg_nx; 4258 } 4259 for (i = 0; i < NKMSANORIGPML4E; i++) { 4260 pm_pml4[KMSANORIGPML4I + i] = (KMSANORIGPDPphys + ptoa(i)) | 4261 X86_PG_RW | X86_PG_V | pg_nx; 4262 } 4263 #endif 4264 for (i = 0; i < ndmpdpphys; i++) { 4265 pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | 4266 X86_PG_V; 4267 } 4268 4269 /* install self-referential address mapping entry(s) */ 4270 pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW | 4271 X86_PG_A | X86_PG_M; 4272 4273 /* install large map entries if configured */ 4274 for (i = 0; i < lm_ents; i++) 4275 pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pmltop[LMSPML4I + i]; 4276 } 4277 4278 void 4279 pmap_pinit_pml5(vm_page_t pml5pg) 4280 { 4281 pml5_entry_t *pm_pml5; 4282 4283 pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg)); 4284 4285 /* 4286 * Add pml5 entry at top of KVA pointing to existing pml4 table, 4287 * entering all existing kernel mappings into level 5 table. 4288 */ 4289 pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | 4290 X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | 4291 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); 4292 4293 /* 4294 * Install self-referential address mapping entry. 4295 */ 4296 pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) | 4297 X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A | 4298 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); 4299 } 4300 4301 static void 4302 pmap_pinit_pml4_pti(vm_page_t pml4pgu) 4303 { 4304 pml4_entry_t *pm_pml4u; 4305 int i; 4306 4307 pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pgu)); 4308 for (i = 0; i < NPML4EPG; i++) 4309 pm_pml4u[i] = pti_pml4[i]; 4310 } 4311 4312 static void 4313 pmap_pinit_pml5_pti(vm_page_t pml5pgu) 4314 { 4315 pml5_entry_t *pm_pml5u; 4316 4317 pm_pml5u = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pgu)); 4318 pagezero(pm_pml5u); 4319 4320 /* 4321 * Add pml5 entry at top of KVA pointing to existing pml4 pti 4322 * table, entering all kernel mappings needed for usermode 4323 * into level 5 table. 4324 */ 4325 pm_pml5u[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 4326 pmap_kextract((vm_offset_t)pti_pml4) | 4327 X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | 4328 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); 4329 } 4330 4331 /* Allocate a page table page and do related bookkeeping */ 4332 static vm_page_t 4333 pmap_alloc_pt_page(pmap_t pmap, vm_pindex_t pindex, int flags) 4334 { 4335 vm_page_t m; 4336 4337 m = vm_page_alloc_noobj(flags); 4338 if (__predict_false(m == NULL)) 4339 return (NULL); 4340 m->pindex = pindex; 4341 pmap_pt_page_count_adj(pmap, 1); 4342 return (m); 4343 } 4344 4345 static void 4346 pmap_free_pt_page(pmap_t pmap, vm_page_t m, bool zerofilled) 4347 { 4348 /* 4349 * This function assumes the page will need to be unwired, 4350 * even though the counterpart allocation in pmap_alloc_pt_page() 4351 * doesn't enforce VM_ALLOC_WIRED. However, all current uses 4352 * of pmap_free_pt_page() require unwiring. The case in which 4353 * a PT page doesn't require unwiring because its ref_count has 4354 * naturally reached 0 is handled through _pmap_unwire_ptp(). 4355 */ 4356 vm_page_unwire_noq(m); 4357 if (zerofilled) 4358 vm_page_free_zero(m); 4359 else 4360 vm_page_free(m); 4361 4362 pmap_pt_page_count_adj(pmap, -1); 4363 } 4364 4365 /* 4366 * Initialize a preallocated and zeroed pmap structure, 4367 * such as one in a vmspace structure. 4368 */ 4369 int 4370 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) 4371 { 4372 vm_page_t pmltop_pg, pmltop_pgu; 4373 vm_paddr_t pmltop_phys; 4374 int i; 4375 4376 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4377 4378 /* 4379 * Allocate the page directory page. Pass NULL instead of a 4380 * pointer to the pmap here to avoid calling 4381 * pmap_resident_count_adj() through pmap_pt_page_count_adj(), 4382 * since that requires pmap lock. Instead do the accounting 4383 * manually. 4384 * 4385 * Note that final call to pmap_remove() optimization that 4386 * checks for zero resident_count is basically disabled by 4387 * accounting for top-level page. But the optimization was 4388 * not effective since we started using non-managed mapping of 4389 * the shared page. 4390 */ 4391 pmltop_pg = pmap_alloc_pt_page(NULL, 0, VM_ALLOC_WIRED | VM_ALLOC_ZERO | 4392 VM_ALLOC_WAITOK); 4393 pmap_pt_page_count_pinit(pmap, 1); 4394 4395 pmltop_phys = VM_PAGE_TO_PHYS(pmltop_pg); 4396 pmap->pm_pmltop = (pml5_entry_t *)PHYS_TO_DMAP(pmltop_phys); 4397 4398 CPU_FOREACH(i) { 4399 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; 4400 pmap->pm_pcids[i].pm_gen = 0; 4401 } 4402 pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */ 4403 pmap->pm_ucr3 = PMAP_NO_CR3; 4404 pmap->pm_pmltopu = NULL; 4405 4406 pmap->pm_type = pm_type; 4407 4408 /* 4409 * Do not install the host kernel mappings in the nested page 4410 * tables. These mappings are meaningless in the guest physical 4411 * address space. 4412 * Install minimal kernel mappings in PTI case. 4413 */ 4414 switch (pm_type) { 4415 case PT_X86: 4416 pmap->pm_cr3 = pmltop_phys; 4417 if (pmap_is_la57(pmap)) 4418 pmap_pinit_pml5(pmltop_pg); 4419 else 4420 pmap_pinit_pml4(pmltop_pg); 4421 if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) { 4422 /* 4423 * As with pmltop_pg, pass NULL instead of a 4424 * pointer to the pmap to ensure that the PTI 4425 * page counted explicitly. 4426 */ 4427 pmltop_pgu = pmap_alloc_pt_page(NULL, 0, 4428 VM_ALLOC_WIRED | VM_ALLOC_WAITOK); 4429 pmap_pt_page_count_pinit(pmap, 1); 4430 pmap->pm_pmltopu = (pml4_entry_t *)PHYS_TO_DMAP( 4431 VM_PAGE_TO_PHYS(pmltop_pgu)); 4432 if (pmap_is_la57(pmap)) 4433 pmap_pinit_pml5_pti(pmltop_pgu); 4434 else 4435 pmap_pinit_pml4_pti(pmltop_pgu); 4436 pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pmltop_pgu); 4437 } 4438 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 4439 rangeset_init(&pmap->pm_pkru, pkru_dup_range, 4440 pkru_free_range, pmap, M_NOWAIT); 4441 } 4442 break; 4443 case PT_EPT: 4444 case PT_RVI: 4445 pmap->pm_eptsmr = smr_create("pmap", 0, 0); 4446 break; 4447 } 4448 4449 vm_radix_init(&pmap->pm_root); 4450 CPU_ZERO(&pmap->pm_active); 4451 TAILQ_INIT(&pmap->pm_pvchunk); 4452 pmap->pm_flags = flags; 4453 pmap->pm_eptgen = 0; 4454 4455 return (1); 4456 } 4457 4458 int 4459 pmap_pinit(pmap_t pmap) 4460 { 4461 4462 return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); 4463 } 4464 4465 static void 4466 pmap_allocpte_free_unref(pmap_t pmap, vm_offset_t va, pt_entry_t *pte) 4467 { 4468 vm_page_t mpg; 4469 struct spglist free; 4470 4471 mpg = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 4472 if (mpg->ref_count != 0) 4473 return; 4474 SLIST_INIT(&free); 4475 _pmap_unwire_ptp(pmap, va, mpg, &free); 4476 pmap_invalidate_page(pmap, va); 4477 vm_page_free_pages_toq(&free, true); 4478 } 4479 4480 static pml4_entry_t * 4481 pmap_allocpte_getpml4(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, 4482 bool addref) 4483 { 4484 vm_pindex_t pml5index; 4485 pml5_entry_t *pml5; 4486 pml4_entry_t *pml4; 4487 vm_page_t pml4pg; 4488 pt_entry_t PG_V; 4489 bool allocated; 4490 4491 if (!pmap_is_la57(pmap)) 4492 return (&pmap->pm_pmltop[pmap_pml4e_index(va)]); 4493 4494 PG_V = pmap_valid_bit(pmap); 4495 pml5index = pmap_pml5e_index(va); 4496 pml5 = &pmap->pm_pmltop[pml5index]; 4497 if ((*pml5 & PG_V) == 0) { 4498 if (pmap_allocpte_nosleep(pmap, pmap_pml5e_pindex(va), lockp, 4499 va) == NULL) 4500 return (NULL); 4501 allocated = true; 4502 } else { 4503 allocated = false; 4504 } 4505 pml4 = (pml4_entry_t *)PHYS_TO_DMAP(*pml5 & PG_FRAME); 4506 pml4 = &pml4[pmap_pml4e_index(va)]; 4507 if ((*pml4 & PG_V) == 0) { 4508 pml4pg = PHYS_TO_VM_PAGE(*pml5 & PG_FRAME); 4509 if (allocated && !addref) 4510 pml4pg->ref_count--; 4511 else if (!allocated && addref) 4512 pml4pg->ref_count++; 4513 } 4514 return (pml4); 4515 } 4516 4517 static pdp_entry_t * 4518 pmap_allocpte_getpdp(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, 4519 bool addref) 4520 { 4521 vm_page_t pdppg; 4522 pml4_entry_t *pml4; 4523 pdp_entry_t *pdp; 4524 pt_entry_t PG_V; 4525 bool allocated; 4526 4527 PG_V = pmap_valid_bit(pmap); 4528 4529 pml4 = pmap_allocpte_getpml4(pmap, lockp, va, false); 4530 if (pml4 == NULL) 4531 return (NULL); 4532 4533 if ((*pml4 & PG_V) == 0) { 4534 /* Have to allocate a new pdp, recurse */ 4535 if (pmap_allocpte_nosleep(pmap, pmap_pml4e_pindex(va), lockp, 4536 va) == NULL) { 4537 if (pmap_is_la57(pmap)) 4538 pmap_allocpte_free_unref(pmap, va, 4539 pmap_pml5e(pmap, va)); 4540 return (NULL); 4541 } 4542 allocated = true; 4543 } else { 4544 allocated = false; 4545 } 4546 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 4547 pdp = &pdp[pmap_pdpe_index(va)]; 4548 if ((*pdp & PG_V) == 0) { 4549 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 4550 if (allocated && !addref) 4551 pdppg->ref_count--; 4552 else if (!allocated && addref) 4553 pdppg->ref_count++; 4554 } 4555 return (pdp); 4556 } 4557 4558 /* 4559 * The ptepindexes, i.e. page indices, of the page table pages encountered 4560 * while translating virtual address va are defined as follows: 4561 * - for the page table page (last level), 4562 * ptepindex = pmap_pde_pindex(va) = va >> PDRSHIFT, 4563 * in other words, it is just the index of the PDE that maps the page 4564 * table page. 4565 * - for the page directory page, 4566 * ptepindex = NUPDE (number of userland PD entries) + 4567 * (pmap_pde_index(va) >> NPDEPGSHIFT) 4568 * i.e. index of PDPE is put after the last index of PDE, 4569 * - for the page directory pointer page, 4570 * ptepindex = NUPDE + NUPDPE + (pmap_pde_index(va) >> (NPDEPGSHIFT + 4571 * NPML4EPGSHIFT), 4572 * i.e. index of pml4e is put after the last index of PDPE, 4573 * - for the PML4 page (if LA57 mode is enabled), 4574 * ptepindex = NUPDE + NUPDPE + NUPML4E + (pmap_pde_index(va) >> 4575 * (NPDEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT), 4576 * i.e. index of pml5e is put after the last index of PML4E. 4577 * 4578 * Define an order on the paging entries, where all entries of the 4579 * same height are put together, then heights are put from deepest to 4580 * root. Then ptexpindex is the sequential number of the 4581 * corresponding paging entry in this order. 4582 * 4583 * The values of NUPDE, NUPDPE, and NUPML4E are determined by the size of 4584 * LA57 paging structures even in LA48 paging mode. Moreover, the 4585 * ptepindexes are calculated as if the paging structures were 5-level 4586 * regardless of the actual mode of operation. 4587 * 4588 * The root page at PML4/PML5 does not participate in this indexing scheme, 4589 * since it is statically allocated by pmap_pinit() and not by pmap_allocpte(). 4590 */ 4591 static vm_page_t 4592 pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, 4593 vm_offset_t va) 4594 { 4595 vm_pindex_t pml5index, pml4index; 4596 pml5_entry_t *pml5, *pml5u; 4597 pml4_entry_t *pml4, *pml4u; 4598 pdp_entry_t *pdp; 4599 pd_entry_t *pd; 4600 vm_page_t m, pdpg; 4601 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 4602 4603 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4604 4605 PG_A = pmap_accessed_bit(pmap); 4606 PG_M = pmap_modified_bit(pmap); 4607 PG_V = pmap_valid_bit(pmap); 4608 PG_RW = pmap_rw_bit(pmap); 4609 4610 /* 4611 * Allocate a page table page. 4612 */ 4613 m = pmap_alloc_pt_page(pmap, ptepindex, 4614 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 4615 if (m == NULL) 4616 return (NULL); 4617 4618 /* 4619 * Map the pagetable page into the process address space, if 4620 * it isn't already there. 4621 */ 4622 if (ptepindex >= NUPDE + NUPDPE + NUPML4E) { 4623 MPASS(pmap_is_la57(pmap)); 4624 4625 pml5index = pmap_pml5e_index(va); 4626 pml5 = &pmap->pm_pmltop[pml5index]; 4627 KASSERT((*pml5 & PG_V) == 0, 4628 ("pmap %p va %#lx pml5 %#lx", pmap, va, *pml5)); 4629 *pml5 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4630 4631 if (pmap->pm_pmltopu != NULL && pml5index < NUPML5E) { 4632 if (pmap->pm_ucr3 != PMAP_NO_CR3) 4633 *pml5 |= pg_nx; 4634 4635 pml5u = &pmap->pm_pmltopu[pml5index]; 4636 *pml5u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | 4637 PG_A | PG_M; 4638 } 4639 } else if (ptepindex >= NUPDE + NUPDPE) { 4640 pml4index = pmap_pml4e_index(va); 4641 /* Wire up a new PDPE page */ 4642 pml4 = pmap_allocpte_getpml4(pmap, lockp, va, true); 4643 if (pml4 == NULL) { 4644 pmap_free_pt_page(pmap, m, true); 4645 return (NULL); 4646 } 4647 KASSERT((*pml4 & PG_V) == 0, 4648 ("pmap %p va %#lx pml4 %#lx", pmap, va, *pml4)); 4649 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4650 4651 if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && 4652 pml4index < NUPML4E) { 4653 /* 4654 * PTI: Make all user-space mappings in the 4655 * kernel-mode page table no-execute so that 4656 * we detect any programming errors that leave 4657 * the kernel-mode page table active on return 4658 * to user space. 4659 */ 4660 if (pmap->pm_ucr3 != PMAP_NO_CR3) 4661 *pml4 |= pg_nx; 4662 4663 pml4u = &pmap->pm_pmltopu[pml4index]; 4664 *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | 4665 PG_A | PG_M; 4666 } 4667 } else if (ptepindex >= NUPDE) { 4668 /* Wire up a new PDE page */ 4669 pdp = pmap_allocpte_getpdp(pmap, lockp, va, true); 4670 if (pdp == NULL) { 4671 pmap_free_pt_page(pmap, m, true); 4672 return (NULL); 4673 } 4674 KASSERT((*pdp & PG_V) == 0, 4675 ("pmap %p va %#lx pdp %#lx", pmap, va, *pdp)); 4676 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4677 } else { 4678 /* Wire up a new PTE page */ 4679 pdp = pmap_allocpte_getpdp(pmap, lockp, va, false); 4680 if (pdp == NULL) { 4681 pmap_free_pt_page(pmap, m, true); 4682 return (NULL); 4683 } 4684 if ((*pdp & PG_V) == 0) { 4685 /* Have to allocate a new pd, recurse */ 4686 if (pmap_allocpte_nosleep(pmap, pmap_pdpe_pindex(va), 4687 lockp, va) == NULL) { 4688 pmap_allocpte_free_unref(pmap, va, 4689 pmap_pml4e(pmap, va)); 4690 pmap_free_pt_page(pmap, m, true); 4691 return (NULL); 4692 } 4693 } else { 4694 /* Add reference to the pd page */ 4695 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 4696 pdpg->ref_count++; 4697 } 4698 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 4699 4700 /* Now we know where the page directory page is */ 4701 pd = &pd[pmap_pde_index(va)]; 4702 KASSERT((*pd & PG_V) == 0, 4703 ("pmap %p va %#lx pd %#lx", pmap, va, *pd)); 4704 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4705 } 4706 4707 return (m); 4708 } 4709 4710 /* 4711 * This routine is called if the desired page table page does not exist. 4712 * 4713 * If page table page allocation fails, this routine may sleep before 4714 * returning NULL. It sleeps only if a lock pointer was given. Sleep 4715 * occurs right before returning to the caller. This way, we never 4716 * drop pmap lock to sleep while a page table page has ref_count == 0, 4717 * which prevents the page from being freed under us. 4718 */ 4719 static vm_page_t 4720 pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, 4721 vm_offset_t va) 4722 { 4723 vm_page_t m; 4724 4725 m = pmap_allocpte_nosleep(pmap, ptepindex, lockp, va); 4726 if (m == NULL && lockp != NULL) { 4727 RELEASE_PV_LIST_LOCK(lockp); 4728 PMAP_UNLOCK(pmap); 4729 PMAP_ASSERT_NOT_IN_DI(); 4730 vm_wait(NULL); 4731 PMAP_LOCK(pmap); 4732 } 4733 return (m); 4734 } 4735 4736 static pd_entry_t * 4737 pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, 4738 struct rwlock **lockp) 4739 { 4740 pdp_entry_t *pdpe, PG_V; 4741 pd_entry_t *pde; 4742 vm_page_t pdpg; 4743 vm_pindex_t pdpindex; 4744 4745 PG_V = pmap_valid_bit(pmap); 4746 4747 retry: 4748 pdpe = pmap_pdpe(pmap, va); 4749 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 4750 pde = pmap_pdpe_to_pde(pdpe, va); 4751 if (va < VM_MAXUSER_ADDRESS) { 4752 /* Add a reference to the pd page. */ 4753 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 4754 pdpg->ref_count++; 4755 } else 4756 pdpg = NULL; 4757 } else if (va < VM_MAXUSER_ADDRESS) { 4758 /* Allocate a pd page. */ 4759 pdpindex = pmap_pde_pindex(va) >> NPDPEPGSHIFT; 4760 pdpg = pmap_allocpte_alloc(pmap, NUPDE + pdpindex, lockp, va); 4761 if (pdpg == NULL) { 4762 if (lockp != NULL) 4763 goto retry; 4764 else 4765 return (NULL); 4766 } 4767 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 4768 pde = &pde[pmap_pde_index(va)]; 4769 } else 4770 panic("pmap_alloc_pde: missing page table page for va %#lx", 4771 va); 4772 *pdpgp = pdpg; 4773 return (pde); 4774 } 4775 4776 static vm_page_t 4777 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 4778 { 4779 vm_pindex_t ptepindex; 4780 pd_entry_t *pd, PG_V; 4781 vm_page_t m; 4782 4783 PG_V = pmap_valid_bit(pmap); 4784 4785 /* 4786 * Calculate pagetable page index 4787 */ 4788 ptepindex = pmap_pde_pindex(va); 4789 retry: 4790 /* 4791 * Get the page directory entry 4792 */ 4793 pd = pmap_pde(pmap, va); 4794 4795 /* 4796 * This supports switching from a 2MB page to a 4797 * normal 4K page. 4798 */ 4799 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 4800 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { 4801 /* 4802 * Invalidation of the 2MB page mapping may have caused 4803 * the deallocation of the underlying PD page. 4804 */ 4805 pd = NULL; 4806 } 4807 } 4808 4809 /* 4810 * If the page table page is mapped, we just increment the 4811 * hold count, and activate it. 4812 */ 4813 if (pd != NULL && (*pd & PG_V) != 0) { 4814 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 4815 m->ref_count++; 4816 } else { 4817 /* 4818 * Here if the pte page isn't mapped, or if it has been 4819 * deallocated. 4820 */ 4821 m = pmap_allocpte_alloc(pmap, ptepindex, lockp, va); 4822 if (m == NULL && lockp != NULL) 4823 goto retry; 4824 } 4825 return (m); 4826 } 4827 4828 /*************************************************** 4829 * Pmap allocation/deallocation routines. 4830 ***************************************************/ 4831 4832 /* 4833 * Release any resources held by the given physical map. 4834 * Called when a pmap initialized by pmap_pinit is being released. 4835 * Should only be called if the map contains no valid mappings. 4836 */ 4837 void 4838 pmap_release(pmap_t pmap) 4839 { 4840 vm_page_t m; 4841 int i; 4842 4843 KASSERT(vm_radix_is_empty(&pmap->pm_root), 4844 ("pmap_release: pmap %p has reserved page table page(s)", 4845 pmap)); 4846 KASSERT(CPU_EMPTY(&pmap->pm_active), 4847 ("releasing active pmap %p", pmap)); 4848 4849 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop)); 4850 4851 if (pmap_is_la57(pmap)) { 4852 pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0; 4853 pmap->pm_pmltop[PML5PML5I] = 0; 4854 } else { 4855 for (i = 0; i < NKPML4E; i++) /* KVA */ 4856 pmap->pm_pmltop[KPML4BASE + i] = 0; 4857 #ifdef KASAN 4858 for (i = 0; i < NKASANPML4E; i++) /* KASAN shadow map */ 4859 pmap->pm_pmltop[KASANPML4I + i] = 0; 4860 #endif 4861 #ifdef KMSAN 4862 for (i = 0; i < NKMSANSHADPML4E; i++) /* KMSAN shadow map */ 4863 pmap->pm_pmltop[KMSANSHADPML4I + i] = 0; 4864 for (i = 0; i < NKMSANORIGPML4E; i++) /* KMSAN shadow map */ 4865 pmap->pm_pmltop[KMSANORIGPML4I + i] = 0; 4866 #endif 4867 for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ 4868 pmap->pm_pmltop[DMPML4I + i] = 0; 4869 pmap->pm_pmltop[PML4PML4I] = 0; /* Recursive Mapping */ 4870 for (i = 0; i < lm_ents; i++) /* Large Map */ 4871 pmap->pm_pmltop[LMSPML4I + i] = 0; 4872 } 4873 4874 pmap_free_pt_page(NULL, m, true); 4875 pmap_pt_page_count_pinit(pmap, -1); 4876 4877 if (pmap->pm_pmltopu != NULL) { 4878 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap-> 4879 pm_pmltopu)); 4880 pmap_free_pt_page(NULL, m, false); 4881 pmap_pt_page_count_pinit(pmap, -1); 4882 } 4883 if (pmap->pm_type == PT_X86 && 4884 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) 4885 rangeset_fini(&pmap->pm_pkru); 4886 4887 KASSERT(pmap->pm_stats.resident_count == 0, 4888 ("pmap_release: pmap %p resident count %ld != 0", 4889 pmap, pmap->pm_stats.resident_count)); 4890 } 4891 4892 static int 4893 kvm_size(SYSCTL_HANDLER_ARGS) 4894 { 4895 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 4896 4897 return sysctl_handle_long(oidp, &ksize, 0, req); 4898 } 4899 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 4900 0, 0, kvm_size, "LU", 4901 "Size of KVM"); 4902 4903 static int 4904 kvm_free(SYSCTL_HANDLER_ARGS) 4905 { 4906 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 4907 4908 return sysctl_handle_long(oidp, &kfree, 0, req); 4909 } 4910 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 4911 0, 0, kvm_free, "LU", 4912 "Amount of KVM free"); 4913 4914 #ifdef KMSAN 4915 static void 4916 pmap_kmsan_shadow_map_page_array(vm_paddr_t pdppa, vm_size_t size) 4917 { 4918 pdp_entry_t *pdpe; 4919 pd_entry_t *pde; 4920 pt_entry_t *pte; 4921 vm_paddr_t dummypa, dummypd, dummypt; 4922 int i, npde, npdpg; 4923 4924 npdpg = howmany(size, NBPDP); 4925 npde = size / NBPDR; 4926 4927 dummypa = vm_phys_early_alloc(-1, PAGE_SIZE); 4928 pagezero((void *)PHYS_TO_DMAP(dummypa)); 4929 4930 dummypt = vm_phys_early_alloc(-1, PAGE_SIZE); 4931 pagezero((void *)PHYS_TO_DMAP(dummypt)); 4932 dummypd = vm_phys_early_alloc(-1, PAGE_SIZE * npdpg); 4933 for (i = 0; i < npdpg; i++) 4934 pagezero((void *)PHYS_TO_DMAP(dummypd + ptoa(i))); 4935 4936 pte = (pt_entry_t *)PHYS_TO_DMAP(dummypt); 4937 for (i = 0; i < NPTEPG; i++) 4938 pte[i] = (pt_entry_t)(dummypa | X86_PG_V | X86_PG_RW | 4939 X86_PG_A | X86_PG_M | pg_nx); 4940 4941 pde = (pd_entry_t *)PHYS_TO_DMAP(dummypd); 4942 for (i = 0; i < npde; i++) 4943 pde[i] = (pd_entry_t)(dummypt | X86_PG_V | X86_PG_RW | pg_nx); 4944 4945 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(pdppa); 4946 for (i = 0; i < npdpg; i++) 4947 pdpe[i] = (pdp_entry_t)(dummypd + ptoa(i) | X86_PG_V | 4948 X86_PG_RW | pg_nx); 4949 } 4950 4951 static void 4952 pmap_kmsan_page_array_startup(vm_offset_t start, vm_offset_t end) 4953 { 4954 vm_size_t size; 4955 4956 KASSERT(start % NBPDP == 0, ("unaligned page array start address")); 4957 4958 /* 4959 * The end of the page array's KVA region is 2MB aligned, see 4960 * kmem_init(). 4961 */ 4962 size = round_2mpage(end) - start; 4963 pmap_kmsan_shadow_map_page_array(KMSANSHADPDPphys, size); 4964 pmap_kmsan_shadow_map_page_array(KMSANORIGPDPphys, size); 4965 } 4966 #endif 4967 4968 /* 4969 * Allocate physical memory for the vm_page array and map it into KVA, 4970 * attempting to back the vm_pages with domain-local memory. 4971 */ 4972 void 4973 pmap_page_array_startup(long pages) 4974 { 4975 pdp_entry_t *pdpe; 4976 pd_entry_t *pde, newpdir; 4977 vm_offset_t va, start, end; 4978 vm_paddr_t pa; 4979 long pfn; 4980 int domain, i; 4981 4982 vm_page_array_size = pages; 4983 4984 start = VM_MIN_KERNEL_ADDRESS; 4985 end = start + pages * sizeof(struct vm_page); 4986 for (va = start; va < end; va += NBPDR) { 4987 pfn = first_page + (va - start) / sizeof(struct vm_page); 4988 domain = vm_phys_domain(ptoa(pfn)); 4989 pdpe = pmap_pdpe(kernel_pmap, va); 4990 if ((*pdpe & X86_PG_V) == 0) { 4991 pa = vm_phys_early_alloc(domain, PAGE_SIZE); 4992 dump_add_page(pa); 4993 pagezero((void *)PHYS_TO_DMAP(pa)); 4994 *pdpe = (pdp_entry_t)(pa | X86_PG_V | X86_PG_RW | 4995 X86_PG_A | X86_PG_M); 4996 } 4997 pde = pmap_pdpe_to_pde(pdpe, va); 4998 if ((*pde & X86_PG_V) != 0) 4999 panic("Unexpected pde"); 5000 pa = vm_phys_early_alloc(domain, NBPDR); 5001 for (i = 0; i < NPDEPG; i++) 5002 dump_add_page(pa + i * PAGE_SIZE); 5003 newpdir = (pd_entry_t)(pa | X86_PG_V | X86_PG_RW | X86_PG_A | 5004 X86_PG_M | PG_PS | pg_g | pg_nx); 5005 pde_store(pde, newpdir); 5006 } 5007 vm_page_array = (vm_page_t)start; 5008 5009 #ifdef KMSAN 5010 pmap_kmsan_page_array_startup(start, end); 5011 #endif 5012 } 5013 5014 /* 5015 * grow the number of kernel page table entries, if needed 5016 */ 5017 void 5018 pmap_growkernel(vm_offset_t addr) 5019 { 5020 vm_paddr_t paddr; 5021 vm_page_t nkpg; 5022 pd_entry_t *pde, newpdir; 5023 pdp_entry_t *pdpe; 5024 vm_offset_t end; 5025 5026 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 5027 5028 /* 5029 * The kernel map covers two distinct regions of KVA: that used 5030 * for dynamic kernel memory allocations, and the uppermost 2GB 5031 * of the virtual address space. The latter is used to map the 5032 * kernel and loadable kernel modules. This scheme enables the 5033 * use of a special code generation model for kernel code which 5034 * takes advantage of compact addressing modes in machine code. 5035 * 5036 * Both regions grow upwards; to avoid wasting memory, the gap 5037 * in between is unmapped. If "addr" is above "KERNBASE", the 5038 * kernel's region is grown, otherwise the kmem region is grown. 5039 * 5040 * The correctness of this action is based on the following 5041 * argument: vm_map_insert() allocates contiguous ranges of the 5042 * kernel virtual address space. It calls this function if a range 5043 * ends after "kernel_vm_end". If the kernel is mapped between 5044 * "kernel_vm_end" and "addr", then the range cannot begin at 5045 * "kernel_vm_end". In fact, its beginning address cannot be less 5046 * than the kernel. Thus, there is no immediate need to allocate 5047 * any new kernel page table pages between "kernel_vm_end" and 5048 * "KERNBASE". 5049 */ 5050 if (KERNBASE < addr) { 5051 end = KERNBASE + nkpt * NBPDR; 5052 if (end == 0) 5053 return; 5054 } else { 5055 end = kernel_vm_end; 5056 } 5057 5058 addr = roundup2(addr, NBPDR); 5059 if (addr - 1 >= vm_map_max(kernel_map)) 5060 addr = vm_map_max(kernel_map); 5061 if (addr <= end) { 5062 /* 5063 * The grown region is already mapped, so there is 5064 * nothing to do. 5065 */ 5066 return; 5067 } 5068 5069 kasan_shadow_map(end, addr - end); 5070 kmsan_shadow_map(end, addr - end); 5071 while (end < addr) { 5072 pdpe = pmap_pdpe(kernel_pmap, end); 5073 if ((*pdpe & X86_PG_V) == 0) { 5074 nkpg = pmap_alloc_pt_page(kernel_pmap, 5075 pmap_pdpe_pindex(end), VM_ALLOC_WIRED | 5076 VM_ALLOC_INTERRUPT | VM_ALLOC_ZERO); 5077 if (nkpg == NULL) 5078 panic("pmap_growkernel: no memory to grow kernel"); 5079 paddr = VM_PAGE_TO_PHYS(nkpg); 5080 *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | 5081 X86_PG_A | X86_PG_M); 5082 continue; /* try again */ 5083 } 5084 pde = pmap_pdpe_to_pde(pdpe, end); 5085 if ((*pde & X86_PG_V) != 0) { 5086 end = (end + NBPDR) & ~PDRMASK; 5087 if (end - 1 >= vm_map_max(kernel_map)) { 5088 end = vm_map_max(kernel_map); 5089 break; 5090 } 5091 continue; 5092 } 5093 5094 nkpg = pmap_alloc_pt_page(kernel_pmap, pmap_pde_pindex(end), 5095 VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT | VM_ALLOC_ZERO); 5096 if (nkpg == NULL) 5097 panic("pmap_growkernel: no memory to grow kernel"); 5098 paddr = VM_PAGE_TO_PHYS(nkpg); 5099 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; 5100 pde_store(pde, newpdir); 5101 5102 end = (end + NBPDR) & ~PDRMASK; 5103 if (end - 1 >= vm_map_max(kernel_map)) { 5104 end = vm_map_max(kernel_map); 5105 break; 5106 } 5107 } 5108 5109 if (end <= KERNBASE) 5110 kernel_vm_end = end; 5111 else 5112 nkpt = howmany(end - KERNBASE, NBPDR); 5113 } 5114 5115 /*************************************************** 5116 * page management routines. 5117 ***************************************************/ 5118 5119 static const uint64_t pc_freemask[_NPCM] = { 5120 [0 ... _NPCM - 2] = PC_FREEN, 5121 [_NPCM - 1] = PC_FREEL 5122 }; 5123 5124 #ifdef PV_STATS 5125 5126 static COUNTER_U64_DEFINE_EARLY(pc_chunk_count); 5127 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, 5128 &pc_chunk_count, "Current number of pv entry cnunks"); 5129 5130 static COUNTER_U64_DEFINE_EARLY(pc_chunk_allocs); 5131 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, 5132 &pc_chunk_allocs, "Total number of pv entry chunks allocated"); 5133 5134 static COUNTER_U64_DEFINE_EARLY(pc_chunk_frees); 5135 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, 5136 &pc_chunk_frees, "Total number of pv entry chunks freed"); 5137 5138 static COUNTER_U64_DEFINE_EARLY(pc_chunk_tryfail); 5139 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, 5140 &pc_chunk_tryfail, 5141 "Number of failed attempts to get a pv entry chunk page"); 5142 5143 static COUNTER_U64_DEFINE_EARLY(pv_entry_frees); 5144 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, 5145 &pv_entry_frees, "Total number of pv entries freed"); 5146 5147 static COUNTER_U64_DEFINE_EARLY(pv_entry_allocs); 5148 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, 5149 &pv_entry_allocs, "Total number of pv entries allocated"); 5150 5151 static COUNTER_U64_DEFINE_EARLY(pv_entry_count); 5152 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, 5153 &pv_entry_count, "Current number of pv entries"); 5154 5155 static COUNTER_U64_DEFINE_EARLY(pv_entry_spare); 5156 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, 5157 &pv_entry_spare, "Current number of spare pv entries"); 5158 #endif 5159 5160 static void 5161 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di) 5162 { 5163 5164 if (pmap == NULL) 5165 return; 5166 pmap_invalidate_all(pmap); 5167 if (pmap != locked_pmap) 5168 PMAP_UNLOCK(pmap); 5169 if (start_di) 5170 pmap_delayed_invl_finish(); 5171 } 5172 5173 /* 5174 * We are in a serious low memory condition. Resort to 5175 * drastic measures to free some pages so we can allocate 5176 * another pv entry chunk. 5177 * 5178 * Returns NULL if PV entries were reclaimed from the specified pmap. 5179 * 5180 * We do not, however, unmap 2mpages because subsequent accesses will 5181 * allocate per-page pv entries until repromotion occurs, thereby 5182 * exacerbating the shortage of free pv entries. 5183 */ 5184 static vm_page_t 5185 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain) 5186 { 5187 struct pv_chunks_list *pvc; 5188 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 5189 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 5190 struct md_page *pvh; 5191 pd_entry_t *pde; 5192 pmap_t next_pmap, pmap; 5193 pt_entry_t *pte, tpte; 5194 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 5195 pv_entry_t pv; 5196 vm_offset_t va; 5197 vm_page_t m, m_pc; 5198 struct spglist free; 5199 uint64_t inuse; 5200 int bit, field, freed; 5201 bool start_di, restart; 5202 5203 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 5204 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 5205 pmap = NULL; 5206 m_pc = NULL; 5207 PG_G = PG_A = PG_M = PG_RW = 0; 5208 SLIST_INIT(&free); 5209 bzero(&pc_marker_b, sizeof(pc_marker_b)); 5210 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 5211 pc_marker = (struct pv_chunk *)&pc_marker_b; 5212 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 5213 5214 /* 5215 * A delayed invalidation block should already be active if 5216 * pmap_advise() or pmap_remove() called this function by way 5217 * of pmap_demote_pde_locked(). 5218 */ 5219 start_di = pmap_not_in_di(); 5220 5221 pvc = &pv_chunks[domain]; 5222 mtx_lock(&pvc->pvc_lock); 5223 pvc->active_reclaims++; 5224 TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru); 5225 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru); 5226 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 5227 SLIST_EMPTY(&free)) { 5228 next_pmap = pc->pc_pmap; 5229 if (next_pmap == NULL) { 5230 /* 5231 * The next chunk is a marker. However, it is 5232 * not our marker, so active_reclaims must be 5233 * > 1. Consequently, the next_chunk code 5234 * will not rotate the pv_chunks list. 5235 */ 5236 goto next_chunk; 5237 } 5238 mtx_unlock(&pvc->pvc_lock); 5239 5240 /* 5241 * A pv_chunk can only be removed from the pc_lru list 5242 * when both pc_chunks_mutex is owned and the 5243 * corresponding pmap is locked. 5244 */ 5245 if (pmap != next_pmap) { 5246 restart = false; 5247 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, 5248 start_di); 5249 pmap = next_pmap; 5250 /* Avoid deadlock and lock recursion. */ 5251 if (pmap > locked_pmap) { 5252 RELEASE_PV_LIST_LOCK(lockp); 5253 PMAP_LOCK(pmap); 5254 if (start_di) 5255 pmap_delayed_invl_start(); 5256 mtx_lock(&pvc->pvc_lock); 5257 restart = true; 5258 } else if (pmap != locked_pmap) { 5259 if (PMAP_TRYLOCK(pmap)) { 5260 if (start_di) 5261 pmap_delayed_invl_start(); 5262 mtx_lock(&pvc->pvc_lock); 5263 restart = true; 5264 } else { 5265 pmap = NULL; /* pmap is not locked */ 5266 mtx_lock(&pvc->pvc_lock); 5267 pc = TAILQ_NEXT(pc_marker, pc_lru); 5268 if (pc == NULL || 5269 pc->pc_pmap != next_pmap) 5270 continue; 5271 goto next_chunk; 5272 } 5273 } else if (start_di) 5274 pmap_delayed_invl_start(); 5275 PG_G = pmap_global_bit(pmap); 5276 PG_A = pmap_accessed_bit(pmap); 5277 PG_M = pmap_modified_bit(pmap); 5278 PG_RW = pmap_rw_bit(pmap); 5279 if (restart) 5280 continue; 5281 } 5282 5283 /* 5284 * Destroy every non-wired, 4 KB page mapping in the chunk. 5285 */ 5286 freed = 0; 5287 for (field = 0; field < _NPCM; field++) { 5288 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 5289 inuse != 0; inuse &= ~(1UL << bit)) { 5290 bit = bsfq(inuse); 5291 pv = &pc->pc_pventry[field * 64 + bit]; 5292 va = pv->pv_va; 5293 pde = pmap_pde(pmap, va); 5294 if ((*pde & PG_PS) != 0) 5295 continue; 5296 pte = pmap_pde_to_pte(pde, va); 5297 if ((*pte & PG_W) != 0) 5298 continue; 5299 tpte = pte_load_clear(pte); 5300 if ((tpte & PG_G) != 0) 5301 pmap_invalidate_page(pmap, va); 5302 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 5303 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5304 vm_page_dirty(m); 5305 if ((tpte & PG_A) != 0) 5306 vm_page_aflag_set(m, PGA_REFERENCED); 5307 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 5308 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5309 m->md.pv_gen++; 5310 if (TAILQ_EMPTY(&m->md.pv_list) && 5311 (m->flags & PG_FICTITIOUS) == 0) { 5312 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5313 if (TAILQ_EMPTY(&pvh->pv_list)) { 5314 vm_page_aflag_clear(m, 5315 PGA_WRITEABLE); 5316 } 5317 } 5318 pmap_delayed_invl_page(m); 5319 pc->pc_map[field] |= 1UL << bit; 5320 pmap_unuse_pt(pmap, va, *pde, &free); 5321 freed++; 5322 } 5323 } 5324 if (freed == 0) { 5325 mtx_lock(&pvc->pvc_lock); 5326 goto next_chunk; 5327 } 5328 /* Every freed mapping is for a 4 KB page. */ 5329 pmap_resident_count_adj(pmap, -freed); 5330 PV_STAT(counter_u64_add(pv_entry_frees, freed)); 5331 PV_STAT(counter_u64_add(pv_entry_spare, freed)); 5332 PV_STAT(counter_u64_add(pv_entry_count, -freed)); 5333 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5334 if (pc_is_free(pc)) { 5335 PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV)); 5336 PV_STAT(counter_u64_add(pc_chunk_count, -1)); 5337 PV_STAT(counter_u64_add(pc_chunk_frees, 1)); 5338 /* Entire chunk is free; return it. */ 5339 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 5340 dump_drop_page(m_pc->phys_addr); 5341 mtx_lock(&pvc->pvc_lock); 5342 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5343 break; 5344 } 5345 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5346 mtx_lock(&pvc->pvc_lock); 5347 /* One freed pv entry in locked_pmap is sufficient. */ 5348 if (pmap == locked_pmap) 5349 break; 5350 next_chunk: 5351 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 5352 TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru); 5353 if (pvc->active_reclaims == 1 && pmap != NULL) { 5354 /* 5355 * Rotate the pv chunks list so that we do not 5356 * scan the same pv chunks that could not be 5357 * freed (because they contained a wired 5358 * and/or superpage mapping) on every 5359 * invocation of reclaim_pv_chunk(). 5360 */ 5361 while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker) { 5362 MPASS(pc->pc_pmap != NULL); 5363 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5364 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 5365 } 5366 } 5367 } 5368 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 5369 TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru); 5370 pvc->active_reclaims--; 5371 mtx_unlock(&pvc->pvc_lock); 5372 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); 5373 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 5374 m_pc = SLIST_FIRST(&free); 5375 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 5376 /* Recycle a freed page table page. */ 5377 m_pc->ref_count = 1; 5378 } 5379 vm_page_free_pages_toq(&free, true); 5380 return (m_pc); 5381 } 5382 5383 static vm_page_t 5384 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 5385 { 5386 vm_page_t m; 5387 int i, domain; 5388 5389 domain = PCPU_GET(domain); 5390 for (i = 0; i < vm_ndomains; i++) { 5391 m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain); 5392 if (m != NULL) 5393 break; 5394 domain = (domain + 1) % vm_ndomains; 5395 } 5396 5397 return (m); 5398 } 5399 5400 /* 5401 * free the pv_entry back to the free list 5402 */ 5403 static void 5404 free_pv_entry(pmap_t pmap, pv_entry_t pv) 5405 { 5406 struct pv_chunk *pc; 5407 int idx, field, bit; 5408 5409 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5410 PV_STAT(counter_u64_add(pv_entry_frees, 1)); 5411 PV_STAT(counter_u64_add(pv_entry_spare, 1)); 5412 PV_STAT(counter_u64_add(pv_entry_count, -1)); 5413 pc = pv_to_chunk(pv); 5414 idx = pv - &pc->pc_pventry[0]; 5415 field = idx / 64; 5416 bit = idx % 64; 5417 pc->pc_map[field] |= 1ul << bit; 5418 if (!pc_is_free(pc)) { 5419 /* 98% of the time, pc is already at the head of the list. */ 5420 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 5421 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5422 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5423 } 5424 return; 5425 } 5426 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5427 free_pv_chunk(pc); 5428 } 5429 5430 static void 5431 free_pv_chunk_dequeued(struct pv_chunk *pc) 5432 { 5433 vm_page_t m; 5434 5435 PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV)); 5436 PV_STAT(counter_u64_add(pc_chunk_count, -1)); 5437 PV_STAT(counter_u64_add(pc_chunk_frees, 1)); 5438 counter_u64_add(pv_page_count, -1); 5439 /* entire chunk is free, return it */ 5440 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 5441 dump_drop_page(m->phys_addr); 5442 vm_page_unwire_noq(m); 5443 vm_page_free(m); 5444 } 5445 5446 static void 5447 free_pv_chunk(struct pv_chunk *pc) 5448 { 5449 struct pv_chunks_list *pvc; 5450 5451 pvc = &pv_chunks[pc_to_domain(pc)]; 5452 mtx_lock(&pvc->pvc_lock); 5453 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5454 mtx_unlock(&pvc->pvc_lock); 5455 free_pv_chunk_dequeued(pc); 5456 } 5457 5458 static void 5459 free_pv_chunk_batch(struct pv_chunklist *batch) 5460 { 5461 struct pv_chunks_list *pvc; 5462 struct pv_chunk *pc, *npc; 5463 int i; 5464 5465 for (i = 0; i < vm_ndomains; i++) { 5466 if (TAILQ_EMPTY(&batch[i])) 5467 continue; 5468 pvc = &pv_chunks[i]; 5469 mtx_lock(&pvc->pvc_lock); 5470 TAILQ_FOREACH(pc, &batch[i], pc_list) { 5471 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5472 } 5473 mtx_unlock(&pvc->pvc_lock); 5474 } 5475 5476 for (i = 0; i < vm_ndomains; i++) { 5477 TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) { 5478 free_pv_chunk_dequeued(pc); 5479 } 5480 } 5481 } 5482 5483 /* 5484 * Returns a new PV entry, allocating a new PV chunk from the system when 5485 * needed. If this PV chunk allocation fails and a PV list lock pointer was 5486 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 5487 * returned. 5488 * 5489 * The given PV list lock may be released. 5490 */ 5491 static pv_entry_t 5492 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 5493 { 5494 struct pv_chunks_list *pvc; 5495 int bit, field; 5496 pv_entry_t pv; 5497 struct pv_chunk *pc; 5498 vm_page_t m; 5499 5500 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5501 PV_STAT(counter_u64_add(pv_entry_allocs, 1)); 5502 retry: 5503 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 5504 if (pc != NULL) { 5505 for (field = 0; field < _NPCM; field++) { 5506 if (pc->pc_map[field]) { 5507 bit = bsfq(pc->pc_map[field]); 5508 break; 5509 } 5510 } 5511 if (field < _NPCM) { 5512 pv = &pc->pc_pventry[field * 64 + bit]; 5513 pc->pc_map[field] &= ~(1ul << bit); 5514 /* If this was the last item, move it to tail */ 5515 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 5516 pc->pc_map[2] == 0) { 5517 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5518 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 5519 pc_list); 5520 } 5521 PV_STAT(counter_u64_add(pv_entry_count, 1)); 5522 PV_STAT(counter_u64_add(pv_entry_spare, -1)); 5523 return (pv); 5524 } 5525 } 5526 /* No free items, allocate another chunk */ 5527 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 5528 if (m == NULL) { 5529 if (lockp == NULL) { 5530 PV_STAT(counter_u64_add(pc_chunk_tryfail, 1)); 5531 return (NULL); 5532 } 5533 m = reclaim_pv_chunk(pmap, lockp); 5534 if (m == NULL) 5535 goto retry; 5536 } else 5537 counter_u64_add(pv_page_count, 1); 5538 PV_STAT(counter_u64_add(pc_chunk_count, 1)); 5539 PV_STAT(counter_u64_add(pc_chunk_allocs, 1)); 5540 dump_add_page(m->phys_addr); 5541 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 5542 pc->pc_pmap = pmap; 5543 pc->pc_map[0] = PC_FREEN & ~1ul; /* preallocated bit 0 */ 5544 pc->pc_map[1] = PC_FREEN; 5545 pc->pc_map[2] = PC_FREEL; 5546 pvc = &pv_chunks[vm_page_domain(m)]; 5547 mtx_lock(&pvc->pvc_lock); 5548 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 5549 mtx_unlock(&pvc->pvc_lock); 5550 pv = &pc->pc_pventry[0]; 5551 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5552 PV_STAT(counter_u64_add(pv_entry_count, 1)); 5553 PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV - 1)); 5554 return (pv); 5555 } 5556 5557 /* 5558 * Returns the number of one bits within the given PV chunk map. 5559 * 5560 * The erratas for Intel processors state that "POPCNT Instruction May 5561 * Take Longer to Execute Than Expected". It is believed that the 5562 * issue is the spurious dependency on the destination register. 5563 * Provide a hint to the register rename logic that the destination 5564 * value is overwritten, by clearing it, as suggested in the 5565 * optimization manual. It should be cheap for unaffected processors 5566 * as well. 5567 * 5568 * Reference numbers for erratas are 5569 * 4th Gen Core: HSD146 5570 * 5th Gen Core: BDM85 5571 * 6th Gen Core: SKL029 5572 */ 5573 static int 5574 popcnt_pc_map_pq(uint64_t *map) 5575 { 5576 u_long result, tmp; 5577 5578 __asm __volatile("xorl %k0,%k0;popcntq %2,%0;" 5579 "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;" 5580 "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0" 5581 : "=&r" (result), "=&r" (tmp) 5582 : "m" (map[0]), "m" (map[1]), "m" (map[2])); 5583 return (result); 5584 } 5585 5586 /* 5587 * Ensure that the number of spare PV entries in the specified pmap meets or 5588 * exceeds the given count, "needed". 5589 * 5590 * The given PV list lock may be released. 5591 */ 5592 static void 5593 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 5594 { 5595 struct pv_chunks_list *pvc; 5596 struct pch new_tail[PMAP_MEMDOM]; 5597 struct pv_chunk *pc; 5598 vm_page_t m; 5599 int avail, free, i; 5600 bool reclaimed; 5601 5602 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5603 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 5604 5605 /* 5606 * Newly allocated PV chunks must be stored in a private list until 5607 * the required number of PV chunks have been allocated. Otherwise, 5608 * reclaim_pv_chunk() could recycle one of these chunks. In 5609 * contrast, these chunks must be added to the pmap upon allocation. 5610 */ 5611 for (i = 0; i < PMAP_MEMDOM; i++) 5612 TAILQ_INIT(&new_tail[i]); 5613 retry: 5614 avail = 0; 5615 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 5616 #ifndef __POPCNT__ 5617 if ((cpu_feature2 & CPUID2_POPCNT) == 0) 5618 bit_count((bitstr_t *)pc->pc_map, 0, 5619 sizeof(pc->pc_map) * NBBY, &free); 5620 else 5621 #endif 5622 free = popcnt_pc_map_pq(pc->pc_map); 5623 if (free == 0) 5624 break; 5625 avail += free; 5626 if (avail >= needed) 5627 break; 5628 } 5629 for (reclaimed = false; avail < needed; avail += _NPCPV) { 5630 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 5631 if (m == NULL) { 5632 m = reclaim_pv_chunk(pmap, lockp); 5633 if (m == NULL) 5634 goto retry; 5635 reclaimed = true; 5636 } else 5637 counter_u64_add(pv_page_count, 1); 5638 PV_STAT(counter_u64_add(pc_chunk_count, 1)); 5639 PV_STAT(counter_u64_add(pc_chunk_allocs, 1)); 5640 dump_add_page(m->phys_addr); 5641 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 5642 pc->pc_pmap = pmap; 5643 pc->pc_map[0] = PC_FREEN; 5644 pc->pc_map[1] = PC_FREEN; 5645 pc->pc_map[2] = PC_FREEL; 5646 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5647 TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru); 5648 PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV)); 5649 5650 /* 5651 * The reclaim might have freed a chunk from the current pmap. 5652 * If that chunk contained available entries, we need to 5653 * re-count the number of available entries. 5654 */ 5655 if (reclaimed) 5656 goto retry; 5657 } 5658 for (i = 0; i < vm_ndomains; i++) { 5659 if (TAILQ_EMPTY(&new_tail[i])) 5660 continue; 5661 pvc = &pv_chunks[i]; 5662 mtx_lock(&pvc->pvc_lock); 5663 TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru); 5664 mtx_unlock(&pvc->pvc_lock); 5665 } 5666 } 5667 5668 /* 5669 * First find and then remove the pv entry for the specified pmap and virtual 5670 * address from the specified pv list. Returns the pv entry if found and NULL 5671 * otherwise. This operation can be performed on pv lists for either 4KB or 5672 * 2MB page mappings. 5673 */ 5674 static __inline pv_entry_t 5675 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 5676 { 5677 pv_entry_t pv; 5678 5679 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5680 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 5681 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5682 pvh->pv_gen++; 5683 break; 5684 } 5685 } 5686 return (pv); 5687 } 5688 5689 /* 5690 * After demotion from a 2MB page mapping to 512 4KB page mappings, 5691 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 5692 * entries for each of the 4KB page mappings. 5693 */ 5694 static void 5695 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 5696 struct rwlock **lockp) 5697 { 5698 struct md_page *pvh; 5699 struct pv_chunk *pc; 5700 pv_entry_t pv; 5701 vm_offset_t va_last; 5702 vm_page_t m; 5703 int bit, field; 5704 5705 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5706 KASSERT((pa & PDRMASK) == 0, 5707 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 5708 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5709 5710 /* 5711 * Transfer the 2mpage's pv entry for this mapping to the first 5712 * page's pv list. Once this transfer begins, the pv list lock 5713 * must not be released until the last pv entry is reinstantiated. 5714 */ 5715 pvh = pa_to_pvh(pa); 5716 va = trunc_2mpage(va); 5717 pv = pmap_pvh_remove(pvh, pmap, va); 5718 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 5719 m = PHYS_TO_VM_PAGE(pa); 5720 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5721 m->md.pv_gen++; 5722 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 5723 PV_STAT(counter_u64_add(pv_entry_allocs, NPTEPG - 1)); 5724 va_last = va + NBPDR - PAGE_SIZE; 5725 for (;;) { 5726 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 5727 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 5728 pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); 5729 for (field = 0; field < _NPCM; field++) { 5730 while (pc->pc_map[field]) { 5731 bit = bsfq(pc->pc_map[field]); 5732 pc->pc_map[field] &= ~(1ul << bit); 5733 pv = &pc->pc_pventry[field * 64 + bit]; 5734 va += PAGE_SIZE; 5735 pv->pv_va = va; 5736 m++; 5737 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5738 ("pmap_pv_demote_pde: page %p is not managed", m)); 5739 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5740 m->md.pv_gen++; 5741 if (va == va_last) 5742 goto out; 5743 } 5744 } 5745 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5746 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 5747 } 5748 out: 5749 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 5750 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5751 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 5752 } 5753 PV_STAT(counter_u64_add(pv_entry_count, NPTEPG - 1)); 5754 PV_STAT(counter_u64_add(pv_entry_spare, -(NPTEPG - 1))); 5755 } 5756 5757 #if VM_NRESERVLEVEL > 0 5758 /* 5759 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 5760 * replace the many pv entries for the 4KB page mappings by a single pv entry 5761 * for the 2MB page mapping. 5762 */ 5763 static void 5764 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 5765 struct rwlock **lockp) 5766 { 5767 struct md_page *pvh; 5768 pv_entry_t pv; 5769 vm_offset_t va_last; 5770 vm_page_t m; 5771 5772 KASSERT((pa & PDRMASK) == 0, 5773 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 5774 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5775 5776 /* 5777 * Transfer the first page's pv entry for this mapping to the 2mpage's 5778 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 5779 * a transfer avoids the possibility that get_pv_entry() calls 5780 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 5781 * mappings that is being promoted. 5782 */ 5783 m = PHYS_TO_VM_PAGE(pa); 5784 va = trunc_2mpage(va); 5785 pv = pmap_pvh_remove(&m->md, pmap, va); 5786 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 5787 pvh = pa_to_pvh(pa); 5788 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5789 pvh->pv_gen++; 5790 /* Free the remaining NPTEPG - 1 pv entries. */ 5791 va_last = va + NBPDR - PAGE_SIZE; 5792 do { 5793 m++; 5794 va += PAGE_SIZE; 5795 pmap_pvh_free(&m->md, pmap, va); 5796 } while (va < va_last); 5797 } 5798 #endif /* VM_NRESERVLEVEL > 0 */ 5799 5800 /* 5801 * First find and then destroy the pv entry for the specified pmap and virtual 5802 * address. This operation can be performed on pv lists for either 4KB or 2MB 5803 * page mappings. 5804 */ 5805 static void 5806 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 5807 { 5808 pv_entry_t pv; 5809 5810 pv = pmap_pvh_remove(pvh, pmap, va); 5811 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 5812 free_pv_entry(pmap, pv); 5813 } 5814 5815 /* 5816 * Conditionally create the PV entry for a 4KB page mapping if the required 5817 * memory can be allocated without resorting to reclamation. 5818 */ 5819 static boolean_t 5820 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 5821 struct rwlock **lockp) 5822 { 5823 pv_entry_t pv; 5824 5825 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5826 /* Pass NULL instead of the lock pointer to disable reclamation. */ 5827 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 5828 pv->pv_va = va; 5829 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 5830 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5831 m->md.pv_gen++; 5832 return (TRUE); 5833 } else 5834 return (FALSE); 5835 } 5836 5837 /* 5838 * Create the PV entry for a 2MB page mapping. Always returns true unless the 5839 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 5840 * false if the PV entry cannot be allocated without resorting to reclamation. 5841 */ 5842 static bool 5843 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, 5844 struct rwlock **lockp) 5845 { 5846 struct md_page *pvh; 5847 pv_entry_t pv; 5848 vm_paddr_t pa; 5849 5850 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5851 /* Pass NULL instead of the lock pointer to disable reclamation. */ 5852 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 5853 NULL : lockp)) == NULL) 5854 return (false); 5855 pv->pv_va = va; 5856 pa = pde & PG_PS_FRAME; 5857 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5858 pvh = pa_to_pvh(pa); 5859 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5860 pvh->pv_gen++; 5861 return (true); 5862 } 5863 5864 /* 5865 * Fills a page table page with mappings to consecutive physical pages. 5866 */ 5867 static void 5868 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 5869 { 5870 pt_entry_t *pte; 5871 5872 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 5873 *pte = newpte; 5874 newpte += PAGE_SIZE; 5875 } 5876 } 5877 5878 /* 5879 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 5880 * mapping is invalidated. 5881 */ 5882 static boolean_t 5883 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 5884 { 5885 struct rwlock *lock; 5886 boolean_t rv; 5887 5888 lock = NULL; 5889 rv = pmap_demote_pde_locked(pmap, pde, va, &lock); 5890 if (lock != NULL) 5891 rw_wunlock(lock); 5892 return (rv); 5893 } 5894 5895 static void 5896 pmap_demote_pde_check(pt_entry_t *firstpte __unused, pt_entry_t newpte __unused) 5897 { 5898 #ifdef INVARIANTS 5899 #ifdef DIAGNOSTIC 5900 pt_entry_t *xpte, *ypte; 5901 5902 for (xpte = firstpte; xpte < firstpte + NPTEPG; 5903 xpte++, newpte += PAGE_SIZE) { 5904 if ((*xpte & PG_FRAME) != (newpte & PG_FRAME)) { 5905 printf("pmap_demote_pde: xpte %zd and newpte map " 5906 "different pages: found %#lx, expected %#lx\n", 5907 xpte - firstpte, *xpte, newpte); 5908 printf("page table dump\n"); 5909 for (ypte = firstpte; ypte < firstpte + NPTEPG; ypte++) 5910 printf("%zd %#lx\n", ypte - firstpte, *ypte); 5911 panic("firstpte"); 5912 } 5913 } 5914 #else 5915 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 5916 ("pmap_demote_pde: firstpte and newpte map different physical" 5917 " addresses")); 5918 #endif 5919 #endif 5920 } 5921 5922 static void 5923 pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 5924 pd_entry_t oldpde, struct rwlock **lockp) 5925 { 5926 struct spglist free; 5927 vm_offset_t sva; 5928 5929 SLIST_INIT(&free); 5930 sva = trunc_2mpage(va); 5931 pmap_remove_pde(pmap, pde, sva, &free, lockp); 5932 if ((oldpde & pmap_global_bit(pmap)) == 0) 5933 pmap_invalidate_pde_page(pmap, sva, oldpde); 5934 vm_page_free_pages_toq(&free, true); 5935 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx in pmap %p", 5936 va, pmap); 5937 } 5938 5939 static boolean_t 5940 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 5941 struct rwlock **lockp) 5942 { 5943 pd_entry_t newpde, oldpde; 5944 pt_entry_t *firstpte, newpte; 5945 pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V; 5946 vm_paddr_t mptepa; 5947 vm_page_t mpte; 5948 int PG_PTE_CACHE; 5949 bool in_kernel; 5950 5951 PG_A = pmap_accessed_bit(pmap); 5952 PG_G = pmap_global_bit(pmap); 5953 PG_M = pmap_modified_bit(pmap); 5954 PG_RW = pmap_rw_bit(pmap); 5955 PG_V = pmap_valid_bit(pmap); 5956 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 5957 PG_PKU_MASK = pmap_pku_mask_bit(pmap); 5958 5959 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5960 in_kernel = va >= VM_MAXUSER_ADDRESS; 5961 oldpde = *pde; 5962 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 5963 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 5964 5965 /* 5966 * Invalidate the 2MB page mapping and return "failure" if the 5967 * mapping was never accessed. 5968 */ 5969 if ((oldpde & PG_A) == 0) { 5970 KASSERT((oldpde & PG_W) == 0, 5971 ("pmap_demote_pde: a wired mapping is missing PG_A")); 5972 pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); 5973 return (FALSE); 5974 } 5975 5976 mpte = pmap_remove_pt_page(pmap, va); 5977 if (mpte == NULL) { 5978 KASSERT((oldpde & PG_W) == 0, 5979 ("pmap_demote_pde: page table page for a wired mapping" 5980 " is missing")); 5981 5982 /* 5983 * If the page table page is missing and the mapping 5984 * is for a kernel address, the mapping must belong to 5985 * the direct map. Page table pages are preallocated 5986 * for every other part of the kernel address space, 5987 * so the direct map region is the only part of the 5988 * kernel address space that must be handled here. 5989 */ 5990 KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS && 5991 va < DMAP_MAX_ADDRESS), 5992 ("pmap_demote_pde: No saved mpte for va %#lx", va)); 5993 5994 /* 5995 * If the 2MB page mapping belongs to the direct map 5996 * region of the kernel's address space, then the page 5997 * allocation request specifies the highest possible 5998 * priority (VM_ALLOC_INTERRUPT). Otherwise, the 5999 * priority is normal. 6000 */ 6001 mpte = pmap_alloc_pt_page(pmap, pmap_pde_pindex(va), 6002 (in_kernel ? VM_ALLOC_INTERRUPT : 0) | VM_ALLOC_WIRED); 6003 6004 /* 6005 * If the allocation of the new page table page fails, 6006 * invalidate the 2MB page mapping and return "failure". 6007 */ 6008 if (mpte == NULL) { 6009 pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); 6010 return (FALSE); 6011 } 6012 6013 if (!in_kernel) 6014 mpte->ref_count = NPTEPG; 6015 } 6016 mptepa = VM_PAGE_TO_PHYS(mpte); 6017 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 6018 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 6019 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 6020 ("pmap_demote_pde: oldpde is missing PG_M")); 6021 newpte = oldpde & ~PG_PS; 6022 newpte = pmap_swap_pat(pmap, newpte); 6023 6024 /* 6025 * If the page table page is not leftover from an earlier promotion, 6026 * initialize it. 6027 */ 6028 if (vm_page_none_valid(mpte)) 6029 pmap_fill_ptp(firstpte, newpte); 6030 6031 pmap_demote_pde_check(firstpte, newpte); 6032 6033 /* 6034 * If the mapping has changed attributes, update the page table 6035 * entries. 6036 */ 6037 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 6038 pmap_fill_ptp(firstpte, newpte); 6039 6040 /* 6041 * The spare PV entries must be reserved prior to demoting the 6042 * mapping, that is, prior to changing the PDE. Otherwise, the state 6043 * of the PDE and the PV lists will be inconsistent, which can result 6044 * in reclaim_pv_chunk() attempting to remove a PV entry from the 6045 * wrong PV list and pmap_pv_demote_pde() failing to find the expected 6046 * PV entry for the 2MB page mapping that is being demoted. 6047 */ 6048 if ((oldpde & PG_MANAGED) != 0) 6049 reserve_pv_entries(pmap, NPTEPG - 1, lockp); 6050 6051 /* 6052 * Demote the mapping. This pmap is locked. The old PDE has 6053 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 6054 * set. Thus, there is no danger of a race with another 6055 * processor changing the setting of PG_A and/or PG_M between 6056 * the read above and the store below. 6057 */ 6058 if (workaround_erratum383) 6059 pmap_update_pde(pmap, va, pde, newpde); 6060 else 6061 pde_store(pde, newpde); 6062 6063 /* 6064 * Invalidate a stale recursive mapping of the page table page. 6065 */ 6066 if (in_kernel) 6067 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 6068 6069 /* 6070 * Demote the PV entry. 6071 */ 6072 if ((oldpde & PG_MANAGED) != 0) 6073 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); 6074 6075 counter_u64_add(pmap_pde_demotions, 1); 6076 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p", 6077 va, pmap); 6078 return (TRUE); 6079 } 6080 6081 /* 6082 * pmap_remove_kernel_pde: Remove a kernel superpage mapping. 6083 */ 6084 static void 6085 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 6086 { 6087 pd_entry_t newpde; 6088 vm_paddr_t mptepa; 6089 vm_page_t mpte; 6090 6091 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 6092 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6093 mpte = pmap_remove_pt_page(pmap, va); 6094 if (mpte == NULL) 6095 panic("pmap_remove_kernel_pde: Missing pt page."); 6096 6097 mptepa = VM_PAGE_TO_PHYS(mpte); 6098 newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; 6099 6100 /* 6101 * If this page table page was unmapped by a promotion, then it 6102 * contains valid mappings. Zero it to invalidate those mappings. 6103 */ 6104 if (vm_page_any_valid(mpte)) 6105 pagezero((void *)PHYS_TO_DMAP(mptepa)); 6106 6107 /* 6108 * Demote the mapping. 6109 */ 6110 if (workaround_erratum383) 6111 pmap_update_pde(pmap, va, pde, newpde); 6112 else 6113 pde_store(pde, newpde); 6114 6115 /* 6116 * Invalidate a stale recursive mapping of the page table page. 6117 */ 6118 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 6119 } 6120 6121 /* 6122 * pmap_remove_pde: do the things to unmap a superpage in a process 6123 */ 6124 static int 6125 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 6126 struct spglist *free, struct rwlock **lockp) 6127 { 6128 struct md_page *pvh; 6129 pd_entry_t oldpde; 6130 vm_offset_t eva, va; 6131 vm_page_t m, mpte; 6132 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 6133 6134 PG_G = pmap_global_bit(pmap); 6135 PG_A = pmap_accessed_bit(pmap); 6136 PG_M = pmap_modified_bit(pmap); 6137 PG_RW = pmap_rw_bit(pmap); 6138 6139 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6140 KASSERT((sva & PDRMASK) == 0, 6141 ("pmap_remove_pde: sva is not 2mpage aligned")); 6142 oldpde = pte_load_clear(pdq); 6143 if (oldpde & PG_W) 6144 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 6145 if ((oldpde & PG_G) != 0) 6146 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 6147 pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE); 6148 if (oldpde & PG_MANAGED) { 6149 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); 6150 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 6151 pmap_pvh_free(pvh, pmap, sva); 6152 eva = sva + NBPDR; 6153 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 6154 va < eva; va += PAGE_SIZE, m++) { 6155 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6156 vm_page_dirty(m); 6157 if (oldpde & PG_A) 6158 vm_page_aflag_set(m, PGA_REFERENCED); 6159 if (TAILQ_EMPTY(&m->md.pv_list) && 6160 TAILQ_EMPTY(&pvh->pv_list)) 6161 vm_page_aflag_clear(m, PGA_WRITEABLE); 6162 pmap_delayed_invl_page(m); 6163 } 6164 } 6165 if (pmap == kernel_pmap) { 6166 pmap_remove_kernel_pde(pmap, pdq, sva); 6167 } else { 6168 mpte = pmap_remove_pt_page(pmap, sva); 6169 if (mpte != NULL) { 6170 KASSERT(vm_page_all_valid(mpte), 6171 ("pmap_remove_pde: pte page not promoted")); 6172 pmap_pt_page_count_adj(pmap, -1); 6173 KASSERT(mpte->ref_count == NPTEPG, 6174 ("pmap_remove_pde: pte page ref count error")); 6175 mpte->ref_count = 0; 6176 pmap_add_delayed_free_list(mpte, free, FALSE); 6177 } 6178 } 6179 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); 6180 } 6181 6182 /* 6183 * pmap_remove_pte: do the things to unmap a page in a process 6184 */ 6185 static int 6186 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 6187 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) 6188 { 6189 struct md_page *pvh; 6190 pt_entry_t oldpte, PG_A, PG_M, PG_RW; 6191 vm_page_t m; 6192 6193 PG_A = pmap_accessed_bit(pmap); 6194 PG_M = pmap_modified_bit(pmap); 6195 PG_RW = pmap_rw_bit(pmap); 6196 6197 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6198 oldpte = pte_load_clear(ptq); 6199 if (oldpte & PG_W) 6200 pmap->pm_stats.wired_count -= 1; 6201 pmap_resident_count_adj(pmap, -1); 6202 if (oldpte & PG_MANAGED) { 6203 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 6204 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6205 vm_page_dirty(m); 6206 if (oldpte & PG_A) 6207 vm_page_aflag_set(m, PGA_REFERENCED); 6208 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 6209 pmap_pvh_free(&m->md, pmap, va); 6210 if (TAILQ_EMPTY(&m->md.pv_list) && 6211 (m->flags & PG_FICTITIOUS) == 0) { 6212 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6213 if (TAILQ_EMPTY(&pvh->pv_list)) 6214 vm_page_aflag_clear(m, PGA_WRITEABLE); 6215 } 6216 pmap_delayed_invl_page(m); 6217 } 6218 return (pmap_unuse_pt(pmap, va, ptepde, free)); 6219 } 6220 6221 /* 6222 * Remove a single page from a process address space 6223 */ 6224 static void 6225 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 6226 struct spglist *free) 6227 { 6228 struct rwlock *lock; 6229 pt_entry_t *pte, PG_V; 6230 6231 PG_V = pmap_valid_bit(pmap); 6232 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6233 if ((*pde & PG_V) == 0) 6234 return; 6235 pte = pmap_pde_to_pte(pde, va); 6236 if ((*pte & PG_V) == 0) 6237 return; 6238 lock = NULL; 6239 pmap_remove_pte(pmap, pte, va, *pde, free, &lock); 6240 if (lock != NULL) 6241 rw_wunlock(lock); 6242 pmap_invalidate_page(pmap, va); 6243 } 6244 6245 /* 6246 * Removes the specified range of addresses from the page table page. 6247 */ 6248 static bool 6249 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 6250 pd_entry_t *pde, struct spglist *free, struct rwlock **lockp) 6251 { 6252 pt_entry_t PG_G, *pte; 6253 vm_offset_t va; 6254 bool anyvalid; 6255 6256 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6257 PG_G = pmap_global_bit(pmap); 6258 anyvalid = false; 6259 va = eva; 6260 for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++, 6261 sva += PAGE_SIZE) { 6262 if (*pte == 0) { 6263 if (va != eva) { 6264 pmap_invalidate_range(pmap, va, sva); 6265 va = eva; 6266 } 6267 continue; 6268 } 6269 if ((*pte & PG_G) == 0) 6270 anyvalid = true; 6271 else if (va == eva) 6272 va = sva; 6273 if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) { 6274 sva += PAGE_SIZE; 6275 break; 6276 } 6277 } 6278 if (va != eva) 6279 pmap_invalidate_range(pmap, va, sva); 6280 return (anyvalid); 6281 } 6282 6283 /* 6284 * Remove the given range of addresses from the specified map. 6285 * 6286 * It is assumed that the start and end are properly 6287 * rounded to the page size. 6288 */ 6289 void 6290 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 6291 { 6292 struct rwlock *lock; 6293 vm_page_t mt; 6294 vm_offset_t va_next; 6295 pml5_entry_t *pml5e; 6296 pml4_entry_t *pml4e; 6297 pdp_entry_t *pdpe; 6298 pd_entry_t ptpaddr, *pde; 6299 pt_entry_t PG_G, PG_V; 6300 struct spglist free; 6301 int anyvalid; 6302 6303 PG_G = pmap_global_bit(pmap); 6304 PG_V = pmap_valid_bit(pmap); 6305 6306 /* 6307 * If there are no resident pages besides the top level page 6308 * table page(s), there is nothing to do. Kernel pmap always 6309 * accounts whole preloaded area as resident, which makes its 6310 * resident count > 2. 6311 * Perform an unsynchronized read. This is, however, safe. 6312 */ 6313 if (pmap->pm_stats.resident_count <= 1 + (pmap->pm_pmltopu != NULL ? 6314 1 : 0)) 6315 return; 6316 6317 anyvalid = 0; 6318 SLIST_INIT(&free); 6319 6320 pmap_delayed_invl_start(); 6321 PMAP_LOCK(pmap); 6322 pmap_pkru_on_remove(pmap, sva, eva); 6323 6324 /* 6325 * special handling of removing one page. a very 6326 * common operation and easy to short circuit some 6327 * code. 6328 */ 6329 if (sva + PAGE_SIZE == eva) { 6330 pde = pmap_pde(pmap, sva); 6331 if (pde && (*pde & PG_PS) == 0) { 6332 pmap_remove_page(pmap, sva, pde, &free); 6333 goto out; 6334 } 6335 } 6336 6337 lock = NULL; 6338 for (; sva < eva; sva = va_next) { 6339 if (pmap->pm_stats.resident_count == 0) 6340 break; 6341 6342 if (pmap_is_la57(pmap)) { 6343 pml5e = pmap_pml5e(pmap, sva); 6344 if ((*pml5e & PG_V) == 0) { 6345 va_next = (sva + NBPML5) & ~PML5MASK; 6346 if (va_next < sva) 6347 va_next = eva; 6348 continue; 6349 } 6350 pml4e = pmap_pml5e_to_pml4e(pml5e, sva); 6351 } else { 6352 pml4e = pmap_pml4e(pmap, sva); 6353 } 6354 if ((*pml4e & PG_V) == 0) { 6355 va_next = (sva + NBPML4) & ~PML4MASK; 6356 if (va_next < sva) 6357 va_next = eva; 6358 continue; 6359 } 6360 6361 va_next = (sva + NBPDP) & ~PDPMASK; 6362 if (va_next < sva) 6363 va_next = eva; 6364 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 6365 if ((*pdpe & PG_V) == 0) 6366 continue; 6367 if ((*pdpe & PG_PS) != 0) { 6368 KASSERT(va_next <= eva, 6369 ("partial update of non-transparent 1G mapping " 6370 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 6371 *pdpe, sva, eva, va_next)); 6372 MPASS(pmap != kernel_pmap); /* XXXKIB */ 6373 MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); 6374 anyvalid = 1; 6375 *pdpe = 0; 6376 pmap_resident_count_adj(pmap, -NBPDP / PAGE_SIZE); 6377 mt = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, sva) & PG_FRAME); 6378 pmap_unwire_ptp(pmap, sva, mt, &free); 6379 continue; 6380 } 6381 6382 /* 6383 * Calculate index for next page table. 6384 */ 6385 va_next = (sva + NBPDR) & ~PDRMASK; 6386 if (va_next < sva) 6387 va_next = eva; 6388 6389 pde = pmap_pdpe_to_pde(pdpe, sva); 6390 ptpaddr = *pde; 6391 6392 /* 6393 * Weed out invalid mappings. 6394 */ 6395 if (ptpaddr == 0) 6396 continue; 6397 6398 /* 6399 * Check for large page. 6400 */ 6401 if ((ptpaddr & PG_PS) != 0) { 6402 /* 6403 * Are we removing the entire large page? If not, 6404 * demote the mapping and fall through. 6405 */ 6406 if (sva + NBPDR == va_next && eva >= va_next) { 6407 /* 6408 * The TLB entry for a PG_G mapping is 6409 * invalidated by pmap_remove_pde(). 6410 */ 6411 if ((ptpaddr & PG_G) == 0) 6412 anyvalid = 1; 6413 pmap_remove_pde(pmap, pde, sva, &free, &lock); 6414 continue; 6415 } else if (!pmap_demote_pde_locked(pmap, pde, sva, 6416 &lock)) { 6417 /* The large page mapping was destroyed. */ 6418 continue; 6419 } else 6420 ptpaddr = *pde; 6421 } 6422 6423 /* 6424 * Limit our scan to either the end of the va represented 6425 * by the current page table page, or to the end of the 6426 * range being removed. 6427 */ 6428 if (va_next > eva) 6429 va_next = eva; 6430 6431 if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock)) 6432 anyvalid = 1; 6433 } 6434 if (lock != NULL) 6435 rw_wunlock(lock); 6436 out: 6437 if (anyvalid) 6438 pmap_invalidate_all(pmap); 6439 PMAP_UNLOCK(pmap); 6440 pmap_delayed_invl_finish(); 6441 vm_page_free_pages_toq(&free, true); 6442 } 6443 6444 /* 6445 * Routine: pmap_remove_all 6446 * Function: 6447 * Removes this physical page from 6448 * all physical maps in which it resides. 6449 * Reflects back modify bits to the pager. 6450 * 6451 * Notes: 6452 * Original versions of this routine were very 6453 * inefficient because they iteratively called 6454 * pmap_remove (slow...) 6455 */ 6456 6457 void 6458 pmap_remove_all(vm_page_t m) 6459 { 6460 struct md_page *pvh; 6461 pv_entry_t pv; 6462 pmap_t pmap; 6463 struct rwlock *lock; 6464 pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; 6465 pd_entry_t *pde; 6466 vm_offset_t va; 6467 struct spglist free; 6468 int pvh_gen, md_gen; 6469 6470 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6471 ("pmap_remove_all: page %p is not managed", m)); 6472 SLIST_INIT(&free); 6473 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6474 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 6475 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6476 rw_wlock(lock); 6477 retry: 6478 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 6479 pmap = PV_PMAP(pv); 6480 if (!PMAP_TRYLOCK(pmap)) { 6481 pvh_gen = pvh->pv_gen; 6482 rw_wunlock(lock); 6483 PMAP_LOCK(pmap); 6484 rw_wlock(lock); 6485 if (pvh_gen != pvh->pv_gen) { 6486 PMAP_UNLOCK(pmap); 6487 goto retry; 6488 } 6489 } 6490 va = pv->pv_va; 6491 pde = pmap_pde(pmap, va); 6492 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 6493 PMAP_UNLOCK(pmap); 6494 } 6495 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 6496 pmap = PV_PMAP(pv); 6497 if (!PMAP_TRYLOCK(pmap)) { 6498 pvh_gen = pvh->pv_gen; 6499 md_gen = m->md.pv_gen; 6500 rw_wunlock(lock); 6501 PMAP_LOCK(pmap); 6502 rw_wlock(lock); 6503 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 6504 PMAP_UNLOCK(pmap); 6505 goto retry; 6506 } 6507 } 6508 PG_A = pmap_accessed_bit(pmap); 6509 PG_M = pmap_modified_bit(pmap); 6510 PG_RW = pmap_rw_bit(pmap); 6511 pmap_resident_count_adj(pmap, -1); 6512 pde = pmap_pde(pmap, pv->pv_va); 6513 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 6514 " a 2mpage in page %p's pv list", m)); 6515 pte = pmap_pde_to_pte(pde, pv->pv_va); 6516 tpte = pte_load_clear(pte); 6517 if (tpte & PG_W) 6518 pmap->pm_stats.wired_count--; 6519 if (tpte & PG_A) 6520 vm_page_aflag_set(m, PGA_REFERENCED); 6521 6522 /* 6523 * Update the vm_page_t clean and reference bits. 6524 */ 6525 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6526 vm_page_dirty(m); 6527 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); 6528 pmap_invalidate_page(pmap, pv->pv_va); 6529 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 6530 m->md.pv_gen++; 6531 free_pv_entry(pmap, pv); 6532 PMAP_UNLOCK(pmap); 6533 } 6534 vm_page_aflag_clear(m, PGA_WRITEABLE); 6535 rw_wunlock(lock); 6536 pmap_delayed_invl_wait(m); 6537 vm_page_free_pages_toq(&free, true); 6538 } 6539 6540 /* 6541 * pmap_protect_pde: do the things to protect a 2mpage in a process 6542 */ 6543 static boolean_t 6544 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 6545 { 6546 pd_entry_t newpde, oldpde; 6547 vm_page_t m, mt; 6548 boolean_t anychanged; 6549 pt_entry_t PG_G, PG_M, PG_RW; 6550 6551 PG_G = pmap_global_bit(pmap); 6552 PG_M = pmap_modified_bit(pmap); 6553 PG_RW = pmap_rw_bit(pmap); 6554 6555 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6556 KASSERT((sva & PDRMASK) == 0, 6557 ("pmap_protect_pde: sva is not 2mpage aligned")); 6558 anychanged = FALSE; 6559 retry: 6560 oldpde = newpde = *pde; 6561 if ((prot & VM_PROT_WRITE) == 0) { 6562 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == 6563 (PG_MANAGED | PG_M | PG_RW)) { 6564 m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 6565 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 6566 vm_page_dirty(mt); 6567 } 6568 newpde &= ~(PG_RW | PG_M); 6569 } 6570 if ((prot & VM_PROT_EXECUTE) == 0) 6571 newpde |= pg_nx; 6572 if (newpde != oldpde) { 6573 /* 6574 * As an optimization to future operations on this PDE, clear 6575 * PG_PROMOTED. The impending invalidation will remove any 6576 * lingering 4KB page mappings from the TLB. 6577 */ 6578 if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED)) 6579 goto retry; 6580 if ((oldpde & PG_G) != 0) 6581 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 6582 else 6583 anychanged = TRUE; 6584 } 6585 return (anychanged); 6586 } 6587 6588 /* 6589 * Set the physical protection on the 6590 * specified range of this map as requested. 6591 */ 6592 void 6593 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 6594 { 6595 vm_page_t m; 6596 vm_offset_t va_next; 6597 pml4_entry_t *pml4e; 6598 pdp_entry_t *pdpe; 6599 pd_entry_t ptpaddr, *pde; 6600 pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; 6601 pt_entry_t obits, pbits; 6602 boolean_t anychanged; 6603 6604 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 6605 if (prot == VM_PROT_NONE) { 6606 pmap_remove(pmap, sva, eva); 6607 return; 6608 } 6609 6610 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 6611 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 6612 return; 6613 6614 PG_G = pmap_global_bit(pmap); 6615 PG_M = pmap_modified_bit(pmap); 6616 PG_V = pmap_valid_bit(pmap); 6617 PG_RW = pmap_rw_bit(pmap); 6618 anychanged = FALSE; 6619 6620 /* 6621 * Although this function delays and batches the invalidation 6622 * of stale TLB entries, it does not need to call 6623 * pmap_delayed_invl_start() and 6624 * pmap_delayed_invl_finish(), because it does not 6625 * ordinarily destroy mappings. Stale TLB entries from 6626 * protection-only changes need only be invalidated before the 6627 * pmap lock is released, because protection-only changes do 6628 * not destroy PV entries. Even operations that iterate over 6629 * a physical page's PV list of mappings, like 6630 * pmap_remove_write(), acquire the pmap lock for each 6631 * mapping. Consequently, for protection-only changes, the 6632 * pmap lock suffices to synchronize both page table and TLB 6633 * updates. 6634 * 6635 * This function only destroys a mapping if pmap_demote_pde() 6636 * fails. In that case, stale TLB entries are immediately 6637 * invalidated. 6638 */ 6639 6640 PMAP_LOCK(pmap); 6641 for (; sva < eva; sva = va_next) { 6642 pml4e = pmap_pml4e(pmap, sva); 6643 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 6644 va_next = (sva + NBPML4) & ~PML4MASK; 6645 if (va_next < sva) 6646 va_next = eva; 6647 continue; 6648 } 6649 6650 va_next = (sva + NBPDP) & ~PDPMASK; 6651 if (va_next < sva) 6652 va_next = eva; 6653 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 6654 if ((*pdpe & PG_V) == 0) 6655 continue; 6656 if ((*pdpe & PG_PS) != 0) { 6657 KASSERT(va_next <= eva, 6658 ("partial update of non-transparent 1G mapping " 6659 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 6660 *pdpe, sva, eva, va_next)); 6661 retry_pdpe: 6662 obits = pbits = *pdpe; 6663 MPASS((pbits & (PG_MANAGED | PG_G)) == 0); 6664 MPASS(pmap != kernel_pmap); /* XXXKIB */ 6665 if ((prot & VM_PROT_WRITE) == 0) 6666 pbits &= ~(PG_RW | PG_M); 6667 if ((prot & VM_PROT_EXECUTE) == 0) 6668 pbits |= pg_nx; 6669 6670 if (pbits != obits) { 6671 if (!atomic_cmpset_long(pdpe, obits, pbits)) 6672 /* PG_PS cannot be cleared under us, */ 6673 goto retry_pdpe; 6674 anychanged = TRUE; 6675 } 6676 continue; 6677 } 6678 6679 va_next = (sva + NBPDR) & ~PDRMASK; 6680 if (va_next < sva) 6681 va_next = eva; 6682 6683 pde = pmap_pdpe_to_pde(pdpe, sva); 6684 ptpaddr = *pde; 6685 6686 /* 6687 * Weed out invalid mappings. 6688 */ 6689 if (ptpaddr == 0) 6690 continue; 6691 6692 /* 6693 * Check for large page. 6694 */ 6695 if ((ptpaddr & PG_PS) != 0) { 6696 /* 6697 * Are we protecting the entire large page? If not, 6698 * demote the mapping and fall through. 6699 */ 6700 if (sva + NBPDR == va_next && eva >= va_next) { 6701 /* 6702 * The TLB entry for a PG_G mapping is 6703 * invalidated by pmap_protect_pde(). 6704 */ 6705 if (pmap_protect_pde(pmap, pde, sva, prot)) 6706 anychanged = TRUE; 6707 continue; 6708 } else if (!pmap_demote_pde(pmap, pde, sva)) { 6709 /* 6710 * The large page mapping was destroyed. 6711 */ 6712 continue; 6713 } 6714 } 6715 6716 if (va_next > eva) 6717 va_next = eva; 6718 6719 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 6720 sva += PAGE_SIZE) { 6721 retry: 6722 obits = pbits = *pte; 6723 if ((pbits & PG_V) == 0) 6724 continue; 6725 6726 if ((prot & VM_PROT_WRITE) == 0) { 6727 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 6728 (PG_MANAGED | PG_M | PG_RW)) { 6729 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 6730 vm_page_dirty(m); 6731 } 6732 pbits &= ~(PG_RW | PG_M); 6733 } 6734 if ((prot & VM_PROT_EXECUTE) == 0) 6735 pbits |= pg_nx; 6736 6737 if (pbits != obits) { 6738 if (!atomic_cmpset_long(pte, obits, pbits)) 6739 goto retry; 6740 if (obits & PG_G) 6741 pmap_invalidate_page(pmap, sva); 6742 else 6743 anychanged = TRUE; 6744 } 6745 } 6746 } 6747 if (anychanged) 6748 pmap_invalidate_all(pmap); 6749 PMAP_UNLOCK(pmap); 6750 } 6751 6752 #if VM_NRESERVLEVEL > 0 6753 static bool 6754 pmap_pde_ept_executable(pmap_t pmap, pd_entry_t pde) 6755 { 6756 6757 if (pmap->pm_type != PT_EPT) 6758 return (false); 6759 return ((pde & EPT_PG_EXECUTE) != 0); 6760 } 6761 6762 /* 6763 * Tries to promote the 512, contiguous 4KB page mappings that are within a 6764 * single page table page (PTP) to a single 2MB page mapping. For promotion 6765 * to occur, two conditions must be met: (1) the 4KB page mappings must map 6766 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 6767 * identical characteristics. 6768 */ 6769 static void 6770 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, vm_page_t mpte, 6771 struct rwlock **lockp) 6772 { 6773 pd_entry_t newpde; 6774 pt_entry_t *firstpte, oldpte, pa, *pte; 6775 pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK; 6776 int PG_PTE_CACHE; 6777 6778 PG_A = pmap_accessed_bit(pmap); 6779 PG_G = pmap_global_bit(pmap); 6780 PG_M = pmap_modified_bit(pmap); 6781 PG_V = pmap_valid_bit(pmap); 6782 PG_RW = pmap_rw_bit(pmap); 6783 PG_PKU_MASK = pmap_pku_mask_bit(pmap); 6784 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 6785 6786 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6787 6788 /* 6789 * Examine the first PTE in the specified PTP. Abort if this PTE is 6790 * ineligible for promotion due to hardware errata, invalid, or does 6791 * not map the first 4KB physical page within a 2MB page. 6792 */ 6793 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 6794 newpde = *firstpte; 6795 if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, newpde))) 6796 return; 6797 if ((newpde & ((PG_FRAME & PDRMASK) | PG_V)) != PG_V) { 6798 counter_u64_add(pmap_pde_p_failures, 1); 6799 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6800 " in pmap %p", va, pmap); 6801 return; 6802 } 6803 6804 /* 6805 * Both here and in the below "for" loop, to allow for repromotion 6806 * after MADV_FREE, conditionally write protect a clean PTE before 6807 * possibly aborting the promotion due to other PTE attributes. Why? 6808 * Suppose that MADV_FREE is applied to a part of a superpage, the 6809 * address range [S, E). pmap_advise() will demote the superpage 6810 * mapping, destroy the 4KB page mapping at the end of [S, E), and 6811 * clear PG_M and PG_A in the PTEs for the rest of [S, E). Later, 6812 * imagine that the memory in [S, E) is recycled, but the last 4KB 6813 * page in [S, E) is not the last to be rewritten, or simply accessed. 6814 * In other words, there is still a 4KB page in [S, E), call it P, 6815 * that is writeable but PG_M and PG_A are clear in P's PTE. Unless 6816 * we write protect P before aborting the promotion, if and when P is 6817 * finally rewritten, there won't be a page fault to trigger 6818 * repromotion. 6819 */ 6820 setpde: 6821 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 6822 /* 6823 * When PG_M is already clear, PG_RW can be cleared without 6824 * a TLB invalidation. 6825 */ 6826 if (!atomic_fcmpset_long(firstpte, &newpde, newpde & ~PG_RW)) 6827 goto setpde; 6828 newpde &= ~PG_RW; 6829 } 6830 if ((newpde & PG_A) == 0) { 6831 counter_u64_add(pmap_pde_p_failures, 1); 6832 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6833 " in pmap %p", va, pmap); 6834 return; 6835 } 6836 6837 /* 6838 * Examine each of the other PTEs in the specified PTP. Abort if this 6839 * PTE maps an unexpected 4KB physical page or does not have identical 6840 * characteristics to the first PTE. 6841 */ 6842 pa = (newpde & (PG_PS_FRAME | PG_V)) + NBPDR - PAGE_SIZE; 6843 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 6844 oldpte = *pte; 6845 if ((oldpte & (PG_FRAME | PG_V)) != pa) { 6846 counter_u64_add(pmap_pde_p_failures, 1); 6847 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6848 " in pmap %p", va, pmap); 6849 return; 6850 } 6851 setpte: 6852 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 6853 /* 6854 * When PG_M is already clear, PG_RW can be cleared 6855 * without a TLB invalidation. 6856 */ 6857 if (!atomic_fcmpset_long(pte, &oldpte, oldpte & ~PG_RW)) 6858 goto setpte; 6859 oldpte &= ~PG_RW; 6860 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 6861 " in pmap %p", (oldpte & PG_FRAME & PDRMASK) | 6862 (va & ~PDRMASK), pmap); 6863 } 6864 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 6865 counter_u64_add(pmap_pde_p_failures, 1); 6866 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6867 " in pmap %p", va, pmap); 6868 return; 6869 } 6870 pa -= PAGE_SIZE; 6871 } 6872 6873 /* 6874 * Save the page table page in its current state until the PDE 6875 * mapping the superpage is demoted by pmap_demote_pde() or 6876 * destroyed by pmap_remove_pde(). 6877 */ 6878 if (mpte == NULL) 6879 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 6880 KASSERT(mpte >= vm_page_array && 6881 mpte < &vm_page_array[vm_page_array_size], 6882 ("pmap_promote_pde: page table page is out of range")); 6883 KASSERT(mpte->pindex == pmap_pde_pindex(va), 6884 ("pmap_promote_pde: page table page's pindex is wrong " 6885 "mpte %p pidx %#lx va %#lx va pde pidx %#lx", 6886 mpte, mpte->pindex, va, pmap_pde_pindex(va))); 6887 if (pmap_insert_pt_page(pmap, mpte, true)) { 6888 counter_u64_add(pmap_pde_p_failures, 1); 6889 CTR2(KTR_PMAP, 6890 "pmap_promote_pde: failure for va %#lx in pmap %p", va, 6891 pmap); 6892 return; 6893 } 6894 6895 /* 6896 * Promote the pv entries. 6897 */ 6898 if ((newpde & PG_MANAGED) != 0) 6899 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); 6900 6901 /* 6902 * Propagate the PAT index to its proper position. 6903 */ 6904 newpde = pmap_swap_pat(pmap, newpde); 6905 6906 /* 6907 * Map the superpage. 6908 */ 6909 if (workaround_erratum383) 6910 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 6911 else 6912 pde_store(pde, PG_PROMOTED | PG_PS | newpde); 6913 6914 counter_u64_add(pmap_pde_promotions, 1); 6915 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" 6916 " in pmap %p", va, pmap); 6917 } 6918 #endif /* VM_NRESERVLEVEL > 0 */ 6919 6920 static int 6921 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags, 6922 int psind) 6923 { 6924 vm_page_t mp; 6925 pt_entry_t origpte, *pml4e, *pdpe, *pde, pten, PG_V; 6926 6927 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6928 KASSERT(psind > 0 && psind < MAXPAGESIZES && pagesizes[psind] != 0, 6929 ("psind %d unexpected", psind)); 6930 KASSERT(((newpte & PG_FRAME) & (pagesizes[psind] - 1)) == 0, 6931 ("unaligned phys address %#lx newpte %#lx psind %d", 6932 newpte & PG_FRAME, newpte, psind)); 6933 KASSERT((va & (pagesizes[psind] - 1)) == 0, 6934 ("unaligned va %#lx psind %d", va, psind)); 6935 KASSERT(va < VM_MAXUSER_ADDRESS, 6936 ("kernel mode non-transparent superpage")); /* XXXKIB */ 6937 KASSERT(va + pagesizes[psind] < VM_MAXUSER_ADDRESS, 6938 ("overflowing user map va %#lx psind %d", va, psind)); /* XXXKIB */ 6939 6940 PG_V = pmap_valid_bit(pmap); 6941 6942 restart: 6943 if (!pmap_pkru_same(pmap, va, va + pagesizes[psind])) 6944 return (KERN_PROTECTION_FAILURE); 6945 pten = newpte; 6946 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) 6947 pten |= pmap_pkru_get(pmap, va); 6948 6949 if (psind == 2) { /* 1G */ 6950 pml4e = pmap_pml4e(pmap, va); 6951 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 6952 mp = pmap_allocpte_alloc(pmap, pmap_pml4e_pindex(va), 6953 NULL, va); 6954 if (mp == NULL) 6955 goto allocf; 6956 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 6957 pdpe = &pdpe[pmap_pdpe_index(va)]; 6958 origpte = *pdpe; 6959 MPASS(origpte == 0); 6960 } else { 6961 pdpe = pmap_pml4e_to_pdpe(pml4e, va); 6962 KASSERT(pdpe != NULL, ("va %#lx lost pdpe", va)); 6963 origpte = *pdpe; 6964 if ((origpte & PG_V) == 0) { 6965 mp = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME); 6966 mp->ref_count++; 6967 } 6968 } 6969 *pdpe = pten; 6970 } else /* (psind == 1) */ { /* 2M */ 6971 pde = pmap_pde(pmap, va); 6972 if (pde == NULL) { 6973 mp = pmap_allocpte_alloc(pmap, pmap_pdpe_pindex(va), 6974 NULL, va); 6975 if (mp == NULL) 6976 goto allocf; 6977 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 6978 pde = &pde[pmap_pde_index(va)]; 6979 origpte = *pde; 6980 MPASS(origpte == 0); 6981 } else { 6982 origpte = *pde; 6983 if ((origpte & PG_V) == 0) { 6984 pdpe = pmap_pdpe(pmap, va); 6985 MPASS(pdpe != NULL && (*pdpe & PG_V) != 0); 6986 mp = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 6987 mp->ref_count++; 6988 } 6989 } 6990 *pde = pten; 6991 } 6992 KASSERT((origpte & PG_V) == 0 || ((origpte & PG_PS) != 0 && 6993 (origpte & PG_PS_FRAME) == (pten & PG_PS_FRAME)), 6994 ("va %#lx changing %s phys page origpte %#lx pten %#lx", 6995 va, psind == 2 ? "1G" : "2M", origpte, pten)); 6996 if ((pten & PG_W) != 0 && (origpte & PG_W) == 0) 6997 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE; 6998 else if ((pten & PG_W) == 0 && (origpte & PG_W) != 0) 6999 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE; 7000 if ((origpte & PG_V) == 0) 7001 pmap_resident_count_adj(pmap, pagesizes[psind] / PAGE_SIZE); 7002 7003 return (KERN_SUCCESS); 7004 7005 allocf: 7006 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 7007 return (KERN_RESOURCE_SHORTAGE); 7008 PMAP_UNLOCK(pmap); 7009 vm_wait(NULL); 7010 PMAP_LOCK(pmap); 7011 goto restart; 7012 } 7013 7014 /* 7015 * Insert the given physical page (p) at 7016 * the specified virtual address (v) in the 7017 * target physical map with the protection requested. 7018 * 7019 * If specified, the page will be wired down, meaning 7020 * that the related pte can not be reclaimed. 7021 * 7022 * NB: This is the only routine which MAY NOT lazy-evaluate 7023 * or lose information. That is, this routine must actually 7024 * insert this page into the given map NOW. 7025 * 7026 * When destroying both a page table and PV entry, this function 7027 * performs the TLB invalidation before releasing the PV list 7028 * lock, so we do not need pmap_delayed_invl_page() calls here. 7029 */ 7030 int 7031 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 7032 u_int flags, int8_t psind) 7033 { 7034 struct rwlock *lock; 7035 pd_entry_t *pde; 7036 pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; 7037 pt_entry_t newpte, origpte; 7038 pv_entry_t pv; 7039 vm_paddr_t opa, pa; 7040 vm_page_t mpte, om; 7041 int rv; 7042 boolean_t nosleep; 7043 7044 PG_A = pmap_accessed_bit(pmap); 7045 PG_G = pmap_global_bit(pmap); 7046 PG_M = pmap_modified_bit(pmap); 7047 PG_V = pmap_valid_bit(pmap); 7048 PG_RW = pmap_rw_bit(pmap); 7049 7050 va = trunc_page(va); 7051 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 7052 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 7053 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", 7054 va)); 7055 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va), 7056 ("pmap_enter: managed mapping within the clean submap")); 7057 if ((m->oflags & VPO_UNMANAGED) == 0) 7058 VM_PAGE_OBJECT_BUSY_ASSERT(m); 7059 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 7060 ("pmap_enter: flags %u has reserved bits set", flags)); 7061 pa = VM_PAGE_TO_PHYS(m); 7062 newpte = (pt_entry_t)(pa | PG_A | PG_V); 7063 if ((flags & VM_PROT_WRITE) != 0) 7064 newpte |= PG_M; 7065 if ((prot & VM_PROT_WRITE) != 0) 7066 newpte |= PG_RW; 7067 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 7068 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); 7069 if ((prot & VM_PROT_EXECUTE) == 0) 7070 newpte |= pg_nx; 7071 if ((flags & PMAP_ENTER_WIRED) != 0) 7072 newpte |= PG_W; 7073 if (va < VM_MAXUSER_ADDRESS) 7074 newpte |= PG_U; 7075 if (pmap == kernel_pmap) 7076 newpte |= PG_G; 7077 newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0); 7078 7079 /* 7080 * Set modified bit gratuitously for writeable mappings if 7081 * the page is unmanaged. We do not want to take a fault 7082 * to do the dirty bit accounting for these mappings. 7083 */ 7084 if ((m->oflags & VPO_UNMANAGED) != 0) { 7085 if ((newpte & PG_RW) != 0) 7086 newpte |= PG_M; 7087 } else 7088 newpte |= PG_MANAGED; 7089 7090 lock = NULL; 7091 PMAP_LOCK(pmap); 7092 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { 7093 KASSERT((m->oflags & VPO_UNMANAGED) != 0, 7094 ("managed largepage va %#lx flags %#x", va, flags)); 7095 rv = pmap_enter_largepage(pmap, va, newpte | PG_PS, flags, 7096 psind); 7097 goto out; 7098 } 7099 if (psind == 1) { 7100 /* Assert the required virtual and physical alignment. */ 7101 KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); 7102 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 7103 rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock); 7104 goto out; 7105 } 7106 mpte = NULL; 7107 7108 /* 7109 * In the case that a page table page is not 7110 * resident, we are creating it here. 7111 */ 7112 retry: 7113 pde = pmap_pde(pmap, va); 7114 if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || 7115 pmap_demote_pde_locked(pmap, pde, va, &lock))) { 7116 pte = pmap_pde_to_pte(pde, va); 7117 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 7118 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7119 mpte->ref_count++; 7120 } 7121 } else if (va < VM_MAXUSER_ADDRESS) { 7122 /* 7123 * Here if the pte page isn't mapped, or if it has been 7124 * deallocated. 7125 */ 7126 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 7127 mpte = pmap_allocpte_alloc(pmap, pmap_pde_pindex(va), 7128 nosleep ? NULL : &lock, va); 7129 if (mpte == NULL && nosleep) { 7130 rv = KERN_RESOURCE_SHORTAGE; 7131 goto out; 7132 } 7133 goto retry; 7134 } else 7135 panic("pmap_enter: invalid page directory va=%#lx", va); 7136 7137 origpte = *pte; 7138 pv = NULL; 7139 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) 7140 newpte |= pmap_pkru_get(pmap, va); 7141 7142 /* 7143 * Is the specified virtual address already mapped? 7144 */ 7145 if ((origpte & PG_V) != 0) { 7146 /* 7147 * Wiring change, just update stats. We don't worry about 7148 * wiring PT pages as they remain resident as long as there 7149 * are valid mappings in them. Hence, if a user page is wired, 7150 * the PT page will be also. 7151 */ 7152 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 7153 pmap->pm_stats.wired_count++; 7154 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 7155 pmap->pm_stats.wired_count--; 7156 7157 /* 7158 * Remove the extra PT page reference. 7159 */ 7160 if (mpte != NULL) { 7161 mpte->ref_count--; 7162 KASSERT(mpte->ref_count > 0, 7163 ("pmap_enter: missing reference to page table page," 7164 " va: 0x%lx", va)); 7165 } 7166 7167 /* 7168 * Has the physical page changed? 7169 */ 7170 opa = origpte & PG_FRAME; 7171 if (opa == pa) { 7172 /* 7173 * No, might be a protection or wiring change. 7174 */ 7175 if ((origpte & PG_MANAGED) != 0 && 7176 (newpte & PG_RW) != 0) 7177 vm_page_aflag_set(m, PGA_WRITEABLE); 7178 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 7179 goto unchanged; 7180 goto validate; 7181 } 7182 7183 /* 7184 * The physical page has changed. Temporarily invalidate 7185 * the mapping. This ensures that all threads sharing the 7186 * pmap keep a consistent view of the mapping, which is 7187 * necessary for the correct handling of COW faults. It 7188 * also permits reuse of the old mapping's PV entry, 7189 * avoiding an allocation. 7190 * 7191 * For consistency, handle unmanaged mappings the same way. 7192 */ 7193 origpte = pte_load_clear(pte); 7194 KASSERT((origpte & PG_FRAME) == opa, 7195 ("pmap_enter: unexpected pa update for %#lx", va)); 7196 if ((origpte & PG_MANAGED) != 0) { 7197 om = PHYS_TO_VM_PAGE(opa); 7198 7199 /* 7200 * The pmap lock is sufficient to synchronize with 7201 * concurrent calls to pmap_page_test_mappings() and 7202 * pmap_ts_referenced(). 7203 */ 7204 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 7205 vm_page_dirty(om); 7206 if ((origpte & PG_A) != 0) { 7207 pmap_invalidate_page(pmap, va); 7208 vm_page_aflag_set(om, PGA_REFERENCED); 7209 } 7210 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 7211 pv = pmap_pvh_remove(&om->md, pmap, va); 7212 KASSERT(pv != NULL, 7213 ("pmap_enter: no PV entry for %#lx", va)); 7214 if ((newpte & PG_MANAGED) == 0) 7215 free_pv_entry(pmap, pv); 7216 if ((om->a.flags & PGA_WRITEABLE) != 0 && 7217 TAILQ_EMPTY(&om->md.pv_list) && 7218 ((om->flags & PG_FICTITIOUS) != 0 || 7219 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 7220 vm_page_aflag_clear(om, PGA_WRITEABLE); 7221 } else { 7222 /* 7223 * Since this mapping is unmanaged, assume that PG_A 7224 * is set. 7225 */ 7226 pmap_invalidate_page(pmap, va); 7227 } 7228 origpte = 0; 7229 } else { 7230 /* 7231 * Increment the counters. 7232 */ 7233 if ((newpte & PG_W) != 0) 7234 pmap->pm_stats.wired_count++; 7235 pmap_resident_count_adj(pmap, 1); 7236 } 7237 7238 /* 7239 * Enter on the PV list if part of our managed memory. 7240 */ 7241 if ((newpte & PG_MANAGED) != 0) { 7242 if (pv == NULL) { 7243 pv = get_pv_entry(pmap, &lock); 7244 pv->pv_va = va; 7245 } 7246 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 7247 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 7248 m->md.pv_gen++; 7249 if ((newpte & PG_RW) != 0) 7250 vm_page_aflag_set(m, PGA_WRITEABLE); 7251 } 7252 7253 /* 7254 * Update the PTE. 7255 */ 7256 if ((origpte & PG_V) != 0) { 7257 validate: 7258 origpte = pte_load_store(pte, newpte); 7259 KASSERT((origpte & PG_FRAME) == pa, 7260 ("pmap_enter: unexpected pa update for %#lx", va)); 7261 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == 7262 (PG_M | PG_RW)) { 7263 if ((origpte & PG_MANAGED) != 0) 7264 vm_page_dirty(m); 7265 7266 /* 7267 * Although the PTE may still have PG_RW set, TLB 7268 * invalidation may nonetheless be required because 7269 * the PTE no longer has PG_M set. 7270 */ 7271 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { 7272 /* 7273 * This PTE change does not require TLB invalidation. 7274 */ 7275 goto unchanged; 7276 } 7277 if ((origpte & PG_A) != 0) 7278 pmap_invalidate_page(pmap, va); 7279 } else 7280 pte_store(pte, newpte); 7281 7282 unchanged: 7283 7284 #if VM_NRESERVLEVEL > 0 7285 /* 7286 * If both the page table page and the reservation are fully 7287 * populated, then attempt promotion. 7288 */ 7289 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 7290 pmap_ps_enabled(pmap) && 7291 (m->flags & PG_FICTITIOUS) == 0 && 7292 vm_reserv_level_iffullpop(m) == 0) 7293 pmap_promote_pde(pmap, pde, va, mpte, &lock); 7294 #endif 7295 7296 rv = KERN_SUCCESS; 7297 out: 7298 if (lock != NULL) 7299 rw_wunlock(lock); 7300 PMAP_UNLOCK(pmap); 7301 return (rv); 7302 } 7303 7304 /* 7305 * Tries to create a read- and/or execute-only 2MB page mapping. Returns 7306 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error 7307 * value. See pmap_enter_pde() for the possible error values when "no sleep", 7308 * "no replace", and "no reclaim" are specified. 7309 */ 7310 static int 7311 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 7312 struct rwlock **lockp) 7313 { 7314 pd_entry_t newpde; 7315 pt_entry_t PG_V; 7316 7317 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7318 PG_V = pmap_valid_bit(pmap); 7319 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | 7320 PG_PS | PG_V; 7321 if ((m->oflags & VPO_UNMANAGED) == 0) 7322 newpde |= PG_MANAGED; 7323 if ((prot & VM_PROT_EXECUTE) == 0) 7324 newpde |= pg_nx; 7325 if (va < VM_MAXUSER_ADDRESS) 7326 newpde |= PG_U; 7327 return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | 7328 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp)); 7329 } 7330 7331 /* 7332 * Returns true if every page table entry in the specified page table page is 7333 * zero. 7334 */ 7335 static bool 7336 pmap_every_pte_zero(vm_paddr_t pa) 7337 { 7338 pt_entry_t *pt_end, *pte; 7339 7340 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); 7341 pte = (pt_entry_t *)PHYS_TO_DMAP(pa); 7342 for (pt_end = pte + NPTEPG; pte < pt_end; pte++) { 7343 if (*pte != 0) 7344 return (false); 7345 } 7346 return (true); 7347 } 7348 7349 /* 7350 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 7351 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, 7352 * KERN_PROTECTION_FAILURE, or KERN_RESOURCE_SHORTAGE otherwise. Returns 7353 * KERN_FAILURE if either (1) PMAP_ENTER_NOREPLACE was specified and a 4KB 7354 * page mapping already exists within the 2MB virtual address range starting 7355 * at the specified virtual address or (2) the requested 2MB page mapping is 7356 * not supported due to hardware errata. Returns KERN_NO_SPACE if 7357 * PMAP_ENTER_NOREPLACE was specified and a 2MB page mapping already exists at 7358 * the specified virtual address. Returns KERN_PROTECTION_FAILURE if the PKRU 7359 * settings are not the same across the 2MB virtual address range starting at 7360 * the specified virtual address. Returns KERN_RESOURCE_SHORTAGE if either 7361 * (1) PMAP_ENTER_NOSLEEP was specified and a page table page allocation 7362 * failed or (2) PMAP_ENTER_NORECLAIM was specified and a PV entry allocation 7363 * failed. 7364 * 7365 * The parameter "m" is only used when creating a managed, writeable mapping. 7366 */ 7367 static int 7368 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, 7369 vm_page_t m, struct rwlock **lockp) 7370 { 7371 struct spglist free; 7372 pd_entry_t oldpde, *pde; 7373 pt_entry_t PG_G, PG_RW, PG_V; 7374 vm_page_t mt, pdpg; 7375 7376 KASSERT(pmap == kernel_pmap || (newpde & PG_W) == 0, 7377 ("pmap_enter_pde: cannot create wired user mapping")); 7378 PG_G = pmap_global_bit(pmap); 7379 PG_RW = pmap_rw_bit(pmap); 7380 KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW, 7381 ("pmap_enter_pde: newpde is missing PG_M")); 7382 PG_V = pmap_valid_bit(pmap); 7383 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7384 7385 if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, 7386 newpde))) { 7387 CTR2(KTR_PMAP, "pmap_enter_pde: 2m x blocked for va %#lx" 7388 " in pmap %p", va, pmap); 7389 return (KERN_FAILURE); 7390 } 7391 if ((pde = pmap_alloc_pde(pmap, va, &pdpg, (flags & 7392 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { 7393 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 7394 " in pmap %p", va, pmap); 7395 return (KERN_RESOURCE_SHORTAGE); 7396 } 7397 7398 /* 7399 * If pkru is not same for the whole pde range, return failure 7400 * and let vm_fault() cope. Check after pde allocation, since 7401 * it could sleep. 7402 */ 7403 if (!pmap_pkru_same(pmap, va, va + NBPDR)) { 7404 pmap_abort_ptp(pmap, va, pdpg); 7405 return (KERN_PROTECTION_FAILURE); 7406 } 7407 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) { 7408 newpde &= ~X86_PG_PKU_MASK; 7409 newpde |= pmap_pkru_get(pmap, va); 7410 } 7411 7412 /* 7413 * If there are existing mappings, either abort or remove them. 7414 */ 7415 oldpde = *pde; 7416 if ((oldpde & PG_V) != 0) { 7417 KASSERT(pdpg == NULL || pdpg->ref_count > 1, 7418 ("pmap_enter_pde: pdpg's reference count is too low")); 7419 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 7420 if ((oldpde & PG_PS) != 0) { 7421 if (pdpg != NULL) 7422 pdpg->ref_count--; 7423 CTR2(KTR_PMAP, 7424 "pmap_enter_pde: no space for va %#lx" 7425 " in pmap %p", va, pmap); 7426 return (KERN_NO_SPACE); 7427 } else if (va < VM_MAXUSER_ADDRESS || 7428 !pmap_every_pte_zero(oldpde & PG_FRAME)) { 7429 if (pdpg != NULL) 7430 pdpg->ref_count--; 7431 CTR2(KTR_PMAP, 7432 "pmap_enter_pde: failure for va %#lx" 7433 " in pmap %p", va, pmap); 7434 return (KERN_FAILURE); 7435 } 7436 } 7437 /* Break the existing mapping(s). */ 7438 SLIST_INIT(&free); 7439 if ((oldpde & PG_PS) != 0) { 7440 /* 7441 * The reference to the PD page that was acquired by 7442 * pmap_alloc_pde() ensures that it won't be freed. 7443 * However, if the PDE resulted from a promotion, then 7444 * a reserved PT page could be freed. 7445 */ 7446 (void)pmap_remove_pde(pmap, pde, va, &free, lockp); 7447 if ((oldpde & PG_G) == 0) 7448 pmap_invalidate_pde_page(pmap, va, oldpde); 7449 } else { 7450 pmap_delayed_invl_start(); 7451 if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free, 7452 lockp)) 7453 pmap_invalidate_all(pmap); 7454 pmap_delayed_invl_finish(); 7455 } 7456 if (va < VM_MAXUSER_ADDRESS) { 7457 vm_page_free_pages_toq(&free, true); 7458 KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", 7459 pde)); 7460 } else { 7461 KASSERT(SLIST_EMPTY(&free), 7462 ("pmap_enter_pde: freed kernel page table page")); 7463 7464 /* 7465 * Both pmap_remove_pde() and pmap_remove_ptes() will 7466 * leave the kernel page table page zero filled. 7467 */ 7468 mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7469 if (pmap_insert_pt_page(pmap, mt, false)) 7470 panic("pmap_enter_pde: trie insert failed"); 7471 } 7472 } 7473 7474 if ((newpde & PG_MANAGED) != 0) { 7475 /* 7476 * Abort this mapping if its PV entry could not be created. 7477 */ 7478 if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) { 7479 if (pdpg != NULL) 7480 pmap_abort_ptp(pmap, va, pdpg); 7481 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 7482 " in pmap %p", va, pmap); 7483 return (KERN_RESOURCE_SHORTAGE); 7484 } 7485 if ((newpde & PG_RW) != 0) { 7486 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 7487 vm_page_aflag_set(mt, PGA_WRITEABLE); 7488 } 7489 } 7490 7491 /* 7492 * Increment counters. 7493 */ 7494 if ((newpde & PG_W) != 0) 7495 pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; 7496 pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE); 7497 7498 /* 7499 * Map the superpage. (This is not a promoted mapping; there will not 7500 * be any lingering 4KB page mappings in the TLB.) 7501 */ 7502 pde_store(pde, newpde); 7503 7504 counter_u64_add(pmap_pde_mappings, 1); 7505 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx in pmap %p", 7506 va, pmap); 7507 return (KERN_SUCCESS); 7508 } 7509 7510 /* 7511 * Maps a sequence of resident pages belonging to the same object. 7512 * The sequence begins with the given page m_start. This page is 7513 * mapped at the given virtual address start. Each subsequent page is 7514 * mapped at a virtual address that is offset from start by the same 7515 * amount as the page is offset from m_start within the object. The 7516 * last page in the sequence is the page with the largest offset from 7517 * m_start that can be mapped at a virtual address less than the given 7518 * virtual address end. Not every virtual page between start and end 7519 * is mapped; only those for which a resident page exists with the 7520 * corresponding offset from m_start are mapped. 7521 */ 7522 void 7523 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 7524 vm_page_t m_start, vm_prot_t prot) 7525 { 7526 struct rwlock *lock; 7527 vm_offset_t va; 7528 vm_page_t m, mpte; 7529 vm_pindex_t diff, psize; 7530 int rv; 7531 7532 VM_OBJECT_ASSERT_LOCKED(m_start->object); 7533 7534 psize = atop(end - start); 7535 mpte = NULL; 7536 m = m_start; 7537 lock = NULL; 7538 PMAP_LOCK(pmap); 7539 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 7540 va = start + ptoa(diff); 7541 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 7542 m->psind == 1 && pmap_ps_enabled(pmap) && 7543 ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) == 7544 KERN_SUCCESS || rv == KERN_NO_SPACE)) 7545 m = &m[NBPDR / PAGE_SIZE - 1]; 7546 else 7547 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 7548 mpte, &lock); 7549 m = TAILQ_NEXT(m, listq); 7550 } 7551 if (lock != NULL) 7552 rw_wunlock(lock); 7553 PMAP_UNLOCK(pmap); 7554 } 7555 7556 /* 7557 * this code makes some *MAJOR* assumptions: 7558 * 1. Current pmap & pmap exists. 7559 * 2. Not wired. 7560 * 3. Read access. 7561 * 4. No page table pages. 7562 * but is *MUCH* faster than pmap_enter... 7563 */ 7564 7565 void 7566 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 7567 { 7568 struct rwlock *lock; 7569 7570 lock = NULL; 7571 PMAP_LOCK(pmap); 7572 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 7573 if (lock != NULL) 7574 rw_wunlock(lock); 7575 PMAP_UNLOCK(pmap); 7576 } 7577 7578 static vm_page_t 7579 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 7580 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 7581 { 7582 pt_entry_t newpte, *pte, PG_V; 7583 7584 KASSERT(!VA_IS_CLEANMAP(va) || 7585 (m->oflags & VPO_UNMANAGED) != 0, 7586 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 7587 PG_V = pmap_valid_bit(pmap); 7588 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7589 7590 /* 7591 * In the case that a page table page is not 7592 * resident, we are creating it here. 7593 */ 7594 if (va < VM_MAXUSER_ADDRESS) { 7595 pdp_entry_t *pdpe; 7596 pd_entry_t *pde; 7597 vm_pindex_t ptepindex; 7598 7599 /* 7600 * Calculate pagetable page index 7601 */ 7602 ptepindex = pmap_pde_pindex(va); 7603 if (mpte && (mpte->pindex == ptepindex)) { 7604 mpte->ref_count++; 7605 } else { 7606 /* 7607 * If the page table page is mapped, we just increment 7608 * the hold count, and activate it. Otherwise, we 7609 * attempt to allocate a page table page, passing NULL 7610 * instead of the PV list lock pointer because we don't 7611 * intend to sleep. If this attempt fails, we don't 7612 * retry. Instead, we give up. 7613 */ 7614 pdpe = pmap_pdpe(pmap, va); 7615 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 7616 if ((*pdpe & PG_PS) != 0) 7617 return (NULL); 7618 pde = pmap_pdpe_to_pde(pdpe, va); 7619 if ((*pde & PG_V) != 0) { 7620 if ((*pde & PG_PS) != 0) 7621 return (NULL); 7622 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7623 mpte->ref_count++; 7624 } else { 7625 mpte = pmap_allocpte_alloc(pmap, 7626 ptepindex, NULL, va); 7627 if (mpte == NULL) 7628 return (NULL); 7629 } 7630 } else { 7631 mpte = pmap_allocpte_alloc(pmap, ptepindex, 7632 NULL, va); 7633 if (mpte == NULL) 7634 return (NULL); 7635 } 7636 } 7637 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 7638 pte = &pte[pmap_pte_index(va)]; 7639 } else { 7640 mpte = NULL; 7641 pte = vtopte(va); 7642 } 7643 if (*pte) { 7644 if (mpte != NULL) 7645 mpte->ref_count--; 7646 return (NULL); 7647 } 7648 7649 /* 7650 * Enter on the PV list if part of our managed memory. 7651 */ 7652 if ((m->oflags & VPO_UNMANAGED) == 0 && 7653 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 7654 if (mpte != NULL) 7655 pmap_abort_ptp(pmap, va, mpte); 7656 return (NULL); 7657 } 7658 7659 /* 7660 * Increment counters 7661 */ 7662 pmap_resident_count_adj(pmap, 1); 7663 7664 newpte = VM_PAGE_TO_PHYS(m) | PG_V | 7665 pmap_cache_bits(pmap, m->md.pat_mode, 0); 7666 if ((m->oflags & VPO_UNMANAGED) == 0) 7667 newpte |= PG_MANAGED; 7668 if ((prot & VM_PROT_EXECUTE) == 0) 7669 newpte |= pg_nx; 7670 if (va < VM_MAXUSER_ADDRESS) 7671 newpte |= PG_U | pmap_pkru_get(pmap, va); 7672 pte_store(pte, newpte); 7673 return (mpte); 7674 } 7675 7676 /* 7677 * Make a temporary mapping for a physical address. This is only intended 7678 * to be used for panic dumps. 7679 */ 7680 void * 7681 pmap_kenter_temporary(vm_paddr_t pa, int i) 7682 { 7683 vm_offset_t va; 7684 7685 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 7686 pmap_kenter(va, pa); 7687 pmap_invlpg(kernel_pmap, va); 7688 return ((void *)crashdumpmap); 7689 } 7690 7691 /* 7692 * This code maps large physical mmap regions into the 7693 * processor address space. Note that some shortcuts 7694 * are taken, but the code works. 7695 */ 7696 void 7697 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 7698 vm_pindex_t pindex, vm_size_t size) 7699 { 7700 pd_entry_t *pde; 7701 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 7702 vm_paddr_t pa, ptepa; 7703 vm_page_t p, pdpg; 7704 int pat_mode; 7705 7706 PG_A = pmap_accessed_bit(pmap); 7707 PG_M = pmap_modified_bit(pmap); 7708 PG_V = pmap_valid_bit(pmap); 7709 PG_RW = pmap_rw_bit(pmap); 7710 7711 VM_OBJECT_ASSERT_WLOCKED(object); 7712 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 7713 ("pmap_object_init_pt: non-device object")); 7714 if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 7715 if (!pmap_ps_enabled(pmap)) 7716 return; 7717 if (!vm_object_populate(object, pindex, pindex + atop(size))) 7718 return; 7719 p = vm_page_lookup(object, pindex); 7720 KASSERT(vm_page_all_valid(p), 7721 ("pmap_object_init_pt: invalid page %p", p)); 7722 pat_mode = p->md.pat_mode; 7723 7724 /* 7725 * Abort the mapping if the first page is not physically 7726 * aligned to a 2MB page boundary. 7727 */ 7728 ptepa = VM_PAGE_TO_PHYS(p); 7729 if (ptepa & (NBPDR - 1)) 7730 return; 7731 7732 /* 7733 * Skip the first page. Abort the mapping if the rest of 7734 * the pages are not physically contiguous or have differing 7735 * memory attributes. 7736 */ 7737 p = TAILQ_NEXT(p, listq); 7738 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 7739 pa += PAGE_SIZE) { 7740 KASSERT(vm_page_all_valid(p), 7741 ("pmap_object_init_pt: invalid page %p", p)); 7742 if (pa != VM_PAGE_TO_PHYS(p) || 7743 pat_mode != p->md.pat_mode) 7744 return; 7745 p = TAILQ_NEXT(p, listq); 7746 } 7747 7748 /* 7749 * Map using 2MB pages. Since "ptepa" is 2M aligned and 7750 * "size" is a multiple of 2M, adding the PAT setting to "pa" 7751 * will not affect the termination of this loop. 7752 */ 7753 PMAP_LOCK(pmap); 7754 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); 7755 pa < ptepa + size; pa += NBPDR) { 7756 pde = pmap_alloc_pde(pmap, addr, &pdpg, NULL); 7757 if (pde == NULL) { 7758 /* 7759 * The creation of mappings below is only an 7760 * optimization. If a page directory page 7761 * cannot be allocated without blocking, 7762 * continue on to the next mapping rather than 7763 * blocking. 7764 */ 7765 addr += NBPDR; 7766 continue; 7767 } 7768 if ((*pde & PG_V) == 0) { 7769 pde_store(pde, pa | PG_PS | PG_M | PG_A | 7770 PG_U | PG_RW | PG_V); 7771 pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE); 7772 counter_u64_add(pmap_pde_mappings, 1); 7773 } else { 7774 /* Continue on if the PDE is already valid. */ 7775 pdpg->ref_count--; 7776 KASSERT(pdpg->ref_count > 0, 7777 ("pmap_object_init_pt: missing reference " 7778 "to page directory page, va: 0x%lx", addr)); 7779 } 7780 addr += NBPDR; 7781 } 7782 PMAP_UNLOCK(pmap); 7783 } 7784 } 7785 7786 /* 7787 * Clear the wired attribute from the mappings for the specified range of 7788 * addresses in the given pmap. Every valid mapping within that range 7789 * must have the wired attribute set. In contrast, invalid mappings 7790 * cannot have the wired attribute set, so they are ignored. 7791 * 7792 * The wired attribute of the page table entry is not a hardware 7793 * feature, so there is no need to invalidate any TLB entries. 7794 * Since pmap_demote_pde() for the wired entry must never fail, 7795 * pmap_delayed_invl_start()/finish() calls around the 7796 * function are not needed. 7797 */ 7798 void 7799 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 7800 { 7801 vm_offset_t va_next; 7802 pml4_entry_t *pml4e; 7803 pdp_entry_t *pdpe; 7804 pd_entry_t *pde; 7805 pt_entry_t *pte, PG_V, PG_G __diagused; 7806 7807 PG_V = pmap_valid_bit(pmap); 7808 PG_G = pmap_global_bit(pmap); 7809 PMAP_LOCK(pmap); 7810 for (; sva < eva; sva = va_next) { 7811 pml4e = pmap_pml4e(pmap, sva); 7812 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 7813 va_next = (sva + NBPML4) & ~PML4MASK; 7814 if (va_next < sva) 7815 va_next = eva; 7816 continue; 7817 } 7818 7819 va_next = (sva + NBPDP) & ~PDPMASK; 7820 if (va_next < sva) 7821 va_next = eva; 7822 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 7823 if ((*pdpe & PG_V) == 0) 7824 continue; 7825 if ((*pdpe & PG_PS) != 0) { 7826 KASSERT(va_next <= eva, 7827 ("partial update of non-transparent 1G mapping " 7828 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 7829 *pdpe, sva, eva, va_next)); 7830 MPASS(pmap != kernel_pmap); /* XXXKIB */ 7831 MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); 7832 atomic_clear_long(pdpe, PG_W); 7833 pmap->pm_stats.wired_count -= NBPDP / PAGE_SIZE; 7834 continue; 7835 } 7836 7837 va_next = (sva + NBPDR) & ~PDRMASK; 7838 if (va_next < sva) 7839 va_next = eva; 7840 pde = pmap_pdpe_to_pde(pdpe, sva); 7841 if ((*pde & PG_V) == 0) 7842 continue; 7843 if ((*pde & PG_PS) != 0) { 7844 if ((*pde & PG_W) == 0) 7845 panic("pmap_unwire: pde %#jx is missing PG_W", 7846 (uintmax_t)*pde); 7847 7848 /* 7849 * Are we unwiring the entire large page? If not, 7850 * demote the mapping and fall through. 7851 */ 7852 if (sva + NBPDR == va_next && eva >= va_next) { 7853 atomic_clear_long(pde, PG_W); 7854 pmap->pm_stats.wired_count -= NBPDR / 7855 PAGE_SIZE; 7856 continue; 7857 } else if (!pmap_demote_pde(pmap, pde, sva)) 7858 panic("pmap_unwire: demotion failed"); 7859 } 7860 if (va_next > eva) 7861 va_next = eva; 7862 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 7863 sva += PAGE_SIZE) { 7864 if ((*pte & PG_V) == 0) 7865 continue; 7866 if ((*pte & PG_W) == 0) 7867 panic("pmap_unwire: pte %#jx is missing PG_W", 7868 (uintmax_t)*pte); 7869 7870 /* 7871 * PG_W must be cleared atomically. Although the pmap 7872 * lock synchronizes access to PG_W, another processor 7873 * could be setting PG_M and/or PG_A concurrently. 7874 */ 7875 atomic_clear_long(pte, PG_W); 7876 pmap->pm_stats.wired_count--; 7877 } 7878 } 7879 PMAP_UNLOCK(pmap); 7880 } 7881 7882 /* 7883 * Copy the range specified by src_addr/len 7884 * from the source map to the range dst_addr/len 7885 * in the destination map. 7886 * 7887 * This routine is only advisory and need not do anything. 7888 */ 7889 void 7890 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 7891 vm_offset_t src_addr) 7892 { 7893 struct rwlock *lock; 7894 pml4_entry_t *pml4e; 7895 pdp_entry_t *pdpe; 7896 pd_entry_t *pde, srcptepaddr; 7897 pt_entry_t *dst_pte, PG_A, PG_M, PG_V, ptetemp, *src_pte; 7898 vm_offset_t addr, end_addr, va_next; 7899 vm_page_t dst_pdpg, dstmpte, srcmpte; 7900 7901 if (dst_addr != src_addr) 7902 return; 7903 7904 if (dst_pmap->pm_type != src_pmap->pm_type) 7905 return; 7906 7907 /* 7908 * EPT page table entries that require emulation of A/D bits are 7909 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although 7910 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit 7911 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT 7912 * implementations flag an EPT misconfiguration for exec-only 7913 * mappings we skip this function entirely for emulated pmaps. 7914 */ 7915 if (pmap_emulate_ad_bits(dst_pmap)) 7916 return; 7917 7918 end_addr = src_addr + len; 7919 lock = NULL; 7920 if (dst_pmap < src_pmap) { 7921 PMAP_LOCK(dst_pmap); 7922 PMAP_LOCK(src_pmap); 7923 } else { 7924 PMAP_LOCK(src_pmap); 7925 PMAP_LOCK(dst_pmap); 7926 } 7927 7928 PG_A = pmap_accessed_bit(dst_pmap); 7929 PG_M = pmap_modified_bit(dst_pmap); 7930 PG_V = pmap_valid_bit(dst_pmap); 7931 7932 for (addr = src_addr; addr < end_addr; addr = va_next) { 7933 KASSERT(addr < UPT_MIN_ADDRESS, 7934 ("pmap_copy: invalid to pmap_copy page tables")); 7935 7936 pml4e = pmap_pml4e(src_pmap, addr); 7937 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 7938 va_next = (addr + NBPML4) & ~PML4MASK; 7939 if (va_next < addr) 7940 va_next = end_addr; 7941 continue; 7942 } 7943 7944 va_next = (addr + NBPDP) & ~PDPMASK; 7945 if (va_next < addr) 7946 va_next = end_addr; 7947 pdpe = pmap_pml4e_to_pdpe(pml4e, addr); 7948 if ((*pdpe & PG_V) == 0) 7949 continue; 7950 if ((*pdpe & PG_PS) != 0) { 7951 KASSERT(va_next <= end_addr, 7952 ("partial update of non-transparent 1G mapping " 7953 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 7954 *pdpe, addr, end_addr, va_next)); 7955 MPASS((addr & PDPMASK) == 0); 7956 MPASS((*pdpe & PG_MANAGED) == 0); 7957 srcptepaddr = *pdpe; 7958 pdpe = pmap_pdpe(dst_pmap, addr); 7959 if (pdpe == NULL) { 7960 if (pmap_allocpte_alloc(dst_pmap, 7961 pmap_pml4e_pindex(addr), NULL, addr) == 7962 NULL) 7963 break; 7964 pdpe = pmap_pdpe(dst_pmap, addr); 7965 } else { 7966 pml4e = pmap_pml4e(dst_pmap, addr); 7967 dst_pdpg = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME); 7968 dst_pdpg->ref_count++; 7969 } 7970 KASSERT(*pdpe == 0, 7971 ("1G mapping present in dst pmap " 7972 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 7973 *pdpe, addr, end_addr, va_next)); 7974 *pdpe = srcptepaddr & ~PG_W; 7975 pmap_resident_count_adj(dst_pmap, NBPDP / PAGE_SIZE); 7976 continue; 7977 } 7978 7979 va_next = (addr + NBPDR) & ~PDRMASK; 7980 if (va_next < addr) 7981 va_next = end_addr; 7982 7983 pde = pmap_pdpe_to_pde(pdpe, addr); 7984 srcptepaddr = *pde; 7985 if (srcptepaddr == 0) 7986 continue; 7987 7988 if (srcptepaddr & PG_PS) { 7989 /* 7990 * We can only virtual copy whole superpages. 7991 */ 7992 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 7993 continue; 7994 pde = pmap_alloc_pde(dst_pmap, addr, &dst_pdpg, NULL); 7995 if (pde == NULL) 7996 break; 7997 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 7998 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, 7999 PMAP_ENTER_NORECLAIM, &lock))) { 8000 /* 8001 * We leave the dirty bit unchanged because 8002 * managed read/write superpage mappings are 8003 * required to be dirty. However, managed 8004 * superpage mappings are not required to 8005 * have their accessed bit set, so we clear 8006 * it because we don't know if this mapping 8007 * will be used. 8008 */ 8009 srcptepaddr &= ~PG_W; 8010 if ((srcptepaddr & PG_MANAGED) != 0) 8011 srcptepaddr &= ~PG_A; 8012 *pde = srcptepaddr; 8013 pmap_resident_count_adj(dst_pmap, NBPDR / 8014 PAGE_SIZE); 8015 counter_u64_add(pmap_pde_mappings, 1); 8016 } else 8017 pmap_abort_ptp(dst_pmap, addr, dst_pdpg); 8018 continue; 8019 } 8020 8021 srcptepaddr &= PG_FRAME; 8022 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 8023 KASSERT(srcmpte->ref_count > 0, 8024 ("pmap_copy: source page table page is unused")); 8025 8026 if (va_next > end_addr) 8027 va_next = end_addr; 8028 8029 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 8030 src_pte = &src_pte[pmap_pte_index(addr)]; 8031 dstmpte = NULL; 8032 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { 8033 ptetemp = *src_pte; 8034 8035 /* 8036 * We only virtual copy managed pages. 8037 */ 8038 if ((ptetemp & PG_MANAGED) == 0) 8039 continue; 8040 8041 if (dstmpte != NULL) { 8042 KASSERT(dstmpte->pindex == 8043 pmap_pde_pindex(addr), 8044 ("dstmpte pindex/addr mismatch")); 8045 dstmpte->ref_count++; 8046 } else if ((dstmpte = pmap_allocpte(dst_pmap, addr, 8047 NULL)) == NULL) 8048 goto out; 8049 dst_pte = (pt_entry_t *) 8050 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 8051 dst_pte = &dst_pte[pmap_pte_index(addr)]; 8052 if (*dst_pte == 0 && 8053 pmap_try_insert_pv_entry(dst_pmap, addr, 8054 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) { 8055 /* 8056 * Clear the wired, modified, and accessed 8057 * (referenced) bits during the copy. 8058 */ 8059 *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); 8060 pmap_resident_count_adj(dst_pmap, 1); 8061 } else { 8062 pmap_abort_ptp(dst_pmap, addr, dstmpte); 8063 goto out; 8064 } 8065 /* Have we copied all of the valid mappings? */ 8066 if (dstmpte->ref_count >= srcmpte->ref_count) 8067 break; 8068 } 8069 } 8070 out: 8071 if (lock != NULL) 8072 rw_wunlock(lock); 8073 PMAP_UNLOCK(src_pmap); 8074 PMAP_UNLOCK(dst_pmap); 8075 } 8076 8077 int 8078 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap) 8079 { 8080 int error; 8081 8082 if (dst_pmap->pm_type != src_pmap->pm_type || 8083 dst_pmap->pm_type != PT_X86 || 8084 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) 8085 return (0); 8086 for (;;) { 8087 if (dst_pmap < src_pmap) { 8088 PMAP_LOCK(dst_pmap); 8089 PMAP_LOCK(src_pmap); 8090 } else { 8091 PMAP_LOCK(src_pmap); 8092 PMAP_LOCK(dst_pmap); 8093 } 8094 error = pmap_pkru_copy(dst_pmap, src_pmap); 8095 /* Clean up partial copy on failure due to no memory. */ 8096 if (error == ENOMEM) 8097 pmap_pkru_deassign_all(dst_pmap); 8098 PMAP_UNLOCK(src_pmap); 8099 PMAP_UNLOCK(dst_pmap); 8100 if (error != ENOMEM) 8101 break; 8102 vm_wait(NULL); 8103 } 8104 return (error); 8105 } 8106 8107 /* 8108 * Zero the specified hardware page. 8109 */ 8110 void 8111 pmap_zero_page(vm_page_t m) 8112 { 8113 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 8114 8115 pagezero((void *)va); 8116 } 8117 8118 /* 8119 * Zero an area within a single hardware page. off and size must not 8120 * cover an area beyond a single hardware page. 8121 */ 8122 void 8123 pmap_zero_page_area(vm_page_t m, int off, int size) 8124 { 8125 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 8126 8127 if (off == 0 && size == PAGE_SIZE) 8128 pagezero((void *)va); 8129 else 8130 bzero((char *)va + off, size); 8131 } 8132 8133 /* 8134 * Copy 1 specified hardware page to another. 8135 */ 8136 void 8137 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 8138 { 8139 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 8140 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 8141 8142 pagecopy((void *)src, (void *)dst); 8143 } 8144 8145 int unmapped_buf_allowed = 1; 8146 8147 void 8148 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 8149 vm_offset_t b_offset, int xfersize) 8150 { 8151 void *a_cp, *b_cp; 8152 vm_page_t pages[2]; 8153 vm_offset_t vaddr[2], a_pg_offset, b_pg_offset; 8154 int cnt; 8155 boolean_t mapped; 8156 8157 while (xfersize > 0) { 8158 a_pg_offset = a_offset & PAGE_MASK; 8159 pages[0] = ma[a_offset >> PAGE_SHIFT]; 8160 b_pg_offset = b_offset & PAGE_MASK; 8161 pages[1] = mb[b_offset >> PAGE_SHIFT]; 8162 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 8163 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 8164 mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE); 8165 a_cp = (char *)vaddr[0] + a_pg_offset; 8166 b_cp = (char *)vaddr[1] + b_pg_offset; 8167 bcopy(a_cp, b_cp, cnt); 8168 if (__predict_false(mapped)) 8169 pmap_unmap_io_transient(pages, vaddr, 2, FALSE); 8170 a_offset += cnt; 8171 b_offset += cnt; 8172 xfersize -= cnt; 8173 } 8174 } 8175 8176 /* 8177 * Returns true if the pmap's pv is one of the first 8178 * 16 pvs linked to from this page. This count may 8179 * be changed upwards or downwards in the future; it 8180 * is only necessary that true be returned for a small 8181 * subset of pmaps for proper page aging. 8182 */ 8183 boolean_t 8184 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 8185 { 8186 struct md_page *pvh; 8187 struct rwlock *lock; 8188 pv_entry_t pv; 8189 int loops = 0; 8190 boolean_t rv; 8191 8192 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8193 ("pmap_page_exists_quick: page %p is not managed", m)); 8194 rv = FALSE; 8195 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8196 rw_rlock(lock); 8197 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8198 if (PV_PMAP(pv) == pmap) { 8199 rv = TRUE; 8200 break; 8201 } 8202 loops++; 8203 if (loops >= 16) 8204 break; 8205 } 8206 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 8207 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8208 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8209 if (PV_PMAP(pv) == pmap) { 8210 rv = TRUE; 8211 break; 8212 } 8213 loops++; 8214 if (loops >= 16) 8215 break; 8216 } 8217 } 8218 rw_runlock(lock); 8219 return (rv); 8220 } 8221 8222 /* 8223 * pmap_page_wired_mappings: 8224 * 8225 * Return the number of managed mappings to the given physical page 8226 * that are wired. 8227 */ 8228 int 8229 pmap_page_wired_mappings(vm_page_t m) 8230 { 8231 struct rwlock *lock; 8232 struct md_page *pvh; 8233 pmap_t pmap; 8234 pt_entry_t *pte; 8235 pv_entry_t pv; 8236 int count, md_gen, pvh_gen; 8237 8238 if ((m->oflags & VPO_UNMANAGED) != 0) 8239 return (0); 8240 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8241 rw_rlock(lock); 8242 restart: 8243 count = 0; 8244 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8245 pmap = PV_PMAP(pv); 8246 if (!PMAP_TRYLOCK(pmap)) { 8247 md_gen = m->md.pv_gen; 8248 rw_runlock(lock); 8249 PMAP_LOCK(pmap); 8250 rw_rlock(lock); 8251 if (md_gen != m->md.pv_gen) { 8252 PMAP_UNLOCK(pmap); 8253 goto restart; 8254 } 8255 } 8256 pte = pmap_pte(pmap, pv->pv_va); 8257 if ((*pte & PG_W) != 0) 8258 count++; 8259 PMAP_UNLOCK(pmap); 8260 } 8261 if ((m->flags & PG_FICTITIOUS) == 0) { 8262 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8263 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8264 pmap = PV_PMAP(pv); 8265 if (!PMAP_TRYLOCK(pmap)) { 8266 md_gen = m->md.pv_gen; 8267 pvh_gen = pvh->pv_gen; 8268 rw_runlock(lock); 8269 PMAP_LOCK(pmap); 8270 rw_rlock(lock); 8271 if (md_gen != m->md.pv_gen || 8272 pvh_gen != pvh->pv_gen) { 8273 PMAP_UNLOCK(pmap); 8274 goto restart; 8275 } 8276 } 8277 pte = pmap_pde(pmap, pv->pv_va); 8278 if ((*pte & PG_W) != 0) 8279 count++; 8280 PMAP_UNLOCK(pmap); 8281 } 8282 } 8283 rw_runlock(lock); 8284 return (count); 8285 } 8286 8287 /* 8288 * Returns TRUE if the given page is mapped individually or as part of 8289 * a 2mpage. Otherwise, returns FALSE. 8290 */ 8291 boolean_t 8292 pmap_page_is_mapped(vm_page_t m) 8293 { 8294 struct rwlock *lock; 8295 boolean_t rv; 8296 8297 if ((m->oflags & VPO_UNMANAGED) != 0) 8298 return (FALSE); 8299 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8300 rw_rlock(lock); 8301 rv = !TAILQ_EMPTY(&m->md.pv_list) || 8302 ((m->flags & PG_FICTITIOUS) == 0 && 8303 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 8304 rw_runlock(lock); 8305 return (rv); 8306 } 8307 8308 /* 8309 * Destroy all managed, non-wired mappings in the given user-space 8310 * pmap. This pmap cannot be active on any processor besides the 8311 * caller. 8312 * 8313 * This function cannot be applied to the kernel pmap. Moreover, it 8314 * is not intended for general use. It is only to be used during 8315 * process termination. Consequently, it can be implemented in ways 8316 * that make it faster than pmap_remove(). First, it can more quickly 8317 * destroy mappings by iterating over the pmap's collection of PV 8318 * entries, rather than searching the page table. Second, it doesn't 8319 * have to test and clear the page table entries atomically, because 8320 * no processor is currently accessing the user address space. In 8321 * particular, a page table entry's dirty bit won't change state once 8322 * this function starts. 8323 * 8324 * Although this function destroys all of the pmap's managed, 8325 * non-wired mappings, it can delay and batch the invalidation of TLB 8326 * entries without calling pmap_delayed_invl_start() and 8327 * pmap_delayed_invl_finish(). Because the pmap is not active on 8328 * any other processor, none of these TLB entries will ever be used 8329 * before their eventual invalidation. Consequently, there is no need 8330 * for either pmap_remove_all() or pmap_remove_write() to wait for 8331 * that eventual TLB invalidation. 8332 */ 8333 void 8334 pmap_remove_pages(pmap_t pmap) 8335 { 8336 pd_entry_t ptepde; 8337 pt_entry_t *pte, tpte; 8338 pt_entry_t PG_M, PG_RW, PG_V; 8339 struct spglist free; 8340 struct pv_chunklist free_chunks[PMAP_MEMDOM]; 8341 vm_page_t m, mpte, mt; 8342 pv_entry_t pv; 8343 struct md_page *pvh; 8344 struct pv_chunk *pc, *npc; 8345 struct rwlock *lock; 8346 int64_t bit; 8347 uint64_t inuse, bitmask; 8348 int allfree, field, i, idx; 8349 #ifdef PV_STATS 8350 int freed; 8351 #endif 8352 boolean_t superpage; 8353 vm_paddr_t pa; 8354 8355 /* 8356 * Assert that the given pmap is only active on the current 8357 * CPU. Unfortunately, we cannot block another CPU from 8358 * activating the pmap while this function is executing. 8359 */ 8360 KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); 8361 #ifdef INVARIANTS 8362 { 8363 cpuset_t other_cpus; 8364 8365 other_cpus = all_cpus; 8366 critical_enter(); 8367 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 8368 CPU_AND(&other_cpus, &other_cpus, &pmap->pm_active); 8369 critical_exit(); 8370 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); 8371 } 8372 #endif 8373 8374 lock = NULL; 8375 PG_M = pmap_modified_bit(pmap); 8376 PG_V = pmap_valid_bit(pmap); 8377 PG_RW = pmap_rw_bit(pmap); 8378 8379 for (i = 0; i < PMAP_MEMDOM; i++) 8380 TAILQ_INIT(&free_chunks[i]); 8381 SLIST_INIT(&free); 8382 PMAP_LOCK(pmap); 8383 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 8384 allfree = 1; 8385 #ifdef PV_STATS 8386 freed = 0; 8387 #endif 8388 for (field = 0; field < _NPCM; field++) { 8389 inuse = ~pc->pc_map[field] & pc_freemask[field]; 8390 while (inuse != 0) { 8391 bit = bsfq(inuse); 8392 bitmask = 1UL << bit; 8393 idx = field * 64 + bit; 8394 pv = &pc->pc_pventry[idx]; 8395 inuse &= ~bitmask; 8396 8397 pte = pmap_pdpe(pmap, pv->pv_va); 8398 ptepde = *pte; 8399 pte = pmap_pdpe_to_pde(pte, pv->pv_va); 8400 tpte = *pte; 8401 if ((tpte & (PG_PS | PG_V)) == PG_V) { 8402 superpage = FALSE; 8403 ptepde = tpte; 8404 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 8405 PG_FRAME); 8406 pte = &pte[pmap_pte_index(pv->pv_va)]; 8407 tpte = *pte; 8408 } else { 8409 /* 8410 * Keep track whether 'tpte' is a 8411 * superpage explicitly instead of 8412 * relying on PG_PS being set. 8413 * 8414 * This is because PG_PS is numerically 8415 * identical to PG_PTE_PAT and thus a 8416 * regular page could be mistaken for 8417 * a superpage. 8418 */ 8419 superpage = TRUE; 8420 } 8421 8422 if ((tpte & PG_V) == 0) { 8423 panic("bad pte va %lx pte %lx", 8424 pv->pv_va, tpte); 8425 } 8426 8427 /* 8428 * We cannot remove wired pages from a process' mapping at this time 8429 */ 8430 if (tpte & PG_W) { 8431 allfree = 0; 8432 continue; 8433 } 8434 8435 /* Mark free */ 8436 pc->pc_map[field] |= bitmask; 8437 8438 /* 8439 * Because this pmap is not active on other 8440 * processors, the dirty bit cannot have 8441 * changed state since we last loaded pte. 8442 */ 8443 pte_clear(pte); 8444 8445 if (superpage) 8446 pa = tpte & PG_PS_FRAME; 8447 else 8448 pa = tpte & PG_FRAME; 8449 8450 m = PHYS_TO_VM_PAGE(pa); 8451 KASSERT(m->phys_addr == pa, 8452 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 8453 m, (uintmax_t)m->phys_addr, 8454 (uintmax_t)tpte)); 8455 8456 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 8457 m < &vm_page_array[vm_page_array_size], 8458 ("pmap_remove_pages: bad tpte %#jx", 8459 (uintmax_t)tpte)); 8460 8461 /* 8462 * Update the vm_page_t clean/reference bits. 8463 */ 8464 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 8465 if (superpage) { 8466 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 8467 vm_page_dirty(mt); 8468 } else 8469 vm_page_dirty(m); 8470 } 8471 8472 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 8473 8474 if (superpage) { 8475 pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE); 8476 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 8477 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 8478 pvh->pv_gen++; 8479 if (TAILQ_EMPTY(&pvh->pv_list)) { 8480 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 8481 if ((mt->a.flags & PGA_WRITEABLE) != 0 && 8482 TAILQ_EMPTY(&mt->md.pv_list)) 8483 vm_page_aflag_clear(mt, PGA_WRITEABLE); 8484 } 8485 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 8486 if (mpte != NULL) { 8487 KASSERT(vm_page_all_valid(mpte), 8488 ("pmap_remove_pages: pte page not promoted")); 8489 pmap_pt_page_count_adj(pmap, -1); 8490 KASSERT(mpte->ref_count == NPTEPG, 8491 ("pmap_remove_pages: pte page reference count error")); 8492 mpte->ref_count = 0; 8493 pmap_add_delayed_free_list(mpte, &free, FALSE); 8494 } 8495 } else { 8496 pmap_resident_count_adj(pmap, -1); 8497 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 8498 m->md.pv_gen++; 8499 if ((m->a.flags & PGA_WRITEABLE) != 0 && 8500 TAILQ_EMPTY(&m->md.pv_list) && 8501 (m->flags & PG_FICTITIOUS) == 0) { 8502 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8503 if (TAILQ_EMPTY(&pvh->pv_list)) 8504 vm_page_aflag_clear(m, PGA_WRITEABLE); 8505 } 8506 } 8507 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 8508 #ifdef PV_STATS 8509 freed++; 8510 #endif 8511 } 8512 } 8513 PV_STAT(counter_u64_add(pv_entry_frees, freed)); 8514 PV_STAT(counter_u64_add(pv_entry_spare, freed)); 8515 PV_STAT(counter_u64_add(pv_entry_count, -freed)); 8516 if (allfree) { 8517 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 8518 TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc, pc_list); 8519 } 8520 } 8521 if (lock != NULL) 8522 rw_wunlock(lock); 8523 pmap_invalidate_all(pmap); 8524 pmap_pkru_deassign_all(pmap); 8525 free_pv_chunk_batch((struct pv_chunklist *)&free_chunks); 8526 PMAP_UNLOCK(pmap); 8527 vm_page_free_pages_toq(&free, true); 8528 } 8529 8530 static boolean_t 8531 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 8532 { 8533 struct rwlock *lock; 8534 pv_entry_t pv; 8535 struct md_page *pvh; 8536 pt_entry_t *pte, mask; 8537 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 8538 pmap_t pmap; 8539 int md_gen, pvh_gen; 8540 boolean_t rv; 8541 8542 rv = FALSE; 8543 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8544 rw_rlock(lock); 8545 restart: 8546 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8547 pmap = PV_PMAP(pv); 8548 if (!PMAP_TRYLOCK(pmap)) { 8549 md_gen = m->md.pv_gen; 8550 rw_runlock(lock); 8551 PMAP_LOCK(pmap); 8552 rw_rlock(lock); 8553 if (md_gen != m->md.pv_gen) { 8554 PMAP_UNLOCK(pmap); 8555 goto restart; 8556 } 8557 } 8558 pte = pmap_pte(pmap, pv->pv_va); 8559 mask = 0; 8560 if (modified) { 8561 PG_M = pmap_modified_bit(pmap); 8562 PG_RW = pmap_rw_bit(pmap); 8563 mask |= PG_RW | PG_M; 8564 } 8565 if (accessed) { 8566 PG_A = pmap_accessed_bit(pmap); 8567 PG_V = pmap_valid_bit(pmap); 8568 mask |= PG_V | PG_A; 8569 } 8570 rv = (*pte & mask) == mask; 8571 PMAP_UNLOCK(pmap); 8572 if (rv) 8573 goto out; 8574 } 8575 if ((m->flags & PG_FICTITIOUS) == 0) { 8576 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8577 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8578 pmap = PV_PMAP(pv); 8579 if (!PMAP_TRYLOCK(pmap)) { 8580 md_gen = m->md.pv_gen; 8581 pvh_gen = pvh->pv_gen; 8582 rw_runlock(lock); 8583 PMAP_LOCK(pmap); 8584 rw_rlock(lock); 8585 if (md_gen != m->md.pv_gen || 8586 pvh_gen != pvh->pv_gen) { 8587 PMAP_UNLOCK(pmap); 8588 goto restart; 8589 } 8590 } 8591 pte = pmap_pde(pmap, pv->pv_va); 8592 mask = 0; 8593 if (modified) { 8594 PG_M = pmap_modified_bit(pmap); 8595 PG_RW = pmap_rw_bit(pmap); 8596 mask |= PG_RW | PG_M; 8597 } 8598 if (accessed) { 8599 PG_A = pmap_accessed_bit(pmap); 8600 PG_V = pmap_valid_bit(pmap); 8601 mask |= PG_V | PG_A; 8602 } 8603 rv = (*pte & mask) == mask; 8604 PMAP_UNLOCK(pmap); 8605 if (rv) 8606 goto out; 8607 } 8608 } 8609 out: 8610 rw_runlock(lock); 8611 return (rv); 8612 } 8613 8614 /* 8615 * pmap_is_modified: 8616 * 8617 * Return whether or not the specified physical page was modified 8618 * in any physical maps. 8619 */ 8620 boolean_t 8621 pmap_is_modified(vm_page_t m) 8622 { 8623 8624 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8625 ("pmap_is_modified: page %p is not managed", m)); 8626 8627 /* 8628 * If the page is not busied then this check is racy. 8629 */ 8630 if (!pmap_page_is_write_mapped(m)) 8631 return (FALSE); 8632 return (pmap_page_test_mappings(m, FALSE, TRUE)); 8633 } 8634 8635 /* 8636 * pmap_is_prefaultable: 8637 * 8638 * Return whether or not the specified virtual address is eligible 8639 * for prefault. 8640 */ 8641 boolean_t 8642 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 8643 { 8644 pd_entry_t *pde; 8645 pt_entry_t *pte, PG_V; 8646 boolean_t rv; 8647 8648 PG_V = pmap_valid_bit(pmap); 8649 8650 /* 8651 * Return TRUE if and only if the PTE for the specified virtual 8652 * address is allocated but invalid. 8653 */ 8654 rv = FALSE; 8655 PMAP_LOCK(pmap); 8656 pde = pmap_pde(pmap, addr); 8657 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { 8658 pte = pmap_pde_to_pte(pde, addr); 8659 rv = (*pte & PG_V) == 0; 8660 } 8661 PMAP_UNLOCK(pmap); 8662 return (rv); 8663 } 8664 8665 /* 8666 * pmap_is_referenced: 8667 * 8668 * Return whether or not the specified physical page was referenced 8669 * in any physical maps. 8670 */ 8671 boolean_t 8672 pmap_is_referenced(vm_page_t m) 8673 { 8674 8675 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8676 ("pmap_is_referenced: page %p is not managed", m)); 8677 return (pmap_page_test_mappings(m, TRUE, FALSE)); 8678 } 8679 8680 /* 8681 * Clear the write and modified bits in each of the given page's mappings. 8682 */ 8683 void 8684 pmap_remove_write(vm_page_t m) 8685 { 8686 struct md_page *pvh; 8687 pmap_t pmap; 8688 struct rwlock *lock; 8689 pv_entry_t next_pv, pv; 8690 pd_entry_t *pde; 8691 pt_entry_t oldpte, *pte, PG_M, PG_RW; 8692 vm_offset_t va; 8693 int pvh_gen, md_gen; 8694 8695 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8696 ("pmap_remove_write: page %p is not managed", m)); 8697 8698 vm_page_assert_busied(m); 8699 if (!pmap_page_is_write_mapped(m)) 8700 return; 8701 8702 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8703 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 8704 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8705 rw_wlock(lock); 8706 retry: 8707 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 8708 pmap = PV_PMAP(pv); 8709 if (!PMAP_TRYLOCK(pmap)) { 8710 pvh_gen = pvh->pv_gen; 8711 rw_wunlock(lock); 8712 PMAP_LOCK(pmap); 8713 rw_wlock(lock); 8714 if (pvh_gen != pvh->pv_gen) { 8715 PMAP_UNLOCK(pmap); 8716 goto retry; 8717 } 8718 } 8719 PG_RW = pmap_rw_bit(pmap); 8720 va = pv->pv_va; 8721 pde = pmap_pde(pmap, va); 8722 if ((*pde & PG_RW) != 0) 8723 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 8724 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 8725 ("inconsistent pv lock %p %p for page %p", 8726 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 8727 PMAP_UNLOCK(pmap); 8728 } 8729 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8730 pmap = PV_PMAP(pv); 8731 if (!PMAP_TRYLOCK(pmap)) { 8732 pvh_gen = pvh->pv_gen; 8733 md_gen = m->md.pv_gen; 8734 rw_wunlock(lock); 8735 PMAP_LOCK(pmap); 8736 rw_wlock(lock); 8737 if (pvh_gen != pvh->pv_gen || 8738 md_gen != m->md.pv_gen) { 8739 PMAP_UNLOCK(pmap); 8740 goto retry; 8741 } 8742 } 8743 PG_M = pmap_modified_bit(pmap); 8744 PG_RW = pmap_rw_bit(pmap); 8745 pde = pmap_pde(pmap, pv->pv_va); 8746 KASSERT((*pde & PG_PS) == 0, 8747 ("pmap_remove_write: found a 2mpage in page %p's pv list", 8748 m)); 8749 pte = pmap_pde_to_pte(pde, pv->pv_va); 8750 oldpte = *pte; 8751 if (oldpte & PG_RW) { 8752 while (!atomic_fcmpset_long(pte, &oldpte, oldpte & 8753 ~(PG_RW | PG_M))) 8754 cpu_spinwait(); 8755 if ((oldpte & PG_M) != 0) 8756 vm_page_dirty(m); 8757 pmap_invalidate_page(pmap, pv->pv_va); 8758 } 8759 PMAP_UNLOCK(pmap); 8760 } 8761 rw_wunlock(lock); 8762 vm_page_aflag_clear(m, PGA_WRITEABLE); 8763 pmap_delayed_invl_wait(m); 8764 } 8765 8766 static __inline boolean_t 8767 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) 8768 { 8769 8770 if (!pmap_emulate_ad_bits(pmap)) 8771 return (TRUE); 8772 8773 KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); 8774 8775 /* 8776 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration 8777 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared 8778 * if the EPT_PG_WRITE bit is set. 8779 */ 8780 if ((pte & EPT_PG_WRITE) != 0) 8781 return (FALSE); 8782 8783 /* 8784 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. 8785 */ 8786 if ((pte & EPT_PG_EXECUTE) == 0 || 8787 ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) 8788 return (TRUE); 8789 else 8790 return (FALSE); 8791 } 8792 8793 /* 8794 * pmap_ts_referenced: 8795 * 8796 * Return a count of reference bits for a page, clearing those bits. 8797 * It is not necessary for every reference bit to be cleared, but it 8798 * is necessary that 0 only be returned when there are truly no 8799 * reference bits set. 8800 * 8801 * As an optimization, update the page's dirty field if a modified bit is 8802 * found while counting reference bits. This opportunistic update can be 8803 * performed at low cost and can eliminate the need for some future calls 8804 * to pmap_is_modified(). However, since this function stops after 8805 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 8806 * dirty pages. Those dirty pages will only be detected by a future call 8807 * to pmap_is_modified(). 8808 * 8809 * A DI block is not needed within this function, because 8810 * invalidations are performed before the PV list lock is 8811 * released. 8812 */ 8813 int 8814 pmap_ts_referenced(vm_page_t m) 8815 { 8816 struct md_page *pvh; 8817 pv_entry_t pv, pvf; 8818 pmap_t pmap; 8819 struct rwlock *lock; 8820 pd_entry_t oldpde, *pde; 8821 pt_entry_t *pte, PG_A, PG_M, PG_RW; 8822 vm_offset_t va; 8823 vm_paddr_t pa; 8824 int cleared, md_gen, not_cleared, pvh_gen; 8825 struct spglist free; 8826 boolean_t demoted; 8827 8828 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8829 ("pmap_ts_referenced: page %p is not managed", m)); 8830 SLIST_INIT(&free); 8831 cleared = 0; 8832 pa = VM_PAGE_TO_PHYS(m); 8833 lock = PHYS_TO_PV_LIST_LOCK(pa); 8834 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 8835 rw_wlock(lock); 8836 retry: 8837 not_cleared = 0; 8838 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 8839 goto small_mappings; 8840 pv = pvf; 8841 do { 8842 if (pvf == NULL) 8843 pvf = pv; 8844 pmap = PV_PMAP(pv); 8845 if (!PMAP_TRYLOCK(pmap)) { 8846 pvh_gen = pvh->pv_gen; 8847 rw_wunlock(lock); 8848 PMAP_LOCK(pmap); 8849 rw_wlock(lock); 8850 if (pvh_gen != pvh->pv_gen) { 8851 PMAP_UNLOCK(pmap); 8852 goto retry; 8853 } 8854 } 8855 PG_A = pmap_accessed_bit(pmap); 8856 PG_M = pmap_modified_bit(pmap); 8857 PG_RW = pmap_rw_bit(pmap); 8858 va = pv->pv_va; 8859 pde = pmap_pde(pmap, pv->pv_va); 8860 oldpde = *pde; 8861 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 8862 /* 8863 * Although "oldpde" is mapping a 2MB page, because 8864 * this function is called at a 4KB page granularity, 8865 * we only update the 4KB page under test. 8866 */ 8867 vm_page_dirty(m); 8868 } 8869 if ((oldpde & PG_A) != 0) { 8870 /* 8871 * Since this reference bit is shared by 512 4KB 8872 * pages, it should not be cleared every time it is 8873 * tested. Apply a simple "hash" function on the 8874 * physical page number, the virtual superpage number, 8875 * and the pmap address to select one 4KB page out of 8876 * the 512 on which testing the reference bit will 8877 * result in clearing that reference bit. This 8878 * function is designed to avoid the selection of the 8879 * same 4KB page for every 2MB page mapping. 8880 * 8881 * On demotion, a mapping that hasn't been referenced 8882 * is simply destroyed. To avoid the possibility of a 8883 * subsequent page fault on a demoted wired mapping, 8884 * always leave its reference bit set. Moreover, 8885 * since the superpage is wired, the current state of 8886 * its reference bit won't affect page replacement. 8887 */ 8888 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 8889 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 8890 (oldpde & PG_W) == 0) { 8891 if (safe_to_clear_referenced(pmap, oldpde)) { 8892 atomic_clear_long(pde, PG_A); 8893 pmap_invalidate_page(pmap, pv->pv_va); 8894 demoted = FALSE; 8895 } else if (pmap_demote_pde_locked(pmap, pde, 8896 pv->pv_va, &lock)) { 8897 /* 8898 * Remove the mapping to a single page 8899 * so that a subsequent access may 8900 * repromote. Since the underlying 8901 * page table page is fully populated, 8902 * this removal never frees a page 8903 * table page. 8904 */ 8905 demoted = TRUE; 8906 va += VM_PAGE_TO_PHYS(m) - (oldpde & 8907 PG_PS_FRAME); 8908 pte = pmap_pde_to_pte(pde, va); 8909 pmap_remove_pte(pmap, pte, va, *pde, 8910 NULL, &lock); 8911 pmap_invalidate_page(pmap, va); 8912 } else 8913 demoted = TRUE; 8914 8915 if (demoted) { 8916 /* 8917 * The superpage mapping was removed 8918 * entirely and therefore 'pv' is no 8919 * longer valid. 8920 */ 8921 if (pvf == pv) 8922 pvf = NULL; 8923 pv = NULL; 8924 } 8925 cleared++; 8926 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 8927 ("inconsistent pv lock %p %p for page %p", 8928 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 8929 } else 8930 not_cleared++; 8931 } 8932 PMAP_UNLOCK(pmap); 8933 /* Rotate the PV list if it has more than one entry. */ 8934 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 8935 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 8936 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 8937 pvh->pv_gen++; 8938 } 8939 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 8940 goto out; 8941 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 8942 small_mappings: 8943 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 8944 goto out; 8945 pv = pvf; 8946 do { 8947 if (pvf == NULL) 8948 pvf = pv; 8949 pmap = PV_PMAP(pv); 8950 if (!PMAP_TRYLOCK(pmap)) { 8951 pvh_gen = pvh->pv_gen; 8952 md_gen = m->md.pv_gen; 8953 rw_wunlock(lock); 8954 PMAP_LOCK(pmap); 8955 rw_wlock(lock); 8956 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 8957 PMAP_UNLOCK(pmap); 8958 goto retry; 8959 } 8960 } 8961 PG_A = pmap_accessed_bit(pmap); 8962 PG_M = pmap_modified_bit(pmap); 8963 PG_RW = pmap_rw_bit(pmap); 8964 pde = pmap_pde(pmap, pv->pv_va); 8965 KASSERT((*pde & PG_PS) == 0, 8966 ("pmap_ts_referenced: found a 2mpage in page %p's pv list", 8967 m)); 8968 pte = pmap_pde_to_pte(pde, pv->pv_va); 8969 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 8970 vm_page_dirty(m); 8971 if ((*pte & PG_A) != 0) { 8972 if (safe_to_clear_referenced(pmap, *pte)) { 8973 atomic_clear_long(pte, PG_A); 8974 pmap_invalidate_page(pmap, pv->pv_va); 8975 cleared++; 8976 } else if ((*pte & PG_W) == 0) { 8977 /* 8978 * Wired pages cannot be paged out so 8979 * doing accessed bit emulation for 8980 * them is wasted effort. We do the 8981 * hard work for unwired pages only. 8982 */ 8983 pmap_remove_pte(pmap, pte, pv->pv_va, 8984 *pde, &free, &lock); 8985 pmap_invalidate_page(pmap, pv->pv_va); 8986 cleared++; 8987 if (pvf == pv) 8988 pvf = NULL; 8989 pv = NULL; 8990 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 8991 ("inconsistent pv lock %p %p for page %p", 8992 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 8993 } else 8994 not_cleared++; 8995 } 8996 PMAP_UNLOCK(pmap); 8997 /* Rotate the PV list if it has more than one entry. */ 8998 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 8999 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 9000 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 9001 m->md.pv_gen++; 9002 } 9003 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 9004 not_cleared < PMAP_TS_REFERENCED_MAX); 9005 out: 9006 rw_wunlock(lock); 9007 vm_page_free_pages_toq(&free, true); 9008 return (cleared + not_cleared); 9009 } 9010 9011 /* 9012 * Apply the given advice to the specified range of addresses within the 9013 * given pmap. Depending on the advice, clear the referenced and/or 9014 * modified flags in each mapping and set the mapped page's dirty field. 9015 */ 9016 void 9017 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 9018 { 9019 struct rwlock *lock; 9020 pml4_entry_t *pml4e; 9021 pdp_entry_t *pdpe; 9022 pd_entry_t oldpde, *pde; 9023 pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; 9024 vm_offset_t va, va_next; 9025 vm_page_t m; 9026 bool anychanged; 9027 9028 if (advice != MADV_DONTNEED && advice != MADV_FREE) 9029 return; 9030 9031 /* 9032 * A/D bit emulation requires an alternate code path when clearing 9033 * the modified and accessed bits below. Since this function is 9034 * advisory in nature we skip it entirely for pmaps that require 9035 * A/D bit emulation. 9036 */ 9037 if (pmap_emulate_ad_bits(pmap)) 9038 return; 9039 9040 PG_A = pmap_accessed_bit(pmap); 9041 PG_G = pmap_global_bit(pmap); 9042 PG_M = pmap_modified_bit(pmap); 9043 PG_V = pmap_valid_bit(pmap); 9044 PG_RW = pmap_rw_bit(pmap); 9045 anychanged = false; 9046 pmap_delayed_invl_start(); 9047 PMAP_LOCK(pmap); 9048 for (; sva < eva; sva = va_next) { 9049 pml4e = pmap_pml4e(pmap, sva); 9050 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 9051 va_next = (sva + NBPML4) & ~PML4MASK; 9052 if (va_next < sva) 9053 va_next = eva; 9054 continue; 9055 } 9056 9057 va_next = (sva + NBPDP) & ~PDPMASK; 9058 if (va_next < sva) 9059 va_next = eva; 9060 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 9061 if ((*pdpe & PG_V) == 0) 9062 continue; 9063 if ((*pdpe & PG_PS) != 0) 9064 continue; 9065 9066 va_next = (sva + NBPDR) & ~PDRMASK; 9067 if (va_next < sva) 9068 va_next = eva; 9069 pde = pmap_pdpe_to_pde(pdpe, sva); 9070 oldpde = *pde; 9071 if ((oldpde & PG_V) == 0) 9072 continue; 9073 else if ((oldpde & PG_PS) != 0) { 9074 if ((oldpde & PG_MANAGED) == 0) 9075 continue; 9076 lock = NULL; 9077 if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { 9078 if (lock != NULL) 9079 rw_wunlock(lock); 9080 9081 /* 9082 * The large page mapping was destroyed. 9083 */ 9084 continue; 9085 } 9086 9087 /* 9088 * Unless the page mappings are wired, remove the 9089 * mapping to a single page so that a subsequent 9090 * access may repromote. Choosing the last page 9091 * within the address range [sva, min(va_next, eva)) 9092 * generally results in more repromotions. Since the 9093 * underlying page table page is fully populated, this 9094 * removal never frees a page table page. 9095 */ 9096 if ((oldpde & PG_W) == 0) { 9097 va = eva; 9098 if (va > va_next) 9099 va = va_next; 9100 va -= PAGE_SIZE; 9101 KASSERT(va >= sva, 9102 ("pmap_advise: no address gap")); 9103 pte = pmap_pde_to_pte(pde, va); 9104 KASSERT((*pte & PG_V) != 0, 9105 ("pmap_advise: invalid PTE")); 9106 pmap_remove_pte(pmap, pte, va, *pde, NULL, 9107 &lock); 9108 anychanged = true; 9109 } 9110 if (lock != NULL) 9111 rw_wunlock(lock); 9112 } 9113 if (va_next > eva) 9114 va_next = eva; 9115 va = va_next; 9116 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 9117 sva += PAGE_SIZE) { 9118 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) 9119 goto maybe_invlrng; 9120 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 9121 if (advice == MADV_DONTNEED) { 9122 /* 9123 * Future calls to pmap_is_modified() 9124 * can be avoided by making the page 9125 * dirty now. 9126 */ 9127 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 9128 vm_page_dirty(m); 9129 } 9130 atomic_clear_long(pte, PG_M | PG_A); 9131 } else if ((*pte & PG_A) != 0) 9132 atomic_clear_long(pte, PG_A); 9133 else 9134 goto maybe_invlrng; 9135 9136 if ((*pte & PG_G) != 0) { 9137 if (va == va_next) 9138 va = sva; 9139 } else 9140 anychanged = true; 9141 continue; 9142 maybe_invlrng: 9143 if (va != va_next) { 9144 pmap_invalidate_range(pmap, va, sva); 9145 va = va_next; 9146 } 9147 } 9148 if (va != va_next) 9149 pmap_invalidate_range(pmap, va, sva); 9150 } 9151 if (anychanged) 9152 pmap_invalidate_all(pmap); 9153 PMAP_UNLOCK(pmap); 9154 pmap_delayed_invl_finish(); 9155 } 9156 9157 /* 9158 * Clear the modify bits on the specified physical page. 9159 */ 9160 void 9161 pmap_clear_modify(vm_page_t m) 9162 { 9163 struct md_page *pvh; 9164 pmap_t pmap; 9165 pv_entry_t next_pv, pv; 9166 pd_entry_t oldpde, *pde; 9167 pt_entry_t *pte, PG_M, PG_RW; 9168 struct rwlock *lock; 9169 vm_offset_t va; 9170 int md_gen, pvh_gen; 9171 9172 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 9173 ("pmap_clear_modify: page %p is not managed", m)); 9174 vm_page_assert_busied(m); 9175 9176 if (!pmap_page_is_write_mapped(m)) 9177 return; 9178 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 9179 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 9180 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 9181 rw_wlock(lock); 9182 restart: 9183 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 9184 pmap = PV_PMAP(pv); 9185 if (!PMAP_TRYLOCK(pmap)) { 9186 pvh_gen = pvh->pv_gen; 9187 rw_wunlock(lock); 9188 PMAP_LOCK(pmap); 9189 rw_wlock(lock); 9190 if (pvh_gen != pvh->pv_gen) { 9191 PMAP_UNLOCK(pmap); 9192 goto restart; 9193 } 9194 } 9195 PG_M = pmap_modified_bit(pmap); 9196 PG_RW = pmap_rw_bit(pmap); 9197 va = pv->pv_va; 9198 pde = pmap_pde(pmap, va); 9199 oldpde = *pde; 9200 /* If oldpde has PG_RW set, then it also has PG_M set. */ 9201 if ((oldpde & PG_RW) != 0 && 9202 pmap_demote_pde_locked(pmap, pde, va, &lock) && 9203 (oldpde & PG_W) == 0) { 9204 /* 9205 * Write protect the mapping to a single page so that 9206 * a subsequent write access may repromote. 9207 */ 9208 va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); 9209 pte = pmap_pde_to_pte(pde, va); 9210 atomic_clear_long(pte, PG_M | PG_RW); 9211 vm_page_dirty(m); 9212 pmap_invalidate_page(pmap, va); 9213 } 9214 PMAP_UNLOCK(pmap); 9215 } 9216 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 9217 pmap = PV_PMAP(pv); 9218 if (!PMAP_TRYLOCK(pmap)) { 9219 md_gen = m->md.pv_gen; 9220 pvh_gen = pvh->pv_gen; 9221 rw_wunlock(lock); 9222 PMAP_LOCK(pmap); 9223 rw_wlock(lock); 9224 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 9225 PMAP_UNLOCK(pmap); 9226 goto restart; 9227 } 9228 } 9229 PG_M = pmap_modified_bit(pmap); 9230 PG_RW = pmap_rw_bit(pmap); 9231 pde = pmap_pde(pmap, pv->pv_va); 9232 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 9233 " a 2mpage in page %p's pv list", m)); 9234 pte = pmap_pde_to_pte(pde, pv->pv_va); 9235 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 9236 atomic_clear_long(pte, PG_M); 9237 pmap_invalidate_page(pmap, pv->pv_va); 9238 } 9239 PMAP_UNLOCK(pmap); 9240 } 9241 rw_wunlock(lock); 9242 } 9243 9244 /* 9245 * Miscellaneous support routines follow 9246 */ 9247 9248 /* Adjust the properties for a leaf page table entry. */ 9249 static __inline void 9250 pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask) 9251 { 9252 u_long opte, npte; 9253 9254 opte = *(u_long *)pte; 9255 do { 9256 npte = opte & ~mask; 9257 npte |= bits; 9258 } while (npte != opte && !atomic_fcmpset_long((u_long *)pte, &opte, 9259 npte)); 9260 } 9261 9262 /* 9263 * Map a set of physical memory pages into the kernel virtual 9264 * address space. Return a pointer to where it is mapped. This 9265 * routine is intended to be used for mapping device memory, 9266 * NOT real memory. 9267 */ 9268 static void * 9269 pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags) 9270 { 9271 struct pmap_preinit_mapping *ppim; 9272 vm_offset_t va, offset; 9273 vm_size_t tmpsize; 9274 int i; 9275 9276 offset = pa & PAGE_MASK; 9277 size = round_page(offset + size); 9278 pa = trunc_page(pa); 9279 9280 if (!pmap_initialized) { 9281 va = 0; 9282 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9283 ppim = pmap_preinit_mapping + i; 9284 if (ppim->va == 0) { 9285 ppim->pa = pa; 9286 ppim->sz = size; 9287 ppim->mode = mode; 9288 ppim->va = virtual_avail; 9289 virtual_avail += size; 9290 va = ppim->va; 9291 break; 9292 } 9293 } 9294 if (va == 0) 9295 panic("%s: too many preinit mappings", __func__); 9296 } else { 9297 /* 9298 * If we have a preinit mapping, re-use it. 9299 */ 9300 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9301 ppim = pmap_preinit_mapping + i; 9302 if (ppim->pa == pa && ppim->sz == size && 9303 (ppim->mode == mode || 9304 (flags & MAPDEV_SETATTR) == 0)) 9305 return ((void *)(ppim->va + offset)); 9306 } 9307 /* 9308 * If the specified range of physical addresses fits within 9309 * the direct map window, use the direct map. 9310 */ 9311 if (pa < dmaplimit && pa + size <= dmaplimit) { 9312 va = PHYS_TO_DMAP(pa); 9313 if ((flags & MAPDEV_SETATTR) != 0) { 9314 PMAP_LOCK(kernel_pmap); 9315 i = pmap_change_props_locked(va, size, 9316 PROT_NONE, mode, flags); 9317 PMAP_UNLOCK(kernel_pmap); 9318 } else 9319 i = 0; 9320 if (!i) 9321 return ((void *)(va + offset)); 9322 } 9323 va = kva_alloc(size); 9324 if (va == 0) 9325 panic("%s: Couldn't allocate KVA", __func__); 9326 } 9327 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 9328 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 9329 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 9330 if ((flags & MAPDEV_FLUSHCACHE) != 0) 9331 pmap_invalidate_cache_range(va, va + tmpsize); 9332 return ((void *)(va + offset)); 9333 } 9334 9335 void * 9336 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 9337 { 9338 9339 return (pmap_mapdev_internal(pa, size, mode, MAPDEV_FLUSHCACHE | 9340 MAPDEV_SETATTR)); 9341 } 9342 9343 void * 9344 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 9345 { 9346 9347 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 9348 } 9349 9350 void * 9351 pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size) 9352 { 9353 9354 return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE, 9355 MAPDEV_SETATTR)); 9356 } 9357 9358 void * 9359 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 9360 { 9361 9362 return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK, 9363 MAPDEV_FLUSHCACHE)); 9364 } 9365 9366 void 9367 pmap_unmapdev(void *p, vm_size_t size) 9368 { 9369 struct pmap_preinit_mapping *ppim; 9370 vm_offset_t offset, va; 9371 int i; 9372 9373 va = (vm_offset_t)p; 9374 9375 /* If we gave a direct map region in pmap_mapdev, do nothing */ 9376 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 9377 return; 9378 offset = va & PAGE_MASK; 9379 size = round_page(offset + size); 9380 va = trunc_page(va); 9381 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9382 ppim = pmap_preinit_mapping + i; 9383 if (ppim->va == va && ppim->sz == size) { 9384 if (pmap_initialized) 9385 return; 9386 ppim->pa = 0; 9387 ppim->va = 0; 9388 ppim->sz = 0; 9389 ppim->mode = 0; 9390 if (va + size == virtual_avail) 9391 virtual_avail = va; 9392 return; 9393 } 9394 } 9395 if (pmap_initialized) { 9396 pmap_qremove(va, atop(size)); 9397 kva_free(va, size); 9398 } 9399 } 9400 9401 /* 9402 * Tries to demote a 1GB page mapping. 9403 */ 9404 static boolean_t 9405 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) 9406 { 9407 pdp_entry_t newpdpe, oldpdpe; 9408 pd_entry_t *firstpde, newpde, *pde; 9409 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 9410 vm_paddr_t pdpgpa; 9411 vm_page_t pdpg; 9412 9413 PG_A = pmap_accessed_bit(pmap); 9414 PG_M = pmap_modified_bit(pmap); 9415 PG_V = pmap_valid_bit(pmap); 9416 PG_RW = pmap_rw_bit(pmap); 9417 9418 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9419 oldpdpe = *pdpe; 9420 KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), 9421 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 9422 pdpg = pmap_alloc_pt_page(pmap, va >> PDPSHIFT, 9423 VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT); 9424 if (pdpg == NULL) { 9425 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 9426 " in pmap %p", va, pmap); 9427 return (FALSE); 9428 } 9429 pdpgpa = VM_PAGE_TO_PHYS(pdpg); 9430 firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa); 9431 newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; 9432 KASSERT((oldpdpe & PG_A) != 0, 9433 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 9434 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 9435 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 9436 newpde = oldpdpe; 9437 9438 /* 9439 * Initialize the page directory page. 9440 */ 9441 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 9442 *pde = newpde; 9443 newpde += NBPDR; 9444 } 9445 9446 /* 9447 * Demote the mapping. 9448 */ 9449 *pdpe = newpdpe; 9450 9451 /* 9452 * Invalidate a stale recursive mapping of the page directory page. 9453 */ 9454 pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); 9455 9456 counter_u64_add(pmap_pdpe_demotions, 1); 9457 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 9458 " in pmap %p", va, pmap); 9459 return (TRUE); 9460 } 9461 9462 /* 9463 * Sets the memory attribute for the specified page. 9464 */ 9465 void 9466 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 9467 { 9468 9469 m->md.pat_mode = ma; 9470 9471 /* 9472 * If "m" is a normal page, update its direct mapping. This update 9473 * can be relied upon to perform any cache operations that are 9474 * required for data coherence. 9475 */ 9476 if ((m->flags & PG_FICTITIOUS) == 0 && 9477 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 9478 m->md.pat_mode)) 9479 panic("memory attribute change on the direct map failed"); 9480 } 9481 9482 void 9483 pmap_page_set_memattr_noflush(vm_page_t m, vm_memattr_t ma) 9484 { 9485 int error; 9486 9487 m->md.pat_mode = ma; 9488 9489 if ((m->flags & PG_FICTITIOUS) != 0) 9490 return; 9491 PMAP_LOCK(kernel_pmap); 9492 error = pmap_change_props_locked(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 9493 PAGE_SIZE, PROT_NONE, m->md.pat_mode, 0); 9494 PMAP_UNLOCK(kernel_pmap); 9495 if (error != 0) 9496 panic("memory attribute change on the direct map failed"); 9497 } 9498 9499 /* 9500 * Changes the specified virtual address range's memory type to that given by 9501 * the parameter "mode". The specified virtual address range must be 9502 * completely contained within either the direct map or the kernel map. If 9503 * the virtual address range is contained within the kernel map, then the 9504 * memory type for each of the corresponding ranges of the direct map is also 9505 * changed. (The corresponding ranges of the direct map are those ranges that 9506 * map the same physical pages as the specified virtual address range.) These 9507 * changes to the direct map are necessary because Intel describes the 9508 * behavior of their processors as "undefined" if two or more mappings to the 9509 * same physical page have different memory types. 9510 * 9511 * Returns zero if the change completed successfully, and either EINVAL or 9512 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 9513 * of the virtual address range was not mapped, and ENOMEM is returned if 9514 * there was insufficient memory available to complete the change. In the 9515 * latter case, the memory type may have been changed on some part of the 9516 * virtual address range or the direct map. 9517 */ 9518 int 9519 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 9520 { 9521 int error; 9522 9523 PMAP_LOCK(kernel_pmap); 9524 error = pmap_change_props_locked(va, size, PROT_NONE, mode, 9525 MAPDEV_FLUSHCACHE); 9526 PMAP_UNLOCK(kernel_pmap); 9527 return (error); 9528 } 9529 9530 /* 9531 * Changes the specified virtual address range's protections to those 9532 * specified by "prot". Like pmap_change_attr(), protections for aliases 9533 * in the direct map are updated as well. Protections on aliasing mappings may 9534 * be a subset of the requested protections; for example, mappings in the direct 9535 * map are never executable. 9536 */ 9537 int 9538 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot) 9539 { 9540 int error; 9541 9542 /* Only supported within the kernel map. */ 9543 if (va < VM_MIN_KERNEL_ADDRESS) 9544 return (EINVAL); 9545 9546 PMAP_LOCK(kernel_pmap); 9547 error = pmap_change_props_locked(va, size, prot, -1, 9548 MAPDEV_ASSERTVALID); 9549 PMAP_UNLOCK(kernel_pmap); 9550 return (error); 9551 } 9552 9553 static int 9554 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, 9555 int mode, int flags) 9556 { 9557 vm_offset_t base, offset, tmpva; 9558 vm_paddr_t pa_start, pa_end, pa_end1; 9559 pdp_entry_t *pdpe; 9560 pd_entry_t *pde, pde_bits, pde_mask; 9561 pt_entry_t *pte, pte_bits, pte_mask; 9562 int error; 9563 bool changed; 9564 9565 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 9566 base = trunc_page(va); 9567 offset = va & PAGE_MASK; 9568 size = round_page(offset + size); 9569 9570 /* 9571 * Only supported on kernel virtual addresses, including the direct 9572 * map but excluding the recursive map. 9573 */ 9574 if (base < DMAP_MIN_ADDRESS) 9575 return (EINVAL); 9576 9577 /* 9578 * Construct our flag sets and masks. "bits" is the subset of 9579 * "mask" that will be set in each modified PTE. 9580 * 9581 * Mappings in the direct map are never allowed to be executable. 9582 */ 9583 pde_bits = pte_bits = 0; 9584 pde_mask = pte_mask = 0; 9585 if (mode != -1) { 9586 pde_bits |= pmap_cache_bits(kernel_pmap, mode, true); 9587 pde_mask |= X86_PG_PDE_CACHE; 9588 pte_bits |= pmap_cache_bits(kernel_pmap, mode, false); 9589 pte_mask |= X86_PG_PTE_CACHE; 9590 } 9591 if (prot != VM_PROT_NONE) { 9592 if ((prot & VM_PROT_WRITE) != 0) { 9593 pde_bits |= X86_PG_RW; 9594 pte_bits |= X86_PG_RW; 9595 } 9596 if ((prot & VM_PROT_EXECUTE) == 0 || 9597 va < VM_MIN_KERNEL_ADDRESS) { 9598 pde_bits |= pg_nx; 9599 pte_bits |= pg_nx; 9600 } 9601 pde_mask |= X86_PG_RW | pg_nx; 9602 pte_mask |= X86_PG_RW | pg_nx; 9603 } 9604 9605 /* 9606 * Pages that aren't mapped aren't supported. Also break down 2MB pages 9607 * into 4KB pages if required. 9608 */ 9609 for (tmpva = base; tmpva < base + size; ) { 9610 pdpe = pmap_pdpe(kernel_pmap, tmpva); 9611 if (pdpe == NULL || *pdpe == 0) { 9612 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9613 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9614 return (EINVAL); 9615 } 9616 if (*pdpe & PG_PS) { 9617 /* 9618 * If the current 1GB page already has the required 9619 * properties, then we need not demote this page. Just 9620 * increment tmpva to the next 1GB page frame. 9621 */ 9622 if ((*pdpe & pde_mask) == pde_bits) { 9623 tmpva = trunc_1gpage(tmpva) + NBPDP; 9624 continue; 9625 } 9626 9627 /* 9628 * If the current offset aligns with a 1GB page frame 9629 * and there is at least 1GB left within the range, then 9630 * we need not break down this page into 2MB pages. 9631 */ 9632 if ((tmpva & PDPMASK) == 0 && 9633 tmpva + PDPMASK < base + size) { 9634 tmpva += NBPDP; 9635 continue; 9636 } 9637 if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) 9638 return (ENOMEM); 9639 } 9640 pde = pmap_pdpe_to_pde(pdpe, tmpva); 9641 if (*pde == 0) { 9642 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9643 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9644 return (EINVAL); 9645 } 9646 if (*pde & PG_PS) { 9647 /* 9648 * If the current 2MB page already has the required 9649 * properties, then we need not demote this page. Just 9650 * increment tmpva to the next 2MB page frame. 9651 */ 9652 if ((*pde & pde_mask) == pde_bits) { 9653 tmpva = trunc_2mpage(tmpva) + NBPDR; 9654 continue; 9655 } 9656 9657 /* 9658 * If the current offset aligns with a 2MB page frame 9659 * and there is at least 2MB left within the range, then 9660 * we need not break down this page into 4KB pages. 9661 */ 9662 if ((tmpva & PDRMASK) == 0 && 9663 tmpva + PDRMASK < base + size) { 9664 tmpva += NBPDR; 9665 continue; 9666 } 9667 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) 9668 return (ENOMEM); 9669 } 9670 pte = pmap_pde_to_pte(pde, tmpva); 9671 if (*pte == 0) { 9672 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9673 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9674 return (EINVAL); 9675 } 9676 tmpva += PAGE_SIZE; 9677 } 9678 error = 0; 9679 9680 /* 9681 * Ok, all the pages exist, so run through them updating their 9682 * properties if required. 9683 */ 9684 changed = false; 9685 pa_start = pa_end = 0; 9686 for (tmpva = base; tmpva < base + size; ) { 9687 pdpe = pmap_pdpe(kernel_pmap, tmpva); 9688 if (*pdpe & PG_PS) { 9689 if ((*pdpe & pde_mask) != pde_bits) { 9690 pmap_pte_props(pdpe, pde_bits, pde_mask); 9691 changed = true; 9692 } 9693 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9694 (*pdpe & PG_PS_FRAME) < dmaplimit) { 9695 if (pa_start == pa_end) { 9696 /* Start physical address run. */ 9697 pa_start = *pdpe & PG_PS_FRAME; 9698 pa_end = pa_start + NBPDP; 9699 } else if (pa_end == (*pdpe & PG_PS_FRAME)) 9700 pa_end += NBPDP; 9701 else { 9702 /* Run ended, update direct map. */ 9703 error = pmap_change_props_locked( 9704 PHYS_TO_DMAP(pa_start), 9705 pa_end - pa_start, prot, mode, 9706 flags); 9707 if (error != 0) 9708 break; 9709 /* Start physical address run. */ 9710 pa_start = *pdpe & PG_PS_FRAME; 9711 pa_end = pa_start + NBPDP; 9712 } 9713 } 9714 tmpva = trunc_1gpage(tmpva) + NBPDP; 9715 continue; 9716 } 9717 pde = pmap_pdpe_to_pde(pdpe, tmpva); 9718 if (*pde & PG_PS) { 9719 if ((*pde & pde_mask) != pde_bits) { 9720 pmap_pte_props(pde, pde_bits, pde_mask); 9721 changed = true; 9722 } 9723 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9724 (*pde & PG_PS_FRAME) < dmaplimit) { 9725 if (pa_start == pa_end) { 9726 /* Start physical address run. */ 9727 pa_start = *pde & PG_PS_FRAME; 9728 pa_end = pa_start + NBPDR; 9729 } else if (pa_end == (*pde & PG_PS_FRAME)) 9730 pa_end += NBPDR; 9731 else { 9732 /* Run ended, update direct map. */ 9733 error = pmap_change_props_locked( 9734 PHYS_TO_DMAP(pa_start), 9735 pa_end - pa_start, prot, mode, 9736 flags); 9737 if (error != 0) 9738 break; 9739 /* Start physical address run. */ 9740 pa_start = *pde & PG_PS_FRAME; 9741 pa_end = pa_start + NBPDR; 9742 } 9743 } 9744 tmpva = trunc_2mpage(tmpva) + NBPDR; 9745 } else { 9746 pte = pmap_pde_to_pte(pde, tmpva); 9747 if ((*pte & pte_mask) != pte_bits) { 9748 pmap_pte_props(pte, pte_bits, pte_mask); 9749 changed = true; 9750 } 9751 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9752 (*pte & PG_FRAME) < dmaplimit) { 9753 if (pa_start == pa_end) { 9754 /* Start physical address run. */ 9755 pa_start = *pte & PG_FRAME; 9756 pa_end = pa_start + PAGE_SIZE; 9757 } else if (pa_end == (*pte & PG_FRAME)) 9758 pa_end += PAGE_SIZE; 9759 else { 9760 /* Run ended, update direct map. */ 9761 error = pmap_change_props_locked( 9762 PHYS_TO_DMAP(pa_start), 9763 pa_end - pa_start, prot, mode, 9764 flags); 9765 if (error != 0) 9766 break; 9767 /* Start physical address run. */ 9768 pa_start = *pte & PG_FRAME; 9769 pa_end = pa_start + PAGE_SIZE; 9770 } 9771 } 9772 tmpva += PAGE_SIZE; 9773 } 9774 } 9775 if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { 9776 pa_end1 = MIN(pa_end, dmaplimit); 9777 if (pa_start != pa_end1) 9778 error = pmap_change_props_locked(PHYS_TO_DMAP(pa_start), 9779 pa_end1 - pa_start, prot, mode, flags); 9780 } 9781 9782 /* 9783 * Flush CPU caches if required to make sure any data isn't cached that 9784 * shouldn't be, etc. 9785 */ 9786 if (changed) { 9787 pmap_invalidate_range(kernel_pmap, base, tmpva); 9788 if ((flags & MAPDEV_FLUSHCACHE) != 0) 9789 pmap_invalidate_cache_range(base, tmpva); 9790 } 9791 return (error); 9792 } 9793 9794 /* 9795 * Demotes any mapping within the direct map region that covers more than the 9796 * specified range of physical addresses. This range's size must be a power 9797 * of two and its starting address must be a multiple of its size. Since the 9798 * demotion does not change any attributes of the mapping, a TLB invalidation 9799 * is not mandatory. The caller may, however, request a TLB invalidation. 9800 */ 9801 void 9802 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) 9803 { 9804 pdp_entry_t *pdpe; 9805 pd_entry_t *pde; 9806 vm_offset_t va; 9807 boolean_t changed; 9808 9809 if (len == 0) 9810 return; 9811 KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); 9812 KASSERT((base & (len - 1)) == 0, 9813 ("pmap_demote_DMAP: base is not a multiple of len")); 9814 if (len < NBPDP && base < dmaplimit) { 9815 va = PHYS_TO_DMAP(base); 9816 changed = FALSE; 9817 PMAP_LOCK(kernel_pmap); 9818 pdpe = pmap_pdpe(kernel_pmap, va); 9819 if ((*pdpe & X86_PG_V) == 0) 9820 panic("pmap_demote_DMAP: invalid PDPE"); 9821 if ((*pdpe & PG_PS) != 0) { 9822 if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) 9823 panic("pmap_demote_DMAP: PDPE failed"); 9824 changed = TRUE; 9825 } 9826 if (len < NBPDR) { 9827 pde = pmap_pdpe_to_pde(pdpe, va); 9828 if ((*pde & X86_PG_V) == 0) 9829 panic("pmap_demote_DMAP: invalid PDE"); 9830 if ((*pde & PG_PS) != 0) { 9831 if (!pmap_demote_pde(kernel_pmap, pde, va)) 9832 panic("pmap_demote_DMAP: PDE failed"); 9833 changed = TRUE; 9834 } 9835 } 9836 if (changed && invalidate) 9837 pmap_invalidate_page(kernel_pmap, va); 9838 PMAP_UNLOCK(kernel_pmap); 9839 } 9840 } 9841 9842 /* 9843 * Perform the pmap work for mincore(2). If the page is not both referenced and 9844 * modified by this pmap, returns its physical address so that the caller can 9845 * find other mappings. 9846 */ 9847 int 9848 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 9849 { 9850 pdp_entry_t *pdpe; 9851 pd_entry_t *pdep; 9852 pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; 9853 vm_paddr_t pa; 9854 int val; 9855 9856 PG_A = pmap_accessed_bit(pmap); 9857 PG_M = pmap_modified_bit(pmap); 9858 PG_V = pmap_valid_bit(pmap); 9859 PG_RW = pmap_rw_bit(pmap); 9860 9861 PMAP_LOCK(pmap); 9862 pte = 0; 9863 pa = 0; 9864 val = 0; 9865 pdpe = pmap_pdpe(pmap, addr); 9866 if (pdpe == NULL) 9867 goto out; 9868 if ((*pdpe & PG_V) != 0) { 9869 if ((*pdpe & PG_PS) != 0) { 9870 pte = *pdpe; 9871 pa = ((pte & PG_PS_PDP_FRAME) | (addr & PDPMASK)) & 9872 PG_FRAME; 9873 val = MINCORE_PSIND(2); 9874 } else { 9875 pdep = pmap_pde(pmap, addr); 9876 if (pdep != NULL && (*pdep & PG_V) != 0) { 9877 if ((*pdep & PG_PS) != 0) { 9878 pte = *pdep; 9879 /* Compute the physical address of the 4KB page. */ 9880 pa = ((pte & PG_PS_FRAME) | (addr & 9881 PDRMASK)) & PG_FRAME; 9882 val = MINCORE_PSIND(1); 9883 } else { 9884 pte = *pmap_pde_to_pte(pdep, addr); 9885 pa = pte & PG_FRAME; 9886 val = 0; 9887 } 9888 } 9889 } 9890 } 9891 if ((pte & PG_V) != 0) { 9892 val |= MINCORE_INCORE; 9893 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 9894 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 9895 if ((pte & PG_A) != 0) 9896 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 9897 } 9898 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 9899 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 9900 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 9901 *pap = pa; 9902 } 9903 out: 9904 PMAP_UNLOCK(pmap); 9905 return (val); 9906 } 9907 9908 static uint64_t 9909 pmap_pcid_alloc(pmap_t pmap, u_int cpuid) 9910 { 9911 uint32_t gen, new_gen, pcid_next; 9912 9913 CRITICAL_ASSERT(curthread); 9914 gen = PCPU_GET(pcid_gen); 9915 if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN) 9916 return (pti ? 0 : CR3_PCID_SAVE); 9917 if (pmap->pm_pcids[cpuid].pm_gen == gen) 9918 return (CR3_PCID_SAVE); 9919 pcid_next = PCPU_GET(pcid_next); 9920 KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) || 9921 (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN), 9922 ("cpu %d pcid_next %#x", cpuid, pcid_next)); 9923 if ((!pti && pcid_next == PMAP_PCID_OVERMAX) || 9924 (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) { 9925 new_gen = gen + 1; 9926 if (new_gen == 0) 9927 new_gen = 1; 9928 PCPU_SET(pcid_gen, new_gen); 9929 pcid_next = PMAP_PCID_KERN + 1; 9930 } else { 9931 new_gen = gen; 9932 } 9933 pmap->pm_pcids[cpuid].pm_pcid = pcid_next; 9934 pmap->pm_pcids[cpuid].pm_gen = new_gen; 9935 PCPU_SET(pcid_next, pcid_next + 1); 9936 return (0); 9937 } 9938 9939 static uint64_t 9940 pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid) 9941 { 9942 uint64_t cached; 9943 9944 cached = pmap_pcid_alloc(pmap, cpuid); 9945 KASSERT(pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, 9946 ("pmap %p cpu %d pcid %#x", pmap, cpuid, 9947 pmap->pm_pcids[cpuid].pm_pcid)); 9948 KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || 9949 pmap == kernel_pmap, 9950 ("non-kernel pmap pmap %p cpu %d pcid %#x", 9951 pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); 9952 return (cached); 9953 } 9954 9955 static void 9956 pmap_activate_sw_pti_post(struct thread *td, pmap_t pmap) 9957 { 9958 9959 PCPU_GET(tssp)->tss_rsp0 = pmap->pm_ucr3 != PMAP_NO_CR3 ? 9960 PCPU_GET(pti_rsp0) : (uintptr_t)td->td_md.md_stack_base; 9961 } 9962 9963 static void 9964 pmap_activate_sw_pcid_pti(struct thread *td, pmap_t pmap, u_int cpuid) 9965 { 9966 pmap_t old_pmap; 9967 uint64_t cached, cr3, kcr3, ucr3; 9968 9969 KASSERT((read_rflags() & PSL_I) == 0, 9970 ("PCID needs interrupts disabled in pmap_activate_sw()")); 9971 9972 /* See the comment in pmap_invalidate_page_pcid(). */ 9973 if (PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) { 9974 PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK); 9975 old_pmap = PCPU_GET(curpmap); 9976 MPASS(old_pmap->pm_ucr3 != PMAP_NO_CR3); 9977 old_pmap->pm_pcids[cpuid].pm_gen = 0; 9978 } 9979 9980 cached = pmap_pcid_alloc_checked(pmap, cpuid); 9981 cr3 = rcr3(); 9982 if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) 9983 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid); 9984 PCPU_SET(curpmap, pmap); 9985 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid; 9986 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid | 9987 PMAP_PCID_USER_PT; 9988 9989 if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) 9990 PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); 9991 9992 PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); 9993 PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); 9994 if (cached) 9995 counter_u64_add(pcid_save_cnt, 1); 9996 9997 pmap_activate_sw_pti_post(td, pmap); 9998 } 9999 10000 static void 10001 pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap, 10002 u_int cpuid) 10003 { 10004 uint64_t cached, cr3; 10005 10006 KASSERT((read_rflags() & PSL_I) == 0, 10007 ("PCID needs interrupts disabled in pmap_activate_sw()")); 10008 10009 cached = pmap_pcid_alloc_checked(pmap, cpuid); 10010 cr3 = rcr3(); 10011 if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) 10012 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | 10013 cached); 10014 PCPU_SET(curpmap, pmap); 10015 if (cached) 10016 counter_u64_add(pcid_save_cnt, 1); 10017 } 10018 10019 static void 10020 pmap_activate_sw_nopcid_nopti(struct thread *td __unused, pmap_t pmap, 10021 u_int cpuid __unused) 10022 { 10023 10024 load_cr3(pmap->pm_cr3); 10025 PCPU_SET(curpmap, pmap); 10026 } 10027 10028 static void 10029 pmap_activate_sw_nopcid_pti(struct thread *td, pmap_t pmap, 10030 u_int cpuid __unused) 10031 { 10032 10033 pmap_activate_sw_nopcid_nopti(td, pmap, cpuid); 10034 PCPU_SET(kcr3, pmap->pm_cr3); 10035 PCPU_SET(ucr3, pmap->pm_ucr3); 10036 pmap_activate_sw_pti_post(td, pmap); 10037 } 10038 10039 DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (struct thread *, pmap_t, 10040 u_int)) 10041 { 10042 10043 if (pmap_pcid_enabled && pti) 10044 return (pmap_activate_sw_pcid_pti); 10045 else if (pmap_pcid_enabled && !pti) 10046 return (pmap_activate_sw_pcid_nopti); 10047 else if (!pmap_pcid_enabled && pti) 10048 return (pmap_activate_sw_nopcid_pti); 10049 else /* if (!pmap_pcid_enabled && !pti) */ 10050 return (pmap_activate_sw_nopcid_nopti); 10051 } 10052 10053 void 10054 pmap_activate_sw(struct thread *td) 10055 { 10056 pmap_t oldpmap, pmap; 10057 u_int cpuid; 10058 10059 oldpmap = PCPU_GET(curpmap); 10060 pmap = vmspace_pmap(td->td_proc->p_vmspace); 10061 if (oldpmap == pmap) { 10062 if (cpu_vendor_id != CPU_VENDOR_INTEL) 10063 mfence(); 10064 return; 10065 } 10066 cpuid = PCPU_GET(cpuid); 10067 #ifdef SMP 10068 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 10069 #else 10070 CPU_SET(cpuid, &pmap->pm_active); 10071 #endif 10072 pmap_activate_sw_mode(td, pmap, cpuid); 10073 #ifdef SMP 10074 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 10075 #else 10076 CPU_CLR(cpuid, &oldpmap->pm_active); 10077 #endif 10078 } 10079 10080 void 10081 pmap_activate(struct thread *td) 10082 { 10083 /* 10084 * invltlb_{invpcid,}_pcid_handler() is used to handle an 10085 * invalidate_all IPI, which checks for curpmap == 10086 * smp_tlb_pmap. The below sequence of operations has a 10087 * window where %CR3 is loaded with the new pmap's PML4 10088 * address, but the curpmap value has not yet been updated. 10089 * This causes the invltlb IPI handler, which is called 10090 * between the updates, to execute as a NOP, which leaves 10091 * stale TLB entries. 10092 * 10093 * Note that the most common use of pmap_activate_sw(), from 10094 * a context switch, is immune to this race, because 10095 * interrupts are disabled (while the thread lock is owned), 10096 * so the IPI is delayed until after curpmap is updated. Protect 10097 * other callers in a similar way, by disabling interrupts 10098 * around the %cr3 register reload and curpmap assignment. 10099 */ 10100 spinlock_enter(); 10101 pmap_activate_sw(td); 10102 spinlock_exit(); 10103 } 10104 10105 void 10106 pmap_activate_boot(pmap_t pmap) 10107 { 10108 uint64_t kcr3; 10109 u_int cpuid; 10110 10111 /* 10112 * kernel_pmap must be never deactivated, and we ensure that 10113 * by never activating it at all. 10114 */ 10115 MPASS(pmap != kernel_pmap); 10116 10117 cpuid = PCPU_GET(cpuid); 10118 #ifdef SMP 10119 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 10120 #else 10121 CPU_SET(cpuid, &pmap->pm_active); 10122 #endif 10123 PCPU_SET(curpmap, pmap); 10124 if (pti) { 10125 kcr3 = pmap->pm_cr3; 10126 if (pmap_pcid_enabled) 10127 kcr3 |= pmap->pm_pcids[cpuid].pm_pcid | CR3_PCID_SAVE; 10128 } else { 10129 kcr3 = PMAP_NO_CR3; 10130 } 10131 PCPU_SET(kcr3, kcr3); 10132 PCPU_SET(ucr3, PMAP_NO_CR3); 10133 } 10134 10135 void 10136 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 10137 { 10138 } 10139 10140 /* 10141 * Increase the starting virtual address of the given mapping if a 10142 * different alignment might result in more superpage mappings. 10143 */ 10144 void 10145 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 10146 vm_offset_t *addr, vm_size_t size) 10147 { 10148 vm_offset_t superpage_offset; 10149 10150 if (size < NBPDR) 10151 return; 10152 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 10153 offset += ptoa(object->pg_color); 10154 superpage_offset = offset & PDRMASK; 10155 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 10156 (*addr & PDRMASK) == superpage_offset) 10157 return; 10158 if ((*addr & PDRMASK) < superpage_offset) 10159 *addr = (*addr & ~PDRMASK) + superpage_offset; 10160 else 10161 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 10162 } 10163 10164 #ifdef INVARIANTS 10165 static unsigned long num_dirty_emulations; 10166 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, 10167 &num_dirty_emulations, 0, NULL); 10168 10169 static unsigned long num_accessed_emulations; 10170 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, 10171 &num_accessed_emulations, 0, NULL); 10172 10173 static unsigned long num_superpage_accessed_emulations; 10174 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, 10175 &num_superpage_accessed_emulations, 0, NULL); 10176 10177 static unsigned long ad_emulation_superpage_promotions; 10178 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, 10179 &ad_emulation_superpage_promotions, 0, NULL); 10180 #endif /* INVARIANTS */ 10181 10182 int 10183 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) 10184 { 10185 int rv; 10186 struct rwlock *lock; 10187 #if VM_NRESERVLEVEL > 0 10188 vm_page_t m, mpte; 10189 #endif 10190 pd_entry_t *pde; 10191 pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; 10192 10193 KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, 10194 ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); 10195 10196 if (!pmap_emulate_ad_bits(pmap)) 10197 return (-1); 10198 10199 PG_A = pmap_accessed_bit(pmap); 10200 PG_M = pmap_modified_bit(pmap); 10201 PG_V = pmap_valid_bit(pmap); 10202 PG_RW = pmap_rw_bit(pmap); 10203 10204 rv = -1; 10205 lock = NULL; 10206 PMAP_LOCK(pmap); 10207 10208 pde = pmap_pde(pmap, va); 10209 if (pde == NULL || (*pde & PG_V) == 0) 10210 goto done; 10211 10212 if ((*pde & PG_PS) != 0) { 10213 if (ftype == VM_PROT_READ) { 10214 #ifdef INVARIANTS 10215 atomic_add_long(&num_superpage_accessed_emulations, 1); 10216 #endif 10217 *pde |= PG_A; 10218 rv = 0; 10219 } 10220 goto done; 10221 } 10222 10223 pte = pmap_pde_to_pte(pde, va); 10224 if ((*pte & PG_V) == 0) 10225 goto done; 10226 10227 if (ftype == VM_PROT_WRITE) { 10228 if ((*pte & PG_RW) == 0) 10229 goto done; 10230 /* 10231 * Set the modified and accessed bits simultaneously. 10232 * 10233 * Intel EPT PTEs that do software emulation of A/D bits map 10234 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively. 10235 * An EPT misconfiguration is triggered if the PTE is writable 10236 * but not readable (WR=10). This is avoided by setting PG_A 10237 * and PG_M simultaneously. 10238 */ 10239 *pte |= PG_M | PG_A; 10240 } else { 10241 *pte |= PG_A; 10242 } 10243 10244 #if VM_NRESERVLEVEL > 0 10245 /* try to promote the mapping */ 10246 if (va < VM_MAXUSER_ADDRESS) 10247 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 10248 else 10249 mpte = NULL; 10250 10251 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 10252 10253 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 10254 pmap_ps_enabled(pmap) && 10255 (m->flags & PG_FICTITIOUS) == 0 && 10256 vm_reserv_level_iffullpop(m) == 0) { 10257 pmap_promote_pde(pmap, pde, va, mpte, &lock); 10258 #ifdef INVARIANTS 10259 atomic_add_long(&ad_emulation_superpage_promotions, 1); 10260 #endif 10261 } 10262 #endif 10263 10264 #ifdef INVARIANTS 10265 if (ftype == VM_PROT_WRITE) 10266 atomic_add_long(&num_dirty_emulations, 1); 10267 else 10268 atomic_add_long(&num_accessed_emulations, 1); 10269 #endif 10270 rv = 0; /* success */ 10271 done: 10272 if (lock != NULL) 10273 rw_wunlock(lock); 10274 PMAP_UNLOCK(pmap); 10275 return (rv); 10276 } 10277 10278 void 10279 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) 10280 { 10281 pml4_entry_t *pml4; 10282 pdp_entry_t *pdp; 10283 pd_entry_t *pde; 10284 pt_entry_t *pte, PG_V; 10285 int idx; 10286 10287 idx = 0; 10288 PG_V = pmap_valid_bit(pmap); 10289 PMAP_LOCK(pmap); 10290 10291 pml4 = pmap_pml4e(pmap, va); 10292 if (pml4 == NULL) 10293 goto done; 10294 ptr[idx++] = *pml4; 10295 if ((*pml4 & PG_V) == 0) 10296 goto done; 10297 10298 pdp = pmap_pml4e_to_pdpe(pml4, va); 10299 ptr[idx++] = *pdp; 10300 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) 10301 goto done; 10302 10303 pde = pmap_pdpe_to_pde(pdp, va); 10304 ptr[idx++] = *pde; 10305 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) 10306 goto done; 10307 10308 pte = pmap_pde_to_pte(pde, va); 10309 ptr[idx++] = *pte; 10310 10311 done: 10312 PMAP_UNLOCK(pmap); 10313 *num = idx; 10314 } 10315 10316 /** 10317 * Get the kernel virtual address of a set of physical pages. If there are 10318 * physical addresses not covered by the DMAP perform a transient mapping 10319 * that will be removed when calling pmap_unmap_io_transient. 10320 * 10321 * \param page The pages the caller wishes to obtain the virtual 10322 * address on the kernel memory map. 10323 * \param vaddr On return contains the kernel virtual memory address 10324 * of the pages passed in the page parameter. 10325 * \param count Number of pages passed in. 10326 * \param can_fault TRUE if the thread using the mapped pages can take 10327 * page faults, FALSE otherwise. 10328 * 10329 * \returns TRUE if the caller must call pmap_unmap_io_transient when 10330 * finished or FALSE otherwise. 10331 * 10332 */ 10333 boolean_t 10334 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 10335 boolean_t can_fault) 10336 { 10337 vm_paddr_t paddr; 10338 boolean_t needs_mapping; 10339 pt_entry_t *pte; 10340 int cache_bits, error __unused, i; 10341 10342 /* 10343 * Allocate any KVA space that we need, this is done in a separate 10344 * loop to prevent calling vmem_alloc while pinned. 10345 */ 10346 needs_mapping = FALSE; 10347 for (i = 0; i < count; i++) { 10348 paddr = VM_PAGE_TO_PHYS(page[i]); 10349 if (__predict_false(paddr >= dmaplimit)) { 10350 error = vmem_alloc(kernel_arena, PAGE_SIZE, 10351 M_BESTFIT | M_WAITOK, &vaddr[i]); 10352 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 10353 needs_mapping = TRUE; 10354 } else { 10355 vaddr[i] = PHYS_TO_DMAP(paddr); 10356 } 10357 } 10358 10359 /* Exit early if everything is covered by the DMAP */ 10360 if (!needs_mapping) 10361 return (FALSE); 10362 10363 /* 10364 * NB: The sequence of updating a page table followed by accesses 10365 * to the corresponding pages used in the !DMAP case is subject to 10366 * the situation described in the "AMD64 Architecture Programmer's 10367 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special 10368 * Coherency Considerations". Therefore, issuing the INVLPG right 10369 * after modifying the PTE bits is crucial. 10370 */ 10371 if (!can_fault) 10372 sched_pin(); 10373 for (i = 0; i < count; i++) { 10374 paddr = VM_PAGE_TO_PHYS(page[i]); 10375 if (paddr >= dmaplimit) { 10376 if (can_fault) { 10377 /* 10378 * Slow path, since we can get page faults 10379 * while mappings are active don't pin the 10380 * thread to the CPU and instead add a global 10381 * mapping visible to all CPUs. 10382 */ 10383 pmap_qenter(vaddr[i], &page[i], 1); 10384 } else { 10385 pte = vtopte(vaddr[i]); 10386 cache_bits = pmap_cache_bits(kernel_pmap, 10387 page[i]->md.pat_mode, 0); 10388 pte_store(pte, paddr | X86_PG_RW | X86_PG_V | 10389 cache_bits); 10390 pmap_invlpg(kernel_pmap, vaddr[i]); 10391 } 10392 } 10393 } 10394 10395 return (needs_mapping); 10396 } 10397 10398 void 10399 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 10400 boolean_t can_fault) 10401 { 10402 vm_paddr_t paddr; 10403 int i; 10404 10405 if (!can_fault) 10406 sched_unpin(); 10407 for (i = 0; i < count; i++) { 10408 paddr = VM_PAGE_TO_PHYS(page[i]); 10409 if (paddr >= dmaplimit) { 10410 if (can_fault) 10411 pmap_qremove(vaddr[i], 1); 10412 vmem_free(kernel_arena, vaddr[i], PAGE_SIZE); 10413 } 10414 } 10415 } 10416 10417 vm_offset_t 10418 pmap_quick_enter_page(vm_page_t m) 10419 { 10420 vm_paddr_t paddr; 10421 10422 paddr = VM_PAGE_TO_PHYS(m); 10423 if (paddr < dmaplimit) 10424 return (PHYS_TO_DMAP(paddr)); 10425 mtx_lock_spin(&qframe_mtx); 10426 KASSERT(*vtopte(qframe) == 0, ("qframe busy")); 10427 10428 /* 10429 * Since qframe is exclusively mapped by us, and we do not set 10430 * PG_G, we can use INVLPG here. 10431 */ 10432 invlpg(qframe); 10433 10434 pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A | 10435 X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0)); 10436 return (qframe); 10437 } 10438 10439 void 10440 pmap_quick_remove_page(vm_offset_t addr) 10441 { 10442 10443 if (addr != qframe) 10444 return; 10445 pte_store(vtopte(qframe), 0); 10446 mtx_unlock_spin(&qframe_mtx); 10447 } 10448 10449 /* 10450 * Pdp pages from the large map are managed differently from either 10451 * kernel or user page table pages. They are permanently allocated at 10452 * initialization time, and their reference count is permanently set to 10453 * zero. The pml4 entries pointing to those pages are copied into 10454 * each allocated pmap. 10455 * 10456 * In contrast, pd and pt pages are managed like user page table 10457 * pages. They are dynamically allocated, and their reference count 10458 * represents the number of valid entries within the page. 10459 */ 10460 static vm_page_t 10461 pmap_large_map_getptp_unlocked(void) 10462 { 10463 return (pmap_alloc_pt_page(kernel_pmap, 0, VM_ALLOC_ZERO)); 10464 } 10465 10466 static vm_page_t 10467 pmap_large_map_getptp(void) 10468 { 10469 vm_page_t m; 10470 10471 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 10472 m = pmap_large_map_getptp_unlocked(); 10473 if (m == NULL) { 10474 PMAP_UNLOCK(kernel_pmap); 10475 vm_wait(NULL); 10476 PMAP_LOCK(kernel_pmap); 10477 /* Callers retry. */ 10478 } 10479 return (m); 10480 } 10481 10482 static pdp_entry_t * 10483 pmap_large_map_pdpe(vm_offset_t va) 10484 { 10485 vm_pindex_t pml4_idx; 10486 vm_paddr_t mphys; 10487 10488 pml4_idx = pmap_pml4e_index(va); 10489 KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents, 10490 ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I " 10491 "%#jx lm_ents %d", 10492 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); 10493 KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0, 10494 ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " 10495 "LMSPML4I %#jx lm_ents %d", 10496 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); 10497 mphys = kernel_pml4[pml4_idx] & PG_FRAME; 10498 return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va)); 10499 } 10500 10501 static pd_entry_t * 10502 pmap_large_map_pde(vm_offset_t va) 10503 { 10504 pdp_entry_t *pdpe; 10505 vm_page_t m; 10506 vm_paddr_t mphys; 10507 10508 retry: 10509 pdpe = pmap_large_map_pdpe(va); 10510 if (*pdpe == 0) { 10511 m = pmap_large_map_getptp(); 10512 if (m == NULL) 10513 goto retry; 10514 mphys = VM_PAGE_TO_PHYS(m); 10515 *pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; 10516 } else { 10517 MPASS((*pdpe & X86_PG_PS) == 0); 10518 mphys = *pdpe & PG_FRAME; 10519 } 10520 return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va)); 10521 } 10522 10523 static pt_entry_t * 10524 pmap_large_map_pte(vm_offset_t va) 10525 { 10526 pd_entry_t *pde; 10527 vm_page_t m; 10528 vm_paddr_t mphys; 10529 10530 retry: 10531 pde = pmap_large_map_pde(va); 10532 if (*pde == 0) { 10533 m = pmap_large_map_getptp(); 10534 if (m == NULL) 10535 goto retry; 10536 mphys = VM_PAGE_TO_PHYS(m); 10537 *pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; 10538 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->ref_count++; 10539 } else { 10540 MPASS((*pde & X86_PG_PS) == 0); 10541 mphys = *pde & PG_FRAME; 10542 } 10543 return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va)); 10544 } 10545 10546 static vm_paddr_t 10547 pmap_large_map_kextract(vm_offset_t va) 10548 { 10549 pdp_entry_t *pdpe, pdp; 10550 pd_entry_t *pde, pd; 10551 pt_entry_t *pte, pt; 10552 10553 KASSERT(PMAP_ADDRESS_IN_LARGEMAP(va), 10554 ("not largemap range %#lx", (u_long)va)); 10555 pdpe = pmap_large_map_pdpe(va); 10556 pdp = *pdpe; 10557 KASSERT((pdp & X86_PG_V) != 0, 10558 ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, 10559 (u_long)pdpe, pdp)); 10560 if ((pdp & X86_PG_PS) != 0) { 10561 KASSERT((amd_feature & AMDID_PAGE1GB) != 0, 10562 ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, 10563 (u_long)pdpe, pdp)); 10564 return ((pdp & PG_PS_PDP_FRAME) | (va & PDPMASK)); 10565 } 10566 pde = pmap_pdpe_to_pde(pdpe, va); 10567 pd = *pde; 10568 KASSERT((pd & X86_PG_V) != 0, 10569 ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); 10570 if ((pd & X86_PG_PS) != 0) 10571 return ((pd & PG_PS_FRAME) | (va & PDRMASK)); 10572 pte = pmap_pde_to_pte(pde, va); 10573 pt = *pte; 10574 KASSERT((pt & X86_PG_V) != 0, 10575 ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, pt)); 10576 return ((pt & PG_FRAME) | (va & PAGE_MASK)); 10577 } 10578 10579 static int 10580 pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase, 10581 vmem_addr_t *vmem_res) 10582 { 10583 10584 /* 10585 * Large mappings are all but static. Consequently, there 10586 * is no point in waiting for an earlier allocation to be 10587 * freed. 10588 */ 10589 return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN, 10590 VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res)); 10591 } 10592 10593 int 10594 pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr, 10595 vm_memattr_t mattr) 10596 { 10597 pdp_entry_t *pdpe; 10598 pd_entry_t *pde; 10599 pt_entry_t *pte; 10600 vm_offset_t va, inc; 10601 vmem_addr_t vmem_res; 10602 vm_paddr_t pa; 10603 int error; 10604 10605 if (len == 0 || spa + len < spa) 10606 return (EINVAL); 10607 10608 /* See if DMAP can serve. */ 10609 if (spa + len <= dmaplimit) { 10610 va = PHYS_TO_DMAP(spa); 10611 *addr = (void *)va; 10612 return (pmap_change_attr(va, len, mattr)); 10613 } 10614 10615 /* 10616 * No, allocate KVA. Fit the address with best possible 10617 * alignment for superpages. Fall back to worse align if 10618 * failed. 10619 */ 10620 error = ENOMEM; 10621 if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len, 10622 NBPDP) >= roundup2(spa, NBPDP) + NBPDP) 10623 error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK, 10624 &vmem_res); 10625 if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa, 10626 NBPDR) + NBPDR) 10627 error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK, 10628 &vmem_res); 10629 if (error != 0) 10630 error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res); 10631 if (error != 0) 10632 return (error); 10633 10634 /* 10635 * Fill pagetable. PG_M is not pre-set, we scan modified bits 10636 * in the pagetable to minimize flushing. No need to 10637 * invalidate TLB, since we only update invalid entries. 10638 */ 10639 PMAP_LOCK(kernel_pmap); 10640 for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc, 10641 len -= inc) { 10642 if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP && 10643 (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) { 10644 pdpe = pmap_large_map_pdpe(va); 10645 MPASS(*pdpe == 0); 10646 *pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW | 10647 X86_PG_V | X86_PG_A | pg_nx | 10648 pmap_cache_bits(kernel_pmap, mattr, TRUE); 10649 inc = NBPDP; 10650 } else if (len >= NBPDR && (pa & PDRMASK) == 0 && 10651 (va & PDRMASK) == 0) { 10652 pde = pmap_large_map_pde(va); 10653 MPASS(*pde == 0); 10654 *pde = pa | pg_g | X86_PG_PS | X86_PG_RW | 10655 X86_PG_V | X86_PG_A | pg_nx | 10656 pmap_cache_bits(kernel_pmap, mattr, TRUE); 10657 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))-> 10658 ref_count++; 10659 inc = NBPDR; 10660 } else { 10661 pte = pmap_large_map_pte(va); 10662 MPASS(*pte == 0); 10663 *pte = pa | pg_g | X86_PG_RW | X86_PG_V | 10664 X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, 10665 mattr, FALSE); 10666 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))-> 10667 ref_count++; 10668 inc = PAGE_SIZE; 10669 } 10670 } 10671 PMAP_UNLOCK(kernel_pmap); 10672 MPASS(len == 0); 10673 10674 *addr = (void *)vmem_res; 10675 return (0); 10676 } 10677 10678 void 10679 pmap_large_unmap(void *svaa, vm_size_t len) 10680 { 10681 vm_offset_t sva, va; 10682 vm_size_t inc; 10683 pdp_entry_t *pdpe, pdp; 10684 pd_entry_t *pde, pd; 10685 pt_entry_t *pte; 10686 vm_page_t m; 10687 struct spglist spgf; 10688 10689 sva = (vm_offset_t)svaa; 10690 if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS && 10691 sva + len <= DMAP_MIN_ADDRESS + dmaplimit)) 10692 return; 10693 10694 SLIST_INIT(&spgf); 10695 KASSERT(PMAP_ADDRESS_IN_LARGEMAP(sva) && 10696 PMAP_ADDRESS_IN_LARGEMAP(sva + len - 1), 10697 ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len)); 10698 PMAP_LOCK(kernel_pmap); 10699 for (va = sva; va < sva + len; va += inc) { 10700 pdpe = pmap_large_map_pdpe(va); 10701 pdp = *pdpe; 10702 KASSERT((pdp & X86_PG_V) != 0, 10703 ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, 10704 (u_long)pdpe, pdp)); 10705 if ((pdp & X86_PG_PS) != 0) { 10706 KASSERT((amd_feature & AMDID_PAGE1GB) != 0, 10707 ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, 10708 (u_long)pdpe, pdp)); 10709 KASSERT((va & PDPMASK) == 0, 10710 ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va, 10711 (u_long)pdpe, pdp)); 10712 KASSERT(va + NBPDP <= sva + len, 10713 ("unmap covers partial 1GB page, sva %#lx va %#lx " 10714 "pdpe %#lx pdp %#lx len %#lx", sva, va, 10715 (u_long)pdpe, pdp, len)); 10716 *pdpe = 0; 10717 inc = NBPDP; 10718 continue; 10719 } 10720 pde = pmap_pdpe_to_pde(pdpe, va); 10721 pd = *pde; 10722 KASSERT((pd & X86_PG_V) != 0, 10723 ("invalid pd va %#lx pde %#lx pd %#lx", va, 10724 (u_long)pde, pd)); 10725 if ((pd & X86_PG_PS) != 0) { 10726 KASSERT((va & PDRMASK) == 0, 10727 ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va, 10728 (u_long)pde, pd)); 10729 KASSERT(va + NBPDR <= sva + len, 10730 ("unmap covers partial 2MB page, sva %#lx va %#lx " 10731 "pde %#lx pd %#lx len %#lx", sva, va, (u_long)pde, 10732 pd, len)); 10733 pde_store(pde, 0); 10734 inc = NBPDR; 10735 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); 10736 m->ref_count--; 10737 if (m->ref_count == 0) { 10738 *pdpe = 0; 10739 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10740 } 10741 continue; 10742 } 10743 pte = pmap_pde_to_pte(pde, va); 10744 KASSERT((*pte & X86_PG_V) != 0, 10745 ("invalid pte va %#lx pte %#lx pt %#lx", va, 10746 (u_long)pte, *pte)); 10747 pte_clear(pte); 10748 inc = PAGE_SIZE; 10749 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte)); 10750 m->ref_count--; 10751 if (m->ref_count == 0) { 10752 *pde = 0; 10753 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10754 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); 10755 m->ref_count--; 10756 if (m->ref_count == 0) { 10757 *pdpe = 0; 10758 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10759 } 10760 } 10761 } 10762 pmap_invalidate_range(kernel_pmap, sva, sva + len); 10763 PMAP_UNLOCK(kernel_pmap); 10764 vm_page_free_pages_toq(&spgf, false); 10765 vmem_free(large_vmem, sva, len); 10766 } 10767 10768 static void 10769 pmap_large_map_wb_fence_mfence(void) 10770 { 10771 10772 mfence(); 10773 } 10774 10775 static void 10776 pmap_large_map_wb_fence_atomic(void) 10777 { 10778 10779 atomic_thread_fence_seq_cst(); 10780 } 10781 10782 static void 10783 pmap_large_map_wb_fence_nop(void) 10784 { 10785 } 10786 10787 DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void)) 10788 { 10789 10790 if (cpu_vendor_id != CPU_VENDOR_INTEL) 10791 return (pmap_large_map_wb_fence_mfence); 10792 else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB | 10793 CPUID_STDEXT_CLFLUSHOPT)) == 0) 10794 return (pmap_large_map_wb_fence_atomic); 10795 else 10796 /* clflush is strongly enough ordered */ 10797 return (pmap_large_map_wb_fence_nop); 10798 } 10799 10800 static void 10801 pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len) 10802 { 10803 10804 for (; len > 0; len -= cpu_clflush_line_size, 10805 va += cpu_clflush_line_size) 10806 clwb(va); 10807 } 10808 10809 static void 10810 pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len) 10811 { 10812 10813 for (; len > 0; len -= cpu_clflush_line_size, 10814 va += cpu_clflush_line_size) 10815 clflushopt(va); 10816 } 10817 10818 static void 10819 pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len) 10820 { 10821 10822 for (; len > 0; len -= cpu_clflush_line_size, 10823 va += cpu_clflush_line_size) 10824 clflush(va); 10825 } 10826 10827 static void 10828 pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len __unused) 10829 { 10830 } 10831 10832 DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t, vm_size_t)) 10833 { 10834 10835 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0) 10836 return (pmap_large_map_flush_range_clwb); 10837 else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) 10838 return (pmap_large_map_flush_range_clflushopt); 10839 else if ((cpu_feature & CPUID_CLFSH) != 0) 10840 return (pmap_large_map_flush_range_clflush); 10841 else 10842 return (pmap_large_map_flush_range_nop); 10843 } 10844 10845 static void 10846 pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva) 10847 { 10848 volatile u_long *pe; 10849 u_long p; 10850 vm_offset_t va; 10851 vm_size_t inc; 10852 bool seen_other; 10853 10854 for (va = sva; va < eva; va += inc) { 10855 inc = 0; 10856 if ((amd_feature & AMDID_PAGE1GB) != 0) { 10857 pe = (volatile u_long *)pmap_large_map_pdpe(va); 10858 p = *pe; 10859 if ((p & X86_PG_PS) != 0) 10860 inc = NBPDP; 10861 } 10862 if (inc == 0) { 10863 pe = (volatile u_long *)pmap_large_map_pde(va); 10864 p = *pe; 10865 if ((p & X86_PG_PS) != 0) 10866 inc = NBPDR; 10867 } 10868 if (inc == 0) { 10869 pe = (volatile u_long *)pmap_large_map_pte(va); 10870 p = *pe; 10871 inc = PAGE_SIZE; 10872 } 10873 seen_other = false; 10874 for (;;) { 10875 if ((p & X86_PG_AVAIL1) != 0) { 10876 /* 10877 * Spin-wait for the end of a parallel 10878 * write-back. 10879 */ 10880 cpu_spinwait(); 10881 p = *pe; 10882 10883 /* 10884 * If we saw other write-back 10885 * occuring, we cannot rely on PG_M to 10886 * indicate state of the cache. The 10887 * PG_M bit is cleared before the 10888 * flush to avoid ignoring new writes, 10889 * and writes which are relevant for 10890 * us might happen after. 10891 */ 10892 seen_other = true; 10893 continue; 10894 } 10895 10896 if ((p & X86_PG_M) != 0 || seen_other) { 10897 if (!atomic_fcmpset_long(pe, &p, 10898 (p & ~X86_PG_M) | X86_PG_AVAIL1)) 10899 /* 10900 * If we saw PG_M without 10901 * PG_AVAIL1, and then on the 10902 * next attempt we do not 10903 * observe either PG_M or 10904 * PG_AVAIL1, the other 10905 * write-back started after us 10906 * and finished before us. We 10907 * can rely on it doing our 10908 * work. 10909 */ 10910 continue; 10911 pmap_large_map_flush_range(va, inc); 10912 atomic_clear_long(pe, X86_PG_AVAIL1); 10913 } 10914 break; 10915 } 10916 maybe_yield(); 10917 } 10918 } 10919 10920 /* 10921 * Write-back cache lines for the given address range. 10922 * 10923 * Must be called only on the range or sub-range returned from 10924 * pmap_large_map(). Must not be called on the coalesced ranges. 10925 * 10926 * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH 10927 * instructions support. 10928 */ 10929 void 10930 pmap_large_map_wb(void *svap, vm_size_t len) 10931 { 10932 vm_offset_t eva, sva; 10933 10934 sva = (vm_offset_t)svap; 10935 eva = sva + len; 10936 pmap_large_map_wb_fence(); 10937 if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) { 10938 pmap_large_map_flush_range(sva, len); 10939 } else { 10940 KASSERT(sva >= LARGEMAP_MIN_ADDRESS && 10941 eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4, 10942 ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len)); 10943 pmap_large_map_wb_large(sva, eva); 10944 } 10945 pmap_large_map_wb_fence(); 10946 } 10947 10948 static vm_page_t 10949 pmap_pti_alloc_page(void) 10950 { 10951 vm_page_t m; 10952 10953 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 10954 m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_WIRED | VM_ALLOC_ZERO); 10955 return (m); 10956 } 10957 10958 static bool 10959 pmap_pti_free_page(vm_page_t m) 10960 { 10961 if (!vm_page_unwire_noq(m)) 10962 return (false); 10963 vm_page_xbusy_claim(m); 10964 vm_page_free_zero(m); 10965 return (true); 10966 } 10967 10968 static void 10969 pmap_pti_init(void) 10970 { 10971 vm_page_t pml4_pg; 10972 pdp_entry_t *pdpe; 10973 vm_offset_t va; 10974 int i; 10975 10976 if (!pti) 10977 return; 10978 pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL); 10979 VM_OBJECT_WLOCK(pti_obj); 10980 pml4_pg = pmap_pti_alloc_page(); 10981 pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg)); 10982 for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS && 10983 va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) { 10984 pdpe = pmap_pti_pdpe(va); 10985 pmap_pti_wire_pte(pdpe); 10986 } 10987 pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0], 10988 (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false); 10989 pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt + 10990 sizeof(struct gate_descriptor) * NIDT, false); 10991 CPU_FOREACH(i) { 10992 /* Doublefault stack IST 1 */ 10993 va = __pcpu[i].pc_common_tss.tss_ist1 + sizeof(struct nmi_pcpu); 10994 pmap_pti_add_kva_locked(va - DBLFAULT_STACK_SIZE, va, false); 10995 /* NMI stack IST 2 */ 10996 va = __pcpu[i].pc_common_tss.tss_ist2 + sizeof(struct nmi_pcpu); 10997 pmap_pti_add_kva_locked(va - NMI_STACK_SIZE, va, false); 10998 /* MC# stack IST 3 */ 10999 va = __pcpu[i].pc_common_tss.tss_ist3 + 11000 sizeof(struct nmi_pcpu); 11001 pmap_pti_add_kva_locked(va - MCE_STACK_SIZE, va, false); 11002 /* DB# stack IST 4 */ 11003 va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu); 11004 pmap_pti_add_kva_locked(va - DBG_STACK_SIZE, va, false); 11005 } 11006 pmap_pti_add_kva_locked((vm_offset_t)KERNSTART, (vm_offset_t)etext, 11007 true); 11008 pti_finalized = true; 11009 VM_OBJECT_WUNLOCK(pti_obj); 11010 } 11011 11012 static void 11013 pmap_cpu_init(void *arg __unused) 11014 { 11015 CPU_COPY(&all_cpus, &kernel_pmap->pm_active); 11016 pmap_pti_init(); 11017 } 11018 SYSINIT(pmap_cpu, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_cpu_init, NULL); 11019 11020 static pdp_entry_t * 11021 pmap_pti_pdpe(vm_offset_t va) 11022 { 11023 pml4_entry_t *pml4e; 11024 pdp_entry_t *pdpe; 11025 vm_page_t m; 11026 vm_pindex_t pml4_idx; 11027 vm_paddr_t mphys; 11028 11029 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11030 11031 pml4_idx = pmap_pml4e_index(va); 11032 pml4e = &pti_pml4[pml4_idx]; 11033 m = NULL; 11034 if (*pml4e == 0) { 11035 if (pti_finalized) 11036 panic("pml4 alloc after finalization\n"); 11037 m = pmap_pti_alloc_page(); 11038 if (*pml4e != 0) { 11039 pmap_pti_free_page(m); 11040 mphys = *pml4e & ~PAGE_MASK; 11041 } else { 11042 mphys = VM_PAGE_TO_PHYS(m); 11043 *pml4e = mphys | X86_PG_RW | X86_PG_V; 11044 } 11045 } else { 11046 mphys = *pml4e & ~PAGE_MASK; 11047 } 11048 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va); 11049 return (pdpe); 11050 } 11051 11052 static void 11053 pmap_pti_wire_pte(void *pte) 11054 { 11055 vm_page_t m; 11056 11057 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11058 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); 11059 m->ref_count++; 11060 } 11061 11062 static void 11063 pmap_pti_unwire_pde(void *pde, bool only_ref) 11064 { 11065 vm_page_t m; 11066 11067 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11068 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde)); 11069 MPASS(only_ref || m->ref_count > 1); 11070 pmap_pti_free_page(m); 11071 } 11072 11073 static void 11074 pmap_pti_unwire_pte(void *pte, vm_offset_t va) 11075 { 11076 vm_page_t m; 11077 pd_entry_t *pde; 11078 11079 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11080 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); 11081 if (pmap_pti_free_page(m)) { 11082 pde = pmap_pti_pde(va); 11083 MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V); 11084 *pde = 0; 11085 pmap_pti_unwire_pde(pde, false); 11086 } 11087 } 11088 11089 static pd_entry_t * 11090 pmap_pti_pde(vm_offset_t va) 11091 { 11092 pdp_entry_t *pdpe; 11093 pd_entry_t *pde; 11094 vm_page_t m; 11095 vm_pindex_t pd_idx; 11096 vm_paddr_t mphys; 11097 11098 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11099 11100 pdpe = pmap_pti_pdpe(va); 11101 if (*pdpe == 0) { 11102 m = pmap_pti_alloc_page(); 11103 if (*pdpe != 0) { 11104 pmap_pti_free_page(m); 11105 MPASS((*pdpe & X86_PG_PS) == 0); 11106 mphys = *pdpe & ~PAGE_MASK; 11107 } else { 11108 mphys = VM_PAGE_TO_PHYS(m); 11109 *pdpe = mphys | X86_PG_RW | X86_PG_V; 11110 } 11111 } else { 11112 MPASS((*pdpe & X86_PG_PS) == 0); 11113 mphys = *pdpe & ~PAGE_MASK; 11114 } 11115 11116 pde = (pd_entry_t *)PHYS_TO_DMAP(mphys); 11117 pd_idx = pmap_pde_index(va); 11118 pde += pd_idx; 11119 return (pde); 11120 } 11121 11122 static pt_entry_t * 11123 pmap_pti_pte(vm_offset_t va, bool *unwire_pde) 11124 { 11125 pd_entry_t *pde; 11126 pt_entry_t *pte; 11127 vm_page_t m; 11128 vm_paddr_t mphys; 11129 11130 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11131 11132 pde = pmap_pti_pde(va); 11133 if (unwire_pde != NULL) { 11134 *unwire_pde = true; 11135 pmap_pti_wire_pte(pde); 11136 } 11137 if (*pde == 0) { 11138 m = pmap_pti_alloc_page(); 11139 if (*pde != 0) { 11140 pmap_pti_free_page(m); 11141 MPASS((*pde & X86_PG_PS) == 0); 11142 mphys = *pde & ~(PAGE_MASK | pg_nx); 11143 } else { 11144 mphys = VM_PAGE_TO_PHYS(m); 11145 *pde = mphys | X86_PG_RW | X86_PG_V; 11146 if (unwire_pde != NULL) 11147 *unwire_pde = false; 11148 } 11149 } else { 11150 MPASS((*pde & X86_PG_PS) == 0); 11151 mphys = *pde & ~(PAGE_MASK | pg_nx); 11152 } 11153 11154 pte = (pt_entry_t *)PHYS_TO_DMAP(mphys); 11155 pte += pmap_pte_index(va); 11156 11157 return (pte); 11158 } 11159 11160 static void 11161 pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec) 11162 { 11163 vm_paddr_t pa; 11164 pd_entry_t *pde; 11165 pt_entry_t *pte, ptev; 11166 bool unwire_pde; 11167 11168 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11169 11170 sva = trunc_page(sva); 11171 MPASS(sva > VM_MAXUSER_ADDRESS); 11172 eva = round_page(eva); 11173 MPASS(sva < eva); 11174 for (; sva < eva; sva += PAGE_SIZE) { 11175 pte = pmap_pti_pte(sva, &unwire_pde); 11176 pa = pmap_kextract(sva); 11177 ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G | 11178 (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap, 11179 VM_MEMATTR_DEFAULT, FALSE); 11180 if (*pte == 0) { 11181 pte_store(pte, ptev); 11182 pmap_pti_wire_pte(pte); 11183 } else { 11184 KASSERT(!pti_finalized, 11185 ("pti overlap after fin %#lx %#lx %#lx", 11186 sva, *pte, ptev)); 11187 KASSERT(*pte == ptev, 11188 ("pti non-identical pte after fin %#lx %#lx %#lx", 11189 sva, *pte, ptev)); 11190 } 11191 if (unwire_pde) { 11192 pde = pmap_pti_pde(sva); 11193 pmap_pti_unwire_pde(pde, true); 11194 } 11195 } 11196 } 11197 11198 void 11199 pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec) 11200 { 11201 11202 if (!pti) 11203 return; 11204 VM_OBJECT_WLOCK(pti_obj); 11205 pmap_pti_add_kva_locked(sva, eva, exec); 11206 VM_OBJECT_WUNLOCK(pti_obj); 11207 } 11208 11209 void 11210 pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva) 11211 { 11212 pt_entry_t *pte; 11213 vm_offset_t va; 11214 11215 if (!pti) 11216 return; 11217 sva = rounddown2(sva, PAGE_SIZE); 11218 MPASS(sva > VM_MAXUSER_ADDRESS); 11219 eva = roundup2(eva, PAGE_SIZE); 11220 MPASS(sva < eva); 11221 VM_OBJECT_WLOCK(pti_obj); 11222 for (va = sva; va < eva; va += PAGE_SIZE) { 11223 pte = pmap_pti_pte(va, NULL); 11224 KASSERT((*pte & X86_PG_V) != 0, 11225 ("invalid pte va %#lx pte %#lx pt %#lx", va, 11226 (u_long)pte, *pte)); 11227 pte_clear(pte); 11228 pmap_pti_unwire_pte(pte, va); 11229 } 11230 pmap_invalidate_range(kernel_pmap, sva, eva); 11231 VM_OBJECT_WUNLOCK(pti_obj); 11232 } 11233 11234 static void * 11235 pkru_dup_range(void *ctx __unused, void *data) 11236 { 11237 struct pmap_pkru_range *node, *new_node; 11238 11239 new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); 11240 if (new_node == NULL) 11241 return (NULL); 11242 node = data; 11243 memcpy(new_node, node, sizeof(*node)); 11244 return (new_node); 11245 } 11246 11247 static void 11248 pkru_free_range(void *ctx __unused, void *node) 11249 { 11250 11251 uma_zfree(pmap_pkru_ranges_zone, node); 11252 } 11253 11254 static int 11255 pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, 11256 int flags) 11257 { 11258 struct pmap_pkru_range *ppr; 11259 int error; 11260 11261 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11262 MPASS(pmap->pm_type == PT_X86); 11263 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11264 if ((flags & AMD64_PKRU_EXCL) != 0 && 11265 !rangeset_check_empty(&pmap->pm_pkru, sva, eva)) 11266 return (EBUSY); 11267 ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); 11268 if (ppr == NULL) 11269 return (ENOMEM); 11270 ppr->pkru_keyidx = keyidx; 11271 ppr->pkru_flags = flags & AMD64_PKRU_PERSIST; 11272 error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr); 11273 if (error != 0) 11274 uma_zfree(pmap_pkru_ranges_zone, ppr); 11275 return (error); 11276 } 11277 11278 static int 11279 pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11280 { 11281 11282 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11283 MPASS(pmap->pm_type == PT_X86); 11284 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11285 return (rangeset_remove(&pmap->pm_pkru, sva, eva)); 11286 } 11287 11288 static void 11289 pmap_pkru_deassign_all(pmap_t pmap) 11290 { 11291 11292 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11293 if (pmap->pm_type == PT_X86 && 11294 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) 11295 rangeset_remove_all(&pmap->pm_pkru); 11296 } 11297 11298 static bool 11299 pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11300 { 11301 struct pmap_pkru_range *ppr, *prev_ppr; 11302 vm_offset_t va; 11303 11304 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11305 if (pmap->pm_type != PT_X86 || 11306 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || 11307 sva >= VM_MAXUSER_ADDRESS) 11308 return (true); 11309 MPASS(eva <= VM_MAXUSER_ADDRESS); 11310 for (va = sva; va < eva; prev_ppr = ppr) { 11311 ppr = rangeset_lookup(&pmap->pm_pkru, va); 11312 if (va == sva) 11313 prev_ppr = ppr; 11314 else if ((ppr == NULL) ^ (prev_ppr == NULL)) 11315 return (false); 11316 if (ppr == NULL) { 11317 va += PAGE_SIZE; 11318 continue; 11319 } 11320 if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx) 11321 return (false); 11322 va = ppr->pkru_rs_el.re_end; 11323 } 11324 return (true); 11325 } 11326 11327 static pt_entry_t 11328 pmap_pkru_get(pmap_t pmap, vm_offset_t va) 11329 { 11330 struct pmap_pkru_range *ppr; 11331 11332 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11333 if (pmap->pm_type != PT_X86 || 11334 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || 11335 va >= VM_MAXUSER_ADDRESS) 11336 return (0); 11337 ppr = rangeset_lookup(&pmap->pm_pkru, va); 11338 if (ppr != NULL) 11339 return (X86_PG_PKU(ppr->pkru_keyidx)); 11340 return (0); 11341 } 11342 11343 static bool 11344 pred_pkru_on_remove(void *ctx __unused, void *r) 11345 { 11346 struct pmap_pkru_range *ppr; 11347 11348 ppr = r; 11349 return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0); 11350 } 11351 11352 static void 11353 pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11354 { 11355 11356 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11357 if (pmap->pm_type == PT_X86 && 11358 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 11359 rangeset_remove_pred(&pmap->pm_pkru, sva, eva, 11360 pred_pkru_on_remove); 11361 } 11362 } 11363 11364 static int 11365 pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap) 11366 { 11367 11368 PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED); 11369 PMAP_LOCK_ASSERT(src_pmap, MA_OWNED); 11370 MPASS(dst_pmap->pm_type == PT_X86); 11371 MPASS(src_pmap->pm_type == PT_X86); 11372 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11373 if (src_pmap->pm_pkru.rs_data_ctx == NULL) 11374 return (0); 11375 return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru)); 11376 } 11377 11378 static void 11379 pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 11380 u_int keyidx) 11381 { 11382 pml4_entry_t *pml4e; 11383 pdp_entry_t *pdpe; 11384 pd_entry_t newpde, ptpaddr, *pde; 11385 pt_entry_t newpte, *ptep, pte; 11386 vm_offset_t va, va_next; 11387 bool changed; 11388 11389 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11390 MPASS(pmap->pm_type == PT_X86); 11391 MPASS(keyidx <= PMAP_MAX_PKRU_IDX); 11392 11393 for (changed = false, va = sva; va < eva; va = va_next) { 11394 pml4e = pmap_pml4e(pmap, va); 11395 if (pml4e == NULL || (*pml4e & X86_PG_V) == 0) { 11396 va_next = (va + NBPML4) & ~PML4MASK; 11397 if (va_next < va) 11398 va_next = eva; 11399 continue; 11400 } 11401 11402 pdpe = pmap_pml4e_to_pdpe(pml4e, va); 11403 if ((*pdpe & X86_PG_V) == 0) { 11404 va_next = (va + NBPDP) & ~PDPMASK; 11405 if (va_next < va) 11406 va_next = eva; 11407 continue; 11408 } 11409 11410 va_next = (va + NBPDR) & ~PDRMASK; 11411 if (va_next < va) 11412 va_next = eva; 11413 11414 pde = pmap_pdpe_to_pde(pdpe, va); 11415 ptpaddr = *pde; 11416 if (ptpaddr == 0) 11417 continue; 11418 11419 MPASS((ptpaddr & X86_PG_V) != 0); 11420 if ((ptpaddr & PG_PS) != 0) { 11421 if (va + NBPDR == va_next && eva >= va_next) { 11422 newpde = (ptpaddr & ~X86_PG_PKU_MASK) | 11423 X86_PG_PKU(keyidx); 11424 if (newpde != ptpaddr) { 11425 *pde = newpde; 11426 changed = true; 11427 } 11428 continue; 11429 } else if (!pmap_demote_pde(pmap, pde, va)) { 11430 continue; 11431 } 11432 } 11433 11434 if (va_next > eva) 11435 va_next = eva; 11436 11437 for (ptep = pmap_pde_to_pte(pde, va); va != va_next; 11438 ptep++, va += PAGE_SIZE) { 11439 pte = *ptep; 11440 if ((pte & X86_PG_V) == 0) 11441 continue; 11442 newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx); 11443 if (newpte != pte) { 11444 *ptep = newpte; 11445 changed = true; 11446 } 11447 } 11448 } 11449 if (changed) 11450 pmap_invalidate_range(pmap, sva, eva); 11451 } 11452 11453 static int 11454 pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 11455 u_int keyidx, int flags) 11456 { 11457 11458 if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX || 11459 (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0) 11460 return (EINVAL); 11461 if (eva <= sva || eva > VM_MAXUSER_ADDRESS) 11462 return (EFAULT); 11463 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) 11464 return (ENOTSUP); 11465 return (0); 11466 } 11467 11468 int 11469 pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, 11470 int flags) 11471 { 11472 int error; 11473 11474 sva = trunc_page(sva); 11475 eva = round_page(eva); 11476 error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags); 11477 if (error != 0) 11478 return (error); 11479 for (;;) { 11480 PMAP_LOCK(pmap); 11481 error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags); 11482 if (error == 0) 11483 pmap_pkru_update_range(pmap, sva, eva, keyidx); 11484 PMAP_UNLOCK(pmap); 11485 if (error != ENOMEM) 11486 break; 11487 vm_wait(NULL); 11488 } 11489 return (error); 11490 } 11491 11492 int 11493 pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11494 { 11495 int error; 11496 11497 sva = trunc_page(sva); 11498 eva = round_page(eva); 11499 error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0); 11500 if (error != 0) 11501 return (error); 11502 for (;;) { 11503 PMAP_LOCK(pmap); 11504 error = pmap_pkru_deassign(pmap, sva, eva); 11505 if (error == 0) 11506 pmap_pkru_update_range(pmap, sva, eva, 0); 11507 PMAP_UNLOCK(pmap); 11508 if (error != ENOMEM) 11509 break; 11510 vm_wait(NULL); 11511 } 11512 return (error); 11513 } 11514 11515 #if defined(KASAN) || defined(KMSAN) 11516 11517 /* 11518 * Reserve enough memory to: 11519 * 1) allocate PDP pages for the shadow map(s), 11520 * 2) shadow one page of memory, so one PD page, one PT page, and one shadow 11521 * page per shadow map. 11522 */ 11523 #ifdef KASAN 11524 #define SAN_EARLY_PAGES (NKASANPML4E + 3) 11525 #else 11526 #define SAN_EARLY_PAGES (NKMSANSHADPML4E + NKMSANORIGPML4E + 2 * 3) 11527 #endif 11528 11529 static uint64_t __nosanitizeaddress __nosanitizememory 11530 pmap_san_enter_early_alloc_4k(uint64_t pabase) 11531 { 11532 static uint8_t data[PAGE_SIZE * SAN_EARLY_PAGES] __aligned(PAGE_SIZE); 11533 static size_t offset = 0; 11534 uint64_t pa; 11535 11536 if (offset == sizeof(data)) { 11537 panic("%s: ran out of memory for the bootstrap shadow map", 11538 __func__); 11539 } 11540 11541 pa = pabase + ((vm_offset_t)&data[offset] - KERNSTART); 11542 offset += PAGE_SIZE; 11543 return (pa); 11544 } 11545 11546 /* 11547 * Map a shadow page, before the kernel has bootstrapped its page tables. This 11548 * is currently only used to shadow the temporary boot stack set up by locore. 11549 */ 11550 static void __nosanitizeaddress __nosanitizememory 11551 pmap_san_enter_early(vm_offset_t va) 11552 { 11553 static bool first = true; 11554 pml4_entry_t *pml4e; 11555 pdp_entry_t *pdpe; 11556 pd_entry_t *pde; 11557 pt_entry_t *pte; 11558 uint64_t cr3, pa, base; 11559 int i; 11560 11561 base = amd64_loadaddr(); 11562 cr3 = rcr3(); 11563 11564 if (first) { 11565 /* 11566 * If this the first call, we need to allocate new PML4Es for 11567 * the bootstrap shadow map(s). We don't know how the PML4 page 11568 * was initialized by the boot loader, so we can't simply test 11569 * whether the shadow map's PML4Es are zero. 11570 */ 11571 first = false; 11572 #ifdef KASAN 11573 for (i = 0; i < NKASANPML4E; i++) { 11574 pa = pmap_san_enter_early_alloc_4k(base); 11575 11576 pml4e = (pml4_entry_t *)cr3 + 11577 pmap_pml4e_index(KASAN_MIN_ADDRESS + i * NBPML4); 11578 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V); 11579 } 11580 #else 11581 for (i = 0; i < NKMSANORIGPML4E; i++) { 11582 pa = pmap_san_enter_early_alloc_4k(base); 11583 11584 pml4e = (pml4_entry_t *)cr3 + 11585 pmap_pml4e_index(KMSAN_ORIG_MIN_ADDRESS + 11586 i * NBPML4); 11587 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V); 11588 } 11589 for (i = 0; i < NKMSANSHADPML4E; i++) { 11590 pa = pmap_san_enter_early_alloc_4k(base); 11591 11592 pml4e = (pml4_entry_t *)cr3 + 11593 pmap_pml4e_index(KMSAN_SHAD_MIN_ADDRESS + 11594 i * NBPML4); 11595 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V); 11596 } 11597 #endif 11598 } 11599 pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(va); 11600 pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(va); 11601 if (*pdpe == 0) { 11602 pa = pmap_san_enter_early_alloc_4k(base); 11603 *pdpe = (pdp_entry_t)(pa | X86_PG_RW | X86_PG_V); 11604 } 11605 pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(va); 11606 if (*pde == 0) { 11607 pa = pmap_san_enter_early_alloc_4k(base); 11608 *pde = (pd_entry_t)(pa | X86_PG_RW | X86_PG_V); 11609 } 11610 pte = (pt_entry_t *)(*pde & PG_FRAME) + pmap_pte_index(va); 11611 if (*pte != 0) 11612 panic("%s: PTE for %#lx is already initialized", __func__, va); 11613 pa = pmap_san_enter_early_alloc_4k(base); 11614 *pte = (pt_entry_t)(pa | X86_PG_A | X86_PG_M | X86_PG_RW | X86_PG_V); 11615 } 11616 11617 static vm_page_t 11618 pmap_san_enter_alloc_4k(void) 11619 { 11620 vm_page_t m; 11621 11622 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 11623 VM_ALLOC_ZERO); 11624 if (m == NULL) 11625 panic("%s: no memory to grow shadow map", __func__); 11626 return (m); 11627 } 11628 11629 static vm_page_t 11630 pmap_san_enter_alloc_2m(void) 11631 { 11632 return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO, 11633 NPTEPG, 0, ~0ul, NBPDR, 0, VM_MEMATTR_DEFAULT)); 11634 } 11635 11636 /* 11637 * Grow a shadow map by at least one 4KB page at the specified address. Use 2MB 11638 * pages when possible. 11639 */ 11640 void __nosanitizeaddress __nosanitizememory 11641 pmap_san_enter(vm_offset_t va) 11642 { 11643 pdp_entry_t *pdpe; 11644 pd_entry_t *pde; 11645 pt_entry_t *pte; 11646 vm_page_t m; 11647 11648 if (kernphys == 0) { 11649 /* 11650 * We're creating a temporary shadow map for the boot stack. 11651 */ 11652 pmap_san_enter_early(va); 11653 return; 11654 } 11655 11656 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 11657 11658 pdpe = pmap_pdpe(kernel_pmap, va); 11659 if ((*pdpe & X86_PG_V) == 0) { 11660 m = pmap_san_enter_alloc_4k(); 11661 *pdpe = (pdp_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11662 X86_PG_V | pg_nx); 11663 } 11664 pde = pmap_pdpe_to_pde(pdpe, va); 11665 if ((*pde & X86_PG_V) == 0) { 11666 m = pmap_san_enter_alloc_2m(); 11667 if (m != NULL) { 11668 *pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11669 X86_PG_PS | X86_PG_V | X86_PG_A | X86_PG_M | pg_nx); 11670 } else { 11671 m = pmap_san_enter_alloc_4k(); 11672 *pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11673 X86_PG_V | pg_nx); 11674 } 11675 } 11676 if ((*pde & X86_PG_PS) != 0) 11677 return; 11678 pte = pmap_pde_to_pte(pde, va); 11679 if ((*pte & X86_PG_V) != 0) 11680 return; 11681 m = pmap_san_enter_alloc_4k(); 11682 *pte = (pt_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | X86_PG_V | 11683 X86_PG_M | X86_PG_A | pg_nx); 11684 } 11685 #endif 11686 11687 /* 11688 * Track a range of the kernel's virtual address space that is contiguous 11689 * in various mapping attributes. 11690 */ 11691 struct pmap_kernel_map_range { 11692 vm_offset_t sva; 11693 pt_entry_t attrs; 11694 int ptes; 11695 int pdes; 11696 int pdpes; 11697 }; 11698 11699 static void 11700 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 11701 vm_offset_t eva) 11702 { 11703 const char *mode; 11704 int i, pat_idx; 11705 11706 if (eva <= range->sva) 11707 return; 11708 11709 pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true); 11710 for (i = 0; i < PAT_INDEX_SIZE; i++) 11711 if (pat_index[i] == pat_idx) 11712 break; 11713 11714 switch (i) { 11715 case PAT_WRITE_BACK: 11716 mode = "WB"; 11717 break; 11718 case PAT_WRITE_THROUGH: 11719 mode = "WT"; 11720 break; 11721 case PAT_UNCACHEABLE: 11722 mode = "UC"; 11723 break; 11724 case PAT_UNCACHED: 11725 mode = "U-"; 11726 break; 11727 case PAT_WRITE_PROTECTED: 11728 mode = "WP"; 11729 break; 11730 case PAT_WRITE_COMBINING: 11731 mode = "WC"; 11732 break; 11733 default: 11734 printf("%s: unknown PAT mode %#x for range 0x%016lx-0x%016lx\n", 11735 __func__, pat_idx, range->sva, eva); 11736 mode = "??"; 11737 break; 11738 } 11739 11740 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n", 11741 range->sva, eva, 11742 (range->attrs & X86_PG_RW) != 0 ? 'w' : '-', 11743 (range->attrs & pg_nx) != 0 ? '-' : 'x', 11744 (range->attrs & X86_PG_U) != 0 ? 'u' : 's', 11745 (range->attrs & X86_PG_G) != 0 ? 'g' : '-', 11746 mode, range->pdpes, range->pdes, range->ptes); 11747 11748 /* Reset to sentinel value. */ 11749 range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, 11750 NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, 11751 NPDEPG - 1, NPTEPG - 1); 11752 } 11753 11754 /* 11755 * Determine whether the attributes specified by a page table entry match those 11756 * being tracked by the current range. This is not quite as simple as a direct 11757 * flag comparison since some PAT modes have multiple representations. 11758 */ 11759 static bool 11760 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 11761 { 11762 pt_entry_t diff, mask; 11763 11764 mask = X86_PG_G | X86_PG_RW | X86_PG_U | X86_PG_PDE_CACHE | pg_nx; 11765 diff = (range->attrs ^ attrs) & mask; 11766 if (diff == 0) 11767 return (true); 11768 if ((diff & ~X86_PG_PDE_PAT) == 0 && 11769 pmap_pat_index(kernel_pmap, range->attrs, true) == 11770 pmap_pat_index(kernel_pmap, attrs, true)) 11771 return (true); 11772 return (false); 11773 } 11774 11775 static void 11776 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 11777 pt_entry_t attrs) 11778 { 11779 11780 memset(range, 0, sizeof(*range)); 11781 range->sva = va; 11782 range->attrs = attrs; 11783 } 11784 11785 /* 11786 * Given a leaf PTE, derive the mapping's attributes. If they do not match 11787 * those of the current run, dump the address range and its attributes, and 11788 * begin a new run. 11789 */ 11790 static void 11791 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 11792 vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde, 11793 pt_entry_t pte) 11794 { 11795 pt_entry_t attrs; 11796 11797 attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx); 11798 11799 attrs |= pdpe & pg_nx; 11800 attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U)); 11801 if ((pdpe & PG_PS) != 0) { 11802 attrs |= pdpe & (X86_PG_G | X86_PG_PDE_CACHE); 11803 } else if (pde != 0) { 11804 attrs |= pde & pg_nx; 11805 attrs &= pg_nx | (pde & (X86_PG_RW | X86_PG_U)); 11806 } 11807 if ((pde & PG_PS) != 0) { 11808 attrs |= pde & (X86_PG_G | X86_PG_PDE_CACHE); 11809 } else if (pte != 0) { 11810 attrs |= pte & pg_nx; 11811 attrs &= pg_nx | (pte & (X86_PG_RW | X86_PG_U)); 11812 attrs |= pte & (X86_PG_G | X86_PG_PTE_CACHE); 11813 11814 /* Canonicalize by always using the PDE PAT bit. */ 11815 if ((attrs & X86_PG_PTE_PAT) != 0) 11816 attrs ^= X86_PG_PDE_PAT | X86_PG_PTE_PAT; 11817 } 11818 11819 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 11820 sysctl_kmaps_dump(sb, range, va); 11821 sysctl_kmaps_reinit(range, va, attrs); 11822 } 11823 } 11824 11825 static int 11826 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 11827 { 11828 struct pmap_kernel_map_range range; 11829 struct sbuf sbuf, *sb; 11830 pml4_entry_t pml4e; 11831 pdp_entry_t *pdp, pdpe; 11832 pd_entry_t *pd, pde; 11833 pt_entry_t *pt, pte; 11834 vm_offset_t sva; 11835 vm_paddr_t pa; 11836 int error, i, j, k, l; 11837 11838 error = sysctl_wire_old_buffer(req, 0); 11839 if (error != 0) 11840 return (error); 11841 sb = &sbuf; 11842 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 11843 11844 /* Sentinel value. */ 11845 range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, 11846 NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, 11847 NPDEPG - 1, NPTEPG - 1); 11848 11849 /* 11850 * Iterate over the kernel page tables without holding the kernel pmap 11851 * lock. Outside of the large map, kernel page table pages are never 11852 * freed, so at worst we will observe inconsistencies in the output. 11853 * Within the large map, ensure that PDP and PD page addresses are 11854 * valid before descending. 11855 */ 11856 for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) { 11857 switch (i) { 11858 case PML4PML4I: 11859 sbuf_printf(sb, "\nRecursive map:\n"); 11860 break; 11861 case DMPML4I: 11862 sbuf_printf(sb, "\nDirect map:\n"); 11863 break; 11864 #ifdef KASAN 11865 case KASANPML4I: 11866 sbuf_printf(sb, "\nKASAN shadow map:\n"); 11867 break; 11868 #endif 11869 #ifdef KMSAN 11870 case KMSANSHADPML4I: 11871 sbuf_printf(sb, "\nKMSAN shadow map:\n"); 11872 break; 11873 case KMSANORIGPML4I: 11874 sbuf_printf(sb, "\nKMSAN origin map:\n"); 11875 break; 11876 #endif 11877 case KPML4BASE: 11878 sbuf_printf(sb, "\nKernel map:\n"); 11879 break; 11880 case LMSPML4I: 11881 sbuf_printf(sb, "\nLarge map:\n"); 11882 break; 11883 } 11884 11885 /* Convert to canonical form. */ 11886 if (sva == 1ul << 47) 11887 sva |= -1ul << 48; 11888 11889 restart: 11890 pml4e = kernel_pml4[i]; 11891 if ((pml4e & X86_PG_V) == 0) { 11892 sva = rounddown2(sva, NBPML4); 11893 sysctl_kmaps_dump(sb, &range, sva); 11894 sva += NBPML4; 11895 continue; 11896 } 11897 pa = pml4e & PG_FRAME; 11898 pdp = (pdp_entry_t *)PHYS_TO_DMAP(pa); 11899 11900 for (j = pmap_pdpe_index(sva); j < NPDPEPG; j++) { 11901 pdpe = pdp[j]; 11902 if ((pdpe & X86_PG_V) == 0) { 11903 sva = rounddown2(sva, NBPDP); 11904 sysctl_kmaps_dump(sb, &range, sva); 11905 sva += NBPDP; 11906 continue; 11907 } 11908 pa = pdpe & PG_FRAME; 11909 if ((pdpe & PG_PS) != 0) { 11910 sva = rounddown2(sva, NBPDP); 11911 sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, 11912 0, 0); 11913 range.pdpes++; 11914 sva += NBPDP; 11915 continue; 11916 } 11917 if (PMAP_ADDRESS_IN_LARGEMAP(sva) && 11918 vm_phys_paddr_to_vm_page(pa) == NULL) { 11919 /* 11920 * Page table pages for the large map may be 11921 * freed. Validate the next-level address 11922 * before descending. 11923 */ 11924 goto restart; 11925 } 11926 pd = (pd_entry_t *)PHYS_TO_DMAP(pa); 11927 11928 for (k = pmap_pde_index(sva); k < NPDEPG; k++) { 11929 pde = pd[k]; 11930 if ((pde & X86_PG_V) == 0) { 11931 sva = rounddown2(sva, NBPDR); 11932 sysctl_kmaps_dump(sb, &range, sva); 11933 sva += NBPDR; 11934 continue; 11935 } 11936 pa = pde & PG_FRAME; 11937 if ((pde & PG_PS) != 0) { 11938 sva = rounddown2(sva, NBPDR); 11939 sysctl_kmaps_check(sb, &range, sva, 11940 pml4e, pdpe, pde, 0); 11941 range.pdes++; 11942 sva += NBPDR; 11943 continue; 11944 } 11945 if (PMAP_ADDRESS_IN_LARGEMAP(sva) && 11946 vm_phys_paddr_to_vm_page(pa) == NULL) { 11947 /* 11948 * Page table pages for the large map 11949 * may be freed. Validate the 11950 * next-level address before descending. 11951 */ 11952 goto restart; 11953 } 11954 pt = (pt_entry_t *)PHYS_TO_DMAP(pa); 11955 11956 for (l = pmap_pte_index(sva); l < NPTEPG; l++, 11957 sva += PAGE_SIZE) { 11958 pte = pt[l]; 11959 if ((pte & X86_PG_V) == 0) { 11960 sysctl_kmaps_dump(sb, &range, 11961 sva); 11962 continue; 11963 } 11964 sysctl_kmaps_check(sb, &range, sva, 11965 pml4e, pdpe, pde, pte); 11966 range.ptes++; 11967 } 11968 } 11969 } 11970 } 11971 11972 error = sbuf_finish(sb); 11973 sbuf_delete(sb); 11974 return (error); 11975 } 11976 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 11977 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 11978 NULL, 0, sysctl_kmaps, "A", 11979 "Dump kernel address layout"); 11980 11981 #ifdef DDB 11982 DB_SHOW_COMMAND(pte, pmap_print_pte) 11983 { 11984 pmap_t pmap; 11985 pml5_entry_t *pml5; 11986 pml4_entry_t *pml4; 11987 pdp_entry_t *pdp; 11988 pd_entry_t *pde; 11989 pt_entry_t *pte, PG_V; 11990 vm_offset_t va; 11991 11992 if (!have_addr) { 11993 db_printf("show pte addr\n"); 11994 return; 11995 } 11996 va = (vm_offset_t)addr; 11997 11998 if (kdb_thread != NULL) 11999 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); 12000 else 12001 pmap = PCPU_GET(curpmap); 12002 12003 PG_V = pmap_valid_bit(pmap); 12004 db_printf("VA 0x%016lx", va); 12005 12006 if (pmap_is_la57(pmap)) { 12007 pml5 = pmap_pml5e(pmap, va); 12008 db_printf(" pml5e 0x%016lx", *pml5); 12009 if ((*pml5 & PG_V) == 0) { 12010 db_printf("\n"); 12011 return; 12012 } 12013 pml4 = pmap_pml5e_to_pml4e(pml5, va); 12014 } else { 12015 pml4 = pmap_pml4e(pmap, va); 12016 } 12017 db_printf(" pml4e 0x%016lx", *pml4); 12018 if ((*pml4 & PG_V) == 0) { 12019 db_printf("\n"); 12020 return; 12021 } 12022 pdp = pmap_pml4e_to_pdpe(pml4, va); 12023 db_printf(" pdpe 0x%016lx", *pdp); 12024 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { 12025 db_printf("\n"); 12026 return; 12027 } 12028 pde = pmap_pdpe_to_pde(pdp, va); 12029 db_printf(" pde 0x%016lx", *pde); 12030 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { 12031 db_printf("\n"); 12032 return; 12033 } 12034 pte = pmap_pde_to_pte(pde, va); 12035 db_printf(" pte 0x%016lx\n", *pte); 12036 } 12037 12038 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) 12039 { 12040 vm_paddr_t a; 12041 12042 if (have_addr) { 12043 a = (vm_paddr_t)addr; 12044 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); 12045 } else { 12046 db_printf("show phys2dmap addr\n"); 12047 } 12048 } 12049 12050 static void 12051 ptpages_show_page(int level, int idx, vm_page_t pg) 12052 { 12053 db_printf("l %d i %d pg %p phys %#lx ref %x\n", 12054 level, idx, pg, VM_PAGE_TO_PHYS(pg), pg->ref_count); 12055 } 12056 12057 static void 12058 ptpages_show_complain(int level, int idx, uint64_t pte) 12059 { 12060 db_printf("l %d i %d pte %#lx\n", level, idx, pte); 12061 } 12062 12063 static void 12064 ptpages_show_pml4(vm_page_t pg4, int num_entries, uint64_t PG_V) 12065 { 12066 vm_page_t pg3, pg2, pg1; 12067 pml4_entry_t *pml4; 12068 pdp_entry_t *pdp; 12069 pd_entry_t *pd; 12070 int i4, i3, i2; 12071 12072 pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg4)); 12073 for (i4 = 0; i4 < num_entries; i4++) { 12074 if ((pml4[i4] & PG_V) == 0) 12075 continue; 12076 pg3 = PHYS_TO_VM_PAGE(pml4[i4] & PG_FRAME); 12077 if (pg3 == NULL) { 12078 ptpages_show_complain(3, i4, pml4[i4]); 12079 continue; 12080 } 12081 ptpages_show_page(3, i4, pg3); 12082 pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg3)); 12083 for (i3 = 0; i3 < NPDPEPG; i3++) { 12084 if ((pdp[i3] & PG_V) == 0) 12085 continue; 12086 pg2 = PHYS_TO_VM_PAGE(pdp[i3] & PG_FRAME); 12087 if (pg3 == NULL) { 12088 ptpages_show_complain(2, i3, pdp[i3]); 12089 continue; 12090 } 12091 ptpages_show_page(2, i3, pg2); 12092 pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg2)); 12093 for (i2 = 0; i2 < NPDEPG; i2++) { 12094 if ((pd[i2] & PG_V) == 0) 12095 continue; 12096 pg1 = PHYS_TO_VM_PAGE(pd[i2] & PG_FRAME); 12097 if (pg1 == NULL) { 12098 ptpages_show_complain(1, i2, pd[i2]); 12099 continue; 12100 } 12101 ptpages_show_page(1, i2, pg1); 12102 } 12103 } 12104 } 12105 } 12106 12107 DB_SHOW_COMMAND(ptpages, pmap_ptpages) 12108 { 12109 pmap_t pmap; 12110 vm_page_t pg; 12111 pml5_entry_t *pml5; 12112 uint64_t PG_V; 12113 int i5; 12114 12115 if (have_addr) 12116 pmap = (pmap_t)addr; 12117 else 12118 pmap = PCPU_GET(curpmap); 12119 12120 PG_V = pmap_valid_bit(pmap); 12121 12122 if (pmap_is_la57(pmap)) { 12123 pml5 = pmap->pm_pmltop; 12124 for (i5 = 0; i5 < NUPML5E; i5++) { 12125 if ((pml5[i5] & PG_V) == 0) 12126 continue; 12127 pg = PHYS_TO_VM_PAGE(pml5[i5] & PG_FRAME); 12128 if (pg == NULL) { 12129 ptpages_show_complain(4, i5, pml5[i5]); 12130 continue; 12131 } 12132 ptpages_show_page(4, i5, pg); 12133 ptpages_show_pml4(pg, NPML4EPG, PG_V); 12134 } 12135 } else { 12136 ptpages_show_pml4(PHYS_TO_VM_PAGE(DMAP_TO_PHYS( 12137 (vm_offset_t)pmap->pm_pmltop)), NUP4ML4E, PG_V); 12138 } 12139 } 12140 #endif 12141