1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * 15 * This code is derived from software contributed to Berkeley by 16 * the Systems Programming Group of the University of Utah Computer 17 * Science Department and William Jolitz of UUNET Technologies Inc. 18 * 19 * Redistribution and use in source and binary forms, with or without 20 * modification, are permitted provided that the following conditions 21 * are met: 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 3. All advertising materials mentioning features or use of this software 28 * must display the following acknowledgement: 29 * This product includes software developed by the University of 30 * California, Berkeley and its contributors. 31 * 4. Neither the name of the University nor the names of its contributors 32 * may be used to endorse or promote products derived from this software 33 * without specific prior written permission. 34 * 35 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 36 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 38 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 39 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 40 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 41 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 42 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 43 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 44 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 45 * SUCH DAMAGE. 46 * 47 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 48 */ 49 /*- 50 * Copyright (c) 2003 Networks Associates Technology, Inc. 51 * Copyright (c) 2014-2020 The FreeBSD Foundation 52 * All rights reserved. 53 * 54 * This software was developed for the FreeBSD Project by Jake Burkholder, 55 * Safeport Network Services, and Network Associates Laboratories, the 56 * Security Research Division of Network Associates, Inc. under 57 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 58 * CHATS research program. 59 * 60 * Portions of this software were developed by 61 * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from 62 * the FreeBSD Foundation. 63 * 64 * Redistribution and use in source and binary forms, with or without 65 * modification, are permitted provided that the following conditions 66 * are met: 67 * 1. Redistributions of source code must retain the above copyright 68 * notice, this list of conditions and the following disclaimer. 69 * 2. Redistributions in binary form must reproduce the above copyright 70 * notice, this list of conditions and the following disclaimer in the 71 * documentation and/or other materials provided with the distribution. 72 * 73 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 74 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 75 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 76 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 77 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 78 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 79 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 80 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 81 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 82 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 83 * SUCH DAMAGE. 84 */ 85 86 #define AMD64_NPT_AWARE 87 88 #include <sys/cdefs.h> 89 /* 90 * Manages physical address maps. 91 * 92 * Since the information managed by this module is 93 * also stored by the logical address mapping module, 94 * this module may throw away valid virtual-to-physical 95 * mappings at almost any time. However, invalidations 96 * of virtual-to-physical mappings must be done as 97 * requested. 98 * 99 * In order to cope with hardware architectures which 100 * make virtual-to-physical map invalidates expensive, 101 * this module may delay invalidate or reduced protection 102 * operations until such time as they are actually 103 * necessary. This module is given full information as 104 * to which processors are currently using which maps, 105 * and to when physical maps must be made correct. 106 */ 107 108 #include "opt_ddb.h" 109 #include "opt_pmap.h" 110 #include "opt_vm.h" 111 112 #include <sys/param.h> 113 #include <sys/asan.h> 114 #include <sys/bitstring.h> 115 #include <sys/bus.h> 116 #include <sys/systm.h> 117 #include <sys/counter.h> 118 #include <sys/kernel.h> 119 #include <sys/ktr.h> 120 #include <sys/lock.h> 121 #include <sys/malloc.h> 122 #include <sys/mman.h> 123 #include <sys/msan.h> 124 #include <sys/mutex.h> 125 #include <sys/proc.h> 126 #include <sys/rangeset.h> 127 #include <sys/rwlock.h> 128 #include <sys/sbuf.h> 129 #include <sys/smr.h> 130 #include <sys/sx.h> 131 #include <sys/turnstile.h> 132 #include <sys/vmem.h> 133 #include <sys/vmmeter.h> 134 #include <sys/sched.h> 135 #include <sys/sysctl.h> 136 #include <sys/smp.h> 137 #ifdef DDB 138 #include <sys/kdb.h> 139 #include <ddb/ddb.h> 140 #endif 141 142 #include <vm/vm.h> 143 #include <vm/vm_param.h> 144 #include <vm/vm_kern.h> 145 #include <vm/vm_page.h> 146 #include <vm/vm_map.h> 147 #include <vm/vm_object.h> 148 #include <vm/vm_extern.h> 149 #include <vm/vm_pageout.h> 150 #include <vm/vm_pager.h> 151 #include <vm/vm_phys.h> 152 #include <vm/vm_radix.h> 153 #include <vm/vm_reserv.h> 154 #include <vm/vm_dumpset.h> 155 #include <vm/uma.h> 156 157 #include <machine/asan.h> 158 #include <machine/intr_machdep.h> 159 #include <x86/apicvar.h> 160 #include <x86/ifunc.h> 161 #include <machine/cpu.h> 162 #include <machine/cputypes.h> 163 #include <machine/md_var.h> 164 #include <machine/msan.h> 165 #include <machine/pcb.h> 166 #include <machine/specialreg.h> 167 #ifdef SMP 168 #include <machine/smp.h> 169 #endif 170 #include <machine/sysarch.h> 171 #include <machine/tss.h> 172 173 #ifdef NUMA 174 #define PMAP_MEMDOM MAXMEMDOM 175 #else 176 #define PMAP_MEMDOM 1 177 #endif 178 179 static __inline boolean_t 180 pmap_type_guest(pmap_t pmap) 181 { 182 183 return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI)); 184 } 185 186 static __inline boolean_t 187 pmap_emulate_ad_bits(pmap_t pmap) 188 { 189 190 return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); 191 } 192 193 static __inline pt_entry_t 194 pmap_valid_bit(pmap_t pmap) 195 { 196 pt_entry_t mask; 197 198 switch (pmap->pm_type) { 199 case PT_X86: 200 case PT_RVI: 201 mask = X86_PG_V; 202 break; 203 case PT_EPT: 204 if (pmap_emulate_ad_bits(pmap)) 205 mask = EPT_PG_EMUL_V; 206 else 207 mask = EPT_PG_READ; 208 break; 209 default: 210 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); 211 } 212 213 return (mask); 214 } 215 216 static __inline pt_entry_t 217 pmap_rw_bit(pmap_t pmap) 218 { 219 pt_entry_t mask; 220 221 switch (pmap->pm_type) { 222 case PT_X86: 223 case PT_RVI: 224 mask = X86_PG_RW; 225 break; 226 case PT_EPT: 227 if (pmap_emulate_ad_bits(pmap)) 228 mask = EPT_PG_EMUL_RW; 229 else 230 mask = EPT_PG_WRITE; 231 break; 232 default: 233 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); 234 } 235 236 return (mask); 237 } 238 239 static pt_entry_t pg_g; 240 241 static __inline pt_entry_t 242 pmap_global_bit(pmap_t pmap) 243 { 244 pt_entry_t mask; 245 246 switch (pmap->pm_type) { 247 case PT_X86: 248 mask = pg_g; 249 break; 250 case PT_RVI: 251 case PT_EPT: 252 mask = 0; 253 break; 254 default: 255 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); 256 } 257 258 return (mask); 259 } 260 261 static __inline pt_entry_t 262 pmap_accessed_bit(pmap_t pmap) 263 { 264 pt_entry_t mask; 265 266 switch (pmap->pm_type) { 267 case PT_X86: 268 case PT_RVI: 269 mask = X86_PG_A; 270 break; 271 case PT_EPT: 272 if (pmap_emulate_ad_bits(pmap)) 273 mask = EPT_PG_READ; 274 else 275 mask = EPT_PG_A; 276 break; 277 default: 278 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); 279 } 280 281 return (mask); 282 } 283 284 static __inline pt_entry_t 285 pmap_modified_bit(pmap_t pmap) 286 { 287 pt_entry_t mask; 288 289 switch (pmap->pm_type) { 290 case PT_X86: 291 case PT_RVI: 292 mask = X86_PG_M; 293 break; 294 case PT_EPT: 295 if (pmap_emulate_ad_bits(pmap)) 296 mask = EPT_PG_WRITE; 297 else 298 mask = EPT_PG_M; 299 break; 300 default: 301 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); 302 } 303 304 return (mask); 305 } 306 307 static __inline pt_entry_t 308 pmap_pku_mask_bit(pmap_t pmap) 309 { 310 311 return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0); 312 } 313 314 static __inline boolean_t 315 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) 316 { 317 318 if (!pmap_emulate_ad_bits(pmap)) 319 return (TRUE); 320 321 KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); 322 323 /* 324 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration 325 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared 326 * if the EPT_PG_WRITE bit is set. 327 */ 328 if ((pte & EPT_PG_WRITE) != 0) 329 return (FALSE); 330 331 /* 332 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. 333 */ 334 if ((pte & EPT_PG_EXECUTE) == 0 || 335 ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) 336 return (TRUE); 337 else 338 return (FALSE); 339 } 340 341 #if !defined(DIAGNOSTIC) 342 #ifdef __GNUC_GNU_INLINE__ 343 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 344 #else 345 #define PMAP_INLINE extern inline 346 #endif 347 #else 348 #define PMAP_INLINE 349 #endif 350 351 #ifdef PV_STATS 352 #define PV_STAT(x) do { x ; } while (0) 353 #else 354 #define PV_STAT(x) do { } while (0) 355 #endif 356 357 #undef pa_index 358 #ifdef NUMA 359 #define pa_index(pa) ({ \ 360 KASSERT((pa) <= vm_phys_segs[vm_phys_nsegs - 1].end, \ 361 ("address %lx beyond the last segment", (pa))); \ 362 (pa) >> PDRSHIFT; \ 363 }) 364 #define pa_to_pmdp(pa) (&pv_table[pa_index(pa)]) 365 #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page)) 366 #define PHYS_TO_PV_LIST_LOCK(pa) ({ \ 367 struct rwlock *_lock; \ 368 if (__predict_false((pa) > pmap_last_pa)) \ 369 _lock = &pv_dummy_large.pv_lock; \ 370 else \ 371 _lock = &(pa_to_pmdp(pa)->pv_lock); \ 372 _lock; \ 373 }) 374 #else 375 #define pa_index(pa) ((pa) >> PDRSHIFT) 376 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 377 378 #define NPV_LIST_LOCKS MAXCPU 379 380 #define PHYS_TO_PV_LIST_LOCK(pa) \ 381 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) 382 #endif 383 384 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 385 struct rwlock **_lockp = (lockp); \ 386 struct rwlock *_new_lock; \ 387 \ 388 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 389 if (_new_lock != *_lockp) { \ 390 if (*_lockp != NULL) \ 391 rw_wunlock(*_lockp); \ 392 *_lockp = _new_lock; \ 393 rw_wlock(*_lockp); \ 394 } \ 395 } while (0) 396 397 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 398 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 399 400 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 401 struct rwlock **_lockp = (lockp); \ 402 \ 403 if (*_lockp != NULL) { \ 404 rw_wunlock(*_lockp); \ 405 *_lockp = NULL; \ 406 } \ 407 } while (0) 408 409 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 410 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 411 412 /* 413 * Statically allocate kernel pmap memory. However, memory for 414 * pm_pcids is obtained after the dynamic allocator is operational. 415 * Initialize it with a non-canonical pointer to catch early accesses 416 * regardless of the active mapping. 417 */ 418 struct pmap kernel_pmap_store = { 419 .pm_pcidp = (void *)0xdeadbeefdeadbeef, 420 }; 421 422 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 423 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 424 425 int nkpt; 426 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, 427 "Number of kernel page table pages allocated on bootup"); 428 429 static int ndmpdp; 430 vm_paddr_t dmaplimit; 431 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 432 pt_entry_t pg_nx; 433 434 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 435 "VM/pmap parameters"); 436 437 static int __read_frequently pg_ps_enabled = 1; 438 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 439 &pg_ps_enabled, 0, "Are large page mappings enabled?"); 440 441 int __read_frequently la57 = 0; 442 SYSCTL_INT(_vm_pmap, OID_AUTO, la57, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 443 &la57, 0, 444 "5-level paging for host is enabled"); 445 446 static bool 447 pmap_is_la57(pmap_t pmap) 448 { 449 if (pmap->pm_type == PT_X86) 450 return (la57); 451 return (false); /* XXXKIB handle EPT */ 452 } 453 454 #define PAT_INDEX_SIZE 8 455 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 456 457 static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 458 static u_int64_t KPDphys; /* phys addr of kernel level 2 */ 459 static u_int64_t KPDPphys; /* phys addr of kernel level 3 */ 460 u_int64_t KPML4phys; /* phys addr of kernel level 4 */ 461 u_int64_t KPML5phys; /* phys addr of kernel level 5, 462 if supported */ 463 464 #ifdef KASAN 465 static uint64_t KASANPDPphys; 466 #endif 467 #ifdef KMSAN 468 static uint64_t KMSANSHADPDPphys; 469 static uint64_t KMSANORIGPDPphys; 470 471 /* 472 * To support systems with large amounts of memory, it is necessary to extend 473 * the maximum size of the direct map. This could eat into the space reserved 474 * for the shadow map. 475 */ 476 _Static_assert(DMPML4I + NDMPML4E <= KMSANSHADPML4I, "direct map overflow"); 477 #endif 478 479 static pml4_entry_t *kernel_pml4; 480 static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ 481 static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 482 static int ndmpdpphys; /* number of DMPDPphys pages */ 483 484 vm_paddr_t kernphys; /* phys addr of start of bootstrap data */ 485 vm_paddr_t KERNend; /* and the end */ 486 487 /* 488 * pmap_mapdev support pre initialization (i.e. console) 489 */ 490 #define PMAP_PREINIT_MAPPING_COUNT 8 491 static struct pmap_preinit_mapping { 492 vm_paddr_t pa; 493 vm_offset_t va; 494 vm_size_t sz; 495 int mode; 496 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 497 static int pmap_initialized; 498 499 /* 500 * Data for the pv entry allocation mechanism. 501 * Updates to pv_invl_gen are protected by the pv list lock but reads are not. 502 */ 503 #ifdef NUMA 504 static __inline int 505 pc_to_domain(struct pv_chunk *pc) 506 { 507 508 return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc))); 509 } 510 #else 511 static __inline int 512 pc_to_domain(struct pv_chunk *pc __unused) 513 { 514 515 return (0); 516 } 517 #endif 518 519 struct pv_chunks_list { 520 struct mtx pvc_lock; 521 TAILQ_HEAD(pch, pv_chunk) pvc_list; 522 int active_reclaims; 523 } __aligned(CACHE_LINE_SIZE); 524 525 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM]; 526 527 #ifdef NUMA 528 struct pmap_large_md_page { 529 struct rwlock pv_lock; 530 struct md_page pv_page; 531 u_long pv_invl_gen; 532 }; 533 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large; 534 #define pv_dummy pv_dummy_large.pv_page 535 __read_mostly static struct pmap_large_md_page *pv_table; 536 __read_mostly vm_paddr_t pmap_last_pa; 537 #else 538 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; 539 static u_long pv_invl_gen[NPV_LIST_LOCKS]; 540 static struct md_page *pv_table; 541 static struct md_page pv_dummy; 542 #endif 543 544 /* 545 * All those kernel PT submaps that BSD is so fond of 546 */ 547 pt_entry_t *CMAP1 = NULL; 548 caddr_t CADDR1 = 0; 549 static vm_offset_t qframe = 0; 550 static struct mtx qframe_mtx; 551 552 static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ 553 554 static vmem_t *large_vmem; 555 static u_int lm_ents; 556 #define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= LARGEMAP_MIN_ADDRESS && \ 557 (va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents) 558 559 int pmap_pcid_enabled = 1; 560 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 561 &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?"); 562 int invpcid_works = 0; 563 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, 564 "Is the invpcid instruction available ?"); 565 int pmap_pcid_invlpg_workaround = 0; 566 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_invlpg_workaround, 567 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 568 &pmap_pcid_invlpg_workaround, 0, 569 "Enable small core PCID/INVLPG workaround"); 570 int pmap_pcid_invlpg_workaround_uena = 1; 571 572 int __read_frequently pti = 0; 573 SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 574 &pti, 0, 575 "Page Table Isolation enabled"); 576 static vm_object_t pti_obj; 577 static pml4_entry_t *pti_pml4; 578 static vm_pindex_t pti_pg_idx; 579 static bool pti_finalized; 580 581 struct pmap_pkru_range { 582 struct rs_el pkru_rs_el; 583 u_int pkru_keyidx; 584 int pkru_flags; 585 }; 586 587 static uma_zone_t pmap_pkru_ranges_zone; 588 static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 589 static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va); 590 static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 591 static void *pkru_dup_range(void *ctx, void *data); 592 static void pkru_free_range(void *ctx, void *node); 593 static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap); 594 static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 595 static void pmap_pkru_deassign_all(pmap_t pmap); 596 597 static COUNTER_U64_DEFINE_EARLY(pcid_save_cnt); 598 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLFLAG_RD, 599 &pcid_save_cnt, "Count of saved TLB context on switch"); 600 601 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = 602 LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); 603 static struct mtx invl_gen_mtx; 604 /* Fake lock object to satisfy turnstiles interface. */ 605 static struct lock_object invl_gen_ts = { 606 .lo_name = "invlts", 607 }; 608 static struct pmap_invl_gen pmap_invl_gen_head = { 609 .gen = 1, 610 .next = NULL, 611 }; 612 static u_long pmap_invl_gen = 1; 613 static int pmap_invl_waiters; 614 static struct callout pmap_invl_callout; 615 static bool pmap_invl_callout_inited; 616 617 #define PMAP_ASSERT_NOT_IN_DI() \ 618 KASSERT(pmap_not_in_di(), ("DI already started")) 619 620 static bool 621 pmap_di_locked(void) 622 { 623 int tun; 624 625 if ((cpu_feature2 & CPUID2_CX16) == 0) 626 return (true); 627 tun = 0; 628 TUNABLE_INT_FETCH("vm.pmap.di_locked", &tun); 629 return (tun != 0); 630 } 631 632 static int 633 sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS) 634 { 635 int locked; 636 637 locked = pmap_di_locked(); 638 return (sysctl_handle_int(oidp, &locked, 0, req)); 639 } 640 SYSCTL_PROC(_vm_pmap, OID_AUTO, di_locked, CTLTYPE_INT | CTLFLAG_RDTUN | 641 CTLFLAG_MPSAFE, 0, 0, sysctl_pmap_di_locked, "", 642 "Locked delayed invalidation"); 643 644 static bool pmap_not_in_di_l(void); 645 static bool pmap_not_in_di_u(void); 646 DEFINE_IFUNC(, bool, pmap_not_in_di, (void)) 647 { 648 649 return (pmap_di_locked() ? pmap_not_in_di_l : pmap_not_in_di_u); 650 } 651 652 static bool 653 pmap_not_in_di_l(void) 654 { 655 struct pmap_invl_gen *invl_gen; 656 657 invl_gen = &curthread->td_md.md_invl_gen; 658 return (invl_gen->gen == 0); 659 } 660 661 static void 662 pmap_thread_init_invl_gen_l(struct thread *td) 663 { 664 struct pmap_invl_gen *invl_gen; 665 666 invl_gen = &td->td_md.md_invl_gen; 667 invl_gen->gen = 0; 668 } 669 670 static void 671 pmap_delayed_invl_wait_block(u_long *m_gen, u_long *invl_gen) 672 { 673 struct turnstile *ts; 674 675 ts = turnstile_trywait(&invl_gen_ts); 676 if (*m_gen > atomic_load_long(invl_gen)) 677 turnstile_wait(ts, NULL, TS_SHARED_QUEUE); 678 else 679 turnstile_cancel(ts); 680 } 681 682 static void 683 pmap_delayed_invl_finish_unblock(u_long new_gen) 684 { 685 struct turnstile *ts; 686 687 turnstile_chain_lock(&invl_gen_ts); 688 ts = turnstile_lookup(&invl_gen_ts); 689 if (new_gen != 0) 690 pmap_invl_gen = new_gen; 691 if (ts != NULL) { 692 turnstile_broadcast(ts, TS_SHARED_QUEUE); 693 turnstile_unpend(ts); 694 } 695 turnstile_chain_unlock(&invl_gen_ts); 696 } 697 698 /* 699 * Start a new Delayed Invalidation (DI) block of code, executed by 700 * the current thread. Within a DI block, the current thread may 701 * destroy both the page table and PV list entries for a mapping and 702 * then release the corresponding PV list lock before ensuring that 703 * the mapping is flushed from the TLBs of any processors with the 704 * pmap active. 705 */ 706 static void 707 pmap_delayed_invl_start_l(void) 708 { 709 struct pmap_invl_gen *invl_gen; 710 u_long currgen; 711 712 invl_gen = &curthread->td_md.md_invl_gen; 713 PMAP_ASSERT_NOT_IN_DI(); 714 mtx_lock(&invl_gen_mtx); 715 if (LIST_EMPTY(&pmap_invl_gen_tracker)) 716 currgen = pmap_invl_gen; 717 else 718 currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; 719 invl_gen->gen = currgen + 1; 720 LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); 721 mtx_unlock(&invl_gen_mtx); 722 } 723 724 /* 725 * Finish the DI block, previously started by the current thread. All 726 * required TLB flushes for the pages marked by 727 * pmap_delayed_invl_page() must be finished before this function is 728 * called. 729 * 730 * This function works by bumping the global DI generation number to 731 * the generation number of the current thread's DI, unless there is a 732 * pending DI that started earlier. In the latter case, bumping the 733 * global DI generation number would incorrectly signal that the 734 * earlier DI had finished. Instead, this function bumps the earlier 735 * DI's generation number to match the generation number of the 736 * current thread's DI. 737 */ 738 static void 739 pmap_delayed_invl_finish_l(void) 740 { 741 struct pmap_invl_gen *invl_gen, *next; 742 743 invl_gen = &curthread->td_md.md_invl_gen; 744 KASSERT(invl_gen->gen != 0, ("missed invl_start")); 745 mtx_lock(&invl_gen_mtx); 746 next = LIST_NEXT(invl_gen, link); 747 if (next == NULL) 748 pmap_delayed_invl_finish_unblock(invl_gen->gen); 749 else 750 next->gen = invl_gen->gen; 751 LIST_REMOVE(invl_gen, link); 752 mtx_unlock(&invl_gen_mtx); 753 invl_gen->gen = 0; 754 } 755 756 static bool 757 pmap_not_in_di_u(void) 758 { 759 struct pmap_invl_gen *invl_gen; 760 761 invl_gen = &curthread->td_md.md_invl_gen; 762 return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0); 763 } 764 765 static void 766 pmap_thread_init_invl_gen_u(struct thread *td) 767 { 768 struct pmap_invl_gen *invl_gen; 769 770 invl_gen = &td->td_md.md_invl_gen; 771 invl_gen->gen = 0; 772 invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID; 773 } 774 775 static bool 776 pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out) 777 { 778 uint64_t new_high, new_low, old_high, old_low; 779 char res; 780 781 old_low = new_low = 0; 782 old_high = new_high = (uintptr_t)0; 783 784 __asm volatile("lock;cmpxchg16b\t%1" 785 : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) 786 : "b"(new_low), "c" (new_high) 787 : "memory", "cc"); 788 if (res == 0) { 789 if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0) 790 return (false); 791 out->gen = old_low; 792 out->next = (void *)old_high; 793 } else { 794 out->gen = new_low; 795 out->next = (void *)new_high; 796 } 797 return (true); 798 } 799 800 static bool 801 pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val, 802 struct pmap_invl_gen *new_val) 803 { 804 uint64_t new_high, new_low, old_high, old_low; 805 char res; 806 807 new_low = new_val->gen; 808 new_high = (uintptr_t)new_val->next; 809 old_low = old_val->gen; 810 old_high = (uintptr_t)old_val->next; 811 812 __asm volatile("lock;cmpxchg16b\t%1" 813 : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) 814 : "b"(new_low), "c" (new_high) 815 : "memory", "cc"); 816 return (res); 817 } 818 819 static COUNTER_U64_DEFINE_EARLY(pv_page_count); 820 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_page_count, CTLFLAG_RD, 821 &pv_page_count, "Current number of allocated pv pages"); 822 823 static COUNTER_U64_DEFINE_EARLY(user_pt_page_count); 824 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, user_pt_page_count, CTLFLAG_RD, 825 &user_pt_page_count, 826 "Current number of allocated page table pages for userspace"); 827 828 static COUNTER_U64_DEFINE_EARLY(kernel_pt_page_count); 829 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, kernel_pt_page_count, CTLFLAG_RD, 830 &kernel_pt_page_count, 831 "Current number of allocated page table pages for the kernel"); 832 833 #ifdef PV_STATS 834 835 static COUNTER_U64_DEFINE_EARLY(invl_start_restart); 836 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_start_restart, 837 CTLFLAG_RD, &invl_start_restart, 838 "Number of delayed TLB invalidation request restarts"); 839 840 static COUNTER_U64_DEFINE_EARLY(invl_finish_restart); 841 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD, 842 &invl_finish_restart, 843 "Number of delayed TLB invalidation completion restarts"); 844 845 static int invl_max_qlen; 846 SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD, 847 &invl_max_qlen, 0, 848 "Maximum delayed TLB invalidation request queue length"); 849 #endif 850 851 #define di_delay locks_delay 852 853 static void 854 pmap_delayed_invl_start_u(void) 855 { 856 struct pmap_invl_gen *invl_gen, *p, prev, new_prev; 857 struct thread *td; 858 struct lock_delay_arg lda; 859 uintptr_t prevl; 860 u_char pri; 861 #ifdef PV_STATS 862 int i, ii; 863 #endif 864 865 td = curthread; 866 invl_gen = &td->td_md.md_invl_gen; 867 PMAP_ASSERT_NOT_IN_DI(); 868 lock_delay_arg_init(&lda, &di_delay); 869 invl_gen->saved_pri = 0; 870 pri = td->td_base_pri; 871 if (pri > PVM) { 872 thread_lock(td); 873 pri = td->td_base_pri; 874 if (pri > PVM) { 875 invl_gen->saved_pri = pri; 876 sched_prio(td, PVM); 877 } 878 thread_unlock(td); 879 } 880 again: 881 PV_STAT(i = 0); 882 for (p = &pmap_invl_gen_head;; p = prev.next) { 883 PV_STAT(i++); 884 prevl = (uintptr_t)atomic_load_ptr(&p->next); 885 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { 886 PV_STAT(counter_u64_add(invl_start_restart, 1)); 887 lock_delay(&lda); 888 goto again; 889 } 890 if (prevl == 0) 891 break; 892 prev.next = (void *)prevl; 893 } 894 #ifdef PV_STATS 895 if ((ii = invl_max_qlen) < i) 896 atomic_cmpset_int(&invl_max_qlen, ii, i); 897 #endif 898 899 if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) { 900 PV_STAT(counter_u64_add(invl_start_restart, 1)); 901 lock_delay(&lda); 902 goto again; 903 } 904 905 new_prev.gen = prev.gen; 906 new_prev.next = invl_gen; 907 invl_gen->gen = prev.gen + 1; 908 909 /* Formal fence between store to invl->gen and updating *p. */ 910 atomic_thread_fence_rel(); 911 912 /* 913 * After inserting an invl_gen element with invalid bit set, 914 * this thread blocks any other thread trying to enter the 915 * delayed invalidation block. Do not allow to remove us from 916 * the CPU, because it causes starvation for other threads. 917 */ 918 critical_enter(); 919 920 /* 921 * ABA for *p is not possible there, since p->gen can only 922 * increase. So if the *p thread finished its di, then 923 * started a new one and got inserted into the list at the 924 * same place, its gen will appear greater than the previously 925 * read gen. 926 */ 927 if (!pmap_di_store_invl(p, &prev, &new_prev)) { 928 critical_exit(); 929 PV_STAT(counter_u64_add(invl_start_restart, 1)); 930 lock_delay(&lda); 931 goto again; 932 } 933 934 /* 935 * There we clear PMAP_INVL_GEN_NEXT_INVALID in 936 * invl_gen->next, allowing other threads to iterate past us. 937 * pmap_di_store_invl() provides fence between the generation 938 * write and the update of next. 939 */ 940 invl_gen->next = NULL; 941 critical_exit(); 942 } 943 944 static bool 945 pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen *invl_gen, 946 struct pmap_invl_gen *p) 947 { 948 struct pmap_invl_gen prev, new_prev; 949 u_long mygen; 950 951 /* 952 * Load invl_gen->gen after setting invl_gen->next 953 * PMAP_INVL_GEN_NEXT_INVALID. This prevents larger 954 * generations to propagate to our invl_gen->gen. Lock prefix 955 * in atomic_set_ptr() worked as seq_cst fence. 956 */ 957 mygen = atomic_load_long(&invl_gen->gen); 958 959 if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen) 960 return (false); 961 962 KASSERT(prev.gen < mygen, 963 ("invalid di gen sequence %lu %lu", prev.gen, mygen)); 964 new_prev.gen = mygen; 965 new_prev.next = (void *)((uintptr_t)invl_gen->next & 966 ~PMAP_INVL_GEN_NEXT_INVALID); 967 968 /* Formal fence between load of prev and storing update to it. */ 969 atomic_thread_fence_rel(); 970 971 return (pmap_di_store_invl(p, &prev, &new_prev)); 972 } 973 974 static void 975 pmap_delayed_invl_finish_u(void) 976 { 977 struct pmap_invl_gen *invl_gen, *p; 978 struct thread *td; 979 struct lock_delay_arg lda; 980 uintptr_t prevl; 981 982 td = curthread; 983 invl_gen = &td->td_md.md_invl_gen; 984 KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0")); 985 KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0, 986 ("missed invl_start: INVALID")); 987 lock_delay_arg_init(&lda, &di_delay); 988 989 again: 990 for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) { 991 prevl = (uintptr_t)atomic_load_ptr(&p->next); 992 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { 993 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 994 lock_delay(&lda); 995 goto again; 996 } 997 if ((void *)prevl == invl_gen) 998 break; 999 } 1000 1001 /* 1002 * It is legitimate to not find ourself on the list if a 1003 * thread before us finished its DI and started it again. 1004 */ 1005 if (__predict_false(p == NULL)) { 1006 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 1007 lock_delay(&lda); 1008 goto again; 1009 } 1010 1011 critical_enter(); 1012 atomic_set_ptr((uintptr_t *)&invl_gen->next, 1013 PMAP_INVL_GEN_NEXT_INVALID); 1014 if (!pmap_delayed_invl_finish_u_crit(invl_gen, p)) { 1015 atomic_clear_ptr((uintptr_t *)&invl_gen->next, 1016 PMAP_INVL_GEN_NEXT_INVALID); 1017 critical_exit(); 1018 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 1019 lock_delay(&lda); 1020 goto again; 1021 } 1022 critical_exit(); 1023 if (atomic_load_int(&pmap_invl_waiters) > 0) 1024 pmap_delayed_invl_finish_unblock(0); 1025 if (invl_gen->saved_pri != 0) { 1026 thread_lock(td); 1027 sched_prio(td, invl_gen->saved_pri); 1028 thread_unlock(td); 1029 } 1030 } 1031 1032 #ifdef DDB 1033 DB_SHOW_COMMAND(di_queue, pmap_di_queue) 1034 { 1035 struct pmap_invl_gen *p, *pn; 1036 struct thread *td; 1037 uintptr_t nextl; 1038 bool first; 1039 1040 for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn, 1041 first = false) { 1042 nextl = (uintptr_t)atomic_load_ptr(&p->next); 1043 pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID); 1044 td = first ? NULL : __containerof(p, struct thread, 1045 td_md.md_invl_gen); 1046 db_printf("gen %lu inv %d td %p tid %d\n", p->gen, 1047 (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td, 1048 td != NULL ? td->td_tid : -1); 1049 } 1050 } 1051 #endif 1052 1053 #ifdef PV_STATS 1054 static COUNTER_U64_DEFINE_EARLY(invl_wait); 1055 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait, 1056 CTLFLAG_RD, &invl_wait, 1057 "Number of times DI invalidation blocked pmap_remove_all/write"); 1058 1059 static COUNTER_U64_DEFINE_EARLY(invl_wait_slow); 1060 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD, 1061 &invl_wait_slow, "Number of slow invalidation waits for lockless DI"); 1062 1063 #endif 1064 1065 #ifdef NUMA 1066 static u_long * 1067 pmap_delayed_invl_genp(vm_page_t m) 1068 { 1069 vm_paddr_t pa; 1070 u_long *gen; 1071 1072 pa = VM_PAGE_TO_PHYS(m); 1073 if (__predict_false((pa) > pmap_last_pa)) 1074 gen = &pv_dummy_large.pv_invl_gen; 1075 else 1076 gen = &(pa_to_pmdp(pa)->pv_invl_gen); 1077 1078 return (gen); 1079 } 1080 #else 1081 static u_long * 1082 pmap_delayed_invl_genp(vm_page_t m) 1083 { 1084 1085 return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); 1086 } 1087 #endif 1088 1089 static void 1090 pmap_delayed_invl_callout_func(void *arg __unused) 1091 { 1092 1093 if (atomic_load_int(&pmap_invl_waiters) == 0) 1094 return; 1095 pmap_delayed_invl_finish_unblock(0); 1096 } 1097 1098 static void 1099 pmap_delayed_invl_callout_init(void *arg __unused) 1100 { 1101 1102 if (pmap_di_locked()) 1103 return; 1104 callout_init(&pmap_invl_callout, 1); 1105 pmap_invl_callout_inited = true; 1106 } 1107 SYSINIT(pmap_di_callout, SI_SUB_CPU + 1, SI_ORDER_ANY, 1108 pmap_delayed_invl_callout_init, NULL); 1109 1110 /* 1111 * Ensure that all currently executing DI blocks, that need to flush 1112 * TLB for the given page m, actually flushed the TLB at the time the 1113 * function returned. If the page m has an empty PV list and we call 1114 * pmap_delayed_invl_wait(), upon its return we know that no CPU has a 1115 * valid mapping for the page m in either its page table or TLB. 1116 * 1117 * This function works by blocking until the global DI generation 1118 * number catches up with the generation number associated with the 1119 * given page m and its PV list. Since this function's callers 1120 * typically own an object lock and sometimes own a page lock, it 1121 * cannot sleep. Instead, it blocks on a turnstile to relinquish the 1122 * processor. 1123 */ 1124 static void 1125 pmap_delayed_invl_wait_l(vm_page_t m) 1126 { 1127 u_long *m_gen; 1128 #ifdef PV_STATS 1129 bool accounted = false; 1130 #endif 1131 1132 m_gen = pmap_delayed_invl_genp(m); 1133 while (*m_gen > pmap_invl_gen) { 1134 #ifdef PV_STATS 1135 if (!accounted) { 1136 counter_u64_add(invl_wait, 1); 1137 accounted = true; 1138 } 1139 #endif 1140 pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen); 1141 } 1142 } 1143 1144 static void 1145 pmap_delayed_invl_wait_u(vm_page_t m) 1146 { 1147 u_long *m_gen; 1148 struct lock_delay_arg lda; 1149 bool fast; 1150 1151 fast = true; 1152 m_gen = pmap_delayed_invl_genp(m); 1153 lock_delay_arg_init(&lda, &di_delay); 1154 while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) { 1155 if (fast || !pmap_invl_callout_inited) { 1156 PV_STAT(counter_u64_add(invl_wait, 1)); 1157 lock_delay(&lda); 1158 fast = false; 1159 } else { 1160 /* 1161 * The page's invalidation generation number 1162 * is still below the current thread's number. 1163 * Prepare to block so that we do not waste 1164 * CPU cycles or worse, suffer livelock. 1165 * 1166 * Since it is impossible to block without 1167 * racing with pmap_delayed_invl_finish_u(), 1168 * prepare for the race by incrementing 1169 * pmap_invl_waiters and arming a 1-tick 1170 * callout which will unblock us if we lose 1171 * the race. 1172 */ 1173 atomic_add_int(&pmap_invl_waiters, 1); 1174 1175 /* 1176 * Re-check the current thread's invalidation 1177 * generation after incrementing 1178 * pmap_invl_waiters, so that there is no race 1179 * with pmap_delayed_invl_finish_u() setting 1180 * the page generation and checking 1181 * pmap_invl_waiters. The only race allowed 1182 * is for a missed unblock, which is handled 1183 * by the callout. 1184 */ 1185 if (*m_gen > 1186 atomic_load_long(&pmap_invl_gen_head.gen)) { 1187 callout_reset(&pmap_invl_callout, 1, 1188 pmap_delayed_invl_callout_func, NULL); 1189 PV_STAT(counter_u64_add(invl_wait_slow, 1)); 1190 pmap_delayed_invl_wait_block(m_gen, 1191 &pmap_invl_gen_head.gen); 1192 } 1193 atomic_add_int(&pmap_invl_waiters, -1); 1194 } 1195 } 1196 } 1197 1198 DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *)) 1199 { 1200 1201 return (pmap_di_locked() ? pmap_thread_init_invl_gen_l : 1202 pmap_thread_init_invl_gen_u); 1203 } 1204 1205 DEFINE_IFUNC(static, void, pmap_delayed_invl_start, (void)) 1206 { 1207 1208 return (pmap_di_locked() ? pmap_delayed_invl_start_l : 1209 pmap_delayed_invl_start_u); 1210 } 1211 1212 DEFINE_IFUNC(static, void, pmap_delayed_invl_finish, (void)) 1213 { 1214 1215 return (pmap_di_locked() ? pmap_delayed_invl_finish_l : 1216 pmap_delayed_invl_finish_u); 1217 } 1218 1219 DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t)) 1220 { 1221 1222 return (pmap_di_locked() ? pmap_delayed_invl_wait_l : 1223 pmap_delayed_invl_wait_u); 1224 } 1225 1226 /* 1227 * Mark the page m's PV list as participating in the current thread's 1228 * DI block. Any threads concurrently using m's PV list to remove or 1229 * restrict all mappings to m will wait for the current thread's DI 1230 * block to complete before proceeding. 1231 * 1232 * The function works by setting the DI generation number for m's PV 1233 * list to at least the DI generation number of the current thread. 1234 * This forces a caller of pmap_delayed_invl_wait() to block until 1235 * current thread calls pmap_delayed_invl_finish(). 1236 */ 1237 static void 1238 pmap_delayed_invl_page(vm_page_t m) 1239 { 1240 u_long gen, *m_gen; 1241 1242 rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); 1243 gen = curthread->td_md.md_invl_gen.gen; 1244 if (gen == 0) 1245 return; 1246 m_gen = pmap_delayed_invl_genp(m); 1247 if (*m_gen < gen) 1248 *m_gen = gen; 1249 } 1250 1251 /* 1252 * Crashdump maps. 1253 */ 1254 static caddr_t crashdumpmap; 1255 1256 /* 1257 * Internal flags for pmap_enter()'s helper functions. 1258 */ 1259 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 1260 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 1261 1262 /* 1263 * Internal flags for pmap_mapdev_internal() and 1264 * pmap_change_props_locked(). 1265 */ 1266 #define MAPDEV_FLUSHCACHE 0x00000001 /* Flush cache after mapping. */ 1267 #define MAPDEV_SETATTR 0x00000002 /* Modify existing attrs. */ 1268 #define MAPDEV_ASSERTVALID 0x00000004 /* Assert mapping validity. */ 1269 1270 TAILQ_HEAD(pv_chunklist, pv_chunk); 1271 1272 static void free_pv_chunk(struct pv_chunk *pc); 1273 static void free_pv_chunk_batch(struct pv_chunklist *batch); 1274 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 1275 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 1276 static int popcnt_pc_map_pq(uint64_t *map); 1277 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 1278 static void reserve_pv_entries(pmap_t pmap, int needed, 1279 struct rwlock **lockp); 1280 static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1281 struct rwlock **lockp); 1282 static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, 1283 u_int flags, struct rwlock **lockp); 1284 #if VM_NRESERVLEVEL > 0 1285 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1286 struct rwlock **lockp); 1287 #endif 1288 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 1289 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 1290 vm_offset_t va); 1291 1292 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); 1293 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, 1294 vm_prot_t prot, int mode, int flags); 1295 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 1296 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, 1297 vm_offset_t va, struct rwlock **lockp); 1298 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, 1299 vm_offset_t va); 1300 static int pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, 1301 vm_prot_t prot, struct rwlock **lockp); 1302 static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, 1303 u_int flags, vm_page_t m, struct rwlock **lockp); 1304 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 1305 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 1306 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 1307 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, 1308 bool allpte_PG_A_set); 1309 static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, 1310 vm_offset_t eva); 1311 static void pmap_invalidate_cache_range_all(vm_offset_t sva, 1312 vm_offset_t eva); 1313 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, 1314 pd_entry_t pde); 1315 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 1316 static vm_page_t pmap_large_map_getptp_unlocked(void); 1317 static vm_paddr_t pmap_large_map_kextract(vm_offset_t va); 1318 #if VM_NRESERVLEVEL > 0 1319 static bool pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 1320 vm_page_t mpte, struct rwlock **lockp); 1321 #endif 1322 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 1323 vm_prot_t prot); 1324 static void pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask); 1325 static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, 1326 bool exec); 1327 static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va); 1328 static pd_entry_t *pmap_pti_pde(vm_offset_t va); 1329 static void pmap_pti_wire_pte(void *pte); 1330 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 1331 struct spglist *free, struct rwlock **lockp); 1332 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 1333 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 1334 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 1335 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 1336 struct spglist *free); 1337 static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1338 pd_entry_t *pde, struct spglist *free, 1339 struct rwlock **lockp); 1340 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 1341 vm_page_t m, struct rwlock **lockp); 1342 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 1343 pd_entry_t newpde); 1344 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); 1345 1346 static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, 1347 struct rwlock **lockp); 1348 static vm_page_t pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex, 1349 struct rwlock **lockp, vm_offset_t va); 1350 static vm_page_t pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex, 1351 struct rwlock **lockp, vm_offset_t va); 1352 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, 1353 struct rwlock **lockp); 1354 1355 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 1356 struct spglist *free); 1357 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 1358 1359 static vm_page_t pmap_alloc_pt_page(pmap_t, vm_pindex_t, int); 1360 static void pmap_free_pt_page(pmap_t, vm_page_t, bool); 1361 1362 /********************/ 1363 /* Inline functions */ 1364 /********************/ 1365 1366 /* 1367 * Return a non-clipped indexes for a given VA, which are page table 1368 * pages indexes at the corresponding level. 1369 */ 1370 static __inline vm_pindex_t 1371 pmap_pde_pindex(vm_offset_t va) 1372 { 1373 return (va >> PDRSHIFT); 1374 } 1375 1376 static __inline vm_pindex_t 1377 pmap_pdpe_pindex(vm_offset_t va) 1378 { 1379 return (NUPDE + (va >> PDPSHIFT)); 1380 } 1381 1382 static __inline vm_pindex_t 1383 pmap_pml4e_pindex(vm_offset_t va) 1384 { 1385 return (NUPDE + NUPDPE + (va >> PML4SHIFT)); 1386 } 1387 1388 static __inline vm_pindex_t 1389 pmap_pml5e_pindex(vm_offset_t va) 1390 { 1391 return (NUPDE + NUPDPE + NUPML4E + (va >> PML5SHIFT)); 1392 } 1393 1394 static __inline pml4_entry_t * 1395 pmap_pml5e(pmap_t pmap, vm_offset_t va) 1396 { 1397 1398 MPASS(pmap_is_la57(pmap)); 1399 return (&pmap->pm_pmltop[pmap_pml5e_index(va)]); 1400 } 1401 1402 static __inline pml4_entry_t * 1403 pmap_pml5e_u(pmap_t pmap, vm_offset_t va) 1404 { 1405 1406 MPASS(pmap_is_la57(pmap)); 1407 return (&pmap->pm_pmltopu[pmap_pml5e_index(va)]); 1408 } 1409 1410 static __inline pml4_entry_t * 1411 pmap_pml5e_to_pml4e(pml5_entry_t *pml5e, vm_offset_t va) 1412 { 1413 pml4_entry_t *pml4e; 1414 1415 /* XXX MPASS(pmap_is_la57(pmap); */ 1416 pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); 1417 return (&pml4e[pmap_pml4e_index(va)]); 1418 } 1419 1420 /* Return a pointer to the PML4 slot that corresponds to a VA */ 1421 static __inline pml4_entry_t * 1422 pmap_pml4e(pmap_t pmap, vm_offset_t va) 1423 { 1424 pml5_entry_t *pml5e; 1425 pml4_entry_t *pml4e; 1426 pt_entry_t PG_V; 1427 1428 if (pmap_is_la57(pmap)) { 1429 pml5e = pmap_pml5e(pmap, va); 1430 PG_V = pmap_valid_bit(pmap); 1431 if ((*pml5e & PG_V) == 0) 1432 return (NULL); 1433 pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); 1434 } else { 1435 pml4e = pmap->pm_pmltop; 1436 } 1437 return (&pml4e[pmap_pml4e_index(va)]); 1438 } 1439 1440 static __inline pml4_entry_t * 1441 pmap_pml4e_u(pmap_t pmap, vm_offset_t va) 1442 { 1443 MPASS(!pmap_is_la57(pmap)); 1444 return (&pmap->pm_pmltopu[pmap_pml4e_index(va)]); 1445 } 1446 1447 /* Return a pointer to the PDP slot that corresponds to a VA */ 1448 static __inline pdp_entry_t * 1449 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 1450 { 1451 pdp_entry_t *pdpe; 1452 1453 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 1454 return (&pdpe[pmap_pdpe_index(va)]); 1455 } 1456 1457 /* Return a pointer to the PDP slot that corresponds to a VA */ 1458 static __inline pdp_entry_t * 1459 pmap_pdpe(pmap_t pmap, vm_offset_t va) 1460 { 1461 pml4_entry_t *pml4e; 1462 pt_entry_t PG_V; 1463 1464 PG_V = pmap_valid_bit(pmap); 1465 pml4e = pmap_pml4e(pmap, va); 1466 if (pml4e == NULL || (*pml4e & PG_V) == 0) 1467 return (NULL); 1468 return (pmap_pml4e_to_pdpe(pml4e, va)); 1469 } 1470 1471 /* Return a pointer to the PD slot that corresponds to a VA */ 1472 static __inline pd_entry_t * 1473 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 1474 { 1475 pd_entry_t *pde; 1476 1477 KASSERT((*pdpe & PG_PS) == 0, 1478 ("%s: pdpe %#lx is a leaf", __func__, *pdpe)); 1479 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 1480 return (&pde[pmap_pde_index(va)]); 1481 } 1482 1483 /* Return a pointer to the PD slot that corresponds to a VA */ 1484 static __inline pd_entry_t * 1485 pmap_pde(pmap_t pmap, vm_offset_t va) 1486 { 1487 pdp_entry_t *pdpe; 1488 pt_entry_t PG_V; 1489 1490 PG_V = pmap_valid_bit(pmap); 1491 pdpe = pmap_pdpe(pmap, va); 1492 if (pdpe == NULL || (*pdpe & PG_V) == 0) 1493 return (NULL); 1494 KASSERT((*pdpe & PG_PS) == 0, 1495 ("pmap_pde for 1G page, pmap %p va %#lx", pmap, va)); 1496 return (pmap_pdpe_to_pde(pdpe, va)); 1497 } 1498 1499 /* Return a pointer to the PT slot that corresponds to a VA */ 1500 static __inline pt_entry_t * 1501 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 1502 { 1503 pt_entry_t *pte; 1504 1505 KASSERT((*pde & PG_PS) == 0, 1506 ("%s: pde %#lx is a leaf", __func__, *pde)); 1507 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 1508 return (&pte[pmap_pte_index(va)]); 1509 } 1510 1511 /* Return a pointer to the PT slot that corresponds to a VA */ 1512 static __inline pt_entry_t * 1513 pmap_pte(pmap_t pmap, vm_offset_t va) 1514 { 1515 pd_entry_t *pde; 1516 pt_entry_t PG_V; 1517 1518 PG_V = pmap_valid_bit(pmap); 1519 pde = pmap_pde(pmap, va); 1520 if (pde == NULL || (*pde & PG_V) == 0) 1521 return (NULL); 1522 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 1523 return ((pt_entry_t *)pde); 1524 return (pmap_pde_to_pte(pde, va)); 1525 } 1526 1527 static __inline void 1528 pmap_resident_count_adj(pmap_t pmap, int count) 1529 { 1530 1531 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1532 KASSERT(pmap->pm_stats.resident_count + count >= 0, 1533 ("pmap %p resident count underflow %ld %d", pmap, 1534 pmap->pm_stats.resident_count, count)); 1535 pmap->pm_stats.resident_count += count; 1536 } 1537 1538 static __inline void 1539 pmap_pt_page_count_pinit(pmap_t pmap, int count) 1540 { 1541 KASSERT(pmap->pm_stats.resident_count + count >= 0, 1542 ("pmap %p resident count underflow %ld %d", pmap, 1543 pmap->pm_stats.resident_count, count)); 1544 pmap->pm_stats.resident_count += count; 1545 } 1546 1547 static __inline void 1548 pmap_pt_page_count_adj(pmap_t pmap, int count) 1549 { 1550 if (pmap == kernel_pmap) 1551 counter_u64_add(kernel_pt_page_count, count); 1552 else { 1553 if (pmap != NULL) 1554 pmap_resident_count_adj(pmap, count); 1555 counter_u64_add(user_pt_page_count, count); 1556 } 1557 } 1558 1559 pt_entry_t vtoptem __read_mostly = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + 1560 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1) << 3; 1561 vm_offset_t PTmap __read_mostly = (vm_offset_t)P4Tmap; 1562 1563 PMAP_INLINE pt_entry_t * 1564 vtopte(vm_offset_t va) 1565 { 1566 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); 1567 1568 return ((pt_entry_t *)(PTmap + ((va >> (PAGE_SHIFT - 3)) & vtoptem))); 1569 } 1570 1571 pd_entry_t vtopdem __read_mostly = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 1572 NPML4EPGSHIFT)) - 1) << 3; 1573 vm_offset_t PDmap __read_mostly = (vm_offset_t)P4Dmap; 1574 1575 static __inline pd_entry_t * 1576 vtopde(vm_offset_t va) 1577 { 1578 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); 1579 1580 return ((pt_entry_t *)(PDmap + ((va >> (PDRSHIFT - 3)) & vtopdem))); 1581 } 1582 1583 static u_int64_t 1584 allocpages(vm_paddr_t *firstaddr, int n) 1585 { 1586 u_int64_t ret; 1587 1588 ret = *firstaddr; 1589 bzero((void *)ret, n * PAGE_SIZE); 1590 *firstaddr += n * PAGE_SIZE; 1591 return (ret); 1592 } 1593 1594 CTASSERT(powerof2(NDMPML4E)); 1595 1596 /* number of kernel PDP slots */ 1597 #define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG) 1598 1599 static void 1600 nkpt_init(vm_paddr_t addr) 1601 { 1602 int pt_pages; 1603 1604 #ifdef NKPT 1605 pt_pages = NKPT; 1606 #else 1607 pt_pages = howmany(addr - kernphys, NBPDR) + 1; /* +1 for 2M hole @0 */ 1608 pt_pages += NKPDPE(pt_pages); 1609 1610 /* 1611 * Add some slop beyond the bare minimum required for bootstrapping 1612 * the kernel. 1613 * 1614 * This is quite important when allocating KVA for kernel modules. 1615 * The modules are required to be linked in the negative 2GB of 1616 * the address space. If we run out of KVA in this region then 1617 * pmap_growkernel() will need to allocate page table pages to map 1618 * the entire 512GB of KVA space which is an unnecessary tax on 1619 * physical memory. 1620 * 1621 * Secondly, device memory mapped as part of setting up the low- 1622 * level console(s) is taken from KVA, starting at virtual_avail. 1623 * This is because cninit() is called after pmap_bootstrap() but 1624 * before vm_init() and pmap_init(). 20MB for a frame buffer is 1625 * not uncommon. 1626 */ 1627 pt_pages += 32; /* 64MB additional slop. */ 1628 #endif 1629 nkpt = pt_pages; 1630 } 1631 1632 /* 1633 * Returns the proper write/execute permission for a physical page that is 1634 * part of the initial boot allocations. 1635 * 1636 * If the page has kernel text, it is marked as read-only. If the page has 1637 * kernel read-only data, it is marked as read-only/not-executable. If the 1638 * page has only read-write data, it is marked as read-write/not-executable. 1639 * If the page is below/above the kernel range, it is marked as read-write. 1640 * 1641 * This function operates on 2M pages, since we map the kernel space that 1642 * way. 1643 */ 1644 static inline pt_entry_t 1645 bootaddr_rwx(vm_paddr_t pa) 1646 { 1647 /* 1648 * The kernel is loaded at a 2MB-aligned address, and memory below that 1649 * need not be executable. The .bss section is padded to a 2MB 1650 * boundary, so memory following the kernel need not be executable 1651 * either. Preloaded kernel modules have their mapping permissions 1652 * fixed up by the linker. 1653 */ 1654 if (pa < trunc_2mpage(kernphys + btext - KERNSTART) || 1655 pa >= trunc_2mpage(kernphys + _end - KERNSTART)) 1656 return (X86_PG_RW | pg_nx); 1657 1658 /* 1659 * The linker should ensure that the read-only and read-write 1660 * portions don't share the same 2M page, so this shouldn't 1661 * impact read-only data. However, in any case, any page with 1662 * read-write data needs to be read-write. 1663 */ 1664 if (pa >= trunc_2mpage(kernphys + brwsection - KERNSTART)) 1665 return (X86_PG_RW | pg_nx); 1666 1667 /* 1668 * Mark any 2M page containing kernel text as read-only. Mark 1669 * other pages with read-only data as read-only and not executable. 1670 * (It is likely a small portion of the read-only data section will 1671 * be marked as read-only, but executable. This should be acceptable 1672 * since the read-only protection will keep the data from changing.) 1673 * Note that fixups to the .text section will still work until we 1674 * set CR0.WP. 1675 */ 1676 if (pa < round_2mpage(kernphys + etext - KERNSTART)) 1677 return (0); 1678 return (pg_nx); 1679 } 1680 1681 static void 1682 create_pagetables(vm_paddr_t *firstaddr) 1683 { 1684 pd_entry_t *pd_p; 1685 pdp_entry_t *pdp_p; 1686 pml4_entry_t *p4_p; 1687 uint64_t DMPDkernphys; 1688 vm_paddr_t pax; 1689 #ifdef KASAN 1690 pt_entry_t *pt_p; 1691 uint64_t KASANPDphys, KASANPTphys, KASANphys; 1692 vm_offset_t kasankernbase; 1693 int kasankpdpi, kasankpdi, nkasanpte; 1694 #endif 1695 int i, j, ndm1g, nkpdpe, nkdmpde; 1696 1697 TSENTER(); 1698 /* Allocate page table pages for the direct map */ 1699 ndmpdp = howmany(ptoa(Maxmem), NBPDP); 1700 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 1701 ndmpdp = 4; 1702 ndmpdpphys = howmany(ndmpdp, NPDPEPG); 1703 if (ndmpdpphys > NDMPML4E) { 1704 /* 1705 * Each NDMPML4E allows 512 GB, so limit to that, 1706 * and then readjust ndmpdp and ndmpdpphys. 1707 */ 1708 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); 1709 Maxmem = atop(NDMPML4E * NBPML4); 1710 ndmpdpphys = NDMPML4E; 1711 ndmpdp = NDMPML4E * NPDEPG; 1712 } 1713 DMPDPphys = allocpages(firstaddr, ndmpdpphys); 1714 ndm1g = 0; 1715 if ((amd_feature & AMDID_PAGE1GB) != 0) { 1716 /* 1717 * Calculate the number of 1G pages that will fully fit in 1718 * Maxmem. 1719 */ 1720 ndm1g = ptoa(Maxmem) >> PDPSHIFT; 1721 1722 /* 1723 * Allocate 2M pages for the kernel. These will be used in 1724 * place of the one or more 1G pages from ndm1g that maps 1725 * kernel memory into DMAP. 1726 */ 1727 nkdmpde = howmany((vm_offset_t)brwsection - KERNSTART + 1728 kernphys - rounddown2(kernphys, NBPDP), NBPDP); 1729 DMPDkernphys = allocpages(firstaddr, nkdmpde); 1730 } 1731 if (ndm1g < ndmpdp) 1732 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); 1733 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 1734 1735 /* Allocate pages. */ 1736 KPML4phys = allocpages(firstaddr, 1); 1737 KPDPphys = allocpages(firstaddr, NKPML4E); 1738 #ifdef KASAN 1739 KASANPDPphys = allocpages(firstaddr, NKASANPML4E); 1740 KASANPDphys = allocpages(firstaddr, 1); 1741 #endif 1742 #ifdef KMSAN 1743 /* 1744 * The KMSAN shadow maps are initially left unpopulated, since there is 1745 * no need to shadow memory above KERNBASE. 1746 */ 1747 KMSANSHADPDPphys = allocpages(firstaddr, NKMSANSHADPML4E); 1748 KMSANORIGPDPphys = allocpages(firstaddr, NKMSANORIGPML4E); 1749 #endif 1750 1751 /* 1752 * Allocate the initial number of kernel page table pages required to 1753 * bootstrap. We defer this until after all memory-size dependent 1754 * allocations are done (e.g. direct map), so that we don't have to 1755 * build in too much slop in our estimate. 1756 * 1757 * Note that when NKPML4E > 1, we have an empty page underneath 1758 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) 1759 * pages. (pmap_enter requires a PD page to exist for each KPML4E.) 1760 */ 1761 nkpt_init(*firstaddr); 1762 nkpdpe = NKPDPE(nkpt); 1763 1764 KPTphys = allocpages(firstaddr, nkpt); 1765 KPDphys = allocpages(firstaddr, nkpdpe); 1766 1767 #ifdef KASAN 1768 nkasanpte = howmany(nkpt, KASAN_SHADOW_SCALE); 1769 KASANPTphys = allocpages(firstaddr, nkasanpte); 1770 KASANphys = allocpages(firstaddr, nkasanpte * NPTEPG); 1771 #endif 1772 1773 /* 1774 * Connect the zero-filled PT pages to their PD entries. This 1775 * implicitly maps the PT pages at their correct locations within 1776 * the PTmap. 1777 */ 1778 pd_p = (pd_entry_t *)KPDphys; 1779 for (i = 0; i < nkpt; i++) 1780 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 1781 1782 /* 1783 * Map from start of the kernel in physical memory (staging 1784 * area) to the end of loader preallocated memory using 2MB 1785 * pages. This replaces some of the PD entries created above. 1786 * For compatibility, identity map 2M at the start. 1787 */ 1788 pd_p[0] = X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | 1789 X86_PG_RW | pg_nx; 1790 for (i = 1, pax = kernphys; pax < KERNend; i++, pax += NBPDR) { 1791 /* Preset PG_M and PG_A because demotion expects it. */ 1792 pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | 1793 X86_PG_A | bootaddr_rwx(pax); 1794 } 1795 1796 /* 1797 * Because we map the physical blocks in 2M pages, adjust firstaddr 1798 * to record the physical blocks we've actually mapped into kernel 1799 * virtual address space. 1800 */ 1801 if (*firstaddr < round_2mpage(KERNend)) 1802 *firstaddr = round_2mpage(KERNend); 1803 1804 /* And connect up the PD to the PDP (leaving room for L4 pages) */ 1805 pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); 1806 for (i = 0; i < nkpdpe; i++) 1807 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 1808 1809 #ifdef KASAN 1810 kasankernbase = kasan_md_addr_to_shad(KERNBASE); 1811 kasankpdpi = pmap_pdpe_index(kasankernbase); 1812 kasankpdi = pmap_pde_index(kasankernbase); 1813 1814 pdp_p = (pdp_entry_t *)KASANPDPphys; 1815 pdp_p[kasankpdpi] = (KASANPDphys | X86_PG_RW | X86_PG_V | pg_nx); 1816 1817 pd_p = (pd_entry_t *)KASANPDphys; 1818 for (i = 0; i < nkasanpte; i++) 1819 pd_p[i + kasankpdi] = (KASANPTphys + ptoa(i)) | X86_PG_RW | 1820 X86_PG_V | pg_nx; 1821 1822 pt_p = (pt_entry_t *)KASANPTphys; 1823 for (i = 0; i < nkasanpte * NPTEPG; i++) 1824 pt_p[i] = (KASANphys + ptoa(i)) | X86_PG_RW | X86_PG_V | 1825 X86_PG_M | X86_PG_A | pg_nx; 1826 #endif 1827 1828 /* 1829 * Now, set up the direct map region using 2MB and/or 1GB pages. If 1830 * the end of physical memory is not aligned to a 1GB page boundary, 1831 * then the residual physical memory is mapped with 2MB pages. Later, 1832 * if pmap_mapdev{_attr}() uses the direct map for non-write-back 1833 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings 1834 * that are partially used. 1835 */ 1836 pd_p = (pd_entry_t *)DMPDphys; 1837 for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { 1838 pd_p[j] = (vm_paddr_t)i << PDRSHIFT; 1839 /* Preset PG_M and PG_A because demotion expects it. */ 1840 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | 1841 X86_PG_M | X86_PG_A | pg_nx; 1842 } 1843 pdp_p = (pdp_entry_t *)DMPDPphys; 1844 for (i = 0; i < ndm1g; i++) { 1845 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; 1846 /* Preset PG_M and PG_A because demotion expects it. */ 1847 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | 1848 X86_PG_M | X86_PG_A | pg_nx; 1849 } 1850 for (j = 0; i < ndmpdp; i++, j++) { 1851 pdp_p[i] = DMPDphys + ptoa(j); 1852 pdp_p[i] |= X86_PG_RW | X86_PG_V | pg_nx; 1853 } 1854 1855 /* 1856 * Instead of using a 1G page for the memory containing the kernel, 1857 * use 2M pages with read-only and no-execute permissions. (If using 1G 1858 * pages, this will partially overwrite the PDPEs above.) 1859 */ 1860 if (ndm1g > 0) { 1861 pd_p = (pd_entry_t *)DMPDkernphys; 1862 for (i = 0, pax = rounddown2(kernphys, NBPDP); 1863 i < NPDEPG * nkdmpde; i++, pax += NBPDR) { 1864 pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | 1865 X86_PG_A | pg_nx | bootaddr_rwx(pax); 1866 } 1867 j = rounddown2(kernphys, NBPDP) >> PDPSHIFT; 1868 for (i = 0; i < nkdmpde; i++) { 1869 pdp_p[i + j] = (DMPDkernphys + ptoa(i)) | 1870 X86_PG_RW | X86_PG_V | pg_nx; 1871 } 1872 } 1873 1874 /* And recursively map PML4 to itself in order to get PTmap */ 1875 p4_p = (pml4_entry_t *)KPML4phys; 1876 p4_p[PML4PML4I] = KPML4phys; 1877 p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx; 1878 1879 #ifdef KASAN 1880 /* Connect the KASAN shadow map slots up to the PML4. */ 1881 for (i = 0; i < NKASANPML4E; i++) { 1882 p4_p[KASANPML4I + i] = KASANPDPphys + ptoa(i); 1883 p4_p[KASANPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1884 } 1885 #endif 1886 1887 #ifdef KMSAN 1888 /* Connect the KMSAN shadow map slots up to the PML4. */ 1889 for (i = 0; i < NKMSANSHADPML4E; i++) { 1890 p4_p[KMSANSHADPML4I + i] = KMSANSHADPDPphys + ptoa(i); 1891 p4_p[KMSANSHADPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1892 } 1893 1894 /* Connect the KMSAN origin map slots up to the PML4. */ 1895 for (i = 0; i < NKMSANORIGPML4E; i++) { 1896 p4_p[KMSANORIGPML4I + i] = KMSANORIGPDPphys + ptoa(i); 1897 p4_p[KMSANORIGPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1898 } 1899 #endif 1900 1901 /* Connect the Direct Map slots up to the PML4. */ 1902 for (i = 0; i < ndmpdpphys; i++) { 1903 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); 1904 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1905 } 1906 1907 /* Connect the KVA slots up to the PML4 */ 1908 for (i = 0; i < NKPML4E; i++) { 1909 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); 1910 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V; 1911 } 1912 1913 kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 1914 TSEXIT(); 1915 } 1916 1917 /* 1918 * Bootstrap the system enough to run with virtual memory. 1919 * 1920 * On amd64 this is called after mapping has already been enabled 1921 * and just syncs the pmap module with what has already been done. 1922 * [We can't call it easily with mapping off since the kernel is not 1923 * mapped with PA == VA, hence we would have to relocate every address 1924 * from the linked base (virtual) address "KERNBASE" to the actual 1925 * (physical) address starting relative to 0] 1926 */ 1927 void 1928 pmap_bootstrap(vm_paddr_t *firstaddr) 1929 { 1930 vm_offset_t va; 1931 pt_entry_t *pte, *pcpu_pte; 1932 struct region_descriptor r_gdt; 1933 uint64_t cr4, pcpu0_phys; 1934 u_long res; 1935 int i; 1936 1937 TSENTER(); 1938 KERNend = *firstaddr; 1939 res = atop(KERNend - (vm_paddr_t)kernphys); 1940 1941 if (!pti) 1942 pg_g = X86_PG_G; 1943 1944 /* 1945 * Create an initial set of page tables to run the kernel in. 1946 */ 1947 create_pagetables(firstaddr); 1948 1949 pcpu0_phys = allocpages(firstaddr, 1); 1950 1951 /* 1952 * Add a physical memory segment (vm_phys_seg) corresponding to the 1953 * preallocated kernel page table pages so that vm_page structures 1954 * representing these pages will be created. The vm_page structures 1955 * are required for promotion of the corresponding kernel virtual 1956 * addresses to superpage mappings. 1957 */ 1958 vm_phys_early_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 1959 1960 /* 1961 * Account for the virtual addresses mapped by create_pagetables(). 1962 */ 1963 virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend - 1964 (vm_paddr_t)kernphys); 1965 virtual_end = VM_MAX_KERNEL_ADDRESS; 1966 1967 /* 1968 * Enable PG_G global pages, then switch to the kernel page 1969 * table from the bootstrap page table. After the switch, it 1970 * is possible to enable SMEP and SMAP since PG_U bits are 1971 * correct now. 1972 */ 1973 cr4 = rcr4(); 1974 cr4 |= CR4_PGE; 1975 load_cr4(cr4); 1976 load_cr3(KPML4phys); 1977 if (cpu_stdext_feature & CPUID_STDEXT_SMEP) 1978 cr4 |= CR4_SMEP; 1979 if (cpu_stdext_feature & CPUID_STDEXT_SMAP) 1980 cr4 |= CR4_SMAP; 1981 load_cr4(cr4); 1982 1983 /* 1984 * Initialize the kernel pmap (which is statically allocated). 1985 * Count bootstrap data as being resident in case any of this data is 1986 * later unmapped (using pmap_remove()) and freed. 1987 */ 1988 PMAP_LOCK_INIT(kernel_pmap); 1989 kernel_pmap->pm_pmltop = kernel_pml4; 1990 kernel_pmap->pm_cr3 = KPML4phys; 1991 kernel_pmap->pm_ucr3 = PMAP_NO_CR3; 1992 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1993 kernel_pmap->pm_stats.resident_count = res; 1994 vm_radix_init(&kernel_pmap->pm_root); 1995 kernel_pmap->pm_flags = pmap_flags; 1996 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 1997 rangeset_init(&kernel_pmap->pm_pkru, pkru_dup_range, 1998 pkru_free_range, kernel_pmap, M_NOWAIT); 1999 } 2000 2001 /* 2002 * The kernel pmap is always active on all CPUs. Once CPUs are 2003 * enumerated, the mask will be set equal to all_cpus. 2004 */ 2005 CPU_FILL(&kernel_pmap->pm_active); 2006 2007 /* 2008 * Initialize the TLB invalidations generation number lock. 2009 */ 2010 mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); 2011 2012 /* 2013 * Reserve some special page table entries/VA space for temporary 2014 * mapping of pages. 2015 */ 2016 #define SYSMAP(c, p, v, n) \ 2017 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 2018 2019 va = virtual_avail; 2020 pte = vtopte(va); 2021 2022 /* 2023 * Crashdump maps. The first page is reused as CMAP1 for the 2024 * memory test. 2025 */ 2026 SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) 2027 CADDR1 = crashdumpmap; 2028 2029 SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU); 2030 virtual_avail = va; 2031 2032 /* 2033 * Map the BSP PCPU now, the rest of the PCPUs are mapped by 2034 * amd64_mp_alloc_pcpu()/start_all_aps() when we know the 2035 * number of CPUs and NUMA affinity. 2036 */ 2037 pcpu_pte[0] = pcpu0_phys | X86_PG_V | X86_PG_RW | pg_g | pg_nx | 2038 X86_PG_M | X86_PG_A; 2039 for (i = 1; i < MAXCPU; i++) 2040 pcpu_pte[i] = 0; 2041 2042 /* 2043 * Re-initialize PCPU area for BSP after switching. 2044 * Make hardware use gdt and common_tss from the new PCPU. 2045 */ 2046 STAILQ_INIT(&cpuhead); 2047 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2048 pcpu_init(&__pcpu[0], 0, sizeof(struct pcpu)); 2049 amd64_bsp_pcpu_init1(&__pcpu[0]); 2050 amd64_bsp_ist_init(&__pcpu[0]); 2051 __pcpu[0].pc_common_tss.tss_iobase = sizeof(struct amd64tss) + 2052 IOPERM_BITMAP_SIZE; 2053 memcpy(__pcpu[0].pc_gdt, temp_bsp_pcpu.pc_gdt, NGDT * 2054 sizeof(struct user_segment_descriptor)); 2055 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&__pcpu[0].pc_common_tss; 2056 ssdtosyssd(&gdt_segs[GPROC0_SEL], 2057 (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); 2058 r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; 2059 r_gdt.rd_base = (long)__pcpu[0].pc_gdt; 2060 lgdt(&r_gdt); 2061 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2062 ltr(GSEL(GPROC0_SEL, SEL_KPL)); 2063 __pcpu[0].pc_dynamic = temp_bsp_pcpu.pc_dynamic; 2064 __pcpu[0].pc_acpi_id = temp_bsp_pcpu.pc_acpi_id; 2065 2066 /* 2067 * Initialize the PAT MSR. 2068 * pmap_init_pat() clears and sets CR4_PGE, which, as a 2069 * side-effect, invalidates stale PG_G TLB entries that might 2070 * have been created in our pre-boot environment. 2071 */ 2072 pmap_init_pat(); 2073 2074 /* Initialize TLB Context Id. */ 2075 if (pmap_pcid_enabled) { 2076 kernel_pmap->pm_pcidp = (void *)(uintptr_t) 2077 offsetof(struct pcpu, pc_kpmap_store); 2078 2079 PCPU_SET(kpmap_store.pm_pcid, PMAP_PCID_KERN); 2080 PCPU_SET(kpmap_store.pm_gen, 1); 2081 2082 /* 2083 * PMAP_PCID_KERN + 1 is used for initialization of 2084 * proc0 pmap. The pmap' pcid state might be used by 2085 * EFIRT entry before first context switch, so it 2086 * needs to be valid. 2087 */ 2088 PCPU_SET(pcid_next, PMAP_PCID_KERN + 2); 2089 PCPU_SET(pcid_gen, 1); 2090 2091 /* 2092 * pcpu area for APs is zeroed during AP startup. 2093 * pc_pcid_next and pc_pcid_gen are initialized by AP 2094 * during pcpu setup. 2095 */ 2096 load_cr4(rcr4() | CR4_PCIDE); 2097 } 2098 TSEXIT(); 2099 } 2100 2101 /* 2102 * Setup the PAT MSR. 2103 */ 2104 void 2105 pmap_init_pat(void) 2106 { 2107 uint64_t pat_msr; 2108 u_long cr0, cr4; 2109 int i; 2110 2111 /* Bail if this CPU doesn't implement PAT. */ 2112 if ((cpu_feature & CPUID_PAT) == 0) 2113 panic("no PAT??"); 2114 2115 /* Set default PAT index table. */ 2116 for (i = 0; i < PAT_INDEX_SIZE; i++) 2117 pat_index[i] = -1; 2118 pat_index[PAT_WRITE_BACK] = 0; 2119 pat_index[PAT_WRITE_THROUGH] = 1; 2120 pat_index[PAT_UNCACHEABLE] = 3; 2121 pat_index[PAT_WRITE_COMBINING] = 6; 2122 pat_index[PAT_WRITE_PROTECTED] = 5; 2123 pat_index[PAT_UNCACHED] = 2; 2124 2125 /* 2126 * Initialize default PAT entries. 2127 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 2128 * Program 5 and 6 as WP and WC. 2129 * 2130 * Leave 4 and 7 as WB and UC. Note that a recursive page table 2131 * mapping for a 2M page uses a PAT value with the bit 3 set due 2132 * to its overload with PG_PS. 2133 */ 2134 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 2135 PAT_VALUE(1, PAT_WRITE_THROUGH) | 2136 PAT_VALUE(2, PAT_UNCACHED) | 2137 PAT_VALUE(3, PAT_UNCACHEABLE) | 2138 PAT_VALUE(4, PAT_WRITE_BACK) | 2139 PAT_VALUE(5, PAT_WRITE_PROTECTED) | 2140 PAT_VALUE(6, PAT_WRITE_COMBINING) | 2141 PAT_VALUE(7, PAT_UNCACHEABLE); 2142 2143 /* Disable PGE. */ 2144 cr4 = rcr4(); 2145 load_cr4(cr4 & ~CR4_PGE); 2146 2147 /* Disable caches (CD = 1, NW = 0). */ 2148 cr0 = rcr0(); 2149 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 2150 2151 /* Flushes caches and TLBs. */ 2152 wbinvd(); 2153 invltlb(); 2154 2155 /* Update PAT and index table. */ 2156 wrmsr(MSR_PAT, pat_msr); 2157 2158 /* Flush caches and TLBs again. */ 2159 wbinvd(); 2160 invltlb(); 2161 2162 /* Restore caches and PGE. */ 2163 load_cr0(cr0); 2164 load_cr4(cr4); 2165 } 2166 2167 vm_page_t 2168 pmap_page_alloc_below_4g(bool zeroed) 2169 { 2170 return (vm_page_alloc_noobj_contig((zeroed ? VM_ALLOC_ZERO : 0), 2171 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT)); 2172 } 2173 2174 extern const char la57_trampoline[], la57_trampoline_gdt_desc[], 2175 la57_trampoline_gdt[], la57_trampoline_end[]; 2176 2177 static void 2178 pmap_bootstrap_la57(void *arg __unused) 2179 { 2180 char *v_code; 2181 pml5_entry_t *v_pml5; 2182 pml4_entry_t *v_pml4; 2183 pdp_entry_t *v_pdp; 2184 pd_entry_t *v_pd; 2185 pt_entry_t *v_pt; 2186 vm_page_t m_code, m_pml4, m_pdp, m_pd, m_pt, m_pml5; 2187 void (*la57_tramp)(uint64_t pml5); 2188 struct region_descriptor r_gdt; 2189 2190 if ((cpu_stdext_feature2 & CPUID_STDEXT2_LA57) == 0) 2191 return; 2192 TUNABLE_INT_FETCH("vm.pmap.la57", &la57); 2193 if (!la57) 2194 return; 2195 2196 r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; 2197 r_gdt.rd_base = (long)__pcpu[0].pc_gdt; 2198 2199 m_code = pmap_page_alloc_below_4g(true); 2200 v_code = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_code)); 2201 m_pml5 = pmap_page_alloc_below_4g(true); 2202 KPML5phys = VM_PAGE_TO_PHYS(m_pml5); 2203 v_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(KPML5phys); 2204 m_pml4 = pmap_page_alloc_below_4g(true); 2205 v_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4)); 2206 m_pdp = pmap_page_alloc_below_4g(true); 2207 v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp)); 2208 m_pd = pmap_page_alloc_below_4g(true); 2209 v_pd = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd)); 2210 m_pt = pmap_page_alloc_below_4g(true); 2211 v_pt = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pt)); 2212 2213 /* 2214 * Map m_code 1:1, it appears below 4G in KVA due to physical 2215 * address being below 4G. Since kernel KVA is in upper half, 2216 * the pml4e should be zero and free for temporary use. 2217 */ 2218 kernel_pmap->pm_pmltop[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = 2219 VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | 2220 X86_PG_M; 2221 v_pdp[pmap_pdpe_index(VM_PAGE_TO_PHYS(m_code))] = 2222 VM_PAGE_TO_PHYS(m_pd) | X86_PG_V | X86_PG_RW | X86_PG_A | 2223 X86_PG_M; 2224 v_pd[pmap_pde_index(VM_PAGE_TO_PHYS(m_code))] = 2225 VM_PAGE_TO_PHYS(m_pt) | X86_PG_V | X86_PG_RW | X86_PG_A | 2226 X86_PG_M; 2227 v_pt[pmap_pte_index(VM_PAGE_TO_PHYS(m_code))] = 2228 VM_PAGE_TO_PHYS(m_code) | X86_PG_V | X86_PG_RW | X86_PG_A | 2229 X86_PG_M; 2230 2231 /* 2232 * Add pml5 entry at top of KVA pointing to existing pml4 table, 2233 * entering all existing kernel mappings into level 5 table. 2234 */ 2235 v_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | 2236 X86_PG_RW | X86_PG_A | X86_PG_M | pg_g; 2237 2238 /* 2239 * Add pml5 entry for 1:1 trampoline mapping after LA57 is turned on. 2240 */ 2241 v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))] = 2242 VM_PAGE_TO_PHYS(m_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A | 2243 X86_PG_M; 2244 v_pml4[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = 2245 VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | 2246 X86_PG_M; 2247 2248 /* 2249 * Copy and call the 48->57 trampoline, hope we return there, alive. 2250 */ 2251 bcopy(la57_trampoline, v_code, la57_trampoline_end - la57_trampoline); 2252 *(u_long *)(v_code + 2 + (la57_trampoline_gdt_desc - la57_trampoline)) = 2253 la57_trampoline_gdt - la57_trampoline + VM_PAGE_TO_PHYS(m_code); 2254 la57_tramp = (void (*)(uint64_t))VM_PAGE_TO_PHYS(m_code); 2255 invlpg((vm_offset_t)la57_tramp); 2256 la57_tramp(KPML5phys); 2257 2258 /* 2259 * gdt was necessary reset, switch back to our gdt. 2260 */ 2261 lgdt(&r_gdt); 2262 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2263 load_ds(_udatasel); 2264 load_es(_udatasel); 2265 load_fs(_ufssel); 2266 ssdtosyssd(&gdt_segs[GPROC0_SEL], 2267 (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); 2268 ltr(GSEL(GPROC0_SEL, SEL_KPL)); 2269 2270 /* 2271 * Now unmap the trampoline, and free the pages. 2272 * Clear pml5 entry used for 1:1 trampoline mapping. 2273 */ 2274 pte_clear(&v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))]); 2275 invlpg((vm_offset_t)v_code); 2276 vm_page_free(m_code); 2277 vm_page_free(m_pdp); 2278 vm_page_free(m_pd); 2279 vm_page_free(m_pt); 2280 2281 /* 2282 * Recursively map PML5 to itself in order to get PTmap and 2283 * PDmap. 2284 */ 2285 v_pml5[PML5PML5I] = KPML5phys | X86_PG_RW | X86_PG_V | pg_nx; 2286 2287 vtoptem = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + 2288 NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3; 2289 PTmap = (vm_offset_t)P5Tmap; 2290 vtopdem = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 2291 NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3; 2292 PDmap = (vm_offset_t)P5Dmap; 2293 2294 kernel_pmap->pm_cr3 = KPML5phys; 2295 kernel_pmap->pm_pmltop = v_pml5; 2296 pmap_pt_page_count_adj(kernel_pmap, 1); 2297 } 2298 SYSINIT(la57, SI_SUB_KMEM, SI_ORDER_ANY, pmap_bootstrap_la57, NULL); 2299 2300 /* 2301 * Initialize a vm_page's machine-dependent fields. 2302 */ 2303 void 2304 pmap_page_init(vm_page_t m) 2305 { 2306 2307 TAILQ_INIT(&m->md.pv_list); 2308 m->md.pat_mode = PAT_WRITE_BACK; 2309 } 2310 2311 static int pmap_allow_2m_x_ept; 2312 SYSCTL_INT(_vm_pmap, OID_AUTO, allow_2m_x_ept, CTLFLAG_RWTUN | CTLFLAG_NOFETCH, 2313 &pmap_allow_2m_x_ept, 0, 2314 "Allow executable superpage mappings in EPT"); 2315 2316 void 2317 pmap_allow_2m_x_ept_recalculate(void) 2318 { 2319 /* 2320 * SKL002, SKL012S. Since the EPT format is only used by 2321 * Intel CPUs, the vendor check is merely a formality. 2322 */ 2323 if (!(cpu_vendor_id != CPU_VENDOR_INTEL || 2324 (cpu_ia32_arch_caps & IA32_ARCH_CAP_IF_PSCHANGE_MC_NO) != 0 || 2325 (CPUID_TO_FAMILY(cpu_id) == 0x6 && 2326 (CPUID_TO_MODEL(cpu_id) == 0x26 || /* Atoms */ 2327 CPUID_TO_MODEL(cpu_id) == 0x27 || 2328 CPUID_TO_MODEL(cpu_id) == 0x35 || 2329 CPUID_TO_MODEL(cpu_id) == 0x36 || 2330 CPUID_TO_MODEL(cpu_id) == 0x37 || 2331 CPUID_TO_MODEL(cpu_id) == 0x86 || 2332 CPUID_TO_MODEL(cpu_id) == 0x1c || 2333 CPUID_TO_MODEL(cpu_id) == 0x4a || 2334 CPUID_TO_MODEL(cpu_id) == 0x4c || 2335 CPUID_TO_MODEL(cpu_id) == 0x4d || 2336 CPUID_TO_MODEL(cpu_id) == 0x5a || 2337 CPUID_TO_MODEL(cpu_id) == 0x5c || 2338 CPUID_TO_MODEL(cpu_id) == 0x5d || 2339 CPUID_TO_MODEL(cpu_id) == 0x5f || 2340 CPUID_TO_MODEL(cpu_id) == 0x6e || 2341 CPUID_TO_MODEL(cpu_id) == 0x7a || 2342 CPUID_TO_MODEL(cpu_id) == 0x57 || /* Knights */ 2343 CPUID_TO_MODEL(cpu_id) == 0x85)))) 2344 pmap_allow_2m_x_ept = 1; 2345 TUNABLE_INT_FETCH("hw.allow_2m_x_ept", &pmap_allow_2m_x_ept); 2346 } 2347 2348 static bool 2349 pmap_allow_2m_x_page(pmap_t pmap, bool executable) 2350 { 2351 2352 return (pmap->pm_type != PT_EPT || !executable || 2353 !pmap_allow_2m_x_ept); 2354 } 2355 2356 #ifdef NUMA 2357 static void 2358 pmap_init_pv_table(void) 2359 { 2360 struct pmap_large_md_page *pvd; 2361 vm_size_t s; 2362 long start, end, highest, pv_npg; 2363 int domain, i, j, pages; 2364 2365 /* 2366 * For correctness we depend on the size being evenly divisible into a 2367 * page. As a tradeoff between performance and total memory use, the 2368 * entry is 64 bytes (aka one cacheline) in size. Not being smaller 2369 * avoids false-sharing, but not being 128 bytes potentially allows for 2370 * avoidable traffic due to adjacent cacheline prefetcher. 2371 * 2372 * Assert the size so that accidental changes fail to compile. 2373 */ 2374 CTASSERT((sizeof(*pvd) == 64)); 2375 2376 /* 2377 * Calculate the size of the array. 2378 */ 2379 pmap_last_pa = vm_phys_segs[vm_phys_nsegs - 1].end; 2380 pv_npg = howmany(pmap_last_pa, NBPDR); 2381 s = (vm_size_t)pv_npg * sizeof(struct pmap_large_md_page); 2382 s = round_page(s); 2383 pv_table = (struct pmap_large_md_page *)kva_alloc(s); 2384 if (pv_table == NULL) 2385 panic("%s: kva_alloc failed\n", __func__); 2386 2387 /* 2388 * Iterate physical segments to allocate space for respective pages. 2389 */ 2390 highest = -1; 2391 s = 0; 2392 for (i = 0; i < vm_phys_nsegs; i++) { 2393 end = vm_phys_segs[i].end / NBPDR; 2394 domain = vm_phys_segs[i].domain; 2395 2396 if (highest >= end) 2397 continue; 2398 2399 start = highest + 1; 2400 pvd = &pv_table[start]; 2401 2402 pages = end - start + 1; 2403 s = round_page(pages * sizeof(*pvd)); 2404 highest = start + (s / sizeof(*pvd)) - 1; 2405 2406 for (j = 0; j < s; j += PAGE_SIZE) { 2407 vm_page_t m = vm_page_alloc_noobj_domain(domain, 0); 2408 if (m == NULL) 2409 panic("failed to allocate PV table page"); 2410 pmap_qenter((vm_offset_t)pvd + j, &m, 1); 2411 } 2412 2413 for (j = 0; j < s / sizeof(*pvd); j++) { 2414 rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW); 2415 TAILQ_INIT(&pvd->pv_page.pv_list); 2416 pvd->pv_page.pv_gen = 0; 2417 pvd->pv_page.pat_mode = 0; 2418 pvd->pv_invl_gen = 0; 2419 pvd++; 2420 } 2421 } 2422 pvd = &pv_dummy_large; 2423 rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW); 2424 TAILQ_INIT(&pvd->pv_page.pv_list); 2425 pvd->pv_page.pv_gen = 0; 2426 pvd->pv_page.pat_mode = 0; 2427 pvd->pv_invl_gen = 0; 2428 } 2429 #else 2430 static void 2431 pmap_init_pv_table(void) 2432 { 2433 vm_size_t s; 2434 long i, pv_npg; 2435 2436 /* 2437 * Initialize the pool of pv list locks. 2438 */ 2439 for (i = 0; i < NPV_LIST_LOCKS; i++) 2440 rw_init(&pv_list_locks[i], "pmap pv list"); 2441 2442 /* 2443 * Calculate the size of the pv head table for superpages. 2444 */ 2445 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); 2446 2447 /* 2448 * Allocate memory for the pv head table for superpages. 2449 */ 2450 s = (vm_size_t)pv_npg * sizeof(struct md_page); 2451 s = round_page(s); 2452 pv_table = kmem_malloc(s, M_WAITOK | M_ZERO); 2453 for (i = 0; i < pv_npg; i++) 2454 TAILQ_INIT(&pv_table[i].pv_list); 2455 TAILQ_INIT(&pv_dummy.pv_list); 2456 } 2457 #endif 2458 2459 /* 2460 * Initialize the pmap module. 2461 * Called by vm_init, to initialize any structures that the pmap 2462 * system needs to map virtual memory. 2463 */ 2464 void 2465 pmap_init(void) 2466 { 2467 struct pmap_preinit_mapping *ppim; 2468 vm_page_t m, mpte; 2469 int error, i, ret, skz63; 2470 2471 /* L1TF, reserve page @0 unconditionally */ 2472 vm_page_blacklist_add(0, bootverbose); 2473 2474 /* Detect bare-metal Skylake Server and Skylake-X. */ 2475 if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL && 2476 CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) { 2477 /* 2478 * Skylake-X errata SKZ63. Processor May Hang When 2479 * Executing Code In an HLE Transaction Region between 2480 * 40000000H and 403FFFFFH. 2481 * 2482 * Mark the pages in the range as preallocated. It 2483 * seems to be impossible to distinguish between 2484 * Skylake Server and Skylake X. 2485 */ 2486 skz63 = 1; 2487 TUNABLE_INT_FETCH("hw.skz63_enable", &skz63); 2488 if (skz63 != 0) { 2489 if (bootverbose) 2490 printf("SKZ63: skipping 4M RAM starting " 2491 "at physical 1G\n"); 2492 for (i = 0; i < atop(0x400000); i++) { 2493 ret = vm_page_blacklist_add(0x40000000 + 2494 ptoa(i), FALSE); 2495 if (!ret && bootverbose) 2496 printf("page at %#lx already used\n", 2497 0x40000000 + ptoa(i)); 2498 } 2499 } 2500 } 2501 2502 /* IFU */ 2503 pmap_allow_2m_x_ept_recalculate(); 2504 2505 /* 2506 * Initialize the vm page array entries for the kernel pmap's 2507 * page table pages. 2508 */ 2509 PMAP_LOCK(kernel_pmap); 2510 for (i = 0; i < nkpt; i++) { 2511 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 2512 KASSERT(mpte >= vm_page_array && 2513 mpte < &vm_page_array[vm_page_array_size], 2514 ("pmap_init: page table page is out of range")); 2515 mpte->pindex = pmap_pde_pindex(KERNBASE) + i; 2516 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 2517 mpte->ref_count = 1; 2518 2519 /* 2520 * Collect the page table pages that were replaced by a 2MB 2521 * page in create_pagetables(). They are zero filled. 2522 */ 2523 if ((i == 0 || 2524 kernphys + ((vm_paddr_t)(i - 1) << PDRSHIFT) < KERNend) && 2525 pmap_insert_pt_page(kernel_pmap, mpte, false, false)) 2526 panic("pmap_init: pmap_insert_pt_page failed"); 2527 } 2528 PMAP_UNLOCK(kernel_pmap); 2529 vm_wire_add(nkpt); 2530 2531 /* 2532 * If the kernel is running on a virtual machine, then it must assume 2533 * that MCA is enabled by the hypervisor. Moreover, the kernel must 2534 * be prepared for the hypervisor changing the vendor and family that 2535 * are reported by CPUID. Consequently, the workaround for AMD Family 2536 * 10h Erratum 383 is enabled if the processor's feature set does not 2537 * include at least one feature that is only supported by older Intel 2538 * or newer AMD processors. 2539 */ 2540 if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && 2541 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 2542 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 2543 AMDID2_FMA4)) == 0) 2544 workaround_erratum383 = 1; 2545 2546 /* 2547 * Are large page mappings enabled? 2548 */ 2549 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 2550 if (pg_ps_enabled) { 2551 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 2552 ("pmap_init: can't assign to pagesizes[1]")); 2553 pagesizes[1] = NBPDR; 2554 if ((amd_feature & AMDID_PAGE1GB) != 0) { 2555 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0, 2556 ("pmap_init: can't assign to pagesizes[2]")); 2557 pagesizes[2] = NBPDP; 2558 } 2559 } 2560 2561 /* 2562 * Initialize pv chunk lists. 2563 */ 2564 for (i = 0; i < PMAP_MEMDOM; i++) { 2565 mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL, MTX_DEF); 2566 TAILQ_INIT(&pv_chunks[i].pvc_list); 2567 } 2568 pmap_init_pv_table(); 2569 2570 pmap_initialized = 1; 2571 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 2572 ppim = pmap_preinit_mapping + i; 2573 if (ppim->va == 0) 2574 continue; 2575 /* Make the direct map consistent */ 2576 if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) { 2577 (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), 2578 ppim->sz, ppim->mode); 2579 } 2580 if (!bootverbose) 2581 continue; 2582 printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i, 2583 ppim->pa, ppim->va, ppim->sz, ppim->mode); 2584 } 2585 2586 mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); 2587 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 2588 (vmem_addr_t *)&qframe); 2589 if (error != 0) 2590 panic("qframe allocation failed"); 2591 2592 lm_ents = 8; 2593 TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents); 2594 if (lm_ents > LMEPML4I - LMSPML4I + 1) 2595 lm_ents = LMEPML4I - LMSPML4I + 1; 2596 #ifdef KMSAN 2597 if (lm_ents > KMSANORIGPML4I - LMSPML4I) { 2598 printf( 2599 "pmap: shrinking large map for KMSAN (%d slots to %ld slots)\n", 2600 lm_ents, KMSANORIGPML4I - LMSPML4I); 2601 lm_ents = KMSANORIGPML4I - LMSPML4I; 2602 } 2603 #endif 2604 if (bootverbose) 2605 printf("pmap: large map %u PML4 slots (%lu GB)\n", 2606 lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024)); 2607 if (lm_ents != 0) { 2608 large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS, 2609 (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK); 2610 if (large_vmem == NULL) { 2611 printf("pmap: cannot create large map\n"); 2612 lm_ents = 0; 2613 } 2614 for (i = 0; i < lm_ents; i++) { 2615 m = pmap_large_map_getptp_unlocked(); 2616 /* XXXKIB la57 */ 2617 kernel_pml4[LMSPML4I + i] = X86_PG_V | 2618 X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx | 2619 VM_PAGE_TO_PHYS(m); 2620 } 2621 } 2622 } 2623 2624 SYSCTL_UINT(_vm_pmap, OID_AUTO, large_map_pml4_entries, 2625 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &lm_ents, 0, 2626 "Maximum number of PML4 entries for use by large map (tunable). " 2627 "Each entry corresponds to 512GB of address space."); 2628 2629 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 2630 "2MB page mapping counters"); 2631 2632 static COUNTER_U64_DEFINE_EARLY(pmap_pde_demotions); 2633 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, demotions, 2634 CTLFLAG_RD, &pmap_pde_demotions, "2MB page demotions"); 2635 2636 static COUNTER_U64_DEFINE_EARLY(pmap_pde_mappings); 2637 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 2638 &pmap_pde_mappings, "2MB page mappings"); 2639 2640 static COUNTER_U64_DEFINE_EARLY(pmap_pde_p_failures); 2641 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 2642 &pmap_pde_p_failures, "2MB page promotion failures"); 2643 2644 static COUNTER_U64_DEFINE_EARLY(pmap_pde_promotions); 2645 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 2646 &pmap_pde_promotions, "2MB page promotions"); 2647 2648 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 2649 "1GB page mapping counters"); 2650 2651 static COUNTER_U64_DEFINE_EARLY(pmap_pdpe_demotions); 2652 SYSCTL_COUNTER_U64(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, 2653 &pmap_pdpe_demotions, "1GB page demotions"); 2654 2655 /*************************************************** 2656 * Low level helper routines..... 2657 ***************************************************/ 2658 2659 static pt_entry_t 2660 pmap_swap_pat(pmap_t pmap, pt_entry_t entry) 2661 { 2662 int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; 2663 2664 switch (pmap->pm_type) { 2665 case PT_X86: 2666 case PT_RVI: 2667 /* Verify that both PAT bits are not set at the same time */ 2668 KASSERT((entry & x86_pat_bits) != x86_pat_bits, 2669 ("Invalid PAT bits in entry %#lx", entry)); 2670 2671 /* Swap the PAT bits if one of them is set */ 2672 if ((entry & x86_pat_bits) != 0) 2673 entry ^= x86_pat_bits; 2674 break; 2675 case PT_EPT: 2676 /* 2677 * Nothing to do - the memory attributes are represented 2678 * the same way for regular pages and superpages. 2679 */ 2680 break; 2681 default: 2682 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); 2683 } 2684 2685 return (entry); 2686 } 2687 2688 boolean_t 2689 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 2690 { 2691 2692 return (mode >= 0 && mode < PAT_INDEX_SIZE && 2693 pat_index[(int)mode] >= 0); 2694 } 2695 2696 /* 2697 * Determine the appropriate bits to set in a PTE or PDE for a specified 2698 * caching mode. 2699 */ 2700 int 2701 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) 2702 { 2703 int cache_bits, pat_flag, pat_idx; 2704 2705 if (!pmap_is_valid_memattr(pmap, mode)) 2706 panic("Unknown caching mode %d\n", mode); 2707 2708 switch (pmap->pm_type) { 2709 case PT_X86: 2710 case PT_RVI: 2711 /* The PAT bit is different for PTE's and PDE's. */ 2712 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 2713 2714 /* Map the caching mode to a PAT index. */ 2715 pat_idx = pat_index[mode]; 2716 2717 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 2718 cache_bits = 0; 2719 if (pat_idx & 0x4) 2720 cache_bits |= pat_flag; 2721 if (pat_idx & 0x2) 2722 cache_bits |= PG_NC_PCD; 2723 if (pat_idx & 0x1) 2724 cache_bits |= PG_NC_PWT; 2725 break; 2726 2727 case PT_EPT: 2728 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); 2729 break; 2730 2731 default: 2732 panic("unsupported pmap type %d", pmap->pm_type); 2733 } 2734 2735 return (cache_bits); 2736 } 2737 2738 static int 2739 pmap_cache_mask(pmap_t pmap, boolean_t is_pde) 2740 { 2741 int mask; 2742 2743 switch (pmap->pm_type) { 2744 case PT_X86: 2745 case PT_RVI: 2746 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; 2747 break; 2748 case PT_EPT: 2749 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); 2750 break; 2751 default: 2752 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); 2753 } 2754 2755 return (mask); 2756 } 2757 2758 static int 2759 pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde) 2760 { 2761 int pat_flag, pat_idx; 2762 2763 pat_idx = 0; 2764 switch (pmap->pm_type) { 2765 case PT_X86: 2766 case PT_RVI: 2767 /* The PAT bit is different for PTE's and PDE's. */ 2768 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 2769 2770 if ((pte & pat_flag) != 0) 2771 pat_idx |= 0x4; 2772 if ((pte & PG_NC_PCD) != 0) 2773 pat_idx |= 0x2; 2774 if ((pte & PG_NC_PWT) != 0) 2775 pat_idx |= 0x1; 2776 break; 2777 case PT_EPT: 2778 if ((pte & EPT_PG_IGNORE_PAT) != 0) 2779 panic("EPT PTE %#lx has no PAT memory type", pte); 2780 pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3; 2781 break; 2782 } 2783 2784 /* See pmap_init_pat(). */ 2785 if (pat_idx == 4) 2786 pat_idx = 0; 2787 if (pat_idx == 7) 2788 pat_idx = 3; 2789 2790 return (pat_idx); 2791 } 2792 2793 bool 2794 pmap_ps_enabled(pmap_t pmap) 2795 { 2796 2797 return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); 2798 } 2799 2800 static void 2801 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) 2802 { 2803 2804 switch (pmap->pm_type) { 2805 case PT_X86: 2806 break; 2807 case PT_RVI: 2808 case PT_EPT: 2809 /* 2810 * XXX 2811 * This is a little bogus since the generation number is 2812 * supposed to be bumped up when a region of the address 2813 * space is invalidated in the page tables. 2814 * 2815 * In this case the old PDE entry is valid but yet we want 2816 * to make sure that any mappings using the old entry are 2817 * invalidated in the TLB. 2818 * 2819 * The reason this works as expected is because we rendezvous 2820 * "all" host cpus and force any vcpu context to exit as a 2821 * side-effect. 2822 */ 2823 atomic_add_long(&pmap->pm_eptgen, 1); 2824 break; 2825 default: 2826 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); 2827 } 2828 pde_store(pde, newpde); 2829 } 2830 2831 /* 2832 * After changing the page size for the specified virtual address in the page 2833 * table, flush the corresponding entries from the processor's TLB. Only the 2834 * calling processor's TLB is affected. 2835 * 2836 * The calling thread must be pinned to a processor. 2837 */ 2838 static void 2839 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) 2840 { 2841 pt_entry_t PG_G; 2842 2843 if (pmap_type_guest(pmap)) 2844 return; 2845 2846 KASSERT(pmap->pm_type == PT_X86, 2847 ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); 2848 2849 PG_G = pmap_global_bit(pmap); 2850 2851 if ((newpde & PG_PS) == 0) 2852 /* Demotion: flush a specific 2MB page mapping. */ 2853 pmap_invlpg(pmap, va); 2854 else if ((newpde & PG_G) == 0) 2855 /* 2856 * Promotion: flush every 4KB page mapping from the TLB 2857 * because there are too many to flush individually. 2858 */ 2859 invltlb(); 2860 else { 2861 /* 2862 * Promotion: flush every 4KB page mapping from the TLB, 2863 * including any global (PG_G) mappings. 2864 */ 2865 invltlb_glob(); 2866 } 2867 } 2868 2869 /* 2870 * The amd64 pmap uses different approaches to TLB invalidation 2871 * depending on the kernel configuration, available hardware features, 2872 * and known hardware errata. The kernel configuration option that 2873 * has the greatest operational impact on TLB invalidation is PTI, 2874 * which is enabled automatically on affected Intel CPUs. The most 2875 * impactful hardware features are first PCID, and then INVPCID 2876 * instruction presence. PCID usage is quite different for PTI 2877 * vs. non-PTI. 2878 * 2879 * * Kernel Page Table Isolation (PTI or KPTI) is used to mitigate 2880 * the Meltdown bug in some Intel CPUs. Under PTI, each user address 2881 * space is served by two page tables, user and kernel. The user 2882 * page table only maps user space and a kernel trampoline. The 2883 * kernel trampoline includes the entirety of the kernel text but 2884 * only the kernel data that is needed to switch from user to kernel 2885 * mode. The kernel page table maps the user and kernel address 2886 * spaces in their entirety. It is identical to the per-process 2887 * page table used in non-PTI mode. 2888 * 2889 * User page tables are only used when the CPU is in user mode. 2890 * Consequently, some TLB invalidations can be postponed until the 2891 * switch from kernel to user mode. In contrast, the user 2892 * space part of the kernel page table is used for copyout(9), so 2893 * TLB invalidations on this page table cannot be similarly postponed. 2894 * 2895 * The existence of a user mode page table for the given pmap is 2896 * indicated by a pm_ucr3 value that differs from PMAP_NO_CR3, in 2897 * which case pm_ucr3 contains the %cr3 register value for the user 2898 * mode page table's root. 2899 * 2900 * * The pm_active bitmask indicates which CPUs currently have the 2901 * pmap active. A CPU's bit is set on context switch to the pmap, and 2902 * cleared on switching off this CPU. For the kernel page table, 2903 * the pm_active field is immutable and contains all CPUs. The 2904 * kernel page table is always logically active on every processor, 2905 * but not necessarily in use by the hardware, e.g., in PTI mode. 2906 * 2907 * When requesting invalidation of virtual addresses with 2908 * pmap_invalidate_XXX() functions, the pmap sends shootdown IPIs to 2909 * all CPUs recorded as active in pm_active. Updates to and reads 2910 * from pm_active are not synchronized, and so they may race with 2911 * each other. Shootdown handlers are prepared to handle the race. 2912 * 2913 * * PCID is an optional feature of the long mode x86 MMU where TLB 2914 * entries are tagged with the 'Process ID' of the address space 2915 * they belong to. This feature provides a limited namespace for 2916 * process identifiers, 12 bits, supporting 4095 simultaneous IDs 2917 * total. 2918 * 2919 * Allocation of a PCID to a pmap is done by an algorithm described 2920 * in section 15.12, "Other TLB Consistency Algorithms", of 2921 * Vahalia's book "Unix Internals". A PCID cannot be allocated for 2922 * the whole lifetime of a pmap in pmap_pinit() due to the limited 2923 * namespace. Instead, a per-CPU, per-pmap PCID is assigned when 2924 * the CPU is about to start caching TLB entries from a pmap, 2925 * i.e., on the context switch that activates the pmap on the CPU. 2926 * 2927 * The PCID allocator maintains a per-CPU, per-pmap generation 2928 * count, pm_gen, which is incremented each time a new PCID is 2929 * allocated. On TLB invalidation, the generation counters for the 2930 * pmap are zeroed, which signals the context switch code that the 2931 * previously allocated PCID is no longer valid. Effectively, 2932 * zeroing any of these counters triggers a TLB shootdown for the 2933 * given CPU/address space, due to the allocation of a new PCID. 2934 * 2935 * Zeroing can be performed remotely. Consequently, if a pmap is 2936 * inactive on a CPU, then a TLB shootdown for that pmap and CPU can 2937 * be initiated by an ordinary memory access to reset the target 2938 * CPU's generation count within the pmap. The CPU initiating the 2939 * TLB shootdown does not need to send an IPI to the target CPU. 2940 * 2941 * * PTI + PCID. The available PCIDs are divided into two sets: PCIDs 2942 * for complete (kernel) page tables, and PCIDs for user mode page 2943 * tables. A user PCID value is obtained from the kernel PCID value 2944 * by setting the highest bit, 11, to 1 (0x800 == PMAP_PCID_USER_PT). 2945 * 2946 * User space page tables are activated on return to user mode, by 2947 * loading pm_ucr3 into %cr3. If the PCPU(ucr3_load_mask) requests 2948 * clearing bit 63 of the loaded ucr3, this effectively causes 2949 * complete invalidation of the user mode TLB entries for the 2950 * current pmap. In which case, local invalidations of individual 2951 * pages in the user page table are skipped. 2952 * 2953 * * Local invalidation, all modes. If the requested invalidation is 2954 * for a specific address or the total invalidation of a currently 2955 * active pmap, then the TLB is flushed using INVLPG for a kernel 2956 * page table, and INVPCID(INVPCID_CTXGLOB)/invltlb_glob() for a 2957 * user space page table(s). 2958 * 2959 * If the INVPCID instruction is available, it is used to flush user 2960 * entries from the kernel page table. 2961 * 2962 * When PCID is enabled, the INVLPG instruction invalidates all TLB 2963 * entries for the given page that either match the current PCID or 2964 * are global. Since TLB entries for the same page under different 2965 * PCIDs are unaffected, kernel pages which reside in all address 2966 * spaces could be problematic. We avoid the problem by creating 2967 * all kernel PTEs with the global flag (PG_G) set, when PTI is 2968 * disabled. 2969 * 2970 * * mode: PTI disabled, PCID present. The kernel reserves PCID 0 for its 2971 * address space, all other 4095 PCIDs are used for user mode spaces 2972 * as described above. A context switch allocates a new PCID if 2973 * the recorded PCID is zero or the recorded generation does not match 2974 * the CPU's generation, effectively flushing the TLB for this address space. 2975 * Total remote invalidation is performed by zeroing pm_gen for all CPUs. 2976 * local user page: INVLPG 2977 * local kernel page: INVLPG 2978 * local user total: INVPCID(CTX) 2979 * local kernel total: INVPCID(CTXGLOB) or invltlb_glob() 2980 * remote user page, inactive pmap: zero pm_gen 2981 * remote user page, active pmap: zero pm_gen + IPI:INVLPG 2982 * (Both actions are required to handle the aforementioned pm_active races.) 2983 * remote kernel page: IPI:INVLPG 2984 * remote user total, inactive pmap: zero pm_gen 2985 * remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) or 2986 * reload %cr3) 2987 * (See note above about pm_active races.) 2988 * remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob()) 2989 * 2990 * PTI enabled, PCID present. 2991 * local user page: INVLPG for kpt, INVPCID(ADDR) or (INVLPG for ucr3) 2992 * for upt 2993 * local kernel page: INVLPG 2994 * local user total: INVPCID(CTX) or reload %cr3 for kpt, clear PCID_SAVE 2995 * on loading UCR3 into %cr3 for upt 2996 * local kernel total: INVPCID(CTXGLOB) or invltlb_glob() 2997 * remote user page, inactive pmap: zero pm_gen 2998 * remote user page, active pmap: zero pm_gen + IPI:(INVLPG for kpt, 2999 * INVPCID(ADDR) for upt) 3000 * remote kernel page: IPI:INVLPG 3001 * remote user total, inactive pmap: zero pm_gen 3002 * remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) for kpt, 3003 * clear PCID_SAVE on loading UCR3 into $cr3 for upt) 3004 * remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob()) 3005 * 3006 * No PCID. 3007 * local user page: INVLPG 3008 * local kernel page: INVLPG 3009 * local user total: reload %cr3 3010 * local kernel total: invltlb_glob() 3011 * remote user page, inactive pmap: - 3012 * remote user page, active pmap: IPI:INVLPG 3013 * remote kernel page: IPI:INVLPG 3014 * remote user total, inactive pmap: - 3015 * remote user total, active pmap: IPI:(reload %cr3) 3016 * remote kernel total: IPI:invltlb_glob() 3017 * Since on return to user mode, the reload of %cr3 with ucr3 causes 3018 * TLB invalidation, no specific action is required for user page table. 3019 * 3020 * EPT. EPT pmaps do not map KVA, all mappings are userspace. 3021 * XXX TODO 3022 */ 3023 3024 #ifdef SMP 3025 /* 3026 * Interrupt the cpus that are executing in the guest context. 3027 * This will force the vcpu to exit and the cached EPT mappings 3028 * will be invalidated by the host before the next vmresume. 3029 */ 3030 static __inline void 3031 pmap_invalidate_ept(pmap_t pmap) 3032 { 3033 smr_seq_t goal; 3034 int ipinum; 3035 3036 sched_pin(); 3037 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 3038 ("pmap_invalidate_ept: absurd pm_active")); 3039 3040 /* 3041 * The TLB mappings associated with a vcpu context are not 3042 * flushed each time a different vcpu is chosen to execute. 3043 * 3044 * This is in contrast with a process's vtop mappings that 3045 * are flushed from the TLB on each context switch. 3046 * 3047 * Therefore we need to do more than just a TLB shootdown on 3048 * the active cpus in 'pmap->pm_active'. To do this we keep 3049 * track of the number of invalidations performed on this pmap. 3050 * 3051 * Each vcpu keeps a cache of this counter and compares it 3052 * just before a vmresume. If the counter is out-of-date an 3053 * invept will be done to flush stale mappings from the TLB. 3054 * 3055 * To ensure that all vCPU threads have observed the new counter 3056 * value before returning, we use SMR. Ordering is important here: 3057 * the VMM enters an SMR read section before loading the counter 3058 * and after updating the pm_active bit set. Thus, pm_active is 3059 * a superset of active readers, and any reader that has observed 3060 * the goal has observed the new counter value. 3061 */ 3062 atomic_add_long(&pmap->pm_eptgen, 1); 3063 3064 goal = smr_advance(pmap->pm_eptsmr); 3065 3066 /* 3067 * Force the vcpu to exit and trap back into the hypervisor. 3068 */ 3069 ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; 3070 ipi_selected(pmap->pm_active, ipinum); 3071 sched_unpin(); 3072 3073 /* 3074 * Ensure that all active vCPUs will observe the new generation counter 3075 * value before executing any more guest instructions. 3076 */ 3077 smr_wait(pmap->pm_eptsmr, goal); 3078 } 3079 3080 static inline void 3081 pmap_invalidate_preipi_pcid(pmap_t pmap) 3082 { 3083 struct pmap_pcid *pcidp; 3084 u_int cpuid, i; 3085 3086 sched_pin(); 3087 3088 cpuid = PCPU_GET(cpuid); 3089 if (pmap != PCPU_GET(curpmap)) 3090 cpuid = 0xffffffff; /* An impossible value */ 3091 3092 CPU_FOREACH(i) { 3093 if (cpuid != i) { 3094 pcidp = zpcpu_get_cpu(pmap->pm_pcidp, i); 3095 pcidp->pm_gen = 0; 3096 } 3097 } 3098 3099 /* 3100 * The fence is between stores to pm_gen and the read of the 3101 * pm_active mask. We need to ensure that it is impossible 3102 * for us to miss the bit update in pm_active and 3103 * simultaneously observe a non-zero pm_gen in 3104 * pmap_activate_sw(), otherwise TLB update is missed. 3105 * Without the fence, IA32 allows such an outcome. Note that 3106 * pm_active is updated by a locked operation, which provides 3107 * the reciprocal fence. 3108 */ 3109 atomic_thread_fence_seq_cst(); 3110 } 3111 3112 static void 3113 pmap_invalidate_preipi_nopcid(pmap_t pmap __unused) 3114 { 3115 sched_pin(); 3116 } 3117 3118 DEFINE_IFUNC(static, void, pmap_invalidate_preipi, (pmap_t)) 3119 { 3120 return (pmap_pcid_enabled ? pmap_invalidate_preipi_pcid : 3121 pmap_invalidate_preipi_nopcid); 3122 } 3123 3124 static inline void 3125 pmap_invalidate_page_pcid_cb(pmap_t pmap, vm_offset_t va, 3126 const bool invpcid_works1) 3127 { 3128 struct invpcid_descr d; 3129 uint64_t kcr3, ucr3; 3130 uint32_t pcid; 3131 3132 /* 3133 * Because pm_pcid is recalculated on a context switch, we 3134 * must ensure there is no preemption, not just pinning. 3135 * Otherwise, we might use a stale value below. 3136 */ 3137 CRITICAL_ASSERT(curthread); 3138 3139 /* 3140 * No need to do anything with user page tables invalidation 3141 * if there is no user page table, or invalidation is deferred 3142 * until the return to userspace. ucr3_load_mask is stable 3143 * because we have preemption disabled. 3144 */ 3145 if (pmap->pm_ucr3 == PMAP_NO_CR3 || 3146 PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) 3147 return; 3148 3149 pcid = pmap_get_pcid(pmap); 3150 if (invpcid_works1) { 3151 d.pcid = pcid | PMAP_PCID_USER_PT; 3152 d.pad = 0; 3153 d.addr = va; 3154 invpcid(&d, INVPCID_ADDR); 3155 } else { 3156 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3157 ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3158 pmap_pti_pcid_invlpg(ucr3, kcr3, va); 3159 } 3160 } 3161 3162 static void 3163 pmap_invalidate_page_pcid_invpcid_cb(pmap_t pmap, vm_offset_t va) 3164 { 3165 pmap_invalidate_page_pcid_cb(pmap, va, true); 3166 } 3167 3168 static void 3169 pmap_invalidate_page_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t va) 3170 { 3171 pmap_invalidate_page_pcid_cb(pmap, va, false); 3172 } 3173 3174 static void 3175 pmap_invalidate_page_nopcid_cb(pmap_t pmap __unused, vm_offset_t va __unused) 3176 { 3177 } 3178 3179 DEFINE_IFUNC(static, void, pmap_invalidate_page_cb, (pmap_t, vm_offset_t)) 3180 { 3181 if (pmap_pcid_enabled) 3182 return (invpcid_works ? pmap_invalidate_page_pcid_invpcid_cb : 3183 pmap_invalidate_page_pcid_noinvpcid_cb); 3184 return (pmap_invalidate_page_nopcid_cb); 3185 } 3186 3187 static void 3188 pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va, 3189 vm_offset_t addr2 __unused) 3190 { 3191 if (pmap == kernel_pmap) { 3192 pmap_invlpg(kernel_pmap, va); 3193 } else if (pmap == PCPU_GET(curpmap)) { 3194 invlpg(va); 3195 pmap_invalidate_page_cb(pmap, va); 3196 } 3197 } 3198 3199 void 3200 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 3201 { 3202 if (pmap_type_guest(pmap)) { 3203 pmap_invalidate_ept(pmap); 3204 return; 3205 } 3206 3207 KASSERT(pmap->pm_type == PT_X86, 3208 ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); 3209 3210 pmap_invalidate_preipi(pmap); 3211 smp_masked_invlpg(va, pmap, pmap_invalidate_page_curcpu_cb); 3212 } 3213 3214 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ 3215 #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) 3216 3217 static void 3218 pmap_invalidate_range_pcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 3219 const bool invpcid_works1) 3220 { 3221 struct invpcid_descr d; 3222 uint64_t kcr3, ucr3; 3223 uint32_t pcid; 3224 3225 CRITICAL_ASSERT(curthread); 3226 3227 if (pmap != PCPU_GET(curpmap) || 3228 pmap->pm_ucr3 == PMAP_NO_CR3 || 3229 PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) 3230 return; 3231 3232 pcid = pmap_get_pcid(pmap); 3233 if (invpcid_works1) { 3234 d.pcid = pcid | PMAP_PCID_USER_PT; 3235 d.pad = 0; 3236 for (d.addr = sva; d.addr < eva; d.addr += PAGE_SIZE) 3237 invpcid(&d, INVPCID_ADDR); 3238 } else { 3239 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3240 ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3241 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); 3242 } 3243 } 3244 3245 static void 3246 pmap_invalidate_range_pcid_invpcid_cb(pmap_t pmap, vm_offset_t sva, 3247 vm_offset_t eva) 3248 { 3249 pmap_invalidate_range_pcid_cb(pmap, sva, eva, true); 3250 } 3251 3252 static void 3253 pmap_invalidate_range_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t sva, 3254 vm_offset_t eva) 3255 { 3256 pmap_invalidate_range_pcid_cb(pmap, sva, eva, false); 3257 } 3258 3259 static void 3260 pmap_invalidate_range_nopcid_cb(pmap_t pmap __unused, vm_offset_t sva __unused, 3261 vm_offset_t eva __unused) 3262 { 3263 } 3264 3265 DEFINE_IFUNC(static, void, pmap_invalidate_range_cb, (pmap_t, vm_offset_t, 3266 vm_offset_t)) 3267 { 3268 if (pmap_pcid_enabled) 3269 return (invpcid_works ? pmap_invalidate_range_pcid_invpcid_cb : 3270 pmap_invalidate_range_pcid_noinvpcid_cb); 3271 return (pmap_invalidate_range_nopcid_cb); 3272 } 3273 3274 static void 3275 pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3276 { 3277 vm_offset_t addr; 3278 3279 if (pmap == kernel_pmap) { 3280 if (PCPU_GET(pcid_invlpg_workaround)) { 3281 struct invpcid_descr d = { 0 }; 3282 3283 invpcid(&d, INVPCID_CTXGLOB); 3284 } else { 3285 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3286 invlpg(addr); 3287 } 3288 } else if (pmap == PCPU_GET(curpmap)) { 3289 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3290 invlpg(addr); 3291 pmap_invalidate_range_cb(pmap, sva, eva); 3292 } 3293 } 3294 3295 void 3296 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3297 { 3298 if (eva - sva >= PMAP_INVLPG_THRESHOLD) { 3299 pmap_invalidate_all(pmap); 3300 return; 3301 } 3302 3303 if (pmap_type_guest(pmap)) { 3304 pmap_invalidate_ept(pmap); 3305 return; 3306 } 3307 3308 KASSERT(pmap->pm_type == PT_X86, 3309 ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); 3310 3311 pmap_invalidate_preipi(pmap); 3312 smp_masked_invlpg_range(sva, eva, pmap, 3313 pmap_invalidate_range_curcpu_cb); 3314 } 3315 3316 static inline void 3317 pmap_invalidate_all_pcid_cb(pmap_t pmap, bool invpcid_works1) 3318 { 3319 struct invpcid_descr d; 3320 uint64_t kcr3; 3321 uint32_t pcid; 3322 3323 if (pmap == kernel_pmap) { 3324 if (invpcid_works1) { 3325 bzero(&d, sizeof(d)); 3326 invpcid(&d, INVPCID_CTXGLOB); 3327 } else { 3328 invltlb_glob(); 3329 } 3330 } else if (pmap == PCPU_GET(curpmap)) { 3331 CRITICAL_ASSERT(curthread); 3332 3333 pcid = pmap_get_pcid(pmap); 3334 if (invpcid_works1) { 3335 d.pcid = pcid; 3336 d.pad = 0; 3337 d.addr = 0; 3338 invpcid(&d, INVPCID_CTX); 3339 } else { 3340 kcr3 = pmap->pm_cr3 | pcid; 3341 load_cr3(kcr3); 3342 } 3343 if (pmap->pm_ucr3 != PMAP_NO_CR3) 3344 PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); 3345 } 3346 } 3347 3348 static void 3349 pmap_invalidate_all_pcid_invpcid_cb(pmap_t pmap) 3350 { 3351 pmap_invalidate_all_pcid_cb(pmap, true); 3352 } 3353 3354 static void 3355 pmap_invalidate_all_pcid_noinvpcid_cb(pmap_t pmap) 3356 { 3357 pmap_invalidate_all_pcid_cb(pmap, false); 3358 } 3359 3360 static void 3361 pmap_invalidate_all_nopcid_cb(pmap_t pmap) 3362 { 3363 if (pmap == kernel_pmap) 3364 invltlb_glob(); 3365 else if (pmap == PCPU_GET(curpmap)) 3366 invltlb(); 3367 } 3368 3369 DEFINE_IFUNC(static, void, pmap_invalidate_all_cb, (pmap_t)) 3370 { 3371 if (pmap_pcid_enabled) 3372 return (invpcid_works ? pmap_invalidate_all_pcid_invpcid_cb : 3373 pmap_invalidate_all_pcid_noinvpcid_cb); 3374 return (pmap_invalidate_all_nopcid_cb); 3375 } 3376 3377 static void 3378 pmap_invalidate_all_curcpu_cb(pmap_t pmap, vm_offset_t addr1 __unused, 3379 vm_offset_t addr2 __unused) 3380 { 3381 pmap_invalidate_all_cb(pmap); 3382 } 3383 3384 void 3385 pmap_invalidate_all(pmap_t pmap) 3386 { 3387 if (pmap_type_guest(pmap)) { 3388 pmap_invalidate_ept(pmap); 3389 return; 3390 } 3391 3392 KASSERT(pmap->pm_type == PT_X86, 3393 ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); 3394 3395 pmap_invalidate_preipi(pmap); 3396 smp_masked_invltlb(pmap, pmap_invalidate_all_curcpu_cb); 3397 } 3398 3399 static void 3400 pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused, vm_offset_t va __unused, 3401 vm_offset_t addr2 __unused) 3402 { 3403 wbinvd(); 3404 } 3405 3406 void 3407 pmap_invalidate_cache(void) 3408 { 3409 sched_pin(); 3410 smp_cache_flush(pmap_invalidate_cache_curcpu_cb); 3411 } 3412 3413 struct pde_action { 3414 cpuset_t invalidate; /* processors that invalidate their TLB */ 3415 pmap_t pmap; 3416 vm_offset_t va; 3417 pd_entry_t *pde; 3418 pd_entry_t newpde; 3419 u_int store; /* processor that updates the PDE */ 3420 }; 3421 3422 static void 3423 pmap_update_pde_action(void *arg) 3424 { 3425 struct pde_action *act = arg; 3426 3427 if (act->store == PCPU_GET(cpuid)) 3428 pmap_update_pde_store(act->pmap, act->pde, act->newpde); 3429 } 3430 3431 static void 3432 pmap_update_pde_teardown(void *arg) 3433 { 3434 struct pde_action *act = arg; 3435 3436 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 3437 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); 3438 } 3439 3440 /* 3441 * Change the page size for the specified virtual address in a way that 3442 * prevents any possibility of the TLB ever having two entries that map the 3443 * same virtual address using different page sizes. This is the recommended 3444 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 3445 * machine check exception for a TLB state that is improperly diagnosed as a 3446 * hardware error. 3447 */ 3448 static void 3449 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 3450 { 3451 struct pde_action act; 3452 cpuset_t active, other_cpus; 3453 u_int cpuid; 3454 3455 sched_pin(); 3456 cpuid = PCPU_GET(cpuid); 3457 other_cpus = all_cpus; 3458 CPU_CLR(cpuid, &other_cpus); 3459 if (pmap == kernel_pmap || pmap_type_guest(pmap)) 3460 active = all_cpus; 3461 else { 3462 active = pmap->pm_active; 3463 } 3464 if (CPU_OVERLAP(&active, &other_cpus)) { 3465 act.store = cpuid; 3466 act.invalidate = active; 3467 act.va = va; 3468 act.pmap = pmap; 3469 act.pde = pde; 3470 act.newpde = newpde; 3471 CPU_SET(cpuid, &active); 3472 smp_rendezvous_cpus(active, 3473 smp_no_rendezvous_barrier, pmap_update_pde_action, 3474 pmap_update_pde_teardown, &act); 3475 } else { 3476 pmap_update_pde_store(pmap, pde, newpde); 3477 if (CPU_ISSET(cpuid, &active)) 3478 pmap_update_pde_invalidate(pmap, va, newpde); 3479 } 3480 sched_unpin(); 3481 } 3482 #else /* !SMP */ 3483 /* 3484 * Normal, non-SMP, invalidation functions. 3485 */ 3486 void 3487 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 3488 { 3489 struct invpcid_descr d; 3490 struct pmap_pcid *pcidp; 3491 uint64_t kcr3, ucr3; 3492 uint32_t pcid; 3493 3494 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3495 pmap->pm_eptgen++; 3496 return; 3497 } 3498 KASSERT(pmap->pm_type == PT_X86, 3499 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 3500 3501 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 3502 invlpg(va); 3503 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && 3504 pmap->pm_ucr3 != PMAP_NO_CR3) { 3505 critical_enter(); 3506 pcid = pmap_get_pcid(pmap); 3507 if (invpcid_works) { 3508 d.pcid = pcid | PMAP_PCID_USER_PT; 3509 d.pad = 0; 3510 d.addr = va; 3511 invpcid(&d, INVPCID_ADDR); 3512 } else { 3513 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3514 ucr3 = pmap->pm_ucr3 | pcid | 3515 PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3516 pmap_pti_pcid_invlpg(ucr3, kcr3, va); 3517 } 3518 critical_exit(); 3519 } 3520 } else if (pmap_pcid_enabled) { 3521 pcidp = zpcpu_get(pmap->pm_pcidp); 3522 pcidp->pm_gen = 0; 3523 } 3524 } 3525 3526 void 3527 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3528 { 3529 struct invpcid_descr d; 3530 struct pmap_pcid *pcidp; 3531 vm_offset_t addr; 3532 uint64_t kcr3, ucr3; 3533 uint32_t pcid; 3534 3535 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3536 pmap->pm_eptgen++; 3537 return; 3538 } 3539 KASSERT(pmap->pm_type == PT_X86, 3540 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 3541 3542 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 3543 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3544 invlpg(addr); 3545 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && 3546 pmap->pm_ucr3 != PMAP_NO_CR3) { 3547 critical_enter(); 3548 pcid = pmap_get_pcid(pmap); 3549 if (invpcid_works) { 3550 d.pcid = pcid | PMAP_PCID_USER_PT; 3551 d.pad = 0; 3552 d.addr = sva; 3553 for (; d.addr < eva; d.addr += PAGE_SIZE) 3554 invpcid(&d, INVPCID_ADDR); 3555 } else { 3556 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3557 ucr3 = pmap->pm_ucr3 | pcid | 3558 PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3559 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); 3560 } 3561 critical_exit(); 3562 } 3563 } else if (pmap_pcid_enabled) { 3564 pcidp = zpcpu_get(pmap->pm_pcidp); 3565 pcidp->pm_gen = 0; 3566 } 3567 } 3568 3569 void 3570 pmap_invalidate_all(pmap_t pmap) 3571 { 3572 struct invpcid_descr d; 3573 struct pmap_pcid *pcidp; 3574 uint64_t kcr3, ucr3; 3575 uint32_t pcid; 3576 3577 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3578 pmap->pm_eptgen++; 3579 return; 3580 } 3581 KASSERT(pmap->pm_type == PT_X86, 3582 ("pmap_invalidate_all: unknown type %d", pmap->pm_type)); 3583 3584 if (pmap == kernel_pmap) { 3585 if (pmap_pcid_enabled && invpcid_works) { 3586 bzero(&d, sizeof(d)); 3587 invpcid(&d, INVPCID_CTXGLOB); 3588 } else { 3589 invltlb_glob(); 3590 } 3591 } else if (pmap == PCPU_GET(curpmap)) { 3592 if (pmap_pcid_enabled) { 3593 critical_enter(); 3594 pcid = pmap_get_pcid(pmap); 3595 if (invpcid_works) { 3596 d.pcid = pcid; 3597 d.pad = 0; 3598 d.addr = 0; 3599 invpcid(&d, INVPCID_CTX); 3600 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 3601 d.pcid |= PMAP_PCID_USER_PT; 3602 invpcid(&d, INVPCID_CTX); 3603 } 3604 } else { 3605 kcr3 = pmap->pm_cr3 | pcid; 3606 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 3607 ucr3 = pmap->pm_ucr3 | pcid | 3608 PMAP_PCID_USER_PT; 3609 pmap_pti_pcid_invalidate(ucr3, kcr3); 3610 } else 3611 load_cr3(kcr3); 3612 } 3613 critical_exit(); 3614 } else { 3615 invltlb(); 3616 } 3617 } else if (pmap_pcid_enabled) { 3618 pcidp = zpcpu_get(pmap->pm_pcidp); 3619 pcidp->pm_gen = 0; 3620 } 3621 } 3622 3623 PMAP_INLINE void 3624 pmap_invalidate_cache(void) 3625 { 3626 3627 wbinvd(); 3628 } 3629 3630 static void 3631 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 3632 { 3633 struct pmap_pcid *pcidp; 3634 3635 pmap_update_pde_store(pmap, pde, newpde); 3636 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) 3637 pmap_update_pde_invalidate(pmap, va, newpde); 3638 else { 3639 pcidp = zpcpu_get(pmap->pm_pcidp); 3640 pcidp->pm_gen = 0; 3641 } 3642 } 3643 #endif /* !SMP */ 3644 3645 static void 3646 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) 3647 { 3648 3649 /* 3650 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created 3651 * by a promotion that did not invalidate the 512 4KB page mappings 3652 * that might exist in the TLB. Consequently, at this point, the TLB 3653 * may hold both 4KB and 2MB page mappings for the address range [va, 3654 * va + NBPDR). Therefore, the entire range must be invalidated here. 3655 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any 3656 * 4KB page mappings for the address range [va, va + NBPDR), and so a 3657 * single INVLPG suffices to invalidate the 2MB page mapping from the 3658 * TLB. 3659 */ 3660 if ((pde & PG_PROMOTED) != 0) 3661 pmap_invalidate_range(pmap, va, va + NBPDR - 1); 3662 else 3663 pmap_invalidate_page(pmap, va); 3664 } 3665 3666 DEFINE_IFUNC(, void, pmap_invalidate_cache_range, 3667 (vm_offset_t sva, vm_offset_t eva)) 3668 { 3669 3670 if ((cpu_feature & CPUID_SS) != 0) 3671 return (pmap_invalidate_cache_range_selfsnoop); 3672 if ((cpu_feature & CPUID_CLFSH) != 0) 3673 return (pmap_force_invalidate_cache_range); 3674 return (pmap_invalidate_cache_range_all); 3675 } 3676 3677 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 3678 3679 static void 3680 pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva) 3681 { 3682 3683 KASSERT((sva & PAGE_MASK) == 0, 3684 ("pmap_invalidate_cache_range: sva not page-aligned")); 3685 KASSERT((eva & PAGE_MASK) == 0, 3686 ("pmap_invalidate_cache_range: eva not page-aligned")); 3687 } 3688 3689 static void 3690 pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva) 3691 { 3692 3693 pmap_invalidate_cache_range_check_align(sva, eva); 3694 } 3695 3696 void 3697 pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 3698 { 3699 3700 sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); 3701 3702 /* 3703 * XXX: Some CPUs fault, hang, or trash the local APIC 3704 * registers if we use CLFLUSH on the local APIC range. The 3705 * local APIC is always uncached, so we don't need to flush 3706 * for that range anyway. 3707 */ 3708 if (pmap_kextract(sva) == lapic_paddr) 3709 return; 3710 3711 if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) { 3712 /* 3713 * Do per-cache line flush. Use a locked 3714 * instruction to insure that previous stores are 3715 * included in the write-back. The processor 3716 * propagates flush to other processors in the cache 3717 * coherence domain. 3718 */ 3719 atomic_thread_fence_seq_cst(); 3720 for (; sva < eva; sva += cpu_clflush_line_size) 3721 clflushopt(sva); 3722 atomic_thread_fence_seq_cst(); 3723 } else { 3724 /* 3725 * Writes are ordered by CLFLUSH on Intel CPUs. 3726 */ 3727 if (cpu_vendor_id != CPU_VENDOR_INTEL) 3728 mfence(); 3729 for (; sva < eva; sva += cpu_clflush_line_size) 3730 clflush(sva); 3731 if (cpu_vendor_id != CPU_VENDOR_INTEL) 3732 mfence(); 3733 } 3734 } 3735 3736 static void 3737 pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva) 3738 { 3739 3740 pmap_invalidate_cache_range_check_align(sva, eva); 3741 pmap_invalidate_cache(); 3742 } 3743 3744 /* 3745 * Remove the specified set of pages from the data and instruction caches. 3746 * 3747 * In contrast to pmap_invalidate_cache_range(), this function does not 3748 * rely on the CPU's self-snoop feature, because it is intended for use 3749 * when moving pages into a different cache domain. 3750 */ 3751 void 3752 pmap_invalidate_cache_pages(vm_page_t *pages, int count) 3753 { 3754 vm_offset_t daddr, eva; 3755 int i; 3756 bool useclflushopt; 3757 3758 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; 3759 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 3760 ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt)) 3761 pmap_invalidate_cache(); 3762 else { 3763 if (useclflushopt) 3764 atomic_thread_fence_seq_cst(); 3765 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 3766 mfence(); 3767 for (i = 0; i < count; i++) { 3768 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 3769 eva = daddr + PAGE_SIZE; 3770 for (; daddr < eva; daddr += cpu_clflush_line_size) { 3771 if (useclflushopt) 3772 clflushopt(daddr); 3773 else 3774 clflush(daddr); 3775 } 3776 } 3777 if (useclflushopt) 3778 atomic_thread_fence_seq_cst(); 3779 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 3780 mfence(); 3781 } 3782 } 3783 3784 void 3785 pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva) 3786 { 3787 3788 pmap_invalidate_cache_range_check_align(sva, eva); 3789 3790 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) { 3791 pmap_force_invalidate_cache_range(sva, eva); 3792 return; 3793 } 3794 3795 /* See comment in pmap_force_invalidate_cache_range(). */ 3796 if (pmap_kextract(sva) == lapic_paddr) 3797 return; 3798 3799 atomic_thread_fence_seq_cst(); 3800 for (; sva < eva; sva += cpu_clflush_line_size) 3801 clwb(sva); 3802 atomic_thread_fence_seq_cst(); 3803 } 3804 3805 void 3806 pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) 3807 { 3808 pt_entry_t *pte; 3809 vm_offset_t vaddr; 3810 int error __diagused; 3811 int pte_bits; 3812 3813 KASSERT((spa & PAGE_MASK) == 0, 3814 ("pmap_flush_cache_phys_range: spa not page-aligned")); 3815 KASSERT((epa & PAGE_MASK) == 0, 3816 ("pmap_flush_cache_phys_range: epa not page-aligned")); 3817 3818 if (spa < dmaplimit) { 3819 pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN( 3820 dmaplimit, epa))); 3821 if (dmaplimit >= epa) 3822 return; 3823 spa = dmaplimit; 3824 } 3825 3826 pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW | 3827 X86_PG_V; 3828 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 3829 &vaddr); 3830 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 3831 pte = vtopte(vaddr); 3832 for (; spa < epa; spa += PAGE_SIZE) { 3833 sched_pin(); 3834 pte_store(pte, spa | pte_bits); 3835 pmap_invlpg(kernel_pmap, vaddr); 3836 /* XXXKIB atomic inside flush_cache_range are excessive */ 3837 pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE); 3838 sched_unpin(); 3839 } 3840 vmem_free(kernel_arena, vaddr, PAGE_SIZE); 3841 } 3842 3843 /* 3844 * Routine: pmap_extract 3845 * Function: 3846 * Extract the physical page address associated 3847 * with the given map/virtual_address pair. 3848 */ 3849 vm_paddr_t 3850 pmap_extract(pmap_t pmap, vm_offset_t va) 3851 { 3852 pdp_entry_t *pdpe; 3853 pd_entry_t *pde; 3854 pt_entry_t *pte, PG_V; 3855 vm_paddr_t pa; 3856 3857 pa = 0; 3858 PG_V = pmap_valid_bit(pmap); 3859 PMAP_LOCK(pmap); 3860 pdpe = pmap_pdpe(pmap, va); 3861 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 3862 if ((*pdpe & PG_PS) != 0) 3863 pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); 3864 else { 3865 pde = pmap_pdpe_to_pde(pdpe, va); 3866 if ((*pde & PG_V) != 0) { 3867 if ((*pde & PG_PS) != 0) { 3868 pa = (*pde & PG_PS_FRAME) | 3869 (va & PDRMASK); 3870 } else { 3871 pte = pmap_pde_to_pte(pde, va); 3872 pa = (*pte & PG_FRAME) | 3873 (va & PAGE_MASK); 3874 } 3875 } 3876 } 3877 } 3878 PMAP_UNLOCK(pmap); 3879 return (pa); 3880 } 3881 3882 /* 3883 * Routine: pmap_extract_and_hold 3884 * Function: 3885 * Atomically extract and hold the physical page 3886 * with the given pmap and virtual address pair 3887 * if that mapping permits the given protection. 3888 */ 3889 vm_page_t 3890 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 3891 { 3892 pdp_entry_t pdpe, *pdpep; 3893 pd_entry_t pde, *pdep; 3894 pt_entry_t pte, PG_RW, PG_V; 3895 vm_page_t m; 3896 3897 m = NULL; 3898 PG_RW = pmap_rw_bit(pmap); 3899 PG_V = pmap_valid_bit(pmap); 3900 PMAP_LOCK(pmap); 3901 3902 pdpep = pmap_pdpe(pmap, va); 3903 if (pdpep == NULL || ((pdpe = *pdpep) & PG_V) == 0) 3904 goto out; 3905 if ((pdpe & PG_PS) != 0) { 3906 if ((pdpe & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0) 3907 goto out; 3908 m = PHYS_TO_VM_PAGE((pdpe & PG_PS_FRAME) | (va & PDPMASK)); 3909 goto check_page; 3910 } 3911 3912 pdep = pmap_pdpe_to_pde(pdpep, va); 3913 if (pdep == NULL || ((pde = *pdep) & PG_V) == 0) 3914 goto out; 3915 if ((pde & PG_PS) != 0) { 3916 if ((pde & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0) 3917 goto out; 3918 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); 3919 goto check_page; 3920 } 3921 3922 pte = *pmap_pde_to_pte(pdep, va); 3923 if ((pte & PG_V) == 0 || 3924 ((pte & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0)) 3925 goto out; 3926 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 3927 3928 check_page: 3929 if (m != NULL && !vm_page_wire_mapped(m)) 3930 m = NULL; 3931 out: 3932 PMAP_UNLOCK(pmap); 3933 return (m); 3934 } 3935 3936 vm_paddr_t 3937 pmap_kextract(vm_offset_t va) 3938 { 3939 pd_entry_t pde; 3940 vm_paddr_t pa; 3941 3942 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 3943 pa = DMAP_TO_PHYS(va); 3944 } else if (PMAP_ADDRESS_IN_LARGEMAP(va)) { 3945 pa = pmap_large_map_kextract(va); 3946 } else { 3947 pde = *vtopde(va); 3948 if (pde & PG_PS) { 3949 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 3950 } else { 3951 /* 3952 * Beware of a concurrent promotion that changes the 3953 * PDE at this point! For example, vtopte() must not 3954 * be used to access the PTE because it would use the 3955 * new PDE. It is, however, safe to use the old PDE 3956 * because the page table page is preserved by the 3957 * promotion. 3958 */ 3959 pa = *pmap_pde_to_pte(&pde, va); 3960 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 3961 } 3962 } 3963 return (pa); 3964 } 3965 3966 /*************************************************** 3967 * Low level mapping routines..... 3968 ***************************************************/ 3969 3970 /* 3971 * Add a wired page to the kva. 3972 * Note: not SMP coherent. 3973 */ 3974 PMAP_INLINE void 3975 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 3976 { 3977 pt_entry_t *pte; 3978 3979 pte = vtopte(va); 3980 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | X86_PG_M | 3981 X86_PG_RW | X86_PG_V); 3982 } 3983 3984 static __inline void 3985 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 3986 { 3987 pt_entry_t *pte; 3988 int cache_bits; 3989 3990 pte = vtopte(va); 3991 cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); 3992 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | X86_PG_M | 3993 X86_PG_RW | X86_PG_V | cache_bits); 3994 } 3995 3996 /* 3997 * Remove a page from the kernel pagetables. 3998 * Note: not SMP coherent. 3999 */ 4000 PMAP_INLINE void 4001 pmap_kremove(vm_offset_t va) 4002 { 4003 pt_entry_t *pte; 4004 4005 pte = vtopte(va); 4006 pte_clear(pte); 4007 } 4008 4009 /* 4010 * Used to map a range of physical addresses into kernel 4011 * virtual address space. 4012 * 4013 * The value passed in '*virt' is a suggested virtual address for 4014 * the mapping. Architectures which can support a direct-mapped 4015 * physical to virtual region can return the appropriate address 4016 * within that region, leaving '*virt' unchanged. Other 4017 * architectures should map the pages starting at '*virt' and 4018 * update '*virt' with the first usable address after the mapped 4019 * region. 4020 */ 4021 vm_offset_t 4022 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 4023 { 4024 return PHYS_TO_DMAP(start); 4025 } 4026 4027 /* 4028 * Add a list of wired pages to the kva 4029 * this routine is only used for temporary 4030 * kernel mappings that do not need to have 4031 * page modification or references recorded. 4032 * Note that old mappings are simply written 4033 * over. The page *must* be wired. 4034 * Note: SMP coherent. Uses a ranged shootdown IPI. 4035 */ 4036 void 4037 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 4038 { 4039 pt_entry_t *endpte, oldpte, pa, *pte; 4040 vm_page_t m; 4041 int cache_bits; 4042 4043 oldpte = 0; 4044 pte = vtopte(sva); 4045 endpte = pte + count; 4046 while (pte < endpte) { 4047 m = *ma++; 4048 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); 4049 pa = VM_PAGE_TO_PHYS(m) | cache_bits; 4050 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { 4051 oldpte |= *pte; 4052 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | 4053 X86_PG_M | X86_PG_RW | X86_PG_V); 4054 } 4055 pte++; 4056 } 4057 if (__predict_false((oldpte & X86_PG_V) != 0)) 4058 pmap_invalidate_range(kernel_pmap, sva, sva + count * 4059 PAGE_SIZE); 4060 } 4061 4062 /* 4063 * This routine tears out page mappings from the 4064 * kernel -- it is meant only for temporary mappings. 4065 * Note: SMP coherent. Uses a ranged shootdown IPI. 4066 */ 4067 void 4068 pmap_qremove(vm_offset_t sva, int count) 4069 { 4070 vm_offset_t va; 4071 4072 va = sva; 4073 while (count-- > 0) { 4074 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); 4075 pmap_kremove(va); 4076 va += PAGE_SIZE; 4077 } 4078 pmap_invalidate_range(kernel_pmap, sva, va); 4079 } 4080 4081 /*************************************************** 4082 * Page table page management routines..... 4083 ***************************************************/ 4084 /* 4085 * Schedule the specified unused page table page to be freed. Specifically, 4086 * add the page to the specified list of pages that will be released to the 4087 * physical memory manager after the TLB has been updated. 4088 */ 4089 static __inline void 4090 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 4091 boolean_t set_PG_ZERO) 4092 { 4093 4094 if (set_PG_ZERO) 4095 m->flags |= PG_ZERO; 4096 else 4097 m->flags &= ~PG_ZERO; 4098 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 4099 } 4100 4101 /* 4102 * Inserts the specified page table page into the specified pmap's collection 4103 * of idle page table pages. Each of a pmap's page table pages is responsible 4104 * for mapping a distinct range of virtual addresses. The pmap's collection is 4105 * ordered by this virtual address range. 4106 * 4107 * If "promoted" is false, then the page table page "mpte" must be zero filled; 4108 * "mpte"'s valid field will be set to 0. 4109 * 4110 * If "promoted" is true and "allpte_PG_A_set" is false, then "mpte" must 4111 * contain valid mappings with identical attributes except for PG_A; "mpte"'s 4112 * valid field will be set to 1. 4113 * 4114 * If "promoted" and "allpte_PG_A_set" are both true, then "mpte" must contain 4115 * valid mappings with identical attributes including PG_A; "mpte"'s valid 4116 * field will be set to VM_PAGE_BITS_ALL. 4117 */ 4118 static __inline int 4119 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, 4120 bool allpte_PG_A_set) 4121 { 4122 4123 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4124 KASSERT(promoted || !allpte_PG_A_set, 4125 ("a zero-filled PTP can't have PG_A set in every PTE")); 4126 mpte->valid = promoted ? (allpte_PG_A_set ? VM_PAGE_BITS_ALL : 1) : 0; 4127 return (vm_radix_insert(&pmap->pm_root, mpte)); 4128 } 4129 4130 /* 4131 * Removes the page table page mapping the specified virtual address from the 4132 * specified pmap's collection of idle page table pages, and returns it. 4133 * Otherwise, returns NULL if there is no page table page corresponding to the 4134 * specified virtual address. 4135 */ 4136 static __inline vm_page_t 4137 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 4138 { 4139 4140 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4141 return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va))); 4142 } 4143 4144 /* 4145 * Decrements a page table page's reference count, which is used to record the 4146 * number of valid page table entries within the page. If the reference count 4147 * drops to zero, then the page table page is unmapped. Returns TRUE if the 4148 * page table page was unmapped and FALSE otherwise. 4149 */ 4150 static inline boolean_t 4151 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4152 { 4153 4154 --m->ref_count; 4155 if (m->ref_count == 0) { 4156 _pmap_unwire_ptp(pmap, va, m, free); 4157 return (TRUE); 4158 } else 4159 return (FALSE); 4160 } 4161 4162 static void 4163 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4164 { 4165 pml5_entry_t *pml5; 4166 pml4_entry_t *pml4; 4167 pdp_entry_t *pdp; 4168 pd_entry_t *pd; 4169 vm_page_t pdpg, pdppg, pml4pg; 4170 4171 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4172 4173 /* 4174 * unmap the page table page 4175 */ 4176 if (m->pindex >= NUPDE + NUPDPE + NUPML4E) { 4177 /* PML4 page */ 4178 MPASS(pmap_is_la57(pmap)); 4179 pml5 = pmap_pml5e(pmap, va); 4180 *pml5 = 0; 4181 if (pmap->pm_pmltopu != NULL && va <= VM_MAXUSER_ADDRESS) { 4182 pml5 = pmap_pml5e_u(pmap, va); 4183 *pml5 = 0; 4184 } 4185 } else if (m->pindex >= NUPDE + NUPDPE) { 4186 /* PDP page */ 4187 pml4 = pmap_pml4e(pmap, va); 4188 *pml4 = 0; 4189 if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && 4190 va <= VM_MAXUSER_ADDRESS) { 4191 pml4 = pmap_pml4e_u(pmap, va); 4192 *pml4 = 0; 4193 } 4194 } else if (m->pindex >= NUPDE) { 4195 /* PD page */ 4196 pdp = pmap_pdpe(pmap, va); 4197 *pdp = 0; 4198 } else { 4199 /* PTE page */ 4200 pd = pmap_pde(pmap, va); 4201 *pd = 0; 4202 } 4203 if (m->pindex < NUPDE) { 4204 /* We just released a PT, unhold the matching PD */ 4205 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 4206 pmap_unwire_ptp(pmap, va, pdpg, free); 4207 } else if (m->pindex < NUPDE + NUPDPE) { 4208 /* We just released a PD, unhold the matching PDP */ 4209 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 4210 pmap_unwire_ptp(pmap, va, pdppg, free); 4211 } else if (m->pindex < NUPDE + NUPDPE + NUPML4E && pmap_is_la57(pmap)) { 4212 /* We just released a PDP, unhold the matching PML4 */ 4213 pml4pg = PHYS_TO_VM_PAGE(*pmap_pml5e(pmap, va) & PG_FRAME); 4214 pmap_unwire_ptp(pmap, va, pml4pg, free); 4215 } 4216 4217 pmap_pt_page_count_adj(pmap, -1); 4218 4219 /* 4220 * Put page on a list so that it is released after 4221 * *ALL* TLB shootdown is done 4222 */ 4223 pmap_add_delayed_free_list(m, free, TRUE); 4224 } 4225 4226 /* 4227 * After removing a page table entry, this routine is used to 4228 * conditionally free the page, and manage the reference count. 4229 */ 4230 static int 4231 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 4232 struct spglist *free) 4233 { 4234 vm_page_t mpte; 4235 4236 if (va >= VM_MAXUSER_ADDRESS) 4237 return (0); 4238 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 4239 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 4240 return (pmap_unwire_ptp(pmap, va, mpte, free)); 4241 } 4242 4243 /* 4244 * Release a page table page reference after a failed attempt to create a 4245 * mapping. 4246 */ 4247 static void 4248 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 4249 { 4250 struct spglist free; 4251 4252 SLIST_INIT(&free); 4253 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 4254 /* 4255 * Although "va" was never mapped, paging-structure caches 4256 * could nonetheless have entries that refer to the freed 4257 * page table pages. Invalidate those entries. 4258 */ 4259 pmap_invalidate_page(pmap, va); 4260 vm_page_free_pages_toq(&free, true); 4261 } 4262 } 4263 4264 static void 4265 pmap_pinit_pcids(pmap_t pmap, uint32_t pcid, int gen) 4266 { 4267 struct pmap_pcid *pcidp; 4268 int i; 4269 4270 CPU_FOREACH(i) { 4271 pcidp = zpcpu_get_cpu(pmap->pm_pcidp, i); 4272 pcidp->pm_pcid = pcid; 4273 pcidp->pm_gen = gen; 4274 } 4275 } 4276 4277 void 4278 pmap_pinit0(pmap_t pmap) 4279 { 4280 struct proc *p; 4281 struct thread *td; 4282 4283 PMAP_LOCK_INIT(pmap); 4284 pmap->pm_pmltop = kernel_pmap->pm_pmltop; 4285 pmap->pm_pmltopu = NULL; 4286 pmap->pm_cr3 = kernel_pmap->pm_cr3; 4287 /* hack to keep pmap_pti_pcid_invalidate() alive */ 4288 pmap->pm_ucr3 = PMAP_NO_CR3; 4289 vm_radix_init(&pmap->pm_root); 4290 CPU_ZERO(&pmap->pm_active); 4291 TAILQ_INIT(&pmap->pm_pvchunk); 4292 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4293 pmap->pm_flags = pmap_flags; 4294 pmap->pm_pcidp = uma_zalloc_pcpu(pcpu_zone_8, M_WAITOK); 4295 pmap_pinit_pcids(pmap, PMAP_PCID_KERN + 1, 1); 4296 pmap_activate_boot(pmap); 4297 td = curthread; 4298 if (pti) { 4299 p = td->td_proc; 4300 PROC_LOCK(p); 4301 p->p_md.md_flags |= P_MD_KPTI; 4302 PROC_UNLOCK(p); 4303 } 4304 pmap_thread_init_invl_gen(td); 4305 4306 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 4307 pmap_pkru_ranges_zone = uma_zcreate("pkru ranges", 4308 sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL, 4309 UMA_ALIGN_PTR, 0); 4310 } 4311 } 4312 4313 void 4314 pmap_pinit_pml4(vm_page_t pml4pg) 4315 { 4316 pml4_entry_t *pm_pml4; 4317 int i; 4318 4319 pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); 4320 4321 /* Wire in kernel global address entries. */ 4322 for (i = 0; i < NKPML4E; i++) { 4323 pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW | 4324 X86_PG_V; 4325 } 4326 #ifdef KASAN 4327 for (i = 0; i < NKASANPML4E; i++) { 4328 pm_pml4[KASANPML4I + i] = (KASANPDPphys + ptoa(i)) | X86_PG_RW | 4329 X86_PG_V | pg_nx; 4330 } 4331 #endif 4332 #ifdef KMSAN 4333 for (i = 0; i < NKMSANSHADPML4E; i++) { 4334 pm_pml4[KMSANSHADPML4I + i] = (KMSANSHADPDPphys + ptoa(i)) | 4335 X86_PG_RW | X86_PG_V | pg_nx; 4336 } 4337 for (i = 0; i < NKMSANORIGPML4E; i++) { 4338 pm_pml4[KMSANORIGPML4I + i] = (KMSANORIGPDPphys + ptoa(i)) | 4339 X86_PG_RW | X86_PG_V | pg_nx; 4340 } 4341 #endif 4342 for (i = 0; i < ndmpdpphys; i++) { 4343 pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | 4344 X86_PG_V; 4345 } 4346 4347 /* install self-referential address mapping entry(s) */ 4348 pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW | 4349 X86_PG_A | X86_PG_M; 4350 4351 /* install large map entries if configured */ 4352 for (i = 0; i < lm_ents; i++) 4353 pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pmltop[LMSPML4I + i]; 4354 } 4355 4356 void 4357 pmap_pinit_pml5(vm_page_t pml5pg) 4358 { 4359 pml5_entry_t *pm_pml5; 4360 4361 pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg)); 4362 4363 /* 4364 * Add pml5 entry at top of KVA pointing to existing pml4 table, 4365 * entering all existing kernel mappings into level 5 table. 4366 */ 4367 pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | 4368 X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | 4369 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); 4370 4371 /* 4372 * Install self-referential address mapping entry. 4373 */ 4374 pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) | 4375 X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A | 4376 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); 4377 } 4378 4379 static void 4380 pmap_pinit_pml4_pti(vm_page_t pml4pgu) 4381 { 4382 pml4_entry_t *pm_pml4u; 4383 int i; 4384 4385 pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pgu)); 4386 for (i = 0; i < NPML4EPG; i++) 4387 pm_pml4u[i] = pti_pml4[i]; 4388 } 4389 4390 static void 4391 pmap_pinit_pml5_pti(vm_page_t pml5pgu) 4392 { 4393 pml5_entry_t *pm_pml5u; 4394 4395 pm_pml5u = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pgu)); 4396 pagezero(pm_pml5u); 4397 4398 /* 4399 * Add pml5 entry at top of KVA pointing to existing pml4 pti 4400 * table, entering all kernel mappings needed for usermode 4401 * into level 5 table. 4402 */ 4403 pm_pml5u[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 4404 pmap_kextract((vm_offset_t)pti_pml4) | 4405 X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | 4406 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); 4407 } 4408 4409 /* Allocate a page table page and do related bookkeeping */ 4410 static vm_page_t 4411 pmap_alloc_pt_page(pmap_t pmap, vm_pindex_t pindex, int flags) 4412 { 4413 vm_page_t m; 4414 4415 m = vm_page_alloc_noobj(flags); 4416 if (__predict_false(m == NULL)) 4417 return (NULL); 4418 m->pindex = pindex; 4419 pmap_pt_page_count_adj(pmap, 1); 4420 return (m); 4421 } 4422 4423 static void 4424 pmap_free_pt_page(pmap_t pmap, vm_page_t m, bool zerofilled) 4425 { 4426 /* 4427 * This function assumes the page will need to be unwired, 4428 * even though the counterpart allocation in pmap_alloc_pt_page() 4429 * doesn't enforce VM_ALLOC_WIRED. However, all current uses 4430 * of pmap_free_pt_page() require unwiring. The case in which 4431 * a PT page doesn't require unwiring because its ref_count has 4432 * naturally reached 0 is handled through _pmap_unwire_ptp(). 4433 */ 4434 vm_page_unwire_noq(m); 4435 if (zerofilled) 4436 vm_page_free_zero(m); 4437 else 4438 vm_page_free(m); 4439 4440 pmap_pt_page_count_adj(pmap, -1); 4441 } 4442 4443 _Static_assert(sizeof(struct pmap_pcid) == 8, "Fix pcpu zone for pm_pcidp"); 4444 4445 /* 4446 * Initialize a preallocated and zeroed pmap structure, 4447 * such as one in a vmspace structure. 4448 */ 4449 int 4450 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) 4451 { 4452 vm_page_t pmltop_pg, pmltop_pgu; 4453 vm_paddr_t pmltop_phys; 4454 4455 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4456 4457 /* 4458 * Allocate the page directory page. Pass NULL instead of a 4459 * pointer to the pmap here to avoid calling 4460 * pmap_resident_count_adj() through pmap_pt_page_count_adj(), 4461 * since that requires pmap lock. Instead do the accounting 4462 * manually. 4463 * 4464 * Note that final call to pmap_remove() optimization that 4465 * checks for zero resident_count is basically disabled by 4466 * accounting for top-level page. But the optimization was 4467 * not effective since we started using non-managed mapping of 4468 * the shared page. 4469 */ 4470 pmltop_pg = pmap_alloc_pt_page(NULL, 0, VM_ALLOC_WIRED | VM_ALLOC_ZERO | 4471 VM_ALLOC_WAITOK); 4472 pmap_pt_page_count_pinit(pmap, 1); 4473 4474 pmltop_phys = VM_PAGE_TO_PHYS(pmltop_pg); 4475 pmap->pm_pmltop = (pml5_entry_t *)PHYS_TO_DMAP(pmltop_phys); 4476 4477 if (pmap_pcid_enabled) { 4478 if (pmap->pm_pcidp == NULL) 4479 pmap->pm_pcidp = uma_zalloc_pcpu(pcpu_zone_8, 4480 M_WAITOK); 4481 pmap_pinit_pcids(pmap, PMAP_PCID_NONE, 0); 4482 } 4483 pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */ 4484 pmap->pm_ucr3 = PMAP_NO_CR3; 4485 pmap->pm_pmltopu = NULL; 4486 4487 pmap->pm_type = pm_type; 4488 4489 /* 4490 * Do not install the host kernel mappings in the nested page 4491 * tables. These mappings are meaningless in the guest physical 4492 * address space. 4493 * Install minimal kernel mappings in PTI case. 4494 */ 4495 switch (pm_type) { 4496 case PT_X86: 4497 pmap->pm_cr3 = pmltop_phys; 4498 if (pmap_is_la57(pmap)) 4499 pmap_pinit_pml5(pmltop_pg); 4500 else 4501 pmap_pinit_pml4(pmltop_pg); 4502 if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) { 4503 /* 4504 * As with pmltop_pg, pass NULL instead of a 4505 * pointer to the pmap to ensure that the PTI 4506 * page counted explicitly. 4507 */ 4508 pmltop_pgu = pmap_alloc_pt_page(NULL, 0, 4509 VM_ALLOC_WIRED | VM_ALLOC_WAITOK); 4510 pmap_pt_page_count_pinit(pmap, 1); 4511 pmap->pm_pmltopu = (pml4_entry_t *)PHYS_TO_DMAP( 4512 VM_PAGE_TO_PHYS(pmltop_pgu)); 4513 if (pmap_is_la57(pmap)) 4514 pmap_pinit_pml5_pti(pmltop_pgu); 4515 else 4516 pmap_pinit_pml4_pti(pmltop_pgu); 4517 pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pmltop_pgu); 4518 } 4519 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 4520 rangeset_init(&pmap->pm_pkru, pkru_dup_range, 4521 pkru_free_range, pmap, M_NOWAIT); 4522 } 4523 break; 4524 case PT_EPT: 4525 case PT_RVI: 4526 pmap->pm_eptsmr = smr_create("pmap", 0, 0); 4527 break; 4528 } 4529 4530 vm_radix_init(&pmap->pm_root); 4531 CPU_ZERO(&pmap->pm_active); 4532 TAILQ_INIT(&pmap->pm_pvchunk); 4533 pmap->pm_flags = flags; 4534 pmap->pm_eptgen = 0; 4535 4536 return (1); 4537 } 4538 4539 int 4540 pmap_pinit(pmap_t pmap) 4541 { 4542 4543 return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); 4544 } 4545 4546 static void 4547 pmap_allocpte_free_unref(pmap_t pmap, vm_offset_t va, pt_entry_t *pte) 4548 { 4549 vm_page_t mpg; 4550 struct spglist free; 4551 4552 mpg = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 4553 if (mpg->ref_count != 0) 4554 return; 4555 SLIST_INIT(&free); 4556 _pmap_unwire_ptp(pmap, va, mpg, &free); 4557 pmap_invalidate_page(pmap, va); 4558 vm_page_free_pages_toq(&free, true); 4559 } 4560 4561 static pml4_entry_t * 4562 pmap_allocpte_getpml4(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, 4563 bool addref) 4564 { 4565 vm_pindex_t pml5index; 4566 pml5_entry_t *pml5; 4567 pml4_entry_t *pml4; 4568 vm_page_t pml4pg; 4569 pt_entry_t PG_V; 4570 bool allocated; 4571 4572 if (!pmap_is_la57(pmap)) 4573 return (&pmap->pm_pmltop[pmap_pml4e_index(va)]); 4574 4575 PG_V = pmap_valid_bit(pmap); 4576 pml5index = pmap_pml5e_index(va); 4577 pml5 = &pmap->pm_pmltop[pml5index]; 4578 if ((*pml5 & PG_V) == 0) { 4579 if (pmap_allocpte_nosleep(pmap, pmap_pml5e_pindex(va), lockp, 4580 va) == NULL) 4581 return (NULL); 4582 allocated = true; 4583 } else { 4584 allocated = false; 4585 } 4586 pml4 = (pml4_entry_t *)PHYS_TO_DMAP(*pml5 & PG_FRAME); 4587 pml4 = &pml4[pmap_pml4e_index(va)]; 4588 if ((*pml4 & PG_V) == 0) { 4589 pml4pg = PHYS_TO_VM_PAGE(*pml5 & PG_FRAME); 4590 if (allocated && !addref) 4591 pml4pg->ref_count--; 4592 else if (!allocated && addref) 4593 pml4pg->ref_count++; 4594 } 4595 return (pml4); 4596 } 4597 4598 static pdp_entry_t * 4599 pmap_allocpte_getpdp(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, 4600 bool addref) 4601 { 4602 vm_page_t pdppg; 4603 pml4_entry_t *pml4; 4604 pdp_entry_t *pdp; 4605 pt_entry_t PG_V; 4606 bool allocated; 4607 4608 PG_V = pmap_valid_bit(pmap); 4609 4610 pml4 = pmap_allocpte_getpml4(pmap, lockp, va, false); 4611 if (pml4 == NULL) 4612 return (NULL); 4613 4614 if ((*pml4 & PG_V) == 0) { 4615 /* Have to allocate a new pdp, recurse */ 4616 if (pmap_allocpte_nosleep(pmap, pmap_pml4e_pindex(va), lockp, 4617 va) == NULL) { 4618 if (pmap_is_la57(pmap)) 4619 pmap_allocpte_free_unref(pmap, va, 4620 pmap_pml5e(pmap, va)); 4621 return (NULL); 4622 } 4623 allocated = true; 4624 } else { 4625 allocated = false; 4626 } 4627 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 4628 pdp = &pdp[pmap_pdpe_index(va)]; 4629 if ((*pdp & PG_V) == 0) { 4630 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 4631 if (allocated && !addref) 4632 pdppg->ref_count--; 4633 else if (!allocated && addref) 4634 pdppg->ref_count++; 4635 } 4636 return (pdp); 4637 } 4638 4639 /* 4640 * The ptepindexes, i.e. page indices, of the page table pages encountered 4641 * while translating virtual address va are defined as follows: 4642 * - for the page table page (last level), 4643 * ptepindex = pmap_pde_pindex(va) = va >> PDRSHIFT, 4644 * in other words, it is just the index of the PDE that maps the page 4645 * table page. 4646 * - for the page directory page, 4647 * ptepindex = NUPDE (number of userland PD entries) + 4648 * (pmap_pde_index(va) >> NPDEPGSHIFT) 4649 * i.e. index of PDPE is put after the last index of PDE, 4650 * - for the page directory pointer page, 4651 * ptepindex = NUPDE + NUPDPE + (pmap_pde_index(va) >> (NPDEPGSHIFT + 4652 * NPML4EPGSHIFT), 4653 * i.e. index of pml4e is put after the last index of PDPE, 4654 * - for the PML4 page (if LA57 mode is enabled), 4655 * ptepindex = NUPDE + NUPDPE + NUPML4E + (pmap_pde_index(va) >> 4656 * (NPDEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT), 4657 * i.e. index of pml5e is put after the last index of PML4E. 4658 * 4659 * Define an order on the paging entries, where all entries of the 4660 * same height are put together, then heights are put from deepest to 4661 * root. Then ptexpindex is the sequential number of the 4662 * corresponding paging entry in this order. 4663 * 4664 * The values of NUPDE, NUPDPE, and NUPML4E are determined by the size of 4665 * LA57 paging structures even in LA48 paging mode. Moreover, the 4666 * ptepindexes are calculated as if the paging structures were 5-level 4667 * regardless of the actual mode of operation. 4668 * 4669 * The root page at PML4/PML5 does not participate in this indexing scheme, 4670 * since it is statically allocated by pmap_pinit() and not by pmap_allocpte(). 4671 */ 4672 static vm_page_t 4673 pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, 4674 vm_offset_t va) 4675 { 4676 vm_pindex_t pml5index, pml4index; 4677 pml5_entry_t *pml5, *pml5u; 4678 pml4_entry_t *pml4, *pml4u; 4679 pdp_entry_t *pdp; 4680 pd_entry_t *pd; 4681 vm_page_t m, pdpg; 4682 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 4683 4684 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4685 4686 PG_A = pmap_accessed_bit(pmap); 4687 PG_M = pmap_modified_bit(pmap); 4688 PG_V = pmap_valid_bit(pmap); 4689 PG_RW = pmap_rw_bit(pmap); 4690 4691 /* 4692 * Allocate a page table page. 4693 */ 4694 m = pmap_alloc_pt_page(pmap, ptepindex, 4695 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 4696 if (m == NULL) 4697 return (NULL); 4698 4699 /* 4700 * Map the pagetable page into the process address space, if 4701 * it isn't already there. 4702 */ 4703 if (ptepindex >= NUPDE + NUPDPE + NUPML4E) { 4704 MPASS(pmap_is_la57(pmap)); 4705 4706 pml5index = pmap_pml5e_index(va); 4707 pml5 = &pmap->pm_pmltop[pml5index]; 4708 KASSERT((*pml5 & PG_V) == 0, 4709 ("pmap %p va %#lx pml5 %#lx", pmap, va, *pml5)); 4710 *pml5 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4711 4712 if (pmap->pm_pmltopu != NULL && pml5index < NUPML5E) { 4713 if (pmap->pm_ucr3 != PMAP_NO_CR3) 4714 *pml5 |= pg_nx; 4715 4716 pml5u = &pmap->pm_pmltopu[pml5index]; 4717 *pml5u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | 4718 PG_A | PG_M; 4719 } 4720 } else if (ptepindex >= NUPDE + NUPDPE) { 4721 pml4index = pmap_pml4e_index(va); 4722 /* Wire up a new PDPE page */ 4723 pml4 = pmap_allocpte_getpml4(pmap, lockp, va, true); 4724 if (pml4 == NULL) { 4725 pmap_free_pt_page(pmap, m, true); 4726 return (NULL); 4727 } 4728 KASSERT((*pml4 & PG_V) == 0, 4729 ("pmap %p va %#lx pml4 %#lx", pmap, va, *pml4)); 4730 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4731 4732 if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && 4733 pml4index < NUPML4E) { 4734 /* 4735 * PTI: Make all user-space mappings in the 4736 * kernel-mode page table no-execute so that 4737 * we detect any programming errors that leave 4738 * the kernel-mode page table active on return 4739 * to user space. 4740 */ 4741 if (pmap->pm_ucr3 != PMAP_NO_CR3) 4742 *pml4 |= pg_nx; 4743 4744 pml4u = &pmap->pm_pmltopu[pml4index]; 4745 *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | 4746 PG_A | PG_M; 4747 } 4748 } else if (ptepindex >= NUPDE) { 4749 /* Wire up a new PDE page */ 4750 pdp = pmap_allocpte_getpdp(pmap, lockp, va, true); 4751 if (pdp == NULL) { 4752 pmap_free_pt_page(pmap, m, true); 4753 return (NULL); 4754 } 4755 KASSERT((*pdp & PG_V) == 0, 4756 ("pmap %p va %#lx pdp %#lx", pmap, va, *pdp)); 4757 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4758 } else { 4759 /* Wire up a new PTE page */ 4760 pdp = pmap_allocpte_getpdp(pmap, lockp, va, false); 4761 if (pdp == NULL) { 4762 pmap_free_pt_page(pmap, m, true); 4763 return (NULL); 4764 } 4765 if ((*pdp & PG_V) == 0) { 4766 /* Have to allocate a new pd, recurse */ 4767 if (pmap_allocpte_nosleep(pmap, pmap_pdpe_pindex(va), 4768 lockp, va) == NULL) { 4769 pmap_allocpte_free_unref(pmap, va, 4770 pmap_pml4e(pmap, va)); 4771 pmap_free_pt_page(pmap, m, true); 4772 return (NULL); 4773 } 4774 } else { 4775 /* Add reference to the pd page */ 4776 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 4777 pdpg->ref_count++; 4778 } 4779 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 4780 4781 /* Now we know where the page directory page is */ 4782 pd = &pd[pmap_pde_index(va)]; 4783 KASSERT((*pd & PG_V) == 0, 4784 ("pmap %p va %#lx pd %#lx", pmap, va, *pd)); 4785 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4786 } 4787 4788 return (m); 4789 } 4790 4791 /* 4792 * This routine is called if the desired page table page does not exist. 4793 * 4794 * If page table page allocation fails, this routine may sleep before 4795 * returning NULL. It sleeps only if a lock pointer was given. Sleep 4796 * occurs right before returning to the caller. This way, we never 4797 * drop pmap lock to sleep while a page table page has ref_count == 0, 4798 * which prevents the page from being freed under us. 4799 */ 4800 static vm_page_t 4801 pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, 4802 vm_offset_t va) 4803 { 4804 vm_page_t m; 4805 4806 m = pmap_allocpte_nosleep(pmap, ptepindex, lockp, va); 4807 if (m == NULL && lockp != NULL) { 4808 RELEASE_PV_LIST_LOCK(lockp); 4809 PMAP_UNLOCK(pmap); 4810 PMAP_ASSERT_NOT_IN_DI(); 4811 vm_wait(NULL); 4812 PMAP_LOCK(pmap); 4813 } 4814 return (m); 4815 } 4816 4817 static pd_entry_t * 4818 pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, 4819 struct rwlock **lockp) 4820 { 4821 pdp_entry_t *pdpe, PG_V; 4822 pd_entry_t *pde; 4823 vm_page_t pdpg; 4824 vm_pindex_t pdpindex; 4825 4826 PG_V = pmap_valid_bit(pmap); 4827 4828 retry: 4829 pdpe = pmap_pdpe(pmap, va); 4830 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 4831 pde = pmap_pdpe_to_pde(pdpe, va); 4832 if (va < VM_MAXUSER_ADDRESS) { 4833 /* Add a reference to the pd page. */ 4834 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 4835 pdpg->ref_count++; 4836 } else 4837 pdpg = NULL; 4838 } else if (va < VM_MAXUSER_ADDRESS) { 4839 /* Allocate a pd page. */ 4840 pdpindex = pmap_pde_pindex(va) >> NPDPEPGSHIFT; 4841 pdpg = pmap_allocpte_alloc(pmap, NUPDE + pdpindex, lockp, va); 4842 if (pdpg == NULL) { 4843 if (lockp != NULL) 4844 goto retry; 4845 else 4846 return (NULL); 4847 } 4848 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 4849 pde = &pde[pmap_pde_index(va)]; 4850 } else 4851 panic("pmap_alloc_pde: missing page table page for va %#lx", 4852 va); 4853 *pdpgp = pdpg; 4854 return (pde); 4855 } 4856 4857 static vm_page_t 4858 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 4859 { 4860 vm_pindex_t ptepindex; 4861 pd_entry_t *pd, PG_V; 4862 vm_page_t m; 4863 4864 PG_V = pmap_valid_bit(pmap); 4865 4866 /* 4867 * Calculate pagetable page index 4868 */ 4869 ptepindex = pmap_pde_pindex(va); 4870 retry: 4871 /* 4872 * Get the page directory entry 4873 */ 4874 pd = pmap_pde(pmap, va); 4875 4876 /* 4877 * This supports switching from a 2MB page to a 4878 * normal 4K page. 4879 */ 4880 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 4881 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { 4882 /* 4883 * Invalidation of the 2MB page mapping may have caused 4884 * the deallocation of the underlying PD page. 4885 */ 4886 pd = NULL; 4887 } 4888 } 4889 4890 /* 4891 * If the page table page is mapped, we just increment the 4892 * hold count, and activate it. 4893 */ 4894 if (pd != NULL && (*pd & PG_V) != 0) { 4895 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 4896 m->ref_count++; 4897 } else { 4898 /* 4899 * Here if the pte page isn't mapped, or if it has been 4900 * deallocated. 4901 */ 4902 m = pmap_allocpte_alloc(pmap, ptepindex, lockp, va); 4903 if (m == NULL && lockp != NULL) 4904 goto retry; 4905 } 4906 return (m); 4907 } 4908 4909 /*************************************************** 4910 * Pmap allocation/deallocation routines. 4911 ***************************************************/ 4912 4913 /* 4914 * Release any resources held by the given physical map. 4915 * Called when a pmap initialized by pmap_pinit is being released. 4916 * Should only be called if the map contains no valid mappings. 4917 */ 4918 void 4919 pmap_release(pmap_t pmap) 4920 { 4921 vm_page_t m; 4922 int i; 4923 4924 KASSERT(vm_radix_is_empty(&pmap->pm_root), 4925 ("pmap_release: pmap %p has reserved page table page(s)", 4926 pmap)); 4927 KASSERT(CPU_EMPTY(&pmap->pm_active), 4928 ("releasing active pmap %p", pmap)); 4929 4930 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop)); 4931 4932 if (pmap_is_la57(pmap)) { 4933 pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0; 4934 pmap->pm_pmltop[PML5PML5I] = 0; 4935 } else { 4936 for (i = 0; i < NKPML4E; i++) /* KVA */ 4937 pmap->pm_pmltop[KPML4BASE + i] = 0; 4938 #ifdef KASAN 4939 for (i = 0; i < NKASANPML4E; i++) /* KASAN shadow map */ 4940 pmap->pm_pmltop[KASANPML4I + i] = 0; 4941 #endif 4942 #ifdef KMSAN 4943 for (i = 0; i < NKMSANSHADPML4E; i++) /* KMSAN shadow map */ 4944 pmap->pm_pmltop[KMSANSHADPML4I + i] = 0; 4945 for (i = 0; i < NKMSANORIGPML4E; i++) /* KMSAN shadow map */ 4946 pmap->pm_pmltop[KMSANORIGPML4I + i] = 0; 4947 #endif 4948 for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ 4949 pmap->pm_pmltop[DMPML4I + i] = 0; 4950 pmap->pm_pmltop[PML4PML4I] = 0; /* Recursive Mapping */ 4951 for (i = 0; i < lm_ents; i++) /* Large Map */ 4952 pmap->pm_pmltop[LMSPML4I + i] = 0; 4953 } 4954 4955 pmap_free_pt_page(NULL, m, true); 4956 pmap_pt_page_count_pinit(pmap, -1); 4957 4958 if (pmap->pm_pmltopu != NULL) { 4959 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap-> 4960 pm_pmltopu)); 4961 pmap_free_pt_page(NULL, m, false); 4962 pmap_pt_page_count_pinit(pmap, -1); 4963 } 4964 if (pmap->pm_type == PT_X86 && 4965 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) 4966 rangeset_fini(&pmap->pm_pkru); 4967 4968 KASSERT(pmap->pm_stats.resident_count == 0, 4969 ("pmap_release: pmap %p resident count %ld != 0", 4970 pmap, pmap->pm_stats.resident_count)); 4971 } 4972 4973 static int 4974 kvm_size(SYSCTL_HANDLER_ARGS) 4975 { 4976 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 4977 4978 return sysctl_handle_long(oidp, &ksize, 0, req); 4979 } 4980 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 4981 0, 0, kvm_size, "LU", 4982 "Size of KVM"); 4983 4984 static int 4985 kvm_free(SYSCTL_HANDLER_ARGS) 4986 { 4987 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 4988 4989 return sysctl_handle_long(oidp, &kfree, 0, req); 4990 } 4991 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 4992 0, 0, kvm_free, "LU", 4993 "Amount of KVM free"); 4994 4995 #ifdef KMSAN 4996 static void 4997 pmap_kmsan_shadow_map_page_array(vm_paddr_t pdppa, vm_size_t size) 4998 { 4999 pdp_entry_t *pdpe; 5000 pd_entry_t *pde; 5001 pt_entry_t *pte; 5002 vm_paddr_t dummypa, dummypd, dummypt; 5003 int i, npde, npdpg; 5004 5005 npdpg = howmany(size, NBPDP); 5006 npde = size / NBPDR; 5007 5008 dummypa = vm_phys_early_alloc(-1, PAGE_SIZE); 5009 pagezero((void *)PHYS_TO_DMAP(dummypa)); 5010 5011 dummypt = vm_phys_early_alloc(-1, PAGE_SIZE); 5012 pagezero((void *)PHYS_TO_DMAP(dummypt)); 5013 dummypd = vm_phys_early_alloc(-1, PAGE_SIZE * npdpg); 5014 for (i = 0; i < npdpg; i++) 5015 pagezero((void *)PHYS_TO_DMAP(dummypd + ptoa(i))); 5016 5017 pte = (pt_entry_t *)PHYS_TO_DMAP(dummypt); 5018 for (i = 0; i < NPTEPG; i++) 5019 pte[i] = (pt_entry_t)(dummypa | X86_PG_V | X86_PG_RW | 5020 X86_PG_A | X86_PG_M | pg_nx); 5021 5022 pde = (pd_entry_t *)PHYS_TO_DMAP(dummypd); 5023 for (i = 0; i < npde; i++) 5024 pde[i] = (pd_entry_t)(dummypt | X86_PG_V | X86_PG_RW | pg_nx); 5025 5026 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(pdppa); 5027 for (i = 0; i < npdpg; i++) 5028 pdpe[i] = (pdp_entry_t)(dummypd + ptoa(i) | X86_PG_V | 5029 X86_PG_RW | pg_nx); 5030 } 5031 5032 static void 5033 pmap_kmsan_page_array_startup(vm_offset_t start, vm_offset_t end) 5034 { 5035 vm_size_t size; 5036 5037 KASSERT(start % NBPDP == 0, ("unaligned page array start address")); 5038 5039 /* 5040 * The end of the page array's KVA region is 2MB aligned, see 5041 * kmem_init(). 5042 */ 5043 size = round_2mpage(end) - start; 5044 pmap_kmsan_shadow_map_page_array(KMSANSHADPDPphys, size); 5045 pmap_kmsan_shadow_map_page_array(KMSANORIGPDPphys, size); 5046 } 5047 #endif 5048 5049 /* 5050 * Allocate physical memory for the vm_page array and map it into KVA, 5051 * attempting to back the vm_pages with domain-local memory. 5052 */ 5053 void 5054 pmap_page_array_startup(long pages) 5055 { 5056 pdp_entry_t *pdpe; 5057 pd_entry_t *pde, newpdir; 5058 vm_offset_t va, start, end; 5059 vm_paddr_t pa; 5060 long pfn; 5061 int domain, i; 5062 5063 vm_page_array_size = pages; 5064 5065 start = VM_MIN_KERNEL_ADDRESS; 5066 end = start + pages * sizeof(struct vm_page); 5067 for (va = start; va < end; va += NBPDR) { 5068 pfn = first_page + (va - start) / sizeof(struct vm_page); 5069 domain = vm_phys_domain(ptoa(pfn)); 5070 pdpe = pmap_pdpe(kernel_pmap, va); 5071 if ((*pdpe & X86_PG_V) == 0) { 5072 pa = vm_phys_early_alloc(domain, PAGE_SIZE); 5073 dump_add_page(pa); 5074 pagezero((void *)PHYS_TO_DMAP(pa)); 5075 *pdpe = (pdp_entry_t)(pa | X86_PG_V | X86_PG_RW | 5076 X86_PG_A | X86_PG_M); 5077 } 5078 pde = pmap_pdpe_to_pde(pdpe, va); 5079 if ((*pde & X86_PG_V) != 0) 5080 panic("Unexpected pde"); 5081 pa = vm_phys_early_alloc(domain, NBPDR); 5082 for (i = 0; i < NPDEPG; i++) 5083 dump_add_page(pa + i * PAGE_SIZE); 5084 newpdir = (pd_entry_t)(pa | X86_PG_V | X86_PG_RW | X86_PG_A | 5085 X86_PG_M | PG_PS | pg_g | pg_nx); 5086 pde_store(pde, newpdir); 5087 } 5088 vm_page_array = (vm_page_t)start; 5089 5090 #ifdef KMSAN 5091 pmap_kmsan_page_array_startup(start, end); 5092 #endif 5093 } 5094 5095 /* 5096 * grow the number of kernel page table entries, if needed 5097 */ 5098 void 5099 pmap_growkernel(vm_offset_t addr) 5100 { 5101 vm_paddr_t paddr; 5102 vm_page_t nkpg; 5103 pd_entry_t *pde, newpdir; 5104 pdp_entry_t *pdpe; 5105 vm_offset_t end; 5106 5107 TSENTER(); 5108 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 5109 5110 /* 5111 * The kernel map covers two distinct regions of KVA: that used 5112 * for dynamic kernel memory allocations, and the uppermost 2GB 5113 * of the virtual address space. The latter is used to map the 5114 * kernel and loadable kernel modules. This scheme enables the 5115 * use of a special code generation model for kernel code which 5116 * takes advantage of compact addressing modes in machine code. 5117 * 5118 * Both regions grow upwards; to avoid wasting memory, the gap 5119 * in between is unmapped. If "addr" is above "KERNBASE", the 5120 * kernel's region is grown, otherwise the kmem region is grown. 5121 * 5122 * The correctness of this action is based on the following 5123 * argument: vm_map_insert() allocates contiguous ranges of the 5124 * kernel virtual address space. It calls this function if a range 5125 * ends after "kernel_vm_end". If the kernel is mapped between 5126 * "kernel_vm_end" and "addr", then the range cannot begin at 5127 * "kernel_vm_end". In fact, its beginning address cannot be less 5128 * than the kernel. Thus, there is no immediate need to allocate 5129 * any new kernel page table pages between "kernel_vm_end" and 5130 * "KERNBASE". 5131 */ 5132 if (KERNBASE < addr) { 5133 end = KERNBASE + nkpt * NBPDR; 5134 if (end == 0) { 5135 TSEXIT(); 5136 return; 5137 } 5138 } else { 5139 end = kernel_vm_end; 5140 } 5141 5142 addr = roundup2(addr, NBPDR); 5143 if (addr - 1 >= vm_map_max(kernel_map)) 5144 addr = vm_map_max(kernel_map); 5145 if (addr <= end) { 5146 /* 5147 * The grown region is already mapped, so there is 5148 * nothing to do. 5149 */ 5150 TSEXIT(); 5151 return; 5152 } 5153 5154 kasan_shadow_map(end, addr - end); 5155 kmsan_shadow_map(end, addr - end); 5156 while (end < addr) { 5157 pdpe = pmap_pdpe(kernel_pmap, end); 5158 if ((*pdpe & X86_PG_V) == 0) { 5159 nkpg = pmap_alloc_pt_page(kernel_pmap, 5160 pmap_pdpe_pindex(end), VM_ALLOC_WIRED | 5161 VM_ALLOC_INTERRUPT | VM_ALLOC_ZERO); 5162 if (nkpg == NULL) 5163 panic("pmap_growkernel: no memory to grow kernel"); 5164 paddr = VM_PAGE_TO_PHYS(nkpg); 5165 *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | 5166 X86_PG_A | X86_PG_M); 5167 continue; /* try again */ 5168 } 5169 pde = pmap_pdpe_to_pde(pdpe, end); 5170 if ((*pde & X86_PG_V) != 0) { 5171 end = (end + NBPDR) & ~PDRMASK; 5172 if (end - 1 >= vm_map_max(kernel_map)) { 5173 end = vm_map_max(kernel_map); 5174 break; 5175 } 5176 continue; 5177 } 5178 5179 nkpg = pmap_alloc_pt_page(kernel_pmap, pmap_pde_pindex(end), 5180 VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT | VM_ALLOC_ZERO); 5181 if (nkpg == NULL) 5182 panic("pmap_growkernel: no memory to grow kernel"); 5183 paddr = VM_PAGE_TO_PHYS(nkpg); 5184 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; 5185 pde_store(pde, newpdir); 5186 5187 end = (end + NBPDR) & ~PDRMASK; 5188 if (end - 1 >= vm_map_max(kernel_map)) { 5189 end = vm_map_max(kernel_map); 5190 break; 5191 } 5192 } 5193 5194 if (end <= KERNBASE) 5195 kernel_vm_end = end; 5196 else 5197 nkpt = howmany(end - KERNBASE, NBPDR); 5198 TSEXIT(); 5199 } 5200 5201 /*************************************************** 5202 * page management routines. 5203 ***************************************************/ 5204 5205 static const uint64_t pc_freemask[_NPCM] = { 5206 [0 ... _NPCM - 2] = PC_FREEN, 5207 [_NPCM - 1] = PC_FREEL 5208 }; 5209 5210 #ifdef PV_STATS 5211 5212 static COUNTER_U64_DEFINE_EARLY(pc_chunk_count); 5213 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, 5214 &pc_chunk_count, "Current number of pv entry cnunks"); 5215 5216 static COUNTER_U64_DEFINE_EARLY(pc_chunk_allocs); 5217 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, 5218 &pc_chunk_allocs, "Total number of pv entry chunks allocated"); 5219 5220 static COUNTER_U64_DEFINE_EARLY(pc_chunk_frees); 5221 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, 5222 &pc_chunk_frees, "Total number of pv entry chunks freed"); 5223 5224 static COUNTER_U64_DEFINE_EARLY(pc_chunk_tryfail); 5225 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, 5226 &pc_chunk_tryfail, 5227 "Number of failed attempts to get a pv entry chunk page"); 5228 5229 static COUNTER_U64_DEFINE_EARLY(pv_entry_frees); 5230 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, 5231 &pv_entry_frees, "Total number of pv entries freed"); 5232 5233 static COUNTER_U64_DEFINE_EARLY(pv_entry_allocs); 5234 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, 5235 &pv_entry_allocs, "Total number of pv entries allocated"); 5236 5237 static COUNTER_U64_DEFINE_EARLY(pv_entry_count); 5238 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, 5239 &pv_entry_count, "Current number of pv entries"); 5240 5241 static COUNTER_U64_DEFINE_EARLY(pv_entry_spare); 5242 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, 5243 &pv_entry_spare, "Current number of spare pv entries"); 5244 #endif 5245 5246 static void 5247 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di) 5248 { 5249 5250 if (pmap == NULL) 5251 return; 5252 pmap_invalidate_all(pmap); 5253 if (pmap != locked_pmap) 5254 PMAP_UNLOCK(pmap); 5255 if (start_di) 5256 pmap_delayed_invl_finish(); 5257 } 5258 5259 /* 5260 * We are in a serious low memory condition. Resort to 5261 * drastic measures to free some pages so we can allocate 5262 * another pv entry chunk. 5263 * 5264 * Returns NULL if PV entries were reclaimed from the specified pmap. 5265 * 5266 * We do not, however, unmap 2mpages because subsequent accesses will 5267 * allocate per-page pv entries until repromotion occurs, thereby 5268 * exacerbating the shortage of free pv entries. 5269 */ 5270 static vm_page_t 5271 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain) 5272 { 5273 struct pv_chunks_list *pvc; 5274 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 5275 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 5276 struct md_page *pvh; 5277 pd_entry_t *pde; 5278 pmap_t next_pmap, pmap; 5279 pt_entry_t *pte, tpte; 5280 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 5281 pv_entry_t pv; 5282 vm_offset_t va; 5283 vm_page_t m, m_pc; 5284 struct spglist free; 5285 uint64_t inuse; 5286 int bit, field, freed; 5287 bool start_di, restart; 5288 5289 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 5290 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 5291 pmap = NULL; 5292 m_pc = NULL; 5293 PG_G = PG_A = PG_M = PG_RW = 0; 5294 SLIST_INIT(&free); 5295 bzero(&pc_marker_b, sizeof(pc_marker_b)); 5296 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 5297 pc_marker = (struct pv_chunk *)&pc_marker_b; 5298 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 5299 5300 /* 5301 * A delayed invalidation block should already be active if 5302 * pmap_advise() or pmap_remove() called this function by way 5303 * of pmap_demote_pde_locked(). 5304 */ 5305 start_di = pmap_not_in_di(); 5306 5307 pvc = &pv_chunks[domain]; 5308 mtx_lock(&pvc->pvc_lock); 5309 pvc->active_reclaims++; 5310 TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru); 5311 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru); 5312 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 5313 SLIST_EMPTY(&free)) { 5314 next_pmap = pc->pc_pmap; 5315 if (next_pmap == NULL) { 5316 /* 5317 * The next chunk is a marker. However, it is 5318 * not our marker, so active_reclaims must be 5319 * > 1. Consequently, the next_chunk code 5320 * will not rotate the pv_chunks list. 5321 */ 5322 goto next_chunk; 5323 } 5324 mtx_unlock(&pvc->pvc_lock); 5325 5326 /* 5327 * A pv_chunk can only be removed from the pc_lru list 5328 * when both pc_chunks_mutex is owned and the 5329 * corresponding pmap is locked. 5330 */ 5331 if (pmap != next_pmap) { 5332 restart = false; 5333 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, 5334 start_di); 5335 pmap = next_pmap; 5336 /* Avoid deadlock and lock recursion. */ 5337 if (pmap > locked_pmap) { 5338 RELEASE_PV_LIST_LOCK(lockp); 5339 PMAP_LOCK(pmap); 5340 if (start_di) 5341 pmap_delayed_invl_start(); 5342 mtx_lock(&pvc->pvc_lock); 5343 restart = true; 5344 } else if (pmap != locked_pmap) { 5345 if (PMAP_TRYLOCK(pmap)) { 5346 if (start_di) 5347 pmap_delayed_invl_start(); 5348 mtx_lock(&pvc->pvc_lock); 5349 restart = true; 5350 } else { 5351 pmap = NULL; /* pmap is not locked */ 5352 mtx_lock(&pvc->pvc_lock); 5353 pc = TAILQ_NEXT(pc_marker, pc_lru); 5354 if (pc == NULL || 5355 pc->pc_pmap != next_pmap) 5356 continue; 5357 goto next_chunk; 5358 } 5359 } else if (start_di) 5360 pmap_delayed_invl_start(); 5361 PG_G = pmap_global_bit(pmap); 5362 PG_A = pmap_accessed_bit(pmap); 5363 PG_M = pmap_modified_bit(pmap); 5364 PG_RW = pmap_rw_bit(pmap); 5365 if (restart) 5366 continue; 5367 } 5368 5369 /* 5370 * Destroy every non-wired, 4 KB page mapping in the chunk. 5371 */ 5372 freed = 0; 5373 for (field = 0; field < _NPCM; field++) { 5374 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 5375 inuse != 0; inuse &= ~(1UL << bit)) { 5376 bit = bsfq(inuse); 5377 pv = &pc->pc_pventry[field * 64 + bit]; 5378 va = pv->pv_va; 5379 pde = pmap_pde(pmap, va); 5380 if ((*pde & PG_PS) != 0) 5381 continue; 5382 pte = pmap_pde_to_pte(pde, va); 5383 if ((*pte & PG_W) != 0) 5384 continue; 5385 tpte = pte_load_clear(pte); 5386 if ((tpte & PG_G) != 0) 5387 pmap_invalidate_page(pmap, va); 5388 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 5389 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5390 vm_page_dirty(m); 5391 if ((tpte & PG_A) != 0) 5392 vm_page_aflag_set(m, PGA_REFERENCED); 5393 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 5394 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5395 m->md.pv_gen++; 5396 if (TAILQ_EMPTY(&m->md.pv_list) && 5397 (m->flags & PG_FICTITIOUS) == 0) { 5398 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5399 if (TAILQ_EMPTY(&pvh->pv_list)) { 5400 vm_page_aflag_clear(m, 5401 PGA_WRITEABLE); 5402 } 5403 } 5404 pmap_delayed_invl_page(m); 5405 pc->pc_map[field] |= 1UL << bit; 5406 pmap_unuse_pt(pmap, va, *pde, &free); 5407 freed++; 5408 } 5409 } 5410 if (freed == 0) { 5411 mtx_lock(&pvc->pvc_lock); 5412 goto next_chunk; 5413 } 5414 /* Every freed mapping is for a 4 KB page. */ 5415 pmap_resident_count_adj(pmap, -freed); 5416 PV_STAT(counter_u64_add(pv_entry_frees, freed)); 5417 PV_STAT(counter_u64_add(pv_entry_spare, freed)); 5418 PV_STAT(counter_u64_add(pv_entry_count, -freed)); 5419 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5420 if (pc_is_free(pc)) { 5421 PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV)); 5422 PV_STAT(counter_u64_add(pc_chunk_count, -1)); 5423 PV_STAT(counter_u64_add(pc_chunk_frees, 1)); 5424 /* Entire chunk is free; return it. */ 5425 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 5426 dump_drop_page(m_pc->phys_addr); 5427 mtx_lock(&pvc->pvc_lock); 5428 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5429 break; 5430 } 5431 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5432 mtx_lock(&pvc->pvc_lock); 5433 /* One freed pv entry in locked_pmap is sufficient. */ 5434 if (pmap == locked_pmap) 5435 break; 5436 next_chunk: 5437 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 5438 TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru); 5439 if (pvc->active_reclaims == 1 && pmap != NULL) { 5440 /* 5441 * Rotate the pv chunks list so that we do not 5442 * scan the same pv chunks that could not be 5443 * freed (because they contained a wired 5444 * and/or superpage mapping) on every 5445 * invocation of reclaim_pv_chunk(). 5446 */ 5447 while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker) { 5448 MPASS(pc->pc_pmap != NULL); 5449 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5450 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 5451 } 5452 } 5453 } 5454 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 5455 TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru); 5456 pvc->active_reclaims--; 5457 mtx_unlock(&pvc->pvc_lock); 5458 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); 5459 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 5460 m_pc = SLIST_FIRST(&free); 5461 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 5462 /* Recycle a freed page table page. */ 5463 m_pc->ref_count = 1; 5464 } 5465 vm_page_free_pages_toq(&free, true); 5466 return (m_pc); 5467 } 5468 5469 static vm_page_t 5470 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 5471 { 5472 vm_page_t m; 5473 int i, domain; 5474 5475 domain = PCPU_GET(domain); 5476 for (i = 0; i < vm_ndomains; i++) { 5477 m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain); 5478 if (m != NULL) 5479 break; 5480 domain = (domain + 1) % vm_ndomains; 5481 } 5482 5483 return (m); 5484 } 5485 5486 /* 5487 * free the pv_entry back to the free list 5488 */ 5489 static void 5490 free_pv_entry(pmap_t pmap, pv_entry_t pv) 5491 { 5492 struct pv_chunk *pc; 5493 int idx, field, bit; 5494 5495 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5496 PV_STAT(counter_u64_add(pv_entry_frees, 1)); 5497 PV_STAT(counter_u64_add(pv_entry_spare, 1)); 5498 PV_STAT(counter_u64_add(pv_entry_count, -1)); 5499 pc = pv_to_chunk(pv); 5500 idx = pv - &pc->pc_pventry[0]; 5501 field = idx / 64; 5502 bit = idx % 64; 5503 pc->pc_map[field] |= 1ul << bit; 5504 if (!pc_is_free(pc)) { 5505 /* 98% of the time, pc is already at the head of the list. */ 5506 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 5507 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5508 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5509 } 5510 return; 5511 } 5512 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5513 free_pv_chunk(pc); 5514 } 5515 5516 static void 5517 free_pv_chunk_dequeued(struct pv_chunk *pc) 5518 { 5519 vm_page_t m; 5520 5521 PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV)); 5522 PV_STAT(counter_u64_add(pc_chunk_count, -1)); 5523 PV_STAT(counter_u64_add(pc_chunk_frees, 1)); 5524 counter_u64_add(pv_page_count, -1); 5525 /* entire chunk is free, return it */ 5526 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 5527 dump_drop_page(m->phys_addr); 5528 vm_page_unwire_noq(m); 5529 vm_page_free(m); 5530 } 5531 5532 static void 5533 free_pv_chunk(struct pv_chunk *pc) 5534 { 5535 struct pv_chunks_list *pvc; 5536 5537 pvc = &pv_chunks[pc_to_domain(pc)]; 5538 mtx_lock(&pvc->pvc_lock); 5539 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5540 mtx_unlock(&pvc->pvc_lock); 5541 free_pv_chunk_dequeued(pc); 5542 } 5543 5544 static void 5545 free_pv_chunk_batch(struct pv_chunklist *batch) 5546 { 5547 struct pv_chunks_list *pvc; 5548 struct pv_chunk *pc, *npc; 5549 int i; 5550 5551 for (i = 0; i < vm_ndomains; i++) { 5552 if (TAILQ_EMPTY(&batch[i])) 5553 continue; 5554 pvc = &pv_chunks[i]; 5555 mtx_lock(&pvc->pvc_lock); 5556 TAILQ_FOREACH(pc, &batch[i], pc_list) { 5557 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5558 } 5559 mtx_unlock(&pvc->pvc_lock); 5560 } 5561 5562 for (i = 0; i < vm_ndomains; i++) { 5563 TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) { 5564 free_pv_chunk_dequeued(pc); 5565 } 5566 } 5567 } 5568 5569 /* 5570 * Returns a new PV entry, allocating a new PV chunk from the system when 5571 * needed. If this PV chunk allocation fails and a PV list lock pointer was 5572 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 5573 * returned. 5574 * 5575 * The given PV list lock may be released. 5576 */ 5577 static pv_entry_t 5578 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 5579 { 5580 struct pv_chunks_list *pvc; 5581 int bit, field; 5582 pv_entry_t pv; 5583 struct pv_chunk *pc; 5584 vm_page_t m; 5585 5586 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5587 PV_STAT(counter_u64_add(pv_entry_allocs, 1)); 5588 retry: 5589 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 5590 if (pc != NULL) { 5591 for (field = 0; field < _NPCM; field++) { 5592 if (pc->pc_map[field]) { 5593 bit = bsfq(pc->pc_map[field]); 5594 break; 5595 } 5596 } 5597 if (field < _NPCM) { 5598 pv = &pc->pc_pventry[field * 64 + bit]; 5599 pc->pc_map[field] &= ~(1ul << bit); 5600 /* If this was the last item, move it to tail */ 5601 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 5602 pc->pc_map[2] == 0) { 5603 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5604 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 5605 pc_list); 5606 } 5607 PV_STAT(counter_u64_add(pv_entry_count, 1)); 5608 PV_STAT(counter_u64_add(pv_entry_spare, -1)); 5609 return (pv); 5610 } 5611 } 5612 /* No free items, allocate another chunk */ 5613 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 5614 if (m == NULL) { 5615 if (lockp == NULL) { 5616 PV_STAT(counter_u64_add(pc_chunk_tryfail, 1)); 5617 return (NULL); 5618 } 5619 m = reclaim_pv_chunk(pmap, lockp); 5620 if (m == NULL) 5621 goto retry; 5622 } else 5623 counter_u64_add(pv_page_count, 1); 5624 PV_STAT(counter_u64_add(pc_chunk_count, 1)); 5625 PV_STAT(counter_u64_add(pc_chunk_allocs, 1)); 5626 dump_add_page(m->phys_addr); 5627 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 5628 pc->pc_pmap = pmap; 5629 pc->pc_map[0] = PC_FREEN & ~1ul; /* preallocated bit 0 */ 5630 pc->pc_map[1] = PC_FREEN; 5631 pc->pc_map[2] = PC_FREEL; 5632 pvc = &pv_chunks[vm_page_domain(m)]; 5633 mtx_lock(&pvc->pvc_lock); 5634 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 5635 mtx_unlock(&pvc->pvc_lock); 5636 pv = &pc->pc_pventry[0]; 5637 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5638 PV_STAT(counter_u64_add(pv_entry_count, 1)); 5639 PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV - 1)); 5640 return (pv); 5641 } 5642 5643 /* 5644 * Returns the number of one bits within the given PV chunk map. 5645 * 5646 * The erratas for Intel processors state that "POPCNT Instruction May 5647 * Take Longer to Execute Than Expected". It is believed that the 5648 * issue is the spurious dependency on the destination register. 5649 * Provide a hint to the register rename logic that the destination 5650 * value is overwritten, by clearing it, as suggested in the 5651 * optimization manual. It should be cheap for unaffected processors 5652 * as well. 5653 * 5654 * Reference numbers for erratas are 5655 * 4th Gen Core: HSD146 5656 * 5th Gen Core: BDM85 5657 * 6th Gen Core: SKL029 5658 */ 5659 static int 5660 popcnt_pc_map_pq(uint64_t *map) 5661 { 5662 u_long result, tmp; 5663 5664 __asm __volatile("xorl %k0,%k0;popcntq %2,%0;" 5665 "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;" 5666 "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0" 5667 : "=&r" (result), "=&r" (tmp) 5668 : "m" (map[0]), "m" (map[1]), "m" (map[2])); 5669 return (result); 5670 } 5671 5672 /* 5673 * Ensure that the number of spare PV entries in the specified pmap meets or 5674 * exceeds the given count, "needed". 5675 * 5676 * The given PV list lock may be released. 5677 */ 5678 static void 5679 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 5680 { 5681 struct pv_chunks_list *pvc; 5682 struct pch new_tail[PMAP_MEMDOM]; 5683 struct pv_chunk *pc; 5684 vm_page_t m; 5685 int avail, free, i; 5686 bool reclaimed; 5687 5688 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5689 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 5690 5691 /* 5692 * Newly allocated PV chunks must be stored in a private list until 5693 * the required number of PV chunks have been allocated. Otherwise, 5694 * reclaim_pv_chunk() could recycle one of these chunks. In 5695 * contrast, these chunks must be added to the pmap upon allocation. 5696 */ 5697 for (i = 0; i < PMAP_MEMDOM; i++) 5698 TAILQ_INIT(&new_tail[i]); 5699 retry: 5700 avail = 0; 5701 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 5702 #ifndef __POPCNT__ 5703 if ((cpu_feature2 & CPUID2_POPCNT) == 0) 5704 bit_count((bitstr_t *)pc->pc_map, 0, 5705 sizeof(pc->pc_map) * NBBY, &free); 5706 else 5707 #endif 5708 free = popcnt_pc_map_pq(pc->pc_map); 5709 if (free == 0) 5710 break; 5711 avail += free; 5712 if (avail >= needed) 5713 break; 5714 } 5715 for (reclaimed = false; avail < needed; avail += _NPCPV) { 5716 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 5717 if (m == NULL) { 5718 m = reclaim_pv_chunk(pmap, lockp); 5719 if (m == NULL) 5720 goto retry; 5721 reclaimed = true; 5722 } else 5723 counter_u64_add(pv_page_count, 1); 5724 PV_STAT(counter_u64_add(pc_chunk_count, 1)); 5725 PV_STAT(counter_u64_add(pc_chunk_allocs, 1)); 5726 dump_add_page(m->phys_addr); 5727 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 5728 pc->pc_pmap = pmap; 5729 pc->pc_map[0] = PC_FREEN; 5730 pc->pc_map[1] = PC_FREEN; 5731 pc->pc_map[2] = PC_FREEL; 5732 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5733 TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru); 5734 PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV)); 5735 5736 /* 5737 * The reclaim might have freed a chunk from the current pmap. 5738 * If that chunk contained available entries, we need to 5739 * re-count the number of available entries. 5740 */ 5741 if (reclaimed) 5742 goto retry; 5743 } 5744 for (i = 0; i < vm_ndomains; i++) { 5745 if (TAILQ_EMPTY(&new_tail[i])) 5746 continue; 5747 pvc = &pv_chunks[i]; 5748 mtx_lock(&pvc->pvc_lock); 5749 TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru); 5750 mtx_unlock(&pvc->pvc_lock); 5751 } 5752 } 5753 5754 /* 5755 * First find and then remove the pv entry for the specified pmap and virtual 5756 * address from the specified pv list. Returns the pv entry if found and NULL 5757 * otherwise. This operation can be performed on pv lists for either 4KB or 5758 * 2MB page mappings. 5759 */ 5760 static __inline pv_entry_t 5761 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 5762 { 5763 pv_entry_t pv; 5764 5765 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5766 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 5767 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5768 pvh->pv_gen++; 5769 break; 5770 } 5771 } 5772 return (pv); 5773 } 5774 5775 /* 5776 * After demotion from a 2MB page mapping to 512 4KB page mappings, 5777 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 5778 * entries for each of the 4KB page mappings. 5779 */ 5780 static void 5781 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 5782 struct rwlock **lockp) 5783 { 5784 struct md_page *pvh; 5785 struct pv_chunk *pc; 5786 pv_entry_t pv; 5787 vm_offset_t va_last; 5788 vm_page_t m; 5789 int bit, field; 5790 5791 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5792 KASSERT((pa & PDRMASK) == 0, 5793 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 5794 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5795 5796 /* 5797 * Transfer the 2mpage's pv entry for this mapping to the first 5798 * page's pv list. Once this transfer begins, the pv list lock 5799 * must not be released until the last pv entry is reinstantiated. 5800 */ 5801 pvh = pa_to_pvh(pa); 5802 va = trunc_2mpage(va); 5803 pv = pmap_pvh_remove(pvh, pmap, va); 5804 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 5805 m = PHYS_TO_VM_PAGE(pa); 5806 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5807 m->md.pv_gen++; 5808 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 5809 PV_STAT(counter_u64_add(pv_entry_allocs, NPTEPG - 1)); 5810 va_last = va + NBPDR - PAGE_SIZE; 5811 for (;;) { 5812 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 5813 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 5814 pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); 5815 for (field = 0; field < _NPCM; field++) { 5816 while (pc->pc_map[field]) { 5817 bit = bsfq(pc->pc_map[field]); 5818 pc->pc_map[field] &= ~(1ul << bit); 5819 pv = &pc->pc_pventry[field * 64 + bit]; 5820 va += PAGE_SIZE; 5821 pv->pv_va = va; 5822 m++; 5823 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5824 ("pmap_pv_demote_pde: page %p is not managed", m)); 5825 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5826 m->md.pv_gen++; 5827 if (va == va_last) 5828 goto out; 5829 } 5830 } 5831 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5832 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 5833 } 5834 out: 5835 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 5836 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5837 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 5838 } 5839 PV_STAT(counter_u64_add(pv_entry_count, NPTEPG - 1)); 5840 PV_STAT(counter_u64_add(pv_entry_spare, -(NPTEPG - 1))); 5841 } 5842 5843 #if VM_NRESERVLEVEL > 0 5844 /* 5845 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 5846 * replace the many pv entries for the 4KB page mappings by a single pv entry 5847 * for the 2MB page mapping. 5848 */ 5849 static void 5850 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 5851 struct rwlock **lockp) 5852 { 5853 struct md_page *pvh; 5854 pv_entry_t pv; 5855 vm_offset_t va_last; 5856 vm_page_t m; 5857 5858 KASSERT((pa & PDRMASK) == 0, 5859 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 5860 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5861 5862 /* 5863 * Transfer the first page's pv entry for this mapping to the 2mpage's 5864 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 5865 * a transfer avoids the possibility that get_pv_entry() calls 5866 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 5867 * mappings that is being promoted. 5868 */ 5869 m = PHYS_TO_VM_PAGE(pa); 5870 va = trunc_2mpage(va); 5871 pv = pmap_pvh_remove(&m->md, pmap, va); 5872 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 5873 pvh = pa_to_pvh(pa); 5874 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5875 pvh->pv_gen++; 5876 /* Free the remaining NPTEPG - 1 pv entries. */ 5877 va_last = va + NBPDR - PAGE_SIZE; 5878 do { 5879 m++; 5880 va += PAGE_SIZE; 5881 pmap_pvh_free(&m->md, pmap, va); 5882 } while (va < va_last); 5883 } 5884 #endif /* VM_NRESERVLEVEL > 0 */ 5885 5886 /* 5887 * First find and then destroy the pv entry for the specified pmap and virtual 5888 * address. This operation can be performed on pv lists for either 4KB or 2MB 5889 * page mappings. 5890 */ 5891 static void 5892 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 5893 { 5894 pv_entry_t pv; 5895 5896 pv = pmap_pvh_remove(pvh, pmap, va); 5897 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 5898 free_pv_entry(pmap, pv); 5899 } 5900 5901 /* 5902 * Conditionally create the PV entry for a 4KB page mapping if the required 5903 * memory can be allocated without resorting to reclamation. 5904 */ 5905 static boolean_t 5906 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 5907 struct rwlock **lockp) 5908 { 5909 pv_entry_t pv; 5910 5911 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5912 /* Pass NULL instead of the lock pointer to disable reclamation. */ 5913 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 5914 pv->pv_va = va; 5915 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 5916 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5917 m->md.pv_gen++; 5918 return (TRUE); 5919 } else 5920 return (FALSE); 5921 } 5922 5923 /* 5924 * Create the PV entry for a 2MB page mapping. Always returns true unless the 5925 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 5926 * false if the PV entry cannot be allocated without resorting to reclamation. 5927 */ 5928 static bool 5929 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, 5930 struct rwlock **lockp) 5931 { 5932 struct md_page *pvh; 5933 pv_entry_t pv; 5934 vm_paddr_t pa; 5935 5936 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5937 /* Pass NULL instead of the lock pointer to disable reclamation. */ 5938 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 5939 NULL : lockp)) == NULL) 5940 return (false); 5941 pv->pv_va = va; 5942 pa = pde & PG_PS_FRAME; 5943 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5944 pvh = pa_to_pvh(pa); 5945 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5946 pvh->pv_gen++; 5947 return (true); 5948 } 5949 5950 /* 5951 * Fills a page table page with mappings to consecutive physical pages. 5952 */ 5953 static void 5954 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 5955 { 5956 pt_entry_t *pte; 5957 5958 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 5959 *pte = newpte; 5960 newpte += PAGE_SIZE; 5961 } 5962 } 5963 5964 /* 5965 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 5966 * mapping is invalidated. 5967 */ 5968 static boolean_t 5969 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 5970 { 5971 struct rwlock *lock; 5972 boolean_t rv; 5973 5974 lock = NULL; 5975 rv = pmap_demote_pde_locked(pmap, pde, va, &lock); 5976 if (lock != NULL) 5977 rw_wunlock(lock); 5978 return (rv); 5979 } 5980 5981 static void 5982 pmap_demote_pde_check(pt_entry_t *firstpte __unused, pt_entry_t newpte __unused) 5983 { 5984 #ifdef INVARIANTS 5985 #ifdef DIAGNOSTIC 5986 pt_entry_t *xpte, *ypte; 5987 5988 for (xpte = firstpte; xpte < firstpte + NPTEPG; 5989 xpte++, newpte += PAGE_SIZE) { 5990 if ((*xpte & PG_FRAME) != (newpte & PG_FRAME)) { 5991 printf("pmap_demote_pde: xpte %zd and newpte map " 5992 "different pages: found %#lx, expected %#lx\n", 5993 xpte - firstpte, *xpte, newpte); 5994 printf("page table dump\n"); 5995 for (ypte = firstpte; ypte < firstpte + NPTEPG; ypte++) 5996 printf("%zd %#lx\n", ypte - firstpte, *ypte); 5997 panic("firstpte"); 5998 } 5999 } 6000 #else 6001 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 6002 ("pmap_demote_pde: firstpte and newpte map different physical" 6003 " addresses")); 6004 #endif 6005 #endif 6006 } 6007 6008 static void 6009 pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 6010 pd_entry_t oldpde, struct rwlock **lockp) 6011 { 6012 struct spglist free; 6013 vm_offset_t sva; 6014 6015 SLIST_INIT(&free); 6016 sva = trunc_2mpage(va); 6017 pmap_remove_pde(pmap, pde, sva, &free, lockp); 6018 if ((oldpde & pmap_global_bit(pmap)) == 0) 6019 pmap_invalidate_pde_page(pmap, sva, oldpde); 6020 vm_page_free_pages_toq(&free, true); 6021 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx in pmap %p", 6022 va, pmap); 6023 } 6024 6025 static boolean_t 6026 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 6027 struct rwlock **lockp) 6028 { 6029 pd_entry_t newpde, oldpde; 6030 pt_entry_t *firstpte, newpte; 6031 pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V; 6032 vm_paddr_t mptepa; 6033 vm_page_t mpte; 6034 int PG_PTE_CACHE; 6035 bool in_kernel; 6036 6037 PG_A = pmap_accessed_bit(pmap); 6038 PG_G = pmap_global_bit(pmap); 6039 PG_M = pmap_modified_bit(pmap); 6040 PG_RW = pmap_rw_bit(pmap); 6041 PG_V = pmap_valid_bit(pmap); 6042 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 6043 PG_PKU_MASK = pmap_pku_mask_bit(pmap); 6044 6045 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6046 in_kernel = va >= VM_MAXUSER_ADDRESS; 6047 oldpde = *pde; 6048 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 6049 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 6050 6051 /* 6052 * Invalidate the 2MB page mapping and return "failure" if the 6053 * mapping was never accessed. 6054 */ 6055 if ((oldpde & PG_A) == 0) { 6056 KASSERT((oldpde & PG_W) == 0, 6057 ("pmap_demote_pde: a wired mapping is missing PG_A")); 6058 pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); 6059 return (FALSE); 6060 } 6061 6062 mpte = pmap_remove_pt_page(pmap, va); 6063 if (mpte == NULL) { 6064 KASSERT((oldpde & PG_W) == 0, 6065 ("pmap_demote_pde: page table page for a wired mapping" 6066 " is missing")); 6067 6068 /* 6069 * If the page table page is missing and the mapping 6070 * is for a kernel address, the mapping must belong to 6071 * the direct map. Page table pages are preallocated 6072 * for every other part of the kernel address space, 6073 * so the direct map region is the only part of the 6074 * kernel address space that must be handled here. 6075 */ 6076 KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS && 6077 va < DMAP_MAX_ADDRESS), 6078 ("pmap_demote_pde: No saved mpte for va %#lx", va)); 6079 6080 /* 6081 * If the 2MB page mapping belongs to the direct map 6082 * region of the kernel's address space, then the page 6083 * allocation request specifies the highest possible 6084 * priority (VM_ALLOC_INTERRUPT). Otherwise, the 6085 * priority is normal. 6086 */ 6087 mpte = pmap_alloc_pt_page(pmap, pmap_pde_pindex(va), 6088 (in_kernel ? VM_ALLOC_INTERRUPT : 0) | VM_ALLOC_WIRED); 6089 6090 /* 6091 * If the allocation of the new page table page fails, 6092 * invalidate the 2MB page mapping and return "failure". 6093 */ 6094 if (mpte == NULL) { 6095 pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); 6096 return (FALSE); 6097 } 6098 6099 if (!in_kernel) 6100 mpte->ref_count = NPTEPG; 6101 } 6102 mptepa = VM_PAGE_TO_PHYS(mpte); 6103 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 6104 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 6105 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 6106 ("pmap_demote_pde: oldpde is missing PG_M")); 6107 newpte = oldpde & ~PG_PS; 6108 newpte = pmap_swap_pat(pmap, newpte); 6109 6110 /* 6111 * If the PTP is not leftover from an earlier promotion or it does not 6112 * have PG_A set in every PTE, then fill it. The new PTEs will all 6113 * have PG_A set. 6114 */ 6115 if (!vm_page_all_valid(mpte)) 6116 pmap_fill_ptp(firstpte, newpte); 6117 6118 pmap_demote_pde_check(firstpte, newpte); 6119 6120 /* 6121 * If the mapping has changed attributes, update the PTEs. 6122 */ 6123 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 6124 pmap_fill_ptp(firstpte, newpte); 6125 6126 /* 6127 * The spare PV entries must be reserved prior to demoting the 6128 * mapping, that is, prior to changing the PDE. Otherwise, the state 6129 * of the PDE and the PV lists will be inconsistent, which can result 6130 * in reclaim_pv_chunk() attempting to remove a PV entry from the 6131 * wrong PV list and pmap_pv_demote_pde() failing to find the expected 6132 * PV entry for the 2MB page mapping that is being demoted. 6133 */ 6134 if ((oldpde & PG_MANAGED) != 0) 6135 reserve_pv_entries(pmap, NPTEPG - 1, lockp); 6136 6137 /* 6138 * Demote the mapping. This pmap is locked. The old PDE has 6139 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 6140 * set. Thus, there is no danger of a race with another 6141 * processor changing the setting of PG_A and/or PG_M between 6142 * the read above and the store below. 6143 */ 6144 if (workaround_erratum383) 6145 pmap_update_pde(pmap, va, pde, newpde); 6146 else 6147 pde_store(pde, newpde); 6148 6149 /* 6150 * Invalidate a stale recursive mapping of the page table page. 6151 */ 6152 if (in_kernel) 6153 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 6154 6155 /* 6156 * Demote the PV entry. 6157 */ 6158 if ((oldpde & PG_MANAGED) != 0) 6159 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); 6160 6161 counter_u64_add(pmap_pde_demotions, 1); 6162 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p", 6163 va, pmap); 6164 return (TRUE); 6165 } 6166 6167 /* 6168 * pmap_remove_kernel_pde: Remove a kernel superpage mapping. 6169 */ 6170 static void 6171 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 6172 { 6173 pd_entry_t newpde; 6174 vm_paddr_t mptepa; 6175 vm_page_t mpte; 6176 6177 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 6178 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6179 mpte = pmap_remove_pt_page(pmap, va); 6180 if (mpte == NULL) 6181 panic("pmap_remove_kernel_pde: Missing pt page."); 6182 6183 mptepa = VM_PAGE_TO_PHYS(mpte); 6184 newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; 6185 6186 /* 6187 * If this page table page was unmapped by a promotion, then it 6188 * contains valid mappings. Zero it to invalidate those mappings. 6189 */ 6190 if (vm_page_any_valid(mpte)) 6191 pagezero((void *)PHYS_TO_DMAP(mptepa)); 6192 6193 /* 6194 * Demote the mapping. 6195 */ 6196 if (workaround_erratum383) 6197 pmap_update_pde(pmap, va, pde, newpde); 6198 else 6199 pde_store(pde, newpde); 6200 6201 /* 6202 * Invalidate a stale recursive mapping of the page table page. 6203 */ 6204 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 6205 } 6206 6207 /* 6208 * pmap_remove_pde: do the things to unmap a superpage in a process 6209 */ 6210 static int 6211 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 6212 struct spglist *free, struct rwlock **lockp) 6213 { 6214 struct md_page *pvh; 6215 pd_entry_t oldpde; 6216 vm_offset_t eva, va; 6217 vm_page_t m, mpte; 6218 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 6219 6220 PG_G = pmap_global_bit(pmap); 6221 PG_A = pmap_accessed_bit(pmap); 6222 PG_M = pmap_modified_bit(pmap); 6223 PG_RW = pmap_rw_bit(pmap); 6224 6225 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6226 KASSERT((sva & PDRMASK) == 0, 6227 ("pmap_remove_pde: sva is not 2mpage aligned")); 6228 oldpde = pte_load_clear(pdq); 6229 if (oldpde & PG_W) 6230 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 6231 if ((oldpde & PG_G) != 0) 6232 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 6233 pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE); 6234 if (oldpde & PG_MANAGED) { 6235 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); 6236 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 6237 pmap_pvh_free(pvh, pmap, sva); 6238 eva = sva + NBPDR; 6239 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 6240 va < eva; va += PAGE_SIZE, m++) { 6241 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6242 vm_page_dirty(m); 6243 if (oldpde & PG_A) 6244 vm_page_aflag_set(m, PGA_REFERENCED); 6245 if (TAILQ_EMPTY(&m->md.pv_list) && 6246 TAILQ_EMPTY(&pvh->pv_list)) 6247 vm_page_aflag_clear(m, PGA_WRITEABLE); 6248 pmap_delayed_invl_page(m); 6249 } 6250 } 6251 if (pmap == kernel_pmap) { 6252 pmap_remove_kernel_pde(pmap, pdq, sva); 6253 } else { 6254 mpte = pmap_remove_pt_page(pmap, sva); 6255 if (mpte != NULL) { 6256 KASSERT(vm_page_any_valid(mpte), 6257 ("pmap_remove_pde: pte page not promoted")); 6258 pmap_pt_page_count_adj(pmap, -1); 6259 KASSERT(mpte->ref_count == NPTEPG, 6260 ("pmap_remove_pde: pte page ref count error")); 6261 mpte->ref_count = 0; 6262 pmap_add_delayed_free_list(mpte, free, FALSE); 6263 } 6264 } 6265 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); 6266 } 6267 6268 /* 6269 * pmap_remove_pte: do the things to unmap a page in a process 6270 */ 6271 static int 6272 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 6273 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) 6274 { 6275 struct md_page *pvh; 6276 pt_entry_t oldpte, PG_A, PG_M, PG_RW; 6277 vm_page_t m; 6278 6279 PG_A = pmap_accessed_bit(pmap); 6280 PG_M = pmap_modified_bit(pmap); 6281 PG_RW = pmap_rw_bit(pmap); 6282 6283 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6284 oldpte = pte_load_clear(ptq); 6285 if (oldpte & PG_W) 6286 pmap->pm_stats.wired_count -= 1; 6287 pmap_resident_count_adj(pmap, -1); 6288 if (oldpte & PG_MANAGED) { 6289 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 6290 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6291 vm_page_dirty(m); 6292 if (oldpte & PG_A) 6293 vm_page_aflag_set(m, PGA_REFERENCED); 6294 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 6295 pmap_pvh_free(&m->md, pmap, va); 6296 if (TAILQ_EMPTY(&m->md.pv_list) && 6297 (m->flags & PG_FICTITIOUS) == 0) { 6298 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6299 if (TAILQ_EMPTY(&pvh->pv_list)) 6300 vm_page_aflag_clear(m, PGA_WRITEABLE); 6301 } 6302 pmap_delayed_invl_page(m); 6303 } 6304 return (pmap_unuse_pt(pmap, va, ptepde, free)); 6305 } 6306 6307 /* 6308 * Remove a single page from a process address space 6309 */ 6310 static void 6311 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 6312 struct spglist *free) 6313 { 6314 struct rwlock *lock; 6315 pt_entry_t *pte, PG_V; 6316 6317 PG_V = pmap_valid_bit(pmap); 6318 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6319 if ((*pde & PG_V) == 0) 6320 return; 6321 pte = pmap_pde_to_pte(pde, va); 6322 if ((*pte & PG_V) == 0) 6323 return; 6324 lock = NULL; 6325 pmap_remove_pte(pmap, pte, va, *pde, free, &lock); 6326 if (lock != NULL) 6327 rw_wunlock(lock); 6328 pmap_invalidate_page(pmap, va); 6329 } 6330 6331 /* 6332 * Removes the specified range of addresses from the page table page. 6333 */ 6334 static bool 6335 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 6336 pd_entry_t *pde, struct spglist *free, struct rwlock **lockp) 6337 { 6338 pt_entry_t PG_G, *pte; 6339 vm_offset_t va; 6340 bool anyvalid; 6341 6342 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6343 PG_G = pmap_global_bit(pmap); 6344 anyvalid = false; 6345 va = eva; 6346 for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++, 6347 sva += PAGE_SIZE) { 6348 if (*pte == 0) { 6349 if (va != eva) { 6350 pmap_invalidate_range(pmap, va, sva); 6351 va = eva; 6352 } 6353 continue; 6354 } 6355 if ((*pte & PG_G) == 0) 6356 anyvalid = true; 6357 else if (va == eva) 6358 va = sva; 6359 if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) { 6360 sva += PAGE_SIZE; 6361 break; 6362 } 6363 } 6364 if (va != eva) 6365 pmap_invalidate_range(pmap, va, sva); 6366 return (anyvalid); 6367 } 6368 6369 static void 6370 pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete) 6371 { 6372 struct rwlock *lock; 6373 vm_page_t mt; 6374 vm_offset_t va_next; 6375 pml5_entry_t *pml5e; 6376 pml4_entry_t *pml4e; 6377 pdp_entry_t *pdpe; 6378 pd_entry_t ptpaddr, *pde; 6379 pt_entry_t PG_G, PG_V; 6380 struct spglist free; 6381 int anyvalid; 6382 6383 PG_G = pmap_global_bit(pmap); 6384 PG_V = pmap_valid_bit(pmap); 6385 6386 /* 6387 * If there are no resident pages besides the top level page 6388 * table page(s), there is nothing to do. Kernel pmap always 6389 * accounts whole preloaded area as resident, which makes its 6390 * resident count > 2. 6391 * Perform an unsynchronized read. This is, however, safe. 6392 */ 6393 if (pmap->pm_stats.resident_count <= 1 + (pmap->pm_pmltopu != NULL ? 6394 1 : 0)) 6395 return; 6396 6397 anyvalid = 0; 6398 SLIST_INIT(&free); 6399 6400 pmap_delayed_invl_start(); 6401 PMAP_LOCK(pmap); 6402 if (map_delete) 6403 pmap_pkru_on_remove(pmap, sva, eva); 6404 6405 /* 6406 * special handling of removing one page. a very 6407 * common operation and easy to short circuit some 6408 * code. 6409 */ 6410 if (sva + PAGE_SIZE == eva) { 6411 pde = pmap_pde(pmap, sva); 6412 if (pde && (*pde & PG_PS) == 0) { 6413 pmap_remove_page(pmap, sva, pde, &free); 6414 goto out; 6415 } 6416 } 6417 6418 lock = NULL; 6419 for (; sva < eva; sva = va_next) { 6420 if (pmap->pm_stats.resident_count == 0) 6421 break; 6422 6423 if (pmap_is_la57(pmap)) { 6424 pml5e = pmap_pml5e(pmap, sva); 6425 if ((*pml5e & PG_V) == 0) { 6426 va_next = (sva + NBPML5) & ~PML5MASK; 6427 if (va_next < sva) 6428 va_next = eva; 6429 continue; 6430 } 6431 pml4e = pmap_pml5e_to_pml4e(pml5e, sva); 6432 } else { 6433 pml4e = pmap_pml4e(pmap, sva); 6434 } 6435 if ((*pml4e & PG_V) == 0) { 6436 va_next = (sva + NBPML4) & ~PML4MASK; 6437 if (va_next < sva) 6438 va_next = eva; 6439 continue; 6440 } 6441 6442 va_next = (sva + NBPDP) & ~PDPMASK; 6443 if (va_next < sva) 6444 va_next = eva; 6445 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 6446 if ((*pdpe & PG_V) == 0) 6447 continue; 6448 if ((*pdpe & PG_PS) != 0) { 6449 KASSERT(va_next <= eva, 6450 ("partial update of non-transparent 1G mapping " 6451 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 6452 *pdpe, sva, eva, va_next)); 6453 MPASS(pmap != kernel_pmap); /* XXXKIB */ 6454 MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); 6455 anyvalid = 1; 6456 *pdpe = 0; 6457 pmap_resident_count_adj(pmap, -NBPDP / PAGE_SIZE); 6458 mt = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, sva) & PG_FRAME); 6459 pmap_unwire_ptp(pmap, sva, mt, &free); 6460 continue; 6461 } 6462 6463 /* 6464 * Calculate index for next page table. 6465 */ 6466 va_next = (sva + NBPDR) & ~PDRMASK; 6467 if (va_next < sva) 6468 va_next = eva; 6469 6470 pde = pmap_pdpe_to_pde(pdpe, sva); 6471 ptpaddr = *pde; 6472 6473 /* 6474 * Weed out invalid mappings. 6475 */ 6476 if (ptpaddr == 0) 6477 continue; 6478 6479 /* 6480 * Check for large page. 6481 */ 6482 if ((ptpaddr & PG_PS) != 0) { 6483 /* 6484 * Are we removing the entire large page? If not, 6485 * demote the mapping and fall through. 6486 */ 6487 if (sva + NBPDR == va_next && eva >= va_next) { 6488 /* 6489 * The TLB entry for a PG_G mapping is 6490 * invalidated by pmap_remove_pde(). 6491 */ 6492 if ((ptpaddr & PG_G) == 0) 6493 anyvalid = 1; 6494 pmap_remove_pde(pmap, pde, sva, &free, &lock); 6495 continue; 6496 } else if (!pmap_demote_pde_locked(pmap, pde, sva, 6497 &lock)) { 6498 /* The large page mapping was destroyed. */ 6499 continue; 6500 } else 6501 ptpaddr = *pde; 6502 } 6503 6504 /* 6505 * Limit our scan to either the end of the va represented 6506 * by the current page table page, or to the end of the 6507 * range being removed. 6508 */ 6509 if (va_next > eva) 6510 va_next = eva; 6511 6512 if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock)) 6513 anyvalid = 1; 6514 } 6515 if (lock != NULL) 6516 rw_wunlock(lock); 6517 out: 6518 if (anyvalid) 6519 pmap_invalidate_all(pmap); 6520 PMAP_UNLOCK(pmap); 6521 pmap_delayed_invl_finish(); 6522 vm_page_free_pages_toq(&free, true); 6523 } 6524 6525 /* 6526 * Remove the given range of addresses from the specified map. 6527 * 6528 * It is assumed that the start and end are properly 6529 * rounded to the page size. 6530 */ 6531 void 6532 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 6533 { 6534 pmap_remove1(pmap, sva, eva, false); 6535 } 6536 6537 /* 6538 * Remove the given range of addresses as part of a logical unmap 6539 * operation. This has the effect of calling pmap_remove(), but 6540 * also clears any metadata that should persist for the lifetime 6541 * of a logical mapping. 6542 */ 6543 void 6544 pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 6545 { 6546 pmap_remove1(pmap, sva, eva, true); 6547 } 6548 6549 /* 6550 * Routine: pmap_remove_all 6551 * Function: 6552 * Removes this physical page from 6553 * all physical maps in which it resides. 6554 * Reflects back modify bits to the pager. 6555 * 6556 * Notes: 6557 * Original versions of this routine were very 6558 * inefficient because they iteratively called 6559 * pmap_remove (slow...) 6560 */ 6561 6562 void 6563 pmap_remove_all(vm_page_t m) 6564 { 6565 struct md_page *pvh; 6566 pv_entry_t pv; 6567 pmap_t pmap; 6568 struct rwlock *lock; 6569 pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; 6570 pd_entry_t *pde; 6571 vm_offset_t va; 6572 struct spglist free; 6573 int pvh_gen, md_gen; 6574 6575 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6576 ("pmap_remove_all: page %p is not managed", m)); 6577 SLIST_INIT(&free); 6578 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6579 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 6580 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6581 rw_wlock(lock); 6582 retry: 6583 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 6584 pmap = PV_PMAP(pv); 6585 if (!PMAP_TRYLOCK(pmap)) { 6586 pvh_gen = pvh->pv_gen; 6587 rw_wunlock(lock); 6588 PMAP_LOCK(pmap); 6589 rw_wlock(lock); 6590 if (pvh_gen != pvh->pv_gen) { 6591 PMAP_UNLOCK(pmap); 6592 goto retry; 6593 } 6594 } 6595 va = pv->pv_va; 6596 pde = pmap_pde(pmap, va); 6597 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 6598 PMAP_UNLOCK(pmap); 6599 } 6600 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 6601 pmap = PV_PMAP(pv); 6602 if (!PMAP_TRYLOCK(pmap)) { 6603 pvh_gen = pvh->pv_gen; 6604 md_gen = m->md.pv_gen; 6605 rw_wunlock(lock); 6606 PMAP_LOCK(pmap); 6607 rw_wlock(lock); 6608 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 6609 PMAP_UNLOCK(pmap); 6610 goto retry; 6611 } 6612 } 6613 PG_A = pmap_accessed_bit(pmap); 6614 PG_M = pmap_modified_bit(pmap); 6615 PG_RW = pmap_rw_bit(pmap); 6616 pmap_resident_count_adj(pmap, -1); 6617 pde = pmap_pde(pmap, pv->pv_va); 6618 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 6619 " a 2mpage in page %p's pv list", m)); 6620 pte = pmap_pde_to_pte(pde, pv->pv_va); 6621 tpte = pte_load_clear(pte); 6622 if (tpte & PG_W) 6623 pmap->pm_stats.wired_count--; 6624 if (tpte & PG_A) 6625 vm_page_aflag_set(m, PGA_REFERENCED); 6626 6627 /* 6628 * Update the vm_page_t clean and reference bits. 6629 */ 6630 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6631 vm_page_dirty(m); 6632 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); 6633 pmap_invalidate_page(pmap, pv->pv_va); 6634 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 6635 m->md.pv_gen++; 6636 free_pv_entry(pmap, pv); 6637 PMAP_UNLOCK(pmap); 6638 } 6639 vm_page_aflag_clear(m, PGA_WRITEABLE); 6640 rw_wunlock(lock); 6641 pmap_delayed_invl_wait(m); 6642 vm_page_free_pages_toq(&free, true); 6643 } 6644 6645 /* 6646 * pmap_protect_pde: do the things to protect a 2mpage in a process 6647 */ 6648 static boolean_t 6649 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 6650 { 6651 pd_entry_t newpde, oldpde; 6652 vm_page_t m, mt; 6653 boolean_t anychanged; 6654 pt_entry_t PG_G, PG_M, PG_RW; 6655 6656 PG_G = pmap_global_bit(pmap); 6657 PG_M = pmap_modified_bit(pmap); 6658 PG_RW = pmap_rw_bit(pmap); 6659 6660 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6661 KASSERT((sva & PDRMASK) == 0, 6662 ("pmap_protect_pde: sva is not 2mpage aligned")); 6663 anychanged = FALSE; 6664 retry: 6665 oldpde = newpde = *pde; 6666 if ((prot & VM_PROT_WRITE) == 0) { 6667 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == 6668 (PG_MANAGED | PG_M | PG_RW)) { 6669 m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 6670 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 6671 vm_page_dirty(mt); 6672 } 6673 newpde &= ~(PG_RW | PG_M); 6674 } 6675 if ((prot & VM_PROT_EXECUTE) == 0) 6676 newpde |= pg_nx; 6677 if (newpde != oldpde) { 6678 /* 6679 * As an optimization to future operations on this PDE, clear 6680 * PG_PROMOTED. The impending invalidation will remove any 6681 * lingering 4KB page mappings from the TLB. 6682 */ 6683 if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED)) 6684 goto retry; 6685 if ((oldpde & PG_G) != 0) 6686 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 6687 else 6688 anychanged = TRUE; 6689 } 6690 return (anychanged); 6691 } 6692 6693 /* 6694 * Set the physical protection on the 6695 * specified range of this map as requested. 6696 */ 6697 void 6698 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 6699 { 6700 vm_page_t m; 6701 vm_offset_t va_next; 6702 pml4_entry_t *pml4e; 6703 pdp_entry_t *pdpe; 6704 pd_entry_t ptpaddr, *pde; 6705 pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; 6706 pt_entry_t obits, pbits; 6707 boolean_t anychanged; 6708 6709 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 6710 if (prot == VM_PROT_NONE) { 6711 pmap_remove(pmap, sva, eva); 6712 return; 6713 } 6714 6715 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 6716 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 6717 return; 6718 6719 PG_G = pmap_global_bit(pmap); 6720 PG_M = pmap_modified_bit(pmap); 6721 PG_V = pmap_valid_bit(pmap); 6722 PG_RW = pmap_rw_bit(pmap); 6723 anychanged = FALSE; 6724 6725 /* 6726 * Although this function delays and batches the invalidation 6727 * of stale TLB entries, it does not need to call 6728 * pmap_delayed_invl_start() and 6729 * pmap_delayed_invl_finish(), because it does not 6730 * ordinarily destroy mappings. Stale TLB entries from 6731 * protection-only changes need only be invalidated before the 6732 * pmap lock is released, because protection-only changes do 6733 * not destroy PV entries. Even operations that iterate over 6734 * a physical page's PV list of mappings, like 6735 * pmap_remove_write(), acquire the pmap lock for each 6736 * mapping. Consequently, for protection-only changes, the 6737 * pmap lock suffices to synchronize both page table and TLB 6738 * updates. 6739 * 6740 * This function only destroys a mapping if pmap_demote_pde() 6741 * fails. In that case, stale TLB entries are immediately 6742 * invalidated. 6743 */ 6744 6745 PMAP_LOCK(pmap); 6746 for (; sva < eva; sva = va_next) { 6747 pml4e = pmap_pml4e(pmap, sva); 6748 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 6749 va_next = (sva + NBPML4) & ~PML4MASK; 6750 if (va_next < sva) 6751 va_next = eva; 6752 continue; 6753 } 6754 6755 va_next = (sva + NBPDP) & ~PDPMASK; 6756 if (va_next < sva) 6757 va_next = eva; 6758 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 6759 if ((*pdpe & PG_V) == 0) 6760 continue; 6761 if ((*pdpe & PG_PS) != 0) { 6762 KASSERT(va_next <= eva, 6763 ("partial update of non-transparent 1G mapping " 6764 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 6765 *pdpe, sva, eva, va_next)); 6766 retry_pdpe: 6767 obits = pbits = *pdpe; 6768 MPASS((pbits & (PG_MANAGED | PG_G)) == 0); 6769 MPASS(pmap != kernel_pmap); /* XXXKIB */ 6770 if ((prot & VM_PROT_WRITE) == 0) 6771 pbits &= ~(PG_RW | PG_M); 6772 if ((prot & VM_PROT_EXECUTE) == 0) 6773 pbits |= pg_nx; 6774 6775 if (pbits != obits) { 6776 if (!atomic_cmpset_long(pdpe, obits, pbits)) 6777 /* PG_PS cannot be cleared under us, */ 6778 goto retry_pdpe; 6779 anychanged = TRUE; 6780 } 6781 continue; 6782 } 6783 6784 va_next = (sva + NBPDR) & ~PDRMASK; 6785 if (va_next < sva) 6786 va_next = eva; 6787 6788 pde = pmap_pdpe_to_pde(pdpe, sva); 6789 ptpaddr = *pde; 6790 6791 /* 6792 * Weed out invalid mappings. 6793 */ 6794 if (ptpaddr == 0) 6795 continue; 6796 6797 /* 6798 * Check for large page. 6799 */ 6800 if ((ptpaddr & PG_PS) != 0) { 6801 /* 6802 * Are we protecting the entire large page? If not, 6803 * demote the mapping and fall through. 6804 */ 6805 if (sva + NBPDR == va_next && eva >= va_next) { 6806 /* 6807 * The TLB entry for a PG_G mapping is 6808 * invalidated by pmap_protect_pde(). 6809 */ 6810 if (pmap_protect_pde(pmap, pde, sva, prot)) 6811 anychanged = TRUE; 6812 continue; 6813 } else if (!pmap_demote_pde(pmap, pde, sva)) { 6814 /* 6815 * The large page mapping was destroyed. 6816 */ 6817 continue; 6818 } 6819 } 6820 6821 if (va_next > eva) 6822 va_next = eva; 6823 6824 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 6825 sva += PAGE_SIZE) { 6826 retry: 6827 obits = pbits = *pte; 6828 if ((pbits & PG_V) == 0) 6829 continue; 6830 6831 if ((prot & VM_PROT_WRITE) == 0) { 6832 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 6833 (PG_MANAGED | PG_M | PG_RW)) { 6834 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 6835 vm_page_dirty(m); 6836 } 6837 pbits &= ~(PG_RW | PG_M); 6838 } 6839 if ((prot & VM_PROT_EXECUTE) == 0) 6840 pbits |= pg_nx; 6841 6842 if (pbits != obits) { 6843 if (!atomic_cmpset_long(pte, obits, pbits)) 6844 goto retry; 6845 if (obits & PG_G) 6846 pmap_invalidate_page(pmap, sva); 6847 else 6848 anychanged = TRUE; 6849 } 6850 } 6851 } 6852 if (anychanged) 6853 pmap_invalidate_all(pmap); 6854 PMAP_UNLOCK(pmap); 6855 } 6856 6857 static bool 6858 pmap_pde_ept_executable(pmap_t pmap, pd_entry_t pde) 6859 { 6860 6861 if (pmap->pm_type != PT_EPT) 6862 return (false); 6863 return ((pde & EPT_PG_EXECUTE) != 0); 6864 } 6865 6866 #if VM_NRESERVLEVEL > 0 6867 /* 6868 * Tries to promote the 512, contiguous 4KB page mappings that are within a 6869 * single page table page (PTP) to a single 2MB page mapping. For promotion 6870 * to occur, two conditions must be met: (1) the 4KB page mappings must map 6871 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 6872 * identical characteristics. 6873 */ 6874 static bool 6875 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, vm_page_t mpte, 6876 struct rwlock **lockp) 6877 { 6878 pd_entry_t newpde; 6879 pt_entry_t *firstpte, oldpte, pa, *pte; 6880 pt_entry_t allpte_PG_A, PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V; 6881 int PG_PTE_CACHE; 6882 6883 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6884 if (!pmap_ps_enabled(pmap)) 6885 return (false); 6886 6887 PG_A = pmap_accessed_bit(pmap); 6888 PG_G = pmap_global_bit(pmap); 6889 PG_M = pmap_modified_bit(pmap); 6890 PG_V = pmap_valid_bit(pmap); 6891 PG_RW = pmap_rw_bit(pmap); 6892 PG_PKU_MASK = pmap_pku_mask_bit(pmap); 6893 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 6894 6895 /* 6896 * Examine the first PTE in the specified PTP. Abort if this PTE is 6897 * ineligible for promotion due to hardware errata, invalid, or does 6898 * not map the first 4KB physical page within a 2MB page. 6899 */ 6900 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 6901 newpde = *firstpte; 6902 if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, newpde))) 6903 return (false); 6904 if ((newpde & ((PG_FRAME & PDRMASK) | PG_V)) != PG_V) { 6905 counter_u64_add(pmap_pde_p_failures, 1); 6906 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6907 " in pmap %p", va, pmap); 6908 return (false); 6909 } 6910 6911 /* 6912 * Both here and in the below "for" loop, to allow for repromotion 6913 * after MADV_FREE, conditionally write protect a clean PTE before 6914 * possibly aborting the promotion due to other PTE attributes. Why? 6915 * Suppose that MADV_FREE is applied to a part of a superpage, the 6916 * address range [S, E). pmap_advise() will demote the superpage 6917 * mapping, destroy the 4KB page mapping at the end of [S, E), and 6918 * clear PG_M and PG_A in the PTEs for the rest of [S, E). Later, 6919 * imagine that the memory in [S, E) is recycled, but the last 4KB 6920 * page in [S, E) is not the last to be rewritten, or simply accessed. 6921 * In other words, there is still a 4KB page in [S, E), call it P, 6922 * that is writeable but PG_M and PG_A are clear in P's PTE. Unless 6923 * we write protect P before aborting the promotion, if and when P is 6924 * finally rewritten, there won't be a page fault to trigger 6925 * repromotion. 6926 */ 6927 setpde: 6928 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 6929 /* 6930 * When PG_M is already clear, PG_RW can be cleared without 6931 * a TLB invalidation. 6932 */ 6933 if (!atomic_fcmpset_long(firstpte, &newpde, newpde & ~PG_RW)) 6934 goto setpde; 6935 newpde &= ~PG_RW; 6936 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 6937 " in pmap %p", va & ~PDRMASK, pmap); 6938 } 6939 6940 /* 6941 * Examine each of the other PTEs in the specified PTP. Abort if this 6942 * PTE maps an unexpected 4KB physical page or does not have identical 6943 * characteristics to the first PTE. 6944 */ 6945 allpte_PG_A = newpde & PG_A; 6946 pa = (newpde & (PG_PS_FRAME | PG_V)) + NBPDR - PAGE_SIZE; 6947 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 6948 oldpte = *pte; 6949 if ((oldpte & (PG_FRAME | PG_V)) != pa) { 6950 counter_u64_add(pmap_pde_p_failures, 1); 6951 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6952 " in pmap %p", va, pmap); 6953 return (false); 6954 } 6955 setpte: 6956 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 6957 /* 6958 * When PG_M is already clear, PG_RW can be cleared 6959 * without a TLB invalidation. 6960 */ 6961 if (!atomic_fcmpset_long(pte, &oldpte, oldpte & ~PG_RW)) 6962 goto setpte; 6963 oldpte &= ~PG_RW; 6964 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 6965 " in pmap %p", (oldpte & PG_FRAME & PDRMASK) | 6966 (va & ~PDRMASK), pmap); 6967 } 6968 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 6969 counter_u64_add(pmap_pde_p_failures, 1); 6970 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6971 " in pmap %p", va, pmap); 6972 return (false); 6973 } 6974 allpte_PG_A &= oldpte; 6975 pa -= PAGE_SIZE; 6976 } 6977 6978 /* 6979 * Unless all PTEs have PG_A set, clear it from the superpage mapping, 6980 * so that promotions triggered by speculative mappings, such as 6981 * pmap_enter_quick(), don't automatically mark the underlying pages 6982 * as referenced. 6983 */ 6984 newpde &= ~PG_A | allpte_PG_A; 6985 6986 /* 6987 * EPT PTEs with PG_M set and PG_A clear are not supported by early 6988 * MMUs supporting EPT. 6989 */ 6990 KASSERT((newpde & PG_A) != 0 || safe_to_clear_referenced(pmap, newpde), 6991 ("unsupported EPT PTE")); 6992 6993 /* 6994 * Save the PTP in its current state until the PDE mapping the 6995 * superpage is demoted by pmap_demote_pde() or destroyed by 6996 * pmap_remove_pde(). If PG_A is not set in every PTE, then request 6997 * that the PTP be refilled on demotion. 6998 */ 6999 if (mpte == NULL) 7000 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7001 KASSERT(mpte >= vm_page_array && 7002 mpte < &vm_page_array[vm_page_array_size], 7003 ("pmap_promote_pde: page table page is out of range")); 7004 KASSERT(mpte->pindex == pmap_pde_pindex(va), 7005 ("pmap_promote_pde: page table page's pindex is wrong " 7006 "mpte %p pidx %#lx va %#lx va pde pidx %#lx", 7007 mpte, mpte->pindex, va, pmap_pde_pindex(va))); 7008 if (pmap_insert_pt_page(pmap, mpte, true, allpte_PG_A != 0)) { 7009 counter_u64_add(pmap_pde_p_failures, 1); 7010 CTR2(KTR_PMAP, 7011 "pmap_promote_pde: failure for va %#lx in pmap %p", va, 7012 pmap); 7013 return (false); 7014 } 7015 7016 /* 7017 * Promote the pv entries. 7018 */ 7019 if ((newpde & PG_MANAGED) != 0) 7020 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); 7021 7022 /* 7023 * Propagate the PAT index to its proper position. 7024 */ 7025 newpde = pmap_swap_pat(pmap, newpde); 7026 7027 /* 7028 * Map the superpage. 7029 */ 7030 if (workaround_erratum383) 7031 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 7032 else 7033 pde_store(pde, PG_PROMOTED | PG_PS | newpde); 7034 7035 counter_u64_add(pmap_pde_promotions, 1); 7036 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" 7037 " in pmap %p", va, pmap); 7038 return (true); 7039 } 7040 #endif /* VM_NRESERVLEVEL > 0 */ 7041 7042 static int 7043 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags, 7044 int psind) 7045 { 7046 vm_page_t mp; 7047 pt_entry_t origpte, *pml4e, *pdpe, *pde, pten, PG_V; 7048 7049 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7050 KASSERT(psind > 0 && psind < MAXPAGESIZES && pagesizes[psind] != 0, 7051 ("psind %d unexpected", psind)); 7052 KASSERT(((newpte & PG_FRAME) & (pagesizes[psind] - 1)) == 0, 7053 ("unaligned phys address %#lx newpte %#lx psind %d", 7054 newpte & PG_FRAME, newpte, psind)); 7055 KASSERT((va & (pagesizes[psind] - 1)) == 0, 7056 ("unaligned va %#lx psind %d", va, psind)); 7057 KASSERT(va < VM_MAXUSER_ADDRESS, 7058 ("kernel mode non-transparent superpage")); /* XXXKIB */ 7059 KASSERT(va + pagesizes[psind] < VM_MAXUSER_ADDRESS, 7060 ("overflowing user map va %#lx psind %d", va, psind)); /* XXXKIB */ 7061 7062 PG_V = pmap_valid_bit(pmap); 7063 7064 restart: 7065 if (!pmap_pkru_same(pmap, va, va + pagesizes[psind])) 7066 return (KERN_PROTECTION_FAILURE); 7067 pten = newpte; 7068 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) 7069 pten |= pmap_pkru_get(pmap, va); 7070 7071 if (psind == 2) { /* 1G */ 7072 pml4e = pmap_pml4e(pmap, va); 7073 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 7074 mp = pmap_allocpte_alloc(pmap, pmap_pml4e_pindex(va), 7075 NULL, va); 7076 if (mp == NULL) 7077 goto allocf; 7078 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 7079 pdpe = &pdpe[pmap_pdpe_index(va)]; 7080 origpte = *pdpe; 7081 MPASS(origpte == 0); 7082 } else { 7083 pdpe = pmap_pml4e_to_pdpe(pml4e, va); 7084 KASSERT(pdpe != NULL, ("va %#lx lost pdpe", va)); 7085 origpte = *pdpe; 7086 if ((origpte & PG_V) == 0) { 7087 mp = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME); 7088 mp->ref_count++; 7089 } 7090 } 7091 *pdpe = pten; 7092 } else /* (psind == 1) */ { /* 2M */ 7093 pde = pmap_pde(pmap, va); 7094 if (pde == NULL) { 7095 mp = pmap_allocpte_alloc(pmap, pmap_pdpe_pindex(va), 7096 NULL, va); 7097 if (mp == NULL) 7098 goto allocf; 7099 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 7100 pde = &pde[pmap_pde_index(va)]; 7101 origpte = *pde; 7102 MPASS(origpte == 0); 7103 } else { 7104 origpte = *pde; 7105 if ((origpte & PG_V) == 0) { 7106 pdpe = pmap_pdpe(pmap, va); 7107 MPASS(pdpe != NULL && (*pdpe & PG_V) != 0); 7108 mp = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 7109 mp->ref_count++; 7110 } 7111 } 7112 *pde = pten; 7113 } 7114 KASSERT((origpte & PG_V) == 0 || ((origpte & PG_PS) != 0 && 7115 (origpte & PG_PS_FRAME) == (pten & PG_PS_FRAME)), 7116 ("va %#lx changing %s phys page origpte %#lx pten %#lx", 7117 va, psind == 2 ? "1G" : "2M", origpte, pten)); 7118 if ((pten & PG_W) != 0 && (origpte & PG_W) == 0) 7119 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE; 7120 else if ((pten & PG_W) == 0 && (origpte & PG_W) != 0) 7121 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE; 7122 if ((origpte & PG_V) == 0) 7123 pmap_resident_count_adj(pmap, pagesizes[psind] / PAGE_SIZE); 7124 7125 return (KERN_SUCCESS); 7126 7127 allocf: 7128 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 7129 return (KERN_RESOURCE_SHORTAGE); 7130 PMAP_UNLOCK(pmap); 7131 vm_wait(NULL); 7132 PMAP_LOCK(pmap); 7133 goto restart; 7134 } 7135 7136 /* 7137 * Insert the given physical page (p) at 7138 * the specified virtual address (v) in the 7139 * target physical map with the protection requested. 7140 * 7141 * If specified, the page will be wired down, meaning 7142 * that the related pte can not be reclaimed. 7143 * 7144 * NB: This is the only routine which MAY NOT lazy-evaluate 7145 * or lose information. That is, this routine must actually 7146 * insert this page into the given map NOW. 7147 * 7148 * When destroying both a page table and PV entry, this function 7149 * performs the TLB invalidation before releasing the PV list 7150 * lock, so we do not need pmap_delayed_invl_page() calls here. 7151 */ 7152 int 7153 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 7154 u_int flags, int8_t psind) 7155 { 7156 struct rwlock *lock; 7157 pd_entry_t *pde; 7158 pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; 7159 pt_entry_t newpte, origpte; 7160 pv_entry_t pv; 7161 vm_paddr_t opa, pa; 7162 vm_page_t mpte, om; 7163 int rv; 7164 boolean_t nosleep; 7165 7166 PG_A = pmap_accessed_bit(pmap); 7167 PG_G = pmap_global_bit(pmap); 7168 PG_M = pmap_modified_bit(pmap); 7169 PG_V = pmap_valid_bit(pmap); 7170 PG_RW = pmap_rw_bit(pmap); 7171 7172 va = trunc_page(va); 7173 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 7174 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 7175 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", 7176 va)); 7177 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va), 7178 ("pmap_enter: managed mapping within the clean submap")); 7179 if ((m->oflags & VPO_UNMANAGED) == 0) 7180 VM_PAGE_OBJECT_BUSY_ASSERT(m); 7181 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 7182 ("pmap_enter: flags %u has reserved bits set", flags)); 7183 pa = VM_PAGE_TO_PHYS(m); 7184 newpte = (pt_entry_t)(pa | PG_A | PG_V); 7185 if ((flags & VM_PROT_WRITE) != 0) 7186 newpte |= PG_M; 7187 if ((prot & VM_PROT_WRITE) != 0) 7188 newpte |= PG_RW; 7189 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 7190 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); 7191 if ((prot & VM_PROT_EXECUTE) == 0) 7192 newpte |= pg_nx; 7193 if ((flags & PMAP_ENTER_WIRED) != 0) 7194 newpte |= PG_W; 7195 if (va < VM_MAXUSER_ADDRESS) 7196 newpte |= PG_U; 7197 if (pmap == kernel_pmap) 7198 newpte |= PG_G; 7199 newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0); 7200 7201 /* 7202 * Set modified bit gratuitously for writeable mappings if 7203 * the page is unmanaged. We do not want to take a fault 7204 * to do the dirty bit accounting for these mappings. 7205 */ 7206 if ((m->oflags & VPO_UNMANAGED) != 0) { 7207 if ((newpte & PG_RW) != 0) 7208 newpte |= PG_M; 7209 } else 7210 newpte |= PG_MANAGED; 7211 7212 lock = NULL; 7213 PMAP_LOCK(pmap); 7214 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { 7215 KASSERT((m->oflags & VPO_UNMANAGED) != 0, 7216 ("managed largepage va %#lx flags %#x", va, flags)); 7217 rv = pmap_enter_largepage(pmap, va, newpte | PG_PS, flags, 7218 psind); 7219 goto out; 7220 } 7221 if (psind == 1) { 7222 /* Assert the required virtual and physical alignment. */ 7223 KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); 7224 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 7225 rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock); 7226 goto out; 7227 } 7228 mpte = NULL; 7229 7230 /* 7231 * In the case that a page table page is not 7232 * resident, we are creating it here. 7233 */ 7234 retry: 7235 pde = pmap_pde(pmap, va); 7236 if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || 7237 pmap_demote_pde_locked(pmap, pde, va, &lock))) { 7238 pte = pmap_pde_to_pte(pde, va); 7239 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 7240 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7241 mpte->ref_count++; 7242 } 7243 } else if (va < VM_MAXUSER_ADDRESS) { 7244 /* 7245 * Here if the pte page isn't mapped, or if it has been 7246 * deallocated. 7247 */ 7248 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 7249 mpte = pmap_allocpte_alloc(pmap, pmap_pde_pindex(va), 7250 nosleep ? NULL : &lock, va); 7251 if (mpte == NULL && nosleep) { 7252 rv = KERN_RESOURCE_SHORTAGE; 7253 goto out; 7254 } 7255 goto retry; 7256 } else 7257 panic("pmap_enter: invalid page directory va=%#lx", va); 7258 7259 origpte = *pte; 7260 pv = NULL; 7261 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) 7262 newpte |= pmap_pkru_get(pmap, va); 7263 7264 /* 7265 * Is the specified virtual address already mapped? 7266 */ 7267 if ((origpte & PG_V) != 0) { 7268 /* 7269 * Wiring change, just update stats. We don't worry about 7270 * wiring PT pages as they remain resident as long as there 7271 * are valid mappings in them. Hence, if a user page is wired, 7272 * the PT page will be also. 7273 */ 7274 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 7275 pmap->pm_stats.wired_count++; 7276 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 7277 pmap->pm_stats.wired_count--; 7278 7279 /* 7280 * Remove the extra PT page reference. 7281 */ 7282 if (mpte != NULL) { 7283 mpte->ref_count--; 7284 KASSERT(mpte->ref_count > 0, 7285 ("pmap_enter: missing reference to page table page," 7286 " va: 0x%lx", va)); 7287 } 7288 7289 /* 7290 * Has the physical page changed? 7291 */ 7292 opa = origpte & PG_FRAME; 7293 if (opa == pa) { 7294 /* 7295 * No, might be a protection or wiring change. 7296 */ 7297 if ((origpte & PG_MANAGED) != 0 && 7298 (newpte & PG_RW) != 0) 7299 vm_page_aflag_set(m, PGA_WRITEABLE); 7300 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 7301 goto unchanged; 7302 goto validate; 7303 } 7304 7305 /* 7306 * The physical page has changed. Temporarily invalidate 7307 * the mapping. This ensures that all threads sharing the 7308 * pmap keep a consistent view of the mapping, which is 7309 * necessary for the correct handling of COW faults. It 7310 * also permits reuse of the old mapping's PV entry, 7311 * avoiding an allocation. 7312 * 7313 * For consistency, handle unmanaged mappings the same way. 7314 */ 7315 origpte = pte_load_clear(pte); 7316 KASSERT((origpte & PG_FRAME) == opa, 7317 ("pmap_enter: unexpected pa update for %#lx", va)); 7318 if ((origpte & PG_MANAGED) != 0) { 7319 om = PHYS_TO_VM_PAGE(opa); 7320 7321 /* 7322 * The pmap lock is sufficient to synchronize with 7323 * concurrent calls to pmap_page_test_mappings() and 7324 * pmap_ts_referenced(). 7325 */ 7326 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 7327 vm_page_dirty(om); 7328 if ((origpte & PG_A) != 0) { 7329 pmap_invalidate_page(pmap, va); 7330 vm_page_aflag_set(om, PGA_REFERENCED); 7331 } 7332 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 7333 pv = pmap_pvh_remove(&om->md, pmap, va); 7334 KASSERT(pv != NULL, 7335 ("pmap_enter: no PV entry for %#lx", va)); 7336 if ((newpte & PG_MANAGED) == 0) 7337 free_pv_entry(pmap, pv); 7338 if ((om->a.flags & PGA_WRITEABLE) != 0 && 7339 TAILQ_EMPTY(&om->md.pv_list) && 7340 ((om->flags & PG_FICTITIOUS) != 0 || 7341 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 7342 vm_page_aflag_clear(om, PGA_WRITEABLE); 7343 } else { 7344 /* 7345 * Since this mapping is unmanaged, assume that PG_A 7346 * is set. 7347 */ 7348 pmap_invalidate_page(pmap, va); 7349 } 7350 origpte = 0; 7351 } else { 7352 /* 7353 * Increment the counters. 7354 */ 7355 if ((newpte & PG_W) != 0) 7356 pmap->pm_stats.wired_count++; 7357 pmap_resident_count_adj(pmap, 1); 7358 } 7359 7360 /* 7361 * Enter on the PV list if part of our managed memory. 7362 */ 7363 if ((newpte & PG_MANAGED) != 0) { 7364 if (pv == NULL) { 7365 pv = get_pv_entry(pmap, &lock); 7366 pv->pv_va = va; 7367 } 7368 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 7369 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 7370 m->md.pv_gen++; 7371 if ((newpte & PG_RW) != 0) 7372 vm_page_aflag_set(m, PGA_WRITEABLE); 7373 } 7374 7375 /* 7376 * Update the PTE. 7377 */ 7378 if ((origpte & PG_V) != 0) { 7379 validate: 7380 origpte = pte_load_store(pte, newpte); 7381 KASSERT((origpte & PG_FRAME) == pa, 7382 ("pmap_enter: unexpected pa update for %#lx", va)); 7383 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == 7384 (PG_M | PG_RW)) { 7385 if ((origpte & PG_MANAGED) != 0) 7386 vm_page_dirty(m); 7387 7388 /* 7389 * Although the PTE may still have PG_RW set, TLB 7390 * invalidation may nonetheless be required because 7391 * the PTE no longer has PG_M set. 7392 */ 7393 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { 7394 /* 7395 * This PTE change does not require TLB invalidation. 7396 */ 7397 goto unchanged; 7398 } 7399 if ((origpte & PG_A) != 0) 7400 pmap_invalidate_page(pmap, va); 7401 } else 7402 pte_store(pte, newpte); 7403 7404 unchanged: 7405 7406 #if VM_NRESERVLEVEL > 0 7407 /* 7408 * If both the page table page and the reservation are fully 7409 * populated, then attempt promotion. 7410 */ 7411 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 7412 (m->flags & PG_FICTITIOUS) == 0 && 7413 vm_reserv_level_iffullpop(m) == 0) 7414 (void)pmap_promote_pde(pmap, pde, va, mpte, &lock); 7415 #endif 7416 7417 rv = KERN_SUCCESS; 7418 out: 7419 if (lock != NULL) 7420 rw_wunlock(lock); 7421 PMAP_UNLOCK(pmap); 7422 return (rv); 7423 } 7424 7425 /* 7426 * Tries to create a read- and/or execute-only 2MB page mapping. Returns 7427 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error 7428 * value. See pmap_enter_pde() for the possible error values when "no sleep", 7429 * "no replace", and "no reclaim" are specified. 7430 */ 7431 static int 7432 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 7433 struct rwlock **lockp) 7434 { 7435 pd_entry_t newpde; 7436 pt_entry_t PG_V; 7437 7438 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7439 PG_V = pmap_valid_bit(pmap); 7440 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | 7441 PG_PS | PG_V; 7442 if ((m->oflags & VPO_UNMANAGED) == 0) 7443 newpde |= PG_MANAGED; 7444 if ((prot & VM_PROT_EXECUTE) == 0) 7445 newpde |= pg_nx; 7446 if (va < VM_MAXUSER_ADDRESS) 7447 newpde |= PG_U; 7448 return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | 7449 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp)); 7450 } 7451 7452 /* 7453 * Returns true if every page table entry in the specified page table page is 7454 * zero. 7455 */ 7456 static bool 7457 pmap_every_pte_zero(vm_paddr_t pa) 7458 { 7459 pt_entry_t *pt_end, *pte; 7460 7461 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); 7462 pte = (pt_entry_t *)PHYS_TO_DMAP(pa); 7463 for (pt_end = pte + NPTEPG; pte < pt_end; pte++) { 7464 if (*pte != 0) 7465 return (false); 7466 } 7467 return (true); 7468 } 7469 7470 /* 7471 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 7472 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, 7473 * KERN_PROTECTION_FAILURE, or KERN_RESOURCE_SHORTAGE otherwise. Returns 7474 * KERN_FAILURE if either (1) PMAP_ENTER_NOREPLACE was specified and a 4KB 7475 * page mapping already exists within the 2MB virtual address range starting 7476 * at the specified virtual address or (2) the requested 2MB page mapping is 7477 * not supported due to hardware errata. Returns KERN_NO_SPACE if 7478 * PMAP_ENTER_NOREPLACE was specified and a 2MB page mapping already exists at 7479 * the specified virtual address. Returns KERN_PROTECTION_FAILURE if the PKRU 7480 * settings are not the same across the 2MB virtual address range starting at 7481 * the specified virtual address. Returns KERN_RESOURCE_SHORTAGE if either 7482 * (1) PMAP_ENTER_NOSLEEP was specified and a page table page allocation 7483 * failed or (2) PMAP_ENTER_NORECLAIM was specified and a PV entry allocation 7484 * failed. 7485 * 7486 * The parameter "m" is only used when creating a managed, writeable mapping. 7487 */ 7488 static int 7489 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, 7490 vm_page_t m, struct rwlock **lockp) 7491 { 7492 struct spglist free; 7493 pd_entry_t oldpde, *pde; 7494 pt_entry_t PG_G, PG_RW, PG_V; 7495 vm_page_t mt, pdpg; 7496 7497 KASSERT(pmap == kernel_pmap || (newpde & PG_W) == 0, 7498 ("pmap_enter_pde: cannot create wired user mapping")); 7499 PG_G = pmap_global_bit(pmap); 7500 PG_RW = pmap_rw_bit(pmap); 7501 KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW, 7502 ("pmap_enter_pde: newpde is missing PG_M")); 7503 PG_V = pmap_valid_bit(pmap); 7504 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7505 7506 if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, 7507 newpde))) { 7508 CTR2(KTR_PMAP, "pmap_enter_pde: 2m x blocked for va %#lx" 7509 " in pmap %p", va, pmap); 7510 return (KERN_FAILURE); 7511 } 7512 if ((pde = pmap_alloc_pde(pmap, va, &pdpg, (flags & 7513 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { 7514 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 7515 " in pmap %p", va, pmap); 7516 return (KERN_RESOURCE_SHORTAGE); 7517 } 7518 7519 /* 7520 * If pkru is not same for the whole pde range, return failure 7521 * and let vm_fault() cope. Check after pde allocation, since 7522 * it could sleep. 7523 */ 7524 if (!pmap_pkru_same(pmap, va, va + NBPDR)) { 7525 pmap_abort_ptp(pmap, va, pdpg); 7526 return (KERN_PROTECTION_FAILURE); 7527 } 7528 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) { 7529 newpde &= ~X86_PG_PKU_MASK; 7530 newpde |= pmap_pkru_get(pmap, va); 7531 } 7532 7533 /* 7534 * If there are existing mappings, either abort or remove them. 7535 */ 7536 oldpde = *pde; 7537 if ((oldpde & PG_V) != 0) { 7538 KASSERT(pdpg == NULL || pdpg->ref_count > 1, 7539 ("pmap_enter_pde: pdpg's reference count is too low")); 7540 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 7541 if ((oldpde & PG_PS) != 0) { 7542 if (pdpg != NULL) 7543 pdpg->ref_count--; 7544 CTR2(KTR_PMAP, 7545 "pmap_enter_pde: no space for va %#lx" 7546 " in pmap %p", va, pmap); 7547 return (KERN_NO_SPACE); 7548 } else if (va < VM_MAXUSER_ADDRESS || 7549 !pmap_every_pte_zero(oldpde & PG_FRAME)) { 7550 if (pdpg != NULL) 7551 pdpg->ref_count--; 7552 CTR2(KTR_PMAP, 7553 "pmap_enter_pde: failure for va %#lx" 7554 " in pmap %p", va, pmap); 7555 return (KERN_FAILURE); 7556 } 7557 } 7558 /* Break the existing mapping(s). */ 7559 SLIST_INIT(&free); 7560 if ((oldpde & PG_PS) != 0) { 7561 /* 7562 * The reference to the PD page that was acquired by 7563 * pmap_alloc_pde() ensures that it won't be freed. 7564 * However, if the PDE resulted from a promotion, then 7565 * a reserved PT page could be freed. 7566 */ 7567 (void)pmap_remove_pde(pmap, pde, va, &free, lockp); 7568 if ((oldpde & PG_G) == 0) 7569 pmap_invalidate_pde_page(pmap, va, oldpde); 7570 } else { 7571 pmap_delayed_invl_start(); 7572 if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free, 7573 lockp)) 7574 pmap_invalidate_all(pmap); 7575 pmap_delayed_invl_finish(); 7576 } 7577 if (va < VM_MAXUSER_ADDRESS) { 7578 vm_page_free_pages_toq(&free, true); 7579 KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", 7580 pde)); 7581 } else { 7582 KASSERT(SLIST_EMPTY(&free), 7583 ("pmap_enter_pde: freed kernel page table page")); 7584 7585 /* 7586 * Both pmap_remove_pde() and pmap_remove_ptes() will 7587 * leave the kernel page table page zero filled. 7588 */ 7589 mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7590 if (pmap_insert_pt_page(pmap, mt, false, false)) 7591 panic("pmap_enter_pde: trie insert failed"); 7592 } 7593 } 7594 7595 if ((newpde & PG_MANAGED) != 0) { 7596 /* 7597 * Abort this mapping if its PV entry could not be created. 7598 */ 7599 if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) { 7600 if (pdpg != NULL) 7601 pmap_abort_ptp(pmap, va, pdpg); 7602 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 7603 " in pmap %p", va, pmap); 7604 return (KERN_RESOURCE_SHORTAGE); 7605 } 7606 if ((newpde & PG_RW) != 0) { 7607 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 7608 vm_page_aflag_set(mt, PGA_WRITEABLE); 7609 } 7610 } 7611 7612 /* 7613 * Increment counters. 7614 */ 7615 if ((newpde & PG_W) != 0) 7616 pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; 7617 pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE); 7618 7619 /* 7620 * Map the superpage. (This is not a promoted mapping; there will not 7621 * be any lingering 4KB page mappings in the TLB.) 7622 */ 7623 pde_store(pde, newpde); 7624 7625 counter_u64_add(pmap_pde_mappings, 1); 7626 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx in pmap %p", 7627 va, pmap); 7628 return (KERN_SUCCESS); 7629 } 7630 7631 /* 7632 * Maps a sequence of resident pages belonging to the same object. 7633 * The sequence begins with the given page m_start. This page is 7634 * mapped at the given virtual address start. Each subsequent page is 7635 * mapped at a virtual address that is offset from start by the same 7636 * amount as the page is offset from m_start within the object. The 7637 * last page in the sequence is the page with the largest offset from 7638 * m_start that can be mapped at a virtual address less than the given 7639 * virtual address end. Not every virtual page between start and end 7640 * is mapped; only those for which a resident page exists with the 7641 * corresponding offset from m_start are mapped. 7642 */ 7643 void 7644 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 7645 vm_page_t m_start, vm_prot_t prot) 7646 { 7647 struct rwlock *lock; 7648 vm_offset_t va; 7649 vm_page_t m, mpte; 7650 vm_pindex_t diff, psize; 7651 int rv; 7652 7653 VM_OBJECT_ASSERT_LOCKED(m_start->object); 7654 7655 psize = atop(end - start); 7656 mpte = NULL; 7657 m = m_start; 7658 lock = NULL; 7659 PMAP_LOCK(pmap); 7660 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 7661 va = start + ptoa(diff); 7662 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 7663 m->psind == 1 && pmap_ps_enabled(pmap) && 7664 ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) == 7665 KERN_SUCCESS || rv == KERN_NO_SPACE)) 7666 m = &m[NBPDR / PAGE_SIZE - 1]; 7667 else 7668 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 7669 mpte, &lock); 7670 m = TAILQ_NEXT(m, listq); 7671 } 7672 if (lock != NULL) 7673 rw_wunlock(lock); 7674 PMAP_UNLOCK(pmap); 7675 } 7676 7677 /* 7678 * this code makes some *MAJOR* assumptions: 7679 * 1. Current pmap & pmap exists. 7680 * 2. Not wired. 7681 * 3. Read access. 7682 * 4. No page table pages. 7683 * but is *MUCH* faster than pmap_enter... 7684 */ 7685 7686 void 7687 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 7688 { 7689 struct rwlock *lock; 7690 7691 lock = NULL; 7692 PMAP_LOCK(pmap); 7693 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 7694 if (lock != NULL) 7695 rw_wunlock(lock); 7696 PMAP_UNLOCK(pmap); 7697 } 7698 7699 static vm_page_t 7700 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 7701 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 7702 { 7703 pd_entry_t *pde; 7704 pt_entry_t newpte, *pte, PG_V; 7705 7706 KASSERT(!VA_IS_CLEANMAP(va) || 7707 (m->oflags & VPO_UNMANAGED) != 0, 7708 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 7709 PG_V = pmap_valid_bit(pmap); 7710 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7711 pde = NULL; 7712 7713 /* 7714 * In the case that a page table page is not 7715 * resident, we are creating it here. 7716 */ 7717 if (va < VM_MAXUSER_ADDRESS) { 7718 pdp_entry_t *pdpe; 7719 vm_pindex_t ptepindex; 7720 7721 /* 7722 * Calculate pagetable page index 7723 */ 7724 ptepindex = pmap_pde_pindex(va); 7725 if (mpte && (mpte->pindex == ptepindex)) { 7726 mpte->ref_count++; 7727 } else { 7728 /* 7729 * If the page table page is mapped, we just increment 7730 * the hold count, and activate it. Otherwise, we 7731 * attempt to allocate a page table page, passing NULL 7732 * instead of the PV list lock pointer because we don't 7733 * intend to sleep. If this attempt fails, we don't 7734 * retry. Instead, we give up. 7735 */ 7736 pdpe = pmap_pdpe(pmap, va); 7737 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 7738 if ((*pdpe & PG_PS) != 0) 7739 return (NULL); 7740 pde = pmap_pdpe_to_pde(pdpe, va); 7741 if ((*pde & PG_V) != 0) { 7742 if ((*pde & PG_PS) != 0) 7743 return (NULL); 7744 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7745 mpte->ref_count++; 7746 } else { 7747 mpte = pmap_allocpte_alloc(pmap, 7748 ptepindex, NULL, va); 7749 if (mpte == NULL) 7750 return (NULL); 7751 } 7752 } else { 7753 mpte = pmap_allocpte_alloc(pmap, ptepindex, 7754 NULL, va); 7755 if (mpte == NULL) 7756 return (NULL); 7757 } 7758 } 7759 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 7760 pte = &pte[pmap_pte_index(va)]; 7761 } else { 7762 mpte = NULL; 7763 pte = vtopte(va); 7764 } 7765 if (*pte) { 7766 if (mpte != NULL) 7767 mpte->ref_count--; 7768 return (NULL); 7769 } 7770 7771 /* 7772 * Enter on the PV list if part of our managed memory. 7773 */ 7774 if ((m->oflags & VPO_UNMANAGED) == 0 && 7775 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 7776 if (mpte != NULL) 7777 pmap_abort_ptp(pmap, va, mpte); 7778 return (NULL); 7779 } 7780 7781 /* 7782 * Increment counters 7783 */ 7784 pmap_resident_count_adj(pmap, 1); 7785 7786 newpte = VM_PAGE_TO_PHYS(m) | PG_V | 7787 pmap_cache_bits(pmap, m->md.pat_mode, 0); 7788 if ((m->oflags & VPO_UNMANAGED) == 0) 7789 newpte |= PG_MANAGED; 7790 if ((prot & VM_PROT_EXECUTE) == 0) 7791 newpte |= pg_nx; 7792 if (va < VM_MAXUSER_ADDRESS) 7793 newpte |= PG_U | pmap_pkru_get(pmap, va); 7794 pte_store(pte, newpte); 7795 7796 #if VM_NRESERVLEVEL > 0 7797 /* 7798 * If both the PTP and the reservation are fully populated, then 7799 * attempt promotion. 7800 */ 7801 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 7802 (m->flags & PG_FICTITIOUS) == 0 && 7803 vm_reserv_level_iffullpop(m) == 0) { 7804 if (pde == NULL) 7805 pde = pmap_pde(pmap, va); 7806 7807 /* 7808 * If promotion succeeds, then the next call to this function 7809 * should not be given the unmapped PTP as a hint. 7810 */ 7811 if (pmap_promote_pde(pmap, pde, va, mpte, lockp)) 7812 mpte = NULL; 7813 } 7814 #endif 7815 7816 return (mpte); 7817 } 7818 7819 /* 7820 * Make a temporary mapping for a physical address. This is only intended 7821 * to be used for panic dumps. 7822 */ 7823 void * 7824 pmap_kenter_temporary(vm_paddr_t pa, int i) 7825 { 7826 vm_offset_t va; 7827 7828 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 7829 pmap_kenter(va, pa); 7830 pmap_invlpg(kernel_pmap, va); 7831 return ((void *)crashdumpmap); 7832 } 7833 7834 /* 7835 * This code maps large physical mmap regions into the 7836 * processor address space. Note that some shortcuts 7837 * are taken, but the code works. 7838 */ 7839 void 7840 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 7841 vm_pindex_t pindex, vm_size_t size) 7842 { 7843 pd_entry_t *pde; 7844 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 7845 vm_paddr_t pa, ptepa; 7846 vm_page_t p, pdpg; 7847 int pat_mode; 7848 7849 PG_A = pmap_accessed_bit(pmap); 7850 PG_M = pmap_modified_bit(pmap); 7851 PG_V = pmap_valid_bit(pmap); 7852 PG_RW = pmap_rw_bit(pmap); 7853 7854 VM_OBJECT_ASSERT_WLOCKED(object); 7855 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 7856 ("pmap_object_init_pt: non-device object")); 7857 if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 7858 if (!pmap_ps_enabled(pmap)) 7859 return; 7860 if (!vm_object_populate(object, pindex, pindex + atop(size))) 7861 return; 7862 p = vm_page_lookup(object, pindex); 7863 KASSERT(vm_page_all_valid(p), 7864 ("pmap_object_init_pt: invalid page %p", p)); 7865 pat_mode = p->md.pat_mode; 7866 7867 /* 7868 * Abort the mapping if the first page is not physically 7869 * aligned to a 2MB page boundary. 7870 */ 7871 ptepa = VM_PAGE_TO_PHYS(p); 7872 if (ptepa & (NBPDR - 1)) 7873 return; 7874 7875 /* 7876 * Skip the first page. Abort the mapping if the rest of 7877 * the pages are not physically contiguous or have differing 7878 * memory attributes. 7879 */ 7880 p = TAILQ_NEXT(p, listq); 7881 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 7882 pa += PAGE_SIZE) { 7883 KASSERT(vm_page_all_valid(p), 7884 ("pmap_object_init_pt: invalid page %p", p)); 7885 if (pa != VM_PAGE_TO_PHYS(p) || 7886 pat_mode != p->md.pat_mode) 7887 return; 7888 p = TAILQ_NEXT(p, listq); 7889 } 7890 7891 /* 7892 * Map using 2MB pages. Since "ptepa" is 2M aligned and 7893 * "size" is a multiple of 2M, adding the PAT setting to "pa" 7894 * will not affect the termination of this loop. 7895 */ 7896 PMAP_LOCK(pmap); 7897 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); 7898 pa < ptepa + size; pa += NBPDR) { 7899 pde = pmap_alloc_pde(pmap, addr, &pdpg, NULL); 7900 if (pde == NULL) { 7901 /* 7902 * The creation of mappings below is only an 7903 * optimization. If a page directory page 7904 * cannot be allocated without blocking, 7905 * continue on to the next mapping rather than 7906 * blocking. 7907 */ 7908 addr += NBPDR; 7909 continue; 7910 } 7911 if ((*pde & PG_V) == 0) { 7912 pde_store(pde, pa | PG_PS | PG_M | PG_A | 7913 PG_U | PG_RW | PG_V); 7914 pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE); 7915 counter_u64_add(pmap_pde_mappings, 1); 7916 } else { 7917 /* Continue on if the PDE is already valid. */ 7918 pdpg->ref_count--; 7919 KASSERT(pdpg->ref_count > 0, 7920 ("pmap_object_init_pt: missing reference " 7921 "to page directory page, va: 0x%lx", addr)); 7922 } 7923 addr += NBPDR; 7924 } 7925 PMAP_UNLOCK(pmap); 7926 } 7927 } 7928 7929 /* 7930 * Clear the wired attribute from the mappings for the specified range of 7931 * addresses in the given pmap. Every valid mapping within that range 7932 * must have the wired attribute set. In contrast, invalid mappings 7933 * cannot have the wired attribute set, so they are ignored. 7934 * 7935 * The wired attribute of the page table entry is not a hardware 7936 * feature, so there is no need to invalidate any TLB entries. 7937 * Since pmap_demote_pde() for the wired entry must never fail, 7938 * pmap_delayed_invl_start()/finish() calls around the 7939 * function are not needed. 7940 */ 7941 void 7942 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 7943 { 7944 vm_offset_t va_next; 7945 pml4_entry_t *pml4e; 7946 pdp_entry_t *pdpe; 7947 pd_entry_t *pde; 7948 pt_entry_t *pte, PG_V, PG_G __diagused; 7949 7950 PG_V = pmap_valid_bit(pmap); 7951 PG_G = pmap_global_bit(pmap); 7952 PMAP_LOCK(pmap); 7953 for (; sva < eva; sva = va_next) { 7954 pml4e = pmap_pml4e(pmap, sva); 7955 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 7956 va_next = (sva + NBPML4) & ~PML4MASK; 7957 if (va_next < sva) 7958 va_next = eva; 7959 continue; 7960 } 7961 7962 va_next = (sva + NBPDP) & ~PDPMASK; 7963 if (va_next < sva) 7964 va_next = eva; 7965 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 7966 if ((*pdpe & PG_V) == 0) 7967 continue; 7968 if ((*pdpe & PG_PS) != 0) { 7969 KASSERT(va_next <= eva, 7970 ("partial update of non-transparent 1G mapping " 7971 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 7972 *pdpe, sva, eva, va_next)); 7973 MPASS(pmap != kernel_pmap); /* XXXKIB */ 7974 MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); 7975 atomic_clear_long(pdpe, PG_W); 7976 pmap->pm_stats.wired_count -= NBPDP / PAGE_SIZE; 7977 continue; 7978 } 7979 7980 va_next = (sva + NBPDR) & ~PDRMASK; 7981 if (va_next < sva) 7982 va_next = eva; 7983 pde = pmap_pdpe_to_pde(pdpe, sva); 7984 if ((*pde & PG_V) == 0) 7985 continue; 7986 if ((*pde & PG_PS) != 0) { 7987 if ((*pde & PG_W) == 0) 7988 panic("pmap_unwire: pde %#jx is missing PG_W", 7989 (uintmax_t)*pde); 7990 7991 /* 7992 * Are we unwiring the entire large page? If not, 7993 * demote the mapping and fall through. 7994 */ 7995 if (sva + NBPDR == va_next && eva >= va_next) { 7996 atomic_clear_long(pde, PG_W); 7997 pmap->pm_stats.wired_count -= NBPDR / 7998 PAGE_SIZE; 7999 continue; 8000 } else if (!pmap_demote_pde(pmap, pde, sva)) 8001 panic("pmap_unwire: demotion failed"); 8002 } 8003 if (va_next > eva) 8004 va_next = eva; 8005 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 8006 sva += PAGE_SIZE) { 8007 if ((*pte & PG_V) == 0) 8008 continue; 8009 if ((*pte & PG_W) == 0) 8010 panic("pmap_unwire: pte %#jx is missing PG_W", 8011 (uintmax_t)*pte); 8012 8013 /* 8014 * PG_W must be cleared atomically. Although the pmap 8015 * lock synchronizes access to PG_W, another processor 8016 * could be setting PG_M and/or PG_A concurrently. 8017 */ 8018 atomic_clear_long(pte, PG_W); 8019 pmap->pm_stats.wired_count--; 8020 } 8021 } 8022 PMAP_UNLOCK(pmap); 8023 } 8024 8025 /* 8026 * Copy the range specified by src_addr/len 8027 * from the source map to the range dst_addr/len 8028 * in the destination map. 8029 * 8030 * This routine is only advisory and need not do anything. 8031 */ 8032 void 8033 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 8034 vm_offset_t src_addr) 8035 { 8036 struct rwlock *lock; 8037 pml4_entry_t *pml4e; 8038 pdp_entry_t *pdpe; 8039 pd_entry_t *pde, srcptepaddr; 8040 pt_entry_t *dst_pte, PG_A, PG_M, PG_V, ptetemp, *src_pte; 8041 vm_offset_t addr, end_addr, va_next; 8042 vm_page_t dst_pdpg, dstmpte, srcmpte; 8043 8044 if (dst_addr != src_addr) 8045 return; 8046 8047 if (dst_pmap->pm_type != src_pmap->pm_type) 8048 return; 8049 8050 /* 8051 * EPT page table entries that require emulation of A/D bits are 8052 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although 8053 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit 8054 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT 8055 * implementations flag an EPT misconfiguration for exec-only 8056 * mappings we skip this function entirely for emulated pmaps. 8057 */ 8058 if (pmap_emulate_ad_bits(dst_pmap)) 8059 return; 8060 8061 end_addr = src_addr + len; 8062 lock = NULL; 8063 if (dst_pmap < src_pmap) { 8064 PMAP_LOCK(dst_pmap); 8065 PMAP_LOCK(src_pmap); 8066 } else { 8067 PMAP_LOCK(src_pmap); 8068 PMAP_LOCK(dst_pmap); 8069 } 8070 8071 PG_A = pmap_accessed_bit(dst_pmap); 8072 PG_M = pmap_modified_bit(dst_pmap); 8073 PG_V = pmap_valid_bit(dst_pmap); 8074 8075 for (addr = src_addr; addr < end_addr; addr = va_next) { 8076 KASSERT(addr < UPT_MIN_ADDRESS, 8077 ("pmap_copy: invalid to pmap_copy page tables")); 8078 8079 pml4e = pmap_pml4e(src_pmap, addr); 8080 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 8081 va_next = (addr + NBPML4) & ~PML4MASK; 8082 if (va_next < addr) 8083 va_next = end_addr; 8084 continue; 8085 } 8086 8087 va_next = (addr + NBPDP) & ~PDPMASK; 8088 if (va_next < addr) 8089 va_next = end_addr; 8090 pdpe = pmap_pml4e_to_pdpe(pml4e, addr); 8091 if ((*pdpe & PG_V) == 0) 8092 continue; 8093 if ((*pdpe & PG_PS) != 0) { 8094 KASSERT(va_next <= end_addr, 8095 ("partial update of non-transparent 1G mapping " 8096 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 8097 *pdpe, addr, end_addr, va_next)); 8098 MPASS((addr & PDPMASK) == 0); 8099 MPASS((*pdpe & PG_MANAGED) == 0); 8100 srcptepaddr = *pdpe; 8101 pdpe = pmap_pdpe(dst_pmap, addr); 8102 if (pdpe == NULL) { 8103 if (pmap_allocpte_alloc(dst_pmap, 8104 pmap_pml4e_pindex(addr), NULL, addr) == 8105 NULL) 8106 break; 8107 pdpe = pmap_pdpe(dst_pmap, addr); 8108 } else { 8109 pml4e = pmap_pml4e(dst_pmap, addr); 8110 dst_pdpg = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME); 8111 dst_pdpg->ref_count++; 8112 } 8113 KASSERT(*pdpe == 0, 8114 ("1G mapping present in dst pmap " 8115 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 8116 *pdpe, addr, end_addr, va_next)); 8117 *pdpe = srcptepaddr & ~PG_W; 8118 pmap_resident_count_adj(dst_pmap, NBPDP / PAGE_SIZE); 8119 continue; 8120 } 8121 8122 va_next = (addr + NBPDR) & ~PDRMASK; 8123 if (va_next < addr) 8124 va_next = end_addr; 8125 8126 pde = pmap_pdpe_to_pde(pdpe, addr); 8127 srcptepaddr = *pde; 8128 if (srcptepaddr == 0) 8129 continue; 8130 8131 if (srcptepaddr & PG_PS) { 8132 /* 8133 * We can only virtual copy whole superpages. 8134 */ 8135 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 8136 continue; 8137 pde = pmap_alloc_pde(dst_pmap, addr, &dst_pdpg, NULL); 8138 if (pde == NULL) 8139 break; 8140 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 8141 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, 8142 PMAP_ENTER_NORECLAIM, &lock))) { 8143 /* 8144 * We leave the dirty bit unchanged because 8145 * managed read/write superpage mappings are 8146 * required to be dirty. However, managed 8147 * superpage mappings are not required to 8148 * have their accessed bit set, so we clear 8149 * it because we don't know if this mapping 8150 * will be used. 8151 */ 8152 srcptepaddr &= ~PG_W; 8153 if ((srcptepaddr & PG_MANAGED) != 0) 8154 srcptepaddr &= ~PG_A; 8155 *pde = srcptepaddr; 8156 pmap_resident_count_adj(dst_pmap, NBPDR / 8157 PAGE_SIZE); 8158 counter_u64_add(pmap_pde_mappings, 1); 8159 } else 8160 pmap_abort_ptp(dst_pmap, addr, dst_pdpg); 8161 continue; 8162 } 8163 8164 srcptepaddr &= PG_FRAME; 8165 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 8166 KASSERT(srcmpte->ref_count > 0, 8167 ("pmap_copy: source page table page is unused")); 8168 8169 if (va_next > end_addr) 8170 va_next = end_addr; 8171 8172 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 8173 src_pte = &src_pte[pmap_pte_index(addr)]; 8174 dstmpte = NULL; 8175 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { 8176 ptetemp = *src_pte; 8177 8178 /* 8179 * We only virtual copy managed pages. 8180 */ 8181 if ((ptetemp & PG_MANAGED) == 0) 8182 continue; 8183 8184 if (dstmpte != NULL) { 8185 KASSERT(dstmpte->pindex == 8186 pmap_pde_pindex(addr), 8187 ("dstmpte pindex/addr mismatch")); 8188 dstmpte->ref_count++; 8189 } else if ((dstmpte = pmap_allocpte(dst_pmap, addr, 8190 NULL)) == NULL) 8191 goto out; 8192 dst_pte = (pt_entry_t *) 8193 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 8194 dst_pte = &dst_pte[pmap_pte_index(addr)]; 8195 if (*dst_pte == 0 && 8196 pmap_try_insert_pv_entry(dst_pmap, addr, 8197 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) { 8198 /* 8199 * Clear the wired, modified, and accessed 8200 * (referenced) bits during the copy. 8201 */ 8202 *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); 8203 pmap_resident_count_adj(dst_pmap, 1); 8204 } else { 8205 pmap_abort_ptp(dst_pmap, addr, dstmpte); 8206 goto out; 8207 } 8208 /* Have we copied all of the valid mappings? */ 8209 if (dstmpte->ref_count >= srcmpte->ref_count) 8210 break; 8211 } 8212 } 8213 out: 8214 if (lock != NULL) 8215 rw_wunlock(lock); 8216 PMAP_UNLOCK(src_pmap); 8217 PMAP_UNLOCK(dst_pmap); 8218 } 8219 8220 int 8221 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap) 8222 { 8223 int error; 8224 8225 if (dst_pmap->pm_type != src_pmap->pm_type || 8226 dst_pmap->pm_type != PT_X86 || 8227 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) 8228 return (0); 8229 for (;;) { 8230 if (dst_pmap < src_pmap) { 8231 PMAP_LOCK(dst_pmap); 8232 PMAP_LOCK(src_pmap); 8233 } else { 8234 PMAP_LOCK(src_pmap); 8235 PMAP_LOCK(dst_pmap); 8236 } 8237 error = pmap_pkru_copy(dst_pmap, src_pmap); 8238 /* Clean up partial copy on failure due to no memory. */ 8239 if (error == ENOMEM) 8240 pmap_pkru_deassign_all(dst_pmap); 8241 PMAP_UNLOCK(src_pmap); 8242 PMAP_UNLOCK(dst_pmap); 8243 if (error != ENOMEM) 8244 break; 8245 vm_wait(NULL); 8246 } 8247 return (error); 8248 } 8249 8250 /* 8251 * Zero the specified hardware page. 8252 */ 8253 void 8254 pmap_zero_page(vm_page_t m) 8255 { 8256 vm_offset_t va; 8257 8258 #ifdef TSLOG_PAGEZERO 8259 TSENTER(); 8260 #endif 8261 va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 8262 pagezero((void *)va); 8263 #ifdef TSLOG_PAGEZERO 8264 TSEXIT(); 8265 #endif 8266 } 8267 8268 /* 8269 * Zero an area within a single hardware page. off and size must not 8270 * cover an area beyond a single hardware page. 8271 */ 8272 void 8273 pmap_zero_page_area(vm_page_t m, int off, int size) 8274 { 8275 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 8276 8277 if (off == 0 && size == PAGE_SIZE) 8278 pagezero((void *)va); 8279 else 8280 bzero((char *)va + off, size); 8281 } 8282 8283 /* 8284 * Copy 1 specified hardware page to another. 8285 */ 8286 void 8287 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 8288 { 8289 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 8290 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 8291 8292 pagecopy((void *)src, (void *)dst); 8293 } 8294 8295 int unmapped_buf_allowed = 1; 8296 8297 void 8298 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 8299 vm_offset_t b_offset, int xfersize) 8300 { 8301 void *a_cp, *b_cp; 8302 vm_page_t pages[2]; 8303 vm_offset_t vaddr[2], a_pg_offset, b_pg_offset; 8304 int cnt; 8305 boolean_t mapped; 8306 8307 while (xfersize > 0) { 8308 a_pg_offset = a_offset & PAGE_MASK; 8309 pages[0] = ma[a_offset >> PAGE_SHIFT]; 8310 b_pg_offset = b_offset & PAGE_MASK; 8311 pages[1] = mb[b_offset >> PAGE_SHIFT]; 8312 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 8313 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 8314 mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE); 8315 a_cp = (char *)vaddr[0] + a_pg_offset; 8316 b_cp = (char *)vaddr[1] + b_pg_offset; 8317 bcopy(a_cp, b_cp, cnt); 8318 if (__predict_false(mapped)) 8319 pmap_unmap_io_transient(pages, vaddr, 2, FALSE); 8320 a_offset += cnt; 8321 b_offset += cnt; 8322 xfersize -= cnt; 8323 } 8324 } 8325 8326 /* 8327 * Returns true if the pmap's pv is one of the first 8328 * 16 pvs linked to from this page. This count may 8329 * be changed upwards or downwards in the future; it 8330 * is only necessary that true be returned for a small 8331 * subset of pmaps for proper page aging. 8332 */ 8333 boolean_t 8334 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 8335 { 8336 struct md_page *pvh; 8337 struct rwlock *lock; 8338 pv_entry_t pv; 8339 int loops = 0; 8340 boolean_t rv; 8341 8342 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8343 ("pmap_page_exists_quick: page %p is not managed", m)); 8344 rv = FALSE; 8345 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8346 rw_rlock(lock); 8347 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8348 if (PV_PMAP(pv) == pmap) { 8349 rv = TRUE; 8350 break; 8351 } 8352 loops++; 8353 if (loops >= 16) 8354 break; 8355 } 8356 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 8357 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8358 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8359 if (PV_PMAP(pv) == pmap) { 8360 rv = TRUE; 8361 break; 8362 } 8363 loops++; 8364 if (loops >= 16) 8365 break; 8366 } 8367 } 8368 rw_runlock(lock); 8369 return (rv); 8370 } 8371 8372 /* 8373 * pmap_page_wired_mappings: 8374 * 8375 * Return the number of managed mappings to the given physical page 8376 * that are wired. 8377 */ 8378 int 8379 pmap_page_wired_mappings(vm_page_t m) 8380 { 8381 struct rwlock *lock; 8382 struct md_page *pvh; 8383 pmap_t pmap; 8384 pt_entry_t *pte; 8385 pv_entry_t pv; 8386 int count, md_gen, pvh_gen; 8387 8388 if ((m->oflags & VPO_UNMANAGED) != 0) 8389 return (0); 8390 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8391 rw_rlock(lock); 8392 restart: 8393 count = 0; 8394 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8395 pmap = PV_PMAP(pv); 8396 if (!PMAP_TRYLOCK(pmap)) { 8397 md_gen = m->md.pv_gen; 8398 rw_runlock(lock); 8399 PMAP_LOCK(pmap); 8400 rw_rlock(lock); 8401 if (md_gen != m->md.pv_gen) { 8402 PMAP_UNLOCK(pmap); 8403 goto restart; 8404 } 8405 } 8406 pte = pmap_pte(pmap, pv->pv_va); 8407 if ((*pte & PG_W) != 0) 8408 count++; 8409 PMAP_UNLOCK(pmap); 8410 } 8411 if ((m->flags & PG_FICTITIOUS) == 0) { 8412 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8413 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8414 pmap = PV_PMAP(pv); 8415 if (!PMAP_TRYLOCK(pmap)) { 8416 md_gen = m->md.pv_gen; 8417 pvh_gen = pvh->pv_gen; 8418 rw_runlock(lock); 8419 PMAP_LOCK(pmap); 8420 rw_rlock(lock); 8421 if (md_gen != m->md.pv_gen || 8422 pvh_gen != pvh->pv_gen) { 8423 PMAP_UNLOCK(pmap); 8424 goto restart; 8425 } 8426 } 8427 pte = pmap_pde(pmap, pv->pv_va); 8428 if ((*pte & PG_W) != 0) 8429 count++; 8430 PMAP_UNLOCK(pmap); 8431 } 8432 } 8433 rw_runlock(lock); 8434 return (count); 8435 } 8436 8437 /* 8438 * Returns TRUE if the given page is mapped individually or as part of 8439 * a 2mpage. Otherwise, returns FALSE. 8440 */ 8441 boolean_t 8442 pmap_page_is_mapped(vm_page_t m) 8443 { 8444 struct rwlock *lock; 8445 boolean_t rv; 8446 8447 if ((m->oflags & VPO_UNMANAGED) != 0) 8448 return (FALSE); 8449 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8450 rw_rlock(lock); 8451 rv = !TAILQ_EMPTY(&m->md.pv_list) || 8452 ((m->flags & PG_FICTITIOUS) == 0 && 8453 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 8454 rw_runlock(lock); 8455 return (rv); 8456 } 8457 8458 /* 8459 * Destroy all managed, non-wired mappings in the given user-space 8460 * pmap. This pmap cannot be active on any processor besides the 8461 * caller. 8462 * 8463 * This function cannot be applied to the kernel pmap. Moreover, it 8464 * is not intended for general use. It is only to be used during 8465 * process termination. Consequently, it can be implemented in ways 8466 * that make it faster than pmap_remove(). First, it can more quickly 8467 * destroy mappings by iterating over the pmap's collection of PV 8468 * entries, rather than searching the page table. Second, it doesn't 8469 * have to test and clear the page table entries atomically, because 8470 * no processor is currently accessing the user address space. In 8471 * particular, a page table entry's dirty bit won't change state once 8472 * this function starts. 8473 * 8474 * Although this function destroys all of the pmap's managed, 8475 * non-wired mappings, it can delay and batch the invalidation of TLB 8476 * entries without calling pmap_delayed_invl_start() and 8477 * pmap_delayed_invl_finish(). Because the pmap is not active on 8478 * any other processor, none of these TLB entries will ever be used 8479 * before their eventual invalidation. Consequently, there is no need 8480 * for either pmap_remove_all() or pmap_remove_write() to wait for 8481 * that eventual TLB invalidation. 8482 */ 8483 void 8484 pmap_remove_pages(pmap_t pmap) 8485 { 8486 pd_entry_t ptepde; 8487 pt_entry_t *pte, tpte; 8488 pt_entry_t PG_M, PG_RW, PG_V; 8489 struct spglist free; 8490 struct pv_chunklist free_chunks[PMAP_MEMDOM]; 8491 vm_page_t m, mpte, mt; 8492 pv_entry_t pv; 8493 struct md_page *pvh; 8494 struct pv_chunk *pc, *npc; 8495 struct rwlock *lock; 8496 int64_t bit; 8497 uint64_t inuse, bitmask; 8498 int allfree, field, i, idx; 8499 #ifdef PV_STATS 8500 int freed; 8501 #endif 8502 boolean_t superpage; 8503 vm_paddr_t pa; 8504 8505 /* 8506 * Assert that the given pmap is only active on the current 8507 * CPU. Unfortunately, we cannot block another CPU from 8508 * activating the pmap while this function is executing. 8509 */ 8510 KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); 8511 #ifdef INVARIANTS 8512 { 8513 cpuset_t other_cpus; 8514 8515 other_cpus = all_cpus; 8516 critical_enter(); 8517 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 8518 CPU_AND(&other_cpus, &other_cpus, &pmap->pm_active); 8519 critical_exit(); 8520 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); 8521 } 8522 #endif 8523 8524 lock = NULL; 8525 PG_M = pmap_modified_bit(pmap); 8526 PG_V = pmap_valid_bit(pmap); 8527 PG_RW = pmap_rw_bit(pmap); 8528 8529 for (i = 0; i < PMAP_MEMDOM; i++) 8530 TAILQ_INIT(&free_chunks[i]); 8531 SLIST_INIT(&free); 8532 PMAP_LOCK(pmap); 8533 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 8534 allfree = 1; 8535 #ifdef PV_STATS 8536 freed = 0; 8537 #endif 8538 for (field = 0; field < _NPCM; field++) { 8539 inuse = ~pc->pc_map[field] & pc_freemask[field]; 8540 while (inuse != 0) { 8541 bit = bsfq(inuse); 8542 bitmask = 1UL << bit; 8543 idx = field * 64 + bit; 8544 pv = &pc->pc_pventry[idx]; 8545 inuse &= ~bitmask; 8546 8547 pte = pmap_pdpe(pmap, pv->pv_va); 8548 ptepde = *pte; 8549 pte = pmap_pdpe_to_pde(pte, pv->pv_va); 8550 tpte = *pte; 8551 if ((tpte & (PG_PS | PG_V)) == PG_V) { 8552 superpage = FALSE; 8553 ptepde = tpte; 8554 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 8555 PG_FRAME); 8556 pte = &pte[pmap_pte_index(pv->pv_va)]; 8557 tpte = *pte; 8558 } else { 8559 /* 8560 * Keep track whether 'tpte' is a 8561 * superpage explicitly instead of 8562 * relying on PG_PS being set. 8563 * 8564 * This is because PG_PS is numerically 8565 * identical to PG_PTE_PAT and thus a 8566 * regular page could be mistaken for 8567 * a superpage. 8568 */ 8569 superpage = TRUE; 8570 } 8571 8572 if ((tpte & PG_V) == 0) { 8573 panic("bad pte va %lx pte %lx", 8574 pv->pv_va, tpte); 8575 } 8576 8577 /* 8578 * We cannot remove wired pages from a process' mapping at this time 8579 */ 8580 if (tpte & PG_W) { 8581 allfree = 0; 8582 continue; 8583 } 8584 8585 /* Mark free */ 8586 pc->pc_map[field] |= bitmask; 8587 8588 /* 8589 * Because this pmap is not active on other 8590 * processors, the dirty bit cannot have 8591 * changed state since we last loaded pte. 8592 */ 8593 pte_clear(pte); 8594 8595 if (superpage) 8596 pa = tpte & PG_PS_FRAME; 8597 else 8598 pa = tpte & PG_FRAME; 8599 8600 m = PHYS_TO_VM_PAGE(pa); 8601 KASSERT(m->phys_addr == pa, 8602 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 8603 m, (uintmax_t)m->phys_addr, 8604 (uintmax_t)tpte)); 8605 8606 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 8607 m < &vm_page_array[vm_page_array_size], 8608 ("pmap_remove_pages: bad tpte %#jx", 8609 (uintmax_t)tpte)); 8610 8611 /* 8612 * Update the vm_page_t clean/reference bits. 8613 */ 8614 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 8615 if (superpage) { 8616 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 8617 vm_page_dirty(mt); 8618 } else 8619 vm_page_dirty(m); 8620 } 8621 8622 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 8623 8624 if (superpage) { 8625 pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE); 8626 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 8627 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 8628 pvh->pv_gen++; 8629 if (TAILQ_EMPTY(&pvh->pv_list)) { 8630 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 8631 if ((mt->a.flags & PGA_WRITEABLE) != 0 && 8632 TAILQ_EMPTY(&mt->md.pv_list)) 8633 vm_page_aflag_clear(mt, PGA_WRITEABLE); 8634 } 8635 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 8636 if (mpte != NULL) { 8637 KASSERT(vm_page_any_valid(mpte), 8638 ("pmap_remove_pages: pte page not promoted")); 8639 pmap_pt_page_count_adj(pmap, -1); 8640 KASSERT(mpte->ref_count == NPTEPG, 8641 ("pmap_remove_pages: pte page reference count error")); 8642 mpte->ref_count = 0; 8643 pmap_add_delayed_free_list(mpte, &free, FALSE); 8644 } 8645 } else { 8646 pmap_resident_count_adj(pmap, -1); 8647 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 8648 m->md.pv_gen++; 8649 if ((m->a.flags & PGA_WRITEABLE) != 0 && 8650 TAILQ_EMPTY(&m->md.pv_list) && 8651 (m->flags & PG_FICTITIOUS) == 0) { 8652 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8653 if (TAILQ_EMPTY(&pvh->pv_list)) 8654 vm_page_aflag_clear(m, PGA_WRITEABLE); 8655 } 8656 } 8657 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 8658 #ifdef PV_STATS 8659 freed++; 8660 #endif 8661 } 8662 } 8663 PV_STAT(counter_u64_add(pv_entry_frees, freed)); 8664 PV_STAT(counter_u64_add(pv_entry_spare, freed)); 8665 PV_STAT(counter_u64_add(pv_entry_count, -freed)); 8666 if (allfree) { 8667 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 8668 TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc, pc_list); 8669 } 8670 } 8671 if (lock != NULL) 8672 rw_wunlock(lock); 8673 pmap_invalidate_all(pmap); 8674 pmap_pkru_deassign_all(pmap); 8675 free_pv_chunk_batch((struct pv_chunklist *)&free_chunks); 8676 PMAP_UNLOCK(pmap); 8677 vm_page_free_pages_toq(&free, true); 8678 } 8679 8680 static boolean_t 8681 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 8682 { 8683 struct rwlock *lock; 8684 pv_entry_t pv; 8685 struct md_page *pvh; 8686 pt_entry_t *pte, mask; 8687 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 8688 pmap_t pmap; 8689 int md_gen, pvh_gen; 8690 boolean_t rv; 8691 8692 rv = FALSE; 8693 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8694 rw_rlock(lock); 8695 restart: 8696 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8697 pmap = PV_PMAP(pv); 8698 if (!PMAP_TRYLOCK(pmap)) { 8699 md_gen = m->md.pv_gen; 8700 rw_runlock(lock); 8701 PMAP_LOCK(pmap); 8702 rw_rlock(lock); 8703 if (md_gen != m->md.pv_gen) { 8704 PMAP_UNLOCK(pmap); 8705 goto restart; 8706 } 8707 } 8708 pte = pmap_pte(pmap, pv->pv_va); 8709 mask = 0; 8710 if (modified) { 8711 PG_M = pmap_modified_bit(pmap); 8712 PG_RW = pmap_rw_bit(pmap); 8713 mask |= PG_RW | PG_M; 8714 } 8715 if (accessed) { 8716 PG_A = pmap_accessed_bit(pmap); 8717 PG_V = pmap_valid_bit(pmap); 8718 mask |= PG_V | PG_A; 8719 } 8720 rv = (*pte & mask) == mask; 8721 PMAP_UNLOCK(pmap); 8722 if (rv) 8723 goto out; 8724 } 8725 if ((m->flags & PG_FICTITIOUS) == 0) { 8726 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8727 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8728 pmap = PV_PMAP(pv); 8729 if (!PMAP_TRYLOCK(pmap)) { 8730 md_gen = m->md.pv_gen; 8731 pvh_gen = pvh->pv_gen; 8732 rw_runlock(lock); 8733 PMAP_LOCK(pmap); 8734 rw_rlock(lock); 8735 if (md_gen != m->md.pv_gen || 8736 pvh_gen != pvh->pv_gen) { 8737 PMAP_UNLOCK(pmap); 8738 goto restart; 8739 } 8740 } 8741 pte = pmap_pde(pmap, pv->pv_va); 8742 mask = 0; 8743 if (modified) { 8744 PG_M = pmap_modified_bit(pmap); 8745 PG_RW = pmap_rw_bit(pmap); 8746 mask |= PG_RW | PG_M; 8747 } 8748 if (accessed) { 8749 PG_A = pmap_accessed_bit(pmap); 8750 PG_V = pmap_valid_bit(pmap); 8751 mask |= PG_V | PG_A; 8752 } 8753 rv = (*pte & mask) == mask; 8754 PMAP_UNLOCK(pmap); 8755 if (rv) 8756 goto out; 8757 } 8758 } 8759 out: 8760 rw_runlock(lock); 8761 return (rv); 8762 } 8763 8764 /* 8765 * pmap_is_modified: 8766 * 8767 * Return whether or not the specified physical page was modified 8768 * in any physical maps. 8769 */ 8770 boolean_t 8771 pmap_is_modified(vm_page_t m) 8772 { 8773 8774 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8775 ("pmap_is_modified: page %p is not managed", m)); 8776 8777 /* 8778 * If the page is not busied then this check is racy. 8779 */ 8780 if (!pmap_page_is_write_mapped(m)) 8781 return (FALSE); 8782 return (pmap_page_test_mappings(m, FALSE, TRUE)); 8783 } 8784 8785 /* 8786 * pmap_is_prefaultable: 8787 * 8788 * Return whether or not the specified virtual address is eligible 8789 * for prefault. 8790 */ 8791 boolean_t 8792 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 8793 { 8794 pd_entry_t *pde; 8795 pt_entry_t *pte, PG_V; 8796 boolean_t rv; 8797 8798 PG_V = pmap_valid_bit(pmap); 8799 8800 /* 8801 * Return TRUE if and only if the PTE for the specified virtual 8802 * address is allocated but invalid. 8803 */ 8804 rv = FALSE; 8805 PMAP_LOCK(pmap); 8806 pde = pmap_pde(pmap, addr); 8807 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { 8808 pte = pmap_pde_to_pte(pde, addr); 8809 rv = (*pte & PG_V) == 0; 8810 } 8811 PMAP_UNLOCK(pmap); 8812 return (rv); 8813 } 8814 8815 /* 8816 * pmap_is_referenced: 8817 * 8818 * Return whether or not the specified physical page was referenced 8819 * in any physical maps. 8820 */ 8821 boolean_t 8822 pmap_is_referenced(vm_page_t m) 8823 { 8824 8825 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8826 ("pmap_is_referenced: page %p is not managed", m)); 8827 return (pmap_page_test_mappings(m, TRUE, FALSE)); 8828 } 8829 8830 /* 8831 * Clear the write and modified bits in each of the given page's mappings. 8832 */ 8833 void 8834 pmap_remove_write(vm_page_t m) 8835 { 8836 struct md_page *pvh; 8837 pmap_t pmap; 8838 struct rwlock *lock; 8839 pv_entry_t next_pv, pv; 8840 pd_entry_t *pde; 8841 pt_entry_t oldpte, *pte, PG_M, PG_RW; 8842 vm_offset_t va; 8843 int pvh_gen, md_gen; 8844 8845 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8846 ("pmap_remove_write: page %p is not managed", m)); 8847 8848 vm_page_assert_busied(m); 8849 if (!pmap_page_is_write_mapped(m)) 8850 return; 8851 8852 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8853 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 8854 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8855 rw_wlock(lock); 8856 retry: 8857 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 8858 pmap = PV_PMAP(pv); 8859 if (!PMAP_TRYLOCK(pmap)) { 8860 pvh_gen = pvh->pv_gen; 8861 rw_wunlock(lock); 8862 PMAP_LOCK(pmap); 8863 rw_wlock(lock); 8864 if (pvh_gen != pvh->pv_gen) { 8865 PMAP_UNLOCK(pmap); 8866 goto retry; 8867 } 8868 } 8869 PG_RW = pmap_rw_bit(pmap); 8870 va = pv->pv_va; 8871 pde = pmap_pde(pmap, va); 8872 if ((*pde & PG_RW) != 0) 8873 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 8874 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 8875 ("inconsistent pv lock %p %p for page %p", 8876 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 8877 PMAP_UNLOCK(pmap); 8878 } 8879 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8880 pmap = PV_PMAP(pv); 8881 if (!PMAP_TRYLOCK(pmap)) { 8882 pvh_gen = pvh->pv_gen; 8883 md_gen = m->md.pv_gen; 8884 rw_wunlock(lock); 8885 PMAP_LOCK(pmap); 8886 rw_wlock(lock); 8887 if (pvh_gen != pvh->pv_gen || 8888 md_gen != m->md.pv_gen) { 8889 PMAP_UNLOCK(pmap); 8890 goto retry; 8891 } 8892 } 8893 PG_M = pmap_modified_bit(pmap); 8894 PG_RW = pmap_rw_bit(pmap); 8895 pde = pmap_pde(pmap, pv->pv_va); 8896 KASSERT((*pde & PG_PS) == 0, 8897 ("pmap_remove_write: found a 2mpage in page %p's pv list", 8898 m)); 8899 pte = pmap_pde_to_pte(pde, pv->pv_va); 8900 oldpte = *pte; 8901 if (oldpte & PG_RW) { 8902 while (!atomic_fcmpset_long(pte, &oldpte, oldpte & 8903 ~(PG_RW | PG_M))) 8904 cpu_spinwait(); 8905 if ((oldpte & PG_M) != 0) 8906 vm_page_dirty(m); 8907 pmap_invalidate_page(pmap, pv->pv_va); 8908 } 8909 PMAP_UNLOCK(pmap); 8910 } 8911 rw_wunlock(lock); 8912 vm_page_aflag_clear(m, PGA_WRITEABLE); 8913 pmap_delayed_invl_wait(m); 8914 } 8915 8916 /* 8917 * pmap_ts_referenced: 8918 * 8919 * Return a count of reference bits for a page, clearing those bits. 8920 * It is not necessary for every reference bit to be cleared, but it 8921 * is necessary that 0 only be returned when there are truly no 8922 * reference bits set. 8923 * 8924 * As an optimization, update the page's dirty field if a modified bit is 8925 * found while counting reference bits. This opportunistic update can be 8926 * performed at low cost and can eliminate the need for some future calls 8927 * to pmap_is_modified(). However, since this function stops after 8928 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 8929 * dirty pages. Those dirty pages will only be detected by a future call 8930 * to pmap_is_modified(). 8931 * 8932 * A DI block is not needed within this function, because 8933 * invalidations are performed before the PV list lock is 8934 * released. 8935 */ 8936 int 8937 pmap_ts_referenced(vm_page_t m) 8938 { 8939 struct md_page *pvh; 8940 pv_entry_t pv, pvf; 8941 pmap_t pmap; 8942 struct rwlock *lock; 8943 pd_entry_t oldpde, *pde; 8944 pt_entry_t *pte, PG_A, PG_M, PG_RW; 8945 vm_offset_t va; 8946 vm_paddr_t pa; 8947 int cleared, md_gen, not_cleared, pvh_gen; 8948 struct spglist free; 8949 boolean_t demoted; 8950 8951 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8952 ("pmap_ts_referenced: page %p is not managed", m)); 8953 SLIST_INIT(&free); 8954 cleared = 0; 8955 pa = VM_PAGE_TO_PHYS(m); 8956 lock = PHYS_TO_PV_LIST_LOCK(pa); 8957 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 8958 rw_wlock(lock); 8959 retry: 8960 not_cleared = 0; 8961 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 8962 goto small_mappings; 8963 pv = pvf; 8964 do { 8965 if (pvf == NULL) 8966 pvf = pv; 8967 pmap = PV_PMAP(pv); 8968 if (!PMAP_TRYLOCK(pmap)) { 8969 pvh_gen = pvh->pv_gen; 8970 rw_wunlock(lock); 8971 PMAP_LOCK(pmap); 8972 rw_wlock(lock); 8973 if (pvh_gen != pvh->pv_gen) { 8974 PMAP_UNLOCK(pmap); 8975 goto retry; 8976 } 8977 } 8978 PG_A = pmap_accessed_bit(pmap); 8979 PG_M = pmap_modified_bit(pmap); 8980 PG_RW = pmap_rw_bit(pmap); 8981 va = pv->pv_va; 8982 pde = pmap_pde(pmap, pv->pv_va); 8983 oldpde = *pde; 8984 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 8985 /* 8986 * Although "oldpde" is mapping a 2MB page, because 8987 * this function is called at a 4KB page granularity, 8988 * we only update the 4KB page under test. 8989 */ 8990 vm_page_dirty(m); 8991 } 8992 if ((oldpde & PG_A) != 0) { 8993 /* 8994 * Since this reference bit is shared by 512 4KB 8995 * pages, it should not be cleared every time it is 8996 * tested. Apply a simple "hash" function on the 8997 * physical page number, the virtual superpage number, 8998 * and the pmap address to select one 4KB page out of 8999 * the 512 on which testing the reference bit will 9000 * result in clearing that reference bit. This 9001 * function is designed to avoid the selection of the 9002 * same 4KB page for every 2MB page mapping. 9003 * 9004 * On demotion, a mapping that hasn't been referenced 9005 * is simply destroyed. To avoid the possibility of a 9006 * subsequent page fault on a demoted wired mapping, 9007 * always leave its reference bit set. Moreover, 9008 * since the superpage is wired, the current state of 9009 * its reference bit won't affect page replacement. 9010 */ 9011 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 9012 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 9013 (oldpde & PG_W) == 0) { 9014 if (safe_to_clear_referenced(pmap, oldpde)) { 9015 atomic_clear_long(pde, PG_A); 9016 pmap_invalidate_page(pmap, pv->pv_va); 9017 demoted = FALSE; 9018 } else if (pmap_demote_pde_locked(pmap, pde, 9019 pv->pv_va, &lock)) { 9020 /* 9021 * Remove the mapping to a single page 9022 * so that a subsequent access may 9023 * repromote. Since the underlying 9024 * page table page is fully populated, 9025 * this removal never frees a page 9026 * table page. 9027 */ 9028 demoted = TRUE; 9029 va += VM_PAGE_TO_PHYS(m) - (oldpde & 9030 PG_PS_FRAME); 9031 pte = pmap_pde_to_pte(pde, va); 9032 pmap_remove_pte(pmap, pte, va, *pde, 9033 NULL, &lock); 9034 pmap_invalidate_page(pmap, va); 9035 } else 9036 demoted = TRUE; 9037 9038 if (demoted) { 9039 /* 9040 * The superpage mapping was removed 9041 * entirely and therefore 'pv' is no 9042 * longer valid. 9043 */ 9044 if (pvf == pv) 9045 pvf = NULL; 9046 pv = NULL; 9047 } 9048 cleared++; 9049 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 9050 ("inconsistent pv lock %p %p for page %p", 9051 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 9052 } else 9053 not_cleared++; 9054 } 9055 PMAP_UNLOCK(pmap); 9056 /* Rotate the PV list if it has more than one entry. */ 9057 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 9058 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 9059 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 9060 pvh->pv_gen++; 9061 } 9062 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 9063 goto out; 9064 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 9065 small_mappings: 9066 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 9067 goto out; 9068 pv = pvf; 9069 do { 9070 if (pvf == NULL) 9071 pvf = pv; 9072 pmap = PV_PMAP(pv); 9073 if (!PMAP_TRYLOCK(pmap)) { 9074 pvh_gen = pvh->pv_gen; 9075 md_gen = m->md.pv_gen; 9076 rw_wunlock(lock); 9077 PMAP_LOCK(pmap); 9078 rw_wlock(lock); 9079 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 9080 PMAP_UNLOCK(pmap); 9081 goto retry; 9082 } 9083 } 9084 PG_A = pmap_accessed_bit(pmap); 9085 PG_M = pmap_modified_bit(pmap); 9086 PG_RW = pmap_rw_bit(pmap); 9087 pde = pmap_pde(pmap, pv->pv_va); 9088 KASSERT((*pde & PG_PS) == 0, 9089 ("pmap_ts_referenced: found a 2mpage in page %p's pv list", 9090 m)); 9091 pte = pmap_pde_to_pte(pde, pv->pv_va); 9092 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 9093 vm_page_dirty(m); 9094 if ((*pte & PG_A) != 0) { 9095 if (safe_to_clear_referenced(pmap, *pte)) { 9096 atomic_clear_long(pte, PG_A); 9097 pmap_invalidate_page(pmap, pv->pv_va); 9098 cleared++; 9099 } else if ((*pte & PG_W) == 0) { 9100 /* 9101 * Wired pages cannot be paged out so 9102 * doing accessed bit emulation for 9103 * them is wasted effort. We do the 9104 * hard work for unwired pages only. 9105 */ 9106 pmap_remove_pte(pmap, pte, pv->pv_va, 9107 *pde, &free, &lock); 9108 pmap_invalidate_page(pmap, pv->pv_va); 9109 cleared++; 9110 if (pvf == pv) 9111 pvf = NULL; 9112 pv = NULL; 9113 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 9114 ("inconsistent pv lock %p %p for page %p", 9115 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 9116 } else 9117 not_cleared++; 9118 } 9119 PMAP_UNLOCK(pmap); 9120 /* Rotate the PV list if it has more than one entry. */ 9121 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 9122 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 9123 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 9124 m->md.pv_gen++; 9125 } 9126 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 9127 not_cleared < PMAP_TS_REFERENCED_MAX); 9128 out: 9129 rw_wunlock(lock); 9130 vm_page_free_pages_toq(&free, true); 9131 return (cleared + not_cleared); 9132 } 9133 9134 /* 9135 * Apply the given advice to the specified range of addresses within the 9136 * given pmap. Depending on the advice, clear the referenced and/or 9137 * modified flags in each mapping and set the mapped page's dirty field. 9138 */ 9139 void 9140 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 9141 { 9142 struct rwlock *lock; 9143 pml4_entry_t *pml4e; 9144 pdp_entry_t *pdpe; 9145 pd_entry_t oldpde, *pde; 9146 pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; 9147 vm_offset_t va, va_next; 9148 vm_page_t m; 9149 bool anychanged; 9150 9151 if (advice != MADV_DONTNEED && advice != MADV_FREE) 9152 return; 9153 9154 /* 9155 * A/D bit emulation requires an alternate code path when clearing 9156 * the modified and accessed bits below. Since this function is 9157 * advisory in nature we skip it entirely for pmaps that require 9158 * A/D bit emulation. 9159 */ 9160 if (pmap_emulate_ad_bits(pmap)) 9161 return; 9162 9163 PG_A = pmap_accessed_bit(pmap); 9164 PG_G = pmap_global_bit(pmap); 9165 PG_M = pmap_modified_bit(pmap); 9166 PG_V = pmap_valid_bit(pmap); 9167 PG_RW = pmap_rw_bit(pmap); 9168 anychanged = false; 9169 pmap_delayed_invl_start(); 9170 PMAP_LOCK(pmap); 9171 for (; sva < eva; sva = va_next) { 9172 pml4e = pmap_pml4e(pmap, sva); 9173 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 9174 va_next = (sva + NBPML4) & ~PML4MASK; 9175 if (va_next < sva) 9176 va_next = eva; 9177 continue; 9178 } 9179 9180 va_next = (sva + NBPDP) & ~PDPMASK; 9181 if (va_next < sva) 9182 va_next = eva; 9183 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 9184 if ((*pdpe & PG_V) == 0) 9185 continue; 9186 if ((*pdpe & PG_PS) != 0) 9187 continue; 9188 9189 va_next = (sva + NBPDR) & ~PDRMASK; 9190 if (va_next < sva) 9191 va_next = eva; 9192 pde = pmap_pdpe_to_pde(pdpe, sva); 9193 oldpde = *pde; 9194 if ((oldpde & PG_V) == 0) 9195 continue; 9196 else if ((oldpde & PG_PS) != 0) { 9197 if ((oldpde & PG_MANAGED) == 0) 9198 continue; 9199 lock = NULL; 9200 if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { 9201 if (lock != NULL) 9202 rw_wunlock(lock); 9203 9204 /* 9205 * The large page mapping was destroyed. 9206 */ 9207 continue; 9208 } 9209 9210 /* 9211 * Unless the page mappings are wired, remove the 9212 * mapping to a single page so that a subsequent 9213 * access may repromote. Choosing the last page 9214 * within the address range [sva, min(va_next, eva)) 9215 * generally results in more repromotions. Since the 9216 * underlying page table page is fully populated, this 9217 * removal never frees a page table page. 9218 */ 9219 if ((oldpde & PG_W) == 0) { 9220 va = eva; 9221 if (va > va_next) 9222 va = va_next; 9223 va -= PAGE_SIZE; 9224 KASSERT(va >= sva, 9225 ("pmap_advise: no address gap")); 9226 pte = pmap_pde_to_pte(pde, va); 9227 KASSERT((*pte & PG_V) != 0, 9228 ("pmap_advise: invalid PTE")); 9229 pmap_remove_pte(pmap, pte, va, *pde, NULL, 9230 &lock); 9231 anychanged = true; 9232 } 9233 if (lock != NULL) 9234 rw_wunlock(lock); 9235 } 9236 if (va_next > eva) 9237 va_next = eva; 9238 va = va_next; 9239 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 9240 sva += PAGE_SIZE) { 9241 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) 9242 goto maybe_invlrng; 9243 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 9244 if (advice == MADV_DONTNEED) { 9245 /* 9246 * Future calls to pmap_is_modified() 9247 * can be avoided by making the page 9248 * dirty now. 9249 */ 9250 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 9251 vm_page_dirty(m); 9252 } 9253 atomic_clear_long(pte, PG_M | PG_A); 9254 } else if ((*pte & PG_A) != 0) 9255 atomic_clear_long(pte, PG_A); 9256 else 9257 goto maybe_invlrng; 9258 9259 if ((*pte & PG_G) != 0) { 9260 if (va == va_next) 9261 va = sva; 9262 } else 9263 anychanged = true; 9264 continue; 9265 maybe_invlrng: 9266 if (va != va_next) { 9267 pmap_invalidate_range(pmap, va, sva); 9268 va = va_next; 9269 } 9270 } 9271 if (va != va_next) 9272 pmap_invalidate_range(pmap, va, sva); 9273 } 9274 if (anychanged) 9275 pmap_invalidate_all(pmap); 9276 PMAP_UNLOCK(pmap); 9277 pmap_delayed_invl_finish(); 9278 } 9279 9280 /* 9281 * Clear the modify bits on the specified physical page. 9282 */ 9283 void 9284 pmap_clear_modify(vm_page_t m) 9285 { 9286 struct md_page *pvh; 9287 pmap_t pmap; 9288 pv_entry_t next_pv, pv; 9289 pd_entry_t oldpde, *pde; 9290 pt_entry_t *pte, PG_M, PG_RW; 9291 struct rwlock *lock; 9292 vm_offset_t va; 9293 int md_gen, pvh_gen; 9294 9295 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 9296 ("pmap_clear_modify: page %p is not managed", m)); 9297 vm_page_assert_busied(m); 9298 9299 if (!pmap_page_is_write_mapped(m)) 9300 return; 9301 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 9302 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 9303 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 9304 rw_wlock(lock); 9305 restart: 9306 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 9307 pmap = PV_PMAP(pv); 9308 if (!PMAP_TRYLOCK(pmap)) { 9309 pvh_gen = pvh->pv_gen; 9310 rw_wunlock(lock); 9311 PMAP_LOCK(pmap); 9312 rw_wlock(lock); 9313 if (pvh_gen != pvh->pv_gen) { 9314 PMAP_UNLOCK(pmap); 9315 goto restart; 9316 } 9317 } 9318 PG_M = pmap_modified_bit(pmap); 9319 PG_RW = pmap_rw_bit(pmap); 9320 va = pv->pv_va; 9321 pde = pmap_pde(pmap, va); 9322 oldpde = *pde; 9323 /* If oldpde has PG_RW set, then it also has PG_M set. */ 9324 if ((oldpde & PG_RW) != 0 && 9325 pmap_demote_pde_locked(pmap, pde, va, &lock) && 9326 (oldpde & PG_W) == 0) { 9327 /* 9328 * Write protect the mapping to a single page so that 9329 * a subsequent write access may repromote. 9330 */ 9331 va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); 9332 pte = pmap_pde_to_pte(pde, va); 9333 atomic_clear_long(pte, PG_M | PG_RW); 9334 vm_page_dirty(m); 9335 pmap_invalidate_page(pmap, va); 9336 } 9337 PMAP_UNLOCK(pmap); 9338 } 9339 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 9340 pmap = PV_PMAP(pv); 9341 if (!PMAP_TRYLOCK(pmap)) { 9342 md_gen = m->md.pv_gen; 9343 pvh_gen = pvh->pv_gen; 9344 rw_wunlock(lock); 9345 PMAP_LOCK(pmap); 9346 rw_wlock(lock); 9347 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 9348 PMAP_UNLOCK(pmap); 9349 goto restart; 9350 } 9351 } 9352 PG_M = pmap_modified_bit(pmap); 9353 PG_RW = pmap_rw_bit(pmap); 9354 pde = pmap_pde(pmap, pv->pv_va); 9355 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 9356 " a 2mpage in page %p's pv list", m)); 9357 pte = pmap_pde_to_pte(pde, pv->pv_va); 9358 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 9359 atomic_clear_long(pte, PG_M); 9360 pmap_invalidate_page(pmap, pv->pv_va); 9361 } 9362 PMAP_UNLOCK(pmap); 9363 } 9364 rw_wunlock(lock); 9365 } 9366 9367 /* 9368 * Miscellaneous support routines follow 9369 */ 9370 9371 /* Adjust the properties for a leaf page table entry. */ 9372 static __inline void 9373 pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask) 9374 { 9375 u_long opte, npte; 9376 9377 opte = *(u_long *)pte; 9378 do { 9379 npte = opte & ~mask; 9380 npte |= bits; 9381 } while (npte != opte && !atomic_fcmpset_long((u_long *)pte, &opte, 9382 npte)); 9383 } 9384 9385 /* 9386 * Map a set of physical memory pages into the kernel virtual 9387 * address space. Return a pointer to where it is mapped. This 9388 * routine is intended to be used for mapping device memory, 9389 * NOT real memory. 9390 */ 9391 static void * 9392 pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags) 9393 { 9394 struct pmap_preinit_mapping *ppim; 9395 vm_offset_t va, offset; 9396 vm_size_t tmpsize; 9397 int i; 9398 9399 offset = pa & PAGE_MASK; 9400 size = round_page(offset + size); 9401 pa = trunc_page(pa); 9402 9403 if (!pmap_initialized) { 9404 va = 0; 9405 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9406 ppim = pmap_preinit_mapping + i; 9407 if (ppim->va == 0) { 9408 ppim->pa = pa; 9409 ppim->sz = size; 9410 ppim->mode = mode; 9411 ppim->va = virtual_avail; 9412 virtual_avail += size; 9413 va = ppim->va; 9414 break; 9415 } 9416 } 9417 if (va == 0) 9418 panic("%s: too many preinit mappings", __func__); 9419 } else { 9420 /* 9421 * If we have a preinit mapping, re-use it. 9422 */ 9423 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9424 ppim = pmap_preinit_mapping + i; 9425 if (ppim->pa == pa && ppim->sz == size && 9426 (ppim->mode == mode || 9427 (flags & MAPDEV_SETATTR) == 0)) 9428 return ((void *)(ppim->va + offset)); 9429 } 9430 /* 9431 * If the specified range of physical addresses fits within 9432 * the direct map window, use the direct map. 9433 */ 9434 if (pa < dmaplimit && pa + size <= dmaplimit) { 9435 va = PHYS_TO_DMAP(pa); 9436 if ((flags & MAPDEV_SETATTR) != 0) { 9437 PMAP_LOCK(kernel_pmap); 9438 i = pmap_change_props_locked(va, size, 9439 PROT_NONE, mode, flags); 9440 PMAP_UNLOCK(kernel_pmap); 9441 } else 9442 i = 0; 9443 if (!i) 9444 return ((void *)(va + offset)); 9445 } 9446 va = kva_alloc(size); 9447 if (va == 0) 9448 panic("%s: Couldn't allocate KVA", __func__); 9449 } 9450 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 9451 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 9452 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 9453 if ((flags & MAPDEV_FLUSHCACHE) != 0) 9454 pmap_invalidate_cache_range(va, va + tmpsize); 9455 return ((void *)(va + offset)); 9456 } 9457 9458 void * 9459 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 9460 { 9461 9462 return (pmap_mapdev_internal(pa, size, mode, MAPDEV_FLUSHCACHE | 9463 MAPDEV_SETATTR)); 9464 } 9465 9466 void * 9467 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 9468 { 9469 9470 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 9471 } 9472 9473 void * 9474 pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size) 9475 { 9476 9477 return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE, 9478 MAPDEV_SETATTR)); 9479 } 9480 9481 void * 9482 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 9483 { 9484 9485 return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK, 9486 MAPDEV_FLUSHCACHE)); 9487 } 9488 9489 void 9490 pmap_unmapdev(void *p, vm_size_t size) 9491 { 9492 struct pmap_preinit_mapping *ppim; 9493 vm_offset_t offset, va; 9494 int i; 9495 9496 va = (vm_offset_t)p; 9497 9498 /* If we gave a direct map region in pmap_mapdev, do nothing */ 9499 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 9500 return; 9501 offset = va & PAGE_MASK; 9502 size = round_page(offset + size); 9503 va = trunc_page(va); 9504 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9505 ppim = pmap_preinit_mapping + i; 9506 if (ppim->va == va && ppim->sz == size) { 9507 if (pmap_initialized) 9508 return; 9509 ppim->pa = 0; 9510 ppim->va = 0; 9511 ppim->sz = 0; 9512 ppim->mode = 0; 9513 if (va + size == virtual_avail) 9514 virtual_avail = va; 9515 return; 9516 } 9517 } 9518 if (pmap_initialized) { 9519 pmap_qremove(va, atop(size)); 9520 kva_free(va, size); 9521 } 9522 } 9523 9524 /* 9525 * Tries to demote a 1GB page mapping. 9526 */ 9527 static boolean_t 9528 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) 9529 { 9530 pdp_entry_t newpdpe, oldpdpe; 9531 pd_entry_t *firstpde, newpde, *pde; 9532 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 9533 vm_paddr_t pdpgpa; 9534 vm_page_t pdpg; 9535 9536 PG_A = pmap_accessed_bit(pmap); 9537 PG_M = pmap_modified_bit(pmap); 9538 PG_V = pmap_valid_bit(pmap); 9539 PG_RW = pmap_rw_bit(pmap); 9540 9541 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9542 oldpdpe = *pdpe; 9543 KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), 9544 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 9545 pdpg = pmap_alloc_pt_page(pmap, va >> PDPSHIFT, 9546 VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT); 9547 if (pdpg == NULL) { 9548 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 9549 " in pmap %p", va, pmap); 9550 return (FALSE); 9551 } 9552 pdpgpa = VM_PAGE_TO_PHYS(pdpg); 9553 firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa); 9554 newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; 9555 KASSERT((oldpdpe & PG_A) != 0, 9556 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 9557 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 9558 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 9559 newpde = oldpdpe; 9560 9561 /* 9562 * Initialize the page directory page. 9563 */ 9564 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 9565 *pde = newpde; 9566 newpde += NBPDR; 9567 } 9568 9569 /* 9570 * Demote the mapping. 9571 */ 9572 *pdpe = newpdpe; 9573 9574 /* 9575 * Invalidate a stale recursive mapping of the page directory page. 9576 */ 9577 pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); 9578 9579 counter_u64_add(pmap_pdpe_demotions, 1); 9580 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 9581 " in pmap %p", va, pmap); 9582 return (TRUE); 9583 } 9584 9585 /* 9586 * Sets the memory attribute for the specified page. 9587 */ 9588 void 9589 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 9590 { 9591 9592 m->md.pat_mode = ma; 9593 9594 /* 9595 * If "m" is a normal page, update its direct mapping. This update 9596 * can be relied upon to perform any cache operations that are 9597 * required for data coherence. 9598 */ 9599 if ((m->flags & PG_FICTITIOUS) == 0 && 9600 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 9601 m->md.pat_mode)) 9602 panic("memory attribute change on the direct map failed"); 9603 } 9604 9605 void 9606 pmap_page_set_memattr_noflush(vm_page_t m, vm_memattr_t ma) 9607 { 9608 int error; 9609 9610 m->md.pat_mode = ma; 9611 9612 if ((m->flags & PG_FICTITIOUS) != 0) 9613 return; 9614 PMAP_LOCK(kernel_pmap); 9615 error = pmap_change_props_locked(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 9616 PAGE_SIZE, PROT_NONE, m->md.pat_mode, 0); 9617 PMAP_UNLOCK(kernel_pmap); 9618 if (error != 0) 9619 panic("memory attribute change on the direct map failed"); 9620 } 9621 9622 /* 9623 * Changes the specified virtual address range's memory type to that given by 9624 * the parameter "mode". The specified virtual address range must be 9625 * completely contained within either the direct map or the kernel map. If 9626 * the virtual address range is contained within the kernel map, then the 9627 * memory type for each of the corresponding ranges of the direct map is also 9628 * changed. (The corresponding ranges of the direct map are those ranges that 9629 * map the same physical pages as the specified virtual address range.) These 9630 * changes to the direct map are necessary because Intel describes the 9631 * behavior of their processors as "undefined" if two or more mappings to the 9632 * same physical page have different memory types. 9633 * 9634 * Returns zero if the change completed successfully, and either EINVAL or 9635 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 9636 * of the virtual address range was not mapped, and ENOMEM is returned if 9637 * there was insufficient memory available to complete the change. In the 9638 * latter case, the memory type may have been changed on some part of the 9639 * virtual address range or the direct map. 9640 */ 9641 int 9642 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 9643 { 9644 int error; 9645 9646 PMAP_LOCK(kernel_pmap); 9647 error = pmap_change_props_locked(va, size, PROT_NONE, mode, 9648 MAPDEV_FLUSHCACHE); 9649 PMAP_UNLOCK(kernel_pmap); 9650 return (error); 9651 } 9652 9653 /* 9654 * Changes the specified virtual address range's protections to those 9655 * specified by "prot". Like pmap_change_attr(), protections for aliases 9656 * in the direct map are updated as well. Protections on aliasing mappings may 9657 * be a subset of the requested protections; for example, mappings in the direct 9658 * map are never executable. 9659 */ 9660 int 9661 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot) 9662 { 9663 int error; 9664 9665 /* Only supported within the kernel map. */ 9666 if (va < VM_MIN_KERNEL_ADDRESS) 9667 return (EINVAL); 9668 9669 PMAP_LOCK(kernel_pmap); 9670 error = pmap_change_props_locked(va, size, prot, -1, 9671 MAPDEV_ASSERTVALID); 9672 PMAP_UNLOCK(kernel_pmap); 9673 return (error); 9674 } 9675 9676 static int 9677 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, 9678 int mode, int flags) 9679 { 9680 vm_offset_t base, offset, tmpva; 9681 vm_paddr_t pa_start, pa_end, pa_end1; 9682 pdp_entry_t *pdpe; 9683 pd_entry_t *pde, pde_bits, pde_mask; 9684 pt_entry_t *pte, pte_bits, pte_mask; 9685 int error; 9686 bool changed; 9687 9688 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 9689 base = trunc_page(va); 9690 offset = va & PAGE_MASK; 9691 size = round_page(offset + size); 9692 9693 /* 9694 * Only supported on kernel virtual addresses, including the direct 9695 * map but excluding the recursive map. 9696 */ 9697 if (base < DMAP_MIN_ADDRESS) 9698 return (EINVAL); 9699 9700 /* 9701 * Construct our flag sets and masks. "bits" is the subset of 9702 * "mask" that will be set in each modified PTE. 9703 * 9704 * Mappings in the direct map are never allowed to be executable. 9705 */ 9706 pde_bits = pte_bits = 0; 9707 pde_mask = pte_mask = 0; 9708 if (mode != -1) { 9709 pde_bits |= pmap_cache_bits(kernel_pmap, mode, true); 9710 pde_mask |= X86_PG_PDE_CACHE; 9711 pte_bits |= pmap_cache_bits(kernel_pmap, mode, false); 9712 pte_mask |= X86_PG_PTE_CACHE; 9713 } 9714 if (prot != VM_PROT_NONE) { 9715 if ((prot & VM_PROT_WRITE) != 0) { 9716 pde_bits |= X86_PG_RW; 9717 pte_bits |= X86_PG_RW; 9718 } 9719 if ((prot & VM_PROT_EXECUTE) == 0 || 9720 va < VM_MIN_KERNEL_ADDRESS) { 9721 pde_bits |= pg_nx; 9722 pte_bits |= pg_nx; 9723 } 9724 pde_mask |= X86_PG_RW | pg_nx; 9725 pte_mask |= X86_PG_RW | pg_nx; 9726 } 9727 9728 /* 9729 * Pages that aren't mapped aren't supported. Also break down 2MB pages 9730 * into 4KB pages if required. 9731 */ 9732 for (tmpva = base; tmpva < base + size; ) { 9733 pdpe = pmap_pdpe(kernel_pmap, tmpva); 9734 if (pdpe == NULL || *pdpe == 0) { 9735 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9736 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9737 return (EINVAL); 9738 } 9739 if (*pdpe & PG_PS) { 9740 /* 9741 * If the current 1GB page already has the required 9742 * properties, then we need not demote this page. Just 9743 * increment tmpva to the next 1GB page frame. 9744 */ 9745 if ((*pdpe & pde_mask) == pde_bits) { 9746 tmpva = trunc_1gpage(tmpva) + NBPDP; 9747 continue; 9748 } 9749 9750 /* 9751 * If the current offset aligns with a 1GB page frame 9752 * and there is at least 1GB left within the range, then 9753 * we need not break down this page into 2MB pages. 9754 */ 9755 if ((tmpva & PDPMASK) == 0 && 9756 tmpva + PDPMASK < base + size) { 9757 tmpva += NBPDP; 9758 continue; 9759 } 9760 if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) 9761 return (ENOMEM); 9762 } 9763 pde = pmap_pdpe_to_pde(pdpe, tmpva); 9764 if (*pde == 0) { 9765 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9766 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9767 return (EINVAL); 9768 } 9769 if (*pde & PG_PS) { 9770 /* 9771 * If the current 2MB page already has the required 9772 * properties, then we need not demote this page. Just 9773 * increment tmpva to the next 2MB page frame. 9774 */ 9775 if ((*pde & pde_mask) == pde_bits) { 9776 tmpva = trunc_2mpage(tmpva) + NBPDR; 9777 continue; 9778 } 9779 9780 /* 9781 * If the current offset aligns with a 2MB page frame 9782 * and there is at least 2MB left within the range, then 9783 * we need not break down this page into 4KB pages. 9784 */ 9785 if ((tmpva & PDRMASK) == 0 && 9786 tmpva + PDRMASK < base + size) { 9787 tmpva += NBPDR; 9788 continue; 9789 } 9790 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) 9791 return (ENOMEM); 9792 } 9793 pte = pmap_pde_to_pte(pde, tmpva); 9794 if (*pte == 0) { 9795 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9796 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9797 return (EINVAL); 9798 } 9799 tmpva += PAGE_SIZE; 9800 } 9801 error = 0; 9802 9803 /* 9804 * Ok, all the pages exist, so run through them updating their 9805 * properties if required. 9806 */ 9807 changed = false; 9808 pa_start = pa_end = 0; 9809 for (tmpva = base; tmpva < base + size; ) { 9810 pdpe = pmap_pdpe(kernel_pmap, tmpva); 9811 if (*pdpe & PG_PS) { 9812 if ((*pdpe & pde_mask) != pde_bits) { 9813 pmap_pte_props(pdpe, pde_bits, pde_mask); 9814 changed = true; 9815 } 9816 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9817 (*pdpe & PG_PS_FRAME) < dmaplimit) { 9818 if (pa_start == pa_end) { 9819 /* Start physical address run. */ 9820 pa_start = *pdpe & PG_PS_FRAME; 9821 pa_end = pa_start + NBPDP; 9822 } else if (pa_end == (*pdpe & PG_PS_FRAME)) 9823 pa_end += NBPDP; 9824 else { 9825 /* Run ended, update direct map. */ 9826 error = pmap_change_props_locked( 9827 PHYS_TO_DMAP(pa_start), 9828 pa_end - pa_start, prot, mode, 9829 flags); 9830 if (error != 0) 9831 break; 9832 /* Start physical address run. */ 9833 pa_start = *pdpe & PG_PS_FRAME; 9834 pa_end = pa_start + NBPDP; 9835 } 9836 } 9837 tmpva = trunc_1gpage(tmpva) + NBPDP; 9838 continue; 9839 } 9840 pde = pmap_pdpe_to_pde(pdpe, tmpva); 9841 if (*pde & PG_PS) { 9842 if ((*pde & pde_mask) != pde_bits) { 9843 pmap_pte_props(pde, pde_bits, pde_mask); 9844 changed = true; 9845 } 9846 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9847 (*pde & PG_PS_FRAME) < dmaplimit) { 9848 if (pa_start == pa_end) { 9849 /* Start physical address run. */ 9850 pa_start = *pde & PG_PS_FRAME; 9851 pa_end = pa_start + NBPDR; 9852 } else if (pa_end == (*pde & PG_PS_FRAME)) 9853 pa_end += NBPDR; 9854 else { 9855 /* Run ended, update direct map. */ 9856 error = pmap_change_props_locked( 9857 PHYS_TO_DMAP(pa_start), 9858 pa_end - pa_start, prot, mode, 9859 flags); 9860 if (error != 0) 9861 break; 9862 /* Start physical address run. */ 9863 pa_start = *pde & PG_PS_FRAME; 9864 pa_end = pa_start + NBPDR; 9865 } 9866 } 9867 tmpva = trunc_2mpage(tmpva) + NBPDR; 9868 } else { 9869 pte = pmap_pde_to_pte(pde, tmpva); 9870 if ((*pte & pte_mask) != pte_bits) { 9871 pmap_pte_props(pte, pte_bits, pte_mask); 9872 changed = true; 9873 } 9874 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9875 (*pte & PG_FRAME) < dmaplimit) { 9876 if (pa_start == pa_end) { 9877 /* Start physical address run. */ 9878 pa_start = *pte & PG_FRAME; 9879 pa_end = pa_start + PAGE_SIZE; 9880 } else if (pa_end == (*pte & PG_FRAME)) 9881 pa_end += PAGE_SIZE; 9882 else { 9883 /* Run ended, update direct map. */ 9884 error = pmap_change_props_locked( 9885 PHYS_TO_DMAP(pa_start), 9886 pa_end - pa_start, prot, mode, 9887 flags); 9888 if (error != 0) 9889 break; 9890 /* Start physical address run. */ 9891 pa_start = *pte & PG_FRAME; 9892 pa_end = pa_start + PAGE_SIZE; 9893 } 9894 } 9895 tmpva += PAGE_SIZE; 9896 } 9897 } 9898 if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { 9899 pa_end1 = MIN(pa_end, dmaplimit); 9900 if (pa_start != pa_end1) 9901 error = pmap_change_props_locked(PHYS_TO_DMAP(pa_start), 9902 pa_end1 - pa_start, prot, mode, flags); 9903 } 9904 9905 /* 9906 * Flush CPU caches if required to make sure any data isn't cached that 9907 * shouldn't be, etc. 9908 */ 9909 if (changed) { 9910 pmap_invalidate_range(kernel_pmap, base, tmpva); 9911 if ((flags & MAPDEV_FLUSHCACHE) != 0) 9912 pmap_invalidate_cache_range(base, tmpva); 9913 } 9914 return (error); 9915 } 9916 9917 /* 9918 * Demotes any mapping within the direct map region that covers more than the 9919 * specified range of physical addresses. This range's size must be a power 9920 * of two and its starting address must be a multiple of its size. Since the 9921 * demotion does not change any attributes of the mapping, a TLB invalidation 9922 * is not mandatory. The caller may, however, request a TLB invalidation. 9923 */ 9924 void 9925 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) 9926 { 9927 pdp_entry_t *pdpe; 9928 pd_entry_t *pde; 9929 vm_offset_t va; 9930 boolean_t changed; 9931 9932 if (len == 0) 9933 return; 9934 KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); 9935 KASSERT((base & (len - 1)) == 0, 9936 ("pmap_demote_DMAP: base is not a multiple of len")); 9937 if (len < NBPDP && base < dmaplimit) { 9938 va = PHYS_TO_DMAP(base); 9939 changed = FALSE; 9940 PMAP_LOCK(kernel_pmap); 9941 pdpe = pmap_pdpe(kernel_pmap, va); 9942 if ((*pdpe & X86_PG_V) == 0) 9943 panic("pmap_demote_DMAP: invalid PDPE"); 9944 if ((*pdpe & PG_PS) != 0) { 9945 if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) 9946 panic("pmap_demote_DMAP: PDPE failed"); 9947 changed = TRUE; 9948 } 9949 if (len < NBPDR) { 9950 pde = pmap_pdpe_to_pde(pdpe, va); 9951 if ((*pde & X86_PG_V) == 0) 9952 panic("pmap_demote_DMAP: invalid PDE"); 9953 if ((*pde & PG_PS) != 0) { 9954 if (!pmap_demote_pde(kernel_pmap, pde, va)) 9955 panic("pmap_demote_DMAP: PDE failed"); 9956 changed = TRUE; 9957 } 9958 } 9959 if (changed && invalidate) 9960 pmap_invalidate_page(kernel_pmap, va); 9961 PMAP_UNLOCK(kernel_pmap); 9962 } 9963 } 9964 9965 /* 9966 * Perform the pmap work for mincore(2). If the page is not both referenced and 9967 * modified by this pmap, returns its physical address so that the caller can 9968 * find other mappings. 9969 */ 9970 int 9971 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 9972 { 9973 pdp_entry_t *pdpe; 9974 pd_entry_t *pdep; 9975 pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; 9976 vm_paddr_t pa; 9977 int val; 9978 9979 PG_A = pmap_accessed_bit(pmap); 9980 PG_M = pmap_modified_bit(pmap); 9981 PG_V = pmap_valid_bit(pmap); 9982 PG_RW = pmap_rw_bit(pmap); 9983 9984 PMAP_LOCK(pmap); 9985 pte = 0; 9986 pa = 0; 9987 val = 0; 9988 pdpe = pmap_pdpe(pmap, addr); 9989 if (pdpe == NULL) 9990 goto out; 9991 if ((*pdpe & PG_V) != 0) { 9992 if ((*pdpe & PG_PS) != 0) { 9993 pte = *pdpe; 9994 pa = ((pte & PG_PS_PDP_FRAME) | (addr & PDPMASK)) & 9995 PG_FRAME; 9996 val = MINCORE_PSIND(2); 9997 } else { 9998 pdep = pmap_pde(pmap, addr); 9999 if (pdep != NULL && (*pdep & PG_V) != 0) { 10000 if ((*pdep & PG_PS) != 0) { 10001 pte = *pdep; 10002 /* Compute the physical address of the 4KB page. */ 10003 pa = ((pte & PG_PS_FRAME) | (addr & 10004 PDRMASK)) & PG_FRAME; 10005 val = MINCORE_PSIND(1); 10006 } else { 10007 pte = *pmap_pde_to_pte(pdep, addr); 10008 pa = pte & PG_FRAME; 10009 val = 0; 10010 } 10011 } 10012 } 10013 } 10014 if ((pte & PG_V) != 0) { 10015 val |= MINCORE_INCORE; 10016 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 10017 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 10018 if ((pte & PG_A) != 0) 10019 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 10020 } 10021 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 10022 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 10023 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 10024 *pap = pa; 10025 } 10026 out: 10027 PMAP_UNLOCK(pmap); 10028 return (val); 10029 } 10030 10031 static uint64_t 10032 pmap_pcid_alloc(pmap_t pmap, struct pmap_pcid *pcidp) 10033 { 10034 uint32_t gen, new_gen, pcid_next; 10035 10036 CRITICAL_ASSERT(curthread); 10037 gen = PCPU_GET(pcid_gen); 10038 if (pcidp->pm_pcid == PMAP_PCID_KERN) 10039 return (pti ? 0 : CR3_PCID_SAVE); 10040 if (pcidp->pm_gen == gen) 10041 return (CR3_PCID_SAVE); 10042 pcid_next = PCPU_GET(pcid_next); 10043 KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) || 10044 (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN), 10045 ("cpu %d pcid_next %#x", PCPU_GET(cpuid), pcid_next)); 10046 if ((!pti && pcid_next == PMAP_PCID_OVERMAX) || 10047 (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) { 10048 new_gen = gen + 1; 10049 if (new_gen == 0) 10050 new_gen = 1; 10051 PCPU_SET(pcid_gen, new_gen); 10052 pcid_next = PMAP_PCID_KERN + 1; 10053 } else { 10054 new_gen = gen; 10055 } 10056 pcidp->pm_pcid = pcid_next; 10057 pcidp->pm_gen = new_gen; 10058 PCPU_SET(pcid_next, pcid_next + 1); 10059 return (0); 10060 } 10061 10062 static uint64_t 10063 pmap_pcid_alloc_checked(pmap_t pmap, struct pmap_pcid *pcidp) 10064 { 10065 uint64_t cached; 10066 10067 cached = pmap_pcid_alloc(pmap, pcidp); 10068 KASSERT(pcidp->pm_pcid < PMAP_PCID_OVERMAX, 10069 ("pmap %p cpu %d pcid %#x", pmap, PCPU_GET(cpuid), pcidp->pm_pcid)); 10070 KASSERT(pcidp->pm_pcid != PMAP_PCID_KERN || pmap == kernel_pmap, 10071 ("non-kernel pmap pmap %p cpu %d pcid %#x", 10072 pmap, PCPU_GET(cpuid), pcidp->pm_pcid)); 10073 return (cached); 10074 } 10075 10076 static void 10077 pmap_activate_sw_pti_post(struct thread *td, pmap_t pmap) 10078 { 10079 10080 PCPU_GET(tssp)->tss_rsp0 = pmap->pm_ucr3 != PMAP_NO_CR3 ? 10081 PCPU_GET(pti_rsp0) : (uintptr_t)td->td_md.md_stack_base; 10082 } 10083 10084 static void 10085 pmap_activate_sw_pcid_pti(struct thread *td, pmap_t pmap, u_int cpuid) 10086 { 10087 pmap_t old_pmap; 10088 struct pmap_pcid *pcidp, *old_pcidp; 10089 uint64_t cached, cr3, kcr3, ucr3; 10090 10091 KASSERT((read_rflags() & PSL_I) == 0, 10092 ("PCID needs interrupts disabled in pmap_activate_sw()")); 10093 10094 /* See the comment in pmap_invalidate_page_pcid(). */ 10095 if (PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) { 10096 PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK); 10097 old_pmap = PCPU_GET(curpmap); 10098 MPASS(old_pmap->pm_ucr3 != PMAP_NO_CR3); 10099 old_pcidp = zpcpu_get_cpu(old_pmap->pm_pcidp, cpuid); 10100 old_pcidp->pm_gen = 0; 10101 } 10102 10103 pcidp = zpcpu_get_cpu(pmap->pm_pcidp, cpuid); 10104 cached = pmap_pcid_alloc_checked(pmap, pcidp); 10105 cr3 = rcr3(); 10106 if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) 10107 load_cr3(pmap->pm_cr3 | pcidp->pm_pcid); 10108 PCPU_SET(curpmap, pmap); 10109 kcr3 = pmap->pm_cr3 | pcidp->pm_pcid; 10110 ucr3 = pmap->pm_ucr3 | pcidp->pm_pcid | PMAP_PCID_USER_PT; 10111 10112 if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) 10113 PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); 10114 10115 PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); 10116 PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); 10117 if (cached) 10118 counter_u64_add(pcid_save_cnt, 1); 10119 10120 pmap_activate_sw_pti_post(td, pmap); 10121 } 10122 10123 static void 10124 pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap, 10125 u_int cpuid) 10126 { 10127 struct pmap_pcid *pcidp; 10128 uint64_t cached, cr3; 10129 10130 KASSERT((read_rflags() & PSL_I) == 0, 10131 ("PCID needs interrupts disabled in pmap_activate_sw()")); 10132 10133 pcidp = zpcpu_get_cpu(pmap->pm_pcidp, cpuid); 10134 cached = pmap_pcid_alloc_checked(pmap, pcidp); 10135 cr3 = rcr3(); 10136 if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) 10137 load_cr3(pmap->pm_cr3 | pcidp->pm_pcid | cached); 10138 PCPU_SET(curpmap, pmap); 10139 if (cached) 10140 counter_u64_add(pcid_save_cnt, 1); 10141 } 10142 10143 static void 10144 pmap_activate_sw_nopcid_nopti(struct thread *td __unused, pmap_t pmap, 10145 u_int cpuid __unused) 10146 { 10147 10148 load_cr3(pmap->pm_cr3); 10149 PCPU_SET(curpmap, pmap); 10150 } 10151 10152 static void 10153 pmap_activate_sw_nopcid_pti(struct thread *td, pmap_t pmap, 10154 u_int cpuid __unused) 10155 { 10156 10157 pmap_activate_sw_nopcid_nopti(td, pmap, cpuid); 10158 PCPU_SET(kcr3, pmap->pm_cr3); 10159 PCPU_SET(ucr3, pmap->pm_ucr3); 10160 pmap_activate_sw_pti_post(td, pmap); 10161 } 10162 10163 DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (struct thread *, pmap_t, 10164 u_int)) 10165 { 10166 10167 if (pmap_pcid_enabled && pti) 10168 return (pmap_activate_sw_pcid_pti); 10169 else if (pmap_pcid_enabled && !pti) 10170 return (pmap_activate_sw_pcid_nopti); 10171 else if (!pmap_pcid_enabled && pti) 10172 return (pmap_activate_sw_nopcid_pti); 10173 else /* if (!pmap_pcid_enabled && !pti) */ 10174 return (pmap_activate_sw_nopcid_nopti); 10175 } 10176 10177 void 10178 pmap_activate_sw(struct thread *td) 10179 { 10180 pmap_t oldpmap, pmap; 10181 u_int cpuid; 10182 10183 oldpmap = PCPU_GET(curpmap); 10184 pmap = vmspace_pmap(td->td_proc->p_vmspace); 10185 if (oldpmap == pmap) { 10186 if (cpu_vendor_id != CPU_VENDOR_INTEL) 10187 mfence(); 10188 return; 10189 } 10190 cpuid = PCPU_GET(cpuid); 10191 #ifdef SMP 10192 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 10193 #else 10194 CPU_SET(cpuid, &pmap->pm_active); 10195 #endif 10196 pmap_activate_sw_mode(td, pmap, cpuid); 10197 #ifdef SMP 10198 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 10199 #else 10200 CPU_CLR(cpuid, &oldpmap->pm_active); 10201 #endif 10202 } 10203 10204 void 10205 pmap_activate(struct thread *td) 10206 { 10207 /* 10208 * invltlb_{invpcid,}_pcid_handler() is used to handle an 10209 * invalidate_all IPI, which checks for curpmap == 10210 * smp_tlb_pmap. The below sequence of operations has a 10211 * window where %CR3 is loaded with the new pmap's PML4 10212 * address, but the curpmap value has not yet been updated. 10213 * This causes the invltlb IPI handler, which is called 10214 * between the updates, to execute as a NOP, which leaves 10215 * stale TLB entries. 10216 * 10217 * Note that the most common use of pmap_activate_sw(), from 10218 * a context switch, is immune to this race, because 10219 * interrupts are disabled (while the thread lock is owned), 10220 * so the IPI is delayed until after curpmap is updated. Protect 10221 * other callers in a similar way, by disabling interrupts 10222 * around the %cr3 register reload and curpmap assignment. 10223 */ 10224 spinlock_enter(); 10225 pmap_activate_sw(td); 10226 spinlock_exit(); 10227 } 10228 10229 void 10230 pmap_activate_boot(pmap_t pmap) 10231 { 10232 uint64_t kcr3; 10233 u_int cpuid; 10234 10235 /* 10236 * kernel_pmap must be never deactivated, and we ensure that 10237 * by never activating it at all. 10238 */ 10239 MPASS(pmap != kernel_pmap); 10240 10241 cpuid = PCPU_GET(cpuid); 10242 #ifdef SMP 10243 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 10244 #else 10245 CPU_SET(cpuid, &pmap->pm_active); 10246 #endif 10247 PCPU_SET(curpmap, pmap); 10248 if (pti) { 10249 kcr3 = pmap->pm_cr3; 10250 if (pmap_pcid_enabled) 10251 kcr3 |= pmap_get_pcid(pmap) | CR3_PCID_SAVE; 10252 } else { 10253 kcr3 = PMAP_NO_CR3; 10254 } 10255 PCPU_SET(kcr3, kcr3); 10256 PCPU_SET(ucr3, PMAP_NO_CR3); 10257 } 10258 10259 void 10260 pmap_active_cpus(pmap_t pmap, cpuset_t *res) 10261 { 10262 *res = pmap->pm_active; 10263 } 10264 10265 void 10266 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 10267 { 10268 } 10269 10270 /* 10271 * Increase the starting virtual address of the given mapping if a 10272 * different alignment might result in more superpage mappings. 10273 */ 10274 void 10275 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 10276 vm_offset_t *addr, vm_size_t size) 10277 { 10278 vm_offset_t superpage_offset; 10279 10280 if (size < NBPDR) 10281 return; 10282 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 10283 offset += ptoa(object->pg_color); 10284 superpage_offset = offset & PDRMASK; 10285 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 10286 (*addr & PDRMASK) == superpage_offset) 10287 return; 10288 if ((*addr & PDRMASK) < superpage_offset) 10289 *addr = (*addr & ~PDRMASK) + superpage_offset; 10290 else 10291 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 10292 } 10293 10294 #ifdef INVARIANTS 10295 static unsigned long num_dirty_emulations; 10296 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, 10297 &num_dirty_emulations, 0, NULL); 10298 10299 static unsigned long num_accessed_emulations; 10300 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, 10301 &num_accessed_emulations, 0, NULL); 10302 10303 static unsigned long num_superpage_accessed_emulations; 10304 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, 10305 &num_superpage_accessed_emulations, 0, NULL); 10306 10307 static unsigned long ad_emulation_superpage_promotions; 10308 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, 10309 &ad_emulation_superpage_promotions, 0, NULL); 10310 #endif /* INVARIANTS */ 10311 10312 int 10313 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) 10314 { 10315 int rv; 10316 struct rwlock *lock; 10317 #if VM_NRESERVLEVEL > 0 10318 vm_page_t m, mpte; 10319 #endif 10320 pd_entry_t *pde; 10321 pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; 10322 10323 KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, 10324 ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); 10325 10326 if (!pmap_emulate_ad_bits(pmap)) 10327 return (-1); 10328 10329 PG_A = pmap_accessed_bit(pmap); 10330 PG_M = pmap_modified_bit(pmap); 10331 PG_V = pmap_valid_bit(pmap); 10332 PG_RW = pmap_rw_bit(pmap); 10333 10334 rv = -1; 10335 lock = NULL; 10336 PMAP_LOCK(pmap); 10337 10338 pde = pmap_pde(pmap, va); 10339 if (pde == NULL || (*pde & PG_V) == 0) 10340 goto done; 10341 10342 if ((*pde & PG_PS) != 0) { 10343 if (ftype == VM_PROT_READ) { 10344 #ifdef INVARIANTS 10345 atomic_add_long(&num_superpage_accessed_emulations, 1); 10346 #endif 10347 *pde |= PG_A; 10348 rv = 0; 10349 } 10350 goto done; 10351 } 10352 10353 pte = pmap_pde_to_pte(pde, va); 10354 if ((*pte & PG_V) == 0) 10355 goto done; 10356 10357 if (ftype == VM_PROT_WRITE) { 10358 if ((*pte & PG_RW) == 0) 10359 goto done; 10360 /* 10361 * Set the modified and accessed bits simultaneously. 10362 * 10363 * Intel EPT PTEs that do software emulation of A/D bits map 10364 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively. 10365 * An EPT misconfiguration is triggered if the PTE is writable 10366 * but not readable (WR=10). This is avoided by setting PG_A 10367 * and PG_M simultaneously. 10368 */ 10369 *pte |= PG_M | PG_A; 10370 } else { 10371 *pte |= PG_A; 10372 } 10373 10374 #if VM_NRESERVLEVEL > 0 10375 /* try to promote the mapping */ 10376 if (va < VM_MAXUSER_ADDRESS) 10377 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 10378 else 10379 mpte = NULL; 10380 10381 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 10382 10383 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 10384 (m->flags & PG_FICTITIOUS) == 0 && 10385 vm_reserv_level_iffullpop(m) == 0 && 10386 pmap_promote_pde(pmap, pde, va, mpte, &lock)) { 10387 #ifdef INVARIANTS 10388 atomic_add_long(&ad_emulation_superpage_promotions, 1); 10389 #endif 10390 } 10391 #endif 10392 10393 #ifdef INVARIANTS 10394 if (ftype == VM_PROT_WRITE) 10395 atomic_add_long(&num_dirty_emulations, 1); 10396 else 10397 atomic_add_long(&num_accessed_emulations, 1); 10398 #endif 10399 rv = 0; /* success */ 10400 done: 10401 if (lock != NULL) 10402 rw_wunlock(lock); 10403 PMAP_UNLOCK(pmap); 10404 return (rv); 10405 } 10406 10407 void 10408 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) 10409 { 10410 pml4_entry_t *pml4; 10411 pdp_entry_t *pdp; 10412 pd_entry_t *pde; 10413 pt_entry_t *pte, PG_V; 10414 int idx; 10415 10416 idx = 0; 10417 PG_V = pmap_valid_bit(pmap); 10418 PMAP_LOCK(pmap); 10419 10420 pml4 = pmap_pml4e(pmap, va); 10421 if (pml4 == NULL) 10422 goto done; 10423 ptr[idx++] = *pml4; 10424 if ((*pml4 & PG_V) == 0) 10425 goto done; 10426 10427 pdp = pmap_pml4e_to_pdpe(pml4, va); 10428 ptr[idx++] = *pdp; 10429 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) 10430 goto done; 10431 10432 pde = pmap_pdpe_to_pde(pdp, va); 10433 ptr[idx++] = *pde; 10434 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) 10435 goto done; 10436 10437 pte = pmap_pde_to_pte(pde, va); 10438 ptr[idx++] = *pte; 10439 10440 done: 10441 PMAP_UNLOCK(pmap); 10442 *num = idx; 10443 } 10444 10445 /** 10446 * Get the kernel virtual address of a set of physical pages. If there are 10447 * physical addresses not covered by the DMAP perform a transient mapping 10448 * that will be removed when calling pmap_unmap_io_transient. 10449 * 10450 * \param page The pages the caller wishes to obtain the virtual 10451 * address on the kernel memory map. 10452 * \param vaddr On return contains the kernel virtual memory address 10453 * of the pages passed in the page parameter. 10454 * \param count Number of pages passed in. 10455 * \param can_fault true if the thread using the mapped pages can take 10456 * page faults, false otherwise. 10457 * 10458 * \returns true if the caller must call pmap_unmap_io_transient when 10459 * finished or false otherwise. 10460 * 10461 */ 10462 bool 10463 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 10464 bool can_fault) 10465 { 10466 vm_paddr_t paddr; 10467 bool needs_mapping; 10468 pt_entry_t *pte; 10469 int cache_bits, error __unused, i; 10470 10471 /* 10472 * Allocate any KVA space that we need, this is done in a separate 10473 * loop to prevent calling vmem_alloc while pinned. 10474 */ 10475 needs_mapping = false; 10476 for (i = 0; i < count; i++) { 10477 paddr = VM_PAGE_TO_PHYS(page[i]); 10478 if (__predict_false(paddr >= dmaplimit)) { 10479 error = vmem_alloc(kernel_arena, PAGE_SIZE, 10480 M_BESTFIT | M_WAITOK, &vaddr[i]); 10481 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 10482 needs_mapping = true; 10483 } else { 10484 vaddr[i] = PHYS_TO_DMAP(paddr); 10485 } 10486 } 10487 10488 /* Exit early if everything is covered by the DMAP */ 10489 if (!needs_mapping) 10490 return (false); 10491 10492 /* 10493 * NB: The sequence of updating a page table followed by accesses 10494 * to the corresponding pages used in the !DMAP case is subject to 10495 * the situation described in the "AMD64 Architecture Programmer's 10496 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special 10497 * Coherency Considerations". Therefore, issuing the INVLPG right 10498 * after modifying the PTE bits is crucial. 10499 */ 10500 if (!can_fault) 10501 sched_pin(); 10502 for (i = 0; i < count; i++) { 10503 paddr = VM_PAGE_TO_PHYS(page[i]); 10504 if (paddr >= dmaplimit) { 10505 if (can_fault) { 10506 /* 10507 * Slow path, since we can get page faults 10508 * while mappings are active don't pin the 10509 * thread to the CPU and instead add a global 10510 * mapping visible to all CPUs. 10511 */ 10512 pmap_qenter(vaddr[i], &page[i], 1); 10513 } else { 10514 pte = vtopte(vaddr[i]); 10515 cache_bits = pmap_cache_bits(kernel_pmap, 10516 page[i]->md.pat_mode, false); 10517 pte_store(pte, paddr | X86_PG_RW | X86_PG_V | 10518 cache_bits); 10519 pmap_invlpg(kernel_pmap, vaddr[i]); 10520 } 10521 } 10522 } 10523 10524 return (needs_mapping); 10525 } 10526 10527 void 10528 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 10529 bool can_fault) 10530 { 10531 vm_paddr_t paddr; 10532 int i; 10533 10534 if (!can_fault) 10535 sched_unpin(); 10536 for (i = 0; i < count; i++) { 10537 paddr = VM_PAGE_TO_PHYS(page[i]); 10538 if (paddr >= dmaplimit) { 10539 if (can_fault) 10540 pmap_qremove(vaddr[i], 1); 10541 vmem_free(kernel_arena, vaddr[i], PAGE_SIZE); 10542 } 10543 } 10544 } 10545 10546 vm_offset_t 10547 pmap_quick_enter_page(vm_page_t m) 10548 { 10549 vm_paddr_t paddr; 10550 10551 paddr = VM_PAGE_TO_PHYS(m); 10552 if (paddr < dmaplimit) 10553 return (PHYS_TO_DMAP(paddr)); 10554 mtx_lock_spin(&qframe_mtx); 10555 KASSERT(*vtopte(qframe) == 0, ("qframe busy")); 10556 10557 /* 10558 * Since qframe is exclusively mapped by us, and we do not set 10559 * PG_G, we can use INVLPG here. 10560 */ 10561 invlpg(qframe); 10562 10563 pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A | 10564 X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0)); 10565 return (qframe); 10566 } 10567 10568 void 10569 pmap_quick_remove_page(vm_offset_t addr) 10570 { 10571 10572 if (addr != qframe) 10573 return; 10574 pte_store(vtopte(qframe), 0); 10575 mtx_unlock_spin(&qframe_mtx); 10576 } 10577 10578 /* 10579 * Pdp pages from the large map are managed differently from either 10580 * kernel or user page table pages. They are permanently allocated at 10581 * initialization time, and their reference count is permanently set to 10582 * zero. The pml4 entries pointing to those pages are copied into 10583 * each allocated pmap. 10584 * 10585 * In contrast, pd and pt pages are managed like user page table 10586 * pages. They are dynamically allocated, and their reference count 10587 * represents the number of valid entries within the page. 10588 */ 10589 static vm_page_t 10590 pmap_large_map_getptp_unlocked(void) 10591 { 10592 return (pmap_alloc_pt_page(kernel_pmap, 0, VM_ALLOC_ZERO)); 10593 } 10594 10595 static vm_page_t 10596 pmap_large_map_getptp(void) 10597 { 10598 vm_page_t m; 10599 10600 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 10601 m = pmap_large_map_getptp_unlocked(); 10602 if (m == NULL) { 10603 PMAP_UNLOCK(kernel_pmap); 10604 vm_wait(NULL); 10605 PMAP_LOCK(kernel_pmap); 10606 /* Callers retry. */ 10607 } 10608 return (m); 10609 } 10610 10611 static pdp_entry_t * 10612 pmap_large_map_pdpe(vm_offset_t va) 10613 { 10614 vm_pindex_t pml4_idx; 10615 vm_paddr_t mphys; 10616 10617 pml4_idx = pmap_pml4e_index(va); 10618 KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents, 10619 ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I " 10620 "%#jx lm_ents %d", 10621 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); 10622 KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0, 10623 ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " 10624 "LMSPML4I %#jx lm_ents %d", 10625 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); 10626 mphys = kernel_pml4[pml4_idx] & PG_FRAME; 10627 return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va)); 10628 } 10629 10630 static pd_entry_t * 10631 pmap_large_map_pde(vm_offset_t va) 10632 { 10633 pdp_entry_t *pdpe; 10634 vm_page_t m; 10635 vm_paddr_t mphys; 10636 10637 retry: 10638 pdpe = pmap_large_map_pdpe(va); 10639 if (*pdpe == 0) { 10640 m = pmap_large_map_getptp(); 10641 if (m == NULL) 10642 goto retry; 10643 mphys = VM_PAGE_TO_PHYS(m); 10644 *pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; 10645 } else { 10646 MPASS((*pdpe & X86_PG_PS) == 0); 10647 mphys = *pdpe & PG_FRAME; 10648 } 10649 return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va)); 10650 } 10651 10652 static pt_entry_t * 10653 pmap_large_map_pte(vm_offset_t va) 10654 { 10655 pd_entry_t *pde; 10656 vm_page_t m; 10657 vm_paddr_t mphys; 10658 10659 retry: 10660 pde = pmap_large_map_pde(va); 10661 if (*pde == 0) { 10662 m = pmap_large_map_getptp(); 10663 if (m == NULL) 10664 goto retry; 10665 mphys = VM_PAGE_TO_PHYS(m); 10666 *pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; 10667 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->ref_count++; 10668 } else { 10669 MPASS((*pde & X86_PG_PS) == 0); 10670 mphys = *pde & PG_FRAME; 10671 } 10672 return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va)); 10673 } 10674 10675 static vm_paddr_t 10676 pmap_large_map_kextract(vm_offset_t va) 10677 { 10678 pdp_entry_t *pdpe, pdp; 10679 pd_entry_t *pde, pd; 10680 pt_entry_t *pte, pt; 10681 10682 KASSERT(PMAP_ADDRESS_IN_LARGEMAP(va), 10683 ("not largemap range %#lx", (u_long)va)); 10684 pdpe = pmap_large_map_pdpe(va); 10685 pdp = *pdpe; 10686 KASSERT((pdp & X86_PG_V) != 0, 10687 ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, 10688 (u_long)pdpe, pdp)); 10689 if ((pdp & X86_PG_PS) != 0) { 10690 KASSERT((amd_feature & AMDID_PAGE1GB) != 0, 10691 ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, 10692 (u_long)pdpe, pdp)); 10693 return ((pdp & PG_PS_PDP_FRAME) | (va & PDPMASK)); 10694 } 10695 pde = pmap_pdpe_to_pde(pdpe, va); 10696 pd = *pde; 10697 KASSERT((pd & X86_PG_V) != 0, 10698 ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); 10699 if ((pd & X86_PG_PS) != 0) 10700 return ((pd & PG_PS_FRAME) | (va & PDRMASK)); 10701 pte = pmap_pde_to_pte(pde, va); 10702 pt = *pte; 10703 KASSERT((pt & X86_PG_V) != 0, 10704 ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, pt)); 10705 return ((pt & PG_FRAME) | (va & PAGE_MASK)); 10706 } 10707 10708 static int 10709 pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase, 10710 vmem_addr_t *vmem_res) 10711 { 10712 10713 /* 10714 * Large mappings are all but static. Consequently, there 10715 * is no point in waiting for an earlier allocation to be 10716 * freed. 10717 */ 10718 return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN, 10719 VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res)); 10720 } 10721 10722 int 10723 pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr, 10724 vm_memattr_t mattr) 10725 { 10726 pdp_entry_t *pdpe; 10727 pd_entry_t *pde; 10728 pt_entry_t *pte; 10729 vm_offset_t va, inc; 10730 vmem_addr_t vmem_res; 10731 vm_paddr_t pa; 10732 int error; 10733 10734 if (len == 0 || spa + len < spa) 10735 return (EINVAL); 10736 10737 /* See if DMAP can serve. */ 10738 if (spa + len <= dmaplimit) { 10739 va = PHYS_TO_DMAP(spa); 10740 *addr = (void *)va; 10741 return (pmap_change_attr(va, len, mattr)); 10742 } 10743 10744 /* 10745 * No, allocate KVA. Fit the address with best possible 10746 * alignment for superpages. Fall back to worse align if 10747 * failed. 10748 */ 10749 error = ENOMEM; 10750 if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len, 10751 NBPDP) >= roundup2(spa, NBPDP) + NBPDP) 10752 error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK, 10753 &vmem_res); 10754 if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa, 10755 NBPDR) + NBPDR) 10756 error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK, 10757 &vmem_res); 10758 if (error != 0) 10759 error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res); 10760 if (error != 0) 10761 return (error); 10762 10763 /* 10764 * Fill pagetable. PG_M is not pre-set, we scan modified bits 10765 * in the pagetable to minimize flushing. No need to 10766 * invalidate TLB, since we only update invalid entries. 10767 */ 10768 PMAP_LOCK(kernel_pmap); 10769 for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc, 10770 len -= inc) { 10771 if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP && 10772 (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) { 10773 pdpe = pmap_large_map_pdpe(va); 10774 MPASS(*pdpe == 0); 10775 *pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW | 10776 X86_PG_V | X86_PG_A | pg_nx | 10777 pmap_cache_bits(kernel_pmap, mattr, TRUE); 10778 inc = NBPDP; 10779 } else if (len >= NBPDR && (pa & PDRMASK) == 0 && 10780 (va & PDRMASK) == 0) { 10781 pde = pmap_large_map_pde(va); 10782 MPASS(*pde == 0); 10783 *pde = pa | pg_g | X86_PG_PS | X86_PG_RW | 10784 X86_PG_V | X86_PG_A | pg_nx | 10785 pmap_cache_bits(kernel_pmap, mattr, TRUE); 10786 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))-> 10787 ref_count++; 10788 inc = NBPDR; 10789 } else { 10790 pte = pmap_large_map_pte(va); 10791 MPASS(*pte == 0); 10792 *pte = pa | pg_g | X86_PG_RW | X86_PG_V | 10793 X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, 10794 mattr, FALSE); 10795 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))-> 10796 ref_count++; 10797 inc = PAGE_SIZE; 10798 } 10799 } 10800 PMAP_UNLOCK(kernel_pmap); 10801 MPASS(len == 0); 10802 10803 *addr = (void *)vmem_res; 10804 return (0); 10805 } 10806 10807 void 10808 pmap_large_unmap(void *svaa, vm_size_t len) 10809 { 10810 vm_offset_t sva, va; 10811 vm_size_t inc; 10812 pdp_entry_t *pdpe, pdp; 10813 pd_entry_t *pde, pd; 10814 pt_entry_t *pte; 10815 vm_page_t m; 10816 struct spglist spgf; 10817 10818 sva = (vm_offset_t)svaa; 10819 if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS && 10820 sva + len <= DMAP_MIN_ADDRESS + dmaplimit)) 10821 return; 10822 10823 SLIST_INIT(&spgf); 10824 KASSERT(PMAP_ADDRESS_IN_LARGEMAP(sva) && 10825 PMAP_ADDRESS_IN_LARGEMAP(sva + len - 1), 10826 ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len)); 10827 PMAP_LOCK(kernel_pmap); 10828 for (va = sva; va < sva + len; va += inc) { 10829 pdpe = pmap_large_map_pdpe(va); 10830 pdp = *pdpe; 10831 KASSERT((pdp & X86_PG_V) != 0, 10832 ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, 10833 (u_long)pdpe, pdp)); 10834 if ((pdp & X86_PG_PS) != 0) { 10835 KASSERT((amd_feature & AMDID_PAGE1GB) != 0, 10836 ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, 10837 (u_long)pdpe, pdp)); 10838 KASSERT((va & PDPMASK) == 0, 10839 ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va, 10840 (u_long)pdpe, pdp)); 10841 KASSERT(va + NBPDP <= sva + len, 10842 ("unmap covers partial 1GB page, sva %#lx va %#lx " 10843 "pdpe %#lx pdp %#lx len %#lx", sva, va, 10844 (u_long)pdpe, pdp, len)); 10845 *pdpe = 0; 10846 inc = NBPDP; 10847 continue; 10848 } 10849 pde = pmap_pdpe_to_pde(pdpe, va); 10850 pd = *pde; 10851 KASSERT((pd & X86_PG_V) != 0, 10852 ("invalid pd va %#lx pde %#lx pd %#lx", va, 10853 (u_long)pde, pd)); 10854 if ((pd & X86_PG_PS) != 0) { 10855 KASSERT((va & PDRMASK) == 0, 10856 ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va, 10857 (u_long)pde, pd)); 10858 KASSERT(va + NBPDR <= sva + len, 10859 ("unmap covers partial 2MB page, sva %#lx va %#lx " 10860 "pde %#lx pd %#lx len %#lx", sva, va, (u_long)pde, 10861 pd, len)); 10862 pde_store(pde, 0); 10863 inc = NBPDR; 10864 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); 10865 m->ref_count--; 10866 if (m->ref_count == 0) { 10867 *pdpe = 0; 10868 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10869 } 10870 continue; 10871 } 10872 pte = pmap_pde_to_pte(pde, va); 10873 KASSERT((*pte & X86_PG_V) != 0, 10874 ("invalid pte va %#lx pte %#lx pt %#lx", va, 10875 (u_long)pte, *pte)); 10876 pte_clear(pte); 10877 inc = PAGE_SIZE; 10878 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte)); 10879 m->ref_count--; 10880 if (m->ref_count == 0) { 10881 *pde = 0; 10882 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10883 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); 10884 m->ref_count--; 10885 if (m->ref_count == 0) { 10886 *pdpe = 0; 10887 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10888 } 10889 } 10890 } 10891 pmap_invalidate_range(kernel_pmap, sva, sva + len); 10892 PMAP_UNLOCK(kernel_pmap); 10893 vm_page_free_pages_toq(&spgf, false); 10894 vmem_free(large_vmem, sva, len); 10895 } 10896 10897 static void 10898 pmap_large_map_wb_fence_mfence(void) 10899 { 10900 10901 mfence(); 10902 } 10903 10904 static void 10905 pmap_large_map_wb_fence_atomic(void) 10906 { 10907 10908 atomic_thread_fence_seq_cst(); 10909 } 10910 10911 static void 10912 pmap_large_map_wb_fence_nop(void) 10913 { 10914 } 10915 10916 DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void)) 10917 { 10918 10919 if (cpu_vendor_id != CPU_VENDOR_INTEL) 10920 return (pmap_large_map_wb_fence_mfence); 10921 else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB | 10922 CPUID_STDEXT_CLFLUSHOPT)) == 0) 10923 return (pmap_large_map_wb_fence_atomic); 10924 else 10925 /* clflush is strongly enough ordered */ 10926 return (pmap_large_map_wb_fence_nop); 10927 } 10928 10929 static void 10930 pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len) 10931 { 10932 10933 for (; len > 0; len -= cpu_clflush_line_size, 10934 va += cpu_clflush_line_size) 10935 clwb(va); 10936 } 10937 10938 static void 10939 pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len) 10940 { 10941 10942 for (; len > 0; len -= cpu_clflush_line_size, 10943 va += cpu_clflush_line_size) 10944 clflushopt(va); 10945 } 10946 10947 static void 10948 pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len) 10949 { 10950 10951 for (; len > 0; len -= cpu_clflush_line_size, 10952 va += cpu_clflush_line_size) 10953 clflush(va); 10954 } 10955 10956 static void 10957 pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len __unused) 10958 { 10959 } 10960 10961 DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t, vm_size_t)) 10962 { 10963 10964 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0) 10965 return (pmap_large_map_flush_range_clwb); 10966 else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) 10967 return (pmap_large_map_flush_range_clflushopt); 10968 else if ((cpu_feature & CPUID_CLFSH) != 0) 10969 return (pmap_large_map_flush_range_clflush); 10970 else 10971 return (pmap_large_map_flush_range_nop); 10972 } 10973 10974 static void 10975 pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva) 10976 { 10977 volatile u_long *pe; 10978 u_long p; 10979 vm_offset_t va; 10980 vm_size_t inc; 10981 bool seen_other; 10982 10983 for (va = sva; va < eva; va += inc) { 10984 inc = 0; 10985 if ((amd_feature & AMDID_PAGE1GB) != 0) { 10986 pe = (volatile u_long *)pmap_large_map_pdpe(va); 10987 p = *pe; 10988 if ((p & X86_PG_PS) != 0) 10989 inc = NBPDP; 10990 } 10991 if (inc == 0) { 10992 pe = (volatile u_long *)pmap_large_map_pde(va); 10993 p = *pe; 10994 if ((p & X86_PG_PS) != 0) 10995 inc = NBPDR; 10996 } 10997 if (inc == 0) { 10998 pe = (volatile u_long *)pmap_large_map_pte(va); 10999 p = *pe; 11000 inc = PAGE_SIZE; 11001 } 11002 seen_other = false; 11003 for (;;) { 11004 if ((p & X86_PG_AVAIL1) != 0) { 11005 /* 11006 * Spin-wait for the end of a parallel 11007 * write-back. 11008 */ 11009 cpu_spinwait(); 11010 p = *pe; 11011 11012 /* 11013 * If we saw other write-back 11014 * occuring, we cannot rely on PG_M to 11015 * indicate state of the cache. The 11016 * PG_M bit is cleared before the 11017 * flush to avoid ignoring new writes, 11018 * and writes which are relevant for 11019 * us might happen after. 11020 */ 11021 seen_other = true; 11022 continue; 11023 } 11024 11025 if ((p & X86_PG_M) != 0 || seen_other) { 11026 if (!atomic_fcmpset_long(pe, &p, 11027 (p & ~X86_PG_M) | X86_PG_AVAIL1)) 11028 /* 11029 * If we saw PG_M without 11030 * PG_AVAIL1, and then on the 11031 * next attempt we do not 11032 * observe either PG_M or 11033 * PG_AVAIL1, the other 11034 * write-back started after us 11035 * and finished before us. We 11036 * can rely on it doing our 11037 * work. 11038 */ 11039 continue; 11040 pmap_large_map_flush_range(va, inc); 11041 atomic_clear_long(pe, X86_PG_AVAIL1); 11042 } 11043 break; 11044 } 11045 maybe_yield(); 11046 } 11047 } 11048 11049 /* 11050 * Write-back cache lines for the given address range. 11051 * 11052 * Must be called only on the range or sub-range returned from 11053 * pmap_large_map(). Must not be called on the coalesced ranges. 11054 * 11055 * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH 11056 * instructions support. 11057 */ 11058 void 11059 pmap_large_map_wb(void *svap, vm_size_t len) 11060 { 11061 vm_offset_t eva, sva; 11062 11063 sva = (vm_offset_t)svap; 11064 eva = sva + len; 11065 pmap_large_map_wb_fence(); 11066 if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) { 11067 pmap_large_map_flush_range(sva, len); 11068 } else { 11069 KASSERT(sva >= LARGEMAP_MIN_ADDRESS && 11070 eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4, 11071 ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len)); 11072 pmap_large_map_wb_large(sva, eva); 11073 } 11074 pmap_large_map_wb_fence(); 11075 } 11076 11077 static vm_page_t 11078 pmap_pti_alloc_page(void) 11079 { 11080 vm_page_t m; 11081 11082 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11083 m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_WIRED | VM_ALLOC_ZERO); 11084 return (m); 11085 } 11086 11087 static bool 11088 pmap_pti_free_page(vm_page_t m) 11089 { 11090 if (!vm_page_unwire_noq(m)) 11091 return (false); 11092 vm_page_xbusy_claim(m); 11093 vm_page_free_zero(m); 11094 return (true); 11095 } 11096 11097 static void 11098 pmap_pti_init(void) 11099 { 11100 vm_page_t pml4_pg; 11101 pdp_entry_t *pdpe; 11102 vm_offset_t va; 11103 int i; 11104 11105 if (!pti) 11106 return; 11107 pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL); 11108 VM_OBJECT_WLOCK(pti_obj); 11109 pml4_pg = pmap_pti_alloc_page(); 11110 pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg)); 11111 for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS && 11112 va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) { 11113 pdpe = pmap_pti_pdpe(va); 11114 pmap_pti_wire_pte(pdpe); 11115 } 11116 pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0], 11117 (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false); 11118 pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt + 11119 sizeof(struct gate_descriptor) * NIDT, false); 11120 CPU_FOREACH(i) { 11121 /* Doublefault stack IST 1 */ 11122 va = __pcpu[i].pc_common_tss.tss_ist1 + sizeof(struct nmi_pcpu); 11123 pmap_pti_add_kva_locked(va - DBLFAULT_STACK_SIZE, va, false); 11124 /* NMI stack IST 2 */ 11125 va = __pcpu[i].pc_common_tss.tss_ist2 + sizeof(struct nmi_pcpu); 11126 pmap_pti_add_kva_locked(va - NMI_STACK_SIZE, va, false); 11127 /* MC# stack IST 3 */ 11128 va = __pcpu[i].pc_common_tss.tss_ist3 + 11129 sizeof(struct nmi_pcpu); 11130 pmap_pti_add_kva_locked(va - MCE_STACK_SIZE, va, false); 11131 /* DB# stack IST 4 */ 11132 va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu); 11133 pmap_pti_add_kva_locked(va - DBG_STACK_SIZE, va, false); 11134 } 11135 pmap_pti_add_kva_locked((vm_offset_t)KERNSTART, (vm_offset_t)etext, 11136 true); 11137 pti_finalized = true; 11138 VM_OBJECT_WUNLOCK(pti_obj); 11139 } 11140 11141 static void 11142 pmap_cpu_init(void *arg __unused) 11143 { 11144 CPU_COPY(&all_cpus, &kernel_pmap->pm_active); 11145 pmap_pti_init(); 11146 } 11147 SYSINIT(pmap_cpu, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_cpu_init, NULL); 11148 11149 static pdp_entry_t * 11150 pmap_pti_pdpe(vm_offset_t va) 11151 { 11152 pml4_entry_t *pml4e; 11153 pdp_entry_t *pdpe; 11154 vm_page_t m; 11155 vm_pindex_t pml4_idx; 11156 vm_paddr_t mphys; 11157 11158 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11159 11160 pml4_idx = pmap_pml4e_index(va); 11161 pml4e = &pti_pml4[pml4_idx]; 11162 m = NULL; 11163 if (*pml4e == 0) { 11164 if (pti_finalized) 11165 panic("pml4 alloc after finalization\n"); 11166 m = pmap_pti_alloc_page(); 11167 if (*pml4e != 0) { 11168 pmap_pti_free_page(m); 11169 mphys = *pml4e & ~PAGE_MASK; 11170 } else { 11171 mphys = VM_PAGE_TO_PHYS(m); 11172 *pml4e = mphys | X86_PG_RW | X86_PG_V; 11173 } 11174 } else { 11175 mphys = *pml4e & ~PAGE_MASK; 11176 } 11177 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va); 11178 return (pdpe); 11179 } 11180 11181 static void 11182 pmap_pti_wire_pte(void *pte) 11183 { 11184 vm_page_t m; 11185 11186 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11187 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); 11188 m->ref_count++; 11189 } 11190 11191 static void 11192 pmap_pti_unwire_pde(void *pde, bool only_ref) 11193 { 11194 vm_page_t m; 11195 11196 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11197 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde)); 11198 MPASS(only_ref || m->ref_count > 1); 11199 pmap_pti_free_page(m); 11200 } 11201 11202 static void 11203 pmap_pti_unwire_pte(void *pte, vm_offset_t va) 11204 { 11205 vm_page_t m; 11206 pd_entry_t *pde; 11207 11208 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11209 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); 11210 if (pmap_pti_free_page(m)) { 11211 pde = pmap_pti_pde(va); 11212 MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V); 11213 *pde = 0; 11214 pmap_pti_unwire_pde(pde, false); 11215 } 11216 } 11217 11218 static pd_entry_t * 11219 pmap_pti_pde(vm_offset_t va) 11220 { 11221 pdp_entry_t *pdpe; 11222 pd_entry_t *pde; 11223 vm_page_t m; 11224 vm_pindex_t pd_idx; 11225 vm_paddr_t mphys; 11226 11227 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11228 11229 pdpe = pmap_pti_pdpe(va); 11230 if (*pdpe == 0) { 11231 m = pmap_pti_alloc_page(); 11232 if (*pdpe != 0) { 11233 pmap_pti_free_page(m); 11234 MPASS((*pdpe & X86_PG_PS) == 0); 11235 mphys = *pdpe & ~PAGE_MASK; 11236 } else { 11237 mphys = VM_PAGE_TO_PHYS(m); 11238 *pdpe = mphys | X86_PG_RW | X86_PG_V; 11239 } 11240 } else { 11241 MPASS((*pdpe & X86_PG_PS) == 0); 11242 mphys = *pdpe & ~PAGE_MASK; 11243 } 11244 11245 pde = (pd_entry_t *)PHYS_TO_DMAP(mphys); 11246 pd_idx = pmap_pde_index(va); 11247 pde += pd_idx; 11248 return (pde); 11249 } 11250 11251 static pt_entry_t * 11252 pmap_pti_pte(vm_offset_t va, bool *unwire_pde) 11253 { 11254 pd_entry_t *pde; 11255 pt_entry_t *pte; 11256 vm_page_t m; 11257 vm_paddr_t mphys; 11258 11259 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11260 11261 pde = pmap_pti_pde(va); 11262 if (unwire_pde != NULL) { 11263 *unwire_pde = true; 11264 pmap_pti_wire_pte(pde); 11265 } 11266 if (*pde == 0) { 11267 m = pmap_pti_alloc_page(); 11268 if (*pde != 0) { 11269 pmap_pti_free_page(m); 11270 MPASS((*pde & X86_PG_PS) == 0); 11271 mphys = *pde & ~(PAGE_MASK | pg_nx); 11272 } else { 11273 mphys = VM_PAGE_TO_PHYS(m); 11274 *pde = mphys | X86_PG_RW | X86_PG_V; 11275 if (unwire_pde != NULL) 11276 *unwire_pde = false; 11277 } 11278 } else { 11279 MPASS((*pde & X86_PG_PS) == 0); 11280 mphys = *pde & ~(PAGE_MASK | pg_nx); 11281 } 11282 11283 pte = (pt_entry_t *)PHYS_TO_DMAP(mphys); 11284 pte += pmap_pte_index(va); 11285 11286 return (pte); 11287 } 11288 11289 static void 11290 pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec) 11291 { 11292 vm_paddr_t pa; 11293 pd_entry_t *pde; 11294 pt_entry_t *pte, ptev; 11295 bool unwire_pde; 11296 11297 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11298 11299 sva = trunc_page(sva); 11300 MPASS(sva > VM_MAXUSER_ADDRESS); 11301 eva = round_page(eva); 11302 MPASS(sva < eva); 11303 for (; sva < eva; sva += PAGE_SIZE) { 11304 pte = pmap_pti_pte(sva, &unwire_pde); 11305 pa = pmap_kextract(sva); 11306 ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G | 11307 (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap, 11308 VM_MEMATTR_DEFAULT, FALSE); 11309 if (*pte == 0) { 11310 pte_store(pte, ptev); 11311 pmap_pti_wire_pte(pte); 11312 } else { 11313 KASSERT(!pti_finalized, 11314 ("pti overlap after fin %#lx %#lx %#lx", 11315 sva, *pte, ptev)); 11316 KASSERT(*pte == ptev, 11317 ("pti non-identical pte after fin %#lx %#lx %#lx", 11318 sva, *pte, ptev)); 11319 } 11320 if (unwire_pde) { 11321 pde = pmap_pti_pde(sva); 11322 pmap_pti_unwire_pde(pde, true); 11323 } 11324 } 11325 } 11326 11327 void 11328 pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec) 11329 { 11330 11331 if (!pti) 11332 return; 11333 VM_OBJECT_WLOCK(pti_obj); 11334 pmap_pti_add_kva_locked(sva, eva, exec); 11335 VM_OBJECT_WUNLOCK(pti_obj); 11336 } 11337 11338 void 11339 pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva) 11340 { 11341 pt_entry_t *pte; 11342 vm_offset_t va; 11343 11344 if (!pti) 11345 return; 11346 sva = rounddown2(sva, PAGE_SIZE); 11347 MPASS(sva > VM_MAXUSER_ADDRESS); 11348 eva = roundup2(eva, PAGE_SIZE); 11349 MPASS(sva < eva); 11350 VM_OBJECT_WLOCK(pti_obj); 11351 for (va = sva; va < eva; va += PAGE_SIZE) { 11352 pte = pmap_pti_pte(va, NULL); 11353 KASSERT((*pte & X86_PG_V) != 0, 11354 ("invalid pte va %#lx pte %#lx pt %#lx", va, 11355 (u_long)pte, *pte)); 11356 pte_clear(pte); 11357 pmap_pti_unwire_pte(pte, va); 11358 } 11359 pmap_invalidate_range(kernel_pmap, sva, eva); 11360 VM_OBJECT_WUNLOCK(pti_obj); 11361 } 11362 11363 static void * 11364 pkru_dup_range(void *ctx __unused, void *data) 11365 { 11366 struct pmap_pkru_range *node, *new_node; 11367 11368 new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); 11369 if (new_node == NULL) 11370 return (NULL); 11371 node = data; 11372 memcpy(new_node, node, sizeof(*node)); 11373 return (new_node); 11374 } 11375 11376 static void 11377 pkru_free_range(void *ctx __unused, void *node) 11378 { 11379 11380 uma_zfree(pmap_pkru_ranges_zone, node); 11381 } 11382 11383 static int 11384 pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, 11385 int flags) 11386 { 11387 struct pmap_pkru_range *ppr; 11388 int error; 11389 11390 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11391 MPASS(pmap->pm_type == PT_X86); 11392 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11393 if ((flags & AMD64_PKRU_EXCL) != 0 && 11394 !rangeset_check_empty(&pmap->pm_pkru, sva, eva)) 11395 return (EBUSY); 11396 ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); 11397 if (ppr == NULL) 11398 return (ENOMEM); 11399 ppr->pkru_keyidx = keyidx; 11400 ppr->pkru_flags = flags & AMD64_PKRU_PERSIST; 11401 error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr); 11402 if (error != 0) 11403 uma_zfree(pmap_pkru_ranges_zone, ppr); 11404 return (error); 11405 } 11406 11407 static int 11408 pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11409 { 11410 11411 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11412 MPASS(pmap->pm_type == PT_X86); 11413 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11414 return (rangeset_remove(&pmap->pm_pkru, sva, eva)); 11415 } 11416 11417 static void 11418 pmap_pkru_deassign_all(pmap_t pmap) 11419 { 11420 11421 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11422 if (pmap->pm_type == PT_X86 && 11423 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) 11424 rangeset_remove_all(&pmap->pm_pkru); 11425 } 11426 11427 static bool 11428 pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11429 { 11430 struct pmap_pkru_range *ppr, *prev_ppr; 11431 vm_offset_t va; 11432 11433 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11434 if (pmap->pm_type != PT_X86 || 11435 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || 11436 sva >= VM_MAXUSER_ADDRESS) 11437 return (true); 11438 MPASS(eva <= VM_MAXUSER_ADDRESS); 11439 for (va = sva; va < eva; prev_ppr = ppr) { 11440 ppr = rangeset_lookup(&pmap->pm_pkru, va); 11441 if (va == sva) 11442 prev_ppr = ppr; 11443 else if ((ppr == NULL) ^ (prev_ppr == NULL)) 11444 return (false); 11445 if (ppr == NULL) { 11446 va += PAGE_SIZE; 11447 continue; 11448 } 11449 if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx) 11450 return (false); 11451 va = ppr->pkru_rs_el.re_end; 11452 } 11453 return (true); 11454 } 11455 11456 static pt_entry_t 11457 pmap_pkru_get(pmap_t pmap, vm_offset_t va) 11458 { 11459 struct pmap_pkru_range *ppr; 11460 11461 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11462 if (pmap->pm_type != PT_X86 || 11463 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || 11464 va >= VM_MAXUSER_ADDRESS) 11465 return (0); 11466 ppr = rangeset_lookup(&pmap->pm_pkru, va); 11467 if (ppr != NULL) 11468 return (X86_PG_PKU(ppr->pkru_keyidx)); 11469 return (0); 11470 } 11471 11472 static bool 11473 pred_pkru_on_remove(void *ctx __unused, void *r) 11474 { 11475 struct pmap_pkru_range *ppr; 11476 11477 ppr = r; 11478 return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0); 11479 } 11480 11481 static void 11482 pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11483 { 11484 11485 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11486 if (pmap->pm_type == PT_X86 && 11487 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 11488 rangeset_remove_pred(&pmap->pm_pkru, sva, eva, 11489 pred_pkru_on_remove); 11490 } 11491 } 11492 11493 static int 11494 pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap) 11495 { 11496 11497 PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED); 11498 PMAP_LOCK_ASSERT(src_pmap, MA_OWNED); 11499 MPASS(dst_pmap->pm_type == PT_X86); 11500 MPASS(src_pmap->pm_type == PT_X86); 11501 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11502 if (src_pmap->pm_pkru.rs_data_ctx == NULL) 11503 return (0); 11504 return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru)); 11505 } 11506 11507 static void 11508 pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 11509 u_int keyidx) 11510 { 11511 pml4_entry_t *pml4e; 11512 pdp_entry_t *pdpe; 11513 pd_entry_t newpde, ptpaddr, *pde; 11514 pt_entry_t newpte, *ptep, pte; 11515 vm_offset_t va, va_next; 11516 bool changed; 11517 11518 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11519 MPASS(pmap->pm_type == PT_X86); 11520 MPASS(keyidx <= PMAP_MAX_PKRU_IDX); 11521 11522 for (changed = false, va = sva; va < eva; va = va_next) { 11523 pml4e = pmap_pml4e(pmap, va); 11524 if (pml4e == NULL || (*pml4e & X86_PG_V) == 0) { 11525 va_next = (va + NBPML4) & ~PML4MASK; 11526 if (va_next < va) 11527 va_next = eva; 11528 continue; 11529 } 11530 11531 pdpe = pmap_pml4e_to_pdpe(pml4e, va); 11532 if ((*pdpe & X86_PG_V) == 0) { 11533 va_next = (va + NBPDP) & ~PDPMASK; 11534 if (va_next < va) 11535 va_next = eva; 11536 continue; 11537 } 11538 11539 va_next = (va + NBPDR) & ~PDRMASK; 11540 if (va_next < va) 11541 va_next = eva; 11542 11543 pde = pmap_pdpe_to_pde(pdpe, va); 11544 ptpaddr = *pde; 11545 if (ptpaddr == 0) 11546 continue; 11547 11548 MPASS((ptpaddr & X86_PG_V) != 0); 11549 if ((ptpaddr & PG_PS) != 0) { 11550 if (va + NBPDR == va_next && eva >= va_next) { 11551 newpde = (ptpaddr & ~X86_PG_PKU_MASK) | 11552 X86_PG_PKU(keyidx); 11553 if (newpde != ptpaddr) { 11554 *pde = newpde; 11555 changed = true; 11556 } 11557 continue; 11558 } else if (!pmap_demote_pde(pmap, pde, va)) { 11559 continue; 11560 } 11561 } 11562 11563 if (va_next > eva) 11564 va_next = eva; 11565 11566 for (ptep = pmap_pde_to_pte(pde, va); va != va_next; 11567 ptep++, va += PAGE_SIZE) { 11568 pte = *ptep; 11569 if ((pte & X86_PG_V) == 0) 11570 continue; 11571 newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx); 11572 if (newpte != pte) { 11573 *ptep = newpte; 11574 changed = true; 11575 } 11576 } 11577 } 11578 if (changed) 11579 pmap_invalidate_range(pmap, sva, eva); 11580 } 11581 11582 static int 11583 pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 11584 u_int keyidx, int flags) 11585 { 11586 11587 if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX || 11588 (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0) 11589 return (EINVAL); 11590 if (eva <= sva || eva > VM_MAXUSER_ADDRESS) 11591 return (EFAULT); 11592 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) 11593 return (ENOTSUP); 11594 return (0); 11595 } 11596 11597 int 11598 pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, 11599 int flags) 11600 { 11601 int error; 11602 11603 sva = trunc_page(sva); 11604 eva = round_page(eva); 11605 error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags); 11606 if (error != 0) 11607 return (error); 11608 for (;;) { 11609 PMAP_LOCK(pmap); 11610 error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags); 11611 if (error == 0) 11612 pmap_pkru_update_range(pmap, sva, eva, keyidx); 11613 PMAP_UNLOCK(pmap); 11614 if (error != ENOMEM) 11615 break; 11616 vm_wait(NULL); 11617 } 11618 return (error); 11619 } 11620 11621 int 11622 pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11623 { 11624 int error; 11625 11626 sva = trunc_page(sva); 11627 eva = round_page(eva); 11628 error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0); 11629 if (error != 0) 11630 return (error); 11631 for (;;) { 11632 PMAP_LOCK(pmap); 11633 error = pmap_pkru_deassign(pmap, sva, eva); 11634 if (error == 0) 11635 pmap_pkru_update_range(pmap, sva, eva, 0); 11636 PMAP_UNLOCK(pmap); 11637 if (error != ENOMEM) 11638 break; 11639 vm_wait(NULL); 11640 } 11641 return (error); 11642 } 11643 11644 #if defined(KASAN) || defined(KMSAN) 11645 11646 /* 11647 * Reserve enough memory to: 11648 * 1) allocate PDP pages for the shadow map(s), 11649 * 2) shadow the boot stack of KSTACK_PAGES pages, 11650 * so we need one PD page, one or two PT pages, and KSTACK_PAGES shadow pages 11651 * per shadow map. 11652 */ 11653 #ifdef KASAN 11654 #define SAN_EARLY_PAGES \ 11655 (NKASANPML4E + 1 + 2 + howmany(KSTACK_PAGES, KASAN_SHADOW_SCALE)) 11656 #else 11657 #define SAN_EARLY_PAGES \ 11658 (NKMSANSHADPML4E + NKMSANORIGPML4E + 2 * (1 + 2 + KSTACK_PAGES)) 11659 #endif 11660 11661 static uint64_t __nosanitizeaddress __nosanitizememory 11662 pmap_san_enter_early_alloc_4k(uint64_t pabase) 11663 { 11664 static uint8_t data[PAGE_SIZE * SAN_EARLY_PAGES] __aligned(PAGE_SIZE); 11665 static size_t offset = 0; 11666 uint64_t pa; 11667 11668 if (offset == sizeof(data)) { 11669 panic("%s: ran out of memory for the bootstrap shadow map", 11670 __func__); 11671 } 11672 11673 pa = pabase + ((vm_offset_t)&data[offset] - KERNSTART); 11674 offset += PAGE_SIZE; 11675 return (pa); 11676 } 11677 11678 /* 11679 * Map a shadow page, before the kernel has bootstrapped its page tables. This 11680 * is currently only used to shadow the temporary boot stack set up by locore. 11681 */ 11682 static void __nosanitizeaddress __nosanitizememory 11683 pmap_san_enter_early(vm_offset_t va) 11684 { 11685 static bool first = true; 11686 pml4_entry_t *pml4e; 11687 pdp_entry_t *pdpe; 11688 pd_entry_t *pde; 11689 pt_entry_t *pte; 11690 uint64_t cr3, pa, base; 11691 int i; 11692 11693 base = amd64_loadaddr(); 11694 cr3 = rcr3(); 11695 11696 if (first) { 11697 /* 11698 * If this the first call, we need to allocate new PML4Es for 11699 * the bootstrap shadow map(s). We don't know how the PML4 page 11700 * was initialized by the boot loader, so we can't simply test 11701 * whether the shadow map's PML4Es are zero. 11702 */ 11703 first = false; 11704 #ifdef KASAN 11705 for (i = 0; i < NKASANPML4E; i++) { 11706 pa = pmap_san_enter_early_alloc_4k(base); 11707 11708 pml4e = (pml4_entry_t *)cr3 + 11709 pmap_pml4e_index(KASAN_MIN_ADDRESS + i * NBPML4); 11710 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V); 11711 } 11712 #else 11713 for (i = 0; i < NKMSANORIGPML4E; i++) { 11714 pa = pmap_san_enter_early_alloc_4k(base); 11715 11716 pml4e = (pml4_entry_t *)cr3 + 11717 pmap_pml4e_index(KMSAN_ORIG_MIN_ADDRESS + 11718 i * NBPML4); 11719 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V); 11720 } 11721 for (i = 0; i < NKMSANSHADPML4E; i++) { 11722 pa = pmap_san_enter_early_alloc_4k(base); 11723 11724 pml4e = (pml4_entry_t *)cr3 + 11725 pmap_pml4e_index(KMSAN_SHAD_MIN_ADDRESS + 11726 i * NBPML4); 11727 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V); 11728 } 11729 #endif 11730 } 11731 pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(va); 11732 pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(va); 11733 if (*pdpe == 0) { 11734 pa = pmap_san_enter_early_alloc_4k(base); 11735 *pdpe = (pdp_entry_t)(pa | X86_PG_RW | X86_PG_V); 11736 } 11737 pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(va); 11738 if (*pde == 0) { 11739 pa = pmap_san_enter_early_alloc_4k(base); 11740 *pde = (pd_entry_t)(pa | X86_PG_RW | X86_PG_V); 11741 } 11742 pte = (pt_entry_t *)(*pde & PG_FRAME) + pmap_pte_index(va); 11743 if (*pte != 0) 11744 panic("%s: PTE for %#lx is already initialized", __func__, va); 11745 pa = pmap_san_enter_early_alloc_4k(base); 11746 *pte = (pt_entry_t)(pa | X86_PG_A | X86_PG_M | X86_PG_RW | X86_PG_V); 11747 } 11748 11749 static vm_page_t 11750 pmap_san_enter_alloc_4k(void) 11751 { 11752 vm_page_t m; 11753 11754 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 11755 VM_ALLOC_ZERO); 11756 if (m == NULL) 11757 panic("%s: no memory to grow shadow map", __func__); 11758 return (m); 11759 } 11760 11761 static vm_page_t 11762 pmap_san_enter_alloc_2m(void) 11763 { 11764 return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO, 11765 NPTEPG, 0, ~0ul, NBPDR, 0, VM_MEMATTR_DEFAULT)); 11766 } 11767 11768 /* 11769 * Grow a shadow map by at least one 4KB page at the specified address. Use 2MB 11770 * pages when possible. 11771 */ 11772 void __nosanitizeaddress __nosanitizememory 11773 pmap_san_enter(vm_offset_t va) 11774 { 11775 pdp_entry_t *pdpe; 11776 pd_entry_t *pde; 11777 pt_entry_t *pte; 11778 vm_page_t m; 11779 11780 if (kernphys == 0) { 11781 /* 11782 * We're creating a temporary shadow map for the boot stack. 11783 */ 11784 pmap_san_enter_early(va); 11785 return; 11786 } 11787 11788 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 11789 11790 pdpe = pmap_pdpe(kernel_pmap, va); 11791 if ((*pdpe & X86_PG_V) == 0) { 11792 m = pmap_san_enter_alloc_4k(); 11793 *pdpe = (pdp_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11794 X86_PG_V | pg_nx); 11795 } 11796 pde = pmap_pdpe_to_pde(pdpe, va); 11797 if ((*pde & X86_PG_V) == 0) { 11798 m = pmap_san_enter_alloc_2m(); 11799 if (m != NULL) { 11800 *pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11801 X86_PG_PS | X86_PG_V | X86_PG_A | X86_PG_M | pg_nx); 11802 } else { 11803 m = pmap_san_enter_alloc_4k(); 11804 *pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11805 X86_PG_V | pg_nx); 11806 } 11807 } 11808 if ((*pde & X86_PG_PS) != 0) 11809 return; 11810 pte = pmap_pde_to_pte(pde, va); 11811 if ((*pte & X86_PG_V) != 0) 11812 return; 11813 m = pmap_san_enter_alloc_4k(); 11814 *pte = (pt_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | X86_PG_V | 11815 X86_PG_M | X86_PG_A | pg_nx); 11816 } 11817 #endif 11818 11819 /* 11820 * Track a range of the kernel's virtual address space that is contiguous 11821 * in various mapping attributes. 11822 */ 11823 struct pmap_kernel_map_range { 11824 vm_offset_t sva; 11825 pt_entry_t attrs; 11826 int ptes; 11827 int pdes; 11828 int pdpes; 11829 }; 11830 11831 static void 11832 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 11833 vm_offset_t eva) 11834 { 11835 const char *mode; 11836 int i, pat_idx; 11837 11838 if (eva <= range->sva) 11839 return; 11840 11841 pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true); 11842 for (i = 0; i < PAT_INDEX_SIZE; i++) 11843 if (pat_index[i] == pat_idx) 11844 break; 11845 11846 switch (i) { 11847 case PAT_WRITE_BACK: 11848 mode = "WB"; 11849 break; 11850 case PAT_WRITE_THROUGH: 11851 mode = "WT"; 11852 break; 11853 case PAT_UNCACHEABLE: 11854 mode = "UC"; 11855 break; 11856 case PAT_UNCACHED: 11857 mode = "U-"; 11858 break; 11859 case PAT_WRITE_PROTECTED: 11860 mode = "WP"; 11861 break; 11862 case PAT_WRITE_COMBINING: 11863 mode = "WC"; 11864 break; 11865 default: 11866 printf("%s: unknown PAT mode %#x for range 0x%016lx-0x%016lx\n", 11867 __func__, pat_idx, range->sva, eva); 11868 mode = "??"; 11869 break; 11870 } 11871 11872 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n", 11873 range->sva, eva, 11874 (range->attrs & X86_PG_RW) != 0 ? 'w' : '-', 11875 (range->attrs & pg_nx) != 0 ? '-' : 'x', 11876 (range->attrs & X86_PG_U) != 0 ? 'u' : 's', 11877 (range->attrs & X86_PG_G) != 0 ? 'g' : '-', 11878 mode, range->pdpes, range->pdes, range->ptes); 11879 11880 /* Reset to sentinel value. */ 11881 range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, 11882 NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, 11883 NPDEPG - 1, NPTEPG - 1); 11884 } 11885 11886 /* 11887 * Determine whether the attributes specified by a page table entry match those 11888 * being tracked by the current range. This is not quite as simple as a direct 11889 * flag comparison since some PAT modes have multiple representations. 11890 */ 11891 static bool 11892 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 11893 { 11894 pt_entry_t diff, mask; 11895 11896 mask = X86_PG_G | X86_PG_RW | X86_PG_U | X86_PG_PDE_CACHE | pg_nx; 11897 diff = (range->attrs ^ attrs) & mask; 11898 if (diff == 0) 11899 return (true); 11900 if ((diff & ~X86_PG_PDE_PAT) == 0 && 11901 pmap_pat_index(kernel_pmap, range->attrs, true) == 11902 pmap_pat_index(kernel_pmap, attrs, true)) 11903 return (true); 11904 return (false); 11905 } 11906 11907 static void 11908 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 11909 pt_entry_t attrs) 11910 { 11911 11912 memset(range, 0, sizeof(*range)); 11913 range->sva = va; 11914 range->attrs = attrs; 11915 } 11916 11917 /* 11918 * Given a leaf PTE, derive the mapping's attributes. If they do not match 11919 * those of the current run, dump the address range and its attributes, and 11920 * begin a new run. 11921 */ 11922 static void 11923 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 11924 vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde, 11925 pt_entry_t pte) 11926 { 11927 pt_entry_t attrs; 11928 11929 attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx); 11930 11931 attrs |= pdpe & pg_nx; 11932 attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U)); 11933 if ((pdpe & PG_PS) != 0) { 11934 attrs |= pdpe & (X86_PG_G | X86_PG_PDE_CACHE); 11935 } else if (pde != 0) { 11936 attrs |= pde & pg_nx; 11937 attrs &= pg_nx | (pde & (X86_PG_RW | X86_PG_U)); 11938 } 11939 if ((pde & PG_PS) != 0) { 11940 attrs |= pde & (X86_PG_G | X86_PG_PDE_CACHE); 11941 } else if (pte != 0) { 11942 attrs |= pte & pg_nx; 11943 attrs &= pg_nx | (pte & (X86_PG_RW | X86_PG_U)); 11944 attrs |= pte & (X86_PG_G | X86_PG_PTE_CACHE); 11945 11946 /* Canonicalize by always using the PDE PAT bit. */ 11947 if ((attrs & X86_PG_PTE_PAT) != 0) 11948 attrs ^= X86_PG_PDE_PAT | X86_PG_PTE_PAT; 11949 } 11950 11951 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 11952 sysctl_kmaps_dump(sb, range, va); 11953 sysctl_kmaps_reinit(range, va, attrs); 11954 } 11955 } 11956 11957 static int 11958 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 11959 { 11960 struct pmap_kernel_map_range range; 11961 struct sbuf sbuf, *sb; 11962 pml4_entry_t pml4e; 11963 pdp_entry_t *pdp, pdpe; 11964 pd_entry_t *pd, pde; 11965 pt_entry_t *pt, pte; 11966 vm_offset_t sva; 11967 vm_paddr_t pa; 11968 int error, i, j, k, l; 11969 11970 error = sysctl_wire_old_buffer(req, 0); 11971 if (error != 0) 11972 return (error); 11973 sb = &sbuf; 11974 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 11975 11976 /* Sentinel value. */ 11977 range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, 11978 NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, 11979 NPDEPG - 1, NPTEPG - 1); 11980 11981 /* 11982 * Iterate over the kernel page tables without holding the kernel pmap 11983 * lock. Outside of the large map, kernel page table pages are never 11984 * freed, so at worst we will observe inconsistencies in the output. 11985 * Within the large map, ensure that PDP and PD page addresses are 11986 * valid before descending. 11987 */ 11988 for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) { 11989 switch (i) { 11990 case PML4PML4I: 11991 sbuf_printf(sb, "\nRecursive map:\n"); 11992 break; 11993 case DMPML4I: 11994 sbuf_printf(sb, "\nDirect map:\n"); 11995 break; 11996 #ifdef KASAN 11997 case KASANPML4I: 11998 sbuf_printf(sb, "\nKASAN shadow map:\n"); 11999 break; 12000 #endif 12001 #ifdef KMSAN 12002 case KMSANSHADPML4I: 12003 sbuf_printf(sb, "\nKMSAN shadow map:\n"); 12004 break; 12005 case KMSANORIGPML4I: 12006 sbuf_printf(sb, "\nKMSAN origin map:\n"); 12007 break; 12008 #endif 12009 case KPML4BASE: 12010 sbuf_printf(sb, "\nKernel map:\n"); 12011 break; 12012 case LMSPML4I: 12013 sbuf_printf(sb, "\nLarge map:\n"); 12014 break; 12015 } 12016 12017 /* Convert to canonical form. */ 12018 if (sva == 1ul << 47) 12019 sva |= -1ul << 48; 12020 12021 restart: 12022 pml4e = kernel_pml4[i]; 12023 if ((pml4e & X86_PG_V) == 0) { 12024 sva = rounddown2(sva, NBPML4); 12025 sysctl_kmaps_dump(sb, &range, sva); 12026 sva += NBPML4; 12027 continue; 12028 } 12029 pa = pml4e & PG_FRAME; 12030 pdp = (pdp_entry_t *)PHYS_TO_DMAP(pa); 12031 12032 for (j = pmap_pdpe_index(sva); j < NPDPEPG; j++) { 12033 pdpe = pdp[j]; 12034 if ((pdpe & X86_PG_V) == 0) { 12035 sva = rounddown2(sva, NBPDP); 12036 sysctl_kmaps_dump(sb, &range, sva); 12037 sva += NBPDP; 12038 continue; 12039 } 12040 pa = pdpe & PG_FRAME; 12041 if ((pdpe & PG_PS) != 0) { 12042 sva = rounddown2(sva, NBPDP); 12043 sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, 12044 0, 0); 12045 range.pdpes++; 12046 sva += NBPDP; 12047 continue; 12048 } 12049 if (PMAP_ADDRESS_IN_LARGEMAP(sva) && 12050 vm_phys_paddr_to_vm_page(pa) == NULL) { 12051 /* 12052 * Page table pages for the large map may be 12053 * freed. Validate the next-level address 12054 * before descending. 12055 */ 12056 goto restart; 12057 } 12058 pd = (pd_entry_t *)PHYS_TO_DMAP(pa); 12059 12060 for (k = pmap_pde_index(sva); k < NPDEPG; k++) { 12061 pde = pd[k]; 12062 if ((pde & X86_PG_V) == 0) { 12063 sva = rounddown2(sva, NBPDR); 12064 sysctl_kmaps_dump(sb, &range, sva); 12065 sva += NBPDR; 12066 continue; 12067 } 12068 pa = pde & PG_FRAME; 12069 if ((pde & PG_PS) != 0) { 12070 sva = rounddown2(sva, NBPDR); 12071 sysctl_kmaps_check(sb, &range, sva, 12072 pml4e, pdpe, pde, 0); 12073 range.pdes++; 12074 sva += NBPDR; 12075 continue; 12076 } 12077 if (PMAP_ADDRESS_IN_LARGEMAP(sva) && 12078 vm_phys_paddr_to_vm_page(pa) == NULL) { 12079 /* 12080 * Page table pages for the large map 12081 * may be freed. Validate the 12082 * next-level address before descending. 12083 */ 12084 goto restart; 12085 } 12086 pt = (pt_entry_t *)PHYS_TO_DMAP(pa); 12087 12088 for (l = pmap_pte_index(sva); l < NPTEPG; l++, 12089 sva += PAGE_SIZE) { 12090 pte = pt[l]; 12091 if ((pte & X86_PG_V) == 0) { 12092 sysctl_kmaps_dump(sb, &range, 12093 sva); 12094 continue; 12095 } 12096 sysctl_kmaps_check(sb, &range, sva, 12097 pml4e, pdpe, pde, pte); 12098 range.ptes++; 12099 } 12100 } 12101 } 12102 } 12103 12104 error = sbuf_finish(sb); 12105 sbuf_delete(sb); 12106 return (error); 12107 } 12108 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 12109 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 12110 NULL, 0, sysctl_kmaps, "A", 12111 "Dump kernel address layout"); 12112 12113 #ifdef DDB 12114 DB_SHOW_COMMAND(pte, pmap_print_pte) 12115 { 12116 pmap_t pmap; 12117 pml5_entry_t *pml5; 12118 pml4_entry_t *pml4; 12119 pdp_entry_t *pdp; 12120 pd_entry_t *pde; 12121 pt_entry_t *pte, PG_V; 12122 vm_offset_t va; 12123 12124 if (!have_addr) { 12125 db_printf("show pte addr\n"); 12126 return; 12127 } 12128 va = (vm_offset_t)addr; 12129 12130 if (kdb_thread != NULL) 12131 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); 12132 else 12133 pmap = PCPU_GET(curpmap); 12134 12135 PG_V = pmap_valid_bit(pmap); 12136 db_printf("VA 0x%016lx", va); 12137 12138 if (pmap_is_la57(pmap)) { 12139 pml5 = pmap_pml5e(pmap, va); 12140 db_printf(" pml5e 0x%016lx", *pml5); 12141 if ((*pml5 & PG_V) == 0) { 12142 db_printf("\n"); 12143 return; 12144 } 12145 pml4 = pmap_pml5e_to_pml4e(pml5, va); 12146 } else { 12147 pml4 = pmap_pml4e(pmap, va); 12148 } 12149 db_printf(" pml4e 0x%016lx", *pml4); 12150 if ((*pml4 & PG_V) == 0) { 12151 db_printf("\n"); 12152 return; 12153 } 12154 pdp = pmap_pml4e_to_pdpe(pml4, va); 12155 db_printf(" pdpe 0x%016lx", *pdp); 12156 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { 12157 db_printf("\n"); 12158 return; 12159 } 12160 pde = pmap_pdpe_to_pde(pdp, va); 12161 db_printf(" pde 0x%016lx", *pde); 12162 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { 12163 db_printf("\n"); 12164 return; 12165 } 12166 pte = pmap_pde_to_pte(pde, va); 12167 db_printf(" pte 0x%016lx\n", *pte); 12168 } 12169 12170 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) 12171 { 12172 vm_paddr_t a; 12173 12174 if (have_addr) { 12175 a = (vm_paddr_t)addr; 12176 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); 12177 } else { 12178 db_printf("show phys2dmap addr\n"); 12179 } 12180 } 12181 12182 static void 12183 ptpages_show_page(int level, int idx, vm_page_t pg) 12184 { 12185 db_printf("l %d i %d pg %p phys %#lx ref %x\n", 12186 level, idx, pg, VM_PAGE_TO_PHYS(pg), pg->ref_count); 12187 } 12188 12189 static void 12190 ptpages_show_complain(int level, int idx, uint64_t pte) 12191 { 12192 db_printf("l %d i %d pte %#lx\n", level, idx, pte); 12193 } 12194 12195 static void 12196 ptpages_show_pml4(vm_page_t pg4, int num_entries, uint64_t PG_V) 12197 { 12198 vm_page_t pg3, pg2, pg1; 12199 pml4_entry_t *pml4; 12200 pdp_entry_t *pdp; 12201 pd_entry_t *pd; 12202 int i4, i3, i2; 12203 12204 pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg4)); 12205 for (i4 = 0; i4 < num_entries; i4++) { 12206 if ((pml4[i4] & PG_V) == 0) 12207 continue; 12208 pg3 = PHYS_TO_VM_PAGE(pml4[i4] & PG_FRAME); 12209 if (pg3 == NULL) { 12210 ptpages_show_complain(3, i4, pml4[i4]); 12211 continue; 12212 } 12213 ptpages_show_page(3, i4, pg3); 12214 pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg3)); 12215 for (i3 = 0; i3 < NPDPEPG; i3++) { 12216 if ((pdp[i3] & PG_V) == 0) 12217 continue; 12218 pg2 = PHYS_TO_VM_PAGE(pdp[i3] & PG_FRAME); 12219 if (pg3 == NULL) { 12220 ptpages_show_complain(2, i3, pdp[i3]); 12221 continue; 12222 } 12223 ptpages_show_page(2, i3, pg2); 12224 pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg2)); 12225 for (i2 = 0; i2 < NPDEPG; i2++) { 12226 if ((pd[i2] & PG_V) == 0) 12227 continue; 12228 pg1 = PHYS_TO_VM_PAGE(pd[i2] & PG_FRAME); 12229 if (pg1 == NULL) { 12230 ptpages_show_complain(1, i2, pd[i2]); 12231 continue; 12232 } 12233 ptpages_show_page(1, i2, pg1); 12234 } 12235 } 12236 } 12237 } 12238 12239 DB_SHOW_COMMAND(ptpages, pmap_ptpages) 12240 { 12241 pmap_t pmap; 12242 vm_page_t pg; 12243 pml5_entry_t *pml5; 12244 uint64_t PG_V; 12245 int i5; 12246 12247 if (have_addr) 12248 pmap = (pmap_t)addr; 12249 else 12250 pmap = PCPU_GET(curpmap); 12251 12252 PG_V = pmap_valid_bit(pmap); 12253 12254 if (pmap_is_la57(pmap)) { 12255 pml5 = pmap->pm_pmltop; 12256 for (i5 = 0; i5 < NUPML5E; i5++) { 12257 if ((pml5[i5] & PG_V) == 0) 12258 continue; 12259 pg = PHYS_TO_VM_PAGE(pml5[i5] & PG_FRAME); 12260 if (pg == NULL) { 12261 ptpages_show_complain(4, i5, pml5[i5]); 12262 continue; 12263 } 12264 ptpages_show_page(4, i5, pg); 12265 ptpages_show_pml4(pg, NPML4EPG, PG_V); 12266 } 12267 } else { 12268 ptpages_show_pml4(PHYS_TO_VM_PAGE(DMAP_TO_PHYS( 12269 (vm_offset_t)pmap->pm_pmltop)), NUP4ML4E, PG_V); 12270 } 12271 } 12272 #endif 12273