1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * 15 * This code is derived from software contributed to Berkeley by 16 * the Systems Programming Group of the University of Utah Computer 17 * Science Department and William Jolitz of UUNET Technologies Inc. 18 * 19 * Redistribution and use in source and binary forms, with or without 20 * modification, are permitted provided that the following conditions 21 * are met: 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 3. All advertising materials mentioning features or use of this software 28 * must display the following acknowledgement: 29 * This product includes software developed by the University of 30 * California, Berkeley and its contributors. 31 * 4. Neither the name of the University nor the names of its contributors 32 * may be used to endorse or promote products derived from this software 33 * without specific prior written permission. 34 * 35 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 36 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 38 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 39 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 40 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 41 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 42 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 43 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 44 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 45 * SUCH DAMAGE. 46 */ 47 /*- 48 * Copyright (c) 2003 Networks Associates Technology, Inc. 49 * Copyright (c) 2014-2020 The FreeBSD Foundation 50 * All rights reserved. 51 * 52 * This software was developed for the FreeBSD Project by Jake Burkholder, 53 * Safeport Network Services, and Network Associates Laboratories, the 54 * Security Research Division of Network Associates, Inc. under 55 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 56 * CHATS research program. 57 * 58 * Portions of this software were developed by 59 * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from 60 * the FreeBSD Foundation. 61 * 62 * Redistribution and use in source and binary forms, with or without 63 * modification, are permitted provided that the following conditions 64 * are met: 65 * 1. Redistributions of source code must retain the above copyright 66 * notice, this list of conditions and the following disclaimer. 67 * 2. Redistributions in binary form must reproduce the above copyright 68 * notice, this list of conditions and the following disclaimer in the 69 * documentation and/or other materials provided with the distribution. 70 * 71 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 72 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 73 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 74 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 75 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 76 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 77 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 78 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 79 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 80 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 81 * SUCH DAMAGE. 82 */ 83 84 #define AMD64_NPT_AWARE 85 86 #include <sys/cdefs.h> 87 /* 88 * Manages physical address maps. 89 * 90 * Since the information managed by this module is 91 * also stored by the logical address mapping module, 92 * this module may throw away valid virtual-to-physical 93 * mappings at almost any time. However, invalidations 94 * of virtual-to-physical mappings must be done as 95 * requested. 96 * 97 * In order to cope with hardware architectures which 98 * make virtual-to-physical map invalidates expensive, 99 * this module may delay invalidate or reduced protection 100 * operations until such time as they are actually 101 * necessary. This module is given full information as 102 * to which processors are currently using which maps, 103 * and to when physical maps must be made correct. 104 */ 105 106 #include "opt_ddb.h" 107 #include "opt_pmap.h" 108 #include "opt_vm.h" 109 110 #include <sys/param.h> 111 #include <sys/asan.h> 112 #include <sys/bitstring.h> 113 #include <sys/bus.h> 114 #include <sys/systm.h> 115 #include <sys/counter.h> 116 #include <sys/kernel.h> 117 #include <sys/ktr.h> 118 #include <sys/lock.h> 119 #include <sys/malloc.h> 120 #include <sys/mman.h> 121 #include <sys/msan.h> 122 #include <sys/mutex.h> 123 #include <sys/proc.h> 124 #include <sys/rangeset.h> 125 #include <sys/rwlock.h> 126 #include <sys/sbuf.h> 127 #include <sys/smr.h> 128 #include <sys/sx.h> 129 #include <sys/turnstile.h> 130 #include <sys/vmem.h> 131 #include <sys/vmmeter.h> 132 #include <sys/sched.h> 133 #include <sys/sysctl.h> 134 #include <sys/smp.h> 135 #ifdef DDB 136 #include <sys/kdb.h> 137 #include <ddb/ddb.h> 138 #endif 139 140 #include <vm/vm.h> 141 #include <vm/vm_param.h> 142 #include <vm/vm_kern.h> 143 #include <vm/vm_page.h> 144 #include <vm/vm_map.h> 145 #include <vm/vm_object.h> 146 #include <vm/vm_extern.h> 147 #include <vm/vm_pageout.h> 148 #include <vm/vm_pager.h> 149 #include <vm/vm_phys.h> 150 #include <vm/vm_radix.h> 151 #include <vm/vm_reserv.h> 152 #include <vm/vm_dumpset.h> 153 #include <vm/uma.h> 154 155 #include <machine/asan.h> 156 #include <machine/intr_machdep.h> 157 #include <x86/apicvar.h> 158 #include <x86/ifunc.h> 159 #include <machine/cpu.h> 160 #include <machine/cputypes.h> 161 #include <machine/md_var.h> 162 #include <machine/msan.h> 163 #include <machine/pcb.h> 164 #include <machine/specialreg.h> 165 #ifdef SMP 166 #include <machine/smp.h> 167 #endif 168 #include <machine/sysarch.h> 169 #include <machine/tss.h> 170 171 #ifdef NUMA 172 #define PMAP_MEMDOM MAXMEMDOM 173 #else 174 #define PMAP_MEMDOM 1 175 #endif 176 177 static __inline bool 178 pmap_type_guest(pmap_t pmap) 179 { 180 181 return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI)); 182 } 183 184 static __inline bool 185 pmap_emulate_ad_bits(pmap_t pmap) 186 { 187 188 return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); 189 } 190 191 static __inline pt_entry_t 192 pmap_valid_bit(pmap_t pmap) 193 { 194 pt_entry_t mask; 195 196 switch (pmap->pm_type) { 197 case PT_X86: 198 case PT_RVI: 199 mask = X86_PG_V; 200 break; 201 case PT_EPT: 202 if (pmap_emulate_ad_bits(pmap)) 203 mask = EPT_PG_EMUL_V; 204 else 205 mask = EPT_PG_READ; 206 break; 207 default: 208 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); 209 } 210 211 return (mask); 212 } 213 214 static __inline pt_entry_t 215 pmap_rw_bit(pmap_t pmap) 216 { 217 pt_entry_t mask; 218 219 switch (pmap->pm_type) { 220 case PT_X86: 221 case PT_RVI: 222 mask = X86_PG_RW; 223 break; 224 case PT_EPT: 225 if (pmap_emulate_ad_bits(pmap)) 226 mask = EPT_PG_EMUL_RW; 227 else 228 mask = EPT_PG_WRITE; 229 break; 230 default: 231 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); 232 } 233 234 return (mask); 235 } 236 237 static pt_entry_t pg_g; 238 239 static __inline pt_entry_t 240 pmap_global_bit(pmap_t pmap) 241 { 242 pt_entry_t mask; 243 244 switch (pmap->pm_type) { 245 case PT_X86: 246 mask = pg_g; 247 break; 248 case PT_RVI: 249 case PT_EPT: 250 mask = 0; 251 break; 252 default: 253 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); 254 } 255 256 return (mask); 257 } 258 259 static __inline pt_entry_t 260 pmap_accessed_bit(pmap_t pmap) 261 { 262 pt_entry_t mask; 263 264 switch (pmap->pm_type) { 265 case PT_X86: 266 case PT_RVI: 267 mask = X86_PG_A; 268 break; 269 case PT_EPT: 270 if (pmap_emulate_ad_bits(pmap)) 271 mask = EPT_PG_READ; 272 else 273 mask = EPT_PG_A; 274 break; 275 default: 276 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); 277 } 278 279 return (mask); 280 } 281 282 static __inline pt_entry_t 283 pmap_modified_bit(pmap_t pmap) 284 { 285 pt_entry_t mask; 286 287 switch (pmap->pm_type) { 288 case PT_X86: 289 case PT_RVI: 290 mask = X86_PG_M; 291 break; 292 case PT_EPT: 293 if (pmap_emulate_ad_bits(pmap)) 294 mask = EPT_PG_WRITE; 295 else 296 mask = EPT_PG_M; 297 break; 298 default: 299 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); 300 } 301 302 return (mask); 303 } 304 305 static __inline pt_entry_t 306 pmap_pku_mask_bit(pmap_t pmap) 307 { 308 309 return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0); 310 } 311 312 static __inline bool 313 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) 314 { 315 316 if (!pmap_emulate_ad_bits(pmap)) 317 return (true); 318 319 KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); 320 321 /* 322 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration 323 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared 324 * if the EPT_PG_WRITE bit is set. 325 */ 326 if ((pte & EPT_PG_WRITE) != 0) 327 return (false); 328 329 /* 330 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. 331 */ 332 if ((pte & EPT_PG_EXECUTE) == 0 || 333 ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) 334 return (true); 335 else 336 return (false); 337 } 338 339 #ifdef PV_STATS 340 #define PV_STAT(x) do { x ; } while (0) 341 #else 342 #define PV_STAT(x) do { } while (0) 343 #endif 344 345 #undef pa_index 346 #ifdef NUMA 347 #define pa_index(pa) ({ \ 348 KASSERT((pa) <= vm_phys_segs[vm_phys_nsegs - 1].end, \ 349 ("address %lx beyond the last segment", (pa))); \ 350 (pa) >> PDRSHIFT; \ 351 }) 352 #define pa_to_pmdp(pa) (&pv_table[pa_index(pa)]) 353 #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page)) 354 #define PHYS_TO_PV_LIST_LOCK(pa) ({ \ 355 struct rwlock *_lock; \ 356 if (__predict_false((pa) > pmap_last_pa)) \ 357 _lock = &pv_dummy_large.pv_lock; \ 358 else \ 359 _lock = &(pa_to_pmdp(pa)->pv_lock); \ 360 _lock; \ 361 }) 362 #else 363 #define pa_index(pa) ((pa) >> PDRSHIFT) 364 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 365 366 #define NPV_LIST_LOCKS MAXCPU 367 368 #define PHYS_TO_PV_LIST_LOCK(pa) \ 369 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) 370 #endif 371 372 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 373 struct rwlock **_lockp = (lockp); \ 374 struct rwlock *_new_lock; \ 375 \ 376 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 377 if (_new_lock != *_lockp) { \ 378 if (*_lockp != NULL) \ 379 rw_wunlock(*_lockp); \ 380 *_lockp = _new_lock; \ 381 rw_wlock(*_lockp); \ 382 } \ 383 } while (0) 384 385 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 386 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 387 388 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 389 struct rwlock **_lockp = (lockp); \ 390 \ 391 if (*_lockp != NULL) { \ 392 rw_wunlock(*_lockp); \ 393 *_lockp = NULL; \ 394 } \ 395 } while (0) 396 397 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 398 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 399 400 /* 401 * Statically allocate kernel pmap memory. However, memory for 402 * pm_pcids is obtained after the dynamic allocator is operational. 403 * Initialize it with a non-canonical pointer to catch early accesses 404 * regardless of the active mapping. 405 */ 406 struct pmap kernel_pmap_store = { 407 .pm_pcidp = (void *)0xdeadbeefdeadbeef, 408 }; 409 410 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 411 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 412 413 int nkpt; 414 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, 415 "Number of kernel page table pages allocated on bootup"); 416 417 static int ndmpdp; 418 vm_paddr_t dmaplimit; 419 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 420 pt_entry_t pg_nx; 421 422 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 423 "VM/pmap parameters"); 424 425 static int __read_frequently pg_ps_enabled = 1; 426 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 427 &pg_ps_enabled, 0, "Are large page mappings enabled?"); 428 429 int __read_frequently la57 = 0; 430 SYSCTL_INT(_vm_pmap, OID_AUTO, la57, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 431 &la57, 0, 432 "5-level paging for host is enabled"); 433 434 static bool 435 pmap_is_la57(pmap_t pmap) 436 { 437 if (pmap->pm_type == PT_X86) 438 return (la57); 439 return (false); /* XXXKIB handle EPT */ 440 } 441 442 #define PAT_INDEX_SIZE 8 443 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 444 445 static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 446 static u_int64_t KPDphys; /* phys addr of kernel level 2 */ 447 static u_int64_t KPDPphys; /* phys addr of kernel level 3 */ 448 u_int64_t KPML4phys; /* phys addr of kernel level 4 */ 449 u_int64_t KPML5phys; /* phys addr of kernel level 5, 450 if supported */ 451 452 #ifdef KASAN 453 static uint64_t KASANPDPphys; 454 #endif 455 #ifdef KMSAN 456 static uint64_t KMSANSHADPDPphys; 457 static uint64_t KMSANORIGPDPphys; 458 459 /* 460 * To support systems with large amounts of memory, it is necessary to extend 461 * the maximum size of the direct map. This could eat into the space reserved 462 * for the shadow map. 463 */ 464 _Static_assert(DMPML4I + NDMPML4E <= KMSANSHADPML4I, "direct map overflow"); 465 #endif 466 467 static pml4_entry_t *kernel_pml4; 468 static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ 469 static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 470 static int ndmpdpphys; /* number of DMPDPphys pages */ 471 472 vm_paddr_t kernphys; /* phys addr of start of bootstrap data */ 473 vm_paddr_t KERNend; /* and the end */ 474 475 /* 476 * pmap_mapdev support pre initialization (i.e. console) 477 */ 478 #define PMAP_PREINIT_MAPPING_COUNT 8 479 static struct pmap_preinit_mapping { 480 vm_paddr_t pa; 481 vm_offset_t va; 482 vm_size_t sz; 483 int mode; 484 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 485 static int pmap_initialized; 486 487 /* 488 * Data for the pv entry allocation mechanism. 489 * Updates to pv_invl_gen are protected by the pv list lock but reads are not. 490 */ 491 #ifdef NUMA 492 static __inline int 493 pc_to_domain(struct pv_chunk *pc) 494 { 495 496 return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc))); 497 } 498 #else 499 static __inline int 500 pc_to_domain(struct pv_chunk *pc __unused) 501 { 502 503 return (0); 504 } 505 #endif 506 507 struct pv_chunks_list { 508 struct mtx pvc_lock; 509 TAILQ_HEAD(pch, pv_chunk) pvc_list; 510 int active_reclaims; 511 } __aligned(CACHE_LINE_SIZE); 512 513 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM]; 514 515 #ifdef NUMA 516 struct pmap_large_md_page { 517 struct rwlock pv_lock; 518 struct md_page pv_page; 519 u_long pv_invl_gen; 520 }; 521 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large; 522 #define pv_dummy pv_dummy_large.pv_page 523 __read_mostly static struct pmap_large_md_page *pv_table; 524 __read_mostly vm_paddr_t pmap_last_pa; 525 #else 526 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; 527 static u_long pv_invl_gen[NPV_LIST_LOCKS]; 528 static struct md_page *pv_table; 529 static struct md_page pv_dummy; 530 #endif 531 532 /* 533 * All those kernel PT submaps that BSD is so fond of 534 */ 535 pt_entry_t *CMAP1 = NULL; 536 caddr_t CADDR1 = 0; 537 static vm_offset_t qframe = 0; 538 static struct mtx qframe_mtx; 539 540 static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ 541 542 static vmem_t *large_vmem; 543 static u_int lm_ents; 544 #define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= LARGEMAP_MIN_ADDRESS && \ 545 (va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents) 546 547 int pmap_pcid_enabled = 1; 548 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 549 &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?"); 550 int invpcid_works = 0; 551 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, 552 "Is the invpcid instruction available ?"); 553 int pmap_pcid_invlpg_workaround = 0; 554 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_invlpg_workaround, 555 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 556 &pmap_pcid_invlpg_workaround, 0, 557 "Enable small core PCID/INVLPG workaround"); 558 int pmap_pcid_invlpg_workaround_uena = 1; 559 560 int __read_frequently pti = 0; 561 SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 562 &pti, 0, 563 "Page Table Isolation enabled"); 564 static vm_object_t pti_obj; 565 static pml4_entry_t *pti_pml4; 566 static vm_pindex_t pti_pg_idx; 567 static bool pti_finalized; 568 569 struct pmap_pkru_range { 570 struct rs_el pkru_rs_el; 571 u_int pkru_keyidx; 572 int pkru_flags; 573 }; 574 575 static uma_zone_t pmap_pkru_ranges_zone; 576 static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 577 static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va); 578 static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 579 static void *pkru_dup_range(void *ctx, void *data); 580 static void pkru_free_range(void *ctx, void *node); 581 static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap); 582 static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 583 static void pmap_pkru_deassign_all(pmap_t pmap); 584 585 static COUNTER_U64_DEFINE_EARLY(pcid_save_cnt); 586 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLFLAG_RD, 587 &pcid_save_cnt, "Count of saved TLB context on switch"); 588 589 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = 590 LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); 591 static struct mtx invl_gen_mtx; 592 /* Fake lock object to satisfy turnstiles interface. */ 593 static struct lock_object invl_gen_ts = { 594 .lo_name = "invlts", 595 }; 596 static struct pmap_invl_gen pmap_invl_gen_head = { 597 .gen = 1, 598 .next = NULL, 599 }; 600 static u_long pmap_invl_gen = 1; 601 static int pmap_invl_waiters; 602 static struct callout pmap_invl_callout; 603 static bool pmap_invl_callout_inited; 604 605 #define PMAP_ASSERT_NOT_IN_DI() \ 606 KASSERT(pmap_not_in_di(), ("DI already started")) 607 608 static bool 609 pmap_di_locked(void) 610 { 611 int tun; 612 613 if ((cpu_feature2 & CPUID2_CX16) == 0) 614 return (true); 615 tun = 0; 616 TUNABLE_INT_FETCH("vm.pmap.di_locked", &tun); 617 return (tun != 0); 618 } 619 620 static int 621 sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS) 622 { 623 int locked; 624 625 locked = pmap_di_locked(); 626 return (sysctl_handle_int(oidp, &locked, 0, req)); 627 } 628 SYSCTL_PROC(_vm_pmap, OID_AUTO, di_locked, CTLTYPE_INT | CTLFLAG_RDTUN | 629 CTLFLAG_MPSAFE, 0, 0, sysctl_pmap_di_locked, "", 630 "Locked delayed invalidation"); 631 632 static bool pmap_not_in_di_l(void); 633 static bool pmap_not_in_di_u(void); 634 DEFINE_IFUNC(, bool, pmap_not_in_di, (void)) 635 { 636 637 return (pmap_di_locked() ? pmap_not_in_di_l : pmap_not_in_di_u); 638 } 639 640 static bool 641 pmap_not_in_di_l(void) 642 { 643 struct pmap_invl_gen *invl_gen; 644 645 invl_gen = &curthread->td_md.md_invl_gen; 646 return (invl_gen->gen == 0); 647 } 648 649 static void 650 pmap_thread_init_invl_gen_l(struct thread *td) 651 { 652 struct pmap_invl_gen *invl_gen; 653 654 invl_gen = &td->td_md.md_invl_gen; 655 invl_gen->gen = 0; 656 } 657 658 static void 659 pmap_delayed_invl_wait_block(u_long *m_gen, u_long *invl_gen) 660 { 661 struct turnstile *ts; 662 663 ts = turnstile_trywait(&invl_gen_ts); 664 if (*m_gen > atomic_load_long(invl_gen)) 665 turnstile_wait(ts, NULL, TS_SHARED_QUEUE); 666 else 667 turnstile_cancel(ts); 668 } 669 670 static void 671 pmap_delayed_invl_finish_unblock(u_long new_gen) 672 { 673 struct turnstile *ts; 674 675 turnstile_chain_lock(&invl_gen_ts); 676 ts = turnstile_lookup(&invl_gen_ts); 677 if (new_gen != 0) 678 pmap_invl_gen = new_gen; 679 if (ts != NULL) { 680 turnstile_broadcast(ts, TS_SHARED_QUEUE); 681 turnstile_unpend(ts); 682 } 683 turnstile_chain_unlock(&invl_gen_ts); 684 } 685 686 /* 687 * Start a new Delayed Invalidation (DI) block of code, executed by 688 * the current thread. Within a DI block, the current thread may 689 * destroy both the page table and PV list entries for a mapping and 690 * then release the corresponding PV list lock before ensuring that 691 * the mapping is flushed from the TLBs of any processors with the 692 * pmap active. 693 */ 694 static void 695 pmap_delayed_invl_start_l(void) 696 { 697 struct pmap_invl_gen *invl_gen; 698 u_long currgen; 699 700 invl_gen = &curthread->td_md.md_invl_gen; 701 PMAP_ASSERT_NOT_IN_DI(); 702 mtx_lock(&invl_gen_mtx); 703 if (LIST_EMPTY(&pmap_invl_gen_tracker)) 704 currgen = pmap_invl_gen; 705 else 706 currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; 707 invl_gen->gen = currgen + 1; 708 LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); 709 mtx_unlock(&invl_gen_mtx); 710 } 711 712 /* 713 * Finish the DI block, previously started by the current thread. All 714 * required TLB flushes for the pages marked by 715 * pmap_delayed_invl_page() must be finished before this function is 716 * called. 717 * 718 * This function works by bumping the global DI generation number to 719 * the generation number of the current thread's DI, unless there is a 720 * pending DI that started earlier. In the latter case, bumping the 721 * global DI generation number would incorrectly signal that the 722 * earlier DI had finished. Instead, this function bumps the earlier 723 * DI's generation number to match the generation number of the 724 * current thread's DI. 725 */ 726 static void 727 pmap_delayed_invl_finish_l(void) 728 { 729 struct pmap_invl_gen *invl_gen, *next; 730 731 invl_gen = &curthread->td_md.md_invl_gen; 732 KASSERT(invl_gen->gen != 0, ("missed invl_start")); 733 mtx_lock(&invl_gen_mtx); 734 next = LIST_NEXT(invl_gen, link); 735 if (next == NULL) 736 pmap_delayed_invl_finish_unblock(invl_gen->gen); 737 else 738 next->gen = invl_gen->gen; 739 LIST_REMOVE(invl_gen, link); 740 mtx_unlock(&invl_gen_mtx); 741 invl_gen->gen = 0; 742 } 743 744 static bool 745 pmap_not_in_di_u(void) 746 { 747 struct pmap_invl_gen *invl_gen; 748 749 invl_gen = &curthread->td_md.md_invl_gen; 750 return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0); 751 } 752 753 static void 754 pmap_thread_init_invl_gen_u(struct thread *td) 755 { 756 struct pmap_invl_gen *invl_gen; 757 758 invl_gen = &td->td_md.md_invl_gen; 759 invl_gen->gen = 0; 760 invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID; 761 } 762 763 static bool 764 pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out) 765 { 766 uint64_t new_high, new_low, old_high, old_low; 767 char res; 768 769 old_low = new_low = 0; 770 old_high = new_high = (uintptr_t)0; 771 772 __asm volatile("lock;cmpxchg16b\t%1" 773 : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) 774 : "b"(new_low), "c" (new_high) 775 : "memory", "cc"); 776 if (res == 0) { 777 if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0) 778 return (false); 779 out->gen = old_low; 780 out->next = (void *)old_high; 781 } else { 782 out->gen = new_low; 783 out->next = (void *)new_high; 784 } 785 return (true); 786 } 787 788 static bool 789 pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val, 790 struct pmap_invl_gen *new_val) 791 { 792 uint64_t new_high, new_low, old_high, old_low; 793 char res; 794 795 new_low = new_val->gen; 796 new_high = (uintptr_t)new_val->next; 797 old_low = old_val->gen; 798 old_high = (uintptr_t)old_val->next; 799 800 __asm volatile("lock;cmpxchg16b\t%1" 801 : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) 802 : "b"(new_low), "c" (new_high) 803 : "memory", "cc"); 804 return (res); 805 } 806 807 static COUNTER_U64_DEFINE_EARLY(pv_page_count); 808 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_page_count, CTLFLAG_RD, 809 &pv_page_count, "Current number of allocated pv pages"); 810 811 static COUNTER_U64_DEFINE_EARLY(user_pt_page_count); 812 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, user_pt_page_count, CTLFLAG_RD, 813 &user_pt_page_count, 814 "Current number of allocated page table pages for userspace"); 815 816 static COUNTER_U64_DEFINE_EARLY(kernel_pt_page_count); 817 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, kernel_pt_page_count, CTLFLAG_RD, 818 &kernel_pt_page_count, 819 "Current number of allocated page table pages for the kernel"); 820 821 #ifdef PV_STATS 822 823 static COUNTER_U64_DEFINE_EARLY(invl_start_restart); 824 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_start_restart, 825 CTLFLAG_RD, &invl_start_restart, 826 "Number of delayed TLB invalidation request restarts"); 827 828 static COUNTER_U64_DEFINE_EARLY(invl_finish_restart); 829 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD, 830 &invl_finish_restart, 831 "Number of delayed TLB invalidation completion restarts"); 832 833 static int invl_max_qlen; 834 SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD, 835 &invl_max_qlen, 0, 836 "Maximum delayed TLB invalidation request queue length"); 837 #endif 838 839 #define di_delay locks_delay 840 841 static void 842 pmap_delayed_invl_start_u(void) 843 { 844 struct pmap_invl_gen *invl_gen, *p, prev, new_prev; 845 struct thread *td; 846 struct lock_delay_arg lda; 847 uintptr_t prevl; 848 u_char pri; 849 #ifdef PV_STATS 850 int i, ii; 851 #endif 852 853 td = curthread; 854 invl_gen = &td->td_md.md_invl_gen; 855 PMAP_ASSERT_NOT_IN_DI(); 856 lock_delay_arg_init(&lda, &di_delay); 857 invl_gen->saved_pri = 0; 858 pri = td->td_base_pri; 859 if (pri > PVM) { 860 thread_lock(td); 861 pri = td->td_base_pri; 862 if (pri > PVM) { 863 invl_gen->saved_pri = pri; 864 sched_prio(td, PVM); 865 } 866 thread_unlock(td); 867 } 868 again: 869 PV_STAT(i = 0); 870 for (p = &pmap_invl_gen_head;; p = prev.next) { 871 PV_STAT(i++); 872 prevl = (uintptr_t)atomic_load_ptr(&p->next); 873 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { 874 PV_STAT(counter_u64_add(invl_start_restart, 1)); 875 lock_delay(&lda); 876 goto again; 877 } 878 if (prevl == 0) 879 break; 880 prev.next = (void *)prevl; 881 } 882 #ifdef PV_STATS 883 if ((ii = invl_max_qlen) < i) 884 atomic_cmpset_int(&invl_max_qlen, ii, i); 885 #endif 886 887 if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) { 888 PV_STAT(counter_u64_add(invl_start_restart, 1)); 889 lock_delay(&lda); 890 goto again; 891 } 892 893 new_prev.gen = prev.gen; 894 new_prev.next = invl_gen; 895 invl_gen->gen = prev.gen + 1; 896 897 /* Formal fence between store to invl->gen and updating *p. */ 898 atomic_thread_fence_rel(); 899 900 /* 901 * After inserting an invl_gen element with invalid bit set, 902 * this thread blocks any other thread trying to enter the 903 * delayed invalidation block. Do not allow to remove us from 904 * the CPU, because it causes starvation for other threads. 905 */ 906 critical_enter(); 907 908 /* 909 * ABA for *p is not possible there, since p->gen can only 910 * increase. So if the *p thread finished its di, then 911 * started a new one and got inserted into the list at the 912 * same place, its gen will appear greater than the previously 913 * read gen. 914 */ 915 if (!pmap_di_store_invl(p, &prev, &new_prev)) { 916 critical_exit(); 917 PV_STAT(counter_u64_add(invl_start_restart, 1)); 918 lock_delay(&lda); 919 goto again; 920 } 921 922 /* 923 * There we clear PMAP_INVL_GEN_NEXT_INVALID in 924 * invl_gen->next, allowing other threads to iterate past us. 925 * pmap_di_store_invl() provides fence between the generation 926 * write and the update of next. 927 */ 928 invl_gen->next = NULL; 929 critical_exit(); 930 } 931 932 static bool 933 pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen *invl_gen, 934 struct pmap_invl_gen *p) 935 { 936 struct pmap_invl_gen prev, new_prev; 937 u_long mygen; 938 939 /* 940 * Load invl_gen->gen after setting invl_gen->next 941 * PMAP_INVL_GEN_NEXT_INVALID. This prevents larger 942 * generations to propagate to our invl_gen->gen. Lock prefix 943 * in atomic_set_ptr() worked as seq_cst fence. 944 */ 945 mygen = atomic_load_long(&invl_gen->gen); 946 947 if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen) 948 return (false); 949 950 KASSERT(prev.gen < mygen, 951 ("invalid di gen sequence %lu %lu", prev.gen, mygen)); 952 new_prev.gen = mygen; 953 new_prev.next = (void *)((uintptr_t)invl_gen->next & 954 ~PMAP_INVL_GEN_NEXT_INVALID); 955 956 /* Formal fence between load of prev and storing update to it. */ 957 atomic_thread_fence_rel(); 958 959 return (pmap_di_store_invl(p, &prev, &new_prev)); 960 } 961 962 static void 963 pmap_delayed_invl_finish_u(void) 964 { 965 struct pmap_invl_gen *invl_gen, *p; 966 struct thread *td; 967 struct lock_delay_arg lda; 968 uintptr_t prevl; 969 970 td = curthread; 971 invl_gen = &td->td_md.md_invl_gen; 972 KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0")); 973 KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0, 974 ("missed invl_start: INVALID")); 975 lock_delay_arg_init(&lda, &di_delay); 976 977 again: 978 for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) { 979 prevl = (uintptr_t)atomic_load_ptr(&p->next); 980 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { 981 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 982 lock_delay(&lda); 983 goto again; 984 } 985 if ((void *)prevl == invl_gen) 986 break; 987 } 988 989 /* 990 * It is legitimate to not find ourself on the list if a 991 * thread before us finished its DI and started it again. 992 */ 993 if (__predict_false(p == NULL)) { 994 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 995 lock_delay(&lda); 996 goto again; 997 } 998 999 critical_enter(); 1000 atomic_set_ptr((uintptr_t *)&invl_gen->next, 1001 PMAP_INVL_GEN_NEXT_INVALID); 1002 if (!pmap_delayed_invl_finish_u_crit(invl_gen, p)) { 1003 atomic_clear_ptr((uintptr_t *)&invl_gen->next, 1004 PMAP_INVL_GEN_NEXT_INVALID); 1005 critical_exit(); 1006 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 1007 lock_delay(&lda); 1008 goto again; 1009 } 1010 critical_exit(); 1011 if (atomic_load_int(&pmap_invl_waiters) > 0) 1012 pmap_delayed_invl_finish_unblock(0); 1013 if (invl_gen->saved_pri != 0) { 1014 thread_lock(td); 1015 sched_prio(td, invl_gen->saved_pri); 1016 thread_unlock(td); 1017 } 1018 } 1019 1020 #ifdef DDB 1021 DB_SHOW_COMMAND(di_queue, pmap_di_queue) 1022 { 1023 struct pmap_invl_gen *p, *pn; 1024 struct thread *td; 1025 uintptr_t nextl; 1026 bool first; 1027 1028 for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn, 1029 first = false) { 1030 nextl = (uintptr_t)atomic_load_ptr(&p->next); 1031 pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID); 1032 td = first ? NULL : __containerof(p, struct thread, 1033 td_md.md_invl_gen); 1034 db_printf("gen %lu inv %d td %p tid %d\n", p->gen, 1035 (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td, 1036 td != NULL ? td->td_tid : -1); 1037 } 1038 } 1039 #endif 1040 1041 #ifdef PV_STATS 1042 static COUNTER_U64_DEFINE_EARLY(invl_wait); 1043 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait, 1044 CTLFLAG_RD, &invl_wait, 1045 "Number of times DI invalidation blocked pmap_remove_all/write"); 1046 1047 static COUNTER_U64_DEFINE_EARLY(invl_wait_slow); 1048 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD, 1049 &invl_wait_slow, "Number of slow invalidation waits for lockless DI"); 1050 1051 #endif 1052 1053 #ifdef NUMA 1054 static u_long * 1055 pmap_delayed_invl_genp(vm_page_t m) 1056 { 1057 vm_paddr_t pa; 1058 u_long *gen; 1059 1060 pa = VM_PAGE_TO_PHYS(m); 1061 if (__predict_false((pa) > pmap_last_pa)) 1062 gen = &pv_dummy_large.pv_invl_gen; 1063 else 1064 gen = &(pa_to_pmdp(pa)->pv_invl_gen); 1065 1066 return (gen); 1067 } 1068 #else 1069 static u_long * 1070 pmap_delayed_invl_genp(vm_page_t m) 1071 { 1072 1073 return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); 1074 } 1075 #endif 1076 1077 static void 1078 pmap_delayed_invl_callout_func(void *arg __unused) 1079 { 1080 1081 if (atomic_load_int(&pmap_invl_waiters) == 0) 1082 return; 1083 pmap_delayed_invl_finish_unblock(0); 1084 } 1085 1086 static void 1087 pmap_delayed_invl_callout_init(void *arg __unused) 1088 { 1089 1090 if (pmap_di_locked()) 1091 return; 1092 callout_init(&pmap_invl_callout, 1); 1093 pmap_invl_callout_inited = true; 1094 } 1095 SYSINIT(pmap_di_callout, SI_SUB_CPU + 1, SI_ORDER_ANY, 1096 pmap_delayed_invl_callout_init, NULL); 1097 1098 /* 1099 * Ensure that all currently executing DI blocks, that need to flush 1100 * TLB for the given page m, actually flushed the TLB at the time the 1101 * function returned. If the page m has an empty PV list and we call 1102 * pmap_delayed_invl_wait(), upon its return we know that no CPU has a 1103 * valid mapping for the page m in either its page table or TLB. 1104 * 1105 * This function works by blocking until the global DI generation 1106 * number catches up with the generation number associated with the 1107 * given page m and its PV list. Since this function's callers 1108 * typically own an object lock and sometimes own a page lock, it 1109 * cannot sleep. Instead, it blocks on a turnstile to relinquish the 1110 * processor. 1111 */ 1112 static void 1113 pmap_delayed_invl_wait_l(vm_page_t m) 1114 { 1115 u_long *m_gen; 1116 #ifdef PV_STATS 1117 bool accounted = false; 1118 #endif 1119 1120 m_gen = pmap_delayed_invl_genp(m); 1121 while (*m_gen > pmap_invl_gen) { 1122 #ifdef PV_STATS 1123 if (!accounted) { 1124 counter_u64_add(invl_wait, 1); 1125 accounted = true; 1126 } 1127 #endif 1128 pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen); 1129 } 1130 } 1131 1132 static void 1133 pmap_delayed_invl_wait_u(vm_page_t m) 1134 { 1135 u_long *m_gen; 1136 struct lock_delay_arg lda; 1137 bool fast; 1138 1139 fast = true; 1140 m_gen = pmap_delayed_invl_genp(m); 1141 lock_delay_arg_init(&lda, &di_delay); 1142 while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) { 1143 if (fast || !pmap_invl_callout_inited) { 1144 PV_STAT(counter_u64_add(invl_wait, 1)); 1145 lock_delay(&lda); 1146 fast = false; 1147 } else { 1148 /* 1149 * The page's invalidation generation number 1150 * is still below the current thread's number. 1151 * Prepare to block so that we do not waste 1152 * CPU cycles or worse, suffer livelock. 1153 * 1154 * Since it is impossible to block without 1155 * racing with pmap_delayed_invl_finish_u(), 1156 * prepare for the race by incrementing 1157 * pmap_invl_waiters and arming a 1-tick 1158 * callout which will unblock us if we lose 1159 * the race. 1160 */ 1161 atomic_add_int(&pmap_invl_waiters, 1); 1162 1163 /* 1164 * Re-check the current thread's invalidation 1165 * generation after incrementing 1166 * pmap_invl_waiters, so that there is no race 1167 * with pmap_delayed_invl_finish_u() setting 1168 * the page generation and checking 1169 * pmap_invl_waiters. The only race allowed 1170 * is for a missed unblock, which is handled 1171 * by the callout. 1172 */ 1173 if (*m_gen > 1174 atomic_load_long(&pmap_invl_gen_head.gen)) { 1175 callout_reset(&pmap_invl_callout, 1, 1176 pmap_delayed_invl_callout_func, NULL); 1177 PV_STAT(counter_u64_add(invl_wait_slow, 1)); 1178 pmap_delayed_invl_wait_block(m_gen, 1179 &pmap_invl_gen_head.gen); 1180 } 1181 atomic_add_int(&pmap_invl_waiters, -1); 1182 } 1183 } 1184 } 1185 1186 DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *)) 1187 { 1188 1189 return (pmap_di_locked() ? pmap_thread_init_invl_gen_l : 1190 pmap_thread_init_invl_gen_u); 1191 } 1192 1193 DEFINE_IFUNC(static, void, pmap_delayed_invl_start, (void)) 1194 { 1195 1196 return (pmap_di_locked() ? pmap_delayed_invl_start_l : 1197 pmap_delayed_invl_start_u); 1198 } 1199 1200 DEFINE_IFUNC(static, void, pmap_delayed_invl_finish, (void)) 1201 { 1202 1203 return (pmap_di_locked() ? pmap_delayed_invl_finish_l : 1204 pmap_delayed_invl_finish_u); 1205 } 1206 1207 DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t)) 1208 { 1209 1210 return (pmap_di_locked() ? pmap_delayed_invl_wait_l : 1211 pmap_delayed_invl_wait_u); 1212 } 1213 1214 /* 1215 * Mark the page m's PV list as participating in the current thread's 1216 * DI block. Any threads concurrently using m's PV list to remove or 1217 * restrict all mappings to m will wait for the current thread's DI 1218 * block to complete before proceeding. 1219 * 1220 * The function works by setting the DI generation number for m's PV 1221 * list to at least the DI generation number of the current thread. 1222 * This forces a caller of pmap_delayed_invl_wait() to block until 1223 * current thread calls pmap_delayed_invl_finish(). 1224 */ 1225 static void 1226 pmap_delayed_invl_page(vm_page_t m) 1227 { 1228 u_long gen, *m_gen; 1229 1230 rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); 1231 gen = curthread->td_md.md_invl_gen.gen; 1232 if (gen == 0) 1233 return; 1234 m_gen = pmap_delayed_invl_genp(m); 1235 if (*m_gen < gen) 1236 *m_gen = gen; 1237 } 1238 1239 /* 1240 * Crashdump maps. 1241 */ 1242 static caddr_t crashdumpmap; 1243 1244 /* 1245 * Internal flags for pmap_enter()'s helper functions. 1246 */ 1247 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 1248 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 1249 1250 /* 1251 * Internal flags for pmap_mapdev_internal() and 1252 * pmap_change_props_locked(). 1253 */ 1254 #define MAPDEV_FLUSHCACHE 0x00000001 /* Flush cache after mapping. */ 1255 #define MAPDEV_SETATTR 0x00000002 /* Modify existing attrs. */ 1256 #define MAPDEV_ASSERTVALID 0x00000004 /* Assert mapping validity. */ 1257 1258 TAILQ_HEAD(pv_chunklist, pv_chunk); 1259 1260 static void free_pv_chunk(struct pv_chunk *pc); 1261 static void free_pv_chunk_batch(struct pv_chunklist *batch); 1262 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 1263 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 1264 static int popcnt_pc_map_pq(uint64_t *map); 1265 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 1266 static void reserve_pv_entries(pmap_t pmap, int needed, 1267 struct rwlock **lockp); 1268 static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1269 struct rwlock **lockp); 1270 static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, 1271 u_int flags, struct rwlock **lockp); 1272 #if VM_NRESERVLEVEL > 0 1273 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1274 struct rwlock **lockp); 1275 #endif 1276 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 1277 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 1278 vm_offset_t va); 1279 1280 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); 1281 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, 1282 vm_prot_t prot, int mode, int flags); 1283 static bool pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 1284 static bool pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, 1285 vm_offset_t va, struct rwlock **lockp); 1286 static bool pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, 1287 vm_offset_t va); 1288 static int pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, 1289 vm_prot_t prot, struct rwlock **lockp); 1290 static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, 1291 u_int flags, vm_page_t m, struct rwlock **lockp); 1292 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 1293 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 1294 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 1295 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, 1296 bool allpte_PG_A_set); 1297 static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, 1298 vm_offset_t eva); 1299 static void pmap_invalidate_cache_range_all(vm_offset_t sva, 1300 vm_offset_t eva); 1301 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, 1302 pd_entry_t pde); 1303 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 1304 static vm_page_t pmap_large_map_getptp_unlocked(void); 1305 static vm_paddr_t pmap_large_map_kextract(vm_offset_t va); 1306 #if VM_NRESERVLEVEL > 0 1307 static bool pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 1308 vm_page_t mpte, struct rwlock **lockp); 1309 #endif 1310 static bool pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 1311 vm_prot_t prot); 1312 static void pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask); 1313 static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, 1314 bool exec); 1315 static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va); 1316 static pd_entry_t *pmap_pti_pde(vm_offset_t va); 1317 static void pmap_pti_wire_pte(void *pte); 1318 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 1319 struct spglist *free, struct rwlock **lockp); 1320 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 1321 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 1322 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 1323 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 1324 struct spglist *free); 1325 static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1326 pd_entry_t *pde, struct spglist *free, 1327 struct rwlock **lockp); 1328 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 1329 vm_page_t m, struct rwlock **lockp); 1330 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 1331 pd_entry_t newpde); 1332 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); 1333 1334 static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, 1335 struct rwlock **lockp); 1336 static vm_page_t pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex, 1337 struct rwlock **lockp, vm_offset_t va); 1338 static vm_page_t pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex, 1339 struct rwlock **lockp, vm_offset_t va); 1340 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, 1341 struct rwlock **lockp); 1342 1343 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 1344 struct spglist *free); 1345 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 1346 1347 static vm_page_t pmap_alloc_pt_page(pmap_t, vm_pindex_t, int); 1348 static void pmap_free_pt_page(pmap_t, vm_page_t, bool); 1349 1350 /********************/ 1351 /* Inline functions */ 1352 /********************/ 1353 1354 /* 1355 * Return a non-clipped indexes for a given VA, which are page table 1356 * pages indexes at the corresponding level. 1357 */ 1358 static __inline vm_pindex_t 1359 pmap_pde_pindex(vm_offset_t va) 1360 { 1361 return (va >> PDRSHIFT); 1362 } 1363 1364 static __inline vm_pindex_t 1365 pmap_pdpe_pindex(vm_offset_t va) 1366 { 1367 return (NUPDE + (va >> PDPSHIFT)); 1368 } 1369 1370 static __inline vm_pindex_t 1371 pmap_pml4e_pindex(vm_offset_t va) 1372 { 1373 return (NUPDE + NUPDPE + (va >> PML4SHIFT)); 1374 } 1375 1376 static __inline vm_pindex_t 1377 pmap_pml5e_pindex(vm_offset_t va) 1378 { 1379 return (NUPDE + NUPDPE + NUPML4E + (va >> PML5SHIFT)); 1380 } 1381 1382 static __inline pml4_entry_t * 1383 pmap_pml5e(pmap_t pmap, vm_offset_t va) 1384 { 1385 1386 MPASS(pmap_is_la57(pmap)); 1387 return (&pmap->pm_pmltop[pmap_pml5e_index(va)]); 1388 } 1389 1390 static __inline pml4_entry_t * 1391 pmap_pml5e_u(pmap_t pmap, vm_offset_t va) 1392 { 1393 1394 MPASS(pmap_is_la57(pmap)); 1395 return (&pmap->pm_pmltopu[pmap_pml5e_index(va)]); 1396 } 1397 1398 static __inline pml4_entry_t * 1399 pmap_pml5e_to_pml4e(pml5_entry_t *pml5e, vm_offset_t va) 1400 { 1401 pml4_entry_t *pml4e; 1402 1403 /* XXX MPASS(pmap_is_la57(pmap); */ 1404 pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); 1405 return (&pml4e[pmap_pml4e_index(va)]); 1406 } 1407 1408 /* Return a pointer to the PML4 slot that corresponds to a VA */ 1409 static __inline pml4_entry_t * 1410 pmap_pml4e(pmap_t pmap, vm_offset_t va) 1411 { 1412 pml5_entry_t *pml5e; 1413 pml4_entry_t *pml4e; 1414 pt_entry_t PG_V; 1415 1416 if (pmap_is_la57(pmap)) { 1417 pml5e = pmap_pml5e(pmap, va); 1418 PG_V = pmap_valid_bit(pmap); 1419 if ((*pml5e & PG_V) == 0) 1420 return (NULL); 1421 pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); 1422 } else { 1423 pml4e = pmap->pm_pmltop; 1424 } 1425 return (&pml4e[pmap_pml4e_index(va)]); 1426 } 1427 1428 static __inline pml4_entry_t * 1429 pmap_pml4e_u(pmap_t pmap, vm_offset_t va) 1430 { 1431 MPASS(!pmap_is_la57(pmap)); 1432 return (&pmap->pm_pmltopu[pmap_pml4e_index(va)]); 1433 } 1434 1435 /* Return a pointer to the PDP slot that corresponds to a VA */ 1436 static __inline pdp_entry_t * 1437 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 1438 { 1439 pdp_entry_t *pdpe; 1440 1441 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 1442 return (&pdpe[pmap_pdpe_index(va)]); 1443 } 1444 1445 /* Return a pointer to the PDP slot that corresponds to a VA */ 1446 static __inline pdp_entry_t * 1447 pmap_pdpe(pmap_t pmap, vm_offset_t va) 1448 { 1449 pml4_entry_t *pml4e; 1450 pt_entry_t PG_V; 1451 1452 PG_V = pmap_valid_bit(pmap); 1453 pml4e = pmap_pml4e(pmap, va); 1454 if (pml4e == NULL || (*pml4e & PG_V) == 0) 1455 return (NULL); 1456 return (pmap_pml4e_to_pdpe(pml4e, va)); 1457 } 1458 1459 /* Return a pointer to the PD slot that corresponds to a VA */ 1460 static __inline pd_entry_t * 1461 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 1462 { 1463 pd_entry_t *pde; 1464 1465 KASSERT((*pdpe & PG_PS) == 0, 1466 ("%s: pdpe %#lx is a leaf", __func__, *pdpe)); 1467 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 1468 return (&pde[pmap_pde_index(va)]); 1469 } 1470 1471 /* Return a pointer to the PD slot that corresponds to a VA */ 1472 static __inline pd_entry_t * 1473 pmap_pde(pmap_t pmap, vm_offset_t va) 1474 { 1475 pdp_entry_t *pdpe; 1476 pt_entry_t PG_V; 1477 1478 PG_V = pmap_valid_bit(pmap); 1479 pdpe = pmap_pdpe(pmap, va); 1480 if (pdpe == NULL || (*pdpe & PG_V) == 0) 1481 return (NULL); 1482 KASSERT((*pdpe & PG_PS) == 0, 1483 ("pmap_pde for 1G page, pmap %p va %#lx", pmap, va)); 1484 return (pmap_pdpe_to_pde(pdpe, va)); 1485 } 1486 1487 /* Return a pointer to the PT slot that corresponds to a VA */ 1488 static __inline pt_entry_t * 1489 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 1490 { 1491 pt_entry_t *pte; 1492 1493 KASSERT((*pde & PG_PS) == 0, 1494 ("%s: pde %#lx is a leaf", __func__, *pde)); 1495 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 1496 return (&pte[pmap_pte_index(va)]); 1497 } 1498 1499 /* Return a pointer to the PT slot that corresponds to a VA */ 1500 static __inline pt_entry_t * 1501 pmap_pte(pmap_t pmap, vm_offset_t va) 1502 { 1503 pd_entry_t *pde; 1504 pt_entry_t PG_V; 1505 1506 PG_V = pmap_valid_bit(pmap); 1507 pde = pmap_pde(pmap, va); 1508 if (pde == NULL || (*pde & PG_V) == 0) 1509 return (NULL); 1510 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 1511 return ((pt_entry_t *)pde); 1512 return (pmap_pde_to_pte(pde, va)); 1513 } 1514 1515 static __inline void 1516 pmap_resident_count_adj(pmap_t pmap, int count) 1517 { 1518 1519 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1520 KASSERT(pmap->pm_stats.resident_count + count >= 0, 1521 ("pmap %p resident count underflow %ld %d", pmap, 1522 pmap->pm_stats.resident_count, count)); 1523 pmap->pm_stats.resident_count += count; 1524 } 1525 1526 static __inline void 1527 pmap_pt_page_count_pinit(pmap_t pmap, int count) 1528 { 1529 KASSERT(pmap->pm_stats.resident_count + count >= 0, 1530 ("pmap %p resident count underflow %ld %d", pmap, 1531 pmap->pm_stats.resident_count, count)); 1532 pmap->pm_stats.resident_count += count; 1533 } 1534 1535 static __inline void 1536 pmap_pt_page_count_adj(pmap_t pmap, int count) 1537 { 1538 if (pmap == kernel_pmap) 1539 counter_u64_add(kernel_pt_page_count, count); 1540 else { 1541 if (pmap != NULL) 1542 pmap_resident_count_adj(pmap, count); 1543 counter_u64_add(user_pt_page_count, count); 1544 } 1545 } 1546 1547 pt_entry_t vtoptem __read_mostly = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + 1548 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1) << 3; 1549 vm_offset_t PTmap __read_mostly = (vm_offset_t)P4Tmap; 1550 1551 pt_entry_t * 1552 vtopte(vm_offset_t va) 1553 { 1554 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); 1555 1556 return ((pt_entry_t *)(PTmap + ((va >> (PAGE_SHIFT - 3)) & vtoptem))); 1557 } 1558 1559 pd_entry_t vtopdem __read_mostly = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 1560 NPML4EPGSHIFT)) - 1) << 3; 1561 vm_offset_t PDmap __read_mostly = (vm_offset_t)P4Dmap; 1562 1563 static __inline pd_entry_t * 1564 vtopde(vm_offset_t va) 1565 { 1566 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); 1567 1568 return ((pt_entry_t *)(PDmap + ((va >> (PDRSHIFT - 3)) & vtopdem))); 1569 } 1570 1571 static u_int64_t 1572 allocpages(vm_paddr_t *firstaddr, int n) 1573 { 1574 u_int64_t ret; 1575 1576 ret = *firstaddr; 1577 bzero((void *)ret, n * PAGE_SIZE); 1578 *firstaddr += n * PAGE_SIZE; 1579 return (ret); 1580 } 1581 1582 CTASSERT(powerof2(NDMPML4E)); 1583 1584 /* number of kernel PDP slots */ 1585 #define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG) 1586 1587 static void 1588 nkpt_init(vm_paddr_t addr) 1589 { 1590 int pt_pages; 1591 1592 #ifdef NKPT 1593 pt_pages = NKPT; 1594 #else 1595 pt_pages = howmany(addr - kernphys, NBPDR) + 1; /* +1 for 2M hole @0 */ 1596 pt_pages += NKPDPE(pt_pages); 1597 1598 /* 1599 * Add some slop beyond the bare minimum required for bootstrapping 1600 * the kernel. 1601 * 1602 * This is quite important when allocating KVA for kernel modules. 1603 * The modules are required to be linked in the negative 2GB of 1604 * the address space. If we run out of KVA in this region then 1605 * pmap_growkernel() will need to allocate page table pages to map 1606 * the entire 512GB of KVA space which is an unnecessary tax on 1607 * physical memory. 1608 * 1609 * Secondly, device memory mapped as part of setting up the low- 1610 * level console(s) is taken from KVA, starting at virtual_avail. 1611 * This is because cninit() is called after pmap_bootstrap() but 1612 * before vm_init() and pmap_init(). 20MB for a frame buffer is 1613 * not uncommon. 1614 */ 1615 pt_pages += 32; /* 64MB additional slop. */ 1616 #endif 1617 nkpt = pt_pages; 1618 } 1619 1620 /* 1621 * Returns the proper write/execute permission for a physical page that is 1622 * part of the initial boot allocations. 1623 * 1624 * If the page has kernel text, it is marked as read-only. If the page has 1625 * kernel read-only data, it is marked as read-only/not-executable. If the 1626 * page has only read-write data, it is marked as read-write/not-executable. 1627 * If the page is below/above the kernel range, it is marked as read-write. 1628 * 1629 * This function operates on 2M pages, since we map the kernel space that 1630 * way. 1631 */ 1632 static inline pt_entry_t 1633 bootaddr_rwx(vm_paddr_t pa) 1634 { 1635 /* 1636 * The kernel is loaded at a 2MB-aligned address, and memory below that 1637 * need not be executable. The .bss section is padded to a 2MB 1638 * boundary, so memory following the kernel need not be executable 1639 * either. Preloaded kernel modules have their mapping permissions 1640 * fixed up by the linker. 1641 */ 1642 if (pa < trunc_2mpage(kernphys + btext - KERNSTART) || 1643 pa >= trunc_2mpage(kernphys + _end - KERNSTART)) 1644 return (X86_PG_RW | pg_nx); 1645 1646 /* 1647 * The linker should ensure that the read-only and read-write 1648 * portions don't share the same 2M page, so this shouldn't 1649 * impact read-only data. However, in any case, any page with 1650 * read-write data needs to be read-write. 1651 */ 1652 if (pa >= trunc_2mpage(kernphys + brwsection - KERNSTART)) 1653 return (X86_PG_RW | pg_nx); 1654 1655 /* 1656 * Mark any 2M page containing kernel text as read-only. Mark 1657 * other pages with read-only data as read-only and not executable. 1658 * (It is likely a small portion of the read-only data section will 1659 * be marked as read-only, but executable. This should be acceptable 1660 * since the read-only protection will keep the data from changing.) 1661 * Note that fixups to the .text section will still work until we 1662 * set CR0.WP. 1663 */ 1664 if (pa < round_2mpage(kernphys + etext - KERNSTART)) 1665 return (0); 1666 return (pg_nx); 1667 } 1668 1669 static void 1670 create_pagetables(vm_paddr_t *firstaddr) 1671 { 1672 pd_entry_t *pd_p; 1673 pdp_entry_t *pdp_p; 1674 pml4_entry_t *p4_p; 1675 uint64_t DMPDkernphys; 1676 vm_paddr_t pax; 1677 #ifdef KASAN 1678 pt_entry_t *pt_p; 1679 uint64_t KASANPDphys, KASANPTphys, KASANphys; 1680 vm_offset_t kasankernbase; 1681 int kasankpdpi, kasankpdi, nkasanpte; 1682 #endif 1683 int i, j, ndm1g, nkpdpe, nkdmpde; 1684 1685 TSENTER(); 1686 /* Allocate page table pages for the direct map */ 1687 ndmpdp = howmany(ptoa(Maxmem), NBPDP); 1688 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 1689 ndmpdp = 4; 1690 ndmpdpphys = howmany(ndmpdp, NPDPEPG); 1691 if (ndmpdpphys > NDMPML4E) { 1692 /* 1693 * Each NDMPML4E allows 512 GB, so limit to that, 1694 * and then readjust ndmpdp and ndmpdpphys. 1695 */ 1696 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); 1697 Maxmem = atop(NDMPML4E * NBPML4); 1698 ndmpdpphys = NDMPML4E; 1699 ndmpdp = NDMPML4E * NPDEPG; 1700 } 1701 DMPDPphys = allocpages(firstaddr, ndmpdpphys); 1702 ndm1g = 0; 1703 if ((amd_feature & AMDID_PAGE1GB) != 0) { 1704 /* 1705 * Calculate the number of 1G pages that will fully fit in 1706 * Maxmem. 1707 */ 1708 ndm1g = ptoa(Maxmem) >> PDPSHIFT; 1709 1710 /* 1711 * Allocate 2M pages for the kernel. These will be used in 1712 * place of the one or more 1G pages from ndm1g that maps 1713 * kernel memory into DMAP. 1714 */ 1715 nkdmpde = howmany((vm_offset_t)brwsection - KERNSTART + 1716 kernphys - rounddown2(kernphys, NBPDP), NBPDP); 1717 DMPDkernphys = allocpages(firstaddr, nkdmpde); 1718 } 1719 if (ndm1g < ndmpdp) 1720 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); 1721 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 1722 1723 /* Allocate pages. */ 1724 KPML4phys = allocpages(firstaddr, 1); 1725 KPDPphys = allocpages(firstaddr, NKPML4E); 1726 #ifdef KASAN 1727 KASANPDPphys = allocpages(firstaddr, NKASANPML4E); 1728 KASANPDphys = allocpages(firstaddr, 1); 1729 #endif 1730 #ifdef KMSAN 1731 /* 1732 * The KMSAN shadow maps are initially left unpopulated, since there is 1733 * no need to shadow memory above KERNBASE. 1734 */ 1735 KMSANSHADPDPphys = allocpages(firstaddr, NKMSANSHADPML4E); 1736 KMSANORIGPDPphys = allocpages(firstaddr, NKMSANORIGPML4E); 1737 #endif 1738 1739 /* 1740 * Allocate the initial number of kernel page table pages required to 1741 * bootstrap. We defer this until after all memory-size dependent 1742 * allocations are done (e.g. direct map), so that we don't have to 1743 * build in too much slop in our estimate. 1744 * 1745 * Note that when NKPML4E > 1, we have an empty page underneath 1746 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) 1747 * pages. (pmap_enter requires a PD page to exist for each KPML4E.) 1748 */ 1749 nkpt_init(*firstaddr); 1750 nkpdpe = NKPDPE(nkpt); 1751 1752 KPTphys = allocpages(firstaddr, nkpt); 1753 KPDphys = allocpages(firstaddr, nkpdpe); 1754 1755 #ifdef KASAN 1756 nkasanpte = howmany(nkpt, KASAN_SHADOW_SCALE); 1757 KASANPTphys = allocpages(firstaddr, nkasanpte); 1758 KASANphys = allocpages(firstaddr, nkasanpte * NPTEPG); 1759 #endif 1760 1761 /* 1762 * Connect the zero-filled PT pages to their PD entries. This 1763 * implicitly maps the PT pages at their correct locations within 1764 * the PTmap. 1765 */ 1766 pd_p = (pd_entry_t *)KPDphys; 1767 for (i = 0; i < nkpt; i++) 1768 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 1769 1770 /* 1771 * Map from start of the kernel in physical memory (staging 1772 * area) to the end of loader preallocated memory using 2MB 1773 * pages. This replaces some of the PD entries created above. 1774 * For compatibility, identity map 2M at the start. 1775 */ 1776 pd_p[0] = X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | 1777 X86_PG_RW | pg_nx; 1778 for (i = 1, pax = kernphys; pax < KERNend; i++, pax += NBPDR) { 1779 /* Preset PG_M and PG_A because demotion expects it. */ 1780 pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | 1781 X86_PG_A | bootaddr_rwx(pax); 1782 } 1783 1784 /* 1785 * Because we map the physical blocks in 2M pages, adjust firstaddr 1786 * to record the physical blocks we've actually mapped into kernel 1787 * virtual address space. 1788 */ 1789 if (*firstaddr < round_2mpage(KERNend)) 1790 *firstaddr = round_2mpage(KERNend); 1791 1792 /* And connect up the PD to the PDP (leaving room for L4 pages) */ 1793 pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); 1794 for (i = 0; i < nkpdpe; i++) 1795 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 1796 1797 #ifdef KASAN 1798 kasankernbase = kasan_md_addr_to_shad(KERNBASE); 1799 kasankpdpi = pmap_pdpe_index(kasankernbase); 1800 kasankpdi = pmap_pde_index(kasankernbase); 1801 1802 pdp_p = (pdp_entry_t *)KASANPDPphys; 1803 pdp_p[kasankpdpi] = (KASANPDphys | X86_PG_RW | X86_PG_V | pg_nx); 1804 1805 pd_p = (pd_entry_t *)KASANPDphys; 1806 for (i = 0; i < nkasanpte; i++) 1807 pd_p[i + kasankpdi] = (KASANPTphys + ptoa(i)) | X86_PG_RW | 1808 X86_PG_V | pg_nx; 1809 1810 pt_p = (pt_entry_t *)KASANPTphys; 1811 for (i = 0; i < nkasanpte * NPTEPG; i++) 1812 pt_p[i] = (KASANphys + ptoa(i)) | X86_PG_RW | X86_PG_V | 1813 X86_PG_M | X86_PG_A | pg_nx; 1814 #endif 1815 1816 /* 1817 * Now, set up the direct map region using 2MB and/or 1GB pages. If 1818 * the end of physical memory is not aligned to a 1GB page boundary, 1819 * then the residual physical memory is mapped with 2MB pages. Later, 1820 * if pmap_mapdev{_attr}() uses the direct map for non-write-back 1821 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings 1822 * that are partially used. 1823 */ 1824 pd_p = (pd_entry_t *)DMPDphys; 1825 for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { 1826 pd_p[j] = (vm_paddr_t)i << PDRSHIFT; 1827 /* Preset PG_M and PG_A because demotion expects it. */ 1828 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | 1829 X86_PG_M | X86_PG_A | pg_nx; 1830 } 1831 pdp_p = (pdp_entry_t *)DMPDPphys; 1832 for (i = 0; i < ndm1g; i++) { 1833 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; 1834 /* Preset PG_M and PG_A because demotion expects it. */ 1835 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | 1836 X86_PG_M | X86_PG_A | pg_nx; 1837 } 1838 for (j = 0; i < ndmpdp; i++, j++) { 1839 pdp_p[i] = DMPDphys + ptoa(j); 1840 pdp_p[i] |= X86_PG_RW | X86_PG_V | pg_nx; 1841 } 1842 1843 /* 1844 * Instead of using a 1G page for the memory containing the kernel, 1845 * use 2M pages with read-only and no-execute permissions. (If using 1G 1846 * pages, this will partially overwrite the PDPEs above.) 1847 */ 1848 if (ndm1g > 0) { 1849 pd_p = (pd_entry_t *)DMPDkernphys; 1850 for (i = 0, pax = rounddown2(kernphys, NBPDP); 1851 i < NPDEPG * nkdmpde; i++, pax += NBPDR) { 1852 pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | 1853 X86_PG_A | pg_nx | bootaddr_rwx(pax); 1854 } 1855 j = rounddown2(kernphys, NBPDP) >> PDPSHIFT; 1856 for (i = 0; i < nkdmpde; i++) { 1857 pdp_p[i + j] = (DMPDkernphys + ptoa(i)) | 1858 X86_PG_RW | X86_PG_V | pg_nx; 1859 } 1860 } 1861 1862 /* And recursively map PML4 to itself in order to get PTmap */ 1863 p4_p = (pml4_entry_t *)KPML4phys; 1864 p4_p[PML4PML4I] = KPML4phys; 1865 p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx; 1866 1867 #ifdef KASAN 1868 /* Connect the KASAN shadow map slots up to the PML4. */ 1869 for (i = 0; i < NKASANPML4E; i++) { 1870 p4_p[KASANPML4I + i] = KASANPDPphys + ptoa(i); 1871 p4_p[KASANPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1872 } 1873 #endif 1874 1875 #ifdef KMSAN 1876 /* Connect the KMSAN shadow map slots up to the PML4. */ 1877 for (i = 0; i < NKMSANSHADPML4E; i++) { 1878 p4_p[KMSANSHADPML4I + i] = KMSANSHADPDPphys + ptoa(i); 1879 p4_p[KMSANSHADPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1880 } 1881 1882 /* Connect the KMSAN origin map slots up to the PML4. */ 1883 for (i = 0; i < NKMSANORIGPML4E; i++) { 1884 p4_p[KMSANORIGPML4I + i] = KMSANORIGPDPphys + ptoa(i); 1885 p4_p[KMSANORIGPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1886 } 1887 #endif 1888 1889 /* Connect the Direct Map slots up to the PML4. */ 1890 for (i = 0; i < ndmpdpphys; i++) { 1891 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); 1892 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1893 } 1894 1895 /* Connect the KVA slots up to the PML4 */ 1896 for (i = 0; i < NKPML4E; i++) { 1897 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); 1898 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V; 1899 } 1900 1901 kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 1902 TSEXIT(); 1903 } 1904 1905 /* 1906 * Bootstrap the system enough to run with virtual memory. 1907 * 1908 * On amd64 this is called after mapping has already been enabled 1909 * and just syncs the pmap module with what has already been done. 1910 * [We can't call it easily with mapping off since the kernel is not 1911 * mapped with PA == VA, hence we would have to relocate every address 1912 * from the linked base (virtual) address "KERNBASE" to the actual 1913 * (physical) address starting relative to 0] 1914 */ 1915 void 1916 pmap_bootstrap(vm_paddr_t *firstaddr) 1917 { 1918 vm_offset_t va; 1919 pt_entry_t *pte, *pcpu_pte; 1920 struct region_descriptor r_gdt; 1921 uint64_t cr4, pcpu0_phys; 1922 u_long res; 1923 int i; 1924 1925 TSENTER(); 1926 KERNend = *firstaddr; 1927 res = atop(KERNend - (vm_paddr_t)kernphys); 1928 1929 if (!pti) 1930 pg_g = X86_PG_G; 1931 1932 /* 1933 * Create an initial set of page tables to run the kernel in. 1934 */ 1935 create_pagetables(firstaddr); 1936 1937 pcpu0_phys = allocpages(firstaddr, 1); 1938 1939 /* 1940 * Add a physical memory segment (vm_phys_seg) corresponding to the 1941 * preallocated kernel page table pages so that vm_page structures 1942 * representing these pages will be created. The vm_page structures 1943 * are required for promotion of the corresponding kernel virtual 1944 * addresses to superpage mappings. 1945 */ 1946 vm_phys_early_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 1947 1948 /* 1949 * Account for the virtual addresses mapped by create_pagetables(). 1950 */ 1951 virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend - 1952 (vm_paddr_t)kernphys); 1953 virtual_end = VM_MAX_KERNEL_ADDRESS; 1954 1955 /* 1956 * Enable PG_G global pages, then switch to the kernel page 1957 * table from the bootstrap page table. After the switch, it 1958 * is possible to enable SMEP and SMAP since PG_U bits are 1959 * correct now. 1960 */ 1961 cr4 = rcr4(); 1962 cr4 |= CR4_PGE; 1963 load_cr4(cr4); 1964 load_cr3(KPML4phys); 1965 if (cpu_stdext_feature & CPUID_STDEXT_SMEP) 1966 cr4 |= CR4_SMEP; 1967 if (cpu_stdext_feature & CPUID_STDEXT_SMAP) 1968 cr4 |= CR4_SMAP; 1969 load_cr4(cr4); 1970 1971 /* 1972 * Initialize the kernel pmap (which is statically allocated). 1973 * Count bootstrap data as being resident in case any of this data is 1974 * later unmapped (using pmap_remove()) and freed. 1975 */ 1976 PMAP_LOCK_INIT(kernel_pmap); 1977 kernel_pmap->pm_pmltop = kernel_pml4; 1978 kernel_pmap->pm_cr3 = KPML4phys; 1979 kernel_pmap->pm_ucr3 = PMAP_NO_CR3; 1980 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1981 kernel_pmap->pm_stats.resident_count = res; 1982 vm_radix_init(&kernel_pmap->pm_root); 1983 kernel_pmap->pm_flags = pmap_flags; 1984 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 1985 rangeset_init(&kernel_pmap->pm_pkru, pkru_dup_range, 1986 pkru_free_range, kernel_pmap, M_NOWAIT); 1987 } 1988 1989 /* 1990 * The kernel pmap is always active on all CPUs. Once CPUs are 1991 * enumerated, the mask will be set equal to all_cpus. 1992 */ 1993 CPU_FILL(&kernel_pmap->pm_active); 1994 1995 /* 1996 * Initialize the TLB invalidations generation number lock. 1997 */ 1998 mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); 1999 2000 /* 2001 * Reserve some special page table entries/VA space for temporary 2002 * mapping of pages. 2003 */ 2004 #define SYSMAP(c, p, v, n) \ 2005 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 2006 2007 va = virtual_avail; 2008 pte = vtopte(va); 2009 2010 /* 2011 * Crashdump maps. The first page is reused as CMAP1 for the 2012 * memory test. 2013 */ 2014 SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) 2015 CADDR1 = crashdumpmap; 2016 2017 SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU); 2018 virtual_avail = va; 2019 2020 /* 2021 * Map the BSP PCPU now, the rest of the PCPUs are mapped by 2022 * amd64_mp_alloc_pcpu()/start_all_aps() when we know the 2023 * number of CPUs and NUMA affinity. 2024 */ 2025 pcpu_pte[0] = pcpu0_phys | X86_PG_V | X86_PG_RW | pg_g | pg_nx | 2026 X86_PG_M | X86_PG_A; 2027 for (i = 1; i < MAXCPU; i++) 2028 pcpu_pte[i] = 0; 2029 2030 /* 2031 * Re-initialize PCPU area for BSP after switching. 2032 * Make hardware use gdt and common_tss from the new PCPU. 2033 */ 2034 STAILQ_INIT(&cpuhead); 2035 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2036 pcpu_init(&__pcpu[0], 0, sizeof(struct pcpu)); 2037 amd64_bsp_pcpu_init1(&__pcpu[0]); 2038 amd64_bsp_ist_init(&__pcpu[0]); 2039 __pcpu[0].pc_common_tss.tss_iobase = sizeof(struct amd64tss) + 2040 IOPERM_BITMAP_SIZE; 2041 memcpy(__pcpu[0].pc_gdt, temp_bsp_pcpu.pc_gdt, NGDT * 2042 sizeof(struct user_segment_descriptor)); 2043 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&__pcpu[0].pc_common_tss; 2044 ssdtosyssd(&gdt_segs[GPROC0_SEL], 2045 (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); 2046 r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; 2047 r_gdt.rd_base = (long)__pcpu[0].pc_gdt; 2048 lgdt(&r_gdt); 2049 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2050 ltr(GSEL(GPROC0_SEL, SEL_KPL)); 2051 __pcpu[0].pc_dynamic = temp_bsp_pcpu.pc_dynamic; 2052 __pcpu[0].pc_acpi_id = temp_bsp_pcpu.pc_acpi_id; 2053 2054 /* 2055 * Initialize the PAT MSR. 2056 * pmap_init_pat() clears and sets CR4_PGE, which, as a 2057 * side-effect, invalidates stale PG_G TLB entries that might 2058 * have been created in our pre-boot environment. 2059 */ 2060 pmap_init_pat(); 2061 2062 /* Initialize TLB Context Id. */ 2063 if (pmap_pcid_enabled) { 2064 kernel_pmap->pm_pcidp = (void *)(uintptr_t) 2065 offsetof(struct pcpu, pc_kpmap_store); 2066 2067 PCPU_SET(kpmap_store.pm_pcid, PMAP_PCID_KERN); 2068 PCPU_SET(kpmap_store.pm_gen, 1); 2069 2070 /* 2071 * PMAP_PCID_KERN + 1 is used for initialization of 2072 * proc0 pmap. The pmap' pcid state might be used by 2073 * EFIRT entry before first context switch, so it 2074 * needs to be valid. 2075 */ 2076 PCPU_SET(pcid_next, PMAP_PCID_KERN + 2); 2077 PCPU_SET(pcid_gen, 1); 2078 2079 /* 2080 * pcpu area for APs is zeroed during AP startup. 2081 * pc_pcid_next and pc_pcid_gen are initialized by AP 2082 * during pcpu setup. 2083 */ 2084 load_cr4(rcr4() | CR4_PCIDE); 2085 } 2086 TSEXIT(); 2087 } 2088 2089 /* 2090 * Setup the PAT MSR. 2091 */ 2092 void 2093 pmap_init_pat(void) 2094 { 2095 uint64_t pat_msr; 2096 u_long cr0, cr4; 2097 int i; 2098 2099 /* Bail if this CPU doesn't implement PAT. */ 2100 if ((cpu_feature & CPUID_PAT) == 0) 2101 panic("no PAT??"); 2102 2103 /* Set default PAT index table. */ 2104 for (i = 0; i < PAT_INDEX_SIZE; i++) 2105 pat_index[i] = -1; 2106 pat_index[PAT_WRITE_BACK] = 0; 2107 pat_index[PAT_WRITE_THROUGH] = 1; 2108 pat_index[PAT_UNCACHEABLE] = 3; 2109 pat_index[PAT_WRITE_COMBINING] = 6; 2110 pat_index[PAT_WRITE_PROTECTED] = 5; 2111 pat_index[PAT_UNCACHED] = 2; 2112 2113 /* 2114 * Initialize default PAT entries. 2115 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 2116 * Program 5 and 6 as WP and WC. 2117 * 2118 * Leave 4 and 7 as WB and UC. Note that a recursive page table 2119 * mapping for a 2M page uses a PAT value with the bit 3 set due 2120 * to its overload with PG_PS. 2121 */ 2122 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 2123 PAT_VALUE(1, PAT_WRITE_THROUGH) | 2124 PAT_VALUE(2, PAT_UNCACHED) | 2125 PAT_VALUE(3, PAT_UNCACHEABLE) | 2126 PAT_VALUE(4, PAT_WRITE_BACK) | 2127 PAT_VALUE(5, PAT_WRITE_PROTECTED) | 2128 PAT_VALUE(6, PAT_WRITE_COMBINING) | 2129 PAT_VALUE(7, PAT_UNCACHEABLE); 2130 2131 /* Disable PGE. */ 2132 cr4 = rcr4(); 2133 load_cr4(cr4 & ~CR4_PGE); 2134 2135 /* Disable caches (CD = 1, NW = 0). */ 2136 cr0 = rcr0(); 2137 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 2138 2139 /* Flushes caches and TLBs. */ 2140 wbinvd(); 2141 invltlb(); 2142 2143 /* Update PAT and index table. */ 2144 wrmsr(MSR_PAT, pat_msr); 2145 2146 /* Flush caches and TLBs again. */ 2147 wbinvd(); 2148 invltlb(); 2149 2150 /* Restore caches and PGE. */ 2151 load_cr0(cr0); 2152 load_cr4(cr4); 2153 } 2154 2155 vm_page_t 2156 pmap_page_alloc_below_4g(bool zeroed) 2157 { 2158 return (vm_page_alloc_noobj_contig((zeroed ? VM_ALLOC_ZERO : 0), 2159 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT)); 2160 } 2161 2162 extern const char la57_trampoline[], la57_trampoline_gdt_desc[], 2163 la57_trampoline_gdt[], la57_trampoline_end[]; 2164 2165 static void 2166 pmap_bootstrap_la57(void *arg __unused) 2167 { 2168 char *v_code; 2169 pml5_entry_t *v_pml5; 2170 pml4_entry_t *v_pml4; 2171 pdp_entry_t *v_pdp; 2172 pd_entry_t *v_pd; 2173 pt_entry_t *v_pt; 2174 vm_page_t m_code, m_pml4, m_pdp, m_pd, m_pt, m_pml5; 2175 void (*la57_tramp)(uint64_t pml5); 2176 struct region_descriptor r_gdt; 2177 2178 if ((cpu_stdext_feature2 & CPUID_STDEXT2_LA57) == 0) 2179 return; 2180 TUNABLE_INT_FETCH("vm.pmap.la57", &la57); 2181 if (!la57) 2182 return; 2183 2184 r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; 2185 r_gdt.rd_base = (long)__pcpu[0].pc_gdt; 2186 2187 m_code = pmap_page_alloc_below_4g(true); 2188 v_code = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_code)); 2189 m_pml5 = pmap_page_alloc_below_4g(true); 2190 KPML5phys = VM_PAGE_TO_PHYS(m_pml5); 2191 v_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(KPML5phys); 2192 m_pml4 = pmap_page_alloc_below_4g(true); 2193 v_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4)); 2194 m_pdp = pmap_page_alloc_below_4g(true); 2195 v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp)); 2196 m_pd = pmap_page_alloc_below_4g(true); 2197 v_pd = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd)); 2198 m_pt = pmap_page_alloc_below_4g(true); 2199 v_pt = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pt)); 2200 2201 /* 2202 * Map m_code 1:1, it appears below 4G in KVA due to physical 2203 * address being below 4G. Since kernel KVA is in upper half, 2204 * the pml4e should be zero and free for temporary use. 2205 */ 2206 kernel_pmap->pm_pmltop[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = 2207 VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | 2208 X86_PG_M; 2209 v_pdp[pmap_pdpe_index(VM_PAGE_TO_PHYS(m_code))] = 2210 VM_PAGE_TO_PHYS(m_pd) | X86_PG_V | X86_PG_RW | X86_PG_A | 2211 X86_PG_M; 2212 v_pd[pmap_pde_index(VM_PAGE_TO_PHYS(m_code))] = 2213 VM_PAGE_TO_PHYS(m_pt) | X86_PG_V | X86_PG_RW | X86_PG_A | 2214 X86_PG_M; 2215 v_pt[pmap_pte_index(VM_PAGE_TO_PHYS(m_code))] = 2216 VM_PAGE_TO_PHYS(m_code) | X86_PG_V | X86_PG_RW | X86_PG_A | 2217 X86_PG_M; 2218 2219 /* 2220 * Add pml5 entry at top of KVA pointing to existing pml4 table, 2221 * entering all existing kernel mappings into level 5 table. 2222 */ 2223 v_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | 2224 X86_PG_RW | X86_PG_A | X86_PG_M | pg_g; 2225 2226 /* 2227 * Add pml5 entry for 1:1 trampoline mapping after LA57 is turned on. 2228 */ 2229 v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))] = 2230 VM_PAGE_TO_PHYS(m_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A | 2231 X86_PG_M; 2232 v_pml4[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = 2233 VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | 2234 X86_PG_M; 2235 2236 /* 2237 * Copy and call the 48->57 trampoline, hope we return there, alive. 2238 */ 2239 bcopy(la57_trampoline, v_code, la57_trampoline_end - la57_trampoline); 2240 *(u_long *)(v_code + 2 + (la57_trampoline_gdt_desc - la57_trampoline)) = 2241 la57_trampoline_gdt - la57_trampoline + VM_PAGE_TO_PHYS(m_code); 2242 la57_tramp = (void (*)(uint64_t))VM_PAGE_TO_PHYS(m_code); 2243 invlpg((vm_offset_t)la57_tramp); 2244 la57_tramp(KPML5phys); 2245 2246 /* 2247 * gdt was necessary reset, switch back to our gdt. 2248 */ 2249 lgdt(&r_gdt); 2250 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2251 load_ds(_udatasel); 2252 load_es(_udatasel); 2253 load_fs(_ufssel); 2254 ssdtosyssd(&gdt_segs[GPROC0_SEL], 2255 (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); 2256 ltr(GSEL(GPROC0_SEL, SEL_KPL)); 2257 2258 /* 2259 * Now unmap the trampoline, and free the pages. 2260 * Clear pml5 entry used for 1:1 trampoline mapping. 2261 */ 2262 pte_clear(&v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))]); 2263 invlpg((vm_offset_t)v_code); 2264 vm_page_free(m_code); 2265 vm_page_free(m_pdp); 2266 vm_page_free(m_pd); 2267 vm_page_free(m_pt); 2268 2269 /* 2270 * Recursively map PML5 to itself in order to get PTmap and 2271 * PDmap. 2272 */ 2273 v_pml5[PML5PML5I] = KPML5phys | X86_PG_RW | X86_PG_V | pg_nx; 2274 2275 vtoptem = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + 2276 NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3; 2277 PTmap = (vm_offset_t)P5Tmap; 2278 vtopdem = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 2279 NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3; 2280 PDmap = (vm_offset_t)P5Dmap; 2281 2282 kernel_pmap->pm_cr3 = KPML5phys; 2283 kernel_pmap->pm_pmltop = v_pml5; 2284 pmap_pt_page_count_adj(kernel_pmap, 1); 2285 } 2286 SYSINIT(la57, SI_SUB_KMEM, SI_ORDER_ANY, pmap_bootstrap_la57, NULL); 2287 2288 /* 2289 * Initialize a vm_page's machine-dependent fields. 2290 */ 2291 void 2292 pmap_page_init(vm_page_t m) 2293 { 2294 2295 TAILQ_INIT(&m->md.pv_list); 2296 m->md.pat_mode = PAT_WRITE_BACK; 2297 } 2298 2299 static int pmap_allow_2m_x_ept; 2300 SYSCTL_INT(_vm_pmap, OID_AUTO, allow_2m_x_ept, CTLFLAG_RWTUN | CTLFLAG_NOFETCH, 2301 &pmap_allow_2m_x_ept, 0, 2302 "Allow executable superpage mappings in EPT"); 2303 2304 void 2305 pmap_allow_2m_x_ept_recalculate(void) 2306 { 2307 /* 2308 * SKL002, SKL012S. Since the EPT format is only used by 2309 * Intel CPUs, the vendor check is merely a formality. 2310 */ 2311 if (!(cpu_vendor_id != CPU_VENDOR_INTEL || 2312 (cpu_ia32_arch_caps & IA32_ARCH_CAP_IF_PSCHANGE_MC_NO) != 0 || 2313 (CPUID_TO_FAMILY(cpu_id) == 0x6 && 2314 (CPUID_TO_MODEL(cpu_id) == 0x26 || /* Atoms */ 2315 CPUID_TO_MODEL(cpu_id) == 0x27 || 2316 CPUID_TO_MODEL(cpu_id) == 0x35 || 2317 CPUID_TO_MODEL(cpu_id) == 0x36 || 2318 CPUID_TO_MODEL(cpu_id) == 0x37 || 2319 CPUID_TO_MODEL(cpu_id) == 0x86 || 2320 CPUID_TO_MODEL(cpu_id) == 0x1c || 2321 CPUID_TO_MODEL(cpu_id) == 0x4a || 2322 CPUID_TO_MODEL(cpu_id) == 0x4c || 2323 CPUID_TO_MODEL(cpu_id) == 0x4d || 2324 CPUID_TO_MODEL(cpu_id) == 0x5a || 2325 CPUID_TO_MODEL(cpu_id) == 0x5c || 2326 CPUID_TO_MODEL(cpu_id) == 0x5d || 2327 CPUID_TO_MODEL(cpu_id) == 0x5f || 2328 CPUID_TO_MODEL(cpu_id) == 0x6e || 2329 CPUID_TO_MODEL(cpu_id) == 0x7a || 2330 CPUID_TO_MODEL(cpu_id) == 0x57 || /* Knights */ 2331 CPUID_TO_MODEL(cpu_id) == 0x85)))) 2332 pmap_allow_2m_x_ept = 1; 2333 #ifndef BURN_BRIDGES 2334 TUNABLE_INT_FETCH("hw.allow_2m_x_ept", &pmap_allow_2m_x_ept); 2335 #endif 2336 TUNABLE_INT_FETCH("vm.pmap.allow_2m_x_ept", &pmap_allow_2m_x_ept); 2337 } 2338 2339 static bool 2340 pmap_allow_2m_x_page(pmap_t pmap, bool executable) 2341 { 2342 2343 return (pmap->pm_type != PT_EPT || !executable || 2344 !pmap_allow_2m_x_ept); 2345 } 2346 2347 #ifdef NUMA 2348 static void 2349 pmap_init_pv_table(void) 2350 { 2351 struct pmap_large_md_page *pvd; 2352 vm_size_t s; 2353 long start, end, highest, pv_npg; 2354 int domain, i, j, pages; 2355 2356 /* 2357 * For correctness we depend on the size being evenly divisible into a 2358 * page. As a tradeoff between performance and total memory use, the 2359 * entry is 64 bytes (aka one cacheline) in size. Not being smaller 2360 * avoids false-sharing, but not being 128 bytes potentially allows for 2361 * avoidable traffic due to adjacent cacheline prefetcher. 2362 * 2363 * Assert the size so that accidental changes fail to compile. 2364 */ 2365 CTASSERT((sizeof(*pvd) == 64)); 2366 2367 /* 2368 * Calculate the size of the array. 2369 */ 2370 pmap_last_pa = vm_phys_segs[vm_phys_nsegs - 1].end; 2371 pv_npg = howmany(pmap_last_pa, NBPDR); 2372 s = (vm_size_t)pv_npg * sizeof(struct pmap_large_md_page); 2373 s = round_page(s); 2374 pv_table = (struct pmap_large_md_page *)kva_alloc(s); 2375 if (pv_table == NULL) 2376 panic("%s: kva_alloc failed\n", __func__); 2377 2378 /* 2379 * Iterate physical segments to allocate space for respective pages. 2380 */ 2381 highest = -1; 2382 s = 0; 2383 for (i = 0; i < vm_phys_nsegs; i++) { 2384 end = vm_phys_segs[i].end / NBPDR; 2385 domain = vm_phys_segs[i].domain; 2386 2387 if (highest >= end) 2388 continue; 2389 2390 start = highest + 1; 2391 pvd = &pv_table[start]; 2392 2393 pages = end - start + 1; 2394 s = round_page(pages * sizeof(*pvd)); 2395 highest = start + (s / sizeof(*pvd)) - 1; 2396 2397 for (j = 0; j < s; j += PAGE_SIZE) { 2398 vm_page_t m = vm_page_alloc_noobj_domain(domain, 0); 2399 if (m == NULL) 2400 panic("failed to allocate PV table page"); 2401 pmap_qenter((vm_offset_t)pvd + j, &m, 1); 2402 } 2403 2404 for (j = 0; j < s / sizeof(*pvd); j++) { 2405 rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW); 2406 TAILQ_INIT(&pvd->pv_page.pv_list); 2407 pvd->pv_page.pv_gen = 0; 2408 pvd->pv_page.pat_mode = 0; 2409 pvd->pv_invl_gen = 0; 2410 pvd++; 2411 } 2412 } 2413 pvd = &pv_dummy_large; 2414 rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW); 2415 TAILQ_INIT(&pvd->pv_page.pv_list); 2416 pvd->pv_page.pv_gen = 0; 2417 pvd->pv_page.pat_mode = 0; 2418 pvd->pv_invl_gen = 0; 2419 } 2420 #else 2421 static void 2422 pmap_init_pv_table(void) 2423 { 2424 vm_size_t s; 2425 long i, pv_npg; 2426 2427 /* 2428 * Initialize the pool of pv list locks. 2429 */ 2430 for (i = 0; i < NPV_LIST_LOCKS; i++) 2431 rw_init(&pv_list_locks[i], "pmap pv list"); 2432 2433 /* 2434 * Calculate the size of the pv head table for superpages. 2435 */ 2436 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); 2437 2438 /* 2439 * Allocate memory for the pv head table for superpages. 2440 */ 2441 s = (vm_size_t)pv_npg * sizeof(struct md_page); 2442 s = round_page(s); 2443 pv_table = kmem_malloc(s, M_WAITOK | M_ZERO); 2444 for (i = 0; i < pv_npg; i++) 2445 TAILQ_INIT(&pv_table[i].pv_list); 2446 TAILQ_INIT(&pv_dummy.pv_list); 2447 } 2448 #endif 2449 2450 /* 2451 * Initialize the pmap module. 2452 * Called by vm_init, to initialize any structures that the pmap 2453 * system needs to map virtual memory. 2454 */ 2455 void 2456 pmap_init(void) 2457 { 2458 struct pmap_preinit_mapping *ppim; 2459 vm_page_t m, mpte; 2460 int error, i, ret, skz63; 2461 2462 /* L1TF, reserve page @0 unconditionally */ 2463 vm_page_blacklist_add(0, bootverbose); 2464 2465 /* Detect bare-metal Skylake Server and Skylake-X. */ 2466 if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL && 2467 CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) { 2468 /* 2469 * Skylake-X errata SKZ63. Processor May Hang When 2470 * Executing Code In an HLE Transaction Region between 2471 * 40000000H and 403FFFFFH. 2472 * 2473 * Mark the pages in the range as preallocated. It 2474 * seems to be impossible to distinguish between 2475 * Skylake Server and Skylake X. 2476 */ 2477 skz63 = 1; 2478 TUNABLE_INT_FETCH("hw.skz63_enable", &skz63); 2479 if (skz63 != 0) { 2480 if (bootverbose) 2481 printf("SKZ63: skipping 4M RAM starting " 2482 "at physical 1G\n"); 2483 for (i = 0; i < atop(0x400000); i++) { 2484 ret = vm_page_blacklist_add(0x40000000 + 2485 ptoa(i), false); 2486 if (!ret && bootverbose) 2487 printf("page at %#lx already used\n", 2488 0x40000000 + ptoa(i)); 2489 } 2490 } 2491 } 2492 2493 /* IFU */ 2494 pmap_allow_2m_x_ept_recalculate(); 2495 2496 /* 2497 * Initialize the vm page array entries for the kernel pmap's 2498 * page table pages. 2499 */ 2500 PMAP_LOCK(kernel_pmap); 2501 for (i = 0; i < nkpt; i++) { 2502 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 2503 KASSERT(mpte >= vm_page_array && 2504 mpte < &vm_page_array[vm_page_array_size], 2505 ("pmap_init: page table page is out of range")); 2506 mpte->pindex = pmap_pde_pindex(KERNBASE) + i; 2507 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 2508 mpte->ref_count = 1; 2509 2510 /* 2511 * Collect the page table pages that were replaced by a 2MB 2512 * page in create_pagetables(). They are zero filled. 2513 */ 2514 if ((i == 0 || 2515 kernphys + ((vm_paddr_t)(i - 1) << PDRSHIFT) < KERNend) && 2516 pmap_insert_pt_page(kernel_pmap, mpte, false, false)) 2517 panic("pmap_init: pmap_insert_pt_page failed"); 2518 } 2519 PMAP_UNLOCK(kernel_pmap); 2520 vm_wire_add(nkpt); 2521 2522 /* 2523 * If the kernel is running on a virtual machine, then it must assume 2524 * that MCA is enabled by the hypervisor. Moreover, the kernel must 2525 * be prepared for the hypervisor changing the vendor and family that 2526 * are reported by CPUID. Consequently, the workaround for AMD Family 2527 * 10h Erratum 383 is enabled if the processor's feature set does not 2528 * include at least one feature that is only supported by older Intel 2529 * or newer AMD processors. 2530 */ 2531 if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && 2532 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 2533 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 2534 AMDID2_FMA4)) == 0) 2535 workaround_erratum383 = 1; 2536 2537 /* 2538 * Are large page mappings enabled? 2539 */ 2540 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 2541 if (pg_ps_enabled) { 2542 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 2543 ("pmap_init: can't assign to pagesizes[1]")); 2544 pagesizes[1] = NBPDR; 2545 if ((amd_feature & AMDID_PAGE1GB) != 0) { 2546 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0, 2547 ("pmap_init: can't assign to pagesizes[2]")); 2548 pagesizes[2] = NBPDP; 2549 } 2550 } 2551 2552 /* 2553 * Initialize pv chunk lists. 2554 */ 2555 for (i = 0; i < PMAP_MEMDOM; i++) { 2556 mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL, MTX_DEF); 2557 TAILQ_INIT(&pv_chunks[i].pvc_list); 2558 } 2559 pmap_init_pv_table(); 2560 2561 pmap_initialized = 1; 2562 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 2563 ppim = pmap_preinit_mapping + i; 2564 if (ppim->va == 0) 2565 continue; 2566 /* Make the direct map consistent */ 2567 if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) { 2568 (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), 2569 ppim->sz, ppim->mode); 2570 } 2571 if (!bootverbose) 2572 continue; 2573 printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i, 2574 ppim->pa, ppim->va, ppim->sz, ppim->mode); 2575 } 2576 2577 mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); 2578 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 2579 (vmem_addr_t *)&qframe); 2580 if (error != 0) 2581 panic("qframe allocation failed"); 2582 2583 lm_ents = 8; 2584 TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents); 2585 if (lm_ents > LMEPML4I - LMSPML4I + 1) 2586 lm_ents = LMEPML4I - LMSPML4I + 1; 2587 #ifdef KMSAN 2588 if (lm_ents > KMSANORIGPML4I - LMSPML4I) { 2589 printf( 2590 "pmap: shrinking large map for KMSAN (%d slots to %ld slots)\n", 2591 lm_ents, KMSANORIGPML4I - LMSPML4I); 2592 lm_ents = KMSANORIGPML4I - LMSPML4I; 2593 } 2594 #endif 2595 if (bootverbose) 2596 printf("pmap: large map %u PML4 slots (%lu GB)\n", 2597 lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024)); 2598 if (lm_ents != 0) { 2599 large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS, 2600 (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK); 2601 if (large_vmem == NULL) { 2602 printf("pmap: cannot create large map\n"); 2603 lm_ents = 0; 2604 } 2605 for (i = 0; i < lm_ents; i++) { 2606 m = pmap_large_map_getptp_unlocked(); 2607 /* XXXKIB la57 */ 2608 kernel_pml4[LMSPML4I + i] = X86_PG_V | 2609 X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx | 2610 VM_PAGE_TO_PHYS(m); 2611 } 2612 } 2613 } 2614 2615 SYSCTL_UINT(_vm_pmap, OID_AUTO, large_map_pml4_entries, 2616 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &lm_ents, 0, 2617 "Maximum number of PML4 entries for use by large map (tunable). " 2618 "Each entry corresponds to 512GB of address space."); 2619 2620 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 2621 "2MB page mapping counters"); 2622 2623 static COUNTER_U64_DEFINE_EARLY(pmap_pde_demotions); 2624 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, demotions, 2625 CTLFLAG_RD, &pmap_pde_demotions, "2MB page demotions"); 2626 2627 static COUNTER_U64_DEFINE_EARLY(pmap_pde_mappings); 2628 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 2629 &pmap_pde_mappings, "2MB page mappings"); 2630 2631 static COUNTER_U64_DEFINE_EARLY(pmap_pde_p_failures); 2632 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 2633 &pmap_pde_p_failures, "2MB page promotion failures"); 2634 2635 static COUNTER_U64_DEFINE_EARLY(pmap_pde_promotions); 2636 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 2637 &pmap_pde_promotions, "2MB page promotions"); 2638 2639 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 2640 "1GB page mapping counters"); 2641 2642 static COUNTER_U64_DEFINE_EARLY(pmap_pdpe_demotions); 2643 SYSCTL_COUNTER_U64(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, 2644 &pmap_pdpe_demotions, "1GB page demotions"); 2645 2646 /*************************************************** 2647 * Low level helper routines..... 2648 ***************************************************/ 2649 2650 static pt_entry_t 2651 pmap_swap_pat(pmap_t pmap, pt_entry_t entry) 2652 { 2653 int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; 2654 2655 switch (pmap->pm_type) { 2656 case PT_X86: 2657 case PT_RVI: 2658 /* Verify that both PAT bits are not set at the same time */ 2659 KASSERT((entry & x86_pat_bits) != x86_pat_bits, 2660 ("Invalid PAT bits in entry %#lx", entry)); 2661 2662 /* Swap the PAT bits if one of them is set */ 2663 if ((entry & x86_pat_bits) != 0) 2664 entry ^= x86_pat_bits; 2665 break; 2666 case PT_EPT: 2667 /* 2668 * Nothing to do - the memory attributes are represented 2669 * the same way for regular pages and superpages. 2670 */ 2671 break; 2672 default: 2673 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); 2674 } 2675 2676 return (entry); 2677 } 2678 2679 bool 2680 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 2681 { 2682 2683 return (mode >= 0 && mode < PAT_INDEX_SIZE && 2684 pat_index[(int)mode] >= 0); 2685 } 2686 2687 /* 2688 * Determine the appropriate bits to set in a PTE or PDE for a specified 2689 * caching mode. 2690 */ 2691 int 2692 pmap_cache_bits(pmap_t pmap, int mode, bool is_pde) 2693 { 2694 int cache_bits, pat_flag, pat_idx; 2695 2696 if (!pmap_is_valid_memattr(pmap, mode)) 2697 panic("Unknown caching mode %d\n", mode); 2698 2699 switch (pmap->pm_type) { 2700 case PT_X86: 2701 case PT_RVI: 2702 /* The PAT bit is different for PTE's and PDE's. */ 2703 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 2704 2705 /* Map the caching mode to a PAT index. */ 2706 pat_idx = pat_index[mode]; 2707 2708 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 2709 cache_bits = 0; 2710 if (pat_idx & 0x4) 2711 cache_bits |= pat_flag; 2712 if (pat_idx & 0x2) 2713 cache_bits |= PG_NC_PCD; 2714 if (pat_idx & 0x1) 2715 cache_bits |= PG_NC_PWT; 2716 break; 2717 2718 case PT_EPT: 2719 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); 2720 break; 2721 2722 default: 2723 panic("unsupported pmap type %d", pmap->pm_type); 2724 } 2725 2726 return (cache_bits); 2727 } 2728 2729 static int 2730 pmap_cache_mask(pmap_t pmap, bool is_pde) 2731 { 2732 int mask; 2733 2734 switch (pmap->pm_type) { 2735 case PT_X86: 2736 case PT_RVI: 2737 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; 2738 break; 2739 case PT_EPT: 2740 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); 2741 break; 2742 default: 2743 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); 2744 } 2745 2746 return (mask); 2747 } 2748 2749 static int 2750 pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde) 2751 { 2752 int pat_flag, pat_idx; 2753 2754 pat_idx = 0; 2755 switch (pmap->pm_type) { 2756 case PT_X86: 2757 case PT_RVI: 2758 /* The PAT bit is different for PTE's and PDE's. */ 2759 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 2760 2761 if ((pte & pat_flag) != 0) 2762 pat_idx |= 0x4; 2763 if ((pte & PG_NC_PCD) != 0) 2764 pat_idx |= 0x2; 2765 if ((pte & PG_NC_PWT) != 0) 2766 pat_idx |= 0x1; 2767 break; 2768 case PT_EPT: 2769 if ((pte & EPT_PG_IGNORE_PAT) != 0) 2770 panic("EPT PTE %#lx has no PAT memory type", pte); 2771 pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3; 2772 break; 2773 } 2774 2775 /* See pmap_init_pat(). */ 2776 if (pat_idx == 4) 2777 pat_idx = 0; 2778 if (pat_idx == 7) 2779 pat_idx = 3; 2780 2781 return (pat_idx); 2782 } 2783 2784 bool 2785 pmap_ps_enabled(pmap_t pmap) 2786 { 2787 2788 return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); 2789 } 2790 2791 static void 2792 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) 2793 { 2794 2795 switch (pmap->pm_type) { 2796 case PT_X86: 2797 break; 2798 case PT_RVI: 2799 case PT_EPT: 2800 /* 2801 * XXX 2802 * This is a little bogus since the generation number is 2803 * supposed to be bumped up when a region of the address 2804 * space is invalidated in the page tables. 2805 * 2806 * In this case the old PDE entry is valid but yet we want 2807 * to make sure that any mappings using the old entry are 2808 * invalidated in the TLB. 2809 * 2810 * The reason this works as expected is because we rendezvous 2811 * "all" host cpus and force any vcpu context to exit as a 2812 * side-effect. 2813 */ 2814 atomic_add_long(&pmap->pm_eptgen, 1); 2815 break; 2816 default: 2817 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); 2818 } 2819 pde_store(pde, newpde); 2820 } 2821 2822 /* 2823 * After changing the page size for the specified virtual address in the page 2824 * table, flush the corresponding entries from the processor's TLB. Only the 2825 * calling processor's TLB is affected. 2826 * 2827 * The calling thread must be pinned to a processor. 2828 */ 2829 static void 2830 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) 2831 { 2832 pt_entry_t PG_G; 2833 2834 if (pmap_type_guest(pmap)) 2835 return; 2836 2837 KASSERT(pmap->pm_type == PT_X86, 2838 ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); 2839 2840 PG_G = pmap_global_bit(pmap); 2841 2842 if ((newpde & PG_PS) == 0) 2843 /* Demotion: flush a specific 2MB page mapping. */ 2844 pmap_invlpg(pmap, va); 2845 else if ((newpde & PG_G) == 0) 2846 /* 2847 * Promotion: flush every 4KB page mapping from the TLB 2848 * because there are too many to flush individually. 2849 */ 2850 invltlb(); 2851 else { 2852 /* 2853 * Promotion: flush every 4KB page mapping from the TLB, 2854 * including any global (PG_G) mappings. 2855 */ 2856 invltlb_glob(); 2857 } 2858 } 2859 2860 /* 2861 * The amd64 pmap uses different approaches to TLB invalidation 2862 * depending on the kernel configuration, available hardware features, 2863 * and known hardware errata. The kernel configuration option that 2864 * has the greatest operational impact on TLB invalidation is PTI, 2865 * which is enabled automatically on affected Intel CPUs. The most 2866 * impactful hardware features are first PCID, and then INVPCID 2867 * instruction presence. PCID usage is quite different for PTI 2868 * vs. non-PTI. 2869 * 2870 * * Kernel Page Table Isolation (PTI or KPTI) is used to mitigate 2871 * the Meltdown bug in some Intel CPUs. Under PTI, each user address 2872 * space is served by two page tables, user and kernel. The user 2873 * page table only maps user space and a kernel trampoline. The 2874 * kernel trampoline includes the entirety of the kernel text but 2875 * only the kernel data that is needed to switch from user to kernel 2876 * mode. The kernel page table maps the user and kernel address 2877 * spaces in their entirety. It is identical to the per-process 2878 * page table used in non-PTI mode. 2879 * 2880 * User page tables are only used when the CPU is in user mode. 2881 * Consequently, some TLB invalidations can be postponed until the 2882 * switch from kernel to user mode. In contrast, the user 2883 * space part of the kernel page table is used for copyout(9), so 2884 * TLB invalidations on this page table cannot be similarly postponed. 2885 * 2886 * The existence of a user mode page table for the given pmap is 2887 * indicated by a pm_ucr3 value that differs from PMAP_NO_CR3, in 2888 * which case pm_ucr3 contains the %cr3 register value for the user 2889 * mode page table's root. 2890 * 2891 * * The pm_active bitmask indicates which CPUs currently have the 2892 * pmap active. A CPU's bit is set on context switch to the pmap, and 2893 * cleared on switching off this CPU. For the kernel page table, 2894 * the pm_active field is immutable and contains all CPUs. The 2895 * kernel page table is always logically active on every processor, 2896 * but not necessarily in use by the hardware, e.g., in PTI mode. 2897 * 2898 * When requesting invalidation of virtual addresses with 2899 * pmap_invalidate_XXX() functions, the pmap sends shootdown IPIs to 2900 * all CPUs recorded as active in pm_active. Updates to and reads 2901 * from pm_active are not synchronized, and so they may race with 2902 * each other. Shootdown handlers are prepared to handle the race. 2903 * 2904 * * PCID is an optional feature of the long mode x86 MMU where TLB 2905 * entries are tagged with the 'Process ID' of the address space 2906 * they belong to. This feature provides a limited namespace for 2907 * process identifiers, 12 bits, supporting 4095 simultaneous IDs 2908 * total. 2909 * 2910 * Allocation of a PCID to a pmap is done by an algorithm described 2911 * in section 15.12, "Other TLB Consistency Algorithms", of 2912 * Vahalia's book "Unix Internals". A PCID cannot be allocated for 2913 * the whole lifetime of a pmap in pmap_pinit() due to the limited 2914 * namespace. Instead, a per-CPU, per-pmap PCID is assigned when 2915 * the CPU is about to start caching TLB entries from a pmap, 2916 * i.e., on the context switch that activates the pmap on the CPU. 2917 * 2918 * The PCID allocator maintains a per-CPU, per-pmap generation 2919 * count, pm_gen, which is incremented each time a new PCID is 2920 * allocated. On TLB invalidation, the generation counters for the 2921 * pmap are zeroed, which signals the context switch code that the 2922 * previously allocated PCID is no longer valid. Effectively, 2923 * zeroing any of these counters triggers a TLB shootdown for the 2924 * given CPU/address space, due to the allocation of a new PCID. 2925 * 2926 * Zeroing can be performed remotely. Consequently, if a pmap is 2927 * inactive on a CPU, then a TLB shootdown for that pmap and CPU can 2928 * be initiated by an ordinary memory access to reset the target 2929 * CPU's generation count within the pmap. The CPU initiating the 2930 * TLB shootdown does not need to send an IPI to the target CPU. 2931 * 2932 * * PTI + PCID. The available PCIDs are divided into two sets: PCIDs 2933 * for complete (kernel) page tables, and PCIDs for user mode page 2934 * tables. A user PCID value is obtained from the kernel PCID value 2935 * by setting the highest bit, 11, to 1 (0x800 == PMAP_PCID_USER_PT). 2936 * 2937 * User space page tables are activated on return to user mode, by 2938 * loading pm_ucr3 into %cr3. If the PCPU(ucr3_load_mask) requests 2939 * clearing bit 63 of the loaded ucr3, this effectively causes 2940 * complete invalidation of the user mode TLB entries for the 2941 * current pmap. In which case, local invalidations of individual 2942 * pages in the user page table are skipped. 2943 * 2944 * * Local invalidation, all modes. If the requested invalidation is 2945 * for a specific address or the total invalidation of a currently 2946 * active pmap, then the TLB is flushed using INVLPG for a kernel 2947 * page table, and INVPCID(INVPCID_CTXGLOB)/invltlb_glob() for a 2948 * user space page table(s). 2949 * 2950 * If the INVPCID instruction is available, it is used to flush user 2951 * entries from the kernel page table. 2952 * 2953 * When PCID is enabled, the INVLPG instruction invalidates all TLB 2954 * entries for the given page that either match the current PCID or 2955 * are global. Since TLB entries for the same page under different 2956 * PCIDs are unaffected, kernel pages which reside in all address 2957 * spaces could be problematic. We avoid the problem by creating 2958 * all kernel PTEs with the global flag (PG_G) set, when PTI is 2959 * disabled. 2960 * 2961 * * mode: PTI disabled, PCID present. The kernel reserves PCID 0 for its 2962 * address space, all other 4095 PCIDs are used for user mode spaces 2963 * as described above. A context switch allocates a new PCID if 2964 * the recorded PCID is zero or the recorded generation does not match 2965 * the CPU's generation, effectively flushing the TLB for this address space. 2966 * Total remote invalidation is performed by zeroing pm_gen for all CPUs. 2967 * local user page: INVLPG 2968 * local kernel page: INVLPG 2969 * local user total: INVPCID(CTX) 2970 * local kernel total: INVPCID(CTXGLOB) or invltlb_glob() 2971 * remote user page, inactive pmap: zero pm_gen 2972 * remote user page, active pmap: zero pm_gen + IPI:INVLPG 2973 * (Both actions are required to handle the aforementioned pm_active races.) 2974 * remote kernel page: IPI:INVLPG 2975 * remote user total, inactive pmap: zero pm_gen 2976 * remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) or 2977 * reload %cr3) 2978 * (See note above about pm_active races.) 2979 * remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob()) 2980 * 2981 * PTI enabled, PCID present. 2982 * local user page: INVLPG for kpt, INVPCID(ADDR) or (INVLPG for ucr3) 2983 * for upt 2984 * local kernel page: INVLPG 2985 * local user total: INVPCID(CTX) or reload %cr3 for kpt, clear PCID_SAVE 2986 * on loading UCR3 into %cr3 for upt 2987 * local kernel total: INVPCID(CTXGLOB) or invltlb_glob() 2988 * remote user page, inactive pmap: zero pm_gen 2989 * remote user page, active pmap: zero pm_gen + IPI:(INVLPG for kpt, 2990 * INVPCID(ADDR) for upt) 2991 * remote kernel page: IPI:INVLPG 2992 * remote user total, inactive pmap: zero pm_gen 2993 * remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) for kpt, 2994 * clear PCID_SAVE on loading UCR3 into $cr3 for upt) 2995 * remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob()) 2996 * 2997 * No PCID. 2998 * local user page: INVLPG 2999 * local kernel page: INVLPG 3000 * local user total: reload %cr3 3001 * local kernel total: invltlb_glob() 3002 * remote user page, inactive pmap: - 3003 * remote user page, active pmap: IPI:INVLPG 3004 * remote kernel page: IPI:INVLPG 3005 * remote user total, inactive pmap: - 3006 * remote user total, active pmap: IPI:(reload %cr3) 3007 * remote kernel total: IPI:invltlb_glob() 3008 * Since on return to user mode, the reload of %cr3 with ucr3 causes 3009 * TLB invalidation, no specific action is required for user page table. 3010 * 3011 * EPT. EPT pmaps do not map KVA, all mappings are userspace. 3012 * XXX TODO 3013 */ 3014 3015 #ifdef SMP 3016 /* 3017 * Interrupt the cpus that are executing in the guest context. 3018 * This will force the vcpu to exit and the cached EPT mappings 3019 * will be invalidated by the host before the next vmresume. 3020 */ 3021 static __inline void 3022 pmap_invalidate_ept(pmap_t pmap) 3023 { 3024 smr_seq_t goal; 3025 int ipinum; 3026 3027 sched_pin(); 3028 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 3029 ("pmap_invalidate_ept: absurd pm_active")); 3030 3031 /* 3032 * The TLB mappings associated with a vcpu context are not 3033 * flushed each time a different vcpu is chosen to execute. 3034 * 3035 * This is in contrast with a process's vtop mappings that 3036 * are flushed from the TLB on each context switch. 3037 * 3038 * Therefore we need to do more than just a TLB shootdown on 3039 * the active cpus in 'pmap->pm_active'. To do this we keep 3040 * track of the number of invalidations performed on this pmap. 3041 * 3042 * Each vcpu keeps a cache of this counter and compares it 3043 * just before a vmresume. If the counter is out-of-date an 3044 * invept will be done to flush stale mappings from the TLB. 3045 * 3046 * To ensure that all vCPU threads have observed the new counter 3047 * value before returning, we use SMR. Ordering is important here: 3048 * the VMM enters an SMR read section before loading the counter 3049 * and after updating the pm_active bit set. Thus, pm_active is 3050 * a superset of active readers, and any reader that has observed 3051 * the goal has observed the new counter value. 3052 */ 3053 atomic_add_long(&pmap->pm_eptgen, 1); 3054 3055 goal = smr_advance(pmap->pm_eptsmr); 3056 3057 /* 3058 * Force the vcpu to exit and trap back into the hypervisor. 3059 */ 3060 ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; 3061 ipi_selected(pmap->pm_active, ipinum); 3062 sched_unpin(); 3063 3064 /* 3065 * Ensure that all active vCPUs will observe the new generation counter 3066 * value before executing any more guest instructions. 3067 */ 3068 smr_wait(pmap->pm_eptsmr, goal); 3069 } 3070 3071 static inline void 3072 pmap_invalidate_preipi_pcid(pmap_t pmap) 3073 { 3074 struct pmap_pcid *pcidp; 3075 u_int cpuid, i; 3076 3077 sched_pin(); 3078 3079 cpuid = PCPU_GET(cpuid); 3080 if (pmap != PCPU_GET(curpmap)) 3081 cpuid = 0xffffffff; /* An impossible value */ 3082 3083 CPU_FOREACH(i) { 3084 if (cpuid != i) { 3085 pcidp = zpcpu_get_cpu(pmap->pm_pcidp, i); 3086 pcidp->pm_gen = 0; 3087 } 3088 } 3089 3090 /* 3091 * The fence is between stores to pm_gen and the read of the 3092 * pm_active mask. We need to ensure that it is impossible 3093 * for us to miss the bit update in pm_active and 3094 * simultaneously observe a non-zero pm_gen in 3095 * pmap_activate_sw(), otherwise TLB update is missed. 3096 * Without the fence, IA32 allows such an outcome. Note that 3097 * pm_active is updated by a locked operation, which provides 3098 * the reciprocal fence. 3099 */ 3100 atomic_thread_fence_seq_cst(); 3101 } 3102 3103 static void 3104 pmap_invalidate_preipi_nopcid(pmap_t pmap __unused) 3105 { 3106 sched_pin(); 3107 } 3108 3109 DEFINE_IFUNC(static, void, pmap_invalidate_preipi, (pmap_t)) 3110 { 3111 return (pmap_pcid_enabled ? pmap_invalidate_preipi_pcid : 3112 pmap_invalidate_preipi_nopcid); 3113 } 3114 3115 static inline void 3116 pmap_invalidate_page_pcid_cb(pmap_t pmap, vm_offset_t va, 3117 const bool invpcid_works1) 3118 { 3119 struct invpcid_descr d; 3120 uint64_t kcr3, ucr3; 3121 uint32_t pcid; 3122 3123 /* 3124 * Because pm_pcid is recalculated on a context switch, we 3125 * must ensure there is no preemption, not just pinning. 3126 * Otherwise, we might use a stale value below. 3127 */ 3128 CRITICAL_ASSERT(curthread); 3129 3130 /* 3131 * No need to do anything with user page tables invalidation 3132 * if there is no user page table, or invalidation is deferred 3133 * until the return to userspace. ucr3_load_mask is stable 3134 * because we have preemption disabled. 3135 */ 3136 if (pmap->pm_ucr3 == PMAP_NO_CR3 || 3137 PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) 3138 return; 3139 3140 pcid = pmap_get_pcid(pmap); 3141 if (invpcid_works1) { 3142 d.pcid = pcid | PMAP_PCID_USER_PT; 3143 d.pad = 0; 3144 d.addr = va; 3145 invpcid(&d, INVPCID_ADDR); 3146 } else { 3147 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3148 ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3149 pmap_pti_pcid_invlpg(ucr3, kcr3, va); 3150 } 3151 } 3152 3153 static void 3154 pmap_invalidate_page_pcid_invpcid_cb(pmap_t pmap, vm_offset_t va) 3155 { 3156 pmap_invalidate_page_pcid_cb(pmap, va, true); 3157 } 3158 3159 static void 3160 pmap_invalidate_page_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t va) 3161 { 3162 pmap_invalidate_page_pcid_cb(pmap, va, false); 3163 } 3164 3165 static void 3166 pmap_invalidate_page_nopcid_cb(pmap_t pmap __unused, vm_offset_t va __unused) 3167 { 3168 } 3169 3170 DEFINE_IFUNC(static, void, pmap_invalidate_page_cb, (pmap_t, vm_offset_t)) 3171 { 3172 if (pmap_pcid_enabled) 3173 return (invpcid_works ? pmap_invalidate_page_pcid_invpcid_cb : 3174 pmap_invalidate_page_pcid_noinvpcid_cb); 3175 return (pmap_invalidate_page_nopcid_cb); 3176 } 3177 3178 static void 3179 pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va, 3180 vm_offset_t addr2 __unused) 3181 { 3182 if (pmap == kernel_pmap) { 3183 pmap_invlpg(kernel_pmap, va); 3184 } else if (pmap == PCPU_GET(curpmap)) { 3185 invlpg(va); 3186 pmap_invalidate_page_cb(pmap, va); 3187 } 3188 } 3189 3190 void 3191 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 3192 { 3193 if (pmap_type_guest(pmap)) { 3194 pmap_invalidate_ept(pmap); 3195 return; 3196 } 3197 3198 KASSERT(pmap->pm_type == PT_X86, 3199 ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); 3200 3201 pmap_invalidate_preipi(pmap); 3202 smp_masked_invlpg(va, pmap, pmap_invalidate_page_curcpu_cb); 3203 } 3204 3205 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ 3206 #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) 3207 3208 static void 3209 pmap_invalidate_range_pcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 3210 const bool invpcid_works1) 3211 { 3212 struct invpcid_descr d; 3213 uint64_t kcr3, ucr3; 3214 uint32_t pcid; 3215 3216 CRITICAL_ASSERT(curthread); 3217 3218 if (pmap != PCPU_GET(curpmap) || 3219 pmap->pm_ucr3 == PMAP_NO_CR3 || 3220 PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) 3221 return; 3222 3223 pcid = pmap_get_pcid(pmap); 3224 if (invpcid_works1) { 3225 d.pcid = pcid | PMAP_PCID_USER_PT; 3226 d.pad = 0; 3227 for (d.addr = sva; d.addr < eva; d.addr += PAGE_SIZE) 3228 invpcid(&d, INVPCID_ADDR); 3229 } else { 3230 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3231 ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3232 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); 3233 } 3234 } 3235 3236 static void 3237 pmap_invalidate_range_pcid_invpcid_cb(pmap_t pmap, vm_offset_t sva, 3238 vm_offset_t eva) 3239 { 3240 pmap_invalidate_range_pcid_cb(pmap, sva, eva, true); 3241 } 3242 3243 static void 3244 pmap_invalidate_range_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t sva, 3245 vm_offset_t eva) 3246 { 3247 pmap_invalidate_range_pcid_cb(pmap, sva, eva, false); 3248 } 3249 3250 static void 3251 pmap_invalidate_range_nopcid_cb(pmap_t pmap __unused, vm_offset_t sva __unused, 3252 vm_offset_t eva __unused) 3253 { 3254 } 3255 3256 DEFINE_IFUNC(static, void, pmap_invalidate_range_cb, (pmap_t, vm_offset_t, 3257 vm_offset_t)) 3258 { 3259 if (pmap_pcid_enabled) 3260 return (invpcid_works ? pmap_invalidate_range_pcid_invpcid_cb : 3261 pmap_invalidate_range_pcid_noinvpcid_cb); 3262 return (pmap_invalidate_range_nopcid_cb); 3263 } 3264 3265 static void 3266 pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3267 { 3268 vm_offset_t addr; 3269 3270 if (pmap == kernel_pmap) { 3271 if (PCPU_GET(pcid_invlpg_workaround)) { 3272 struct invpcid_descr d = { 0 }; 3273 3274 invpcid(&d, INVPCID_CTXGLOB); 3275 } else { 3276 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3277 invlpg(addr); 3278 } 3279 } else if (pmap == PCPU_GET(curpmap)) { 3280 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3281 invlpg(addr); 3282 pmap_invalidate_range_cb(pmap, sva, eva); 3283 } 3284 } 3285 3286 void 3287 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3288 { 3289 if (eva - sva >= PMAP_INVLPG_THRESHOLD) { 3290 pmap_invalidate_all(pmap); 3291 return; 3292 } 3293 3294 if (pmap_type_guest(pmap)) { 3295 pmap_invalidate_ept(pmap); 3296 return; 3297 } 3298 3299 KASSERT(pmap->pm_type == PT_X86, 3300 ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); 3301 3302 pmap_invalidate_preipi(pmap); 3303 smp_masked_invlpg_range(sva, eva, pmap, 3304 pmap_invalidate_range_curcpu_cb); 3305 } 3306 3307 static inline void 3308 pmap_invalidate_all_pcid_cb(pmap_t pmap, bool invpcid_works1) 3309 { 3310 struct invpcid_descr d; 3311 uint64_t kcr3; 3312 uint32_t pcid; 3313 3314 if (pmap == kernel_pmap) { 3315 if (invpcid_works1) { 3316 bzero(&d, sizeof(d)); 3317 invpcid(&d, INVPCID_CTXGLOB); 3318 } else { 3319 invltlb_glob(); 3320 } 3321 } else if (pmap == PCPU_GET(curpmap)) { 3322 CRITICAL_ASSERT(curthread); 3323 3324 pcid = pmap_get_pcid(pmap); 3325 if (invpcid_works1) { 3326 d.pcid = pcid; 3327 d.pad = 0; 3328 d.addr = 0; 3329 invpcid(&d, INVPCID_CTX); 3330 } else { 3331 kcr3 = pmap->pm_cr3 | pcid; 3332 load_cr3(kcr3); 3333 } 3334 if (pmap->pm_ucr3 != PMAP_NO_CR3) 3335 PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); 3336 } 3337 } 3338 3339 static void 3340 pmap_invalidate_all_pcid_invpcid_cb(pmap_t pmap) 3341 { 3342 pmap_invalidate_all_pcid_cb(pmap, true); 3343 } 3344 3345 static void 3346 pmap_invalidate_all_pcid_noinvpcid_cb(pmap_t pmap) 3347 { 3348 pmap_invalidate_all_pcid_cb(pmap, false); 3349 } 3350 3351 static void 3352 pmap_invalidate_all_nopcid_cb(pmap_t pmap) 3353 { 3354 if (pmap == kernel_pmap) 3355 invltlb_glob(); 3356 else if (pmap == PCPU_GET(curpmap)) 3357 invltlb(); 3358 } 3359 3360 DEFINE_IFUNC(static, void, pmap_invalidate_all_cb, (pmap_t)) 3361 { 3362 if (pmap_pcid_enabled) 3363 return (invpcid_works ? pmap_invalidate_all_pcid_invpcid_cb : 3364 pmap_invalidate_all_pcid_noinvpcid_cb); 3365 return (pmap_invalidate_all_nopcid_cb); 3366 } 3367 3368 static void 3369 pmap_invalidate_all_curcpu_cb(pmap_t pmap, vm_offset_t addr1 __unused, 3370 vm_offset_t addr2 __unused) 3371 { 3372 pmap_invalidate_all_cb(pmap); 3373 } 3374 3375 void 3376 pmap_invalidate_all(pmap_t pmap) 3377 { 3378 if (pmap_type_guest(pmap)) { 3379 pmap_invalidate_ept(pmap); 3380 return; 3381 } 3382 3383 KASSERT(pmap->pm_type == PT_X86, 3384 ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); 3385 3386 pmap_invalidate_preipi(pmap); 3387 smp_masked_invltlb(pmap, pmap_invalidate_all_curcpu_cb); 3388 } 3389 3390 static void 3391 pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused, vm_offset_t va __unused, 3392 vm_offset_t addr2 __unused) 3393 { 3394 wbinvd(); 3395 } 3396 3397 void 3398 pmap_invalidate_cache(void) 3399 { 3400 sched_pin(); 3401 smp_cache_flush(pmap_invalidate_cache_curcpu_cb); 3402 } 3403 3404 struct pde_action { 3405 cpuset_t invalidate; /* processors that invalidate their TLB */ 3406 pmap_t pmap; 3407 vm_offset_t va; 3408 pd_entry_t *pde; 3409 pd_entry_t newpde; 3410 u_int store; /* processor that updates the PDE */ 3411 }; 3412 3413 static void 3414 pmap_update_pde_action(void *arg) 3415 { 3416 struct pde_action *act = arg; 3417 3418 if (act->store == PCPU_GET(cpuid)) 3419 pmap_update_pde_store(act->pmap, act->pde, act->newpde); 3420 } 3421 3422 static void 3423 pmap_update_pde_teardown(void *arg) 3424 { 3425 struct pde_action *act = arg; 3426 3427 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 3428 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); 3429 } 3430 3431 /* 3432 * Change the page size for the specified virtual address in a way that 3433 * prevents any possibility of the TLB ever having two entries that map the 3434 * same virtual address using different page sizes. This is the recommended 3435 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 3436 * machine check exception for a TLB state that is improperly diagnosed as a 3437 * hardware error. 3438 */ 3439 static void 3440 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 3441 { 3442 struct pde_action act; 3443 cpuset_t active, other_cpus; 3444 u_int cpuid; 3445 3446 sched_pin(); 3447 cpuid = PCPU_GET(cpuid); 3448 other_cpus = all_cpus; 3449 CPU_CLR(cpuid, &other_cpus); 3450 if (pmap == kernel_pmap || pmap_type_guest(pmap)) 3451 active = all_cpus; 3452 else { 3453 active = pmap->pm_active; 3454 } 3455 if (CPU_OVERLAP(&active, &other_cpus)) { 3456 act.store = cpuid; 3457 act.invalidate = active; 3458 act.va = va; 3459 act.pmap = pmap; 3460 act.pde = pde; 3461 act.newpde = newpde; 3462 CPU_SET(cpuid, &active); 3463 smp_rendezvous_cpus(active, 3464 smp_no_rendezvous_barrier, pmap_update_pde_action, 3465 pmap_update_pde_teardown, &act); 3466 } else { 3467 pmap_update_pde_store(pmap, pde, newpde); 3468 if (CPU_ISSET(cpuid, &active)) 3469 pmap_update_pde_invalidate(pmap, va, newpde); 3470 } 3471 sched_unpin(); 3472 } 3473 #else /* !SMP */ 3474 /* 3475 * Normal, non-SMP, invalidation functions. 3476 */ 3477 void 3478 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 3479 { 3480 struct invpcid_descr d; 3481 struct pmap_pcid *pcidp; 3482 uint64_t kcr3, ucr3; 3483 uint32_t pcid; 3484 3485 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3486 pmap->pm_eptgen++; 3487 return; 3488 } 3489 KASSERT(pmap->pm_type == PT_X86, 3490 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 3491 3492 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 3493 invlpg(va); 3494 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && 3495 pmap->pm_ucr3 != PMAP_NO_CR3) { 3496 critical_enter(); 3497 pcid = pmap_get_pcid(pmap); 3498 if (invpcid_works) { 3499 d.pcid = pcid | PMAP_PCID_USER_PT; 3500 d.pad = 0; 3501 d.addr = va; 3502 invpcid(&d, INVPCID_ADDR); 3503 } else { 3504 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3505 ucr3 = pmap->pm_ucr3 | pcid | 3506 PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3507 pmap_pti_pcid_invlpg(ucr3, kcr3, va); 3508 } 3509 critical_exit(); 3510 } 3511 } else if (pmap_pcid_enabled) { 3512 pcidp = zpcpu_get(pmap->pm_pcidp); 3513 pcidp->pm_gen = 0; 3514 } 3515 } 3516 3517 void 3518 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3519 { 3520 struct invpcid_descr d; 3521 struct pmap_pcid *pcidp; 3522 vm_offset_t addr; 3523 uint64_t kcr3, ucr3; 3524 uint32_t pcid; 3525 3526 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3527 pmap->pm_eptgen++; 3528 return; 3529 } 3530 KASSERT(pmap->pm_type == PT_X86, 3531 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 3532 3533 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 3534 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3535 invlpg(addr); 3536 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && 3537 pmap->pm_ucr3 != PMAP_NO_CR3) { 3538 critical_enter(); 3539 pcid = pmap_get_pcid(pmap); 3540 if (invpcid_works) { 3541 d.pcid = pcid | PMAP_PCID_USER_PT; 3542 d.pad = 0; 3543 d.addr = sva; 3544 for (; d.addr < eva; d.addr += PAGE_SIZE) 3545 invpcid(&d, INVPCID_ADDR); 3546 } else { 3547 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3548 ucr3 = pmap->pm_ucr3 | pcid | 3549 PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3550 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); 3551 } 3552 critical_exit(); 3553 } 3554 } else if (pmap_pcid_enabled) { 3555 pcidp = zpcpu_get(pmap->pm_pcidp); 3556 pcidp->pm_gen = 0; 3557 } 3558 } 3559 3560 void 3561 pmap_invalidate_all(pmap_t pmap) 3562 { 3563 struct invpcid_descr d; 3564 struct pmap_pcid *pcidp; 3565 uint64_t kcr3, ucr3; 3566 uint32_t pcid; 3567 3568 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3569 pmap->pm_eptgen++; 3570 return; 3571 } 3572 KASSERT(pmap->pm_type == PT_X86, 3573 ("pmap_invalidate_all: unknown type %d", pmap->pm_type)); 3574 3575 if (pmap == kernel_pmap) { 3576 if (pmap_pcid_enabled && invpcid_works) { 3577 bzero(&d, sizeof(d)); 3578 invpcid(&d, INVPCID_CTXGLOB); 3579 } else { 3580 invltlb_glob(); 3581 } 3582 } else if (pmap == PCPU_GET(curpmap)) { 3583 if (pmap_pcid_enabled) { 3584 critical_enter(); 3585 pcid = pmap_get_pcid(pmap); 3586 if (invpcid_works) { 3587 d.pcid = pcid; 3588 d.pad = 0; 3589 d.addr = 0; 3590 invpcid(&d, INVPCID_CTX); 3591 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 3592 d.pcid |= PMAP_PCID_USER_PT; 3593 invpcid(&d, INVPCID_CTX); 3594 } 3595 } else { 3596 kcr3 = pmap->pm_cr3 | pcid; 3597 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 3598 ucr3 = pmap->pm_ucr3 | pcid | 3599 PMAP_PCID_USER_PT; 3600 pmap_pti_pcid_invalidate(ucr3, kcr3); 3601 } else 3602 load_cr3(kcr3); 3603 } 3604 critical_exit(); 3605 } else { 3606 invltlb(); 3607 } 3608 } else if (pmap_pcid_enabled) { 3609 pcidp = zpcpu_get(pmap->pm_pcidp); 3610 pcidp->pm_gen = 0; 3611 } 3612 } 3613 3614 void 3615 pmap_invalidate_cache(void) 3616 { 3617 3618 wbinvd(); 3619 } 3620 3621 static void 3622 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 3623 { 3624 struct pmap_pcid *pcidp; 3625 3626 pmap_update_pde_store(pmap, pde, newpde); 3627 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) 3628 pmap_update_pde_invalidate(pmap, va, newpde); 3629 else { 3630 pcidp = zpcpu_get(pmap->pm_pcidp); 3631 pcidp->pm_gen = 0; 3632 } 3633 } 3634 #endif /* !SMP */ 3635 3636 static void 3637 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) 3638 { 3639 3640 /* 3641 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created 3642 * by a promotion that did not invalidate the 512 4KB page mappings 3643 * that might exist in the TLB. Consequently, at this point, the TLB 3644 * may hold both 4KB and 2MB page mappings for the address range [va, 3645 * va + NBPDR). Therefore, the entire range must be invalidated here. 3646 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any 3647 * 4KB page mappings for the address range [va, va + NBPDR), and so a 3648 * single INVLPG suffices to invalidate the 2MB page mapping from the 3649 * TLB. 3650 */ 3651 if ((pde & PG_PROMOTED) != 0) 3652 pmap_invalidate_range(pmap, va, va + NBPDR - 1); 3653 else 3654 pmap_invalidate_page(pmap, va); 3655 } 3656 3657 DEFINE_IFUNC(, void, pmap_invalidate_cache_range, 3658 (vm_offset_t sva, vm_offset_t eva)) 3659 { 3660 3661 if ((cpu_feature & CPUID_SS) != 0) 3662 return (pmap_invalidate_cache_range_selfsnoop); 3663 if ((cpu_feature & CPUID_CLFSH) != 0) 3664 return (pmap_force_invalidate_cache_range); 3665 return (pmap_invalidate_cache_range_all); 3666 } 3667 3668 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 3669 3670 static void 3671 pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva) 3672 { 3673 3674 KASSERT((sva & PAGE_MASK) == 0, 3675 ("pmap_invalidate_cache_range: sva not page-aligned")); 3676 KASSERT((eva & PAGE_MASK) == 0, 3677 ("pmap_invalidate_cache_range: eva not page-aligned")); 3678 } 3679 3680 static void 3681 pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva) 3682 { 3683 3684 pmap_invalidate_cache_range_check_align(sva, eva); 3685 } 3686 3687 void 3688 pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 3689 { 3690 3691 sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); 3692 3693 /* 3694 * XXX: Some CPUs fault, hang, or trash the local APIC 3695 * registers if we use CLFLUSH on the local APIC range. The 3696 * local APIC is always uncached, so we don't need to flush 3697 * for that range anyway. 3698 */ 3699 if (pmap_kextract(sva) == lapic_paddr) 3700 return; 3701 3702 if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) { 3703 /* 3704 * Do per-cache line flush. Use a locked 3705 * instruction to insure that previous stores are 3706 * included in the write-back. The processor 3707 * propagates flush to other processors in the cache 3708 * coherence domain. 3709 */ 3710 atomic_thread_fence_seq_cst(); 3711 for (; sva < eva; sva += cpu_clflush_line_size) 3712 clflushopt(sva); 3713 atomic_thread_fence_seq_cst(); 3714 } else { 3715 /* 3716 * Writes are ordered by CLFLUSH on Intel CPUs. 3717 */ 3718 if (cpu_vendor_id != CPU_VENDOR_INTEL) 3719 mfence(); 3720 for (; sva < eva; sva += cpu_clflush_line_size) 3721 clflush(sva); 3722 if (cpu_vendor_id != CPU_VENDOR_INTEL) 3723 mfence(); 3724 } 3725 } 3726 3727 static void 3728 pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva) 3729 { 3730 3731 pmap_invalidate_cache_range_check_align(sva, eva); 3732 pmap_invalidate_cache(); 3733 } 3734 3735 /* 3736 * Remove the specified set of pages from the data and instruction caches. 3737 * 3738 * In contrast to pmap_invalidate_cache_range(), this function does not 3739 * rely on the CPU's self-snoop feature, because it is intended for use 3740 * when moving pages into a different cache domain. 3741 */ 3742 void 3743 pmap_invalidate_cache_pages(vm_page_t *pages, int count) 3744 { 3745 vm_offset_t daddr, eva; 3746 int i; 3747 bool useclflushopt; 3748 3749 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; 3750 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 3751 ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt)) 3752 pmap_invalidate_cache(); 3753 else { 3754 if (useclflushopt) 3755 atomic_thread_fence_seq_cst(); 3756 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 3757 mfence(); 3758 for (i = 0; i < count; i++) { 3759 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 3760 eva = daddr + PAGE_SIZE; 3761 for (; daddr < eva; daddr += cpu_clflush_line_size) { 3762 if (useclflushopt) 3763 clflushopt(daddr); 3764 else 3765 clflush(daddr); 3766 } 3767 } 3768 if (useclflushopt) 3769 atomic_thread_fence_seq_cst(); 3770 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 3771 mfence(); 3772 } 3773 } 3774 3775 void 3776 pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva) 3777 { 3778 3779 pmap_invalidate_cache_range_check_align(sva, eva); 3780 3781 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) { 3782 pmap_force_invalidate_cache_range(sva, eva); 3783 return; 3784 } 3785 3786 /* See comment in pmap_force_invalidate_cache_range(). */ 3787 if (pmap_kextract(sva) == lapic_paddr) 3788 return; 3789 3790 atomic_thread_fence_seq_cst(); 3791 for (; sva < eva; sva += cpu_clflush_line_size) 3792 clwb(sva); 3793 atomic_thread_fence_seq_cst(); 3794 } 3795 3796 void 3797 pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) 3798 { 3799 pt_entry_t *pte; 3800 vm_offset_t vaddr; 3801 int error __diagused; 3802 int pte_bits; 3803 3804 KASSERT((spa & PAGE_MASK) == 0, 3805 ("pmap_flush_cache_phys_range: spa not page-aligned")); 3806 KASSERT((epa & PAGE_MASK) == 0, 3807 ("pmap_flush_cache_phys_range: epa not page-aligned")); 3808 3809 if (spa < dmaplimit) { 3810 pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN( 3811 dmaplimit, epa))); 3812 if (dmaplimit >= epa) 3813 return; 3814 spa = dmaplimit; 3815 } 3816 3817 pte_bits = pmap_cache_bits(kernel_pmap, mattr, false) | X86_PG_RW | 3818 X86_PG_V; 3819 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 3820 &vaddr); 3821 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 3822 pte = vtopte(vaddr); 3823 for (; spa < epa; spa += PAGE_SIZE) { 3824 sched_pin(); 3825 pte_store(pte, spa | pte_bits); 3826 pmap_invlpg(kernel_pmap, vaddr); 3827 /* XXXKIB atomic inside flush_cache_range are excessive */ 3828 pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE); 3829 sched_unpin(); 3830 } 3831 vmem_free(kernel_arena, vaddr, PAGE_SIZE); 3832 } 3833 3834 /* 3835 * Routine: pmap_extract 3836 * Function: 3837 * Extract the physical page address associated 3838 * with the given map/virtual_address pair. 3839 */ 3840 vm_paddr_t 3841 pmap_extract(pmap_t pmap, vm_offset_t va) 3842 { 3843 pdp_entry_t *pdpe; 3844 pd_entry_t *pde; 3845 pt_entry_t *pte, PG_V; 3846 vm_paddr_t pa; 3847 3848 pa = 0; 3849 PG_V = pmap_valid_bit(pmap); 3850 PMAP_LOCK(pmap); 3851 pdpe = pmap_pdpe(pmap, va); 3852 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 3853 if ((*pdpe & PG_PS) != 0) 3854 pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); 3855 else { 3856 pde = pmap_pdpe_to_pde(pdpe, va); 3857 if ((*pde & PG_V) != 0) { 3858 if ((*pde & PG_PS) != 0) { 3859 pa = (*pde & PG_PS_FRAME) | 3860 (va & PDRMASK); 3861 } else { 3862 pte = pmap_pde_to_pte(pde, va); 3863 pa = (*pte & PG_FRAME) | 3864 (va & PAGE_MASK); 3865 } 3866 } 3867 } 3868 } 3869 PMAP_UNLOCK(pmap); 3870 return (pa); 3871 } 3872 3873 /* 3874 * Routine: pmap_extract_and_hold 3875 * Function: 3876 * Atomically extract and hold the physical page 3877 * with the given pmap and virtual address pair 3878 * if that mapping permits the given protection. 3879 */ 3880 vm_page_t 3881 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 3882 { 3883 pdp_entry_t pdpe, *pdpep; 3884 pd_entry_t pde, *pdep; 3885 pt_entry_t pte, PG_RW, PG_V; 3886 vm_page_t m; 3887 3888 m = NULL; 3889 PG_RW = pmap_rw_bit(pmap); 3890 PG_V = pmap_valid_bit(pmap); 3891 PMAP_LOCK(pmap); 3892 3893 pdpep = pmap_pdpe(pmap, va); 3894 if (pdpep == NULL || ((pdpe = *pdpep) & PG_V) == 0) 3895 goto out; 3896 if ((pdpe & PG_PS) != 0) { 3897 if ((pdpe & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0) 3898 goto out; 3899 m = PHYS_TO_VM_PAGE((pdpe & PG_PS_FRAME) | (va & PDPMASK)); 3900 goto check_page; 3901 } 3902 3903 pdep = pmap_pdpe_to_pde(pdpep, va); 3904 if (pdep == NULL || ((pde = *pdep) & PG_V) == 0) 3905 goto out; 3906 if ((pde & PG_PS) != 0) { 3907 if ((pde & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0) 3908 goto out; 3909 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); 3910 goto check_page; 3911 } 3912 3913 pte = *pmap_pde_to_pte(pdep, va); 3914 if ((pte & PG_V) == 0 || 3915 ((pte & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0)) 3916 goto out; 3917 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 3918 3919 check_page: 3920 if (m != NULL && !vm_page_wire_mapped(m)) 3921 m = NULL; 3922 out: 3923 PMAP_UNLOCK(pmap); 3924 return (m); 3925 } 3926 3927 /* 3928 * Routine: pmap_kextract 3929 * Function: 3930 * Extract the physical page address associated with the given kernel 3931 * virtual address. 3932 */ 3933 vm_paddr_t 3934 pmap_kextract(vm_offset_t va) 3935 { 3936 pd_entry_t pde; 3937 vm_paddr_t pa; 3938 3939 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 3940 pa = DMAP_TO_PHYS(va); 3941 } else if (PMAP_ADDRESS_IN_LARGEMAP(va)) { 3942 pa = pmap_large_map_kextract(va); 3943 } else { 3944 pde = *vtopde(va); 3945 if (pde & PG_PS) { 3946 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 3947 } else { 3948 /* 3949 * Beware of a concurrent promotion that changes the 3950 * PDE at this point! For example, vtopte() must not 3951 * be used to access the PTE because it would use the 3952 * new PDE. It is, however, safe to use the old PDE 3953 * because the page table page is preserved by the 3954 * promotion. 3955 */ 3956 pa = *pmap_pde_to_pte(&pde, va); 3957 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 3958 } 3959 } 3960 return (pa); 3961 } 3962 3963 /*************************************************** 3964 * Low level mapping routines..... 3965 ***************************************************/ 3966 3967 /* 3968 * Add a wired page to the kva. 3969 * Note: not SMP coherent. 3970 */ 3971 void 3972 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 3973 { 3974 pt_entry_t *pte; 3975 3976 pte = vtopte(va); 3977 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | X86_PG_M | 3978 X86_PG_RW | X86_PG_V); 3979 } 3980 3981 static __inline void 3982 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 3983 { 3984 pt_entry_t *pte; 3985 int cache_bits; 3986 3987 pte = vtopte(va); 3988 cache_bits = pmap_cache_bits(kernel_pmap, mode, false); 3989 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | X86_PG_M | 3990 X86_PG_RW | X86_PG_V | cache_bits); 3991 } 3992 3993 /* 3994 * Remove a page from the kernel pagetables. 3995 * Note: not SMP coherent. 3996 */ 3997 void 3998 pmap_kremove(vm_offset_t va) 3999 { 4000 pt_entry_t *pte; 4001 4002 pte = vtopte(va); 4003 pte_clear(pte); 4004 } 4005 4006 /* 4007 * Used to map a range of physical addresses into kernel 4008 * virtual address space. 4009 * 4010 * The value passed in '*virt' is a suggested virtual address for 4011 * the mapping. Architectures which can support a direct-mapped 4012 * physical to virtual region can return the appropriate address 4013 * within that region, leaving '*virt' unchanged. Other 4014 * architectures should map the pages starting at '*virt' and 4015 * update '*virt' with the first usable address after the mapped 4016 * region. 4017 */ 4018 vm_offset_t 4019 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 4020 { 4021 return PHYS_TO_DMAP(start); 4022 } 4023 4024 /* 4025 * Add a list of wired pages to the kva 4026 * this routine is only used for temporary 4027 * kernel mappings that do not need to have 4028 * page modification or references recorded. 4029 * Note that old mappings are simply written 4030 * over. The page *must* be wired. 4031 * Note: SMP coherent. Uses a ranged shootdown IPI. 4032 */ 4033 void 4034 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 4035 { 4036 pt_entry_t *endpte, oldpte, pa, *pte; 4037 vm_page_t m; 4038 int cache_bits; 4039 4040 oldpte = 0; 4041 pte = vtopte(sva); 4042 endpte = pte + count; 4043 while (pte < endpte) { 4044 m = *ma++; 4045 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, false); 4046 pa = VM_PAGE_TO_PHYS(m) | cache_bits; 4047 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { 4048 oldpte |= *pte; 4049 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | 4050 X86_PG_M | X86_PG_RW | X86_PG_V); 4051 } 4052 pte++; 4053 } 4054 if (__predict_false((oldpte & X86_PG_V) != 0)) 4055 pmap_invalidate_range(kernel_pmap, sva, sva + count * 4056 PAGE_SIZE); 4057 } 4058 4059 /* 4060 * This routine tears out page mappings from the 4061 * kernel -- it is meant only for temporary mappings. 4062 * Note: SMP coherent. Uses a ranged shootdown IPI. 4063 */ 4064 void 4065 pmap_qremove(vm_offset_t sva, int count) 4066 { 4067 vm_offset_t va; 4068 4069 va = sva; 4070 while (count-- > 0) { 4071 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); 4072 pmap_kremove(va); 4073 va += PAGE_SIZE; 4074 } 4075 pmap_invalidate_range(kernel_pmap, sva, va); 4076 } 4077 4078 /*************************************************** 4079 * Page table page management routines..... 4080 ***************************************************/ 4081 /* 4082 * Schedule the specified unused page table page to be freed. Specifically, 4083 * add the page to the specified list of pages that will be released to the 4084 * physical memory manager after the TLB has been updated. 4085 */ 4086 static __inline void 4087 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO) 4088 { 4089 4090 if (set_PG_ZERO) 4091 m->flags |= PG_ZERO; 4092 else 4093 m->flags &= ~PG_ZERO; 4094 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 4095 } 4096 4097 /* 4098 * Inserts the specified page table page into the specified pmap's collection 4099 * of idle page table pages. Each of a pmap's page table pages is responsible 4100 * for mapping a distinct range of virtual addresses. The pmap's collection is 4101 * ordered by this virtual address range. 4102 * 4103 * If "promoted" is false, then the page table page "mpte" must be zero filled; 4104 * "mpte"'s valid field will be set to 0. 4105 * 4106 * If "promoted" is true and "allpte_PG_A_set" is false, then "mpte" must 4107 * contain valid mappings with identical attributes except for PG_A; "mpte"'s 4108 * valid field will be set to 1. 4109 * 4110 * If "promoted" and "allpte_PG_A_set" are both true, then "mpte" must contain 4111 * valid mappings with identical attributes including PG_A; "mpte"'s valid 4112 * field will be set to VM_PAGE_BITS_ALL. 4113 */ 4114 static __inline int 4115 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, 4116 bool allpte_PG_A_set) 4117 { 4118 4119 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4120 KASSERT(promoted || !allpte_PG_A_set, 4121 ("a zero-filled PTP can't have PG_A set in every PTE")); 4122 mpte->valid = promoted ? (allpte_PG_A_set ? VM_PAGE_BITS_ALL : 1) : 0; 4123 return (vm_radix_insert(&pmap->pm_root, mpte)); 4124 } 4125 4126 /* 4127 * Removes the page table page mapping the specified virtual address from the 4128 * specified pmap's collection of idle page table pages, and returns it. 4129 * Otherwise, returns NULL if there is no page table page corresponding to the 4130 * specified virtual address. 4131 */ 4132 static __inline vm_page_t 4133 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 4134 { 4135 4136 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4137 return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va))); 4138 } 4139 4140 /* 4141 * Decrements a page table page's reference count, which is used to record the 4142 * number of valid page table entries within the page. If the reference count 4143 * drops to zero, then the page table page is unmapped. Returns true if the 4144 * page table page was unmapped and false otherwise. 4145 */ 4146 static inline bool 4147 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4148 { 4149 4150 --m->ref_count; 4151 if (m->ref_count == 0) { 4152 _pmap_unwire_ptp(pmap, va, m, free); 4153 return (true); 4154 } else 4155 return (false); 4156 } 4157 4158 static void 4159 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4160 { 4161 pml5_entry_t *pml5; 4162 pml4_entry_t *pml4; 4163 pdp_entry_t *pdp; 4164 pd_entry_t *pd; 4165 vm_page_t pdpg, pdppg, pml4pg; 4166 4167 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4168 4169 /* 4170 * unmap the page table page 4171 */ 4172 if (m->pindex >= NUPDE + NUPDPE + NUPML4E) { 4173 /* PML4 page */ 4174 MPASS(pmap_is_la57(pmap)); 4175 pml5 = pmap_pml5e(pmap, va); 4176 *pml5 = 0; 4177 if (pmap->pm_pmltopu != NULL && va <= VM_MAXUSER_ADDRESS) { 4178 pml5 = pmap_pml5e_u(pmap, va); 4179 *pml5 = 0; 4180 } 4181 } else if (m->pindex >= NUPDE + NUPDPE) { 4182 /* PDP page */ 4183 pml4 = pmap_pml4e(pmap, va); 4184 *pml4 = 0; 4185 if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && 4186 va <= VM_MAXUSER_ADDRESS) { 4187 pml4 = pmap_pml4e_u(pmap, va); 4188 *pml4 = 0; 4189 } 4190 } else if (m->pindex >= NUPDE) { 4191 /* PD page */ 4192 pdp = pmap_pdpe(pmap, va); 4193 *pdp = 0; 4194 } else { 4195 /* PTE page */ 4196 pd = pmap_pde(pmap, va); 4197 *pd = 0; 4198 } 4199 if (m->pindex < NUPDE) { 4200 /* We just released a PT, unhold the matching PD */ 4201 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 4202 pmap_unwire_ptp(pmap, va, pdpg, free); 4203 } else if (m->pindex < NUPDE + NUPDPE) { 4204 /* We just released a PD, unhold the matching PDP */ 4205 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 4206 pmap_unwire_ptp(pmap, va, pdppg, free); 4207 } else if (m->pindex < NUPDE + NUPDPE + NUPML4E && pmap_is_la57(pmap)) { 4208 /* We just released a PDP, unhold the matching PML4 */ 4209 pml4pg = PHYS_TO_VM_PAGE(*pmap_pml5e(pmap, va) & PG_FRAME); 4210 pmap_unwire_ptp(pmap, va, pml4pg, free); 4211 } 4212 4213 pmap_pt_page_count_adj(pmap, -1); 4214 4215 /* 4216 * Put page on a list so that it is released after 4217 * *ALL* TLB shootdown is done 4218 */ 4219 pmap_add_delayed_free_list(m, free, true); 4220 } 4221 4222 /* 4223 * After removing a page table entry, this routine is used to 4224 * conditionally free the page, and manage the reference count. 4225 */ 4226 static int 4227 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 4228 struct spglist *free) 4229 { 4230 vm_page_t mpte; 4231 4232 if (va >= VM_MAXUSER_ADDRESS) 4233 return (0); 4234 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 4235 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 4236 return (pmap_unwire_ptp(pmap, va, mpte, free)); 4237 } 4238 4239 /* 4240 * Release a page table page reference after a failed attempt to create a 4241 * mapping. 4242 */ 4243 static void 4244 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 4245 { 4246 struct spglist free; 4247 4248 SLIST_INIT(&free); 4249 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 4250 /* 4251 * Although "va" was never mapped, paging-structure caches 4252 * could nonetheless have entries that refer to the freed 4253 * page table pages. Invalidate those entries. 4254 */ 4255 pmap_invalidate_page(pmap, va); 4256 vm_page_free_pages_toq(&free, true); 4257 } 4258 } 4259 4260 static void 4261 pmap_pinit_pcids(pmap_t pmap, uint32_t pcid, int gen) 4262 { 4263 struct pmap_pcid *pcidp; 4264 int i; 4265 4266 CPU_FOREACH(i) { 4267 pcidp = zpcpu_get_cpu(pmap->pm_pcidp, i); 4268 pcidp->pm_pcid = pcid; 4269 pcidp->pm_gen = gen; 4270 } 4271 } 4272 4273 void 4274 pmap_pinit0(pmap_t pmap) 4275 { 4276 struct proc *p; 4277 struct thread *td; 4278 4279 PMAP_LOCK_INIT(pmap); 4280 pmap->pm_pmltop = kernel_pmap->pm_pmltop; 4281 pmap->pm_pmltopu = NULL; 4282 pmap->pm_cr3 = kernel_pmap->pm_cr3; 4283 /* hack to keep pmap_pti_pcid_invalidate() alive */ 4284 pmap->pm_ucr3 = PMAP_NO_CR3; 4285 vm_radix_init(&pmap->pm_root); 4286 CPU_ZERO(&pmap->pm_active); 4287 TAILQ_INIT(&pmap->pm_pvchunk); 4288 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4289 pmap->pm_flags = pmap_flags; 4290 pmap->pm_pcidp = uma_zalloc_pcpu(pcpu_zone_8, M_WAITOK); 4291 pmap_pinit_pcids(pmap, PMAP_PCID_KERN + 1, 1); 4292 pmap_activate_boot(pmap); 4293 td = curthread; 4294 if (pti) { 4295 p = td->td_proc; 4296 PROC_LOCK(p); 4297 p->p_md.md_flags |= P_MD_KPTI; 4298 PROC_UNLOCK(p); 4299 } 4300 pmap_thread_init_invl_gen(td); 4301 4302 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 4303 pmap_pkru_ranges_zone = uma_zcreate("pkru ranges", 4304 sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL, 4305 UMA_ALIGN_PTR, 0); 4306 } 4307 } 4308 4309 void 4310 pmap_pinit_pml4(vm_page_t pml4pg) 4311 { 4312 pml4_entry_t *pm_pml4; 4313 int i; 4314 4315 pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); 4316 4317 /* Wire in kernel global address entries. */ 4318 for (i = 0; i < NKPML4E; i++) { 4319 pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW | 4320 X86_PG_V; 4321 } 4322 #ifdef KASAN 4323 for (i = 0; i < NKASANPML4E; i++) { 4324 pm_pml4[KASANPML4I + i] = (KASANPDPphys + ptoa(i)) | X86_PG_RW | 4325 X86_PG_V | pg_nx; 4326 } 4327 #endif 4328 #ifdef KMSAN 4329 for (i = 0; i < NKMSANSHADPML4E; i++) { 4330 pm_pml4[KMSANSHADPML4I + i] = (KMSANSHADPDPphys + ptoa(i)) | 4331 X86_PG_RW | X86_PG_V | pg_nx; 4332 } 4333 for (i = 0; i < NKMSANORIGPML4E; i++) { 4334 pm_pml4[KMSANORIGPML4I + i] = (KMSANORIGPDPphys + ptoa(i)) | 4335 X86_PG_RW | X86_PG_V | pg_nx; 4336 } 4337 #endif 4338 for (i = 0; i < ndmpdpphys; i++) { 4339 pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | 4340 X86_PG_V; 4341 } 4342 4343 /* install self-referential address mapping entry(s) */ 4344 pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW | 4345 X86_PG_A | X86_PG_M; 4346 4347 /* install large map entries if configured */ 4348 for (i = 0; i < lm_ents; i++) 4349 pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pmltop[LMSPML4I + i]; 4350 } 4351 4352 void 4353 pmap_pinit_pml5(vm_page_t pml5pg) 4354 { 4355 pml5_entry_t *pm_pml5; 4356 4357 pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg)); 4358 4359 /* 4360 * Add pml5 entry at top of KVA pointing to existing pml4 table, 4361 * entering all existing kernel mappings into level 5 table. 4362 */ 4363 pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | 4364 X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | 4365 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, false); 4366 4367 /* 4368 * Install self-referential address mapping entry. 4369 */ 4370 pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) | 4371 X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A | 4372 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, false); 4373 } 4374 4375 static void 4376 pmap_pinit_pml4_pti(vm_page_t pml4pgu) 4377 { 4378 pml4_entry_t *pm_pml4u; 4379 int i; 4380 4381 pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pgu)); 4382 for (i = 0; i < NPML4EPG; i++) 4383 pm_pml4u[i] = pti_pml4[i]; 4384 } 4385 4386 static void 4387 pmap_pinit_pml5_pti(vm_page_t pml5pgu) 4388 { 4389 pml5_entry_t *pm_pml5u; 4390 4391 pm_pml5u = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pgu)); 4392 pagezero(pm_pml5u); 4393 4394 /* 4395 * Add pml5 entry at top of KVA pointing to existing pml4 pti 4396 * table, entering all kernel mappings needed for usermode 4397 * into level 5 table. 4398 */ 4399 pm_pml5u[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 4400 pmap_kextract((vm_offset_t)pti_pml4) | 4401 X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | 4402 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, false); 4403 } 4404 4405 /* Allocate a page table page and do related bookkeeping */ 4406 static vm_page_t 4407 pmap_alloc_pt_page(pmap_t pmap, vm_pindex_t pindex, int flags) 4408 { 4409 vm_page_t m; 4410 4411 m = vm_page_alloc_noobj(flags); 4412 if (__predict_false(m == NULL)) 4413 return (NULL); 4414 m->pindex = pindex; 4415 pmap_pt_page_count_adj(pmap, 1); 4416 return (m); 4417 } 4418 4419 static void 4420 pmap_free_pt_page(pmap_t pmap, vm_page_t m, bool zerofilled) 4421 { 4422 /* 4423 * This function assumes the page will need to be unwired, 4424 * even though the counterpart allocation in pmap_alloc_pt_page() 4425 * doesn't enforce VM_ALLOC_WIRED. However, all current uses 4426 * of pmap_free_pt_page() require unwiring. The case in which 4427 * a PT page doesn't require unwiring because its ref_count has 4428 * naturally reached 0 is handled through _pmap_unwire_ptp(). 4429 */ 4430 vm_page_unwire_noq(m); 4431 if (zerofilled) 4432 vm_page_free_zero(m); 4433 else 4434 vm_page_free(m); 4435 4436 pmap_pt_page_count_adj(pmap, -1); 4437 } 4438 4439 _Static_assert(sizeof(struct pmap_pcid) == 8, "Fix pcpu zone for pm_pcidp"); 4440 4441 /* 4442 * Initialize a preallocated and zeroed pmap structure, 4443 * such as one in a vmspace structure. 4444 */ 4445 int 4446 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) 4447 { 4448 vm_page_t pmltop_pg, pmltop_pgu; 4449 vm_paddr_t pmltop_phys; 4450 4451 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4452 4453 /* 4454 * Allocate the page directory page. Pass NULL instead of a 4455 * pointer to the pmap here to avoid calling 4456 * pmap_resident_count_adj() through pmap_pt_page_count_adj(), 4457 * since that requires pmap lock. Instead do the accounting 4458 * manually. 4459 * 4460 * Note that final call to pmap_remove() optimization that 4461 * checks for zero resident_count is basically disabled by 4462 * accounting for top-level page. But the optimization was 4463 * not effective since we started using non-managed mapping of 4464 * the shared page. 4465 */ 4466 pmltop_pg = pmap_alloc_pt_page(NULL, 0, VM_ALLOC_WIRED | VM_ALLOC_ZERO | 4467 VM_ALLOC_WAITOK); 4468 pmap_pt_page_count_pinit(pmap, 1); 4469 4470 pmltop_phys = VM_PAGE_TO_PHYS(pmltop_pg); 4471 pmap->pm_pmltop = (pml5_entry_t *)PHYS_TO_DMAP(pmltop_phys); 4472 4473 if (pmap_pcid_enabled) { 4474 if (pmap->pm_pcidp == NULL) 4475 pmap->pm_pcidp = uma_zalloc_pcpu(pcpu_zone_8, 4476 M_WAITOK); 4477 pmap_pinit_pcids(pmap, PMAP_PCID_NONE, 0); 4478 } 4479 pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */ 4480 pmap->pm_ucr3 = PMAP_NO_CR3; 4481 pmap->pm_pmltopu = NULL; 4482 4483 pmap->pm_type = pm_type; 4484 4485 /* 4486 * Do not install the host kernel mappings in the nested page 4487 * tables. These mappings are meaningless in the guest physical 4488 * address space. 4489 * Install minimal kernel mappings in PTI case. 4490 */ 4491 switch (pm_type) { 4492 case PT_X86: 4493 pmap->pm_cr3 = pmltop_phys; 4494 if (pmap_is_la57(pmap)) 4495 pmap_pinit_pml5(pmltop_pg); 4496 else 4497 pmap_pinit_pml4(pmltop_pg); 4498 if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) { 4499 /* 4500 * As with pmltop_pg, pass NULL instead of a 4501 * pointer to the pmap to ensure that the PTI 4502 * page counted explicitly. 4503 */ 4504 pmltop_pgu = pmap_alloc_pt_page(NULL, 0, 4505 VM_ALLOC_WIRED | VM_ALLOC_WAITOK); 4506 pmap_pt_page_count_pinit(pmap, 1); 4507 pmap->pm_pmltopu = (pml4_entry_t *)PHYS_TO_DMAP( 4508 VM_PAGE_TO_PHYS(pmltop_pgu)); 4509 if (pmap_is_la57(pmap)) 4510 pmap_pinit_pml5_pti(pmltop_pgu); 4511 else 4512 pmap_pinit_pml4_pti(pmltop_pgu); 4513 pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pmltop_pgu); 4514 } 4515 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 4516 rangeset_init(&pmap->pm_pkru, pkru_dup_range, 4517 pkru_free_range, pmap, M_NOWAIT); 4518 } 4519 break; 4520 case PT_EPT: 4521 case PT_RVI: 4522 pmap->pm_eptsmr = smr_create("pmap", 0, 0); 4523 break; 4524 } 4525 4526 vm_radix_init(&pmap->pm_root); 4527 CPU_ZERO(&pmap->pm_active); 4528 TAILQ_INIT(&pmap->pm_pvchunk); 4529 pmap->pm_flags = flags; 4530 pmap->pm_eptgen = 0; 4531 4532 return (1); 4533 } 4534 4535 int 4536 pmap_pinit(pmap_t pmap) 4537 { 4538 4539 return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); 4540 } 4541 4542 static void 4543 pmap_allocpte_free_unref(pmap_t pmap, vm_offset_t va, pt_entry_t *pte) 4544 { 4545 vm_page_t mpg; 4546 struct spglist free; 4547 4548 mpg = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 4549 if (mpg->ref_count != 0) 4550 return; 4551 SLIST_INIT(&free); 4552 _pmap_unwire_ptp(pmap, va, mpg, &free); 4553 pmap_invalidate_page(pmap, va); 4554 vm_page_free_pages_toq(&free, true); 4555 } 4556 4557 static pml4_entry_t * 4558 pmap_allocpte_getpml4(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, 4559 bool addref) 4560 { 4561 vm_pindex_t pml5index; 4562 pml5_entry_t *pml5; 4563 pml4_entry_t *pml4; 4564 vm_page_t pml4pg; 4565 pt_entry_t PG_V; 4566 bool allocated; 4567 4568 if (!pmap_is_la57(pmap)) 4569 return (&pmap->pm_pmltop[pmap_pml4e_index(va)]); 4570 4571 PG_V = pmap_valid_bit(pmap); 4572 pml5index = pmap_pml5e_index(va); 4573 pml5 = &pmap->pm_pmltop[pml5index]; 4574 if ((*pml5 & PG_V) == 0) { 4575 if (pmap_allocpte_nosleep(pmap, pmap_pml5e_pindex(va), lockp, 4576 va) == NULL) 4577 return (NULL); 4578 allocated = true; 4579 } else { 4580 allocated = false; 4581 } 4582 pml4 = (pml4_entry_t *)PHYS_TO_DMAP(*pml5 & PG_FRAME); 4583 pml4 = &pml4[pmap_pml4e_index(va)]; 4584 if ((*pml4 & PG_V) == 0) { 4585 pml4pg = PHYS_TO_VM_PAGE(*pml5 & PG_FRAME); 4586 if (allocated && !addref) 4587 pml4pg->ref_count--; 4588 else if (!allocated && addref) 4589 pml4pg->ref_count++; 4590 } 4591 return (pml4); 4592 } 4593 4594 static pdp_entry_t * 4595 pmap_allocpte_getpdp(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, 4596 bool addref) 4597 { 4598 vm_page_t pdppg; 4599 pml4_entry_t *pml4; 4600 pdp_entry_t *pdp; 4601 pt_entry_t PG_V; 4602 bool allocated; 4603 4604 PG_V = pmap_valid_bit(pmap); 4605 4606 pml4 = pmap_allocpte_getpml4(pmap, lockp, va, false); 4607 if (pml4 == NULL) 4608 return (NULL); 4609 4610 if ((*pml4 & PG_V) == 0) { 4611 /* Have to allocate a new pdp, recurse */ 4612 if (pmap_allocpte_nosleep(pmap, pmap_pml4e_pindex(va), lockp, 4613 va) == NULL) { 4614 if (pmap_is_la57(pmap)) 4615 pmap_allocpte_free_unref(pmap, va, 4616 pmap_pml5e(pmap, va)); 4617 return (NULL); 4618 } 4619 allocated = true; 4620 } else { 4621 allocated = false; 4622 } 4623 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 4624 pdp = &pdp[pmap_pdpe_index(va)]; 4625 if ((*pdp & PG_V) == 0) { 4626 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 4627 if (allocated && !addref) 4628 pdppg->ref_count--; 4629 else if (!allocated && addref) 4630 pdppg->ref_count++; 4631 } 4632 return (pdp); 4633 } 4634 4635 /* 4636 * The ptepindexes, i.e. page indices, of the page table pages encountered 4637 * while translating virtual address va are defined as follows: 4638 * - for the page table page (last level), 4639 * ptepindex = pmap_pde_pindex(va) = va >> PDRSHIFT, 4640 * in other words, it is just the index of the PDE that maps the page 4641 * table page. 4642 * - for the page directory page, 4643 * ptepindex = NUPDE (number of userland PD entries) + 4644 * (pmap_pde_index(va) >> NPDEPGSHIFT) 4645 * i.e. index of PDPE is put after the last index of PDE, 4646 * - for the page directory pointer page, 4647 * ptepindex = NUPDE + NUPDPE + (pmap_pde_index(va) >> (NPDEPGSHIFT + 4648 * NPML4EPGSHIFT), 4649 * i.e. index of pml4e is put after the last index of PDPE, 4650 * - for the PML4 page (if LA57 mode is enabled), 4651 * ptepindex = NUPDE + NUPDPE + NUPML4E + (pmap_pde_index(va) >> 4652 * (NPDEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT), 4653 * i.e. index of pml5e is put after the last index of PML4E. 4654 * 4655 * Define an order on the paging entries, where all entries of the 4656 * same height are put together, then heights are put from deepest to 4657 * root. Then ptexpindex is the sequential number of the 4658 * corresponding paging entry in this order. 4659 * 4660 * The values of NUPDE, NUPDPE, and NUPML4E are determined by the size of 4661 * LA57 paging structures even in LA48 paging mode. Moreover, the 4662 * ptepindexes are calculated as if the paging structures were 5-level 4663 * regardless of the actual mode of operation. 4664 * 4665 * The root page at PML4/PML5 does not participate in this indexing scheme, 4666 * since it is statically allocated by pmap_pinit() and not by pmap_allocpte(). 4667 */ 4668 static vm_page_t 4669 pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, 4670 vm_offset_t va) 4671 { 4672 vm_pindex_t pml5index, pml4index; 4673 pml5_entry_t *pml5, *pml5u; 4674 pml4_entry_t *pml4, *pml4u; 4675 pdp_entry_t *pdp; 4676 pd_entry_t *pd; 4677 vm_page_t m, pdpg; 4678 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 4679 4680 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4681 4682 PG_A = pmap_accessed_bit(pmap); 4683 PG_M = pmap_modified_bit(pmap); 4684 PG_V = pmap_valid_bit(pmap); 4685 PG_RW = pmap_rw_bit(pmap); 4686 4687 /* 4688 * Allocate a page table page. 4689 */ 4690 m = pmap_alloc_pt_page(pmap, ptepindex, 4691 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 4692 if (m == NULL) 4693 return (NULL); 4694 4695 /* 4696 * Map the pagetable page into the process address space, if 4697 * it isn't already there. 4698 */ 4699 if (ptepindex >= NUPDE + NUPDPE + NUPML4E) { 4700 MPASS(pmap_is_la57(pmap)); 4701 4702 pml5index = pmap_pml5e_index(va); 4703 pml5 = &pmap->pm_pmltop[pml5index]; 4704 KASSERT((*pml5 & PG_V) == 0, 4705 ("pmap %p va %#lx pml5 %#lx", pmap, va, *pml5)); 4706 *pml5 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4707 4708 if (pmap->pm_pmltopu != NULL && pml5index < NUPML5E) { 4709 if (pmap->pm_ucr3 != PMAP_NO_CR3) 4710 *pml5 |= pg_nx; 4711 4712 pml5u = &pmap->pm_pmltopu[pml5index]; 4713 *pml5u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | 4714 PG_A | PG_M; 4715 } 4716 } else if (ptepindex >= NUPDE + NUPDPE) { 4717 pml4index = pmap_pml4e_index(va); 4718 /* Wire up a new PDPE page */ 4719 pml4 = pmap_allocpte_getpml4(pmap, lockp, va, true); 4720 if (pml4 == NULL) { 4721 pmap_free_pt_page(pmap, m, true); 4722 return (NULL); 4723 } 4724 KASSERT((*pml4 & PG_V) == 0, 4725 ("pmap %p va %#lx pml4 %#lx", pmap, va, *pml4)); 4726 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4727 4728 if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && 4729 pml4index < NUPML4E) { 4730 /* 4731 * PTI: Make all user-space mappings in the 4732 * kernel-mode page table no-execute so that 4733 * we detect any programming errors that leave 4734 * the kernel-mode page table active on return 4735 * to user space. 4736 */ 4737 if (pmap->pm_ucr3 != PMAP_NO_CR3) 4738 *pml4 |= pg_nx; 4739 4740 pml4u = &pmap->pm_pmltopu[pml4index]; 4741 *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | 4742 PG_A | PG_M; 4743 } 4744 } else if (ptepindex >= NUPDE) { 4745 /* Wire up a new PDE page */ 4746 pdp = pmap_allocpte_getpdp(pmap, lockp, va, true); 4747 if (pdp == NULL) { 4748 pmap_free_pt_page(pmap, m, true); 4749 return (NULL); 4750 } 4751 KASSERT((*pdp & PG_V) == 0, 4752 ("pmap %p va %#lx pdp %#lx", pmap, va, *pdp)); 4753 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4754 } else { 4755 /* Wire up a new PTE page */ 4756 pdp = pmap_allocpte_getpdp(pmap, lockp, va, false); 4757 if (pdp == NULL) { 4758 pmap_free_pt_page(pmap, m, true); 4759 return (NULL); 4760 } 4761 if ((*pdp & PG_V) == 0) { 4762 /* Have to allocate a new pd, recurse */ 4763 if (pmap_allocpte_nosleep(pmap, pmap_pdpe_pindex(va), 4764 lockp, va) == NULL) { 4765 pmap_allocpte_free_unref(pmap, va, 4766 pmap_pml4e(pmap, va)); 4767 pmap_free_pt_page(pmap, m, true); 4768 return (NULL); 4769 } 4770 } else { 4771 /* Add reference to the pd page */ 4772 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 4773 pdpg->ref_count++; 4774 } 4775 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 4776 4777 /* Now we know where the page directory page is */ 4778 pd = &pd[pmap_pde_index(va)]; 4779 KASSERT((*pd & PG_V) == 0, 4780 ("pmap %p va %#lx pd %#lx", pmap, va, *pd)); 4781 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4782 } 4783 4784 return (m); 4785 } 4786 4787 /* 4788 * This routine is called if the desired page table page does not exist. 4789 * 4790 * If page table page allocation fails, this routine may sleep before 4791 * returning NULL. It sleeps only if a lock pointer was given. Sleep 4792 * occurs right before returning to the caller. This way, we never 4793 * drop pmap lock to sleep while a page table page has ref_count == 0, 4794 * which prevents the page from being freed under us. 4795 */ 4796 static vm_page_t 4797 pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, 4798 vm_offset_t va) 4799 { 4800 vm_page_t m; 4801 4802 m = pmap_allocpte_nosleep(pmap, ptepindex, lockp, va); 4803 if (m == NULL && lockp != NULL) { 4804 RELEASE_PV_LIST_LOCK(lockp); 4805 PMAP_UNLOCK(pmap); 4806 PMAP_ASSERT_NOT_IN_DI(); 4807 vm_wait(NULL); 4808 PMAP_LOCK(pmap); 4809 } 4810 return (m); 4811 } 4812 4813 static pd_entry_t * 4814 pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, 4815 struct rwlock **lockp) 4816 { 4817 pdp_entry_t *pdpe, PG_V; 4818 pd_entry_t *pde; 4819 vm_page_t pdpg; 4820 vm_pindex_t pdpindex; 4821 4822 PG_V = pmap_valid_bit(pmap); 4823 4824 retry: 4825 pdpe = pmap_pdpe(pmap, va); 4826 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 4827 pde = pmap_pdpe_to_pde(pdpe, va); 4828 if (va < VM_MAXUSER_ADDRESS) { 4829 /* Add a reference to the pd page. */ 4830 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 4831 pdpg->ref_count++; 4832 } else 4833 pdpg = NULL; 4834 } else if (va < VM_MAXUSER_ADDRESS) { 4835 /* Allocate a pd page. */ 4836 pdpindex = pmap_pde_pindex(va) >> NPDPEPGSHIFT; 4837 pdpg = pmap_allocpte_alloc(pmap, NUPDE + pdpindex, lockp, va); 4838 if (pdpg == NULL) { 4839 if (lockp != NULL) 4840 goto retry; 4841 else 4842 return (NULL); 4843 } 4844 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 4845 pde = &pde[pmap_pde_index(va)]; 4846 } else 4847 panic("pmap_alloc_pde: missing page table page for va %#lx", 4848 va); 4849 *pdpgp = pdpg; 4850 return (pde); 4851 } 4852 4853 static vm_page_t 4854 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 4855 { 4856 vm_pindex_t ptepindex; 4857 pd_entry_t *pd, PG_V; 4858 vm_page_t m; 4859 4860 PG_V = pmap_valid_bit(pmap); 4861 4862 /* 4863 * Calculate pagetable page index 4864 */ 4865 ptepindex = pmap_pde_pindex(va); 4866 retry: 4867 /* 4868 * Get the page directory entry 4869 */ 4870 pd = pmap_pde(pmap, va); 4871 4872 /* 4873 * This supports switching from a 2MB page to a 4874 * normal 4K page. 4875 */ 4876 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 4877 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { 4878 /* 4879 * Invalidation of the 2MB page mapping may have caused 4880 * the deallocation of the underlying PD page. 4881 */ 4882 pd = NULL; 4883 } 4884 } 4885 4886 /* 4887 * If the page table page is mapped, we just increment the 4888 * hold count, and activate it. 4889 */ 4890 if (pd != NULL && (*pd & PG_V) != 0) { 4891 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 4892 m->ref_count++; 4893 } else { 4894 /* 4895 * Here if the pte page isn't mapped, or if it has been 4896 * deallocated. 4897 */ 4898 m = pmap_allocpte_alloc(pmap, ptepindex, lockp, va); 4899 if (m == NULL && lockp != NULL) 4900 goto retry; 4901 } 4902 return (m); 4903 } 4904 4905 /*************************************************** 4906 * Pmap allocation/deallocation routines. 4907 ***************************************************/ 4908 4909 /* 4910 * Release any resources held by the given physical map. 4911 * Called when a pmap initialized by pmap_pinit is being released. 4912 * Should only be called if the map contains no valid mappings. 4913 */ 4914 void 4915 pmap_release(pmap_t pmap) 4916 { 4917 vm_page_t m; 4918 int i; 4919 4920 KASSERT(vm_radix_is_empty(&pmap->pm_root), 4921 ("pmap_release: pmap %p has reserved page table page(s)", 4922 pmap)); 4923 KASSERT(CPU_EMPTY(&pmap->pm_active), 4924 ("releasing active pmap %p", pmap)); 4925 4926 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop)); 4927 4928 if (pmap_is_la57(pmap)) { 4929 pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0; 4930 pmap->pm_pmltop[PML5PML5I] = 0; 4931 } else { 4932 for (i = 0; i < NKPML4E; i++) /* KVA */ 4933 pmap->pm_pmltop[KPML4BASE + i] = 0; 4934 #ifdef KASAN 4935 for (i = 0; i < NKASANPML4E; i++) /* KASAN shadow map */ 4936 pmap->pm_pmltop[KASANPML4I + i] = 0; 4937 #endif 4938 #ifdef KMSAN 4939 for (i = 0; i < NKMSANSHADPML4E; i++) /* KMSAN shadow map */ 4940 pmap->pm_pmltop[KMSANSHADPML4I + i] = 0; 4941 for (i = 0; i < NKMSANORIGPML4E; i++) /* KMSAN shadow map */ 4942 pmap->pm_pmltop[KMSANORIGPML4I + i] = 0; 4943 #endif 4944 for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ 4945 pmap->pm_pmltop[DMPML4I + i] = 0; 4946 pmap->pm_pmltop[PML4PML4I] = 0; /* Recursive Mapping */ 4947 for (i = 0; i < lm_ents; i++) /* Large Map */ 4948 pmap->pm_pmltop[LMSPML4I + i] = 0; 4949 } 4950 4951 pmap_free_pt_page(NULL, m, true); 4952 pmap_pt_page_count_pinit(pmap, -1); 4953 4954 if (pmap->pm_pmltopu != NULL) { 4955 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap-> 4956 pm_pmltopu)); 4957 pmap_free_pt_page(NULL, m, false); 4958 pmap_pt_page_count_pinit(pmap, -1); 4959 } 4960 if (pmap->pm_type == PT_X86 && 4961 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) 4962 rangeset_fini(&pmap->pm_pkru); 4963 4964 KASSERT(pmap->pm_stats.resident_count == 0, 4965 ("pmap_release: pmap %p resident count %ld != 0", 4966 pmap, pmap->pm_stats.resident_count)); 4967 } 4968 4969 static int 4970 kvm_size(SYSCTL_HANDLER_ARGS) 4971 { 4972 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 4973 4974 return sysctl_handle_long(oidp, &ksize, 0, req); 4975 } 4976 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 4977 0, 0, kvm_size, "LU", 4978 "Size of KVM"); 4979 4980 static int 4981 kvm_free(SYSCTL_HANDLER_ARGS) 4982 { 4983 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 4984 4985 return sysctl_handle_long(oidp, &kfree, 0, req); 4986 } 4987 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 4988 0, 0, kvm_free, "LU", 4989 "Amount of KVM free"); 4990 4991 #ifdef KMSAN 4992 static void 4993 pmap_kmsan_shadow_map_page_array(vm_paddr_t pdppa, vm_size_t size) 4994 { 4995 pdp_entry_t *pdpe; 4996 pd_entry_t *pde; 4997 pt_entry_t *pte; 4998 vm_paddr_t dummypa, dummypd, dummypt; 4999 int i, npde, npdpg; 5000 5001 npdpg = howmany(size, NBPDP); 5002 npde = size / NBPDR; 5003 5004 dummypa = vm_phys_early_alloc(-1, PAGE_SIZE); 5005 pagezero((void *)PHYS_TO_DMAP(dummypa)); 5006 5007 dummypt = vm_phys_early_alloc(-1, PAGE_SIZE); 5008 pagezero((void *)PHYS_TO_DMAP(dummypt)); 5009 dummypd = vm_phys_early_alloc(-1, PAGE_SIZE * npdpg); 5010 for (i = 0; i < npdpg; i++) 5011 pagezero((void *)PHYS_TO_DMAP(dummypd + ptoa(i))); 5012 5013 pte = (pt_entry_t *)PHYS_TO_DMAP(dummypt); 5014 for (i = 0; i < NPTEPG; i++) 5015 pte[i] = (pt_entry_t)(dummypa | X86_PG_V | X86_PG_RW | 5016 X86_PG_A | X86_PG_M | pg_nx); 5017 5018 pde = (pd_entry_t *)PHYS_TO_DMAP(dummypd); 5019 for (i = 0; i < npde; i++) 5020 pde[i] = (pd_entry_t)(dummypt | X86_PG_V | X86_PG_RW | pg_nx); 5021 5022 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(pdppa); 5023 for (i = 0; i < npdpg; i++) 5024 pdpe[i] = (pdp_entry_t)(dummypd + ptoa(i) | X86_PG_V | 5025 X86_PG_RW | pg_nx); 5026 } 5027 5028 static void 5029 pmap_kmsan_page_array_startup(vm_offset_t start, vm_offset_t end) 5030 { 5031 vm_size_t size; 5032 5033 KASSERT(start % NBPDP == 0, ("unaligned page array start address")); 5034 5035 /* 5036 * The end of the page array's KVA region is 2MB aligned, see 5037 * kmem_init(). 5038 */ 5039 size = round_2mpage(end) - start; 5040 pmap_kmsan_shadow_map_page_array(KMSANSHADPDPphys, size); 5041 pmap_kmsan_shadow_map_page_array(KMSANORIGPDPphys, size); 5042 } 5043 #endif 5044 5045 /* 5046 * Allocate physical memory for the vm_page array and map it into KVA, 5047 * attempting to back the vm_pages with domain-local memory. 5048 */ 5049 void 5050 pmap_page_array_startup(long pages) 5051 { 5052 pdp_entry_t *pdpe; 5053 pd_entry_t *pde, newpdir; 5054 vm_offset_t va, start, end; 5055 vm_paddr_t pa; 5056 long pfn; 5057 int domain, i; 5058 5059 vm_page_array_size = pages; 5060 5061 start = VM_MIN_KERNEL_ADDRESS; 5062 end = start + pages * sizeof(struct vm_page); 5063 for (va = start; va < end; va += NBPDR) { 5064 pfn = first_page + (va - start) / sizeof(struct vm_page); 5065 domain = vm_phys_domain(ptoa(pfn)); 5066 pdpe = pmap_pdpe(kernel_pmap, va); 5067 if ((*pdpe & X86_PG_V) == 0) { 5068 pa = vm_phys_early_alloc(domain, PAGE_SIZE); 5069 dump_add_page(pa); 5070 pagezero((void *)PHYS_TO_DMAP(pa)); 5071 *pdpe = (pdp_entry_t)(pa | X86_PG_V | X86_PG_RW | 5072 X86_PG_A | X86_PG_M); 5073 } 5074 pde = pmap_pdpe_to_pde(pdpe, va); 5075 if ((*pde & X86_PG_V) != 0) 5076 panic("Unexpected pde"); 5077 pa = vm_phys_early_alloc(domain, NBPDR); 5078 for (i = 0; i < NPDEPG; i++) 5079 dump_add_page(pa + i * PAGE_SIZE); 5080 newpdir = (pd_entry_t)(pa | X86_PG_V | X86_PG_RW | X86_PG_A | 5081 X86_PG_M | PG_PS | pg_g | pg_nx); 5082 pde_store(pde, newpdir); 5083 } 5084 vm_page_array = (vm_page_t)start; 5085 5086 #ifdef KMSAN 5087 pmap_kmsan_page_array_startup(start, end); 5088 #endif 5089 } 5090 5091 /* 5092 * grow the number of kernel page table entries, if needed 5093 */ 5094 void 5095 pmap_growkernel(vm_offset_t addr) 5096 { 5097 vm_paddr_t paddr; 5098 vm_page_t nkpg; 5099 pd_entry_t *pde, newpdir; 5100 pdp_entry_t *pdpe; 5101 vm_offset_t end; 5102 5103 TSENTER(); 5104 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 5105 5106 /* 5107 * The kernel map covers two distinct regions of KVA: that used 5108 * for dynamic kernel memory allocations, and the uppermost 2GB 5109 * of the virtual address space. The latter is used to map the 5110 * kernel and loadable kernel modules. This scheme enables the 5111 * use of a special code generation model for kernel code which 5112 * takes advantage of compact addressing modes in machine code. 5113 * 5114 * Both regions grow upwards; to avoid wasting memory, the gap 5115 * in between is unmapped. If "addr" is above "KERNBASE", the 5116 * kernel's region is grown, otherwise the kmem region is grown. 5117 * 5118 * The correctness of this action is based on the following 5119 * argument: vm_map_insert() allocates contiguous ranges of the 5120 * kernel virtual address space. It calls this function if a range 5121 * ends after "kernel_vm_end". If the kernel is mapped between 5122 * "kernel_vm_end" and "addr", then the range cannot begin at 5123 * "kernel_vm_end". In fact, its beginning address cannot be less 5124 * than the kernel. Thus, there is no immediate need to allocate 5125 * any new kernel page table pages between "kernel_vm_end" and 5126 * "KERNBASE". 5127 */ 5128 if (KERNBASE < addr) { 5129 end = KERNBASE + nkpt * NBPDR; 5130 if (end == 0) { 5131 TSEXIT(); 5132 return; 5133 } 5134 } else { 5135 end = kernel_vm_end; 5136 } 5137 5138 addr = roundup2(addr, NBPDR); 5139 if (addr - 1 >= vm_map_max(kernel_map)) 5140 addr = vm_map_max(kernel_map); 5141 if (addr <= end) { 5142 /* 5143 * The grown region is already mapped, so there is 5144 * nothing to do. 5145 */ 5146 TSEXIT(); 5147 return; 5148 } 5149 5150 kasan_shadow_map(end, addr - end); 5151 kmsan_shadow_map(end, addr - end); 5152 while (end < addr) { 5153 pdpe = pmap_pdpe(kernel_pmap, end); 5154 if ((*pdpe & X86_PG_V) == 0) { 5155 nkpg = pmap_alloc_pt_page(kernel_pmap, 5156 pmap_pdpe_pindex(end), VM_ALLOC_WIRED | 5157 VM_ALLOC_INTERRUPT | VM_ALLOC_ZERO); 5158 if (nkpg == NULL) 5159 panic("pmap_growkernel: no memory to grow kernel"); 5160 paddr = VM_PAGE_TO_PHYS(nkpg); 5161 *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | 5162 X86_PG_A | X86_PG_M); 5163 continue; /* try again */ 5164 } 5165 pde = pmap_pdpe_to_pde(pdpe, end); 5166 if ((*pde & X86_PG_V) != 0) { 5167 end = (end + NBPDR) & ~PDRMASK; 5168 if (end - 1 >= vm_map_max(kernel_map)) { 5169 end = vm_map_max(kernel_map); 5170 break; 5171 } 5172 continue; 5173 } 5174 5175 nkpg = pmap_alloc_pt_page(kernel_pmap, pmap_pde_pindex(end), 5176 VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT | VM_ALLOC_ZERO); 5177 if (nkpg == NULL) 5178 panic("pmap_growkernel: no memory to grow kernel"); 5179 paddr = VM_PAGE_TO_PHYS(nkpg); 5180 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; 5181 pde_store(pde, newpdir); 5182 5183 end = (end + NBPDR) & ~PDRMASK; 5184 if (end - 1 >= vm_map_max(kernel_map)) { 5185 end = vm_map_max(kernel_map); 5186 break; 5187 } 5188 } 5189 5190 if (end <= KERNBASE) 5191 kernel_vm_end = end; 5192 else 5193 nkpt = howmany(end - KERNBASE, NBPDR); 5194 TSEXIT(); 5195 } 5196 5197 /*************************************************** 5198 * page management routines. 5199 ***************************************************/ 5200 5201 static const uint64_t pc_freemask[_NPCM] = { 5202 [0 ... _NPCM - 2] = PC_FREEN, 5203 [_NPCM - 1] = PC_FREEL 5204 }; 5205 5206 #ifdef PV_STATS 5207 5208 static COUNTER_U64_DEFINE_EARLY(pc_chunk_count); 5209 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, 5210 &pc_chunk_count, "Current number of pv entry cnunks"); 5211 5212 static COUNTER_U64_DEFINE_EARLY(pc_chunk_allocs); 5213 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, 5214 &pc_chunk_allocs, "Total number of pv entry chunks allocated"); 5215 5216 static COUNTER_U64_DEFINE_EARLY(pc_chunk_frees); 5217 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, 5218 &pc_chunk_frees, "Total number of pv entry chunks freed"); 5219 5220 static COUNTER_U64_DEFINE_EARLY(pc_chunk_tryfail); 5221 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, 5222 &pc_chunk_tryfail, 5223 "Number of failed attempts to get a pv entry chunk page"); 5224 5225 static COUNTER_U64_DEFINE_EARLY(pv_entry_frees); 5226 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, 5227 &pv_entry_frees, "Total number of pv entries freed"); 5228 5229 static COUNTER_U64_DEFINE_EARLY(pv_entry_allocs); 5230 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, 5231 &pv_entry_allocs, "Total number of pv entries allocated"); 5232 5233 static COUNTER_U64_DEFINE_EARLY(pv_entry_count); 5234 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, 5235 &pv_entry_count, "Current number of pv entries"); 5236 5237 static COUNTER_U64_DEFINE_EARLY(pv_entry_spare); 5238 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, 5239 &pv_entry_spare, "Current number of spare pv entries"); 5240 #endif 5241 5242 static void 5243 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di) 5244 { 5245 5246 if (pmap == NULL) 5247 return; 5248 pmap_invalidate_all(pmap); 5249 if (pmap != locked_pmap) 5250 PMAP_UNLOCK(pmap); 5251 if (start_di) 5252 pmap_delayed_invl_finish(); 5253 } 5254 5255 /* 5256 * We are in a serious low memory condition. Resort to 5257 * drastic measures to free some pages so we can allocate 5258 * another pv entry chunk. 5259 * 5260 * Returns NULL if PV entries were reclaimed from the specified pmap. 5261 * 5262 * We do not, however, unmap 2mpages because subsequent accesses will 5263 * allocate per-page pv entries until repromotion occurs, thereby 5264 * exacerbating the shortage of free pv entries. 5265 */ 5266 static vm_page_t 5267 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain) 5268 { 5269 struct pv_chunks_list *pvc; 5270 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 5271 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 5272 struct md_page *pvh; 5273 pd_entry_t *pde; 5274 pmap_t next_pmap, pmap; 5275 pt_entry_t *pte, tpte; 5276 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 5277 pv_entry_t pv; 5278 vm_offset_t va; 5279 vm_page_t m, m_pc; 5280 struct spglist free; 5281 uint64_t inuse; 5282 int bit, field, freed; 5283 bool start_di, restart; 5284 5285 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 5286 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 5287 pmap = NULL; 5288 m_pc = NULL; 5289 PG_G = PG_A = PG_M = PG_RW = 0; 5290 SLIST_INIT(&free); 5291 bzero(&pc_marker_b, sizeof(pc_marker_b)); 5292 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 5293 pc_marker = (struct pv_chunk *)&pc_marker_b; 5294 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 5295 5296 /* 5297 * A delayed invalidation block should already be active if 5298 * pmap_advise() or pmap_remove() called this function by way 5299 * of pmap_demote_pde_locked(). 5300 */ 5301 start_di = pmap_not_in_di(); 5302 5303 pvc = &pv_chunks[domain]; 5304 mtx_lock(&pvc->pvc_lock); 5305 pvc->active_reclaims++; 5306 TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru); 5307 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru); 5308 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 5309 SLIST_EMPTY(&free)) { 5310 next_pmap = pc->pc_pmap; 5311 if (next_pmap == NULL) { 5312 /* 5313 * The next chunk is a marker. However, it is 5314 * not our marker, so active_reclaims must be 5315 * > 1. Consequently, the next_chunk code 5316 * will not rotate the pv_chunks list. 5317 */ 5318 goto next_chunk; 5319 } 5320 mtx_unlock(&pvc->pvc_lock); 5321 5322 /* 5323 * A pv_chunk can only be removed from the pc_lru list 5324 * when both pc_chunks_mutex is owned and the 5325 * corresponding pmap is locked. 5326 */ 5327 if (pmap != next_pmap) { 5328 restart = false; 5329 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, 5330 start_di); 5331 pmap = next_pmap; 5332 /* Avoid deadlock and lock recursion. */ 5333 if (pmap > locked_pmap) { 5334 RELEASE_PV_LIST_LOCK(lockp); 5335 PMAP_LOCK(pmap); 5336 if (start_di) 5337 pmap_delayed_invl_start(); 5338 mtx_lock(&pvc->pvc_lock); 5339 restart = true; 5340 } else if (pmap != locked_pmap) { 5341 if (PMAP_TRYLOCK(pmap)) { 5342 if (start_di) 5343 pmap_delayed_invl_start(); 5344 mtx_lock(&pvc->pvc_lock); 5345 restart = true; 5346 } else { 5347 pmap = NULL; /* pmap is not locked */ 5348 mtx_lock(&pvc->pvc_lock); 5349 pc = TAILQ_NEXT(pc_marker, pc_lru); 5350 if (pc == NULL || 5351 pc->pc_pmap != next_pmap) 5352 continue; 5353 goto next_chunk; 5354 } 5355 } else if (start_di) 5356 pmap_delayed_invl_start(); 5357 PG_G = pmap_global_bit(pmap); 5358 PG_A = pmap_accessed_bit(pmap); 5359 PG_M = pmap_modified_bit(pmap); 5360 PG_RW = pmap_rw_bit(pmap); 5361 if (restart) 5362 continue; 5363 } 5364 5365 /* 5366 * Destroy every non-wired, 4 KB page mapping in the chunk. 5367 */ 5368 freed = 0; 5369 for (field = 0; field < _NPCM; field++) { 5370 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 5371 inuse != 0; inuse &= ~(1UL << bit)) { 5372 bit = bsfq(inuse); 5373 pv = &pc->pc_pventry[field * 64 + bit]; 5374 va = pv->pv_va; 5375 pde = pmap_pde(pmap, va); 5376 if ((*pde & PG_PS) != 0) 5377 continue; 5378 pte = pmap_pde_to_pte(pde, va); 5379 if ((*pte & PG_W) != 0) 5380 continue; 5381 tpte = pte_load_clear(pte); 5382 if ((tpte & PG_G) != 0) 5383 pmap_invalidate_page(pmap, va); 5384 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 5385 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5386 vm_page_dirty(m); 5387 if ((tpte & PG_A) != 0) 5388 vm_page_aflag_set(m, PGA_REFERENCED); 5389 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 5390 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5391 m->md.pv_gen++; 5392 if (TAILQ_EMPTY(&m->md.pv_list) && 5393 (m->flags & PG_FICTITIOUS) == 0) { 5394 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5395 if (TAILQ_EMPTY(&pvh->pv_list)) { 5396 vm_page_aflag_clear(m, 5397 PGA_WRITEABLE); 5398 } 5399 } 5400 pmap_delayed_invl_page(m); 5401 pc->pc_map[field] |= 1UL << bit; 5402 pmap_unuse_pt(pmap, va, *pde, &free); 5403 freed++; 5404 } 5405 } 5406 if (freed == 0) { 5407 mtx_lock(&pvc->pvc_lock); 5408 goto next_chunk; 5409 } 5410 /* Every freed mapping is for a 4 KB page. */ 5411 pmap_resident_count_adj(pmap, -freed); 5412 PV_STAT(counter_u64_add(pv_entry_frees, freed)); 5413 PV_STAT(counter_u64_add(pv_entry_spare, freed)); 5414 PV_STAT(counter_u64_add(pv_entry_count, -freed)); 5415 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5416 if (pc_is_free(pc)) { 5417 PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV)); 5418 PV_STAT(counter_u64_add(pc_chunk_count, -1)); 5419 PV_STAT(counter_u64_add(pc_chunk_frees, 1)); 5420 /* Entire chunk is free; return it. */ 5421 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 5422 dump_drop_page(m_pc->phys_addr); 5423 mtx_lock(&pvc->pvc_lock); 5424 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5425 break; 5426 } 5427 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5428 mtx_lock(&pvc->pvc_lock); 5429 /* One freed pv entry in locked_pmap is sufficient. */ 5430 if (pmap == locked_pmap) 5431 break; 5432 next_chunk: 5433 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 5434 TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru); 5435 if (pvc->active_reclaims == 1 && pmap != NULL) { 5436 /* 5437 * Rotate the pv chunks list so that we do not 5438 * scan the same pv chunks that could not be 5439 * freed (because they contained a wired 5440 * and/or superpage mapping) on every 5441 * invocation of reclaim_pv_chunk(). 5442 */ 5443 while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker) { 5444 MPASS(pc->pc_pmap != NULL); 5445 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5446 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 5447 } 5448 } 5449 } 5450 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 5451 TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru); 5452 pvc->active_reclaims--; 5453 mtx_unlock(&pvc->pvc_lock); 5454 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); 5455 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 5456 m_pc = SLIST_FIRST(&free); 5457 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 5458 /* Recycle a freed page table page. */ 5459 m_pc->ref_count = 1; 5460 } 5461 vm_page_free_pages_toq(&free, true); 5462 return (m_pc); 5463 } 5464 5465 static vm_page_t 5466 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 5467 { 5468 vm_page_t m; 5469 int i, domain; 5470 5471 domain = PCPU_GET(domain); 5472 for (i = 0; i < vm_ndomains; i++) { 5473 m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain); 5474 if (m != NULL) 5475 break; 5476 domain = (domain + 1) % vm_ndomains; 5477 } 5478 5479 return (m); 5480 } 5481 5482 /* 5483 * free the pv_entry back to the free list 5484 */ 5485 static void 5486 free_pv_entry(pmap_t pmap, pv_entry_t pv) 5487 { 5488 struct pv_chunk *pc; 5489 int idx, field, bit; 5490 5491 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5492 PV_STAT(counter_u64_add(pv_entry_frees, 1)); 5493 PV_STAT(counter_u64_add(pv_entry_spare, 1)); 5494 PV_STAT(counter_u64_add(pv_entry_count, -1)); 5495 pc = pv_to_chunk(pv); 5496 idx = pv - &pc->pc_pventry[0]; 5497 field = idx / 64; 5498 bit = idx % 64; 5499 pc->pc_map[field] |= 1ul << bit; 5500 if (!pc_is_free(pc)) { 5501 /* 98% of the time, pc is already at the head of the list. */ 5502 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 5503 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5504 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5505 } 5506 return; 5507 } 5508 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5509 free_pv_chunk(pc); 5510 } 5511 5512 static void 5513 free_pv_chunk_dequeued(struct pv_chunk *pc) 5514 { 5515 vm_page_t m; 5516 5517 PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV)); 5518 PV_STAT(counter_u64_add(pc_chunk_count, -1)); 5519 PV_STAT(counter_u64_add(pc_chunk_frees, 1)); 5520 counter_u64_add(pv_page_count, -1); 5521 /* entire chunk is free, return it */ 5522 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 5523 dump_drop_page(m->phys_addr); 5524 vm_page_unwire_noq(m); 5525 vm_page_free(m); 5526 } 5527 5528 static void 5529 free_pv_chunk(struct pv_chunk *pc) 5530 { 5531 struct pv_chunks_list *pvc; 5532 5533 pvc = &pv_chunks[pc_to_domain(pc)]; 5534 mtx_lock(&pvc->pvc_lock); 5535 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5536 mtx_unlock(&pvc->pvc_lock); 5537 free_pv_chunk_dequeued(pc); 5538 } 5539 5540 static void 5541 free_pv_chunk_batch(struct pv_chunklist *batch) 5542 { 5543 struct pv_chunks_list *pvc; 5544 struct pv_chunk *pc, *npc; 5545 int i; 5546 5547 for (i = 0; i < vm_ndomains; i++) { 5548 if (TAILQ_EMPTY(&batch[i])) 5549 continue; 5550 pvc = &pv_chunks[i]; 5551 mtx_lock(&pvc->pvc_lock); 5552 TAILQ_FOREACH(pc, &batch[i], pc_list) { 5553 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5554 } 5555 mtx_unlock(&pvc->pvc_lock); 5556 } 5557 5558 for (i = 0; i < vm_ndomains; i++) { 5559 TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) { 5560 free_pv_chunk_dequeued(pc); 5561 } 5562 } 5563 } 5564 5565 /* 5566 * Returns a new PV entry, allocating a new PV chunk from the system when 5567 * needed. If this PV chunk allocation fails and a PV list lock pointer was 5568 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 5569 * returned. 5570 * 5571 * The given PV list lock may be released. 5572 */ 5573 static pv_entry_t 5574 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 5575 { 5576 struct pv_chunks_list *pvc; 5577 int bit, field; 5578 pv_entry_t pv; 5579 struct pv_chunk *pc; 5580 vm_page_t m; 5581 5582 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5583 PV_STAT(counter_u64_add(pv_entry_allocs, 1)); 5584 retry: 5585 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 5586 if (pc != NULL) { 5587 for (field = 0; field < _NPCM; field++) { 5588 if (pc->pc_map[field]) { 5589 bit = bsfq(pc->pc_map[field]); 5590 break; 5591 } 5592 } 5593 if (field < _NPCM) { 5594 pv = &pc->pc_pventry[field * 64 + bit]; 5595 pc->pc_map[field] &= ~(1ul << bit); 5596 /* If this was the last item, move it to tail */ 5597 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 5598 pc->pc_map[2] == 0) { 5599 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5600 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 5601 pc_list); 5602 } 5603 PV_STAT(counter_u64_add(pv_entry_count, 1)); 5604 PV_STAT(counter_u64_add(pv_entry_spare, -1)); 5605 return (pv); 5606 } 5607 } 5608 /* No free items, allocate another chunk */ 5609 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 5610 if (m == NULL) { 5611 if (lockp == NULL) { 5612 PV_STAT(counter_u64_add(pc_chunk_tryfail, 1)); 5613 return (NULL); 5614 } 5615 m = reclaim_pv_chunk(pmap, lockp); 5616 if (m == NULL) 5617 goto retry; 5618 } else 5619 counter_u64_add(pv_page_count, 1); 5620 PV_STAT(counter_u64_add(pc_chunk_count, 1)); 5621 PV_STAT(counter_u64_add(pc_chunk_allocs, 1)); 5622 dump_add_page(m->phys_addr); 5623 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 5624 pc->pc_pmap = pmap; 5625 pc->pc_map[0] = PC_FREEN & ~1ul; /* preallocated bit 0 */ 5626 pc->pc_map[1] = PC_FREEN; 5627 pc->pc_map[2] = PC_FREEL; 5628 pvc = &pv_chunks[vm_page_domain(m)]; 5629 mtx_lock(&pvc->pvc_lock); 5630 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 5631 mtx_unlock(&pvc->pvc_lock); 5632 pv = &pc->pc_pventry[0]; 5633 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5634 PV_STAT(counter_u64_add(pv_entry_count, 1)); 5635 PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV - 1)); 5636 return (pv); 5637 } 5638 5639 /* 5640 * Returns the number of one bits within the given PV chunk map. 5641 * 5642 * The erratas for Intel processors state that "POPCNT Instruction May 5643 * Take Longer to Execute Than Expected". It is believed that the 5644 * issue is the spurious dependency on the destination register. 5645 * Provide a hint to the register rename logic that the destination 5646 * value is overwritten, by clearing it, as suggested in the 5647 * optimization manual. It should be cheap for unaffected processors 5648 * as well. 5649 * 5650 * Reference numbers for erratas are 5651 * 4th Gen Core: HSD146 5652 * 5th Gen Core: BDM85 5653 * 6th Gen Core: SKL029 5654 */ 5655 static int 5656 popcnt_pc_map_pq(uint64_t *map) 5657 { 5658 u_long result, tmp; 5659 5660 __asm __volatile("xorl %k0,%k0;popcntq %2,%0;" 5661 "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;" 5662 "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0" 5663 : "=&r" (result), "=&r" (tmp) 5664 : "m" (map[0]), "m" (map[1]), "m" (map[2])); 5665 return (result); 5666 } 5667 5668 /* 5669 * Ensure that the number of spare PV entries in the specified pmap meets or 5670 * exceeds the given count, "needed". 5671 * 5672 * The given PV list lock may be released. 5673 */ 5674 static void 5675 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 5676 { 5677 struct pv_chunks_list *pvc; 5678 struct pch new_tail[PMAP_MEMDOM]; 5679 struct pv_chunk *pc; 5680 vm_page_t m; 5681 int avail, free, i; 5682 bool reclaimed; 5683 5684 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5685 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 5686 5687 /* 5688 * Newly allocated PV chunks must be stored in a private list until 5689 * the required number of PV chunks have been allocated. Otherwise, 5690 * reclaim_pv_chunk() could recycle one of these chunks. In 5691 * contrast, these chunks must be added to the pmap upon allocation. 5692 */ 5693 for (i = 0; i < PMAP_MEMDOM; i++) 5694 TAILQ_INIT(&new_tail[i]); 5695 retry: 5696 avail = 0; 5697 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 5698 #ifndef __POPCNT__ 5699 if ((cpu_feature2 & CPUID2_POPCNT) == 0) 5700 bit_count((bitstr_t *)pc->pc_map, 0, 5701 sizeof(pc->pc_map) * NBBY, &free); 5702 else 5703 #endif 5704 free = popcnt_pc_map_pq(pc->pc_map); 5705 if (free == 0) 5706 break; 5707 avail += free; 5708 if (avail >= needed) 5709 break; 5710 } 5711 for (reclaimed = false; avail < needed; avail += _NPCPV) { 5712 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 5713 if (m == NULL) { 5714 m = reclaim_pv_chunk(pmap, lockp); 5715 if (m == NULL) 5716 goto retry; 5717 reclaimed = true; 5718 } else 5719 counter_u64_add(pv_page_count, 1); 5720 PV_STAT(counter_u64_add(pc_chunk_count, 1)); 5721 PV_STAT(counter_u64_add(pc_chunk_allocs, 1)); 5722 dump_add_page(m->phys_addr); 5723 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 5724 pc->pc_pmap = pmap; 5725 pc->pc_map[0] = PC_FREEN; 5726 pc->pc_map[1] = PC_FREEN; 5727 pc->pc_map[2] = PC_FREEL; 5728 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5729 TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru); 5730 PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV)); 5731 5732 /* 5733 * The reclaim might have freed a chunk from the current pmap. 5734 * If that chunk contained available entries, we need to 5735 * re-count the number of available entries. 5736 */ 5737 if (reclaimed) 5738 goto retry; 5739 } 5740 for (i = 0; i < vm_ndomains; i++) { 5741 if (TAILQ_EMPTY(&new_tail[i])) 5742 continue; 5743 pvc = &pv_chunks[i]; 5744 mtx_lock(&pvc->pvc_lock); 5745 TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru); 5746 mtx_unlock(&pvc->pvc_lock); 5747 } 5748 } 5749 5750 /* 5751 * First find and then remove the pv entry for the specified pmap and virtual 5752 * address from the specified pv list. Returns the pv entry if found and NULL 5753 * otherwise. This operation can be performed on pv lists for either 4KB or 5754 * 2MB page mappings. 5755 */ 5756 static __inline pv_entry_t 5757 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 5758 { 5759 pv_entry_t pv; 5760 5761 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5762 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 5763 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5764 pvh->pv_gen++; 5765 break; 5766 } 5767 } 5768 return (pv); 5769 } 5770 5771 /* 5772 * After demotion from a 2MB page mapping to 512 4KB page mappings, 5773 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 5774 * entries for each of the 4KB page mappings. 5775 */ 5776 static void 5777 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 5778 struct rwlock **lockp) 5779 { 5780 struct md_page *pvh; 5781 struct pv_chunk *pc; 5782 pv_entry_t pv; 5783 vm_offset_t va_last; 5784 vm_page_t m; 5785 int bit, field; 5786 5787 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5788 KASSERT((pa & PDRMASK) == 0, 5789 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 5790 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5791 5792 /* 5793 * Transfer the 2mpage's pv entry for this mapping to the first 5794 * page's pv list. Once this transfer begins, the pv list lock 5795 * must not be released until the last pv entry is reinstantiated. 5796 */ 5797 pvh = pa_to_pvh(pa); 5798 va = trunc_2mpage(va); 5799 pv = pmap_pvh_remove(pvh, pmap, va); 5800 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 5801 m = PHYS_TO_VM_PAGE(pa); 5802 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5803 m->md.pv_gen++; 5804 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 5805 PV_STAT(counter_u64_add(pv_entry_allocs, NPTEPG - 1)); 5806 va_last = va + NBPDR - PAGE_SIZE; 5807 for (;;) { 5808 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 5809 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 5810 pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); 5811 for (field = 0; field < _NPCM; field++) { 5812 while (pc->pc_map[field]) { 5813 bit = bsfq(pc->pc_map[field]); 5814 pc->pc_map[field] &= ~(1ul << bit); 5815 pv = &pc->pc_pventry[field * 64 + bit]; 5816 va += PAGE_SIZE; 5817 pv->pv_va = va; 5818 m++; 5819 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5820 ("pmap_pv_demote_pde: page %p is not managed", m)); 5821 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5822 m->md.pv_gen++; 5823 if (va == va_last) 5824 goto out; 5825 } 5826 } 5827 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5828 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 5829 } 5830 out: 5831 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 5832 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5833 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 5834 } 5835 PV_STAT(counter_u64_add(pv_entry_count, NPTEPG - 1)); 5836 PV_STAT(counter_u64_add(pv_entry_spare, -(NPTEPG - 1))); 5837 } 5838 5839 #if VM_NRESERVLEVEL > 0 5840 /* 5841 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 5842 * replace the many pv entries for the 4KB page mappings by a single pv entry 5843 * for the 2MB page mapping. 5844 */ 5845 static void 5846 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 5847 struct rwlock **lockp) 5848 { 5849 struct md_page *pvh; 5850 pv_entry_t pv; 5851 vm_offset_t va_last; 5852 vm_page_t m; 5853 5854 KASSERT((pa & PDRMASK) == 0, 5855 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 5856 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5857 5858 /* 5859 * Transfer the first page's pv entry for this mapping to the 2mpage's 5860 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 5861 * a transfer avoids the possibility that get_pv_entry() calls 5862 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 5863 * mappings that is being promoted. 5864 */ 5865 m = PHYS_TO_VM_PAGE(pa); 5866 va = trunc_2mpage(va); 5867 pv = pmap_pvh_remove(&m->md, pmap, va); 5868 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 5869 pvh = pa_to_pvh(pa); 5870 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5871 pvh->pv_gen++; 5872 /* Free the remaining NPTEPG - 1 pv entries. */ 5873 va_last = va + NBPDR - PAGE_SIZE; 5874 do { 5875 m++; 5876 va += PAGE_SIZE; 5877 pmap_pvh_free(&m->md, pmap, va); 5878 } while (va < va_last); 5879 } 5880 #endif /* VM_NRESERVLEVEL > 0 */ 5881 5882 /* 5883 * First find and then destroy the pv entry for the specified pmap and virtual 5884 * address. This operation can be performed on pv lists for either 4KB or 2MB 5885 * page mappings. 5886 */ 5887 static void 5888 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 5889 { 5890 pv_entry_t pv; 5891 5892 pv = pmap_pvh_remove(pvh, pmap, va); 5893 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 5894 free_pv_entry(pmap, pv); 5895 } 5896 5897 /* 5898 * Conditionally create the PV entry for a 4KB page mapping if the required 5899 * memory can be allocated without resorting to reclamation. 5900 */ 5901 static bool 5902 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 5903 struct rwlock **lockp) 5904 { 5905 pv_entry_t pv; 5906 5907 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5908 /* Pass NULL instead of the lock pointer to disable reclamation. */ 5909 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 5910 pv->pv_va = va; 5911 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 5912 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5913 m->md.pv_gen++; 5914 return (true); 5915 } else 5916 return (false); 5917 } 5918 5919 /* 5920 * Create the PV entry for a 2MB page mapping. Always returns true unless the 5921 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 5922 * false if the PV entry cannot be allocated without resorting to reclamation. 5923 */ 5924 static bool 5925 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, 5926 struct rwlock **lockp) 5927 { 5928 struct md_page *pvh; 5929 pv_entry_t pv; 5930 vm_paddr_t pa; 5931 5932 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5933 /* Pass NULL instead of the lock pointer to disable reclamation. */ 5934 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 5935 NULL : lockp)) == NULL) 5936 return (false); 5937 pv->pv_va = va; 5938 pa = pde & PG_PS_FRAME; 5939 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5940 pvh = pa_to_pvh(pa); 5941 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5942 pvh->pv_gen++; 5943 return (true); 5944 } 5945 5946 /* 5947 * Fills a page table page with mappings to consecutive physical pages. 5948 */ 5949 static void 5950 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 5951 { 5952 pt_entry_t *pte; 5953 5954 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 5955 *pte = newpte; 5956 newpte += PAGE_SIZE; 5957 } 5958 } 5959 5960 /* 5961 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 5962 * mapping is invalidated. 5963 */ 5964 static bool 5965 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 5966 { 5967 struct rwlock *lock; 5968 bool rv; 5969 5970 lock = NULL; 5971 rv = pmap_demote_pde_locked(pmap, pde, va, &lock); 5972 if (lock != NULL) 5973 rw_wunlock(lock); 5974 return (rv); 5975 } 5976 5977 static void 5978 pmap_demote_pde_check(pt_entry_t *firstpte __unused, pt_entry_t newpte __unused) 5979 { 5980 #ifdef INVARIANTS 5981 #ifdef DIAGNOSTIC 5982 pt_entry_t *xpte, *ypte; 5983 5984 for (xpte = firstpte; xpte < firstpte + NPTEPG; 5985 xpte++, newpte += PAGE_SIZE) { 5986 if ((*xpte & PG_FRAME) != (newpte & PG_FRAME)) { 5987 printf("pmap_demote_pde: xpte %zd and newpte map " 5988 "different pages: found %#lx, expected %#lx\n", 5989 xpte - firstpte, *xpte, newpte); 5990 printf("page table dump\n"); 5991 for (ypte = firstpte; ypte < firstpte + NPTEPG; ypte++) 5992 printf("%zd %#lx\n", ypte - firstpte, *ypte); 5993 panic("firstpte"); 5994 } 5995 } 5996 #else 5997 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 5998 ("pmap_demote_pde: firstpte and newpte map different physical" 5999 " addresses")); 6000 #endif 6001 #endif 6002 } 6003 6004 static void 6005 pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 6006 pd_entry_t oldpde, struct rwlock **lockp) 6007 { 6008 struct spglist free; 6009 vm_offset_t sva; 6010 6011 SLIST_INIT(&free); 6012 sva = trunc_2mpage(va); 6013 pmap_remove_pde(pmap, pde, sva, &free, lockp); 6014 if ((oldpde & pmap_global_bit(pmap)) == 0) 6015 pmap_invalidate_pde_page(pmap, sva, oldpde); 6016 vm_page_free_pages_toq(&free, true); 6017 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx in pmap %p", 6018 va, pmap); 6019 } 6020 6021 static bool 6022 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 6023 struct rwlock **lockp) 6024 { 6025 pd_entry_t newpde, oldpde; 6026 pt_entry_t *firstpte, newpte; 6027 pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V; 6028 vm_paddr_t mptepa; 6029 vm_page_t mpte; 6030 int PG_PTE_CACHE; 6031 bool in_kernel; 6032 6033 PG_A = pmap_accessed_bit(pmap); 6034 PG_G = pmap_global_bit(pmap); 6035 PG_M = pmap_modified_bit(pmap); 6036 PG_RW = pmap_rw_bit(pmap); 6037 PG_V = pmap_valid_bit(pmap); 6038 PG_PTE_CACHE = pmap_cache_mask(pmap, false); 6039 PG_PKU_MASK = pmap_pku_mask_bit(pmap); 6040 6041 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6042 in_kernel = va >= VM_MAXUSER_ADDRESS; 6043 oldpde = *pde; 6044 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 6045 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 6046 6047 /* 6048 * Invalidate the 2MB page mapping and return "failure" if the 6049 * mapping was never accessed. 6050 */ 6051 if ((oldpde & PG_A) == 0) { 6052 KASSERT((oldpde & PG_W) == 0, 6053 ("pmap_demote_pde: a wired mapping is missing PG_A")); 6054 pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); 6055 return (false); 6056 } 6057 6058 mpte = pmap_remove_pt_page(pmap, va); 6059 if (mpte == NULL) { 6060 KASSERT((oldpde & PG_W) == 0, 6061 ("pmap_demote_pde: page table page for a wired mapping" 6062 " is missing")); 6063 6064 /* 6065 * If the page table page is missing and the mapping 6066 * is for a kernel address, the mapping must belong to 6067 * the direct map. Page table pages are preallocated 6068 * for every other part of the kernel address space, 6069 * so the direct map region is the only part of the 6070 * kernel address space that must be handled here. 6071 */ 6072 KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS && 6073 va < DMAP_MAX_ADDRESS), 6074 ("pmap_demote_pde: No saved mpte for va %#lx", va)); 6075 6076 /* 6077 * If the 2MB page mapping belongs to the direct map 6078 * region of the kernel's address space, then the page 6079 * allocation request specifies the highest possible 6080 * priority (VM_ALLOC_INTERRUPT). Otherwise, the 6081 * priority is normal. 6082 */ 6083 mpte = pmap_alloc_pt_page(pmap, pmap_pde_pindex(va), 6084 (in_kernel ? VM_ALLOC_INTERRUPT : 0) | VM_ALLOC_WIRED); 6085 6086 /* 6087 * If the allocation of the new page table page fails, 6088 * invalidate the 2MB page mapping and return "failure". 6089 */ 6090 if (mpte == NULL) { 6091 pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); 6092 return (false); 6093 } 6094 6095 if (!in_kernel) 6096 mpte->ref_count = NPTEPG; 6097 } 6098 mptepa = VM_PAGE_TO_PHYS(mpte); 6099 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 6100 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 6101 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 6102 ("pmap_demote_pde: oldpde is missing PG_M")); 6103 newpte = oldpde & ~PG_PS; 6104 newpte = pmap_swap_pat(pmap, newpte); 6105 6106 /* 6107 * If the PTP is not leftover from an earlier promotion or it does not 6108 * have PG_A set in every PTE, then fill it. The new PTEs will all 6109 * have PG_A set. 6110 */ 6111 if (!vm_page_all_valid(mpte)) 6112 pmap_fill_ptp(firstpte, newpte); 6113 6114 pmap_demote_pde_check(firstpte, newpte); 6115 6116 /* 6117 * If the mapping has changed attributes, update the PTEs. 6118 */ 6119 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 6120 pmap_fill_ptp(firstpte, newpte); 6121 6122 /* 6123 * The spare PV entries must be reserved prior to demoting the 6124 * mapping, that is, prior to changing the PDE. Otherwise, the state 6125 * of the PDE and the PV lists will be inconsistent, which can result 6126 * in reclaim_pv_chunk() attempting to remove a PV entry from the 6127 * wrong PV list and pmap_pv_demote_pde() failing to find the expected 6128 * PV entry for the 2MB page mapping that is being demoted. 6129 */ 6130 if ((oldpde & PG_MANAGED) != 0) 6131 reserve_pv_entries(pmap, NPTEPG - 1, lockp); 6132 6133 /* 6134 * Demote the mapping. This pmap is locked. The old PDE has 6135 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 6136 * set. Thus, there is no danger of a race with another 6137 * processor changing the setting of PG_A and/or PG_M between 6138 * the read above and the store below. 6139 */ 6140 if (workaround_erratum383) 6141 pmap_update_pde(pmap, va, pde, newpde); 6142 else 6143 pde_store(pde, newpde); 6144 6145 /* 6146 * Invalidate a stale recursive mapping of the page table page. 6147 */ 6148 if (in_kernel) 6149 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 6150 6151 /* 6152 * Demote the PV entry. 6153 */ 6154 if ((oldpde & PG_MANAGED) != 0) 6155 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); 6156 6157 counter_u64_add(pmap_pde_demotions, 1); 6158 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p", 6159 va, pmap); 6160 return (true); 6161 } 6162 6163 /* 6164 * pmap_remove_kernel_pde: Remove a kernel superpage mapping. 6165 */ 6166 static void 6167 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 6168 { 6169 pd_entry_t newpde; 6170 vm_paddr_t mptepa; 6171 vm_page_t mpte; 6172 6173 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 6174 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6175 mpte = pmap_remove_pt_page(pmap, va); 6176 if (mpte == NULL) 6177 panic("pmap_remove_kernel_pde: Missing pt page."); 6178 6179 mptepa = VM_PAGE_TO_PHYS(mpte); 6180 newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; 6181 6182 /* 6183 * If this page table page was unmapped by a promotion, then it 6184 * contains valid mappings. Zero it to invalidate those mappings. 6185 */ 6186 if (vm_page_any_valid(mpte)) 6187 pagezero((void *)PHYS_TO_DMAP(mptepa)); 6188 6189 /* 6190 * Demote the mapping. 6191 */ 6192 if (workaround_erratum383) 6193 pmap_update_pde(pmap, va, pde, newpde); 6194 else 6195 pde_store(pde, newpde); 6196 6197 /* 6198 * Invalidate a stale recursive mapping of the page table page. 6199 */ 6200 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 6201 } 6202 6203 /* 6204 * pmap_remove_pde: do the things to unmap a superpage in a process 6205 */ 6206 static int 6207 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 6208 struct spglist *free, struct rwlock **lockp) 6209 { 6210 struct md_page *pvh; 6211 pd_entry_t oldpde; 6212 vm_offset_t eva, va; 6213 vm_page_t m, mpte; 6214 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 6215 6216 PG_G = pmap_global_bit(pmap); 6217 PG_A = pmap_accessed_bit(pmap); 6218 PG_M = pmap_modified_bit(pmap); 6219 PG_RW = pmap_rw_bit(pmap); 6220 6221 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6222 KASSERT((sva & PDRMASK) == 0, 6223 ("pmap_remove_pde: sva is not 2mpage aligned")); 6224 oldpde = pte_load_clear(pdq); 6225 if (oldpde & PG_W) 6226 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 6227 if ((oldpde & PG_G) != 0) 6228 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 6229 pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE); 6230 if (oldpde & PG_MANAGED) { 6231 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); 6232 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 6233 pmap_pvh_free(pvh, pmap, sva); 6234 eva = sva + NBPDR; 6235 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 6236 va < eva; va += PAGE_SIZE, m++) { 6237 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6238 vm_page_dirty(m); 6239 if (oldpde & PG_A) 6240 vm_page_aflag_set(m, PGA_REFERENCED); 6241 if (TAILQ_EMPTY(&m->md.pv_list) && 6242 TAILQ_EMPTY(&pvh->pv_list)) 6243 vm_page_aflag_clear(m, PGA_WRITEABLE); 6244 pmap_delayed_invl_page(m); 6245 } 6246 } 6247 if (pmap == kernel_pmap) { 6248 pmap_remove_kernel_pde(pmap, pdq, sva); 6249 } else { 6250 mpte = pmap_remove_pt_page(pmap, sva); 6251 if (mpte != NULL) { 6252 KASSERT(vm_page_any_valid(mpte), 6253 ("pmap_remove_pde: pte page not promoted")); 6254 pmap_pt_page_count_adj(pmap, -1); 6255 KASSERT(mpte->ref_count == NPTEPG, 6256 ("pmap_remove_pde: pte page ref count error")); 6257 mpte->ref_count = 0; 6258 pmap_add_delayed_free_list(mpte, free, false); 6259 } 6260 } 6261 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); 6262 } 6263 6264 /* 6265 * pmap_remove_pte: do the things to unmap a page in a process 6266 */ 6267 static int 6268 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 6269 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) 6270 { 6271 struct md_page *pvh; 6272 pt_entry_t oldpte, PG_A, PG_M, PG_RW; 6273 vm_page_t m; 6274 6275 PG_A = pmap_accessed_bit(pmap); 6276 PG_M = pmap_modified_bit(pmap); 6277 PG_RW = pmap_rw_bit(pmap); 6278 6279 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6280 oldpte = pte_load_clear(ptq); 6281 if (oldpte & PG_W) 6282 pmap->pm_stats.wired_count -= 1; 6283 pmap_resident_count_adj(pmap, -1); 6284 if (oldpte & PG_MANAGED) { 6285 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 6286 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6287 vm_page_dirty(m); 6288 if (oldpte & PG_A) 6289 vm_page_aflag_set(m, PGA_REFERENCED); 6290 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 6291 pmap_pvh_free(&m->md, pmap, va); 6292 if (TAILQ_EMPTY(&m->md.pv_list) && 6293 (m->flags & PG_FICTITIOUS) == 0) { 6294 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6295 if (TAILQ_EMPTY(&pvh->pv_list)) 6296 vm_page_aflag_clear(m, PGA_WRITEABLE); 6297 } 6298 pmap_delayed_invl_page(m); 6299 } 6300 return (pmap_unuse_pt(pmap, va, ptepde, free)); 6301 } 6302 6303 /* 6304 * Remove a single page from a process address space 6305 */ 6306 static void 6307 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 6308 struct spglist *free) 6309 { 6310 struct rwlock *lock; 6311 pt_entry_t *pte, PG_V; 6312 6313 PG_V = pmap_valid_bit(pmap); 6314 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6315 if ((*pde & PG_V) == 0) 6316 return; 6317 pte = pmap_pde_to_pte(pde, va); 6318 if ((*pte & PG_V) == 0) 6319 return; 6320 lock = NULL; 6321 pmap_remove_pte(pmap, pte, va, *pde, free, &lock); 6322 if (lock != NULL) 6323 rw_wunlock(lock); 6324 pmap_invalidate_page(pmap, va); 6325 } 6326 6327 /* 6328 * Removes the specified range of addresses from the page table page. 6329 */ 6330 static bool 6331 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 6332 pd_entry_t *pde, struct spglist *free, struct rwlock **lockp) 6333 { 6334 pt_entry_t PG_G, *pte; 6335 vm_offset_t va; 6336 bool anyvalid; 6337 6338 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6339 PG_G = pmap_global_bit(pmap); 6340 anyvalid = false; 6341 va = eva; 6342 for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++, 6343 sva += PAGE_SIZE) { 6344 if (*pte == 0) { 6345 if (va != eva) { 6346 pmap_invalidate_range(pmap, va, sva); 6347 va = eva; 6348 } 6349 continue; 6350 } 6351 if ((*pte & PG_G) == 0) 6352 anyvalid = true; 6353 else if (va == eva) 6354 va = sva; 6355 if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) { 6356 sva += PAGE_SIZE; 6357 break; 6358 } 6359 } 6360 if (va != eva) 6361 pmap_invalidate_range(pmap, va, sva); 6362 return (anyvalid); 6363 } 6364 6365 static void 6366 pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete) 6367 { 6368 struct rwlock *lock; 6369 vm_page_t mt; 6370 vm_offset_t va_next; 6371 pml5_entry_t *pml5e; 6372 pml4_entry_t *pml4e; 6373 pdp_entry_t *pdpe; 6374 pd_entry_t ptpaddr, *pde; 6375 pt_entry_t PG_G, PG_V; 6376 struct spglist free; 6377 int anyvalid; 6378 6379 PG_G = pmap_global_bit(pmap); 6380 PG_V = pmap_valid_bit(pmap); 6381 6382 /* 6383 * If there are no resident pages besides the top level page 6384 * table page(s), there is nothing to do. Kernel pmap always 6385 * accounts whole preloaded area as resident, which makes its 6386 * resident count > 2. 6387 * Perform an unsynchronized read. This is, however, safe. 6388 */ 6389 if (pmap->pm_stats.resident_count <= 1 + (pmap->pm_pmltopu != NULL ? 6390 1 : 0)) 6391 return; 6392 6393 anyvalid = 0; 6394 SLIST_INIT(&free); 6395 6396 pmap_delayed_invl_start(); 6397 PMAP_LOCK(pmap); 6398 if (map_delete) 6399 pmap_pkru_on_remove(pmap, sva, eva); 6400 6401 /* 6402 * special handling of removing one page. a very 6403 * common operation and easy to short circuit some 6404 * code. 6405 */ 6406 if (sva + PAGE_SIZE == eva) { 6407 pde = pmap_pde(pmap, sva); 6408 if (pde && (*pde & PG_PS) == 0) { 6409 pmap_remove_page(pmap, sva, pde, &free); 6410 goto out; 6411 } 6412 } 6413 6414 lock = NULL; 6415 for (; sva < eva; sva = va_next) { 6416 if (pmap->pm_stats.resident_count == 0) 6417 break; 6418 6419 if (pmap_is_la57(pmap)) { 6420 pml5e = pmap_pml5e(pmap, sva); 6421 if ((*pml5e & PG_V) == 0) { 6422 va_next = (sva + NBPML5) & ~PML5MASK; 6423 if (va_next < sva) 6424 va_next = eva; 6425 continue; 6426 } 6427 pml4e = pmap_pml5e_to_pml4e(pml5e, sva); 6428 } else { 6429 pml4e = pmap_pml4e(pmap, sva); 6430 } 6431 if ((*pml4e & PG_V) == 0) { 6432 va_next = (sva + NBPML4) & ~PML4MASK; 6433 if (va_next < sva) 6434 va_next = eva; 6435 continue; 6436 } 6437 6438 va_next = (sva + NBPDP) & ~PDPMASK; 6439 if (va_next < sva) 6440 va_next = eva; 6441 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 6442 if ((*pdpe & PG_V) == 0) 6443 continue; 6444 if ((*pdpe & PG_PS) != 0) { 6445 KASSERT(va_next <= eva, 6446 ("partial update of non-transparent 1G mapping " 6447 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 6448 *pdpe, sva, eva, va_next)); 6449 MPASS(pmap != kernel_pmap); /* XXXKIB */ 6450 MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); 6451 anyvalid = 1; 6452 *pdpe = 0; 6453 pmap_resident_count_adj(pmap, -NBPDP / PAGE_SIZE); 6454 mt = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, sva) & PG_FRAME); 6455 pmap_unwire_ptp(pmap, sva, mt, &free); 6456 continue; 6457 } 6458 6459 /* 6460 * Calculate index for next page table. 6461 */ 6462 va_next = (sva + NBPDR) & ~PDRMASK; 6463 if (va_next < sva) 6464 va_next = eva; 6465 6466 pde = pmap_pdpe_to_pde(pdpe, sva); 6467 ptpaddr = *pde; 6468 6469 /* 6470 * Weed out invalid mappings. 6471 */ 6472 if (ptpaddr == 0) 6473 continue; 6474 6475 /* 6476 * Check for large page. 6477 */ 6478 if ((ptpaddr & PG_PS) != 0) { 6479 /* 6480 * Are we removing the entire large page? If not, 6481 * demote the mapping and fall through. 6482 */ 6483 if (sva + NBPDR == va_next && eva >= va_next) { 6484 /* 6485 * The TLB entry for a PG_G mapping is 6486 * invalidated by pmap_remove_pde(). 6487 */ 6488 if ((ptpaddr & PG_G) == 0) 6489 anyvalid = 1; 6490 pmap_remove_pde(pmap, pde, sva, &free, &lock); 6491 continue; 6492 } else if (!pmap_demote_pde_locked(pmap, pde, sva, 6493 &lock)) { 6494 /* The large page mapping was destroyed. */ 6495 continue; 6496 } else 6497 ptpaddr = *pde; 6498 } 6499 6500 /* 6501 * Limit our scan to either the end of the va represented 6502 * by the current page table page, or to the end of the 6503 * range being removed. 6504 */ 6505 if (va_next > eva) 6506 va_next = eva; 6507 6508 if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock)) 6509 anyvalid = 1; 6510 } 6511 if (lock != NULL) 6512 rw_wunlock(lock); 6513 out: 6514 if (anyvalid) 6515 pmap_invalidate_all(pmap); 6516 PMAP_UNLOCK(pmap); 6517 pmap_delayed_invl_finish(); 6518 vm_page_free_pages_toq(&free, true); 6519 } 6520 6521 /* 6522 * Remove the given range of addresses from the specified map. 6523 * 6524 * It is assumed that the start and end are properly 6525 * rounded to the page size. 6526 */ 6527 void 6528 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 6529 { 6530 pmap_remove1(pmap, sva, eva, false); 6531 } 6532 6533 /* 6534 * Remove the given range of addresses as part of a logical unmap 6535 * operation. This has the effect of calling pmap_remove(), but 6536 * also clears any metadata that should persist for the lifetime 6537 * of a logical mapping. 6538 */ 6539 void 6540 pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 6541 { 6542 pmap_remove1(pmap, sva, eva, true); 6543 } 6544 6545 /* 6546 * Routine: pmap_remove_all 6547 * Function: 6548 * Removes this physical page from 6549 * all physical maps in which it resides. 6550 * Reflects back modify bits to the pager. 6551 * 6552 * Notes: 6553 * Original versions of this routine were very 6554 * inefficient because they iteratively called 6555 * pmap_remove (slow...) 6556 */ 6557 6558 void 6559 pmap_remove_all(vm_page_t m) 6560 { 6561 struct md_page *pvh; 6562 pv_entry_t pv; 6563 pmap_t pmap; 6564 struct rwlock *lock; 6565 pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; 6566 pd_entry_t *pde; 6567 vm_offset_t va; 6568 struct spglist free; 6569 int pvh_gen, md_gen; 6570 6571 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6572 ("pmap_remove_all: page %p is not managed", m)); 6573 SLIST_INIT(&free); 6574 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6575 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 6576 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6577 rw_wlock(lock); 6578 retry: 6579 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 6580 pmap = PV_PMAP(pv); 6581 if (!PMAP_TRYLOCK(pmap)) { 6582 pvh_gen = pvh->pv_gen; 6583 rw_wunlock(lock); 6584 PMAP_LOCK(pmap); 6585 rw_wlock(lock); 6586 if (pvh_gen != pvh->pv_gen) { 6587 PMAP_UNLOCK(pmap); 6588 goto retry; 6589 } 6590 } 6591 va = pv->pv_va; 6592 pde = pmap_pde(pmap, va); 6593 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 6594 PMAP_UNLOCK(pmap); 6595 } 6596 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 6597 pmap = PV_PMAP(pv); 6598 if (!PMAP_TRYLOCK(pmap)) { 6599 pvh_gen = pvh->pv_gen; 6600 md_gen = m->md.pv_gen; 6601 rw_wunlock(lock); 6602 PMAP_LOCK(pmap); 6603 rw_wlock(lock); 6604 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 6605 PMAP_UNLOCK(pmap); 6606 goto retry; 6607 } 6608 } 6609 PG_A = pmap_accessed_bit(pmap); 6610 PG_M = pmap_modified_bit(pmap); 6611 PG_RW = pmap_rw_bit(pmap); 6612 pmap_resident_count_adj(pmap, -1); 6613 pde = pmap_pde(pmap, pv->pv_va); 6614 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 6615 " a 2mpage in page %p's pv list", m)); 6616 pte = pmap_pde_to_pte(pde, pv->pv_va); 6617 tpte = pte_load_clear(pte); 6618 if (tpte & PG_W) 6619 pmap->pm_stats.wired_count--; 6620 if (tpte & PG_A) 6621 vm_page_aflag_set(m, PGA_REFERENCED); 6622 6623 /* 6624 * Update the vm_page_t clean and reference bits. 6625 */ 6626 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6627 vm_page_dirty(m); 6628 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); 6629 pmap_invalidate_page(pmap, pv->pv_va); 6630 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 6631 m->md.pv_gen++; 6632 free_pv_entry(pmap, pv); 6633 PMAP_UNLOCK(pmap); 6634 } 6635 vm_page_aflag_clear(m, PGA_WRITEABLE); 6636 rw_wunlock(lock); 6637 pmap_delayed_invl_wait(m); 6638 vm_page_free_pages_toq(&free, true); 6639 } 6640 6641 /* 6642 * pmap_protect_pde: do the things to protect a 2mpage in a process 6643 */ 6644 static bool 6645 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 6646 { 6647 pd_entry_t newpde, oldpde; 6648 vm_page_t m, mt; 6649 bool anychanged; 6650 pt_entry_t PG_G, PG_M, PG_RW; 6651 6652 PG_G = pmap_global_bit(pmap); 6653 PG_M = pmap_modified_bit(pmap); 6654 PG_RW = pmap_rw_bit(pmap); 6655 6656 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6657 KASSERT((sva & PDRMASK) == 0, 6658 ("pmap_protect_pde: sva is not 2mpage aligned")); 6659 anychanged = false; 6660 retry: 6661 oldpde = newpde = *pde; 6662 if ((prot & VM_PROT_WRITE) == 0) { 6663 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == 6664 (PG_MANAGED | PG_M | PG_RW)) { 6665 m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 6666 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 6667 vm_page_dirty(mt); 6668 } 6669 newpde &= ~(PG_RW | PG_M); 6670 } 6671 if ((prot & VM_PROT_EXECUTE) == 0) 6672 newpde |= pg_nx; 6673 if (newpde != oldpde) { 6674 /* 6675 * As an optimization to future operations on this PDE, clear 6676 * PG_PROMOTED. The impending invalidation will remove any 6677 * lingering 4KB page mappings from the TLB. 6678 */ 6679 if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED)) 6680 goto retry; 6681 if ((oldpde & PG_G) != 0) 6682 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 6683 else 6684 anychanged = true; 6685 } 6686 return (anychanged); 6687 } 6688 6689 /* 6690 * Set the physical protection on the 6691 * specified range of this map as requested. 6692 */ 6693 void 6694 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 6695 { 6696 vm_page_t m; 6697 vm_offset_t va_next; 6698 pml4_entry_t *pml4e; 6699 pdp_entry_t *pdpe; 6700 pd_entry_t ptpaddr, *pde; 6701 pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; 6702 pt_entry_t obits, pbits; 6703 bool anychanged; 6704 6705 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 6706 if (prot == VM_PROT_NONE) { 6707 pmap_remove(pmap, sva, eva); 6708 return; 6709 } 6710 6711 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 6712 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 6713 return; 6714 6715 PG_G = pmap_global_bit(pmap); 6716 PG_M = pmap_modified_bit(pmap); 6717 PG_V = pmap_valid_bit(pmap); 6718 PG_RW = pmap_rw_bit(pmap); 6719 anychanged = false; 6720 6721 /* 6722 * Although this function delays and batches the invalidation 6723 * of stale TLB entries, it does not need to call 6724 * pmap_delayed_invl_start() and 6725 * pmap_delayed_invl_finish(), because it does not 6726 * ordinarily destroy mappings. Stale TLB entries from 6727 * protection-only changes need only be invalidated before the 6728 * pmap lock is released, because protection-only changes do 6729 * not destroy PV entries. Even operations that iterate over 6730 * a physical page's PV list of mappings, like 6731 * pmap_remove_write(), acquire the pmap lock for each 6732 * mapping. Consequently, for protection-only changes, the 6733 * pmap lock suffices to synchronize both page table and TLB 6734 * updates. 6735 * 6736 * This function only destroys a mapping if pmap_demote_pde() 6737 * fails. In that case, stale TLB entries are immediately 6738 * invalidated. 6739 */ 6740 6741 PMAP_LOCK(pmap); 6742 for (; sva < eva; sva = va_next) { 6743 pml4e = pmap_pml4e(pmap, sva); 6744 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 6745 va_next = (sva + NBPML4) & ~PML4MASK; 6746 if (va_next < sva) 6747 va_next = eva; 6748 continue; 6749 } 6750 6751 va_next = (sva + NBPDP) & ~PDPMASK; 6752 if (va_next < sva) 6753 va_next = eva; 6754 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 6755 if ((*pdpe & PG_V) == 0) 6756 continue; 6757 if ((*pdpe & PG_PS) != 0) { 6758 KASSERT(va_next <= eva, 6759 ("partial update of non-transparent 1G mapping " 6760 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 6761 *pdpe, sva, eva, va_next)); 6762 retry_pdpe: 6763 obits = pbits = *pdpe; 6764 MPASS((pbits & (PG_MANAGED | PG_G)) == 0); 6765 MPASS(pmap != kernel_pmap); /* XXXKIB */ 6766 if ((prot & VM_PROT_WRITE) == 0) 6767 pbits &= ~(PG_RW | PG_M); 6768 if ((prot & VM_PROT_EXECUTE) == 0) 6769 pbits |= pg_nx; 6770 6771 if (pbits != obits) { 6772 if (!atomic_cmpset_long(pdpe, obits, pbits)) 6773 /* PG_PS cannot be cleared under us, */ 6774 goto retry_pdpe; 6775 anychanged = true; 6776 } 6777 continue; 6778 } 6779 6780 va_next = (sva + NBPDR) & ~PDRMASK; 6781 if (va_next < sva) 6782 va_next = eva; 6783 6784 pde = pmap_pdpe_to_pde(pdpe, sva); 6785 ptpaddr = *pde; 6786 6787 /* 6788 * Weed out invalid mappings. 6789 */ 6790 if (ptpaddr == 0) 6791 continue; 6792 6793 /* 6794 * Check for large page. 6795 */ 6796 if ((ptpaddr & PG_PS) != 0) { 6797 /* 6798 * Are we protecting the entire large page? If not, 6799 * demote the mapping and fall through. 6800 */ 6801 if (sva + NBPDR == va_next && eva >= va_next) { 6802 /* 6803 * The TLB entry for a PG_G mapping is 6804 * invalidated by pmap_protect_pde(). 6805 */ 6806 if (pmap_protect_pde(pmap, pde, sva, prot)) 6807 anychanged = true; 6808 continue; 6809 } else if (!pmap_demote_pde(pmap, pde, sva)) { 6810 /* 6811 * The large page mapping was destroyed. 6812 */ 6813 continue; 6814 } 6815 } 6816 6817 if (va_next > eva) 6818 va_next = eva; 6819 6820 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 6821 sva += PAGE_SIZE) { 6822 retry: 6823 obits = pbits = *pte; 6824 if ((pbits & PG_V) == 0) 6825 continue; 6826 6827 if ((prot & VM_PROT_WRITE) == 0) { 6828 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 6829 (PG_MANAGED | PG_M | PG_RW)) { 6830 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 6831 vm_page_dirty(m); 6832 } 6833 pbits &= ~(PG_RW | PG_M); 6834 } 6835 if ((prot & VM_PROT_EXECUTE) == 0) 6836 pbits |= pg_nx; 6837 6838 if (pbits != obits) { 6839 if (!atomic_cmpset_long(pte, obits, pbits)) 6840 goto retry; 6841 if (obits & PG_G) 6842 pmap_invalidate_page(pmap, sva); 6843 else 6844 anychanged = true; 6845 } 6846 } 6847 } 6848 if (anychanged) 6849 pmap_invalidate_all(pmap); 6850 PMAP_UNLOCK(pmap); 6851 } 6852 6853 static bool 6854 pmap_pde_ept_executable(pmap_t pmap, pd_entry_t pde) 6855 { 6856 6857 if (pmap->pm_type != PT_EPT) 6858 return (false); 6859 return ((pde & EPT_PG_EXECUTE) != 0); 6860 } 6861 6862 #if VM_NRESERVLEVEL > 0 6863 /* 6864 * Tries to promote the 512, contiguous 4KB page mappings that are within a 6865 * single page table page (PTP) to a single 2MB page mapping. For promotion 6866 * to occur, two conditions must be met: (1) the 4KB page mappings must map 6867 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 6868 * identical characteristics. 6869 */ 6870 static bool 6871 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, vm_page_t mpte, 6872 struct rwlock **lockp) 6873 { 6874 pd_entry_t newpde; 6875 pt_entry_t *firstpte, oldpte, pa, *pte; 6876 pt_entry_t allpte_PG_A, PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V; 6877 int PG_PTE_CACHE; 6878 6879 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6880 if (!pmap_ps_enabled(pmap)) 6881 return (false); 6882 6883 PG_A = pmap_accessed_bit(pmap); 6884 PG_G = pmap_global_bit(pmap); 6885 PG_M = pmap_modified_bit(pmap); 6886 PG_V = pmap_valid_bit(pmap); 6887 PG_RW = pmap_rw_bit(pmap); 6888 PG_PKU_MASK = pmap_pku_mask_bit(pmap); 6889 PG_PTE_CACHE = pmap_cache_mask(pmap, false); 6890 6891 /* 6892 * Examine the first PTE in the specified PTP. Abort if this PTE is 6893 * ineligible for promotion due to hardware errata, invalid, or does 6894 * not map the first 4KB physical page within a 2MB page. 6895 */ 6896 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 6897 newpde = *firstpte; 6898 if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, newpde))) 6899 return (false); 6900 if ((newpde & ((PG_FRAME & PDRMASK) | PG_V)) != PG_V) { 6901 counter_u64_add(pmap_pde_p_failures, 1); 6902 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6903 " in pmap %p", va, pmap); 6904 return (false); 6905 } 6906 6907 /* 6908 * Both here and in the below "for" loop, to allow for repromotion 6909 * after MADV_FREE, conditionally write protect a clean PTE before 6910 * possibly aborting the promotion due to other PTE attributes. Why? 6911 * Suppose that MADV_FREE is applied to a part of a superpage, the 6912 * address range [S, E). pmap_advise() will demote the superpage 6913 * mapping, destroy the 4KB page mapping at the end of [S, E), and 6914 * clear PG_M and PG_A in the PTEs for the rest of [S, E). Later, 6915 * imagine that the memory in [S, E) is recycled, but the last 4KB 6916 * page in [S, E) is not the last to be rewritten, or simply accessed. 6917 * In other words, there is still a 4KB page in [S, E), call it P, 6918 * that is writeable but PG_M and PG_A are clear in P's PTE. Unless 6919 * we write protect P before aborting the promotion, if and when P is 6920 * finally rewritten, there won't be a page fault to trigger 6921 * repromotion. 6922 */ 6923 setpde: 6924 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 6925 /* 6926 * When PG_M is already clear, PG_RW can be cleared without 6927 * a TLB invalidation. 6928 */ 6929 if (!atomic_fcmpset_long(firstpte, &newpde, newpde & ~PG_RW)) 6930 goto setpde; 6931 newpde &= ~PG_RW; 6932 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 6933 " in pmap %p", va & ~PDRMASK, pmap); 6934 } 6935 6936 /* 6937 * Examine each of the other PTEs in the specified PTP. Abort if this 6938 * PTE maps an unexpected 4KB physical page or does not have identical 6939 * characteristics to the first PTE. 6940 */ 6941 allpte_PG_A = newpde & PG_A; 6942 pa = (newpde & (PG_PS_FRAME | PG_V)) + NBPDR - PAGE_SIZE; 6943 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 6944 oldpte = *pte; 6945 if ((oldpte & (PG_FRAME | PG_V)) != pa) { 6946 counter_u64_add(pmap_pde_p_failures, 1); 6947 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6948 " in pmap %p", va, pmap); 6949 return (false); 6950 } 6951 setpte: 6952 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 6953 /* 6954 * When PG_M is already clear, PG_RW can be cleared 6955 * without a TLB invalidation. 6956 */ 6957 if (!atomic_fcmpset_long(pte, &oldpte, oldpte & ~PG_RW)) 6958 goto setpte; 6959 oldpte &= ~PG_RW; 6960 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 6961 " in pmap %p", (oldpte & PG_FRAME & PDRMASK) | 6962 (va & ~PDRMASK), pmap); 6963 } 6964 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 6965 counter_u64_add(pmap_pde_p_failures, 1); 6966 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6967 " in pmap %p", va, pmap); 6968 return (false); 6969 } 6970 allpte_PG_A &= oldpte; 6971 pa -= PAGE_SIZE; 6972 } 6973 6974 /* 6975 * Unless all PTEs have PG_A set, clear it from the superpage mapping, 6976 * so that promotions triggered by speculative mappings, such as 6977 * pmap_enter_quick(), don't automatically mark the underlying pages 6978 * as referenced. 6979 */ 6980 newpde &= ~PG_A | allpte_PG_A; 6981 6982 /* 6983 * EPT PTEs with PG_M set and PG_A clear are not supported by early 6984 * MMUs supporting EPT. 6985 */ 6986 KASSERT((newpde & PG_A) != 0 || safe_to_clear_referenced(pmap, newpde), 6987 ("unsupported EPT PTE")); 6988 6989 /* 6990 * Save the PTP in its current state until the PDE mapping the 6991 * superpage is demoted by pmap_demote_pde() or destroyed by 6992 * pmap_remove_pde(). If PG_A is not set in every PTE, then request 6993 * that the PTP be refilled on demotion. 6994 */ 6995 if (mpte == NULL) 6996 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 6997 KASSERT(mpte >= vm_page_array && 6998 mpte < &vm_page_array[vm_page_array_size], 6999 ("pmap_promote_pde: page table page is out of range")); 7000 KASSERT(mpte->pindex == pmap_pde_pindex(va), 7001 ("pmap_promote_pde: page table page's pindex is wrong " 7002 "mpte %p pidx %#lx va %#lx va pde pidx %#lx", 7003 mpte, mpte->pindex, va, pmap_pde_pindex(va))); 7004 if (pmap_insert_pt_page(pmap, mpte, true, allpte_PG_A != 0)) { 7005 counter_u64_add(pmap_pde_p_failures, 1); 7006 CTR2(KTR_PMAP, 7007 "pmap_promote_pde: failure for va %#lx in pmap %p", va, 7008 pmap); 7009 return (false); 7010 } 7011 7012 /* 7013 * Promote the pv entries. 7014 */ 7015 if ((newpde & PG_MANAGED) != 0) 7016 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); 7017 7018 /* 7019 * Propagate the PAT index to its proper position. 7020 */ 7021 newpde = pmap_swap_pat(pmap, newpde); 7022 7023 /* 7024 * Map the superpage. 7025 */ 7026 if (workaround_erratum383) 7027 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 7028 else 7029 pde_store(pde, PG_PROMOTED | PG_PS | newpde); 7030 7031 counter_u64_add(pmap_pde_promotions, 1); 7032 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" 7033 " in pmap %p", va, pmap); 7034 return (true); 7035 } 7036 #endif /* VM_NRESERVLEVEL > 0 */ 7037 7038 static int 7039 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags, 7040 int psind) 7041 { 7042 vm_page_t mp; 7043 pt_entry_t origpte, *pml4e, *pdpe, *pde, pten, PG_V; 7044 7045 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7046 KASSERT(psind > 0 && psind < MAXPAGESIZES && pagesizes[psind] != 0, 7047 ("psind %d unexpected", psind)); 7048 KASSERT(((newpte & PG_FRAME) & (pagesizes[psind] - 1)) == 0, 7049 ("unaligned phys address %#lx newpte %#lx psind %d", 7050 newpte & PG_FRAME, newpte, psind)); 7051 KASSERT((va & (pagesizes[psind] - 1)) == 0, 7052 ("unaligned va %#lx psind %d", va, psind)); 7053 KASSERT(va < VM_MAXUSER_ADDRESS, 7054 ("kernel mode non-transparent superpage")); /* XXXKIB */ 7055 KASSERT(va + pagesizes[psind] < VM_MAXUSER_ADDRESS, 7056 ("overflowing user map va %#lx psind %d", va, psind)); /* XXXKIB */ 7057 7058 PG_V = pmap_valid_bit(pmap); 7059 7060 restart: 7061 if (!pmap_pkru_same(pmap, va, va + pagesizes[psind])) 7062 return (KERN_PROTECTION_FAILURE); 7063 pten = newpte; 7064 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) 7065 pten |= pmap_pkru_get(pmap, va); 7066 7067 if (psind == 2) { /* 1G */ 7068 pml4e = pmap_pml4e(pmap, va); 7069 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 7070 mp = pmap_allocpte_alloc(pmap, pmap_pml4e_pindex(va), 7071 NULL, va); 7072 if (mp == NULL) 7073 goto allocf; 7074 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 7075 pdpe = &pdpe[pmap_pdpe_index(va)]; 7076 origpte = *pdpe; 7077 MPASS(origpte == 0); 7078 } else { 7079 pdpe = pmap_pml4e_to_pdpe(pml4e, va); 7080 KASSERT(pdpe != NULL, ("va %#lx lost pdpe", va)); 7081 origpte = *pdpe; 7082 if ((origpte & PG_V) == 0) { 7083 mp = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME); 7084 mp->ref_count++; 7085 } 7086 } 7087 *pdpe = pten; 7088 } else /* (psind == 1) */ { /* 2M */ 7089 pde = pmap_pde(pmap, va); 7090 if (pde == NULL) { 7091 mp = pmap_allocpte_alloc(pmap, pmap_pdpe_pindex(va), 7092 NULL, va); 7093 if (mp == NULL) 7094 goto allocf; 7095 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 7096 pde = &pde[pmap_pde_index(va)]; 7097 origpte = *pde; 7098 MPASS(origpte == 0); 7099 } else { 7100 origpte = *pde; 7101 if ((origpte & PG_V) == 0) { 7102 pdpe = pmap_pdpe(pmap, va); 7103 MPASS(pdpe != NULL && (*pdpe & PG_V) != 0); 7104 mp = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 7105 mp->ref_count++; 7106 } 7107 } 7108 *pde = pten; 7109 } 7110 KASSERT((origpte & PG_V) == 0 || ((origpte & PG_PS) != 0 && 7111 (origpte & PG_PS_FRAME) == (pten & PG_PS_FRAME)), 7112 ("va %#lx changing %s phys page origpte %#lx pten %#lx", 7113 va, psind == 2 ? "1G" : "2M", origpte, pten)); 7114 if ((pten & PG_W) != 0 && (origpte & PG_W) == 0) 7115 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE; 7116 else if ((pten & PG_W) == 0 && (origpte & PG_W) != 0) 7117 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE; 7118 if ((origpte & PG_V) == 0) 7119 pmap_resident_count_adj(pmap, pagesizes[psind] / PAGE_SIZE); 7120 7121 return (KERN_SUCCESS); 7122 7123 allocf: 7124 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 7125 return (KERN_RESOURCE_SHORTAGE); 7126 PMAP_UNLOCK(pmap); 7127 vm_wait(NULL); 7128 PMAP_LOCK(pmap); 7129 goto restart; 7130 } 7131 7132 /* 7133 * Insert the given physical page (p) at 7134 * the specified virtual address (v) in the 7135 * target physical map with the protection requested. 7136 * 7137 * If specified, the page will be wired down, meaning 7138 * that the related pte can not be reclaimed. 7139 * 7140 * NB: This is the only routine which MAY NOT lazy-evaluate 7141 * or lose information. That is, this routine must actually 7142 * insert this page into the given map NOW. 7143 * 7144 * When destroying both a page table and PV entry, this function 7145 * performs the TLB invalidation before releasing the PV list 7146 * lock, so we do not need pmap_delayed_invl_page() calls here. 7147 */ 7148 int 7149 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 7150 u_int flags, int8_t psind) 7151 { 7152 struct rwlock *lock; 7153 pd_entry_t *pde; 7154 pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; 7155 pt_entry_t newpte, origpte; 7156 pv_entry_t pv; 7157 vm_paddr_t opa, pa; 7158 vm_page_t mpte, om; 7159 int rv; 7160 bool nosleep; 7161 7162 PG_A = pmap_accessed_bit(pmap); 7163 PG_G = pmap_global_bit(pmap); 7164 PG_M = pmap_modified_bit(pmap); 7165 PG_V = pmap_valid_bit(pmap); 7166 PG_RW = pmap_rw_bit(pmap); 7167 7168 va = trunc_page(va); 7169 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 7170 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 7171 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", 7172 va)); 7173 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va), 7174 ("pmap_enter: managed mapping within the clean submap")); 7175 if ((m->oflags & VPO_UNMANAGED) == 0) 7176 VM_PAGE_OBJECT_BUSY_ASSERT(m); 7177 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 7178 ("pmap_enter: flags %u has reserved bits set", flags)); 7179 pa = VM_PAGE_TO_PHYS(m); 7180 newpte = (pt_entry_t)(pa | PG_A | PG_V); 7181 if ((flags & VM_PROT_WRITE) != 0) 7182 newpte |= PG_M; 7183 if ((prot & VM_PROT_WRITE) != 0) 7184 newpte |= PG_RW; 7185 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 7186 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); 7187 if ((prot & VM_PROT_EXECUTE) == 0) 7188 newpte |= pg_nx; 7189 if ((flags & PMAP_ENTER_WIRED) != 0) 7190 newpte |= PG_W; 7191 if (va < VM_MAXUSER_ADDRESS) 7192 newpte |= PG_U; 7193 if (pmap == kernel_pmap) 7194 newpte |= PG_G; 7195 newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0); 7196 7197 /* 7198 * Set modified bit gratuitously for writeable mappings if 7199 * the page is unmanaged. We do not want to take a fault 7200 * to do the dirty bit accounting for these mappings. 7201 */ 7202 if ((m->oflags & VPO_UNMANAGED) != 0) { 7203 if ((newpte & PG_RW) != 0) 7204 newpte |= PG_M; 7205 } else 7206 newpte |= PG_MANAGED; 7207 7208 lock = NULL; 7209 PMAP_LOCK(pmap); 7210 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { 7211 KASSERT((m->oflags & VPO_UNMANAGED) != 0, 7212 ("managed largepage va %#lx flags %#x", va, flags)); 7213 rv = pmap_enter_largepage(pmap, va, newpte | PG_PS, flags, 7214 psind); 7215 goto out; 7216 } 7217 if (psind == 1) { 7218 /* Assert the required virtual and physical alignment. */ 7219 KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); 7220 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 7221 rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock); 7222 goto out; 7223 } 7224 mpte = NULL; 7225 7226 /* 7227 * In the case that a page table page is not 7228 * resident, we are creating it here. 7229 */ 7230 retry: 7231 pde = pmap_pde(pmap, va); 7232 if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || 7233 pmap_demote_pde_locked(pmap, pde, va, &lock))) { 7234 pte = pmap_pde_to_pte(pde, va); 7235 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 7236 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7237 mpte->ref_count++; 7238 } 7239 } else if (va < VM_MAXUSER_ADDRESS) { 7240 /* 7241 * Here if the pte page isn't mapped, or if it has been 7242 * deallocated. 7243 */ 7244 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 7245 mpte = pmap_allocpte_alloc(pmap, pmap_pde_pindex(va), 7246 nosleep ? NULL : &lock, va); 7247 if (mpte == NULL && nosleep) { 7248 rv = KERN_RESOURCE_SHORTAGE; 7249 goto out; 7250 } 7251 goto retry; 7252 } else 7253 panic("pmap_enter: invalid page directory va=%#lx", va); 7254 7255 origpte = *pte; 7256 pv = NULL; 7257 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) 7258 newpte |= pmap_pkru_get(pmap, va); 7259 7260 /* 7261 * Is the specified virtual address already mapped? 7262 */ 7263 if ((origpte & PG_V) != 0) { 7264 /* 7265 * Wiring change, just update stats. We don't worry about 7266 * wiring PT pages as they remain resident as long as there 7267 * are valid mappings in them. Hence, if a user page is wired, 7268 * the PT page will be also. 7269 */ 7270 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 7271 pmap->pm_stats.wired_count++; 7272 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 7273 pmap->pm_stats.wired_count--; 7274 7275 /* 7276 * Remove the extra PT page reference. 7277 */ 7278 if (mpte != NULL) { 7279 mpte->ref_count--; 7280 KASSERT(mpte->ref_count > 0, 7281 ("pmap_enter: missing reference to page table page," 7282 " va: 0x%lx", va)); 7283 } 7284 7285 /* 7286 * Has the physical page changed? 7287 */ 7288 opa = origpte & PG_FRAME; 7289 if (opa == pa) { 7290 /* 7291 * No, might be a protection or wiring change. 7292 */ 7293 if ((origpte & PG_MANAGED) != 0 && 7294 (newpte & PG_RW) != 0) 7295 vm_page_aflag_set(m, PGA_WRITEABLE); 7296 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 7297 goto unchanged; 7298 goto validate; 7299 } 7300 7301 /* 7302 * The physical page has changed. Temporarily invalidate 7303 * the mapping. This ensures that all threads sharing the 7304 * pmap keep a consistent view of the mapping, which is 7305 * necessary for the correct handling of COW faults. It 7306 * also permits reuse of the old mapping's PV entry, 7307 * avoiding an allocation. 7308 * 7309 * For consistency, handle unmanaged mappings the same way. 7310 */ 7311 origpte = pte_load_clear(pte); 7312 KASSERT((origpte & PG_FRAME) == opa, 7313 ("pmap_enter: unexpected pa update for %#lx", va)); 7314 if ((origpte & PG_MANAGED) != 0) { 7315 om = PHYS_TO_VM_PAGE(opa); 7316 7317 /* 7318 * The pmap lock is sufficient to synchronize with 7319 * concurrent calls to pmap_page_test_mappings() and 7320 * pmap_ts_referenced(). 7321 */ 7322 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 7323 vm_page_dirty(om); 7324 if ((origpte & PG_A) != 0) { 7325 pmap_invalidate_page(pmap, va); 7326 vm_page_aflag_set(om, PGA_REFERENCED); 7327 } 7328 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 7329 pv = pmap_pvh_remove(&om->md, pmap, va); 7330 KASSERT(pv != NULL, 7331 ("pmap_enter: no PV entry for %#lx", va)); 7332 if ((newpte & PG_MANAGED) == 0) 7333 free_pv_entry(pmap, pv); 7334 if ((om->a.flags & PGA_WRITEABLE) != 0 && 7335 TAILQ_EMPTY(&om->md.pv_list) && 7336 ((om->flags & PG_FICTITIOUS) != 0 || 7337 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 7338 vm_page_aflag_clear(om, PGA_WRITEABLE); 7339 } else { 7340 /* 7341 * Since this mapping is unmanaged, assume that PG_A 7342 * is set. 7343 */ 7344 pmap_invalidate_page(pmap, va); 7345 } 7346 origpte = 0; 7347 } else { 7348 /* 7349 * Increment the counters. 7350 */ 7351 if ((newpte & PG_W) != 0) 7352 pmap->pm_stats.wired_count++; 7353 pmap_resident_count_adj(pmap, 1); 7354 } 7355 7356 /* 7357 * Enter on the PV list if part of our managed memory. 7358 */ 7359 if ((newpte & PG_MANAGED) != 0) { 7360 if (pv == NULL) { 7361 pv = get_pv_entry(pmap, &lock); 7362 pv->pv_va = va; 7363 } 7364 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 7365 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 7366 m->md.pv_gen++; 7367 if ((newpte & PG_RW) != 0) 7368 vm_page_aflag_set(m, PGA_WRITEABLE); 7369 } 7370 7371 /* 7372 * Update the PTE. 7373 */ 7374 if ((origpte & PG_V) != 0) { 7375 validate: 7376 origpte = pte_load_store(pte, newpte); 7377 KASSERT((origpte & PG_FRAME) == pa, 7378 ("pmap_enter: unexpected pa update for %#lx", va)); 7379 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == 7380 (PG_M | PG_RW)) { 7381 if ((origpte & PG_MANAGED) != 0) 7382 vm_page_dirty(m); 7383 7384 /* 7385 * Although the PTE may still have PG_RW set, TLB 7386 * invalidation may nonetheless be required because 7387 * the PTE no longer has PG_M set. 7388 */ 7389 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { 7390 /* 7391 * This PTE change does not require TLB invalidation. 7392 */ 7393 goto unchanged; 7394 } 7395 if ((origpte & PG_A) != 0) 7396 pmap_invalidate_page(pmap, va); 7397 } else 7398 pte_store(pte, newpte); 7399 7400 unchanged: 7401 7402 #if VM_NRESERVLEVEL > 0 7403 /* 7404 * If both the page table page and the reservation are fully 7405 * populated, then attempt promotion. 7406 */ 7407 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 7408 (m->flags & PG_FICTITIOUS) == 0 && 7409 vm_reserv_level_iffullpop(m) == 0) 7410 (void)pmap_promote_pde(pmap, pde, va, mpte, &lock); 7411 #endif 7412 7413 rv = KERN_SUCCESS; 7414 out: 7415 if (lock != NULL) 7416 rw_wunlock(lock); 7417 PMAP_UNLOCK(pmap); 7418 return (rv); 7419 } 7420 7421 /* 7422 * Tries to create a read- and/or execute-only 2MB page mapping. Returns 7423 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error 7424 * value. See pmap_enter_pde() for the possible error values when "no sleep", 7425 * "no replace", and "no reclaim" are specified. 7426 */ 7427 static int 7428 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 7429 struct rwlock **lockp) 7430 { 7431 pd_entry_t newpde; 7432 pt_entry_t PG_V; 7433 7434 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7435 PG_V = pmap_valid_bit(pmap); 7436 newpde = VM_PAGE_TO_PHYS(m) | 7437 pmap_cache_bits(pmap, m->md.pat_mode, true) | PG_PS | PG_V; 7438 if ((m->oflags & VPO_UNMANAGED) == 0) 7439 newpde |= PG_MANAGED; 7440 if ((prot & VM_PROT_EXECUTE) == 0) 7441 newpde |= pg_nx; 7442 if (va < VM_MAXUSER_ADDRESS) 7443 newpde |= PG_U; 7444 return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | 7445 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp)); 7446 } 7447 7448 /* 7449 * Returns true if every page table entry in the specified page table page is 7450 * zero. 7451 */ 7452 static bool 7453 pmap_every_pte_zero(vm_paddr_t pa) 7454 { 7455 pt_entry_t *pt_end, *pte; 7456 7457 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); 7458 pte = (pt_entry_t *)PHYS_TO_DMAP(pa); 7459 for (pt_end = pte + NPTEPG; pte < pt_end; pte++) { 7460 if (*pte != 0) 7461 return (false); 7462 } 7463 return (true); 7464 } 7465 7466 /* 7467 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 7468 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, 7469 * KERN_PROTECTION_FAILURE, or KERN_RESOURCE_SHORTAGE otherwise. Returns 7470 * KERN_FAILURE if either (1) PMAP_ENTER_NOREPLACE was specified and a 4KB 7471 * page mapping already exists within the 2MB virtual address range starting 7472 * at the specified virtual address or (2) the requested 2MB page mapping is 7473 * not supported due to hardware errata. Returns KERN_NO_SPACE if 7474 * PMAP_ENTER_NOREPLACE was specified and a 2MB page mapping already exists at 7475 * the specified virtual address. Returns KERN_PROTECTION_FAILURE if the PKRU 7476 * settings are not the same across the 2MB virtual address range starting at 7477 * the specified virtual address. Returns KERN_RESOURCE_SHORTAGE if either 7478 * (1) PMAP_ENTER_NOSLEEP was specified and a page table page allocation 7479 * failed or (2) PMAP_ENTER_NORECLAIM was specified and a PV entry allocation 7480 * failed. 7481 * 7482 * The parameter "m" is only used when creating a managed, writeable mapping. 7483 */ 7484 static int 7485 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, 7486 vm_page_t m, struct rwlock **lockp) 7487 { 7488 struct spglist free; 7489 pd_entry_t oldpde, *pde; 7490 pt_entry_t PG_G, PG_RW, PG_V; 7491 vm_page_t mt, pdpg; 7492 vm_page_t uwptpg; 7493 7494 PG_G = pmap_global_bit(pmap); 7495 PG_RW = pmap_rw_bit(pmap); 7496 KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW, 7497 ("pmap_enter_pde: newpde is missing PG_M")); 7498 PG_V = pmap_valid_bit(pmap); 7499 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7500 7501 if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, 7502 newpde))) { 7503 CTR2(KTR_PMAP, "pmap_enter_pde: 2m x blocked for va %#lx" 7504 " in pmap %p", va, pmap); 7505 return (KERN_FAILURE); 7506 } 7507 if ((pde = pmap_alloc_pde(pmap, va, &pdpg, (flags & 7508 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { 7509 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 7510 " in pmap %p", va, pmap); 7511 return (KERN_RESOURCE_SHORTAGE); 7512 } 7513 7514 /* 7515 * If pkru is not same for the whole pde range, return failure 7516 * and let vm_fault() cope. Check after pde allocation, since 7517 * it could sleep. 7518 */ 7519 if (!pmap_pkru_same(pmap, va, va + NBPDR)) { 7520 pmap_abort_ptp(pmap, va, pdpg); 7521 return (KERN_PROTECTION_FAILURE); 7522 } 7523 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) { 7524 newpde &= ~X86_PG_PKU_MASK; 7525 newpde |= pmap_pkru_get(pmap, va); 7526 } 7527 7528 /* 7529 * If there are existing mappings, either abort or remove them. 7530 */ 7531 oldpde = *pde; 7532 if ((oldpde & PG_V) != 0) { 7533 KASSERT(pdpg == NULL || pdpg->ref_count > 1, 7534 ("pmap_enter_pde: pdpg's reference count is too low")); 7535 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 7536 if ((oldpde & PG_PS) != 0) { 7537 if (pdpg != NULL) 7538 pdpg->ref_count--; 7539 CTR2(KTR_PMAP, 7540 "pmap_enter_pde: no space for va %#lx" 7541 " in pmap %p", va, pmap); 7542 return (KERN_NO_SPACE); 7543 } else if (va < VM_MAXUSER_ADDRESS || 7544 !pmap_every_pte_zero(oldpde & PG_FRAME)) { 7545 if (pdpg != NULL) 7546 pdpg->ref_count--; 7547 CTR2(KTR_PMAP, 7548 "pmap_enter_pde: failure for va %#lx" 7549 " in pmap %p", va, pmap); 7550 return (KERN_FAILURE); 7551 } 7552 } 7553 /* Break the existing mapping(s). */ 7554 SLIST_INIT(&free); 7555 if ((oldpde & PG_PS) != 0) { 7556 /* 7557 * The reference to the PD page that was acquired by 7558 * pmap_alloc_pde() ensures that it won't be freed. 7559 * However, if the PDE resulted from a promotion, then 7560 * a reserved PT page could be freed. 7561 */ 7562 (void)pmap_remove_pde(pmap, pde, va, &free, lockp); 7563 if ((oldpde & PG_G) == 0) 7564 pmap_invalidate_pde_page(pmap, va, oldpde); 7565 } else { 7566 pmap_delayed_invl_start(); 7567 if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free, 7568 lockp)) 7569 pmap_invalidate_all(pmap); 7570 pmap_delayed_invl_finish(); 7571 } 7572 if (va < VM_MAXUSER_ADDRESS) { 7573 vm_page_free_pages_toq(&free, true); 7574 KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", 7575 pde)); 7576 } else { 7577 KASSERT(SLIST_EMPTY(&free), 7578 ("pmap_enter_pde: freed kernel page table page")); 7579 7580 /* 7581 * Both pmap_remove_pde() and pmap_remove_ptes() will 7582 * leave the kernel page table page zero filled. 7583 */ 7584 mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7585 if (pmap_insert_pt_page(pmap, mt, false, false)) 7586 panic("pmap_enter_pde: trie insert failed"); 7587 } 7588 } 7589 7590 /* 7591 * Allocate leaf ptpage for wired userspace pages. 7592 */ 7593 uwptpg = NULL; 7594 if ((newpde & PG_W) != 0 && pmap != kernel_pmap) { 7595 uwptpg = pmap_alloc_pt_page(pmap, pmap_pde_pindex(va), 7596 VM_ALLOC_WIRED); 7597 if (uwptpg == NULL) 7598 return (KERN_RESOURCE_SHORTAGE); 7599 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) { 7600 pmap_free_pt_page(pmap, uwptpg, false); 7601 return (KERN_RESOURCE_SHORTAGE); 7602 } 7603 7604 uwptpg->ref_count = NPTEPG; 7605 } 7606 if ((newpde & PG_MANAGED) != 0) { 7607 /* 7608 * Abort this mapping if its PV entry could not be created. 7609 */ 7610 if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) { 7611 if (pdpg != NULL) 7612 pmap_abort_ptp(pmap, va, pdpg); 7613 if (uwptpg != NULL) { 7614 mt = pmap_remove_pt_page(pmap, va); 7615 KASSERT(mt == uwptpg, 7616 ("removed pt page %p, expected %p", mt, 7617 uwptpg)); 7618 uwptpg->ref_count = 1; 7619 pmap_free_pt_page(pmap, uwptpg, false); 7620 } 7621 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 7622 " in pmap %p", va, pmap); 7623 return (KERN_RESOURCE_SHORTAGE); 7624 } 7625 if ((newpde & PG_RW) != 0) { 7626 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 7627 vm_page_aflag_set(mt, PGA_WRITEABLE); 7628 } 7629 } 7630 7631 /* 7632 * Increment counters. 7633 */ 7634 if ((newpde & PG_W) != 0) 7635 pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; 7636 pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE); 7637 7638 /* 7639 * Map the superpage. (This is not a promoted mapping; there will not 7640 * be any lingering 4KB page mappings in the TLB.) 7641 */ 7642 pde_store(pde, newpde); 7643 7644 counter_u64_add(pmap_pde_mappings, 1); 7645 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx in pmap %p", 7646 va, pmap); 7647 return (KERN_SUCCESS); 7648 } 7649 7650 /* 7651 * Maps a sequence of resident pages belonging to the same object. 7652 * The sequence begins with the given page m_start. This page is 7653 * mapped at the given virtual address start. Each subsequent page is 7654 * mapped at a virtual address that is offset from start by the same 7655 * amount as the page is offset from m_start within the object. The 7656 * last page in the sequence is the page with the largest offset from 7657 * m_start that can be mapped at a virtual address less than the given 7658 * virtual address end. Not every virtual page between start and end 7659 * is mapped; only those for which a resident page exists with the 7660 * corresponding offset from m_start are mapped. 7661 */ 7662 void 7663 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 7664 vm_page_t m_start, vm_prot_t prot) 7665 { 7666 struct rwlock *lock; 7667 vm_offset_t va; 7668 vm_page_t m, mpte; 7669 vm_pindex_t diff, psize; 7670 int rv; 7671 7672 VM_OBJECT_ASSERT_LOCKED(m_start->object); 7673 7674 psize = atop(end - start); 7675 mpte = NULL; 7676 m = m_start; 7677 lock = NULL; 7678 PMAP_LOCK(pmap); 7679 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 7680 va = start + ptoa(diff); 7681 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 7682 m->psind == 1 && pmap_ps_enabled(pmap) && 7683 ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) == 7684 KERN_SUCCESS || rv == KERN_NO_SPACE)) 7685 m = &m[NBPDR / PAGE_SIZE - 1]; 7686 else 7687 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 7688 mpte, &lock); 7689 m = TAILQ_NEXT(m, listq); 7690 } 7691 if (lock != NULL) 7692 rw_wunlock(lock); 7693 PMAP_UNLOCK(pmap); 7694 } 7695 7696 /* 7697 * this code makes some *MAJOR* assumptions: 7698 * 1. Current pmap & pmap exists. 7699 * 2. Not wired. 7700 * 3. Read access. 7701 * 4. No page table pages. 7702 * but is *MUCH* faster than pmap_enter... 7703 */ 7704 7705 void 7706 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 7707 { 7708 struct rwlock *lock; 7709 7710 lock = NULL; 7711 PMAP_LOCK(pmap); 7712 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 7713 if (lock != NULL) 7714 rw_wunlock(lock); 7715 PMAP_UNLOCK(pmap); 7716 } 7717 7718 static vm_page_t 7719 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 7720 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 7721 { 7722 pd_entry_t *pde; 7723 pt_entry_t newpte, *pte, PG_V; 7724 7725 KASSERT(!VA_IS_CLEANMAP(va) || 7726 (m->oflags & VPO_UNMANAGED) != 0, 7727 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 7728 PG_V = pmap_valid_bit(pmap); 7729 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7730 pde = NULL; 7731 7732 /* 7733 * In the case that a page table page is not 7734 * resident, we are creating it here. 7735 */ 7736 if (va < VM_MAXUSER_ADDRESS) { 7737 pdp_entry_t *pdpe; 7738 vm_pindex_t ptepindex; 7739 7740 /* 7741 * Calculate pagetable page index 7742 */ 7743 ptepindex = pmap_pde_pindex(va); 7744 if (mpte && (mpte->pindex == ptepindex)) { 7745 mpte->ref_count++; 7746 } else { 7747 /* 7748 * If the page table page is mapped, we just increment 7749 * the hold count, and activate it. Otherwise, we 7750 * attempt to allocate a page table page, passing NULL 7751 * instead of the PV list lock pointer because we don't 7752 * intend to sleep. If this attempt fails, we don't 7753 * retry. Instead, we give up. 7754 */ 7755 pdpe = pmap_pdpe(pmap, va); 7756 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 7757 if ((*pdpe & PG_PS) != 0) 7758 return (NULL); 7759 pde = pmap_pdpe_to_pde(pdpe, va); 7760 if ((*pde & PG_V) != 0) { 7761 if ((*pde & PG_PS) != 0) 7762 return (NULL); 7763 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7764 mpte->ref_count++; 7765 } else { 7766 mpte = pmap_allocpte_alloc(pmap, 7767 ptepindex, NULL, va); 7768 if (mpte == NULL) 7769 return (NULL); 7770 } 7771 } else { 7772 mpte = pmap_allocpte_alloc(pmap, ptepindex, 7773 NULL, va); 7774 if (mpte == NULL) 7775 return (NULL); 7776 } 7777 } 7778 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 7779 pte = &pte[pmap_pte_index(va)]; 7780 } else { 7781 mpte = NULL; 7782 pte = vtopte(va); 7783 } 7784 if (*pte) { 7785 if (mpte != NULL) 7786 mpte->ref_count--; 7787 return (NULL); 7788 } 7789 7790 /* 7791 * Enter on the PV list if part of our managed memory. 7792 */ 7793 if ((m->oflags & VPO_UNMANAGED) == 0 && 7794 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 7795 if (mpte != NULL) 7796 pmap_abort_ptp(pmap, va, mpte); 7797 return (NULL); 7798 } 7799 7800 /* 7801 * Increment counters 7802 */ 7803 pmap_resident_count_adj(pmap, 1); 7804 7805 newpte = VM_PAGE_TO_PHYS(m) | PG_V | 7806 pmap_cache_bits(pmap, m->md.pat_mode, false); 7807 if ((m->oflags & VPO_UNMANAGED) == 0) 7808 newpte |= PG_MANAGED; 7809 if ((prot & VM_PROT_EXECUTE) == 0) 7810 newpte |= pg_nx; 7811 if (va < VM_MAXUSER_ADDRESS) 7812 newpte |= PG_U | pmap_pkru_get(pmap, va); 7813 pte_store(pte, newpte); 7814 7815 #if VM_NRESERVLEVEL > 0 7816 /* 7817 * If both the PTP and the reservation are fully populated, then 7818 * attempt promotion. 7819 */ 7820 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 7821 (m->flags & PG_FICTITIOUS) == 0 && 7822 vm_reserv_level_iffullpop(m) == 0) { 7823 if (pde == NULL) 7824 pde = pmap_pde(pmap, va); 7825 7826 /* 7827 * If promotion succeeds, then the next call to this function 7828 * should not be given the unmapped PTP as a hint. 7829 */ 7830 if (pmap_promote_pde(pmap, pde, va, mpte, lockp)) 7831 mpte = NULL; 7832 } 7833 #endif 7834 7835 return (mpte); 7836 } 7837 7838 /* 7839 * Make a temporary mapping for a physical address. This is only intended 7840 * to be used for panic dumps. 7841 */ 7842 void * 7843 pmap_kenter_temporary(vm_paddr_t pa, int i) 7844 { 7845 vm_offset_t va; 7846 7847 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 7848 pmap_kenter(va, pa); 7849 pmap_invlpg(kernel_pmap, va); 7850 return ((void *)crashdumpmap); 7851 } 7852 7853 /* 7854 * This code maps large physical mmap regions into the 7855 * processor address space. Note that some shortcuts 7856 * are taken, but the code works. 7857 */ 7858 void 7859 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 7860 vm_pindex_t pindex, vm_size_t size) 7861 { 7862 pd_entry_t *pde; 7863 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 7864 vm_paddr_t pa, ptepa; 7865 vm_page_t p, pdpg; 7866 int pat_mode; 7867 7868 PG_A = pmap_accessed_bit(pmap); 7869 PG_M = pmap_modified_bit(pmap); 7870 PG_V = pmap_valid_bit(pmap); 7871 PG_RW = pmap_rw_bit(pmap); 7872 7873 VM_OBJECT_ASSERT_WLOCKED(object); 7874 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 7875 ("pmap_object_init_pt: non-device object")); 7876 if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 7877 if (!pmap_ps_enabled(pmap)) 7878 return; 7879 if (!vm_object_populate(object, pindex, pindex + atop(size))) 7880 return; 7881 p = vm_page_lookup(object, pindex); 7882 KASSERT(vm_page_all_valid(p), 7883 ("pmap_object_init_pt: invalid page %p", p)); 7884 pat_mode = p->md.pat_mode; 7885 7886 /* 7887 * Abort the mapping if the first page is not physically 7888 * aligned to a 2MB page boundary. 7889 */ 7890 ptepa = VM_PAGE_TO_PHYS(p); 7891 if (ptepa & (NBPDR - 1)) 7892 return; 7893 7894 /* 7895 * Skip the first page. Abort the mapping if the rest of 7896 * the pages are not physically contiguous or have differing 7897 * memory attributes. 7898 */ 7899 p = TAILQ_NEXT(p, listq); 7900 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 7901 pa += PAGE_SIZE) { 7902 KASSERT(vm_page_all_valid(p), 7903 ("pmap_object_init_pt: invalid page %p", p)); 7904 if (pa != VM_PAGE_TO_PHYS(p) || 7905 pat_mode != p->md.pat_mode) 7906 return; 7907 p = TAILQ_NEXT(p, listq); 7908 } 7909 7910 /* 7911 * Map using 2MB pages. Since "ptepa" is 2M aligned and 7912 * "size" is a multiple of 2M, adding the PAT setting to "pa" 7913 * will not affect the termination of this loop. 7914 */ 7915 PMAP_LOCK(pmap); 7916 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, true); 7917 pa < ptepa + size; pa += NBPDR) { 7918 pde = pmap_alloc_pde(pmap, addr, &pdpg, NULL); 7919 if (pde == NULL) { 7920 /* 7921 * The creation of mappings below is only an 7922 * optimization. If a page directory page 7923 * cannot be allocated without blocking, 7924 * continue on to the next mapping rather than 7925 * blocking. 7926 */ 7927 addr += NBPDR; 7928 continue; 7929 } 7930 if ((*pde & PG_V) == 0) { 7931 pde_store(pde, pa | PG_PS | PG_M | PG_A | 7932 PG_U | PG_RW | PG_V); 7933 pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE); 7934 counter_u64_add(pmap_pde_mappings, 1); 7935 } else { 7936 /* Continue on if the PDE is already valid. */ 7937 pdpg->ref_count--; 7938 KASSERT(pdpg->ref_count > 0, 7939 ("pmap_object_init_pt: missing reference " 7940 "to page directory page, va: 0x%lx", addr)); 7941 } 7942 addr += NBPDR; 7943 } 7944 PMAP_UNLOCK(pmap); 7945 } 7946 } 7947 7948 /* 7949 * Clear the wired attribute from the mappings for the specified range of 7950 * addresses in the given pmap. Every valid mapping within that range 7951 * must have the wired attribute set. In contrast, invalid mappings 7952 * cannot have the wired attribute set, so they are ignored. 7953 * 7954 * The wired attribute of the page table entry is not a hardware 7955 * feature, so there is no need to invalidate any TLB entries. 7956 * Since pmap_demote_pde() for the wired entry must never fail, 7957 * pmap_delayed_invl_start()/finish() calls around the 7958 * function are not needed. 7959 */ 7960 void 7961 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 7962 { 7963 vm_offset_t va_next; 7964 pml4_entry_t *pml4e; 7965 pdp_entry_t *pdpe; 7966 pd_entry_t *pde; 7967 pt_entry_t *pte, PG_V, PG_G __diagused; 7968 7969 PG_V = pmap_valid_bit(pmap); 7970 PG_G = pmap_global_bit(pmap); 7971 PMAP_LOCK(pmap); 7972 for (; sva < eva; sva = va_next) { 7973 pml4e = pmap_pml4e(pmap, sva); 7974 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 7975 va_next = (sva + NBPML4) & ~PML4MASK; 7976 if (va_next < sva) 7977 va_next = eva; 7978 continue; 7979 } 7980 7981 va_next = (sva + NBPDP) & ~PDPMASK; 7982 if (va_next < sva) 7983 va_next = eva; 7984 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 7985 if ((*pdpe & PG_V) == 0) 7986 continue; 7987 if ((*pdpe & PG_PS) != 0) { 7988 KASSERT(va_next <= eva, 7989 ("partial update of non-transparent 1G mapping " 7990 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 7991 *pdpe, sva, eva, va_next)); 7992 MPASS(pmap != kernel_pmap); /* XXXKIB */ 7993 MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); 7994 atomic_clear_long(pdpe, PG_W); 7995 pmap->pm_stats.wired_count -= NBPDP / PAGE_SIZE; 7996 continue; 7997 } 7998 7999 va_next = (sva + NBPDR) & ~PDRMASK; 8000 if (va_next < sva) 8001 va_next = eva; 8002 pde = pmap_pdpe_to_pde(pdpe, sva); 8003 if ((*pde & PG_V) == 0) 8004 continue; 8005 if ((*pde & PG_PS) != 0) { 8006 if ((*pde & PG_W) == 0) 8007 panic("pmap_unwire: pde %#jx is missing PG_W", 8008 (uintmax_t)*pde); 8009 8010 /* 8011 * Are we unwiring the entire large page? If not, 8012 * demote the mapping and fall through. 8013 */ 8014 if (sva + NBPDR == va_next && eva >= va_next) { 8015 atomic_clear_long(pde, PG_W); 8016 pmap->pm_stats.wired_count -= NBPDR / 8017 PAGE_SIZE; 8018 continue; 8019 } else if (!pmap_demote_pde(pmap, pde, sva)) 8020 panic("pmap_unwire: demotion failed"); 8021 } 8022 if (va_next > eva) 8023 va_next = eva; 8024 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 8025 sva += PAGE_SIZE) { 8026 if ((*pte & PG_V) == 0) 8027 continue; 8028 if ((*pte & PG_W) == 0) 8029 panic("pmap_unwire: pte %#jx is missing PG_W", 8030 (uintmax_t)*pte); 8031 8032 /* 8033 * PG_W must be cleared atomically. Although the pmap 8034 * lock synchronizes access to PG_W, another processor 8035 * could be setting PG_M and/or PG_A concurrently. 8036 */ 8037 atomic_clear_long(pte, PG_W); 8038 pmap->pm_stats.wired_count--; 8039 } 8040 } 8041 PMAP_UNLOCK(pmap); 8042 } 8043 8044 /* 8045 * Copy the range specified by src_addr/len 8046 * from the source map to the range dst_addr/len 8047 * in the destination map. 8048 * 8049 * This routine is only advisory and need not do anything. 8050 */ 8051 void 8052 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 8053 vm_offset_t src_addr) 8054 { 8055 struct rwlock *lock; 8056 pml4_entry_t *pml4e; 8057 pdp_entry_t *pdpe; 8058 pd_entry_t *pde, srcptepaddr; 8059 pt_entry_t *dst_pte, PG_A, PG_M, PG_V, ptetemp, *src_pte; 8060 vm_offset_t addr, end_addr, va_next; 8061 vm_page_t dst_pdpg, dstmpte, srcmpte; 8062 8063 if (dst_addr != src_addr) 8064 return; 8065 8066 if (dst_pmap->pm_type != src_pmap->pm_type) 8067 return; 8068 8069 /* 8070 * EPT page table entries that require emulation of A/D bits are 8071 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although 8072 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit 8073 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT 8074 * implementations flag an EPT misconfiguration for exec-only 8075 * mappings we skip this function entirely for emulated pmaps. 8076 */ 8077 if (pmap_emulate_ad_bits(dst_pmap)) 8078 return; 8079 8080 end_addr = src_addr + len; 8081 lock = NULL; 8082 if (dst_pmap < src_pmap) { 8083 PMAP_LOCK(dst_pmap); 8084 PMAP_LOCK(src_pmap); 8085 } else { 8086 PMAP_LOCK(src_pmap); 8087 PMAP_LOCK(dst_pmap); 8088 } 8089 8090 PG_A = pmap_accessed_bit(dst_pmap); 8091 PG_M = pmap_modified_bit(dst_pmap); 8092 PG_V = pmap_valid_bit(dst_pmap); 8093 8094 for (addr = src_addr; addr < end_addr; addr = va_next) { 8095 KASSERT(addr < UPT_MIN_ADDRESS, 8096 ("pmap_copy: invalid to pmap_copy page tables")); 8097 8098 pml4e = pmap_pml4e(src_pmap, addr); 8099 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 8100 va_next = (addr + NBPML4) & ~PML4MASK; 8101 if (va_next < addr) 8102 va_next = end_addr; 8103 continue; 8104 } 8105 8106 va_next = (addr + NBPDP) & ~PDPMASK; 8107 if (va_next < addr) 8108 va_next = end_addr; 8109 pdpe = pmap_pml4e_to_pdpe(pml4e, addr); 8110 if ((*pdpe & PG_V) == 0) 8111 continue; 8112 if ((*pdpe & PG_PS) != 0) { 8113 KASSERT(va_next <= end_addr, 8114 ("partial update of non-transparent 1G mapping " 8115 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 8116 *pdpe, addr, end_addr, va_next)); 8117 MPASS((addr & PDPMASK) == 0); 8118 MPASS((*pdpe & PG_MANAGED) == 0); 8119 srcptepaddr = *pdpe; 8120 pdpe = pmap_pdpe(dst_pmap, addr); 8121 if (pdpe == NULL) { 8122 if (pmap_allocpte_alloc(dst_pmap, 8123 pmap_pml4e_pindex(addr), NULL, addr) == 8124 NULL) 8125 break; 8126 pdpe = pmap_pdpe(dst_pmap, addr); 8127 } else { 8128 pml4e = pmap_pml4e(dst_pmap, addr); 8129 dst_pdpg = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME); 8130 dst_pdpg->ref_count++; 8131 } 8132 KASSERT(*pdpe == 0, 8133 ("1G mapping present in dst pmap " 8134 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 8135 *pdpe, addr, end_addr, va_next)); 8136 *pdpe = srcptepaddr & ~PG_W; 8137 pmap_resident_count_adj(dst_pmap, NBPDP / PAGE_SIZE); 8138 continue; 8139 } 8140 8141 va_next = (addr + NBPDR) & ~PDRMASK; 8142 if (va_next < addr) 8143 va_next = end_addr; 8144 8145 pde = pmap_pdpe_to_pde(pdpe, addr); 8146 srcptepaddr = *pde; 8147 if (srcptepaddr == 0) 8148 continue; 8149 8150 if (srcptepaddr & PG_PS) { 8151 /* 8152 * We can only virtual copy whole superpages. 8153 */ 8154 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 8155 continue; 8156 pde = pmap_alloc_pde(dst_pmap, addr, &dst_pdpg, NULL); 8157 if (pde == NULL) 8158 break; 8159 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 8160 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, 8161 PMAP_ENTER_NORECLAIM, &lock))) { 8162 /* 8163 * We leave the dirty bit unchanged because 8164 * managed read/write superpage mappings are 8165 * required to be dirty. However, managed 8166 * superpage mappings are not required to 8167 * have their accessed bit set, so we clear 8168 * it because we don't know if this mapping 8169 * will be used. 8170 */ 8171 srcptepaddr &= ~PG_W; 8172 if ((srcptepaddr & PG_MANAGED) != 0) 8173 srcptepaddr &= ~PG_A; 8174 *pde = srcptepaddr; 8175 pmap_resident_count_adj(dst_pmap, NBPDR / 8176 PAGE_SIZE); 8177 counter_u64_add(pmap_pde_mappings, 1); 8178 } else 8179 pmap_abort_ptp(dst_pmap, addr, dst_pdpg); 8180 continue; 8181 } 8182 8183 srcptepaddr &= PG_FRAME; 8184 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 8185 KASSERT(srcmpte->ref_count > 0, 8186 ("pmap_copy: source page table page is unused")); 8187 8188 if (va_next > end_addr) 8189 va_next = end_addr; 8190 8191 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 8192 src_pte = &src_pte[pmap_pte_index(addr)]; 8193 dstmpte = NULL; 8194 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { 8195 ptetemp = *src_pte; 8196 8197 /* 8198 * We only virtual copy managed pages. 8199 */ 8200 if ((ptetemp & PG_MANAGED) == 0) 8201 continue; 8202 8203 if (dstmpte != NULL) { 8204 KASSERT(dstmpte->pindex == 8205 pmap_pde_pindex(addr), 8206 ("dstmpte pindex/addr mismatch")); 8207 dstmpte->ref_count++; 8208 } else if ((dstmpte = pmap_allocpte(dst_pmap, addr, 8209 NULL)) == NULL) 8210 goto out; 8211 dst_pte = (pt_entry_t *) 8212 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 8213 dst_pte = &dst_pte[pmap_pte_index(addr)]; 8214 if (*dst_pte == 0 && 8215 pmap_try_insert_pv_entry(dst_pmap, addr, 8216 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) { 8217 /* 8218 * Clear the wired, modified, and accessed 8219 * (referenced) bits during the copy. 8220 */ 8221 *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); 8222 pmap_resident_count_adj(dst_pmap, 1); 8223 } else { 8224 pmap_abort_ptp(dst_pmap, addr, dstmpte); 8225 goto out; 8226 } 8227 /* Have we copied all of the valid mappings? */ 8228 if (dstmpte->ref_count >= srcmpte->ref_count) 8229 break; 8230 } 8231 } 8232 out: 8233 if (lock != NULL) 8234 rw_wunlock(lock); 8235 PMAP_UNLOCK(src_pmap); 8236 PMAP_UNLOCK(dst_pmap); 8237 } 8238 8239 int 8240 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap) 8241 { 8242 int error; 8243 8244 if (dst_pmap->pm_type != src_pmap->pm_type || 8245 dst_pmap->pm_type != PT_X86 || 8246 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) 8247 return (0); 8248 for (;;) { 8249 if (dst_pmap < src_pmap) { 8250 PMAP_LOCK(dst_pmap); 8251 PMAP_LOCK(src_pmap); 8252 } else { 8253 PMAP_LOCK(src_pmap); 8254 PMAP_LOCK(dst_pmap); 8255 } 8256 error = pmap_pkru_copy(dst_pmap, src_pmap); 8257 /* Clean up partial copy on failure due to no memory. */ 8258 if (error == ENOMEM) 8259 pmap_pkru_deassign_all(dst_pmap); 8260 PMAP_UNLOCK(src_pmap); 8261 PMAP_UNLOCK(dst_pmap); 8262 if (error != ENOMEM) 8263 break; 8264 vm_wait(NULL); 8265 } 8266 return (error); 8267 } 8268 8269 /* 8270 * Zero the specified hardware page. 8271 */ 8272 void 8273 pmap_zero_page(vm_page_t m) 8274 { 8275 vm_offset_t va; 8276 8277 #ifdef TSLOG_PAGEZERO 8278 TSENTER(); 8279 #endif 8280 va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 8281 pagezero((void *)va); 8282 #ifdef TSLOG_PAGEZERO 8283 TSEXIT(); 8284 #endif 8285 } 8286 8287 /* 8288 * Zero an area within a single hardware page. off and size must not 8289 * cover an area beyond a single hardware page. 8290 */ 8291 void 8292 pmap_zero_page_area(vm_page_t m, int off, int size) 8293 { 8294 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 8295 8296 if (off == 0 && size == PAGE_SIZE) 8297 pagezero((void *)va); 8298 else 8299 bzero((char *)va + off, size); 8300 } 8301 8302 /* 8303 * Copy 1 specified hardware page to another. 8304 */ 8305 void 8306 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 8307 { 8308 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 8309 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 8310 8311 pagecopy((void *)src, (void *)dst); 8312 } 8313 8314 int unmapped_buf_allowed = 1; 8315 8316 void 8317 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 8318 vm_offset_t b_offset, int xfersize) 8319 { 8320 void *a_cp, *b_cp; 8321 vm_page_t pages[2]; 8322 vm_offset_t vaddr[2], a_pg_offset, b_pg_offset; 8323 int cnt; 8324 bool mapped; 8325 8326 while (xfersize > 0) { 8327 a_pg_offset = a_offset & PAGE_MASK; 8328 pages[0] = ma[a_offset >> PAGE_SHIFT]; 8329 b_pg_offset = b_offset & PAGE_MASK; 8330 pages[1] = mb[b_offset >> PAGE_SHIFT]; 8331 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 8332 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 8333 mapped = pmap_map_io_transient(pages, vaddr, 2, false); 8334 a_cp = (char *)vaddr[0] + a_pg_offset; 8335 b_cp = (char *)vaddr[1] + b_pg_offset; 8336 bcopy(a_cp, b_cp, cnt); 8337 if (__predict_false(mapped)) 8338 pmap_unmap_io_transient(pages, vaddr, 2, false); 8339 a_offset += cnt; 8340 b_offset += cnt; 8341 xfersize -= cnt; 8342 } 8343 } 8344 8345 /* 8346 * Returns true if the pmap's pv is one of the first 8347 * 16 pvs linked to from this page. This count may 8348 * be changed upwards or downwards in the future; it 8349 * is only necessary that true be returned for a small 8350 * subset of pmaps for proper page aging. 8351 */ 8352 bool 8353 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 8354 { 8355 struct md_page *pvh; 8356 struct rwlock *lock; 8357 pv_entry_t pv; 8358 int loops = 0; 8359 bool rv; 8360 8361 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8362 ("pmap_page_exists_quick: page %p is not managed", m)); 8363 rv = false; 8364 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8365 rw_rlock(lock); 8366 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8367 if (PV_PMAP(pv) == pmap) { 8368 rv = true; 8369 break; 8370 } 8371 loops++; 8372 if (loops >= 16) 8373 break; 8374 } 8375 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 8376 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8377 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8378 if (PV_PMAP(pv) == pmap) { 8379 rv = true; 8380 break; 8381 } 8382 loops++; 8383 if (loops >= 16) 8384 break; 8385 } 8386 } 8387 rw_runlock(lock); 8388 return (rv); 8389 } 8390 8391 /* 8392 * pmap_page_wired_mappings: 8393 * 8394 * Return the number of managed mappings to the given physical page 8395 * that are wired. 8396 */ 8397 int 8398 pmap_page_wired_mappings(vm_page_t m) 8399 { 8400 struct rwlock *lock; 8401 struct md_page *pvh; 8402 pmap_t pmap; 8403 pt_entry_t *pte; 8404 pv_entry_t pv; 8405 int count, md_gen, pvh_gen; 8406 8407 if ((m->oflags & VPO_UNMANAGED) != 0) 8408 return (0); 8409 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8410 rw_rlock(lock); 8411 restart: 8412 count = 0; 8413 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8414 pmap = PV_PMAP(pv); 8415 if (!PMAP_TRYLOCK(pmap)) { 8416 md_gen = m->md.pv_gen; 8417 rw_runlock(lock); 8418 PMAP_LOCK(pmap); 8419 rw_rlock(lock); 8420 if (md_gen != m->md.pv_gen) { 8421 PMAP_UNLOCK(pmap); 8422 goto restart; 8423 } 8424 } 8425 pte = pmap_pte(pmap, pv->pv_va); 8426 if ((*pte & PG_W) != 0) 8427 count++; 8428 PMAP_UNLOCK(pmap); 8429 } 8430 if ((m->flags & PG_FICTITIOUS) == 0) { 8431 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8432 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8433 pmap = PV_PMAP(pv); 8434 if (!PMAP_TRYLOCK(pmap)) { 8435 md_gen = m->md.pv_gen; 8436 pvh_gen = pvh->pv_gen; 8437 rw_runlock(lock); 8438 PMAP_LOCK(pmap); 8439 rw_rlock(lock); 8440 if (md_gen != m->md.pv_gen || 8441 pvh_gen != pvh->pv_gen) { 8442 PMAP_UNLOCK(pmap); 8443 goto restart; 8444 } 8445 } 8446 pte = pmap_pde(pmap, pv->pv_va); 8447 if ((*pte & PG_W) != 0) 8448 count++; 8449 PMAP_UNLOCK(pmap); 8450 } 8451 } 8452 rw_runlock(lock); 8453 return (count); 8454 } 8455 8456 /* 8457 * Returns true if the given page is mapped individually or as part of 8458 * a 2mpage. Otherwise, returns false. 8459 */ 8460 bool 8461 pmap_page_is_mapped(vm_page_t m) 8462 { 8463 struct rwlock *lock; 8464 bool rv; 8465 8466 if ((m->oflags & VPO_UNMANAGED) != 0) 8467 return (false); 8468 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8469 rw_rlock(lock); 8470 rv = !TAILQ_EMPTY(&m->md.pv_list) || 8471 ((m->flags & PG_FICTITIOUS) == 0 && 8472 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 8473 rw_runlock(lock); 8474 return (rv); 8475 } 8476 8477 /* 8478 * Destroy all managed, non-wired mappings in the given user-space 8479 * pmap. This pmap cannot be active on any processor besides the 8480 * caller. 8481 * 8482 * This function cannot be applied to the kernel pmap. Moreover, it 8483 * is not intended for general use. It is only to be used during 8484 * process termination. Consequently, it can be implemented in ways 8485 * that make it faster than pmap_remove(). First, it can more quickly 8486 * destroy mappings by iterating over the pmap's collection of PV 8487 * entries, rather than searching the page table. Second, it doesn't 8488 * have to test and clear the page table entries atomically, because 8489 * no processor is currently accessing the user address space. In 8490 * particular, a page table entry's dirty bit won't change state once 8491 * this function starts. 8492 * 8493 * Although this function destroys all of the pmap's managed, 8494 * non-wired mappings, it can delay and batch the invalidation of TLB 8495 * entries without calling pmap_delayed_invl_start() and 8496 * pmap_delayed_invl_finish(). Because the pmap is not active on 8497 * any other processor, none of these TLB entries will ever be used 8498 * before their eventual invalidation. Consequently, there is no need 8499 * for either pmap_remove_all() or pmap_remove_write() to wait for 8500 * that eventual TLB invalidation. 8501 */ 8502 void 8503 pmap_remove_pages(pmap_t pmap) 8504 { 8505 pd_entry_t ptepde; 8506 pt_entry_t *pte, tpte; 8507 pt_entry_t PG_M, PG_RW, PG_V; 8508 struct spglist free; 8509 struct pv_chunklist free_chunks[PMAP_MEMDOM]; 8510 vm_page_t m, mpte, mt; 8511 pv_entry_t pv; 8512 struct md_page *pvh; 8513 struct pv_chunk *pc, *npc; 8514 struct rwlock *lock; 8515 int64_t bit; 8516 uint64_t inuse, bitmask; 8517 int allfree, field, i, idx; 8518 #ifdef PV_STATS 8519 int freed; 8520 #endif 8521 bool superpage; 8522 vm_paddr_t pa; 8523 8524 /* 8525 * Assert that the given pmap is only active on the current 8526 * CPU. Unfortunately, we cannot block another CPU from 8527 * activating the pmap while this function is executing. 8528 */ 8529 KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); 8530 #ifdef INVARIANTS 8531 { 8532 cpuset_t other_cpus; 8533 8534 other_cpus = all_cpus; 8535 critical_enter(); 8536 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 8537 CPU_AND(&other_cpus, &other_cpus, &pmap->pm_active); 8538 critical_exit(); 8539 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); 8540 } 8541 #endif 8542 8543 lock = NULL; 8544 PG_M = pmap_modified_bit(pmap); 8545 PG_V = pmap_valid_bit(pmap); 8546 PG_RW = pmap_rw_bit(pmap); 8547 8548 for (i = 0; i < PMAP_MEMDOM; i++) 8549 TAILQ_INIT(&free_chunks[i]); 8550 SLIST_INIT(&free); 8551 PMAP_LOCK(pmap); 8552 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 8553 allfree = 1; 8554 #ifdef PV_STATS 8555 freed = 0; 8556 #endif 8557 for (field = 0; field < _NPCM; field++) { 8558 inuse = ~pc->pc_map[field] & pc_freemask[field]; 8559 while (inuse != 0) { 8560 bit = bsfq(inuse); 8561 bitmask = 1UL << bit; 8562 idx = field * 64 + bit; 8563 pv = &pc->pc_pventry[idx]; 8564 inuse &= ~bitmask; 8565 8566 pte = pmap_pdpe(pmap, pv->pv_va); 8567 ptepde = *pte; 8568 pte = pmap_pdpe_to_pde(pte, pv->pv_va); 8569 tpte = *pte; 8570 if ((tpte & (PG_PS | PG_V)) == PG_V) { 8571 superpage = false; 8572 ptepde = tpte; 8573 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 8574 PG_FRAME); 8575 pte = &pte[pmap_pte_index(pv->pv_va)]; 8576 tpte = *pte; 8577 } else { 8578 /* 8579 * Keep track whether 'tpte' is a 8580 * superpage explicitly instead of 8581 * relying on PG_PS being set. 8582 * 8583 * This is because PG_PS is numerically 8584 * identical to PG_PTE_PAT and thus a 8585 * regular page could be mistaken for 8586 * a superpage. 8587 */ 8588 superpage = true; 8589 } 8590 8591 if ((tpte & PG_V) == 0) { 8592 panic("bad pte va %lx pte %lx", 8593 pv->pv_va, tpte); 8594 } 8595 8596 /* 8597 * We cannot remove wired pages from a process' mapping at this time 8598 */ 8599 if (tpte & PG_W) { 8600 allfree = 0; 8601 continue; 8602 } 8603 8604 /* Mark free */ 8605 pc->pc_map[field] |= bitmask; 8606 8607 /* 8608 * Because this pmap is not active on other 8609 * processors, the dirty bit cannot have 8610 * changed state since we last loaded pte. 8611 */ 8612 pte_clear(pte); 8613 8614 if (superpage) 8615 pa = tpte & PG_PS_FRAME; 8616 else 8617 pa = tpte & PG_FRAME; 8618 8619 m = PHYS_TO_VM_PAGE(pa); 8620 KASSERT(m->phys_addr == pa, 8621 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 8622 m, (uintmax_t)m->phys_addr, 8623 (uintmax_t)tpte)); 8624 8625 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 8626 m < &vm_page_array[vm_page_array_size], 8627 ("pmap_remove_pages: bad tpte %#jx", 8628 (uintmax_t)tpte)); 8629 8630 /* 8631 * Update the vm_page_t clean/reference bits. 8632 */ 8633 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 8634 if (superpage) { 8635 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 8636 vm_page_dirty(mt); 8637 } else 8638 vm_page_dirty(m); 8639 } 8640 8641 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 8642 8643 if (superpage) { 8644 pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE); 8645 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 8646 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 8647 pvh->pv_gen++; 8648 if (TAILQ_EMPTY(&pvh->pv_list)) { 8649 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 8650 if ((mt->a.flags & PGA_WRITEABLE) != 0 && 8651 TAILQ_EMPTY(&mt->md.pv_list)) 8652 vm_page_aflag_clear(mt, PGA_WRITEABLE); 8653 } 8654 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 8655 if (mpte != NULL) { 8656 KASSERT(vm_page_any_valid(mpte), 8657 ("pmap_remove_pages: pte page not promoted")); 8658 pmap_pt_page_count_adj(pmap, -1); 8659 KASSERT(mpte->ref_count == NPTEPG, 8660 ("pmap_remove_pages: pte page reference count error")); 8661 mpte->ref_count = 0; 8662 pmap_add_delayed_free_list(mpte, &free, false); 8663 } 8664 } else { 8665 pmap_resident_count_adj(pmap, -1); 8666 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 8667 m->md.pv_gen++; 8668 if ((m->a.flags & PGA_WRITEABLE) != 0 && 8669 TAILQ_EMPTY(&m->md.pv_list) && 8670 (m->flags & PG_FICTITIOUS) == 0) { 8671 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8672 if (TAILQ_EMPTY(&pvh->pv_list)) 8673 vm_page_aflag_clear(m, PGA_WRITEABLE); 8674 } 8675 } 8676 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 8677 #ifdef PV_STATS 8678 freed++; 8679 #endif 8680 } 8681 } 8682 PV_STAT(counter_u64_add(pv_entry_frees, freed)); 8683 PV_STAT(counter_u64_add(pv_entry_spare, freed)); 8684 PV_STAT(counter_u64_add(pv_entry_count, -freed)); 8685 if (allfree) { 8686 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 8687 TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc, pc_list); 8688 } 8689 } 8690 if (lock != NULL) 8691 rw_wunlock(lock); 8692 pmap_invalidate_all(pmap); 8693 pmap_pkru_deassign_all(pmap); 8694 free_pv_chunk_batch((struct pv_chunklist *)&free_chunks); 8695 PMAP_UNLOCK(pmap); 8696 vm_page_free_pages_toq(&free, true); 8697 } 8698 8699 static bool 8700 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified) 8701 { 8702 struct rwlock *lock; 8703 pv_entry_t pv; 8704 struct md_page *pvh; 8705 pt_entry_t *pte, mask; 8706 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 8707 pmap_t pmap; 8708 int md_gen, pvh_gen; 8709 bool rv; 8710 8711 rv = false; 8712 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8713 rw_rlock(lock); 8714 restart: 8715 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8716 pmap = PV_PMAP(pv); 8717 if (!PMAP_TRYLOCK(pmap)) { 8718 md_gen = m->md.pv_gen; 8719 rw_runlock(lock); 8720 PMAP_LOCK(pmap); 8721 rw_rlock(lock); 8722 if (md_gen != m->md.pv_gen) { 8723 PMAP_UNLOCK(pmap); 8724 goto restart; 8725 } 8726 } 8727 pte = pmap_pte(pmap, pv->pv_va); 8728 mask = 0; 8729 if (modified) { 8730 PG_M = pmap_modified_bit(pmap); 8731 PG_RW = pmap_rw_bit(pmap); 8732 mask |= PG_RW | PG_M; 8733 } 8734 if (accessed) { 8735 PG_A = pmap_accessed_bit(pmap); 8736 PG_V = pmap_valid_bit(pmap); 8737 mask |= PG_V | PG_A; 8738 } 8739 rv = (*pte & mask) == mask; 8740 PMAP_UNLOCK(pmap); 8741 if (rv) 8742 goto out; 8743 } 8744 if ((m->flags & PG_FICTITIOUS) == 0) { 8745 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8746 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8747 pmap = PV_PMAP(pv); 8748 if (!PMAP_TRYLOCK(pmap)) { 8749 md_gen = m->md.pv_gen; 8750 pvh_gen = pvh->pv_gen; 8751 rw_runlock(lock); 8752 PMAP_LOCK(pmap); 8753 rw_rlock(lock); 8754 if (md_gen != m->md.pv_gen || 8755 pvh_gen != pvh->pv_gen) { 8756 PMAP_UNLOCK(pmap); 8757 goto restart; 8758 } 8759 } 8760 pte = pmap_pde(pmap, pv->pv_va); 8761 mask = 0; 8762 if (modified) { 8763 PG_M = pmap_modified_bit(pmap); 8764 PG_RW = pmap_rw_bit(pmap); 8765 mask |= PG_RW | PG_M; 8766 } 8767 if (accessed) { 8768 PG_A = pmap_accessed_bit(pmap); 8769 PG_V = pmap_valid_bit(pmap); 8770 mask |= PG_V | PG_A; 8771 } 8772 rv = (*pte & mask) == mask; 8773 PMAP_UNLOCK(pmap); 8774 if (rv) 8775 goto out; 8776 } 8777 } 8778 out: 8779 rw_runlock(lock); 8780 return (rv); 8781 } 8782 8783 /* 8784 * pmap_is_modified: 8785 * 8786 * Return whether or not the specified physical page was modified 8787 * in any physical maps. 8788 */ 8789 bool 8790 pmap_is_modified(vm_page_t m) 8791 { 8792 8793 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8794 ("pmap_is_modified: page %p is not managed", m)); 8795 8796 /* 8797 * If the page is not busied then this check is racy. 8798 */ 8799 if (!pmap_page_is_write_mapped(m)) 8800 return (false); 8801 return (pmap_page_test_mappings(m, false, true)); 8802 } 8803 8804 /* 8805 * pmap_is_prefaultable: 8806 * 8807 * Return whether or not the specified virtual address is eligible 8808 * for prefault. 8809 */ 8810 bool 8811 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 8812 { 8813 pd_entry_t *pde; 8814 pt_entry_t *pte, PG_V; 8815 bool rv; 8816 8817 PG_V = pmap_valid_bit(pmap); 8818 8819 /* 8820 * Return true if and only if the PTE for the specified virtual 8821 * address is allocated but invalid. 8822 */ 8823 rv = false; 8824 PMAP_LOCK(pmap); 8825 pde = pmap_pde(pmap, addr); 8826 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { 8827 pte = pmap_pde_to_pte(pde, addr); 8828 rv = (*pte & PG_V) == 0; 8829 } 8830 PMAP_UNLOCK(pmap); 8831 return (rv); 8832 } 8833 8834 /* 8835 * pmap_is_referenced: 8836 * 8837 * Return whether or not the specified physical page was referenced 8838 * in any physical maps. 8839 */ 8840 bool 8841 pmap_is_referenced(vm_page_t m) 8842 { 8843 8844 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8845 ("pmap_is_referenced: page %p is not managed", m)); 8846 return (pmap_page_test_mappings(m, true, false)); 8847 } 8848 8849 /* 8850 * Clear the write and modified bits in each of the given page's mappings. 8851 */ 8852 void 8853 pmap_remove_write(vm_page_t m) 8854 { 8855 struct md_page *pvh; 8856 pmap_t pmap; 8857 struct rwlock *lock; 8858 pv_entry_t next_pv, pv; 8859 pd_entry_t *pde; 8860 pt_entry_t oldpte, *pte, PG_M, PG_RW; 8861 vm_offset_t va; 8862 int pvh_gen, md_gen; 8863 8864 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8865 ("pmap_remove_write: page %p is not managed", m)); 8866 8867 vm_page_assert_busied(m); 8868 if (!pmap_page_is_write_mapped(m)) 8869 return; 8870 8871 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8872 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 8873 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8874 rw_wlock(lock); 8875 retry: 8876 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 8877 pmap = PV_PMAP(pv); 8878 if (!PMAP_TRYLOCK(pmap)) { 8879 pvh_gen = pvh->pv_gen; 8880 rw_wunlock(lock); 8881 PMAP_LOCK(pmap); 8882 rw_wlock(lock); 8883 if (pvh_gen != pvh->pv_gen) { 8884 PMAP_UNLOCK(pmap); 8885 goto retry; 8886 } 8887 } 8888 PG_RW = pmap_rw_bit(pmap); 8889 va = pv->pv_va; 8890 pde = pmap_pde(pmap, va); 8891 if ((*pde & PG_RW) != 0) 8892 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 8893 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 8894 ("inconsistent pv lock %p %p for page %p", 8895 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 8896 PMAP_UNLOCK(pmap); 8897 } 8898 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8899 pmap = PV_PMAP(pv); 8900 if (!PMAP_TRYLOCK(pmap)) { 8901 pvh_gen = pvh->pv_gen; 8902 md_gen = m->md.pv_gen; 8903 rw_wunlock(lock); 8904 PMAP_LOCK(pmap); 8905 rw_wlock(lock); 8906 if (pvh_gen != pvh->pv_gen || 8907 md_gen != m->md.pv_gen) { 8908 PMAP_UNLOCK(pmap); 8909 goto retry; 8910 } 8911 } 8912 PG_M = pmap_modified_bit(pmap); 8913 PG_RW = pmap_rw_bit(pmap); 8914 pde = pmap_pde(pmap, pv->pv_va); 8915 KASSERT((*pde & PG_PS) == 0, 8916 ("pmap_remove_write: found a 2mpage in page %p's pv list", 8917 m)); 8918 pte = pmap_pde_to_pte(pde, pv->pv_va); 8919 oldpte = *pte; 8920 if (oldpte & PG_RW) { 8921 while (!atomic_fcmpset_long(pte, &oldpte, oldpte & 8922 ~(PG_RW | PG_M))) 8923 cpu_spinwait(); 8924 if ((oldpte & PG_M) != 0) 8925 vm_page_dirty(m); 8926 pmap_invalidate_page(pmap, pv->pv_va); 8927 } 8928 PMAP_UNLOCK(pmap); 8929 } 8930 rw_wunlock(lock); 8931 vm_page_aflag_clear(m, PGA_WRITEABLE); 8932 pmap_delayed_invl_wait(m); 8933 } 8934 8935 /* 8936 * pmap_ts_referenced: 8937 * 8938 * Return a count of reference bits for a page, clearing those bits. 8939 * It is not necessary for every reference bit to be cleared, but it 8940 * is necessary that 0 only be returned when there are truly no 8941 * reference bits set. 8942 * 8943 * As an optimization, update the page's dirty field if a modified bit is 8944 * found while counting reference bits. This opportunistic update can be 8945 * performed at low cost and can eliminate the need for some future calls 8946 * to pmap_is_modified(). However, since this function stops after 8947 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 8948 * dirty pages. Those dirty pages will only be detected by a future call 8949 * to pmap_is_modified(). 8950 * 8951 * A DI block is not needed within this function, because 8952 * invalidations are performed before the PV list lock is 8953 * released. 8954 */ 8955 int 8956 pmap_ts_referenced(vm_page_t m) 8957 { 8958 struct md_page *pvh; 8959 pv_entry_t pv, pvf; 8960 pmap_t pmap; 8961 struct rwlock *lock; 8962 pd_entry_t oldpde, *pde; 8963 pt_entry_t *pte, PG_A, PG_M, PG_RW; 8964 vm_offset_t va; 8965 vm_paddr_t pa; 8966 int cleared, md_gen, not_cleared, pvh_gen; 8967 struct spglist free; 8968 bool demoted; 8969 8970 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8971 ("pmap_ts_referenced: page %p is not managed", m)); 8972 SLIST_INIT(&free); 8973 cleared = 0; 8974 pa = VM_PAGE_TO_PHYS(m); 8975 lock = PHYS_TO_PV_LIST_LOCK(pa); 8976 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 8977 rw_wlock(lock); 8978 retry: 8979 not_cleared = 0; 8980 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 8981 goto small_mappings; 8982 pv = pvf; 8983 do { 8984 if (pvf == NULL) 8985 pvf = pv; 8986 pmap = PV_PMAP(pv); 8987 if (!PMAP_TRYLOCK(pmap)) { 8988 pvh_gen = pvh->pv_gen; 8989 rw_wunlock(lock); 8990 PMAP_LOCK(pmap); 8991 rw_wlock(lock); 8992 if (pvh_gen != pvh->pv_gen) { 8993 PMAP_UNLOCK(pmap); 8994 goto retry; 8995 } 8996 } 8997 PG_A = pmap_accessed_bit(pmap); 8998 PG_M = pmap_modified_bit(pmap); 8999 PG_RW = pmap_rw_bit(pmap); 9000 va = pv->pv_va; 9001 pde = pmap_pde(pmap, pv->pv_va); 9002 oldpde = *pde; 9003 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 9004 /* 9005 * Although "oldpde" is mapping a 2MB page, because 9006 * this function is called at a 4KB page granularity, 9007 * we only update the 4KB page under test. 9008 */ 9009 vm_page_dirty(m); 9010 } 9011 if ((oldpde & PG_A) != 0) { 9012 /* 9013 * Since this reference bit is shared by 512 4KB 9014 * pages, it should not be cleared every time it is 9015 * tested. Apply a simple "hash" function on the 9016 * physical page number, the virtual superpage number, 9017 * and the pmap address to select one 4KB page out of 9018 * the 512 on which testing the reference bit will 9019 * result in clearing that reference bit. This 9020 * function is designed to avoid the selection of the 9021 * same 4KB page for every 2MB page mapping. 9022 * 9023 * On demotion, a mapping that hasn't been referenced 9024 * is simply destroyed. To avoid the possibility of a 9025 * subsequent page fault on a demoted wired mapping, 9026 * always leave its reference bit set. Moreover, 9027 * since the superpage is wired, the current state of 9028 * its reference bit won't affect page replacement. 9029 */ 9030 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 9031 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 9032 (oldpde & PG_W) == 0) { 9033 if (safe_to_clear_referenced(pmap, oldpde)) { 9034 atomic_clear_long(pde, PG_A); 9035 pmap_invalidate_page(pmap, pv->pv_va); 9036 demoted = false; 9037 } else if (pmap_demote_pde_locked(pmap, pde, 9038 pv->pv_va, &lock)) { 9039 /* 9040 * Remove the mapping to a single page 9041 * so that a subsequent access may 9042 * repromote. Since the underlying 9043 * page table page is fully populated, 9044 * this removal never frees a page 9045 * table page. 9046 */ 9047 demoted = true; 9048 va += VM_PAGE_TO_PHYS(m) - (oldpde & 9049 PG_PS_FRAME); 9050 pte = pmap_pde_to_pte(pde, va); 9051 pmap_remove_pte(pmap, pte, va, *pde, 9052 NULL, &lock); 9053 pmap_invalidate_page(pmap, va); 9054 } else 9055 demoted = true; 9056 9057 if (demoted) { 9058 /* 9059 * The superpage mapping was removed 9060 * entirely and therefore 'pv' is no 9061 * longer valid. 9062 */ 9063 if (pvf == pv) 9064 pvf = NULL; 9065 pv = NULL; 9066 } 9067 cleared++; 9068 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 9069 ("inconsistent pv lock %p %p for page %p", 9070 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 9071 } else 9072 not_cleared++; 9073 } 9074 PMAP_UNLOCK(pmap); 9075 /* Rotate the PV list if it has more than one entry. */ 9076 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 9077 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 9078 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 9079 pvh->pv_gen++; 9080 } 9081 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 9082 goto out; 9083 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 9084 small_mappings: 9085 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 9086 goto out; 9087 pv = pvf; 9088 do { 9089 if (pvf == NULL) 9090 pvf = pv; 9091 pmap = PV_PMAP(pv); 9092 if (!PMAP_TRYLOCK(pmap)) { 9093 pvh_gen = pvh->pv_gen; 9094 md_gen = m->md.pv_gen; 9095 rw_wunlock(lock); 9096 PMAP_LOCK(pmap); 9097 rw_wlock(lock); 9098 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 9099 PMAP_UNLOCK(pmap); 9100 goto retry; 9101 } 9102 } 9103 PG_A = pmap_accessed_bit(pmap); 9104 PG_M = pmap_modified_bit(pmap); 9105 PG_RW = pmap_rw_bit(pmap); 9106 pde = pmap_pde(pmap, pv->pv_va); 9107 KASSERT((*pde & PG_PS) == 0, 9108 ("pmap_ts_referenced: found a 2mpage in page %p's pv list", 9109 m)); 9110 pte = pmap_pde_to_pte(pde, pv->pv_va); 9111 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 9112 vm_page_dirty(m); 9113 if ((*pte & PG_A) != 0) { 9114 if (safe_to_clear_referenced(pmap, *pte)) { 9115 atomic_clear_long(pte, PG_A); 9116 pmap_invalidate_page(pmap, pv->pv_va); 9117 cleared++; 9118 } else if ((*pte & PG_W) == 0) { 9119 /* 9120 * Wired pages cannot be paged out so 9121 * doing accessed bit emulation for 9122 * them is wasted effort. We do the 9123 * hard work for unwired pages only. 9124 */ 9125 pmap_remove_pte(pmap, pte, pv->pv_va, 9126 *pde, &free, &lock); 9127 pmap_invalidate_page(pmap, pv->pv_va); 9128 cleared++; 9129 if (pvf == pv) 9130 pvf = NULL; 9131 pv = NULL; 9132 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 9133 ("inconsistent pv lock %p %p for page %p", 9134 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 9135 } else 9136 not_cleared++; 9137 } 9138 PMAP_UNLOCK(pmap); 9139 /* Rotate the PV list if it has more than one entry. */ 9140 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 9141 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 9142 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 9143 m->md.pv_gen++; 9144 } 9145 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 9146 not_cleared < PMAP_TS_REFERENCED_MAX); 9147 out: 9148 rw_wunlock(lock); 9149 vm_page_free_pages_toq(&free, true); 9150 return (cleared + not_cleared); 9151 } 9152 9153 /* 9154 * Apply the given advice to the specified range of addresses within the 9155 * given pmap. Depending on the advice, clear the referenced and/or 9156 * modified flags in each mapping and set the mapped page's dirty field. 9157 */ 9158 void 9159 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 9160 { 9161 struct rwlock *lock; 9162 pml4_entry_t *pml4e; 9163 pdp_entry_t *pdpe; 9164 pd_entry_t oldpde, *pde; 9165 pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; 9166 vm_offset_t va, va_next; 9167 vm_page_t m; 9168 bool anychanged; 9169 9170 if (advice != MADV_DONTNEED && advice != MADV_FREE) 9171 return; 9172 9173 /* 9174 * A/D bit emulation requires an alternate code path when clearing 9175 * the modified and accessed bits below. Since this function is 9176 * advisory in nature we skip it entirely for pmaps that require 9177 * A/D bit emulation. 9178 */ 9179 if (pmap_emulate_ad_bits(pmap)) 9180 return; 9181 9182 PG_A = pmap_accessed_bit(pmap); 9183 PG_G = pmap_global_bit(pmap); 9184 PG_M = pmap_modified_bit(pmap); 9185 PG_V = pmap_valid_bit(pmap); 9186 PG_RW = pmap_rw_bit(pmap); 9187 anychanged = false; 9188 pmap_delayed_invl_start(); 9189 PMAP_LOCK(pmap); 9190 for (; sva < eva; sva = va_next) { 9191 pml4e = pmap_pml4e(pmap, sva); 9192 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 9193 va_next = (sva + NBPML4) & ~PML4MASK; 9194 if (va_next < sva) 9195 va_next = eva; 9196 continue; 9197 } 9198 9199 va_next = (sva + NBPDP) & ~PDPMASK; 9200 if (va_next < sva) 9201 va_next = eva; 9202 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 9203 if ((*pdpe & PG_V) == 0) 9204 continue; 9205 if ((*pdpe & PG_PS) != 0) 9206 continue; 9207 9208 va_next = (sva + NBPDR) & ~PDRMASK; 9209 if (va_next < sva) 9210 va_next = eva; 9211 pde = pmap_pdpe_to_pde(pdpe, sva); 9212 oldpde = *pde; 9213 if ((oldpde & PG_V) == 0) 9214 continue; 9215 else if ((oldpde & PG_PS) != 0) { 9216 if ((oldpde & PG_MANAGED) == 0) 9217 continue; 9218 lock = NULL; 9219 if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { 9220 if (lock != NULL) 9221 rw_wunlock(lock); 9222 9223 /* 9224 * The large page mapping was destroyed. 9225 */ 9226 continue; 9227 } 9228 9229 /* 9230 * Unless the page mappings are wired, remove the 9231 * mapping to a single page so that a subsequent 9232 * access may repromote. Choosing the last page 9233 * within the address range [sva, min(va_next, eva)) 9234 * generally results in more repromotions. Since the 9235 * underlying page table page is fully populated, this 9236 * removal never frees a page table page. 9237 */ 9238 if ((oldpde & PG_W) == 0) { 9239 va = eva; 9240 if (va > va_next) 9241 va = va_next; 9242 va -= PAGE_SIZE; 9243 KASSERT(va >= sva, 9244 ("pmap_advise: no address gap")); 9245 pte = pmap_pde_to_pte(pde, va); 9246 KASSERT((*pte & PG_V) != 0, 9247 ("pmap_advise: invalid PTE")); 9248 pmap_remove_pte(pmap, pte, va, *pde, NULL, 9249 &lock); 9250 anychanged = true; 9251 } 9252 if (lock != NULL) 9253 rw_wunlock(lock); 9254 } 9255 if (va_next > eva) 9256 va_next = eva; 9257 va = va_next; 9258 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 9259 sva += PAGE_SIZE) { 9260 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) 9261 goto maybe_invlrng; 9262 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 9263 if (advice == MADV_DONTNEED) { 9264 /* 9265 * Future calls to pmap_is_modified() 9266 * can be avoided by making the page 9267 * dirty now. 9268 */ 9269 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 9270 vm_page_dirty(m); 9271 } 9272 atomic_clear_long(pte, PG_M | PG_A); 9273 } else if ((*pte & PG_A) != 0) 9274 atomic_clear_long(pte, PG_A); 9275 else 9276 goto maybe_invlrng; 9277 9278 if ((*pte & PG_G) != 0) { 9279 if (va == va_next) 9280 va = sva; 9281 } else 9282 anychanged = true; 9283 continue; 9284 maybe_invlrng: 9285 if (va != va_next) { 9286 pmap_invalidate_range(pmap, va, sva); 9287 va = va_next; 9288 } 9289 } 9290 if (va != va_next) 9291 pmap_invalidate_range(pmap, va, sva); 9292 } 9293 if (anychanged) 9294 pmap_invalidate_all(pmap); 9295 PMAP_UNLOCK(pmap); 9296 pmap_delayed_invl_finish(); 9297 } 9298 9299 /* 9300 * Clear the modify bits on the specified physical page. 9301 */ 9302 void 9303 pmap_clear_modify(vm_page_t m) 9304 { 9305 struct md_page *pvh; 9306 pmap_t pmap; 9307 pv_entry_t next_pv, pv; 9308 pd_entry_t oldpde, *pde; 9309 pt_entry_t *pte, PG_M, PG_RW; 9310 struct rwlock *lock; 9311 vm_offset_t va; 9312 int md_gen, pvh_gen; 9313 9314 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 9315 ("pmap_clear_modify: page %p is not managed", m)); 9316 vm_page_assert_busied(m); 9317 9318 if (!pmap_page_is_write_mapped(m)) 9319 return; 9320 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 9321 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 9322 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 9323 rw_wlock(lock); 9324 restart: 9325 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 9326 pmap = PV_PMAP(pv); 9327 if (!PMAP_TRYLOCK(pmap)) { 9328 pvh_gen = pvh->pv_gen; 9329 rw_wunlock(lock); 9330 PMAP_LOCK(pmap); 9331 rw_wlock(lock); 9332 if (pvh_gen != pvh->pv_gen) { 9333 PMAP_UNLOCK(pmap); 9334 goto restart; 9335 } 9336 } 9337 PG_M = pmap_modified_bit(pmap); 9338 PG_RW = pmap_rw_bit(pmap); 9339 va = pv->pv_va; 9340 pde = pmap_pde(pmap, va); 9341 oldpde = *pde; 9342 /* If oldpde has PG_RW set, then it also has PG_M set. */ 9343 if ((oldpde & PG_RW) != 0 && 9344 pmap_demote_pde_locked(pmap, pde, va, &lock) && 9345 (oldpde & PG_W) == 0) { 9346 /* 9347 * Write protect the mapping to a single page so that 9348 * a subsequent write access may repromote. 9349 */ 9350 va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); 9351 pte = pmap_pde_to_pte(pde, va); 9352 atomic_clear_long(pte, PG_M | PG_RW); 9353 vm_page_dirty(m); 9354 pmap_invalidate_page(pmap, va); 9355 } 9356 PMAP_UNLOCK(pmap); 9357 } 9358 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 9359 pmap = PV_PMAP(pv); 9360 if (!PMAP_TRYLOCK(pmap)) { 9361 md_gen = m->md.pv_gen; 9362 pvh_gen = pvh->pv_gen; 9363 rw_wunlock(lock); 9364 PMAP_LOCK(pmap); 9365 rw_wlock(lock); 9366 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 9367 PMAP_UNLOCK(pmap); 9368 goto restart; 9369 } 9370 } 9371 PG_M = pmap_modified_bit(pmap); 9372 PG_RW = pmap_rw_bit(pmap); 9373 pde = pmap_pde(pmap, pv->pv_va); 9374 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 9375 " a 2mpage in page %p's pv list", m)); 9376 pte = pmap_pde_to_pte(pde, pv->pv_va); 9377 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 9378 atomic_clear_long(pte, PG_M); 9379 pmap_invalidate_page(pmap, pv->pv_va); 9380 } 9381 PMAP_UNLOCK(pmap); 9382 } 9383 rw_wunlock(lock); 9384 } 9385 9386 /* 9387 * Miscellaneous support routines follow 9388 */ 9389 9390 /* Adjust the properties for a leaf page table entry. */ 9391 static __inline void 9392 pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask) 9393 { 9394 u_long opte, npte; 9395 9396 opte = *(u_long *)pte; 9397 do { 9398 npte = opte & ~mask; 9399 npte |= bits; 9400 } while (npte != opte && !atomic_fcmpset_long((u_long *)pte, &opte, 9401 npte)); 9402 } 9403 9404 /* 9405 * Map a set of physical memory pages into the kernel virtual 9406 * address space. Return a pointer to where it is mapped. This 9407 * routine is intended to be used for mapping device memory, 9408 * NOT real memory. 9409 */ 9410 static void * 9411 pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags) 9412 { 9413 struct pmap_preinit_mapping *ppim; 9414 vm_offset_t va, offset; 9415 vm_size_t tmpsize; 9416 int i; 9417 9418 offset = pa & PAGE_MASK; 9419 size = round_page(offset + size); 9420 pa = trunc_page(pa); 9421 9422 if (!pmap_initialized) { 9423 va = 0; 9424 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9425 ppim = pmap_preinit_mapping + i; 9426 if (ppim->va == 0) { 9427 ppim->pa = pa; 9428 ppim->sz = size; 9429 ppim->mode = mode; 9430 ppim->va = virtual_avail; 9431 virtual_avail += size; 9432 va = ppim->va; 9433 break; 9434 } 9435 } 9436 if (va == 0) 9437 panic("%s: too many preinit mappings", __func__); 9438 } else { 9439 /* 9440 * If we have a preinit mapping, re-use it. 9441 */ 9442 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9443 ppim = pmap_preinit_mapping + i; 9444 if (ppim->pa == pa && ppim->sz == size && 9445 (ppim->mode == mode || 9446 (flags & MAPDEV_SETATTR) == 0)) 9447 return ((void *)(ppim->va + offset)); 9448 } 9449 /* 9450 * If the specified range of physical addresses fits within 9451 * the direct map window, use the direct map. 9452 */ 9453 if (pa < dmaplimit && pa + size <= dmaplimit) { 9454 va = PHYS_TO_DMAP(pa); 9455 if ((flags & MAPDEV_SETATTR) != 0) { 9456 PMAP_LOCK(kernel_pmap); 9457 i = pmap_change_props_locked(va, size, 9458 PROT_NONE, mode, flags); 9459 PMAP_UNLOCK(kernel_pmap); 9460 } else 9461 i = 0; 9462 if (!i) 9463 return ((void *)(va + offset)); 9464 } 9465 va = kva_alloc(size); 9466 if (va == 0) 9467 panic("%s: Couldn't allocate KVA", __func__); 9468 } 9469 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 9470 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 9471 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 9472 if ((flags & MAPDEV_FLUSHCACHE) != 0) 9473 pmap_invalidate_cache_range(va, va + tmpsize); 9474 return ((void *)(va + offset)); 9475 } 9476 9477 void * 9478 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 9479 { 9480 9481 return (pmap_mapdev_internal(pa, size, mode, MAPDEV_FLUSHCACHE | 9482 MAPDEV_SETATTR)); 9483 } 9484 9485 void * 9486 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 9487 { 9488 9489 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 9490 } 9491 9492 void * 9493 pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size) 9494 { 9495 9496 return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE, 9497 MAPDEV_SETATTR)); 9498 } 9499 9500 void * 9501 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 9502 { 9503 9504 return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK, 9505 MAPDEV_FLUSHCACHE)); 9506 } 9507 9508 void 9509 pmap_unmapdev(void *p, vm_size_t size) 9510 { 9511 struct pmap_preinit_mapping *ppim; 9512 vm_offset_t offset, va; 9513 int i; 9514 9515 va = (vm_offset_t)p; 9516 9517 /* If we gave a direct map region in pmap_mapdev, do nothing */ 9518 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 9519 return; 9520 offset = va & PAGE_MASK; 9521 size = round_page(offset + size); 9522 va = trunc_page(va); 9523 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9524 ppim = pmap_preinit_mapping + i; 9525 if (ppim->va == va && ppim->sz == size) { 9526 if (pmap_initialized) 9527 return; 9528 ppim->pa = 0; 9529 ppim->va = 0; 9530 ppim->sz = 0; 9531 ppim->mode = 0; 9532 if (va + size == virtual_avail) 9533 virtual_avail = va; 9534 return; 9535 } 9536 } 9537 if (pmap_initialized) { 9538 pmap_qremove(va, atop(size)); 9539 kva_free(va, size); 9540 } 9541 } 9542 9543 /* 9544 * Tries to demote a 1GB page mapping. 9545 */ 9546 static bool 9547 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) 9548 { 9549 pdp_entry_t newpdpe, oldpdpe; 9550 pd_entry_t *firstpde, newpde, *pde; 9551 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 9552 vm_paddr_t pdpgpa; 9553 vm_page_t pdpg; 9554 9555 PG_A = pmap_accessed_bit(pmap); 9556 PG_M = pmap_modified_bit(pmap); 9557 PG_V = pmap_valid_bit(pmap); 9558 PG_RW = pmap_rw_bit(pmap); 9559 9560 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9561 oldpdpe = *pdpe; 9562 KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), 9563 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 9564 pdpg = pmap_alloc_pt_page(pmap, va >> PDPSHIFT, 9565 VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT); 9566 if (pdpg == NULL) { 9567 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 9568 " in pmap %p", va, pmap); 9569 return (false); 9570 } 9571 pdpgpa = VM_PAGE_TO_PHYS(pdpg); 9572 firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa); 9573 newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; 9574 KASSERT((oldpdpe & PG_A) != 0, 9575 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 9576 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 9577 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 9578 newpde = oldpdpe; 9579 9580 /* 9581 * Initialize the page directory page. 9582 */ 9583 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 9584 *pde = newpde; 9585 newpde += NBPDR; 9586 } 9587 9588 /* 9589 * Demote the mapping. 9590 */ 9591 *pdpe = newpdpe; 9592 9593 /* 9594 * Invalidate a stale recursive mapping of the page directory page. 9595 */ 9596 pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); 9597 9598 counter_u64_add(pmap_pdpe_demotions, 1); 9599 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 9600 " in pmap %p", va, pmap); 9601 return (true); 9602 } 9603 9604 /* 9605 * Sets the memory attribute for the specified page. 9606 */ 9607 void 9608 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 9609 { 9610 9611 m->md.pat_mode = ma; 9612 9613 /* 9614 * If "m" is a normal page, update its direct mapping. This update 9615 * can be relied upon to perform any cache operations that are 9616 * required for data coherence. 9617 */ 9618 if ((m->flags & PG_FICTITIOUS) == 0 && 9619 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 9620 m->md.pat_mode)) 9621 panic("memory attribute change on the direct map failed"); 9622 } 9623 9624 void 9625 pmap_page_set_memattr_noflush(vm_page_t m, vm_memattr_t ma) 9626 { 9627 int error; 9628 9629 m->md.pat_mode = ma; 9630 9631 if ((m->flags & PG_FICTITIOUS) != 0) 9632 return; 9633 PMAP_LOCK(kernel_pmap); 9634 error = pmap_change_props_locked(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 9635 PAGE_SIZE, PROT_NONE, m->md.pat_mode, 0); 9636 PMAP_UNLOCK(kernel_pmap); 9637 if (error != 0) 9638 panic("memory attribute change on the direct map failed"); 9639 } 9640 9641 /* 9642 * Changes the specified virtual address range's memory type to that given by 9643 * the parameter "mode". The specified virtual address range must be 9644 * completely contained within either the direct map or the kernel map. If 9645 * the virtual address range is contained within the kernel map, then the 9646 * memory type for each of the corresponding ranges of the direct map is also 9647 * changed. (The corresponding ranges of the direct map are those ranges that 9648 * map the same physical pages as the specified virtual address range.) These 9649 * changes to the direct map are necessary because Intel describes the 9650 * behavior of their processors as "undefined" if two or more mappings to the 9651 * same physical page have different memory types. 9652 * 9653 * Returns zero if the change completed successfully, and either EINVAL or 9654 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 9655 * of the virtual address range was not mapped, and ENOMEM is returned if 9656 * there was insufficient memory available to complete the change. In the 9657 * latter case, the memory type may have been changed on some part of the 9658 * virtual address range or the direct map. 9659 */ 9660 int 9661 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 9662 { 9663 int error; 9664 9665 PMAP_LOCK(kernel_pmap); 9666 error = pmap_change_props_locked(va, size, PROT_NONE, mode, 9667 MAPDEV_FLUSHCACHE); 9668 PMAP_UNLOCK(kernel_pmap); 9669 return (error); 9670 } 9671 9672 /* 9673 * Changes the specified virtual address range's protections to those 9674 * specified by "prot". Like pmap_change_attr(), protections for aliases 9675 * in the direct map are updated as well. Protections on aliasing mappings may 9676 * be a subset of the requested protections; for example, mappings in the direct 9677 * map are never executable. 9678 */ 9679 int 9680 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot) 9681 { 9682 int error; 9683 9684 /* Only supported within the kernel map. */ 9685 if (va < VM_MIN_KERNEL_ADDRESS) 9686 return (EINVAL); 9687 9688 PMAP_LOCK(kernel_pmap); 9689 error = pmap_change_props_locked(va, size, prot, -1, 9690 MAPDEV_ASSERTVALID); 9691 PMAP_UNLOCK(kernel_pmap); 9692 return (error); 9693 } 9694 9695 static int 9696 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, 9697 int mode, int flags) 9698 { 9699 vm_offset_t base, offset, tmpva; 9700 vm_paddr_t pa_start, pa_end, pa_end1; 9701 pdp_entry_t *pdpe; 9702 pd_entry_t *pde, pde_bits, pde_mask; 9703 pt_entry_t *pte, pte_bits, pte_mask; 9704 int error; 9705 bool changed; 9706 9707 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 9708 base = trunc_page(va); 9709 offset = va & PAGE_MASK; 9710 size = round_page(offset + size); 9711 9712 /* 9713 * Only supported on kernel virtual addresses, including the direct 9714 * map but excluding the recursive map. 9715 */ 9716 if (base < DMAP_MIN_ADDRESS) 9717 return (EINVAL); 9718 9719 /* 9720 * Construct our flag sets and masks. "bits" is the subset of 9721 * "mask" that will be set in each modified PTE. 9722 * 9723 * Mappings in the direct map are never allowed to be executable. 9724 */ 9725 pde_bits = pte_bits = 0; 9726 pde_mask = pte_mask = 0; 9727 if (mode != -1) { 9728 pde_bits |= pmap_cache_bits(kernel_pmap, mode, true); 9729 pde_mask |= X86_PG_PDE_CACHE; 9730 pte_bits |= pmap_cache_bits(kernel_pmap, mode, false); 9731 pte_mask |= X86_PG_PTE_CACHE; 9732 } 9733 if (prot != VM_PROT_NONE) { 9734 if ((prot & VM_PROT_WRITE) != 0) { 9735 pde_bits |= X86_PG_RW; 9736 pte_bits |= X86_PG_RW; 9737 } 9738 if ((prot & VM_PROT_EXECUTE) == 0 || 9739 va < VM_MIN_KERNEL_ADDRESS) { 9740 pde_bits |= pg_nx; 9741 pte_bits |= pg_nx; 9742 } 9743 pde_mask |= X86_PG_RW | pg_nx; 9744 pte_mask |= X86_PG_RW | pg_nx; 9745 } 9746 9747 /* 9748 * Pages that aren't mapped aren't supported. Also break down 2MB pages 9749 * into 4KB pages if required. 9750 */ 9751 for (tmpva = base; tmpva < base + size; ) { 9752 pdpe = pmap_pdpe(kernel_pmap, tmpva); 9753 if (pdpe == NULL || *pdpe == 0) { 9754 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9755 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9756 return (EINVAL); 9757 } 9758 if (*pdpe & PG_PS) { 9759 /* 9760 * If the current 1GB page already has the required 9761 * properties, then we need not demote this page. Just 9762 * increment tmpva to the next 1GB page frame. 9763 */ 9764 if ((*pdpe & pde_mask) == pde_bits) { 9765 tmpva = trunc_1gpage(tmpva) + NBPDP; 9766 continue; 9767 } 9768 9769 /* 9770 * If the current offset aligns with a 1GB page frame 9771 * and there is at least 1GB left within the range, then 9772 * we need not break down this page into 2MB pages. 9773 */ 9774 if ((tmpva & PDPMASK) == 0 && 9775 tmpva + PDPMASK < base + size) { 9776 tmpva += NBPDP; 9777 continue; 9778 } 9779 if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) 9780 return (ENOMEM); 9781 } 9782 pde = pmap_pdpe_to_pde(pdpe, tmpva); 9783 if (*pde == 0) { 9784 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9785 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9786 return (EINVAL); 9787 } 9788 if (*pde & PG_PS) { 9789 /* 9790 * If the current 2MB page already has the required 9791 * properties, then we need not demote this page. Just 9792 * increment tmpva to the next 2MB page frame. 9793 */ 9794 if ((*pde & pde_mask) == pde_bits) { 9795 tmpva = trunc_2mpage(tmpva) + NBPDR; 9796 continue; 9797 } 9798 9799 /* 9800 * If the current offset aligns with a 2MB page frame 9801 * and there is at least 2MB left within the range, then 9802 * we need not break down this page into 4KB pages. 9803 */ 9804 if ((tmpva & PDRMASK) == 0 && 9805 tmpva + PDRMASK < base + size) { 9806 tmpva += NBPDR; 9807 continue; 9808 } 9809 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) 9810 return (ENOMEM); 9811 } 9812 pte = pmap_pde_to_pte(pde, tmpva); 9813 if (*pte == 0) { 9814 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9815 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9816 return (EINVAL); 9817 } 9818 tmpva += PAGE_SIZE; 9819 } 9820 error = 0; 9821 9822 /* 9823 * Ok, all the pages exist, so run through them updating their 9824 * properties if required. 9825 */ 9826 changed = false; 9827 pa_start = pa_end = 0; 9828 for (tmpva = base; tmpva < base + size; ) { 9829 pdpe = pmap_pdpe(kernel_pmap, tmpva); 9830 if (*pdpe & PG_PS) { 9831 if ((*pdpe & pde_mask) != pde_bits) { 9832 pmap_pte_props(pdpe, pde_bits, pde_mask); 9833 changed = true; 9834 } 9835 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9836 (*pdpe & PG_PS_FRAME) < dmaplimit) { 9837 if (pa_start == pa_end) { 9838 /* Start physical address run. */ 9839 pa_start = *pdpe & PG_PS_FRAME; 9840 pa_end = pa_start + NBPDP; 9841 } else if (pa_end == (*pdpe & PG_PS_FRAME)) 9842 pa_end += NBPDP; 9843 else { 9844 /* Run ended, update direct map. */ 9845 error = pmap_change_props_locked( 9846 PHYS_TO_DMAP(pa_start), 9847 pa_end - pa_start, prot, mode, 9848 flags); 9849 if (error != 0) 9850 break; 9851 /* Start physical address run. */ 9852 pa_start = *pdpe & PG_PS_FRAME; 9853 pa_end = pa_start + NBPDP; 9854 } 9855 } 9856 tmpva = trunc_1gpage(tmpva) + NBPDP; 9857 continue; 9858 } 9859 pde = pmap_pdpe_to_pde(pdpe, tmpva); 9860 if (*pde & PG_PS) { 9861 if ((*pde & pde_mask) != pde_bits) { 9862 pmap_pte_props(pde, pde_bits, pde_mask); 9863 changed = true; 9864 } 9865 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9866 (*pde & PG_PS_FRAME) < dmaplimit) { 9867 if (pa_start == pa_end) { 9868 /* Start physical address run. */ 9869 pa_start = *pde & PG_PS_FRAME; 9870 pa_end = pa_start + NBPDR; 9871 } else if (pa_end == (*pde & PG_PS_FRAME)) 9872 pa_end += NBPDR; 9873 else { 9874 /* Run ended, update direct map. */ 9875 error = pmap_change_props_locked( 9876 PHYS_TO_DMAP(pa_start), 9877 pa_end - pa_start, prot, mode, 9878 flags); 9879 if (error != 0) 9880 break; 9881 /* Start physical address run. */ 9882 pa_start = *pde & PG_PS_FRAME; 9883 pa_end = pa_start + NBPDR; 9884 } 9885 } 9886 tmpva = trunc_2mpage(tmpva) + NBPDR; 9887 } else { 9888 pte = pmap_pde_to_pte(pde, tmpva); 9889 if ((*pte & pte_mask) != pte_bits) { 9890 pmap_pte_props(pte, pte_bits, pte_mask); 9891 changed = true; 9892 } 9893 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9894 (*pte & PG_FRAME) < dmaplimit) { 9895 if (pa_start == pa_end) { 9896 /* Start physical address run. */ 9897 pa_start = *pte & PG_FRAME; 9898 pa_end = pa_start + PAGE_SIZE; 9899 } else if (pa_end == (*pte & PG_FRAME)) 9900 pa_end += PAGE_SIZE; 9901 else { 9902 /* Run ended, update direct map. */ 9903 error = pmap_change_props_locked( 9904 PHYS_TO_DMAP(pa_start), 9905 pa_end - pa_start, prot, mode, 9906 flags); 9907 if (error != 0) 9908 break; 9909 /* Start physical address run. */ 9910 pa_start = *pte & PG_FRAME; 9911 pa_end = pa_start + PAGE_SIZE; 9912 } 9913 } 9914 tmpva += PAGE_SIZE; 9915 } 9916 } 9917 if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { 9918 pa_end1 = MIN(pa_end, dmaplimit); 9919 if (pa_start != pa_end1) 9920 error = pmap_change_props_locked(PHYS_TO_DMAP(pa_start), 9921 pa_end1 - pa_start, prot, mode, flags); 9922 } 9923 9924 /* 9925 * Flush CPU caches if required to make sure any data isn't cached that 9926 * shouldn't be, etc. 9927 */ 9928 if (changed) { 9929 pmap_invalidate_range(kernel_pmap, base, tmpva); 9930 if ((flags & MAPDEV_FLUSHCACHE) != 0) 9931 pmap_invalidate_cache_range(base, tmpva); 9932 } 9933 return (error); 9934 } 9935 9936 /* 9937 * Demotes any mapping within the direct map region that covers more than the 9938 * specified range of physical addresses. This range's size must be a power 9939 * of two and its starting address must be a multiple of its size. Since the 9940 * demotion does not change any attributes of the mapping, a TLB invalidation 9941 * is not mandatory. The caller may, however, request a TLB invalidation. 9942 */ 9943 void 9944 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, bool invalidate) 9945 { 9946 pdp_entry_t *pdpe; 9947 pd_entry_t *pde; 9948 vm_offset_t va; 9949 bool changed; 9950 9951 if (len == 0) 9952 return; 9953 KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); 9954 KASSERT((base & (len - 1)) == 0, 9955 ("pmap_demote_DMAP: base is not a multiple of len")); 9956 if (len < NBPDP && base < dmaplimit) { 9957 va = PHYS_TO_DMAP(base); 9958 changed = false; 9959 PMAP_LOCK(kernel_pmap); 9960 pdpe = pmap_pdpe(kernel_pmap, va); 9961 if ((*pdpe & X86_PG_V) == 0) 9962 panic("pmap_demote_DMAP: invalid PDPE"); 9963 if ((*pdpe & PG_PS) != 0) { 9964 if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) 9965 panic("pmap_demote_DMAP: PDPE failed"); 9966 changed = true; 9967 } 9968 if (len < NBPDR) { 9969 pde = pmap_pdpe_to_pde(pdpe, va); 9970 if ((*pde & X86_PG_V) == 0) 9971 panic("pmap_demote_DMAP: invalid PDE"); 9972 if ((*pde & PG_PS) != 0) { 9973 if (!pmap_demote_pde(kernel_pmap, pde, va)) 9974 panic("pmap_demote_DMAP: PDE failed"); 9975 changed = true; 9976 } 9977 } 9978 if (changed && invalidate) 9979 pmap_invalidate_page(kernel_pmap, va); 9980 PMAP_UNLOCK(kernel_pmap); 9981 } 9982 } 9983 9984 /* 9985 * Perform the pmap work for mincore(2). If the page is not both referenced and 9986 * modified by this pmap, returns its physical address so that the caller can 9987 * find other mappings. 9988 */ 9989 int 9990 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 9991 { 9992 pdp_entry_t *pdpe; 9993 pd_entry_t *pdep; 9994 pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; 9995 vm_paddr_t pa; 9996 int val; 9997 9998 PG_A = pmap_accessed_bit(pmap); 9999 PG_M = pmap_modified_bit(pmap); 10000 PG_V = pmap_valid_bit(pmap); 10001 PG_RW = pmap_rw_bit(pmap); 10002 10003 PMAP_LOCK(pmap); 10004 pte = 0; 10005 pa = 0; 10006 val = 0; 10007 pdpe = pmap_pdpe(pmap, addr); 10008 if (pdpe == NULL) 10009 goto out; 10010 if ((*pdpe & PG_V) != 0) { 10011 if ((*pdpe & PG_PS) != 0) { 10012 pte = *pdpe; 10013 pa = ((pte & PG_PS_PDP_FRAME) | (addr & PDPMASK)) & 10014 PG_FRAME; 10015 val = MINCORE_PSIND(2); 10016 } else { 10017 pdep = pmap_pde(pmap, addr); 10018 if (pdep != NULL && (*pdep & PG_V) != 0) { 10019 if ((*pdep & PG_PS) != 0) { 10020 pte = *pdep; 10021 /* Compute the physical address of the 4KB page. */ 10022 pa = ((pte & PG_PS_FRAME) | (addr & 10023 PDRMASK)) & PG_FRAME; 10024 val = MINCORE_PSIND(1); 10025 } else { 10026 pte = *pmap_pde_to_pte(pdep, addr); 10027 pa = pte & PG_FRAME; 10028 val = 0; 10029 } 10030 } 10031 } 10032 } 10033 if ((pte & PG_V) != 0) { 10034 val |= MINCORE_INCORE; 10035 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 10036 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 10037 if ((pte & PG_A) != 0) 10038 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 10039 } 10040 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 10041 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 10042 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 10043 *pap = pa; 10044 } 10045 out: 10046 PMAP_UNLOCK(pmap); 10047 return (val); 10048 } 10049 10050 static uint64_t 10051 pmap_pcid_alloc(pmap_t pmap, struct pmap_pcid *pcidp) 10052 { 10053 uint32_t gen, new_gen, pcid_next; 10054 10055 CRITICAL_ASSERT(curthread); 10056 gen = PCPU_GET(pcid_gen); 10057 if (pcidp->pm_pcid == PMAP_PCID_KERN) 10058 return (pti ? 0 : CR3_PCID_SAVE); 10059 if (pcidp->pm_gen == gen) 10060 return (CR3_PCID_SAVE); 10061 pcid_next = PCPU_GET(pcid_next); 10062 KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) || 10063 (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN), 10064 ("cpu %d pcid_next %#x", PCPU_GET(cpuid), pcid_next)); 10065 if ((!pti && pcid_next == PMAP_PCID_OVERMAX) || 10066 (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) { 10067 new_gen = gen + 1; 10068 if (new_gen == 0) 10069 new_gen = 1; 10070 PCPU_SET(pcid_gen, new_gen); 10071 pcid_next = PMAP_PCID_KERN + 1; 10072 } else { 10073 new_gen = gen; 10074 } 10075 pcidp->pm_pcid = pcid_next; 10076 pcidp->pm_gen = new_gen; 10077 PCPU_SET(pcid_next, pcid_next + 1); 10078 return (0); 10079 } 10080 10081 static uint64_t 10082 pmap_pcid_alloc_checked(pmap_t pmap, struct pmap_pcid *pcidp) 10083 { 10084 uint64_t cached; 10085 10086 cached = pmap_pcid_alloc(pmap, pcidp); 10087 KASSERT(pcidp->pm_pcid < PMAP_PCID_OVERMAX, 10088 ("pmap %p cpu %d pcid %#x", pmap, PCPU_GET(cpuid), pcidp->pm_pcid)); 10089 KASSERT(pcidp->pm_pcid != PMAP_PCID_KERN || pmap == kernel_pmap, 10090 ("non-kernel pmap pmap %p cpu %d pcid %#x", 10091 pmap, PCPU_GET(cpuid), pcidp->pm_pcid)); 10092 return (cached); 10093 } 10094 10095 static void 10096 pmap_activate_sw_pti_post(struct thread *td, pmap_t pmap) 10097 { 10098 10099 PCPU_GET(tssp)->tss_rsp0 = pmap->pm_ucr3 != PMAP_NO_CR3 ? 10100 PCPU_GET(pti_rsp0) : (uintptr_t)td->td_md.md_stack_base; 10101 } 10102 10103 static void 10104 pmap_activate_sw_pcid_pti(struct thread *td, pmap_t pmap, u_int cpuid) 10105 { 10106 pmap_t old_pmap; 10107 struct pmap_pcid *pcidp, *old_pcidp; 10108 uint64_t cached, cr3, kcr3, ucr3; 10109 10110 KASSERT((read_rflags() & PSL_I) == 0, 10111 ("PCID needs interrupts disabled in pmap_activate_sw()")); 10112 10113 /* See the comment in pmap_invalidate_page_pcid(). */ 10114 if (PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) { 10115 PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK); 10116 old_pmap = PCPU_GET(curpmap); 10117 MPASS(old_pmap->pm_ucr3 != PMAP_NO_CR3); 10118 old_pcidp = zpcpu_get_cpu(old_pmap->pm_pcidp, cpuid); 10119 old_pcidp->pm_gen = 0; 10120 } 10121 10122 pcidp = zpcpu_get_cpu(pmap->pm_pcidp, cpuid); 10123 cached = pmap_pcid_alloc_checked(pmap, pcidp); 10124 cr3 = rcr3(); 10125 if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) 10126 load_cr3(pmap->pm_cr3 | pcidp->pm_pcid); 10127 PCPU_SET(curpmap, pmap); 10128 kcr3 = pmap->pm_cr3 | pcidp->pm_pcid; 10129 ucr3 = pmap->pm_ucr3 | pcidp->pm_pcid | PMAP_PCID_USER_PT; 10130 10131 if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) 10132 PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); 10133 10134 PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); 10135 PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); 10136 if (cached) 10137 counter_u64_add(pcid_save_cnt, 1); 10138 10139 pmap_activate_sw_pti_post(td, pmap); 10140 } 10141 10142 static void 10143 pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap, 10144 u_int cpuid) 10145 { 10146 struct pmap_pcid *pcidp; 10147 uint64_t cached, cr3; 10148 10149 KASSERT((read_rflags() & PSL_I) == 0, 10150 ("PCID needs interrupts disabled in pmap_activate_sw()")); 10151 10152 pcidp = zpcpu_get_cpu(pmap->pm_pcidp, cpuid); 10153 cached = pmap_pcid_alloc_checked(pmap, pcidp); 10154 cr3 = rcr3(); 10155 if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) 10156 load_cr3(pmap->pm_cr3 | pcidp->pm_pcid | cached); 10157 PCPU_SET(curpmap, pmap); 10158 if (cached) 10159 counter_u64_add(pcid_save_cnt, 1); 10160 } 10161 10162 static void 10163 pmap_activate_sw_nopcid_nopti(struct thread *td __unused, pmap_t pmap, 10164 u_int cpuid __unused) 10165 { 10166 10167 load_cr3(pmap->pm_cr3); 10168 PCPU_SET(curpmap, pmap); 10169 } 10170 10171 static void 10172 pmap_activate_sw_nopcid_pti(struct thread *td, pmap_t pmap, 10173 u_int cpuid __unused) 10174 { 10175 10176 pmap_activate_sw_nopcid_nopti(td, pmap, cpuid); 10177 PCPU_SET(kcr3, pmap->pm_cr3); 10178 PCPU_SET(ucr3, pmap->pm_ucr3); 10179 pmap_activate_sw_pti_post(td, pmap); 10180 } 10181 10182 DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (struct thread *, pmap_t, 10183 u_int)) 10184 { 10185 10186 if (pmap_pcid_enabled && pti) 10187 return (pmap_activate_sw_pcid_pti); 10188 else if (pmap_pcid_enabled && !pti) 10189 return (pmap_activate_sw_pcid_nopti); 10190 else if (!pmap_pcid_enabled && pti) 10191 return (pmap_activate_sw_nopcid_pti); 10192 else /* if (!pmap_pcid_enabled && !pti) */ 10193 return (pmap_activate_sw_nopcid_nopti); 10194 } 10195 10196 void 10197 pmap_activate_sw(struct thread *td) 10198 { 10199 pmap_t oldpmap, pmap; 10200 u_int cpuid; 10201 10202 oldpmap = PCPU_GET(curpmap); 10203 pmap = vmspace_pmap(td->td_proc->p_vmspace); 10204 if (oldpmap == pmap) { 10205 if (cpu_vendor_id != CPU_VENDOR_INTEL) 10206 mfence(); 10207 return; 10208 } 10209 cpuid = PCPU_GET(cpuid); 10210 #ifdef SMP 10211 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 10212 #else 10213 CPU_SET(cpuid, &pmap->pm_active); 10214 #endif 10215 pmap_activate_sw_mode(td, pmap, cpuid); 10216 #ifdef SMP 10217 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 10218 #else 10219 CPU_CLR(cpuid, &oldpmap->pm_active); 10220 #endif 10221 } 10222 10223 void 10224 pmap_activate(struct thread *td) 10225 { 10226 /* 10227 * invltlb_{invpcid,}_pcid_handler() is used to handle an 10228 * invalidate_all IPI, which checks for curpmap == 10229 * smp_tlb_pmap. The below sequence of operations has a 10230 * window where %CR3 is loaded with the new pmap's PML4 10231 * address, but the curpmap value has not yet been updated. 10232 * This causes the invltlb IPI handler, which is called 10233 * between the updates, to execute as a NOP, which leaves 10234 * stale TLB entries. 10235 * 10236 * Note that the most common use of pmap_activate_sw(), from 10237 * a context switch, is immune to this race, because 10238 * interrupts are disabled (while the thread lock is owned), 10239 * so the IPI is delayed until after curpmap is updated. Protect 10240 * other callers in a similar way, by disabling interrupts 10241 * around the %cr3 register reload and curpmap assignment. 10242 */ 10243 spinlock_enter(); 10244 pmap_activate_sw(td); 10245 spinlock_exit(); 10246 } 10247 10248 void 10249 pmap_activate_boot(pmap_t pmap) 10250 { 10251 uint64_t kcr3; 10252 u_int cpuid; 10253 10254 /* 10255 * kernel_pmap must be never deactivated, and we ensure that 10256 * by never activating it at all. 10257 */ 10258 MPASS(pmap != kernel_pmap); 10259 10260 cpuid = PCPU_GET(cpuid); 10261 #ifdef SMP 10262 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 10263 #else 10264 CPU_SET(cpuid, &pmap->pm_active); 10265 #endif 10266 PCPU_SET(curpmap, pmap); 10267 if (pti) { 10268 kcr3 = pmap->pm_cr3; 10269 if (pmap_pcid_enabled) 10270 kcr3 |= pmap_get_pcid(pmap) | CR3_PCID_SAVE; 10271 } else { 10272 kcr3 = PMAP_NO_CR3; 10273 } 10274 PCPU_SET(kcr3, kcr3); 10275 PCPU_SET(ucr3, PMAP_NO_CR3); 10276 } 10277 10278 void 10279 pmap_active_cpus(pmap_t pmap, cpuset_t *res) 10280 { 10281 *res = pmap->pm_active; 10282 } 10283 10284 void 10285 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 10286 { 10287 } 10288 10289 /* 10290 * Increase the starting virtual address of the given mapping if a 10291 * different alignment might result in more superpage mappings. 10292 */ 10293 void 10294 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 10295 vm_offset_t *addr, vm_size_t size) 10296 { 10297 vm_offset_t superpage_offset; 10298 10299 if (size < NBPDR) 10300 return; 10301 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 10302 offset += ptoa(object->pg_color); 10303 superpage_offset = offset & PDRMASK; 10304 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 10305 (*addr & PDRMASK) == superpage_offset) 10306 return; 10307 if ((*addr & PDRMASK) < superpage_offset) 10308 *addr = (*addr & ~PDRMASK) + superpage_offset; 10309 else 10310 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 10311 } 10312 10313 #ifdef INVARIANTS 10314 static unsigned long num_dirty_emulations; 10315 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, 10316 &num_dirty_emulations, 0, NULL); 10317 10318 static unsigned long num_accessed_emulations; 10319 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, 10320 &num_accessed_emulations, 0, NULL); 10321 10322 static unsigned long num_superpage_accessed_emulations; 10323 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, 10324 &num_superpage_accessed_emulations, 0, NULL); 10325 10326 static unsigned long ad_emulation_superpage_promotions; 10327 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, 10328 &ad_emulation_superpage_promotions, 0, NULL); 10329 #endif /* INVARIANTS */ 10330 10331 int 10332 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) 10333 { 10334 int rv; 10335 struct rwlock *lock; 10336 #if VM_NRESERVLEVEL > 0 10337 vm_page_t m, mpte; 10338 #endif 10339 pd_entry_t *pde; 10340 pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; 10341 10342 KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, 10343 ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); 10344 10345 if (!pmap_emulate_ad_bits(pmap)) 10346 return (-1); 10347 10348 PG_A = pmap_accessed_bit(pmap); 10349 PG_M = pmap_modified_bit(pmap); 10350 PG_V = pmap_valid_bit(pmap); 10351 PG_RW = pmap_rw_bit(pmap); 10352 10353 rv = -1; 10354 lock = NULL; 10355 PMAP_LOCK(pmap); 10356 10357 pde = pmap_pde(pmap, va); 10358 if (pde == NULL || (*pde & PG_V) == 0) 10359 goto done; 10360 10361 if ((*pde & PG_PS) != 0) { 10362 if (ftype == VM_PROT_READ) { 10363 #ifdef INVARIANTS 10364 atomic_add_long(&num_superpage_accessed_emulations, 1); 10365 #endif 10366 *pde |= PG_A; 10367 rv = 0; 10368 } 10369 goto done; 10370 } 10371 10372 pte = pmap_pde_to_pte(pde, va); 10373 if ((*pte & PG_V) == 0) 10374 goto done; 10375 10376 if (ftype == VM_PROT_WRITE) { 10377 if ((*pte & PG_RW) == 0) 10378 goto done; 10379 /* 10380 * Set the modified and accessed bits simultaneously. 10381 * 10382 * Intel EPT PTEs that do software emulation of A/D bits map 10383 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively. 10384 * An EPT misconfiguration is triggered if the PTE is writable 10385 * but not readable (WR=10). This is avoided by setting PG_A 10386 * and PG_M simultaneously. 10387 */ 10388 *pte |= PG_M | PG_A; 10389 } else { 10390 *pte |= PG_A; 10391 } 10392 10393 #if VM_NRESERVLEVEL > 0 10394 /* try to promote the mapping */ 10395 if (va < VM_MAXUSER_ADDRESS) 10396 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 10397 else 10398 mpte = NULL; 10399 10400 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 10401 10402 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 10403 (m->flags & PG_FICTITIOUS) == 0 && 10404 vm_reserv_level_iffullpop(m) == 0 && 10405 pmap_promote_pde(pmap, pde, va, mpte, &lock)) { 10406 #ifdef INVARIANTS 10407 atomic_add_long(&ad_emulation_superpage_promotions, 1); 10408 #endif 10409 } 10410 #endif 10411 10412 #ifdef INVARIANTS 10413 if (ftype == VM_PROT_WRITE) 10414 atomic_add_long(&num_dirty_emulations, 1); 10415 else 10416 atomic_add_long(&num_accessed_emulations, 1); 10417 #endif 10418 rv = 0; /* success */ 10419 done: 10420 if (lock != NULL) 10421 rw_wunlock(lock); 10422 PMAP_UNLOCK(pmap); 10423 return (rv); 10424 } 10425 10426 void 10427 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) 10428 { 10429 pml4_entry_t *pml4; 10430 pdp_entry_t *pdp; 10431 pd_entry_t *pde; 10432 pt_entry_t *pte, PG_V; 10433 int idx; 10434 10435 idx = 0; 10436 PG_V = pmap_valid_bit(pmap); 10437 PMAP_LOCK(pmap); 10438 10439 pml4 = pmap_pml4e(pmap, va); 10440 if (pml4 == NULL) 10441 goto done; 10442 ptr[idx++] = *pml4; 10443 if ((*pml4 & PG_V) == 0) 10444 goto done; 10445 10446 pdp = pmap_pml4e_to_pdpe(pml4, va); 10447 ptr[idx++] = *pdp; 10448 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) 10449 goto done; 10450 10451 pde = pmap_pdpe_to_pde(pdp, va); 10452 ptr[idx++] = *pde; 10453 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) 10454 goto done; 10455 10456 pte = pmap_pde_to_pte(pde, va); 10457 ptr[idx++] = *pte; 10458 10459 done: 10460 PMAP_UNLOCK(pmap); 10461 *num = idx; 10462 } 10463 10464 /** 10465 * Get the kernel virtual address of a set of physical pages. If there are 10466 * physical addresses not covered by the DMAP perform a transient mapping 10467 * that will be removed when calling pmap_unmap_io_transient. 10468 * 10469 * \param page The pages the caller wishes to obtain the virtual 10470 * address on the kernel memory map. 10471 * \param vaddr On return contains the kernel virtual memory address 10472 * of the pages passed in the page parameter. 10473 * \param count Number of pages passed in. 10474 * \param can_fault true if the thread using the mapped pages can take 10475 * page faults, false otherwise. 10476 * 10477 * \returns true if the caller must call pmap_unmap_io_transient when 10478 * finished or false otherwise. 10479 * 10480 */ 10481 bool 10482 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 10483 bool can_fault) 10484 { 10485 vm_paddr_t paddr; 10486 bool needs_mapping; 10487 pt_entry_t *pte; 10488 int cache_bits, error __unused, i; 10489 10490 /* 10491 * Allocate any KVA space that we need, this is done in a separate 10492 * loop to prevent calling vmem_alloc while pinned. 10493 */ 10494 needs_mapping = false; 10495 for (i = 0; i < count; i++) { 10496 paddr = VM_PAGE_TO_PHYS(page[i]); 10497 if (__predict_false(paddr >= dmaplimit)) { 10498 error = vmem_alloc(kernel_arena, PAGE_SIZE, 10499 M_BESTFIT | M_WAITOK, &vaddr[i]); 10500 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 10501 needs_mapping = true; 10502 } else { 10503 vaddr[i] = PHYS_TO_DMAP(paddr); 10504 } 10505 } 10506 10507 /* Exit early if everything is covered by the DMAP */ 10508 if (!needs_mapping) 10509 return (false); 10510 10511 /* 10512 * NB: The sequence of updating a page table followed by accesses 10513 * to the corresponding pages used in the !DMAP case is subject to 10514 * the situation described in the "AMD64 Architecture Programmer's 10515 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special 10516 * Coherency Considerations". Therefore, issuing the INVLPG right 10517 * after modifying the PTE bits is crucial. 10518 */ 10519 if (!can_fault) 10520 sched_pin(); 10521 for (i = 0; i < count; i++) { 10522 paddr = VM_PAGE_TO_PHYS(page[i]); 10523 if (paddr >= dmaplimit) { 10524 if (can_fault) { 10525 /* 10526 * Slow path, since we can get page faults 10527 * while mappings are active don't pin the 10528 * thread to the CPU and instead add a global 10529 * mapping visible to all CPUs. 10530 */ 10531 pmap_qenter(vaddr[i], &page[i], 1); 10532 } else { 10533 pte = vtopte(vaddr[i]); 10534 cache_bits = pmap_cache_bits(kernel_pmap, 10535 page[i]->md.pat_mode, false); 10536 pte_store(pte, paddr | X86_PG_RW | X86_PG_V | 10537 cache_bits); 10538 pmap_invlpg(kernel_pmap, vaddr[i]); 10539 } 10540 } 10541 } 10542 10543 return (needs_mapping); 10544 } 10545 10546 void 10547 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 10548 bool can_fault) 10549 { 10550 vm_paddr_t paddr; 10551 int i; 10552 10553 if (!can_fault) 10554 sched_unpin(); 10555 for (i = 0; i < count; i++) { 10556 paddr = VM_PAGE_TO_PHYS(page[i]); 10557 if (paddr >= dmaplimit) { 10558 if (can_fault) 10559 pmap_qremove(vaddr[i], 1); 10560 vmem_free(kernel_arena, vaddr[i], PAGE_SIZE); 10561 } 10562 } 10563 } 10564 10565 vm_offset_t 10566 pmap_quick_enter_page(vm_page_t m) 10567 { 10568 vm_paddr_t paddr; 10569 10570 paddr = VM_PAGE_TO_PHYS(m); 10571 if (paddr < dmaplimit) 10572 return (PHYS_TO_DMAP(paddr)); 10573 mtx_lock_spin(&qframe_mtx); 10574 KASSERT(*vtopte(qframe) == 0, ("qframe busy")); 10575 10576 /* 10577 * Since qframe is exclusively mapped by us, and we do not set 10578 * PG_G, we can use INVLPG here. 10579 */ 10580 invlpg(qframe); 10581 10582 pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A | 10583 X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, false)); 10584 return (qframe); 10585 } 10586 10587 void 10588 pmap_quick_remove_page(vm_offset_t addr) 10589 { 10590 10591 if (addr != qframe) 10592 return; 10593 pte_store(vtopte(qframe), 0); 10594 mtx_unlock_spin(&qframe_mtx); 10595 } 10596 10597 /* 10598 * Pdp pages from the large map are managed differently from either 10599 * kernel or user page table pages. They are permanently allocated at 10600 * initialization time, and their reference count is permanently set to 10601 * zero. The pml4 entries pointing to those pages are copied into 10602 * each allocated pmap. 10603 * 10604 * In contrast, pd and pt pages are managed like user page table 10605 * pages. They are dynamically allocated, and their reference count 10606 * represents the number of valid entries within the page. 10607 */ 10608 static vm_page_t 10609 pmap_large_map_getptp_unlocked(void) 10610 { 10611 return (pmap_alloc_pt_page(kernel_pmap, 0, VM_ALLOC_ZERO)); 10612 } 10613 10614 static vm_page_t 10615 pmap_large_map_getptp(void) 10616 { 10617 vm_page_t m; 10618 10619 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 10620 m = pmap_large_map_getptp_unlocked(); 10621 if (m == NULL) { 10622 PMAP_UNLOCK(kernel_pmap); 10623 vm_wait(NULL); 10624 PMAP_LOCK(kernel_pmap); 10625 /* Callers retry. */ 10626 } 10627 return (m); 10628 } 10629 10630 static pdp_entry_t * 10631 pmap_large_map_pdpe(vm_offset_t va) 10632 { 10633 vm_pindex_t pml4_idx; 10634 vm_paddr_t mphys; 10635 10636 pml4_idx = pmap_pml4e_index(va); 10637 KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents, 10638 ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I " 10639 "%#jx lm_ents %d", 10640 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); 10641 KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0, 10642 ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " 10643 "LMSPML4I %#jx lm_ents %d", 10644 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); 10645 mphys = kernel_pml4[pml4_idx] & PG_FRAME; 10646 return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va)); 10647 } 10648 10649 static pd_entry_t * 10650 pmap_large_map_pde(vm_offset_t va) 10651 { 10652 pdp_entry_t *pdpe; 10653 vm_page_t m; 10654 vm_paddr_t mphys; 10655 10656 retry: 10657 pdpe = pmap_large_map_pdpe(va); 10658 if (*pdpe == 0) { 10659 m = pmap_large_map_getptp(); 10660 if (m == NULL) 10661 goto retry; 10662 mphys = VM_PAGE_TO_PHYS(m); 10663 *pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; 10664 } else { 10665 MPASS((*pdpe & X86_PG_PS) == 0); 10666 mphys = *pdpe & PG_FRAME; 10667 } 10668 return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va)); 10669 } 10670 10671 static pt_entry_t * 10672 pmap_large_map_pte(vm_offset_t va) 10673 { 10674 pd_entry_t *pde; 10675 vm_page_t m; 10676 vm_paddr_t mphys; 10677 10678 retry: 10679 pde = pmap_large_map_pde(va); 10680 if (*pde == 0) { 10681 m = pmap_large_map_getptp(); 10682 if (m == NULL) 10683 goto retry; 10684 mphys = VM_PAGE_TO_PHYS(m); 10685 *pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; 10686 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->ref_count++; 10687 } else { 10688 MPASS((*pde & X86_PG_PS) == 0); 10689 mphys = *pde & PG_FRAME; 10690 } 10691 return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va)); 10692 } 10693 10694 static vm_paddr_t 10695 pmap_large_map_kextract(vm_offset_t va) 10696 { 10697 pdp_entry_t *pdpe, pdp; 10698 pd_entry_t *pde, pd; 10699 pt_entry_t *pte, pt; 10700 10701 KASSERT(PMAP_ADDRESS_IN_LARGEMAP(va), 10702 ("not largemap range %#lx", (u_long)va)); 10703 pdpe = pmap_large_map_pdpe(va); 10704 pdp = *pdpe; 10705 KASSERT((pdp & X86_PG_V) != 0, 10706 ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, 10707 (u_long)pdpe, pdp)); 10708 if ((pdp & X86_PG_PS) != 0) { 10709 KASSERT((amd_feature & AMDID_PAGE1GB) != 0, 10710 ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, 10711 (u_long)pdpe, pdp)); 10712 return ((pdp & PG_PS_PDP_FRAME) | (va & PDPMASK)); 10713 } 10714 pde = pmap_pdpe_to_pde(pdpe, va); 10715 pd = *pde; 10716 KASSERT((pd & X86_PG_V) != 0, 10717 ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); 10718 if ((pd & X86_PG_PS) != 0) 10719 return ((pd & PG_PS_FRAME) | (va & PDRMASK)); 10720 pte = pmap_pde_to_pte(pde, va); 10721 pt = *pte; 10722 KASSERT((pt & X86_PG_V) != 0, 10723 ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, pt)); 10724 return ((pt & PG_FRAME) | (va & PAGE_MASK)); 10725 } 10726 10727 static int 10728 pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase, 10729 vmem_addr_t *vmem_res) 10730 { 10731 10732 /* 10733 * Large mappings are all but static. Consequently, there 10734 * is no point in waiting for an earlier allocation to be 10735 * freed. 10736 */ 10737 return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN, 10738 VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res)); 10739 } 10740 10741 int 10742 pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr, 10743 vm_memattr_t mattr) 10744 { 10745 pdp_entry_t *pdpe; 10746 pd_entry_t *pde; 10747 pt_entry_t *pte; 10748 vm_offset_t va, inc; 10749 vmem_addr_t vmem_res; 10750 vm_paddr_t pa; 10751 int error; 10752 10753 if (len == 0 || spa + len < spa) 10754 return (EINVAL); 10755 10756 /* See if DMAP can serve. */ 10757 if (spa + len <= dmaplimit) { 10758 va = PHYS_TO_DMAP(spa); 10759 *addr = (void *)va; 10760 return (pmap_change_attr(va, len, mattr)); 10761 } 10762 10763 /* 10764 * No, allocate KVA. Fit the address with best possible 10765 * alignment for superpages. Fall back to worse align if 10766 * failed. 10767 */ 10768 error = ENOMEM; 10769 if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len, 10770 NBPDP) >= roundup2(spa, NBPDP) + NBPDP) 10771 error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK, 10772 &vmem_res); 10773 if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa, 10774 NBPDR) + NBPDR) 10775 error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK, 10776 &vmem_res); 10777 if (error != 0) 10778 error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res); 10779 if (error != 0) 10780 return (error); 10781 10782 /* 10783 * Fill pagetable. PG_M is not pre-set, we scan modified bits 10784 * in the pagetable to minimize flushing. No need to 10785 * invalidate TLB, since we only update invalid entries. 10786 */ 10787 PMAP_LOCK(kernel_pmap); 10788 for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc, 10789 len -= inc) { 10790 if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP && 10791 (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) { 10792 pdpe = pmap_large_map_pdpe(va); 10793 MPASS(*pdpe == 0); 10794 *pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW | 10795 X86_PG_V | X86_PG_A | pg_nx | 10796 pmap_cache_bits(kernel_pmap, mattr, true); 10797 inc = NBPDP; 10798 } else if (len >= NBPDR && (pa & PDRMASK) == 0 && 10799 (va & PDRMASK) == 0) { 10800 pde = pmap_large_map_pde(va); 10801 MPASS(*pde == 0); 10802 *pde = pa | pg_g | X86_PG_PS | X86_PG_RW | 10803 X86_PG_V | X86_PG_A | pg_nx | 10804 pmap_cache_bits(kernel_pmap, mattr, true); 10805 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))-> 10806 ref_count++; 10807 inc = NBPDR; 10808 } else { 10809 pte = pmap_large_map_pte(va); 10810 MPASS(*pte == 0); 10811 *pte = pa | pg_g | X86_PG_RW | X86_PG_V | 10812 X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, 10813 mattr, false); 10814 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))-> 10815 ref_count++; 10816 inc = PAGE_SIZE; 10817 } 10818 } 10819 PMAP_UNLOCK(kernel_pmap); 10820 MPASS(len == 0); 10821 10822 *addr = (void *)vmem_res; 10823 return (0); 10824 } 10825 10826 void 10827 pmap_large_unmap(void *svaa, vm_size_t len) 10828 { 10829 vm_offset_t sva, va; 10830 vm_size_t inc; 10831 pdp_entry_t *pdpe, pdp; 10832 pd_entry_t *pde, pd; 10833 pt_entry_t *pte; 10834 vm_page_t m; 10835 struct spglist spgf; 10836 10837 sva = (vm_offset_t)svaa; 10838 if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS && 10839 sva + len <= DMAP_MIN_ADDRESS + dmaplimit)) 10840 return; 10841 10842 SLIST_INIT(&spgf); 10843 KASSERT(PMAP_ADDRESS_IN_LARGEMAP(sva) && 10844 PMAP_ADDRESS_IN_LARGEMAP(sva + len - 1), 10845 ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len)); 10846 PMAP_LOCK(kernel_pmap); 10847 for (va = sva; va < sva + len; va += inc) { 10848 pdpe = pmap_large_map_pdpe(va); 10849 pdp = *pdpe; 10850 KASSERT((pdp & X86_PG_V) != 0, 10851 ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, 10852 (u_long)pdpe, pdp)); 10853 if ((pdp & X86_PG_PS) != 0) { 10854 KASSERT((amd_feature & AMDID_PAGE1GB) != 0, 10855 ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, 10856 (u_long)pdpe, pdp)); 10857 KASSERT((va & PDPMASK) == 0, 10858 ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va, 10859 (u_long)pdpe, pdp)); 10860 KASSERT(va + NBPDP <= sva + len, 10861 ("unmap covers partial 1GB page, sva %#lx va %#lx " 10862 "pdpe %#lx pdp %#lx len %#lx", sva, va, 10863 (u_long)pdpe, pdp, len)); 10864 *pdpe = 0; 10865 inc = NBPDP; 10866 continue; 10867 } 10868 pde = pmap_pdpe_to_pde(pdpe, va); 10869 pd = *pde; 10870 KASSERT((pd & X86_PG_V) != 0, 10871 ("invalid pd va %#lx pde %#lx pd %#lx", va, 10872 (u_long)pde, pd)); 10873 if ((pd & X86_PG_PS) != 0) { 10874 KASSERT((va & PDRMASK) == 0, 10875 ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va, 10876 (u_long)pde, pd)); 10877 KASSERT(va + NBPDR <= sva + len, 10878 ("unmap covers partial 2MB page, sva %#lx va %#lx " 10879 "pde %#lx pd %#lx len %#lx", sva, va, (u_long)pde, 10880 pd, len)); 10881 pde_store(pde, 0); 10882 inc = NBPDR; 10883 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); 10884 m->ref_count--; 10885 if (m->ref_count == 0) { 10886 *pdpe = 0; 10887 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10888 } 10889 continue; 10890 } 10891 pte = pmap_pde_to_pte(pde, va); 10892 KASSERT((*pte & X86_PG_V) != 0, 10893 ("invalid pte va %#lx pte %#lx pt %#lx", va, 10894 (u_long)pte, *pte)); 10895 pte_clear(pte); 10896 inc = PAGE_SIZE; 10897 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte)); 10898 m->ref_count--; 10899 if (m->ref_count == 0) { 10900 *pde = 0; 10901 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10902 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); 10903 m->ref_count--; 10904 if (m->ref_count == 0) { 10905 *pdpe = 0; 10906 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10907 } 10908 } 10909 } 10910 pmap_invalidate_range(kernel_pmap, sva, sva + len); 10911 PMAP_UNLOCK(kernel_pmap); 10912 vm_page_free_pages_toq(&spgf, false); 10913 vmem_free(large_vmem, sva, len); 10914 } 10915 10916 static void 10917 pmap_large_map_wb_fence_mfence(void) 10918 { 10919 10920 mfence(); 10921 } 10922 10923 static void 10924 pmap_large_map_wb_fence_atomic(void) 10925 { 10926 10927 atomic_thread_fence_seq_cst(); 10928 } 10929 10930 static void 10931 pmap_large_map_wb_fence_nop(void) 10932 { 10933 } 10934 10935 DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void)) 10936 { 10937 10938 if (cpu_vendor_id != CPU_VENDOR_INTEL) 10939 return (pmap_large_map_wb_fence_mfence); 10940 else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB | 10941 CPUID_STDEXT_CLFLUSHOPT)) == 0) 10942 return (pmap_large_map_wb_fence_atomic); 10943 else 10944 /* clflush is strongly enough ordered */ 10945 return (pmap_large_map_wb_fence_nop); 10946 } 10947 10948 static void 10949 pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len) 10950 { 10951 10952 for (; len > 0; len -= cpu_clflush_line_size, 10953 va += cpu_clflush_line_size) 10954 clwb(va); 10955 } 10956 10957 static void 10958 pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len) 10959 { 10960 10961 for (; len > 0; len -= cpu_clflush_line_size, 10962 va += cpu_clflush_line_size) 10963 clflushopt(va); 10964 } 10965 10966 static void 10967 pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len) 10968 { 10969 10970 for (; len > 0; len -= cpu_clflush_line_size, 10971 va += cpu_clflush_line_size) 10972 clflush(va); 10973 } 10974 10975 static void 10976 pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len __unused) 10977 { 10978 } 10979 10980 DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t, vm_size_t)) 10981 { 10982 10983 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0) 10984 return (pmap_large_map_flush_range_clwb); 10985 else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) 10986 return (pmap_large_map_flush_range_clflushopt); 10987 else if ((cpu_feature & CPUID_CLFSH) != 0) 10988 return (pmap_large_map_flush_range_clflush); 10989 else 10990 return (pmap_large_map_flush_range_nop); 10991 } 10992 10993 static void 10994 pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva) 10995 { 10996 volatile u_long *pe; 10997 u_long p; 10998 vm_offset_t va; 10999 vm_size_t inc; 11000 bool seen_other; 11001 11002 for (va = sva; va < eva; va += inc) { 11003 inc = 0; 11004 if ((amd_feature & AMDID_PAGE1GB) != 0) { 11005 pe = (volatile u_long *)pmap_large_map_pdpe(va); 11006 p = *pe; 11007 if ((p & X86_PG_PS) != 0) 11008 inc = NBPDP; 11009 } 11010 if (inc == 0) { 11011 pe = (volatile u_long *)pmap_large_map_pde(va); 11012 p = *pe; 11013 if ((p & X86_PG_PS) != 0) 11014 inc = NBPDR; 11015 } 11016 if (inc == 0) { 11017 pe = (volatile u_long *)pmap_large_map_pte(va); 11018 p = *pe; 11019 inc = PAGE_SIZE; 11020 } 11021 seen_other = false; 11022 for (;;) { 11023 if ((p & X86_PG_AVAIL1) != 0) { 11024 /* 11025 * Spin-wait for the end of a parallel 11026 * write-back. 11027 */ 11028 cpu_spinwait(); 11029 p = *pe; 11030 11031 /* 11032 * If we saw other write-back 11033 * occuring, we cannot rely on PG_M to 11034 * indicate state of the cache. The 11035 * PG_M bit is cleared before the 11036 * flush to avoid ignoring new writes, 11037 * and writes which are relevant for 11038 * us might happen after. 11039 */ 11040 seen_other = true; 11041 continue; 11042 } 11043 11044 if ((p & X86_PG_M) != 0 || seen_other) { 11045 if (!atomic_fcmpset_long(pe, &p, 11046 (p & ~X86_PG_M) | X86_PG_AVAIL1)) 11047 /* 11048 * If we saw PG_M without 11049 * PG_AVAIL1, and then on the 11050 * next attempt we do not 11051 * observe either PG_M or 11052 * PG_AVAIL1, the other 11053 * write-back started after us 11054 * and finished before us. We 11055 * can rely on it doing our 11056 * work. 11057 */ 11058 continue; 11059 pmap_large_map_flush_range(va, inc); 11060 atomic_clear_long(pe, X86_PG_AVAIL1); 11061 } 11062 break; 11063 } 11064 maybe_yield(); 11065 } 11066 } 11067 11068 /* 11069 * Write-back cache lines for the given address range. 11070 * 11071 * Must be called only on the range or sub-range returned from 11072 * pmap_large_map(). Must not be called on the coalesced ranges. 11073 * 11074 * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH 11075 * instructions support. 11076 */ 11077 void 11078 pmap_large_map_wb(void *svap, vm_size_t len) 11079 { 11080 vm_offset_t eva, sva; 11081 11082 sva = (vm_offset_t)svap; 11083 eva = sva + len; 11084 pmap_large_map_wb_fence(); 11085 if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) { 11086 pmap_large_map_flush_range(sva, len); 11087 } else { 11088 KASSERT(sva >= LARGEMAP_MIN_ADDRESS && 11089 eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4, 11090 ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len)); 11091 pmap_large_map_wb_large(sva, eva); 11092 } 11093 pmap_large_map_wb_fence(); 11094 } 11095 11096 static vm_page_t 11097 pmap_pti_alloc_page(void) 11098 { 11099 vm_page_t m; 11100 11101 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11102 m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_WIRED | VM_ALLOC_ZERO); 11103 return (m); 11104 } 11105 11106 static bool 11107 pmap_pti_free_page(vm_page_t m) 11108 { 11109 if (!vm_page_unwire_noq(m)) 11110 return (false); 11111 vm_page_xbusy_claim(m); 11112 vm_page_free_zero(m); 11113 return (true); 11114 } 11115 11116 static void 11117 pmap_pti_init(void) 11118 { 11119 vm_page_t pml4_pg; 11120 pdp_entry_t *pdpe; 11121 vm_offset_t va; 11122 int i; 11123 11124 if (!pti) 11125 return; 11126 pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL); 11127 VM_OBJECT_WLOCK(pti_obj); 11128 pml4_pg = pmap_pti_alloc_page(); 11129 pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg)); 11130 for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS && 11131 va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) { 11132 pdpe = pmap_pti_pdpe(va); 11133 pmap_pti_wire_pte(pdpe); 11134 } 11135 pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0], 11136 (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false); 11137 pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt + 11138 sizeof(struct gate_descriptor) * NIDT, false); 11139 CPU_FOREACH(i) { 11140 /* Doublefault stack IST 1 */ 11141 va = __pcpu[i].pc_common_tss.tss_ist1 + sizeof(struct nmi_pcpu); 11142 pmap_pti_add_kva_locked(va - DBLFAULT_STACK_SIZE, va, false); 11143 /* NMI stack IST 2 */ 11144 va = __pcpu[i].pc_common_tss.tss_ist2 + sizeof(struct nmi_pcpu); 11145 pmap_pti_add_kva_locked(va - NMI_STACK_SIZE, va, false); 11146 /* MC# stack IST 3 */ 11147 va = __pcpu[i].pc_common_tss.tss_ist3 + 11148 sizeof(struct nmi_pcpu); 11149 pmap_pti_add_kva_locked(va - MCE_STACK_SIZE, va, false); 11150 /* DB# stack IST 4 */ 11151 va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu); 11152 pmap_pti_add_kva_locked(va - DBG_STACK_SIZE, va, false); 11153 } 11154 pmap_pti_add_kva_locked((vm_offset_t)KERNSTART, (vm_offset_t)etext, 11155 true); 11156 pti_finalized = true; 11157 VM_OBJECT_WUNLOCK(pti_obj); 11158 } 11159 11160 static void 11161 pmap_cpu_init(void *arg __unused) 11162 { 11163 CPU_COPY(&all_cpus, &kernel_pmap->pm_active); 11164 pmap_pti_init(); 11165 } 11166 SYSINIT(pmap_cpu, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_cpu_init, NULL); 11167 11168 static pdp_entry_t * 11169 pmap_pti_pdpe(vm_offset_t va) 11170 { 11171 pml4_entry_t *pml4e; 11172 pdp_entry_t *pdpe; 11173 vm_page_t m; 11174 vm_pindex_t pml4_idx; 11175 vm_paddr_t mphys; 11176 11177 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11178 11179 pml4_idx = pmap_pml4e_index(va); 11180 pml4e = &pti_pml4[pml4_idx]; 11181 m = NULL; 11182 if (*pml4e == 0) { 11183 if (pti_finalized) 11184 panic("pml4 alloc after finalization\n"); 11185 m = pmap_pti_alloc_page(); 11186 if (*pml4e != 0) { 11187 pmap_pti_free_page(m); 11188 mphys = *pml4e & ~PAGE_MASK; 11189 } else { 11190 mphys = VM_PAGE_TO_PHYS(m); 11191 *pml4e = mphys | X86_PG_RW | X86_PG_V; 11192 } 11193 } else { 11194 mphys = *pml4e & ~PAGE_MASK; 11195 } 11196 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va); 11197 return (pdpe); 11198 } 11199 11200 static void 11201 pmap_pti_wire_pte(void *pte) 11202 { 11203 vm_page_t m; 11204 11205 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11206 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); 11207 m->ref_count++; 11208 } 11209 11210 static void 11211 pmap_pti_unwire_pde(void *pde, bool only_ref) 11212 { 11213 vm_page_t m; 11214 11215 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11216 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde)); 11217 MPASS(only_ref || m->ref_count > 1); 11218 pmap_pti_free_page(m); 11219 } 11220 11221 static void 11222 pmap_pti_unwire_pte(void *pte, vm_offset_t va) 11223 { 11224 vm_page_t m; 11225 pd_entry_t *pde; 11226 11227 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11228 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); 11229 if (pmap_pti_free_page(m)) { 11230 pde = pmap_pti_pde(va); 11231 MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V); 11232 *pde = 0; 11233 pmap_pti_unwire_pde(pde, false); 11234 } 11235 } 11236 11237 static pd_entry_t * 11238 pmap_pti_pde(vm_offset_t va) 11239 { 11240 pdp_entry_t *pdpe; 11241 pd_entry_t *pde; 11242 vm_page_t m; 11243 vm_pindex_t pd_idx; 11244 vm_paddr_t mphys; 11245 11246 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11247 11248 pdpe = pmap_pti_pdpe(va); 11249 if (*pdpe == 0) { 11250 m = pmap_pti_alloc_page(); 11251 if (*pdpe != 0) { 11252 pmap_pti_free_page(m); 11253 MPASS((*pdpe & X86_PG_PS) == 0); 11254 mphys = *pdpe & ~PAGE_MASK; 11255 } else { 11256 mphys = VM_PAGE_TO_PHYS(m); 11257 *pdpe = mphys | X86_PG_RW | X86_PG_V; 11258 } 11259 } else { 11260 MPASS((*pdpe & X86_PG_PS) == 0); 11261 mphys = *pdpe & ~PAGE_MASK; 11262 } 11263 11264 pde = (pd_entry_t *)PHYS_TO_DMAP(mphys); 11265 pd_idx = pmap_pde_index(va); 11266 pde += pd_idx; 11267 return (pde); 11268 } 11269 11270 static pt_entry_t * 11271 pmap_pti_pte(vm_offset_t va, bool *unwire_pde) 11272 { 11273 pd_entry_t *pde; 11274 pt_entry_t *pte; 11275 vm_page_t m; 11276 vm_paddr_t mphys; 11277 11278 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11279 11280 pde = pmap_pti_pde(va); 11281 if (unwire_pde != NULL) { 11282 *unwire_pde = true; 11283 pmap_pti_wire_pte(pde); 11284 } 11285 if (*pde == 0) { 11286 m = pmap_pti_alloc_page(); 11287 if (*pde != 0) { 11288 pmap_pti_free_page(m); 11289 MPASS((*pde & X86_PG_PS) == 0); 11290 mphys = *pde & ~(PAGE_MASK | pg_nx); 11291 } else { 11292 mphys = VM_PAGE_TO_PHYS(m); 11293 *pde = mphys | X86_PG_RW | X86_PG_V; 11294 if (unwire_pde != NULL) 11295 *unwire_pde = false; 11296 } 11297 } else { 11298 MPASS((*pde & X86_PG_PS) == 0); 11299 mphys = *pde & ~(PAGE_MASK | pg_nx); 11300 } 11301 11302 pte = (pt_entry_t *)PHYS_TO_DMAP(mphys); 11303 pte += pmap_pte_index(va); 11304 11305 return (pte); 11306 } 11307 11308 static void 11309 pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec) 11310 { 11311 vm_paddr_t pa; 11312 pd_entry_t *pde; 11313 pt_entry_t *pte, ptev; 11314 bool unwire_pde; 11315 11316 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11317 11318 sva = trunc_page(sva); 11319 MPASS(sva > VM_MAXUSER_ADDRESS); 11320 eva = round_page(eva); 11321 MPASS(sva < eva); 11322 for (; sva < eva; sva += PAGE_SIZE) { 11323 pte = pmap_pti_pte(sva, &unwire_pde); 11324 pa = pmap_kextract(sva); 11325 ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G | 11326 (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap, 11327 VM_MEMATTR_DEFAULT, false); 11328 if (*pte == 0) { 11329 pte_store(pte, ptev); 11330 pmap_pti_wire_pte(pte); 11331 } else { 11332 KASSERT(!pti_finalized, 11333 ("pti overlap after fin %#lx %#lx %#lx", 11334 sva, *pte, ptev)); 11335 KASSERT(*pte == ptev, 11336 ("pti non-identical pte after fin %#lx %#lx %#lx", 11337 sva, *pte, ptev)); 11338 } 11339 if (unwire_pde) { 11340 pde = pmap_pti_pde(sva); 11341 pmap_pti_unwire_pde(pde, true); 11342 } 11343 } 11344 } 11345 11346 void 11347 pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec) 11348 { 11349 11350 if (!pti) 11351 return; 11352 VM_OBJECT_WLOCK(pti_obj); 11353 pmap_pti_add_kva_locked(sva, eva, exec); 11354 VM_OBJECT_WUNLOCK(pti_obj); 11355 } 11356 11357 void 11358 pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva) 11359 { 11360 pt_entry_t *pte; 11361 vm_offset_t va; 11362 11363 if (!pti) 11364 return; 11365 sva = rounddown2(sva, PAGE_SIZE); 11366 MPASS(sva > VM_MAXUSER_ADDRESS); 11367 eva = roundup2(eva, PAGE_SIZE); 11368 MPASS(sva < eva); 11369 VM_OBJECT_WLOCK(pti_obj); 11370 for (va = sva; va < eva; va += PAGE_SIZE) { 11371 pte = pmap_pti_pte(va, NULL); 11372 KASSERT((*pte & X86_PG_V) != 0, 11373 ("invalid pte va %#lx pte %#lx pt %#lx", va, 11374 (u_long)pte, *pte)); 11375 pte_clear(pte); 11376 pmap_pti_unwire_pte(pte, va); 11377 } 11378 pmap_invalidate_range(kernel_pmap, sva, eva); 11379 VM_OBJECT_WUNLOCK(pti_obj); 11380 } 11381 11382 static void * 11383 pkru_dup_range(void *ctx __unused, void *data) 11384 { 11385 struct pmap_pkru_range *node, *new_node; 11386 11387 new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); 11388 if (new_node == NULL) 11389 return (NULL); 11390 node = data; 11391 memcpy(new_node, node, sizeof(*node)); 11392 return (new_node); 11393 } 11394 11395 static void 11396 pkru_free_range(void *ctx __unused, void *node) 11397 { 11398 11399 uma_zfree(pmap_pkru_ranges_zone, node); 11400 } 11401 11402 static int 11403 pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, 11404 int flags) 11405 { 11406 struct pmap_pkru_range *ppr; 11407 int error; 11408 11409 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11410 MPASS(pmap->pm_type == PT_X86); 11411 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11412 if ((flags & AMD64_PKRU_EXCL) != 0 && 11413 !rangeset_check_empty(&pmap->pm_pkru, sva, eva)) 11414 return (EBUSY); 11415 ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); 11416 if (ppr == NULL) 11417 return (ENOMEM); 11418 ppr->pkru_keyidx = keyidx; 11419 ppr->pkru_flags = flags & AMD64_PKRU_PERSIST; 11420 error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr); 11421 if (error != 0) 11422 uma_zfree(pmap_pkru_ranges_zone, ppr); 11423 return (error); 11424 } 11425 11426 static int 11427 pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11428 { 11429 11430 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11431 MPASS(pmap->pm_type == PT_X86); 11432 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11433 return (rangeset_remove(&pmap->pm_pkru, sva, eva)); 11434 } 11435 11436 static void 11437 pmap_pkru_deassign_all(pmap_t pmap) 11438 { 11439 11440 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11441 if (pmap->pm_type == PT_X86 && 11442 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) 11443 rangeset_remove_all(&pmap->pm_pkru); 11444 } 11445 11446 static bool 11447 pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11448 { 11449 struct pmap_pkru_range *ppr, *prev_ppr; 11450 vm_offset_t va; 11451 11452 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11453 if (pmap->pm_type != PT_X86 || 11454 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || 11455 sva >= VM_MAXUSER_ADDRESS) 11456 return (true); 11457 MPASS(eva <= VM_MAXUSER_ADDRESS); 11458 for (va = sva; va < eva; prev_ppr = ppr) { 11459 ppr = rangeset_lookup(&pmap->pm_pkru, va); 11460 if (va == sva) 11461 prev_ppr = ppr; 11462 else if ((ppr == NULL) ^ (prev_ppr == NULL)) 11463 return (false); 11464 if (ppr == NULL) { 11465 va += PAGE_SIZE; 11466 continue; 11467 } 11468 if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx) 11469 return (false); 11470 va = ppr->pkru_rs_el.re_end; 11471 } 11472 return (true); 11473 } 11474 11475 static pt_entry_t 11476 pmap_pkru_get(pmap_t pmap, vm_offset_t va) 11477 { 11478 struct pmap_pkru_range *ppr; 11479 11480 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11481 if (pmap->pm_type != PT_X86 || 11482 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || 11483 va >= VM_MAXUSER_ADDRESS) 11484 return (0); 11485 ppr = rangeset_lookup(&pmap->pm_pkru, va); 11486 if (ppr != NULL) 11487 return (X86_PG_PKU(ppr->pkru_keyidx)); 11488 return (0); 11489 } 11490 11491 static bool 11492 pred_pkru_on_remove(void *ctx __unused, void *r) 11493 { 11494 struct pmap_pkru_range *ppr; 11495 11496 ppr = r; 11497 return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0); 11498 } 11499 11500 static void 11501 pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11502 { 11503 11504 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11505 if (pmap->pm_type == PT_X86 && 11506 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 11507 rangeset_remove_pred(&pmap->pm_pkru, sva, eva, 11508 pred_pkru_on_remove); 11509 } 11510 } 11511 11512 static int 11513 pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap) 11514 { 11515 11516 PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED); 11517 PMAP_LOCK_ASSERT(src_pmap, MA_OWNED); 11518 MPASS(dst_pmap->pm_type == PT_X86); 11519 MPASS(src_pmap->pm_type == PT_X86); 11520 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11521 if (src_pmap->pm_pkru.rs_data_ctx == NULL) 11522 return (0); 11523 return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru)); 11524 } 11525 11526 static void 11527 pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 11528 u_int keyidx) 11529 { 11530 pml4_entry_t *pml4e; 11531 pdp_entry_t *pdpe; 11532 pd_entry_t newpde, ptpaddr, *pde; 11533 pt_entry_t newpte, *ptep, pte; 11534 vm_offset_t va, va_next; 11535 bool changed; 11536 11537 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11538 MPASS(pmap->pm_type == PT_X86); 11539 MPASS(keyidx <= PMAP_MAX_PKRU_IDX); 11540 11541 for (changed = false, va = sva; va < eva; va = va_next) { 11542 pml4e = pmap_pml4e(pmap, va); 11543 if (pml4e == NULL || (*pml4e & X86_PG_V) == 0) { 11544 va_next = (va + NBPML4) & ~PML4MASK; 11545 if (va_next < va) 11546 va_next = eva; 11547 continue; 11548 } 11549 11550 pdpe = pmap_pml4e_to_pdpe(pml4e, va); 11551 if ((*pdpe & X86_PG_V) == 0) { 11552 va_next = (va + NBPDP) & ~PDPMASK; 11553 if (va_next < va) 11554 va_next = eva; 11555 continue; 11556 } 11557 11558 va_next = (va + NBPDR) & ~PDRMASK; 11559 if (va_next < va) 11560 va_next = eva; 11561 11562 pde = pmap_pdpe_to_pde(pdpe, va); 11563 ptpaddr = *pde; 11564 if (ptpaddr == 0) 11565 continue; 11566 11567 MPASS((ptpaddr & X86_PG_V) != 0); 11568 if ((ptpaddr & PG_PS) != 0) { 11569 if (va + NBPDR == va_next && eva >= va_next) { 11570 newpde = (ptpaddr & ~X86_PG_PKU_MASK) | 11571 X86_PG_PKU(keyidx); 11572 if (newpde != ptpaddr) { 11573 *pde = newpde; 11574 changed = true; 11575 } 11576 continue; 11577 } else if (!pmap_demote_pde(pmap, pde, va)) { 11578 continue; 11579 } 11580 } 11581 11582 if (va_next > eva) 11583 va_next = eva; 11584 11585 for (ptep = pmap_pde_to_pte(pde, va); va != va_next; 11586 ptep++, va += PAGE_SIZE) { 11587 pte = *ptep; 11588 if ((pte & X86_PG_V) == 0) 11589 continue; 11590 newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx); 11591 if (newpte != pte) { 11592 *ptep = newpte; 11593 changed = true; 11594 } 11595 } 11596 } 11597 if (changed) 11598 pmap_invalidate_range(pmap, sva, eva); 11599 } 11600 11601 static int 11602 pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 11603 u_int keyidx, int flags) 11604 { 11605 11606 if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX || 11607 (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0) 11608 return (EINVAL); 11609 if (eva <= sva || eva > VM_MAXUSER_ADDRESS) 11610 return (EFAULT); 11611 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) 11612 return (ENOTSUP); 11613 return (0); 11614 } 11615 11616 int 11617 pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, 11618 int flags) 11619 { 11620 int error; 11621 11622 sva = trunc_page(sva); 11623 eva = round_page(eva); 11624 error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags); 11625 if (error != 0) 11626 return (error); 11627 for (;;) { 11628 PMAP_LOCK(pmap); 11629 error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags); 11630 if (error == 0) 11631 pmap_pkru_update_range(pmap, sva, eva, keyidx); 11632 PMAP_UNLOCK(pmap); 11633 if (error != ENOMEM) 11634 break; 11635 vm_wait(NULL); 11636 } 11637 return (error); 11638 } 11639 11640 int 11641 pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11642 { 11643 int error; 11644 11645 sva = trunc_page(sva); 11646 eva = round_page(eva); 11647 error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0); 11648 if (error != 0) 11649 return (error); 11650 for (;;) { 11651 PMAP_LOCK(pmap); 11652 error = pmap_pkru_deassign(pmap, sva, eva); 11653 if (error == 0) 11654 pmap_pkru_update_range(pmap, sva, eva, 0); 11655 PMAP_UNLOCK(pmap); 11656 if (error != ENOMEM) 11657 break; 11658 vm_wait(NULL); 11659 } 11660 return (error); 11661 } 11662 11663 #if defined(KASAN) || defined(KMSAN) 11664 11665 /* 11666 * Reserve enough memory to: 11667 * 1) allocate PDP pages for the shadow map(s), 11668 * 2) shadow the boot stack of KSTACK_PAGES pages, 11669 * so we need one PD page, one or two PT pages, and KSTACK_PAGES shadow pages 11670 * per shadow map. 11671 */ 11672 #ifdef KASAN 11673 #define SAN_EARLY_PAGES \ 11674 (NKASANPML4E + 1 + 2 + howmany(KSTACK_PAGES, KASAN_SHADOW_SCALE)) 11675 #else 11676 #define SAN_EARLY_PAGES \ 11677 (NKMSANSHADPML4E + NKMSANORIGPML4E + 2 * (1 + 2 + KSTACK_PAGES)) 11678 #endif 11679 11680 static uint64_t __nosanitizeaddress __nosanitizememory 11681 pmap_san_enter_early_alloc_4k(uint64_t pabase) 11682 { 11683 static uint8_t data[PAGE_SIZE * SAN_EARLY_PAGES] __aligned(PAGE_SIZE); 11684 static size_t offset = 0; 11685 uint64_t pa; 11686 11687 if (offset == sizeof(data)) { 11688 panic("%s: ran out of memory for the bootstrap shadow map", 11689 __func__); 11690 } 11691 11692 pa = pabase + ((vm_offset_t)&data[offset] - KERNSTART); 11693 offset += PAGE_SIZE; 11694 return (pa); 11695 } 11696 11697 /* 11698 * Map a shadow page, before the kernel has bootstrapped its page tables. This 11699 * is currently only used to shadow the temporary boot stack set up by locore. 11700 */ 11701 static void __nosanitizeaddress __nosanitizememory 11702 pmap_san_enter_early(vm_offset_t va) 11703 { 11704 static bool first = true; 11705 pml4_entry_t *pml4e; 11706 pdp_entry_t *pdpe; 11707 pd_entry_t *pde; 11708 pt_entry_t *pte; 11709 uint64_t cr3, pa, base; 11710 int i; 11711 11712 base = amd64_loadaddr(); 11713 cr3 = rcr3(); 11714 11715 if (first) { 11716 /* 11717 * If this the first call, we need to allocate new PML4Es for 11718 * the bootstrap shadow map(s). We don't know how the PML4 page 11719 * was initialized by the boot loader, so we can't simply test 11720 * whether the shadow map's PML4Es are zero. 11721 */ 11722 first = false; 11723 #ifdef KASAN 11724 for (i = 0; i < NKASANPML4E; i++) { 11725 pa = pmap_san_enter_early_alloc_4k(base); 11726 11727 pml4e = (pml4_entry_t *)cr3 + 11728 pmap_pml4e_index(KASAN_MIN_ADDRESS + i * NBPML4); 11729 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V); 11730 } 11731 #else 11732 for (i = 0; i < NKMSANORIGPML4E; i++) { 11733 pa = pmap_san_enter_early_alloc_4k(base); 11734 11735 pml4e = (pml4_entry_t *)cr3 + 11736 pmap_pml4e_index(KMSAN_ORIG_MIN_ADDRESS + 11737 i * NBPML4); 11738 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V); 11739 } 11740 for (i = 0; i < NKMSANSHADPML4E; i++) { 11741 pa = pmap_san_enter_early_alloc_4k(base); 11742 11743 pml4e = (pml4_entry_t *)cr3 + 11744 pmap_pml4e_index(KMSAN_SHAD_MIN_ADDRESS + 11745 i * NBPML4); 11746 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V); 11747 } 11748 #endif 11749 } 11750 pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(va); 11751 pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(va); 11752 if (*pdpe == 0) { 11753 pa = pmap_san_enter_early_alloc_4k(base); 11754 *pdpe = (pdp_entry_t)(pa | X86_PG_RW | X86_PG_V); 11755 } 11756 pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(va); 11757 if (*pde == 0) { 11758 pa = pmap_san_enter_early_alloc_4k(base); 11759 *pde = (pd_entry_t)(pa | X86_PG_RW | X86_PG_V); 11760 } 11761 pte = (pt_entry_t *)(*pde & PG_FRAME) + pmap_pte_index(va); 11762 if (*pte != 0) 11763 panic("%s: PTE for %#lx is already initialized", __func__, va); 11764 pa = pmap_san_enter_early_alloc_4k(base); 11765 *pte = (pt_entry_t)(pa | X86_PG_A | X86_PG_M | X86_PG_RW | X86_PG_V); 11766 } 11767 11768 static vm_page_t 11769 pmap_san_enter_alloc_4k(void) 11770 { 11771 vm_page_t m; 11772 11773 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 11774 VM_ALLOC_ZERO); 11775 if (m == NULL) 11776 panic("%s: no memory to grow shadow map", __func__); 11777 return (m); 11778 } 11779 11780 static vm_page_t 11781 pmap_san_enter_alloc_2m(void) 11782 { 11783 return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO, 11784 NPTEPG, 0, ~0ul, NBPDR, 0, VM_MEMATTR_DEFAULT)); 11785 } 11786 11787 /* 11788 * Grow a shadow map by at least one 4KB page at the specified address. Use 2MB 11789 * pages when possible. 11790 */ 11791 void __nosanitizeaddress __nosanitizememory 11792 pmap_san_enter(vm_offset_t va) 11793 { 11794 pdp_entry_t *pdpe; 11795 pd_entry_t *pde; 11796 pt_entry_t *pte; 11797 vm_page_t m; 11798 11799 if (kernphys == 0) { 11800 /* 11801 * We're creating a temporary shadow map for the boot stack. 11802 */ 11803 pmap_san_enter_early(va); 11804 return; 11805 } 11806 11807 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 11808 11809 pdpe = pmap_pdpe(kernel_pmap, va); 11810 if ((*pdpe & X86_PG_V) == 0) { 11811 m = pmap_san_enter_alloc_4k(); 11812 *pdpe = (pdp_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11813 X86_PG_V | pg_nx); 11814 } 11815 pde = pmap_pdpe_to_pde(pdpe, va); 11816 if ((*pde & X86_PG_V) == 0) { 11817 m = pmap_san_enter_alloc_2m(); 11818 if (m != NULL) { 11819 *pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11820 X86_PG_PS | X86_PG_V | X86_PG_A | X86_PG_M | pg_nx); 11821 } else { 11822 m = pmap_san_enter_alloc_4k(); 11823 *pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11824 X86_PG_V | pg_nx); 11825 } 11826 } 11827 if ((*pde & X86_PG_PS) != 0) 11828 return; 11829 pte = pmap_pde_to_pte(pde, va); 11830 if ((*pte & X86_PG_V) != 0) 11831 return; 11832 m = pmap_san_enter_alloc_4k(); 11833 *pte = (pt_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | X86_PG_V | 11834 X86_PG_M | X86_PG_A | pg_nx); 11835 } 11836 #endif 11837 11838 /* 11839 * Track a range of the kernel's virtual address space that is contiguous 11840 * in various mapping attributes. 11841 */ 11842 struct pmap_kernel_map_range { 11843 vm_offset_t sva; 11844 pt_entry_t attrs; 11845 int ptes; 11846 int pdes; 11847 int pdpes; 11848 }; 11849 11850 static void 11851 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 11852 vm_offset_t eva) 11853 { 11854 const char *mode; 11855 int i, pat_idx; 11856 11857 if (eva <= range->sva) 11858 return; 11859 11860 pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true); 11861 for (i = 0; i < PAT_INDEX_SIZE; i++) 11862 if (pat_index[i] == pat_idx) 11863 break; 11864 11865 switch (i) { 11866 case PAT_WRITE_BACK: 11867 mode = "WB"; 11868 break; 11869 case PAT_WRITE_THROUGH: 11870 mode = "WT"; 11871 break; 11872 case PAT_UNCACHEABLE: 11873 mode = "UC"; 11874 break; 11875 case PAT_UNCACHED: 11876 mode = "U-"; 11877 break; 11878 case PAT_WRITE_PROTECTED: 11879 mode = "WP"; 11880 break; 11881 case PAT_WRITE_COMBINING: 11882 mode = "WC"; 11883 break; 11884 default: 11885 printf("%s: unknown PAT mode %#x for range 0x%016lx-0x%016lx\n", 11886 __func__, pat_idx, range->sva, eva); 11887 mode = "??"; 11888 break; 11889 } 11890 11891 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n", 11892 range->sva, eva, 11893 (range->attrs & X86_PG_RW) != 0 ? 'w' : '-', 11894 (range->attrs & pg_nx) != 0 ? '-' : 'x', 11895 (range->attrs & X86_PG_U) != 0 ? 'u' : 's', 11896 (range->attrs & X86_PG_G) != 0 ? 'g' : '-', 11897 mode, range->pdpes, range->pdes, range->ptes); 11898 11899 /* Reset to sentinel value. */ 11900 range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, 11901 NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, 11902 NPDEPG - 1, NPTEPG - 1); 11903 } 11904 11905 /* 11906 * Determine whether the attributes specified by a page table entry match those 11907 * being tracked by the current range. This is not quite as simple as a direct 11908 * flag comparison since some PAT modes have multiple representations. 11909 */ 11910 static bool 11911 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 11912 { 11913 pt_entry_t diff, mask; 11914 11915 mask = X86_PG_G | X86_PG_RW | X86_PG_U | X86_PG_PDE_CACHE | pg_nx; 11916 diff = (range->attrs ^ attrs) & mask; 11917 if (diff == 0) 11918 return (true); 11919 if ((diff & ~X86_PG_PDE_PAT) == 0 && 11920 pmap_pat_index(kernel_pmap, range->attrs, true) == 11921 pmap_pat_index(kernel_pmap, attrs, true)) 11922 return (true); 11923 return (false); 11924 } 11925 11926 static void 11927 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 11928 pt_entry_t attrs) 11929 { 11930 11931 memset(range, 0, sizeof(*range)); 11932 range->sva = va; 11933 range->attrs = attrs; 11934 } 11935 11936 /* 11937 * Given a leaf PTE, derive the mapping's attributes. If they do not match 11938 * those of the current run, dump the address range and its attributes, and 11939 * begin a new run. 11940 */ 11941 static void 11942 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 11943 vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde, 11944 pt_entry_t pte) 11945 { 11946 pt_entry_t attrs; 11947 11948 attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx); 11949 11950 attrs |= pdpe & pg_nx; 11951 attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U)); 11952 if ((pdpe & PG_PS) != 0) { 11953 attrs |= pdpe & (X86_PG_G | X86_PG_PDE_CACHE); 11954 } else if (pde != 0) { 11955 attrs |= pde & pg_nx; 11956 attrs &= pg_nx | (pde & (X86_PG_RW | X86_PG_U)); 11957 } 11958 if ((pde & PG_PS) != 0) { 11959 attrs |= pde & (X86_PG_G | X86_PG_PDE_CACHE); 11960 } else if (pte != 0) { 11961 attrs |= pte & pg_nx; 11962 attrs &= pg_nx | (pte & (X86_PG_RW | X86_PG_U)); 11963 attrs |= pte & (X86_PG_G | X86_PG_PTE_CACHE); 11964 11965 /* Canonicalize by always using the PDE PAT bit. */ 11966 if ((attrs & X86_PG_PTE_PAT) != 0) 11967 attrs ^= X86_PG_PDE_PAT | X86_PG_PTE_PAT; 11968 } 11969 11970 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 11971 sysctl_kmaps_dump(sb, range, va); 11972 sysctl_kmaps_reinit(range, va, attrs); 11973 } 11974 } 11975 11976 static int 11977 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 11978 { 11979 struct pmap_kernel_map_range range; 11980 struct sbuf sbuf, *sb; 11981 pml4_entry_t pml4e; 11982 pdp_entry_t *pdp, pdpe; 11983 pd_entry_t *pd, pde; 11984 pt_entry_t *pt, pte; 11985 vm_offset_t sva; 11986 vm_paddr_t pa; 11987 int error, i, j, k, l; 11988 11989 error = sysctl_wire_old_buffer(req, 0); 11990 if (error != 0) 11991 return (error); 11992 sb = &sbuf; 11993 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 11994 11995 /* Sentinel value. */ 11996 range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, 11997 NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, 11998 NPDEPG - 1, NPTEPG - 1); 11999 12000 /* 12001 * Iterate over the kernel page tables without holding the kernel pmap 12002 * lock. Outside of the large map, kernel page table pages are never 12003 * freed, so at worst we will observe inconsistencies in the output. 12004 * Within the large map, ensure that PDP and PD page addresses are 12005 * valid before descending. 12006 */ 12007 for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) { 12008 switch (i) { 12009 case PML4PML4I: 12010 sbuf_printf(sb, "\nRecursive map:\n"); 12011 break; 12012 case DMPML4I: 12013 sbuf_printf(sb, "\nDirect map:\n"); 12014 break; 12015 #ifdef KASAN 12016 case KASANPML4I: 12017 sbuf_printf(sb, "\nKASAN shadow map:\n"); 12018 break; 12019 #endif 12020 #ifdef KMSAN 12021 case KMSANSHADPML4I: 12022 sbuf_printf(sb, "\nKMSAN shadow map:\n"); 12023 break; 12024 case KMSANORIGPML4I: 12025 sbuf_printf(sb, "\nKMSAN origin map:\n"); 12026 break; 12027 #endif 12028 case KPML4BASE: 12029 sbuf_printf(sb, "\nKernel map:\n"); 12030 break; 12031 case LMSPML4I: 12032 sbuf_printf(sb, "\nLarge map:\n"); 12033 break; 12034 } 12035 12036 /* Convert to canonical form. */ 12037 if (sva == 1ul << 47) 12038 sva |= -1ul << 48; 12039 12040 restart: 12041 pml4e = kernel_pml4[i]; 12042 if ((pml4e & X86_PG_V) == 0) { 12043 sva = rounddown2(sva, NBPML4); 12044 sysctl_kmaps_dump(sb, &range, sva); 12045 sva += NBPML4; 12046 continue; 12047 } 12048 pa = pml4e & PG_FRAME; 12049 pdp = (pdp_entry_t *)PHYS_TO_DMAP(pa); 12050 12051 for (j = pmap_pdpe_index(sva); j < NPDPEPG; j++) { 12052 pdpe = pdp[j]; 12053 if ((pdpe & X86_PG_V) == 0) { 12054 sva = rounddown2(sva, NBPDP); 12055 sysctl_kmaps_dump(sb, &range, sva); 12056 sva += NBPDP; 12057 continue; 12058 } 12059 pa = pdpe & PG_FRAME; 12060 if ((pdpe & PG_PS) != 0) { 12061 sva = rounddown2(sva, NBPDP); 12062 sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, 12063 0, 0); 12064 range.pdpes++; 12065 sva += NBPDP; 12066 continue; 12067 } 12068 if (PMAP_ADDRESS_IN_LARGEMAP(sva) && 12069 vm_phys_paddr_to_vm_page(pa) == NULL) { 12070 /* 12071 * Page table pages for the large map may be 12072 * freed. Validate the next-level address 12073 * before descending. 12074 */ 12075 goto restart; 12076 } 12077 pd = (pd_entry_t *)PHYS_TO_DMAP(pa); 12078 12079 for (k = pmap_pde_index(sva); k < NPDEPG; k++) { 12080 pde = pd[k]; 12081 if ((pde & X86_PG_V) == 0) { 12082 sva = rounddown2(sva, NBPDR); 12083 sysctl_kmaps_dump(sb, &range, sva); 12084 sva += NBPDR; 12085 continue; 12086 } 12087 pa = pde & PG_FRAME; 12088 if ((pde & PG_PS) != 0) { 12089 sva = rounddown2(sva, NBPDR); 12090 sysctl_kmaps_check(sb, &range, sva, 12091 pml4e, pdpe, pde, 0); 12092 range.pdes++; 12093 sva += NBPDR; 12094 continue; 12095 } 12096 if (PMAP_ADDRESS_IN_LARGEMAP(sva) && 12097 vm_phys_paddr_to_vm_page(pa) == NULL) { 12098 /* 12099 * Page table pages for the large map 12100 * may be freed. Validate the 12101 * next-level address before descending. 12102 */ 12103 goto restart; 12104 } 12105 pt = (pt_entry_t *)PHYS_TO_DMAP(pa); 12106 12107 for (l = pmap_pte_index(sva); l < NPTEPG; l++, 12108 sva += PAGE_SIZE) { 12109 pte = pt[l]; 12110 if ((pte & X86_PG_V) == 0) { 12111 sysctl_kmaps_dump(sb, &range, 12112 sva); 12113 continue; 12114 } 12115 sysctl_kmaps_check(sb, &range, sva, 12116 pml4e, pdpe, pde, pte); 12117 range.ptes++; 12118 } 12119 } 12120 } 12121 } 12122 12123 error = sbuf_finish(sb); 12124 sbuf_delete(sb); 12125 return (error); 12126 } 12127 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 12128 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 12129 NULL, 0, sysctl_kmaps, "A", 12130 "Dump kernel address layout"); 12131 12132 #ifdef DDB 12133 DB_SHOW_COMMAND(pte, pmap_print_pte) 12134 { 12135 pmap_t pmap; 12136 pml5_entry_t *pml5; 12137 pml4_entry_t *pml4; 12138 pdp_entry_t *pdp; 12139 pd_entry_t *pde; 12140 pt_entry_t *pte, PG_V; 12141 vm_offset_t va; 12142 12143 if (!have_addr) { 12144 db_printf("show pte addr\n"); 12145 return; 12146 } 12147 va = (vm_offset_t)addr; 12148 12149 if (kdb_thread != NULL) 12150 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); 12151 else 12152 pmap = PCPU_GET(curpmap); 12153 12154 PG_V = pmap_valid_bit(pmap); 12155 db_printf("VA 0x%016lx", va); 12156 12157 if (pmap_is_la57(pmap)) { 12158 pml5 = pmap_pml5e(pmap, va); 12159 db_printf(" pml5e 0x%016lx", *pml5); 12160 if ((*pml5 & PG_V) == 0) { 12161 db_printf("\n"); 12162 return; 12163 } 12164 pml4 = pmap_pml5e_to_pml4e(pml5, va); 12165 } else { 12166 pml4 = pmap_pml4e(pmap, va); 12167 } 12168 db_printf(" pml4e 0x%016lx", *pml4); 12169 if ((*pml4 & PG_V) == 0) { 12170 db_printf("\n"); 12171 return; 12172 } 12173 pdp = pmap_pml4e_to_pdpe(pml4, va); 12174 db_printf(" pdpe 0x%016lx", *pdp); 12175 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { 12176 db_printf("\n"); 12177 return; 12178 } 12179 pde = pmap_pdpe_to_pde(pdp, va); 12180 db_printf(" pde 0x%016lx", *pde); 12181 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { 12182 db_printf("\n"); 12183 return; 12184 } 12185 pte = pmap_pde_to_pte(pde, va); 12186 db_printf(" pte 0x%016lx\n", *pte); 12187 } 12188 12189 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) 12190 { 12191 vm_paddr_t a; 12192 12193 if (have_addr) { 12194 a = (vm_paddr_t)addr; 12195 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); 12196 } else { 12197 db_printf("show phys2dmap addr\n"); 12198 } 12199 } 12200 12201 static void 12202 ptpages_show_page(int level, int idx, vm_page_t pg) 12203 { 12204 db_printf("l %d i %d pg %p phys %#lx ref %x\n", 12205 level, idx, pg, VM_PAGE_TO_PHYS(pg), pg->ref_count); 12206 } 12207 12208 static void 12209 ptpages_show_complain(int level, int idx, uint64_t pte) 12210 { 12211 db_printf("l %d i %d pte %#lx\n", level, idx, pte); 12212 } 12213 12214 static void 12215 ptpages_show_pml4(vm_page_t pg4, int num_entries, uint64_t PG_V) 12216 { 12217 vm_page_t pg3, pg2, pg1; 12218 pml4_entry_t *pml4; 12219 pdp_entry_t *pdp; 12220 pd_entry_t *pd; 12221 int i4, i3, i2; 12222 12223 pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg4)); 12224 for (i4 = 0; i4 < num_entries; i4++) { 12225 if ((pml4[i4] & PG_V) == 0) 12226 continue; 12227 pg3 = PHYS_TO_VM_PAGE(pml4[i4] & PG_FRAME); 12228 if (pg3 == NULL) { 12229 ptpages_show_complain(3, i4, pml4[i4]); 12230 continue; 12231 } 12232 ptpages_show_page(3, i4, pg3); 12233 pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg3)); 12234 for (i3 = 0; i3 < NPDPEPG; i3++) { 12235 if ((pdp[i3] & PG_V) == 0) 12236 continue; 12237 pg2 = PHYS_TO_VM_PAGE(pdp[i3] & PG_FRAME); 12238 if (pg3 == NULL) { 12239 ptpages_show_complain(2, i3, pdp[i3]); 12240 continue; 12241 } 12242 ptpages_show_page(2, i3, pg2); 12243 pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg2)); 12244 for (i2 = 0; i2 < NPDEPG; i2++) { 12245 if ((pd[i2] & PG_V) == 0) 12246 continue; 12247 pg1 = PHYS_TO_VM_PAGE(pd[i2] & PG_FRAME); 12248 if (pg1 == NULL) { 12249 ptpages_show_complain(1, i2, pd[i2]); 12250 continue; 12251 } 12252 ptpages_show_page(1, i2, pg1); 12253 } 12254 } 12255 } 12256 } 12257 12258 DB_SHOW_COMMAND(ptpages, pmap_ptpages) 12259 { 12260 pmap_t pmap; 12261 vm_page_t pg; 12262 pml5_entry_t *pml5; 12263 uint64_t PG_V; 12264 int i5; 12265 12266 if (have_addr) 12267 pmap = (pmap_t)addr; 12268 else 12269 pmap = PCPU_GET(curpmap); 12270 12271 PG_V = pmap_valid_bit(pmap); 12272 12273 if (pmap_is_la57(pmap)) { 12274 pml5 = pmap->pm_pmltop; 12275 for (i5 = 0; i5 < NUPML5E; i5++) { 12276 if ((pml5[i5] & PG_V) == 0) 12277 continue; 12278 pg = PHYS_TO_VM_PAGE(pml5[i5] & PG_FRAME); 12279 if (pg == NULL) { 12280 ptpages_show_complain(4, i5, pml5[i5]); 12281 continue; 12282 } 12283 ptpages_show_page(4, i5, pg); 12284 ptpages_show_pml4(pg, NPML4EPG, PG_V); 12285 } 12286 } else { 12287 ptpages_show_pml4(PHYS_TO_VM_PAGE(DMAP_TO_PHYS( 12288 (vm_offset_t)pmap->pm_pmltop)), NUP4ML4E, PG_V); 12289 } 12290 } 12291 #endif 12292