1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * 15 * This code is derived from software contributed to Berkeley by 16 * the Systems Programming Group of the University of Utah Computer 17 * Science Department and William Jolitz of UUNET Technologies Inc. 18 * 19 * Redistribution and use in source and binary forms, with or without 20 * modification, are permitted provided that the following conditions 21 * are met: 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 3. All advertising materials mentioning features or use of this software 28 * must display the following acknowledgement: 29 * This product includes software developed by the University of 30 * California, Berkeley and its contributors. 31 * 4. Neither the name of the University nor the names of its contributors 32 * may be used to endorse or promote products derived from this software 33 * without specific prior written permission. 34 * 35 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 36 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 38 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 39 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 40 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 41 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 42 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 43 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 44 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 45 * SUCH DAMAGE. 46 */ 47 /*- 48 * Copyright (c) 2003 Networks Associates Technology, Inc. 49 * Copyright (c) 2014-2020 The FreeBSD Foundation 50 * All rights reserved. 51 * 52 * This software was developed for the FreeBSD Project by Jake Burkholder, 53 * Safeport Network Services, and Network Associates Laboratories, the 54 * Security Research Division of Network Associates, Inc. under 55 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 56 * CHATS research program. 57 * 58 * Portions of this software were developed by 59 * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from 60 * the FreeBSD Foundation. 61 * 62 * Redistribution and use in source and binary forms, with or without 63 * modification, are permitted provided that the following conditions 64 * are met: 65 * 1. Redistributions of source code must retain the above copyright 66 * notice, this list of conditions and the following disclaimer. 67 * 2. Redistributions in binary form must reproduce the above copyright 68 * notice, this list of conditions and the following disclaimer in the 69 * documentation and/or other materials provided with the distribution. 70 * 71 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 72 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 73 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 74 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 75 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 76 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 77 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 78 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 79 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 80 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 81 * SUCH DAMAGE. 82 */ 83 84 #define AMD64_NPT_AWARE 85 86 #include <sys/cdefs.h> 87 /* 88 * Manages physical address maps. 89 * 90 * Since the information managed by this module is 91 * also stored by the logical address mapping module, 92 * this module may throw away valid virtual-to-physical 93 * mappings at almost any time. However, invalidations 94 * of virtual-to-physical mappings must be done as 95 * requested. 96 * 97 * In order to cope with hardware architectures which 98 * make virtual-to-physical map invalidates expensive, 99 * this module may delay invalidate or reduced protection 100 * operations until such time as they are actually 101 * necessary. This module is given full information as 102 * to which processors are currently using which maps, 103 * and to when physical maps must be made correct. 104 */ 105 106 #include "opt_ddb.h" 107 #include "opt_pmap.h" 108 #include "opt_vm.h" 109 110 #include <sys/param.h> 111 #include <sys/asan.h> 112 #include <sys/bitstring.h> 113 #include <sys/bus.h> 114 #include <sys/systm.h> 115 #include <sys/counter.h> 116 #include <sys/kernel.h> 117 #include <sys/ktr.h> 118 #include <sys/lock.h> 119 #include <sys/malloc.h> 120 #include <sys/mman.h> 121 #include <sys/msan.h> 122 #include <sys/mutex.h> 123 #include <sys/proc.h> 124 #include <sys/rangeset.h> 125 #include <sys/rwlock.h> 126 #include <sys/sbuf.h> 127 #include <sys/smr.h> 128 #include <sys/sx.h> 129 #include <sys/turnstile.h> 130 #include <sys/vmem.h> 131 #include <sys/vmmeter.h> 132 #include <sys/sched.h> 133 #include <sys/sysctl.h> 134 #include <sys/smp.h> 135 #ifdef DDB 136 #include <sys/kdb.h> 137 #include <ddb/ddb.h> 138 #endif 139 140 #include <vm/vm.h> 141 #include <vm/vm_param.h> 142 #include <vm/vm_kern.h> 143 #include <vm/vm_page.h> 144 #include <vm/vm_map.h> 145 #include <vm/vm_object.h> 146 #include <vm/vm_extern.h> 147 #include <vm/vm_pageout.h> 148 #include <vm/vm_pager.h> 149 #include <vm/vm_phys.h> 150 #include <vm/vm_radix.h> 151 #include <vm/vm_reserv.h> 152 #include <vm/vm_dumpset.h> 153 #include <vm/uma.h> 154 155 #include <machine/asan.h> 156 #include <machine/intr_machdep.h> 157 #include <x86/apicvar.h> 158 #include <x86/ifunc.h> 159 #include <machine/cpu.h> 160 #include <machine/cputypes.h> 161 #include <machine/md_var.h> 162 #include <machine/msan.h> 163 #include <machine/pcb.h> 164 #include <machine/specialreg.h> 165 #ifdef SMP 166 #include <machine/smp.h> 167 #endif 168 #include <machine/sysarch.h> 169 #include <machine/tss.h> 170 171 #ifdef NUMA 172 #define PMAP_MEMDOM MAXMEMDOM 173 #else 174 #define PMAP_MEMDOM 1 175 #endif 176 177 static __inline boolean_t 178 pmap_type_guest(pmap_t pmap) 179 { 180 181 return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI)); 182 } 183 184 static __inline boolean_t 185 pmap_emulate_ad_bits(pmap_t pmap) 186 { 187 188 return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); 189 } 190 191 static __inline pt_entry_t 192 pmap_valid_bit(pmap_t pmap) 193 { 194 pt_entry_t mask; 195 196 switch (pmap->pm_type) { 197 case PT_X86: 198 case PT_RVI: 199 mask = X86_PG_V; 200 break; 201 case PT_EPT: 202 if (pmap_emulate_ad_bits(pmap)) 203 mask = EPT_PG_EMUL_V; 204 else 205 mask = EPT_PG_READ; 206 break; 207 default: 208 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); 209 } 210 211 return (mask); 212 } 213 214 static __inline pt_entry_t 215 pmap_rw_bit(pmap_t pmap) 216 { 217 pt_entry_t mask; 218 219 switch (pmap->pm_type) { 220 case PT_X86: 221 case PT_RVI: 222 mask = X86_PG_RW; 223 break; 224 case PT_EPT: 225 if (pmap_emulate_ad_bits(pmap)) 226 mask = EPT_PG_EMUL_RW; 227 else 228 mask = EPT_PG_WRITE; 229 break; 230 default: 231 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); 232 } 233 234 return (mask); 235 } 236 237 static pt_entry_t pg_g; 238 239 static __inline pt_entry_t 240 pmap_global_bit(pmap_t pmap) 241 { 242 pt_entry_t mask; 243 244 switch (pmap->pm_type) { 245 case PT_X86: 246 mask = pg_g; 247 break; 248 case PT_RVI: 249 case PT_EPT: 250 mask = 0; 251 break; 252 default: 253 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); 254 } 255 256 return (mask); 257 } 258 259 static __inline pt_entry_t 260 pmap_accessed_bit(pmap_t pmap) 261 { 262 pt_entry_t mask; 263 264 switch (pmap->pm_type) { 265 case PT_X86: 266 case PT_RVI: 267 mask = X86_PG_A; 268 break; 269 case PT_EPT: 270 if (pmap_emulate_ad_bits(pmap)) 271 mask = EPT_PG_READ; 272 else 273 mask = EPT_PG_A; 274 break; 275 default: 276 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); 277 } 278 279 return (mask); 280 } 281 282 static __inline pt_entry_t 283 pmap_modified_bit(pmap_t pmap) 284 { 285 pt_entry_t mask; 286 287 switch (pmap->pm_type) { 288 case PT_X86: 289 case PT_RVI: 290 mask = X86_PG_M; 291 break; 292 case PT_EPT: 293 if (pmap_emulate_ad_bits(pmap)) 294 mask = EPT_PG_WRITE; 295 else 296 mask = EPT_PG_M; 297 break; 298 default: 299 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); 300 } 301 302 return (mask); 303 } 304 305 static __inline pt_entry_t 306 pmap_pku_mask_bit(pmap_t pmap) 307 { 308 309 return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0); 310 } 311 312 static __inline boolean_t 313 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) 314 { 315 316 if (!pmap_emulate_ad_bits(pmap)) 317 return (TRUE); 318 319 KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); 320 321 /* 322 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration 323 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared 324 * if the EPT_PG_WRITE bit is set. 325 */ 326 if ((pte & EPT_PG_WRITE) != 0) 327 return (FALSE); 328 329 /* 330 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. 331 */ 332 if ((pte & EPT_PG_EXECUTE) == 0 || 333 ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) 334 return (TRUE); 335 else 336 return (FALSE); 337 } 338 339 #ifdef PV_STATS 340 #define PV_STAT(x) do { x ; } while (0) 341 #else 342 #define PV_STAT(x) do { } while (0) 343 #endif 344 345 #undef pa_index 346 #ifdef NUMA 347 #define pa_index(pa) ({ \ 348 KASSERT((pa) <= vm_phys_segs[vm_phys_nsegs - 1].end, \ 349 ("address %lx beyond the last segment", (pa))); \ 350 (pa) >> PDRSHIFT; \ 351 }) 352 #define pa_to_pmdp(pa) (&pv_table[pa_index(pa)]) 353 #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page)) 354 #define PHYS_TO_PV_LIST_LOCK(pa) ({ \ 355 struct rwlock *_lock; \ 356 if (__predict_false((pa) > pmap_last_pa)) \ 357 _lock = &pv_dummy_large.pv_lock; \ 358 else \ 359 _lock = &(pa_to_pmdp(pa)->pv_lock); \ 360 _lock; \ 361 }) 362 #else 363 #define pa_index(pa) ((pa) >> PDRSHIFT) 364 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 365 366 #define NPV_LIST_LOCKS MAXCPU 367 368 #define PHYS_TO_PV_LIST_LOCK(pa) \ 369 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) 370 #endif 371 372 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 373 struct rwlock **_lockp = (lockp); \ 374 struct rwlock *_new_lock; \ 375 \ 376 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 377 if (_new_lock != *_lockp) { \ 378 if (*_lockp != NULL) \ 379 rw_wunlock(*_lockp); \ 380 *_lockp = _new_lock; \ 381 rw_wlock(*_lockp); \ 382 } \ 383 } while (0) 384 385 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 386 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 387 388 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 389 struct rwlock **_lockp = (lockp); \ 390 \ 391 if (*_lockp != NULL) { \ 392 rw_wunlock(*_lockp); \ 393 *_lockp = NULL; \ 394 } \ 395 } while (0) 396 397 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 398 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 399 400 /* 401 * Statically allocate kernel pmap memory. However, memory for 402 * pm_pcids is obtained after the dynamic allocator is operational. 403 * Initialize it with a non-canonical pointer to catch early accesses 404 * regardless of the active mapping. 405 */ 406 struct pmap kernel_pmap_store = { 407 .pm_pcidp = (void *)0xdeadbeefdeadbeef, 408 }; 409 410 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 411 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 412 413 int nkpt; 414 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, 415 "Number of kernel page table pages allocated on bootup"); 416 417 static int ndmpdp; 418 vm_paddr_t dmaplimit; 419 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 420 pt_entry_t pg_nx; 421 422 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 423 "VM/pmap parameters"); 424 425 static int __read_frequently pg_ps_enabled = 1; 426 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 427 &pg_ps_enabled, 0, "Are large page mappings enabled?"); 428 429 int __read_frequently la57 = 0; 430 SYSCTL_INT(_vm_pmap, OID_AUTO, la57, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 431 &la57, 0, 432 "5-level paging for host is enabled"); 433 434 static bool 435 pmap_is_la57(pmap_t pmap) 436 { 437 if (pmap->pm_type == PT_X86) 438 return (la57); 439 return (false); /* XXXKIB handle EPT */ 440 } 441 442 #define PAT_INDEX_SIZE 8 443 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 444 445 static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 446 static u_int64_t KPDphys; /* phys addr of kernel level 2 */ 447 static u_int64_t KPDPphys; /* phys addr of kernel level 3 */ 448 u_int64_t KPML4phys; /* phys addr of kernel level 4 */ 449 u_int64_t KPML5phys; /* phys addr of kernel level 5, 450 if supported */ 451 452 #ifdef KASAN 453 static uint64_t KASANPDPphys; 454 #endif 455 #ifdef KMSAN 456 static uint64_t KMSANSHADPDPphys; 457 static uint64_t KMSANORIGPDPphys; 458 459 /* 460 * To support systems with large amounts of memory, it is necessary to extend 461 * the maximum size of the direct map. This could eat into the space reserved 462 * for the shadow map. 463 */ 464 _Static_assert(DMPML4I + NDMPML4E <= KMSANSHADPML4I, "direct map overflow"); 465 #endif 466 467 static pml4_entry_t *kernel_pml4; 468 static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ 469 static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 470 static int ndmpdpphys; /* number of DMPDPphys pages */ 471 472 vm_paddr_t kernphys; /* phys addr of start of bootstrap data */ 473 vm_paddr_t KERNend; /* and the end */ 474 475 /* 476 * pmap_mapdev support pre initialization (i.e. console) 477 */ 478 #define PMAP_PREINIT_MAPPING_COUNT 8 479 static struct pmap_preinit_mapping { 480 vm_paddr_t pa; 481 vm_offset_t va; 482 vm_size_t sz; 483 int mode; 484 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 485 static int pmap_initialized; 486 487 /* 488 * Data for the pv entry allocation mechanism. 489 * Updates to pv_invl_gen are protected by the pv list lock but reads are not. 490 */ 491 #ifdef NUMA 492 static __inline int 493 pc_to_domain(struct pv_chunk *pc) 494 { 495 496 return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc))); 497 } 498 #else 499 static __inline int 500 pc_to_domain(struct pv_chunk *pc __unused) 501 { 502 503 return (0); 504 } 505 #endif 506 507 struct pv_chunks_list { 508 struct mtx pvc_lock; 509 TAILQ_HEAD(pch, pv_chunk) pvc_list; 510 int active_reclaims; 511 } __aligned(CACHE_LINE_SIZE); 512 513 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM]; 514 515 #ifdef NUMA 516 struct pmap_large_md_page { 517 struct rwlock pv_lock; 518 struct md_page pv_page; 519 u_long pv_invl_gen; 520 }; 521 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large; 522 #define pv_dummy pv_dummy_large.pv_page 523 __read_mostly static struct pmap_large_md_page *pv_table; 524 __read_mostly vm_paddr_t pmap_last_pa; 525 #else 526 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; 527 static u_long pv_invl_gen[NPV_LIST_LOCKS]; 528 static struct md_page *pv_table; 529 static struct md_page pv_dummy; 530 #endif 531 532 /* 533 * All those kernel PT submaps that BSD is so fond of 534 */ 535 pt_entry_t *CMAP1 = NULL; 536 caddr_t CADDR1 = 0; 537 static vm_offset_t qframe = 0; 538 static struct mtx qframe_mtx; 539 540 static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ 541 542 static vmem_t *large_vmem; 543 static u_int lm_ents; 544 #define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= LARGEMAP_MIN_ADDRESS && \ 545 (va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents) 546 547 int pmap_pcid_enabled = 1; 548 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 549 &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?"); 550 int invpcid_works = 0; 551 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, 552 "Is the invpcid instruction available ?"); 553 int pmap_pcid_invlpg_workaround = 0; 554 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_invlpg_workaround, 555 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 556 &pmap_pcid_invlpg_workaround, 0, 557 "Enable small core PCID/INVLPG workaround"); 558 int pmap_pcid_invlpg_workaround_uena = 1; 559 560 int __read_frequently pti = 0; 561 SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 562 &pti, 0, 563 "Page Table Isolation enabled"); 564 static vm_object_t pti_obj; 565 static pml4_entry_t *pti_pml4; 566 static vm_pindex_t pti_pg_idx; 567 static bool pti_finalized; 568 569 struct pmap_pkru_range { 570 struct rs_el pkru_rs_el; 571 u_int pkru_keyidx; 572 int pkru_flags; 573 }; 574 575 static uma_zone_t pmap_pkru_ranges_zone; 576 static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 577 static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va); 578 static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 579 static void *pkru_dup_range(void *ctx, void *data); 580 static void pkru_free_range(void *ctx, void *node); 581 static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap); 582 static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 583 static void pmap_pkru_deassign_all(pmap_t pmap); 584 585 static COUNTER_U64_DEFINE_EARLY(pcid_save_cnt); 586 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLFLAG_RD, 587 &pcid_save_cnt, "Count of saved TLB context on switch"); 588 589 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = 590 LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); 591 static struct mtx invl_gen_mtx; 592 /* Fake lock object to satisfy turnstiles interface. */ 593 static struct lock_object invl_gen_ts = { 594 .lo_name = "invlts", 595 }; 596 static struct pmap_invl_gen pmap_invl_gen_head = { 597 .gen = 1, 598 .next = NULL, 599 }; 600 static u_long pmap_invl_gen = 1; 601 static int pmap_invl_waiters; 602 static struct callout pmap_invl_callout; 603 static bool pmap_invl_callout_inited; 604 605 #define PMAP_ASSERT_NOT_IN_DI() \ 606 KASSERT(pmap_not_in_di(), ("DI already started")) 607 608 static bool 609 pmap_di_locked(void) 610 { 611 int tun; 612 613 if ((cpu_feature2 & CPUID2_CX16) == 0) 614 return (true); 615 tun = 0; 616 TUNABLE_INT_FETCH("vm.pmap.di_locked", &tun); 617 return (tun != 0); 618 } 619 620 static int 621 sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS) 622 { 623 int locked; 624 625 locked = pmap_di_locked(); 626 return (sysctl_handle_int(oidp, &locked, 0, req)); 627 } 628 SYSCTL_PROC(_vm_pmap, OID_AUTO, di_locked, CTLTYPE_INT | CTLFLAG_RDTUN | 629 CTLFLAG_MPSAFE, 0, 0, sysctl_pmap_di_locked, "", 630 "Locked delayed invalidation"); 631 632 static bool pmap_not_in_di_l(void); 633 static bool pmap_not_in_di_u(void); 634 DEFINE_IFUNC(, bool, pmap_not_in_di, (void)) 635 { 636 637 return (pmap_di_locked() ? pmap_not_in_di_l : pmap_not_in_di_u); 638 } 639 640 static bool 641 pmap_not_in_di_l(void) 642 { 643 struct pmap_invl_gen *invl_gen; 644 645 invl_gen = &curthread->td_md.md_invl_gen; 646 return (invl_gen->gen == 0); 647 } 648 649 static void 650 pmap_thread_init_invl_gen_l(struct thread *td) 651 { 652 struct pmap_invl_gen *invl_gen; 653 654 invl_gen = &td->td_md.md_invl_gen; 655 invl_gen->gen = 0; 656 } 657 658 static void 659 pmap_delayed_invl_wait_block(u_long *m_gen, u_long *invl_gen) 660 { 661 struct turnstile *ts; 662 663 ts = turnstile_trywait(&invl_gen_ts); 664 if (*m_gen > atomic_load_long(invl_gen)) 665 turnstile_wait(ts, NULL, TS_SHARED_QUEUE); 666 else 667 turnstile_cancel(ts); 668 } 669 670 static void 671 pmap_delayed_invl_finish_unblock(u_long new_gen) 672 { 673 struct turnstile *ts; 674 675 turnstile_chain_lock(&invl_gen_ts); 676 ts = turnstile_lookup(&invl_gen_ts); 677 if (new_gen != 0) 678 pmap_invl_gen = new_gen; 679 if (ts != NULL) { 680 turnstile_broadcast(ts, TS_SHARED_QUEUE); 681 turnstile_unpend(ts); 682 } 683 turnstile_chain_unlock(&invl_gen_ts); 684 } 685 686 /* 687 * Start a new Delayed Invalidation (DI) block of code, executed by 688 * the current thread. Within a DI block, the current thread may 689 * destroy both the page table and PV list entries for a mapping and 690 * then release the corresponding PV list lock before ensuring that 691 * the mapping is flushed from the TLBs of any processors with the 692 * pmap active. 693 */ 694 static void 695 pmap_delayed_invl_start_l(void) 696 { 697 struct pmap_invl_gen *invl_gen; 698 u_long currgen; 699 700 invl_gen = &curthread->td_md.md_invl_gen; 701 PMAP_ASSERT_NOT_IN_DI(); 702 mtx_lock(&invl_gen_mtx); 703 if (LIST_EMPTY(&pmap_invl_gen_tracker)) 704 currgen = pmap_invl_gen; 705 else 706 currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; 707 invl_gen->gen = currgen + 1; 708 LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); 709 mtx_unlock(&invl_gen_mtx); 710 } 711 712 /* 713 * Finish the DI block, previously started by the current thread. All 714 * required TLB flushes for the pages marked by 715 * pmap_delayed_invl_page() must be finished before this function is 716 * called. 717 * 718 * This function works by bumping the global DI generation number to 719 * the generation number of the current thread's DI, unless there is a 720 * pending DI that started earlier. In the latter case, bumping the 721 * global DI generation number would incorrectly signal that the 722 * earlier DI had finished. Instead, this function bumps the earlier 723 * DI's generation number to match the generation number of the 724 * current thread's DI. 725 */ 726 static void 727 pmap_delayed_invl_finish_l(void) 728 { 729 struct pmap_invl_gen *invl_gen, *next; 730 731 invl_gen = &curthread->td_md.md_invl_gen; 732 KASSERT(invl_gen->gen != 0, ("missed invl_start")); 733 mtx_lock(&invl_gen_mtx); 734 next = LIST_NEXT(invl_gen, link); 735 if (next == NULL) 736 pmap_delayed_invl_finish_unblock(invl_gen->gen); 737 else 738 next->gen = invl_gen->gen; 739 LIST_REMOVE(invl_gen, link); 740 mtx_unlock(&invl_gen_mtx); 741 invl_gen->gen = 0; 742 } 743 744 static bool 745 pmap_not_in_di_u(void) 746 { 747 struct pmap_invl_gen *invl_gen; 748 749 invl_gen = &curthread->td_md.md_invl_gen; 750 return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0); 751 } 752 753 static void 754 pmap_thread_init_invl_gen_u(struct thread *td) 755 { 756 struct pmap_invl_gen *invl_gen; 757 758 invl_gen = &td->td_md.md_invl_gen; 759 invl_gen->gen = 0; 760 invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID; 761 } 762 763 static bool 764 pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out) 765 { 766 uint64_t new_high, new_low, old_high, old_low; 767 char res; 768 769 old_low = new_low = 0; 770 old_high = new_high = (uintptr_t)0; 771 772 __asm volatile("lock;cmpxchg16b\t%1" 773 : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) 774 : "b"(new_low), "c" (new_high) 775 : "memory", "cc"); 776 if (res == 0) { 777 if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0) 778 return (false); 779 out->gen = old_low; 780 out->next = (void *)old_high; 781 } else { 782 out->gen = new_low; 783 out->next = (void *)new_high; 784 } 785 return (true); 786 } 787 788 static bool 789 pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val, 790 struct pmap_invl_gen *new_val) 791 { 792 uint64_t new_high, new_low, old_high, old_low; 793 char res; 794 795 new_low = new_val->gen; 796 new_high = (uintptr_t)new_val->next; 797 old_low = old_val->gen; 798 old_high = (uintptr_t)old_val->next; 799 800 __asm volatile("lock;cmpxchg16b\t%1" 801 : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) 802 : "b"(new_low), "c" (new_high) 803 : "memory", "cc"); 804 return (res); 805 } 806 807 static COUNTER_U64_DEFINE_EARLY(pv_page_count); 808 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_page_count, CTLFLAG_RD, 809 &pv_page_count, "Current number of allocated pv pages"); 810 811 static COUNTER_U64_DEFINE_EARLY(user_pt_page_count); 812 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, user_pt_page_count, CTLFLAG_RD, 813 &user_pt_page_count, 814 "Current number of allocated page table pages for userspace"); 815 816 static COUNTER_U64_DEFINE_EARLY(kernel_pt_page_count); 817 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, kernel_pt_page_count, CTLFLAG_RD, 818 &kernel_pt_page_count, 819 "Current number of allocated page table pages for the kernel"); 820 821 #ifdef PV_STATS 822 823 static COUNTER_U64_DEFINE_EARLY(invl_start_restart); 824 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_start_restart, 825 CTLFLAG_RD, &invl_start_restart, 826 "Number of delayed TLB invalidation request restarts"); 827 828 static COUNTER_U64_DEFINE_EARLY(invl_finish_restart); 829 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD, 830 &invl_finish_restart, 831 "Number of delayed TLB invalidation completion restarts"); 832 833 static int invl_max_qlen; 834 SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD, 835 &invl_max_qlen, 0, 836 "Maximum delayed TLB invalidation request queue length"); 837 #endif 838 839 #define di_delay locks_delay 840 841 static void 842 pmap_delayed_invl_start_u(void) 843 { 844 struct pmap_invl_gen *invl_gen, *p, prev, new_prev; 845 struct thread *td; 846 struct lock_delay_arg lda; 847 uintptr_t prevl; 848 u_char pri; 849 #ifdef PV_STATS 850 int i, ii; 851 #endif 852 853 td = curthread; 854 invl_gen = &td->td_md.md_invl_gen; 855 PMAP_ASSERT_NOT_IN_DI(); 856 lock_delay_arg_init(&lda, &di_delay); 857 invl_gen->saved_pri = 0; 858 pri = td->td_base_pri; 859 if (pri > PVM) { 860 thread_lock(td); 861 pri = td->td_base_pri; 862 if (pri > PVM) { 863 invl_gen->saved_pri = pri; 864 sched_prio(td, PVM); 865 } 866 thread_unlock(td); 867 } 868 again: 869 PV_STAT(i = 0); 870 for (p = &pmap_invl_gen_head;; p = prev.next) { 871 PV_STAT(i++); 872 prevl = (uintptr_t)atomic_load_ptr(&p->next); 873 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { 874 PV_STAT(counter_u64_add(invl_start_restart, 1)); 875 lock_delay(&lda); 876 goto again; 877 } 878 if (prevl == 0) 879 break; 880 prev.next = (void *)prevl; 881 } 882 #ifdef PV_STATS 883 if ((ii = invl_max_qlen) < i) 884 atomic_cmpset_int(&invl_max_qlen, ii, i); 885 #endif 886 887 if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) { 888 PV_STAT(counter_u64_add(invl_start_restart, 1)); 889 lock_delay(&lda); 890 goto again; 891 } 892 893 new_prev.gen = prev.gen; 894 new_prev.next = invl_gen; 895 invl_gen->gen = prev.gen + 1; 896 897 /* Formal fence between store to invl->gen and updating *p. */ 898 atomic_thread_fence_rel(); 899 900 /* 901 * After inserting an invl_gen element with invalid bit set, 902 * this thread blocks any other thread trying to enter the 903 * delayed invalidation block. Do not allow to remove us from 904 * the CPU, because it causes starvation for other threads. 905 */ 906 critical_enter(); 907 908 /* 909 * ABA for *p is not possible there, since p->gen can only 910 * increase. So if the *p thread finished its di, then 911 * started a new one and got inserted into the list at the 912 * same place, its gen will appear greater than the previously 913 * read gen. 914 */ 915 if (!pmap_di_store_invl(p, &prev, &new_prev)) { 916 critical_exit(); 917 PV_STAT(counter_u64_add(invl_start_restart, 1)); 918 lock_delay(&lda); 919 goto again; 920 } 921 922 /* 923 * There we clear PMAP_INVL_GEN_NEXT_INVALID in 924 * invl_gen->next, allowing other threads to iterate past us. 925 * pmap_di_store_invl() provides fence between the generation 926 * write and the update of next. 927 */ 928 invl_gen->next = NULL; 929 critical_exit(); 930 } 931 932 static bool 933 pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen *invl_gen, 934 struct pmap_invl_gen *p) 935 { 936 struct pmap_invl_gen prev, new_prev; 937 u_long mygen; 938 939 /* 940 * Load invl_gen->gen after setting invl_gen->next 941 * PMAP_INVL_GEN_NEXT_INVALID. This prevents larger 942 * generations to propagate to our invl_gen->gen. Lock prefix 943 * in atomic_set_ptr() worked as seq_cst fence. 944 */ 945 mygen = atomic_load_long(&invl_gen->gen); 946 947 if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen) 948 return (false); 949 950 KASSERT(prev.gen < mygen, 951 ("invalid di gen sequence %lu %lu", prev.gen, mygen)); 952 new_prev.gen = mygen; 953 new_prev.next = (void *)((uintptr_t)invl_gen->next & 954 ~PMAP_INVL_GEN_NEXT_INVALID); 955 956 /* Formal fence between load of prev and storing update to it. */ 957 atomic_thread_fence_rel(); 958 959 return (pmap_di_store_invl(p, &prev, &new_prev)); 960 } 961 962 static void 963 pmap_delayed_invl_finish_u(void) 964 { 965 struct pmap_invl_gen *invl_gen, *p; 966 struct thread *td; 967 struct lock_delay_arg lda; 968 uintptr_t prevl; 969 970 td = curthread; 971 invl_gen = &td->td_md.md_invl_gen; 972 KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0")); 973 KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0, 974 ("missed invl_start: INVALID")); 975 lock_delay_arg_init(&lda, &di_delay); 976 977 again: 978 for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) { 979 prevl = (uintptr_t)atomic_load_ptr(&p->next); 980 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { 981 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 982 lock_delay(&lda); 983 goto again; 984 } 985 if ((void *)prevl == invl_gen) 986 break; 987 } 988 989 /* 990 * It is legitimate to not find ourself on the list if a 991 * thread before us finished its DI and started it again. 992 */ 993 if (__predict_false(p == NULL)) { 994 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 995 lock_delay(&lda); 996 goto again; 997 } 998 999 critical_enter(); 1000 atomic_set_ptr((uintptr_t *)&invl_gen->next, 1001 PMAP_INVL_GEN_NEXT_INVALID); 1002 if (!pmap_delayed_invl_finish_u_crit(invl_gen, p)) { 1003 atomic_clear_ptr((uintptr_t *)&invl_gen->next, 1004 PMAP_INVL_GEN_NEXT_INVALID); 1005 critical_exit(); 1006 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 1007 lock_delay(&lda); 1008 goto again; 1009 } 1010 critical_exit(); 1011 if (atomic_load_int(&pmap_invl_waiters) > 0) 1012 pmap_delayed_invl_finish_unblock(0); 1013 if (invl_gen->saved_pri != 0) { 1014 thread_lock(td); 1015 sched_prio(td, invl_gen->saved_pri); 1016 thread_unlock(td); 1017 } 1018 } 1019 1020 #ifdef DDB 1021 DB_SHOW_COMMAND(di_queue, pmap_di_queue) 1022 { 1023 struct pmap_invl_gen *p, *pn; 1024 struct thread *td; 1025 uintptr_t nextl; 1026 bool first; 1027 1028 for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn, 1029 first = false) { 1030 nextl = (uintptr_t)atomic_load_ptr(&p->next); 1031 pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID); 1032 td = first ? NULL : __containerof(p, struct thread, 1033 td_md.md_invl_gen); 1034 db_printf("gen %lu inv %d td %p tid %d\n", p->gen, 1035 (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td, 1036 td != NULL ? td->td_tid : -1); 1037 } 1038 } 1039 #endif 1040 1041 #ifdef PV_STATS 1042 static COUNTER_U64_DEFINE_EARLY(invl_wait); 1043 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait, 1044 CTLFLAG_RD, &invl_wait, 1045 "Number of times DI invalidation blocked pmap_remove_all/write"); 1046 1047 static COUNTER_U64_DEFINE_EARLY(invl_wait_slow); 1048 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD, 1049 &invl_wait_slow, "Number of slow invalidation waits for lockless DI"); 1050 1051 #endif 1052 1053 #ifdef NUMA 1054 static u_long * 1055 pmap_delayed_invl_genp(vm_page_t m) 1056 { 1057 vm_paddr_t pa; 1058 u_long *gen; 1059 1060 pa = VM_PAGE_TO_PHYS(m); 1061 if (__predict_false((pa) > pmap_last_pa)) 1062 gen = &pv_dummy_large.pv_invl_gen; 1063 else 1064 gen = &(pa_to_pmdp(pa)->pv_invl_gen); 1065 1066 return (gen); 1067 } 1068 #else 1069 static u_long * 1070 pmap_delayed_invl_genp(vm_page_t m) 1071 { 1072 1073 return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); 1074 } 1075 #endif 1076 1077 static void 1078 pmap_delayed_invl_callout_func(void *arg __unused) 1079 { 1080 1081 if (atomic_load_int(&pmap_invl_waiters) == 0) 1082 return; 1083 pmap_delayed_invl_finish_unblock(0); 1084 } 1085 1086 static void 1087 pmap_delayed_invl_callout_init(void *arg __unused) 1088 { 1089 1090 if (pmap_di_locked()) 1091 return; 1092 callout_init(&pmap_invl_callout, 1); 1093 pmap_invl_callout_inited = true; 1094 } 1095 SYSINIT(pmap_di_callout, SI_SUB_CPU + 1, SI_ORDER_ANY, 1096 pmap_delayed_invl_callout_init, NULL); 1097 1098 /* 1099 * Ensure that all currently executing DI blocks, that need to flush 1100 * TLB for the given page m, actually flushed the TLB at the time the 1101 * function returned. If the page m has an empty PV list and we call 1102 * pmap_delayed_invl_wait(), upon its return we know that no CPU has a 1103 * valid mapping for the page m in either its page table or TLB. 1104 * 1105 * This function works by blocking until the global DI generation 1106 * number catches up with the generation number associated with the 1107 * given page m and its PV list. Since this function's callers 1108 * typically own an object lock and sometimes own a page lock, it 1109 * cannot sleep. Instead, it blocks on a turnstile to relinquish the 1110 * processor. 1111 */ 1112 static void 1113 pmap_delayed_invl_wait_l(vm_page_t m) 1114 { 1115 u_long *m_gen; 1116 #ifdef PV_STATS 1117 bool accounted = false; 1118 #endif 1119 1120 m_gen = pmap_delayed_invl_genp(m); 1121 while (*m_gen > pmap_invl_gen) { 1122 #ifdef PV_STATS 1123 if (!accounted) { 1124 counter_u64_add(invl_wait, 1); 1125 accounted = true; 1126 } 1127 #endif 1128 pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen); 1129 } 1130 } 1131 1132 static void 1133 pmap_delayed_invl_wait_u(vm_page_t m) 1134 { 1135 u_long *m_gen; 1136 struct lock_delay_arg lda; 1137 bool fast; 1138 1139 fast = true; 1140 m_gen = pmap_delayed_invl_genp(m); 1141 lock_delay_arg_init(&lda, &di_delay); 1142 while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) { 1143 if (fast || !pmap_invl_callout_inited) { 1144 PV_STAT(counter_u64_add(invl_wait, 1)); 1145 lock_delay(&lda); 1146 fast = false; 1147 } else { 1148 /* 1149 * The page's invalidation generation number 1150 * is still below the current thread's number. 1151 * Prepare to block so that we do not waste 1152 * CPU cycles or worse, suffer livelock. 1153 * 1154 * Since it is impossible to block without 1155 * racing with pmap_delayed_invl_finish_u(), 1156 * prepare for the race by incrementing 1157 * pmap_invl_waiters and arming a 1-tick 1158 * callout which will unblock us if we lose 1159 * the race. 1160 */ 1161 atomic_add_int(&pmap_invl_waiters, 1); 1162 1163 /* 1164 * Re-check the current thread's invalidation 1165 * generation after incrementing 1166 * pmap_invl_waiters, so that there is no race 1167 * with pmap_delayed_invl_finish_u() setting 1168 * the page generation and checking 1169 * pmap_invl_waiters. The only race allowed 1170 * is for a missed unblock, which is handled 1171 * by the callout. 1172 */ 1173 if (*m_gen > 1174 atomic_load_long(&pmap_invl_gen_head.gen)) { 1175 callout_reset(&pmap_invl_callout, 1, 1176 pmap_delayed_invl_callout_func, NULL); 1177 PV_STAT(counter_u64_add(invl_wait_slow, 1)); 1178 pmap_delayed_invl_wait_block(m_gen, 1179 &pmap_invl_gen_head.gen); 1180 } 1181 atomic_add_int(&pmap_invl_waiters, -1); 1182 } 1183 } 1184 } 1185 1186 DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *)) 1187 { 1188 1189 return (pmap_di_locked() ? pmap_thread_init_invl_gen_l : 1190 pmap_thread_init_invl_gen_u); 1191 } 1192 1193 DEFINE_IFUNC(static, void, pmap_delayed_invl_start, (void)) 1194 { 1195 1196 return (pmap_di_locked() ? pmap_delayed_invl_start_l : 1197 pmap_delayed_invl_start_u); 1198 } 1199 1200 DEFINE_IFUNC(static, void, pmap_delayed_invl_finish, (void)) 1201 { 1202 1203 return (pmap_di_locked() ? pmap_delayed_invl_finish_l : 1204 pmap_delayed_invl_finish_u); 1205 } 1206 1207 DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t)) 1208 { 1209 1210 return (pmap_di_locked() ? pmap_delayed_invl_wait_l : 1211 pmap_delayed_invl_wait_u); 1212 } 1213 1214 /* 1215 * Mark the page m's PV list as participating in the current thread's 1216 * DI block. Any threads concurrently using m's PV list to remove or 1217 * restrict all mappings to m will wait for the current thread's DI 1218 * block to complete before proceeding. 1219 * 1220 * The function works by setting the DI generation number for m's PV 1221 * list to at least the DI generation number of the current thread. 1222 * This forces a caller of pmap_delayed_invl_wait() to block until 1223 * current thread calls pmap_delayed_invl_finish(). 1224 */ 1225 static void 1226 pmap_delayed_invl_page(vm_page_t m) 1227 { 1228 u_long gen, *m_gen; 1229 1230 rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); 1231 gen = curthread->td_md.md_invl_gen.gen; 1232 if (gen == 0) 1233 return; 1234 m_gen = pmap_delayed_invl_genp(m); 1235 if (*m_gen < gen) 1236 *m_gen = gen; 1237 } 1238 1239 /* 1240 * Crashdump maps. 1241 */ 1242 static caddr_t crashdumpmap; 1243 1244 /* 1245 * Internal flags for pmap_enter()'s helper functions. 1246 */ 1247 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 1248 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 1249 1250 /* 1251 * Internal flags for pmap_mapdev_internal() and 1252 * pmap_change_props_locked(). 1253 */ 1254 #define MAPDEV_FLUSHCACHE 0x00000001 /* Flush cache after mapping. */ 1255 #define MAPDEV_SETATTR 0x00000002 /* Modify existing attrs. */ 1256 #define MAPDEV_ASSERTVALID 0x00000004 /* Assert mapping validity. */ 1257 1258 TAILQ_HEAD(pv_chunklist, pv_chunk); 1259 1260 static void free_pv_chunk(struct pv_chunk *pc); 1261 static void free_pv_chunk_batch(struct pv_chunklist *batch); 1262 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 1263 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 1264 static int popcnt_pc_map_pq(uint64_t *map); 1265 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 1266 static void reserve_pv_entries(pmap_t pmap, int needed, 1267 struct rwlock **lockp); 1268 static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1269 struct rwlock **lockp); 1270 static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, 1271 u_int flags, struct rwlock **lockp); 1272 #if VM_NRESERVLEVEL > 0 1273 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1274 struct rwlock **lockp); 1275 #endif 1276 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 1277 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 1278 vm_offset_t va); 1279 1280 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); 1281 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, 1282 vm_prot_t prot, int mode, int flags); 1283 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 1284 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, 1285 vm_offset_t va, struct rwlock **lockp); 1286 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, 1287 vm_offset_t va); 1288 static int pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, 1289 vm_prot_t prot, struct rwlock **lockp); 1290 static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, 1291 u_int flags, vm_page_t m, struct rwlock **lockp); 1292 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 1293 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 1294 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 1295 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, 1296 bool allpte_PG_A_set); 1297 static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, 1298 vm_offset_t eva); 1299 static void pmap_invalidate_cache_range_all(vm_offset_t sva, 1300 vm_offset_t eva); 1301 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, 1302 pd_entry_t pde); 1303 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 1304 static vm_page_t pmap_large_map_getptp_unlocked(void); 1305 static vm_paddr_t pmap_large_map_kextract(vm_offset_t va); 1306 #if VM_NRESERVLEVEL > 0 1307 static bool pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 1308 vm_page_t mpte, struct rwlock **lockp); 1309 #endif 1310 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 1311 vm_prot_t prot); 1312 static void pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask); 1313 static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, 1314 bool exec); 1315 static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va); 1316 static pd_entry_t *pmap_pti_pde(vm_offset_t va); 1317 static void pmap_pti_wire_pte(void *pte); 1318 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 1319 struct spglist *free, struct rwlock **lockp); 1320 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 1321 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 1322 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 1323 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 1324 struct spglist *free); 1325 static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1326 pd_entry_t *pde, struct spglist *free, 1327 struct rwlock **lockp); 1328 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 1329 vm_page_t m, struct rwlock **lockp); 1330 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 1331 pd_entry_t newpde); 1332 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); 1333 1334 static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, 1335 struct rwlock **lockp); 1336 static vm_page_t pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex, 1337 struct rwlock **lockp, vm_offset_t va); 1338 static vm_page_t pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex, 1339 struct rwlock **lockp, vm_offset_t va); 1340 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, 1341 struct rwlock **lockp); 1342 1343 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 1344 struct spglist *free); 1345 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 1346 1347 static vm_page_t pmap_alloc_pt_page(pmap_t, vm_pindex_t, int); 1348 static void pmap_free_pt_page(pmap_t, vm_page_t, bool); 1349 1350 /********************/ 1351 /* Inline functions */ 1352 /********************/ 1353 1354 /* 1355 * Return a non-clipped indexes for a given VA, which are page table 1356 * pages indexes at the corresponding level. 1357 */ 1358 static __inline vm_pindex_t 1359 pmap_pde_pindex(vm_offset_t va) 1360 { 1361 return (va >> PDRSHIFT); 1362 } 1363 1364 static __inline vm_pindex_t 1365 pmap_pdpe_pindex(vm_offset_t va) 1366 { 1367 return (NUPDE + (va >> PDPSHIFT)); 1368 } 1369 1370 static __inline vm_pindex_t 1371 pmap_pml4e_pindex(vm_offset_t va) 1372 { 1373 return (NUPDE + NUPDPE + (va >> PML4SHIFT)); 1374 } 1375 1376 static __inline vm_pindex_t 1377 pmap_pml5e_pindex(vm_offset_t va) 1378 { 1379 return (NUPDE + NUPDPE + NUPML4E + (va >> PML5SHIFT)); 1380 } 1381 1382 static __inline pml4_entry_t * 1383 pmap_pml5e(pmap_t pmap, vm_offset_t va) 1384 { 1385 1386 MPASS(pmap_is_la57(pmap)); 1387 return (&pmap->pm_pmltop[pmap_pml5e_index(va)]); 1388 } 1389 1390 static __inline pml4_entry_t * 1391 pmap_pml5e_u(pmap_t pmap, vm_offset_t va) 1392 { 1393 1394 MPASS(pmap_is_la57(pmap)); 1395 return (&pmap->pm_pmltopu[pmap_pml5e_index(va)]); 1396 } 1397 1398 static __inline pml4_entry_t * 1399 pmap_pml5e_to_pml4e(pml5_entry_t *pml5e, vm_offset_t va) 1400 { 1401 pml4_entry_t *pml4e; 1402 1403 /* XXX MPASS(pmap_is_la57(pmap); */ 1404 pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); 1405 return (&pml4e[pmap_pml4e_index(va)]); 1406 } 1407 1408 /* Return a pointer to the PML4 slot that corresponds to a VA */ 1409 static __inline pml4_entry_t * 1410 pmap_pml4e(pmap_t pmap, vm_offset_t va) 1411 { 1412 pml5_entry_t *pml5e; 1413 pml4_entry_t *pml4e; 1414 pt_entry_t PG_V; 1415 1416 if (pmap_is_la57(pmap)) { 1417 pml5e = pmap_pml5e(pmap, va); 1418 PG_V = pmap_valid_bit(pmap); 1419 if ((*pml5e & PG_V) == 0) 1420 return (NULL); 1421 pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); 1422 } else { 1423 pml4e = pmap->pm_pmltop; 1424 } 1425 return (&pml4e[pmap_pml4e_index(va)]); 1426 } 1427 1428 static __inline pml4_entry_t * 1429 pmap_pml4e_u(pmap_t pmap, vm_offset_t va) 1430 { 1431 MPASS(!pmap_is_la57(pmap)); 1432 return (&pmap->pm_pmltopu[pmap_pml4e_index(va)]); 1433 } 1434 1435 /* Return a pointer to the PDP slot that corresponds to a VA */ 1436 static __inline pdp_entry_t * 1437 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 1438 { 1439 pdp_entry_t *pdpe; 1440 1441 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 1442 return (&pdpe[pmap_pdpe_index(va)]); 1443 } 1444 1445 /* Return a pointer to the PDP slot that corresponds to a VA */ 1446 static __inline pdp_entry_t * 1447 pmap_pdpe(pmap_t pmap, vm_offset_t va) 1448 { 1449 pml4_entry_t *pml4e; 1450 pt_entry_t PG_V; 1451 1452 PG_V = pmap_valid_bit(pmap); 1453 pml4e = pmap_pml4e(pmap, va); 1454 if (pml4e == NULL || (*pml4e & PG_V) == 0) 1455 return (NULL); 1456 return (pmap_pml4e_to_pdpe(pml4e, va)); 1457 } 1458 1459 /* Return a pointer to the PD slot that corresponds to a VA */ 1460 static __inline pd_entry_t * 1461 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 1462 { 1463 pd_entry_t *pde; 1464 1465 KASSERT((*pdpe & PG_PS) == 0, 1466 ("%s: pdpe %#lx is a leaf", __func__, *pdpe)); 1467 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 1468 return (&pde[pmap_pde_index(va)]); 1469 } 1470 1471 /* Return a pointer to the PD slot that corresponds to a VA */ 1472 static __inline pd_entry_t * 1473 pmap_pde(pmap_t pmap, vm_offset_t va) 1474 { 1475 pdp_entry_t *pdpe; 1476 pt_entry_t PG_V; 1477 1478 PG_V = pmap_valid_bit(pmap); 1479 pdpe = pmap_pdpe(pmap, va); 1480 if (pdpe == NULL || (*pdpe & PG_V) == 0) 1481 return (NULL); 1482 KASSERT((*pdpe & PG_PS) == 0, 1483 ("pmap_pde for 1G page, pmap %p va %#lx", pmap, va)); 1484 return (pmap_pdpe_to_pde(pdpe, va)); 1485 } 1486 1487 /* Return a pointer to the PT slot that corresponds to a VA */ 1488 static __inline pt_entry_t * 1489 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 1490 { 1491 pt_entry_t *pte; 1492 1493 KASSERT((*pde & PG_PS) == 0, 1494 ("%s: pde %#lx is a leaf", __func__, *pde)); 1495 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 1496 return (&pte[pmap_pte_index(va)]); 1497 } 1498 1499 /* Return a pointer to the PT slot that corresponds to a VA */ 1500 static __inline pt_entry_t * 1501 pmap_pte(pmap_t pmap, vm_offset_t va) 1502 { 1503 pd_entry_t *pde; 1504 pt_entry_t PG_V; 1505 1506 PG_V = pmap_valid_bit(pmap); 1507 pde = pmap_pde(pmap, va); 1508 if (pde == NULL || (*pde & PG_V) == 0) 1509 return (NULL); 1510 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 1511 return ((pt_entry_t *)pde); 1512 return (pmap_pde_to_pte(pde, va)); 1513 } 1514 1515 static __inline void 1516 pmap_resident_count_adj(pmap_t pmap, int count) 1517 { 1518 1519 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1520 KASSERT(pmap->pm_stats.resident_count + count >= 0, 1521 ("pmap %p resident count underflow %ld %d", pmap, 1522 pmap->pm_stats.resident_count, count)); 1523 pmap->pm_stats.resident_count += count; 1524 } 1525 1526 static __inline void 1527 pmap_pt_page_count_pinit(pmap_t pmap, int count) 1528 { 1529 KASSERT(pmap->pm_stats.resident_count + count >= 0, 1530 ("pmap %p resident count underflow %ld %d", pmap, 1531 pmap->pm_stats.resident_count, count)); 1532 pmap->pm_stats.resident_count += count; 1533 } 1534 1535 static __inline void 1536 pmap_pt_page_count_adj(pmap_t pmap, int count) 1537 { 1538 if (pmap == kernel_pmap) 1539 counter_u64_add(kernel_pt_page_count, count); 1540 else { 1541 if (pmap != NULL) 1542 pmap_resident_count_adj(pmap, count); 1543 counter_u64_add(user_pt_page_count, count); 1544 } 1545 } 1546 1547 pt_entry_t vtoptem __read_mostly = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + 1548 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1) << 3; 1549 vm_offset_t PTmap __read_mostly = (vm_offset_t)P4Tmap; 1550 1551 pt_entry_t * 1552 vtopte(vm_offset_t va) 1553 { 1554 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); 1555 1556 return ((pt_entry_t *)(PTmap + ((va >> (PAGE_SHIFT - 3)) & vtoptem))); 1557 } 1558 1559 pd_entry_t vtopdem __read_mostly = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 1560 NPML4EPGSHIFT)) - 1) << 3; 1561 vm_offset_t PDmap __read_mostly = (vm_offset_t)P4Dmap; 1562 1563 static __inline pd_entry_t * 1564 vtopde(vm_offset_t va) 1565 { 1566 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); 1567 1568 return ((pt_entry_t *)(PDmap + ((va >> (PDRSHIFT - 3)) & vtopdem))); 1569 } 1570 1571 static u_int64_t 1572 allocpages(vm_paddr_t *firstaddr, int n) 1573 { 1574 u_int64_t ret; 1575 1576 ret = *firstaddr; 1577 bzero((void *)ret, n * PAGE_SIZE); 1578 *firstaddr += n * PAGE_SIZE; 1579 return (ret); 1580 } 1581 1582 CTASSERT(powerof2(NDMPML4E)); 1583 1584 /* number of kernel PDP slots */ 1585 #define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG) 1586 1587 static void 1588 nkpt_init(vm_paddr_t addr) 1589 { 1590 int pt_pages; 1591 1592 #ifdef NKPT 1593 pt_pages = NKPT; 1594 #else 1595 pt_pages = howmany(addr - kernphys, NBPDR) + 1; /* +1 for 2M hole @0 */ 1596 pt_pages += NKPDPE(pt_pages); 1597 1598 /* 1599 * Add some slop beyond the bare minimum required for bootstrapping 1600 * the kernel. 1601 * 1602 * This is quite important when allocating KVA for kernel modules. 1603 * The modules are required to be linked in the negative 2GB of 1604 * the address space. If we run out of KVA in this region then 1605 * pmap_growkernel() will need to allocate page table pages to map 1606 * the entire 512GB of KVA space which is an unnecessary tax on 1607 * physical memory. 1608 * 1609 * Secondly, device memory mapped as part of setting up the low- 1610 * level console(s) is taken from KVA, starting at virtual_avail. 1611 * This is because cninit() is called after pmap_bootstrap() but 1612 * before vm_init() and pmap_init(). 20MB for a frame buffer is 1613 * not uncommon. 1614 */ 1615 pt_pages += 32; /* 64MB additional slop. */ 1616 #endif 1617 nkpt = pt_pages; 1618 } 1619 1620 /* 1621 * Returns the proper write/execute permission for a physical page that is 1622 * part of the initial boot allocations. 1623 * 1624 * If the page has kernel text, it is marked as read-only. If the page has 1625 * kernel read-only data, it is marked as read-only/not-executable. If the 1626 * page has only read-write data, it is marked as read-write/not-executable. 1627 * If the page is below/above the kernel range, it is marked as read-write. 1628 * 1629 * This function operates on 2M pages, since we map the kernel space that 1630 * way. 1631 */ 1632 static inline pt_entry_t 1633 bootaddr_rwx(vm_paddr_t pa) 1634 { 1635 /* 1636 * The kernel is loaded at a 2MB-aligned address, and memory below that 1637 * need not be executable. The .bss section is padded to a 2MB 1638 * boundary, so memory following the kernel need not be executable 1639 * either. Preloaded kernel modules have their mapping permissions 1640 * fixed up by the linker. 1641 */ 1642 if (pa < trunc_2mpage(kernphys + btext - KERNSTART) || 1643 pa >= trunc_2mpage(kernphys + _end - KERNSTART)) 1644 return (X86_PG_RW | pg_nx); 1645 1646 /* 1647 * The linker should ensure that the read-only and read-write 1648 * portions don't share the same 2M page, so this shouldn't 1649 * impact read-only data. However, in any case, any page with 1650 * read-write data needs to be read-write. 1651 */ 1652 if (pa >= trunc_2mpage(kernphys + brwsection - KERNSTART)) 1653 return (X86_PG_RW | pg_nx); 1654 1655 /* 1656 * Mark any 2M page containing kernel text as read-only. Mark 1657 * other pages with read-only data as read-only and not executable. 1658 * (It is likely a small portion of the read-only data section will 1659 * be marked as read-only, but executable. This should be acceptable 1660 * since the read-only protection will keep the data from changing.) 1661 * Note that fixups to the .text section will still work until we 1662 * set CR0.WP. 1663 */ 1664 if (pa < round_2mpage(kernphys + etext - KERNSTART)) 1665 return (0); 1666 return (pg_nx); 1667 } 1668 1669 static void 1670 create_pagetables(vm_paddr_t *firstaddr) 1671 { 1672 pd_entry_t *pd_p; 1673 pdp_entry_t *pdp_p; 1674 pml4_entry_t *p4_p; 1675 uint64_t DMPDkernphys; 1676 vm_paddr_t pax; 1677 #ifdef KASAN 1678 pt_entry_t *pt_p; 1679 uint64_t KASANPDphys, KASANPTphys, KASANphys; 1680 vm_offset_t kasankernbase; 1681 int kasankpdpi, kasankpdi, nkasanpte; 1682 #endif 1683 int i, j, ndm1g, nkpdpe, nkdmpde; 1684 1685 TSENTER(); 1686 /* Allocate page table pages for the direct map */ 1687 ndmpdp = howmany(ptoa(Maxmem), NBPDP); 1688 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 1689 ndmpdp = 4; 1690 ndmpdpphys = howmany(ndmpdp, NPDPEPG); 1691 if (ndmpdpphys > NDMPML4E) { 1692 /* 1693 * Each NDMPML4E allows 512 GB, so limit to that, 1694 * and then readjust ndmpdp and ndmpdpphys. 1695 */ 1696 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); 1697 Maxmem = atop(NDMPML4E * NBPML4); 1698 ndmpdpphys = NDMPML4E; 1699 ndmpdp = NDMPML4E * NPDEPG; 1700 } 1701 DMPDPphys = allocpages(firstaddr, ndmpdpphys); 1702 ndm1g = 0; 1703 if ((amd_feature & AMDID_PAGE1GB) != 0) { 1704 /* 1705 * Calculate the number of 1G pages that will fully fit in 1706 * Maxmem. 1707 */ 1708 ndm1g = ptoa(Maxmem) >> PDPSHIFT; 1709 1710 /* 1711 * Allocate 2M pages for the kernel. These will be used in 1712 * place of the one or more 1G pages from ndm1g that maps 1713 * kernel memory into DMAP. 1714 */ 1715 nkdmpde = howmany((vm_offset_t)brwsection - KERNSTART + 1716 kernphys - rounddown2(kernphys, NBPDP), NBPDP); 1717 DMPDkernphys = allocpages(firstaddr, nkdmpde); 1718 } 1719 if (ndm1g < ndmpdp) 1720 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); 1721 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 1722 1723 /* Allocate pages. */ 1724 KPML4phys = allocpages(firstaddr, 1); 1725 KPDPphys = allocpages(firstaddr, NKPML4E); 1726 #ifdef KASAN 1727 KASANPDPphys = allocpages(firstaddr, NKASANPML4E); 1728 KASANPDphys = allocpages(firstaddr, 1); 1729 #endif 1730 #ifdef KMSAN 1731 /* 1732 * The KMSAN shadow maps are initially left unpopulated, since there is 1733 * no need to shadow memory above KERNBASE. 1734 */ 1735 KMSANSHADPDPphys = allocpages(firstaddr, NKMSANSHADPML4E); 1736 KMSANORIGPDPphys = allocpages(firstaddr, NKMSANORIGPML4E); 1737 #endif 1738 1739 /* 1740 * Allocate the initial number of kernel page table pages required to 1741 * bootstrap. We defer this until after all memory-size dependent 1742 * allocations are done (e.g. direct map), so that we don't have to 1743 * build in too much slop in our estimate. 1744 * 1745 * Note that when NKPML4E > 1, we have an empty page underneath 1746 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) 1747 * pages. (pmap_enter requires a PD page to exist for each KPML4E.) 1748 */ 1749 nkpt_init(*firstaddr); 1750 nkpdpe = NKPDPE(nkpt); 1751 1752 KPTphys = allocpages(firstaddr, nkpt); 1753 KPDphys = allocpages(firstaddr, nkpdpe); 1754 1755 #ifdef KASAN 1756 nkasanpte = howmany(nkpt, KASAN_SHADOW_SCALE); 1757 KASANPTphys = allocpages(firstaddr, nkasanpte); 1758 KASANphys = allocpages(firstaddr, nkasanpte * NPTEPG); 1759 #endif 1760 1761 /* 1762 * Connect the zero-filled PT pages to their PD entries. This 1763 * implicitly maps the PT pages at their correct locations within 1764 * the PTmap. 1765 */ 1766 pd_p = (pd_entry_t *)KPDphys; 1767 for (i = 0; i < nkpt; i++) 1768 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 1769 1770 /* 1771 * Map from start of the kernel in physical memory (staging 1772 * area) to the end of loader preallocated memory using 2MB 1773 * pages. This replaces some of the PD entries created above. 1774 * For compatibility, identity map 2M at the start. 1775 */ 1776 pd_p[0] = X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | 1777 X86_PG_RW | pg_nx; 1778 for (i = 1, pax = kernphys; pax < KERNend; i++, pax += NBPDR) { 1779 /* Preset PG_M and PG_A because demotion expects it. */ 1780 pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | 1781 X86_PG_A | bootaddr_rwx(pax); 1782 } 1783 1784 /* 1785 * Because we map the physical blocks in 2M pages, adjust firstaddr 1786 * to record the physical blocks we've actually mapped into kernel 1787 * virtual address space. 1788 */ 1789 if (*firstaddr < round_2mpage(KERNend)) 1790 *firstaddr = round_2mpage(KERNend); 1791 1792 /* And connect up the PD to the PDP (leaving room for L4 pages) */ 1793 pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); 1794 for (i = 0; i < nkpdpe; i++) 1795 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 1796 1797 #ifdef KASAN 1798 kasankernbase = kasan_md_addr_to_shad(KERNBASE); 1799 kasankpdpi = pmap_pdpe_index(kasankernbase); 1800 kasankpdi = pmap_pde_index(kasankernbase); 1801 1802 pdp_p = (pdp_entry_t *)KASANPDPphys; 1803 pdp_p[kasankpdpi] = (KASANPDphys | X86_PG_RW | X86_PG_V | pg_nx); 1804 1805 pd_p = (pd_entry_t *)KASANPDphys; 1806 for (i = 0; i < nkasanpte; i++) 1807 pd_p[i + kasankpdi] = (KASANPTphys + ptoa(i)) | X86_PG_RW | 1808 X86_PG_V | pg_nx; 1809 1810 pt_p = (pt_entry_t *)KASANPTphys; 1811 for (i = 0; i < nkasanpte * NPTEPG; i++) 1812 pt_p[i] = (KASANphys + ptoa(i)) | X86_PG_RW | X86_PG_V | 1813 X86_PG_M | X86_PG_A | pg_nx; 1814 #endif 1815 1816 /* 1817 * Now, set up the direct map region using 2MB and/or 1GB pages. If 1818 * the end of physical memory is not aligned to a 1GB page boundary, 1819 * then the residual physical memory is mapped with 2MB pages. Later, 1820 * if pmap_mapdev{_attr}() uses the direct map for non-write-back 1821 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings 1822 * that are partially used. 1823 */ 1824 pd_p = (pd_entry_t *)DMPDphys; 1825 for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { 1826 pd_p[j] = (vm_paddr_t)i << PDRSHIFT; 1827 /* Preset PG_M and PG_A because demotion expects it. */ 1828 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | 1829 X86_PG_M | X86_PG_A | pg_nx; 1830 } 1831 pdp_p = (pdp_entry_t *)DMPDPphys; 1832 for (i = 0; i < ndm1g; i++) { 1833 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; 1834 /* Preset PG_M and PG_A because demotion expects it. */ 1835 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | 1836 X86_PG_M | X86_PG_A | pg_nx; 1837 } 1838 for (j = 0; i < ndmpdp; i++, j++) { 1839 pdp_p[i] = DMPDphys + ptoa(j); 1840 pdp_p[i] |= X86_PG_RW | X86_PG_V | pg_nx; 1841 } 1842 1843 /* 1844 * Instead of using a 1G page for the memory containing the kernel, 1845 * use 2M pages with read-only and no-execute permissions. (If using 1G 1846 * pages, this will partially overwrite the PDPEs above.) 1847 */ 1848 if (ndm1g > 0) { 1849 pd_p = (pd_entry_t *)DMPDkernphys; 1850 for (i = 0, pax = rounddown2(kernphys, NBPDP); 1851 i < NPDEPG * nkdmpde; i++, pax += NBPDR) { 1852 pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | 1853 X86_PG_A | pg_nx | bootaddr_rwx(pax); 1854 } 1855 j = rounddown2(kernphys, NBPDP) >> PDPSHIFT; 1856 for (i = 0; i < nkdmpde; i++) { 1857 pdp_p[i + j] = (DMPDkernphys + ptoa(i)) | 1858 X86_PG_RW | X86_PG_V | pg_nx; 1859 } 1860 } 1861 1862 /* And recursively map PML4 to itself in order to get PTmap */ 1863 p4_p = (pml4_entry_t *)KPML4phys; 1864 p4_p[PML4PML4I] = KPML4phys; 1865 p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx; 1866 1867 #ifdef KASAN 1868 /* Connect the KASAN shadow map slots up to the PML4. */ 1869 for (i = 0; i < NKASANPML4E; i++) { 1870 p4_p[KASANPML4I + i] = KASANPDPphys + ptoa(i); 1871 p4_p[KASANPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1872 } 1873 #endif 1874 1875 #ifdef KMSAN 1876 /* Connect the KMSAN shadow map slots up to the PML4. */ 1877 for (i = 0; i < NKMSANSHADPML4E; i++) { 1878 p4_p[KMSANSHADPML4I + i] = KMSANSHADPDPphys + ptoa(i); 1879 p4_p[KMSANSHADPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1880 } 1881 1882 /* Connect the KMSAN origin map slots up to the PML4. */ 1883 for (i = 0; i < NKMSANORIGPML4E; i++) { 1884 p4_p[KMSANORIGPML4I + i] = KMSANORIGPDPphys + ptoa(i); 1885 p4_p[KMSANORIGPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1886 } 1887 #endif 1888 1889 /* Connect the Direct Map slots up to the PML4. */ 1890 for (i = 0; i < ndmpdpphys; i++) { 1891 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); 1892 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1893 } 1894 1895 /* Connect the KVA slots up to the PML4 */ 1896 for (i = 0; i < NKPML4E; i++) { 1897 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); 1898 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V; 1899 } 1900 1901 kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 1902 TSEXIT(); 1903 } 1904 1905 /* 1906 * Bootstrap the system enough to run with virtual memory. 1907 * 1908 * On amd64 this is called after mapping has already been enabled 1909 * and just syncs the pmap module with what has already been done. 1910 * [We can't call it easily with mapping off since the kernel is not 1911 * mapped with PA == VA, hence we would have to relocate every address 1912 * from the linked base (virtual) address "KERNBASE" to the actual 1913 * (physical) address starting relative to 0] 1914 */ 1915 void 1916 pmap_bootstrap(vm_paddr_t *firstaddr) 1917 { 1918 vm_offset_t va; 1919 pt_entry_t *pte, *pcpu_pte; 1920 struct region_descriptor r_gdt; 1921 uint64_t cr4, pcpu0_phys; 1922 u_long res; 1923 int i; 1924 1925 TSENTER(); 1926 KERNend = *firstaddr; 1927 res = atop(KERNend - (vm_paddr_t)kernphys); 1928 1929 if (!pti) 1930 pg_g = X86_PG_G; 1931 1932 /* 1933 * Create an initial set of page tables to run the kernel in. 1934 */ 1935 create_pagetables(firstaddr); 1936 1937 pcpu0_phys = allocpages(firstaddr, 1); 1938 1939 /* 1940 * Add a physical memory segment (vm_phys_seg) corresponding to the 1941 * preallocated kernel page table pages so that vm_page structures 1942 * representing these pages will be created. The vm_page structures 1943 * are required for promotion of the corresponding kernel virtual 1944 * addresses to superpage mappings. 1945 */ 1946 vm_phys_early_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 1947 1948 /* 1949 * Account for the virtual addresses mapped by create_pagetables(). 1950 */ 1951 virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend - 1952 (vm_paddr_t)kernphys); 1953 virtual_end = VM_MAX_KERNEL_ADDRESS; 1954 1955 /* 1956 * Enable PG_G global pages, then switch to the kernel page 1957 * table from the bootstrap page table. After the switch, it 1958 * is possible to enable SMEP and SMAP since PG_U bits are 1959 * correct now. 1960 */ 1961 cr4 = rcr4(); 1962 cr4 |= CR4_PGE; 1963 load_cr4(cr4); 1964 load_cr3(KPML4phys); 1965 if (cpu_stdext_feature & CPUID_STDEXT_SMEP) 1966 cr4 |= CR4_SMEP; 1967 if (cpu_stdext_feature & CPUID_STDEXT_SMAP) 1968 cr4 |= CR4_SMAP; 1969 load_cr4(cr4); 1970 1971 /* 1972 * Initialize the kernel pmap (which is statically allocated). 1973 * Count bootstrap data as being resident in case any of this data is 1974 * later unmapped (using pmap_remove()) and freed. 1975 */ 1976 PMAP_LOCK_INIT(kernel_pmap); 1977 kernel_pmap->pm_pmltop = kernel_pml4; 1978 kernel_pmap->pm_cr3 = KPML4phys; 1979 kernel_pmap->pm_ucr3 = PMAP_NO_CR3; 1980 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1981 kernel_pmap->pm_stats.resident_count = res; 1982 vm_radix_init(&kernel_pmap->pm_root); 1983 kernel_pmap->pm_flags = pmap_flags; 1984 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 1985 rangeset_init(&kernel_pmap->pm_pkru, pkru_dup_range, 1986 pkru_free_range, kernel_pmap, M_NOWAIT); 1987 } 1988 1989 /* 1990 * The kernel pmap is always active on all CPUs. Once CPUs are 1991 * enumerated, the mask will be set equal to all_cpus. 1992 */ 1993 CPU_FILL(&kernel_pmap->pm_active); 1994 1995 /* 1996 * Initialize the TLB invalidations generation number lock. 1997 */ 1998 mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); 1999 2000 /* 2001 * Reserve some special page table entries/VA space for temporary 2002 * mapping of pages. 2003 */ 2004 #define SYSMAP(c, p, v, n) \ 2005 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 2006 2007 va = virtual_avail; 2008 pte = vtopte(va); 2009 2010 /* 2011 * Crashdump maps. The first page is reused as CMAP1 for the 2012 * memory test. 2013 */ 2014 SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) 2015 CADDR1 = crashdumpmap; 2016 2017 SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU); 2018 virtual_avail = va; 2019 2020 /* 2021 * Map the BSP PCPU now, the rest of the PCPUs are mapped by 2022 * amd64_mp_alloc_pcpu()/start_all_aps() when we know the 2023 * number of CPUs and NUMA affinity. 2024 */ 2025 pcpu_pte[0] = pcpu0_phys | X86_PG_V | X86_PG_RW | pg_g | pg_nx | 2026 X86_PG_M | X86_PG_A; 2027 for (i = 1; i < MAXCPU; i++) 2028 pcpu_pte[i] = 0; 2029 2030 /* 2031 * Re-initialize PCPU area for BSP after switching. 2032 * Make hardware use gdt and common_tss from the new PCPU. 2033 */ 2034 STAILQ_INIT(&cpuhead); 2035 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2036 pcpu_init(&__pcpu[0], 0, sizeof(struct pcpu)); 2037 amd64_bsp_pcpu_init1(&__pcpu[0]); 2038 amd64_bsp_ist_init(&__pcpu[0]); 2039 __pcpu[0].pc_common_tss.tss_iobase = sizeof(struct amd64tss) + 2040 IOPERM_BITMAP_SIZE; 2041 memcpy(__pcpu[0].pc_gdt, temp_bsp_pcpu.pc_gdt, NGDT * 2042 sizeof(struct user_segment_descriptor)); 2043 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&__pcpu[0].pc_common_tss; 2044 ssdtosyssd(&gdt_segs[GPROC0_SEL], 2045 (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); 2046 r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; 2047 r_gdt.rd_base = (long)__pcpu[0].pc_gdt; 2048 lgdt(&r_gdt); 2049 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2050 ltr(GSEL(GPROC0_SEL, SEL_KPL)); 2051 __pcpu[0].pc_dynamic = temp_bsp_pcpu.pc_dynamic; 2052 __pcpu[0].pc_acpi_id = temp_bsp_pcpu.pc_acpi_id; 2053 2054 /* 2055 * Initialize the PAT MSR. 2056 * pmap_init_pat() clears and sets CR4_PGE, which, as a 2057 * side-effect, invalidates stale PG_G TLB entries that might 2058 * have been created in our pre-boot environment. 2059 */ 2060 pmap_init_pat(); 2061 2062 /* Initialize TLB Context Id. */ 2063 if (pmap_pcid_enabled) { 2064 kernel_pmap->pm_pcidp = (void *)(uintptr_t) 2065 offsetof(struct pcpu, pc_kpmap_store); 2066 2067 PCPU_SET(kpmap_store.pm_pcid, PMAP_PCID_KERN); 2068 PCPU_SET(kpmap_store.pm_gen, 1); 2069 2070 /* 2071 * PMAP_PCID_KERN + 1 is used for initialization of 2072 * proc0 pmap. The pmap' pcid state might be used by 2073 * EFIRT entry before first context switch, so it 2074 * needs to be valid. 2075 */ 2076 PCPU_SET(pcid_next, PMAP_PCID_KERN + 2); 2077 PCPU_SET(pcid_gen, 1); 2078 2079 /* 2080 * pcpu area for APs is zeroed during AP startup. 2081 * pc_pcid_next and pc_pcid_gen are initialized by AP 2082 * during pcpu setup. 2083 */ 2084 load_cr4(rcr4() | CR4_PCIDE); 2085 } 2086 TSEXIT(); 2087 } 2088 2089 /* 2090 * Setup the PAT MSR. 2091 */ 2092 void 2093 pmap_init_pat(void) 2094 { 2095 uint64_t pat_msr; 2096 u_long cr0, cr4; 2097 int i; 2098 2099 /* Bail if this CPU doesn't implement PAT. */ 2100 if ((cpu_feature & CPUID_PAT) == 0) 2101 panic("no PAT??"); 2102 2103 /* Set default PAT index table. */ 2104 for (i = 0; i < PAT_INDEX_SIZE; i++) 2105 pat_index[i] = -1; 2106 pat_index[PAT_WRITE_BACK] = 0; 2107 pat_index[PAT_WRITE_THROUGH] = 1; 2108 pat_index[PAT_UNCACHEABLE] = 3; 2109 pat_index[PAT_WRITE_COMBINING] = 6; 2110 pat_index[PAT_WRITE_PROTECTED] = 5; 2111 pat_index[PAT_UNCACHED] = 2; 2112 2113 /* 2114 * Initialize default PAT entries. 2115 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 2116 * Program 5 and 6 as WP and WC. 2117 * 2118 * Leave 4 and 7 as WB and UC. Note that a recursive page table 2119 * mapping for a 2M page uses a PAT value with the bit 3 set due 2120 * to its overload with PG_PS. 2121 */ 2122 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 2123 PAT_VALUE(1, PAT_WRITE_THROUGH) | 2124 PAT_VALUE(2, PAT_UNCACHED) | 2125 PAT_VALUE(3, PAT_UNCACHEABLE) | 2126 PAT_VALUE(4, PAT_WRITE_BACK) | 2127 PAT_VALUE(5, PAT_WRITE_PROTECTED) | 2128 PAT_VALUE(6, PAT_WRITE_COMBINING) | 2129 PAT_VALUE(7, PAT_UNCACHEABLE); 2130 2131 /* Disable PGE. */ 2132 cr4 = rcr4(); 2133 load_cr4(cr4 & ~CR4_PGE); 2134 2135 /* Disable caches (CD = 1, NW = 0). */ 2136 cr0 = rcr0(); 2137 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 2138 2139 /* Flushes caches and TLBs. */ 2140 wbinvd(); 2141 invltlb(); 2142 2143 /* Update PAT and index table. */ 2144 wrmsr(MSR_PAT, pat_msr); 2145 2146 /* Flush caches and TLBs again. */ 2147 wbinvd(); 2148 invltlb(); 2149 2150 /* Restore caches and PGE. */ 2151 load_cr0(cr0); 2152 load_cr4(cr4); 2153 } 2154 2155 vm_page_t 2156 pmap_page_alloc_below_4g(bool zeroed) 2157 { 2158 return (vm_page_alloc_noobj_contig((zeroed ? VM_ALLOC_ZERO : 0), 2159 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT)); 2160 } 2161 2162 extern const char la57_trampoline[], la57_trampoline_gdt_desc[], 2163 la57_trampoline_gdt[], la57_trampoline_end[]; 2164 2165 static void 2166 pmap_bootstrap_la57(void *arg __unused) 2167 { 2168 char *v_code; 2169 pml5_entry_t *v_pml5; 2170 pml4_entry_t *v_pml4; 2171 pdp_entry_t *v_pdp; 2172 pd_entry_t *v_pd; 2173 pt_entry_t *v_pt; 2174 vm_page_t m_code, m_pml4, m_pdp, m_pd, m_pt, m_pml5; 2175 void (*la57_tramp)(uint64_t pml5); 2176 struct region_descriptor r_gdt; 2177 2178 if ((cpu_stdext_feature2 & CPUID_STDEXT2_LA57) == 0) 2179 return; 2180 TUNABLE_INT_FETCH("vm.pmap.la57", &la57); 2181 if (!la57) 2182 return; 2183 2184 r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; 2185 r_gdt.rd_base = (long)__pcpu[0].pc_gdt; 2186 2187 m_code = pmap_page_alloc_below_4g(true); 2188 v_code = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_code)); 2189 m_pml5 = pmap_page_alloc_below_4g(true); 2190 KPML5phys = VM_PAGE_TO_PHYS(m_pml5); 2191 v_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(KPML5phys); 2192 m_pml4 = pmap_page_alloc_below_4g(true); 2193 v_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4)); 2194 m_pdp = pmap_page_alloc_below_4g(true); 2195 v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp)); 2196 m_pd = pmap_page_alloc_below_4g(true); 2197 v_pd = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd)); 2198 m_pt = pmap_page_alloc_below_4g(true); 2199 v_pt = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pt)); 2200 2201 /* 2202 * Map m_code 1:1, it appears below 4G in KVA due to physical 2203 * address being below 4G. Since kernel KVA is in upper half, 2204 * the pml4e should be zero and free for temporary use. 2205 */ 2206 kernel_pmap->pm_pmltop[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = 2207 VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | 2208 X86_PG_M; 2209 v_pdp[pmap_pdpe_index(VM_PAGE_TO_PHYS(m_code))] = 2210 VM_PAGE_TO_PHYS(m_pd) | X86_PG_V | X86_PG_RW | X86_PG_A | 2211 X86_PG_M; 2212 v_pd[pmap_pde_index(VM_PAGE_TO_PHYS(m_code))] = 2213 VM_PAGE_TO_PHYS(m_pt) | X86_PG_V | X86_PG_RW | X86_PG_A | 2214 X86_PG_M; 2215 v_pt[pmap_pte_index(VM_PAGE_TO_PHYS(m_code))] = 2216 VM_PAGE_TO_PHYS(m_code) | X86_PG_V | X86_PG_RW | X86_PG_A | 2217 X86_PG_M; 2218 2219 /* 2220 * Add pml5 entry at top of KVA pointing to existing pml4 table, 2221 * entering all existing kernel mappings into level 5 table. 2222 */ 2223 v_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | 2224 X86_PG_RW | X86_PG_A | X86_PG_M | pg_g; 2225 2226 /* 2227 * Add pml5 entry for 1:1 trampoline mapping after LA57 is turned on. 2228 */ 2229 v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))] = 2230 VM_PAGE_TO_PHYS(m_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A | 2231 X86_PG_M; 2232 v_pml4[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = 2233 VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | 2234 X86_PG_M; 2235 2236 /* 2237 * Copy and call the 48->57 trampoline, hope we return there, alive. 2238 */ 2239 bcopy(la57_trampoline, v_code, la57_trampoline_end - la57_trampoline); 2240 *(u_long *)(v_code + 2 + (la57_trampoline_gdt_desc - la57_trampoline)) = 2241 la57_trampoline_gdt - la57_trampoline + VM_PAGE_TO_PHYS(m_code); 2242 la57_tramp = (void (*)(uint64_t))VM_PAGE_TO_PHYS(m_code); 2243 invlpg((vm_offset_t)la57_tramp); 2244 la57_tramp(KPML5phys); 2245 2246 /* 2247 * gdt was necessary reset, switch back to our gdt. 2248 */ 2249 lgdt(&r_gdt); 2250 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2251 load_ds(_udatasel); 2252 load_es(_udatasel); 2253 load_fs(_ufssel); 2254 ssdtosyssd(&gdt_segs[GPROC0_SEL], 2255 (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); 2256 ltr(GSEL(GPROC0_SEL, SEL_KPL)); 2257 2258 /* 2259 * Now unmap the trampoline, and free the pages. 2260 * Clear pml5 entry used for 1:1 trampoline mapping. 2261 */ 2262 pte_clear(&v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))]); 2263 invlpg((vm_offset_t)v_code); 2264 vm_page_free(m_code); 2265 vm_page_free(m_pdp); 2266 vm_page_free(m_pd); 2267 vm_page_free(m_pt); 2268 2269 /* 2270 * Recursively map PML5 to itself in order to get PTmap and 2271 * PDmap. 2272 */ 2273 v_pml5[PML5PML5I] = KPML5phys | X86_PG_RW | X86_PG_V | pg_nx; 2274 2275 vtoptem = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + 2276 NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3; 2277 PTmap = (vm_offset_t)P5Tmap; 2278 vtopdem = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 2279 NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3; 2280 PDmap = (vm_offset_t)P5Dmap; 2281 2282 kernel_pmap->pm_cr3 = KPML5phys; 2283 kernel_pmap->pm_pmltop = v_pml5; 2284 pmap_pt_page_count_adj(kernel_pmap, 1); 2285 } 2286 SYSINIT(la57, SI_SUB_KMEM, SI_ORDER_ANY, pmap_bootstrap_la57, NULL); 2287 2288 /* 2289 * Initialize a vm_page's machine-dependent fields. 2290 */ 2291 void 2292 pmap_page_init(vm_page_t m) 2293 { 2294 2295 TAILQ_INIT(&m->md.pv_list); 2296 m->md.pat_mode = PAT_WRITE_BACK; 2297 } 2298 2299 static int pmap_allow_2m_x_ept; 2300 SYSCTL_INT(_vm_pmap, OID_AUTO, allow_2m_x_ept, CTLFLAG_RWTUN | CTLFLAG_NOFETCH, 2301 &pmap_allow_2m_x_ept, 0, 2302 "Allow executable superpage mappings in EPT"); 2303 2304 void 2305 pmap_allow_2m_x_ept_recalculate(void) 2306 { 2307 /* 2308 * SKL002, SKL012S. Since the EPT format is only used by 2309 * Intel CPUs, the vendor check is merely a formality. 2310 */ 2311 if (!(cpu_vendor_id != CPU_VENDOR_INTEL || 2312 (cpu_ia32_arch_caps & IA32_ARCH_CAP_IF_PSCHANGE_MC_NO) != 0 || 2313 (CPUID_TO_FAMILY(cpu_id) == 0x6 && 2314 (CPUID_TO_MODEL(cpu_id) == 0x26 || /* Atoms */ 2315 CPUID_TO_MODEL(cpu_id) == 0x27 || 2316 CPUID_TO_MODEL(cpu_id) == 0x35 || 2317 CPUID_TO_MODEL(cpu_id) == 0x36 || 2318 CPUID_TO_MODEL(cpu_id) == 0x37 || 2319 CPUID_TO_MODEL(cpu_id) == 0x86 || 2320 CPUID_TO_MODEL(cpu_id) == 0x1c || 2321 CPUID_TO_MODEL(cpu_id) == 0x4a || 2322 CPUID_TO_MODEL(cpu_id) == 0x4c || 2323 CPUID_TO_MODEL(cpu_id) == 0x4d || 2324 CPUID_TO_MODEL(cpu_id) == 0x5a || 2325 CPUID_TO_MODEL(cpu_id) == 0x5c || 2326 CPUID_TO_MODEL(cpu_id) == 0x5d || 2327 CPUID_TO_MODEL(cpu_id) == 0x5f || 2328 CPUID_TO_MODEL(cpu_id) == 0x6e || 2329 CPUID_TO_MODEL(cpu_id) == 0x7a || 2330 CPUID_TO_MODEL(cpu_id) == 0x57 || /* Knights */ 2331 CPUID_TO_MODEL(cpu_id) == 0x85)))) 2332 pmap_allow_2m_x_ept = 1; 2333 #ifndef BURN_BRIDGES 2334 TUNABLE_INT_FETCH("hw.allow_2m_x_ept", &pmap_allow_2m_x_ept); 2335 #endif 2336 TUNABLE_INT_FETCH("vm.pmap.allow_2m_x_ept", &pmap_allow_2m_x_ept); 2337 } 2338 2339 static bool 2340 pmap_allow_2m_x_page(pmap_t pmap, bool executable) 2341 { 2342 2343 return (pmap->pm_type != PT_EPT || !executable || 2344 !pmap_allow_2m_x_ept); 2345 } 2346 2347 #ifdef NUMA 2348 static void 2349 pmap_init_pv_table(void) 2350 { 2351 struct pmap_large_md_page *pvd; 2352 vm_size_t s; 2353 long start, end, highest, pv_npg; 2354 int domain, i, j, pages; 2355 2356 /* 2357 * For correctness we depend on the size being evenly divisible into a 2358 * page. As a tradeoff between performance and total memory use, the 2359 * entry is 64 bytes (aka one cacheline) in size. Not being smaller 2360 * avoids false-sharing, but not being 128 bytes potentially allows for 2361 * avoidable traffic due to adjacent cacheline prefetcher. 2362 * 2363 * Assert the size so that accidental changes fail to compile. 2364 */ 2365 CTASSERT((sizeof(*pvd) == 64)); 2366 2367 /* 2368 * Calculate the size of the array. 2369 */ 2370 pmap_last_pa = vm_phys_segs[vm_phys_nsegs - 1].end; 2371 pv_npg = howmany(pmap_last_pa, NBPDR); 2372 s = (vm_size_t)pv_npg * sizeof(struct pmap_large_md_page); 2373 s = round_page(s); 2374 pv_table = (struct pmap_large_md_page *)kva_alloc(s); 2375 if (pv_table == NULL) 2376 panic("%s: kva_alloc failed\n", __func__); 2377 2378 /* 2379 * Iterate physical segments to allocate space for respective pages. 2380 */ 2381 highest = -1; 2382 s = 0; 2383 for (i = 0; i < vm_phys_nsegs; i++) { 2384 end = vm_phys_segs[i].end / NBPDR; 2385 domain = vm_phys_segs[i].domain; 2386 2387 if (highest >= end) 2388 continue; 2389 2390 start = highest + 1; 2391 pvd = &pv_table[start]; 2392 2393 pages = end - start + 1; 2394 s = round_page(pages * sizeof(*pvd)); 2395 highest = start + (s / sizeof(*pvd)) - 1; 2396 2397 for (j = 0; j < s; j += PAGE_SIZE) { 2398 vm_page_t m = vm_page_alloc_noobj_domain(domain, 0); 2399 if (m == NULL) 2400 panic("failed to allocate PV table page"); 2401 pmap_qenter((vm_offset_t)pvd + j, &m, 1); 2402 } 2403 2404 for (j = 0; j < s / sizeof(*pvd); j++) { 2405 rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW); 2406 TAILQ_INIT(&pvd->pv_page.pv_list); 2407 pvd->pv_page.pv_gen = 0; 2408 pvd->pv_page.pat_mode = 0; 2409 pvd->pv_invl_gen = 0; 2410 pvd++; 2411 } 2412 } 2413 pvd = &pv_dummy_large; 2414 rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW); 2415 TAILQ_INIT(&pvd->pv_page.pv_list); 2416 pvd->pv_page.pv_gen = 0; 2417 pvd->pv_page.pat_mode = 0; 2418 pvd->pv_invl_gen = 0; 2419 } 2420 #else 2421 static void 2422 pmap_init_pv_table(void) 2423 { 2424 vm_size_t s; 2425 long i, pv_npg; 2426 2427 /* 2428 * Initialize the pool of pv list locks. 2429 */ 2430 for (i = 0; i < NPV_LIST_LOCKS; i++) 2431 rw_init(&pv_list_locks[i], "pmap pv list"); 2432 2433 /* 2434 * Calculate the size of the pv head table for superpages. 2435 */ 2436 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); 2437 2438 /* 2439 * Allocate memory for the pv head table for superpages. 2440 */ 2441 s = (vm_size_t)pv_npg * sizeof(struct md_page); 2442 s = round_page(s); 2443 pv_table = kmem_malloc(s, M_WAITOK | M_ZERO); 2444 for (i = 0; i < pv_npg; i++) 2445 TAILQ_INIT(&pv_table[i].pv_list); 2446 TAILQ_INIT(&pv_dummy.pv_list); 2447 } 2448 #endif 2449 2450 /* 2451 * Initialize the pmap module. 2452 * Called by vm_init, to initialize any structures that the pmap 2453 * system needs to map virtual memory. 2454 */ 2455 void 2456 pmap_init(void) 2457 { 2458 struct pmap_preinit_mapping *ppim; 2459 vm_page_t m, mpte; 2460 int error, i, ret, skz63; 2461 2462 /* L1TF, reserve page @0 unconditionally */ 2463 vm_page_blacklist_add(0, bootverbose); 2464 2465 /* Detect bare-metal Skylake Server and Skylake-X. */ 2466 if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL && 2467 CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) { 2468 /* 2469 * Skylake-X errata SKZ63. Processor May Hang When 2470 * Executing Code In an HLE Transaction Region between 2471 * 40000000H and 403FFFFFH. 2472 * 2473 * Mark the pages in the range as preallocated. It 2474 * seems to be impossible to distinguish between 2475 * Skylake Server and Skylake X. 2476 */ 2477 skz63 = 1; 2478 TUNABLE_INT_FETCH("hw.skz63_enable", &skz63); 2479 if (skz63 != 0) { 2480 if (bootverbose) 2481 printf("SKZ63: skipping 4M RAM starting " 2482 "at physical 1G\n"); 2483 for (i = 0; i < atop(0x400000); i++) { 2484 ret = vm_page_blacklist_add(0x40000000 + 2485 ptoa(i), FALSE); 2486 if (!ret && bootverbose) 2487 printf("page at %#lx already used\n", 2488 0x40000000 + ptoa(i)); 2489 } 2490 } 2491 } 2492 2493 /* IFU */ 2494 pmap_allow_2m_x_ept_recalculate(); 2495 2496 /* 2497 * Initialize the vm page array entries for the kernel pmap's 2498 * page table pages. 2499 */ 2500 PMAP_LOCK(kernel_pmap); 2501 for (i = 0; i < nkpt; i++) { 2502 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 2503 KASSERT(mpte >= vm_page_array && 2504 mpte < &vm_page_array[vm_page_array_size], 2505 ("pmap_init: page table page is out of range")); 2506 mpte->pindex = pmap_pde_pindex(KERNBASE) + i; 2507 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 2508 mpte->ref_count = 1; 2509 2510 /* 2511 * Collect the page table pages that were replaced by a 2MB 2512 * page in create_pagetables(). They are zero filled. 2513 */ 2514 if ((i == 0 || 2515 kernphys + ((vm_paddr_t)(i - 1) << PDRSHIFT) < KERNend) && 2516 pmap_insert_pt_page(kernel_pmap, mpte, false, false)) 2517 panic("pmap_init: pmap_insert_pt_page failed"); 2518 } 2519 PMAP_UNLOCK(kernel_pmap); 2520 vm_wire_add(nkpt); 2521 2522 /* 2523 * If the kernel is running on a virtual machine, then it must assume 2524 * that MCA is enabled by the hypervisor. Moreover, the kernel must 2525 * be prepared for the hypervisor changing the vendor and family that 2526 * are reported by CPUID. Consequently, the workaround for AMD Family 2527 * 10h Erratum 383 is enabled if the processor's feature set does not 2528 * include at least one feature that is only supported by older Intel 2529 * or newer AMD processors. 2530 */ 2531 if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && 2532 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 2533 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 2534 AMDID2_FMA4)) == 0) 2535 workaround_erratum383 = 1; 2536 2537 /* 2538 * Are large page mappings enabled? 2539 */ 2540 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 2541 if (pg_ps_enabled) { 2542 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 2543 ("pmap_init: can't assign to pagesizes[1]")); 2544 pagesizes[1] = NBPDR; 2545 if ((amd_feature & AMDID_PAGE1GB) != 0) { 2546 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0, 2547 ("pmap_init: can't assign to pagesizes[2]")); 2548 pagesizes[2] = NBPDP; 2549 } 2550 } 2551 2552 /* 2553 * Initialize pv chunk lists. 2554 */ 2555 for (i = 0; i < PMAP_MEMDOM; i++) { 2556 mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL, MTX_DEF); 2557 TAILQ_INIT(&pv_chunks[i].pvc_list); 2558 } 2559 pmap_init_pv_table(); 2560 2561 pmap_initialized = 1; 2562 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 2563 ppim = pmap_preinit_mapping + i; 2564 if (ppim->va == 0) 2565 continue; 2566 /* Make the direct map consistent */ 2567 if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) { 2568 (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), 2569 ppim->sz, ppim->mode); 2570 } 2571 if (!bootverbose) 2572 continue; 2573 printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i, 2574 ppim->pa, ppim->va, ppim->sz, ppim->mode); 2575 } 2576 2577 mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); 2578 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 2579 (vmem_addr_t *)&qframe); 2580 if (error != 0) 2581 panic("qframe allocation failed"); 2582 2583 lm_ents = 8; 2584 TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents); 2585 if (lm_ents > LMEPML4I - LMSPML4I + 1) 2586 lm_ents = LMEPML4I - LMSPML4I + 1; 2587 #ifdef KMSAN 2588 if (lm_ents > KMSANORIGPML4I - LMSPML4I) { 2589 printf( 2590 "pmap: shrinking large map for KMSAN (%d slots to %ld slots)\n", 2591 lm_ents, KMSANORIGPML4I - LMSPML4I); 2592 lm_ents = KMSANORIGPML4I - LMSPML4I; 2593 } 2594 #endif 2595 if (bootverbose) 2596 printf("pmap: large map %u PML4 slots (%lu GB)\n", 2597 lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024)); 2598 if (lm_ents != 0) { 2599 large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS, 2600 (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK); 2601 if (large_vmem == NULL) { 2602 printf("pmap: cannot create large map\n"); 2603 lm_ents = 0; 2604 } 2605 for (i = 0; i < lm_ents; i++) { 2606 m = pmap_large_map_getptp_unlocked(); 2607 /* XXXKIB la57 */ 2608 kernel_pml4[LMSPML4I + i] = X86_PG_V | 2609 X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx | 2610 VM_PAGE_TO_PHYS(m); 2611 } 2612 } 2613 } 2614 2615 SYSCTL_UINT(_vm_pmap, OID_AUTO, large_map_pml4_entries, 2616 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &lm_ents, 0, 2617 "Maximum number of PML4 entries for use by large map (tunable). " 2618 "Each entry corresponds to 512GB of address space."); 2619 2620 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 2621 "2MB page mapping counters"); 2622 2623 static COUNTER_U64_DEFINE_EARLY(pmap_pde_demotions); 2624 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, demotions, 2625 CTLFLAG_RD, &pmap_pde_demotions, "2MB page demotions"); 2626 2627 static COUNTER_U64_DEFINE_EARLY(pmap_pde_mappings); 2628 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 2629 &pmap_pde_mappings, "2MB page mappings"); 2630 2631 static COUNTER_U64_DEFINE_EARLY(pmap_pde_p_failures); 2632 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 2633 &pmap_pde_p_failures, "2MB page promotion failures"); 2634 2635 static COUNTER_U64_DEFINE_EARLY(pmap_pde_promotions); 2636 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 2637 &pmap_pde_promotions, "2MB page promotions"); 2638 2639 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 2640 "1GB page mapping counters"); 2641 2642 static COUNTER_U64_DEFINE_EARLY(pmap_pdpe_demotions); 2643 SYSCTL_COUNTER_U64(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, 2644 &pmap_pdpe_demotions, "1GB page demotions"); 2645 2646 /*************************************************** 2647 * Low level helper routines..... 2648 ***************************************************/ 2649 2650 static pt_entry_t 2651 pmap_swap_pat(pmap_t pmap, pt_entry_t entry) 2652 { 2653 int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; 2654 2655 switch (pmap->pm_type) { 2656 case PT_X86: 2657 case PT_RVI: 2658 /* Verify that both PAT bits are not set at the same time */ 2659 KASSERT((entry & x86_pat_bits) != x86_pat_bits, 2660 ("Invalid PAT bits in entry %#lx", entry)); 2661 2662 /* Swap the PAT bits if one of them is set */ 2663 if ((entry & x86_pat_bits) != 0) 2664 entry ^= x86_pat_bits; 2665 break; 2666 case PT_EPT: 2667 /* 2668 * Nothing to do - the memory attributes are represented 2669 * the same way for regular pages and superpages. 2670 */ 2671 break; 2672 default: 2673 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); 2674 } 2675 2676 return (entry); 2677 } 2678 2679 boolean_t 2680 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 2681 { 2682 2683 return (mode >= 0 && mode < PAT_INDEX_SIZE && 2684 pat_index[(int)mode] >= 0); 2685 } 2686 2687 /* 2688 * Determine the appropriate bits to set in a PTE or PDE for a specified 2689 * caching mode. 2690 */ 2691 int 2692 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) 2693 { 2694 int cache_bits, pat_flag, pat_idx; 2695 2696 if (!pmap_is_valid_memattr(pmap, mode)) 2697 panic("Unknown caching mode %d\n", mode); 2698 2699 switch (pmap->pm_type) { 2700 case PT_X86: 2701 case PT_RVI: 2702 /* The PAT bit is different for PTE's and PDE's. */ 2703 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 2704 2705 /* Map the caching mode to a PAT index. */ 2706 pat_idx = pat_index[mode]; 2707 2708 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 2709 cache_bits = 0; 2710 if (pat_idx & 0x4) 2711 cache_bits |= pat_flag; 2712 if (pat_idx & 0x2) 2713 cache_bits |= PG_NC_PCD; 2714 if (pat_idx & 0x1) 2715 cache_bits |= PG_NC_PWT; 2716 break; 2717 2718 case PT_EPT: 2719 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); 2720 break; 2721 2722 default: 2723 panic("unsupported pmap type %d", pmap->pm_type); 2724 } 2725 2726 return (cache_bits); 2727 } 2728 2729 static int 2730 pmap_cache_mask(pmap_t pmap, boolean_t is_pde) 2731 { 2732 int mask; 2733 2734 switch (pmap->pm_type) { 2735 case PT_X86: 2736 case PT_RVI: 2737 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; 2738 break; 2739 case PT_EPT: 2740 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); 2741 break; 2742 default: 2743 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); 2744 } 2745 2746 return (mask); 2747 } 2748 2749 static int 2750 pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde) 2751 { 2752 int pat_flag, pat_idx; 2753 2754 pat_idx = 0; 2755 switch (pmap->pm_type) { 2756 case PT_X86: 2757 case PT_RVI: 2758 /* The PAT bit is different for PTE's and PDE's. */ 2759 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 2760 2761 if ((pte & pat_flag) != 0) 2762 pat_idx |= 0x4; 2763 if ((pte & PG_NC_PCD) != 0) 2764 pat_idx |= 0x2; 2765 if ((pte & PG_NC_PWT) != 0) 2766 pat_idx |= 0x1; 2767 break; 2768 case PT_EPT: 2769 if ((pte & EPT_PG_IGNORE_PAT) != 0) 2770 panic("EPT PTE %#lx has no PAT memory type", pte); 2771 pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3; 2772 break; 2773 } 2774 2775 /* See pmap_init_pat(). */ 2776 if (pat_idx == 4) 2777 pat_idx = 0; 2778 if (pat_idx == 7) 2779 pat_idx = 3; 2780 2781 return (pat_idx); 2782 } 2783 2784 bool 2785 pmap_ps_enabled(pmap_t pmap) 2786 { 2787 2788 return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); 2789 } 2790 2791 static void 2792 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) 2793 { 2794 2795 switch (pmap->pm_type) { 2796 case PT_X86: 2797 break; 2798 case PT_RVI: 2799 case PT_EPT: 2800 /* 2801 * XXX 2802 * This is a little bogus since the generation number is 2803 * supposed to be bumped up when a region of the address 2804 * space is invalidated in the page tables. 2805 * 2806 * In this case the old PDE entry is valid but yet we want 2807 * to make sure that any mappings using the old entry are 2808 * invalidated in the TLB. 2809 * 2810 * The reason this works as expected is because we rendezvous 2811 * "all" host cpus and force any vcpu context to exit as a 2812 * side-effect. 2813 */ 2814 atomic_add_long(&pmap->pm_eptgen, 1); 2815 break; 2816 default: 2817 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); 2818 } 2819 pde_store(pde, newpde); 2820 } 2821 2822 /* 2823 * After changing the page size for the specified virtual address in the page 2824 * table, flush the corresponding entries from the processor's TLB. Only the 2825 * calling processor's TLB is affected. 2826 * 2827 * The calling thread must be pinned to a processor. 2828 */ 2829 static void 2830 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) 2831 { 2832 pt_entry_t PG_G; 2833 2834 if (pmap_type_guest(pmap)) 2835 return; 2836 2837 KASSERT(pmap->pm_type == PT_X86, 2838 ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); 2839 2840 PG_G = pmap_global_bit(pmap); 2841 2842 if ((newpde & PG_PS) == 0) 2843 /* Demotion: flush a specific 2MB page mapping. */ 2844 pmap_invlpg(pmap, va); 2845 else if ((newpde & PG_G) == 0) 2846 /* 2847 * Promotion: flush every 4KB page mapping from the TLB 2848 * because there are too many to flush individually. 2849 */ 2850 invltlb(); 2851 else { 2852 /* 2853 * Promotion: flush every 4KB page mapping from the TLB, 2854 * including any global (PG_G) mappings. 2855 */ 2856 invltlb_glob(); 2857 } 2858 } 2859 2860 /* 2861 * The amd64 pmap uses different approaches to TLB invalidation 2862 * depending on the kernel configuration, available hardware features, 2863 * and known hardware errata. The kernel configuration option that 2864 * has the greatest operational impact on TLB invalidation is PTI, 2865 * which is enabled automatically on affected Intel CPUs. The most 2866 * impactful hardware features are first PCID, and then INVPCID 2867 * instruction presence. PCID usage is quite different for PTI 2868 * vs. non-PTI. 2869 * 2870 * * Kernel Page Table Isolation (PTI or KPTI) is used to mitigate 2871 * the Meltdown bug in some Intel CPUs. Under PTI, each user address 2872 * space is served by two page tables, user and kernel. The user 2873 * page table only maps user space and a kernel trampoline. The 2874 * kernel trampoline includes the entirety of the kernel text but 2875 * only the kernel data that is needed to switch from user to kernel 2876 * mode. The kernel page table maps the user and kernel address 2877 * spaces in their entirety. It is identical to the per-process 2878 * page table used in non-PTI mode. 2879 * 2880 * User page tables are only used when the CPU is in user mode. 2881 * Consequently, some TLB invalidations can be postponed until the 2882 * switch from kernel to user mode. In contrast, the user 2883 * space part of the kernel page table is used for copyout(9), so 2884 * TLB invalidations on this page table cannot be similarly postponed. 2885 * 2886 * The existence of a user mode page table for the given pmap is 2887 * indicated by a pm_ucr3 value that differs from PMAP_NO_CR3, in 2888 * which case pm_ucr3 contains the %cr3 register value for the user 2889 * mode page table's root. 2890 * 2891 * * The pm_active bitmask indicates which CPUs currently have the 2892 * pmap active. A CPU's bit is set on context switch to the pmap, and 2893 * cleared on switching off this CPU. For the kernel page table, 2894 * the pm_active field is immutable and contains all CPUs. The 2895 * kernel page table is always logically active on every processor, 2896 * but not necessarily in use by the hardware, e.g., in PTI mode. 2897 * 2898 * When requesting invalidation of virtual addresses with 2899 * pmap_invalidate_XXX() functions, the pmap sends shootdown IPIs to 2900 * all CPUs recorded as active in pm_active. Updates to and reads 2901 * from pm_active are not synchronized, and so they may race with 2902 * each other. Shootdown handlers are prepared to handle the race. 2903 * 2904 * * PCID is an optional feature of the long mode x86 MMU where TLB 2905 * entries are tagged with the 'Process ID' of the address space 2906 * they belong to. This feature provides a limited namespace for 2907 * process identifiers, 12 bits, supporting 4095 simultaneous IDs 2908 * total. 2909 * 2910 * Allocation of a PCID to a pmap is done by an algorithm described 2911 * in section 15.12, "Other TLB Consistency Algorithms", of 2912 * Vahalia's book "Unix Internals". A PCID cannot be allocated for 2913 * the whole lifetime of a pmap in pmap_pinit() due to the limited 2914 * namespace. Instead, a per-CPU, per-pmap PCID is assigned when 2915 * the CPU is about to start caching TLB entries from a pmap, 2916 * i.e., on the context switch that activates the pmap on the CPU. 2917 * 2918 * The PCID allocator maintains a per-CPU, per-pmap generation 2919 * count, pm_gen, which is incremented each time a new PCID is 2920 * allocated. On TLB invalidation, the generation counters for the 2921 * pmap are zeroed, which signals the context switch code that the 2922 * previously allocated PCID is no longer valid. Effectively, 2923 * zeroing any of these counters triggers a TLB shootdown for the 2924 * given CPU/address space, due to the allocation of a new PCID. 2925 * 2926 * Zeroing can be performed remotely. Consequently, if a pmap is 2927 * inactive on a CPU, then a TLB shootdown for that pmap and CPU can 2928 * be initiated by an ordinary memory access to reset the target 2929 * CPU's generation count within the pmap. The CPU initiating the 2930 * TLB shootdown does not need to send an IPI to the target CPU. 2931 * 2932 * * PTI + PCID. The available PCIDs are divided into two sets: PCIDs 2933 * for complete (kernel) page tables, and PCIDs for user mode page 2934 * tables. A user PCID value is obtained from the kernel PCID value 2935 * by setting the highest bit, 11, to 1 (0x800 == PMAP_PCID_USER_PT). 2936 * 2937 * User space page tables are activated on return to user mode, by 2938 * loading pm_ucr3 into %cr3. If the PCPU(ucr3_load_mask) requests 2939 * clearing bit 63 of the loaded ucr3, this effectively causes 2940 * complete invalidation of the user mode TLB entries for the 2941 * current pmap. In which case, local invalidations of individual 2942 * pages in the user page table are skipped. 2943 * 2944 * * Local invalidation, all modes. If the requested invalidation is 2945 * for a specific address or the total invalidation of a currently 2946 * active pmap, then the TLB is flushed using INVLPG for a kernel 2947 * page table, and INVPCID(INVPCID_CTXGLOB)/invltlb_glob() for a 2948 * user space page table(s). 2949 * 2950 * If the INVPCID instruction is available, it is used to flush user 2951 * entries from the kernel page table. 2952 * 2953 * When PCID is enabled, the INVLPG instruction invalidates all TLB 2954 * entries for the given page that either match the current PCID or 2955 * are global. Since TLB entries for the same page under different 2956 * PCIDs are unaffected, kernel pages which reside in all address 2957 * spaces could be problematic. We avoid the problem by creating 2958 * all kernel PTEs with the global flag (PG_G) set, when PTI is 2959 * disabled. 2960 * 2961 * * mode: PTI disabled, PCID present. The kernel reserves PCID 0 for its 2962 * address space, all other 4095 PCIDs are used for user mode spaces 2963 * as described above. A context switch allocates a new PCID if 2964 * the recorded PCID is zero or the recorded generation does not match 2965 * the CPU's generation, effectively flushing the TLB for this address space. 2966 * Total remote invalidation is performed by zeroing pm_gen for all CPUs. 2967 * local user page: INVLPG 2968 * local kernel page: INVLPG 2969 * local user total: INVPCID(CTX) 2970 * local kernel total: INVPCID(CTXGLOB) or invltlb_glob() 2971 * remote user page, inactive pmap: zero pm_gen 2972 * remote user page, active pmap: zero pm_gen + IPI:INVLPG 2973 * (Both actions are required to handle the aforementioned pm_active races.) 2974 * remote kernel page: IPI:INVLPG 2975 * remote user total, inactive pmap: zero pm_gen 2976 * remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) or 2977 * reload %cr3) 2978 * (See note above about pm_active races.) 2979 * remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob()) 2980 * 2981 * PTI enabled, PCID present. 2982 * local user page: INVLPG for kpt, INVPCID(ADDR) or (INVLPG for ucr3) 2983 * for upt 2984 * local kernel page: INVLPG 2985 * local user total: INVPCID(CTX) or reload %cr3 for kpt, clear PCID_SAVE 2986 * on loading UCR3 into %cr3 for upt 2987 * local kernel total: INVPCID(CTXGLOB) or invltlb_glob() 2988 * remote user page, inactive pmap: zero pm_gen 2989 * remote user page, active pmap: zero pm_gen + IPI:(INVLPG for kpt, 2990 * INVPCID(ADDR) for upt) 2991 * remote kernel page: IPI:INVLPG 2992 * remote user total, inactive pmap: zero pm_gen 2993 * remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) for kpt, 2994 * clear PCID_SAVE on loading UCR3 into $cr3 for upt) 2995 * remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob()) 2996 * 2997 * No PCID. 2998 * local user page: INVLPG 2999 * local kernel page: INVLPG 3000 * local user total: reload %cr3 3001 * local kernel total: invltlb_glob() 3002 * remote user page, inactive pmap: - 3003 * remote user page, active pmap: IPI:INVLPG 3004 * remote kernel page: IPI:INVLPG 3005 * remote user total, inactive pmap: - 3006 * remote user total, active pmap: IPI:(reload %cr3) 3007 * remote kernel total: IPI:invltlb_glob() 3008 * Since on return to user mode, the reload of %cr3 with ucr3 causes 3009 * TLB invalidation, no specific action is required for user page table. 3010 * 3011 * EPT. EPT pmaps do not map KVA, all mappings are userspace. 3012 * XXX TODO 3013 */ 3014 3015 #ifdef SMP 3016 /* 3017 * Interrupt the cpus that are executing in the guest context. 3018 * This will force the vcpu to exit and the cached EPT mappings 3019 * will be invalidated by the host before the next vmresume. 3020 */ 3021 static __inline void 3022 pmap_invalidate_ept(pmap_t pmap) 3023 { 3024 smr_seq_t goal; 3025 int ipinum; 3026 3027 sched_pin(); 3028 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 3029 ("pmap_invalidate_ept: absurd pm_active")); 3030 3031 /* 3032 * The TLB mappings associated with a vcpu context are not 3033 * flushed each time a different vcpu is chosen to execute. 3034 * 3035 * This is in contrast with a process's vtop mappings that 3036 * are flushed from the TLB on each context switch. 3037 * 3038 * Therefore we need to do more than just a TLB shootdown on 3039 * the active cpus in 'pmap->pm_active'. To do this we keep 3040 * track of the number of invalidations performed on this pmap. 3041 * 3042 * Each vcpu keeps a cache of this counter and compares it 3043 * just before a vmresume. If the counter is out-of-date an 3044 * invept will be done to flush stale mappings from the TLB. 3045 * 3046 * To ensure that all vCPU threads have observed the new counter 3047 * value before returning, we use SMR. Ordering is important here: 3048 * the VMM enters an SMR read section before loading the counter 3049 * and after updating the pm_active bit set. Thus, pm_active is 3050 * a superset of active readers, and any reader that has observed 3051 * the goal has observed the new counter value. 3052 */ 3053 atomic_add_long(&pmap->pm_eptgen, 1); 3054 3055 goal = smr_advance(pmap->pm_eptsmr); 3056 3057 /* 3058 * Force the vcpu to exit and trap back into the hypervisor. 3059 */ 3060 ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; 3061 ipi_selected(pmap->pm_active, ipinum); 3062 sched_unpin(); 3063 3064 /* 3065 * Ensure that all active vCPUs will observe the new generation counter 3066 * value before executing any more guest instructions. 3067 */ 3068 smr_wait(pmap->pm_eptsmr, goal); 3069 } 3070 3071 static inline void 3072 pmap_invalidate_preipi_pcid(pmap_t pmap) 3073 { 3074 struct pmap_pcid *pcidp; 3075 u_int cpuid, i; 3076 3077 sched_pin(); 3078 3079 cpuid = PCPU_GET(cpuid); 3080 if (pmap != PCPU_GET(curpmap)) 3081 cpuid = 0xffffffff; /* An impossible value */ 3082 3083 CPU_FOREACH(i) { 3084 if (cpuid != i) { 3085 pcidp = zpcpu_get_cpu(pmap->pm_pcidp, i); 3086 pcidp->pm_gen = 0; 3087 } 3088 } 3089 3090 /* 3091 * The fence is between stores to pm_gen and the read of the 3092 * pm_active mask. We need to ensure that it is impossible 3093 * for us to miss the bit update in pm_active and 3094 * simultaneously observe a non-zero pm_gen in 3095 * pmap_activate_sw(), otherwise TLB update is missed. 3096 * Without the fence, IA32 allows such an outcome. Note that 3097 * pm_active is updated by a locked operation, which provides 3098 * the reciprocal fence. 3099 */ 3100 atomic_thread_fence_seq_cst(); 3101 } 3102 3103 static void 3104 pmap_invalidate_preipi_nopcid(pmap_t pmap __unused) 3105 { 3106 sched_pin(); 3107 } 3108 3109 DEFINE_IFUNC(static, void, pmap_invalidate_preipi, (pmap_t)) 3110 { 3111 return (pmap_pcid_enabled ? pmap_invalidate_preipi_pcid : 3112 pmap_invalidate_preipi_nopcid); 3113 } 3114 3115 static inline void 3116 pmap_invalidate_page_pcid_cb(pmap_t pmap, vm_offset_t va, 3117 const bool invpcid_works1) 3118 { 3119 struct invpcid_descr d; 3120 uint64_t kcr3, ucr3; 3121 uint32_t pcid; 3122 3123 /* 3124 * Because pm_pcid is recalculated on a context switch, we 3125 * must ensure there is no preemption, not just pinning. 3126 * Otherwise, we might use a stale value below. 3127 */ 3128 CRITICAL_ASSERT(curthread); 3129 3130 /* 3131 * No need to do anything with user page tables invalidation 3132 * if there is no user page table, or invalidation is deferred 3133 * until the return to userspace. ucr3_load_mask is stable 3134 * because we have preemption disabled. 3135 */ 3136 if (pmap->pm_ucr3 == PMAP_NO_CR3 || 3137 PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) 3138 return; 3139 3140 pcid = pmap_get_pcid(pmap); 3141 if (invpcid_works1) { 3142 d.pcid = pcid | PMAP_PCID_USER_PT; 3143 d.pad = 0; 3144 d.addr = va; 3145 invpcid(&d, INVPCID_ADDR); 3146 } else { 3147 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3148 ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3149 pmap_pti_pcid_invlpg(ucr3, kcr3, va); 3150 } 3151 } 3152 3153 static void 3154 pmap_invalidate_page_pcid_invpcid_cb(pmap_t pmap, vm_offset_t va) 3155 { 3156 pmap_invalidate_page_pcid_cb(pmap, va, true); 3157 } 3158 3159 static void 3160 pmap_invalidate_page_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t va) 3161 { 3162 pmap_invalidate_page_pcid_cb(pmap, va, false); 3163 } 3164 3165 static void 3166 pmap_invalidate_page_nopcid_cb(pmap_t pmap __unused, vm_offset_t va __unused) 3167 { 3168 } 3169 3170 DEFINE_IFUNC(static, void, pmap_invalidate_page_cb, (pmap_t, vm_offset_t)) 3171 { 3172 if (pmap_pcid_enabled) 3173 return (invpcid_works ? pmap_invalidate_page_pcid_invpcid_cb : 3174 pmap_invalidate_page_pcid_noinvpcid_cb); 3175 return (pmap_invalidate_page_nopcid_cb); 3176 } 3177 3178 static void 3179 pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va, 3180 vm_offset_t addr2 __unused) 3181 { 3182 if (pmap == kernel_pmap) { 3183 pmap_invlpg(kernel_pmap, va); 3184 } else if (pmap == PCPU_GET(curpmap)) { 3185 invlpg(va); 3186 pmap_invalidate_page_cb(pmap, va); 3187 } 3188 } 3189 3190 void 3191 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 3192 { 3193 if (pmap_type_guest(pmap)) { 3194 pmap_invalidate_ept(pmap); 3195 return; 3196 } 3197 3198 KASSERT(pmap->pm_type == PT_X86, 3199 ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); 3200 3201 pmap_invalidate_preipi(pmap); 3202 smp_masked_invlpg(va, pmap, pmap_invalidate_page_curcpu_cb); 3203 } 3204 3205 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ 3206 #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) 3207 3208 static void 3209 pmap_invalidate_range_pcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 3210 const bool invpcid_works1) 3211 { 3212 struct invpcid_descr d; 3213 uint64_t kcr3, ucr3; 3214 uint32_t pcid; 3215 3216 CRITICAL_ASSERT(curthread); 3217 3218 if (pmap != PCPU_GET(curpmap) || 3219 pmap->pm_ucr3 == PMAP_NO_CR3 || 3220 PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) 3221 return; 3222 3223 pcid = pmap_get_pcid(pmap); 3224 if (invpcid_works1) { 3225 d.pcid = pcid | PMAP_PCID_USER_PT; 3226 d.pad = 0; 3227 for (d.addr = sva; d.addr < eva; d.addr += PAGE_SIZE) 3228 invpcid(&d, INVPCID_ADDR); 3229 } else { 3230 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3231 ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3232 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); 3233 } 3234 } 3235 3236 static void 3237 pmap_invalidate_range_pcid_invpcid_cb(pmap_t pmap, vm_offset_t sva, 3238 vm_offset_t eva) 3239 { 3240 pmap_invalidate_range_pcid_cb(pmap, sva, eva, true); 3241 } 3242 3243 static void 3244 pmap_invalidate_range_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t sva, 3245 vm_offset_t eva) 3246 { 3247 pmap_invalidate_range_pcid_cb(pmap, sva, eva, false); 3248 } 3249 3250 static void 3251 pmap_invalidate_range_nopcid_cb(pmap_t pmap __unused, vm_offset_t sva __unused, 3252 vm_offset_t eva __unused) 3253 { 3254 } 3255 3256 DEFINE_IFUNC(static, void, pmap_invalidate_range_cb, (pmap_t, vm_offset_t, 3257 vm_offset_t)) 3258 { 3259 if (pmap_pcid_enabled) 3260 return (invpcid_works ? pmap_invalidate_range_pcid_invpcid_cb : 3261 pmap_invalidate_range_pcid_noinvpcid_cb); 3262 return (pmap_invalidate_range_nopcid_cb); 3263 } 3264 3265 static void 3266 pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3267 { 3268 vm_offset_t addr; 3269 3270 if (pmap == kernel_pmap) { 3271 if (PCPU_GET(pcid_invlpg_workaround)) { 3272 struct invpcid_descr d = { 0 }; 3273 3274 invpcid(&d, INVPCID_CTXGLOB); 3275 } else { 3276 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3277 invlpg(addr); 3278 } 3279 } else if (pmap == PCPU_GET(curpmap)) { 3280 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3281 invlpg(addr); 3282 pmap_invalidate_range_cb(pmap, sva, eva); 3283 } 3284 } 3285 3286 void 3287 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3288 { 3289 if (eva - sva >= PMAP_INVLPG_THRESHOLD) { 3290 pmap_invalidate_all(pmap); 3291 return; 3292 } 3293 3294 if (pmap_type_guest(pmap)) { 3295 pmap_invalidate_ept(pmap); 3296 return; 3297 } 3298 3299 KASSERT(pmap->pm_type == PT_X86, 3300 ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); 3301 3302 pmap_invalidate_preipi(pmap); 3303 smp_masked_invlpg_range(sva, eva, pmap, 3304 pmap_invalidate_range_curcpu_cb); 3305 } 3306 3307 static inline void 3308 pmap_invalidate_all_pcid_cb(pmap_t pmap, bool invpcid_works1) 3309 { 3310 struct invpcid_descr d; 3311 uint64_t kcr3; 3312 uint32_t pcid; 3313 3314 if (pmap == kernel_pmap) { 3315 if (invpcid_works1) { 3316 bzero(&d, sizeof(d)); 3317 invpcid(&d, INVPCID_CTXGLOB); 3318 } else { 3319 invltlb_glob(); 3320 } 3321 } else if (pmap == PCPU_GET(curpmap)) { 3322 CRITICAL_ASSERT(curthread); 3323 3324 pcid = pmap_get_pcid(pmap); 3325 if (invpcid_works1) { 3326 d.pcid = pcid; 3327 d.pad = 0; 3328 d.addr = 0; 3329 invpcid(&d, INVPCID_CTX); 3330 } else { 3331 kcr3 = pmap->pm_cr3 | pcid; 3332 load_cr3(kcr3); 3333 } 3334 if (pmap->pm_ucr3 != PMAP_NO_CR3) 3335 PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); 3336 } 3337 } 3338 3339 static void 3340 pmap_invalidate_all_pcid_invpcid_cb(pmap_t pmap) 3341 { 3342 pmap_invalidate_all_pcid_cb(pmap, true); 3343 } 3344 3345 static void 3346 pmap_invalidate_all_pcid_noinvpcid_cb(pmap_t pmap) 3347 { 3348 pmap_invalidate_all_pcid_cb(pmap, false); 3349 } 3350 3351 static void 3352 pmap_invalidate_all_nopcid_cb(pmap_t pmap) 3353 { 3354 if (pmap == kernel_pmap) 3355 invltlb_glob(); 3356 else if (pmap == PCPU_GET(curpmap)) 3357 invltlb(); 3358 } 3359 3360 DEFINE_IFUNC(static, void, pmap_invalidate_all_cb, (pmap_t)) 3361 { 3362 if (pmap_pcid_enabled) 3363 return (invpcid_works ? pmap_invalidate_all_pcid_invpcid_cb : 3364 pmap_invalidate_all_pcid_noinvpcid_cb); 3365 return (pmap_invalidate_all_nopcid_cb); 3366 } 3367 3368 static void 3369 pmap_invalidate_all_curcpu_cb(pmap_t pmap, vm_offset_t addr1 __unused, 3370 vm_offset_t addr2 __unused) 3371 { 3372 pmap_invalidate_all_cb(pmap); 3373 } 3374 3375 void 3376 pmap_invalidate_all(pmap_t pmap) 3377 { 3378 if (pmap_type_guest(pmap)) { 3379 pmap_invalidate_ept(pmap); 3380 return; 3381 } 3382 3383 KASSERT(pmap->pm_type == PT_X86, 3384 ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); 3385 3386 pmap_invalidate_preipi(pmap); 3387 smp_masked_invltlb(pmap, pmap_invalidate_all_curcpu_cb); 3388 } 3389 3390 static void 3391 pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused, vm_offset_t va __unused, 3392 vm_offset_t addr2 __unused) 3393 { 3394 wbinvd(); 3395 } 3396 3397 void 3398 pmap_invalidate_cache(void) 3399 { 3400 sched_pin(); 3401 smp_cache_flush(pmap_invalidate_cache_curcpu_cb); 3402 } 3403 3404 struct pde_action { 3405 cpuset_t invalidate; /* processors that invalidate their TLB */ 3406 pmap_t pmap; 3407 vm_offset_t va; 3408 pd_entry_t *pde; 3409 pd_entry_t newpde; 3410 u_int store; /* processor that updates the PDE */ 3411 }; 3412 3413 static void 3414 pmap_update_pde_action(void *arg) 3415 { 3416 struct pde_action *act = arg; 3417 3418 if (act->store == PCPU_GET(cpuid)) 3419 pmap_update_pde_store(act->pmap, act->pde, act->newpde); 3420 } 3421 3422 static void 3423 pmap_update_pde_teardown(void *arg) 3424 { 3425 struct pde_action *act = arg; 3426 3427 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 3428 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); 3429 } 3430 3431 /* 3432 * Change the page size for the specified virtual address in a way that 3433 * prevents any possibility of the TLB ever having two entries that map the 3434 * same virtual address using different page sizes. This is the recommended 3435 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 3436 * machine check exception for a TLB state that is improperly diagnosed as a 3437 * hardware error. 3438 */ 3439 static void 3440 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 3441 { 3442 struct pde_action act; 3443 cpuset_t active, other_cpus; 3444 u_int cpuid; 3445 3446 sched_pin(); 3447 cpuid = PCPU_GET(cpuid); 3448 other_cpus = all_cpus; 3449 CPU_CLR(cpuid, &other_cpus); 3450 if (pmap == kernel_pmap || pmap_type_guest(pmap)) 3451 active = all_cpus; 3452 else { 3453 active = pmap->pm_active; 3454 } 3455 if (CPU_OVERLAP(&active, &other_cpus)) { 3456 act.store = cpuid; 3457 act.invalidate = active; 3458 act.va = va; 3459 act.pmap = pmap; 3460 act.pde = pde; 3461 act.newpde = newpde; 3462 CPU_SET(cpuid, &active); 3463 smp_rendezvous_cpus(active, 3464 smp_no_rendezvous_barrier, pmap_update_pde_action, 3465 pmap_update_pde_teardown, &act); 3466 } else { 3467 pmap_update_pde_store(pmap, pde, newpde); 3468 if (CPU_ISSET(cpuid, &active)) 3469 pmap_update_pde_invalidate(pmap, va, newpde); 3470 } 3471 sched_unpin(); 3472 } 3473 #else /* !SMP */ 3474 /* 3475 * Normal, non-SMP, invalidation functions. 3476 */ 3477 void 3478 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 3479 { 3480 struct invpcid_descr d; 3481 struct pmap_pcid *pcidp; 3482 uint64_t kcr3, ucr3; 3483 uint32_t pcid; 3484 3485 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3486 pmap->pm_eptgen++; 3487 return; 3488 } 3489 KASSERT(pmap->pm_type == PT_X86, 3490 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 3491 3492 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 3493 invlpg(va); 3494 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && 3495 pmap->pm_ucr3 != PMAP_NO_CR3) { 3496 critical_enter(); 3497 pcid = pmap_get_pcid(pmap); 3498 if (invpcid_works) { 3499 d.pcid = pcid | PMAP_PCID_USER_PT; 3500 d.pad = 0; 3501 d.addr = va; 3502 invpcid(&d, INVPCID_ADDR); 3503 } else { 3504 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3505 ucr3 = pmap->pm_ucr3 | pcid | 3506 PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3507 pmap_pti_pcid_invlpg(ucr3, kcr3, va); 3508 } 3509 critical_exit(); 3510 } 3511 } else if (pmap_pcid_enabled) { 3512 pcidp = zpcpu_get(pmap->pm_pcidp); 3513 pcidp->pm_gen = 0; 3514 } 3515 } 3516 3517 void 3518 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3519 { 3520 struct invpcid_descr d; 3521 struct pmap_pcid *pcidp; 3522 vm_offset_t addr; 3523 uint64_t kcr3, ucr3; 3524 uint32_t pcid; 3525 3526 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3527 pmap->pm_eptgen++; 3528 return; 3529 } 3530 KASSERT(pmap->pm_type == PT_X86, 3531 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 3532 3533 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 3534 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3535 invlpg(addr); 3536 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && 3537 pmap->pm_ucr3 != PMAP_NO_CR3) { 3538 critical_enter(); 3539 pcid = pmap_get_pcid(pmap); 3540 if (invpcid_works) { 3541 d.pcid = pcid | PMAP_PCID_USER_PT; 3542 d.pad = 0; 3543 d.addr = sva; 3544 for (; d.addr < eva; d.addr += PAGE_SIZE) 3545 invpcid(&d, INVPCID_ADDR); 3546 } else { 3547 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3548 ucr3 = pmap->pm_ucr3 | pcid | 3549 PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3550 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); 3551 } 3552 critical_exit(); 3553 } 3554 } else if (pmap_pcid_enabled) { 3555 pcidp = zpcpu_get(pmap->pm_pcidp); 3556 pcidp->pm_gen = 0; 3557 } 3558 } 3559 3560 void 3561 pmap_invalidate_all(pmap_t pmap) 3562 { 3563 struct invpcid_descr d; 3564 struct pmap_pcid *pcidp; 3565 uint64_t kcr3, ucr3; 3566 uint32_t pcid; 3567 3568 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3569 pmap->pm_eptgen++; 3570 return; 3571 } 3572 KASSERT(pmap->pm_type == PT_X86, 3573 ("pmap_invalidate_all: unknown type %d", pmap->pm_type)); 3574 3575 if (pmap == kernel_pmap) { 3576 if (pmap_pcid_enabled && invpcid_works) { 3577 bzero(&d, sizeof(d)); 3578 invpcid(&d, INVPCID_CTXGLOB); 3579 } else { 3580 invltlb_glob(); 3581 } 3582 } else if (pmap == PCPU_GET(curpmap)) { 3583 if (pmap_pcid_enabled) { 3584 critical_enter(); 3585 pcid = pmap_get_pcid(pmap); 3586 if (invpcid_works) { 3587 d.pcid = pcid; 3588 d.pad = 0; 3589 d.addr = 0; 3590 invpcid(&d, INVPCID_CTX); 3591 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 3592 d.pcid |= PMAP_PCID_USER_PT; 3593 invpcid(&d, INVPCID_CTX); 3594 } 3595 } else { 3596 kcr3 = pmap->pm_cr3 | pcid; 3597 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 3598 ucr3 = pmap->pm_ucr3 | pcid | 3599 PMAP_PCID_USER_PT; 3600 pmap_pti_pcid_invalidate(ucr3, kcr3); 3601 } else 3602 load_cr3(kcr3); 3603 } 3604 critical_exit(); 3605 } else { 3606 invltlb(); 3607 } 3608 } else if (pmap_pcid_enabled) { 3609 pcidp = zpcpu_get(pmap->pm_pcidp); 3610 pcidp->pm_gen = 0; 3611 } 3612 } 3613 3614 void 3615 pmap_invalidate_cache(void) 3616 { 3617 3618 wbinvd(); 3619 } 3620 3621 static void 3622 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 3623 { 3624 struct pmap_pcid *pcidp; 3625 3626 pmap_update_pde_store(pmap, pde, newpde); 3627 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) 3628 pmap_update_pde_invalidate(pmap, va, newpde); 3629 else { 3630 pcidp = zpcpu_get(pmap->pm_pcidp); 3631 pcidp->pm_gen = 0; 3632 } 3633 } 3634 #endif /* !SMP */ 3635 3636 static void 3637 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) 3638 { 3639 3640 /* 3641 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created 3642 * by a promotion that did not invalidate the 512 4KB page mappings 3643 * that might exist in the TLB. Consequently, at this point, the TLB 3644 * may hold both 4KB and 2MB page mappings for the address range [va, 3645 * va + NBPDR). Therefore, the entire range must be invalidated here. 3646 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any 3647 * 4KB page mappings for the address range [va, va + NBPDR), and so a 3648 * single INVLPG suffices to invalidate the 2MB page mapping from the 3649 * TLB. 3650 */ 3651 if ((pde & PG_PROMOTED) != 0) 3652 pmap_invalidate_range(pmap, va, va + NBPDR - 1); 3653 else 3654 pmap_invalidate_page(pmap, va); 3655 } 3656 3657 DEFINE_IFUNC(, void, pmap_invalidate_cache_range, 3658 (vm_offset_t sva, vm_offset_t eva)) 3659 { 3660 3661 if ((cpu_feature & CPUID_SS) != 0) 3662 return (pmap_invalidate_cache_range_selfsnoop); 3663 if ((cpu_feature & CPUID_CLFSH) != 0) 3664 return (pmap_force_invalidate_cache_range); 3665 return (pmap_invalidate_cache_range_all); 3666 } 3667 3668 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 3669 3670 static void 3671 pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva) 3672 { 3673 3674 KASSERT((sva & PAGE_MASK) == 0, 3675 ("pmap_invalidate_cache_range: sva not page-aligned")); 3676 KASSERT((eva & PAGE_MASK) == 0, 3677 ("pmap_invalidate_cache_range: eva not page-aligned")); 3678 } 3679 3680 static void 3681 pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva) 3682 { 3683 3684 pmap_invalidate_cache_range_check_align(sva, eva); 3685 } 3686 3687 void 3688 pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 3689 { 3690 3691 sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); 3692 3693 /* 3694 * XXX: Some CPUs fault, hang, or trash the local APIC 3695 * registers if we use CLFLUSH on the local APIC range. The 3696 * local APIC is always uncached, so we don't need to flush 3697 * for that range anyway. 3698 */ 3699 if (pmap_kextract(sva) == lapic_paddr) 3700 return; 3701 3702 if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) { 3703 /* 3704 * Do per-cache line flush. Use a locked 3705 * instruction to insure that previous stores are 3706 * included in the write-back. The processor 3707 * propagates flush to other processors in the cache 3708 * coherence domain. 3709 */ 3710 atomic_thread_fence_seq_cst(); 3711 for (; sva < eva; sva += cpu_clflush_line_size) 3712 clflushopt(sva); 3713 atomic_thread_fence_seq_cst(); 3714 } else { 3715 /* 3716 * Writes are ordered by CLFLUSH on Intel CPUs. 3717 */ 3718 if (cpu_vendor_id != CPU_VENDOR_INTEL) 3719 mfence(); 3720 for (; sva < eva; sva += cpu_clflush_line_size) 3721 clflush(sva); 3722 if (cpu_vendor_id != CPU_VENDOR_INTEL) 3723 mfence(); 3724 } 3725 } 3726 3727 static void 3728 pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva) 3729 { 3730 3731 pmap_invalidate_cache_range_check_align(sva, eva); 3732 pmap_invalidate_cache(); 3733 } 3734 3735 /* 3736 * Remove the specified set of pages from the data and instruction caches. 3737 * 3738 * In contrast to pmap_invalidate_cache_range(), this function does not 3739 * rely on the CPU's self-snoop feature, because it is intended for use 3740 * when moving pages into a different cache domain. 3741 */ 3742 void 3743 pmap_invalidate_cache_pages(vm_page_t *pages, int count) 3744 { 3745 vm_offset_t daddr, eva; 3746 int i; 3747 bool useclflushopt; 3748 3749 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; 3750 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 3751 ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt)) 3752 pmap_invalidate_cache(); 3753 else { 3754 if (useclflushopt) 3755 atomic_thread_fence_seq_cst(); 3756 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 3757 mfence(); 3758 for (i = 0; i < count; i++) { 3759 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 3760 eva = daddr + PAGE_SIZE; 3761 for (; daddr < eva; daddr += cpu_clflush_line_size) { 3762 if (useclflushopt) 3763 clflushopt(daddr); 3764 else 3765 clflush(daddr); 3766 } 3767 } 3768 if (useclflushopt) 3769 atomic_thread_fence_seq_cst(); 3770 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 3771 mfence(); 3772 } 3773 } 3774 3775 void 3776 pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva) 3777 { 3778 3779 pmap_invalidate_cache_range_check_align(sva, eva); 3780 3781 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) { 3782 pmap_force_invalidate_cache_range(sva, eva); 3783 return; 3784 } 3785 3786 /* See comment in pmap_force_invalidate_cache_range(). */ 3787 if (pmap_kextract(sva) == lapic_paddr) 3788 return; 3789 3790 atomic_thread_fence_seq_cst(); 3791 for (; sva < eva; sva += cpu_clflush_line_size) 3792 clwb(sva); 3793 atomic_thread_fence_seq_cst(); 3794 } 3795 3796 void 3797 pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) 3798 { 3799 pt_entry_t *pte; 3800 vm_offset_t vaddr; 3801 int error __diagused; 3802 int pte_bits; 3803 3804 KASSERT((spa & PAGE_MASK) == 0, 3805 ("pmap_flush_cache_phys_range: spa not page-aligned")); 3806 KASSERT((epa & PAGE_MASK) == 0, 3807 ("pmap_flush_cache_phys_range: epa not page-aligned")); 3808 3809 if (spa < dmaplimit) { 3810 pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN( 3811 dmaplimit, epa))); 3812 if (dmaplimit >= epa) 3813 return; 3814 spa = dmaplimit; 3815 } 3816 3817 pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW | 3818 X86_PG_V; 3819 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 3820 &vaddr); 3821 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 3822 pte = vtopte(vaddr); 3823 for (; spa < epa; spa += PAGE_SIZE) { 3824 sched_pin(); 3825 pte_store(pte, spa | pte_bits); 3826 pmap_invlpg(kernel_pmap, vaddr); 3827 /* XXXKIB atomic inside flush_cache_range are excessive */ 3828 pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE); 3829 sched_unpin(); 3830 } 3831 vmem_free(kernel_arena, vaddr, PAGE_SIZE); 3832 } 3833 3834 /* 3835 * Routine: pmap_extract 3836 * Function: 3837 * Extract the physical page address associated 3838 * with the given map/virtual_address pair. 3839 */ 3840 vm_paddr_t 3841 pmap_extract(pmap_t pmap, vm_offset_t va) 3842 { 3843 pdp_entry_t *pdpe; 3844 pd_entry_t *pde; 3845 pt_entry_t *pte, PG_V; 3846 vm_paddr_t pa; 3847 3848 pa = 0; 3849 PG_V = pmap_valid_bit(pmap); 3850 PMAP_LOCK(pmap); 3851 pdpe = pmap_pdpe(pmap, va); 3852 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 3853 if ((*pdpe & PG_PS) != 0) 3854 pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); 3855 else { 3856 pde = pmap_pdpe_to_pde(pdpe, va); 3857 if ((*pde & PG_V) != 0) { 3858 if ((*pde & PG_PS) != 0) { 3859 pa = (*pde & PG_PS_FRAME) | 3860 (va & PDRMASK); 3861 } else { 3862 pte = pmap_pde_to_pte(pde, va); 3863 pa = (*pte & PG_FRAME) | 3864 (va & PAGE_MASK); 3865 } 3866 } 3867 } 3868 } 3869 PMAP_UNLOCK(pmap); 3870 return (pa); 3871 } 3872 3873 /* 3874 * Routine: pmap_extract_and_hold 3875 * Function: 3876 * Atomically extract and hold the physical page 3877 * with the given pmap and virtual address pair 3878 * if that mapping permits the given protection. 3879 */ 3880 vm_page_t 3881 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 3882 { 3883 pdp_entry_t pdpe, *pdpep; 3884 pd_entry_t pde, *pdep; 3885 pt_entry_t pte, PG_RW, PG_V; 3886 vm_page_t m; 3887 3888 m = NULL; 3889 PG_RW = pmap_rw_bit(pmap); 3890 PG_V = pmap_valid_bit(pmap); 3891 PMAP_LOCK(pmap); 3892 3893 pdpep = pmap_pdpe(pmap, va); 3894 if (pdpep == NULL || ((pdpe = *pdpep) & PG_V) == 0) 3895 goto out; 3896 if ((pdpe & PG_PS) != 0) { 3897 if ((pdpe & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0) 3898 goto out; 3899 m = PHYS_TO_VM_PAGE((pdpe & PG_PS_FRAME) | (va & PDPMASK)); 3900 goto check_page; 3901 } 3902 3903 pdep = pmap_pdpe_to_pde(pdpep, va); 3904 if (pdep == NULL || ((pde = *pdep) & PG_V) == 0) 3905 goto out; 3906 if ((pde & PG_PS) != 0) { 3907 if ((pde & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0) 3908 goto out; 3909 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); 3910 goto check_page; 3911 } 3912 3913 pte = *pmap_pde_to_pte(pdep, va); 3914 if ((pte & PG_V) == 0 || 3915 ((pte & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0)) 3916 goto out; 3917 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 3918 3919 check_page: 3920 if (m != NULL && !vm_page_wire_mapped(m)) 3921 m = NULL; 3922 out: 3923 PMAP_UNLOCK(pmap); 3924 return (m); 3925 } 3926 3927 /* 3928 * Routine: pmap_kextract 3929 * Function: 3930 * Extract the physical page address associated with the given kernel 3931 * virtual address. 3932 */ 3933 vm_paddr_t 3934 pmap_kextract(vm_offset_t va) 3935 { 3936 pd_entry_t pde; 3937 vm_paddr_t pa; 3938 3939 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 3940 pa = DMAP_TO_PHYS(va); 3941 } else if (PMAP_ADDRESS_IN_LARGEMAP(va)) { 3942 pa = pmap_large_map_kextract(va); 3943 } else { 3944 pde = *vtopde(va); 3945 if (pde & PG_PS) { 3946 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 3947 } else { 3948 /* 3949 * Beware of a concurrent promotion that changes the 3950 * PDE at this point! For example, vtopte() must not 3951 * be used to access the PTE because it would use the 3952 * new PDE. It is, however, safe to use the old PDE 3953 * because the page table page is preserved by the 3954 * promotion. 3955 */ 3956 pa = *pmap_pde_to_pte(&pde, va); 3957 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 3958 } 3959 } 3960 return (pa); 3961 } 3962 3963 /*************************************************** 3964 * Low level mapping routines..... 3965 ***************************************************/ 3966 3967 /* 3968 * Add a wired page to the kva. 3969 * Note: not SMP coherent. 3970 */ 3971 void 3972 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 3973 { 3974 pt_entry_t *pte; 3975 3976 pte = vtopte(va); 3977 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | X86_PG_M | 3978 X86_PG_RW | X86_PG_V); 3979 } 3980 3981 static __inline void 3982 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 3983 { 3984 pt_entry_t *pte; 3985 int cache_bits; 3986 3987 pte = vtopte(va); 3988 cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); 3989 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | X86_PG_M | 3990 X86_PG_RW | X86_PG_V | cache_bits); 3991 } 3992 3993 /* 3994 * Remove a page from the kernel pagetables. 3995 * Note: not SMP coherent. 3996 */ 3997 void 3998 pmap_kremove(vm_offset_t va) 3999 { 4000 pt_entry_t *pte; 4001 4002 pte = vtopte(va); 4003 pte_clear(pte); 4004 } 4005 4006 /* 4007 * Used to map a range of physical addresses into kernel 4008 * virtual address space. 4009 * 4010 * The value passed in '*virt' is a suggested virtual address for 4011 * the mapping. Architectures which can support a direct-mapped 4012 * physical to virtual region can return the appropriate address 4013 * within that region, leaving '*virt' unchanged. Other 4014 * architectures should map the pages starting at '*virt' and 4015 * update '*virt' with the first usable address after the mapped 4016 * region. 4017 */ 4018 vm_offset_t 4019 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 4020 { 4021 return PHYS_TO_DMAP(start); 4022 } 4023 4024 /* 4025 * Add a list of wired pages to the kva 4026 * this routine is only used for temporary 4027 * kernel mappings that do not need to have 4028 * page modification or references recorded. 4029 * Note that old mappings are simply written 4030 * over. The page *must* be wired. 4031 * Note: SMP coherent. Uses a ranged shootdown IPI. 4032 */ 4033 void 4034 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 4035 { 4036 pt_entry_t *endpte, oldpte, pa, *pte; 4037 vm_page_t m; 4038 int cache_bits; 4039 4040 oldpte = 0; 4041 pte = vtopte(sva); 4042 endpte = pte + count; 4043 while (pte < endpte) { 4044 m = *ma++; 4045 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); 4046 pa = VM_PAGE_TO_PHYS(m) | cache_bits; 4047 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { 4048 oldpte |= *pte; 4049 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | 4050 X86_PG_M | X86_PG_RW | X86_PG_V); 4051 } 4052 pte++; 4053 } 4054 if (__predict_false((oldpte & X86_PG_V) != 0)) 4055 pmap_invalidate_range(kernel_pmap, sva, sva + count * 4056 PAGE_SIZE); 4057 } 4058 4059 /* 4060 * This routine tears out page mappings from the 4061 * kernel -- it is meant only for temporary mappings. 4062 * Note: SMP coherent. Uses a ranged shootdown IPI. 4063 */ 4064 void 4065 pmap_qremove(vm_offset_t sva, int count) 4066 { 4067 vm_offset_t va; 4068 4069 va = sva; 4070 while (count-- > 0) { 4071 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); 4072 pmap_kremove(va); 4073 va += PAGE_SIZE; 4074 } 4075 pmap_invalidate_range(kernel_pmap, sva, va); 4076 } 4077 4078 /*************************************************** 4079 * Page table page management routines..... 4080 ***************************************************/ 4081 /* 4082 * Schedule the specified unused page table page to be freed. Specifically, 4083 * add the page to the specified list of pages that will be released to the 4084 * physical memory manager after the TLB has been updated. 4085 */ 4086 static __inline void 4087 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 4088 boolean_t set_PG_ZERO) 4089 { 4090 4091 if (set_PG_ZERO) 4092 m->flags |= PG_ZERO; 4093 else 4094 m->flags &= ~PG_ZERO; 4095 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 4096 } 4097 4098 /* 4099 * Inserts the specified page table page into the specified pmap's collection 4100 * of idle page table pages. Each of a pmap's page table pages is responsible 4101 * for mapping a distinct range of virtual addresses. The pmap's collection is 4102 * ordered by this virtual address range. 4103 * 4104 * If "promoted" is false, then the page table page "mpte" must be zero filled; 4105 * "mpte"'s valid field will be set to 0. 4106 * 4107 * If "promoted" is true and "allpte_PG_A_set" is false, then "mpte" must 4108 * contain valid mappings with identical attributes except for PG_A; "mpte"'s 4109 * valid field will be set to 1. 4110 * 4111 * If "promoted" and "allpte_PG_A_set" are both true, then "mpte" must contain 4112 * valid mappings with identical attributes including PG_A; "mpte"'s valid 4113 * field will be set to VM_PAGE_BITS_ALL. 4114 */ 4115 static __inline int 4116 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, 4117 bool allpte_PG_A_set) 4118 { 4119 4120 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4121 KASSERT(promoted || !allpte_PG_A_set, 4122 ("a zero-filled PTP can't have PG_A set in every PTE")); 4123 mpte->valid = promoted ? (allpte_PG_A_set ? VM_PAGE_BITS_ALL : 1) : 0; 4124 return (vm_radix_insert(&pmap->pm_root, mpte)); 4125 } 4126 4127 /* 4128 * Removes the page table page mapping the specified virtual address from the 4129 * specified pmap's collection of idle page table pages, and returns it. 4130 * Otherwise, returns NULL if there is no page table page corresponding to the 4131 * specified virtual address. 4132 */ 4133 static __inline vm_page_t 4134 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 4135 { 4136 4137 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4138 return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va))); 4139 } 4140 4141 /* 4142 * Decrements a page table page's reference count, which is used to record the 4143 * number of valid page table entries within the page. If the reference count 4144 * drops to zero, then the page table page is unmapped. Returns TRUE if the 4145 * page table page was unmapped and FALSE otherwise. 4146 */ 4147 static inline boolean_t 4148 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4149 { 4150 4151 --m->ref_count; 4152 if (m->ref_count == 0) { 4153 _pmap_unwire_ptp(pmap, va, m, free); 4154 return (TRUE); 4155 } else 4156 return (FALSE); 4157 } 4158 4159 static void 4160 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4161 { 4162 pml5_entry_t *pml5; 4163 pml4_entry_t *pml4; 4164 pdp_entry_t *pdp; 4165 pd_entry_t *pd; 4166 vm_page_t pdpg, pdppg, pml4pg; 4167 4168 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4169 4170 /* 4171 * unmap the page table page 4172 */ 4173 if (m->pindex >= NUPDE + NUPDPE + NUPML4E) { 4174 /* PML4 page */ 4175 MPASS(pmap_is_la57(pmap)); 4176 pml5 = pmap_pml5e(pmap, va); 4177 *pml5 = 0; 4178 if (pmap->pm_pmltopu != NULL && va <= VM_MAXUSER_ADDRESS) { 4179 pml5 = pmap_pml5e_u(pmap, va); 4180 *pml5 = 0; 4181 } 4182 } else if (m->pindex >= NUPDE + NUPDPE) { 4183 /* PDP page */ 4184 pml4 = pmap_pml4e(pmap, va); 4185 *pml4 = 0; 4186 if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && 4187 va <= VM_MAXUSER_ADDRESS) { 4188 pml4 = pmap_pml4e_u(pmap, va); 4189 *pml4 = 0; 4190 } 4191 } else if (m->pindex >= NUPDE) { 4192 /* PD page */ 4193 pdp = pmap_pdpe(pmap, va); 4194 *pdp = 0; 4195 } else { 4196 /* PTE page */ 4197 pd = pmap_pde(pmap, va); 4198 *pd = 0; 4199 } 4200 if (m->pindex < NUPDE) { 4201 /* We just released a PT, unhold the matching PD */ 4202 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 4203 pmap_unwire_ptp(pmap, va, pdpg, free); 4204 } else if (m->pindex < NUPDE + NUPDPE) { 4205 /* We just released a PD, unhold the matching PDP */ 4206 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 4207 pmap_unwire_ptp(pmap, va, pdppg, free); 4208 } else if (m->pindex < NUPDE + NUPDPE + NUPML4E && pmap_is_la57(pmap)) { 4209 /* We just released a PDP, unhold the matching PML4 */ 4210 pml4pg = PHYS_TO_VM_PAGE(*pmap_pml5e(pmap, va) & PG_FRAME); 4211 pmap_unwire_ptp(pmap, va, pml4pg, free); 4212 } 4213 4214 pmap_pt_page_count_adj(pmap, -1); 4215 4216 /* 4217 * Put page on a list so that it is released after 4218 * *ALL* TLB shootdown is done 4219 */ 4220 pmap_add_delayed_free_list(m, free, TRUE); 4221 } 4222 4223 /* 4224 * After removing a page table entry, this routine is used to 4225 * conditionally free the page, and manage the reference count. 4226 */ 4227 static int 4228 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 4229 struct spglist *free) 4230 { 4231 vm_page_t mpte; 4232 4233 if (va >= VM_MAXUSER_ADDRESS) 4234 return (0); 4235 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 4236 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 4237 return (pmap_unwire_ptp(pmap, va, mpte, free)); 4238 } 4239 4240 /* 4241 * Release a page table page reference after a failed attempt to create a 4242 * mapping. 4243 */ 4244 static void 4245 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 4246 { 4247 struct spglist free; 4248 4249 SLIST_INIT(&free); 4250 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 4251 /* 4252 * Although "va" was never mapped, paging-structure caches 4253 * could nonetheless have entries that refer to the freed 4254 * page table pages. Invalidate those entries. 4255 */ 4256 pmap_invalidate_page(pmap, va); 4257 vm_page_free_pages_toq(&free, true); 4258 } 4259 } 4260 4261 static void 4262 pmap_pinit_pcids(pmap_t pmap, uint32_t pcid, int gen) 4263 { 4264 struct pmap_pcid *pcidp; 4265 int i; 4266 4267 CPU_FOREACH(i) { 4268 pcidp = zpcpu_get_cpu(pmap->pm_pcidp, i); 4269 pcidp->pm_pcid = pcid; 4270 pcidp->pm_gen = gen; 4271 } 4272 } 4273 4274 void 4275 pmap_pinit0(pmap_t pmap) 4276 { 4277 struct proc *p; 4278 struct thread *td; 4279 4280 PMAP_LOCK_INIT(pmap); 4281 pmap->pm_pmltop = kernel_pmap->pm_pmltop; 4282 pmap->pm_pmltopu = NULL; 4283 pmap->pm_cr3 = kernel_pmap->pm_cr3; 4284 /* hack to keep pmap_pti_pcid_invalidate() alive */ 4285 pmap->pm_ucr3 = PMAP_NO_CR3; 4286 vm_radix_init(&pmap->pm_root); 4287 CPU_ZERO(&pmap->pm_active); 4288 TAILQ_INIT(&pmap->pm_pvchunk); 4289 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4290 pmap->pm_flags = pmap_flags; 4291 pmap->pm_pcidp = uma_zalloc_pcpu(pcpu_zone_8, M_WAITOK); 4292 pmap_pinit_pcids(pmap, PMAP_PCID_KERN + 1, 1); 4293 pmap_activate_boot(pmap); 4294 td = curthread; 4295 if (pti) { 4296 p = td->td_proc; 4297 PROC_LOCK(p); 4298 p->p_md.md_flags |= P_MD_KPTI; 4299 PROC_UNLOCK(p); 4300 } 4301 pmap_thread_init_invl_gen(td); 4302 4303 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 4304 pmap_pkru_ranges_zone = uma_zcreate("pkru ranges", 4305 sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL, 4306 UMA_ALIGN_PTR, 0); 4307 } 4308 } 4309 4310 void 4311 pmap_pinit_pml4(vm_page_t pml4pg) 4312 { 4313 pml4_entry_t *pm_pml4; 4314 int i; 4315 4316 pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); 4317 4318 /* Wire in kernel global address entries. */ 4319 for (i = 0; i < NKPML4E; i++) { 4320 pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW | 4321 X86_PG_V; 4322 } 4323 #ifdef KASAN 4324 for (i = 0; i < NKASANPML4E; i++) { 4325 pm_pml4[KASANPML4I + i] = (KASANPDPphys + ptoa(i)) | X86_PG_RW | 4326 X86_PG_V | pg_nx; 4327 } 4328 #endif 4329 #ifdef KMSAN 4330 for (i = 0; i < NKMSANSHADPML4E; i++) { 4331 pm_pml4[KMSANSHADPML4I + i] = (KMSANSHADPDPphys + ptoa(i)) | 4332 X86_PG_RW | X86_PG_V | pg_nx; 4333 } 4334 for (i = 0; i < NKMSANORIGPML4E; i++) { 4335 pm_pml4[KMSANORIGPML4I + i] = (KMSANORIGPDPphys + ptoa(i)) | 4336 X86_PG_RW | X86_PG_V | pg_nx; 4337 } 4338 #endif 4339 for (i = 0; i < ndmpdpphys; i++) { 4340 pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | 4341 X86_PG_V; 4342 } 4343 4344 /* install self-referential address mapping entry(s) */ 4345 pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW | 4346 X86_PG_A | X86_PG_M; 4347 4348 /* install large map entries if configured */ 4349 for (i = 0; i < lm_ents; i++) 4350 pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pmltop[LMSPML4I + i]; 4351 } 4352 4353 void 4354 pmap_pinit_pml5(vm_page_t pml5pg) 4355 { 4356 pml5_entry_t *pm_pml5; 4357 4358 pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg)); 4359 4360 /* 4361 * Add pml5 entry at top of KVA pointing to existing pml4 table, 4362 * entering all existing kernel mappings into level 5 table. 4363 */ 4364 pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | 4365 X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | 4366 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); 4367 4368 /* 4369 * Install self-referential address mapping entry. 4370 */ 4371 pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) | 4372 X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A | 4373 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); 4374 } 4375 4376 static void 4377 pmap_pinit_pml4_pti(vm_page_t pml4pgu) 4378 { 4379 pml4_entry_t *pm_pml4u; 4380 int i; 4381 4382 pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pgu)); 4383 for (i = 0; i < NPML4EPG; i++) 4384 pm_pml4u[i] = pti_pml4[i]; 4385 } 4386 4387 static void 4388 pmap_pinit_pml5_pti(vm_page_t pml5pgu) 4389 { 4390 pml5_entry_t *pm_pml5u; 4391 4392 pm_pml5u = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pgu)); 4393 pagezero(pm_pml5u); 4394 4395 /* 4396 * Add pml5 entry at top of KVA pointing to existing pml4 pti 4397 * table, entering all kernel mappings needed for usermode 4398 * into level 5 table. 4399 */ 4400 pm_pml5u[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 4401 pmap_kextract((vm_offset_t)pti_pml4) | 4402 X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | 4403 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); 4404 } 4405 4406 /* Allocate a page table page and do related bookkeeping */ 4407 static vm_page_t 4408 pmap_alloc_pt_page(pmap_t pmap, vm_pindex_t pindex, int flags) 4409 { 4410 vm_page_t m; 4411 4412 m = vm_page_alloc_noobj(flags); 4413 if (__predict_false(m == NULL)) 4414 return (NULL); 4415 m->pindex = pindex; 4416 pmap_pt_page_count_adj(pmap, 1); 4417 return (m); 4418 } 4419 4420 static void 4421 pmap_free_pt_page(pmap_t pmap, vm_page_t m, bool zerofilled) 4422 { 4423 /* 4424 * This function assumes the page will need to be unwired, 4425 * even though the counterpart allocation in pmap_alloc_pt_page() 4426 * doesn't enforce VM_ALLOC_WIRED. However, all current uses 4427 * of pmap_free_pt_page() require unwiring. The case in which 4428 * a PT page doesn't require unwiring because its ref_count has 4429 * naturally reached 0 is handled through _pmap_unwire_ptp(). 4430 */ 4431 vm_page_unwire_noq(m); 4432 if (zerofilled) 4433 vm_page_free_zero(m); 4434 else 4435 vm_page_free(m); 4436 4437 pmap_pt_page_count_adj(pmap, -1); 4438 } 4439 4440 _Static_assert(sizeof(struct pmap_pcid) == 8, "Fix pcpu zone for pm_pcidp"); 4441 4442 /* 4443 * Initialize a preallocated and zeroed pmap structure, 4444 * such as one in a vmspace structure. 4445 */ 4446 int 4447 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) 4448 { 4449 vm_page_t pmltop_pg, pmltop_pgu; 4450 vm_paddr_t pmltop_phys; 4451 4452 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4453 4454 /* 4455 * Allocate the page directory page. Pass NULL instead of a 4456 * pointer to the pmap here to avoid calling 4457 * pmap_resident_count_adj() through pmap_pt_page_count_adj(), 4458 * since that requires pmap lock. Instead do the accounting 4459 * manually. 4460 * 4461 * Note that final call to pmap_remove() optimization that 4462 * checks for zero resident_count is basically disabled by 4463 * accounting for top-level page. But the optimization was 4464 * not effective since we started using non-managed mapping of 4465 * the shared page. 4466 */ 4467 pmltop_pg = pmap_alloc_pt_page(NULL, 0, VM_ALLOC_WIRED | VM_ALLOC_ZERO | 4468 VM_ALLOC_WAITOK); 4469 pmap_pt_page_count_pinit(pmap, 1); 4470 4471 pmltop_phys = VM_PAGE_TO_PHYS(pmltop_pg); 4472 pmap->pm_pmltop = (pml5_entry_t *)PHYS_TO_DMAP(pmltop_phys); 4473 4474 if (pmap_pcid_enabled) { 4475 if (pmap->pm_pcidp == NULL) 4476 pmap->pm_pcidp = uma_zalloc_pcpu(pcpu_zone_8, 4477 M_WAITOK); 4478 pmap_pinit_pcids(pmap, PMAP_PCID_NONE, 0); 4479 } 4480 pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */ 4481 pmap->pm_ucr3 = PMAP_NO_CR3; 4482 pmap->pm_pmltopu = NULL; 4483 4484 pmap->pm_type = pm_type; 4485 4486 /* 4487 * Do not install the host kernel mappings in the nested page 4488 * tables. These mappings are meaningless in the guest physical 4489 * address space. 4490 * Install minimal kernel mappings in PTI case. 4491 */ 4492 switch (pm_type) { 4493 case PT_X86: 4494 pmap->pm_cr3 = pmltop_phys; 4495 if (pmap_is_la57(pmap)) 4496 pmap_pinit_pml5(pmltop_pg); 4497 else 4498 pmap_pinit_pml4(pmltop_pg); 4499 if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) { 4500 /* 4501 * As with pmltop_pg, pass NULL instead of a 4502 * pointer to the pmap to ensure that the PTI 4503 * page counted explicitly. 4504 */ 4505 pmltop_pgu = pmap_alloc_pt_page(NULL, 0, 4506 VM_ALLOC_WIRED | VM_ALLOC_WAITOK); 4507 pmap_pt_page_count_pinit(pmap, 1); 4508 pmap->pm_pmltopu = (pml4_entry_t *)PHYS_TO_DMAP( 4509 VM_PAGE_TO_PHYS(pmltop_pgu)); 4510 if (pmap_is_la57(pmap)) 4511 pmap_pinit_pml5_pti(pmltop_pgu); 4512 else 4513 pmap_pinit_pml4_pti(pmltop_pgu); 4514 pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pmltop_pgu); 4515 } 4516 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 4517 rangeset_init(&pmap->pm_pkru, pkru_dup_range, 4518 pkru_free_range, pmap, M_NOWAIT); 4519 } 4520 break; 4521 case PT_EPT: 4522 case PT_RVI: 4523 pmap->pm_eptsmr = smr_create("pmap", 0, 0); 4524 break; 4525 } 4526 4527 vm_radix_init(&pmap->pm_root); 4528 CPU_ZERO(&pmap->pm_active); 4529 TAILQ_INIT(&pmap->pm_pvchunk); 4530 pmap->pm_flags = flags; 4531 pmap->pm_eptgen = 0; 4532 4533 return (1); 4534 } 4535 4536 int 4537 pmap_pinit(pmap_t pmap) 4538 { 4539 4540 return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); 4541 } 4542 4543 static void 4544 pmap_allocpte_free_unref(pmap_t pmap, vm_offset_t va, pt_entry_t *pte) 4545 { 4546 vm_page_t mpg; 4547 struct spglist free; 4548 4549 mpg = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 4550 if (mpg->ref_count != 0) 4551 return; 4552 SLIST_INIT(&free); 4553 _pmap_unwire_ptp(pmap, va, mpg, &free); 4554 pmap_invalidate_page(pmap, va); 4555 vm_page_free_pages_toq(&free, true); 4556 } 4557 4558 static pml4_entry_t * 4559 pmap_allocpte_getpml4(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, 4560 bool addref) 4561 { 4562 vm_pindex_t pml5index; 4563 pml5_entry_t *pml5; 4564 pml4_entry_t *pml4; 4565 vm_page_t pml4pg; 4566 pt_entry_t PG_V; 4567 bool allocated; 4568 4569 if (!pmap_is_la57(pmap)) 4570 return (&pmap->pm_pmltop[pmap_pml4e_index(va)]); 4571 4572 PG_V = pmap_valid_bit(pmap); 4573 pml5index = pmap_pml5e_index(va); 4574 pml5 = &pmap->pm_pmltop[pml5index]; 4575 if ((*pml5 & PG_V) == 0) { 4576 if (pmap_allocpte_nosleep(pmap, pmap_pml5e_pindex(va), lockp, 4577 va) == NULL) 4578 return (NULL); 4579 allocated = true; 4580 } else { 4581 allocated = false; 4582 } 4583 pml4 = (pml4_entry_t *)PHYS_TO_DMAP(*pml5 & PG_FRAME); 4584 pml4 = &pml4[pmap_pml4e_index(va)]; 4585 if ((*pml4 & PG_V) == 0) { 4586 pml4pg = PHYS_TO_VM_PAGE(*pml5 & PG_FRAME); 4587 if (allocated && !addref) 4588 pml4pg->ref_count--; 4589 else if (!allocated && addref) 4590 pml4pg->ref_count++; 4591 } 4592 return (pml4); 4593 } 4594 4595 static pdp_entry_t * 4596 pmap_allocpte_getpdp(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, 4597 bool addref) 4598 { 4599 vm_page_t pdppg; 4600 pml4_entry_t *pml4; 4601 pdp_entry_t *pdp; 4602 pt_entry_t PG_V; 4603 bool allocated; 4604 4605 PG_V = pmap_valid_bit(pmap); 4606 4607 pml4 = pmap_allocpte_getpml4(pmap, lockp, va, false); 4608 if (pml4 == NULL) 4609 return (NULL); 4610 4611 if ((*pml4 & PG_V) == 0) { 4612 /* Have to allocate a new pdp, recurse */ 4613 if (pmap_allocpte_nosleep(pmap, pmap_pml4e_pindex(va), lockp, 4614 va) == NULL) { 4615 if (pmap_is_la57(pmap)) 4616 pmap_allocpte_free_unref(pmap, va, 4617 pmap_pml5e(pmap, va)); 4618 return (NULL); 4619 } 4620 allocated = true; 4621 } else { 4622 allocated = false; 4623 } 4624 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 4625 pdp = &pdp[pmap_pdpe_index(va)]; 4626 if ((*pdp & PG_V) == 0) { 4627 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 4628 if (allocated && !addref) 4629 pdppg->ref_count--; 4630 else if (!allocated && addref) 4631 pdppg->ref_count++; 4632 } 4633 return (pdp); 4634 } 4635 4636 /* 4637 * The ptepindexes, i.e. page indices, of the page table pages encountered 4638 * while translating virtual address va are defined as follows: 4639 * - for the page table page (last level), 4640 * ptepindex = pmap_pde_pindex(va) = va >> PDRSHIFT, 4641 * in other words, it is just the index of the PDE that maps the page 4642 * table page. 4643 * - for the page directory page, 4644 * ptepindex = NUPDE (number of userland PD entries) + 4645 * (pmap_pde_index(va) >> NPDEPGSHIFT) 4646 * i.e. index of PDPE is put after the last index of PDE, 4647 * - for the page directory pointer page, 4648 * ptepindex = NUPDE + NUPDPE + (pmap_pde_index(va) >> (NPDEPGSHIFT + 4649 * NPML4EPGSHIFT), 4650 * i.e. index of pml4e is put after the last index of PDPE, 4651 * - for the PML4 page (if LA57 mode is enabled), 4652 * ptepindex = NUPDE + NUPDPE + NUPML4E + (pmap_pde_index(va) >> 4653 * (NPDEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT), 4654 * i.e. index of pml5e is put after the last index of PML4E. 4655 * 4656 * Define an order on the paging entries, where all entries of the 4657 * same height are put together, then heights are put from deepest to 4658 * root. Then ptexpindex is the sequential number of the 4659 * corresponding paging entry in this order. 4660 * 4661 * The values of NUPDE, NUPDPE, and NUPML4E are determined by the size of 4662 * LA57 paging structures even in LA48 paging mode. Moreover, the 4663 * ptepindexes are calculated as if the paging structures were 5-level 4664 * regardless of the actual mode of operation. 4665 * 4666 * The root page at PML4/PML5 does not participate in this indexing scheme, 4667 * since it is statically allocated by pmap_pinit() and not by pmap_allocpte(). 4668 */ 4669 static vm_page_t 4670 pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, 4671 vm_offset_t va) 4672 { 4673 vm_pindex_t pml5index, pml4index; 4674 pml5_entry_t *pml5, *pml5u; 4675 pml4_entry_t *pml4, *pml4u; 4676 pdp_entry_t *pdp; 4677 pd_entry_t *pd; 4678 vm_page_t m, pdpg; 4679 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 4680 4681 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4682 4683 PG_A = pmap_accessed_bit(pmap); 4684 PG_M = pmap_modified_bit(pmap); 4685 PG_V = pmap_valid_bit(pmap); 4686 PG_RW = pmap_rw_bit(pmap); 4687 4688 /* 4689 * Allocate a page table page. 4690 */ 4691 m = pmap_alloc_pt_page(pmap, ptepindex, 4692 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 4693 if (m == NULL) 4694 return (NULL); 4695 4696 /* 4697 * Map the pagetable page into the process address space, if 4698 * it isn't already there. 4699 */ 4700 if (ptepindex >= NUPDE + NUPDPE + NUPML4E) { 4701 MPASS(pmap_is_la57(pmap)); 4702 4703 pml5index = pmap_pml5e_index(va); 4704 pml5 = &pmap->pm_pmltop[pml5index]; 4705 KASSERT((*pml5 & PG_V) == 0, 4706 ("pmap %p va %#lx pml5 %#lx", pmap, va, *pml5)); 4707 *pml5 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4708 4709 if (pmap->pm_pmltopu != NULL && pml5index < NUPML5E) { 4710 if (pmap->pm_ucr3 != PMAP_NO_CR3) 4711 *pml5 |= pg_nx; 4712 4713 pml5u = &pmap->pm_pmltopu[pml5index]; 4714 *pml5u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | 4715 PG_A | PG_M; 4716 } 4717 } else if (ptepindex >= NUPDE + NUPDPE) { 4718 pml4index = pmap_pml4e_index(va); 4719 /* Wire up a new PDPE page */ 4720 pml4 = pmap_allocpte_getpml4(pmap, lockp, va, true); 4721 if (pml4 == NULL) { 4722 pmap_free_pt_page(pmap, m, true); 4723 return (NULL); 4724 } 4725 KASSERT((*pml4 & PG_V) == 0, 4726 ("pmap %p va %#lx pml4 %#lx", pmap, va, *pml4)); 4727 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4728 4729 if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && 4730 pml4index < NUPML4E) { 4731 /* 4732 * PTI: Make all user-space mappings in the 4733 * kernel-mode page table no-execute so that 4734 * we detect any programming errors that leave 4735 * the kernel-mode page table active on return 4736 * to user space. 4737 */ 4738 if (pmap->pm_ucr3 != PMAP_NO_CR3) 4739 *pml4 |= pg_nx; 4740 4741 pml4u = &pmap->pm_pmltopu[pml4index]; 4742 *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | 4743 PG_A | PG_M; 4744 } 4745 } else if (ptepindex >= NUPDE) { 4746 /* Wire up a new PDE page */ 4747 pdp = pmap_allocpte_getpdp(pmap, lockp, va, true); 4748 if (pdp == NULL) { 4749 pmap_free_pt_page(pmap, m, true); 4750 return (NULL); 4751 } 4752 KASSERT((*pdp & PG_V) == 0, 4753 ("pmap %p va %#lx pdp %#lx", pmap, va, *pdp)); 4754 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4755 } else { 4756 /* Wire up a new PTE page */ 4757 pdp = pmap_allocpte_getpdp(pmap, lockp, va, false); 4758 if (pdp == NULL) { 4759 pmap_free_pt_page(pmap, m, true); 4760 return (NULL); 4761 } 4762 if ((*pdp & PG_V) == 0) { 4763 /* Have to allocate a new pd, recurse */ 4764 if (pmap_allocpte_nosleep(pmap, pmap_pdpe_pindex(va), 4765 lockp, va) == NULL) { 4766 pmap_allocpte_free_unref(pmap, va, 4767 pmap_pml4e(pmap, va)); 4768 pmap_free_pt_page(pmap, m, true); 4769 return (NULL); 4770 } 4771 } else { 4772 /* Add reference to the pd page */ 4773 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 4774 pdpg->ref_count++; 4775 } 4776 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 4777 4778 /* Now we know where the page directory page is */ 4779 pd = &pd[pmap_pde_index(va)]; 4780 KASSERT((*pd & PG_V) == 0, 4781 ("pmap %p va %#lx pd %#lx", pmap, va, *pd)); 4782 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4783 } 4784 4785 return (m); 4786 } 4787 4788 /* 4789 * This routine is called if the desired page table page does not exist. 4790 * 4791 * If page table page allocation fails, this routine may sleep before 4792 * returning NULL. It sleeps only if a lock pointer was given. Sleep 4793 * occurs right before returning to the caller. This way, we never 4794 * drop pmap lock to sleep while a page table page has ref_count == 0, 4795 * which prevents the page from being freed under us. 4796 */ 4797 static vm_page_t 4798 pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, 4799 vm_offset_t va) 4800 { 4801 vm_page_t m; 4802 4803 m = pmap_allocpte_nosleep(pmap, ptepindex, lockp, va); 4804 if (m == NULL && lockp != NULL) { 4805 RELEASE_PV_LIST_LOCK(lockp); 4806 PMAP_UNLOCK(pmap); 4807 PMAP_ASSERT_NOT_IN_DI(); 4808 vm_wait(NULL); 4809 PMAP_LOCK(pmap); 4810 } 4811 return (m); 4812 } 4813 4814 static pd_entry_t * 4815 pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, 4816 struct rwlock **lockp) 4817 { 4818 pdp_entry_t *pdpe, PG_V; 4819 pd_entry_t *pde; 4820 vm_page_t pdpg; 4821 vm_pindex_t pdpindex; 4822 4823 PG_V = pmap_valid_bit(pmap); 4824 4825 retry: 4826 pdpe = pmap_pdpe(pmap, va); 4827 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 4828 pde = pmap_pdpe_to_pde(pdpe, va); 4829 if (va < VM_MAXUSER_ADDRESS) { 4830 /* Add a reference to the pd page. */ 4831 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 4832 pdpg->ref_count++; 4833 } else 4834 pdpg = NULL; 4835 } else if (va < VM_MAXUSER_ADDRESS) { 4836 /* Allocate a pd page. */ 4837 pdpindex = pmap_pde_pindex(va) >> NPDPEPGSHIFT; 4838 pdpg = pmap_allocpte_alloc(pmap, NUPDE + pdpindex, lockp, va); 4839 if (pdpg == NULL) { 4840 if (lockp != NULL) 4841 goto retry; 4842 else 4843 return (NULL); 4844 } 4845 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 4846 pde = &pde[pmap_pde_index(va)]; 4847 } else 4848 panic("pmap_alloc_pde: missing page table page for va %#lx", 4849 va); 4850 *pdpgp = pdpg; 4851 return (pde); 4852 } 4853 4854 static vm_page_t 4855 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 4856 { 4857 vm_pindex_t ptepindex; 4858 pd_entry_t *pd, PG_V; 4859 vm_page_t m; 4860 4861 PG_V = pmap_valid_bit(pmap); 4862 4863 /* 4864 * Calculate pagetable page index 4865 */ 4866 ptepindex = pmap_pde_pindex(va); 4867 retry: 4868 /* 4869 * Get the page directory entry 4870 */ 4871 pd = pmap_pde(pmap, va); 4872 4873 /* 4874 * This supports switching from a 2MB page to a 4875 * normal 4K page. 4876 */ 4877 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 4878 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { 4879 /* 4880 * Invalidation of the 2MB page mapping may have caused 4881 * the deallocation of the underlying PD page. 4882 */ 4883 pd = NULL; 4884 } 4885 } 4886 4887 /* 4888 * If the page table page is mapped, we just increment the 4889 * hold count, and activate it. 4890 */ 4891 if (pd != NULL && (*pd & PG_V) != 0) { 4892 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 4893 m->ref_count++; 4894 } else { 4895 /* 4896 * Here if the pte page isn't mapped, or if it has been 4897 * deallocated. 4898 */ 4899 m = pmap_allocpte_alloc(pmap, ptepindex, lockp, va); 4900 if (m == NULL && lockp != NULL) 4901 goto retry; 4902 } 4903 return (m); 4904 } 4905 4906 /*************************************************** 4907 * Pmap allocation/deallocation routines. 4908 ***************************************************/ 4909 4910 /* 4911 * Release any resources held by the given physical map. 4912 * Called when a pmap initialized by pmap_pinit is being released. 4913 * Should only be called if the map contains no valid mappings. 4914 */ 4915 void 4916 pmap_release(pmap_t pmap) 4917 { 4918 vm_page_t m; 4919 int i; 4920 4921 KASSERT(vm_radix_is_empty(&pmap->pm_root), 4922 ("pmap_release: pmap %p has reserved page table page(s)", 4923 pmap)); 4924 KASSERT(CPU_EMPTY(&pmap->pm_active), 4925 ("releasing active pmap %p", pmap)); 4926 4927 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop)); 4928 4929 if (pmap_is_la57(pmap)) { 4930 pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0; 4931 pmap->pm_pmltop[PML5PML5I] = 0; 4932 } else { 4933 for (i = 0; i < NKPML4E; i++) /* KVA */ 4934 pmap->pm_pmltop[KPML4BASE + i] = 0; 4935 #ifdef KASAN 4936 for (i = 0; i < NKASANPML4E; i++) /* KASAN shadow map */ 4937 pmap->pm_pmltop[KASANPML4I + i] = 0; 4938 #endif 4939 #ifdef KMSAN 4940 for (i = 0; i < NKMSANSHADPML4E; i++) /* KMSAN shadow map */ 4941 pmap->pm_pmltop[KMSANSHADPML4I + i] = 0; 4942 for (i = 0; i < NKMSANORIGPML4E; i++) /* KMSAN shadow map */ 4943 pmap->pm_pmltop[KMSANORIGPML4I + i] = 0; 4944 #endif 4945 for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ 4946 pmap->pm_pmltop[DMPML4I + i] = 0; 4947 pmap->pm_pmltop[PML4PML4I] = 0; /* Recursive Mapping */ 4948 for (i = 0; i < lm_ents; i++) /* Large Map */ 4949 pmap->pm_pmltop[LMSPML4I + i] = 0; 4950 } 4951 4952 pmap_free_pt_page(NULL, m, true); 4953 pmap_pt_page_count_pinit(pmap, -1); 4954 4955 if (pmap->pm_pmltopu != NULL) { 4956 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap-> 4957 pm_pmltopu)); 4958 pmap_free_pt_page(NULL, m, false); 4959 pmap_pt_page_count_pinit(pmap, -1); 4960 } 4961 if (pmap->pm_type == PT_X86 && 4962 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) 4963 rangeset_fini(&pmap->pm_pkru); 4964 4965 KASSERT(pmap->pm_stats.resident_count == 0, 4966 ("pmap_release: pmap %p resident count %ld != 0", 4967 pmap, pmap->pm_stats.resident_count)); 4968 } 4969 4970 static int 4971 kvm_size(SYSCTL_HANDLER_ARGS) 4972 { 4973 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 4974 4975 return sysctl_handle_long(oidp, &ksize, 0, req); 4976 } 4977 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 4978 0, 0, kvm_size, "LU", 4979 "Size of KVM"); 4980 4981 static int 4982 kvm_free(SYSCTL_HANDLER_ARGS) 4983 { 4984 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 4985 4986 return sysctl_handle_long(oidp, &kfree, 0, req); 4987 } 4988 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 4989 0, 0, kvm_free, "LU", 4990 "Amount of KVM free"); 4991 4992 #ifdef KMSAN 4993 static void 4994 pmap_kmsan_shadow_map_page_array(vm_paddr_t pdppa, vm_size_t size) 4995 { 4996 pdp_entry_t *pdpe; 4997 pd_entry_t *pde; 4998 pt_entry_t *pte; 4999 vm_paddr_t dummypa, dummypd, dummypt; 5000 int i, npde, npdpg; 5001 5002 npdpg = howmany(size, NBPDP); 5003 npde = size / NBPDR; 5004 5005 dummypa = vm_phys_early_alloc(-1, PAGE_SIZE); 5006 pagezero((void *)PHYS_TO_DMAP(dummypa)); 5007 5008 dummypt = vm_phys_early_alloc(-1, PAGE_SIZE); 5009 pagezero((void *)PHYS_TO_DMAP(dummypt)); 5010 dummypd = vm_phys_early_alloc(-1, PAGE_SIZE * npdpg); 5011 for (i = 0; i < npdpg; i++) 5012 pagezero((void *)PHYS_TO_DMAP(dummypd + ptoa(i))); 5013 5014 pte = (pt_entry_t *)PHYS_TO_DMAP(dummypt); 5015 for (i = 0; i < NPTEPG; i++) 5016 pte[i] = (pt_entry_t)(dummypa | X86_PG_V | X86_PG_RW | 5017 X86_PG_A | X86_PG_M | pg_nx); 5018 5019 pde = (pd_entry_t *)PHYS_TO_DMAP(dummypd); 5020 for (i = 0; i < npde; i++) 5021 pde[i] = (pd_entry_t)(dummypt | X86_PG_V | X86_PG_RW | pg_nx); 5022 5023 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(pdppa); 5024 for (i = 0; i < npdpg; i++) 5025 pdpe[i] = (pdp_entry_t)(dummypd + ptoa(i) | X86_PG_V | 5026 X86_PG_RW | pg_nx); 5027 } 5028 5029 static void 5030 pmap_kmsan_page_array_startup(vm_offset_t start, vm_offset_t end) 5031 { 5032 vm_size_t size; 5033 5034 KASSERT(start % NBPDP == 0, ("unaligned page array start address")); 5035 5036 /* 5037 * The end of the page array's KVA region is 2MB aligned, see 5038 * kmem_init(). 5039 */ 5040 size = round_2mpage(end) - start; 5041 pmap_kmsan_shadow_map_page_array(KMSANSHADPDPphys, size); 5042 pmap_kmsan_shadow_map_page_array(KMSANORIGPDPphys, size); 5043 } 5044 #endif 5045 5046 /* 5047 * Allocate physical memory for the vm_page array and map it into KVA, 5048 * attempting to back the vm_pages with domain-local memory. 5049 */ 5050 void 5051 pmap_page_array_startup(long pages) 5052 { 5053 pdp_entry_t *pdpe; 5054 pd_entry_t *pde, newpdir; 5055 vm_offset_t va, start, end; 5056 vm_paddr_t pa; 5057 long pfn; 5058 int domain, i; 5059 5060 vm_page_array_size = pages; 5061 5062 start = VM_MIN_KERNEL_ADDRESS; 5063 end = start + pages * sizeof(struct vm_page); 5064 for (va = start; va < end; va += NBPDR) { 5065 pfn = first_page + (va - start) / sizeof(struct vm_page); 5066 domain = vm_phys_domain(ptoa(pfn)); 5067 pdpe = pmap_pdpe(kernel_pmap, va); 5068 if ((*pdpe & X86_PG_V) == 0) { 5069 pa = vm_phys_early_alloc(domain, PAGE_SIZE); 5070 dump_add_page(pa); 5071 pagezero((void *)PHYS_TO_DMAP(pa)); 5072 *pdpe = (pdp_entry_t)(pa | X86_PG_V | X86_PG_RW | 5073 X86_PG_A | X86_PG_M); 5074 } 5075 pde = pmap_pdpe_to_pde(pdpe, va); 5076 if ((*pde & X86_PG_V) != 0) 5077 panic("Unexpected pde"); 5078 pa = vm_phys_early_alloc(domain, NBPDR); 5079 for (i = 0; i < NPDEPG; i++) 5080 dump_add_page(pa + i * PAGE_SIZE); 5081 newpdir = (pd_entry_t)(pa | X86_PG_V | X86_PG_RW | X86_PG_A | 5082 X86_PG_M | PG_PS | pg_g | pg_nx); 5083 pde_store(pde, newpdir); 5084 } 5085 vm_page_array = (vm_page_t)start; 5086 5087 #ifdef KMSAN 5088 pmap_kmsan_page_array_startup(start, end); 5089 #endif 5090 } 5091 5092 /* 5093 * grow the number of kernel page table entries, if needed 5094 */ 5095 void 5096 pmap_growkernel(vm_offset_t addr) 5097 { 5098 vm_paddr_t paddr; 5099 vm_page_t nkpg; 5100 pd_entry_t *pde, newpdir; 5101 pdp_entry_t *pdpe; 5102 vm_offset_t end; 5103 5104 TSENTER(); 5105 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 5106 5107 /* 5108 * The kernel map covers two distinct regions of KVA: that used 5109 * for dynamic kernel memory allocations, and the uppermost 2GB 5110 * of the virtual address space. The latter is used to map the 5111 * kernel and loadable kernel modules. This scheme enables the 5112 * use of a special code generation model for kernel code which 5113 * takes advantage of compact addressing modes in machine code. 5114 * 5115 * Both regions grow upwards; to avoid wasting memory, the gap 5116 * in between is unmapped. If "addr" is above "KERNBASE", the 5117 * kernel's region is grown, otherwise the kmem region is grown. 5118 * 5119 * The correctness of this action is based on the following 5120 * argument: vm_map_insert() allocates contiguous ranges of the 5121 * kernel virtual address space. It calls this function if a range 5122 * ends after "kernel_vm_end". If the kernel is mapped between 5123 * "kernel_vm_end" and "addr", then the range cannot begin at 5124 * "kernel_vm_end". In fact, its beginning address cannot be less 5125 * than the kernel. Thus, there is no immediate need to allocate 5126 * any new kernel page table pages between "kernel_vm_end" and 5127 * "KERNBASE". 5128 */ 5129 if (KERNBASE < addr) { 5130 end = KERNBASE + nkpt * NBPDR; 5131 if (end == 0) { 5132 TSEXIT(); 5133 return; 5134 } 5135 } else { 5136 end = kernel_vm_end; 5137 } 5138 5139 addr = roundup2(addr, NBPDR); 5140 if (addr - 1 >= vm_map_max(kernel_map)) 5141 addr = vm_map_max(kernel_map); 5142 if (addr <= end) { 5143 /* 5144 * The grown region is already mapped, so there is 5145 * nothing to do. 5146 */ 5147 TSEXIT(); 5148 return; 5149 } 5150 5151 kasan_shadow_map(end, addr - end); 5152 kmsan_shadow_map(end, addr - end); 5153 while (end < addr) { 5154 pdpe = pmap_pdpe(kernel_pmap, end); 5155 if ((*pdpe & X86_PG_V) == 0) { 5156 nkpg = pmap_alloc_pt_page(kernel_pmap, 5157 pmap_pdpe_pindex(end), VM_ALLOC_WIRED | 5158 VM_ALLOC_INTERRUPT | VM_ALLOC_ZERO); 5159 if (nkpg == NULL) 5160 panic("pmap_growkernel: no memory to grow kernel"); 5161 paddr = VM_PAGE_TO_PHYS(nkpg); 5162 *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | 5163 X86_PG_A | X86_PG_M); 5164 continue; /* try again */ 5165 } 5166 pde = pmap_pdpe_to_pde(pdpe, end); 5167 if ((*pde & X86_PG_V) != 0) { 5168 end = (end + NBPDR) & ~PDRMASK; 5169 if (end - 1 >= vm_map_max(kernel_map)) { 5170 end = vm_map_max(kernel_map); 5171 break; 5172 } 5173 continue; 5174 } 5175 5176 nkpg = pmap_alloc_pt_page(kernel_pmap, pmap_pde_pindex(end), 5177 VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT | VM_ALLOC_ZERO); 5178 if (nkpg == NULL) 5179 panic("pmap_growkernel: no memory to grow kernel"); 5180 paddr = VM_PAGE_TO_PHYS(nkpg); 5181 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; 5182 pde_store(pde, newpdir); 5183 5184 end = (end + NBPDR) & ~PDRMASK; 5185 if (end - 1 >= vm_map_max(kernel_map)) { 5186 end = vm_map_max(kernel_map); 5187 break; 5188 } 5189 } 5190 5191 if (end <= KERNBASE) 5192 kernel_vm_end = end; 5193 else 5194 nkpt = howmany(end - KERNBASE, NBPDR); 5195 TSEXIT(); 5196 } 5197 5198 /*************************************************** 5199 * page management routines. 5200 ***************************************************/ 5201 5202 static const uint64_t pc_freemask[_NPCM] = { 5203 [0 ... _NPCM - 2] = PC_FREEN, 5204 [_NPCM - 1] = PC_FREEL 5205 }; 5206 5207 #ifdef PV_STATS 5208 5209 static COUNTER_U64_DEFINE_EARLY(pc_chunk_count); 5210 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, 5211 &pc_chunk_count, "Current number of pv entry cnunks"); 5212 5213 static COUNTER_U64_DEFINE_EARLY(pc_chunk_allocs); 5214 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, 5215 &pc_chunk_allocs, "Total number of pv entry chunks allocated"); 5216 5217 static COUNTER_U64_DEFINE_EARLY(pc_chunk_frees); 5218 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, 5219 &pc_chunk_frees, "Total number of pv entry chunks freed"); 5220 5221 static COUNTER_U64_DEFINE_EARLY(pc_chunk_tryfail); 5222 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, 5223 &pc_chunk_tryfail, 5224 "Number of failed attempts to get a pv entry chunk page"); 5225 5226 static COUNTER_U64_DEFINE_EARLY(pv_entry_frees); 5227 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, 5228 &pv_entry_frees, "Total number of pv entries freed"); 5229 5230 static COUNTER_U64_DEFINE_EARLY(pv_entry_allocs); 5231 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, 5232 &pv_entry_allocs, "Total number of pv entries allocated"); 5233 5234 static COUNTER_U64_DEFINE_EARLY(pv_entry_count); 5235 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, 5236 &pv_entry_count, "Current number of pv entries"); 5237 5238 static COUNTER_U64_DEFINE_EARLY(pv_entry_spare); 5239 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, 5240 &pv_entry_spare, "Current number of spare pv entries"); 5241 #endif 5242 5243 static void 5244 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di) 5245 { 5246 5247 if (pmap == NULL) 5248 return; 5249 pmap_invalidate_all(pmap); 5250 if (pmap != locked_pmap) 5251 PMAP_UNLOCK(pmap); 5252 if (start_di) 5253 pmap_delayed_invl_finish(); 5254 } 5255 5256 /* 5257 * We are in a serious low memory condition. Resort to 5258 * drastic measures to free some pages so we can allocate 5259 * another pv entry chunk. 5260 * 5261 * Returns NULL if PV entries were reclaimed from the specified pmap. 5262 * 5263 * We do not, however, unmap 2mpages because subsequent accesses will 5264 * allocate per-page pv entries until repromotion occurs, thereby 5265 * exacerbating the shortage of free pv entries. 5266 */ 5267 static vm_page_t 5268 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain) 5269 { 5270 struct pv_chunks_list *pvc; 5271 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 5272 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 5273 struct md_page *pvh; 5274 pd_entry_t *pde; 5275 pmap_t next_pmap, pmap; 5276 pt_entry_t *pte, tpte; 5277 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 5278 pv_entry_t pv; 5279 vm_offset_t va; 5280 vm_page_t m, m_pc; 5281 struct spglist free; 5282 uint64_t inuse; 5283 int bit, field, freed; 5284 bool start_di, restart; 5285 5286 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 5287 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 5288 pmap = NULL; 5289 m_pc = NULL; 5290 PG_G = PG_A = PG_M = PG_RW = 0; 5291 SLIST_INIT(&free); 5292 bzero(&pc_marker_b, sizeof(pc_marker_b)); 5293 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 5294 pc_marker = (struct pv_chunk *)&pc_marker_b; 5295 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 5296 5297 /* 5298 * A delayed invalidation block should already be active if 5299 * pmap_advise() or pmap_remove() called this function by way 5300 * of pmap_demote_pde_locked(). 5301 */ 5302 start_di = pmap_not_in_di(); 5303 5304 pvc = &pv_chunks[domain]; 5305 mtx_lock(&pvc->pvc_lock); 5306 pvc->active_reclaims++; 5307 TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru); 5308 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru); 5309 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 5310 SLIST_EMPTY(&free)) { 5311 next_pmap = pc->pc_pmap; 5312 if (next_pmap == NULL) { 5313 /* 5314 * The next chunk is a marker. However, it is 5315 * not our marker, so active_reclaims must be 5316 * > 1. Consequently, the next_chunk code 5317 * will not rotate the pv_chunks list. 5318 */ 5319 goto next_chunk; 5320 } 5321 mtx_unlock(&pvc->pvc_lock); 5322 5323 /* 5324 * A pv_chunk can only be removed from the pc_lru list 5325 * when both pc_chunks_mutex is owned and the 5326 * corresponding pmap is locked. 5327 */ 5328 if (pmap != next_pmap) { 5329 restart = false; 5330 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, 5331 start_di); 5332 pmap = next_pmap; 5333 /* Avoid deadlock and lock recursion. */ 5334 if (pmap > locked_pmap) { 5335 RELEASE_PV_LIST_LOCK(lockp); 5336 PMAP_LOCK(pmap); 5337 if (start_di) 5338 pmap_delayed_invl_start(); 5339 mtx_lock(&pvc->pvc_lock); 5340 restart = true; 5341 } else if (pmap != locked_pmap) { 5342 if (PMAP_TRYLOCK(pmap)) { 5343 if (start_di) 5344 pmap_delayed_invl_start(); 5345 mtx_lock(&pvc->pvc_lock); 5346 restart = true; 5347 } else { 5348 pmap = NULL; /* pmap is not locked */ 5349 mtx_lock(&pvc->pvc_lock); 5350 pc = TAILQ_NEXT(pc_marker, pc_lru); 5351 if (pc == NULL || 5352 pc->pc_pmap != next_pmap) 5353 continue; 5354 goto next_chunk; 5355 } 5356 } else if (start_di) 5357 pmap_delayed_invl_start(); 5358 PG_G = pmap_global_bit(pmap); 5359 PG_A = pmap_accessed_bit(pmap); 5360 PG_M = pmap_modified_bit(pmap); 5361 PG_RW = pmap_rw_bit(pmap); 5362 if (restart) 5363 continue; 5364 } 5365 5366 /* 5367 * Destroy every non-wired, 4 KB page mapping in the chunk. 5368 */ 5369 freed = 0; 5370 for (field = 0; field < _NPCM; field++) { 5371 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 5372 inuse != 0; inuse &= ~(1UL << bit)) { 5373 bit = bsfq(inuse); 5374 pv = &pc->pc_pventry[field * 64 + bit]; 5375 va = pv->pv_va; 5376 pde = pmap_pde(pmap, va); 5377 if ((*pde & PG_PS) != 0) 5378 continue; 5379 pte = pmap_pde_to_pte(pde, va); 5380 if ((*pte & PG_W) != 0) 5381 continue; 5382 tpte = pte_load_clear(pte); 5383 if ((tpte & PG_G) != 0) 5384 pmap_invalidate_page(pmap, va); 5385 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 5386 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5387 vm_page_dirty(m); 5388 if ((tpte & PG_A) != 0) 5389 vm_page_aflag_set(m, PGA_REFERENCED); 5390 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 5391 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5392 m->md.pv_gen++; 5393 if (TAILQ_EMPTY(&m->md.pv_list) && 5394 (m->flags & PG_FICTITIOUS) == 0) { 5395 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5396 if (TAILQ_EMPTY(&pvh->pv_list)) { 5397 vm_page_aflag_clear(m, 5398 PGA_WRITEABLE); 5399 } 5400 } 5401 pmap_delayed_invl_page(m); 5402 pc->pc_map[field] |= 1UL << bit; 5403 pmap_unuse_pt(pmap, va, *pde, &free); 5404 freed++; 5405 } 5406 } 5407 if (freed == 0) { 5408 mtx_lock(&pvc->pvc_lock); 5409 goto next_chunk; 5410 } 5411 /* Every freed mapping is for a 4 KB page. */ 5412 pmap_resident_count_adj(pmap, -freed); 5413 PV_STAT(counter_u64_add(pv_entry_frees, freed)); 5414 PV_STAT(counter_u64_add(pv_entry_spare, freed)); 5415 PV_STAT(counter_u64_add(pv_entry_count, -freed)); 5416 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5417 if (pc_is_free(pc)) { 5418 PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV)); 5419 PV_STAT(counter_u64_add(pc_chunk_count, -1)); 5420 PV_STAT(counter_u64_add(pc_chunk_frees, 1)); 5421 /* Entire chunk is free; return it. */ 5422 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 5423 dump_drop_page(m_pc->phys_addr); 5424 mtx_lock(&pvc->pvc_lock); 5425 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5426 break; 5427 } 5428 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5429 mtx_lock(&pvc->pvc_lock); 5430 /* One freed pv entry in locked_pmap is sufficient. */ 5431 if (pmap == locked_pmap) 5432 break; 5433 next_chunk: 5434 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 5435 TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru); 5436 if (pvc->active_reclaims == 1 && pmap != NULL) { 5437 /* 5438 * Rotate the pv chunks list so that we do not 5439 * scan the same pv chunks that could not be 5440 * freed (because they contained a wired 5441 * and/or superpage mapping) on every 5442 * invocation of reclaim_pv_chunk(). 5443 */ 5444 while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker) { 5445 MPASS(pc->pc_pmap != NULL); 5446 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5447 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 5448 } 5449 } 5450 } 5451 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 5452 TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru); 5453 pvc->active_reclaims--; 5454 mtx_unlock(&pvc->pvc_lock); 5455 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); 5456 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 5457 m_pc = SLIST_FIRST(&free); 5458 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 5459 /* Recycle a freed page table page. */ 5460 m_pc->ref_count = 1; 5461 } 5462 vm_page_free_pages_toq(&free, true); 5463 return (m_pc); 5464 } 5465 5466 static vm_page_t 5467 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 5468 { 5469 vm_page_t m; 5470 int i, domain; 5471 5472 domain = PCPU_GET(domain); 5473 for (i = 0; i < vm_ndomains; i++) { 5474 m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain); 5475 if (m != NULL) 5476 break; 5477 domain = (domain + 1) % vm_ndomains; 5478 } 5479 5480 return (m); 5481 } 5482 5483 /* 5484 * free the pv_entry back to the free list 5485 */ 5486 static void 5487 free_pv_entry(pmap_t pmap, pv_entry_t pv) 5488 { 5489 struct pv_chunk *pc; 5490 int idx, field, bit; 5491 5492 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5493 PV_STAT(counter_u64_add(pv_entry_frees, 1)); 5494 PV_STAT(counter_u64_add(pv_entry_spare, 1)); 5495 PV_STAT(counter_u64_add(pv_entry_count, -1)); 5496 pc = pv_to_chunk(pv); 5497 idx = pv - &pc->pc_pventry[0]; 5498 field = idx / 64; 5499 bit = idx % 64; 5500 pc->pc_map[field] |= 1ul << bit; 5501 if (!pc_is_free(pc)) { 5502 /* 98% of the time, pc is already at the head of the list. */ 5503 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 5504 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5505 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5506 } 5507 return; 5508 } 5509 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5510 free_pv_chunk(pc); 5511 } 5512 5513 static void 5514 free_pv_chunk_dequeued(struct pv_chunk *pc) 5515 { 5516 vm_page_t m; 5517 5518 PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV)); 5519 PV_STAT(counter_u64_add(pc_chunk_count, -1)); 5520 PV_STAT(counter_u64_add(pc_chunk_frees, 1)); 5521 counter_u64_add(pv_page_count, -1); 5522 /* entire chunk is free, return it */ 5523 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 5524 dump_drop_page(m->phys_addr); 5525 vm_page_unwire_noq(m); 5526 vm_page_free(m); 5527 } 5528 5529 static void 5530 free_pv_chunk(struct pv_chunk *pc) 5531 { 5532 struct pv_chunks_list *pvc; 5533 5534 pvc = &pv_chunks[pc_to_domain(pc)]; 5535 mtx_lock(&pvc->pvc_lock); 5536 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5537 mtx_unlock(&pvc->pvc_lock); 5538 free_pv_chunk_dequeued(pc); 5539 } 5540 5541 static void 5542 free_pv_chunk_batch(struct pv_chunklist *batch) 5543 { 5544 struct pv_chunks_list *pvc; 5545 struct pv_chunk *pc, *npc; 5546 int i; 5547 5548 for (i = 0; i < vm_ndomains; i++) { 5549 if (TAILQ_EMPTY(&batch[i])) 5550 continue; 5551 pvc = &pv_chunks[i]; 5552 mtx_lock(&pvc->pvc_lock); 5553 TAILQ_FOREACH(pc, &batch[i], pc_list) { 5554 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5555 } 5556 mtx_unlock(&pvc->pvc_lock); 5557 } 5558 5559 for (i = 0; i < vm_ndomains; i++) { 5560 TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) { 5561 free_pv_chunk_dequeued(pc); 5562 } 5563 } 5564 } 5565 5566 /* 5567 * Returns a new PV entry, allocating a new PV chunk from the system when 5568 * needed. If this PV chunk allocation fails and a PV list lock pointer was 5569 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 5570 * returned. 5571 * 5572 * The given PV list lock may be released. 5573 */ 5574 static pv_entry_t 5575 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 5576 { 5577 struct pv_chunks_list *pvc; 5578 int bit, field; 5579 pv_entry_t pv; 5580 struct pv_chunk *pc; 5581 vm_page_t m; 5582 5583 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5584 PV_STAT(counter_u64_add(pv_entry_allocs, 1)); 5585 retry: 5586 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 5587 if (pc != NULL) { 5588 for (field = 0; field < _NPCM; field++) { 5589 if (pc->pc_map[field]) { 5590 bit = bsfq(pc->pc_map[field]); 5591 break; 5592 } 5593 } 5594 if (field < _NPCM) { 5595 pv = &pc->pc_pventry[field * 64 + bit]; 5596 pc->pc_map[field] &= ~(1ul << bit); 5597 /* If this was the last item, move it to tail */ 5598 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 5599 pc->pc_map[2] == 0) { 5600 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5601 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 5602 pc_list); 5603 } 5604 PV_STAT(counter_u64_add(pv_entry_count, 1)); 5605 PV_STAT(counter_u64_add(pv_entry_spare, -1)); 5606 return (pv); 5607 } 5608 } 5609 /* No free items, allocate another chunk */ 5610 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 5611 if (m == NULL) { 5612 if (lockp == NULL) { 5613 PV_STAT(counter_u64_add(pc_chunk_tryfail, 1)); 5614 return (NULL); 5615 } 5616 m = reclaim_pv_chunk(pmap, lockp); 5617 if (m == NULL) 5618 goto retry; 5619 } else 5620 counter_u64_add(pv_page_count, 1); 5621 PV_STAT(counter_u64_add(pc_chunk_count, 1)); 5622 PV_STAT(counter_u64_add(pc_chunk_allocs, 1)); 5623 dump_add_page(m->phys_addr); 5624 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 5625 pc->pc_pmap = pmap; 5626 pc->pc_map[0] = PC_FREEN & ~1ul; /* preallocated bit 0 */ 5627 pc->pc_map[1] = PC_FREEN; 5628 pc->pc_map[2] = PC_FREEL; 5629 pvc = &pv_chunks[vm_page_domain(m)]; 5630 mtx_lock(&pvc->pvc_lock); 5631 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 5632 mtx_unlock(&pvc->pvc_lock); 5633 pv = &pc->pc_pventry[0]; 5634 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5635 PV_STAT(counter_u64_add(pv_entry_count, 1)); 5636 PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV - 1)); 5637 return (pv); 5638 } 5639 5640 /* 5641 * Returns the number of one bits within the given PV chunk map. 5642 * 5643 * The erratas for Intel processors state that "POPCNT Instruction May 5644 * Take Longer to Execute Than Expected". It is believed that the 5645 * issue is the spurious dependency on the destination register. 5646 * Provide a hint to the register rename logic that the destination 5647 * value is overwritten, by clearing it, as suggested in the 5648 * optimization manual. It should be cheap for unaffected processors 5649 * as well. 5650 * 5651 * Reference numbers for erratas are 5652 * 4th Gen Core: HSD146 5653 * 5th Gen Core: BDM85 5654 * 6th Gen Core: SKL029 5655 */ 5656 static int 5657 popcnt_pc_map_pq(uint64_t *map) 5658 { 5659 u_long result, tmp; 5660 5661 __asm __volatile("xorl %k0,%k0;popcntq %2,%0;" 5662 "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;" 5663 "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0" 5664 : "=&r" (result), "=&r" (tmp) 5665 : "m" (map[0]), "m" (map[1]), "m" (map[2])); 5666 return (result); 5667 } 5668 5669 /* 5670 * Ensure that the number of spare PV entries in the specified pmap meets or 5671 * exceeds the given count, "needed". 5672 * 5673 * The given PV list lock may be released. 5674 */ 5675 static void 5676 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 5677 { 5678 struct pv_chunks_list *pvc; 5679 struct pch new_tail[PMAP_MEMDOM]; 5680 struct pv_chunk *pc; 5681 vm_page_t m; 5682 int avail, free, i; 5683 bool reclaimed; 5684 5685 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5686 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 5687 5688 /* 5689 * Newly allocated PV chunks must be stored in a private list until 5690 * the required number of PV chunks have been allocated. Otherwise, 5691 * reclaim_pv_chunk() could recycle one of these chunks. In 5692 * contrast, these chunks must be added to the pmap upon allocation. 5693 */ 5694 for (i = 0; i < PMAP_MEMDOM; i++) 5695 TAILQ_INIT(&new_tail[i]); 5696 retry: 5697 avail = 0; 5698 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 5699 #ifndef __POPCNT__ 5700 if ((cpu_feature2 & CPUID2_POPCNT) == 0) 5701 bit_count((bitstr_t *)pc->pc_map, 0, 5702 sizeof(pc->pc_map) * NBBY, &free); 5703 else 5704 #endif 5705 free = popcnt_pc_map_pq(pc->pc_map); 5706 if (free == 0) 5707 break; 5708 avail += free; 5709 if (avail >= needed) 5710 break; 5711 } 5712 for (reclaimed = false; avail < needed; avail += _NPCPV) { 5713 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 5714 if (m == NULL) { 5715 m = reclaim_pv_chunk(pmap, lockp); 5716 if (m == NULL) 5717 goto retry; 5718 reclaimed = true; 5719 } else 5720 counter_u64_add(pv_page_count, 1); 5721 PV_STAT(counter_u64_add(pc_chunk_count, 1)); 5722 PV_STAT(counter_u64_add(pc_chunk_allocs, 1)); 5723 dump_add_page(m->phys_addr); 5724 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 5725 pc->pc_pmap = pmap; 5726 pc->pc_map[0] = PC_FREEN; 5727 pc->pc_map[1] = PC_FREEN; 5728 pc->pc_map[2] = PC_FREEL; 5729 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5730 TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru); 5731 PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV)); 5732 5733 /* 5734 * The reclaim might have freed a chunk from the current pmap. 5735 * If that chunk contained available entries, we need to 5736 * re-count the number of available entries. 5737 */ 5738 if (reclaimed) 5739 goto retry; 5740 } 5741 for (i = 0; i < vm_ndomains; i++) { 5742 if (TAILQ_EMPTY(&new_tail[i])) 5743 continue; 5744 pvc = &pv_chunks[i]; 5745 mtx_lock(&pvc->pvc_lock); 5746 TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru); 5747 mtx_unlock(&pvc->pvc_lock); 5748 } 5749 } 5750 5751 /* 5752 * First find and then remove the pv entry for the specified pmap and virtual 5753 * address from the specified pv list. Returns the pv entry if found and NULL 5754 * otherwise. This operation can be performed on pv lists for either 4KB or 5755 * 2MB page mappings. 5756 */ 5757 static __inline pv_entry_t 5758 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 5759 { 5760 pv_entry_t pv; 5761 5762 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5763 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 5764 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5765 pvh->pv_gen++; 5766 break; 5767 } 5768 } 5769 return (pv); 5770 } 5771 5772 /* 5773 * After demotion from a 2MB page mapping to 512 4KB page mappings, 5774 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 5775 * entries for each of the 4KB page mappings. 5776 */ 5777 static void 5778 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 5779 struct rwlock **lockp) 5780 { 5781 struct md_page *pvh; 5782 struct pv_chunk *pc; 5783 pv_entry_t pv; 5784 vm_offset_t va_last; 5785 vm_page_t m; 5786 int bit, field; 5787 5788 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5789 KASSERT((pa & PDRMASK) == 0, 5790 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 5791 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5792 5793 /* 5794 * Transfer the 2mpage's pv entry for this mapping to the first 5795 * page's pv list. Once this transfer begins, the pv list lock 5796 * must not be released until the last pv entry is reinstantiated. 5797 */ 5798 pvh = pa_to_pvh(pa); 5799 va = trunc_2mpage(va); 5800 pv = pmap_pvh_remove(pvh, pmap, va); 5801 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 5802 m = PHYS_TO_VM_PAGE(pa); 5803 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5804 m->md.pv_gen++; 5805 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 5806 PV_STAT(counter_u64_add(pv_entry_allocs, NPTEPG - 1)); 5807 va_last = va + NBPDR - PAGE_SIZE; 5808 for (;;) { 5809 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 5810 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 5811 pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); 5812 for (field = 0; field < _NPCM; field++) { 5813 while (pc->pc_map[field]) { 5814 bit = bsfq(pc->pc_map[field]); 5815 pc->pc_map[field] &= ~(1ul << bit); 5816 pv = &pc->pc_pventry[field * 64 + bit]; 5817 va += PAGE_SIZE; 5818 pv->pv_va = va; 5819 m++; 5820 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5821 ("pmap_pv_demote_pde: page %p is not managed", m)); 5822 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5823 m->md.pv_gen++; 5824 if (va == va_last) 5825 goto out; 5826 } 5827 } 5828 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5829 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 5830 } 5831 out: 5832 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 5833 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5834 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 5835 } 5836 PV_STAT(counter_u64_add(pv_entry_count, NPTEPG - 1)); 5837 PV_STAT(counter_u64_add(pv_entry_spare, -(NPTEPG - 1))); 5838 } 5839 5840 #if VM_NRESERVLEVEL > 0 5841 /* 5842 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 5843 * replace the many pv entries for the 4KB page mappings by a single pv entry 5844 * for the 2MB page mapping. 5845 */ 5846 static void 5847 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 5848 struct rwlock **lockp) 5849 { 5850 struct md_page *pvh; 5851 pv_entry_t pv; 5852 vm_offset_t va_last; 5853 vm_page_t m; 5854 5855 KASSERT((pa & PDRMASK) == 0, 5856 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 5857 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5858 5859 /* 5860 * Transfer the first page's pv entry for this mapping to the 2mpage's 5861 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 5862 * a transfer avoids the possibility that get_pv_entry() calls 5863 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 5864 * mappings that is being promoted. 5865 */ 5866 m = PHYS_TO_VM_PAGE(pa); 5867 va = trunc_2mpage(va); 5868 pv = pmap_pvh_remove(&m->md, pmap, va); 5869 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 5870 pvh = pa_to_pvh(pa); 5871 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5872 pvh->pv_gen++; 5873 /* Free the remaining NPTEPG - 1 pv entries. */ 5874 va_last = va + NBPDR - PAGE_SIZE; 5875 do { 5876 m++; 5877 va += PAGE_SIZE; 5878 pmap_pvh_free(&m->md, pmap, va); 5879 } while (va < va_last); 5880 } 5881 #endif /* VM_NRESERVLEVEL > 0 */ 5882 5883 /* 5884 * First find and then destroy the pv entry for the specified pmap and virtual 5885 * address. This operation can be performed on pv lists for either 4KB or 2MB 5886 * page mappings. 5887 */ 5888 static void 5889 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 5890 { 5891 pv_entry_t pv; 5892 5893 pv = pmap_pvh_remove(pvh, pmap, va); 5894 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 5895 free_pv_entry(pmap, pv); 5896 } 5897 5898 /* 5899 * Conditionally create the PV entry for a 4KB page mapping if the required 5900 * memory can be allocated without resorting to reclamation. 5901 */ 5902 static boolean_t 5903 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 5904 struct rwlock **lockp) 5905 { 5906 pv_entry_t pv; 5907 5908 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5909 /* Pass NULL instead of the lock pointer to disable reclamation. */ 5910 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 5911 pv->pv_va = va; 5912 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 5913 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5914 m->md.pv_gen++; 5915 return (TRUE); 5916 } else 5917 return (FALSE); 5918 } 5919 5920 /* 5921 * Create the PV entry for a 2MB page mapping. Always returns true unless the 5922 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 5923 * false if the PV entry cannot be allocated without resorting to reclamation. 5924 */ 5925 static bool 5926 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, 5927 struct rwlock **lockp) 5928 { 5929 struct md_page *pvh; 5930 pv_entry_t pv; 5931 vm_paddr_t pa; 5932 5933 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5934 /* Pass NULL instead of the lock pointer to disable reclamation. */ 5935 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 5936 NULL : lockp)) == NULL) 5937 return (false); 5938 pv->pv_va = va; 5939 pa = pde & PG_PS_FRAME; 5940 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5941 pvh = pa_to_pvh(pa); 5942 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5943 pvh->pv_gen++; 5944 return (true); 5945 } 5946 5947 /* 5948 * Fills a page table page with mappings to consecutive physical pages. 5949 */ 5950 static void 5951 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 5952 { 5953 pt_entry_t *pte; 5954 5955 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 5956 *pte = newpte; 5957 newpte += PAGE_SIZE; 5958 } 5959 } 5960 5961 /* 5962 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 5963 * mapping is invalidated. 5964 */ 5965 static boolean_t 5966 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 5967 { 5968 struct rwlock *lock; 5969 boolean_t rv; 5970 5971 lock = NULL; 5972 rv = pmap_demote_pde_locked(pmap, pde, va, &lock); 5973 if (lock != NULL) 5974 rw_wunlock(lock); 5975 return (rv); 5976 } 5977 5978 static void 5979 pmap_demote_pde_check(pt_entry_t *firstpte __unused, pt_entry_t newpte __unused) 5980 { 5981 #ifdef INVARIANTS 5982 #ifdef DIAGNOSTIC 5983 pt_entry_t *xpte, *ypte; 5984 5985 for (xpte = firstpte; xpte < firstpte + NPTEPG; 5986 xpte++, newpte += PAGE_SIZE) { 5987 if ((*xpte & PG_FRAME) != (newpte & PG_FRAME)) { 5988 printf("pmap_demote_pde: xpte %zd and newpte map " 5989 "different pages: found %#lx, expected %#lx\n", 5990 xpte - firstpte, *xpte, newpte); 5991 printf("page table dump\n"); 5992 for (ypte = firstpte; ypte < firstpte + NPTEPG; ypte++) 5993 printf("%zd %#lx\n", ypte - firstpte, *ypte); 5994 panic("firstpte"); 5995 } 5996 } 5997 #else 5998 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 5999 ("pmap_demote_pde: firstpte and newpte map different physical" 6000 " addresses")); 6001 #endif 6002 #endif 6003 } 6004 6005 static void 6006 pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 6007 pd_entry_t oldpde, struct rwlock **lockp) 6008 { 6009 struct spglist free; 6010 vm_offset_t sva; 6011 6012 SLIST_INIT(&free); 6013 sva = trunc_2mpage(va); 6014 pmap_remove_pde(pmap, pde, sva, &free, lockp); 6015 if ((oldpde & pmap_global_bit(pmap)) == 0) 6016 pmap_invalidate_pde_page(pmap, sva, oldpde); 6017 vm_page_free_pages_toq(&free, true); 6018 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx in pmap %p", 6019 va, pmap); 6020 } 6021 6022 static boolean_t 6023 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 6024 struct rwlock **lockp) 6025 { 6026 pd_entry_t newpde, oldpde; 6027 pt_entry_t *firstpte, newpte; 6028 pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V; 6029 vm_paddr_t mptepa; 6030 vm_page_t mpte; 6031 int PG_PTE_CACHE; 6032 bool in_kernel; 6033 6034 PG_A = pmap_accessed_bit(pmap); 6035 PG_G = pmap_global_bit(pmap); 6036 PG_M = pmap_modified_bit(pmap); 6037 PG_RW = pmap_rw_bit(pmap); 6038 PG_V = pmap_valid_bit(pmap); 6039 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 6040 PG_PKU_MASK = pmap_pku_mask_bit(pmap); 6041 6042 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6043 in_kernel = va >= VM_MAXUSER_ADDRESS; 6044 oldpde = *pde; 6045 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 6046 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 6047 6048 /* 6049 * Invalidate the 2MB page mapping and return "failure" if the 6050 * mapping was never accessed. 6051 */ 6052 if ((oldpde & PG_A) == 0) { 6053 KASSERT((oldpde & PG_W) == 0, 6054 ("pmap_demote_pde: a wired mapping is missing PG_A")); 6055 pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); 6056 return (FALSE); 6057 } 6058 6059 mpte = pmap_remove_pt_page(pmap, va); 6060 if (mpte == NULL) { 6061 KASSERT((oldpde & PG_W) == 0, 6062 ("pmap_demote_pde: page table page for a wired mapping" 6063 " is missing")); 6064 6065 /* 6066 * If the page table page is missing and the mapping 6067 * is for a kernel address, the mapping must belong to 6068 * the direct map. Page table pages are preallocated 6069 * for every other part of the kernel address space, 6070 * so the direct map region is the only part of the 6071 * kernel address space that must be handled here. 6072 */ 6073 KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS && 6074 va < DMAP_MAX_ADDRESS), 6075 ("pmap_demote_pde: No saved mpte for va %#lx", va)); 6076 6077 /* 6078 * If the 2MB page mapping belongs to the direct map 6079 * region of the kernel's address space, then the page 6080 * allocation request specifies the highest possible 6081 * priority (VM_ALLOC_INTERRUPT). Otherwise, the 6082 * priority is normal. 6083 */ 6084 mpte = pmap_alloc_pt_page(pmap, pmap_pde_pindex(va), 6085 (in_kernel ? VM_ALLOC_INTERRUPT : 0) | VM_ALLOC_WIRED); 6086 6087 /* 6088 * If the allocation of the new page table page fails, 6089 * invalidate the 2MB page mapping and return "failure". 6090 */ 6091 if (mpte == NULL) { 6092 pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); 6093 return (FALSE); 6094 } 6095 6096 if (!in_kernel) 6097 mpte->ref_count = NPTEPG; 6098 } 6099 mptepa = VM_PAGE_TO_PHYS(mpte); 6100 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 6101 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 6102 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 6103 ("pmap_demote_pde: oldpde is missing PG_M")); 6104 newpte = oldpde & ~PG_PS; 6105 newpte = pmap_swap_pat(pmap, newpte); 6106 6107 /* 6108 * If the PTP is not leftover from an earlier promotion or it does not 6109 * have PG_A set in every PTE, then fill it. The new PTEs will all 6110 * have PG_A set. 6111 */ 6112 if (!vm_page_all_valid(mpte)) 6113 pmap_fill_ptp(firstpte, newpte); 6114 6115 pmap_demote_pde_check(firstpte, newpte); 6116 6117 /* 6118 * If the mapping has changed attributes, update the PTEs. 6119 */ 6120 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 6121 pmap_fill_ptp(firstpte, newpte); 6122 6123 /* 6124 * The spare PV entries must be reserved prior to demoting the 6125 * mapping, that is, prior to changing the PDE. Otherwise, the state 6126 * of the PDE and the PV lists will be inconsistent, which can result 6127 * in reclaim_pv_chunk() attempting to remove a PV entry from the 6128 * wrong PV list and pmap_pv_demote_pde() failing to find the expected 6129 * PV entry for the 2MB page mapping that is being demoted. 6130 */ 6131 if ((oldpde & PG_MANAGED) != 0) 6132 reserve_pv_entries(pmap, NPTEPG - 1, lockp); 6133 6134 /* 6135 * Demote the mapping. This pmap is locked. The old PDE has 6136 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 6137 * set. Thus, there is no danger of a race with another 6138 * processor changing the setting of PG_A and/or PG_M between 6139 * the read above and the store below. 6140 */ 6141 if (workaround_erratum383) 6142 pmap_update_pde(pmap, va, pde, newpde); 6143 else 6144 pde_store(pde, newpde); 6145 6146 /* 6147 * Invalidate a stale recursive mapping of the page table page. 6148 */ 6149 if (in_kernel) 6150 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 6151 6152 /* 6153 * Demote the PV entry. 6154 */ 6155 if ((oldpde & PG_MANAGED) != 0) 6156 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); 6157 6158 counter_u64_add(pmap_pde_demotions, 1); 6159 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p", 6160 va, pmap); 6161 return (TRUE); 6162 } 6163 6164 /* 6165 * pmap_remove_kernel_pde: Remove a kernel superpage mapping. 6166 */ 6167 static void 6168 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 6169 { 6170 pd_entry_t newpde; 6171 vm_paddr_t mptepa; 6172 vm_page_t mpte; 6173 6174 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 6175 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6176 mpte = pmap_remove_pt_page(pmap, va); 6177 if (mpte == NULL) 6178 panic("pmap_remove_kernel_pde: Missing pt page."); 6179 6180 mptepa = VM_PAGE_TO_PHYS(mpte); 6181 newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; 6182 6183 /* 6184 * If this page table page was unmapped by a promotion, then it 6185 * contains valid mappings. Zero it to invalidate those mappings. 6186 */ 6187 if (vm_page_any_valid(mpte)) 6188 pagezero((void *)PHYS_TO_DMAP(mptepa)); 6189 6190 /* 6191 * Demote the mapping. 6192 */ 6193 if (workaround_erratum383) 6194 pmap_update_pde(pmap, va, pde, newpde); 6195 else 6196 pde_store(pde, newpde); 6197 6198 /* 6199 * Invalidate a stale recursive mapping of the page table page. 6200 */ 6201 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 6202 } 6203 6204 /* 6205 * pmap_remove_pde: do the things to unmap a superpage in a process 6206 */ 6207 static int 6208 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 6209 struct spglist *free, struct rwlock **lockp) 6210 { 6211 struct md_page *pvh; 6212 pd_entry_t oldpde; 6213 vm_offset_t eva, va; 6214 vm_page_t m, mpte; 6215 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 6216 6217 PG_G = pmap_global_bit(pmap); 6218 PG_A = pmap_accessed_bit(pmap); 6219 PG_M = pmap_modified_bit(pmap); 6220 PG_RW = pmap_rw_bit(pmap); 6221 6222 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6223 KASSERT((sva & PDRMASK) == 0, 6224 ("pmap_remove_pde: sva is not 2mpage aligned")); 6225 oldpde = pte_load_clear(pdq); 6226 if (oldpde & PG_W) 6227 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 6228 if ((oldpde & PG_G) != 0) 6229 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 6230 pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE); 6231 if (oldpde & PG_MANAGED) { 6232 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); 6233 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 6234 pmap_pvh_free(pvh, pmap, sva); 6235 eva = sva + NBPDR; 6236 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 6237 va < eva; va += PAGE_SIZE, m++) { 6238 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6239 vm_page_dirty(m); 6240 if (oldpde & PG_A) 6241 vm_page_aflag_set(m, PGA_REFERENCED); 6242 if (TAILQ_EMPTY(&m->md.pv_list) && 6243 TAILQ_EMPTY(&pvh->pv_list)) 6244 vm_page_aflag_clear(m, PGA_WRITEABLE); 6245 pmap_delayed_invl_page(m); 6246 } 6247 } 6248 if (pmap == kernel_pmap) { 6249 pmap_remove_kernel_pde(pmap, pdq, sva); 6250 } else { 6251 mpte = pmap_remove_pt_page(pmap, sva); 6252 if (mpte != NULL) { 6253 KASSERT(vm_page_any_valid(mpte), 6254 ("pmap_remove_pde: pte page not promoted")); 6255 pmap_pt_page_count_adj(pmap, -1); 6256 KASSERT(mpte->ref_count == NPTEPG, 6257 ("pmap_remove_pde: pte page ref count error")); 6258 mpte->ref_count = 0; 6259 pmap_add_delayed_free_list(mpte, free, FALSE); 6260 } 6261 } 6262 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); 6263 } 6264 6265 /* 6266 * pmap_remove_pte: do the things to unmap a page in a process 6267 */ 6268 static int 6269 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 6270 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) 6271 { 6272 struct md_page *pvh; 6273 pt_entry_t oldpte, PG_A, PG_M, PG_RW; 6274 vm_page_t m; 6275 6276 PG_A = pmap_accessed_bit(pmap); 6277 PG_M = pmap_modified_bit(pmap); 6278 PG_RW = pmap_rw_bit(pmap); 6279 6280 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6281 oldpte = pte_load_clear(ptq); 6282 if (oldpte & PG_W) 6283 pmap->pm_stats.wired_count -= 1; 6284 pmap_resident_count_adj(pmap, -1); 6285 if (oldpte & PG_MANAGED) { 6286 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 6287 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6288 vm_page_dirty(m); 6289 if (oldpte & PG_A) 6290 vm_page_aflag_set(m, PGA_REFERENCED); 6291 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 6292 pmap_pvh_free(&m->md, pmap, va); 6293 if (TAILQ_EMPTY(&m->md.pv_list) && 6294 (m->flags & PG_FICTITIOUS) == 0) { 6295 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6296 if (TAILQ_EMPTY(&pvh->pv_list)) 6297 vm_page_aflag_clear(m, PGA_WRITEABLE); 6298 } 6299 pmap_delayed_invl_page(m); 6300 } 6301 return (pmap_unuse_pt(pmap, va, ptepde, free)); 6302 } 6303 6304 /* 6305 * Remove a single page from a process address space 6306 */ 6307 static void 6308 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 6309 struct spglist *free) 6310 { 6311 struct rwlock *lock; 6312 pt_entry_t *pte, PG_V; 6313 6314 PG_V = pmap_valid_bit(pmap); 6315 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6316 if ((*pde & PG_V) == 0) 6317 return; 6318 pte = pmap_pde_to_pte(pde, va); 6319 if ((*pte & PG_V) == 0) 6320 return; 6321 lock = NULL; 6322 pmap_remove_pte(pmap, pte, va, *pde, free, &lock); 6323 if (lock != NULL) 6324 rw_wunlock(lock); 6325 pmap_invalidate_page(pmap, va); 6326 } 6327 6328 /* 6329 * Removes the specified range of addresses from the page table page. 6330 */ 6331 static bool 6332 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 6333 pd_entry_t *pde, struct spglist *free, struct rwlock **lockp) 6334 { 6335 pt_entry_t PG_G, *pte; 6336 vm_offset_t va; 6337 bool anyvalid; 6338 6339 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6340 PG_G = pmap_global_bit(pmap); 6341 anyvalid = false; 6342 va = eva; 6343 for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++, 6344 sva += PAGE_SIZE) { 6345 if (*pte == 0) { 6346 if (va != eva) { 6347 pmap_invalidate_range(pmap, va, sva); 6348 va = eva; 6349 } 6350 continue; 6351 } 6352 if ((*pte & PG_G) == 0) 6353 anyvalid = true; 6354 else if (va == eva) 6355 va = sva; 6356 if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) { 6357 sva += PAGE_SIZE; 6358 break; 6359 } 6360 } 6361 if (va != eva) 6362 pmap_invalidate_range(pmap, va, sva); 6363 return (anyvalid); 6364 } 6365 6366 static void 6367 pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete) 6368 { 6369 struct rwlock *lock; 6370 vm_page_t mt; 6371 vm_offset_t va_next; 6372 pml5_entry_t *pml5e; 6373 pml4_entry_t *pml4e; 6374 pdp_entry_t *pdpe; 6375 pd_entry_t ptpaddr, *pde; 6376 pt_entry_t PG_G, PG_V; 6377 struct spglist free; 6378 int anyvalid; 6379 6380 PG_G = pmap_global_bit(pmap); 6381 PG_V = pmap_valid_bit(pmap); 6382 6383 /* 6384 * If there are no resident pages besides the top level page 6385 * table page(s), there is nothing to do. Kernel pmap always 6386 * accounts whole preloaded area as resident, which makes its 6387 * resident count > 2. 6388 * Perform an unsynchronized read. This is, however, safe. 6389 */ 6390 if (pmap->pm_stats.resident_count <= 1 + (pmap->pm_pmltopu != NULL ? 6391 1 : 0)) 6392 return; 6393 6394 anyvalid = 0; 6395 SLIST_INIT(&free); 6396 6397 pmap_delayed_invl_start(); 6398 PMAP_LOCK(pmap); 6399 if (map_delete) 6400 pmap_pkru_on_remove(pmap, sva, eva); 6401 6402 /* 6403 * special handling of removing one page. a very 6404 * common operation and easy to short circuit some 6405 * code. 6406 */ 6407 if (sva + PAGE_SIZE == eva) { 6408 pde = pmap_pde(pmap, sva); 6409 if (pde && (*pde & PG_PS) == 0) { 6410 pmap_remove_page(pmap, sva, pde, &free); 6411 goto out; 6412 } 6413 } 6414 6415 lock = NULL; 6416 for (; sva < eva; sva = va_next) { 6417 if (pmap->pm_stats.resident_count == 0) 6418 break; 6419 6420 if (pmap_is_la57(pmap)) { 6421 pml5e = pmap_pml5e(pmap, sva); 6422 if ((*pml5e & PG_V) == 0) { 6423 va_next = (sva + NBPML5) & ~PML5MASK; 6424 if (va_next < sva) 6425 va_next = eva; 6426 continue; 6427 } 6428 pml4e = pmap_pml5e_to_pml4e(pml5e, sva); 6429 } else { 6430 pml4e = pmap_pml4e(pmap, sva); 6431 } 6432 if ((*pml4e & PG_V) == 0) { 6433 va_next = (sva + NBPML4) & ~PML4MASK; 6434 if (va_next < sva) 6435 va_next = eva; 6436 continue; 6437 } 6438 6439 va_next = (sva + NBPDP) & ~PDPMASK; 6440 if (va_next < sva) 6441 va_next = eva; 6442 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 6443 if ((*pdpe & PG_V) == 0) 6444 continue; 6445 if ((*pdpe & PG_PS) != 0) { 6446 KASSERT(va_next <= eva, 6447 ("partial update of non-transparent 1G mapping " 6448 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 6449 *pdpe, sva, eva, va_next)); 6450 MPASS(pmap != kernel_pmap); /* XXXKIB */ 6451 MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); 6452 anyvalid = 1; 6453 *pdpe = 0; 6454 pmap_resident_count_adj(pmap, -NBPDP / PAGE_SIZE); 6455 mt = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, sva) & PG_FRAME); 6456 pmap_unwire_ptp(pmap, sva, mt, &free); 6457 continue; 6458 } 6459 6460 /* 6461 * Calculate index for next page table. 6462 */ 6463 va_next = (sva + NBPDR) & ~PDRMASK; 6464 if (va_next < sva) 6465 va_next = eva; 6466 6467 pde = pmap_pdpe_to_pde(pdpe, sva); 6468 ptpaddr = *pde; 6469 6470 /* 6471 * Weed out invalid mappings. 6472 */ 6473 if (ptpaddr == 0) 6474 continue; 6475 6476 /* 6477 * Check for large page. 6478 */ 6479 if ((ptpaddr & PG_PS) != 0) { 6480 /* 6481 * Are we removing the entire large page? If not, 6482 * demote the mapping and fall through. 6483 */ 6484 if (sva + NBPDR == va_next && eva >= va_next) { 6485 /* 6486 * The TLB entry for a PG_G mapping is 6487 * invalidated by pmap_remove_pde(). 6488 */ 6489 if ((ptpaddr & PG_G) == 0) 6490 anyvalid = 1; 6491 pmap_remove_pde(pmap, pde, sva, &free, &lock); 6492 continue; 6493 } else if (!pmap_demote_pde_locked(pmap, pde, sva, 6494 &lock)) { 6495 /* The large page mapping was destroyed. */ 6496 continue; 6497 } else 6498 ptpaddr = *pde; 6499 } 6500 6501 /* 6502 * Limit our scan to either the end of the va represented 6503 * by the current page table page, or to the end of the 6504 * range being removed. 6505 */ 6506 if (va_next > eva) 6507 va_next = eva; 6508 6509 if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock)) 6510 anyvalid = 1; 6511 } 6512 if (lock != NULL) 6513 rw_wunlock(lock); 6514 out: 6515 if (anyvalid) 6516 pmap_invalidate_all(pmap); 6517 PMAP_UNLOCK(pmap); 6518 pmap_delayed_invl_finish(); 6519 vm_page_free_pages_toq(&free, true); 6520 } 6521 6522 /* 6523 * Remove the given range of addresses from the specified map. 6524 * 6525 * It is assumed that the start and end are properly 6526 * rounded to the page size. 6527 */ 6528 void 6529 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 6530 { 6531 pmap_remove1(pmap, sva, eva, false); 6532 } 6533 6534 /* 6535 * Remove the given range of addresses as part of a logical unmap 6536 * operation. This has the effect of calling pmap_remove(), but 6537 * also clears any metadata that should persist for the lifetime 6538 * of a logical mapping. 6539 */ 6540 void 6541 pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 6542 { 6543 pmap_remove1(pmap, sva, eva, true); 6544 } 6545 6546 /* 6547 * Routine: pmap_remove_all 6548 * Function: 6549 * Removes this physical page from 6550 * all physical maps in which it resides. 6551 * Reflects back modify bits to the pager. 6552 * 6553 * Notes: 6554 * Original versions of this routine were very 6555 * inefficient because they iteratively called 6556 * pmap_remove (slow...) 6557 */ 6558 6559 void 6560 pmap_remove_all(vm_page_t m) 6561 { 6562 struct md_page *pvh; 6563 pv_entry_t pv; 6564 pmap_t pmap; 6565 struct rwlock *lock; 6566 pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; 6567 pd_entry_t *pde; 6568 vm_offset_t va; 6569 struct spglist free; 6570 int pvh_gen, md_gen; 6571 6572 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6573 ("pmap_remove_all: page %p is not managed", m)); 6574 SLIST_INIT(&free); 6575 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6576 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 6577 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6578 rw_wlock(lock); 6579 retry: 6580 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 6581 pmap = PV_PMAP(pv); 6582 if (!PMAP_TRYLOCK(pmap)) { 6583 pvh_gen = pvh->pv_gen; 6584 rw_wunlock(lock); 6585 PMAP_LOCK(pmap); 6586 rw_wlock(lock); 6587 if (pvh_gen != pvh->pv_gen) { 6588 PMAP_UNLOCK(pmap); 6589 goto retry; 6590 } 6591 } 6592 va = pv->pv_va; 6593 pde = pmap_pde(pmap, va); 6594 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 6595 PMAP_UNLOCK(pmap); 6596 } 6597 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 6598 pmap = PV_PMAP(pv); 6599 if (!PMAP_TRYLOCK(pmap)) { 6600 pvh_gen = pvh->pv_gen; 6601 md_gen = m->md.pv_gen; 6602 rw_wunlock(lock); 6603 PMAP_LOCK(pmap); 6604 rw_wlock(lock); 6605 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 6606 PMAP_UNLOCK(pmap); 6607 goto retry; 6608 } 6609 } 6610 PG_A = pmap_accessed_bit(pmap); 6611 PG_M = pmap_modified_bit(pmap); 6612 PG_RW = pmap_rw_bit(pmap); 6613 pmap_resident_count_adj(pmap, -1); 6614 pde = pmap_pde(pmap, pv->pv_va); 6615 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 6616 " a 2mpage in page %p's pv list", m)); 6617 pte = pmap_pde_to_pte(pde, pv->pv_va); 6618 tpte = pte_load_clear(pte); 6619 if (tpte & PG_W) 6620 pmap->pm_stats.wired_count--; 6621 if (tpte & PG_A) 6622 vm_page_aflag_set(m, PGA_REFERENCED); 6623 6624 /* 6625 * Update the vm_page_t clean and reference bits. 6626 */ 6627 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6628 vm_page_dirty(m); 6629 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); 6630 pmap_invalidate_page(pmap, pv->pv_va); 6631 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 6632 m->md.pv_gen++; 6633 free_pv_entry(pmap, pv); 6634 PMAP_UNLOCK(pmap); 6635 } 6636 vm_page_aflag_clear(m, PGA_WRITEABLE); 6637 rw_wunlock(lock); 6638 pmap_delayed_invl_wait(m); 6639 vm_page_free_pages_toq(&free, true); 6640 } 6641 6642 /* 6643 * pmap_protect_pde: do the things to protect a 2mpage in a process 6644 */ 6645 static boolean_t 6646 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 6647 { 6648 pd_entry_t newpde, oldpde; 6649 vm_page_t m, mt; 6650 boolean_t anychanged; 6651 pt_entry_t PG_G, PG_M, PG_RW; 6652 6653 PG_G = pmap_global_bit(pmap); 6654 PG_M = pmap_modified_bit(pmap); 6655 PG_RW = pmap_rw_bit(pmap); 6656 6657 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6658 KASSERT((sva & PDRMASK) == 0, 6659 ("pmap_protect_pde: sva is not 2mpage aligned")); 6660 anychanged = FALSE; 6661 retry: 6662 oldpde = newpde = *pde; 6663 if ((prot & VM_PROT_WRITE) == 0) { 6664 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == 6665 (PG_MANAGED | PG_M | PG_RW)) { 6666 m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 6667 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 6668 vm_page_dirty(mt); 6669 } 6670 newpde &= ~(PG_RW | PG_M); 6671 } 6672 if ((prot & VM_PROT_EXECUTE) == 0) 6673 newpde |= pg_nx; 6674 if (newpde != oldpde) { 6675 /* 6676 * As an optimization to future operations on this PDE, clear 6677 * PG_PROMOTED. The impending invalidation will remove any 6678 * lingering 4KB page mappings from the TLB. 6679 */ 6680 if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED)) 6681 goto retry; 6682 if ((oldpde & PG_G) != 0) 6683 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 6684 else 6685 anychanged = TRUE; 6686 } 6687 return (anychanged); 6688 } 6689 6690 /* 6691 * Set the physical protection on the 6692 * specified range of this map as requested. 6693 */ 6694 void 6695 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 6696 { 6697 vm_page_t m; 6698 vm_offset_t va_next; 6699 pml4_entry_t *pml4e; 6700 pdp_entry_t *pdpe; 6701 pd_entry_t ptpaddr, *pde; 6702 pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; 6703 pt_entry_t obits, pbits; 6704 boolean_t anychanged; 6705 6706 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 6707 if (prot == VM_PROT_NONE) { 6708 pmap_remove(pmap, sva, eva); 6709 return; 6710 } 6711 6712 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 6713 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 6714 return; 6715 6716 PG_G = pmap_global_bit(pmap); 6717 PG_M = pmap_modified_bit(pmap); 6718 PG_V = pmap_valid_bit(pmap); 6719 PG_RW = pmap_rw_bit(pmap); 6720 anychanged = FALSE; 6721 6722 /* 6723 * Although this function delays and batches the invalidation 6724 * of stale TLB entries, it does not need to call 6725 * pmap_delayed_invl_start() and 6726 * pmap_delayed_invl_finish(), because it does not 6727 * ordinarily destroy mappings. Stale TLB entries from 6728 * protection-only changes need only be invalidated before the 6729 * pmap lock is released, because protection-only changes do 6730 * not destroy PV entries. Even operations that iterate over 6731 * a physical page's PV list of mappings, like 6732 * pmap_remove_write(), acquire the pmap lock for each 6733 * mapping. Consequently, for protection-only changes, the 6734 * pmap lock suffices to synchronize both page table and TLB 6735 * updates. 6736 * 6737 * This function only destroys a mapping if pmap_demote_pde() 6738 * fails. In that case, stale TLB entries are immediately 6739 * invalidated. 6740 */ 6741 6742 PMAP_LOCK(pmap); 6743 for (; sva < eva; sva = va_next) { 6744 pml4e = pmap_pml4e(pmap, sva); 6745 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 6746 va_next = (sva + NBPML4) & ~PML4MASK; 6747 if (va_next < sva) 6748 va_next = eva; 6749 continue; 6750 } 6751 6752 va_next = (sva + NBPDP) & ~PDPMASK; 6753 if (va_next < sva) 6754 va_next = eva; 6755 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 6756 if ((*pdpe & PG_V) == 0) 6757 continue; 6758 if ((*pdpe & PG_PS) != 0) { 6759 KASSERT(va_next <= eva, 6760 ("partial update of non-transparent 1G mapping " 6761 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 6762 *pdpe, sva, eva, va_next)); 6763 retry_pdpe: 6764 obits = pbits = *pdpe; 6765 MPASS((pbits & (PG_MANAGED | PG_G)) == 0); 6766 MPASS(pmap != kernel_pmap); /* XXXKIB */ 6767 if ((prot & VM_PROT_WRITE) == 0) 6768 pbits &= ~(PG_RW | PG_M); 6769 if ((prot & VM_PROT_EXECUTE) == 0) 6770 pbits |= pg_nx; 6771 6772 if (pbits != obits) { 6773 if (!atomic_cmpset_long(pdpe, obits, pbits)) 6774 /* PG_PS cannot be cleared under us, */ 6775 goto retry_pdpe; 6776 anychanged = TRUE; 6777 } 6778 continue; 6779 } 6780 6781 va_next = (sva + NBPDR) & ~PDRMASK; 6782 if (va_next < sva) 6783 va_next = eva; 6784 6785 pde = pmap_pdpe_to_pde(pdpe, sva); 6786 ptpaddr = *pde; 6787 6788 /* 6789 * Weed out invalid mappings. 6790 */ 6791 if (ptpaddr == 0) 6792 continue; 6793 6794 /* 6795 * Check for large page. 6796 */ 6797 if ((ptpaddr & PG_PS) != 0) { 6798 /* 6799 * Are we protecting the entire large page? If not, 6800 * demote the mapping and fall through. 6801 */ 6802 if (sva + NBPDR == va_next && eva >= va_next) { 6803 /* 6804 * The TLB entry for a PG_G mapping is 6805 * invalidated by pmap_protect_pde(). 6806 */ 6807 if (pmap_protect_pde(pmap, pde, sva, prot)) 6808 anychanged = TRUE; 6809 continue; 6810 } else if (!pmap_demote_pde(pmap, pde, sva)) { 6811 /* 6812 * The large page mapping was destroyed. 6813 */ 6814 continue; 6815 } 6816 } 6817 6818 if (va_next > eva) 6819 va_next = eva; 6820 6821 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 6822 sva += PAGE_SIZE) { 6823 retry: 6824 obits = pbits = *pte; 6825 if ((pbits & PG_V) == 0) 6826 continue; 6827 6828 if ((prot & VM_PROT_WRITE) == 0) { 6829 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 6830 (PG_MANAGED | PG_M | PG_RW)) { 6831 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 6832 vm_page_dirty(m); 6833 } 6834 pbits &= ~(PG_RW | PG_M); 6835 } 6836 if ((prot & VM_PROT_EXECUTE) == 0) 6837 pbits |= pg_nx; 6838 6839 if (pbits != obits) { 6840 if (!atomic_cmpset_long(pte, obits, pbits)) 6841 goto retry; 6842 if (obits & PG_G) 6843 pmap_invalidate_page(pmap, sva); 6844 else 6845 anychanged = TRUE; 6846 } 6847 } 6848 } 6849 if (anychanged) 6850 pmap_invalidate_all(pmap); 6851 PMAP_UNLOCK(pmap); 6852 } 6853 6854 static bool 6855 pmap_pde_ept_executable(pmap_t pmap, pd_entry_t pde) 6856 { 6857 6858 if (pmap->pm_type != PT_EPT) 6859 return (false); 6860 return ((pde & EPT_PG_EXECUTE) != 0); 6861 } 6862 6863 #if VM_NRESERVLEVEL > 0 6864 /* 6865 * Tries to promote the 512, contiguous 4KB page mappings that are within a 6866 * single page table page (PTP) to a single 2MB page mapping. For promotion 6867 * to occur, two conditions must be met: (1) the 4KB page mappings must map 6868 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 6869 * identical characteristics. 6870 */ 6871 static bool 6872 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, vm_page_t mpte, 6873 struct rwlock **lockp) 6874 { 6875 pd_entry_t newpde; 6876 pt_entry_t *firstpte, oldpte, pa, *pte; 6877 pt_entry_t allpte_PG_A, PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V; 6878 int PG_PTE_CACHE; 6879 6880 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6881 if (!pmap_ps_enabled(pmap)) 6882 return (false); 6883 6884 PG_A = pmap_accessed_bit(pmap); 6885 PG_G = pmap_global_bit(pmap); 6886 PG_M = pmap_modified_bit(pmap); 6887 PG_V = pmap_valid_bit(pmap); 6888 PG_RW = pmap_rw_bit(pmap); 6889 PG_PKU_MASK = pmap_pku_mask_bit(pmap); 6890 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 6891 6892 /* 6893 * Examine the first PTE in the specified PTP. Abort if this PTE is 6894 * ineligible for promotion due to hardware errata, invalid, or does 6895 * not map the first 4KB physical page within a 2MB page. 6896 */ 6897 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 6898 newpde = *firstpte; 6899 if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, newpde))) 6900 return (false); 6901 if ((newpde & ((PG_FRAME & PDRMASK) | PG_V)) != PG_V) { 6902 counter_u64_add(pmap_pde_p_failures, 1); 6903 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6904 " in pmap %p", va, pmap); 6905 return (false); 6906 } 6907 6908 /* 6909 * Both here and in the below "for" loop, to allow for repromotion 6910 * after MADV_FREE, conditionally write protect a clean PTE before 6911 * possibly aborting the promotion due to other PTE attributes. Why? 6912 * Suppose that MADV_FREE is applied to a part of a superpage, the 6913 * address range [S, E). pmap_advise() will demote the superpage 6914 * mapping, destroy the 4KB page mapping at the end of [S, E), and 6915 * clear PG_M and PG_A in the PTEs for the rest of [S, E). Later, 6916 * imagine that the memory in [S, E) is recycled, but the last 4KB 6917 * page in [S, E) is not the last to be rewritten, or simply accessed. 6918 * In other words, there is still a 4KB page in [S, E), call it P, 6919 * that is writeable but PG_M and PG_A are clear in P's PTE. Unless 6920 * we write protect P before aborting the promotion, if and when P is 6921 * finally rewritten, there won't be a page fault to trigger 6922 * repromotion. 6923 */ 6924 setpde: 6925 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 6926 /* 6927 * When PG_M is already clear, PG_RW can be cleared without 6928 * a TLB invalidation. 6929 */ 6930 if (!atomic_fcmpset_long(firstpte, &newpde, newpde & ~PG_RW)) 6931 goto setpde; 6932 newpde &= ~PG_RW; 6933 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 6934 " in pmap %p", va & ~PDRMASK, pmap); 6935 } 6936 6937 /* 6938 * Examine each of the other PTEs in the specified PTP. Abort if this 6939 * PTE maps an unexpected 4KB physical page or does not have identical 6940 * characteristics to the first PTE. 6941 */ 6942 allpte_PG_A = newpde & PG_A; 6943 pa = (newpde & (PG_PS_FRAME | PG_V)) + NBPDR - PAGE_SIZE; 6944 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 6945 oldpte = *pte; 6946 if ((oldpte & (PG_FRAME | PG_V)) != pa) { 6947 counter_u64_add(pmap_pde_p_failures, 1); 6948 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6949 " in pmap %p", va, pmap); 6950 return (false); 6951 } 6952 setpte: 6953 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 6954 /* 6955 * When PG_M is already clear, PG_RW can be cleared 6956 * without a TLB invalidation. 6957 */ 6958 if (!atomic_fcmpset_long(pte, &oldpte, oldpte & ~PG_RW)) 6959 goto setpte; 6960 oldpte &= ~PG_RW; 6961 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 6962 " in pmap %p", (oldpte & PG_FRAME & PDRMASK) | 6963 (va & ~PDRMASK), pmap); 6964 } 6965 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 6966 counter_u64_add(pmap_pde_p_failures, 1); 6967 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6968 " in pmap %p", va, pmap); 6969 return (false); 6970 } 6971 allpte_PG_A &= oldpte; 6972 pa -= PAGE_SIZE; 6973 } 6974 6975 /* 6976 * Unless all PTEs have PG_A set, clear it from the superpage mapping, 6977 * so that promotions triggered by speculative mappings, such as 6978 * pmap_enter_quick(), don't automatically mark the underlying pages 6979 * as referenced. 6980 */ 6981 newpde &= ~PG_A | allpte_PG_A; 6982 6983 /* 6984 * EPT PTEs with PG_M set and PG_A clear are not supported by early 6985 * MMUs supporting EPT. 6986 */ 6987 KASSERT((newpde & PG_A) != 0 || safe_to_clear_referenced(pmap, newpde), 6988 ("unsupported EPT PTE")); 6989 6990 /* 6991 * Save the PTP in its current state until the PDE mapping the 6992 * superpage is demoted by pmap_demote_pde() or destroyed by 6993 * pmap_remove_pde(). If PG_A is not set in every PTE, then request 6994 * that the PTP be refilled on demotion. 6995 */ 6996 if (mpte == NULL) 6997 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 6998 KASSERT(mpte >= vm_page_array && 6999 mpte < &vm_page_array[vm_page_array_size], 7000 ("pmap_promote_pde: page table page is out of range")); 7001 KASSERT(mpte->pindex == pmap_pde_pindex(va), 7002 ("pmap_promote_pde: page table page's pindex is wrong " 7003 "mpte %p pidx %#lx va %#lx va pde pidx %#lx", 7004 mpte, mpte->pindex, va, pmap_pde_pindex(va))); 7005 if (pmap_insert_pt_page(pmap, mpte, true, allpte_PG_A != 0)) { 7006 counter_u64_add(pmap_pde_p_failures, 1); 7007 CTR2(KTR_PMAP, 7008 "pmap_promote_pde: failure for va %#lx in pmap %p", va, 7009 pmap); 7010 return (false); 7011 } 7012 7013 /* 7014 * Promote the pv entries. 7015 */ 7016 if ((newpde & PG_MANAGED) != 0) 7017 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); 7018 7019 /* 7020 * Propagate the PAT index to its proper position. 7021 */ 7022 newpde = pmap_swap_pat(pmap, newpde); 7023 7024 /* 7025 * Map the superpage. 7026 */ 7027 if (workaround_erratum383) 7028 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 7029 else 7030 pde_store(pde, PG_PROMOTED | PG_PS | newpde); 7031 7032 counter_u64_add(pmap_pde_promotions, 1); 7033 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" 7034 " in pmap %p", va, pmap); 7035 return (true); 7036 } 7037 #endif /* VM_NRESERVLEVEL > 0 */ 7038 7039 static int 7040 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags, 7041 int psind) 7042 { 7043 vm_page_t mp; 7044 pt_entry_t origpte, *pml4e, *pdpe, *pde, pten, PG_V; 7045 7046 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7047 KASSERT(psind > 0 && psind < MAXPAGESIZES && pagesizes[psind] != 0, 7048 ("psind %d unexpected", psind)); 7049 KASSERT(((newpte & PG_FRAME) & (pagesizes[psind] - 1)) == 0, 7050 ("unaligned phys address %#lx newpte %#lx psind %d", 7051 newpte & PG_FRAME, newpte, psind)); 7052 KASSERT((va & (pagesizes[psind] - 1)) == 0, 7053 ("unaligned va %#lx psind %d", va, psind)); 7054 KASSERT(va < VM_MAXUSER_ADDRESS, 7055 ("kernel mode non-transparent superpage")); /* XXXKIB */ 7056 KASSERT(va + pagesizes[psind] < VM_MAXUSER_ADDRESS, 7057 ("overflowing user map va %#lx psind %d", va, psind)); /* XXXKIB */ 7058 7059 PG_V = pmap_valid_bit(pmap); 7060 7061 restart: 7062 if (!pmap_pkru_same(pmap, va, va + pagesizes[psind])) 7063 return (KERN_PROTECTION_FAILURE); 7064 pten = newpte; 7065 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) 7066 pten |= pmap_pkru_get(pmap, va); 7067 7068 if (psind == 2) { /* 1G */ 7069 pml4e = pmap_pml4e(pmap, va); 7070 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 7071 mp = pmap_allocpte_alloc(pmap, pmap_pml4e_pindex(va), 7072 NULL, va); 7073 if (mp == NULL) 7074 goto allocf; 7075 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 7076 pdpe = &pdpe[pmap_pdpe_index(va)]; 7077 origpte = *pdpe; 7078 MPASS(origpte == 0); 7079 } else { 7080 pdpe = pmap_pml4e_to_pdpe(pml4e, va); 7081 KASSERT(pdpe != NULL, ("va %#lx lost pdpe", va)); 7082 origpte = *pdpe; 7083 if ((origpte & PG_V) == 0) { 7084 mp = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME); 7085 mp->ref_count++; 7086 } 7087 } 7088 *pdpe = pten; 7089 } else /* (psind == 1) */ { /* 2M */ 7090 pde = pmap_pde(pmap, va); 7091 if (pde == NULL) { 7092 mp = pmap_allocpte_alloc(pmap, pmap_pdpe_pindex(va), 7093 NULL, va); 7094 if (mp == NULL) 7095 goto allocf; 7096 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 7097 pde = &pde[pmap_pde_index(va)]; 7098 origpte = *pde; 7099 MPASS(origpte == 0); 7100 } else { 7101 origpte = *pde; 7102 if ((origpte & PG_V) == 0) { 7103 pdpe = pmap_pdpe(pmap, va); 7104 MPASS(pdpe != NULL && (*pdpe & PG_V) != 0); 7105 mp = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 7106 mp->ref_count++; 7107 } 7108 } 7109 *pde = pten; 7110 } 7111 KASSERT((origpte & PG_V) == 0 || ((origpte & PG_PS) != 0 && 7112 (origpte & PG_PS_FRAME) == (pten & PG_PS_FRAME)), 7113 ("va %#lx changing %s phys page origpte %#lx pten %#lx", 7114 va, psind == 2 ? "1G" : "2M", origpte, pten)); 7115 if ((pten & PG_W) != 0 && (origpte & PG_W) == 0) 7116 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE; 7117 else if ((pten & PG_W) == 0 && (origpte & PG_W) != 0) 7118 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE; 7119 if ((origpte & PG_V) == 0) 7120 pmap_resident_count_adj(pmap, pagesizes[psind] / PAGE_SIZE); 7121 7122 return (KERN_SUCCESS); 7123 7124 allocf: 7125 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 7126 return (KERN_RESOURCE_SHORTAGE); 7127 PMAP_UNLOCK(pmap); 7128 vm_wait(NULL); 7129 PMAP_LOCK(pmap); 7130 goto restart; 7131 } 7132 7133 /* 7134 * Insert the given physical page (p) at 7135 * the specified virtual address (v) in the 7136 * target physical map with the protection requested. 7137 * 7138 * If specified, the page will be wired down, meaning 7139 * that the related pte can not be reclaimed. 7140 * 7141 * NB: This is the only routine which MAY NOT lazy-evaluate 7142 * or lose information. That is, this routine must actually 7143 * insert this page into the given map NOW. 7144 * 7145 * When destroying both a page table and PV entry, this function 7146 * performs the TLB invalidation before releasing the PV list 7147 * lock, so we do not need pmap_delayed_invl_page() calls here. 7148 */ 7149 int 7150 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 7151 u_int flags, int8_t psind) 7152 { 7153 struct rwlock *lock; 7154 pd_entry_t *pde; 7155 pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; 7156 pt_entry_t newpte, origpte; 7157 pv_entry_t pv; 7158 vm_paddr_t opa, pa; 7159 vm_page_t mpte, om; 7160 int rv; 7161 boolean_t nosleep; 7162 7163 PG_A = pmap_accessed_bit(pmap); 7164 PG_G = pmap_global_bit(pmap); 7165 PG_M = pmap_modified_bit(pmap); 7166 PG_V = pmap_valid_bit(pmap); 7167 PG_RW = pmap_rw_bit(pmap); 7168 7169 va = trunc_page(va); 7170 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 7171 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 7172 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", 7173 va)); 7174 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va), 7175 ("pmap_enter: managed mapping within the clean submap")); 7176 if ((m->oflags & VPO_UNMANAGED) == 0) 7177 VM_PAGE_OBJECT_BUSY_ASSERT(m); 7178 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 7179 ("pmap_enter: flags %u has reserved bits set", flags)); 7180 pa = VM_PAGE_TO_PHYS(m); 7181 newpte = (pt_entry_t)(pa | PG_A | PG_V); 7182 if ((flags & VM_PROT_WRITE) != 0) 7183 newpte |= PG_M; 7184 if ((prot & VM_PROT_WRITE) != 0) 7185 newpte |= PG_RW; 7186 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 7187 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); 7188 if ((prot & VM_PROT_EXECUTE) == 0) 7189 newpte |= pg_nx; 7190 if ((flags & PMAP_ENTER_WIRED) != 0) 7191 newpte |= PG_W; 7192 if (va < VM_MAXUSER_ADDRESS) 7193 newpte |= PG_U; 7194 if (pmap == kernel_pmap) 7195 newpte |= PG_G; 7196 newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0); 7197 7198 /* 7199 * Set modified bit gratuitously for writeable mappings if 7200 * the page is unmanaged. We do not want to take a fault 7201 * to do the dirty bit accounting for these mappings. 7202 */ 7203 if ((m->oflags & VPO_UNMANAGED) != 0) { 7204 if ((newpte & PG_RW) != 0) 7205 newpte |= PG_M; 7206 } else 7207 newpte |= PG_MANAGED; 7208 7209 lock = NULL; 7210 PMAP_LOCK(pmap); 7211 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { 7212 KASSERT((m->oflags & VPO_UNMANAGED) != 0, 7213 ("managed largepage va %#lx flags %#x", va, flags)); 7214 rv = pmap_enter_largepage(pmap, va, newpte | PG_PS, flags, 7215 psind); 7216 goto out; 7217 } 7218 if (psind == 1) { 7219 /* Assert the required virtual and physical alignment. */ 7220 KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); 7221 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 7222 rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock); 7223 goto out; 7224 } 7225 mpte = NULL; 7226 7227 /* 7228 * In the case that a page table page is not 7229 * resident, we are creating it here. 7230 */ 7231 retry: 7232 pde = pmap_pde(pmap, va); 7233 if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || 7234 pmap_demote_pde_locked(pmap, pde, va, &lock))) { 7235 pte = pmap_pde_to_pte(pde, va); 7236 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 7237 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7238 mpte->ref_count++; 7239 } 7240 } else if (va < VM_MAXUSER_ADDRESS) { 7241 /* 7242 * Here if the pte page isn't mapped, or if it has been 7243 * deallocated. 7244 */ 7245 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 7246 mpte = pmap_allocpte_alloc(pmap, pmap_pde_pindex(va), 7247 nosleep ? NULL : &lock, va); 7248 if (mpte == NULL && nosleep) { 7249 rv = KERN_RESOURCE_SHORTAGE; 7250 goto out; 7251 } 7252 goto retry; 7253 } else 7254 panic("pmap_enter: invalid page directory va=%#lx", va); 7255 7256 origpte = *pte; 7257 pv = NULL; 7258 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) 7259 newpte |= pmap_pkru_get(pmap, va); 7260 7261 /* 7262 * Is the specified virtual address already mapped? 7263 */ 7264 if ((origpte & PG_V) != 0) { 7265 /* 7266 * Wiring change, just update stats. We don't worry about 7267 * wiring PT pages as they remain resident as long as there 7268 * are valid mappings in them. Hence, if a user page is wired, 7269 * the PT page will be also. 7270 */ 7271 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 7272 pmap->pm_stats.wired_count++; 7273 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 7274 pmap->pm_stats.wired_count--; 7275 7276 /* 7277 * Remove the extra PT page reference. 7278 */ 7279 if (mpte != NULL) { 7280 mpte->ref_count--; 7281 KASSERT(mpte->ref_count > 0, 7282 ("pmap_enter: missing reference to page table page," 7283 " va: 0x%lx", va)); 7284 } 7285 7286 /* 7287 * Has the physical page changed? 7288 */ 7289 opa = origpte & PG_FRAME; 7290 if (opa == pa) { 7291 /* 7292 * No, might be a protection or wiring change. 7293 */ 7294 if ((origpte & PG_MANAGED) != 0 && 7295 (newpte & PG_RW) != 0) 7296 vm_page_aflag_set(m, PGA_WRITEABLE); 7297 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 7298 goto unchanged; 7299 goto validate; 7300 } 7301 7302 /* 7303 * The physical page has changed. Temporarily invalidate 7304 * the mapping. This ensures that all threads sharing the 7305 * pmap keep a consistent view of the mapping, which is 7306 * necessary for the correct handling of COW faults. It 7307 * also permits reuse of the old mapping's PV entry, 7308 * avoiding an allocation. 7309 * 7310 * For consistency, handle unmanaged mappings the same way. 7311 */ 7312 origpte = pte_load_clear(pte); 7313 KASSERT((origpte & PG_FRAME) == opa, 7314 ("pmap_enter: unexpected pa update for %#lx", va)); 7315 if ((origpte & PG_MANAGED) != 0) { 7316 om = PHYS_TO_VM_PAGE(opa); 7317 7318 /* 7319 * The pmap lock is sufficient to synchronize with 7320 * concurrent calls to pmap_page_test_mappings() and 7321 * pmap_ts_referenced(). 7322 */ 7323 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 7324 vm_page_dirty(om); 7325 if ((origpte & PG_A) != 0) { 7326 pmap_invalidate_page(pmap, va); 7327 vm_page_aflag_set(om, PGA_REFERENCED); 7328 } 7329 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 7330 pv = pmap_pvh_remove(&om->md, pmap, va); 7331 KASSERT(pv != NULL, 7332 ("pmap_enter: no PV entry for %#lx", va)); 7333 if ((newpte & PG_MANAGED) == 0) 7334 free_pv_entry(pmap, pv); 7335 if ((om->a.flags & PGA_WRITEABLE) != 0 && 7336 TAILQ_EMPTY(&om->md.pv_list) && 7337 ((om->flags & PG_FICTITIOUS) != 0 || 7338 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 7339 vm_page_aflag_clear(om, PGA_WRITEABLE); 7340 } else { 7341 /* 7342 * Since this mapping is unmanaged, assume that PG_A 7343 * is set. 7344 */ 7345 pmap_invalidate_page(pmap, va); 7346 } 7347 origpte = 0; 7348 } else { 7349 /* 7350 * Increment the counters. 7351 */ 7352 if ((newpte & PG_W) != 0) 7353 pmap->pm_stats.wired_count++; 7354 pmap_resident_count_adj(pmap, 1); 7355 } 7356 7357 /* 7358 * Enter on the PV list if part of our managed memory. 7359 */ 7360 if ((newpte & PG_MANAGED) != 0) { 7361 if (pv == NULL) { 7362 pv = get_pv_entry(pmap, &lock); 7363 pv->pv_va = va; 7364 } 7365 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 7366 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 7367 m->md.pv_gen++; 7368 if ((newpte & PG_RW) != 0) 7369 vm_page_aflag_set(m, PGA_WRITEABLE); 7370 } 7371 7372 /* 7373 * Update the PTE. 7374 */ 7375 if ((origpte & PG_V) != 0) { 7376 validate: 7377 origpte = pte_load_store(pte, newpte); 7378 KASSERT((origpte & PG_FRAME) == pa, 7379 ("pmap_enter: unexpected pa update for %#lx", va)); 7380 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == 7381 (PG_M | PG_RW)) { 7382 if ((origpte & PG_MANAGED) != 0) 7383 vm_page_dirty(m); 7384 7385 /* 7386 * Although the PTE may still have PG_RW set, TLB 7387 * invalidation may nonetheless be required because 7388 * the PTE no longer has PG_M set. 7389 */ 7390 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { 7391 /* 7392 * This PTE change does not require TLB invalidation. 7393 */ 7394 goto unchanged; 7395 } 7396 if ((origpte & PG_A) != 0) 7397 pmap_invalidate_page(pmap, va); 7398 } else 7399 pte_store(pte, newpte); 7400 7401 unchanged: 7402 7403 #if VM_NRESERVLEVEL > 0 7404 /* 7405 * If both the page table page and the reservation are fully 7406 * populated, then attempt promotion. 7407 */ 7408 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 7409 (m->flags & PG_FICTITIOUS) == 0 && 7410 vm_reserv_level_iffullpop(m) == 0) 7411 (void)pmap_promote_pde(pmap, pde, va, mpte, &lock); 7412 #endif 7413 7414 rv = KERN_SUCCESS; 7415 out: 7416 if (lock != NULL) 7417 rw_wunlock(lock); 7418 PMAP_UNLOCK(pmap); 7419 return (rv); 7420 } 7421 7422 /* 7423 * Tries to create a read- and/or execute-only 2MB page mapping. Returns 7424 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error 7425 * value. See pmap_enter_pde() for the possible error values when "no sleep", 7426 * "no replace", and "no reclaim" are specified. 7427 */ 7428 static int 7429 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 7430 struct rwlock **lockp) 7431 { 7432 pd_entry_t newpde; 7433 pt_entry_t PG_V; 7434 7435 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7436 PG_V = pmap_valid_bit(pmap); 7437 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | 7438 PG_PS | PG_V; 7439 if ((m->oflags & VPO_UNMANAGED) == 0) 7440 newpde |= PG_MANAGED; 7441 if ((prot & VM_PROT_EXECUTE) == 0) 7442 newpde |= pg_nx; 7443 if (va < VM_MAXUSER_ADDRESS) 7444 newpde |= PG_U; 7445 return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | 7446 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp)); 7447 } 7448 7449 /* 7450 * Returns true if every page table entry in the specified page table page is 7451 * zero. 7452 */ 7453 static bool 7454 pmap_every_pte_zero(vm_paddr_t pa) 7455 { 7456 pt_entry_t *pt_end, *pte; 7457 7458 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); 7459 pte = (pt_entry_t *)PHYS_TO_DMAP(pa); 7460 for (pt_end = pte + NPTEPG; pte < pt_end; pte++) { 7461 if (*pte != 0) 7462 return (false); 7463 } 7464 return (true); 7465 } 7466 7467 /* 7468 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 7469 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, 7470 * KERN_PROTECTION_FAILURE, or KERN_RESOURCE_SHORTAGE otherwise. Returns 7471 * KERN_FAILURE if either (1) PMAP_ENTER_NOREPLACE was specified and a 4KB 7472 * page mapping already exists within the 2MB virtual address range starting 7473 * at the specified virtual address or (2) the requested 2MB page mapping is 7474 * not supported due to hardware errata. Returns KERN_NO_SPACE if 7475 * PMAP_ENTER_NOREPLACE was specified and a 2MB page mapping already exists at 7476 * the specified virtual address. Returns KERN_PROTECTION_FAILURE if the PKRU 7477 * settings are not the same across the 2MB virtual address range starting at 7478 * the specified virtual address. Returns KERN_RESOURCE_SHORTAGE if either 7479 * (1) PMAP_ENTER_NOSLEEP was specified and a page table page allocation 7480 * failed or (2) PMAP_ENTER_NORECLAIM was specified and a PV entry allocation 7481 * failed. 7482 * 7483 * The parameter "m" is only used when creating a managed, writeable mapping. 7484 */ 7485 static int 7486 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, 7487 vm_page_t m, struct rwlock **lockp) 7488 { 7489 struct spglist free; 7490 pd_entry_t oldpde, *pde; 7491 pt_entry_t PG_G, PG_RW, PG_V; 7492 vm_page_t mt, pdpg; 7493 vm_page_t uwptpg; 7494 7495 PG_G = pmap_global_bit(pmap); 7496 PG_RW = pmap_rw_bit(pmap); 7497 KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW, 7498 ("pmap_enter_pde: newpde is missing PG_M")); 7499 PG_V = pmap_valid_bit(pmap); 7500 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7501 7502 if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, 7503 newpde))) { 7504 CTR2(KTR_PMAP, "pmap_enter_pde: 2m x blocked for va %#lx" 7505 " in pmap %p", va, pmap); 7506 return (KERN_FAILURE); 7507 } 7508 if ((pde = pmap_alloc_pde(pmap, va, &pdpg, (flags & 7509 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { 7510 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 7511 " in pmap %p", va, pmap); 7512 return (KERN_RESOURCE_SHORTAGE); 7513 } 7514 7515 /* 7516 * If pkru is not same for the whole pde range, return failure 7517 * and let vm_fault() cope. Check after pde allocation, since 7518 * it could sleep. 7519 */ 7520 if (!pmap_pkru_same(pmap, va, va + NBPDR)) { 7521 pmap_abort_ptp(pmap, va, pdpg); 7522 return (KERN_PROTECTION_FAILURE); 7523 } 7524 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) { 7525 newpde &= ~X86_PG_PKU_MASK; 7526 newpde |= pmap_pkru_get(pmap, va); 7527 } 7528 7529 /* 7530 * If there are existing mappings, either abort or remove them. 7531 */ 7532 oldpde = *pde; 7533 if ((oldpde & PG_V) != 0) { 7534 KASSERT(pdpg == NULL || pdpg->ref_count > 1, 7535 ("pmap_enter_pde: pdpg's reference count is too low")); 7536 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 7537 if ((oldpde & PG_PS) != 0) { 7538 if (pdpg != NULL) 7539 pdpg->ref_count--; 7540 CTR2(KTR_PMAP, 7541 "pmap_enter_pde: no space for va %#lx" 7542 " in pmap %p", va, pmap); 7543 return (KERN_NO_SPACE); 7544 } else if (va < VM_MAXUSER_ADDRESS || 7545 !pmap_every_pte_zero(oldpde & PG_FRAME)) { 7546 if (pdpg != NULL) 7547 pdpg->ref_count--; 7548 CTR2(KTR_PMAP, 7549 "pmap_enter_pde: failure for va %#lx" 7550 " in pmap %p", va, pmap); 7551 return (KERN_FAILURE); 7552 } 7553 } 7554 /* Break the existing mapping(s). */ 7555 SLIST_INIT(&free); 7556 if ((oldpde & PG_PS) != 0) { 7557 /* 7558 * The reference to the PD page that was acquired by 7559 * pmap_alloc_pde() ensures that it won't be freed. 7560 * However, if the PDE resulted from a promotion, then 7561 * a reserved PT page could be freed. 7562 */ 7563 (void)pmap_remove_pde(pmap, pde, va, &free, lockp); 7564 if ((oldpde & PG_G) == 0) 7565 pmap_invalidate_pde_page(pmap, va, oldpde); 7566 } else { 7567 pmap_delayed_invl_start(); 7568 if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free, 7569 lockp)) 7570 pmap_invalidate_all(pmap); 7571 pmap_delayed_invl_finish(); 7572 } 7573 if (va < VM_MAXUSER_ADDRESS) { 7574 vm_page_free_pages_toq(&free, true); 7575 KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", 7576 pde)); 7577 } else { 7578 KASSERT(SLIST_EMPTY(&free), 7579 ("pmap_enter_pde: freed kernel page table page")); 7580 7581 /* 7582 * Both pmap_remove_pde() and pmap_remove_ptes() will 7583 * leave the kernel page table page zero filled. 7584 */ 7585 mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7586 if (pmap_insert_pt_page(pmap, mt, false, false)) 7587 panic("pmap_enter_pde: trie insert failed"); 7588 } 7589 } 7590 7591 /* 7592 * Allocate leaf ptpage for wired userspace pages. 7593 */ 7594 uwptpg = NULL; 7595 if ((newpde & PG_W) != 0 && pmap != kernel_pmap) { 7596 uwptpg = pmap_alloc_pt_page(pmap, pmap_pde_pindex(va), 7597 VM_ALLOC_WIRED); 7598 if (uwptpg == NULL) 7599 return (KERN_RESOURCE_SHORTAGE); 7600 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) { 7601 pmap_free_pt_page(pmap, uwptpg, false); 7602 return (KERN_RESOURCE_SHORTAGE); 7603 } 7604 7605 uwptpg->ref_count = NPTEPG; 7606 } 7607 if ((newpde & PG_MANAGED) != 0) { 7608 /* 7609 * Abort this mapping if its PV entry could not be created. 7610 */ 7611 if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) { 7612 if (pdpg != NULL) 7613 pmap_abort_ptp(pmap, va, pdpg); 7614 if (uwptpg != NULL) { 7615 mt = pmap_remove_pt_page(pmap, va); 7616 KASSERT(mt == uwptpg, 7617 ("removed pt page %p, expected %p", mt, 7618 uwptpg)); 7619 uwptpg->ref_count = 1; 7620 pmap_free_pt_page(pmap, uwptpg, false); 7621 } 7622 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 7623 " in pmap %p", va, pmap); 7624 return (KERN_RESOURCE_SHORTAGE); 7625 } 7626 if ((newpde & PG_RW) != 0) { 7627 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 7628 vm_page_aflag_set(mt, PGA_WRITEABLE); 7629 } 7630 } 7631 7632 /* 7633 * Increment counters. 7634 */ 7635 if ((newpde & PG_W) != 0) 7636 pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; 7637 pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE); 7638 7639 /* 7640 * Map the superpage. (This is not a promoted mapping; there will not 7641 * be any lingering 4KB page mappings in the TLB.) 7642 */ 7643 pde_store(pde, newpde); 7644 7645 counter_u64_add(pmap_pde_mappings, 1); 7646 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx in pmap %p", 7647 va, pmap); 7648 return (KERN_SUCCESS); 7649 } 7650 7651 /* 7652 * Maps a sequence of resident pages belonging to the same object. 7653 * The sequence begins with the given page m_start. This page is 7654 * mapped at the given virtual address start. Each subsequent page is 7655 * mapped at a virtual address that is offset from start by the same 7656 * amount as the page is offset from m_start within the object. The 7657 * last page in the sequence is the page with the largest offset from 7658 * m_start that can be mapped at a virtual address less than the given 7659 * virtual address end. Not every virtual page between start and end 7660 * is mapped; only those for which a resident page exists with the 7661 * corresponding offset from m_start are mapped. 7662 */ 7663 void 7664 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 7665 vm_page_t m_start, vm_prot_t prot) 7666 { 7667 struct rwlock *lock; 7668 vm_offset_t va; 7669 vm_page_t m, mpte; 7670 vm_pindex_t diff, psize; 7671 int rv; 7672 7673 VM_OBJECT_ASSERT_LOCKED(m_start->object); 7674 7675 psize = atop(end - start); 7676 mpte = NULL; 7677 m = m_start; 7678 lock = NULL; 7679 PMAP_LOCK(pmap); 7680 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 7681 va = start + ptoa(diff); 7682 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 7683 m->psind == 1 && pmap_ps_enabled(pmap) && 7684 ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) == 7685 KERN_SUCCESS || rv == KERN_NO_SPACE)) 7686 m = &m[NBPDR / PAGE_SIZE - 1]; 7687 else 7688 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 7689 mpte, &lock); 7690 m = TAILQ_NEXT(m, listq); 7691 } 7692 if (lock != NULL) 7693 rw_wunlock(lock); 7694 PMAP_UNLOCK(pmap); 7695 } 7696 7697 /* 7698 * this code makes some *MAJOR* assumptions: 7699 * 1. Current pmap & pmap exists. 7700 * 2. Not wired. 7701 * 3. Read access. 7702 * 4. No page table pages. 7703 * but is *MUCH* faster than pmap_enter... 7704 */ 7705 7706 void 7707 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 7708 { 7709 struct rwlock *lock; 7710 7711 lock = NULL; 7712 PMAP_LOCK(pmap); 7713 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 7714 if (lock != NULL) 7715 rw_wunlock(lock); 7716 PMAP_UNLOCK(pmap); 7717 } 7718 7719 static vm_page_t 7720 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 7721 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 7722 { 7723 pd_entry_t *pde; 7724 pt_entry_t newpte, *pte, PG_V; 7725 7726 KASSERT(!VA_IS_CLEANMAP(va) || 7727 (m->oflags & VPO_UNMANAGED) != 0, 7728 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 7729 PG_V = pmap_valid_bit(pmap); 7730 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7731 pde = NULL; 7732 7733 /* 7734 * In the case that a page table page is not 7735 * resident, we are creating it here. 7736 */ 7737 if (va < VM_MAXUSER_ADDRESS) { 7738 pdp_entry_t *pdpe; 7739 vm_pindex_t ptepindex; 7740 7741 /* 7742 * Calculate pagetable page index 7743 */ 7744 ptepindex = pmap_pde_pindex(va); 7745 if (mpte && (mpte->pindex == ptepindex)) { 7746 mpte->ref_count++; 7747 } else { 7748 /* 7749 * If the page table page is mapped, we just increment 7750 * the hold count, and activate it. Otherwise, we 7751 * attempt to allocate a page table page, passing NULL 7752 * instead of the PV list lock pointer because we don't 7753 * intend to sleep. If this attempt fails, we don't 7754 * retry. Instead, we give up. 7755 */ 7756 pdpe = pmap_pdpe(pmap, va); 7757 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 7758 if ((*pdpe & PG_PS) != 0) 7759 return (NULL); 7760 pde = pmap_pdpe_to_pde(pdpe, va); 7761 if ((*pde & PG_V) != 0) { 7762 if ((*pde & PG_PS) != 0) 7763 return (NULL); 7764 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7765 mpte->ref_count++; 7766 } else { 7767 mpte = pmap_allocpte_alloc(pmap, 7768 ptepindex, NULL, va); 7769 if (mpte == NULL) 7770 return (NULL); 7771 } 7772 } else { 7773 mpte = pmap_allocpte_alloc(pmap, ptepindex, 7774 NULL, va); 7775 if (mpte == NULL) 7776 return (NULL); 7777 } 7778 } 7779 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 7780 pte = &pte[pmap_pte_index(va)]; 7781 } else { 7782 mpte = NULL; 7783 pte = vtopte(va); 7784 } 7785 if (*pte) { 7786 if (mpte != NULL) 7787 mpte->ref_count--; 7788 return (NULL); 7789 } 7790 7791 /* 7792 * Enter on the PV list if part of our managed memory. 7793 */ 7794 if ((m->oflags & VPO_UNMANAGED) == 0 && 7795 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 7796 if (mpte != NULL) 7797 pmap_abort_ptp(pmap, va, mpte); 7798 return (NULL); 7799 } 7800 7801 /* 7802 * Increment counters 7803 */ 7804 pmap_resident_count_adj(pmap, 1); 7805 7806 newpte = VM_PAGE_TO_PHYS(m) | PG_V | 7807 pmap_cache_bits(pmap, m->md.pat_mode, 0); 7808 if ((m->oflags & VPO_UNMANAGED) == 0) 7809 newpte |= PG_MANAGED; 7810 if ((prot & VM_PROT_EXECUTE) == 0) 7811 newpte |= pg_nx; 7812 if (va < VM_MAXUSER_ADDRESS) 7813 newpte |= PG_U | pmap_pkru_get(pmap, va); 7814 pte_store(pte, newpte); 7815 7816 #if VM_NRESERVLEVEL > 0 7817 /* 7818 * If both the PTP and the reservation are fully populated, then 7819 * attempt promotion. 7820 */ 7821 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 7822 (m->flags & PG_FICTITIOUS) == 0 && 7823 vm_reserv_level_iffullpop(m) == 0) { 7824 if (pde == NULL) 7825 pde = pmap_pde(pmap, va); 7826 7827 /* 7828 * If promotion succeeds, then the next call to this function 7829 * should not be given the unmapped PTP as a hint. 7830 */ 7831 if (pmap_promote_pde(pmap, pde, va, mpte, lockp)) 7832 mpte = NULL; 7833 } 7834 #endif 7835 7836 return (mpte); 7837 } 7838 7839 /* 7840 * Make a temporary mapping for a physical address. This is only intended 7841 * to be used for panic dumps. 7842 */ 7843 void * 7844 pmap_kenter_temporary(vm_paddr_t pa, int i) 7845 { 7846 vm_offset_t va; 7847 7848 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 7849 pmap_kenter(va, pa); 7850 pmap_invlpg(kernel_pmap, va); 7851 return ((void *)crashdumpmap); 7852 } 7853 7854 /* 7855 * This code maps large physical mmap regions into the 7856 * processor address space. Note that some shortcuts 7857 * are taken, but the code works. 7858 */ 7859 void 7860 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 7861 vm_pindex_t pindex, vm_size_t size) 7862 { 7863 pd_entry_t *pde; 7864 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 7865 vm_paddr_t pa, ptepa; 7866 vm_page_t p, pdpg; 7867 int pat_mode; 7868 7869 PG_A = pmap_accessed_bit(pmap); 7870 PG_M = pmap_modified_bit(pmap); 7871 PG_V = pmap_valid_bit(pmap); 7872 PG_RW = pmap_rw_bit(pmap); 7873 7874 VM_OBJECT_ASSERT_WLOCKED(object); 7875 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 7876 ("pmap_object_init_pt: non-device object")); 7877 if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 7878 if (!pmap_ps_enabled(pmap)) 7879 return; 7880 if (!vm_object_populate(object, pindex, pindex + atop(size))) 7881 return; 7882 p = vm_page_lookup(object, pindex); 7883 KASSERT(vm_page_all_valid(p), 7884 ("pmap_object_init_pt: invalid page %p", p)); 7885 pat_mode = p->md.pat_mode; 7886 7887 /* 7888 * Abort the mapping if the first page is not physically 7889 * aligned to a 2MB page boundary. 7890 */ 7891 ptepa = VM_PAGE_TO_PHYS(p); 7892 if (ptepa & (NBPDR - 1)) 7893 return; 7894 7895 /* 7896 * Skip the first page. Abort the mapping if the rest of 7897 * the pages are not physically contiguous or have differing 7898 * memory attributes. 7899 */ 7900 p = TAILQ_NEXT(p, listq); 7901 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 7902 pa += PAGE_SIZE) { 7903 KASSERT(vm_page_all_valid(p), 7904 ("pmap_object_init_pt: invalid page %p", p)); 7905 if (pa != VM_PAGE_TO_PHYS(p) || 7906 pat_mode != p->md.pat_mode) 7907 return; 7908 p = TAILQ_NEXT(p, listq); 7909 } 7910 7911 /* 7912 * Map using 2MB pages. Since "ptepa" is 2M aligned and 7913 * "size" is a multiple of 2M, adding the PAT setting to "pa" 7914 * will not affect the termination of this loop. 7915 */ 7916 PMAP_LOCK(pmap); 7917 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); 7918 pa < ptepa + size; pa += NBPDR) { 7919 pde = pmap_alloc_pde(pmap, addr, &pdpg, NULL); 7920 if (pde == NULL) { 7921 /* 7922 * The creation of mappings below is only an 7923 * optimization. If a page directory page 7924 * cannot be allocated without blocking, 7925 * continue on to the next mapping rather than 7926 * blocking. 7927 */ 7928 addr += NBPDR; 7929 continue; 7930 } 7931 if ((*pde & PG_V) == 0) { 7932 pde_store(pde, pa | PG_PS | PG_M | PG_A | 7933 PG_U | PG_RW | PG_V); 7934 pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE); 7935 counter_u64_add(pmap_pde_mappings, 1); 7936 } else { 7937 /* Continue on if the PDE is already valid. */ 7938 pdpg->ref_count--; 7939 KASSERT(pdpg->ref_count > 0, 7940 ("pmap_object_init_pt: missing reference " 7941 "to page directory page, va: 0x%lx", addr)); 7942 } 7943 addr += NBPDR; 7944 } 7945 PMAP_UNLOCK(pmap); 7946 } 7947 } 7948 7949 /* 7950 * Clear the wired attribute from the mappings for the specified range of 7951 * addresses in the given pmap. Every valid mapping within that range 7952 * must have the wired attribute set. In contrast, invalid mappings 7953 * cannot have the wired attribute set, so they are ignored. 7954 * 7955 * The wired attribute of the page table entry is not a hardware 7956 * feature, so there is no need to invalidate any TLB entries. 7957 * Since pmap_demote_pde() for the wired entry must never fail, 7958 * pmap_delayed_invl_start()/finish() calls around the 7959 * function are not needed. 7960 */ 7961 void 7962 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 7963 { 7964 vm_offset_t va_next; 7965 pml4_entry_t *pml4e; 7966 pdp_entry_t *pdpe; 7967 pd_entry_t *pde; 7968 pt_entry_t *pte, PG_V, PG_G __diagused; 7969 7970 PG_V = pmap_valid_bit(pmap); 7971 PG_G = pmap_global_bit(pmap); 7972 PMAP_LOCK(pmap); 7973 for (; sva < eva; sva = va_next) { 7974 pml4e = pmap_pml4e(pmap, sva); 7975 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 7976 va_next = (sva + NBPML4) & ~PML4MASK; 7977 if (va_next < sva) 7978 va_next = eva; 7979 continue; 7980 } 7981 7982 va_next = (sva + NBPDP) & ~PDPMASK; 7983 if (va_next < sva) 7984 va_next = eva; 7985 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 7986 if ((*pdpe & PG_V) == 0) 7987 continue; 7988 if ((*pdpe & PG_PS) != 0) { 7989 KASSERT(va_next <= eva, 7990 ("partial update of non-transparent 1G mapping " 7991 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 7992 *pdpe, sva, eva, va_next)); 7993 MPASS(pmap != kernel_pmap); /* XXXKIB */ 7994 MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); 7995 atomic_clear_long(pdpe, PG_W); 7996 pmap->pm_stats.wired_count -= NBPDP / PAGE_SIZE; 7997 continue; 7998 } 7999 8000 va_next = (sva + NBPDR) & ~PDRMASK; 8001 if (va_next < sva) 8002 va_next = eva; 8003 pde = pmap_pdpe_to_pde(pdpe, sva); 8004 if ((*pde & PG_V) == 0) 8005 continue; 8006 if ((*pde & PG_PS) != 0) { 8007 if ((*pde & PG_W) == 0) 8008 panic("pmap_unwire: pde %#jx is missing PG_W", 8009 (uintmax_t)*pde); 8010 8011 /* 8012 * Are we unwiring the entire large page? If not, 8013 * demote the mapping and fall through. 8014 */ 8015 if (sva + NBPDR == va_next && eva >= va_next) { 8016 atomic_clear_long(pde, PG_W); 8017 pmap->pm_stats.wired_count -= NBPDR / 8018 PAGE_SIZE; 8019 continue; 8020 } else if (!pmap_demote_pde(pmap, pde, sva)) 8021 panic("pmap_unwire: demotion failed"); 8022 } 8023 if (va_next > eva) 8024 va_next = eva; 8025 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 8026 sva += PAGE_SIZE) { 8027 if ((*pte & PG_V) == 0) 8028 continue; 8029 if ((*pte & PG_W) == 0) 8030 panic("pmap_unwire: pte %#jx is missing PG_W", 8031 (uintmax_t)*pte); 8032 8033 /* 8034 * PG_W must be cleared atomically. Although the pmap 8035 * lock synchronizes access to PG_W, another processor 8036 * could be setting PG_M and/or PG_A concurrently. 8037 */ 8038 atomic_clear_long(pte, PG_W); 8039 pmap->pm_stats.wired_count--; 8040 } 8041 } 8042 PMAP_UNLOCK(pmap); 8043 } 8044 8045 /* 8046 * Copy the range specified by src_addr/len 8047 * from the source map to the range dst_addr/len 8048 * in the destination map. 8049 * 8050 * This routine is only advisory and need not do anything. 8051 */ 8052 void 8053 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 8054 vm_offset_t src_addr) 8055 { 8056 struct rwlock *lock; 8057 pml4_entry_t *pml4e; 8058 pdp_entry_t *pdpe; 8059 pd_entry_t *pde, srcptepaddr; 8060 pt_entry_t *dst_pte, PG_A, PG_M, PG_V, ptetemp, *src_pte; 8061 vm_offset_t addr, end_addr, va_next; 8062 vm_page_t dst_pdpg, dstmpte, srcmpte; 8063 8064 if (dst_addr != src_addr) 8065 return; 8066 8067 if (dst_pmap->pm_type != src_pmap->pm_type) 8068 return; 8069 8070 /* 8071 * EPT page table entries that require emulation of A/D bits are 8072 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although 8073 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit 8074 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT 8075 * implementations flag an EPT misconfiguration for exec-only 8076 * mappings we skip this function entirely for emulated pmaps. 8077 */ 8078 if (pmap_emulate_ad_bits(dst_pmap)) 8079 return; 8080 8081 end_addr = src_addr + len; 8082 lock = NULL; 8083 if (dst_pmap < src_pmap) { 8084 PMAP_LOCK(dst_pmap); 8085 PMAP_LOCK(src_pmap); 8086 } else { 8087 PMAP_LOCK(src_pmap); 8088 PMAP_LOCK(dst_pmap); 8089 } 8090 8091 PG_A = pmap_accessed_bit(dst_pmap); 8092 PG_M = pmap_modified_bit(dst_pmap); 8093 PG_V = pmap_valid_bit(dst_pmap); 8094 8095 for (addr = src_addr; addr < end_addr; addr = va_next) { 8096 KASSERT(addr < UPT_MIN_ADDRESS, 8097 ("pmap_copy: invalid to pmap_copy page tables")); 8098 8099 pml4e = pmap_pml4e(src_pmap, addr); 8100 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 8101 va_next = (addr + NBPML4) & ~PML4MASK; 8102 if (va_next < addr) 8103 va_next = end_addr; 8104 continue; 8105 } 8106 8107 va_next = (addr + NBPDP) & ~PDPMASK; 8108 if (va_next < addr) 8109 va_next = end_addr; 8110 pdpe = pmap_pml4e_to_pdpe(pml4e, addr); 8111 if ((*pdpe & PG_V) == 0) 8112 continue; 8113 if ((*pdpe & PG_PS) != 0) { 8114 KASSERT(va_next <= end_addr, 8115 ("partial update of non-transparent 1G mapping " 8116 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 8117 *pdpe, addr, end_addr, va_next)); 8118 MPASS((addr & PDPMASK) == 0); 8119 MPASS((*pdpe & PG_MANAGED) == 0); 8120 srcptepaddr = *pdpe; 8121 pdpe = pmap_pdpe(dst_pmap, addr); 8122 if (pdpe == NULL) { 8123 if (pmap_allocpte_alloc(dst_pmap, 8124 pmap_pml4e_pindex(addr), NULL, addr) == 8125 NULL) 8126 break; 8127 pdpe = pmap_pdpe(dst_pmap, addr); 8128 } else { 8129 pml4e = pmap_pml4e(dst_pmap, addr); 8130 dst_pdpg = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME); 8131 dst_pdpg->ref_count++; 8132 } 8133 KASSERT(*pdpe == 0, 8134 ("1G mapping present in dst pmap " 8135 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 8136 *pdpe, addr, end_addr, va_next)); 8137 *pdpe = srcptepaddr & ~PG_W; 8138 pmap_resident_count_adj(dst_pmap, NBPDP / PAGE_SIZE); 8139 continue; 8140 } 8141 8142 va_next = (addr + NBPDR) & ~PDRMASK; 8143 if (va_next < addr) 8144 va_next = end_addr; 8145 8146 pde = pmap_pdpe_to_pde(pdpe, addr); 8147 srcptepaddr = *pde; 8148 if (srcptepaddr == 0) 8149 continue; 8150 8151 if (srcptepaddr & PG_PS) { 8152 /* 8153 * We can only virtual copy whole superpages. 8154 */ 8155 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 8156 continue; 8157 pde = pmap_alloc_pde(dst_pmap, addr, &dst_pdpg, NULL); 8158 if (pde == NULL) 8159 break; 8160 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 8161 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, 8162 PMAP_ENTER_NORECLAIM, &lock))) { 8163 /* 8164 * We leave the dirty bit unchanged because 8165 * managed read/write superpage mappings are 8166 * required to be dirty. However, managed 8167 * superpage mappings are not required to 8168 * have their accessed bit set, so we clear 8169 * it because we don't know if this mapping 8170 * will be used. 8171 */ 8172 srcptepaddr &= ~PG_W; 8173 if ((srcptepaddr & PG_MANAGED) != 0) 8174 srcptepaddr &= ~PG_A; 8175 *pde = srcptepaddr; 8176 pmap_resident_count_adj(dst_pmap, NBPDR / 8177 PAGE_SIZE); 8178 counter_u64_add(pmap_pde_mappings, 1); 8179 } else 8180 pmap_abort_ptp(dst_pmap, addr, dst_pdpg); 8181 continue; 8182 } 8183 8184 srcptepaddr &= PG_FRAME; 8185 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 8186 KASSERT(srcmpte->ref_count > 0, 8187 ("pmap_copy: source page table page is unused")); 8188 8189 if (va_next > end_addr) 8190 va_next = end_addr; 8191 8192 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 8193 src_pte = &src_pte[pmap_pte_index(addr)]; 8194 dstmpte = NULL; 8195 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { 8196 ptetemp = *src_pte; 8197 8198 /* 8199 * We only virtual copy managed pages. 8200 */ 8201 if ((ptetemp & PG_MANAGED) == 0) 8202 continue; 8203 8204 if (dstmpte != NULL) { 8205 KASSERT(dstmpte->pindex == 8206 pmap_pde_pindex(addr), 8207 ("dstmpte pindex/addr mismatch")); 8208 dstmpte->ref_count++; 8209 } else if ((dstmpte = pmap_allocpte(dst_pmap, addr, 8210 NULL)) == NULL) 8211 goto out; 8212 dst_pte = (pt_entry_t *) 8213 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 8214 dst_pte = &dst_pte[pmap_pte_index(addr)]; 8215 if (*dst_pte == 0 && 8216 pmap_try_insert_pv_entry(dst_pmap, addr, 8217 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) { 8218 /* 8219 * Clear the wired, modified, and accessed 8220 * (referenced) bits during the copy. 8221 */ 8222 *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); 8223 pmap_resident_count_adj(dst_pmap, 1); 8224 } else { 8225 pmap_abort_ptp(dst_pmap, addr, dstmpte); 8226 goto out; 8227 } 8228 /* Have we copied all of the valid mappings? */ 8229 if (dstmpte->ref_count >= srcmpte->ref_count) 8230 break; 8231 } 8232 } 8233 out: 8234 if (lock != NULL) 8235 rw_wunlock(lock); 8236 PMAP_UNLOCK(src_pmap); 8237 PMAP_UNLOCK(dst_pmap); 8238 } 8239 8240 int 8241 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap) 8242 { 8243 int error; 8244 8245 if (dst_pmap->pm_type != src_pmap->pm_type || 8246 dst_pmap->pm_type != PT_X86 || 8247 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) 8248 return (0); 8249 for (;;) { 8250 if (dst_pmap < src_pmap) { 8251 PMAP_LOCK(dst_pmap); 8252 PMAP_LOCK(src_pmap); 8253 } else { 8254 PMAP_LOCK(src_pmap); 8255 PMAP_LOCK(dst_pmap); 8256 } 8257 error = pmap_pkru_copy(dst_pmap, src_pmap); 8258 /* Clean up partial copy on failure due to no memory. */ 8259 if (error == ENOMEM) 8260 pmap_pkru_deassign_all(dst_pmap); 8261 PMAP_UNLOCK(src_pmap); 8262 PMAP_UNLOCK(dst_pmap); 8263 if (error != ENOMEM) 8264 break; 8265 vm_wait(NULL); 8266 } 8267 return (error); 8268 } 8269 8270 /* 8271 * Zero the specified hardware page. 8272 */ 8273 void 8274 pmap_zero_page(vm_page_t m) 8275 { 8276 vm_offset_t va; 8277 8278 #ifdef TSLOG_PAGEZERO 8279 TSENTER(); 8280 #endif 8281 va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 8282 pagezero((void *)va); 8283 #ifdef TSLOG_PAGEZERO 8284 TSEXIT(); 8285 #endif 8286 } 8287 8288 /* 8289 * Zero an area within a single hardware page. off and size must not 8290 * cover an area beyond a single hardware page. 8291 */ 8292 void 8293 pmap_zero_page_area(vm_page_t m, int off, int size) 8294 { 8295 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 8296 8297 if (off == 0 && size == PAGE_SIZE) 8298 pagezero((void *)va); 8299 else 8300 bzero((char *)va + off, size); 8301 } 8302 8303 /* 8304 * Copy 1 specified hardware page to another. 8305 */ 8306 void 8307 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 8308 { 8309 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 8310 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 8311 8312 pagecopy((void *)src, (void *)dst); 8313 } 8314 8315 int unmapped_buf_allowed = 1; 8316 8317 void 8318 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 8319 vm_offset_t b_offset, int xfersize) 8320 { 8321 void *a_cp, *b_cp; 8322 vm_page_t pages[2]; 8323 vm_offset_t vaddr[2], a_pg_offset, b_pg_offset; 8324 int cnt; 8325 boolean_t mapped; 8326 8327 while (xfersize > 0) { 8328 a_pg_offset = a_offset & PAGE_MASK; 8329 pages[0] = ma[a_offset >> PAGE_SHIFT]; 8330 b_pg_offset = b_offset & PAGE_MASK; 8331 pages[1] = mb[b_offset >> PAGE_SHIFT]; 8332 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 8333 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 8334 mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE); 8335 a_cp = (char *)vaddr[0] + a_pg_offset; 8336 b_cp = (char *)vaddr[1] + b_pg_offset; 8337 bcopy(a_cp, b_cp, cnt); 8338 if (__predict_false(mapped)) 8339 pmap_unmap_io_transient(pages, vaddr, 2, FALSE); 8340 a_offset += cnt; 8341 b_offset += cnt; 8342 xfersize -= cnt; 8343 } 8344 } 8345 8346 /* 8347 * Returns true if the pmap's pv is one of the first 8348 * 16 pvs linked to from this page. This count may 8349 * be changed upwards or downwards in the future; it 8350 * is only necessary that true be returned for a small 8351 * subset of pmaps for proper page aging. 8352 */ 8353 boolean_t 8354 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 8355 { 8356 struct md_page *pvh; 8357 struct rwlock *lock; 8358 pv_entry_t pv; 8359 int loops = 0; 8360 boolean_t rv; 8361 8362 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8363 ("pmap_page_exists_quick: page %p is not managed", m)); 8364 rv = FALSE; 8365 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8366 rw_rlock(lock); 8367 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8368 if (PV_PMAP(pv) == pmap) { 8369 rv = TRUE; 8370 break; 8371 } 8372 loops++; 8373 if (loops >= 16) 8374 break; 8375 } 8376 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 8377 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8378 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8379 if (PV_PMAP(pv) == pmap) { 8380 rv = TRUE; 8381 break; 8382 } 8383 loops++; 8384 if (loops >= 16) 8385 break; 8386 } 8387 } 8388 rw_runlock(lock); 8389 return (rv); 8390 } 8391 8392 /* 8393 * pmap_page_wired_mappings: 8394 * 8395 * Return the number of managed mappings to the given physical page 8396 * that are wired. 8397 */ 8398 int 8399 pmap_page_wired_mappings(vm_page_t m) 8400 { 8401 struct rwlock *lock; 8402 struct md_page *pvh; 8403 pmap_t pmap; 8404 pt_entry_t *pte; 8405 pv_entry_t pv; 8406 int count, md_gen, pvh_gen; 8407 8408 if ((m->oflags & VPO_UNMANAGED) != 0) 8409 return (0); 8410 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8411 rw_rlock(lock); 8412 restart: 8413 count = 0; 8414 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8415 pmap = PV_PMAP(pv); 8416 if (!PMAP_TRYLOCK(pmap)) { 8417 md_gen = m->md.pv_gen; 8418 rw_runlock(lock); 8419 PMAP_LOCK(pmap); 8420 rw_rlock(lock); 8421 if (md_gen != m->md.pv_gen) { 8422 PMAP_UNLOCK(pmap); 8423 goto restart; 8424 } 8425 } 8426 pte = pmap_pte(pmap, pv->pv_va); 8427 if ((*pte & PG_W) != 0) 8428 count++; 8429 PMAP_UNLOCK(pmap); 8430 } 8431 if ((m->flags & PG_FICTITIOUS) == 0) { 8432 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8433 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8434 pmap = PV_PMAP(pv); 8435 if (!PMAP_TRYLOCK(pmap)) { 8436 md_gen = m->md.pv_gen; 8437 pvh_gen = pvh->pv_gen; 8438 rw_runlock(lock); 8439 PMAP_LOCK(pmap); 8440 rw_rlock(lock); 8441 if (md_gen != m->md.pv_gen || 8442 pvh_gen != pvh->pv_gen) { 8443 PMAP_UNLOCK(pmap); 8444 goto restart; 8445 } 8446 } 8447 pte = pmap_pde(pmap, pv->pv_va); 8448 if ((*pte & PG_W) != 0) 8449 count++; 8450 PMAP_UNLOCK(pmap); 8451 } 8452 } 8453 rw_runlock(lock); 8454 return (count); 8455 } 8456 8457 /* 8458 * Returns TRUE if the given page is mapped individually or as part of 8459 * a 2mpage. Otherwise, returns FALSE. 8460 */ 8461 boolean_t 8462 pmap_page_is_mapped(vm_page_t m) 8463 { 8464 struct rwlock *lock; 8465 boolean_t rv; 8466 8467 if ((m->oflags & VPO_UNMANAGED) != 0) 8468 return (FALSE); 8469 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8470 rw_rlock(lock); 8471 rv = !TAILQ_EMPTY(&m->md.pv_list) || 8472 ((m->flags & PG_FICTITIOUS) == 0 && 8473 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 8474 rw_runlock(lock); 8475 return (rv); 8476 } 8477 8478 /* 8479 * Destroy all managed, non-wired mappings in the given user-space 8480 * pmap. This pmap cannot be active on any processor besides the 8481 * caller. 8482 * 8483 * This function cannot be applied to the kernel pmap. Moreover, it 8484 * is not intended for general use. It is only to be used during 8485 * process termination. Consequently, it can be implemented in ways 8486 * that make it faster than pmap_remove(). First, it can more quickly 8487 * destroy mappings by iterating over the pmap's collection of PV 8488 * entries, rather than searching the page table. Second, it doesn't 8489 * have to test and clear the page table entries atomically, because 8490 * no processor is currently accessing the user address space. In 8491 * particular, a page table entry's dirty bit won't change state once 8492 * this function starts. 8493 * 8494 * Although this function destroys all of the pmap's managed, 8495 * non-wired mappings, it can delay and batch the invalidation of TLB 8496 * entries without calling pmap_delayed_invl_start() and 8497 * pmap_delayed_invl_finish(). Because the pmap is not active on 8498 * any other processor, none of these TLB entries will ever be used 8499 * before their eventual invalidation. Consequently, there is no need 8500 * for either pmap_remove_all() or pmap_remove_write() to wait for 8501 * that eventual TLB invalidation. 8502 */ 8503 void 8504 pmap_remove_pages(pmap_t pmap) 8505 { 8506 pd_entry_t ptepde; 8507 pt_entry_t *pte, tpte; 8508 pt_entry_t PG_M, PG_RW, PG_V; 8509 struct spglist free; 8510 struct pv_chunklist free_chunks[PMAP_MEMDOM]; 8511 vm_page_t m, mpte, mt; 8512 pv_entry_t pv; 8513 struct md_page *pvh; 8514 struct pv_chunk *pc, *npc; 8515 struct rwlock *lock; 8516 int64_t bit; 8517 uint64_t inuse, bitmask; 8518 int allfree, field, i, idx; 8519 #ifdef PV_STATS 8520 int freed; 8521 #endif 8522 boolean_t superpage; 8523 vm_paddr_t pa; 8524 8525 /* 8526 * Assert that the given pmap is only active on the current 8527 * CPU. Unfortunately, we cannot block another CPU from 8528 * activating the pmap while this function is executing. 8529 */ 8530 KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); 8531 #ifdef INVARIANTS 8532 { 8533 cpuset_t other_cpus; 8534 8535 other_cpus = all_cpus; 8536 critical_enter(); 8537 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 8538 CPU_AND(&other_cpus, &other_cpus, &pmap->pm_active); 8539 critical_exit(); 8540 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); 8541 } 8542 #endif 8543 8544 lock = NULL; 8545 PG_M = pmap_modified_bit(pmap); 8546 PG_V = pmap_valid_bit(pmap); 8547 PG_RW = pmap_rw_bit(pmap); 8548 8549 for (i = 0; i < PMAP_MEMDOM; i++) 8550 TAILQ_INIT(&free_chunks[i]); 8551 SLIST_INIT(&free); 8552 PMAP_LOCK(pmap); 8553 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 8554 allfree = 1; 8555 #ifdef PV_STATS 8556 freed = 0; 8557 #endif 8558 for (field = 0; field < _NPCM; field++) { 8559 inuse = ~pc->pc_map[field] & pc_freemask[field]; 8560 while (inuse != 0) { 8561 bit = bsfq(inuse); 8562 bitmask = 1UL << bit; 8563 idx = field * 64 + bit; 8564 pv = &pc->pc_pventry[idx]; 8565 inuse &= ~bitmask; 8566 8567 pte = pmap_pdpe(pmap, pv->pv_va); 8568 ptepde = *pte; 8569 pte = pmap_pdpe_to_pde(pte, pv->pv_va); 8570 tpte = *pte; 8571 if ((tpte & (PG_PS | PG_V)) == PG_V) { 8572 superpage = FALSE; 8573 ptepde = tpte; 8574 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 8575 PG_FRAME); 8576 pte = &pte[pmap_pte_index(pv->pv_va)]; 8577 tpte = *pte; 8578 } else { 8579 /* 8580 * Keep track whether 'tpte' is a 8581 * superpage explicitly instead of 8582 * relying on PG_PS being set. 8583 * 8584 * This is because PG_PS is numerically 8585 * identical to PG_PTE_PAT and thus a 8586 * regular page could be mistaken for 8587 * a superpage. 8588 */ 8589 superpage = TRUE; 8590 } 8591 8592 if ((tpte & PG_V) == 0) { 8593 panic("bad pte va %lx pte %lx", 8594 pv->pv_va, tpte); 8595 } 8596 8597 /* 8598 * We cannot remove wired pages from a process' mapping at this time 8599 */ 8600 if (tpte & PG_W) { 8601 allfree = 0; 8602 continue; 8603 } 8604 8605 /* Mark free */ 8606 pc->pc_map[field] |= bitmask; 8607 8608 /* 8609 * Because this pmap is not active on other 8610 * processors, the dirty bit cannot have 8611 * changed state since we last loaded pte. 8612 */ 8613 pte_clear(pte); 8614 8615 if (superpage) 8616 pa = tpte & PG_PS_FRAME; 8617 else 8618 pa = tpte & PG_FRAME; 8619 8620 m = PHYS_TO_VM_PAGE(pa); 8621 KASSERT(m->phys_addr == pa, 8622 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 8623 m, (uintmax_t)m->phys_addr, 8624 (uintmax_t)tpte)); 8625 8626 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 8627 m < &vm_page_array[vm_page_array_size], 8628 ("pmap_remove_pages: bad tpte %#jx", 8629 (uintmax_t)tpte)); 8630 8631 /* 8632 * Update the vm_page_t clean/reference bits. 8633 */ 8634 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 8635 if (superpage) { 8636 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 8637 vm_page_dirty(mt); 8638 } else 8639 vm_page_dirty(m); 8640 } 8641 8642 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 8643 8644 if (superpage) { 8645 pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE); 8646 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 8647 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 8648 pvh->pv_gen++; 8649 if (TAILQ_EMPTY(&pvh->pv_list)) { 8650 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 8651 if ((mt->a.flags & PGA_WRITEABLE) != 0 && 8652 TAILQ_EMPTY(&mt->md.pv_list)) 8653 vm_page_aflag_clear(mt, PGA_WRITEABLE); 8654 } 8655 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 8656 if (mpte != NULL) { 8657 KASSERT(vm_page_any_valid(mpte), 8658 ("pmap_remove_pages: pte page not promoted")); 8659 pmap_pt_page_count_adj(pmap, -1); 8660 KASSERT(mpte->ref_count == NPTEPG, 8661 ("pmap_remove_pages: pte page reference count error")); 8662 mpte->ref_count = 0; 8663 pmap_add_delayed_free_list(mpte, &free, FALSE); 8664 } 8665 } else { 8666 pmap_resident_count_adj(pmap, -1); 8667 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 8668 m->md.pv_gen++; 8669 if ((m->a.flags & PGA_WRITEABLE) != 0 && 8670 TAILQ_EMPTY(&m->md.pv_list) && 8671 (m->flags & PG_FICTITIOUS) == 0) { 8672 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8673 if (TAILQ_EMPTY(&pvh->pv_list)) 8674 vm_page_aflag_clear(m, PGA_WRITEABLE); 8675 } 8676 } 8677 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 8678 #ifdef PV_STATS 8679 freed++; 8680 #endif 8681 } 8682 } 8683 PV_STAT(counter_u64_add(pv_entry_frees, freed)); 8684 PV_STAT(counter_u64_add(pv_entry_spare, freed)); 8685 PV_STAT(counter_u64_add(pv_entry_count, -freed)); 8686 if (allfree) { 8687 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 8688 TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc, pc_list); 8689 } 8690 } 8691 if (lock != NULL) 8692 rw_wunlock(lock); 8693 pmap_invalidate_all(pmap); 8694 pmap_pkru_deassign_all(pmap); 8695 free_pv_chunk_batch((struct pv_chunklist *)&free_chunks); 8696 PMAP_UNLOCK(pmap); 8697 vm_page_free_pages_toq(&free, true); 8698 } 8699 8700 static boolean_t 8701 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 8702 { 8703 struct rwlock *lock; 8704 pv_entry_t pv; 8705 struct md_page *pvh; 8706 pt_entry_t *pte, mask; 8707 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 8708 pmap_t pmap; 8709 int md_gen, pvh_gen; 8710 boolean_t rv; 8711 8712 rv = FALSE; 8713 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8714 rw_rlock(lock); 8715 restart: 8716 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8717 pmap = PV_PMAP(pv); 8718 if (!PMAP_TRYLOCK(pmap)) { 8719 md_gen = m->md.pv_gen; 8720 rw_runlock(lock); 8721 PMAP_LOCK(pmap); 8722 rw_rlock(lock); 8723 if (md_gen != m->md.pv_gen) { 8724 PMAP_UNLOCK(pmap); 8725 goto restart; 8726 } 8727 } 8728 pte = pmap_pte(pmap, pv->pv_va); 8729 mask = 0; 8730 if (modified) { 8731 PG_M = pmap_modified_bit(pmap); 8732 PG_RW = pmap_rw_bit(pmap); 8733 mask |= PG_RW | PG_M; 8734 } 8735 if (accessed) { 8736 PG_A = pmap_accessed_bit(pmap); 8737 PG_V = pmap_valid_bit(pmap); 8738 mask |= PG_V | PG_A; 8739 } 8740 rv = (*pte & mask) == mask; 8741 PMAP_UNLOCK(pmap); 8742 if (rv) 8743 goto out; 8744 } 8745 if ((m->flags & PG_FICTITIOUS) == 0) { 8746 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8747 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8748 pmap = PV_PMAP(pv); 8749 if (!PMAP_TRYLOCK(pmap)) { 8750 md_gen = m->md.pv_gen; 8751 pvh_gen = pvh->pv_gen; 8752 rw_runlock(lock); 8753 PMAP_LOCK(pmap); 8754 rw_rlock(lock); 8755 if (md_gen != m->md.pv_gen || 8756 pvh_gen != pvh->pv_gen) { 8757 PMAP_UNLOCK(pmap); 8758 goto restart; 8759 } 8760 } 8761 pte = pmap_pde(pmap, pv->pv_va); 8762 mask = 0; 8763 if (modified) { 8764 PG_M = pmap_modified_bit(pmap); 8765 PG_RW = pmap_rw_bit(pmap); 8766 mask |= PG_RW | PG_M; 8767 } 8768 if (accessed) { 8769 PG_A = pmap_accessed_bit(pmap); 8770 PG_V = pmap_valid_bit(pmap); 8771 mask |= PG_V | PG_A; 8772 } 8773 rv = (*pte & mask) == mask; 8774 PMAP_UNLOCK(pmap); 8775 if (rv) 8776 goto out; 8777 } 8778 } 8779 out: 8780 rw_runlock(lock); 8781 return (rv); 8782 } 8783 8784 /* 8785 * pmap_is_modified: 8786 * 8787 * Return whether or not the specified physical page was modified 8788 * in any physical maps. 8789 */ 8790 boolean_t 8791 pmap_is_modified(vm_page_t m) 8792 { 8793 8794 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8795 ("pmap_is_modified: page %p is not managed", m)); 8796 8797 /* 8798 * If the page is not busied then this check is racy. 8799 */ 8800 if (!pmap_page_is_write_mapped(m)) 8801 return (FALSE); 8802 return (pmap_page_test_mappings(m, FALSE, TRUE)); 8803 } 8804 8805 /* 8806 * pmap_is_prefaultable: 8807 * 8808 * Return whether or not the specified virtual address is eligible 8809 * for prefault. 8810 */ 8811 boolean_t 8812 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 8813 { 8814 pd_entry_t *pde; 8815 pt_entry_t *pte, PG_V; 8816 boolean_t rv; 8817 8818 PG_V = pmap_valid_bit(pmap); 8819 8820 /* 8821 * Return TRUE if and only if the PTE for the specified virtual 8822 * address is allocated but invalid. 8823 */ 8824 rv = FALSE; 8825 PMAP_LOCK(pmap); 8826 pde = pmap_pde(pmap, addr); 8827 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { 8828 pte = pmap_pde_to_pte(pde, addr); 8829 rv = (*pte & PG_V) == 0; 8830 } 8831 PMAP_UNLOCK(pmap); 8832 return (rv); 8833 } 8834 8835 /* 8836 * pmap_is_referenced: 8837 * 8838 * Return whether or not the specified physical page was referenced 8839 * in any physical maps. 8840 */ 8841 boolean_t 8842 pmap_is_referenced(vm_page_t m) 8843 { 8844 8845 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8846 ("pmap_is_referenced: page %p is not managed", m)); 8847 return (pmap_page_test_mappings(m, TRUE, FALSE)); 8848 } 8849 8850 /* 8851 * Clear the write and modified bits in each of the given page's mappings. 8852 */ 8853 void 8854 pmap_remove_write(vm_page_t m) 8855 { 8856 struct md_page *pvh; 8857 pmap_t pmap; 8858 struct rwlock *lock; 8859 pv_entry_t next_pv, pv; 8860 pd_entry_t *pde; 8861 pt_entry_t oldpte, *pte, PG_M, PG_RW; 8862 vm_offset_t va; 8863 int pvh_gen, md_gen; 8864 8865 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8866 ("pmap_remove_write: page %p is not managed", m)); 8867 8868 vm_page_assert_busied(m); 8869 if (!pmap_page_is_write_mapped(m)) 8870 return; 8871 8872 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8873 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 8874 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8875 rw_wlock(lock); 8876 retry: 8877 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 8878 pmap = PV_PMAP(pv); 8879 if (!PMAP_TRYLOCK(pmap)) { 8880 pvh_gen = pvh->pv_gen; 8881 rw_wunlock(lock); 8882 PMAP_LOCK(pmap); 8883 rw_wlock(lock); 8884 if (pvh_gen != pvh->pv_gen) { 8885 PMAP_UNLOCK(pmap); 8886 goto retry; 8887 } 8888 } 8889 PG_RW = pmap_rw_bit(pmap); 8890 va = pv->pv_va; 8891 pde = pmap_pde(pmap, va); 8892 if ((*pde & PG_RW) != 0) 8893 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 8894 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 8895 ("inconsistent pv lock %p %p for page %p", 8896 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 8897 PMAP_UNLOCK(pmap); 8898 } 8899 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8900 pmap = PV_PMAP(pv); 8901 if (!PMAP_TRYLOCK(pmap)) { 8902 pvh_gen = pvh->pv_gen; 8903 md_gen = m->md.pv_gen; 8904 rw_wunlock(lock); 8905 PMAP_LOCK(pmap); 8906 rw_wlock(lock); 8907 if (pvh_gen != pvh->pv_gen || 8908 md_gen != m->md.pv_gen) { 8909 PMAP_UNLOCK(pmap); 8910 goto retry; 8911 } 8912 } 8913 PG_M = pmap_modified_bit(pmap); 8914 PG_RW = pmap_rw_bit(pmap); 8915 pde = pmap_pde(pmap, pv->pv_va); 8916 KASSERT((*pde & PG_PS) == 0, 8917 ("pmap_remove_write: found a 2mpage in page %p's pv list", 8918 m)); 8919 pte = pmap_pde_to_pte(pde, pv->pv_va); 8920 oldpte = *pte; 8921 if (oldpte & PG_RW) { 8922 while (!atomic_fcmpset_long(pte, &oldpte, oldpte & 8923 ~(PG_RW | PG_M))) 8924 cpu_spinwait(); 8925 if ((oldpte & PG_M) != 0) 8926 vm_page_dirty(m); 8927 pmap_invalidate_page(pmap, pv->pv_va); 8928 } 8929 PMAP_UNLOCK(pmap); 8930 } 8931 rw_wunlock(lock); 8932 vm_page_aflag_clear(m, PGA_WRITEABLE); 8933 pmap_delayed_invl_wait(m); 8934 } 8935 8936 /* 8937 * pmap_ts_referenced: 8938 * 8939 * Return a count of reference bits for a page, clearing those bits. 8940 * It is not necessary for every reference bit to be cleared, but it 8941 * is necessary that 0 only be returned when there are truly no 8942 * reference bits set. 8943 * 8944 * As an optimization, update the page's dirty field if a modified bit is 8945 * found while counting reference bits. This opportunistic update can be 8946 * performed at low cost and can eliminate the need for some future calls 8947 * to pmap_is_modified(). However, since this function stops after 8948 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 8949 * dirty pages. Those dirty pages will only be detected by a future call 8950 * to pmap_is_modified(). 8951 * 8952 * A DI block is not needed within this function, because 8953 * invalidations are performed before the PV list lock is 8954 * released. 8955 */ 8956 int 8957 pmap_ts_referenced(vm_page_t m) 8958 { 8959 struct md_page *pvh; 8960 pv_entry_t pv, pvf; 8961 pmap_t pmap; 8962 struct rwlock *lock; 8963 pd_entry_t oldpde, *pde; 8964 pt_entry_t *pte, PG_A, PG_M, PG_RW; 8965 vm_offset_t va; 8966 vm_paddr_t pa; 8967 int cleared, md_gen, not_cleared, pvh_gen; 8968 struct spglist free; 8969 boolean_t demoted; 8970 8971 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8972 ("pmap_ts_referenced: page %p is not managed", m)); 8973 SLIST_INIT(&free); 8974 cleared = 0; 8975 pa = VM_PAGE_TO_PHYS(m); 8976 lock = PHYS_TO_PV_LIST_LOCK(pa); 8977 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 8978 rw_wlock(lock); 8979 retry: 8980 not_cleared = 0; 8981 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 8982 goto small_mappings; 8983 pv = pvf; 8984 do { 8985 if (pvf == NULL) 8986 pvf = pv; 8987 pmap = PV_PMAP(pv); 8988 if (!PMAP_TRYLOCK(pmap)) { 8989 pvh_gen = pvh->pv_gen; 8990 rw_wunlock(lock); 8991 PMAP_LOCK(pmap); 8992 rw_wlock(lock); 8993 if (pvh_gen != pvh->pv_gen) { 8994 PMAP_UNLOCK(pmap); 8995 goto retry; 8996 } 8997 } 8998 PG_A = pmap_accessed_bit(pmap); 8999 PG_M = pmap_modified_bit(pmap); 9000 PG_RW = pmap_rw_bit(pmap); 9001 va = pv->pv_va; 9002 pde = pmap_pde(pmap, pv->pv_va); 9003 oldpde = *pde; 9004 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 9005 /* 9006 * Although "oldpde" is mapping a 2MB page, because 9007 * this function is called at a 4KB page granularity, 9008 * we only update the 4KB page under test. 9009 */ 9010 vm_page_dirty(m); 9011 } 9012 if ((oldpde & PG_A) != 0) { 9013 /* 9014 * Since this reference bit is shared by 512 4KB 9015 * pages, it should not be cleared every time it is 9016 * tested. Apply a simple "hash" function on the 9017 * physical page number, the virtual superpage number, 9018 * and the pmap address to select one 4KB page out of 9019 * the 512 on which testing the reference bit will 9020 * result in clearing that reference bit. This 9021 * function is designed to avoid the selection of the 9022 * same 4KB page for every 2MB page mapping. 9023 * 9024 * On demotion, a mapping that hasn't been referenced 9025 * is simply destroyed. To avoid the possibility of a 9026 * subsequent page fault on a demoted wired mapping, 9027 * always leave its reference bit set. Moreover, 9028 * since the superpage is wired, the current state of 9029 * its reference bit won't affect page replacement. 9030 */ 9031 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 9032 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 9033 (oldpde & PG_W) == 0) { 9034 if (safe_to_clear_referenced(pmap, oldpde)) { 9035 atomic_clear_long(pde, PG_A); 9036 pmap_invalidate_page(pmap, pv->pv_va); 9037 demoted = FALSE; 9038 } else if (pmap_demote_pde_locked(pmap, pde, 9039 pv->pv_va, &lock)) { 9040 /* 9041 * Remove the mapping to a single page 9042 * so that a subsequent access may 9043 * repromote. Since the underlying 9044 * page table page is fully populated, 9045 * this removal never frees a page 9046 * table page. 9047 */ 9048 demoted = TRUE; 9049 va += VM_PAGE_TO_PHYS(m) - (oldpde & 9050 PG_PS_FRAME); 9051 pte = pmap_pde_to_pte(pde, va); 9052 pmap_remove_pte(pmap, pte, va, *pde, 9053 NULL, &lock); 9054 pmap_invalidate_page(pmap, va); 9055 } else 9056 demoted = TRUE; 9057 9058 if (demoted) { 9059 /* 9060 * The superpage mapping was removed 9061 * entirely and therefore 'pv' is no 9062 * longer valid. 9063 */ 9064 if (pvf == pv) 9065 pvf = NULL; 9066 pv = NULL; 9067 } 9068 cleared++; 9069 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 9070 ("inconsistent pv lock %p %p for page %p", 9071 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 9072 } else 9073 not_cleared++; 9074 } 9075 PMAP_UNLOCK(pmap); 9076 /* Rotate the PV list if it has more than one entry. */ 9077 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 9078 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 9079 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 9080 pvh->pv_gen++; 9081 } 9082 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 9083 goto out; 9084 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 9085 small_mappings: 9086 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 9087 goto out; 9088 pv = pvf; 9089 do { 9090 if (pvf == NULL) 9091 pvf = pv; 9092 pmap = PV_PMAP(pv); 9093 if (!PMAP_TRYLOCK(pmap)) { 9094 pvh_gen = pvh->pv_gen; 9095 md_gen = m->md.pv_gen; 9096 rw_wunlock(lock); 9097 PMAP_LOCK(pmap); 9098 rw_wlock(lock); 9099 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 9100 PMAP_UNLOCK(pmap); 9101 goto retry; 9102 } 9103 } 9104 PG_A = pmap_accessed_bit(pmap); 9105 PG_M = pmap_modified_bit(pmap); 9106 PG_RW = pmap_rw_bit(pmap); 9107 pde = pmap_pde(pmap, pv->pv_va); 9108 KASSERT((*pde & PG_PS) == 0, 9109 ("pmap_ts_referenced: found a 2mpage in page %p's pv list", 9110 m)); 9111 pte = pmap_pde_to_pte(pde, pv->pv_va); 9112 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 9113 vm_page_dirty(m); 9114 if ((*pte & PG_A) != 0) { 9115 if (safe_to_clear_referenced(pmap, *pte)) { 9116 atomic_clear_long(pte, PG_A); 9117 pmap_invalidate_page(pmap, pv->pv_va); 9118 cleared++; 9119 } else if ((*pte & PG_W) == 0) { 9120 /* 9121 * Wired pages cannot be paged out so 9122 * doing accessed bit emulation for 9123 * them is wasted effort. We do the 9124 * hard work for unwired pages only. 9125 */ 9126 pmap_remove_pte(pmap, pte, pv->pv_va, 9127 *pde, &free, &lock); 9128 pmap_invalidate_page(pmap, pv->pv_va); 9129 cleared++; 9130 if (pvf == pv) 9131 pvf = NULL; 9132 pv = NULL; 9133 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 9134 ("inconsistent pv lock %p %p for page %p", 9135 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 9136 } else 9137 not_cleared++; 9138 } 9139 PMAP_UNLOCK(pmap); 9140 /* Rotate the PV list if it has more than one entry. */ 9141 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 9142 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 9143 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 9144 m->md.pv_gen++; 9145 } 9146 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 9147 not_cleared < PMAP_TS_REFERENCED_MAX); 9148 out: 9149 rw_wunlock(lock); 9150 vm_page_free_pages_toq(&free, true); 9151 return (cleared + not_cleared); 9152 } 9153 9154 /* 9155 * Apply the given advice to the specified range of addresses within the 9156 * given pmap. Depending on the advice, clear the referenced and/or 9157 * modified flags in each mapping and set the mapped page's dirty field. 9158 */ 9159 void 9160 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 9161 { 9162 struct rwlock *lock; 9163 pml4_entry_t *pml4e; 9164 pdp_entry_t *pdpe; 9165 pd_entry_t oldpde, *pde; 9166 pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; 9167 vm_offset_t va, va_next; 9168 vm_page_t m; 9169 bool anychanged; 9170 9171 if (advice != MADV_DONTNEED && advice != MADV_FREE) 9172 return; 9173 9174 /* 9175 * A/D bit emulation requires an alternate code path when clearing 9176 * the modified and accessed bits below. Since this function is 9177 * advisory in nature we skip it entirely for pmaps that require 9178 * A/D bit emulation. 9179 */ 9180 if (pmap_emulate_ad_bits(pmap)) 9181 return; 9182 9183 PG_A = pmap_accessed_bit(pmap); 9184 PG_G = pmap_global_bit(pmap); 9185 PG_M = pmap_modified_bit(pmap); 9186 PG_V = pmap_valid_bit(pmap); 9187 PG_RW = pmap_rw_bit(pmap); 9188 anychanged = false; 9189 pmap_delayed_invl_start(); 9190 PMAP_LOCK(pmap); 9191 for (; sva < eva; sva = va_next) { 9192 pml4e = pmap_pml4e(pmap, sva); 9193 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 9194 va_next = (sva + NBPML4) & ~PML4MASK; 9195 if (va_next < sva) 9196 va_next = eva; 9197 continue; 9198 } 9199 9200 va_next = (sva + NBPDP) & ~PDPMASK; 9201 if (va_next < sva) 9202 va_next = eva; 9203 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 9204 if ((*pdpe & PG_V) == 0) 9205 continue; 9206 if ((*pdpe & PG_PS) != 0) 9207 continue; 9208 9209 va_next = (sva + NBPDR) & ~PDRMASK; 9210 if (va_next < sva) 9211 va_next = eva; 9212 pde = pmap_pdpe_to_pde(pdpe, sva); 9213 oldpde = *pde; 9214 if ((oldpde & PG_V) == 0) 9215 continue; 9216 else if ((oldpde & PG_PS) != 0) { 9217 if ((oldpde & PG_MANAGED) == 0) 9218 continue; 9219 lock = NULL; 9220 if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { 9221 if (lock != NULL) 9222 rw_wunlock(lock); 9223 9224 /* 9225 * The large page mapping was destroyed. 9226 */ 9227 continue; 9228 } 9229 9230 /* 9231 * Unless the page mappings are wired, remove the 9232 * mapping to a single page so that a subsequent 9233 * access may repromote. Choosing the last page 9234 * within the address range [sva, min(va_next, eva)) 9235 * generally results in more repromotions. Since the 9236 * underlying page table page is fully populated, this 9237 * removal never frees a page table page. 9238 */ 9239 if ((oldpde & PG_W) == 0) { 9240 va = eva; 9241 if (va > va_next) 9242 va = va_next; 9243 va -= PAGE_SIZE; 9244 KASSERT(va >= sva, 9245 ("pmap_advise: no address gap")); 9246 pte = pmap_pde_to_pte(pde, va); 9247 KASSERT((*pte & PG_V) != 0, 9248 ("pmap_advise: invalid PTE")); 9249 pmap_remove_pte(pmap, pte, va, *pde, NULL, 9250 &lock); 9251 anychanged = true; 9252 } 9253 if (lock != NULL) 9254 rw_wunlock(lock); 9255 } 9256 if (va_next > eva) 9257 va_next = eva; 9258 va = va_next; 9259 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 9260 sva += PAGE_SIZE) { 9261 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) 9262 goto maybe_invlrng; 9263 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 9264 if (advice == MADV_DONTNEED) { 9265 /* 9266 * Future calls to pmap_is_modified() 9267 * can be avoided by making the page 9268 * dirty now. 9269 */ 9270 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 9271 vm_page_dirty(m); 9272 } 9273 atomic_clear_long(pte, PG_M | PG_A); 9274 } else if ((*pte & PG_A) != 0) 9275 atomic_clear_long(pte, PG_A); 9276 else 9277 goto maybe_invlrng; 9278 9279 if ((*pte & PG_G) != 0) { 9280 if (va == va_next) 9281 va = sva; 9282 } else 9283 anychanged = true; 9284 continue; 9285 maybe_invlrng: 9286 if (va != va_next) { 9287 pmap_invalidate_range(pmap, va, sva); 9288 va = va_next; 9289 } 9290 } 9291 if (va != va_next) 9292 pmap_invalidate_range(pmap, va, sva); 9293 } 9294 if (anychanged) 9295 pmap_invalidate_all(pmap); 9296 PMAP_UNLOCK(pmap); 9297 pmap_delayed_invl_finish(); 9298 } 9299 9300 /* 9301 * Clear the modify bits on the specified physical page. 9302 */ 9303 void 9304 pmap_clear_modify(vm_page_t m) 9305 { 9306 struct md_page *pvh; 9307 pmap_t pmap; 9308 pv_entry_t next_pv, pv; 9309 pd_entry_t oldpde, *pde; 9310 pt_entry_t *pte, PG_M, PG_RW; 9311 struct rwlock *lock; 9312 vm_offset_t va; 9313 int md_gen, pvh_gen; 9314 9315 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 9316 ("pmap_clear_modify: page %p is not managed", m)); 9317 vm_page_assert_busied(m); 9318 9319 if (!pmap_page_is_write_mapped(m)) 9320 return; 9321 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 9322 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 9323 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 9324 rw_wlock(lock); 9325 restart: 9326 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 9327 pmap = PV_PMAP(pv); 9328 if (!PMAP_TRYLOCK(pmap)) { 9329 pvh_gen = pvh->pv_gen; 9330 rw_wunlock(lock); 9331 PMAP_LOCK(pmap); 9332 rw_wlock(lock); 9333 if (pvh_gen != pvh->pv_gen) { 9334 PMAP_UNLOCK(pmap); 9335 goto restart; 9336 } 9337 } 9338 PG_M = pmap_modified_bit(pmap); 9339 PG_RW = pmap_rw_bit(pmap); 9340 va = pv->pv_va; 9341 pde = pmap_pde(pmap, va); 9342 oldpde = *pde; 9343 /* If oldpde has PG_RW set, then it also has PG_M set. */ 9344 if ((oldpde & PG_RW) != 0 && 9345 pmap_demote_pde_locked(pmap, pde, va, &lock) && 9346 (oldpde & PG_W) == 0) { 9347 /* 9348 * Write protect the mapping to a single page so that 9349 * a subsequent write access may repromote. 9350 */ 9351 va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); 9352 pte = pmap_pde_to_pte(pde, va); 9353 atomic_clear_long(pte, PG_M | PG_RW); 9354 vm_page_dirty(m); 9355 pmap_invalidate_page(pmap, va); 9356 } 9357 PMAP_UNLOCK(pmap); 9358 } 9359 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 9360 pmap = PV_PMAP(pv); 9361 if (!PMAP_TRYLOCK(pmap)) { 9362 md_gen = m->md.pv_gen; 9363 pvh_gen = pvh->pv_gen; 9364 rw_wunlock(lock); 9365 PMAP_LOCK(pmap); 9366 rw_wlock(lock); 9367 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 9368 PMAP_UNLOCK(pmap); 9369 goto restart; 9370 } 9371 } 9372 PG_M = pmap_modified_bit(pmap); 9373 PG_RW = pmap_rw_bit(pmap); 9374 pde = pmap_pde(pmap, pv->pv_va); 9375 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 9376 " a 2mpage in page %p's pv list", m)); 9377 pte = pmap_pde_to_pte(pde, pv->pv_va); 9378 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 9379 atomic_clear_long(pte, PG_M); 9380 pmap_invalidate_page(pmap, pv->pv_va); 9381 } 9382 PMAP_UNLOCK(pmap); 9383 } 9384 rw_wunlock(lock); 9385 } 9386 9387 /* 9388 * Miscellaneous support routines follow 9389 */ 9390 9391 /* Adjust the properties for a leaf page table entry. */ 9392 static __inline void 9393 pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask) 9394 { 9395 u_long opte, npte; 9396 9397 opte = *(u_long *)pte; 9398 do { 9399 npte = opte & ~mask; 9400 npte |= bits; 9401 } while (npte != opte && !atomic_fcmpset_long((u_long *)pte, &opte, 9402 npte)); 9403 } 9404 9405 /* 9406 * Map a set of physical memory pages into the kernel virtual 9407 * address space. Return a pointer to where it is mapped. This 9408 * routine is intended to be used for mapping device memory, 9409 * NOT real memory. 9410 */ 9411 static void * 9412 pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags) 9413 { 9414 struct pmap_preinit_mapping *ppim; 9415 vm_offset_t va, offset; 9416 vm_size_t tmpsize; 9417 int i; 9418 9419 offset = pa & PAGE_MASK; 9420 size = round_page(offset + size); 9421 pa = trunc_page(pa); 9422 9423 if (!pmap_initialized) { 9424 va = 0; 9425 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9426 ppim = pmap_preinit_mapping + i; 9427 if (ppim->va == 0) { 9428 ppim->pa = pa; 9429 ppim->sz = size; 9430 ppim->mode = mode; 9431 ppim->va = virtual_avail; 9432 virtual_avail += size; 9433 va = ppim->va; 9434 break; 9435 } 9436 } 9437 if (va == 0) 9438 panic("%s: too many preinit mappings", __func__); 9439 } else { 9440 /* 9441 * If we have a preinit mapping, re-use it. 9442 */ 9443 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9444 ppim = pmap_preinit_mapping + i; 9445 if (ppim->pa == pa && ppim->sz == size && 9446 (ppim->mode == mode || 9447 (flags & MAPDEV_SETATTR) == 0)) 9448 return ((void *)(ppim->va + offset)); 9449 } 9450 /* 9451 * If the specified range of physical addresses fits within 9452 * the direct map window, use the direct map. 9453 */ 9454 if (pa < dmaplimit && pa + size <= dmaplimit) { 9455 va = PHYS_TO_DMAP(pa); 9456 if ((flags & MAPDEV_SETATTR) != 0) { 9457 PMAP_LOCK(kernel_pmap); 9458 i = pmap_change_props_locked(va, size, 9459 PROT_NONE, mode, flags); 9460 PMAP_UNLOCK(kernel_pmap); 9461 } else 9462 i = 0; 9463 if (!i) 9464 return ((void *)(va + offset)); 9465 } 9466 va = kva_alloc(size); 9467 if (va == 0) 9468 panic("%s: Couldn't allocate KVA", __func__); 9469 } 9470 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 9471 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 9472 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 9473 if ((flags & MAPDEV_FLUSHCACHE) != 0) 9474 pmap_invalidate_cache_range(va, va + tmpsize); 9475 return ((void *)(va + offset)); 9476 } 9477 9478 void * 9479 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 9480 { 9481 9482 return (pmap_mapdev_internal(pa, size, mode, MAPDEV_FLUSHCACHE | 9483 MAPDEV_SETATTR)); 9484 } 9485 9486 void * 9487 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 9488 { 9489 9490 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 9491 } 9492 9493 void * 9494 pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size) 9495 { 9496 9497 return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE, 9498 MAPDEV_SETATTR)); 9499 } 9500 9501 void * 9502 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 9503 { 9504 9505 return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK, 9506 MAPDEV_FLUSHCACHE)); 9507 } 9508 9509 void 9510 pmap_unmapdev(void *p, vm_size_t size) 9511 { 9512 struct pmap_preinit_mapping *ppim; 9513 vm_offset_t offset, va; 9514 int i; 9515 9516 va = (vm_offset_t)p; 9517 9518 /* If we gave a direct map region in pmap_mapdev, do nothing */ 9519 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 9520 return; 9521 offset = va & PAGE_MASK; 9522 size = round_page(offset + size); 9523 va = trunc_page(va); 9524 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9525 ppim = pmap_preinit_mapping + i; 9526 if (ppim->va == va && ppim->sz == size) { 9527 if (pmap_initialized) 9528 return; 9529 ppim->pa = 0; 9530 ppim->va = 0; 9531 ppim->sz = 0; 9532 ppim->mode = 0; 9533 if (va + size == virtual_avail) 9534 virtual_avail = va; 9535 return; 9536 } 9537 } 9538 if (pmap_initialized) { 9539 pmap_qremove(va, atop(size)); 9540 kva_free(va, size); 9541 } 9542 } 9543 9544 /* 9545 * Tries to demote a 1GB page mapping. 9546 */ 9547 static boolean_t 9548 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) 9549 { 9550 pdp_entry_t newpdpe, oldpdpe; 9551 pd_entry_t *firstpde, newpde, *pde; 9552 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 9553 vm_paddr_t pdpgpa; 9554 vm_page_t pdpg; 9555 9556 PG_A = pmap_accessed_bit(pmap); 9557 PG_M = pmap_modified_bit(pmap); 9558 PG_V = pmap_valid_bit(pmap); 9559 PG_RW = pmap_rw_bit(pmap); 9560 9561 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9562 oldpdpe = *pdpe; 9563 KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), 9564 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 9565 pdpg = pmap_alloc_pt_page(pmap, va >> PDPSHIFT, 9566 VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT); 9567 if (pdpg == NULL) { 9568 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 9569 " in pmap %p", va, pmap); 9570 return (FALSE); 9571 } 9572 pdpgpa = VM_PAGE_TO_PHYS(pdpg); 9573 firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa); 9574 newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; 9575 KASSERT((oldpdpe & PG_A) != 0, 9576 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 9577 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 9578 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 9579 newpde = oldpdpe; 9580 9581 /* 9582 * Initialize the page directory page. 9583 */ 9584 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 9585 *pde = newpde; 9586 newpde += NBPDR; 9587 } 9588 9589 /* 9590 * Demote the mapping. 9591 */ 9592 *pdpe = newpdpe; 9593 9594 /* 9595 * Invalidate a stale recursive mapping of the page directory page. 9596 */ 9597 pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); 9598 9599 counter_u64_add(pmap_pdpe_demotions, 1); 9600 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 9601 " in pmap %p", va, pmap); 9602 return (TRUE); 9603 } 9604 9605 /* 9606 * Sets the memory attribute for the specified page. 9607 */ 9608 void 9609 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 9610 { 9611 9612 m->md.pat_mode = ma; 9613 9614 /* 9615 * If "m" is a normal page, update its direct mapping. This update 9616 * can be relied upon to perform any cache operations that are 9617 * required for data coherence. 9618 */ 9619 if ((m->flags & PG_FICTITIOUS) == 0 && 9620 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 9621 m->md.pat_mode)) 9622 panic("memory attribute change on the direct map failed"); 9623 } 9624 9625 void 9626 pmap_page_set_memattr_noflush(vm_page_t m, vm_memattr_t ma) 9627 { 9628 int error; 9629 9630 m->md.pat_mode = ma; 9631 9632 if ((m->flags & PG_FICTITIOUS) != 0) 9633 return; 9634 PMAP_LOCK(kernel_pmap); 9635 error = pmap_change_props_locked(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 9636 PAGE_SIZE, PROT_NONE, m->md.pat_mode, 0); 9637 PMAP_UNLOCK(kernel_pmap); 9638 if (error != 0) 9639 panic("memory attribute change on the direct map failed"); 9640 } 9641 9642 /* 9643 * Changes the specified virtual address range's memory type to that given by 9644 * the parameter "mode". The specified virtual address range must be 9645 * completely contained within either the direct map or the kernel map. If 9646 * the virtual address range is contained within the kernel map, then the 9647 * memory type for each of the corresponding ranges of the direct map is also 9648 * changed. (The corresponding ranges of the direct map are those ranges that 9649 * map the same physical pages as the specified virtual address range.) These 9650 * changes to the direct map are necessary because Intel describes the 9651 * behavior of their processors as "undefined" if two or more mappings to the 9652 * same physical page have different memory types. 9653 * 9654 * Returns zero if the change completed successfully, and either EINVAL or 9655 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 9656 * of the virtual address range was not mapped, and ENOMEM is returned if 9657 * there was insufficient memory available to complete the change. In the 9658 * latter case, the memory type may have been changed on some part of the 9659 * virtual address range or the direct map. 9660 */ 9661 int 9662 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 9663 { 9664 int error; 9665 9666 PMAP_LOCK(kernel_pmap); 9667 error = pmap_change_props_locked(va, size, PROT_NONE, mode, 9668 MAPDEV_FLUSHCACHE); 9669 PMAP_UNLOCK(kernel_pmap); 9670 return (error); 9671 } 9672 9673 /* 9674 * Changes the specified virtual address range's protections to those 9675 * specified by "prot". Like pmap_change_attr(), protections for aliases 9676 * in the direct map are updated as well. Protections on aliasing mappings may 9677 * be a subset of the requested protections; for example, mappings in the direct 9678 * map are never executable. 9679 */ 9680 int 9681 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot) 9682 { 9683 int error; 9684 9685 /* Only supported within the kernel map. */ 9686 if (va < VM_MIN_KERNEL_ADDRESS) 9687 return (EINVAL); 9688 9689 PMAP_LOCK(kernel_pmap); 9690 error = pmap_change_props_locked(va, size, prot, -1, 9691 MAPDEV_ASSERTVALID); 9692 PMAP_UNLOCK(kernel_pmap); 9693 return (error); 9694 } 9695 9696 static int 9697 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, 9698 int mode, int flags) 9699 { 9700 vm_offset_t base, offset, tmpva; 9701 vm_paddr_t pa_start, pa_end, pa_end1; 9702 pdp_entry_t *pdpe; 9703 pd_entry_t *pde, pde_bits, pde_mask; 9704 pt_entry_t *pte, pte_bits, pte_mask; 9705 int error; 9706 bool changed; 9707 9708 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 9709 base = trunc_page(va); 9710 offset = va & PAGE_MASK; 9711 size = round_page(offset + size); 9712 9713 /* 9714 * Only supported on kernel virtual addresses, including the direct 9715 * map but excluding the recursive map. 9716 */ 9717 if (base < DMAP_MIN_ADDRESS) 9718 return (EINVAL); 9719 9720 /* 9721 * Construct our flag sets and masks. "bits" is the subset of 9722 * "mask" that will be set in each modified PTE. 9723 * 9724 * Mappings in the direct map are never allowed to be executable. 9725 */ 9726 pde_bits = pte_bits = 0; 9727 pde_mask = pte_mask = 0; 9728 if (mode != -1) { 9729 pde_bits |= pmap_cache_bits(kernel_pmap, mode, true); 9730 pde_mask |= X86_PG_PDE_CACHE; 9731 pte_bits |= pmap_cache_bits(kernel_pmap, mode, false); 9732 pte_mask |= X86_PG_PTE_CACHE; 9733 } 9734 if (prot != VM_PROT_NONE) { 9735 if ((prot & VM_PROT_WRITE) != 0) { 9736 pde_bits |= X86_PG_RW; 9737 pte_bits |= X86_PG_RW; 9738 } 9739 if ((prot & VM_PROT_EXECUTE) == 0 || 9740 va < VM_MIN_KERNEL_ADDRESS) { 9741 pde_bits |= pg_nx; 9742 pte_bits |= pg_nx; 9743 } 9744 pde_mask |= X86_PG_RW | pg_nx; 9745 pte_mask |= X86_PG_RW | pg_nx; 9746 } 9747 9748 /* 9749 * Pages that aren't mapped aren't supported. Also break down 2MB pages 9750 * into 4KB pages if required. 9751 */ 9752 for (tmpva = base; tmpva < base + size; ) { 9753 pdpe = pmap_pdpe(kernel_pmap, tmpva); 9754 if (pdpe == NULL || *pdpe == 0) { 9755 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9756 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9757 return (EINVAL); 9758 } 9759 if (*pdpe & PG_PS) { 9760 /* 9761 * If the current 1GB page already has the required 9762 * properties, then we need not demote this page. Just 9763 * increment tmpva to the next 1GB page frame. 9764 */ 9765 if ((*pdpe & pde_mask) == pde_bits) { 9766 tmpva = trunc_1gpage(tmpva) + NBPDP; 9767 continue; 9768 } 9769 9770 /* 9771 * If the current offset aligns with a 1GB page frame 9772 * and there is at least 1GB left within the range, then 9773 * we need not break down this page into 2MB pages. 9774 */ 9775 if ((tmpva & PDPMASK) == 0 && 9776 tmpva + PDPMASK < base + size) { 9777 tmpva += NBPDP; 9778 continue; 9779 } 9780 if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) 9781 return (ENOMEM); 9782 } 9783 pde = pmap_pdpe_to_pde(pdpe, tmpva); 9784 if (*pde == 0) { 9785 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9786 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9787 return (EINVAL); 9788 } 9789 if (*pde & PG_PS) { 9790 /* 9791 * If the current 2MB page already has the required 9792 * properties, then we need not demote this page. Just 9793 * increment tmpva to the next 2MB page frame. 9794 */ 9795 if ((*pde & pde_mask) == pde_bits) { 9796 tmpva = trunc_2mpage(tmpva) + NBPDR; 9797 continue; 9798 } 9799 9800 /* 9801 * If the current offset aligns with a 2MB page frame 9802 * and there is at least 2MB left within the range, then 9803 * we need not break down this page into 4KB pages. 9804 */ 9805 if ((tmpva & PDRMASK) == 0 && 9806 tmpva + PDRMASK < base + size) { 9807 tmpva += NBPDR; 9808 continue; 9809 } 9810 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) 9811 return (ENOMEM); 9812 } 9813 pte = pmap_pde_to_pte(pde, tmpva); 9814 if (*pte == 0) { 9815 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9816 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9817 return (EINVAL); 9818 } 9819 tmpva += PAGE_SIZE; 9820 } 9821 error = 0; 9822 9823 /* 9824 * Ok, all the pages exist, so run through them updating their 9825 * properties if required. 9826 */ 9827 changed = false; 9828 pa_start = pa_end = 0; 9829 for (tmpva = base; tmpva < base + size; ) { 9830 pdpe = pmap_pdpe(kernel_pmap, tmpva); 9831 if (*pdpe & PG_PS) { 9832 if ((*pdpe & pde_mask) != pde_bits) { 9833 pmap_pte_props(pdpe, pde_bits, pde_mask); 9834 changed = true; 9835 } 9836 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9837 (*pdpe & PG_PS_FRAME) < dmaplimit) { 9838 if (pa_start == pa_end) { 9839 /* Start physical address run. */ 9840 pa_start = *pdpe & PG_PS_FRAME; 9841 pa_end = pa_start + NBPDP; 9842 } else if (pa_end == (*pdpe & PG_PS_FRAME)) 9843 pa_end += NBPDP; 9844 else { 9845 /* Run ended, update direct map. */ 9846 error = pmap_change_props_locked( 9847 PHYS_TO_DMAP(pa_start), 9848 pa_end - pa_start, prot, mode, 9849 flags); 9850 if (error != 0) 9851 break; 9852 /* Start physical address run. */ 9853 pa_start = *pdpe & PG_PS_FRAME; 9854 pa_end = pa_start + NBPDP; 9855 } 9856 } 9857 tmpva = trunc_1gpage(tmpva) + NBPDP; 9858 continue; 9859 } 9860 pde = pmap_pdpe_to_pde(pdpe, tmpva); 9861 if (*pde & PG_PS) { 9862 if ((*pde & pde_mask) != pde_bits) { 9863 pmap_pte_props(pde, pde_bits, pde_mask); 9864 changed = true; 9865 } 9866 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9867 (*pde & PG_PS_FRAME) < dmaplimit) { 9868 if (pa_start == pa_end) { 9869 /* Start physical address run. */ 9870 pa_start = *pde & PG_PS_FRAME; 9871 pa_end = pa_start + NBPDR; 9872 } else if (pa_end == (*pde & PG_PS_FRAME)) 9873 pa_end += NBPDR; 9874 else { 9875 /* Run ended, update direct map. */ 9876 error = pmap_change_props_locked( 9877 PHYS_TO_DMAP(pa_start), 9878 pa_end - pa_start, prot, mode, 9879 flags); 9880 if (error != 0) 9881 break; 9882 /* Start physical address run. */ 9883 pa_start = *pde & PG_PS_FRAME; 9884 pa_end = pa_start + NBPDR; 9885 } 9886 } 9887 tmpva = trunc_2mpage(tmpva) + NBPDR; 9888 } else { 9889 pte = pmap_pde_to_pte(pde, tmpva); 9890 if ((*pte & pte_mask) != pte_bits) { 9891 pmap_pte_props(pte, pte_bits, pte_mask); 9892 changed = true; 9893 } 9894 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9895 (*pte & PG_FRAME) < dmaplimit) { 9896 if (pa_start == pa_end) { 9897 /* Start physical address run. */ 9898 pa_start = *pte & PG_FRAME; 9899 pa_end = pa_start + PAGE_SIZE; 9900 } else if (pa_end == (*pte & PG_FRAME)) 9901 pa_end += PAGE_SIZE; 9902 else { 9903 /* Run ended, update direct map. */ 9904 error = pmap_change_props_locked( 9905 PHYS_TO_DMAP(pa_start), 9906 pa_end - pa_start, prot, mode, 9907 flags); 9908 if (error != 0) 9909 break; 9910 /* Start physical address run. */ 9911 pa_start = *pte & PG_FRAME; 9912 pa_end = pa_start + PAGE_SIZE; 9913 } 9914 } 9915 tmpva += PAGE_SIZE; 9916 } 9917 } 9918 if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { 9919 pa_end1 = MIN(pa_end, dmaplimit); 9920 if (pa_start != pa_end1) 9921 error = pmap_change_props_locked(PHYS_TO_DMAP(pa_start), 9922 pa_end1 - pa_start, prot, mode, flags); 9923 } 9924 9925 /* 9926 * Flush CPU caches if required to make sure any data isn't cached that 9927 * shouldn't be, etc. 9928 */ 9929 if (changed) { 9930 pmap_invalidate_range(kernel_pmap, base, tmpva); 9931 if ((flags & MAPDEV_FLUSHCACHE) != 0) 9932 pmap_invalidate_cache_range(base, tmpva); 9933 } 9934 return (error); 9935 } 9936 9937 /* 9938 * Demotes any mapping within the direct map region that covers more than the 9939 * specified range of physical addresses. This range's size must be a power 9940 * of two and its starting address must be a multiple of its size. Since the 9941 * demotion does not change any attributes of the mapping, a TLB invalidation 9942 * is not mandatory. The caller may, however, request a TLB invalidation. 9943 */ 9944 void 9945 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) 9946 { 9947 pdp_entry_t *pdpe; 9948 pd_entry_t *pde; 9949 vm_offset_t va; 9950 boolean_t changed; 9951 9952 if (len == 0) 9953 return; 9954 KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); 9955 KASSERT((base & (len - 1)) == 0, 9956 ("pmap_demote_DMAP: base is not a multiple of len")); 9957 if (len < NBPDP && base < dmaplimit) { 9958 va = PHYS_TO_DMAP(base); 9959 changed = FALSE; 9960 PMAP_LOCK(kernel_pmap); 9961 pdpe = pmap_pdpe(kernel_pmap, va); 9962 if ((*pdpe & X86_PG_V) == 0) 9963 panic("pmap_demote_DMAP: invalid PDPE"); 9964 if ((*pdpe & PG_PS) != 0) { 9965 if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) 9966 panic("pmap_demote_DMAP: PDPE failed"); 9967 changed = TRUE; 9968 } 9969 if (len < NBPDR) { 9970 pde = pmap_pdpe_to_pde(pdpe, va); 9971 if ((*pde & X86_PG_V) == 0) 9972 panic("pmap_demote_DMAP: invalid PDE"); 9973 if ((*pde & PG_PS) != 0) { 9974 if (!pmap_demote_pde(kernel_pmap, pde, va)) 9975 panic("pmap_demote_DMAP: PDE failed"); 9976 changed = TRUE; 9977 } 9978 } 9979 if (changed && invalidate) 9980 pmap_invalidate_page(kernel_pmap, va); 9981 PMAP_UNLOCK(kernel_pmap); 9982 } 9983 } 9984 9985 /* 9986 * Perform the pmap work for mincore(2). If the page is not both referenced and 9987 * modified by this pmap, returns its physical address so that the caller can 9988 * find other mappings. 9989 */ 9990 int 9991 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 9992 { 9993 pdp_entry_t *pdpe; 9994 pd_entry_t *pdep; 9995 pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; 9996 vm_paddr_t pa; 9997 int val; 9998 9999 PG_A = pmap_accessed_bit(pmap); 10000 PG_M = pmap_modified_bit(pmap); 10001 PG_V = pmap_valid_bit(pmap); 10002 PG_RW = pmap_rw_bit(pmap); 10003 10004 PMAP_LOCK(pmap); 10005 pte = 0; 10006 pa = 0; 10007 val = 0; 10008 pdpe = pmap_pdpe(pmap, addr); 10009 if (pdpe == NULL) 10010 goto out; 10011 if ((*pdpe & PG_V) != 0) { 10012 if ((*pdpe & PG_PS) != 0) { 10013 pte = *pdpe; 10014 pa = ((pte & PG_PS_PDP_FRAME) | (addr & PDPMASK)) & 10015 PG_FRAME; 10016 val = MINCORE_PSIND(2); 10017 } else { 10018 pdep = pmap_pde(pmap, addr); 10019 if (pdep != NULL && (*pdep & PG_V) != 0) { 10020 if ((*pdep & PG_PS) != 0) { 10021 pte = *pdep; 10022 /* Compute the physical address of the 4KB page. */ 10023 pa = ((pte & PG_PS_FRAME) | (addr & 10024 PDRMASK)) & PG_FRAME; 10025 val = MINCORE_PSIND(1); 10026 } else { 10027 pte = *pmap_pde_to_pte(pdep, addr); 10028 pa = pte & PG_FRAME; 10029 val = 0; 10030 } 10031 } 10032 } 10033 } 10034 if ((pte & PG_V) != 0) { 10035 val |= MINCORE_INCORE; 10036 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 10037 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 10038 if ((pte & PG_A) != 0) 10039 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 10040 } 10041 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 10042 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 10043 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 10044 *pap = pa; 10045 } 10046 out: 10047 PMAP_UNLOCK(pmap); 10048 return (val); 10049 } 10050 10051 static uint64_t 10052 pmap_pcid_alloc(pmap_t pmap, struct pmap_pcid *pcidp) 10053 { 10054 uint32_t gen, new_gen, pcid_next; 10055 10056 CRITICAL_ASSERT(curthread); 10057 gen = PCPU_GET(pcid_gen); 10058 if (pcidp->pm_pcid == PMAP_PCID_KERN) 10059 return (pti ? 0 : CR3_PCID_SAVE); 10060 if (pcidp->pm_gen == gen) 10061 return (CR3_PCID_SAVE); 10062 pcid_next = PCPU_GET(pcid_next); 10063 KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) || 10064 (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN), 10065 ("cpu %d pcid_next %#x", PCPU_GET(cpuid), pcid_next)); 10066 if ((!pti && pcid_next == PMAP_PCID_OVERMAX) || 10067 (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) { 10068 new_gen = gen + 1; 10069 if (new_gen == 0) 10070 new_gen = 1; 10071 PCPU_SET(pcid_gen, new_gen); 10072 pcid_next = PMAP_PCID_KERN + 1; 10073 } else { 10074 new_gen = gen; 10075 } 10076 pcidp->pm_pcid = pcid_next; 10077 pcidp->pm_gen = new_gen; 10078 PCPU_SET(pcid_next, pcid_next + 1); 10079 return (0); 10080 } 10081 10082 static uint64_t 10083 pmap_pcid_alloc_checked(pmap_t pmap, struct pmap_pcid *pcidp) 10084 { 10085 uint64_t cached; 10086 10087 cached = pmap_pcid_alloc(pmap, pcidp); 10088 KASSERT(pcidp->pm_pcid < PMAP_PCID_OVERMAX, 10089 ("pmap %p cpu %d pcid %#x", pmap, PCPU_GET(cpuid), pcidp->pm_pcid)); 10090 KASSERT(pcidp->pm_pcid != PMAP_PCID_KERN || pmap == kernel_pmap, 10091 ("non-kernel pmap pmap %p cpu %d pcid %#x", 10092 pmap, PCPU_GET(cpuid), pcidp->pm_pcid)); 10093 return (cached); 10094 } 10095 10096 static void 10097 pmap_activate_sw_pti_post(struct thread *td, pmap_t pmap) 10098 { 10099 10100 PCPU_GET(tssp)->tss_rsp0 = pmap->pm_ucr3 != PMAP_NO_CR3 ? 10101 PCPU_GET(pti_rsp0) : (uintptr_t)td->td_md.md_stack_base; 10102 } 10103 10104 static void 10105 pmap_activate_sw_pcid_pti(struct thread *td, pmap_t pmap, u_int cpuid) 10106 { 10107 pmap_t old_pmap; 10108 struct pmap_pcid *pcidp, *old_pcidp; 10109 uint64_t cached, cr3, kcr3, ucr3; 10110 10111 KASSERT((read_rflags() & PSL_I) == 0, 10112 ("PCID needs interrupts disabled in pmap_activate_sw()")); 10113 10114 /* See the comment in pmap_invalidate_page_pcid(). */ 10115 if (PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) { 10116 PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK); 10117 old_pmap = PCPU_GET(curpmap); 10118 MPASS(old_pmap->pm_ucr3 != PMAP_NO_CR3); 10119 old_pcidp = zpcpu_get_cpu(old_pmap->pm_pcidp, cpuid); 10120 old_pcidp->pm_gen = 0; 10121 } 10122 10123 pcidp = zpcpu_get_cpu(pmap->pm_pcidp, cpuid); 10124 cached = pmap_pcid_alloc_checked(pmap, pcidp); 10125 cr3 = rcr3(); 10126 if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) 10127 load_cr3(pmap->pm_cr3 | pcidp->pm_pcid); 10128 PCPU_SET(curpmap, pmap); 10129 kcr3 = pmap->pm_cr3 | pcidp->pm_pcid; 10130 ucr3 = pmap->pm_ucr3 | pcidp->pm_pcid | PMAP_PCID_USER_PT; 10131 10132 if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) 10133 PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); 10134 10135 PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); 10136 PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); 10137 if (cached) 10138 counter_u64_add(pcid_save_cnt, 1); 10139 10140 pmap_activate_sw_pti_post(td, pmap); 10141 } 10142 10143 static void 10144 pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap, 10145 u_int cpuid) 10146 { 10147 struct pmap_pcid *pcidp; 10148 uint64_t cached, cr3; 10149 10150 KASSERT((read_rflags() & PSL_I) == 0, 10151 ("PCID needs interrupts disabled in pmap_activate_sw()")); 10152 10153 pcidp = zpcpu_get_cpu(pmap->pm_pcidp, cpuid); 10154 cached = pmap_pcid_alloc_checked(pmap, pcidp); 10155 cr3 = rcr3(); 10156 if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) 10157 load_cr3(pmap->pm_cr3 | pcidp->pm_pcid | cached); 10158 PCPU_SET(curpmap, pmap); 10159 if (cached) 10160 counter_u64_add(pcid_save_cnt, 1); 10161 } 10162 10163 static void 10164 pmap_activate_sw_nopcid_nopti(struct thread *td __unused, pmap_t pmap, 10165 u_int cpuid __unused) 10166 { 10167 10168 load_cr3(pmap->pm_cr3); 10169 PCPU_SET(curpmap, pmap); 10170 } 10171 10172 static void 10173 pmap_activate_sw_nopcid_pti(struct thread *td, pmap_t pmap, 10174 u_int cpuid __unused) 10175 { 10176 10177 pmap_activate_sw_nopcid_nopti(td, pmap, cpuid); 10178 PCPU_SET(kcr3, pmap->pm_cr3); 10179 PCPU_SET(ucr3, pmap->pm_ucr3); 10180 pmap_activate_sw_pti_post(td, pmap); 10181 } 10182 10183 DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (struct thread *, pmap_t, 10184 u_int)) 10185 { 10186 10187 if (pmap_pcid_enabled && pti) 10188 return (pmap_activate_sw_pcid_pti); 10189 else if (pmap_pcid_enabled && !pti) 10190 return (pmap_activate_sw_pcid_nopti); 10191 else if (!pmap_pcid_enabled && pti) 10192 return (pmap_activate_sw_nopcid_pti); 10193 else /* if (!pmap_pcid_enabled && !pti) */ 10194 return (pmap_activate_sw_nopcid_nopti); 10195 } 10196 10197 void 10198 pmap_activate_sw(struct thread *td) 10199 { 10200 pmap_t oldpmap, pmap; 10201 u_int cpuid; 10202 10203 oldpmap = PCPU_GET(curpmap); 10204 pmap = vmspace_pmap(td->td_proc->p_vmspace); 10205 if (oldpmap == pmap) { 10206 if (cpu_vendor_id != CPU_VENDOR_INTEL) 10207 mfence(); 10208 return; 10209 } 10210 cpuid = PCPU_GET(cpuid); 10211 #ifdef SMP 10212 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 10213 #else 10214 CPU_SET(cpuid, &pmap->pm_active); 10215 #endif 10216 pmap_activate_sw_mode(td, pmap, cpuid); 10217 #ifdef SMP 10218 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 10219 #else 10220 CPU_CLR(cpuid, &oldpmap->pm_active); 10221 #endif 10222 } 10223 10224 void 10225 pmap_activate(struct thread *td) 10226 { 10227 /* 10228 * invltlb_{invpcid,}_pcid_handler() is used to handle an 10229 * invalidate_all IPI, which checks for curpmap == 10230 * smp_tlb_pmap. The below sequence of operations has a 10231 * window where %CR3 is loaded with the new pmap's PML4 10232 * address, but the curpmap value has not yet been updated. 10233 * This causes the invltlb IPI handler, which is called 10234 * between the updates, to execute as a NOP, which leaves 10235 * stale TLB entries. 10236 * 10237 * Note that the most common use of pmap_activate_sw(), from 10238 * a context switch, is immune to this race, because 10239 * interrupts are disabled (while the thread lock is owned), 10240 * so the IPI is delayed until after curpmap is updated. Protect 10241 * other callers in a similar way, by disabling interrupts 10242 * around the %cr3 register reload and curpmap assignment. 10243 */ 10244 spinlock_enter(); 10245 pmap_activate_sw(td); 10246 spinlock_exit(); 10247 } 10248 10249 void 10250 pmap_activate_boot(pmap_t pmap) 10251 { 10252 uint64_t kcr3; 10253 u_int cpuid; 10254 10255 /* 10256 * kernel_pmap must be never deactivated, and we ensure that 10257 * by never activating it at all. 10258 */ 10259 MPASS(pmap != kernel_pmap); 10260 10261 cpuid = PCPU_GET(cpuid); 10262 #ifdef SMP 10263 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 10264 #else 10265 CPU_SET(cpuid, &pmap->pm_active); 10266 #endif 10267 PCPU_SET(curpmap, pmap); 10268 if (pti) { 10269 kcr3 = pmap->pm_cr3; 10270 if (pmap_pcid_enabled) 10271 kcr3 |= pmap_get_pcid(pmap) | CR3_PCID_SAVE; 10272 } else { 10273 kcr3 = PMAP_NO_CR3; 10274 } 10275 PCPU_SET(kcr3, kcr3); 10276 PCPU_SET(ucr3, PMAP_NO_CR3); 10277 } 10278 10279 void 10280 pmap_active_cpus(pmap_t pmap, cpuset_t *res) 10281 { 10282 *res = pmap->pm_active; 10283 } 10284 10285 void 10286 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 10287 { 10288 } 10289 10290 /* 10291 * Increase the starting virtual address of the given mapping if a 10292 * different alignment might result in more superpage mappings. 10293 */ 10294 void 10295 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 10296 vm_offset_t *addr, vm_size_t size) 10297 { 10298 vm_offset_t superpage_offset; 10299 10300 if (size < NBPDR) 10301 return; 10302 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 10303 offset += ptoa(object->pg_color); 10304 superpage_offset = offset & PDRMASK; 10305 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 10306 (*addr & PDRMASK) == superpage_offset) 10307 return; 10308 if ((*addr & PDRMASK) < superpage_offset) 10309 *addr = (*addr & ~PDRMASK) + superpage_offset; 10310 else 10311 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 10312 } 10313 10314 #ifdef INVARIANTS 10315 static unsigned long num_dirty_emulations; 10316 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, 10317 &num_dirty_emulations, 0, NULL); 10318 10319 static unsigned long num_accessed_emulations; 10320 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, 10321 &num_accessed_emulations, 0, NULL); 10322 10323 static unsigned long num_superpage_accessed_emulations; 10324 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, 10325 &num_superpage_accessed_emulations, 0, NULL); 10326 10327 static unsigned long ad_emulation_superpage_promotions; 10328 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, 10329 &ad_emulation_superpage_promotions, 0, NULL); 10330 #endif /* INVARIANTS */ 10331 10332 int 10333 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) 10334 { 10335 int rv; 10336 struct rwlock *lock; 10337 #if VM_NRESERVLEVEL > 0 10338 vm_page_t m, mpte; 10339 #endif 10340 pd_entry_t *pde; 10341 pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; 10342 10343 KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, 10344 ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); 10345 10346 if (!pmap_emulate_ad_bits(pmap)) 10347 return (-1); 10348 10349 PG_A = pmap_accessed_bit(pmap); 10350 PG_M = pmap_modified_bit(pmap); 10351 PG_V = pmap_valid_bit(pmap); 10352 PG_RW = pmap_rw_bit(pmap); 10353 10354 rv = -1; 10355 lock = NULL; 10356 PMAP_LOCK(pmap); 10357 10358 pde = pmap_pde(pmap, va); 10359 if (pde == NULL || (*pde & PG_V) == 0) 10360 goto done; 10361 10362 if ((*pde & PG_PS) != 0) { 10363 if (ftype == VM_PROT_READ) { 10364 #ifdef INVARIANTS 10365 atomic_add_long(&num_superpage_accessed_emulations, 1); 10366 #endif 10367 *pde |= PG_A; 10368 rv = 0; 10369 } 10370 goto done; 10371 } 10372 10373 pte = pmap_pde_to_pte(pde, va); 10374 if ((*pte & PG_V) == 0) 10375 goto done; 10376 10377 if (ftype == VM_PROT_WRITE) { 10378 if ((*pte & PG_RW) == 0) 10379 goto done; 10380 /* 10381 * Set the modified and accessed bits simultaneously. 10382 * 10383 * Intel EPT PTEs that do software emulation of A/D bits map 10384 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively. 10385 * An EPT misconfiguration is triggered if the PTE is writable 10386 * but not readable (WR=10). This is avoided by setting PG_A 10387 * and PG_M simultaneously. 10388 */ 10389 *pte |= PG_M | PG_A; 10390 } else { 10391 *pte |= PG_A; 10392 } 10393 10394 #if VM_NRESERVLEVEL > 0 10395 /* try to promote the mapping */ 10396 if (va < VM_MAXUSER_ADDRESS) 10397 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 10398 else 10399 mpte = NULL; 10400 10401 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 10402 10403 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 10404 (m->flags & PG_FICTITIOUS) == 0 && 10405 vm_reserv_level_iffullpop(m) == 0 && 10406 pmap_promote_pde(pmap, pde, va, mpte, &lock)) { 10407 #ifdef INVARIANTS 10408 atomic_add_long(&ad_emulation_superpage_promotions, 1); 10409 #endif 10410 } 10411 #endif 10412 10413 #ifdef INVARIANTS 10414 if (ftype == VM_PROT_WRITE) 10415 atomic_add_long(&num_dirty_emulations, 1); 10416 else 10417 atomic_add_long(&num_accessed_emulations, 1); 10418 #endif 10419 rv = 0; /* success */ 10420 done: 10421 if (lock != NULL) 10422 rw_wunlock(lock); 10423 PMAP_UNLOCK(pmap); 10424 return (rv); 10425 } 10426 10427 void 10428 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) 10429 { 10430 pml4_entry_t *pml4; 10431 pdp_entry_t *pdp; 10432 pd_entry_t *pde; 10433 pt_entry_t *pte, PG_V; 10434 int idx; 10435 10436 idx = 0; 10437 PG_V = pmap_valid_bit(pmap); 10438 PMAP_LOCK(pmap); 10439 10440 pml4 = pmap_pml4e(pmap, va); 10441 if (pml4 == NULL) 10442 goto done; 10443 ptr[idx++] = *pml4; 10444 if ((*pml4 & PG_V) == 0) 10445 goto done; 10446 10447 pdp = pmap_pml4e_to_pdpe(pml4, va); 10448 ptr[idx++] = *pdp; 10449 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) 10450 goto done; 10451 10452 pde = pmap_pdpe_to_pde(pdp, va); 10453 ptr[idx++] = *pde; 10454 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) 10455 goto done; 10456 10457 pte = pmap_pde_to_pte(pde, va); 10458 ptr[idx++] = *pte; 10459 10460 done: 10461 PMAP_UNLOCK(pmap); 10462 *num = idx; 10463 } 10464 10465 /** 10466 * Get the kernel virtual address of a set of physical pages. If there are 10467 * physical addresses not covered by the DMAP perform a transient mapping 10468 * that will be removed when calling pmap_unmap_io_transient. 10469 * 10470 * \param page The pages the caller wishes to obtain the virtual 10471 * address on the kernel memory map. 10472 * \param vaddr On return contains the kernel virtual memory address 10473 * of the pages passed in the page parameter. 10474 * \param count Number of pages passed in. 10475 * \param can_fault true if the thread using the mapped pages can take 10476 * page faults, false otherwise. 10477 * 10478 * \returns true if the caller must call pmap_unmap_io_transient when 10479 * finished or false otherwise. 10480 * 10481 */ 10482 bool 10483 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 10484 bool can_fault) 10485 { 10486 vm_paddr_t paddr; 10487 bool needs_mapping; 10488 pt_entry_t *pte; 10489 int cache_bits, error __unused, i; 10490 10491 /* 10492 * Allocate any KVA space that we need, this is done in a separate 10493 * loop to prevent calling vmem_alloc while pinned. 10494 */ 10495 needs_mapping = false; 10496 for (i = 0; i < count; i++) { 10497 paddr = VM_PAGE_TO_PHYS(page[i]); 10498 if (__predict_false(paddr >= dmaplimit)) { 10499 error = vmem_alloc(kernel_arena, PAGE_SIZE, 10500 M_BESTFIT | M_WAITOK, &vaddr[i]); 10501 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 10502 needs_mapping = true; 10503 } else { 10504 vaddr[i] = PHYS_TO_DMAP(paddr); 10505 } 10506 } 10507 10508 /* Exit early if everything is covered by the DMAP */ 10509 if (!needs_mapping) 10510 return (false); 10511 10512 /* 10513 * NB: The sequence of updating a page table followed by accesses 10514 * to the corresponding pages used in the !DMAP case is subject to 10515 * the situation described in the "AMD64 Architecture Programmer's 10516 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special 10517 * Coherency Considerations". Therefore, issuing the INVLPG right 10518 * after modifying the PTE bits is crucial. 10519 */ 10520 if (!can_fault) 10521 sched_pin(); 10522 for (i = 0; i < count; i++) { 10523 paddr = VM_PAGE_TO_PHYS(page[i]); 10524 if (paddr >= dmaplimit) { 10525 if (can_fault) { 10526 /* 10527 * Slow path, since we can get page faults 10528 * while mappings are active don't pin the 10529 * thread to the CPU and instead add a global 10530 * mapping visible to all CPUs. 10531 */ 10532 pmap_qenter(vaddr[i], &page[i], 1); 10533 } else { 10534 pte = vtopte(vaddr[i]); 10535 cache_bits = pmap_cache_bits(kernel_pmap, 10536 page[i]->md.pat_mode, false); 10537 pte_store(pte, paddr | X86_PG_RW | X86_PG_V | 10538 cache_bits); 10539 pmap_invlpg(kernel_pmap, vaddr[i]); 10540 } 10541 } 10542 } 10543 10544 return (needs_mapping); 10545 } 10546 10547 void 10548 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 10549 bool can_fault) 10550 { 10551 vm_paddr_t paddr; 10552 int i; 10553 10554 if (!can_fault) 10555 sched_unpin(); 10556 for (i = 0; i < count; i++) { 10557 paddr = VM_PAGE_TO_PHYS(page[i]); 10558 if (paddr >= dmaplimit) { 10559 if (can_fault) 10560 pmap_qremove(vaddr[i], 1); 10561 vmem_free(kernel_arena, vaddr[i], PAGE_SIZE); 10562 } 10563 } 10564 } 10565 10566 vm_offset_t 10567 pmap_quick_enter_page(vm_page_t m) 10568 { 10569 vm_paddr_t paddr; 10570 10571 paddr = VM_PAGE_TO_PHYS(m); 10572 if (paddr < dmaplimit) 10573 return (PHYS_TO_DMAP(paddr)); 10574 mtx_lock_spin(&qframe_mtx); 10575 KASSERT(*vtopte(qframe) == 0, ("qframe busy")); 10576 10577 /* 10578 * Since qframe is exclusively mapped by us, and we do not set 10579 * PG_G, we can use INVLPG here. 10580 */ 10581 invlpg(qframe); 10582 10583 pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A | 10584 X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0)); 10585 return (qframe); 10586 } 10587 10588 void 10589 pmap_quick_remove_page(vm_offset_t addr) 10590 { 10591 10592 if (addr != qframe) 10593 return; 10594 pte_store(vtopte(qframe), 0); 10595 mtx_unlock_spin(&qframe_mtx); 10596 } 10597 10598 /* 10599 * Pdp pages from the large map are managed differently from either 10600 * kernel or user page table pages. They are permanently allocated at 10601 * initialization time, and their reference count is permanently set to 10602 * zero. The pml4 entries pointing to those pages are copied into 10603 * each allocated pmap. 10604 * 10605 * In contrast, pd and pt pages are managed like user page table 10606 * pages. They are dynamically allocated, and their reference count 10607 * represents the number of valid entries within the page. 10608 */ 10609 static vm_page_t 10610 pmap_large_map_getptp_unlocked(void) 10611 { 10612 return (pmap_alloc_pt_page(kernel_pmap, 0, VM_ALLOC_ZERO)); 10613 } 10614 10615 static vm_page_t 10616 pmap_large_map_getptp(void) 10617 { 10618 vm_page_t m; 10619 10620 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 10621 m = pmap_large_map_getptp_unlocked(); 10622 if (m == NULL) { 10623 PMAP_UNLOCK(kernel_pmap); 10624 vm_wait(NULL); 10625 PMAP_LOCK(kernel_pmap); 10626 /* Callers retry. */ 10627 } 10628 return (m); 10629 } 10630 10631 static pdp_entry_t * 10632 pmap_large_map_pdpe(vm_offset_t va) 10633 { 10634 vm_pindex_t pml4_idx; 10635 vm_paddr_t mphys; 10636 10637 pml4_idx = pmap_pml4e_index(va); 10638 KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents, 10639 ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I " 10640 "%#jx lm_ents %d", 10641 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); 10642 KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0, 10643 ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " 10644 "LMSPML4I %#jx lm_ents %d", 10645 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); 10646 mphys = kernel_pml4[pml4_idx] & PG_FRAME; 10647 return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va)); 10648 } 10649 10650 static pd_entry_t * 10651 pmap_large_map_pde(vm_offset_t va) 10652 { 10653 pdp_entry_t *pdpe; 10654 vm_page_t m; 10655 vm_paddr_t mphys; 10656 10657 retry: 10658 pdpe = pmap_large_map_pdpe(va); 10659 if (*pdpe == 0) { 10660 m = pmap_large_map_getptp(); 10661 if (m == NULL) 10662 goto retry; 10663 mphys = VM_PAGE_TO_PHYS(m); 10664 *pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; 10665 } else { 10666 MPASS((*pdpe & X86_PG_PS) == 0); 10667 mphys = *pdpe & PG_FRAME; 10668 } 10669 return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va)); 10670 } 10671 10672 static pt_entry_t * 10673 pmap_large_map_pte(vm_offset_t va) 10674 { 10675 pd_entry_t *pde; 10676 vm_page_t m; 10677 vm_paddr_t mphys; 10678 10679 retry: 10680 pde = pmap_large_map_pde(va); 10681 if (*pde == 0) { 10682 m = pmap_large_map_getptp(); 10683 if (m == NULL) 10684 goto retry; 10685 mphys = VM_PAGE_TO_PHYS(m); 10686 *pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; 10687 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->ref_count++; 10688 } else { 10689 MPASS((*pde & X86_PG_PS) == 0); 10690 mphys = *pde & PG_FRAME; 10691 } 10692 return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va)); 10693 } 10694 10695 static vm_paddr_t 10696 pmap_large_map_kextract(vm_offset_t va) 10697 { 10698 pdp_entry_t *pdpe, pdp; 10699 pd_entry_t *pde, pd; 10700 pt_entry_t *pte, pt; 10701 10702 KASSERT(PMAP_ADDRESS_IN_LARGEMAP(va), 10703 ("not largemap range %#lx", (u_long)va)); 10704 pdpe = pmap_large_map_pdpe(va); 10705 pdp = *pdpe; 10706 KASSERT((pdp & X86_PG_V) != 0, 10707 ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, 10708 (u_long)pdpe, pdp)); 10709 if ((pdp & X86_PG_PS) != 0) { 10710 KASSERT((amd_feature & AMDID_PAGE1GB) != 0, 10711 ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, 10712 (u_long)pdpe, pdp)); 10713 return ((pdp & PG_PS_PDP_FRAME) | (va & PDPMASK)); 10714 } 10715 pde = pmap_pdpe_to_pde(pdpe, va); 10716 pd = *pde; 10717 KASSERT((pd & X86_PG_V) != 0, 10718 ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); 10719 if ((pd & X86_PG_PS) != 0) 10720 return ((pd & PG_PS_FRAME) | (va & PDRMASK)); 10721 pte = pmap_pde_to_pte(pde, va); 10722 pt = *pte; 10723 KASSERT((pt & X86_PG_V) != 0, 10724 ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, pt)); 10725 return ((pt & PG_FRAME) | (va & PAGE_MASK)); 10726 } 10727 10728 static int 10729 pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase, 10730 vmem_addr_t *vmem_res) 10731 { 10732 10733 /* 10734 * Large mappings are all but static. Consequently, there 10735 * is no point in waiting for an earlier allocation to be 10736 * freed. 10737 */ 10738 return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN, 10739 VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res)); 10740 } 10741 10742 int 10743 pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr, 10744 vm_memattr_t mattr) 10745 { 10746 pdp_entry_t *pdpe; 10747 pd_entry_t *pde; 10748 pt_entry_t *pte; 10749 vm_offset_t va, inc; 10750 vmem_addr_t vmem_res; 10751 vm_paddr_t pa; 10752 int error; 10753 10754 if (len == 0 || spa + len < spa) 10755 return (EINVAL); 10756 10757 /* See if DMAP can serve. */ 10758 if (spa + len <= dmaplimit) { 10759 va = PHYS_TO_DMAP(spa); 10760 *addr = (void *)va; 10761 return (pmap_change_attr(va, len, mattr)); 10762 } 10763 10764 /* 10765 * No, allocate KVA. Fit the address with best possible 10766 * alignment for superpages. Fall back to worse align if 10767 * failed. 10768 */ 10769 error = ENOMEM; 10770 if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len, 10771 NBPDP) >= roundup2(spa, NBPDP) + NBPDP) 10772 error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK, 10773 &vmem_res); 10774 if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa, 10775 NBPDR) + NBPDR) 10776 error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK, 10777 &vmem_res); 10778 if (error != 0) 10779 error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res); 10780 if (error != 0) 10781 return (error); 10782 10783 /* 10784 * Fill pagetable. PG_M is not pre-set, we scan modified bits 10785 * in the pagetable to minimize flushing. No need to 10786 * invalidate TLB, since we only update invalid entries. 10787 */ 10788 PMAP_LOCK(kernel_pmap); 10789 for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc, 10790 len -= inc) { 10791 if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP && 10792 (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) { 10793 pdpe = pmap_large_map_pdpe(va); 10794 MPASS(*pdpe == 0); 10795 *pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW | 10796 X86_PG_V | X86_PG_A | pg_nx | 10797 pmap_cache_bits(kernel_pmap, mattr, TRUE); 10798 inc = NBPDP; 10799 } else if (len >= NBPDR && (pa & PDRMASK) == 0 && 10800 (va & PDRMASK) == 0) { 10801 pde = pmap_large_map_pde(va); 10802 MPASS(*pde == 0); 10803 *pde = pa | pg_g | X86_PG_PS | X86_PG_RW | 10804 X86_PG_V | X86_PG_A | pg_nx | 10805 pmap_cache_bits(kernel_pmap, mattr, TRUE); 10806 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))-> 10807 ref_count++; 10808 inc = NBPDR; 10809 } else { 10810 pte = pmap_large_map_pte(va); 10811 MPASS(*pte == 0); 10812 *pte = pa | pg_g | X86_PG_RW | X86_PG_V | 10813 X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, 10814 mattr, FALSE); 10815 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))-> 10816 ref_count++; 10817 inc = PAGE_SIZE; 10818 } 10819 } 10820 PMAP_UNLOCK(kernel_pmap); 10821 MPASS(len == 0); 10822 10823 *addr = (void *)vmem_res; 10824 return (0); 10825 } 10826 10827 void 10828 pmap_large_unmap(void *svaa, vm_size_t len) 10829 { 10830 vm_offset_t sva, va; 10831 vm_size_t inc; 10832 pdp_entry_t *pdpe, pdp; 10833 pd_entry_t *pde, pd; 10834 pt_entry_t *pte; 10835 vm_page_t m; 10836 struct spglist spgf; 10837 10838 sva = (vm_offset_t)svaa; 10839 if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS && 10840 sva + len <= DMAP_MIN_ADDRESS + dmaplimit)) 10841 return; 10842 10843 SLIST_INIT(&spgf); 10844 KASSERT(PMAP_ADDRESS_IN_LARGEMAP(sva) && 10845 PMAP_ADDRESS_IN_LARGEMAP(sva + len - 1), 10846 ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len)); 10847 PMAP_LOCK(kernel_pmap); 10848 for (va = sva; va < sva + len; va += inc) { 10849 pdpe = pmap_large_map_pdpe(va); 10850 pdp = *pdpe; 10851 KASSERT((pdp & X86_PG_V) != 0, 10852 ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, 10853 (u_long)pdpe, pdp)); 10854 if ((pdp & X86_PG_PS) != 0) { 10855 KASSERT((amd_feature & AMDID_PAGE1GB) != 0, 10856 ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, 10857 (u_long)pdpe, pdp)); 10858 KASSERT((va & PDPMASK) == 0, 10859 ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va, 10860 (u_long)pdpe, pdp)); 10861 KASSERT(va + NBPDP <= sva + len, 10862 ("unmap covers partial 1GB page, sva %#lx va %#lx " 10863 "pdpe %#lx pdp %#lx len %#lx", sva, va, 10864 (u_long)pdpe, pdp, len)); 10865 *pdpe = 0; 10866 inc = NBPDP; 10867 continue; 10868 } 10869 pde = pmap_pdpe_to_pde(pdpe, va); 10870 pd = *pde; 10871 KASSERT((pd & X86_PG_V) != 0, 10872 ("invalid pd va %#lx pde %#lx pd %#lx", va, 10873 (u_long)pde, pd)); 10874 if ((pd & X86_PG_PS) != 0) { 10875 KASSERT((va & PDRMASK) == 0, 10876 ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va, 10877 (u_long)pde, pd)); 10878 KASSERT(va + NBPDR <= sva + len, 10879 ("unmap covers partial 2MB page, sva %#lx va %#lx " 10880 "pde %#lx pd %#lx len %#lx", sva, va, (u_long)pde, 10881 pd, len)); 10882 pde_store(pde, 0); 10883 inc = NBPDR; 10884 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); 10885 m->ref_count--; 10886 if (m->ref_count == 0) { 10887 *pdpe = 0; 10888 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10889 } 10890 continue; 10891 } 10892 pte = pmap_pde_to_pte(pde, va); 10893 KASSERT((*pte & X86_PG_V) != 0, 10894 ("invalid pte va %#lx pte %#lx pt %#lx", va, 10895 (u_long)pte, *pte)); 10896 pte_clear(pte); 10897 inc = PAGE_SIZE; 10898 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte)); 10899 m->ref_count--; 10900 if (m->ref_count == 0) { 10901 *pde = 0; 10902 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10903 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); 10904 m->ref_count--; 10905 if (m->ref_count == 0) { 10906 *pdpe = 0; 10907 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10908 } 10909 } 10910 } 10911 pmap_invalidate_range(kernel_pmap, sva, sva + len); 10912 PMAP_UNLOCK(kernel_pmap); 10913 vm_page_free_pages_toq(&spgf, false); 10914 vmem_free(large_vmem, sva, len); 10915 } 10916 10917 static void 10918 pmap_large_map_wb_fence_mfence(void) 10919 { 10920 10921 mfence(); 10922 } 10923 10924 static void 10925 pmap_large_map_wb_fence_atomic(void) 10926 { 10927 10928 atomic_thread_fence_seq_cst(); 10929 } 10930 10931 static void 10932 pmap_large_map_wb_fence_nop(void) 10933 { 10934 } 10935 10936 DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void)) 10937 { 10938 10939 if (cpu_vendor_id != CPU_VENDOR_INTEL) 10940 return (pmap_large_map_wb_fence_mfence); 10941 else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB | 10942 CPUID_STDEXT_CLFLUSHOPT)) == 0) 10943 return (pmap_large_map_wb_fence_atomic); 10944 else 10945 /* clflush is strongly enough ordered */ 10946 return (pmap_large_map_wb_fence_nop); 10947 } 10948 10949 static void 10950 pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len) 10951 { 10952 10953 for (; len > 0; len -= cpu_clflush_line_size, 10954 va += cpu_clflush_line_size) 10955 clwb(va); 10956 } 10957 10958 static void 10959 pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len) 10960 { 10961 10962 for (; len > 0; len -= cpu_clflush_line_size, 10963 va += cpu_clflush_line_size) 10964 clflushopt(va); 10965 } 10966 10967 static void 10968 pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len) 10969 { 10970 10971 for (; len > 0; len -= cpu_clflush_line_size, 10972 va += cpu_clflush_line_size) 10973 clflush(va); 10974 } 10975 10976 static void 10977 pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len __unused) 10978 { 10979 } 10980 10981 DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t, vm_size_t)) 10982 { 10983 10984 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0) 10985 return (pmap_large_map_flush_range_clwb); 10986 else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) 10987 return (pmap_large_map_flush_range_clflushopt); 10988 else if ((cpu_feature & CPUID_CLFSH) != 0) 10989 return (pmap_large_map_flush_range_clflush); 10990 else 10991 return (pmap_large_map_flush_range_nop); 10992 } 10993 10994 static void 10995 pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva) 10996 { 10997 volatile u_long *pe; 10998 u_long p; 10999 vm_offset_t va; 11000 vm_size_t inc; 11001 bool seen_other; 11002 11003 for (va = sva; va < eva; va += inc) { 11004 inc = 0; 11005 if ((amd_feature & AMDID_PAGE1GB) != 0) { 11006 pe = (volatile u_long *)pmap_large_map_pdpe(va); 11007 p = *pe; 11008 if ((p & X86_PG_PS) != 0) 11009 inc = NBPDP; 11010 } 11011 if (inc == 0) { 11012 pe = (volatile u_long *)pmap_large_map_pde(va); 11013 p = *pe; 11014 if ((p & X86_PG_PS) != 0) 11015 inc = NBPDR; 11016 } 11017 if (inc == 0) { 11018 pe = (volatile u_long *)pmap_large_map_pte(va); 11019 p = *pe; 11020 inc = PAGE_SIZE; 11021 } 11022 seen_other = false; 11023 for (;;) { 11024 if ((p & X86_PG_AVAIL1) != 0) { 11025 /* 11026 * Spin-wait for the end of a parallel 11027 * write-back. 11028 */ 11029 cpu_spinwait(); 11030 p = *pe; 11031 11032 /* 11033 * If we saw other write-back 11034 * occuring, we cannot rely on PG_M to 11035 * indicate state of the cache. The 11036 * PG_M bit is cleared before the 11037 * flush to avoid ignoring new writes, 11038 * and writes which are relevant for 11039 * us might happen after. 11040 */ 11041 seen_other = true; 11042 continue; 11043 } 11044 11045 if ((p & X86_PG_M) != 0 || seen_other) { 11046 if (!atomic_fcmpset_long(pe, &p, 11047 (p & ~X86_PG_M) | X86_PG_AVAIL1)) 11048 /* 11049 * If we saw PG_M without 11050 * PG_AVAIL1, and then on the 11051 * next attempt we do not 11052 * observe either PG_M or 11053 * PG_AVAIL1, the other 11054 * write-back started after us 11055 * and finished before us. We 11056 * can rely on it doing our 11057 * work. 11058 */ 11059 continue; 11060 pmap_large_map_flush_range(va, inc); 11061 atomic_clear_long(pe, X86_PG_AVAIL1); 11062 } 11063 break; 11064 } 11065 maybe_yield(); 11066 } 11067 } 11068 11069 /* 11070 * Write-back cache lines for the given address range. 11071 * 11072 * Must be called only on the range or sub-range returned from 11073 * pmap_large_map(). Must not be called on the coalesced ranges. 11074 * 11075 * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH 11076 * instructions support. 11077 */ 11078 void 11079 pmap_large_map_wb(void *svap, vm_size_t len) 11080 { 11081 vm_offset_t eva, sva; 11082 11083 sva = (vm_offset_t)svap; 11084 eva = sva + len; 11085 pmap_large_map_wb_fence(); 11086 if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) { 11087 pmap_large_map_flush_range(sva, len); 11088 } else { 11089 KASSERT(sva >= LARGEMAP_MIN_ADDRESS && 11090 eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4, 11091 ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len)); 11092 pmap_large_map_wb_large(sva, eva); 11093 } 11094 pmap_large_map_wb_fence(); 11095 } 11096 11097 static vm_page_t 11098 pmap_pti_alloc_page(void) 11099 { 11100 vm_page_t m; 11101 11102 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11103 m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_WIRED | VM_ALLOC_ZERO); 11104 return (m); 11105 } 11106 11107 static bool 11108 pmap_pti_free_page(vm_page_t m) 11109 { 11110 if (!vm_page_unwire_noq(m)) 11111 return (false); 11112 vm_page_xbusy_claim(m); 11113 vm_page_free_zero(m); 11114 return (true); 11115 } 11116 11117 static void 11118 pmap_pti_init(void) 11119 { 11120 vm_page_t pml4_pg; 11121 pdp_entry_t *pdpe; 11122 vm_offset_t va; 11123 int i; 11124 11125 if (!pti) 11126 return; 11127 pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL); 11128 VM_OBJECT_WLOCK(pti_obj); 11129 pml4_pg = pmap_pti_alloc_page(); 11130 pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg)); 11131 for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS && 11132 va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) { 11133 pdpe = pmap_pti_pdpe(va); 11134 pmap_pti_wire_pte(pdpe); 11135 } 11136 pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0], 11137 (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false); 11138 pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt + 11139 sizeof(struct gate_descriptor) * NIDT, false); 11140 CPU_FOREACH(i) { 11141 /* Doublefault stack IST 1 */ 11142 va = __pcpu[i].pc_common_tss.tss_ist1 + sizeof(struct nmi_pcpu); 11143 pmap_pti_add_kva_locked(va - DBLFAULT_STACK_SIZE, va, false); 11144 /* NMI stack IST 2 */ 11145 va = __pcpu[i].pc_common_tss.tss_ist2 + sizeof(struct nmi_pcpu); 11146 pmap_pti_add_kva_locked(va - NMI_STACK_SIZE, va, false); 11147 /* MC# stack IST 3 */ 11148 va = __pcpu[i].pc_common_tss.tss_ist3 + 11149 sizeof(struct nmi_pcpu); 11150 pmap_pti_add_kva_locked(va - MCE_STACK_SIZE, va, false); 11151 /* DB# stack IST 4 */ 11152 va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu); 11153 pmap_pti_add_kva_locked(va - DBG_STACK_SIZE, va, false); 11154 } 11155 pmap_pti_add_kva_locked((vm_offset_t)KERNSTART, (vm_offset_t)etext, 11156 true); 11157 pti_finalized = true; 11158 VM_OBJECT_WUNLOCK(pti_obj); 11159 } 11160 11161 static void 11162 pmap_cpu_init(void *arg __unused) 11163 { 11164 CPU_COPY(&all_cpus, &kernel_pmap->pm_active); 11165 pmap_pti_init(); 11166 } 11167 SYSINIT(pmap_cpu, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_cpu_init, NULL); 11168 11169 static pdp_entry_t * 11170 pmap_pti_pdpe(vm_offset_t va) 11171 { 11172 pml4_entry_t *pml4e; 11173 pdp_entry_t *pdpe; 11174 vm_page_t m; 11175 vm_pindex_t pml4_idx; 11176 vm_paddr_t mphys; 11177 11178 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11179 11180 pml4_idx = pmap_pml4e_index(va); 11181 pml4e = &pti_pml4[pml4_idx]; 11182 m = NULL; 11183 if (*pml4e == 0) { 11184 if (pti_finalized) 11185 panic("pml4 alloc after finalization\n"); 11186 m = pmap_pti_alloc_page(); 11187 if (*pml4e != 0) { 11188 pmap_pti_free_page(m); 11189 mphys = *pml4e & ~PAGE_MASK; 11190 } else { 11191 mphys = VM_PAGE_TO_PHYS(m); 11192 *pml4e = mphys | X86_PG_RW | X86_PG_V; 11193 } 11194 } else { 11195 mphys = *pml4e & ~PAGE_MASK; 11196 } 11197 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va); 11198 return (pdpe); 11199 } 11200 11201 static void 11202 pmap_pti_wire_pte(void *pte) 11203 { 11204 vm_page_t m; 11205 11206 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11207 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); 11208 m->ref_count++; 11209 } 11210 11211 static void 11212 pmap_pti_unwire_pde(void *pde, bool only_ref) 11213 { 11214 vm_page_t m; 11215 11216 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11217 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde)); 11218 MPASS(only_ref || m->ref_count > 1); 11219 pmap_pti_free_page(m); 11220 } 11221 11222 static void 11223 pmap_pti_unwire_pte(void *pte, vm_offset_t va) 11224 { 11225 vm_page_t m; 11226 pd_entry_t *pde; 11227 11228 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11229 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); 11230 if (pmap_pti_free_page(m)) { 11231 pde = pmap_pti_pde(va); 11232 MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V); 11233 *pde = 0; 11234 pmap_pti_unwire_pde(pde, false); 11235 } 11236 } 11237 11238 static pd_entry_t * 11239 pmap_pti_pde(vm_offset_t va) 11240 { 11241 pdp_entry_t *pdpe; 11242 pd_entry_t *pde; 11243 vm_page_t m; 11244 vm_pindex_t pd_idx; 11245 vm_paddr_t mphys; 11246 11247 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11248 11249 pdpe = pmap_pti_pdpe(va); 11250 if (*pdpe == 0) { 11251 m = pmap_pti_alloc_page(); 11252 if (*pdpe != 0) { 11253 pmap_pti_free_page(m); 11254 MPASS((*pdpe & X86_PG_PS) == 0); 11255 mphys = *pdpe & ~PAGE_MASK; 11256 } else { 11257 mphys = VM_PAGE_TO_PHYS(m); 11258 *pdpe = mphys | X86_PG_RW | X86_PG_V; 11259 } 11260 } else { 11261 MPASS((*pdpe & X86_PG_PS) == 0); 11262 mphys = *pdpe & ~PAGE_MASK; 11263 } 11264 11265 pde = (pd_entry_t *)PHYS_TO_DMAP(mphys); 11266 pd_idx = pmap_pde_index(va); 11267 pde += pd_idx; 11268 return (pde); 11269 } 11270 11271 static pt_entry_t * 11272 pmap_pti_pte(vm_offset_t va, bool *unwire_pde) 11273 { 11274 pd_entry_t *pde; 11275 pt_entry_t *pte; 11276 vm_page_t m; 11277 vm_paddr_t mphys; 11278 11279 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11280 11281 pde = pmap_pti_pde(va); 11282 if (unwire_pde != NULL) { 11283 *unwire_pde = true; 11284 pmap_pti_wire_pte(pde); 11285 } 11286 if (*pde == 0) { 11287 m = pmap_pti_alloc_page(); 11288 if (*pde != 0) { 11289 pmap_pti_free_page(m); 11290 MPASS((*pde & X86_PG_PS) == 0); 11291 mphys = *pde & ~(PAGE_MASK | pg_nx); 11292 } else { 11293 mphys = VM_PAGE_TO_PHYS(m); 11294 *pde = mphys | X86_PG_RW | X86_PG_V; 11295 if (unwire_pde != NULL) 11296 *unwire_pde = false; 11297 } 11298 } else { 11299 MPASS((*pde & X86_PG_PS) == 0); 11300 mphys = *pde & ~(PAGE_MASK | pg_nx); 11301 } 11302 11303 pte = (pt_entry_t *)PHYS_TO_DMAP(mphys); 11304 pte += pmap_pte_index(va); 11305 11306 return (pte); 11307 } 11308 11309 static void 11310 pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec) 11311 { 11312 vm_paddr_t pa; 11313 pd_entry_t *pde; 11314 pt_entry_t *pte, ptev; 11315 bool unwire_pde; 11316 11317 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11318 11319 sva = trunc_page(sva); 11320 MPASS(sva > VM_MAXUSER_ADDRESS); 11321 eva = round_page(eva); 11322 MPASS(sva < eva); 11323 for (; sva < eva; sva += PAGE_SIZE) { 11324 pte = pmap_pti_pte(sva, &unwire_pde); 11325 pa = pmap_kextract(sva); 11326 ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G | 11327 (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap, 11328 VM_MEMATTR_DEFAULT, FALSE); 11329 if (*pte == 0) { 11330 pte_store(pte, ptev); 11331 pmap_pti_wire_pte(pte); 11332 } else { 11333 KASSERT(!pti_finalized, 11334 ("pti overlap after fin %#lx %#lx %#lx", 11335 sva, *pte, ptev)); 11336 KASSERT(*pte == ptev, 11337 ("pti non-identical pte after fin %#lx %#lx %#lx", 11338 sva, *pte, ptev)); 11339 } 11340 if (unwire_pde) { 11341 pde = pmap_pti_pde(sva); 11342 pmap_pti_unwire_pde(pde, true); 11343 } 11344 } 11345 } 11346 11347 void 11348 pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec) 11349 { 11350 11351 if (!pti) 11352 return; 11353 VM_OBJECT_WLOCK(pti_obj); 11354 pmap_pti_add_kva_locked(sva, eva, exec); 11355 VM_OBJECT_WUNLOCK(pti_obj); 11356 } 11357 11358 void 11359 pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva) 11360 { 11361 pt_entry_t *pte; 11362 vm_offset_t va; 11363 11364 if (!pti) 11365 return; 11366 sva = rounddown2(sva, PAGE_SIZE); 11367 MPASS(sva > VM_MAXUSER_ADDRESS); 11368 eva = roundup2(eva, PAGE_SIZE); 11369 MPASS(sva < eva); 11370 VM_OBJECT_WLOCK(pti_obj); 11371 for (va = sva; va < eva; va += PAGE_SIZE) { 11372 pte = pmap_pti_pte(va, NULL); 11373 KASSERT((*pte & X86_PG_V) != 0, 11374 ("invalid pte va %#lx pte %#lx pt %#lx", va, 11375 (u_long)pte, *pte)); 11376 pte_clear(pte); 11377 pmap_pti_unwire_pte(pte, va); 11378 } 11379 pmap_invalidate_range(kernel_pmap, sva, eva); 11380 VM_OBJECT_WUNLOCK(pti_obj); 11381 } 11382 11383 static void * 11384 pkru_dup_range(void *ctx __unused, void *data) 11385 { 11386 struct pmap_pkru_range *node, *new_node; 11387 11388 new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); 11389 if (new_node == NULL) 11390 return (NULL); 11391 node = data; 11392 memcpy(new_node, node, sizeof(*node)); 11393 return (new_node); 11394 } 11395 11396 static void 11397 pkru_free_range(void *ctx __unused, void *node) 11398 { 11399 11400 uma_zfree(pmap_pkru_ranges_zone, node); 11401 } 11402 11403 static int 11404 pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, 11405 int flags) 11406 { 11407 struct pmap_pkru_range *ppr; 11408 int error; 11409 11410 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11411 MPASS(pmap->pm_type == PT_X86); 11412 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11413 if ((flags & AMD64_PKRU_EXCL) != 0 && 11414 !rangeset_check_empty(&pmap->pm_pkru, sva, eva)) 11415 return (EBUSY); 11416 ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); 11417 if (ppr == NULL) 11418 return (ENOMEM); 11419 ppr->pkru_keyidx = keyidx; 11420 ppr->pkru_flags = flags & AMD64_PKRU_PERSIST; 11421 error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr); 11422 if (error != 0) 11423 uma_zfree(pmap_pkru_ranges_zone, ppr); 11424 return (error); 11425 } 11426 11427 static int 11428 pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11429 { 11430 11431 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11432 MPASS(pmap->pm_type == PT_X86); 11433 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11434 return (rangeset_remove(&pmap->pm_pkru, sva, eva)); 11435 } 11436 11437 static void 11438 pmap_pkru_deassign_all(pmap_t pmap) 11439 { 11440 11441 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11442 if (pmap->pm_type == PT_X86 && 11443 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) 11444 rangeset_remove_all(&pmap->pm_pkru); 11445 } 11446 11447 static bool 11448 pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11449 { 11450 struct pmap_pkru_range *ppr, *prev_ppr; 11451 vm_offset_t va; 11452 11453 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11454 if (pmap->pm_type != PT_X86 || 11455 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || 11456 sva >= VM_MAXUSER_ADDRESS) 11457 return (true); 11458 MPASS(eva <= VM_MAXUSER_ADDRESS); 11459 for (va = sva; va < eva; prev_ppr = ppr) { 11460 ppr = rangeset_lookup(&pmap->pm_pkru, va); 11461 if (va == sva) 11462 prev_ppr = ppr; 11463 else if ((ppr == NULL) ^ (prev_ppr == NULL)) 11464 return (false); 11465 if (ppr == NULL) { 11466 va += PAGE_SIZE; 11467 continue; 11468 } 11469 if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx) 11470 return (false); 11471 va = ppr->pkru_rs_el.re_end; 11472 } 11473 return (true); 11474 } 11475 11476 static pt_entry_t 11477 pmap_pkru_get(pmap_t pmap, vm_offset_t va) 11478 { 11479 struct pmap_pkru_range *ppr; 11480 11481 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11482 if (pmap->pm_type != PT_X86 || 11483 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || 11484 va >= VM_MAXUSER_ADDRESS) 11485 return (0); 11486 ppr = rangeset_lookup(&pmap->pm_pkru, va); 11487 if (ppr != NULL) 11488 return (X86_PG_PKU(ppr->pkru_keyidx)); 11489 return (0); 11490 } 11491 11492 static bool 11493 pred_pkru_on_remove(void *ctx __unused, void *r) 11494 { 11495 struct pmap_pkru_range *ppr; 11496 11497 ppr = r; 11498 return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0); 11499 } 11500 11501 static void 11502 pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11503 { 11504 11505 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11506 if (pmap->pm_type == PT_X86 && 11507 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 11508 rangeset_remove_pred(&pmap->pm_pkru, sva, eva, 11509 pred_pkru_on_remove); 11510 } 11511 } 11512 11513 static int 11514 pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap) 11515 { 11516 11517 PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED); 11518 PMAP_LOCK_ASSERT(src_pmap, MA_OWNED); 11519 MPASS(dst_pmap->pm_type == PT_X86); 11520 MPASS(src_pmap->pm_type == PT_X86); 11521 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11522 if (src_pmap->pm_pkru.rs_data_ctx == NULL) 11523 return (0); 11524 return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru)); 11525 } 11526 11527 static void 11528 pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 11529 u_int keyidx) 11530 { 11531 pml4_entry_t *pml4e; 11532 pdp_entry_t *pdpe; 11533 pd_entry_t newpde, ptpaddr, *pde; 11534 pt_entry_t newpte, *ptep, pte; 11535 vm_offset_t va, va_next; 11536 bool changed; 11537 11538 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11539 MPASS(pmap->pm_type == PT_X86); 11540 MPASS(keyidx <= PMAP_MAX_PKRU_IDX); 11541 11542 for (changed = false, va = sva; va < eva; va = va_next) { 11543 pml4e = pmap_pml4e(pmap, va); 11544 if (pml4e == NULL || (*pml4e & X86_PG_V) == 0) { 11545 va_next = (va + NBPML4) & ~PML4MASK; 11546 if (va_next < va) 11547 va_next = eva; 11548 continue; 11549 } 11550 11551 pdpe = pmap_pml4e_to_pdpe(pml4e, va); 11552 if ((*pdpe & X86_PG_V) == 0) { 11553 va_next = (va + NBPDP) & ~PDPMASK; 11554 if (va_next < va) 11555 va_next = eva; 11556 continue; 11557 } 11558 11559 va_next = (va + NBPDR) & ~PDRMASK; 11560 if (va_next < va) 11561 va_next = eva; 11562 11563 pde = pmap_pdpe_to_pde(pdpe, va); 11564 ptpaddr = *pde; 11565 if (ptpaddr == 0) 11566 continue; 11567 11568 MPASS((ptpaddr & X86_PG_V) != 0); 11569 if ((ptpaddr & PG_PS) != 0) { 11570 if (va + NBPDR == va_next && eva >= va_next) { 11571 newpde = (ptpaddr & ~X86_PG_PKU_MASK) | 11572 X86_PG_PKU(keyidx); 11573 if (newpde != ptpaddr) { 11574 *pde = newpde; 11575 changed = true; 11576 } 11577 continue; 11578 } else if (!pmap_demote_pde(pmap, pde, va)) { 11579 continue; 11580 } 11581 } 11582 11583 if (va_next > eva) 11584 va_next = eva; 11585 11586 for (ptep = pmap_pde_to_pte(pde, va); va != va_next; 11587 ptep++, va += PAGE_SIZE) { 11588 pte = *ptep; 11589 if ((pte & X86_PG_V) == 0) 11590 continue; 11591 newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx); 11592 if (newpte != pte) { 11593 *ptep = newpte; 11594 changed = true; 11595 } 11596 } 11597 } 11598 if (changed) 11599 pmap_invalidate_range(pmap, sva, eva); 11600 } 11601 11602 static int 11603 pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 11604 u_int keyidx, int flags) 11605 { 11606 11607 if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX || 11608 (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0) 11609 return (EINVAL); 11610 if (eva <= sva || eva > VM_MAXUSER_ADDRESS) 11611 return (EFAULT); 11612 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) 11613 return (ENOTSUP); 11614 return (0); 11615 } 11616 11617 int 11618 pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, 11619 int flags) 11620 { 11621 int error; 11622 11623 sva = trunc_page(sva); 11624 eva = round_page(eva); 11625 error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags); 11626 if (error != 0) 11627 return (error); 11628 for (;;) { 11629 PMAP_LOCK(pmap); 11630 error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags); 11631 if (error == 0) 11632 pmap_pkru_update_range(pmap, sva, eva, keyidx); 11633 PMAP_UNLOCK(pmap); 11634 if (error != ENOMEM) 11635 break; 11636 vm_wait(NULL); 11637 } 11638 return (error); 11639 } 11640 11641 int 11642 pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11643 { 11644 int error; 11645 11646 sva = trunc_page(sva); 11647 eva = round_page(eva); 11648 error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0); 11649 if (error != 0) 11650 return (error); 11651 for (;;) { 11652 PMAP_LOCK(pmap); 11653 error = pmap_pkru_deassign(pmap, sva, eva); 11654 if (error == 0) 11655 pmap_pkru_update_range(pmap, sva, eva, 0); 11656 PMAP_UNLOCK(pmap); 11657 if (error != ENOMEM) 11658 break; 11659 vm_wait(NULL); 11660 } 11661 return (error); 11662 } 11663 11664 #if defined(KASAN) || defined(KMSAN) 11665 11666 /* 11667 * Reserve enough memory to: 11668 * 1) allocate PDP pages for the shadow map(s), 11669 * 2) shadow the boot stack of KSTACK_PAGES pages, 11670 * so we need one PD page, one or two PT pages, and KSTACK_PAGES shadow pages 11671 * per shadow map. 11672 */ 11673 #ifdef KASAN 11674 #define SAN_EARLY_PAGES \ 11675 (NKASANPML4E + 1 + 2 + howmany(KSTACK_PAGES, KASAN_SHADOW_SCALE)) 11676 #else 11677 #define SAN_EARLY_PAGES \ 11678 (NKMSANSHADPML4E + NKMSANORIGPML4E + 2 * (1 + 2 + KSTACK_PAGES)) 11679 #endif 11680 11681 static uint64_t __nosanitizeaddress __nosanitizememory 11682 pmap_san_enter_early_alloc_4k(uint64_t pabase) 11683 { 11684 static uint8_t data[PAGE_SIZE * SAN_EARLY_PAGES] __aligned(PAGE_SIZE); 11685 static size_t offset = 0; 11686 uint64_t pa; 11687 11688 if (offset == sizeof(data)) { 11689 panic("%s: ran out of memory for the bootstrap shadow map", 11690 __func__); 11691 } 11692 11693 pa = pabase + ((vm_offset_t)&data[offset] - KERNSTART); 11694 offset += PAGE_SIZE; 11695 return (pa); 11696 } 11697 11698 /* 11699 * Map a shadow page, before the kernel has bootstrapped its page tables. This 11700 * is currently only used to shadow the temporary boot stack set up by locore. 11701 */ 11702 static void __nosanitizeaddress __nosanitizememory 11703 pmap_san_enter_early(vm_offset_t va) 11704 { 11705 static bool first = true; 11706 pml4_entry_t *pml4e; 11707 pdp_entry_t *pdpe; 11708 pd_entry_t *pde; 11709 pt_entry_t *pte; 11710 uint64_t cr3, pa, base; 11711 int i; 11712 11713 base = amd64_loadaddr(); 11714 cr3 = rcr3(); 11715 11716 if (first) { 11717 /* 11718 * If this the first call, we need to allocate new PML4Es for 11719 * the bootstrap shadow map(s). We don't know how the PML4 page 11720 * was initialized by the boot loader, so we can't simply test 11721 * whether the shadow map's PML4Es are zero. 11722 */ 11723 first = false; 11724 #ifdef KASAN 11725 for (i = 0; i < NKASANPML4E; i++) { 11726 pa = pmap_san_enter_early_alloc_4k(base); 11727 11728 pml4e = (pml4_entry_t *)cr3 + 11729 pmap_pml4e_index(KASAN_MIN_ADDRESS + i * NBPML4); 11730 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V); 11731 } 11732 #else 11733 for (i = 0; i < NKMSANORIGPML4E; i++) { 11734 pa = pmap_san_enter_early_alloc_4k(base); 11735 11736 pml4e = (pml4_entry_t *)cr3 + 11737 pmap_pml4e_index(KMSAN_ORIG_MIN_ADDRESS + 11738 i * NBPML4); 11739 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V); 11740 } 11741 for (i = 0; i < NKMSANSHADPML4E; i++) { 11742 pa = pmap_san_enter_early_alloc_4k(base); 11743 11744 pml4e = (pml4_entry_t *)cr3 + 11745 pmap_pml4e_index(KMSAN_SHAD_MIN_ADDRESS + 11746 i * NBPML4); 11747 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V); 11748 } 11749 #endif 11750 } 11751 pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(va); 11752 pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(va); 11753 if (*pdpe == 0) { 11754 pa = pmap_san_enter_early_alloc_4k(base); 11755 *pdpe = (pdp_entry_t)(pa | X86_PG_RW | X86_PG_V); 11756 } 11757 pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(va); 11758 if (*pde == 0) { 11759 pa = pmap_san_enter_early_alloc_4k(base); 11760 *pde = (pd_entry_t)(pa | X86_PG_RW | X86_PG_V); 11761 } 11762 pte = (pt_entry_t *)(*pde & PG_FRAME) + pmap_pte_index(va); 11763 if (*pte != 0) 11764 panic("%s: PTE for %#lx is already initialized", __func__, va); 11765 pa = pmap_san_enter_early_alloc_4k(base); 11766 *pte = (pt_entry_t)(pa | X86_PG_A | X86_PG_M | X86_PG_RW | X86_PG_V); 11767 } 11768 11769 static vm_page_t 11770 pmap_san_enter_alloc_4k(void) 11771 { 11772 vm_page_t m; 11773 11774 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 11775 VM_ALLOC_ZERO); 11776 if (m == NULL) 11777 panic("%s: no memory to grow shadow map", __func__); 11778 return (m); 11779 } 11780 11781 static vm_page_t 11782 pmap_san_enter_alloc_2m(void) 11783 { 11784 return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO, 11785 NPTEPG, 0, ~0ul, NBPDR, 0, VM_MEMATTR_DEFAULT)); 11786 } 11787 11788 /* 11789 * Grow a shadow map by at least one 4KB page at the specified address. Use 2MB 11790 * pages when possible. 11791 */ 11792 void __nosanitizeaddress __nosanitizememory 11793 pmap_san_enter(vm_offset_t va) 11794 { 11795 pdp_entry_t *pdpe; 11796 pd_entry_t *pde; 11797 pt_entry_t *pte; 11798 vm_page_t m; 11799 11800 if (kernphys == 0) { 11801 /* 11802 * We're creating a temporary shadow map for the boot stack. 11803 */ 11804 pmap_san_enter_early(va); 11805 return; 11806 } 11807 11808 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 11809 11810 pdpe = pmap_pdpe(kernel_pmap, va); 11811 if ((*pdpe & X86_PG_V) == 0) { 11812 m = pmap_san_enter_alloc_4k(); 11813 *pdpe = (pdp_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11814 X86_PG_V | pg_nx); 11815 } 11816 pde = pmap_pdpe_to_pde(pdpe, va); 11817 if ((*pde & X86_PG_V) == 0) { 11818 m = pmap_san_enter_alloc_2m(); 11819 if (m != NULL) { 11820 *pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11821 X86_PG_PS | X86_PG_V | X86_PG_A | X86_PG_M | pg_nx); 11822 } else { 11823 m = pmap_san_enter_alloc_4k(); 11824 *pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11825 X86_PG_V | pg_nx); 11826 } 11827 } 11828 if ((*pde & X86_PG_PS) != 0) 11829 return; 11830 pte = pmap_pde_to_pte(pde, va); 11831 if ((*pte & X86_PG_V) != 0) 11832 return; 11833 m = pmap_san_enter_alloc_4k(); 11834 *pte = (pt_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | X86_PG_V | 11835 X86_PG_M | X86_PG_A | pg_nx); 11836 } 11837 #endif 11838 11839 /* 11840 * Track a range of the kernel's virtual address space that is contiguous 11841 * in various mapping attributes. 11842 */ 11843 struct pmap_kernel_map_range { 11844 vm_offset_t sva; 11845 pt_entry_t attrs; 11846 int ptes; 11847 int pdes; 11848 int pdpes; 11849 }; 11850 11851 static void 11852 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 11853 vm_offset_t eva) 11854 { 11855 const char *mode; 11856 int i, pat_idx; 11857 11858 if (eva <= range->sva) 11859 return; 11860 11861 pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true); 11862 for (i = 0; i < PAT_INDEX_SIZE; i++) 11863 if (pat_index[i] == pat_idx) 11864 break; 11865 11866 switch (i) { 11867 case PAT_WRITE_BACK: 11868 mode = "WB"; 11869 break; 11870 case PAT_WRITE_THROUGH: 11871 mode = "WT"; 11872 break; 11873 case PAT_UNCACHEABLE: 11874 mode = "UC"; 11875 break; 11876 case PAT_UNCACHED: 11877 mode = "U-"; 11878 break; 11879 case PAT_WRITE_PROTECTED: 11880 mode = "WP"; 11881 break; 11882 case PAT_WRITE_COMBINING: 11883 mode = "WC"; 11884 break; 11885 default: 11886 printf("%s: unknown PAT mode %#x for range 0x%016lx-0x%016lx\n", 11887 __func__, pat_idx, range->sva, eva); 11888 mode = "??"; 11889 break; 11890 } 11891 11892 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n", 11893 range->sva, eva, 11894 (range->attrs & X86_PG_RW) != 0 ? 'w' : '-', 11895 (range->attrs & pg_nx) != 0 ? '-' : 'x', 11896 (range->attrs & X86_PG_U) != 0 ? 'u' : 's', 11897 (range->attrs & X86_PG_G) != 0 ? 'g' : '-', 11898 mode, range->pdpes, range->pdes, range->ptes); 11899 11900 /* Reset to sentinel value. */ 11901 range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, 11902 NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, 11903 NPDEPG - 1, NPTEPG - 1); 11904 } 11905 11906 /* 11907 * Determine whether the attributes specified by a page table entry match those 11908 * being tracked by the current range. This is not quite as simple as a direct 11909 * flag comparison since some PAT modes have multiple representations. 11910 */ 11911 static bool 11912 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 11913 { 11914 pt_entry_t diff, mask; 11915 11916 mask = X86_PG_G | X86_PG_RW | X86_PG_U | X86_PG_PDE_CACHE | pg_nx; 11917 diff = (range->attrs ^ attrs) & mask; 11918 if (diff == 0) 11919 return (true); 11920 if ((diff & ~X86_PG_PDE_PAT) == 0 && 11921 pmap_pat_index(kernel_pmap, range->attrs, true) == 11922 pmap_pat_index(kernel_pmap, attrs, true)) 11923 return (true); 11924 return (false); 11925 } 11926 11927 static void 11928 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 11929 pt_entry_t attrs) 11930 { 11931 11932 memset(range, 0, sizeof(*range)); 11933 range->sva = va; 11934 range->attrs = attrs; 11935 } 11936 11937 /* 11938 * Given a leaf PTE, derive the mapping's attributes. If they do not match 11939 * those of the current run, dump the address range and its attributes, and 11940 * begin a new run. 11941 */ 11942 static void 11943 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 11944 vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde, 11945 pt_entry_t pte) 11946 { 11947 pt_entry_t attrs; 11948 11949 attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx); 11950 11951 attrs |= pdpe & pg_nx; 11952 attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U)); 11953 if ((pdpe & PG_PS) != 0) { 11954 attrs |= pdpe & (X86_PG_G | X86_PG_PDE_CACHE); 11955 } else if (pde != 0) { 11956 attrs |= pde & pg_nx; 11957 attrs &= pg_nx | (pde & (X86_PG_RW | X86_PG_U)); 11958 } 11959 if ((pde & PG_PS) != 0) { 11960 attrs |= pde & (X86_PG_G | X86_PG_PDE_CACHE); 11961 } else if (pte != 0) { 11962 attrs |= pte & pg_nx; 11963 attrs &= pg_nx | (pte & (X86_PG_RW | X86_PG_U)); 11964 attrs |= pte & (X86_PG_G | X86_PG_PTE_CACHE); 11965 11966 /* Canonicalize by always using the PDE PAT bit. */ 11967 if ((attrs & X86_PG_PTE_PAT) != 0) 11968 attrs ^= X86_PG_PDE_PAT | X86_PG_PTE_PAT; 11969 } 11970 11971 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 11972 sysctl_kmaps_dump(sb, range, va); 11973 sysctl_kmaps_reinit(range, va, attrs); 11974 } 11975 } 11976 11977 static int 11978 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 11979 { 11980 struct pmap_kernel_map_range range; 11981 struct sbuf sbuf, *sb; 11982 pml4_entry_t pml4e; 11983 pdp_entry_t *pdp, pdpe; 11984 pd_entry_t *pd, pde; 11985 pt_entry_t *pt, pte; 11986 vm_offset_t sva; 11987 vm_paddr_t pa; 11988 int error, i, j, k, l; 11989 11990 error = sysctl_wire_old_buffer(req, 0); 11991 if (error != 0) 11992 return (error); 11993 sb = &sbuf; 11994 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 11995 11996 /* Sentinel value. */ 11997 range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, 11998 NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, 11999 NPDEPG - 1, NPTEPG - 1); 12000 12001 /* 12002 * Iterate over the kernel page tables without holding the kernel pmap 12003 * lock. Outside of the large map, kernel page table pages are never 12004 * freed, so at worst we will observe inconsistencies in the output. 12005 * Within the large map, ensure that PDP and PD page addresses are 12006 * valid before descending. 12007 */ 12008 for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) { 12009 switch (i) { 12010 case PML4PML4I: 12011 sbuf_printf(sb, "\nRecursive map:\n"); 12012 break; 12013 case DMPML4I: 12014 sbuf_printf(sb, "\nDirect map:\n"); 12015 break; 12016 #ifdef KASAN 12017 case KASANPML4I: 12018 sbuf_printf(sb, "\nKASAN shadow map:\n"); 12019 break; 12020 #endif 12021 #ifdef KMSAN 12022 case KMSANSHADPML4I: 12023 sbuf_printf(sb, "\nKMSAN shadow map:\n"); 12024 break; 12025 case KMSANORIGPML4I: 12026 sbuf_printf(sb, "\nKMSAN origin map:\n"); 12027 break; 12028 #endif 12029 case KPML4BASE: 12030 sbuf_printf(sb, "\nKernel map:\n"); 12031 break; 12032 case LMSPML4I: 12033 sbuf_printf(sb, "\nLarge map:\n"); 12034 break; 12035 } 12036 12037 /* Convert to canonical form. */ 12038 if (sva == 1ul << 47) 12039 sva |= -1ul << 48; 12040 12041 restart: 12042 pml4e = kernel_pml4[i]; 12043 if ((pml4e & X86_PG_V) == 0) { 12044 sva = rounddown2(sva, NBPML4); 12045 sysctl_kmaps_dump(sb, &range, sva); 12046 sva += NBPML4; 12047 continue; 12048 } 12049 pa = pml4e & PG_FRAME; 12050 pdp = (pdp_entry_t *)PHYS_TO_DMAP(pa); 12051 12052 for (j = pmap_pdpe_index(sva); j < NPDPEPG; j++) { 12053 pdpe = pdp[j]; 12054 if ((pdpe & X86_PG_V) == 0) { 12055 sva = rounddown2(sva, NBPDP); 12056 sysctl_kmaps_dump(sb, &range, sva); 12057 sva += NBPDP; 12058 continue; 12059 } 12060 pa = pdpe & PG_FRAME; 12061 if ((pdpe & PG_PS) != 0) { 12062 sva = rounddown2(sva, NBPDP); 12063 sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, 12064 0, 0); 12065 range.pdpes++; 12066 sva += NBPDP; 12067 continue; 12068 } 12069 if (PMAP_ADDRESS_IN_LARGEMAP(sva) && 12070 vm_phys_paddr_to_vm_page(pa) == NULL) { 12071 /* 12072 * Page table pages for the large map may be 12073 * freed. Validate the next-level address 12074 * before descending. 12075 */ 12076 goto restart; 12077 } 12078 pd = (pd_entry_t *)PHYS_TO_DMAP(pa); 12079 12080 for (k = pmap_pde_index(sva); k < NPDEPG; k++) { 12081 pde = pd[k]; 12082 if ((pde & X86_PG_V) == 0) { 12083 sva = rounddown2(sva, NBPDR); 12084 sysctl_kmaps_dump(sb, &range, sva); 12085 sva += NBPDR; 12086 continue; 12087 } 12088 pa = pde & PG_FRAME; 12089 if ((pde & PG_PS) != 0) { 12090 sva = rounddown2(sva, NBPDR); 12091 sysctl_kmaps_check(sb, &range, sva, 12092 pml4e, pdpe, pde, 0); 12093 range.pdes++; 12094 sva += NBPDR; 12095 continue; 12096 } 12097 if (PMAP_ADDRESS_IN_LARGEMAP(sva) && 12098 vm_phys_paddr_to_vm_page(pa) == NULL) { 12099 /* 12100 * Page table pages for the large map 12101 * may be freed. Validate the 12102 * next-level address before descending. 12103 */ 12104 goto restart; 12105 } 12106 pt = (pt_entry_t *)PHYS_TO_DMAP(pa); 12107 12108 for (l = pmap_pte_index(sva); l < NPTEPG; l++, 12109 sva += PAGE_SIZE) { 12110 pte = pt[l]; 12111 if ((pte & X86_PG_V) == 0) { 12112 sysctl_kmaps_dump(sb, &range, 12113 sva); 12114 continue; 12115 } 12116 sysctl_kmaps_check(sb, &range, sva, 12117 pml4e, pdpe, pde, pte); 12118 range.ptes++; 12119 } 12120 } 12121 } 12122 } 12123 12124 error = sbuf_finish(sb); 12125 sbuf_delete(sb); 12126 return (error); 12127 } 12128 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 12129 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 12130 NULL, 0, sysctl_kmaps, "A", 12131 "Dump kernel address layout"); 12132 12133 #ifdef DDB 12134 DB_SHOW_COMMAND(pte, pmap_print_pte) 12135 { 12136 pmap_t pmap; 12137 pml5_entry_t *pml5; 12138 pml4_entry_t *pml4; 12139 pdp_entry_t *pdp; 12140 pd_entry_t *pde; 12141 pt_entry_t *pte, PG_V; 12142 vm_offset_t va; 12143 12144 if (!have_addr) { 12145 db_printf("show pte addr\n"); 12146 return; 12147 } 12148 va = (vm_offset_t)addr; 12149 12150 if (kdb_thread != NULL) 12151 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); 12152 else 12153 pmap = PCPU_GET(curpmap); 12154 12155 PG_V = pmap_valid_bit(pmap); 12156 db_printf("VA 0x%016lx", va); 12157 12158 if (pmap_is_la57(pmap)) { 12159 pml5 = pmap_pml5e(pmap, va); 12160 db_printf(" pml5e 0x%016lx", *pml5); 12161 if ((*pml5 & PG_V) == 0) { 12162 db_printf("\n"); 12163 return; 12164 } 12165 pml4 = pmap_pml5e_to_pml4e(pml5, va); 12166 } else { 12167 pml4 = pmap_pml4e(pmap, va); 12168 } 12169 db_printf(" pml4e 0x%016lx", *pml4); 12170 if ((*pml4 & PG_V) == 0) { 12171 db_printf("\n"); 12172 return; 12173 } 12174 pdp = pmap_pml4e_to_pdpe(pml4, va); 12175 db_printf(" pdpe 0x%016lx", *pdp); 12176 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { 12177 db_printf("\n"); 12178 return; 12179 } 12180 pde = pmap_pdpe_to_pde(pdp, va); 12181 db_printf(" pde 0x%016lx", *pde); 12182 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { 12183 db_printf("\n"); 12184 return; 12185 } 12186 pte = pmap_pde_to_pte(pde, va); 12187 db_printf(" pte 0x%016lx\n", *pte); 12188 } 12189 12190 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) 12191 { 12192 vm_paddr_t a; 12193 12194 if (have_addr) { 12195 a = (vm_paddr_t)addr; 12196 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); 12197 } else { 12198 db_printf("show phys2dmap addr\n"); 12199 } 12200 } 12201 12202 static void 12203 ptpages_show_page(int level, int idx, vm_page_t pg) 12204 { 12205 db_printf("l %d i %d pg %p phys %#lx ref %x\n", 12206 level, idx, pg, VM_PAGE_TO_PHYS(pg), pg->ref_count); 12207 } 12208 12209 static void 12210 ptpages_show_complain(int level, int idx, uint64_t pte) 12211 { 12212 db_printf("l %d i %d pte %#lx\n", level, idx, pte); 12213 } 12214 12215 static void 12216 ptpages_show_pml4(vm_page_t pg4, int num_entries, uint64_t PG_V) 12217 { 12218 vm_page_t pg3, pg2, pg1; 12219 pml4_entry_t *pml4; 12220 pdp_entry_t *pdp; 12221 pd_entry_t *pd; 12222 int i4, i3, i2; 12223 12224 pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg4)); 12225 for (i4 = 0; i4 < num_entries; i4++) { 12226 if ((pml4[i4] & PG_V) == 0) 12227 continue; 12228 pg3 = PHYS_TO_VM_PAGE(pml4[i4] & PG_FRAME); 12229 if (pg3 == NULL) { 12230 ptpages_show_complain(3, i4, pml4[i4]); 12231 continue; 12232 } 12233 ptpages_show_page(3, i4, pg3); 12234 pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg3)); 12235 for (i3 = 0; i3 < NPDPEPG; i3++) { 12236 if ((pdp[i3] & PG_V) == 0) 12237 continue; 12238 pg2 = PHYS_TO_VM_PAGE(pdp[i3] & PG_FRAME); 12239 if (pg3 == NULL) { 12240 ptpages_show_complain(2, i3, pdp[i3]); 12241 continue; 12242 } 12243 ptpages_show_page(2, i3, pg2); 12244 pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg2)); 12245 for (i2 = 0; i2 < NPDEPG; i2++) { 12246 if ((pd[i2] & PG_V) == 0) 12247 continue; 12248 pg1 = PHYS_TO_VM_PAGE(pd[i2] & PG_FRAME); 12249 if (pg1 == NULL) { 12250 ptpages_show_complain(1, i2, pd[i2]); 12251 continue; 12252 } 12253 ptpages_show_page(1, i2, pg1); 12254 } 12255 } 12256 } 12257 } 12258 12259 DB_SHOW_COMMAND(ptpages, pmap_ptpages) 12260 { 12261 pmap_t pmap; 12262 vm_page_t pg; 12263 pml5_entry_t *pml5; 12264 uint64_t PG_V; 12265 int i5; 12266 12267 if (have_addr) 12268 pmap = (pmap_t)addr; 12269 else 12270 pmap = PCPU_GET(curpmap); 12271 12272 PG_V = pmap_valid_bit(pmap); 12273 12274 if (pmap_is_la57(pmap)) { 12275 pml5 = pmap->pm_pmltop; 12276 for (i5 = 0; i5 < NUPML5E; i5++) { 12277 if ((pml5[i5] & PG_V) == 0) 12278 continue; 12279 pg = PHYS_TO_VM_PAGE(pml5[i5] & PG_FRAME); 12280 if (pg == NULL) { 12281 ptpages_show_complain(4, i5, pml5[i5]); 12282 continue; 12283 } 12284 ptpages_show_page(4, i5, pg); 12285 ptpages_show_pml4(pg, NPML4EPG, PG_V); 12286 } 12287 } else { 12288 ptpages_show_pml4(PHYS_TO_VM_PAGE(DMAP_TO_PHYS( 12289 (vm_offset_t)pmap->pm_pmltop)), NUP4ML4E, PG_V); 12290 } 12291 } 12292 #endif 12293