1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * 15 * This code is derived from software contributed to Berkeley by 16 * the Systems Programming Group of the University of Utah Computer 17 * Science Department and William Jolitz of UUNET Technologies Inc. 18 * 19 * Redistribution and use in source and binary forms, with or without 20 * modification, are permitted provided that the following conditions 21 * are met: 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 3. All advertising materials mentioning features or use of this software 28 * must display the following acknowledgement: 29 * This product includes software developed by the University of 30 * California, Berkeley and its contributors. 31 * 4. Neither the name of the University nor the names of its contributors 32 * may be used to endorse or promote products derived from this software 33 * without specific prior written permission. 34 * 35 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 36 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 38 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 39 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 40 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 41 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 42 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 43 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 44 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 45 * SUCH DAMAGE. 46 * 47 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 48 */ 49 /*- 50 * Copyright (c) 2003 Networks Associates Technology, Inc. 51 * Copyright (c) 2014-2019 The FreeBSD Foundation 52 * All rights reserved. 53 * 54 * This software was developed for the FreeBSD Project by Jake Burkholder, 55 * Safeport Network Services, and Network Associates Laboratories, the 56 * Security Research Division of Network Associates, Inc. under 57 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 58 * CHATS research program. 59 * 60 * Portions of this software were developed by 61 * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from 62 * the FreeBSD Foundation. 63 * 64 * Redistribution and use in source and binary forms, with or without 65 * modification, are permitted provided that the following conditions 66 * are met: 67 * 1. Redistributions of source code must retain the above copyright 68 * notice, this list of conditions and the following disclaimer. 69 * 2. Redistributions in binary form must reproduce the above copyright 70 * notice, this list of conditions and the following disclaimer in the 71 * documentation and/or other materials provided with the distribution. 72 * 73 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 74 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 75 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 76 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 77 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 78 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 79 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 80 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 81 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 82 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 83 * SUCH DAMAGE. 84 */ 85 86 #define AMD64_NPT_AWARE 87 88 #include <sys/cdefs.h> 89 __FBSDID("$FreeBSD$"); 90 91 /* 92 * Manages physical address maps. 93 * 94 * Since the information managed by this module is 95 * also stored by the logical address mapping module, 96 * this module may throw away valid virtual-to-physical 97 * mappings at almost any time. However, invalidations 98 * of virtual-to-physical mappings must be done as 99 * requested. 100 * 101 * In order to cope with hardware architectures which 102 * make virtual-to-physical map invalidates expensive, 103 * this module may delay invalidate or reduced protection 104 * operations until such time as they are actually 105 * necessary. This module is given full information as 106 * to which processors are currently using which maps, 107 * and to when physical maps must be made correct. 108 */ 109 110 #include "opt_pmap.h" 111 #include "opt_vm.h" 112 113 #include <sys/param.h> 114 #include <sys/bitstring.h> 115 #include <sys/bus.h> 116 #include <sys/systm.h> 117 #include <sys/kernel.h> 118 #include <sys/ktr.h> 119 #include <sys/lock.h> 120 #include <sys/malloc.h> 121 #include <sys/mman.h> 122 #include <sys/mutex.h> 123 #include <sys/proc.h> 124 #include <sys/rangeset.h> 125 #include <sys/rwlock.h> 126 #include <sys/sx.h> 127 #include <sys/turnstile.h> 128 #include <sys/vmem.h> 129 #include <sys/vmmeter.h> 130 #include <sys/sched.h> 131 #include <sys/sysctl.h> 132 #include <sys/smp.h> 133 134 #include <vm/vm.h> 135 #include <vm/vm_param.h> 136 #include <vm/vm_kern.h> 137 #include <vm/vm_page.h> 138 #include <vm/vm_map.h> 139 #include <vm/vm_object.h> 140 #include <vm/vm_extern.h> 141 #include <vm/vm_pageout.h> 142 #include <vm/vm_pager.h> 143 #include <vm/vm_phys.h> 144 #include <vm/vm_radix.h> 145 #include <vm/vm_reserv.h> 146 #include <vm/uma.h> 147 148 #include <machine/intr_machdep.h> 149 #include <x86/apicvar.h> 150 #include <x86/ifunc.h> 151 #include <machine/cpu.h> 152 #include <machine/cputypes.h> 153 #include <machine/md_var.h> 154 #include <machine/pcb.h> 155 #include <machine/specialreg.h> 156 #ifdef SMP 157 #include <machine/smp.h> 158 #endif 159 #include <machine/sysarch.h> 160 #include <machine/tss.h> 161 162 static __inline boolean_t 163 pmap_type_guest(pmap_t pmap) 164 { 165 166 return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI)); 167 } 168 169 static __inline boolean_t 170 pmap_emulate_ad_bits(pmap_t pmap) 171 { 172 173 return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); 174 } 175 176 static __inline pt_entry_t 177 pmap_valid_bit(pmap_t pmap) 178 { 179 pt_entry_t mask; 180 181 switch (pmap->pm_type) { 182 case PT_X86: 183 case PT_RVI: 184 mask = X86_PG_V; 185 break; 186 case PT_EPT: 187 if (pmap_emulate_ad_bits(pmap)) 188 mask = EPT_PG_EMUL_V; 189 else 190 mask = EPT_PG_READ; 191 break; 192 default: 193 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); 194 } 195 196 return (mask); 197 } 198 199 static __inline pt_entry_t 200 pmap_rw_bit(pmap_t pmap) 201 { 202 pt_entry_t mask; 203 204 switch (pmap->pm_type) { 205 case PT_X86: 206 case PT_RVI: 207 mask = X86_PG_RW; 208 break; 209 case PT_EPT: 210 if (pmap_emulate_ad_bits(pmap)) 211 mask = EPT_PG_EMUL_RW; 212 else 213 mask = EPT_PG_WRITE; 214 break; 215 default: 216 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); 217 } 218 219 return (mask); 220 } 221 222 static pt_entry_t pg_g; 223 224 static __inline pt_entry_t 225 pmap_global_bit(pmap_t pmap) 226 { 227 pt_entry_t mask; 228 229 switch (pmap->pm_type) { 230 case PT_X86: 231 mask = pg_g; 232 break; 233 case PT_RVI: 234 case PT_EPT: 235 mask = 0; 236 break; 237 default: 238 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); 239 } 240 241 return (mask); 242 } 243 244 static __inline pt_entry_t 245 pmap_accessed_bit(pmap_t pmap) 246 { 247 pt_entry_t mask; 248 249 switch (pmap->pm_type) { 250 case PT_X86: 251 case PT_RVI: 252 mask = X86_PG_A; 253 break; 254 case PT_EPT: 255 if (pmap_emulate_ad_bits(pmap)) 256 mask = EPT_PG_READ; 257 else 258 mask = EPT_PG_A; 259 break; 260 default: 261 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); 262 } 263 264 return (mask); 265 } 266 267 static __inline pt_entry_t 268 pmap_modified_bit(pmap_t pmap) 269 { 270 pt_entry_t mask; 271 272 switch (pmap->pm_type) { 273 case PT_X86: 274 case PT_RVI: 275 mask = X86_PG_M; 276 break; 277 case PT_EPT: 278 if (pmap_emulate_ad_bits(pmap)) 279 mask = EPT_PG_WRITE; 280 else 281 mask = EPT_PG_M; 282 break; 283 default: 284 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); 285 } 286 287 return (mask); 288 } 289 290 static __inline pt_entry_t 291 pmap_pku_mask_bit(pmap_t pmap) 292 { 293 294 return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0); 295 } 296 297 #if !defined(DIAGNOSTIC) 298 #ifdef __GNUC_GNU_INLINE__ 299 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 300 #else 301 #define PMAP_INLINE extern inline 302 #endif 303 #else 304 #define PMAP_INLINE 305 #endif 306 307 #ifdef PV_STATS 308 #define PV_STAT(x) do { x ; } while (0) 309 #else 310 #define PV_STAT(x) do { } while (0) 311 #endif 312 313 #define pa_index(pa) ((pa) >> PDRSHIFT) 314 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 315 316 #define NPV_LIST_LOCKS MAXCPU 317 318 #define PHYS_TO_PV_LIST_LOCK(pa) \ 319 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) 320 321 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 322 struct rwlock **_lockp = (lockp); \ 323 struct rwlock *_new_lock; \ 324 \ 325 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 326 if (_new_lock != *_lockp) { \ 327 if (*_lockp != NULL) \ 328 rw_wunlock(*_lockp); \ 329 *_lockp = _new_lock; \ 330 rw_wlock(*_lockp); \ 331 } \ 332 } while (0) 333 334 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 335 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 336 337 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 338 struct rwlock **_lockp = (lockp); \ 339 \ 340 if (*_lockp != NULL) { \ 341 rw_wunlock(*_lockp); \ 342 *_lockp = NULL; \ 343 } \ 344 } while (0) 345 346 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 347 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 348 349 struct pmap kernel_pmap_store; 350 351 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 352 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 353 354 int nkpt; 355 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, 356 "Number of kernel page table pages allocated on bootup"); 357 358 static int ndmpdp; 359 vm_paddr_t dmaplimit; 360 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 361 pt_entry_t pg_nx; 362 363 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 364 365 static int pg_ps_enabled = 1; 366 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 367 &pg_ps_enabled, 0, "Are large page mappings enabled?"); 368 369 #define PAT_INDEX_SIZE 8 370 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 371 372 static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 373 static u_int64_t KPDphys; /* phys addr of kernel level 2 */ 374 u_int64_t KPDPphys; /* phys addr of kernel level 3 */ 375 u_int64_t KPML4phys; /* phys addr of kernel level 4 */ 376 377 static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ 378 static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 379 static int ndmpdpphys; /* number of DMPDPphys pages */ 380 381 static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */ 382 383 /* 384 * pmap_mapdev support pre initialization (i.e. console) 385 */ 386 #define PMAP_PREINIT_MAPPING_COUNT 8 387 static struct pmap_preinit_mapping { 388 vm_paddr_t pa; 389 vm_offset_t va; 390 vm_size_t sz; 391 int mode; 392 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 393 static int pmap_initialized; 394 395 /* 396 * Data for the pv entry allocation mechanism. 397 * Updates to pv_invl_gen are protected by the pv_list_locks[] 398 * elements, but reads are not. 399 */ 400 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 401 static struct mtx __exclusive_cache_line pv_chunks_mutex; 402 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; 403 static u_long pv_invl_gen[NPV_LIST_LOCKS]; 404 static struct md_page *pv_table; 405 static struct md_page pv_dummy; 406 407 /* 408 * All those kernel PT submaps that BSD is so fond of 409 */ 410 pt_entry_t *CMAP1 = NULL; 411 caddr_t CADDR1 = 0; 412 static vm_offset_t qframe = 0; 413 static struct mtx qframe_mtx; 414 415 static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ 416 417 static vmem_t *large_vmem; 418 static u_int lm_ents; 419 420 int pmap_pcid_enabled = 1; 421 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 422 &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?"); 423 int invpcid_works = 0; 424 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, 425 "Is the invpcid instruction available ?"); 426 427 int __read_frequently pti = 0; 428 SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 429 &pti, 0, 430 "Page Table Isolation enabled"); 431 static vm_object_t pti_obj; 432 static pml4_entry_t *pti_pml4; 433 static vm_pindex_t pti_pg_idx; 434 static bool pti_finalized; 435 436 struct pmap_pkru_range { 437 struct rs_el pkru_rs_el; 438 u_int pkru_keyidx; 439 int pkru_flags; 440 }; 441 442 static uma_zone_t pmap_pkru_ranges_zone; 443 static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 444 static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va); 445 static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 446 static void *pkru_dup_range(void *ctx, void *data); 447 static void pkru_free_range(void *ctx, void *node); 448 static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap); 449 static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 450 static void pmap_pkru_deassign_all(pmap_t pmap); 451 452 static int 453 pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS) 454 { 455 int i; 456 uint64_t res; 457 458 res = 0; 459 CPU_FOREACH(i) { 460 res += cpuid_to_pcpu[i]->pc_pm_save_cnt; 461 } 462 return (sysctl_handle_64(oidp, &res, 0, req)); 463 } 464 SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW | 465 CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU", 466 "Count of saved TLB context on switch"); 467 468 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = 469 LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); 470 static struct mtx invl_gen_mtx; 471 static u_long pmap_invl_gen = 0; 472 /* Fake lock object to satisfy turnstiles interface. */ 473 static struct lock_object invl_gen_ts = { 474 .lo_name = "invlts", 475 }; 476 477 static bool 478 pmap_not_in_di(void) 479 { 480 481 return (curthread->td_md.md_invl_gen.gen == 0); 482 } 483 484 #define PMAP_ASSERT_NOT_IN_DI() \ 485 KASSERT(pmap_not_in_di(), ("DI already started")) 486 487 /* 488 * Start a new Delayed Invalidation (DI) block of code, executed by 489 * the current thread. Within a DI block, the current thread may 490 * destroy both the page table and PV list entries for a mapping and 491 * then release the corresponding PV list lock before ensuring that 492 * the mapping is flushed from the TLBs of any processors with the 493 * pmap active. 494 */ 495 static void 496 pmap_delayed_invl_started(void) 497 { 498 struct pmap_invl_gen *invl_gen; 499 u_long currgen; 500 501 invl_gen = &curthread->td_md.md_invl_gen; 502 PMAP_ASSERT_NOT_IN_DI(); 503 mtx_lock(&invl_gen_mtx); 504 if (LIST_EMPTY(&pmap_invl_gen_tracker)) 505 currgen = pmap_invl_gen; 506 else 507 currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; 508 invl_gen->gen = currgen + 1; 509 LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); 510 mtx_unlock(&invl_gen_mtx); 511 } 512 513 /* 514 * Finish the DI block, previously started by the current thread. All 515 * required TLB flushes for the pages marked by 516 * pmap_delayed_invl_page() must be finished before this function is 517 * called. 518 * 519 * This function works by bumping the global DI generation number to 520 * the generation number of the current thread's DI, unless there is a 521 * pending DI that started earlier. In the latter case, bumping the 522 * global DI generation number would incorrectly signal that the 523 * earlier DI had finished. Instead, this function bumps the earlier 524 * DI's generation number to match the generation number of the 525 * current thread's DI. 526 */ 527 static void 528 pmap_delayed_invl_finished(void) 529 { 530 struct pmap_invl_gen *invl_gen, *next; 531 struct turnstile *ts; 532 533 invl_gen = &curthread->td_md.md_invl_gen; 534 KASSERT(invl_gen->gen != 0, ("missed invl_started")); 535 mtx_lock(&invl_gen_mtx); 536 next = LIST_NEXT(invl_gen, link); 537 if (next == NULL) { 538 turnstile_chain_lock(&invl_gen_ts); 539 ts = turnstile_lookup(&invl_gen_ts); 540 pmap_invl_gen = invl_gen->gen; 541 if (ts != NULL) { 542 turnstile_broadcast(ts, TS_SHARED_QUEUE); 543 turnstile_unpend(ts); 544 } 545 turnstile_chain_unlock(&invl_gen_ts); 546 } else { 547 next->gen = invl_gen->gen; 548 } 549 LIST_REMOVE(invl_gen, link); 550 mtx_unlock(&invl_gen_mtx); 551 invl_gen->gen = 0; 552 } 553 554 #ifdef PV_STATS 555 static long invl_wait; 556 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0, 557 "Number of times DI invalidation blocked pmap_remove_all/write"); 558 #endif 559 560 static u_long * 561 pmap_delayed_invl_genp(vm_page_t m) 562 { 563 564 return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); 565 } 566 567 /* 568 * Ensure that all currently executing DI blocks, that need to flush 569 * TLB for the given page m, actually flushed the TLB at the time the 570 * function returned. If the page m has an empty PV list and we call 571 * pmap_delayed_invl_wait(), upon its return we know that no CPU has a 572 * valid mapping for the page m in either its page table or TLB. 573 * 574 * This function works by blocking until the global DI generation 575 * number catches up with the generation number associated with the 576 * given page m and its PV list. Since this function's callers 577 * typically own an object lock and sometimes own a page lock, it 578 * cannot sleep. Instead, it blocks on a turnstile to relinquish the 579 * processor. 580 */ 581 static void 582 pmap_delayed_invl_wait(vm_page_t m) 583 { 584 struct turnstile *ts; 585 u_long *m_gen; 586 #ifdef PV_STATS 587 bool accounted = false; 588 #endif 589 590 m_gen = pmap_delayed_invl_genp(m); 591 while (*m_gen > pmap_invl_gen) { 592 #ifdef PV_STATS 593 if (!accounted) { 594 atomic_add_long(&invl_wait, 1); 595 accounted = true; 596 } 597 #endif 598 ts = turnstile_trywait(&invl_gen_ts); 599 if (*m_gen > pmap_invl_gen) 600 turnstile_wait(ts, NULL, TS_SHARED_QUEUE); 601 else 602 turnstile_cancel(ts); 603 } 604 } 605 606 /* 607 * Mark the page m's PV list as participating in the current thread's 608 * DI block. Any threads concurrently using m's PV list to remove or 609 * restrict all mappings to m will wait for the current thread's DI 610 * block to complete before proceeding. 611 * 612 * The function works by setting the DI generation number for m's PV 613 * list to at least the DI generation number of the current thread. 614 * This forces a caller of pmap_delayed_invl_wait() to block until 615 * current thread calls pmap_delayed_invl_finished(). 616 */ 617 static void 618 pmap_delayed_invl_page(vm_page_t m) 619 { 620 u_long gen, *m_gen; 621 622 rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); 623 gen = curthread->td_md.md_invl_gen.gen; 624 if (gen == 0) 625 return; 626 m_gen = pmap_delayed_invl_genp(m); 627 if (*m_gen < gen) 628 *m_gen = gen; 629 } 630 631 /* 632 * Crashdump maps. 633 */ 634 static caddr_t crashdumpmap; 635 636 /* 637 * Internal flags for pmap_enter()'s helper functions. 638 */ 639 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 640 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 641 642 static void free_pv_chunk(struct pv_chunk *pc); 643 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 644 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 645 static int popcnt_pc_map_pq(uint64_t *map); 646 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 647 static void reserve_pv_entries(pmap_t pmap, int needed, 648 struct rwlock **lockp); 649 static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 650 struct rwlock **lockp); 651 static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, 652 u_int flags, struct rwlock **lockp); 653 #if VM_NRESERVLEVEL > 0 654 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 655 struct rwlock **lockp); 656 #endif 657 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 658 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 659 vm_offset_t va); 660 661 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, 662 bool noflush); 663 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 664 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, 665 vm_offset_t va, struct rwlock **lockp); 666 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, 667 vm_offset_t va); 668 static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, 669 vm_prot_t prot, struct rwlock **lockp); 670 static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, 671 u_int flags, vm_page_t m, struct rwlock **lockp); 672 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 673 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 674 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 675 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 676 static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, 677 vm_offset_t eva); 678 static void pmap_invalidate_cache_range_all(vm_offset_t sva, 679 vm_offset_t eva); 680 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, 681 pd_entry_t pde); 682 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 683 static vm_page_t pmap_large_map_getptp_unlocked(void); 684 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask); 685 #if VM_NRESERVLEVEL > 0 686 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 687 struct rwlock **lockp); 688 #endif 689 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 690 vm_prot_t prot); 691 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask); 692 static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, 693 bool exec); 694 static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va); 695 static pd_entry_t *pmap_pti_pde(vm_offset_t va); 696 static void pmap_pti_wire_pte(void *pte); 697 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 698 struct spglist *free, struct rwlock **lockp); 699 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 700 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 701 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 702 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 703 struct spglist *free); 704 static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 705 pd_entry_t *pde, struct spglist *free, 706 struct rwlock **lockp); 707 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 708 vm_page_t m, struct rwlock **lockp); 709 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 710 pd_entry_t newpde); 711 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); 712 713 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, 714 struct rwlock **lockp); 715 static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, 716 struct rwlock **lockp); 717 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, 718 struct rwlock **lockp); 719 720 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 721 struct spglist *free); 722 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 723 724 /********************/ 725 /* Inline functions */ 726 /********************/ 727 728 /* Return a non-clipped PD index for a given VA */ 729 static __inline vm_pindex_t 730 pmap_pde_pindex(vm_offset_t va) 731 { 732 return (va >> PDRSHIFT); 733 } 734 735 736 /* Return a pointer to the PML4 slot that corresponds to a VA */ 737 static __inline pml4_entry_t * 738 pmap_pml4e(pmap_t pmap, vm_offset_t va) 739 { 740 741 return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 742 } 743 744 /* Return a pointer to the PDP slot that corresponds to a VA */ 745 static __inline pdp_entry_t * 746 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 747 { 748 pdp_entry_t *pdpe; 749 750 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 751 return (&pdpe[pmap_pdpe_index(va)]); 752 } 753 754 /* Return a pointer to the PDP slot that corresponds to a VA */ 755 static __inline pdp_entry_t * 756 pmap_pdpe(pmap_t pmap, vm_offset_t va) 757 { 758 pml4_entry_t *pml4e; 759 pt_entry_t PG_V; 760 761 PG_V = pmap_valid_bit(pmap); 762 pml4e = pmap_pml4e(pmap, va); 763 if ((*pml4e & PG_V) == 0) 764 return (NULL); 765 return (pmap_pml4e_to_pdpe(pml4e, va)); 766 } 767 768 /* Return a pointer to the PD slot that corresponds to a VA */ 769 static __inline pd_entry_t * 770 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 771 { 772 pd_entry_t *pde; 773 774 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 775 return (&pde[pmap_pde_index(va)]); 776 } 777 778 /* Return a pointer to the PD slot that corresponds to a VA */ 779 static __inline pd_entry_t * 780 pmap_pde(pmap_t pmap, vm_offset_t va) 781 { 782 pdp_entry_t *pdpe; 783 pt_entry_t PG_V; 784 785 PG_V = pmap_valid_bit(pmap); 786 pdpe = pmap_pdpe(pmap, va); 787 if (pdpe == NULL || (*pdpe & PG_V) == 0) 788 return (NULL); 789 return (pmap_pdpe_to_pde(pdpe, va)); 790 } 791 792 /* Return a pointer to the PT slot that corresponds to a VA */ 793 static __inline pt_entry_t * 794 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 795 { 796 pt_entry_t *pte; 797 798 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 799 return (&pte[pmap_pte_index(va)]); 800 } 801 802 /* Return a pointer to the PT slot that corresponds to a VA */ 803 static __inline pt_entry_t * 804 pmap_pte(pmap_t pmap, vm_offset_t va) 805 { 806 pd_entry_t *pde; 807 pt_entry_t PG_V; 808 809 PG_V = pmap_valid_bit(pmap); 810 pde = pmap_pde(pmap, va); 811 if (pde == NULL || (*pde & PG_V) == 0) 812 return (NULL); 813 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 814 return ((pt_entry_t *)pde); 815 return (pmap_pde_to_pte(pde, va)); 816 } 817 818 static __inline void 819 pmap_resident_count_inc(pmap_t pmap, int count) 820 { 821 822 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 823 pmap->pm_stats.resident_count += count; 824 } 825 826 static __inline void 827 pmap_resident_count_dec(pmap_t pmap, int count) 828 { 829 830 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 831 KASSERT(pmap->pm_stats.resident_count >= count, 832 ("pmap %p resident count underflow %ld %d", pmap, 833 pmap->pm_stats.resident_count, count)); 834 pmap->pm_stats.resident_count -= count; 835 } 836 837 PMAP_INLINE pt_entry_t * 838 vtopte(vm_offset_t va) 839 { 840 u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 841 842 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); 843 844 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 845 } 846 847 static __inline pd_entry_t * 848 vtopde(vm_offset_t va) 849 { 850 u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 851 852 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); 853 854 return (PDmap + ((va >> PDRSHIFT) & mask)); 855 } 856 857 static u_int64_t 858 allocpages(vm_paddr_t *firstaddr, int n) 859 { 860 u_int64_t ret; 861 862 ret = *firstaddr; 863 bzero((void *)ret, n * PAGE_SIZE); 864 *firstaddr += n * PAGE_SIZE; 865 return (ret); 866 } 867 868 CTASSERT(powerof2(NDMPML4E)); 869 870 /* number of kernel PDP slots */ 871 #define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG) 872 873 static void 874 nkpt_init(vm_paddr_t addr) 875 { 876 int pt_pages; 877 878 #ifdef NKPT 879 pt_pages = NKPT; 880 #else 881 pt_pages = howmany(addr, 1 << PDRSHIFT); 882 pt_pages += NKPDPE(pt_pages); 883 884 /* 885 * Add some slop beyond the bare minimum required for bootstrapping 886 * the kernel. 887 * 888 * This is quite important when allocating KVA for kernel modules. 889 * The modules are required to be linked in the negative 2GB of 890 * the address space. If we run out of KVA in this region then 891 * pmap_growkernel() will need to allocate page table pages to map 892 * the entire 512GB of KVA space which is an unnecessary tax on 893 * physical memory. 894 * 895 * Secondly, device memory mapped as part of setting up the low- 896 * level console(s) is taken from KVA, starting at virtual_avail. 897 * This is because cninit() is called after pmap_bootstrap() but 898 * before vm_init() and pmap_init(). 20MB for a frame buffer is 899 * not uncommon. 900 */ 901 pt_pages += 32; /* 64MB additional slop. */ 902 #endif 903 nkpt = pt_pages; 904 } 905 906 /* 907 * Returns the proper write/execute permission for a physical page that is 908 * part of the initial boot allocations. 909 * 910 * If the page has kernel text, it is marked as read-only. If the page has 911 * kernel read-only data, it is marked as read-only/not-executable. If the 912 * page has only read-write data, it is marked as read-write/not-executable. 913 * If the page is below/above the kernel range, it is marked as read-write. 914 * 915 * This function operates on 2M pages, since we map the kernel space that 916 * way. 917 * 918 * Note that this doesn't currently provide any protection for modules. 919 */ 920 static inline pt_entry_t 921 bootaddr_rwx(vm_paddr_t pa) 922 { 923 924 /* 925 * Everything in the same 2M page as the start of the kernel 926 * should be static. On the other hand, things in the same 2M 927 * page as the end of the kernel could be read-write/executable, 928 * as the kernel image is not guaranteed to end on a 2M boundary. 929 */ 930 if (pa < trunc_2mpage(btext - KERNBASE) || 931 pa >= trunc_2mpage(_end - KERNBASE)) 932 return (X86_PG_RW); 933 /* 934 * The linker should ensure that the read-only and read-write 935 * portions don't share the same 2M page, so this shouldn't 936 * impact read-only data. However, in any case, any page with 937 * read-write data needs to be read-write. 938 */ 939 if (pa >= trunc_2mpage(brwsection - KERNBASE)) 940 return (X86_PG_RW | pg_nx); 941 /* 942 * Mark any 2M page containing kernel text as read-only. Mark 943 * other pages with read-only data as read-only and not executable. 944 * (It is likely a small portion of the read-only data section will 945 * be marked as read-only, but executable. This should be acceptable 946 * since the read-only protection will keep the data from changing.) 947 * Note that fixups to the .text section will still work until we 948 * set CR0.WP. 949 */ 950 if (pa < round_2mpage(etext - KERNBASE)) 951 return (0); 952 return (pg_nx); 953 } 954 955 static void 956 create_pagetables(vm_paddr_t *firstaddr) 957 { 958 int i, j, ndm1g, nkpdpe, nkdmpde; 959 pt_entry_t *pt_p; 960 pd_entry_t *pd_p; 961 pdp_entry_t *pdp_p; 962 pml4_entry_t *p4_p; 963 uint64_t DMPDkernphys; 964 965 /* Allocate page table pages for the direct map */ 966 ndmpdp = howmany(ptoa(Maxmem), NBPDP); 967 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 968 ndmpdp = 4; 969 ndmpdpphys = howmany(ndmpdp, NPDPEPG); 970 if (ndmpdpphys > NDMPML4E) { 971 /* 972 * Each NDMPML4E allows 512 GB, so limit to that, 973 * and then readjust ndmpdp and ndmpdpphys. 974 */ 975 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); 976 Maxmem = atop(NDMPML4E * NBPML4); 977 ndmpdpphys = NDMPML4E; 978 ndmpdp = NDMPML4E * NPDEPG; 979 } 980 DMPDPphys = allocpages(firstaddr, ndmpdpphys); 981 ndm1g = 0; 982 if ((amd_feature & AMDID_PAGE1GB) != 0) { 983 /* 984 * Calculate the number of 1G pages that will fully fit in 985 * Maxmem. 986 */ 987 ndm1g = ptoa(Maxmem) >> PDPSHIFT; 988 989 /* 990 * Allocate 2M pages for the kernel. These will be used in 991 * place of the first one or more 1G pages from ndm1g. 992 */ 993 nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP); 994 DMPDkernphys = allocpages(firstaddr, nkdmpde); 995 } 996 if (ndm1g < ndmpdp) 997 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); 998 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 999 1000 /* Allocate pages */ 1001 KPML4phys = allocpages(firstaddr, 1); 1002 KPDPphys = allocpages(firstaddr, NKPML4E); 1003 1004 /* 1005 * Allocate the initial number of kernel page table pages required to 1006 * bootstrap. We defer this until after all memory-size dependent 1007 * allocations are done (e.g. direct map), so that we don't have to 1008 * build in too much slop in our estimate. 1009 * 1010 * Note that when NKPML4E > 1, we have an empty page underneath 1011 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) 1012 * pages. (pmap_enter requires a PD page to exist for each KPML4E.) 1013 */ 1014 nkpt_init(*firstaddr); 1015 nkpdpe = NKPDPE(nkpt); 1016 1017 KPTphys = allocpages(firstaddr, nkpt); 1018 KPDphys = allocpages(firstaddr, nkpdpe); 1019 1020 /* Fill in the underlying page table pages */ 1021 /* XXX not fully used, underneath 2M pages */ 1022 pt_p = (pt_entry_t *)KPTphys; 1023 for (i = 0; ptoa(i) < *firstaddr; i++) 1024 pt_p[i] = ptoa(i) | X86_PG_V | pg_g | bootaddr_rwx(ptoa(i)); 1025 1026 /* Now map the page tables at their location within PTmap */ 1027 pd_p = (pd_entry_t *)KPDphys; 1028 for (i = 0; i < nkpt; i++) 1029 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 1030 1031 /* Map from zero to end of allocations under 2M pages */ 1032 /* This replaces some of the KPTphys entries above */ 1033 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) 1034 /* Preset PG_M and PG_A because demotion expects it. */ 1035 pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | 1036 X86_PG_M | X86_PG_A | bootaddr_rwx(i << PDRSHIFT); 1037 1038 /* 1039 * Because we map the physical blocks in 2M pages, adjust firstaddr 1040 * to record the physical blocks we've actually mapped into kernel 1041 * virtual address space. 1042 */ 1043 *firstaddr = round_2mpage(*firstaddr); 1044 1045 /* And connect up the PD to the PDP (leaving room for L4 pages) */ 1046 pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); 1047 for (i = 0; i < nkpdpe; i++) 1048 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 1049 1050 /* 1051 * Now, set up the direct map region using 2MB and/or 1GB pages. If 1052 * the end of physical memory is not aligned to a 1GB page boundary, 1053 * then the residual physical memory is mapped with 2MB pages. Later, 1054 * if pmap_mapdev{_attr}() uses the direct map for non-write-back 1055 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings 1056 * that are partially used. 1057 */ 1058 pd_p = (pd_entry_t *)DMPDphys; 1059 for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { 1060 pd_p[j] = (vm_paddr_t)i << PDRSHIFT; 1061 /* Preset PG_M and PG_A because demotion expects it. */ 1062 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | 1063 X86_PG_M | X86_PG_A | pg_nx; 1064 } 1065 pdp_p = (pdp_entry_t *)DMPDPphys; 1066 for (i = 0; i < ndm1g; i++) { 1067 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; 1068 /* Preset PG_M and PG_A because demotion expects it. */ 1069 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | 1070 X86_PG_M | X86_PG_A | pg_nx; 1071 } 1072 for (j = 0; i < ndmpdp; i++, j++) { 1073 pdp_p[i] = DMPDphys + ptoa(j); 1074 pdp_p[i] |= X86_PG_RW | X86_PG_V; 1075 } 1076 1077 /* 1078 * Instead of using a 1G page for the memory containing the kernel, 1079 * use 2M pages with appropriate permissions. (If using 1G pages, 1080 * this will partially overwrite the PDPEs above.) 1081 */ 1082 if (ndm1g) { 1083 pd_p = (pd_entry_t *)DMPDkernphys; 1084 for (i = 0; i < (NPDEPG * nkdmpde); i++) 1085 pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | 1086 X86_PG_M | X86_PG_A | pg_nx | 1087 bootaddr_rwx(i << PDRSHIFT); 1088 for (i = 0; i < nkdmpde; i++) 1089 pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW | 1090 X86_PG_V; 1091 } 1092 1093 /* And recursively map PML4 to itself in order to get PTmap */ 1094 p4_p = (pml4_entry_t *)KPML4phys; 1095 p4_p[PML4PML4I] = KPML4phys; 1096 p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx; 1097 1098 /* Connect the Direct Map slot(s) up to the PML4. */ 1099 for (i = 0; i < ndmpdpphys; i++) { 1100 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); 1101 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V; 1102 } 1103 1104 /* Connect the KVA slots up to the PML4 */ 1105 for (i = 0; i < NKPML4E; i++) { 1106 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); 1107 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V; 1108 } 1109 } 1110 1111 /* 1112 * Bootstrap the system enough to run with virtual memory. 1113 * 1114 * On amd64 this is called after mapping has already been enabled 1115 * and just syncs the pmap module with what has already been done. 1116 * [We can't call it easily with mapping off since the kernel is not 1117 * mapped with PA == VA, hence we would have to relocate every address 1118 * from the linked base (virtual) address "KERNBASE" to the actual 1119 * (physical) address starting relative to 0] 1120 */ 1121 void 1122 pmap_bootstrap(vm_paddr_t *firstaddr) 1123 { 1124 vm_offset_t va; 1125 pt_entry_t *pte; 1126 uint64_t cr4; 1127 u_long res; 1128 int i; 1129 1130 KERNend = *firstaddr; 1131 res = atop(KERNend - (vm_paddr_t)kernphys); 1132 1133 if (!pti) 1134 pg_g = X86_PG_G; 1135 1136 /* 1137 * Create an initial set of page tables to run the kernel in. 1138 */ 1139 create_pagetables(firstaddr); 1140 1141 /* 1142 * Add a physical memory segment (vm_phys_seg) corresponding to the 1143 * preallocated kernel page table pages so that vm_page structures 1144 * representing these pages will be created. The vm_page structures 1145 * are required for promotion of the corresponding kernel virtual 1146 * addresses to superpage mappings. 1147 */ 1148 vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 1149 1150 virtual_avail = (vm_offset_t) KERNBASE + *firstaddr; 1151 virtual_end = VM_MAX_KERNEL_ADDRESS; 1152 1153 /* 1154 * Enable PG_G global pages, then switch to the kernel page 1155 * table from the bootstrap page table. After the switch, it 1156 * is possible to enable SMEP and SMAP since PG_U bits are 1157 * correct now. 1158 */ 1159 cr4 = rcr4(); 1160 cr4 |= CR4_PGE; 1161 load_cr4(cr4); 1162 load_cr3(KPML4phys); 1163 if (cpu_stdext_feature & CPUID_STDEXT_SMEP) 1164 cr4 |= CR4_SMEP; 1165 if (cpu_stdext_feature & CPUID_STDEXT_SMAP) 1166 cr4 |= CR4_SMAP; 1167 load_cr4(cr4); 1168 1169 /* 1170 * Initialize the kernel pmap (which is statically allocated). 1171 * Count bootstrap data as being resident in case any of this data is 1172 * later unmapped (using pmap_remove()) and freed. 1173 */ 1174 PMAP_LOCK_INIT(kernel_pmap); 1175 kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); 1176 kernel_pmap->pm_cr3 = KPML4phys; 1177 kernel_pmap->pm_ucr3 = PMAP_NO_CR3; 1178 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 1179 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1180 kernel_pmap->pm_stats.resident_count = res; 1181 kernel_pmap->pm_flags = pmap_flags; 1182 1183 /* 1184 * Initialize the TLB invalidations generation number lock. 1185 */ 1186 mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); 1187 1188 /* 1189 * Reserve some special page table entries/VA space for temporary 1190 * mapping of pages. 1191 */ 1192 #define SYSMAP(c, p, v, n) \ 1193 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 1194 1195 va = virtual_avail; 1196 pte = vtopte(va); 1197 1198 /* 1199 * Crashdump maps. The first page is reused as CMAP1 for the 1200 * memory test. 1201 */ 1202 SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) 1203 CADDR1 = crashdumpmap; 1204 1205 virtual_avail = va; 1206 1207 /* 1208 * Initialize the PAT MSR. 1209 * pmap_init_pat() clears and sets CR4_PGE, which, as a 1210 * side-effect, invalidates stale PG_G TLB entries that might 1211 * have been created in our pre-boot environment. 1212 */ 1213 pmap_init_pat(); 1214 1215 /* Initialize TLB Context Id. */ 1216 if (pmap_pcid_enabled) { 1217 for (i = 0; i < MAXCPU; i++) { 1218 kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN; 1219 kernel_pmap->pm_pcids[i].pm_gen = 1; 1220 } 1221 1222 /* 1223 * PMAP_PCID_KERN + 1 is used for initialization of 1224 * proc0 pmap. The pmap' pcid state might be used by 1225 * EFIRT entry before first context switch, so it 1226 * needs to be valid. 1227 */ 1228 PCPU_SET(pcid_next, PMAP_PCID_KERN + 2); 1229 PCPU_SET(pcid_gen, 1); 1230 1231 /* 1232 * pcpu area for APs is zeroed during AP startup. 1233 * pc_pcid_next and pc_pcid_gen are initialized by AP 1234 * during pcpu setup. 1235 */ 1236 load_cr4(rcr4() | CR4_PCIDE); 1237 } 1238 } 1239 1240 /* 1241 * Setup the PAT MSR. 1242 */ 1243 void 1244 pmap_init_pat(void) 1245 { 1246 uint64_t pat_msr; 1247 u_long cr0, cr4; 1248 int i; 1249 1250 /* Bail if this CPU doesn't implement PAT. */ 1251 if ((cpu_feature & CPUID_PAT) == 0) 1252 panic("no PAT??"); 1253 1254 /* Set default PAT index table. */ 1255 for (i = 0; i < PAT_INDEX_SIZE; i++) 1256 pat_index[i] = -1; 1257 pat_index[PAT_WRITE_BACK] = 0; 1258 pat_index[PAT_WRITE_THROUGH] = 1; 1259 pat_index[PAT_UNCACHEABLE] = 3; 1260 pat_index[PAT_WRITE_COMBINING] = 6; 1261 pat_index[PAT_WRITE_PROTECTED] = 5; 1262 pat_index[PAT_UNCACHED] = 2; 1263 1264 /* 1265 * Initialize default PAT entries. 1266 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 1267 * Program 5 and 6 as WP and WC. 1268 * 1269 * Leave 4 and 7 as WB and UC. Note that a recursive page table 1270 * mapping for a 2M page uses a PAT value with the bit 3 set due 1271 * to its overload with PG_PS. 1272 */ 1273 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 1274 PAT_VALUE(1, PAT_WRITE_THROUGH) | 1275 PAT_VALUE(2, PAT_UNCACHED) | 1276 PAT_VALUE(3, PAT_UNCACHEABLE) | 1277 PAT_VALUE(4, PAT_WRITE_BACK) | 1278 PAT_VALUE(5, PAT_WRITE_PROTECTED) | 1279 PAT_VALUE(6, PAT_WRITE_COMBINING) | 1280 PAT_VALUE(7, PAT_UNCACHEABLE); 1281 1282 /* Disable PGE. */ 1283 cr4 = rcr4(); 1284 load_cr4(cr4 & ~CR4_PGE); 1285 1286 /* Disable caches (CD = 1, NW = 0). */ 1287 cr0 = rcr0(); 1288 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 1289 1290 /* Flushes caches and TLBs. */ 1291 wbinvd(); 1292 invltlb(); 1293 1294 /* Update PAT and index table. */ 1295 wrmsr(MSR_PAT, pat_msr); 1296 1297 /* Flush caches and TLBs again. */ 1298 wbinvd(); 1299 invltlb(); 1300 1301 /* Restore caches and PGE. */ 1302 load_cr0(cr0); 1303 load_cr4(cr4); 1304 } 1305 1306 /* 1307 * Initialize a vm_page's machine-dependent fields. 1308 */ 1309 void 1310 pmap_page_init(vm_page_t m) 1311 { 1312 1313 TAILQ_INIT(&m->md.pv_list); 1314 m->md.pat_mode = PAT_WRITE_BACK; 1315 } 1316 1317 /* 1318 * Initialize the pmap module. 1319 * Called by vm_init, to initialize any structures that the pmap 1320 * system needs to map virtual memory. 1321 */ 1322 void 1323 pmap_init(void) 1324 { 1325 struct pmap_preinit_mapping *ppim; 1326 vm_page_t m, mpte; 1327 vm_size_t s; 1328 int error, i, pv_npg, ret, skz63; 1329 1330 /* L1TF, reserve page @0 unconditionally */ 1331 vm_page_blacklist_add(0, bootverbose); 1332 1333 /* Detect bare-metal Skylake Server and Skylake-X. */ 1334 if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL && 1335 CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) { 1336 /* 1337 * Skylake-X errata SKZ63. Processor May Hang When 1338 * Executing Code In an HLE Transaction Region between 1339 * 40000000H and 403FFFFFH. 1340 * 1341 * Mark the pages in the range as preallocated. It 1342 * seems to be impossible to distinguish between 1343 * Skylake Server and Skylake X. 1344 */ 1345 skz63 = 1; 1346 TUNABLE_INT_FETCH("hw.skz63_enable", &skz63); 1347 if (skz63 != 0) { 1348 if (bootverbose) 1349 printf("SKZ63: skipping 4M RAM starting " 1350 "at physical 1G\n"); 1351 for (i = 0; i < atop(0x400000); i++) { 1352 ret = vm_page_blacklist_add(0x40000000 + 1353 ptoa(i), FALSE); 1354 if (!ret && bootverbose) 1355 printf("page at %#lx already used\n", 1356 0x40000000 + ptoa(i)); 1357 } 1358 } 1359 } 1360 1361 /* 1362 * Initialize the vm page array entries for the kernel pmap's 1363 * page table pages. 1364 */ 1365 PMAP_LOCK(kernel_pmap); 1366 for (i = 0; i < nkpt; i++) { 1367 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 1368 KASSERT(mpte >= vm_page_array && 1369 mpte < &vm_page_array[vm_page_array_size], 1370 ("pmap_init: page table page is out of range")); 1371 mpte->pindex = pmap_pde_pindex(KERNBASE) + i; 1372 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 1373 mpte->wire_count = 1; 1374 if (i << PDRSHIFT < KERNend && 1375 pmap_insert_pt_page(kernel_pmap, mpte)) 1376 panic("pmap_init: pmap_insert_pt_page failed"); 1377 } 1378 PMAP_UNLOCK(kernel_pmap); 1379 vm_wire_add(nkpt); 1380 1381 /* 1382 * If the kernel is running on a virtual machine, then it must assume 1383 * that MCA is enabled by the hypervisor. Moreover, the kernel must 1384 * be prepared for the hypervisor changing the vendor and family that 1385 * are reported by CPUID. Consequently, the workaround for AMD Family 1386 * 10h Erratum 383 is enabled if the processor's feature set does not 1387 * include at least one feature that is only supported by older Intel 1388 * or newer AMD processors. 1389 */ 1390 if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && 1391 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 1392 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 1393 AMDID2_FMA4)) == 0) 1394 workaround_erratum383 = 1; 1395 1396 /* 1397 * Are large page mappings enabled? 1398 */ 1399 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 1400 if (pg_ps_enabled) { 1401 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1402 ("pmap_init: can't assign to pagesizes[1]")); 1403 pagesizes[1] = NBPDR; 1404 } 1405 1406 /* 1407 * Initialize the pv chunk list mutex. 1408 */ 1409 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 1410 1411 /* 1412 * Initialize the pool of pv list locks. 1413 */ 1414 for (i = 0; i < NPV_LIST_LOCKS; i++) 1415 rw_init(&pv_list_locks[i], "pmap pv list"); 1416 1417 /* 1418 * Calculate the size of the pv head table for superpages. 1419 */ 1420 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); 1421 1422 /* 1423 * Allocate memory for the pv head table for superpages. 1424 */ 1425 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 1426 s = round_page(s); 1427 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 1428 for (i = 0; i < pv_npg; i++) 1429 TAILQ_INIT(&pv_table[i].pv_list); 1430 TAILQ_INIT(&pv_dummy.pv_list); 1431 1432 pmap_initialized = 1; 1433 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 1434 ppim = pmap_preinit_mapping + i; 1435 if (ppim->va == 0) 1436 continue; 1437 /* Make the direct map consistent */ 1438 if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) { 1439 (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), 1440 ppim->sz, ppim->mode); 1441 } 1442 if (!bootverbose) 1443 continue; 1444 printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i, 1445 ppim->pa, ppim->va, ppim->sz, ppim->mode); 1446 } 1447 1448 mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); 1449 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 1450 (vmem_addr_t *)&qframe); 1451 if (error != 0) 1452 panic("qframe allocation failed"); 1453 1454 lm_ents = 8; 1455 TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents); 1456 if (lm_ents > LMEPML4I - LMSPML4I + 1) 1457 lm_ents = LMEPML4I - LMSPML4I + 1; 1458 if (bootverbose) 1459 printf("pmap: large map %u PML4 slots (%lu Gb)\n", 1460 lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024)); 1461 if (lm_ents != 0) { 1462 large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS, 1463 (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK); 1464 if (large_vmem == NULL) { 1465 printf("pmap: cannot create large map\n"); 1466 lm_ents = 0; 1467 } 1468 for (i = 0; i < lm_ents; i++) { 1469 m = pmap_large_map_getptp_unlocked(); 1470 kernel_pmap->pm_pml4[LMSPML4I + i] = X86_PG_V | 1471 X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx | 1472 VM_PAGE_TO_PHYS(m); 1473 } 1474 } 1475 } 1476 1477 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, 1478 "2MB page mapping counters"); 1479 1480 static u_long pmap_pde_demotions; 1481 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, 1482 &pmap_pde_demotions, 0, "2MB page demotions"); 1483 1484 static u_long pmap_pde_mappings; 1485 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 1486 &pmap_pde_mappings, 0, "2MB page mappings"); 1487 1488 static u_long pmap_pde_p_failures; 1489 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 1490 &pmap_pde_p_failures, 0, "2MB page promotion failures"); 1491 1492 static u_long pmap_pde_promotions; 1493 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 1494 &pmap_pde_promotions, 0, "2MB page promotions"); 1495 1496 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0, 1497 "1GB page mapping counters"); 1498 1499 static u_long pmap_pdpe_demotions; 1500 SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, 1501 &pmap_pdpe_demotions, 0, "1GB page demotions"); 1502 1503 /*************************************************** 1504 * Low level helper routines..... 1505 ***************************************************/ 1506 1507 static pt_entry_t 1508 pmap_swap_pat(pmap_t pmap, pt_entry_t entry) 1509 { 1510 int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; 1511 1512 switch (pmap->pm_type) { 1513 case PT_X86: 1514 case PT_RVI: 1515 /* Verify that both PAT bits are not set at the same time */ 1516 KASSERT((entry & x86_pat_bits) != x86_pat_bits, 1517 ("Invalid PAT bits in entry %#lx", entry)); 1518 1519 /* Swap the PAT bits if one of them is set */ 1520 if ((entry & x86_pat_bits) != 0) 1521 entry ^= x86_pat_bits; 1522 break; 1523 case PT_EPT: 1524 /* 1525 * Nothing to do - the memory attributes are represented 1526 * the same way for regular pages and superpages. 1527 */ 1528 break; 1529 default: 1530 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); 1531 } 1532 1533 return (entry); 1534 } 1535 1536 boolean_t 1537 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 1538 { 1539 1540 return (mode >= 0 && mode < PAT_INDEX_SIZE && 1541 pat_index[(int)mode] >= 0); 1542 } 1543 1544 /* 1545 * Determine the appropriate bits to set in a PTE or PDE for a specified 1546 * caching mode. 1547 */ 1548 int 1549 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) 1550 { 1551 int cache_bits, pat_flag, pat_idx; 1552 1553 if (!pmap_is_valid_memattr(pmap, mode)) 1554 panic("Unknown caching mode %d\n", mode); 1555 1556 switch (pmap->pm_type) { 1557 case PT_X86: 1558 case PT_RVI: 1559 /* The PAT bit is different for PTE's and PDE's. */ 1560 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 1561 1562 /* Map the caching mode to a PAT index. */ 1563 pat_idx = pat_index[mode]; 1564 1565 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 1566 cache_bits = 0; 1567 if (pat_idx & 0x4) 1568 cache_bits |= pat_flag; 1569 if (pat_idx & 0x2) 1570 cache_bits |= PG_NC_PCD; 1571 if (pat_idx & 0x1) 1572 cache_bits |= PG_NC_PWT; 1573 break; 1574 1575 case PT_EPT: 1576 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); 1577 break; 1578 1579 default: 1580 panic("unsupported pmap type %d", pmap->pm_type); 1581 } 1582 1583 return (cache_bits); 1584 } 1585 1586 static int 1587 pmap_cache_mask(pmap_t pmap, boolean_t is_pde) 1588 { 1589 int mask; 1590 1591 switch (pmap->pm_type) { 1592 case PT_X86: 1593 case PT_RVI: 1594 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; 1595 break; 1596 case PT_EPT: 1597 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); 1598 break; 1599 default: 1600 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); 1601 } 1602 1603 return (mask); 1604 } 1605 1606 bool 1607 pmap_ps_enabled(pmap_t pmap) 1608 { 1609 1610 return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); 1611 } 1612 1613 static void 1614 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) 1615 { 1616 1617 switch (pmap->pm_type) { 1618 case PT_X86: 1619 break; 1620 case PT_RVI: 1621 case PT_EPT: 1622 /* 1623 * XXX 1624 * This is a little bogus since the generation number is 1625 * supposed to be bumped up when a region of the address 1626 * space is invalidated in the page tables. 1627 * 1628 * In this case the old PDE entry is valid but yet we want 1629 * to make sure that any mappings using the old entry are 1630 * invalidated in the TLB. 1631 * 1632 * The reason this works as expected is because we rendezvous 1633 * "all" host cpus and force any vcpu context to exit as a 1634 * side-effect. 1635 */ 1636 atomic_add_acq_long(&pmap->pm_eptgen, 1); 1637 break; 1638 default: 1639 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); 1640 } 1641 pde_store(pde, newpde); 1642 } 1643 1644 /* 1645 * After changing the page size for the specified virtual address in the page 1646 * table, flush the corresponding entries from the processor's TLB. Only the 1647 * calling processor's TLB is affected. 1648 * 1649 * The calling thread must be pinned to a processor. 1650 */ 1651 static void 1652 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) 1653 { 1654 pt_entry_t PG_G; 1655 1656 if (pmap_type_guest(pmap)) 1657 return; 1658 1659 KASSERT(pmap->pm_type == PT_X86, 1660 ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); 1661 1662 PG_G = pmap_global_bit(pmap); 1663 1664 if ((newpde & PG_PS) == 0) 1665 /* Demotion: flush a specific 2MB page mapping. */ 1666 invlpg(va); 1667 else if ((newpde & PG_G) == 0) 1668 /* 1669 * Promotion: flush every 4KB page mapping from the TLB 1670 * because there are too many to flush individually. 1671 */ 1672 invltlb(); 1673 else { 1674 /* 1675 * Promotion: flush every 4KB page mapping from the TLB, 1676 * including any global (PG_G) mappings. 1677 */ 1678 invltlb_glob(); 1679 } 1680 } 1681 #ifdef SMP 1682 1683 /* 1684 * For SMP, these functions have to use the IPI mechanism for coherence. 1685 * 1686 * N.B.: Before calling any of the following TLB invalidation functions, 1687 * the calling processor must ensure that all stores updating a non- 1688 * kernel page table are globally performed. Otherwise, another 1689 * processor could cache an old, pre-update entry without being 1690 * invalidated. This can happen one of two ways: (1) The pmap becomes 1691 * active on another processor after its pm_active field is checked by 1692 * one of the following functions but before a store updating the page 1693 * table is globally performed. (2) The pmap becomes active on another 1694 * processor before its pm_active field is checked but due to 1695 * speculative loads one of the following functions stills reads the 1696 * pmap as inactive on the other processor. 1697 * 1698 * The kernel page table is exempt because its pm_active field is 1699 * immutable. The kernel page table is always active on every 1700 * processor. 1701 */ 1702 1703 /* 1704 * Interrupt the cpus that are executing in the guest context. 1705 * This will force the vcpu to exit and the cached EPT mappings 1706 * will be invalidated by the host before the next vmresume. 1707 */ 1708 static __inline void 1709 pmap_invalidate_ept(pmap_t pmap) 1710 { 1711 int ipinum; 1712 1713 sched_pin(); 1714 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 1715 ("pmap_invalidate_ept: absurd pm_active")); 1716 1717 /* 1718 * The TLB mappings associated with a vcpu context are not 1719 * flushed each time a different vcpu is chosen to execute. 1720 * 1721 * This is in contrast with a process's vtop mappings that 1722 * are flushed from the TLB on each context switch. 1723 * 1724 * Therefore we need to do more than just a TLB shootdown on 1725 * the active cpus in 'pmap->pm_active'. To do this we keep 1726 * track of the number of invalidations performed on this pmap. 1727 * 1728 * Each vcpu keeps a cache of this counter and compares it 1729 * just before a vmresume. If the counter is out-of-date an 1730 * invept will be done to flush stale mappings from the TLB. 1731 */ 1732 atomic_add_acq_long(&pmap->pm_eptgen, 1); 1733 1734 /* 1735 * Force the vcpu to exit and trap back into the hypervisor. 1736 */ 1737 ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; 1738 ipi_selected(pmap->pm_active, ipinum); 1739 sched_unpin(); 1740 } 1741 1742 static cpuset_t 1743 pmap_invalidate_cpu_mask(pmap_t pmap) 1744 { 1745 1746 return (pmap == kernel_pmap ? all_cpus : pmap->pm_active); 1747 } 1748 1749 static inline void 1750 pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va, 1751 const bool invpcid_works1) 1752 { 1753 struct invpcid_descr d; 1754 uint64_t kcr3, ucr3; 1755 uint32_t pcid; 1756 u_int cpuid, i; 1757 1758 cpuid = PCPU_GET(cpuid); 1759 if (pmap == PCPU_GET(curpmap)) { 1760 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 1761 /* 1762 * Because pm_pcid is recalculated on a 1763 * context switch, we must disable switching. 1764 * Otherwise, we might use a stale value 1765 * below. 1766 */ 1767 critical_enter(); 1768 pcid = pmap->pm_pcids[cpuid].pm_pcid; 1769 if (invpcid_works1) { 1770 d.pcid = pcid | PMAP_PCID_USER_PT; 1771 d.pad = 0; 1772 d.addr = va; 1773 invpcid(&d, INVPCID_ADDR); 1774 } else { 1775 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 1776 ucr3 = pmap->pm_ucr3 | pcid | 1777 PMAP_PCID_USER_PT | CR3_PCID_SAVE; 1778 pmap_pti_pcid_invlpg(ucr3, kcr3, va); 1779 } 1780 critical_exit(); 1781 } 1782 } else 1783 pmap->pm_pcids[cpuid].pm_gen = 0; 1784 1785 CPU_FOREACH(i) { 1786 if (cpuid != i) 1787 pmap->pm_pcids[i].pm_gen = 0; 1788 } 1789 1790 /* 1791 * The fence is between stores to pm_gen and the read of the 1792 * pm_active mask. We need to ensure that it is impossible 1793 * for us to miss the bit update in pm_active and 1794 * simultaneously observe a non-zero pm_gen in 1795 * pmap_activate_sw(), otherwise TLB update is missed. 1796 * Without the fence, IA32 allows such an outcome. Note that 1797 * pm_active is updated by a locked operation, which provides 1798 * the reciprocal fence. 1799 */ 1800 atomic_thread_fence_seq_cst(); 1801 } 1802 1803 static void 1804 pmap_invalidate_page_pcid_invpcid(pmap_t pmap, vm_offset_t va) 1805 { 1806 1807 pmap_invalidate_page_pcid(pmap, va, true); 1808 } 1809 1810 static void 1811 pmap_invalidate_page_pcid_noinvpcid(pmap_t pmap, vm_offset_t va) 1812 { 1813 1814 pmap_invalidate_page_pcid(pmap, va, false); 1815 } 1816 1817 static void 1818 pmap_invalidate_page_nopcid(pmap_t pmap, vm_offset_t va) 1819 { 1820 } 1821 1822 DEFINE_IFUNC(static, void, pmap_invalidate_page_mode, (pmap_t, vm_offset_t), 1823 static) 1824 { 1825 1826 if (pmap_pcid_enabled) 1827 return (invpcid_works ? pmap_invalidate_page_pcid_invpcid : 1828 pmap_invalidate_page_pcid_noinvpcid); 1829 return (pmap_invalidate_page_nopcid); 1830 } 1831 1832 void 1833 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1834 { 1835 1836 if (pmap_type_guest(pmap)) { 1837 pmap_invalidate_ept(pmap); 1838 return; 1839 } 1840 1841 KASSERT(pmap->pm_type == PT_X86, 1842 ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); 1843 1844 sched_pin(); 1845 if (pmap == kernel_pmap) { 1846 invlpg(va); 1847 } else { 1848 if (pmap == PCPU_GET(curpmap)) 1849 invlpg(va); 1850 pmap_invalidate_page_mode(pmap, va); 1851 } 1852 smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap); 1853 sched_unpin(); 1854 } 1855 1856 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ 1857 #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) 1858 1859 static void 1860 pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1861 const bool invpcid_works1) 1862 { 1863 struct invpcid_descr d; 1864 uint64_t kcr3, ucr3; 1865 uint32_t pcid; 1866 u_int cpuid, i; 1867 1868 cpuid = PCPU_GET(cpuid); 1869 if (pmap == PCPU_GET(curpmap)) { 1870 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 1871 critical_enter(); 1872 pcid = pmap->pm_pcids[cpuid].pm_pcid; 1873 if (invpcid_works1) { 1874 d.pcid = pcid | PMAP_PCID_USER_PT; 1875 d.pad = 0; 1876 d.addr = sva; 1877 for (; d.addr < eva; d.addr += PAGE_SIZE) 1878 invpcid(&d, INVPCID_ADDR); 1879 } else { 1880 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 1881 ucr3 = pmap->pm_ucr3 | pcid | 1882 PMAP_PCID_USER_PT | CR3_PCID_SAVE; 1883 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); 1884 } 1885 critical_exit(); 1886 } 1887 } else 1888 pmap->pm_pcids[cpuid].pm_gen = 0; 1889 1890 CPU_FOREACH(i) { 1891 if (cpuid != i) 1892 pmap->pm_pcids[i].pm_gen = 0; 1893 } 1894 /* See the comment in pmap_invalidate_page_pcid(). */ 1895 atomic_thread_fence_seq_cst(); 1896 } 1897 1898 static void 1899 pmap_invalidate_range_pcid_invpcid(pmap_t pmap, vm_offset_t sva, 1900 vm_offset_t eva) 1901 { 1902 1903 pmap_invalidate_range_pcid(pmap, sva, eva, true); 1904 } 1905 1906 static void 1907 pmap_invalidate_range_pcid_noinvpcid(pmap_t pmap, vm_offset_t sva, 1908 vm_offset_t eva) 1909 { 1910 1911 pmap_invalidate_range_pcid(pmap, sva, eva, false); 1912 } 1913 1914 static void 1915 pmap_invalidate_range_nopcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1916 { 1917 } 1918 1919 DEFINE_IFUNC(static, void, pmap_invalidate_range_mode, (pmap_t, vm_offset_t, 1920 vm_offset_t), static) 1921 { 1922 1923 if (pmap_pcid_enabled) 1924 return (invpcid_works ? pmap_invalidate_range_pcid_invpcid : 1925 pmap_invalidate_range_pcid_noinvpcid); 1926 return (pmap_invalidate_range_nopcid); 1927 } 1928 1929 void 1930 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1931 { 1932 vm_offset_t addr; 1933 1934 if (eva - sva >= PMAP_INVLPG_THRESHOLD) { 1935 pmap_invalidate_all(pmap); 1936 return; 1937 } 1938 1939 if (pmap_type_guest(pmap)) { 1940 pmap_invalidate_ept(pmap); 1941 return; 1942 } 1943 1944 KASSERT(pmap->pm_type == PT_X86, 1945 ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); 1946 1947 sched_pin(); 1948 if (pmap == kernel_pmap) { 1949 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1950 invlpg(addr); 1951 } else { 1952 if (pmap == PCPU_GET(curpmap)) { 1953 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1954 invlpg(addr); 1955 } 1956 pmap_invalidate_range_mode(pmap, sva, eva); 1957 } 1958 smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap); 1959 sched_unpin(); 1960 } 1961 1962 static inline void 1963 pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1) 1964 { 1965 struct invpcid_descr d; 1966 uint64_t kcr3, ucr3; 1967 uint32_t pcid; 1968 u_int cpuid, i; 1969 1970 if (pmap == kernel_pmap) { 1971 if (invpcid_works1) { 1972 bzero(&d, sizeof(d)); 1973 invpcid(&d, INVPCID_CTXGLOB); 1974 } else { 1975 invltlb_glob(); 1976 } 1977 } else { 1978 cpuid = PCPU_GET(cpuid); 1979 if (pmap == PCPU_GET(curpmap)) { 1980 critical_enter(); 1981 pcid = pmap->pm_pcids[cpuid].pm_pcid; 1982 if (invpcid_works1) { 1983 d.pcid = pcid; 1984 d.pad = 0; 1985 d.addr = 0; 1986 invpcid(&d, INVPCID_CTX); 1987 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 1988 d.pcid |= PMAP_PCID_USER_PT; 1989 invpcid(&d, INVPCID_CTX); 1990 } 1991 } else { 1992 kcr3 = pmap->pm_cr3 | pcid; 1993 ucr3 = pmap->pm_ucr3; 1994 if (ucr3 != PMAP_NO_CR3) { 1995 ucr3 |= pcid | PMAP_PCID_USER_PT; 1996 pmap_pti_pcid_invalidate(ucr3, kcr3); 1997 } else { 1998 load_cr3(kcr3); 1999 } 2000 } 2001 critical_exit(); 2002 } else 2003 pmap->pm_pcids[cpuid].pm_gen = 0; 2004 CPU_FOREACH(i) { 2005 if (cpuid != i) 2006 pmap->pm_pcids[i].pm_gen = 0; 2007 } 2008 } 2009 /* See the comment in pmap_invalidate_page_pcid(). */ 2010 atomic_thread_fence_seq_cst(); 2011 } 2012 2013 static void 2014 pmap_invalidate_all_pcid_invpcid(pmap_t pmap) 2015 { 2016 2017 pmap_invalidate_all_pcid(pmap, true); 2018 } 2019 2020 static void 2021 pmap_invalidate_all_pcid_noinvpcid(pmap_t pmap) 2022 { 2023 2024 pmap_invalidate_all_pcid(pmap, false); 2025 } 2026 2027 static void 2028 pmap_invalidate_all_nopcid(pmap_t pmap) 2029 { 2030 2031 if (pmap == kernel_pmap) 2032 invltlb_glob(); 2033 else if (pmap == PCPU_GET(curpmap)) 2034 invltlb(); 2035 } 2036 2037 DEFINE_IFUNC(static, void, pmap_invalidate_all_mode, (pmap_t), static) 2038 { 2039 2040 if (pmap_pcid_enabled) 2041 return (invpcid_works ? pmap_invalidate_all_pcid_invpcid : 2042 pmap_invalidate_all_pcid_noinvpcid); 2043 return (pmap_invalidate_all_nopcid); 2044 } 2045 2046 void 2047 pmap_invalidate_all(pmap_t pmap) 2048 { 2049 2050 if (pmap_type_guest(pmap)) { 2051 pmap_invalidate_ept(pmap); 2052 return; 2053 } 2054 2055 KASSERT(pmap->pm_type == PT_X86, 2056 ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); 2057 2058 sched_pin(); 2059 pmap_invalidate_all_mode(pmap); 2060 smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap); 2061 sched_unpin(); 2062 } 2063 2064 void 2065 pmap_invalidate_cache(void) 2066 { 2067 2068 sched_pin(); 2069 wbinvd(); 2070 smp_cache_flush(); 2071 sched_unpin(); 2072 } 2073 2074 struct pde_action { 2075 cpuset_t invalidate; /* processors that invalidate their TLB */ 2076 pmap_t pmap; 2077 vm_offset_t va; 2078 pd_entry_t *pde; 2079 pd_entry_t newpde; 2080 u_int store; /* processor that updates the PDE */ 2081 }; 2082 2083 static void 2084 pmap_update_pde_action(void *arg) 2085 { 2086 struct pde_action *act = arg; 2087 2088 if (act->store == PCPU_GET(cpuid)) 2089 pmap_update_pde_store(act->pmap, act->pde, act->newpde); 2090 } 2091 2092 static void 2093 pmap_update_pde_teardown(void *arg) 2094 { 2095 struct pde_action *act = arg; 2096 2097 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 2098 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); 2099 } 2100 2101 /* 2102 * Change the page size for the specified virtual address in a way that 2103 * prevents any possibility of the TLB ever having two entries that map the 2104 * same virtual address using different page sizes. This is the recommended 2105 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 2106 * machine check exception for a TLB state that is improperly diagnosed as a 2107 * hardware error. 2108 */ 2109 static void 2110 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 2111 { 2112 struct pde_action act; 2113 cpuset_t active, other_cpus; 2114 u_int cpuid; 2115 2116 sched_pin(); 2117 cpuid = PCPU_GET(cpuid); 2118 other_cpus = all_cpus; 2119 CPU_CLR(cpuid, &other_cpus); 2120 if (pmap == kernel_pmap || pmap_type_guest(pmap)) 2121 active = all_cpus; 2122 else { 2123 active = pmap->pm_active; 2124 } 2125 if (CPU_OVERLAP(&active, &other_cpus)) { 2126 act.store = cpuid; 2127 act.invalidate = active; 2128 act.va = va; 2129 act.pmap = pmap; 2130 act.pde = pde; 2131 act.newpde = newpde; 2132 CPU_SET(cpuid, &active); 2133 smp_rendezvous_cpus(active, 2134 smp_no_rendezvous_barrier, pmap_update_pde_action, 2135 pmap_update_pde_teardown, &act); 2136 } else { 2137 pmap_update_pde_store(pmap, pde, newpde); 2138 if (CPU_ISSET(cpuid, &active)) 2139 pmap_update_pde_invalidate(pmap, va, newpde); 2140 } 2141 sched_unpin(); 2142 } 2143 #else /* !SMP */ 2144 /* 2145 * Normal, non-SMP, invalidation functions. 2146 */ 2147 void 2148 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 2149 { 2150 struct invpcid_descr d; 2151 uint64_t kcr3, ucr3; 2152 uint32_t pcid; 2153 2154 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 2155 pmap->pm_eptgen++; 2156 return; 2157 } 2158 KASSERT(pmap->pm_type == PT_X86, 2159 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 2160 2161 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 2162 invlpg(va); 2163 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && 2164 pmap->pm_ucr3 != PMAP_NO_CR3) { 2165 critical_enter(); 2166 pcid = pmap->pm_pcids[0].pm_pcid; 2167 if (invpcid_works) { 2168 d.pcid = pcid | PMAP_PCID_USER_PT; 2169 d.pad = 0; 2170 d.addr = va; 2171 invpcid(&d, INVPCID_ADDR); 2172 } else { 2173 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 2174 ucr3 = pmap->pm_ucr3 | pcid | 2175 PMAP_PCID_USER_PT | CR3_PCID_SAVE; 2176 pmap_pti_pcid_invlpg(ucr3, kcr3, va); 2177 } 2178 critical_exit(); 2179 } 2180 } else if (pmap_pcid_enabled) 2181 pmap->pm_pcids[0].pm_gen = 0; 2182 } 2183 2184 void 2185 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2186 { 2187 struct invpcid_descr d; 2188 vm_offset_t addr; 2189 uint64_t kcr3, ucr3; 2190 2191 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 2192 pmap->pm_eptgen++; 2193 return; 2194 } 2195 KASSERT(pmap->pm_type == PT_X86, 2196 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 2197 2198 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 2199 for (addr = sva; addr < eva; addr += PAGE_SIZE) 2200 invlpg(addr); 2201 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && 2202 pmap->pm_ucr3 != PMAP_NO_CR3) { 2203 critical_enter(); 2204 if (invpcid_works) { 2205 d.pcid = pmap->pm_pcids[0].pm_pcid | 2206 PMAP_PCID_USER_PT; 2207 d.pad = 0; 2208 d.addr = sva; 2209 for (; d.addr < eva; d.addr += PAGE_SIZE) 2210 invpcid(&d, INVPCID_ADDR); 2211 } else { 2212 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0]. 2213 pm_pcid | CR3_PCID_SAVE; 2214 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0]. 2215 pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; 2216 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); 2217 } 2218 critical_exit(); 2219 } 2220 } else if (pmap_pcid_enabled) { 2221 pmap->pm_pcids[0].pm_gen = 0; 2222 } 2223 } 2224 2225 void 2226 pmap_invalidate_all(pmap_t pmap) 2227 { 2228 struct invpcid_descr d; 2229 uint64_t kcr3, ucr3; 2230 2231 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 2232 pmap->pm_eptgen++; 2233 return; 2234 } 2235 KASSERT(pmap->pm_type == PT_X86, 2236 ("pmap_invalidate_all: unknown type %d", pmap->pm_type)); 2237 2238 if (pmap == kernel_pmap) { 2239 if (pmap_pcid_enabled && invpcid_works) { 2240 bzero(&d, sizeof(d)); 2241 invpcid(&d, INVPCID_CTXGLOB); 2242 } else { 2243 invltlb_glob(); 2244 } 2245 } else if (pmap == PCPU_GET(curpmap)) { 2246 if (pmap_pcid_enabled) { 2247 critical_enter(); 2248 if (invpcid_works) { 2249 d.pcid = pmap->pm_pcids[0].pm_pcid; 2250 d.pad = 0; 2251 d.addr = 0; 2252 invpcid(&d, INVPCID_CTX); 2253 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 2254 d.pcid |= PMAP_PCID_USER_PT; 2255 invpcid(&d, INVPCID_CTX); 2256 } 2257 } else { 2258 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid; 2259 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 2260 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[ 2261 0].pm_pcid | PMAP_PCID_USER_PT; 2262 pmap_pti_pcid_invalidate(ucr3, kcr3); 2263 } else 2264 load_cr3(kcr3); 2265 } 2266 critical_exit(); 2267 } else { 2268 invltlb(); 2269 } 2270 } else if (pmap_pcid_enabled) { 2271 pmap->pm_pcids[0].pm_gen = 0; 2272 } 2273 } 2274 2275 PMAP_INLINE void 2276 pmap_invalidate_cache(void) 2277 { 2278 2279 wbinvd(); 2280 } 2281 2282 static void 2283 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 2284 { 2285 2286 pmap_update_pde_store(pmap, pde, newpde); 2287 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) 2288 pmap_update_pde_invalidate(pmap, va, newpde); 2289 else 2290 pmap->pm_pcids[0].pm_gen = 0; 2291 } 2292 #endif /* !SMP */ 2293 2294 static void 2295 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) 2296 { 2297 2298 /* 2299 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created 2300 * by a promotion that did not invalidate the 512 4KB page mappings 2301 * that might exist in the TLB. Consequently, at this point, the TLB 2302 * may hold both 4KB and 2MB page mappings for the address range [va, 2303 * va + NBPDR). Therefore, the entire range must be invalidated here. 2304 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any 2305 * 4KB page mappings for the address range [va, va + NBPDR), and so a 2306 * single INVLPG suffices to invalidate the 2MB page mapping from the 2307 * TLB. 2308 */ 2309 if ((pde & PG_PROMOTED) != 0) 2310 pmap_invalidate_range(pmap, va, va + NBPDR - 1); 2311 else 2312 pmap_invalidate_page(pmap, va); 2313 } 2314 2315 DEFINE_IFUNC(, void, pmap_invalidate_cache_range, 2316 (vm_offset_t sva, vm_offset_t eva), static) 2317 { 2318 2319 if ((cpu_feature & CPUID_SS) != 0) 2320 return (pmap_invalidate_cache_range_selfsnoop); 2321 if ((cpu_feature & CPUID_CLFSH) != 0) 2322 return (pmap_force_invalidate_cache_range); 2323 return (pmap_invalidate_cache_range_all); 2324 } 2325 2326 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 2327 2328 static void 2329 pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva) 2330 { 2331 2332 KASSERT((sva & PAGE_MASK) == 0, 2333 ("pmap_invalidate_cache_range: sva not page-aligned")); 2334 KASSERT((eva & PAGE_MASK) == 0, 2335 ("pmap_invalidate_cache_range: eva not page-aligned")); 2336 } 2337 2338 static void 2339 pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva) 2340 { 2341 2342 pmap_invalidate_cache_range_check_align(sva, eva); 2343 } 2344 2345 void 2346 pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 2347 { 2348 2349 sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); 2350 2351 /* 2352 * XXX: Some CPUs fault, hang, or trash the local APIC 2353 * registers if we use CLFLUSH on the local APIC range. The 2354 * local APIC is always uncached, so we don't need to flush 2355 * for that range anyway. 2356 */ 2357 if (pmap_kextract(sva) == lapic_paddr) 2358 return; 2359 2360 if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) { 2361 /* 2362 * Do per-cache line flush. Use the sfence 2363 * instruction to insure that previous stores are 2364 * included in the write-back. The processor 2365 * propagates flush to other processors in the cache 2366 * coherence domain. 2367 */ 2368 sfence(); 2369 for (; sva < eva; sva += cpu_clflush_line_size) 2370 clflushopt(sva); 2371 sfence(); 2372 } else { 2373 /* 2374 * Writes are ordered by CLFLUSH on Intel CPUs. 2375 */ 2376 if (cpu_vendor_id != CPU_VENDOR_INTEL) 2377 mfence(); 2378 for (; sva < eva; sva += cpu_clflush_line_size) 2379 clflush(sva); 2380 if (cpu_vendor_id != CPU_VENDOR_INTEL) 2381 mfence(); 2382 } 2383 } 2384 2385 static void 2386 pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva) 2387 { 2388 2389 pmap_invalidate_cache_range_check_align(sva, eva); 2390 pmap_invalidate_cache(); 2391 } 2392 2393 /* 2394 * Remove the specified set of pages from the data and instruction caches. 2395 * 2396 * In contrast to pmap_invalidate_cache_range(), this function does not 2397 * rely on the CPU's self-snoop feature, because it is intended for use 2398 * when moving pages into a different cache domain. 2399 */ 2400 void 2401 pmap_invalidate_cache_pages(vm_page_t *pages, int count) 2402 { 2403 vm_offset_t daddr, eva; 2404 int i; 2405 bool useclflushopt; 2406 2407 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; 2408 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 2409 ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt)) 2410 pmap_invalidate_cache(); 2411 else { 2412 if (useclflushopt) 2413 sfence(); 2414 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 2415 mfence(); 2416 for (i = 0; i < count; i++) { 2417 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 2418 eva = daddr + PAGE_SIZE; 2419 for (; daddr < eva; daddr += cpu_clflush_line_size) { 2420 if (useclflushopt) 2421 clflushopt(daddr); 2422 else 2423 clflush(daddr); 2424 } 2425 } 2426 if (useclflushopt) 2427 sfence(); 2428 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 2429 mfence(); 2430 } 2431 } 2432 2433 void 2434 pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva) 2435 { 2436 2437 pmap_invalidate_cache_range_check_align(sva, eva); 2438 2439 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) { 2440 pmap_force_invalidate_cache_range(sva, eva); 2441 return; 2442 } 2443 2444 /* See comment in pmap_force_invalidate_cache_range(). */ 2445 if (pmap_kextract(sva) == lapic_paddr) 2446 return; 2447 2448 sfence(); 2449 for (; sva < eva; sva += cpu_clflush_line_size) 2450 clwb(sva); 2451 sfence(); 2452 } 2453 2454 void 2455 pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) 2456 { 2457 pt_entry_t *pte; 2458 vm_offset_t vaddr; 2459 int error, pte_bits; 2460 2461 KASSERT((spa & PAGE_MASK) == 0, 2462 ("pmap_flush_cache_phys_range: spa not page-aligned")); 2463 KASSERT((epa & PAGE_MASK) == 0, 2464 ("pmap_flush_cache_phys_range: epa not page-aligned")); 2465 2466 if (spa < dmaplimit) { 2467 pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN( 2468 dmaplimit, epa))); 2469 if (dmaplimit >= epa) 2470 return; 2471 spa = dmaplimit; 2472 } 2473 2474 pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW | 2475 X86_PG_V; 2476 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 2477 &vaddr); 2478 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 2479 pte = vtopte(vaddr); 2480 for (; spa < epa; spa += PAGE_SIZE) { 2481 sched_pin(); 2482 pte_store(pte, spa | pte_bits); 2483 invlpg(vaddr); 2484 /* XXXKIB sfences inside flush_cache_range are excessive */ 2485 pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE); 2486 sched_unpin(); 2487 } 2488 vmem_free(kernel_arena, vaddr, PAGE_SIZE); 2489 } 2490 2491 /* 2492 * Routine: pmap_extract 2493 * Function: 2494 * Extract the physical page address associated 2495 * with the given map/virtual_address pair. 2496 */ 2497 vm_paddr_t 2498 pmap_extract(pmap_t pmap, vm_offset_t va) 2499 { 2500 pdp_entry_t *pdpe; 2501 pd_entry_t *pde; 2502 pt_entry_t *pte, PG_V; 2503 vm_paddr_t pa; 2504 2505 pa = 0; 2506 PG_V = pmap_valid_bit(pmap); 2507 PMAP_LOCK(pmap); 2508 pdpe = pmap_pdpe(pmap, va); 2509 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 2510 if ((*pdpe & PG_PS) != 0) 2511 pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); 2512 else { 2513 pde = pmap_pdpe_to_pde(pdpe, va); 2514 if ((*pde & PG_V) != 0) { 2515 if ((*pde & PG_PS) != 0) { 2516 pa = (*pde & PG_PS_FRAME) | 2517 (va & PDRMASK); 2518 } else { 2519 pte = pmap_pde_to_pte(pde, va); 2520 pa = (*pte & PG_FRAME) | 2521 (va & PAGE_MASK); 2522 } 2523 } 2524 } 2525 } 2526 PMAP_UNLOCK(pmap); 2527 return (pa); 2528 } 2529 2530 /* 2531 * Routine: pmap_extract_and_hold 2532 * Function: 2533 * Atomically extract and hold the physical page 2534 * with the given pmap and virtual address pair 2535 * if that mapping permits the given protection. 2536 */ 2537 vm_page_t 2538 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 2539 { 2540 pd_entry_t pde, *pdep; 2541 pt_entry_t pte, PG_RW, PG_V; 2542 vm_paddr_t pa; 2543 vm_page_t m; 2544 2545 pa = 0; 2546 m = NULL; 2547 PG_RW = pmap_rw_bit(pmap); 2548 PG_V = pmap_valid_bit(pmap); 2549 PMAP_LOCK(pmap); 2550 retry: 2551 pdep = pmap_pde(pmap, va); 2552 if (pdep != NULL && (pde = *pdep)) { 2553 if (pde & PG_PS) { 2554 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 2555 if (vm_page_pa_tryrelock(pmap, (pde & 2556 PG_PS_FRAME) | (va & PDRMASK), &pa)) 2557 goto retry; 2558 m = PHYS_TO_VM_PAGE(pa); 2559 } 2560 } else { 2561 pte = *pmap_pde_to_pte(pdep, va); 2562 if ((pte & PG_V) && 2563 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 2564 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, 2565 &pa)) 2566 goto retry; 2567 m = PHYS_TO_VM_PAGE(pa); 2568 } 2569 } 2570 if (m != NULL) 2571 vm_page_hold(m); 2572 } 2573 PA_UNLOCK_COND(pa); 2574 PMAP_UNLOCK(pmap); 2575 return (m); 2576 } 2577 2578 vm_paddr_t 2579 pmap_kextract(vm_offset_t va) 2580 { 2581 pd_entry_t pde; 2582 vm_paddr_t pa; 2583 2584 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 2585 pa = DMAP_TO_PHYS(va); 2586 } else { 2587 pde = *vtopde(va); 2588 if (pde & PG_PS) { 2589 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 2590 } else { 2591 /* 2592 * Beware of a concurrent promotion that changes the 2593 * PDE at this point! For example, vtopte() must not 2594 * be used to access the PTE because it would use the 2595 * new PDE. It is, however, safe to use the old PDE 2596 * because the page table page is preserved by the 2597 * promotion. 2598 */ 2599 pa = *pmap_pde_to_pte(&pde, va); 2600 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 2601 } 2602 } 2603 return (pa); 2604 } 2605 2606 /*************************************************** 2607 * Low level mapping routines..... 2608 ***************************************************/ 2609 2610 /* 2611 * Add a wired page to the kva. 2612 * Note: not SMP coherent. 2613 */ 2614 PMAP_INLINE void 2615 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 2616 { 2617 pt_entry_t *pte; 2618 2619 pte = vtopte(va); 2620 pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g); 2621 } 2622 2623 static __inline void 2624 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 2625 { 2626 pt_entry_t *pte; 2627 int cache_bits; 2628 2629 pte = vtopte(va); 2630 cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); 2631 pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | cache_bits); 2632 } 2633 2634 /* 2635 * Remove a page from the kernel pagetables. 2636 * Note: not SMP coherent. 2637 */ 2638 PMAP_INLINE void 2639 pmap_kremove(vm_offset_t va) 2640 { 2641 pt_entry_t *pte; 2642 2643 pte = vtopte(va); 2644 pte_clear(pte); 2645 } 2646 2647 /* 2648 * Used to map a range of physical addresses into kernel 2649 * virtual address space. 2650 * 2651 * The value passed in '*virt' is a suggested virtual address for 2652 * the mapping. Architectures which can support a direct-mapped 2653 * physical to virtual region can return the appropriate address 2654 * within that region, leaving '*virt' unchanged. Other 2655 * architectures should map the pages starting at '*virt' and 2656 * update '*virt' with the first usable address after the mapped 2657 * region. 2658 */ 2659 vm_offset_t 2660 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 2661 { 2662 return PHYS_TO_DMAP(start); 2663 } 2664 2665 2666 /* 2667 * Add a list of wired pages to the kva 2668 * this routine is only used for temporary 2669 * kernel mappings that do not need to have 2670 * page modification or references recorded. 2671 * Note that old mappings are simply written 2672 * over. The page *must* be wired. 2673 * Note: SMP coherent. Uses a ranged shootdown IPI. 2674 */ 2675 void 2676 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 2677 { 2678 pt_entry_t *endpte, oldpte, pa, *pte; 2679 vm_page_t m; 2680 int cache_bits; 2681 2682 oldpte = 0; 2683 pte = vtopte(sva); 2684 endpte = pte + count; 2685 while (pte < endpte) { 2686 m = *ma++; 2687 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); 2688 pa = VM_PAGE_TO_PHYS(m) | cache_bits; 2689 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { 2690 oldpte |= *pte; 2691 pte_store(pte, pa | pg_g | pg_nx | X86_PG_RW | X86_PG_V); 2692 } 2693 pte++; 2694 } 2695 if (__predict_false((oldpte & X86_PG_V) != 0)) 2696 pmap_invalidate_range(kernel_pmap, sva, sva + count * 2697 PAGE_SIZE); 2698 } 2699 2700 /* 2701 * This routine tears out page mappings from the 2702 * kernel -- it is meant only for temporary mappings. 2703 * Note: SMP coherent. Uses a ranged shootdown IPI. 2704 */ 2705 void 2706 pmap_qremove(vm_offset_t sva, int count) 2707 { 2708 vm_offset_t va; 2709 2710 va = sva; 2711 while (count-- > 0) { 2712 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); 2713 pmap_kremove(va); 2714 va += PAGE_SIZE; 2715 } 2716 pmap_invalidate_range(kernel_pmap, sva, va); 2717 } 2718 2719 /*************************************************** 2720 * Page table page management routines..... 2721 ***************************************************/ 2722 /* 2723 * Schedule the specified unused page table page to be freed. Specifically, 2724 * add the page to the specified list of pages that will be released to the 2725 * physical memory manager after the TLB has been updated. 2726 */ 2727 static __inline void 2728 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 2729 boolean_t set_PG_ZERO) 2730 { 2731 2732 if (set_PG_ZERO) 2733 m->flags |= PG_ZERO; 2734 else 2735 m->flags &= ~PG_ZERO; 2736 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 2737 } 2738 2739 /* 2740 * Inserts the specified page table page into the specified pmap's collection 2741 * of idle page table pages. Each of a pmap's page table pages is responsible 2742 * for mapping a distinct range of virtual addresses. The pmap's collection is 2743 * ordered by this virtual address range. 2744 */ 2745 static __inline int 2746 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 2747 { 2748 2749 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2750 return (vm_radix_insert(&pmap->pm_root, mpte)); 2751 } 2752 2753 /* 2754 * Removes the page table page mapping the specified virtual address from the 2755 * specified pmap's collection of idle page table pages, and returns it. 2756 * Otherwise, returns NULL if there is no page table page corresponding to the 2757 * specified virtual address. 2758 */ 2759 static __inline vm_page_t 2760 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 2761 { 2762 2763 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2764 return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va))); 2765 } 2766 2767 /* 2768 * Decrements a page table page's wire count, which is used to record the 2769 * number of valid page table entries within the page. If the wire count 2770 * drops to zero, then the page table page is unmapped. Returns TRUE if the 2771 * page table page was unmapped and FALSE otherwise. 2772 */ 2773 static inline boolean_t 2774 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2775 { 2776 2777 --m->wire_count; 2778 if (m->wire_count == 0) { 2779 _pmap_unwire_ptp(pmap, va, m, free); 2780 return (TRUE); 2781 } else 2782 return (FALSE); 2783 } 2784 2785 static void 2786 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2787 { 2788 2789 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2790 /* 2791 * unmap the page table page 2792 */ 2793 if (m->pindex >= (NUPDE + NUPDPE)) { 2794 /* PDP page */ 2795 pml4_entry_t *pml4; 2796 pml4 = pmap_pml4e(pmap, va); 2797 *pml4 = 0; 2798 if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) { 2799 pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)]; 2800 *pml4 = 0; 2801 } 2802 } else if (m->pindex >= NUPDE) { 2803 /* PD page */ 2804 pdp_entry_t *pdp; 2805 pdp = pmap_pdpe(pmap, va); 2806 *pdp = 0; 2807 } else { 2808 /* PTE page */ 2809 pd_entry_t *pd; 2810 pd = pmap_pde(pmap, va); 2811 *pd = 0; 2812 } 2813 pmap_resident_count_dec(pmap, 1); 2814 if (m->pindex < NUPDE) { 2815 /* We just released a PT, unhold the matching PD */ 2816 vm_page_t pdpg; 2817 2818 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 2819 pmap_unwire_ptp(pmap, va, pdpg, free); 2820 } 2821 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 2822 /* We just released a PD, unhold the matching PDP */ 2823 vm_page_t pdppg; 2824 2825 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 2826 pmap_unwire_ptp(pmap, va, pdppg, free); 2827 } 2828 2829 /* 2830 * Put page on a list so that it is released after 2831 * *ALL* TLB shootdown is done 2832 */ 2833 pmap_add_delayed_free_list(m, free, TRUE); 2834 } 2835 2836 /* 2837 * After removing a page table entry, this routine is used to 2838 * conditionally free the page, and manage the hold/wire counts. 2839 */ 2840 static int 2841 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 2842 struct spglist *free) 2843 { 2844 vm_page_t mpte; 2845 2846 if (va >= VM_MAXUSER_ADDRESS) 2847 return (0); 2848 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 2849 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 2850 return (pmap_unwire_ptp(pmap, va, mpte, free)); 2851 } 2852 2853 void 2854 pmap_pinit0(pmap_t pmap) 2855 { 2856 int i; 2857 2858 PMAP_LOCK_INIT(pmap); 2859 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 2860 pmap->pm_pml4u = NULL; 2861 pmap->pm_cr3 = KPML4phys; 2862 /* hack to keep pmap_pti_pcid_invalidate() alive */ 2863 pmap->pm_ucr3 = PMAP_NO_CR3; 2864 pmap->pm_root.rt_root = 0; 2865 CPU_ZERO(&pmap->pm_active); 2866 TAILQ_INIT(&pmap->pm_pvchunk); 2867 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2868 pmap->pm_flags = pmap_flags; 2869 CPU_FOREACH(i) { 2870 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN + 1; 2871 pmap->pm_pcids[i].pm_gen = 1; 2872 } 2873 pmap_activate_boot(pmap); 2874 2875 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 2876 pmap_pkru_ranges_zone = uma_zcreate("pkru ranges", 2877 sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL, 2878 UMA_ALIGN_PTR, 0); 2879 } 2880 } 2881 2882 void 2883 pmap_pinit_pml4(vm_page_t pml4pg) 2884 { 2885 pml4_entry_t *pm_pml4; 2886 int i; 2887 2888 pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); 2889 2890 /* Wire in kernel global address entries. */ 2891 for (i = 0; i < NKPML4E; i++) { 2892 pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW | 2893 X86_PG_V; 2894 } 2895 for (i = 0; i < ndmpdpphys; i++) { 2896 pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | 2897 X86_PG_V; 2898 } 2899 2900 /* install self-referential address mapping entry(s) */ 2901 pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW | 2902 X86_PG_A | X86_PG_M; 2903 2904 /* install large map entries if configured */ 2905 for (i = 0; i < lm_ents; i++) 2906 pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pml4[LMSPML4I + i]; 2907 } 2908 2909 static void 2910 pmap_pinit_pml4_pti(vm_page_t pml4pg) 2911 { 2912 pml4_entry_t *pm_pml4; 2913 int i; 2914 2915 pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); 2916 for (i = 0; i < NPML4EPG; i++) 2917 pm_pml4[i] = pti_pml4[i]; 2918 } 2919 2920 /* 2921 * Initialize a preallocated and zeroed pmap structure, 2922 * such as one in a vmspace structure. 2923 */ 2924 int 2925 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) 2926 { 2927 vm_page_t pml4pg, pml4pgu; 2928 vm_paddr_t pml4phys; 2929 int i; 2930 2931 /* 2932 * allocate the page directory page 2933 */ 2934 pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 2935 VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK); 2936 2937 pml4phys = VM_PAGE_TO_PHYS(pml4pg); 2938 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys); 2939 CPU_FOREACH(i) { 2940 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; 2941 pmap->pm_pcids[i].pm_gen = 0; 2942 } 2943 pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */ 2944 pmap->pm_ucr3 = PMAP_NO_CR3; 2945 pmap->pm_pml4u = NULL; 2946 2947 pmap->pm_type = pm_type; 2948 if ((pml4pg->flags & PG_ZERO) == 0) 2949 pagezero(pmap->pm_pml4); 2950 2951 /* 2952 * Do not install the host kernel mappings in the nested page 2953 * tables. These mappings are meaningless in the guest physical 2954 * address space. 2955 * Install minimal kernel mappings in PTI case. 2956 */ 2957 if (pm_type == PT_X86) { 2958 pmap->pm_cr3 = pml4phys; 2959 pmap_pinit_pml4(pml4pg); 2960 if (pti) { 2961 pml4pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2962 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_WAITOK); 2963 pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP( 2964 VM_PAGE_TO_PHYS(pml4pgu)); 2965 pmap_pinit_pml4_pti(pml4pgu); 2966 pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu); 2967 } 2968 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 2969 rangeset_init(&pmap->pm_pkru, pkru_dup_range, 2970 pkru_free_range, pmap, M_NOWAIT); 2971 } 2972 } 2973 2974 pmap->pm_root.rt_root = 0; 2975 CPU_ZERO(&pmap->pm_active); 2976 TAILQ_INIT(&pmap->pm_pvchunk); 2977 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2978 pmap->pm_flags = flags; 2979 pmap->pm_eptgen = 0; 2980 2981 return (1); 2982 } 2983 2984 int 2985 pmap_pinit(pmap_t pmap) 2986 { 2987 2988 return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); 2989 } 2990 2991 /* 2992 * This routine is called if the desired page table page does not exist. 2993 * 2994 * If page table page allocation fails, this routine may sleep before 2995 * returning NULL. It sleeps only if a lock pointer was given. 2996 * 2997 * Note: If a page allocation fails at page table level two or three, 2998 * one or two pages may be held during the wait, only to be released 2999 * afterwards. This conservative approach is easily argued to avoid 3000 * race conditions. 3001 */ 3002 static vm_page_t 3003 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 3004 { 3005 vm_page_t m, pdppg, pdpg; 3006 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 3007 3008 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3009 3010 PG_A = pmap_accessed_bit(pmap); 3011 PG_M = pmap_modified_bit(pmap); 3012 PG_V = pmap_valid_bit(pmap); 3013 PG_RW = pmap_rw_bit(pmap); 3014 3015 /* 3016 * Allocate a page table page. 3017 */ 3018 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 3019 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 3020 if (lockp != NULL) { 3021 RELEASE_PV_LIST_LOCK(lockp); 3022 PMAP_UNLOCK(pmap); 3023 PMAP_ASSERT_NOT_IN_DI(); 3024 vm_wait(NULL); 3025 PMAP_LOCK(pmap); 3026 } 3027 3028 /* 3029 * Indicate the need to retry. While waiting, the page table 3030 * page may have been allocated. 3031 */ 3032 return (NULL); 3033 } 3034 if ((m->flags & PG_ZERO) == 0) 3035 pmap_zero_page(m); 3036 3037 /* 3038 * Map the pagetable page into the process address space, if 3039 * it isn't already there. 3040 */ 3041 3042 if (ptepindex >= (NUPDE + NUPDPE)) { 3043 pml4_entry_t *pml4, *pml4u; 3044 vm_pindex_t pml4index; 3045 3046 /* Wire up a new PDPE page */ 3047 pml4index = ptepindex - (NUPDE + NUPDPE); 3048 pml4 = &pmap->pm_pml4[pml4index]; 3049 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 3050 if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) { 3051 /* 3052 * PTI: Make all user-space mappings in the 3053 * kernel-mode page table no-execute so that 3054 * we detect any programming errors that leave 3055 * the kernel-mode page table active on return 3056 * to user space. 3057 */ 3058 if (pmap->pm_ucr3 != PMAP_NO_CR3) 3059 *pml4 |= pg_nx; 3060 3061 pml4u = &pmap->pm_pml4u[pml4index]; 3062 *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | 3063 PG_A | PG_M; 3064 } 3065 3066 } else if (ptepindex >= NUPDE) { 3067 vm_pindex_t pml4index; 3068 vm_pindex_t pdpindex; 3069 pml4_entry_t *pml4; 3070 pdp_entry_t *pdp; 3071 3072 /* Wire up a new PDE page */ 3073 pdpindex = ptepindex - NUPDE; 3074 pml4index = pdpindex >> NPML4EPGSHIFT; 3075 3076 pml4 = &pmap->pm_pml4[pml4index]; 3077 if ((*pml4 & PG_V) == 0) { 3078 /* Have to allocate a new pdp, recurse */ 3079 if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, 3080 lockp) == NULL) { 3081 vm_page_unwire_noq(m); 3082 vm_page_free_zero(m); 3083 return (NULL); 3084 } 3085 } else { 3086 /* Add reference to pdp page */ 3087 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 3088 pdppg->wire_count++; 3089 } 3090 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 3091 3092 /* Now find the pdp page */ 3093 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 3094 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 3095 3096 } else { 3097 vm_pindex_t pml4index; 3098 vm_pindex_t pdpindex; 3099 pml4_entry_t *pml4; 3100 pdp_entry_t *pdp; 3101 pd_entry_t *pd; 3102 3103 /* Wire up a new PTE page */ 3104 pdpindex = ptepindex >> NPDPEPGSHIFT; 3105 pml4index = pdpindex >> NPML4EPGSHIFT; 3106 3107 /* First, find the pdp and check that its valid. */ 3108 pml4 = &pmap->pm_pml4[pml4index]; 3109 if ((*pml4 & PG_V) == 0) { 3110 /* Have to allocate a new pd, recurse */ 3111 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 3112 lockp) == NULL) { 3113 vm_page_unwire_noq(m); 3114 vm_page_free_zero(m); 3115 return (NULL); 3116 } 3117 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 3118 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 3119 } else { 3120 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 3121 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 3122 if ((*pdp & PG_V) == 0) { 3123 /* Have to allocate a new pd, recurse */ 3124 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 3125 lockp) == NULL) { 3126 vm_page_unwire_noq(m); 3127 vm_page_free_zero(m); 3128 return (NULL); 3129 } 3130 } else { 3131 /* Add reference to the pd page */ 3132 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 3133 pdpg->wire_count++; 3134 } 3135 } 3136 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 3137 3138 /* Now we know where the page directory page is */ 3139 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; 3140 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 3141 } 3142 3143 pmap_resident_count_inc(pmap, 1); 3144 3145 return (m); 3146 } 3147 3148 static vm_page_t 3149 pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 3150 { 3151 vm_pindex_t pdpindex, ptepindex; 3152 pdp_entry_t *pdpe, PG_V; 3153 vm_page_t pdpg; 3154 3155 PG_V = pmap_valid_bit(pmap); 3156 3157 retry: 3158 pdpe = pmap_pdpe(pmap, va); 3159 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 3160 /* Add a reference to the pd page. */ 3161 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 3162 pdpg->wire_count++; 3163 } else { 3164 /* Allocate a pd page. */ 3165 ptepindex = pmap_pde_pindex(va); 3166 pdpindex = ptepindex >> NPDPEPGSHIFT; 3167 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); 3168 if (pdpg == NULL && lockp != NULL) 3169 goto retry; 3170 } 3171 return (pdpg); 3172 } 3173 3174 static vm_page_t 3175 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 3176 { 3177 vm_pindex_t ptepindex; 3178 pd_entry_t *pd, PG_V; 3179 vm_page_t m; 3180 3181 PG_V = pmap_valid_bit(pmap); 3182 3183 /* 3184 * Calculate pagetable page index 3185 */ 3186 ptepindex = pmap_pde_pindex(va); 3187 retry: 3188 /* 3189 * Get the page directory entry 3190 */ 3191 pd = pmap_pde(pmap, va); 3192 3193 /* 3194 * This supports switching from a 2MB page to a 3195 * normal 4K page. 3196 */ 3197 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 3198 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { 3199 /* 3200 * Invalidation of the 2MB page mapping may have caused 3201 * the deallocation of the underlying PD page. 3202 */ 3203 pd = NULL; 3204 } 3205 } 3206 3207 /* 3208 * If the page table page is mapped, we just increment the 3209 * hold count, and activate it. 3210 */ 3211 if (pd != NULL && (*pd & PG_V) != 0) { 3212 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 3213 m->wire_count++; 3214 } else { 3215 /* 3216 * Here if the pte page isn't mapped, or if it has been 3217 * deallocated. 3218 */ 3219 m = _pmap_allocpte(pmap, ptepindex, lockp); 3220 if (m == NULL && lockp != NULL) 3221 goto retry; 3222 } 3223 return (m); 3224 } 3225 3226 3227 /*************************************************** 3228 * Pmap allocation/deallocation routines. 3229 ***************************************************/ 3230 3231 /* 3232 * Release any resources held by the given physical map. 3233 * Called when a pmap initialized by pmap_pinit is being released. 3234 * Should only be called if the map contains no valid mappings. 3235 */ 3236 void 3237 pmap_release(pmap_t pmap) 3238 { 3239 vm_page_t m; 3240 int i; 3241 3242 KASSERT(pmap->pm_stats.resident_count == 0, 3243 ("pmap_release: pmap resident count %ld != 0", 3244 pmap->pm_stats.resident_count)); 3245 KASSERT(vm_radix_is_empty(&pmap->pm_root), 3246 ("pmap_release: pmap has reserved page table page(s)")); 3247 KASSERT(CPU_EMPTY(&pmap->pm_active), 3248 ("releasing active pmap %p", pmap)); 3249 3250 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4)); 3251 3252 for (i = 0; i < NKPML4E; i++) /* KVA */ 3253 pmap->pm_pml4[KPML4BASE + i] = 0; 3254 for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ 3255 pmap->pm_pml4[DMPML4I + i] = 0; 3256 pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ 3257 for (i = 0; i < lm_ents; i++) /* Large Map */ 3258 pmap->pm_pml4[LMSPML4I + i] = 0; 3259 3260 vm_page_unwire_noq(m); 3261 vm_page_free_zero(m); 3262 3263 if (pmap->pm_pml4u != NULL) { 3264 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u)); 3265 vm_page_unwire_noq(m); 3266 vm_page_free(m); 3267 } 3268 if (pmap->pm_type == PT_X86 && 3269 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) 3270 rangeset_fini(&pmap->pm_pkru); 3271 } 3272 3273 static int 3274 kvm_size(SYSCTL_HANDLER_ARGS) 3275 { 3276 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 3277 3278 return sysctl_handle_long(oidp, &ksize, 0, req); 3279 } 3280 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 3281 0, 0, kvm_size, "LU", "Size of KVM"); 3282 3283 static int 3284 kvm_free(SYSCTL_HANDLER_ARGS) 3285 { 3286 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 3287 3288 return sysctl_handle_long(oidp, &kfree, 0, req); 3289 } 3290 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 3291 0, 0, kvm_free, "LU", "Amount of KVM free"); 3292 3293 /* 3294 * grow the number of kernel page table entries, if needed 3295 */ 3296 void 3297 pmap_growkernel(vm_offset_t addr) 3298 { 3299 vm_paddr_t paddr; 3300 vm_page_t nkpg; 3301 pd_entry_t *pde, newpdir; 3302 pdp_entry_t *pdpe; 3303 3304 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 3305 3306 /* 3307 * Return if "addr" is within the range of kernel page table pages 3308 * that were preallocated during pmap bootstrap. Moreover, leave 3309 * "kernel_vm_end" and the kernel page table as they were. 3310 * 3311 * The correctness of this action is based on the following 3312 * argument: vm_map_insert() allocates contiguous ranges of the 3313 * kernel virtual address space. It calls this function if a range 3314 * ends after "kernel_vm_end". If the kernel is mapped between 3315 * "kernel_vm_end" and "addr", then the range cannot begin at 3316 * "kernel_vm_end". In fact, its beginning address cannot be less 3317 * than the kernel. Thus, there is no immediate need to allocate 3318 * any new kernel page table pages between "kernel_vm_end" and 3319 * "KERNBASE". 3320 */ 3321 if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR) 3322 return; 3323 3324 addr = roundup2(addr, NBPDR); 3325 if (addr - 1 >= vm_map_max(kernel_map)) 3326 addr = vm_map_max(kernel_map); 3327 while (kernel_vm_end < addr) { 3328 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end); 3329 if ((*pdpe & X86_PG_V) == 0) { 3330 /* We need a new PDP entry */ 3331 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT, 3332 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 3333 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 3334 if (nkpg == NULL) 3335 panic("pmap_growkernel: no memory to grow kernel"); 3336 if ((nkpg->flags & PG_ZERO) == 0) 3337 pmap_zero_page(nkpg); 3338 paddr = VM_PAGE_TO_PHYS(nkpg); 3339 *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | 3340 X86_PG_A | X86_PG_M); 3341 continue; /* try again */ 3342 } 3343 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); 3344 if ((*pde & X86_PG_V) != 0) { 3345 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 3346 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 3347 kernel_vm_end = vm_map_max(kernel_map); 3348 break; 3349 } 3350 continue; 3351 } 3352 3353 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end), 3354 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 3355 VM_ALLOC_ZERO); 3356 if (nkpg == NULL) 3357 panic("pmap_growkernel: no memory to grow kernel"); 3358 if ((nkpg->flags & PG_ZERO) == 0) 3359 pmap_zero_page(nkpg); 3360 paddr = VM_PAGE_TO_PHYS(nkpg); 3361 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; 3362 pde_store(pde, newpdir); 3363 3364 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 3365 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 3366 kernel_vm_end = vm_map_max(kernel_map); 3367 break; 3368 } 3369 } 3370 } 3371 3372 3373 /*************************************************** 3374 * page management routines. 3375 ***************************************************/ 3376 3377 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 3378 CTASSERT(_NPCM == 3); 3379 CTASSERT(_NPCPV == 168); 3380 3381 static __inline struct pv_chunk * 3382 pv_to_chunk(pv_entry_t pv) 3383 { 3384 3385 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 3386 } 3387 3388 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 3389 3390 #define PC_FREE0 0xfffffffffffffffful 3391 #define PC_FREE1 0xfffffffffffffffful 3392 #define PC_FREE2 0x000000fffffffffful 3393 3394 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 3395 3396 #ifdef PV_STATS 3397 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 3398 3399 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 3400 "Current number of pv entry chunks"); 3401 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 3402 "Current number of pv entry chunks allocated"); 3403 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 3404 "Current number of pv entry chunks frees"); 3405 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 3406 "Number of times tried to get a chunk page but failed."); 3407 3408 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 3409 static int pv_entry_spare; 3410 3411 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 3412 "Current number of pv entry frees"); 3413 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 3414 "Current number of pv entry allocs"); 3415 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 3416 "Current number of pv entries"); 3417 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 3418 "Current number of spare pv entries"); 3419 #endif 3420 3421 static void 3422 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di) 3423 { 3424 3425 if (pmap == NULL) 3426 return; 3427 pmap_invalidate_all(pmap); 3428 if (pmap != locked_pmap) 3429 PMAP_UNLOCK(pmap); 3430 if (start_di) 3431 pmap_delayed_invl_finished(); 3432 } 3433 3434 /* 3435 * We are in a serious low memory condition. Resort to 3436 * drastic measures to free some pages so we can allocate 3437 * another pv entry chunk. 3438 * 3439 * Returns NULL if PV entries were reclaimed from the specified pmap. 3440 * 3441 * We do not, however, unmap 2mpages because subsequent accesses will 3442 * allocate per-page pv entries until repromotion occurs, thereby 3443 * exacerbating the shortage of free pv entries. 3444 */ 3445 static vm_page_t 3446 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 3447 { 3448 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 3449 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 3450 struct md_page *pvh; 3451 pd_entry_t *pde; 3452 pmap_t next_pmap, pmap; 3453 pt_entry_t *pte, tpte; 3454 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 3455 pv_entry_t pv; 3456 vm_offset_t va; 3457 vm_page_t m, m_pc; 3458 struct spglist free; 3459 uint64_t inuse; 3460 int bit, field, freed; 3461 bool start_di; 3462 static int active_reclaims = 0; 3463 3464 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 3465 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 3466 pmap = NULL; 3467 m_pc = NULL; 3468 PG_G = PG_A = PG_M = PG_RW = 0; 3469 SLIST_INIT(&free); 3470 bzero(&pc_marker_b, sizeof(pc_marker_b)); 3471 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 3472 pc_marker = (struct pv_chunk *)&pc_marker_b; 3473 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 3474 3475 /* 3476 * A delayed invalidation block should already be active if 3477 * pmap_advise() or pmap_remove() called this function by way 3478 * of pmap_demote_pde_locked(). 3479 */ 3480 start_di = pmap_not_in_di(); 3481 3482 mtx_lock(&pv_chunks_mutex); 3483 active_reclaims++; 3484 TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); 3485 TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); 3486 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 3487 SLIST_EMPTY(&free)) { 3488 next_pmap = pc->pc_pmap; 3489 if (next_pmap == NULL) { 3490 /* 3491 * The next chunk is a marker. However, it is 3492 * not our marker, so active_reclaims must be 3493 * > 1. Consequently, the next_chunk code 3494 * will not rotate the pv_chunks list. 3495 */ 3496 goto next_chunk; 3497 } 3498 mtx_unlock(&pv_chunks_mutex); 3499 3500 /* 3501 * A pv_chunk can only be removed from the pc_lru list 3502 * when both pc_chunks_mutex is owned and the 3503 * corresponding pmap is locked. 3504 */ 3505 if (pmap != next_pmap) { 3506 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, 3507 start_di); 3508 pmap = next_pmap; 3509 /* Avoid deadlock and lock recursion. */ 3510 if (pmap > locked_pmap) { 3511 RELEASE_PV_LIST_LOCK(lockp); 3512 PMAP_LOCK(pmap); 3513 if (start_di) 3514 pmap_delayed_invl_started(); 3515 mtx_lock(&pv_chunks_mutex); 3516 continue; 3517 } else if (pmap != locked_pmap) { 3518 if (PMAP_TRYLOCK(pmap)) { 3519 if (start_di) 3520 pmap_delayed_invl_started(); 3521 mtx_lock(&pv_chunks_mutex); 3522 continue; 3523 } else { 3524 pmap = NULL; /* pmap is not locked */ 3525 mtx_lock(&pv_chunks_mutex); 3526 pc = TAILQ_NEXT(pc_marker, pc_lru); 3527 if (pc == NULL || 3528 pc->pc_pmap != next_pmap) 3529 continue; 3530 goto next_chunk; 3531 } 3532 } else if (start_di) 3533 pmap_delayed_invl_started(); 3534 PG_G = pmap_global_bit(pmap); 3535 PG_A = pmap_accessed_bit(pmap); 3536 PG_M = pmap_modified_bit(pmap); 3537 PG_RW = pmap_rw_bit(pmap); 3538 } 3539 3540 /* 3541 * Destroy every non-wired, 4 KB page mapping in the chunk. 3542 */ 3543 freed = 0; 3544 for (field = 0; field < _NPCM; field++) { 3545 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 3546 inuse != 0; inuse &= ~(1UL << bit)) { 3547 bit = bsfq(inuse); 3548 pv = &pc->pc_pventry[field * 64 + bit]; 3549 va = pv->pv_va; 3550 pde = pmap_pde(pmap, va); 3551 if ((*pde & PG_PS) != 0) 3552 continue; 3553 pte = pmap_pde_to_pte(pde, va); 3554 if ((*pte & PG_W) != 0) 3555 continue; 3556 tpte = pte_load_clear(pte); 3557 if ((tpte & PG_G) != 0) 3558 pmap_invalidate_page(pmap, va); 3559 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 3560 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3561 vm_page_dirty(m); 3562 if ((tpte & PG_A) != 0) 3563 vm_page_aflag_set(m, PGA_REFERENCED); 3564 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3565 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3566 m->md.pv_gen++; 3567 if (TAILQ_EMPTY(&m->md.pv_list) && 3568 (m->flags & PG_FICTITIOUS) == 0) { 3569 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3570 if (TAILQ_EMPTY(&pvh->pv_list)) { 3571 vm_page_aflag_clear(m, 3572 PGA_WRITEABLE); 3573 } 3574 } 3575 pmap_delayed_invl_page(m); 3576 pc->pc_map[field] |= 1UL << bit; 3577 pmap_unuse_pt(pmap, va, *pde, &free); 3578 freed++; 3579 } 3580 } 3581 if (freed == 0) { 3582 mtx_lock(&pv_chunks_mutex); 3583 goto next_chunk; 3584 } 3585 /* Every freed mapping is for a 4 KB page. */ 3586 pmap_resident_count_dec(pmap, freed); 3587 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 3588 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 3589 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 3590 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3591 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && 3592 pc->pc_map[2] == PC_FREE2) { 3593 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 3594 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 3595 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 3596 /* Entire chunk is free; return it. */ 3597 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 3598 dump_drop_page(m_pc->phys_addr); 3599 mtx_lock(&pv_chunks_mutex); 3600 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 3601 break; 3602 } 3603 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3604 mtx_lock(&pv_chunks_mutex); 3605 /* One freed pv entry in locked_pmap is sufficient. */ 3606 if (pmap == locked_pmap) 3607 break; 3608 next_chunk: 3609 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 3610 TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); 3611 if (active_reclaims == 1 && pmap != NULL) { 3612 /* 3613 * Rotate the pv chunks list so that we do not 3614 * scan the same pv chunks that could not be 3615 * freed (because they contained a wired 3616 * and/or superpage mapping) on every 3617 * invocation of reclaim_pv_chunk(). 3618 */ 3619 while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { 3620 MPASS(pc->pc_pmap != NULL); 3621 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 3622 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 3623 } 3624 } 3625 } 3626 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 3627 TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); 3628 active_reclaims--; 3629 mtx_unlock(&pv_chunks_mutex); 3630 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); 3631 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 3632 m_pc = SLIST_FIRST(&free); 3633 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 3634 /* Recycle a freed page table page. */ 3635 m_pc->wire_count = 1; 3636 } 3637 vm_page_free_pages_toq(&free, true); 3638 return (m_pc); 3639 } 3640 3641 /* 3642 * free the pv_entry back to the free list 3643 */ 3644 static void 3645 free_pv_entry(pmap_t pmap, pv_entry_t pv) 3646 { 3647 struct pv_chunk *pc; 3648 int idx, field, bit; 3649 3650 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3651 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 3652 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 3653 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 3654 pc = pv_to_chunk(pv); 3655 idx = pv - &pc->pc_pventry[0]; 3656 field = idx / 64; 3657 bit = idx % 64; 3658 pc->pc_map[field] |= 1ul << bit; 3659 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 3660 pc->pc_map[2] != PC_FREE2) { 3661 /* 98% of the time, pc is already at the head of the list. */ 3662 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 3663 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3664 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3665 } 3666 return; 3667 } 3668 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3669 free_pv_chunk(pc); 3670 } 3671 3672 static void 3673 free_pv_chunk(struct pv_chunk *pc) 3674 { 3675 vm_page_t m; 3676 3677 mtx_lock(&pv_chunks_mutex); 3678 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 3679 mtx_unlock(&pv_chunks_mutex); 3680 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 3681 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 3682 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 3683 /* entire chunk is free, return it */ 3684 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 3685 dump_drop_page(m->phys_addr); 3686 vm_page_unwire(m, PQ_NONE); 3687 vm_page_free(m); 3688 } 3689 3690 /* 3691 * Returns a new PV entry, allocating a new PV chunk from the system when 3692 * needed. If this PV chunk allocation fails and a PV list lock pointer was 3693 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 3694 * returned. 3695 * 3696 * The given PV list lock may be released. 3697 */ 3698 static pv_entry_t 3699 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 3700 { 3701 int bit, field; 3702 pv_entry_t pv; 3703 struct pv_chunk *pc; 3704 vm_page_t m; 3705 3706 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3707 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 3708 retry: 3709 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3710 if (pc != NULL) { 3711 for (field = 0; field < _NPCM; field++) { 3712 if (pc->pc_map[field]) { 3713 bit = bsfq(pc->pc_map[field]); 3714 break; 3715 } 3716 } 3717 if (field < _NPCM) { 3718 pv = &pc->pc_pventry[field * 64 + bit]; 3719 pc->pc_map[field] &= ~(1ul << bit); 3720 /* If this was the last item, move it to tail */ 3721 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 3722 pc->pc_map[2] == 0) { 3723 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3724 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 3725 pc_list); 3726 } 3727 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 3728 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 3729 return (pv); 3730 } 3731 } 3732 /* No free items, allocate another chunk */ 3733 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 3734 VM_ALLOC_WIRED); 3735 if (m == NULL) { 3736 if (lockp == NULL) { 3737 PV_STAT(pc_chunk_tryfail++); 3738 return (NULL); 3739 } 3740 m = reclaim_pv_chunk(pmap, lockp); 3741 if (m == NULL) 3742 goto retry; 3743 } 3744 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3745 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3746 dump_add_page(m->phys_addr); 3747 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 3748 pc->pc_pmap = pmap; 3749 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 3750 pc->pc_map[1] = PC_FREE1; 3751 pc->pc_map[2] = PC_FREE2; 3752 mtx_lock(&pv_chunks_mutex); 3753 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 3754 mtx_unlock(&pv_chunks_mutex); 3755 pv = &pc->pc_pventry[0]; 3756 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3757 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 3758 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 3759 return (pv); 3760 } 3761 3762 /* 3763 * Returns the number of one bits within the given PV chunk map. 3764 * 3765 * The erratas for Intel processors state that "POPCNT Instruction May 3766 * Take Longer to Execute Than Expected". It is believed that the 3767 * issue is the spurious dependency on the destination register. 3768 * Provide a hint to the register rename logic that the destination 3769 * value is overwritten, by clearing it, as suggested in the 3770 * optimization manual. It should be cheap for unaffected processors 3771 * as well. 3772 * 3773 * Reference numbers for erratas are 3774 * 4th Gen Core: HSD146 3775 * 5th Gen Core: BDM85 3776 * 6th Gen Core: SKL029 3777 */ 3778 static int 3779 popcnt_pc_map_pq(uint64_t *map) 3780 { 3781 u_long result, tmp; 3782 3783 __asm __volatile("xorl %k0,%k0;popcntq %2,%0;" 3784 "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;" 3785 "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0" 3786 : "=&r" (result), "=&r" (tmp) 3787 : "m" (map[0]), "m" (map[1]), "m" (map[2])); 3788 return (result); 3789 } 3790 3791 /* 3792 * Ensure that the number of spare PV entries in the specified pmap meets or 3793 * exceeds the given count, "needed". 3794 * 3795 * The given PV list lock may be released. 3796 */ 3797 static void 3798 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 3799 { 3800 struct pch new_tail; 3801 struct pv_chunk *pc; 3802 vm_page_t m; 3803 int avail, free; 3804 bool reclaimed; 3805 3806 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3807 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 3808 3809 /* 3810 * Newly allocated PV chunks must be stored in a private list until 3811 * the required number of PV chunks have been allocated. Otherwise, 3812 * reclaim_pv_chunk() could recycle one of these chunks. In 3813 * contrast, these chunks must be added to the pmap upon allocation. 3814 */ 3815 TAILQ_INIT(&new_tail); 3816 retry: 3817 avail = 0; 3818 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 3819 #ifndef __POPCNT__ 3820 if ((cpu_feature2 & CPUID2_POPCNT) == 0) 3821 bit_count((bitstr_t *)pc->pc_map, 0, 3822 sizeof(pc->pc_map) * NBBY, &free); 3823 else 3824 #endif 3825 free = popcnt_pc_map_pq(pc->pc_map); 3826 if (free == 0) 3827 break; 3828 avail += free; 3829 if (avail >= needed) 3830 break; 3831 } 3832 for (reclaimed = false; avail < needed; avail += _NPCPV) { 3833 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 3834 VM_ALLOC_WIRED); 3835 if (m == NULL) { 3836 m = reclaim_pv_chunk(pmap, lockp); 3837 if (m == NULL) 3838 goto retry; 3839 reclaimed = true; 3840 } 3841 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3842 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3843 dump_add_page(m->phys_addr); 3844 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 3845 pc->pc_pmap = pmap; 3846 pc->pc_map[0] = PC_FREE0; 3847 pc->pc_map[1] = PC_FREE1; 3848 pc->pc_map[2] = PC_FREE2; 3849 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3850 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 3851 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 3852 3853 /* 3854 * The reclaim might have freed a chunk from the current pmap. 3855 * If that chunk contained available entries, we need to 3856 * re-count the number of available entries. 3857 */ 3858 if (reclaimed) 3859 goto retry; 3860 } 3861 if (!TAILQ_EMPTY(&new_tail)) { 3862 mtx_lock(&pv_chunks_mutex); 3863 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 3864 mtx_unlock(&pv_chunks_mutex); 3865 } 3866 } 3867 3868 /* 3869 * First find and then remove the pv entry for the specified pmap and virtual 3870 * address from the specified pv list. Returns the pv entry if found and NULL 3871 * otherwise. This operation can be performed on pv lists for either 4KB or 3872 * 2MB page mappings. 3873 */ 3874 static __inline pv_entry_t 3875 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3876 { 3877 pv_entry_t pv; 3878 3879 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3880 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 3881 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3882 pvh->pv_gen++; 3883 break; 3884 } 3885 } 3886 return (pv); 3887 } 3888 3889 /* 3890 * After demotion from a 2MB page mapping to 512 4KB page mappings, 3891 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 3892 * entries for each of the 4KB page mappings. 3893 */ 3894 static void 3895 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3896 struct rwlock **lockp) 3897 { 3898 struct md_page *pvh; 3899 struct pv_chunk *pc; 3900 pv_entry_t pv; 3901 vm_offset_t va_last; 3902 vm_page_t m; 3903 int bit, field; 3904 3905 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3906 KASSERT((pa & PDRMASK) == 0, 3907 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 3908 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3909 3910 /* 3911 * Transfer the 2mpage's pv entry for this mapping to the first 3912 * page's pv list. Once this transfer begins, the pv list lock 3913 * must not be released until the last pv entry is reinstantiated. 3914 */ 3915 pvh = pa_to_pvh(pa); 3916 va = trunc_2mpage(va); 3917 pv = pmap_pvh_remove(pvh, pmap, va); 3918 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 3919 m = PHYS_TO_VM_PAGE(pa); 3920 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3921 m->md.pv_gen++; 3922 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 3923 PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); 3924 va_last = va + NBPDR - PAGE_SIZE; 3925 for (;;) { 3926 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3927 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 3928 pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); 3929 for (field = 0; field < _NPCM; field++) { 3930 while (pc->pc_map[field]) { 3931 bit = bsfq(pc->pc_map[field]); 3932 pc->pc_map[field] &= ~(1ul << bit); 3933 pv = &pc->pc_pventry[field * 64 + bit]; 3934 va += PAGE_SIZE; 3935 pv->pv_va = va; 3936 m++; 3937 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3938 ("pmap_pv_demote_pde: page %p is not managed", m)); 3939 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3940 m->md.pv_gen++; 3941 if (va == va_last) 3942 goto out; 3943 } 3944 } 3945 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3946 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3947 } 3948 out: 3949 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 3950 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3951 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3952 } 3953 PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); 3954 PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); 3955 } 3956 3957 #if VM_NRESERVLEVEL > 0 3958 /* 3959 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 3960 * replace the many pv entries for the 4KB page mappings by a single pv entry 3961 * for the 2MB page mapping. 3962 */ 3963 static void 3964 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3965 struct rwlock **lockp) 3966 { 3967 struct md_page *pvh; 3968 pv_entry_t pv; 3969 vm_offset_t va_last; 3970 vm_page_t m; 3971 3972 KASSERT((pa & PDRMASK) == 0, 3973 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 3974 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3975 3976 /* 3977 * Transfer the first page's pv entry for this mapping to the 2mpage's 3978 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 3979 * a transfer avoids the possibility that get_pv_entry() calls 3980 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 3981 * mappings that is being promoted. 3982 */ 3983 m = PHYS_TO_VM_PAGE(pa); 3984 va = trunc_2mpage(va); 3985 pv = pmap_pvh_remove(&m->md, pmap, va); 3986 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 3987 pvh = pa_to_pvh(pa); 3988 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3989 pvh->pv_gen++; 3990 /* Free the remaining NPTEPG - 1 pv entries. */ 3991 va_last = va + NBPDR - PAGE_SIZE; 3992 do { 3993 m++; 3994 va += PAGE_SIZE; 3995 pmap_pvh_free(&m->md, pmap, va); 3996 } while (va < va_last); 3997 } 3998 #endif /* VM_NRESERVLEVEL > 0 */ 3999 4000 /* 4001 * First find and then destroy the pv entry for the specified pmap and virtual 4002 * address. This operation can be performed on pv lists for either 4KB or 2MB 4003 * page mappings. 4004 */ 4005 static void 4006 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 4007 { 4008 pv_entry_t pv; 4009 4010 pv = pmap_pvh_remove(pvh, pmap, va); 4011 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 4012 free_pv_entry(pmap, pv); 4013 } 4014 4015 /* 4016 * Conditionally create the PV entry for a 4KB page mapping if the required 4017 * memory can be allocated without resorting to reclamation. 4018 */ 4019 static boolean_t 4020 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 4021 struct rwlock **lockp) 4022 { 4023 pv_entry_t pv; 4024 4025 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4026 /* Pass NULL instead of the lock pointer to disable reclamation. */ 4027 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 4028 pv->pv_va = va; 4029 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 4030 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4031 m->md.pv_gen++; 4032 return (TRUE); 4033 } else 4034 return (FALSE); 4035 } 4036 4037 /* 4038 * Create the PV entry for a 2MB page mapping. Always returns true unless the 4039 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 4040 * false if the PV entry cannot be allocated without resorting to reclamation. 4041 */ 4042 static bool 4043 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, 4044 struct rwlock **lockp) 4045 { 4046 struct md_page *pvh; 4047 pv_entry_t pv; 4048 vm_paddr_t pa; 4049 4050 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4051 /* Pass NULL instead of the lock pointer to disable reclamation. */ 4052 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 4053 NULL : lockp)) == NULL) 4054 return (false); 4055 pv->pv_va = va; 4056 pa = pde & PG_PS_FRAME; 4057 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 4058 pvh = pa_to_pvh(pa); 4059 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4060 pvh->pv_gen++; 4061 return (true); 4062 } 4063 4064 /* 4065 * Fills a page table page with mappings to consecutive physical pages. 4066 */ 4067 static void 4068 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 4069 { 4070 pt_entry_t *pte; 4071 4072 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 4073 *pte = newpte; 4074 newpte += PAGE_SIZE; 4075 } 4076 } 4077 4078 /* 4079 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 4080 * mapping is invalidated. 4081 */ 4082 static boolean_t 4083 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 4084 { 4085 struct rwlock *lock; 4086 boolean_t rv; 4087 4088 lock = NULL; 4089 rv = pmap_demote_pde_locked(pmap, pde, va, &lock); 4090 if (lock != NULL) 4091 rw_wunlock(lock); 4092 return (rv); 4093 } 4094 4095 static boolean_t 4096 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 4097 struct rwlock **lockp) 4098 { 4099 pd_entry_t newpde, oldpde; 4100 pt_entry_t *firstpte, newpte; 4101 pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V; 4102 vm_paddr_t mptepa; 4103 vm_page_t mpte; 4104 struct spglist free; 4105 vm_offset_t sva; 4106 int PG_PTE_CACHE; 4107 4108 PG_G = pmap_global_bit(pmap); 4109 PG_A = pmap_accessed_bit(pmap); 4110 PG_M = pmap_modified_bit(pmap); 4111 PG_RW = pmap_rw_bit(pmap); 4112 PG_V = pmap_valid_bit(pmap); 4113 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 4114 PG_PKU_MASK = pmap_pku_mask_bit(pmap); 4115 4116 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4117 oldpde = *pde; 4118 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 4119 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 4120 if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 4121 NULL) { 4122 KASSERT((oldpde & PG_W) == 0, 4123 ("pmap_demote_pde: page table page for a wired mapping" 4124 " is missing")); 4125 4126 /* 4127 * Invalidate the 2MB page mapping and return "failure" if the 4128 * mapping was never accessed or the allocation of the new 4129 * page table page fails. If the 2MB page mapping belongs to 4130 * the direct map region of the kernel's address space, then 4131 * the page allocation request specifies the highest possible 4132 * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is 4133 * normal. Page table pages are preallocated for every other 4134 * part of the kernel address space, so the direct map region 4135 * is the only part of the kernel address space that must be 4136 * handled here. 4137 */ 4138 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 4139 pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va < 4140 DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | 4141 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 4142 SLIST_INIT(&free); 4143 sva = trunc_2mpage(va); 4144 pmap_remove_pde(pmap, pde, sva, &free, lockp); 4145 if ((oldpde & PG_G) == 0) 4146 pmap_invalidate_pde_page(pmap, sva, oldpde); 4147 vm_page_free_pages_toq(&free, true); 4148 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx" 4149 " in pmap %p", va, pmap); 4150 return (FALSE); 4151 } 4152 if (va < VM_MAXUSER_ADDRESS) 4153 pmap_resident_count_inc(pmap, 1); 4154 } 4155 mptepa = VM_PAGE_TO_PHYS(mpte); 4156 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 4157 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 4158 KASSERT((oldpde & PG_A) != 0, 4159 ("pmap_demote_pde: oldpde is missing PG_A")); 4160 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 4161 ("pmap_demote_pde: oldpde is missing PG_M")); 4162 newpte = oldpde & ~PG_PS; 4163 newpte = pmap_swap_pat(pmap, newpte); 4164 4165 /* 4166 * If the page table page is new, initialize it. 4167 */ 4168 if (mpte->wire_count == 1) { 4169 mpte->wire_count = NPTEPG; 4170 pmap_fill_ptp(firstpte, newpte); 4171 } 4172 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 4173 ("pmap_demote_pde: firstpte and newpte map different physical" 4174 " addresses")); 4175 4176 /* 4177 * If the mapping has changed attributes, update the page table 4178 * entries. 4179 */ 4180 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 4181 pmap_fill_ptp(firstpte, newpte); 4182 4183 /* 4184 * The spare PV entries must be reserved prior to demoting the 4185 * mapping, that is, prior to changing the PDE. Otherwise, the state 4186 * of the PDE and the PV lists will be inconsistent, which can result 4187 * in reclaim_pv_chunk() attempting to remove a PV entry from the 4188 * wrong PV list and pmap_pv_demote_pde() failing to find the expected 4189 * PV entry for the 2MB page mapping that is being demoted. 4190 */ 4191 if ((oldpde & PG_MANAGED) != 0) 4192 reserve_pv_entries(pmap, NPTEPG - 1, lockp); 4193 4194 /* 4195 * Demote the mapping. This pmap is locked. The old PDE has 4196 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 4197 * set. Thus, there is no danger of a race with another 4198 * processor changing the setting of PG_A and/or PG_M between 4199 * the read above and the store below. 4200 */ 4201 if (workaround_erratum383) 4202 pmap_update_pde(pmap, va, pde, newpde); 4203 else 4204 pde_store(pde, newpde); 4205 4206 /* 4207 * Invalidate a stale recursive mapping of the page table page. 4208 */ 4209 if (va >= VM_MAXUSER_ADDRESS) 4210 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 4211 4212 /* 4213 * Demote the PV entry. 4214 */ 4215 if ((oldpde & PG_MANAGED) != 0) 4216 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); 4217 4218 atomic_add_long(&pmap_pde_demotions, 1); 4219 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx" 4220 " in pmap %p", va, pmap); 4221 return (TRUE); 4222 } 4223 4224 /* 4225 * pmap_remove_kernel_pde: Remove a kernel superpage mapping. 4226 */ 4227 static void 4228 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 4229 { 4230 pd_entry_t newpde; 4231 vm_paddr_t mptepa; 4232 vm_page_t mpte; 4233 4234 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 4235 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4236 mpte = pmap_remove_pt_page(pmap, va); 4237 if (mpte == NULL) 4238 panic("pmap_remove_kernel_pde: Missing pt page."); 4239 4240 mptepa = VM_PAGE_TO_PHYS(mpte); 4241 newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; 4242 4243 /* 4244 * Initialize the page table page. 4245 */ 4246 pagezero((void *)PHYS_TO_DMAP(mptepa)); 4247 4248 /* 4249 * Demote the mapping. 4250 */ 4251 if (workaround_erratum383) 4252 pmap_update_pde(pmap, va, pde, newpde); 4253 else 4254 pde_store(pde, newpde); 4255 4256 /* 4257 * Invalidate a stale recursive mapping of the page table page. 4258 */ 4259 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 4260 } 4261 4262 /* 4263 * pmap_remove_pde: do the things to unmap a superpage in a process 4264 */ 4265 static int 4266 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 4267 struct spglist *free, struct rwlock **lockp) 4268 { 4269 struct md_page *pvh; 4270 pd_entry_t oldpde; 4271 vm_offset_t eva, va; 4272 vm_page_t m, mpte; 4273 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 4274 4275 PG_G = pmap_global_bit(pmap); 4276 PG_A = pmap_accessed_bit(pmap); 4277 PG_M = pmap_modified_bit(pmap); 4278 PG_RW = pmap_rw_bit(pmap); 4279 4280 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4281 KASSERT((sva & PDRMASK) == 0, 4282 ("pmap_remove_pde: sva is not 2mpage aligned")); 4283 oldpde = pte_load_clear(pdq); 4284 if (oldpde & PG_W) 4285 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 4286 if ((oldpde & PG_G) != 0) 4287 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 4288 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); 4289 if (oldpde & PG_MANAGED) { 4290 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); 4291 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 4292 pmap_pvh_free(pvh, pmap, sva); 4293 eva = sva + NBPDR; 4294 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 4295 va < eva; va += PAGE_SIZE, m++) { 4296 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 4297 vm_page_dirty(m); 4298 if (oldpde & PG_A) 4299 vm_page_aflag_set(m, PGA_REFERENCED); 4300 if (TAILQ_EMPTY(&m->md.pv_list) && 4301 TAILQ_EMPTY(&pvh->pv_list)) 4302 vm_page_aflag_clear(m, PGA_WRITEABLE); 4303 pmap_delayed_invl_page(m); 4304 } 4305 } 4306 if (pmap == kernel_pmap) { 4307 pmap_remove_kernel_pde(pmap, pdq, sva); 4308 } else { 4309 mpte = pmap_remove_pt_page(pmap, sva); 4310 if (mpte != NULL) { 4311 pmap_resident_count_dec(pmap, 1); 4312 KASSERT(mpte->wire_count == NPTEPG, 4313 ("pmap_remove_pde: pte page wire count error")); 4314 mpte->wire_count = 0; 4315 pmap_add_delayed_free_list(mpte, free, FALSE); 4316 } 4317 } 4318 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); 4319 } 4320 4321 /* 4322 * pmap_remove_pte: do the things to unmap a page in a process 4323 */ 4324 static int 4325 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 4326 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) 4327 { 4328 struct md_page *pvh; 4329 pt_entry_t oldpte, PG_A, PG_M, PG_RW; 4330 vm_page_t m; 4331 4332 PG_A = pmap_accessed_bit(pmap); 4333 PG_M = pmap_modified_bit(pmap); 4334 PG_RW = pmap_rw_bit(pmap); 4335 4336 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4337 oldpte = pte_load_clear(ptq); 4338 if (oldpte & PG_W) 4339 pmap->pm_stats.wired_count -= 1; 4340 pmap_resident_count_dec(pmap, 1); 4341 if (oldpte & PG_MANAGED) { 4342 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 4343 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 4344 vm_page_dirty(m); 4345 if (oldpte & PG_A) 4346 vm_page_aflag_set(m, PGA_REFERENCED); 4347 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 4348 pmap_pvh_free(&m->md, pmap, va); 4349 if (TAILQ_EMPTY(&m->md.pv_list) && 4350 (m->flags & PG_FICTITIOUS) == 0) { 4351 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4352 if (TAILQ_EMPTY(&pvh->pv_list)) 4353 vm_page_aflag_clear(m, PGA_WRITEABLE); 4354 } 4355 pmap_delayed_invl_page(m); 4356 } 4357 return (pmap_unuse_pt(pmap, va, ptepde, free)); 4358 } 4359 4360 /* 4361 * Remove a single page from a process address space 4362 */ 4363 static void 4364 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 4365 struct spglist *free) 4366 { 4367 struct rwlock *lock; 4368 pt_entry_t *pte, PG_V; 4369 4370 PG_V = pmap_valid_bit(pmap); 4371 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4372 if ((*pde & PG_V) == 0) 4373 return; 4374 pte = pmap_pde_to_pte(pde, va); 4375 if ((*pte & PG_V) == 0) 4376 return; 4377 lock = NULL; 4378 pmap_remove_pte(pmap, pte, va, *pde, free, &lock); 4379 if (lock != NULL) 4380 rw_wunlock(lock); 4381 pmap_invalidate_page(pmap, va); 4382 } 4383 4384 /* 4385 * Removes the specified range of addresses from the page table page. 4386 */ 4387 static bool 4388 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 4389 pd_entry_t *pde, struct spglist *free, struct rwlock **lockp) 4390 { 4391 pt_entry_t PG_G, *pte; 4392 vm_offset_t va; 4393 bool anyvalid; 4394 4395 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4396 PG_G = pmap_global_bit(pmap); 4397 anyvalid = false; 4398 va = eva; 4399 for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++, 4400 sva += PAGE_SIZE) { 4401 if (*pte == 0) { 4402 if (va != eva) { 4403 pmap_invalidate_range(pmap, va, sva); 4404 va = eva; 4405 } 4406 continue; 4407 } 4408 if ((*pte & PG_G) == 0) 4409 anyvalid = true; 4410 else if (va == eva) 4411 va = sva; 4412 if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) { 4413 sva += PAGE_SIZE; 4414 break; 4415 } 4416 } 4417 if (va != eva) 4418 pmap_invalidate_range(pmap, va, sva); 4419 return (anyvalid); 4420 } 4421 4422 /* 4423 * Remove the given range of addresses from the specified map. 4424 * 4425 * It is assumed that the start and end are properly 4426 * rounded to the page size. 4427 */ 4428 void 4429 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4430 { 4431 struct rwlock *lock; 4432 vm_offset_t va_next; 4433 pml4_entry_t *pml4e; 4434 pdp_entry_t *pdpe; 4435 pd_entry_t ptpaddr, *pde; 4436 pt_entry_t PG_G, PG_V; 4437 struct spglist free; 4438 int anyvalid; 4439 4440 PG_G = pmap_global_bit(pmap); 4441 PG_V = pmap_valid_bit(pmap); 4442 4443 /* 4444 * Perform an unsynchronized read. This is, however, safe. 4445 */ 4446 if (pmap->pm_stats.resident_count == 0) 4447 return; 4448 4449 anyvalid = 0; 4450 SLIST_INIT(&free); 4451 4452 pmap_delayed_invl_started(); 4453 PMAP_LOCK(pmap); 4454 4455 /* 4456 * special handling of removing one page. a very 4457 * common operation and easy to short circuit some 4458 * code. 4459 */ 4460 if (sva + PAGE_SIZE == eva) { 4461 pde = pmap_pde(pmap, sva); 4462 if (pde && (*pde & PG_PS) == 0) { 4463 pmap_remove_page(pmap, sva, pde, &free); 4464 goto out; 4465 } 4466 } 4467 4468 lock = NULL; 4469 for (; sva < eva; sva = va_next) { 4470 4471 if (pmap->pm_stats.resident_count == 0) 4472 break; 4473 4474 pml4e = pmap_pml4e(pmap, sva); 4475 if ((*pml4e & PG_V) == 0) { 4476 va_next = (sva + NBPML4) & ~PML4MASK; 4477 if (va_next < sva) 4478 va_next = eva; 4479 continue; 4480 } 4481 4482 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 4483 if ((*pdpe & PG_V) == 0) { 4484 va_next = (sva + NBPDP) & ~PDPMASK; 4485 if (va_next < sva) 4486 va_next = eva; 4487 continue; 4488 } 4489 4490 /* 4491 * Calculate index for next page table. 4492 */ 4493 va_next = (sva + NBPDR) & ~PDRMASK; 4494 if (va_next < sva) 4495 va_next = eva; 4496 4497 pde = pmap_pdpe_to_pde(pdpe, sva); 4498 ptpaddr = *pde; 4499 4500 /* 4501 * Weed out invalid mappings. 4502 */ 4503 if (ptpaddr == 0) 4504 continue; 4505 4506 /* 4507 * Check for large page. 4508 */ 4509 if ((ptpaddr & PG_PS) != 0) { 4510 /* 4511 * Are we removing the entire large page? If not, 4512 * demote the mapping and fall through. 4513 */ 4514 if (sva + NBPDR == va_next && eva >= va_next) { 4515 /* 4516 * The TLB entry for a PG_G mapping is 4517 * invalidated by pmap_remove_pde(). 4518 */ 4519 if ((ptpaddr & PG_G) == 0) 4520 anyvalid = 1; 4521 pmap_remove_pde(pmap, pde, sva, &free, &lock); 4522 continue; 4523 } else if (!pmap_demote_pde_locked(pmap, pde, sva, 4524 &lock)) { 4525 /* The large page mapping was destroyed. */ 4526 continue; 4527 } else 4528 ptpaddr = *pde; 4529 } 4530 4531 /* 4532 * Limit our scan to either the end of the va represented 4533 * by the current page table page, or to the end of the 4534 * range being removed. 4535 */ 4536 if (va_next > eva) 4537 va_next = eva; 4538 4539 if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock)) 4540 anyvalid = 1; 4541 } 4542 if (lock != NULL) 4543 rw_wunlock(lock); 4544 out: 4545 if (anyvalid) 4546 pmap_invalidate_all(pmap); 4547 pmap_pkru_on_remove(pmap, sva, eva); 4548 PMAP_UNLOCK(pmap); 4549 pmap_delayed_invl_finished(); 4550 vm_page_free_pages_toq(&free, true); 4551 } 4552 4553 /* 4554 * Routine: pmap_remove_all 4555 * Function: 4556 * Removes this physical page from 4557 * all physical maps in which it resides. 4558 * Reflects back modify bits to the pager. 4559 * 4560 * Notes: 4561 * Original versions of this routine were very 4562 * inefficient because they iteratively called 4563 * pmap_remove (slow...) 4564 */ 4565 4566 void 4567 pmap_remove_all(vm_page_t m) 4568 { 4569 struct md_page *pvh; 4570 pv_entry_t pv; 4571 pmap_t pmap; 4572 struct rwlock *lock; 4573 pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; 4574 pd_entry_t *pde; 4575 vm_offset_t va; 4576 struct spglist free; 4577 int pvh_gen, md_gen; 4578 4579 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4580 ("pmap_remove_all: page %p is not managed", m)); 4581 SLIST_INIT(&free); 4582 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4583 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 4584 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4585 retry: 4586 rw_wlock(lock); 4587 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 4588 pmap = PV_PMAP(pv); 4589 if (!PMAP_TRYLOCK(pmap)) { 4590 pvh_gen = pvh->pv_gen; 4591 rw_wunlock(lock); 4592 PMAP_LOCK(pmap); 4593 rw_wlock(lock); 4594 if (pvh_gen != pvh->pv_gen) { 4595 rw_wunlock(lock); 4596 PMAP_UNLOCK(pmap); 4597 goto retry; 4598 } 4599 } 4600 va = pv->pv_va; 4601 pde = pmap_pde(pmap, va); 4602 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 4603 PMAP_UNLOCK(pmap); 4604 } 4605 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4606 pmap = PV_PMAP(pv); 4607 if (!PMAP_TRYLOCK(pmap)) { 4608 pvh_gen = pvh->pv_gen; 4609 md_gen = m->md.pv_gen; 4610 rw_wunlock(lock); 4611 PMAP_LOCK(pmap); 4612 rw_wlock(lock); 4613 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4614 rw_wunlock(lock); 4615 PMAP_UNLOCK(pmap); 4616 goto retry; 4617 } 4618 } 4619 PG_A = pmap_accessed_bit(pmap); 4620 PG_M = pmap_modified_bit(pmap); 4621 PG_RW = pmap_rw_bit(pmap); 4622 pmap_resident_count_dec(pmap, 1); 4623 pde = pmap_pde(pmap, pv->pv_va); 4624 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 4625 " a 2mpage in page %p's pv list", m)); 4626 pte = pmap_pde_to_pte(pde, pv->pv_va); 4627 tpte = pte_load_clear(pte); 4628 if (tpte & PG_W) 4629 pmap->pm_stats.wired_count--; 4630 if (tpte & PG_A) 4631 vm_page_aflag_set(m, PGA_REFERENCED); 4632 4633 /* 4634 * Update the vm_page_t clean and reference bits. 4635 */ 4636 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 4637 vm_page_dirty(m); 4638 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); 4639 pmap_invalidate_page(pmap, pv->pv_va); 4640 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4641 m->md.pv_gen++; 4642 free_pv_entry(pmap, pv); 4643 PMAP_UNLOCK(pmap); 4644 } 4645 vm_page_aflag_clear(m, PGA_WRITEABLE); 4646 rw_wunlock(lock); 4647 pmap_delayed_invl_wait(m); 4648 vm_page_free_pages_toq(&free, true); 4649 } 4650 4651 /* 4652 * pmap_protect_pde: do the things to protect a 2mpage in a process 4653 */ 4654 static boolean_t 4655 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 4656 { 4657 pd_entry_t newpde, oldpde; 4658 vm_offset_t eva, va; 4659 vm_page_t m; 4660 boolean_t anychanged; 4661 pt_entry_t PG_G, PG_M, PG_RW; 4662 4663 PG_G = pmap_global_bit(pmap); 4664 PG_M = pmap_modified_bit(pmap); 4665 PG_RW = pmap_rw_bit(pmap); 4666 4667 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4668 KASSERT((sva & PDRMASK) == 0, 4669 ("pmap_protect_pde: sva is not 2mpage aligned")); 4670 anychanged = FALSE; 4671 retry: 4672 oldpde = newpde = *pde; 4673 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == 4674 (PG_MANAGED | PG_M | PG_RW)) { 4675 eva = sva + NBPDR; 4676 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 4677 va < eva; va += PAGE_SIZE, m++) 4678 vm_page_dirty(m); 4679 } 4680 if ((prot & VM_PROT_WRITE) == 0) 4681 newpde &= ~(PG_RW | PG_M); 4682 if ((prot & VM_PROT_EXECUTE) == 0) 4683 newpde |= pg_nx; 4684 if (newpde != oldpde) { 4685 /* 4686 * As an optimization to future operations on this PDE, clear 4687 * PG_PROMOTED. The impending invalidation will remove any 4688 * lingering 4KB page mappings from the TLB. 4689 */ 4690 if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED)) 4691 goto retry; 4692 if ((oldpde & PG_G) != 0) 4693 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 4694 else 4695 anychanged = TRUE; 4696 } 4697 return (anychanged); 4698 } 4699 4700 /* 4701 * Set the physical protection on the 4702 * specified range of this map as requested. 4703 */ 4704 void 4705 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4706 { 4707 vm_offset_t va_next; 4708 pml4_entry_t *pml4e; 4709 pdp_entry_t *pdpe; 4710 pd_entry_t ptpaddr, *pde; 4711 pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; 4712 boolean_t anychanged; 4713 4714 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 4715 if (prot == VM_PROT_NONE) { 4716 pmap_remove(pmap, sva, eva); 4717 return; 4718 } 4719 4720 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 4721 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 4722 return; 4723 4724 PG_G = pmap_global_bit(pmap); 4725 PG_M = pmap_modified_bit(pmap); 4726 PG_V = pmap_valid_bit(pmap); 4727 PG_RW = pmap_rw_bit(pmap); 4728 anychanged = FALSE; 4729 4730 /* 4731 * Although this function delays and batches the invalidation 4732 * of stale TLB entries, it does not need to call 4733 * pmap_delayed_invl_started() and 4734 * pmap_delayed_invl_finished(), because it does not 4735 * ordinarily destroy mappings. Stale TLB entries from 4736 * protection-only changes need only be invalidated before the 4737 * pmap lock is released, because protection-only changes do 4738 * not destroy PV entries. Even operations that iterate over 4739 * a physical page's PV list of mappings, like 4740 * pmap_remove_write(), acquire the pmap lock for each 4741 * mapping. Consequently, for protection-only changes, the 4742 * pmap lock suffices to synchronize both page table and TLB 4743 * updates. 4744 * 4745 * This function only destroys a mapping if pmap_demote_pde() 4746 * fails. In that case, stale TLB entries are immediately 4747 * invalidated. 4748 */ 4749 4750 PMAP_LOCK(pmap); 4751 for (; sva < eva; sva = va_next) { 4752 4753 pml4e = pmap_pml4e(pmap, sva); 4754 if ((*pml4e & PG_V) == 0) { 4755 va_next = (sva + NBPML4) & ~PML4MASK; 4756 if (va_next < sva) 4757 va_next = eva; 4758 continue; 4759 } 4760 4761 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 4762 if ((*pdpe & PG_V) == 0) { 4763 va_next = (sva + NBPDP) & ~PDPMASK; 4764 if (va_next < sva) 4765 va_next = eva; 4766 continue; 4767 } 4768 4769 va_next = (sva + NBPDR) & ~PDRMASK; 4770 if (va_next < sva) 4771 va_next = eva; 4772 4773 pde = pmap_pdpe_to_pde(pdpe, sva); 4774 ptpaddr = *pde; 4775 4776 /* 4777 * Weed out invalid mappings. 4778 */ 4779 if (ptpaddr == 0) 4780 continue; 4781 4782 /* 4783 * Check for large page. 4784 */ 4785 if ((ptpaddr & PG_PS) != 0) { 4786 /* 4787 * Are we protecting the entire large page? If not, 4788 * demote the mapping and fall through. 4789 */ 4790 if (sva + NBPDR == va_next && eva >= va_next) { 4791 /* 4792 * The TLB entry for a PG_G mapping is 4793 * invalidated by pmap_protect_pde(). 4794 */ 4795 if (pmap_protect_pde(pmap, pde, sva, prot)) 4796 anychanged = TRUE; 4797 continue; 4798 } else if (!pmap_demote_pde(pmap, pde, sva)) { 4799 /* 4800 * The large page mapping was destroyed. 4801 */ 4802 continue; 4803 } 4804 } 4805 4806 if (va_next > eva) 4807 va_next = eva; 4808 4809 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 4810 sva += PAGE_SIZE) { 4811 pt_entry_t obits, pbits; 4812 vm_page_t m; 4813 4814 retry: 4815 obits = pbits = *pte; 4816 if ((pbits & PG_V) == 0) 4817 continue; 4818 4819 if ((prot & VM_PROT_WRITE) == 0) { 4820 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 4821 (PG_MANAGED | PG_M | PG_RW)) { 4822 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 4823 vm_page_dirty(m); 4824 } 4825 pbits &= ~(PG_RW | PG_M); 4826 } 4827 if ((prot & VM_PROT_EXECUTE) == 0) 4828 pbits |= pg_nx; 4829 4830 if (pbits != obits) { 4831 if (!atomic_cmpset_long(pte, obits, pbits)) 4832 goto retry; 4833 if (obits & PG_G) 4834 pmap_invalidate_page(pmap, sva); 4835 else 4836 anychanged = TRUE; 4837 } 4838 } 4839 } 4840 if (anychanged) 4841 pmap_invalidate_all(pmap); 4842 PMAP_UNLOCK(pmap); 4843 } 4844 4845 #if VM_NRESERVLEVEL > 0 4846 /* 4847 * Tries to promote the 512, contiguous 4KB page mappings that are within a 4848 * single page table page (PTP) to a single 2MB page mapping. For promotion 4849 * to occur, two conditions must be met: (1) the 4KB page mappings must map 4850 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 4851 * identical characteristics. 4852 */ 4853 static void 4854 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 4855 struct rwlock **lockp) 4856 { 4857 pd_entry_t newpde; 4858 pt_entry_t *firstpte, oldpte, pa, *pte; 4859 pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK; 4860 vm_page_t mpte; 4861 int PG_PTE_CACHE; 4862 4863 PG_A = pmap_accessed_bit(pmap); 4864 PG_G = pmap_global_bit(pmap); 4865 PG_M = pmap_modified_bit(pmap); 4866 PG_V = pmap_valid_bit(pmap); 4867 PG_RW = pmap_rw_bit(pmap); 4868 PG_PKU_MASK = pmap_pku_mask_bit(pmap); 4869 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 4870 4871 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4872 4873 /* 4874 * Examine the first PTE in the specified PTP. Abort if this PTE is 4875 * either invalid, unused, or does not map the first 4KB physical page 4876 * within a 2MB page. 4877 */ 4878 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 4879 setpde: 4880 newpde = *firstpte; 4881 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 4882 atomic_add_long(&pmap_pde_p_failures, 1); 4883 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4884 " in pmap %p", va, pmap); 4885 return; 4886 } 4887 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 4888 /* 4889 * When PG_M is already clear, PG_RW can be cleared without 4890 * a TLB invalidation. 4891 */ 4892 if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW)) 4893 goto setpde; 4894 newpde &= ~PG_RW; 4895 } 4896 4897 /* 4898 * Examine each of the other PTEs in the specified PTP. Abort if this 4899 * PTE maps an unexpected 4KB physical page or does not have identical 4900 * characteristics to the first PTE. 4901 */ 4902 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 4903 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 4904 setpte: 4905 oldpte = *pte; 4906 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 4907 atomic_add_long(&pmap_pde_p_failures, 1); 4908 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4909 " in pmap %p", va, pmap); 4910 return; 4911 } 4912 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 4913 /* 4914 * When PG_M is already clear, PG_RW can be cleared 4915 * without a TLB invalidation. 4916 */ 4917 if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW)) 4918 goto setpte; 4919 oldpte &= ~PG_RW; 4920 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 4921 " in pmap %p", (oldpte & PG_FRAME & PDRMASK) | 4922 (va & ~PDRMASK), pmap); 4923 } 4924 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 4925 atomic_add_long(&pmap_pde_p_failures, 1); 4926 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4927 " in pmap %p", va, pmap); 4928 return; 4929 } 4930 pa -= PAGE_SIZE; 4931 } 4932 4933 /* 4934 * Save the page table page in its current state until the PDE 4935 * mapping the superpage is demoted by pmap_demote_pde() or 4936 * destroyed by pmap_remove_pde(). 4937 */ 4938 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 4939 KASSERT(mpte >= vm_page_array && 4940 mpte < &vm_page_array[vm_page_array_size], 4941 ("pmap_promote_pde: page table page is out of range")); 4942 KASSERT(mpte->pindex == pmap_pde_pindex(va), 4943 ("pmap_promote_pde: page table page's pindex is wrong")); 4944 if (pmap_insert_pt_page(pmap, mpte)) { 4945 atomic_add_long(&pmap_pde_p_failures, 1); 4946 CTR2(KTR_PMAP, 4947 "pmap_promote_pde: failure for va %#lx in pmap %p", va, 4948 pmap); 4949 return; 4950 } 4951 4952 /* 4953 * Promote the pv entries. 4954 */ 4955 if ((newpde & PG_MANAGED) != 0) 4956 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); 4957 4958 /* 4959 * Propagate the PAT index to its proper position. 4960 */ 4961 newpde = pmap_swap_pat(pmap, newpde); 4962 4963 /* 4964 * Map the superpage. 4965 */ 4966 if (workaround_erratum383) 4967 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 4968 else 4969 pde_store(pde, PG_PROMOTED | PG_PS | newpde); 4970 4971 atomic_add_long(&pmap_pde_promotions, 1); 4972 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" 4973 " in pmap %p", va, pmap); 4974 } 4975 #endif /* VM_NRESERVLEVEL > 0 */ 4976 4977 /* 4978 * Insert the given physical page (p) at 4979 * the specified virtual address (v) in the 4980 * target physical map with the protection requested. 4981 * 4982 * If specified, the page will be wired down, meaning 4983 * that the related pte can not be reclaimed. 4984 * 4985 * NB: This is the only routine which MAY NOT lazy-evaluate 4986 * or lose information. That is, this routine must actually 4987 * insert this page into the given map NOW. 4988 * 4989 * When destroying both a page table and PV entry, this function 4990 * performs the TLB invalidation before releasing the PV list 4991 * lock, so we do not need pmap_delayed_invl_page() calls here. 4992 */ 4993 int 4994 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4995 u_int flags, int8_t psind) 4996 { 4997 struct rwlock *lock; 4998 pd_entry_t *pde; 4999 pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; 5000 pt_entry_t newpte, origpte; 5001 pv_entry_t pv; 5002 vm_paddr_t opa, pa; 5003 vm_page_t mpte, om; 5004 int rv; 5005 boolean_t nosleep; 5006 5007 PG_A = pmap_accessed_bit(pmap); 5008 PG_G = pmap_global_bit(pmap); 5009 PG_M = pmap_modified_bit(pmap); 5010 PG_V = pmap_valid_bit(pmap); 5011 PG_RW = pmap_rw_bit(pmap); 5012 5013 va = trunc_page(va); 5014 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 5015 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 5016 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", 5017 va)); 5018 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || 5019 va >= kmi.clean_eva, 5020 ("pmap_enter: managed mapping within the clean submap")); 5021 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) 5022 VM_OBJECT_ASSERT_LOCKED(m->object); 5023 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 5024 ("pmap_enter: flags %u has reserved bits set", flags)); 5025 pa = VM_PAGE_TO_PHYS(m); 5026 newpte = (pt_entry_t)(pa | PG_A | PG_V); 5027 if ((flags & VM_PROT_WRITE) != 0) 5028 newpte |= PG_M; 5029 if ((prot & VM_PROT_WRITE) != 0) 5030 newpte |= PG_RW; 5031 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 5032 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); 5033 if ((prot & VM_PROT_EXECUTE) == 0) 5034 newpte |= pg_nx; 5035 if ((flags & PMAP_ENTER_WIRED) != 0) 5036 newpte |= PG_W; 5037 if (va < VM_MAXUSER_ADDRESS) 5038 newpte |= PG_U; 5039 if (pmap == kernel_pmap) 5040 newpte |= PG_G; 5041 newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0); 5042 5043 /* 5044 * Set modified bit gratuitously for writeable mappings if 5045 * the page is unmanaged. We do not want to take a fault 5046 * to do the dirty bit accounting for these mappings. 5047 */ 5048 if ((m->oflags & VPO_UNMANAGED) != 0) { 5049 if ((newpte & PG_RW) != 0) 5050 newpte |= PG_M; 5051 } else 5052 newpte |= PG_MANAGED; 5053 5054 lock = NULL; 5055 PMAP_LOCK(pmap); 5056 if (psind == 1) { 5057 /* Assert the required virtual and physical alignment. */ 5058 KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); 5059 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 5060 rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock); 5061 goto out; 5062 } 5063 mpte = NULL; 5064 5065 /* 5066 * In the case that a page table page is not 5067 * resident, we are creating it here. 5068 */ 5069 retry: 5070 pde = pmap_pde(pmap, va); 5071 if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || 5072 pmap_demote_pde_locked(pmap, pde, va, &lock))) { 5073 pte = pmap_pde_to_pte(pde, va); 5074 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 5075 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 5076 mpte->wire_count++; 5077 } 5078 } else if (va < VM_MAXUSER_ADDRESS) { 5079 /* 5080 * Here if the pte page isn't mapped, or if it has been 5081 * deallocated. 5082 */ 5083 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 5084 mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), 5085 nosleep ? NULL : &lock); 5086 if (mpte == NULL && nosleep) { 5087 rv = KERN_RESOURCE_SHORTAGE; 5088 goto out; 5089 } 5090 goto retry; 5091 } else 5092 panic("pmap_enter: invalid page directory va=%#lx", va); 5093 5094 origpte = *pte; 5095 pv = NULL; 5096 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) 5097 newpte |= pmap_pkru_get(pmap, va); 5098 5099 /* 5100 * Is the specified virtual address already mapped? 5101 */ 5102 if ((origpte & PG_V) != 0) { 5103 /* 5104 * Wiring change, just update stats. We don't worry about 5105 * wiring PT pages as they remain resident as long as there 5106 * are valid mappings in them. Hence, if a user page is wired, 5107 * the PT page will be also. 5108 */ 5109 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 5110 pmap->pm_stats.wired_count++; 5111 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 5112 pmap->pm_stats.wired_count--; 5113 5114 /* 5115 * Remove the extra PT page reference. 5116 */ 5117 if (mpte != NULL) { 5118 mpte->wire_count--; 5119 KASSERT(mpte->wire_count > 0, 5120 ("pmap_enter: missing reference to page table page," 5121 " va: 0x%lx", va)); 5122 } 5123 5124 /* 5125 * Has the physical page changed? 5126 */ 5127 opa = origpte & PG_FRAME; 5128 if (opa == pa) { 5129 /* 5130 * No, might be a protection or wiring change. 5131 */ 5132 if ((origpte & PG_MANAGED) != 0 && 5133 (newpte & PG_RW) != 0) 5134 vm_page_aflag_set(m, PGA_WRITEABLE); 5135 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 5136 goto unchanged; 5137 goto validate; 5138 } 5139 5140 /* 5141 * The physical page has changed. Temporarily invalidate 5142 * the mapping. This ensures that all threads sharing the 5143 * pmap keep a consistent view of the mapping, which is 5144 * necessary for the correct handling of COW faults. It 5145 * also permits reuse of the old mapping's PV entry, 5146 * avoiding an allocation. 5147 * 5148 * For consistency, handle unmanaged mappings the same way. 5149 */ 5150 origpte = pte_load_clear(pte); 5151 KASSERT((origpte & PG_FRAME) == opa, 5152 ("pmap_enter: unexpected pa update for %#lx", va)); 5153 if ((origpte & PG_MANAGED) != 0) { 5154 om = PHYS_TO_VM_PAGE(opa); 5155 5156 /* 5157 * The pmap lock is sufficient to synchronize with 5158 * concurrent calls to pmap_page_test_mappings() and 5159 * pmap_ts_referenced(). 5160 */ 5161 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5162 vm_page_dirty(om); 5163 if ((origpte & PG_A) != 0) 5164 vm_page_aflag_set(om, PGA_REFERENCED); 5165 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 5166 pv = pmap_pvh_remove(&om->md, pmap, va); 5167 KASSERT(pv != NULL, 5168 ("pmap_enter: no PV entry for %#lx", va)); 5169 if ((newpte & PG_MANAGED) == 0) 5170 free_pv_entry(pmap, pv); 5171 if ((om->aflags & PGA_WRITEABLE) != 0 && 5172 TAILQ_EMPTY(&om->md.pv_list) && 5173 ((om->flags & PG_FICTITIOUS) != 0 || 5174 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 5175 vm_page_aflag_clear(om, PGA_WRITEABLE); 5176 } 5177 if ((origpte & PG_A) != 0) 5178 pmap_invalidate_page(pmap, va); 5179 origpte = 0; 5180 } else { 5181 /* 5182 * Increment the counters. 5183 */ 5184 if ((newpte & PG_W) != 0) 5185 pmap->pm_stats.wired_count++; 5186 pmap_resident_count_inc(pmap, 1); 5187 } 5188 5189 /* 5190 * Enter on the PV list if part of our managed memory. 5191 */ 5192 if ((newpte & PG_MANAGED) != 0) { 5193 if (pv == NULL) { 5194 pv = get_pv_entry(pmap, &lock); 5195 pv->pv_va = va; 5196 } 5197 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 5198 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5199 m->md.pv_gen++; 5200 if ((newpte & PG_RW) != 0) 5201 vm_page_aflag_set(m, PGA_WRITEABLE); 5202 } 5203 5204 /* 5205 * Update the PTE. 5206 */ 5207 if ((origpte & PG_V) != 0) { 5208 validate: 5209 origpte = pte_load_store(pte, newpte); 5210 KASSERT((origpte & PG_FRAME) == pa, 5211 ("pmap_enter: unexpected pa update for %#lx", va)); 5212 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == 5213 (PG_M | PG_RW)) { 5214 if ((origpte & PG_MANAGED) != 0) 5215 vm_page_dirty(m); 5216 5217 /* 5218 * Although the PTE may still have PG_RW set, TLB 5219 * invalidation may nonetheless be required because 5220 * the PTE no longer has PG_M set. 5221 */ 5222 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { 5223 /* 5224 * This PTE change does not require TLB invalidation. 5225 */ 5226 goto unchanged; 5227 } 5228 if ((origpte & PG_A) != 0) 5229 pmap_invalidate_page(pmap, va); 5230 } else 5231 pte_store(pte, newpte); 5232 5233 unchanged: 5234 5235 #if VM_NRESERVLEVEL > 0 5236 /* 5237 * If both the page table page and the reservation are fully 5238 * populated, then attempt promotion. 5239 */ 5240 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 5241 pmap_ps_enabled(pmap) && 5242 (m->flags & PG_FICTITIOUS) == 0 && 5243 vm_reserv_level_iffullpop(m) == 0) 5244 pmap_promote_pde(pmap, pde, va, &lock); 5245 #endif 5246 5247 rv = KERN_SUCCESS; 5248 out: 5249 if (lock != NULL) 5250 rw_wunlock(lock); 5251 PMAP_UNLOCK(pmap); 5252 return (rv); 5253 } 5254 5255 /* 5256 * Tries to create a read- and/or execute-only 2MB page mapping. Returns true 5257 * if successful. Returns false if (1) a page table page cannot be allocated 5258 * without sleeping, (2) a mapping already exists at the specified virtual 5259 * address, or (3) a PV entry cannot be allocated without reclaiming another 5260 * PV entry. 5261 */ 5262 static bool 5263 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 5264 struct rwlock **lockp) 5265 { 5266 pd_entry_t newpde; 5267 pt_entry_t PG_V; 5268 5269 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5270 PG_V = pmap_valid_bit(pmap); 5271 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | 5272 PG_PS | PG_V; 5273 if ((m->oflags & VPO_UNMANAGED) == 0) 5274 newpde |= PG_MANAGED; 5275 if ((prot & VM_PROT_EXECUTE) == 0) 5276 newpde |= pg_nx; 5277 if (va < VM_MAXUSER_ADDRESS) 5278 newpde |= PG_U; 5279 return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | 5280 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == 5281 KERN_SUCCESS); 5282 } 5283 5284 /* 5285 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 5286 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 5287 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 5288 * a mapping already exists at the specified virtual address. Returns 5289 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table 5290 * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if 5291 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. 5292 * 5293 * The parameter "m" is only used when creating a managed, writeable mapping. 5294 */ 5295 static int 5296 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, 5297 vm_page_t m, struct rwlock **lockp) 5298 { 5299 struct spglist free; 5300 pd_entry_t oldpde, *pde; 5301 pt_entry_t PG_G, PG_RW, PG_V; 5302 vm_page_t mt, pdpg; 5303 5304 PG_G = pmap_global_bit(pmap); 5305 PG_RW = pmap_rw_bit(pmap); 5306 KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW, 5307 ("pmap_enter_pde: newpde is missing PG_M")); 5308 PG_V = pmap_valid_bit(pmap); 5309 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5310 5311 if ((pdpg = pmap_allocpde(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? 5312 NULL : lockp)) == NULL) { 5313 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 5314 " in pmap %p", va, pmap); 5315 return (KERN_RESOURCE_SHORTAGE); 5316 } 5317 5318 /* 5319 * If pkru is not same for the whole pde range, return failure 5320 * and let vm_fault() cope. Check after pde allocation, since 5321 * it could sleep. 5322 */ 5323 if (!pmap_pkru_same(pmap, va, va + NBPDR)) { 5324 SLIST_INIT(&free); 5325 if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { 5326 pmap_invalidate_page(pmap, va); 5327 vm_page_free_pages_toq(&free, true); 5328 } 5329 return (KERN_FAILURE); 5330 } 5331 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) { 5332 newpde &= ~X86_PG_PKU_MASK; 5333 newpde |= pmap_pkru_get(pmap, va); 5334 } 5335 5336 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 5337 pde = &pde[pmap_pde_index(va)]; 5338 oldpde = *pde; 5339 if ((oldpde & PG_V) != 0) { 5340 KASSERT(pdpg->wire_count > 1, 5341 ("pmap_enter_pde: pdpg's wire count is too low")); 5342 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 5343 pdpg->wire_count--; 5344 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 5345 " in pmap %p", va, pmap); 5346 return (KERN_FAILURE); 5347 } 5348 /* Break the existing mapping(s). */ 5349 SLIST_INIT(&free); 5350 if ((oldpde & PG_PS) != 0) { 5351 /* 5352 * The reference to the PD page that was acquired by 5353 * pmap_allocpde() ensures that it won't be freed. 5354 * However, if the PDE resulted from a promotion, then 5355 * a reserved PT page could be freed. 5356 */ 5357 (void)pmap_remove_pde(pmap, pde, va, &free, lockp); 5358 if ((oldpde & PG_G) == 0) 5359 pmap_invalidate_pde_page(pmap, va, oldpde); 5360 } else { 5361 pmap_delayed_invl_started(); 5362 if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free, 5363 lockp)) 5364 pmap_invalidate_all(pmap); 5365 pmap_delayed_invl_finished(); 5366 } 5367 vm_page_free_pages_toq(&free, true); 5368 if (va >= VM_MAXUSER_ADDRESS) { 5369 mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 5370 if (pmap_insert_pt_page(pmap, mt)) { 5371 /* 5372 * XXX Currently, this can't happen because 5373 * we do not perform pmap_enter(psind == 1) 5374 * on the kernel pmap. 5375 */ 5376 panic("pmap_enter_pde: trie insert failed"); 5377 } 5378 } else 5379 KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", 5380 pde)); 5381 } 5382 if ((newpde & PG_MANAGED) != 0) { 5383 /* 5384 * Abort this mapping if its PV entry could not be created. 5385 */ 5386 if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) { 5387 SLIST_INIT(&free); 5388 if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { 5389 /* 5390 * Although "va" is not mapped, paging- 5391 * structure caches could nonetheless have 5392 * entries that refer to the freed page table 5393 * pages. Invalidate those entries. 5394 */ 5395 pmap_invalidate_page(pmap, va); 5396 vm_page_free_pages_toq(&free, true); 5397 } 5398 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 5399 " in pmap %p", va, pmap); 5400 return (KERN_RESOURCE_SHORTAGE); 5401 } 5402 if ((newpde & PG_RW) != 0) { 5403 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 5404 vm_page_aflag_set(mt, PGA_WRITEABLE); 5405 } 5406 } 5407 5408 /* 5409 * Increment counters. 5410 */ 5411 if ((newpde & PG_W) != 0) 5412 pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; 5413 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); 5414 5415 /* 5416 * Map the superpage. (This is not a promoted mapping; there will not 5417 * be any lingering 4KB page mappings in the TLB.) 5418 */ 5419 pde_store(pde, newpde); 5420 5421 atomic_add_long(&pmap_pde_mappings, 1); 5422 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 5423 " in pmap %p", va, pmap); 5424 return (KERN_SUCCESS); 5425 } 5426 5427 /* 5428 * Maps a sequence of resident pages belonging to the same object. 5429 * The sequence begins with the given page m_start. This page is 5430 * mapped at the given virtual address start. Each subsequent page is 5431 * mapped at a virtual address that is offset from start by the same 5432 * amount as the page is offset from m_start within the object. The 5433 * last page in the sequence is the page with the largest offset from 5434 * m_start that can be mapped at a virtual address less than the given 5435 * virtual address end. Not every virtual page between start and end 5436 * is mapped; only those for which a resident page exists with the 5437 * corresponding offset from m_start are mapped. 5438 */ 5439 void 5440 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 5441 vm_page_t m_start, vm_prot_t prot) 5442 { 5443 struct rwlock *lock; 5444 vm_offset_t va; 5445 vm_page_t m, mpte; 5446 vm_pindex_t diff, psize; 5447 5448 VM_OBJECT_ASSERT_LOCKED(m_start->object); 5449 5450 psize = atop(end - start); 5451 mpte = NULL; 5452 m = m_start; 5453 lock = NULL; 5454 PMAP_LOCK(pmap); 5455 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 5456 va = start + ptoa(diff); 5457 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 5458 m->psind == 1 && pmap_ps_enabled(pmap) && 5459 pmap_enter_2mpage(pmap, va, m, prot, &lock)) 5460 m = &m[NBPDR / PAGE_SIZE - 1]; 5461 else 5462 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 5463 mpte, &lock); 5464 m = TAILQ_NEXT(m, listq); 5465 } 5466 if (lock != NULL) 5467 rw_wunlock(lock); 5468 PMAP_UNLOCK(pmap); 5469 } 5470 5471 /* 5472 * this code makes some *MAJOR* assumptions: 5473 * 1. Current pmap & pmap exists. 5474 * 2. Not wired. 5475 * 3. Read access. 5476 * 4. No page table pages. 5477 * but is *MUCH* faster than pmap_enter... 5478 */ 5479 5480 void 5481 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 5482 { 5483 struct rwlock *lock; 5484 5485 lock = NULL; 5486 PMAP_LOCK(pmap); 5487 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 5488 if (lock != NULL) 5489 rw_wunlock(lock); 5490 PMAP_UNLOCK(pmap); 5491 } 5492 5493 static vm_page_t 5494 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 5495 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 5496 { 5497 struct spglist free; 5498 pt_entry_t newpte, *pte, PG_V; 5499 5500 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 5501 (m->oflags & VPO_UNMANAGED) != 0, 5502 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 5503 PG_V = pmap_valid_bit(pmap); 5504 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5505 5506 /* 5507 * In the case that a page table page is not 5508 * resident, we are creating it here. 5509 */ 5510 if (va < VM_MAXUSER_ADDRESS) { 5511 vm_pindex_t ptepindex; 5512 pd_entry_t *ptepa; 5513 5514 /* 5515 * Calculate pagetable page index 5516 */ 5517 ptepindex = pmap_pde_pindex(va); 5518 if (mpte && (mpte->pindex == ptepindex)) { 5519 mpte->wire_count++; 5520 } else { 5521 /* 5522 * Get the page directory entry 5523 */ 5524 ptepa = pmap_pde(pmap, va); 5525 5526 /* 5527 * If the page table page is mapped, we just increment 5528 * the hold count, and activate it. Otherwise, we 5529 * attempt to allocate a page table page. If this 5530 * attempt fails, we don't retry. Instead, we give up. 5531 */ 5532 if (ptepa && (*ptepa & PG_V) != 0) { 5533 if (*ptepa & PG_PS) 5534 return (NULL); 5535 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); 5536 mpte->wire_count++; 5537 } else { 5538 /* 5539 * Pass NULL instead of the PV list lock 5540 * pointer, because we don't intend to sleep. 5541 */ 5542 mpte = _pmap_allocpte(pmap, ptepindex, NULL); 5543 if (mpte == NULL) 5544 return (mpte); 5545 } 5546 } 5547 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 5548 pte = &pte[pmap_pte_index(va)]; 5549 } else { 5550 mpte = NULL; 5551 pte = vtopte(va); 5552 } 5553 if (*pte) { 5554 if (mpte != NULL) { 5555 mpte->wire_count--; 5556 mpte = NULL; 5557 } 5558 return (mpte); 5559 } 5560 5561 /* 5562 * Enter on the PV list if part of our managed memory. 5563 */ 5564 if ((m->oflags & VPO_UNMANAGED) == 0 && 5565 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 5566 if (mpte != NULL) { 5567 SLIST_INIT(&free); 5568 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 5569 /* 5570 * Although "va" is not mapped, paging- 5571 * structure caches could nonetheless have 5572 * entries that refer to the freed page table 5573 * pages. Invalidate those entries. 5574 */ 5575 pmap_invalidate_page(pmap, va); 5576 vm_page_free_pages_toq(&free, true); 5577 } 5578 mpte = NULL; 5579 } 5580 return (mpte); 5581 } 5582 5583 /* 5584 * Increment counters 5585 */ 5586 pmap_resident_count_inc(pmap, 1); 5587 5588 newpte = VM_PAGE_TO_PHYS(m) | PG_V | 5589 pmap_cache_bits(pmap, m->md.pat_mode, 0); 5590 if ((m->oflags & VPO_UNMANAGED) == 0) 5591 newpte |= PG_MANAGED; 5592 if ((prot & VM_PROT_EXECUTE) == 0) 5593 newpte |= pg_nx; 5594 if (va < VM_MAXUSER_ADDRESS) 5595 newpte |= PG_U | pmap_pkru_get(pmap, va); 5596 pte_store(pte, newpte); 5597 return (mpte); 5598 } 5599 5600 /* 5601 * Make a temporary mapping for a physical address. This is only intended 5602 * to be used for panic dumps. 5603 */ 5604 void * 5605 pmap_kenter_temporary(vm_paddr_t pa, int i) 5606 { 5607 vm_offset_t va; 5608 5609 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 5610 pmap_kenter(va, pa); 5611 invlpg(va); 5612 return ((void *)crashdumpmap); 5613 } 5614 5615 /* 5616 * This code maps large physical mmap regions into the 5617 * processor address space. Note that some shortcuts 5618 * are taken, but the code works. 5619 */ 5620 void 5621 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 5622 vm_pindex_t pindex, vm_size_t size) 5623 { 5624 pd_entry_t *pde; 5625 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 5626 vm_paddr_t pa, ptepa; 5627 vm_page_t p, pdpg; 5628 int pat_mode; 5629 5630 PG_A = pmap_accessed_bit(pmap); 5631 PG_M = pmap_modified_bit(pmap); 5632 PG_V = pmap_valid_bit(pmap); 5633 PG_RW = pmap_rw_bit(pmap); 5634 5635 VM_OBJECT_ASSERT_WLOCKED(object); 5636 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 5637 ("pmap_object_init_pt: non-device object")); 5638 if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 5639 if (!pmap_ps_enabled(pmap)) 5640 return; 5641 if (!vm_object_populate(object, pindex, pindex + atop(size))) 5642 return; 5643 p = vm_page_lookup(object, pindex); 5644 KASSERT(p->valid == VM_PAGE_BITS_ALL, 5645 ("pmap_object_init_pt: invalid page %p", p)); 5646 pat_mode = p->md.pat_mode; 5647 5648 /* 5649 * Abort the mapping if the first page is not physically 5650 * aligned to a 2MB page boundary. 5651 */ 5652 ptepa = VM_PAGE_TO_PHYS(p); 5653 if (ptepa & (NBPDR - 1)) 5654 return; 5655 5656 /* 5657 * Skip the first page. Abort the mapping if the rest of 5658 * the pages are not physically contiguous or have differing 5659 * memory attributes. 5660 */ 5661 p = TAILQ_NEXT(p, listq); 5662 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 5663 pa += PAGE_SIZE) { 5664 KASSERT(p->valid == VM_PAGE_BITS_ALL, 5665 ("pmap_object_init_pt: invalid page %p", p)); 5666 if (pa != VM_PAGE_TO_PHYS(p) || 5667 pat_mode != p->md.pat_mode) 5668 return; 5669 p = TAILQ_NEXT(p, listq); 5670 } 5671 5672 /* 5673 * Map using 2MB pages. Since "ptepa" is 2M aligned and 5674 * "size" is a multiple of 2M, adding the PAT setting to "pa" 5675 * will not affect the termination of this loop. 5676 */ 5677 PMAP_LOCK(pmap); 5678 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); 5679 pa < ptepa + size; pa += NBPDR) { 5680 pdpg = pmap_allocpde(pmap, addr, NULL); 5681 if (pdpg == NULL) { 5682 /* 5683 * The creation of mappings below is only an 5684 * optimization. If a page directory page 5685 * cannot be allocated without blocking, 5686 * continue on to the next mapping rather than 5687 * blocking. 5688 */ 5689 addr += NBPDR; 5690 continue; 5691 } 5692 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 5693 pde = &pde[pmap_pde_index(addr)]; 5694 if ((*pde & PG_V) == 0) { 5695 pde_store(pde, pa | PG_PS | PG_M | PG_A | 5696 PG_U | PG_RW | PG_V); 5697 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); 5698 atomic_add_long(&pmap_pde_mappings, 1); 5699 } else { 5700 /* Continue on if the PDE is already valid. */ 5701 pdpg->wire_count--; 5702 KASSERT(pdpg->wire_count > 0, 5703 ("pmap_object_init_pt: missing reference " 5704 "to page directory page, va: 0x%lx", addr)); 5705 } 5706 addr += NBPDR; 5707 } 5708 PMAP_UNLOCK(pmap); 5709 } 5710 } 5711 5712 /* 5713 * Clear the wired attribute from the mappings for the specified range of 5714 * addresses in the given pmap. Every valid mapping within that range 5715 * must have the wired attribute set. In contrast, invalid mappings 5716 * cannot have the wired attribute set, so they are ignored. 5717 * 5718 * The wired attribute of the page table entry is not a hardware 5719 * feature, so there is no need to invalidate any TLB entries. 5720 * Since pmap_demote_pde() for the wired entry must never fail, 5721 * pmap_delayed_invl_started()/finished() calls around the 5722 * function are not needed. 5723 */ 5724 void 5725 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5726 { 5727 vm_offset_t va_next; 5728 pml4_entry_t *pml4e; 5729 pdp_entry_t *pdpe; 5730 pd_entry_t *pde; 5731 pt_entry_t *pte, PG_V; 5732 5733 PG_V = pmap_valid_bit(pmap); 5734 PMAP_LOCK(pmap); 5735 for (; sva < eva; sva = va_next) { 5736 pml4e = pmap_pml4e(pmap, sva); 5737 if ((*pml4e & PG_V) == 0) { 5738 va_next = (sva + NBPML4) & ~PML4MASK; 5739 if (va_next < sva) 5740 va_next = eva; 5741 continue; 5742 } 5743 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 5744 if ((*pdpe & PG_V) == 0) { 5745 va_next = (sva + NBPDP) & ~PDPMASK; 5746 if (va_next < sva) 5747 va_next = eva; 5748 continue; 5749 } 5750 va_next = (sva + NBPDR) & ~PDRMASK; 5751 if (va_next < sva) 5752 va_next = eva; 5753 pde = pmap_pdpe_to_pde(pdpe, sva); 5754 if ((*pde & PG_V) == 0) 5755 continue; 5756 if ((*pde & PG_PS) != 0) { 5757 if ((*pde & PG_W) == 0) 5758 panic("pmap_unwire: pde %#jx is missing PG_W", 5759 (uintmax_t)*pde); 5760 5761 /* 5762 * Are we unwiring the entire large page? If not, 5763 * demote the mapping and fall through. 5764 */ 5765 if (sva + NBPDR == va_next && eva >= va_next) { 5766 atomic_clear_long(pde, PG_W); 5767 pmap->pm_stats.wired_count -= NBPDR / 5768 PAGE_SIZE; 5769 continue; 5770 } else if (!pmap_demote_pde(pmap, pde, sva)) 5771 panic("pmap_unwire: demotion failed"); 5772 } 5773 if (va_next > eva) 5774 va_next = eva; 5775 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 5776 sva += PAGE_SIZE) { 5777 if ((*pte & PG_V) == 0) 5778 continue; 5779 if ((*pte & PG_W) == 0) 5780 panic("pmap_unwire: pte %#jx is missing PG_W", 5781 (uintmax_t)*pte); 5782 5783 /* 5784 * PG_W must be cleared atomically. Although the pmap 5785 * lock synchronizes access to PG_W, another processor 5786 * could be setting PG_M and/or PG_A concurrently. 5787 */ 5788 atomic_clear_long(pte, PG_W); 5789 pmap->pm_stats.wired_count--; 5790 } 5791 } 5792 PMAP_UNLOCK(pmap); 5793 } 5794 5795 /* 5796 * Copy the range specified by src_addr/len 5797 * from the source map to the range dst_addr/len 5798 * in the destination map. 5799 * 5800 * This routine is only advisory and need not do anything. 5801 */ 5802 5803 void 5804 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 5805 vm_offset_t src_addr) 5806 { 5807 struct rwlock *lock; 5808 struct spglist free; 5809 vm_offset_t addr; 5810 vm_offset_t end_addr = src_addr + len; 5811 vm_offset_t va_next; 5812 vm_page_t dst_pdpg, dstmpte, srcmpte; 5813 pt_entry_t PG_A, PG_M, PG_V; 5814 5815 if (dst_addr != src_addr) 5816 return; 5817 5818 if (dst_pmap->pm_type != src_pmap->pm_type) 5819 return; 5820 5821 /* 5822 * EPT page table entries that require emulation of A/D bits are 5823 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although 5824 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit 5825 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT 5826 * implementations flag an EPT misconfiguration for exec-only 5827 * mappings we skip this function entirely for emulated pmaps. 5828 */ 5829 if (pmap_emulate_ad_bits(dst_pmap)) 5830 return; 5831 5832 lock = NULL; 5833 if (dst_pmap < src_pmap) { 5834 PMAP_LOCK(dst_pmap); 5835 PMAP_LOCK(src_pmap); 5836 } else { 5837 PMAP_LOCK(src_pmap); 5838 PMAP_LOCK(dst_pmap); 5839 } 5840 5841 PG_A = pmap_accessed_bit(dst_pmap); 5842 PG_M = pmap_modified_bit(dst_pmap); 5843 PG_V = pmap_valid_bit(dst_pmap); 5844 5845 for (addr = src_addr; addr < end_addr; addr = va_next) { 5846 pt_entry_t *src_pte, *dst_pte; 5847 pml4_entry_t *pml4e; 5848 pdp_entry_t *pdpe; 5849 pd_entry_t srcptepaddr, *pde; 5850 5851 KASSERT(addr < UPT_MIN_ADDRESS, 5852 ("pmap_copy: invalid to pmap_copy page tables")); 5853 5854 pml4e = pmap_pml4e(src_pmap, addr); 5855 if ((*pml4e & PG_V) == 0) { 5856 va_next = (addr + NBPML4) & ~PML4MASK; 5857 if (va_next < addr) 5858 va_next = end_addr; 5859 continue; 5860 } 5861 5862 pdpe = pmap_pml4e_to_pdpe(pml4e, addr); 5863 if ((*pdpe & PG_V) == 0) { 5864 va_next = (addr + NBPDP) & ~PDPMASK; 5865 if (va_next < addr) 5866 va_next = end_addr; 5867 continue; 5868 } 5869 5870 va_next = (addr + NBPDR) & ~PDRMASK; 5871 if (va_next < addr) 5872 va_next = end_addr; 5873 5874 pde = pmap_pdpe_to_pde(pdpe, addr); 5875 srcptepaddr = *pde; 5876 if (srcptepaddr == 0) 5877 continue; 5878 5879 if (srcptepaddr & PG_PS) { 5880 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 5881 continue; 5882 dst_pdpg = pmap_allocpde(dst_pmap, addr, NULL); 5883 if (dst_pdpg == NULL) 5884 break; 5885 pde = (pd_entry_t *) 5886 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg)); 5887 pde = &pde[pmap_pde_index(addr)]; 5888 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 5889 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, 5890 PMAP_ENTER_NORECLAIM, &lock))) { 5891 *pde = srcptepaddr & ~PG_W; 5892 pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE); 5893 atomic_add_long(&pmap_pde_mappings, 1); 5894 } else 5895 dst_pdpg->wire_count--; 5896 continue; 5897 } 5898 5899 srcptepaddr &= PG_FRAME; 5900 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 5901 KASSERT(srcmpte->wire_count > 0, 5902 ("pmap_copy: source page table page is unused")); 5903 5904 if (va_next > end_addr) 5905 va_next = end_addr; 5906 5907 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 5908 src_pte = &src_pte[pmap_pte_index(addr)]; 5909 dstmpte = NULL; 5910 while (addr < va_next) { 5911 pt_entry_t ptetemp; 5912 ptetemp = *src_pte; 5913 /* 5914 * we only virtual copy managed pages 5915 */ 5916 if ((ptetemp & PG_MANAGED) != 0) { 5917 if (dstmpte != NULL && 5918 dstmpte->pindex == pmap_pde_pindex(addr)) 5919 dstmpte->wire_count++; 5920 else if ((dstmpte = pmap_allocpte(dst_pmap, 5921 addr, NULL)) == NULL) 5922 goto out; 5923 dst_pte = (pt_entry_t *) 5924 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 5925 dst_pte = &dst_pte[pmap_pte_index(addr)]; 5926 if (*dst_pte == 0 && 5927 pmap_try_insert_pv_entry(dst_pmap, addr, 5928 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), 5929 &lock)) { 5930 /* 5931 * Clear the wired, modified, and 5932 * accessed (referenced) bits 5933 * during the copy. 5934 */ 5935 *dst_pte = ptetemp & ~(PG_W | PG_M | 5936 PG_A); 5937 pmap_resident_count_inc(dst_pmap, 1); 5938 } else { 5939 SLIST_INIT(&free); 5940 if (pmap_unwire_ptp(dst_pmap, addr, 5941 dstmpte, &free)) { 5942 /* 5943 * Although "addr" is not 5944 * mapped, paging-structure 5945 * caches could nonetheless 5946 * have entries that refer to 5947 * the freed page table pages. 5948 * Invalidate those entries. 5949 */ 5950 pmap_invalidate_page(dst_pmap, 5951 addr); 5952 vm_page_free_pages_toq(&free, 5953 true); 5954 } 5955 goto out; 5956 } 5957 if (dstmpte->wire_count >= srcmpte->wire_count) 5958 break; 5959 } 5960 addr += PAGE_SIZE; 5961 src_pte++; 5962 } 5963 } 5964 out: 5965 if (lock != NULL) 5966 rw_wunlock(lock); 5967 PMAP_UNLOCK(src_pmap); 5968 PMAP_UNLOCK(dst_pmap); 5969 } 5970 5971 int 5972 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap) 5973 { 5974 int error; 5975 5976 if (dst_pmap->pm_type != src_pmap->pm_type || 5977 dst_pmap->pm_type != PT_X86 || 5978 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) 5979 return (0); 5980 for (;;) { 5981 if (dst_pmap < src_pmap) { 5982 PMAP_LOCK(dst_pmap); 5983 PMAP_LOCK(src_pmap); 5984 } else { 5985 PMAP_LOCK(src_pmap); 5986 PMAP_LOCK(dst_pmap); 5987 } 5988 error = pmap_pkru_copy(dst_pmap, src_pmap); 5989 /* Clean up partial copy on failure due to no memory. */ 5990 if (error == ENOMEM) 5991 pmap_pkru_deassign_all(dst_pmap); 5992 PMAP_UNLOCK(src_pmap); 5993 PMAP_UNLOCK(dst_pmap); 5994 if (error != ENOMEM) 5995 break; 5996 vm_wait(NULL); 5997 } 5998 return (error); 5999 } 6000 6001 /* 6002 * Zero the specified hardware page. 6003 */ 6004 void 6005 pmap_zero_page(vm_page_t m) 6006 { 6007 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 6008 6009 pagezero((void *)va); 6010 } 6011 6012 /* 6013 * Zero an an area within a single hardware page. off and size must not 6014 * cover an area beyond a single hardware page. 6015 */ 6016 void 6017 pmap_zero_page_area(vm_page_t m, int off, int size) 6018 { 6019 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 6020 6021 if (off == 0 && size == PAGE_SIZE) 6022 pagezero((void *)va); 6023 else 6024 bzero((char *)va + off, size); 6025 } 6026 6027 /* 6028 * Copy 1 specified hardware page to another. 6029 */ 6030 void 6031 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 6032 { 6033 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 6034 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 6035 6036 pagecopy((void *)src, (void *)dst); 6037 } 6038 6039 int unmapped_buf_allowed = 1; 6040 6041 void 6042 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 6043 vm_offset_t b_offset, int xfersize) 6044 { 6045 void *a_cp, *b_cp; 6046 vm_page_t pages[2]; 6047 vm_offset_t vaddr[2], a_pg_offset, b_pg_offset; 6048 int cnt; 6049 boolean_t mapped; 6050 6051 while (xfersize > 0) { 6052 a_pg_offset = a_offset & PAGE_MASK; 6053 pages[0] = ma[a_offset >> PAGE_SHIFT]; 6054 b_pg_offset = b_offset & PAGE_MASK; 6055 pages[1] = mb[b_offset >> PAGE_SHIFT]; 6056 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 6057 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 6058 mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE); 6059 a_cp = (char *)vaddr[0] + a_pg_offset; 6060 b_cp = (char *)vaddr[1] + b_pg_offset; 6061 bcopy(a_cp, b_cp, cnt); 6062 if (__predict_false(mapped)) 6063 pmap_unmap_io_transient(pages, vaddr, 2, FALSE); 6064 a_offset += cnt; 6065 b_offset += cnt; 6066 xfersize -= cnt; 6067 } 6068 } 6069 6070 /* 6071 * Returns true if the pmap's pv is one of the first 6072 * 16 pvs linked to from this page. This count may 6073 * be changed upwards or downwards in the future; it 6074 * is only necessary that true be returned for a small 6075 * subset of pmaps for proper page aging. 6076 */ 6077 boolean_t 6078 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 6079 { 6080 struct md_page *pvh; 6081 struct rwlock *lock; 6082 pv_entry_t pv; 6083 int loops = 0; 6084 boolean_t rv; 6085 6086 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6087 ("pmap_page_exists_quick: page %p is not managed", m)); 6088 rv = FALSE; 6089 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6090 rw_rlock(lock); 6091 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6092 if (PV_PMAP(pv) == pmap) { 6093 rv = TRUE; 6094 break; 6095 } 6096 loops++; 6097 if (loops >= 16) 6098 break; 6099 } 6100 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 6101 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6102 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 6103 if (PV_PMAP(pv) == pmap) { 6104 rv = TRUE; 6105 break; 6106 } 6107 loops++; 6108 if (loops >= 16) 6109 break; 6110 } 6111 } 6112 rw_runlock(lock); 6113 return (rv); 6114 } 6115 6116 /* 6117 * pmap_page_wired_mappings: 6118 * 6119 * Return the number of managed mappings to the given physical page 6120 * that are wired. 6121 */ 6122 int 6123 pmap_page_wired_mappings(vm_page_t m) 6124 { 6125 struct rwlock *lock; 6126 struct md_page *pvh; 6127 pmap_t pmap; 6128 pt_entry_t *pte; 6129 pv_entry_t pv; 6130 int count, md_gen, pvh_gen; 6131 6132 if ((m->oflags & VPO_UNMANAGED) != 0) 6133 return (0); 6134 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6135 rw_rlock(lock); 6136 restart: 6137 count = 0; 6138 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6139 pmap = PV_PMAP(pv); 6140 if (!PMAP_TRYLOCK(pmap)) { 6141 md_gen = m->md.pv_gen; 6142 rw_runlock(lock); 6143 PMAP_LOCK(pmap); 6144 rw_rlock(lock); 6145 if (md_gen != m->md.pv_gen) { 6146 PMAP_UNLOCK(pmap); 6147 goto restart; 6148 } 6149 } 6150 pte = pmap_pte(pmap, pv->pv_va); 6151 if ((*pte & PG_W) != 0) 6152 count++; 6153 PMAP_UNLOCK(pmap); 6154 } 6155 if ((m->flags & PG_FICTITIOUS) == 0) { 6156 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6157 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 6158 pmap = PV_PMAP(pv); 6159 if (!PMAP_TRYLOCK(pmap)) { 6160 md_gen = m->md.pv_gen; 6161 pvh_gen = pvh->pv_gen; 6162 rw_runlock(lock); 6163 PMAP_LOCK(pmap); 6164 rw_rlock(lock); 6165 if (md_gen != m->md.pv_gen || 6166 pvh_gen != pvh->pv_gen) { 6167 PMAP_UNLOCK(pmap); 6168 goto restart; 6169 } 6170 } 6171 pte = pmap_pde(pmap, pv->pv_va); 6172 if ((*pte & PG_W) != 0) 6173 count++; 6174 PMAP_UNLOCK(pmap); 6175 } 6176 } 6177 rw_runlock(lock); 6178 return (count); 6179 } 6180 6181 /* 6182 * Returns TRUE if the given page is mapped individually or as part of 6183 * a 2mpage. Otherwise, returns FALSE. 6184 */ 6185 boolean_t 6186 pmap_page_is_mapped(vm_page_t m) 6187 { 6188 struct rwlock *lock; 6189 boolean_t rv; 6190 6191 if ((m->oflags & VPO_UNMANAGED) != 0) 6192 return (FALSE); 6193 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6194 rw_rlock(lock); 6195 rv = !TAILQ_EMPTY(&m->md.pv_list) || 6196 ((m->flags & PG_FICTITIOUS) == 0 && 6197 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 6198 rw_runlock(lock); 6199 return (rv); 6200 } 6201 6202 /* 6203 * Destroy all managed, non-wired mappings in the given user-space 6204 * pmap. This pmap cannot be active on any processor besides the 6205 * caller. 6206 * 6207 * This function cannot be applied to the kernel pmap. Moreover, it 6208 * is not intended for general use. It is only to be used during 6209 * process termination. Consequently, it can be implemented in ways 6210 * that make it faster than pmap_remove(). First, it can more quickly 6211 * destroy mappings by iterating over the pmap's collection of PV 6212 * entries, rather than searching the page table. Second, it doesn't 6213 * have to test and clear the page table entries atomically, because 6214 * no processor is currently accessing the user address space. In 6215 * particular, a page table entry's dirty bit won't change state once 6216 * this function starts. 6217 * 6218 * Although this function destroys all of the pmap's managed, 6219 * non-wired mappings, it can delay and batch the invalidation of TLB 6220 * entries without calling pmap_delayed_invl_started() and 6221 * pmap_delayed_invl_finished(). Because the pmap is not active on 6222 * any other processor, none of these TLB entries will ever be used 6223 * before their eventual invalidation. Consequently, there is no need 6224 * for either pmap_remove_all() or pmap_remove_write() to wait for 6225 * that eventual TLB invalidation. 6226 */ 6227 void 6228 pmap_remove_pages(pmap_t pmap) 6229 { 6230 pd_entry_t ptepde; 6231 pt_entry_t *pte, tpte; 6232 pt_entry_t PG_M, PG_RW, PG_V; 6233 struct spglist free; 6234 vm_page_t m, mpte, mt; 6235 pv_entry_t pv; 6236 struct md_page *pvh; 6237 struct pv_chunk *pc, *npc; 6238 struct rwlock *lock; 6239 int64_t bit; 6240 uint64_t inuse, bitmask; 6241 int allfree, field, freed, idx; 6242 boolean_t superpage; 6243 vm_paddr_t pa; 6244 6245 /* 6246 * Assert that the given pmap is only active on the current 6247 * CPU. Unfortunately, we cannot block another CPU from 6248 * activating the pmap while this function is executing. 6249 */ 6250 KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); 6251 #ifdef INVARIANTS 6252 { 6253 cpuset_t other_cpus; 6254 6255 other_cpus = all_cpus; 6256 critical_enter(); 6257 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 6258 CPU_AND(&other_cpus, &pmap->pm_active); 6259 critical_exit(); 6260 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); 6261 } 6262 #endif 6263 6264 lock = NULL; 6265 PG_M = pmap_modified_bit(pmap); 6266 PG_V = pmap_valid_bit(pmap); 6267 PG_RW = pmap_rw_bit(pmap); 6268 6269 SLIST_INIT(&free); 6270 PMAP_LOCK(pmap); 6271 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 6272 allfree = 1; 6273 freed = 0; 6274 for (field = 0; field < _NPCM; field++) { 6275 inuse = ~pc->pc_map[field] & pc_freemask[field]; 6276 while (inuse != 0) { 6277 bit = bsfq(inuse); 6278 bitmask = 1UL << bit; 6279 idx = field * 64 + bit; 6280 pv = &pc->pc_pventry[idx]; 6281 inuse &= ~bitmask; 6282 6283 pte = pmap_pdpe(pmap, pv->pv_va); 6284 ptepde = *pte; 6285 pte = pmap_pdpe_to_pde(pte, pv->pv_va); 6286 tpte = *pte; 6287 if ((tpte & (PG_PS | PG_V)) == PG_V) { 6288 superpage = FALSE; 6289 ptepde = tpte; 6290 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 6291 PG_FRAME); 6292 pte = &pte[pmap_pte_index(pv->pv_va)]; 6293 tpte = *pte; 6294 } else { 6295 /* 6296 * Keep track whether 'tpte' is a 6297 * superpage explicitly instead of 6298 * relying on PG_PS being set. 6299 * 6300 * This is because PG_PS is numerically 6301 * identical to PG_PTE_PAT and thus a 6302 * regular page could be mistaken for 6303 * a superpage. 6304 */ 6305 superpage = TRUE; 6306 } 6307 6308 if ((tpte & PG_V) == 0) { 6309 panic("bad pte va %lx pte %lx", 6310 pv->pv_va, tpte); 6311 } 6312 6313 /* 6314 * We cannot remove wired pages from a process' mapping at this time 6315 */ 6316 if (tpte & PG_W) { 6317 allfree = 0; 6318 continue; 6319 } 6320 6321 if (superpage) 6322 pa = tpte & PG_PS_FRAME; 6323 else 6324 pa = tpte & PG_FRAME; 6325 6326 m = PHYS_TO_VM_PAGE(pa); 6327 KASSERT(m->phys_addr == pa, 6328 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 6329 m, (uintmax_t)m->phys_addr, 6330 (uintmax_t)tpte)); 6331 6332 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 6333 m < &vm_page_array[vm_page_array_size], 6334 ("pmap_remove_pages: bad tpte %#jx", 6335 (uintmax_t)tpte)); 6336 6337 pte_clear(pte); 6338 6339 /* 6340 * Update the vm_page_t clean/reference bits. 6341 */ 6342 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 6343 if (superpage) { 6344 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 6345 vm_page_dirty(mt); 6346 } else 6347 vm_page_dirty(m); 6348 } 6349 6350 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 6351 6352 /* Mark free */ 6353 pc->pc_map[field] |= bitmask; 6354 if (superpage) { 6355 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); 6356 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 6357 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 6358 pvh->pv_gen++; 6359 if (TAILQ_EMPTY(&pvh->pv_list)) { 6360 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 6361 if ((mt->aflags & PGA_WRITEABLE) != 0 && 6362 TAILQ_EMPTY(&mt->md.pv_list)) 6363 vm_page_aflag_clear(mt, PGA_WRITEABLE); 6364 } 6365 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 6366 if (mpte != NULL) { 6367 pmap_resident_count_dec(pmap, 1); 6368 KASSERT(mpte->wire_count == NPTEPG, 6369 ("pmap_remove_pages: pte page wire count error")); 6370 mpte->wire_count = 0; 6371 pmap_add_delayed_free_list(mpte, &free, FALSE); 6372 } 6373 } else { 6374 pmap_resident_count_dec(pmap, 1); 6375 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 6376 m->md.pv_gen++; 6377 if ((m->aflags & PGA_WRITEABLE) != 0 && 6378 TAILQ_EMPTY(&m->md.pv_list) && 6379 (m->flags & PG_FICTITIOUS) == 0) { 6380 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6381 if (TAILQ_EMPTY(&pvh->pv_list)) 6382 vm_page_aflag_clear(m, PGA_WRITEABLE); 6383 } 6384 } 6385 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 6386 freed++; 6387 } 6388 } 6389 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 6390 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 6391 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 6392 if (allfree) { 6393 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 6394 free_pv_chunk(pc); 6395 } 6396 } 6397 if (lock != NULL) 6398 rw_wunlock(lock); 6399 pmap_invalidate_all(pmap); 6400 pmap_pkru_deassign_all(pmap); 6401 PMAP_UNLOCK(pmap); 6402 vm_page_free_pages_toq(&free, true); 6403 } 6404 6405 static boolean_t 6406 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 6407 { 6408 struct rwlock *lock; 6409 pv_entry_t pv; 6410 struct md_page *pvh; 6411 pt_entry_t *pte, mask; 6412 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 6413 pmap_t pmap; 6414 int md_gen, pvh_gen; 6415 boolean_t rv; 6416 6417 rv = FALSE; 6418 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6419 rw_rlock(lock); 6420 restart: 6421 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6422 pmap = PV_PMAP(pv); 6423 if (!PMAP_TRYLOCK(pmap)) { 6424 md_gen = m->md.pv_gen; 6425 rw_runlock(lock); 6426 PMAP_LOCK(pmap); 6427 rw_rlock(lock); 6428 if (md_gen != m->md.pv_gen) { 6429 PMAP_UNLOCK(pmap); 6430 goto restart; 6431 } 6432 } 6433 pte = pmap_pte(pmap, pv->pv_va); 6434 mask = 0; 6435 if (modified) { 6436 PG_M = pmap_modified_bit(pmap); 6437 PG_RW = pmap_rw_bit(pmap); 6438 mask |= PG_RW | PG_M; 6439 } 6440 if (accessed) { 6441 PG_A = pmap_accessed_bit(pmap); 6442 PG_V = pmap_valid_bit(pmap); 6443 mask |= PG_V | PG_A; 6444 } 6445 rv = (*pte & mask) == mask; 6446 PMAP_UNLOCK(pmap); 6447 if (rv) 6448 goto out; 6449 } 6450 if ((m->flags & PG_FICTITIOUS) == 0) { 6451 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6452 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 6453 pmap = PV_PMAP(pv); 6454 if (!PMAP_TRYLOCK(pmap)) { 6455 md_gen = m->md.pv_gen; 6456 pvh_gen = pvh->pv_gen; 6457 rw_runlock(lock); 6458 PMAP_LOCK(pmap); 6459 rw_rlock(lock); 6460 if (md_gen != m->md.pv_gen || 6461 pvh_gen != pvh->pv_gen) { 6462 PMAP_UNLOCK(pmap); 6463 goto restart; 6464 } 6465 } 6466 pte = pmap_pde(pmap, pv->pv_va); 6467 mask = 0; 6468 if (modified) { 6469 PG_M = pmap_modified_bit(pmap); 6470 PG_RW = pmap_rw_bit(pmap); 6471 mask |= PG_RW | PG_M; 6472 } 6473 if (accessed) { 6474 PG_A = pmap_accessed_bit(pmap); 6475 PG_V = pmap_valid_bit(pmap); 6476 mask |= PG_V | PG_A; 6477 } 6478 rv = (*pte & mask) == mask; 6479 PMAP_UNLOCK(pmap); 6480 if (rv) 6481 goto out; 6482 } 6483 } 6484 out: 6485 rw_runlock(lock); 6486 return (rv); 6487 } 6488 6489 /* 6490 * pmap_is_modified: 6491 * 6492 * Return whether or not the specified physical page was modified 6493 * in any physical maps. 6494 */ 6495 boolean_t 6496 pmap_is_modified(vm_page_t m) 6497 { 6498 6499 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6500 ("pmap_is_modified: page %p is not managed", m)); 6501 6502 /* 6503 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 6504 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 6505 * is clear, no PTEs can have PG_M set. 6506 */ 6507 VM_OBJECT_ASSERT_WLOCKED(m->object); 6508 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 6509 return (FALSE); 6510 return (pmap_page_test_mappings(m, FALSE, TRUE)); 6511 } 6512 6513 /* 6514 * pmap_is_prefaultable: 6515 * 6516 * Return whether or not the specified virtual address is eligible 6517 * for prefault. 6518 */ 6519 boolean_t 6520 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 6521 { 6522 pd_entry_t *pde; 6523 pt_entry_t *pte, PG_V; 6524 boolean_t rv; 6525 6526 PG_V = pmap_valid_bit(pmap); 6527 rv = FALSE; 6528 PMAP_LOCK(pmap); 6529 pde = pmap_pde(pmap, addr); 6530 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { 6531 pte = pmap_pde_to_pte(pde, addr); 6532 rv = (*pte & PG_V) == 0; 6533 } 6534 PMAP_UNLOCK(pmap); 6535 return (rv); 6536 } 6537 6538 /* 6539 * pmap_is_referenced: 6540 * 6541 * Return whether or not the specified physical page was referenced 6542 * in any physical maps. 6543 */ 6544 boolean_t 6545 pmap_is_referenced(vm_page_t m) 6546 { 6547 6548 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6549 ("pmap_is_referenced: page %p is not managed", m)); 6550 return (pmap_page_test_mappings(m, TRUE, FALSE)); 6551 } 6552 6553 /* 6554 * Clear the write and modified bits in each of the given page's mappings. 6555 */ 6556 void 6557 pmap_remove_write(vm_page_t m) 6558 { 6559 struct md_page *pvh; 6560 pmap_t pmap; 6561 struct rwlock *lock; 6562 pv_entry_t next_pv, pv; 6563 pd_entry_t *pde; 6564 pt_entry_t oldpte, *pte, PG_M, PG_RW; 6565 vm_offset_t va; 6566 int pvh_gen, md_gen; 6567 6568 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6569 ("pmap_remove_write: page %p is not managed", m)); 6570 6571 /* 6572 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 6573 * set by another thread while the object is locked. Thus, 6574 * if PGA_WRITEABLE is clear, no page table entries need updating. 6575 */ 6576 VM_OBJECT_ASSERT_WLOCKED(m->object); 6577 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 6578 return; 6579 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6580 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 6581 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6582 retry_pv_loop: 6583 rw_wlock(lock); 6584 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 6585 pmap = PV_PMAP(pv); 6586 if (!PMAP_TRYLOCK(pmap)) { 6587 pvh_gen = pvh->pv_gen; 6588 rw_wunlock(lock); 6589 PMAP_LOCK(pmap); 6590 rw_wlock(lock); 6591 if (pvh_gen != pvh->pv_gen) { 6592 PMAP_UNLOCK(pmap); 6593 rw_wunlock(lock); 6594 goto retry_pv_loop; 6595 } 6596 } 6597 PG_RW = pmap_rw_bit(pmap); 6598 va = pv->pv_va; 6599 pde = pmap_pde(pmap, va); 6600 if ((*pde & PG_RW) != 0) 6601 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 6602 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 6603 ("inconsistent pv lock %p %p for page %p", 6604 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 6605 PMAP_UNLOCK(pmap); 6606 } 6607 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6608 pmap = PV_PMAP(pv); 6609 if (!PMAP_TRYLOCK(pmap)) { 6610 pvh_gen = pvh->pv_gen; 6611 md_gen = m->md.pv_gen; 6612 rw_wunlock(lock); 6613 PMAP_LOCK(pmap); 6614 rw_wlock(lock); 6615 if (pvh_gen != pvh->pv_gen || 6616 md_gen != m->md.pv_gen) { 6617 PMAP_UNLOCK(pmap); 6618 rw_wunlock(lock); 6619 goto retry_pv_loop; 6620 } 6621 } 6622 PG_M = pmap_modified_bit(pmap); 6623 PG_RW = pmap_rw_bit(pmap); 6624 pde = pmap_pde(pmap, pv->pv_va); 6625 KASSERT((*pde & PG_PS) == 0, 6626 ("pmap_remove_write: found a 2mpage in page %p's pv list", 6627 m)); 6628 pte = pmap_pde_to_pte(pde, pv->pv_va); 6629 retry: 6630 oldpte = *pte; 6631 if (oldpte & PG_RW) { 6632 if (!atomic_cmpset_long(pte, oldpte, oldpte & 6633 ~(PG_RW | PG_M))) 6634 goto retry; 6635 if ((oldpte & PG_M) != 0) 6636 vm_page_dirty(m); 6637 pmap_invalidate_page(pmap, pv->pv_va); 6638 } 6639 PMAP_UNLOCK(pmap); 6640 } 6641 rw_wunlock(lock); 6642 vm_page_aflag_clear(m, PGA_WRITEABLE); 6643 pmap_delayed_invl_wait(m); 6644 } 6645 6646 static __inline boolean_t 6647 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) 6648 { 6649 6650 if (!pmap_emulate_ad_bits(pmap)) 6651 return (TRUE); 6652 6653 KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); 6654 6655 /* 6656 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration 6657 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared 6658 * if the EPT_PG_WRITE bit is set. 6659 */ 6660 if ((pte & EPT_PG_WRITE) != 0) 6661 return (FALSE); 6662 6663 /* 6664 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. 6665 */ 6666 if ((pte & EPT_PG_EXECUTE) == 0 || 6667 ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) 6668 return (TRUE); 6669 else 6670 return (FALSE); 6671 } 6672 6673 /* 6674 * pmap_ts_referenced: 6675 * 6676 * Return a count of reference bits for a page, clearing those bits. 6677 * It is not necessary for every reference bit to be cleared, but it 6678 * is necessary that 0 only be returned when there are truly no 6679 * reference bits set. 6680 * 6681 * As an optimization, update the page's dirty field if a modified bit is 6682 * found while counting reference bits. This opportunistic update can be 6683 * performed at low cost and can eliminate the need for some future calls 6684 * to pmap_is_modified(). However, since this function stops after 6685 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 6686 * dirty pages. Those dirty pages will only be detected by a future call 6687 * to pmap_is_modified(). 6688 * 6689 * A DI block is not needed within this function, because 6690 * invalidations are performed before the PV list lock is 6691 * released. 6692 */ 6693 int 6694 pmap_ts_referenced(vm_page_t m) 6695 { 6696 struct md_page *pvh; 6697 pv_entry_t pv, pvf; 6698 pmap_t pmap; 6699 struct rwlock *lock; 6700 pd_entry_t oldpde, *pde; 6701 pt_entry_t *pte, PG_A, PG_M, PG_RW; 6702 vm_offset_t va; 6703 vm_paddr_t pa; 6704 int cleared, md_gen, not_cleared, pvh_gen; 6705 struct spglist free; 6706 boolean_t demoted; 6707 6708 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6709 ("pmap_ts_referenced: page %p is not managed", m)); 6710 SLIST_INIT(&free); 6711 cleared = 0; 6712 pa = VM_PAGE_TO_PHYS(m); 6713 lock = PHYS_TO_PV_LIST_LOCK(pa); 6714 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 6715 rw_wlock(lock); 6716 retry: 6717 not_cleared = 0; 6718 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 6719 goto small_mappings; 6720 pv = pvf; 6721 do { 6722 if (pvf == NULL) 6723 pvf = pv; 6724 pmap = PV_PMAP(pv); 6725 if (!PMAP_TRYLOCK(pmap)) { 6726 pvh_gen = pvh->pv_gen; 6727 rw_wunlock(lock); 6728 PMAP_LOCK(pmap); 6729 rw_wlock(lock); 6730 if (pvh_gen != pvh->pv_gen) { 6731 PMAP_UNLOCK(pmap); 6732 goto retry; 6733 } 6734 } 6735 PG_A = pmap_accessed_bit(pmap); 6736 PG_M = pmap_modified_bit(pmap); 6737 PG_RW = pmap_rw_bit(pmap); 6738 va = pv->pv_va; 6739 pde = pmap_pde(pmap, pv->pv_va); 6740 oldpde = *pde; 6741 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 6742 /* 6743 * Although "oldpde" is mapping a 2MB page, because 6744 * this function is called at a 4KB page granularity, 6745 * we only update the 4KB page under test. 6746 */ 6747 vm_page_dirty(m); 6748 } 6749 if ((oldpde & PG_A) != 0) { 6750 /* 6751 * Since this reference bit is shared by 512 4KB 6752 * pages, it should not be cleared every time it is 6753 * tested. Apply a simple "hash" function on the 6754 * physical page number, the virtual superpage number, 6755 * and the pmap address to select one 4KB page out of 6756 * the 512 on which testing the reference bit will 6757 * result in clearing that reference bit. This 6758 * function is designed to avoid the selection of the 6759 * same 4KB page for every 2MB page mapping. 6760 * 6761 * On demotion, a mapping that hasn't been referenced 6762 * is simply destroyed. To avoid the possibility of a 6763 * subsequent page fault on a demoted wired mapping, 6764 * always leave its reference bit set. Moreover, 6765 * since the superpage is wired, the current state of 6766 * its reference bit won't affect page replacement. 6767 */ 6768 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 6769 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 6770 (oldpde & PG_W) == 0) { 6771 if (safe_to_clear_referenced(pmap, oldpde)) { 6772 atomic_clear_long(pde, PG_A); 6773 pmap_invalidate_page(pmap, pv->pv_va); 6774 demoted = FALSE; 6775 } else if (pmap_demote_pde_locked(pmap, pde, 6776 pv->pv_va, &lock)) { 6777 /* 6778 * Remove the mapping to a single page 6779 * so that a subsequent access may 6780 * repromote. Since the underlying 6781 * page table page is fully populated, 6782 * this removal never frees a page 6783 * table page. 6784 */ 6785 demoted = TRUE; 6786 va += VM_PAGE_TO_PHYS(m) - (oldpde & 6787 PG_PS_FRAME); 6788 pte = pmap_pde_to_pte(pde, va); 6789 pmap_remove_pte(pmap, pte, va, *pde, 6790 NULL, &lock); 6791 pmap_invalidate_page(pmap, va); 6792 } else 6793 demoted = TRUE; 6794 6795 if (demoted) { 6796 /* 6797 * The superpage mapping was removed 6798 * entirely and therefore 'pv' is no 6799 * longer valid. 6800 */ 6801 if (pvf == pv) 6802 pvf = NULL; 6803 pv = NULL; 6804 } 6805 cleared++; 6806 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 6807 ("inconsistent pv lock %p %p for page %p", 6808 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 6809 } else 6810 not_cleared++; 6811 } 6812 PMAP_UNLOCK(pmap); 6813 /* Rotate the PV list if it has more than one entry. */ 6814 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 6815 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 6816 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 6817 pvh->pv_gen++; 6818 } 6819 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 6820 goto out; 6821 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 6822 small_mappings: 6823 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 6824 goto out; 6825 pv = pvf; 6826 do { 6827 if (pvf == NULL) 6828 pvf = pv; 6829 pmap = PV_PMAP(pv); 6830 if (!PMAP_TRYLOCK(pmap)) { 6831 pvh_gen = pvh->pv_gen; 6832 md_gen = m->md.pv_gen; 6833 rw_wunlock(lock); 6834 PMAP_LOCK(pmap); 6835 rw_wlock(lock); 6836 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 6837 PMAP_UNLOCK(pmap); 6838 goto retry; 6839 } 6840 } 6841 PG_A = pmap_accessed_bit(pmap); 6842 PG_M = pmap_modified_bit(pmap); 6843 PG_RW = pmap_rw_bit(pmap); 6844 pde = pmap_pde(pmap, pv->pv_va); 6845 KASSERT((*pde & PG_PS) == 0, 6846 ("pmap_ts_referenced: found a 2mpage in page %p's pv list", 6847 m)); 6848 pte = pmap_pde_to_pte(pde, pv->pv_va); 6849 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6850 vm_page_dirty(m); 6851 if ((*pte & PG_A) != 0) { 6852 if (safe_to_clear_referenced(pmap, *pte)) { 6853 atomic_clear_long(pte, PG_A); 6854 pmap_invalidate_page(pmap, pv->pv_va); 6855 cleared++; 6856 } else if ((*pte & PG_W) == 0) { 6857 /* 6858 * Wired pages cannot be paged out so 6859 * doing accessed bit emulation for 6860 * them is wasted effort. We do the 6861 * hard work for unwired pages only. 6862 */ 6863 pmap_remove_pte(pmap, pte, pv->pv_va, 6864 *pde, &free, &lock); 6865 pmap_invalidate_page(pmap, pv->pv_va); 6866 cleared++; 6867 if (pvf == pv) 6868 pvf = NULL; 6869 pv = NULL; 6870 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 6871 ("inconsistent pv lock %p %p for page %p", 6872 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 6873 } else 6874 not_cleared++; 6875 } 6876 PMAP_UNLOCK(pmap); 6877 /* Rotate the PV list if it has more than one entry. */ 6878 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 6879 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 6880 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 6881 m->md.pv_gen++; 6882 } 6883 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 6884 not_cleared < PMAP_TS_REFERENCED_MAX); 6885 out: 6886 rw_wunlock(lock); 6887 vm_page_free_pages_toq(&free, true); 6888 return (cleared + not_cleared); 6889 } 6890 6891 /* 6892 * Apply the given advice to the specified range of addresses within the 6893 * given pmap. Depending on the advice, clear the referenced and/or 6894 * modified flags in each mapping and set the mapped page's dirty field. 6895 */ 6896 void 6897 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 6898 { 6899 struct rwlock *lock; 6900 pml4_entry_t *pml4e; 6901 pdp_entry_t *pdpe; 6902 pd_entry_t oldpde, *pde; 6903 pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; 6904 vm_offset_t va, va_next; 6905 vm_page_t m; 6906 boolean_t anychanged; 6907 6908 if (advice != MADV_DONTNEED && advice != MADV_FREE) 6909 return; 6910 6911 /* 6912 * A/D bit emulation requires an alternate code path when clearing 6913 * the modified and accessed bits below. Since this function is 6914 * advisory in nature we skip it entirely for pmaps that require 6915 * A/D bit emulation. 6916 */ 6917 if (pmap_emulate_ad_bits(pmap)) 6918 return; 6919 6920 PG_A = pmap_accessed_bit(pmap); 6921 PG_G = pmap_global_bit(pmap); 6922 PG_M = pmap_modified_bit(pmap); 6923 PG_V = pmap_valid_bit(pmap); 6924 PG_RW = pmap_rw_bit(pmap); 6925 anychanged = FALSE; 6926 pmap_delayed_invl_started(); 6927 PMAP_LOCK(pmap); 6928 for (; sva < eva; sva = va_next) { 6929 pml4e = pmap_pml4e(pmap, sva); 6930 if ((*pml4e & PG_V) == 0) { 6931 va_next = (sva + NBPML4) & ~PML4MASK; 6932 if (va_next < sva) 6933 va_next = eva; 6934 continue; 6935 } 6936 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 6937 if ((*pdpe & PG_V) == 0) { 6938 va_next = (sva + NBPDP) & ~PDPMASK; 6939 if (va_next < sva) 6940 va_next = eva; 6941 continue; 6942 } 6943 va_next = (sva + NBPDR) & ~PDRMASK; 6944 if (va_next < sva) 6945 va_next = eva; 6946 pde = pmap_pdpe_to_pde(pdpe, sva); 6947 oldpde = *pde; 6948 if ((oldpde & PG_V) == 0) 6949 continue; 6950 else if ((oldpde & PG_PS) != 0) { 6951 if ((oldpde & PG_MANAGED) == 0) 6952 continue; 6953 lock = NULL; 6954 if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { 6955 if (lock != NULL) 6956 rw_wunlock(lock); 6957 6958 /* 6959 * The large page mapping was destroyed. 6960 */ 6961 continue; 6962 } 6963 6964 /* 6965 * Unless the page mappings are wired, remove the 6966 * mapping to a single page so that a subsequent 6967 * access may repromote. Since the underlying page 6968 * table page is fully populated, this removal never 6969 * frees a page table page. 6970 */ 6971 if ((oldpde & PG_W) == 0) { 6972 pte = pmap_pde_to_pte(pde, sva); 6973 KASSERT((*pte & PG_V) != 0, 6974 ("pmap_advise: invalid PTE")); 6975 pmap_remove_pte(pmap, pte, sva, *pde, NULL, 6976 &lock); 6977 anychanged = TRUE; 6978 } 6979 if (lock != NULL) 6980 rw_wunlock(lock); 6981 } 6982 if (va_next > eva) 6983 va_next = eva; 6984 va = va_next; 6985 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 6986 sva += PAGE_SIZE) { 6987 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) 6988 goto maybe_invlrng; 6989 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 6990 if (advice == MADV_DONTNEED) { 6991 /* 6992 * Future calls to pmap_is_modified() 6993 * can be avoided by making the page 6994 * dirty now. 6995 */ 6996 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 6997 vm_page_dirty(m); 6998 } 6999 atomic_clear_long(pte, PG_M | PG_A); 7000 } else if ((*pte & PG_A) != 0) 7001 atomic_clear_long(pte, PG_A); 7002 else 7003 goto maybe_invlrng; 7004 7005 if ((*pte & PG_G) != 0) { 7006 if (va == va_next) 7007 va = sva; 7008 } else 7009 anychanged = TRUE; 7010 continue; 7011 maybe_invlrng: 7012 if (va != va_next) { 7013 pmap_invalidate_range(pmap, va, sva); 7014 va = va_next; 7015 } 7016 } 7017 if (va != va_next) 7018 pmap_invalidate_range(pmap, va, sva); 7019 } 7020 if (anychanged) 7021 pmap_invalidate_all(pmap); 7022 PMAP_UNLOCK(pmap); 7023 pmap_delayed_invl_finished(); 7024 } 7025 7026 /* 7027 * Clear the modify bits on the specified physical page. 7028 */ 7029 void 7030 pmap_clear_modify(vm_page_t m) 7031 { 7032 struct md_page *pvh; 7033 pmap_t pmap; 7034 pv_entry_t next_pv, pv; 7035 pd_entry_t oldpde, *pde; 7036 pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V; 7037 struct rwlock *lock; 7038 vm_offset_t va; 7039 int md_gen, pvh_gen; 7040 7041 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7042 ("pmap_clear_modify: page %p is not managed", m)); 7043 VM_OBJECT_ASSERT_WLOCKED(m->object); 7044 KASSERT(!vm_page_xbusied(m), 7045 ("pmap_clear_modify: page %p is exclusive busied", m)); 7046 7047 /* 7048 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 7049 * If the object containing the page is locked and the page is not 7050 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 7051 */ 7052 if ((m->aflags & PGA_WRITEABLE) == 0) 7053 return; 7054 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 7055 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 7056 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 7057 rw_wlock(lock); 7058 restart: 7059 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 7060 pmap = PV_PMAP(pv); 7061 if (!PMAP_TRYLOCK(pmap)) { 7062 pvh_gen = pvh->pv_gen; 7063 rw_wunlock(lock); 7064 PMAP_LOCK(pmap); 7065 rw_wlock(lock); 7066 if (pvh_gen != pvh->pv_gen) { 7067 PMAP_UNLOCK(pmap); 7068 goto restart; 7069 } 7070 } 7071 PG_M = pmap_modified_bit(pmap); 7072 PG_V = pmap_valid_bit(pmap); 7073 PG_RW = pmap_rw_bit(pmap); 7074 va = pv->pv_va; 7075 pde = pmap_pde(pmap, va); 7076 oldpde = *pde; 7077 if ((oldpde & PG_RW) != 0) { 7078 if (pmap_demote_pde_locked(pmap, pde, va, &lock)) { 7079 if ((oldpde & PG_W) == 0) { 7080 /* 7081 * Write protect the mapping to a 7082 * single page so that a subsequent 7083 * write access may repromote. 7084 */ 7085 va += VM_PAGE_TO_PHYS(m) - (oldpde & 7086 PG_PS_FRAME); 7087 pte = pmap_pde_to_pte(pde, va); 7088 oldpte = *pte; 7089 if ((oldpte & PG_V) != 0) { 7090 while (!atomic_cmpset_long(pte, 7091 oldpte, 7092 oldpte & ~(PG_M | PG_RW))) 7093 oldpte = *pte; 7094 vm_page_dirty(m); 7095 pmap_invalidate_page(pmap, va); 7096 } 7097 } 7098 } 7099 } 7100 PMAP_UNLOCK(pmap); 7101 } 7102 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 7103 pmap = PV_PMAP(pv); 7104 if (!PMAP_TRYLOCK(pmap)) { 7105 md_gen = m->md.pv_gen; 7106 pvh_gen = pvh->pv_gen; 7107 rw_wunlock(lock); 7108 PMAP_LOCK(pmap); 7109 rw_wlock(lock); 7110 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 7111 PMAP_UNLOCK(pmap); 7112 goto restart; 7113 } 7114 } 7115 PG_M = pmap_modified_bit(pmap); 7116 PG_RW = pmap_rw_bit(pmap); 7117 pde = pmap_pde(pmap, pv->pv_va); 7118 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 7119 " a 2mpage in page %p's pv list", m)); 7120 pte = pmap_pde_to_pte(pde, pv->pv_va); 7121 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 7122 atomic_clear_long(pte, PG_M); 7123 pmap_invalidate_page(pmap, pv->pv_va); 7124 } 7125 PMAP_UNLOCK(pmap); 7126 } 7127 rw_wunlock(lock); 7128 } 7129 7130 /* 7131 * Miscellaneous support routines follow 7132 */ 7133 7134 /* Adjust the cache mode for a 4KB page mapped via a PTE. */ 7135 static __inline void 7136 pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask) 7137 { 7138 u_int opte, npte; 7139 7140 /* 7141 * The cache mode bits are all in the low 32-bits of the 7142 * PTE, so we can just spin on updating the low 32-bits. 7143 */ 7144 do { 7145 opte = *(u_int *)pte; 7146 npte = opte & ~mask; 7147 npte |= cache_bits; 7148 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 7149 } 7150 7151 /* Adjust the cache mode for a 2MB page mapped via a PDE. */ 7152 static __inline void 7153 pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask) 7154 { 7155 u_int opde, npde; 7156 7157 /* 7158 * The cache mode bits are all in the low 32-bits of the 7159 * PDE, so we can just spin on updating the low 32-bits. 7160 */ 7161 do { 7162 opde = *(u_int *)pde; 7163 npde = opde & ~mask; 7164 npde |= cache_bits; 7165 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 7166 } 7167 7168 /* 7169 * Map a set of physical memory pages into the kernel virtual 7170 * address space. Return a pointer to where it is mapped. This 7171 * routine is intended to be used for mapping device memory, 7172 * NOT real memory. 7173 */ 7174 static void * 7175 pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, bool noflush) 7176 { 7177 struct pmap_preinit_mapping *ppim; 7178 vm_offset_t va, offset; 7179 vm_size_t tmpsize; 7180 int i; 7181 7182 offset = pa & PAGE_MASK; 7183 size = round_page(offset + size); 7184 pa = trunc_page(pa); 7185 7186 if (!pmap_initialized) { 7187 va = 0; 7188 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 7189 ppim = pmap_preinit_mapping + i; 7190 if (ppim->va == 0) { 7191 ppim->pa = pa; 7192 ppim->sz = size; 7193 ppim->mode = mode; 7194 ppim->va = virtual_avail; 7195 virtual_avail += size; 7196 va = ppim->va; 7197 break; 7198 } 7199 } 7200 if (va == 0) 7201 panic("%s: too many preinit mappings", __func__); 7202 } else { 7203 /* 7204 * If we have a preinit mapping, re-use it. 7205 */ 7206 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 7207 ppim = pmap_preinit_mapping + i; 7208 if (ppim->pa == pa && ppim->sz == size && 7209 ppim->mode == mode) 7210 return ((void *)(ppim->va + offset)); 7211 } 7212 /* 7213 * If the specified range of physical addresses fits within 7214 * the direct map window, use the direct map. 7215 */ 7216 if (pa < dmaplimit && pa + size <= dmaplimit) { 7217 va = PHYS_TO_DMAP(pa); 7218 PMAP_LOCK(kernel_pmap); 7219 i = pmap_change_attr_locked(va, size, mode, noflush); 7220 PMAP_UNLOCK(kernel_pmap); 7221 if (!i) 7222 return ((void *)(va + offset)); 7223 } 7224 va = kva_alloc(size); 7225 if (va == 0) 7226 panic("%s: Couldn't allocate KVA", __func__); 7227 } 7228 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 7229 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 7230 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 7231 if (!noflush) 7232 pmap_invalidate_cache_range(va, va + tmpsize); 7233 return ((void *)(va + offset)); 7234 } 7235 7236 void * 7237 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 7238 { 7239 7240 return (pmap_mapdev_internal(pa, size, mode, false)); 7241 } 7242 7243 void * 7244 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 7245 { 7246 7247 return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE, false)); 7248 } 7249 7250 void * 7251 pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size) 7252 { 7253 7254 return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE, true)); 7255 } 7256 7257 void * 7258 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 7259 { 7260 7261 return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK, false)); 7262 } 7263 7264 void 7265 pmap_unmapdev(vm_offset_t va, vm_size_t size) 7266 { 7267 struct pmap_preinit_mapping *ppim; 7268 vm_offset_t offset; 7269 int i; 7270 7271 /* If we gave a direct map region in pmap_mapdev, do nothing */ 7272 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 7273 return; 7274 offset = va & PAGE_MASK; 7275 size = round_page(offset + size); 7276 va = trunc_page(va); 7277 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 7278 ppim = pmap_preinit_mapping + i; 7279 if (ppim->va == va && ppim->sz == size) { 7280 if (pmap_initialized) 7281 return; 7282 ppim->pa = 0; 7283 ppim->va = 0; 7284 ppim->sz = 0; 7285 ppim->mode = 0; 7286 if (va + size == virtual_avail) 7287 virtual_avail = va; 7288 return; 7289 } 7290 } 7291 if (pmap_initialized) 7292 kva_free(va, size); 7293 } 7294 7295 /* 7296 * Tries to demote a 1GB page mapping. 7297 */ 7298 static boolean_t 7299 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) 7300 { 7301 pdp_entry_t newpdpe, oldpdpe; 7302 pd_entry_t *firstpde, newpde, *pde; 7303 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 7304 vm_paddr_t pdpgpa; 7305 vm_page_t pdpg; 7306 7307 PG_A = pmap_accessed_bit(pmap); 7308 PG_M = pmap_modified_bit(pmap); 7309 PG_V = pmap_valid_bit(pmap); 7310 PG_RW = pmap_rw_bit(pmap); 7311 7312 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7313 oldpdpe = *pdpe; 7314 KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), 7315 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 7316 if ((pdpg = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT | 7317 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 7318 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 7319 " in pmap %p", va, pmap); 7320 return (FALSE); 7321 } 7322 pdpgpa = VM_PAGE_TO_PHYS(pdpg); 7323 firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa); 7324 newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; 7325 KASSERT((oldpdpe & PG_A) != 0, 7326 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 7327 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 7328 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 7329 newpde = oldpdpe; 7330 7331 /* 7332 * Initialize the page directory page. 7333 */ 7334 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 7335 *pde = newpde; 7336 newpde += NBPDR; 7337 } 7338 7339 /* 7340 * Demote the mapping. 7341 */ 7342 *pdpe = newpdpe; 7343 7344 /* 7345 * Invalidate a stale recursive mapping of the page directory page. 7346 */ 7347 pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); 7348 7349 pmap_pdpe_demotions++; 7350 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 7351 " in pmap %p", va, pmap); 7352 return (TRUE); 7353 } 7354 7355 /* 7356 * Sets the memory attribute for the specified page. 7357 */ 7358 void 7359 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 7360 { 7361 7362 m->md.pat_mode = ma; 7363 7364 /* 7365 * If "m" is a normal page, update its direct mapping. This update 7366 * can be relied upon to perform any cache operations that are 7367 * required for data coherence. 7368 */ 7369 if ((m->flags & PG_FICTITIOUS) == 0 && 7370 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 7371 m->md.pat_mode)) 7372 panic("memory attribute change on the direct map failed"); 7373 } 7374 7375 /* 7376 * Changes the specified virtual address range's memory type to that given by 7377 * the parameter "mode". The specified virtual address range must be 7378 * completely contained within either the direct map or the kernel map. If 7379 * the virtual address range is contained within the kernel map, then the 7380 * memory type for each of the corresponding ranges of the direct map is also 7381 * changed. (The corresponding ranges of the direct map are those ranges that 7382 * map the same physical pages as the specified virtual address range.) These 7383 * changes to the direct map are necessary because Intel describes the 7384 * behavior of their processors as "undefined" if two or more mappings to the 7385 * same physical page have different memory types. 7386 * 7387 * Returns zero if the change completed successfully, and either EINVAL or 7388 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 7389 * of the virtual address range was not mapped, and ENOMEM is returned if 7390 * there was insufficient memory available to complete the change. In the 7391 * latter case, the memory type may have been changed on some part of the 7392 * virtual address range or the direct map. 7393 */ 7394 int 7395 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 7396 { 7397 int error; 7398 7399 PMAP_LOCK(kernel_pmap); 7400 error = pmap_change_attr_locked(va, size, mode, false); 7401 PMAP_UNLOCK(kernel_pmap); 7402 return (error); 7403 } 7404 7405 static int 7406 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool noflush) 7407 { 7408 vm_offset_t base, offset, tmpva; 7409 vm_paddr_t pa_start, pa_end, pa_end1; 7410 pdp_entry_t *pdpe; 7411 pd_entry_t *pde; 7412 pt_entry_t *pte; 7413 int cache_bits_pte, cache_bits_pde, error; 7414 boolean_t changed; 7415 7416 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 7417 base = trunc_page(va); 7418 offset = va & PAGE_MASK; 7419 size = round_page(offset + size); 7420 7421 /* 7422 * Only supported on kernel virtual addresses, including the direct 7423 * map but excluding the recursive map. 7424 */ 7425 if (base < DMAP_MIN_ADDRESS) 7426 return (EINVAL); 7427 7428 cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1); 7429 cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0); 7430 changed = FALSE; 7431 7432 /* 7433 * Pages that aren't mapped aren't supported. Also break down 2MB pages 7434 * into 4KB pages if required. 7435 */ 7436 for (tmpva = base; tmpva < base + size; ) { 7437 pdpe = pmap_pdpe(kernel_pmap, tmpva); 7438 if (pdpe == NULL || *pdpe == 0) 7439 return (EINVAL); 7440 if (*pdpe & PG_PS) { 7441 /* 7442 * If the current 1GB page already has the required 7443 * memory type, then we need not demote this page. Just 7444 * increment tmpva to the next 1GB page frame. 7445 */ 7446 if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) { 7447 tmpva = trunc_1gpage(tmpva) + NBPDP; 7448 continue; 7449 } 7450 7451 /* 7452 * If the current offset aligns with a 1GB page frame 7453 * and there is at least 1GB left within the range, then 7454 * we need not break down this page into 2MB pages. 7455 */ 7456 if ((tmpva & PDPMASK) == 0 && 7457 tmpva + PDPMASK < base + size) { 7458 tmpva += NBPDP; 7459 continue; 7460 } 7461 if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) 7462 return (ENOMEM); 7463 } 7464 pde = pmap_pdpe_to_pde(pdpe, tmpva); 7465 if (*pde == 0) 7466 return (EINVAL); 7467 if (*pde & PG_PS) { 7468 /* 7469 * If the current 2MB page already has the required 7470 * memory type, then we need not demote this page. Just 7471 * increment tmpva to the next 2MB page frame. 7472 */ 7473 if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) { 7474 tmpva = trunc_2mpage(tmpva) + NBPDR; 7475 continue; 7476 } 7477 7478 /* 7479 * If the current offset aligns with a 2MB page frame 7480 * and there is at least 2MB left within the range, then 7481 * we need not break down this page into 4KB pages. 7482 */ 7483 if ((tmpva & PDRMASK) == 0 && 7484 tmpva + PDRMASK < base + size) { 7485 tmpva += NBPDR; 7486 continue; 7487 } 7488 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) 7489 return (ENOMEM); 7490 } 7491 pte = pmap_pde_to_pte(pde, tmpva); 7492 if (*pte == 0) 7493 return (EINVAL); 7494 tmpva += PAGE_SIZE; 7495 } 7496 error = 0; 7497 7498 /* 7499 * Ok, all the pages exist, so run through them updating their 7500 * cache mode if required. 7501 */ 7502 pa_start = pa_end = 0; 7503 for (tmpva = base; tmpva < base + size; ) { 7504 pdpe = pmap_pdpe(kernel_pmap, tmpva); 7505 if (*pdpe & PG_PS) { 7506 if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) { 7507 pmap_pde_attr(pdpe, cache_bits_pde, 7508 X86_PG_PDE_CACHE); 7509 changed = TRUE; 7510 } 7511 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 7512 (*pdpe & PG_PS_FRAME) < dmaplimit) { 7513 if (pa_start == pa_end) { 7514 /* Start physical address run. */ 7515 pa_start = *pdpe & PG_PS_FRAME; 7516 pa_end = pa_start + NBPDP; 7517 } else if (pa_end == (*pdpe & PG_PS_FRAME)) 7518 pa_end += NBPDP; 7519 else { 7520 /* Run ended, update direct map. */ 7521 error = pmap_change_attr_locked( 7522 PHYS_TO_DMAP(pa_start), 7523 pa_end - pa_start, mode, noflush); 7524 if (error != 0) 7525 break; 7526 /* Start physical address run. */ 7527 pa_start = *pdpe & PG_PS_FRAME; 7528 pa_end = pa_start + NBPDP; 7529 } 7530 } 7531 tmpva = trunc_1gpage(tmpva) + NBPDP; 7532 continue; 7533 } 7534 pde = pmap_pdpe_to_pde(pdpe, tmpva); 7535 if (*pde & PG_PS) { 7536 if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) { 7537 pmap_pde_attr(pde, cache_bits_pde, 7538 X86_PG_PDE_CACHE); 7539 changed = TRUE; 7540 } 7541 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 7542 (*pde & PG_PS_FRAME) < dmaplimit) { 7543 if (pa_start == pa_end) { 7544 /* Start physical address run. */ 7545 pa_start = *pde & PG_PS_FRAME; 7546 pa_end = pa_start + NBPDR; 7547 } else if (pa_end == (*pde & PG_PS_FRAME)) 7548 pa_end += NBPDR; 7549 else { 7550 /* Run ended, update direct map. */ 7551 error = pmap_change_attr_locked( 7552 PHYS_TO_DMAP(pa_start), 7553 pa_end - pa_start, mode, noflush); 7554 if (error != 0) 7555 break; 7556 /* Start physical address run. */ 7557 pa_start = *pde & PG_PS_FRAME; 7558 pa_end = pa_start + NBPDR; 7559 } 7560 } 7561 tmpva = trunc_2mpage(tmpva) + NBPDR; 7562 } else { 7563 pte = pmap_pde_to_pte(pde, tmpva); 7564 if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) { 7565 pmap_pte_attr(pte, cache_bits_pte, 7566 X86_PG_PTE_CACHE); 7567 changed = TRUE; 7568 } 7569 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 7570 (*pte & PG_FRAME) < dmaplimit) { 7571 if (pa_start == pa_end) { 7572 /* Start physical address run. */ 7573 pa_start = *pte & PG_FRAME; 7574 pa_end = pa_start + PAGE_SIZE; 7575 } else if (pa_end == (*pte & PG_FRAME)) 7576 pa_end += PAGE_SIZE; 7577 else { 7578 /* Run ended, update direct map. */ 7579 error = pmap_change_attr_locked( 7580 PHYS_TO_DMAP(pa_start), 7581 pa_end - pa_start, mode, noflush); 7582 if (error != 0) 7583 break; 7584 /* Start physical address run. */ 7585 pa_start = *pte & PG_FRAME; 7586 pa_end = pa_start + PAGE_SIZE; 7587 } 7588 } 7589 tmpva += PAGE_SIZE; 7590 } 7591 } 7592 if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { 7593 pa_end1 = MIN(pa_end, dmaplimit); 7594 if (pa_start != pa_end1) 7595 error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), 7596 pa_end1 - pa_start, mode, noflush); 7597 } 7598 7599 /* 7600 * Flush CPU caches if required to make sure any data isn't cached that 7601 * shouldn't be, etc. 7602 */ 7603 if (changed) { 7604 pmap_invalidate_range(kernel_pmap, base, tmpva); 7605 if (!noflush) 7606 pmap_invalidate_cache_range(base, tmpva); 7607 } 7608 return (error); 7609 } 7610 7611 /* 7612 * Demotes any mapping within the direct map region that covers more than the 7613 * specified range of physical addresses. This range's size must be a power 7614 * of two and its starting address must be a multiple of its size. Since the 7615 * demotion does not change any attributes of the mapping, a TLB invalidation 7616 * is not mandatory. The caller may, however, request a TLB invalidation. 7617 */ 7618 void 7619 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) 7620 { 7621 pdp_entry_t *pdpe; 7622 pd_entry_t *pde; 7623 vm_offset_t va; 7624 boolean_t changed; 7625 7626 if (len == 0) 7627 return; 7628 KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); 7629 KASSERT((base & (len - 1)) == 0, 7630 ("pmap_demote_DMAP: base is not a multiple of len")); 7631 if (len < NBPDP && base < dmaplimit) { 7632 va = PHYS_TO_DMAP(base); 7633 changed = FALSE; 7634 PMAP_LOCK(kernel_pmap); 7635 pdpe = pmap_pdpe(kernel_pmap, va); 7636 if ((*pdpe & X86_PG_V) == 0) 7637 panic("pmap_demote_DMAP: invalid PDPE"); 7638 if ((*pdpe & PG_PS) != 0) { 7639 if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) 7640 panic("pmap_demote_DMAP: PDPE failed"); 7641 changed = TRUE; 7642 } 7643 if (len < NBPDR) { 7644 pde = pmap_pdpe_to_pde(pdpe, va); 7645 if ((*pde & X86_PG_V) == 0) 7646 panic("pmap_demote_DMAP: invalid PDE"); 7647 if ((*pde & PG_PS) != 0) { 7648 if (!pmap_demote_pde(kernel_pmap, pde, va)) 7649 panic("pmap_demote_DMAP: PDE failed"); 7650 changed = TRUE; 7651 } 7652 } 7653 if (changed && invalidate) 7654 pmap_invalidate_page(kernel_pmap, va); 7655 PMAP_UNLOCK(kernel_pmap); 7656 } 7657 } 7658 7659 /* 7660 * perform the pmap work for mincore 7661 */ 7662 int 7663 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 7664 { 7665 pd_entry_t *pdep; 7666 pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; 7667 vm_paddr_t pa; 7668 int val; 7669 7670 PG_A = pmap_accessed_bit(pmap); 7671 PG_M = pmap_modified_bit(pmap); 7672 PG_V = pmap_valid_bit(pmap); 7673 PG_RW = pmap_rw_bit(pmap); 7674 7675 PMAP_LOCK(pmap); 7676 retry: 7677 pdep = pmap_pde(pmap, addr); 7678 if (pdep != NULL && (*pdep & PG_V)) { 7679 if (*pdep & PG_PS) { 7680 pte = *pdep; 7681 /* Compute the physical address of the 4KB page. */ 7682 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & 7683 PG_FRAME; 7684 val = MINCORE_SUPER; 7685 } else { 7686 pte = *pmap_pde_to_pte(pdep, addr); 7687 pa = pte & PG_FRAME; 7688 val = 0; 7689 } 7690 } else { 7691 pte = 0; 7692 pa = 0; 7693 val = 0; 7694 } 7695 if ((pte & PG_V) != 0) { 7696 val |= MINCORE_INCORE; 7697 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 7698 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 7699 if ((pte & PG_A) != 0) 7700 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 7701 } 7702 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 7703 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 7704 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 7705 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 7706 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 7707 goto retry; 7708 } else 7709 PA_UNLOCK_COND(*locked_pa); 7710 PMAP_UNLOCK(pmap); 7711 return (val); 7712 } 7713 7714 static uint64_t 7715 pmap_pcid_alloc(pmap_t pmap, u_int cpuid) 7716 { 7717 uint32_t gen, new_gen, pcid_next; 7718 7719 CRITICAL_ASSERT(curthread); 7720 gen = PCPU_GET(pcid_gen); 7721 if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN) 7722 return (pti ? 0 : CR3_PCID_SAVE); 7723 if (pmap->pm_pcids[cpuid].pm_gen == gen) 7724 return (CR3_PCID_SAVE); 7725 pcid_next = PCPU_GET(pcid_next); 7726 KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) || 7727 (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN), 7728 ("cpu %d pcid_next %#x", cpuid, pcid_next)); 7729 if ((!pti && pcid_next == PMAP_PCID_OVERMAX) || 7730 (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) { 7731 new_gen = gen + 1; 7732 if (new_gen == 0) 7733 new_gen = 1; 7734 PCPU_SET(pcid_gen, new_gen); 7735 pcid_next = PMAP_PCID_KERN + 1; 7736 } else { 7737 new_gen = gen; 7738 } 7739 pmap->pm_pcids[cpuid].pm_pcid = pcid_next; 7740 pmap->pm_pcids[cpuid].pm_gen = new_gen; 7741 PCPU_SET(pcid_next, pcid_next + 1); 7742 return (0); 7743 } 7744 7745 static uint64_t 7746 pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid) 7747 { 7748 uint64_t cached; 7749 7750 cached = pmap_pcid_alloc(pmap, cpuid); 7751 KASSERT(pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, 7752 ("pmap %p cpu %d pcid %#x", pmap, cpuid, 7753 pmap->pm_pcids[cpuid].pm_pcid)); 7754 KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || 7755 pmap == kernel_pmap, 7756 ("non-kernel pmap pmap %p cpu %d pcid %#x", 7757 pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); 7758 return (cached); 7759 } 7760 7761 static void 7762 pmap_activate_sw_pti_post(pmap_t pmap) 7763 { 7764 7765 if (pmap->pm_ucr3 != PMAP_NO_CR3) 7766 PCPU_GET(tssp)->tss_rsp0 = ((vm_offset_t)PCPU_PTR(pti_stack) + 7767 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful; 7768 } 7769 7770 static void inline 7771 pmap_activate_sw_pcid_pti(pmap_t pmap, u_int cpuid, const bool invpcid_works1) 7772 { 7773 struct invpcid_descr d; 7774 uint64_t cached, cr3, kcr3, ucr3; 7775 7776 cached = pmap_pcid_alloc_checked(pmap, cpuid); 7777 cr3 = rcr3(); 7778 if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) 7779 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid); 7780 PCPU_SET(curpmap, pmap); 7781 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid; 7782 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid | 7783 PMAP_PCID_USER_PT; 7784 7785 if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) { 7786 /* 7787 * Explicitly invalidate translations cached from the 7788 * user page table. They are not automatically 7789 * flushed by reload of cr3 with the kernel page table 7790 * pointer above. 7791 * 7792 * Note that the if() condition is resolved statically 7793 * by using the function argument instead of 7794 * runtime-evaluated invpcid_works value. 7795 */ 7796 if (invpcid_works1) { 7797 d.pcid = PMAP_PCID_USER_PT | 7798 pmap->pm_pcids[cpuid].pm_pcid; 7799 d.pad = 0; 7800 d.addr = 0; 7801 invpcid(&d, INVPCID_CTX); 7802 } else { 7803 pmap_pti_pcid_invalidate(ucr3, kcr3); 7804 } 7805 } 7806 7807 PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); 7808 PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); 7809 if (cached) 7810 PCPU_INC(pm_save_cnt); 7811 } 7812 7813 static void 7814 pmap_activate_sw_pcid_invpcid_pti(pmap_t pmap, u_int cpuid) 7815 { 7816 7817 pmap_activate_sw_pcid_pti(pmap, cpuid, true); 7818 pmap_activate_sw_pti_post(pmap); 7819 } 7820 7821 static void 7822 pmap_activate_sw_pcid_noinvpcid_pti(pmap_t pmap, u_int cpuid) 7823 { 7824 register_t rflags; 7825 7826 /* 7827 * If the INVPCID instruction is not available, 7828 * invltlb_pcid_handler() is used to handle an invalidate_all 7829 * IPI, which checks for curpmap == smp_tlb_pmap. The below 7830 * sequence of operations has a window where %CR3 is loaded 7831 * with the new pmap's PML4 address, but the curpmap value has 7832 * not yet been updated. This causes the invltlb IPI handler, 7833 * which is called between the updates, to execute as a NOP, 7834 * which leaves stale TLB entries. 7835 * 7836 * Note that the most typical use of pmap_activate_sw(), from 7837 * the context switch, is immune to this race, because 7838 * interrupts are disabled (while the thread lock is owned), 7839 * and the IPI happens after curpmap is updated. Protect 7840 * other callers in a similar way, by disabling interrupts 7841 * around the %cr3 register reload and curpmap assignment. 7842 */ 7843 rflags = intr_disable(); 7844 pmap_activate_sw_pcid_pti(pmap, cpuid, false); 7845 intr_restore(rflags); 7846 pmap_activate_sw_pti_post(pmap); 7847 } 7848 7849 static void 7850 pmap_activate_sw_pcid_nopti(pmap_t pmap, u_int cpuid) 7851 { 7852 uint64_t cached, cr3; 7853 7854 cached = pmap_pcid_alloc_checked(pmap, cpuid); 7855 cr3 = rcr3(); 7856 if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) 7857 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | 7858 cached); 7859 PCPU_SET(curpmap, pmap); 7860 if (cached) 7861 PCPU_INC(pm_save_cnt); 7862 } 7863 7864 static void 7865 pmap_activate_sw_pcid_noinvpcid_nopti(pmap_t pmap, u_int cpuid) 7866 { 7867 register_t rflags; 7868 7869 rflags = intr_disable(); 7870 pmap_activate_sw_pcid_nopti(pmap, cpuid); 7871 intr_restore(rflags); 7872 } 7873 7874 static void 7875 pmap_activate_sw_nopcid_nopti(pmap_t pmap, u_int cpuid __unused) 7876 { 7877 7878 load_cr3(pmap->pm_cr3); 7879 PCPU_SET(curpmap, pmap); 7880 } 7881 7882 static void 7883 pmap_activate_sw_nopcid_pti(pmap_t pmap, u_int cpuid __unused) 7884 { 7885 7886 pmap_activate_sw_nopcid_nopti(pmap, cpuid); 7887 PCPU_SET(kcr3, pmap->pm_cr3); 7888 PCPU_SET(ucr3, pmap->pm_ucr3); 7889 pmap_activate_sw_pti_post(pmap); 7890 } 7891 7892 DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (pmap_t, u_int), static) 7893 { 7894 7895 if (pmap_pcid_enabled && pti && invpcid_works) 7896 return (pmap_activate_sw_pcid_invpcid_pti); 7897 else if (pmap_pcid_enabled && pti && !invpcid_works) 7898 return (pmap_activate_sw_pcid_noinvpcid_pti); 7899 else if (pmap_pcid_enabled && !pti && invpcid_works) 7900 return (pmap_activate_sw_pcid_nopti); 7901 else if (pmap_pcid_enabled && !pti && !invpcid_works) 7902 return (pmap_activate_sw_pcid_noinvpcid_nopti); 7903 else if (!pmap_pcid_enabled && pti) 7904 return (pmap_activate_sw_nopcid_pti); 7905 else /* if (!pmap_pcid_enabled && !pti) */ 7906 return (pmap_activate_sw_nopcid_nopti); 7907 } 7908 7909 void 7910 pmap_activate_sw(struct thread *td) 7911 { 7912 pmap_t oldpmap, pmap; 7913 u_int cpuid; 7914 7915 oldpmap = PCPU_GET(curpmap); 7916 pmap = vmspace_pmap(td->td_proc->p_vmspace); 7917 if (oldpmap == pmap) 7918 return; 7919 cpuid = PCPU_GET(cpuid); 7920 #ifdef SMP 7921 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 7922 #else 7923 CPU_SET(cpuid, &pmap->pm_active); 7924 #endif 7925 pmap_activate_sw_mode(pmap, cpuid); 7926 #ifdef SMP 7927 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 7928 #else 7929 CPU_CLR(cpuid, &oldpmap->pm_active); 7930 #endif 7931 } 7932 7933 void 7934 pmap_activate(struct thread *td) 7935 { 7936 7937 critical_enter(); 7938 pmap_activate_sw(td); 7939 critical_exit(); 7940 } 7941 7942 void 7943 pmap_activate_boot(pmap_t pmap) 7944 { 7945 uint64_t kcr3; 7946 u_int cpuid; 7947 7948 /* 7949 * kernel_pmap must be never deactivated, and we ensure that 7950 * by never activating it at all. 7951 */ 7952 MPASS(pmap != kernel_pmap); 7953 7954 cpuid = PCPU_GET(cpuid); 7955 #ifdef SMP 7956 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 7957 #else 7958 CPU_SET(cpuid, &pmap->pm_active); 7959 #endif 7960 PCPU_SET(curpmap, pmap); 7961 if (pti) { 7962 kcr3 = pmap->pm_cr3; 7963 if (pmap_pcid_enabled) 7964 kcr3 |= pmap->pm_pcids[cpuid].pm_pcid | CR3_PCID_SAVE; 7965 } else { 7966 kcr3 = PMAP_NO_CR3; 7967 } 7968 PCPU_SET(kcr3, kcr3); 7969 PCPU_SET(ucr3, PMAP_NO_CR3); 7970 } 7971 7972 void 7973 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 7974 { 7975 } 7976 7977 /* 7978 * Increase the starting virtual address of the given mapping if a 7979 * different alignment might result in more superpage mappings. 7980 */ 7981 void 7982 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 7983 vm_offset_t *addr, vm_size_t size) 7984 { 7985 vm_offset_t superpage_offset; 7986 7987 if (size < NBPDR) 7988 return; 7989 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 7990 offset += ptoa(object->pg_color); 7991 superpage_offset = offset & PDRMASK; 7992 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 7993 (*addr & PDRMASK) == superpage_offset) 7994 return; 7995 if ((*addr & PDRMASK) < superpage_offset) 7996 *addr = (*addr & ~PDRMASK) + superpage_offset; 7997 else 7998 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 7999 } 8000 8001 #ifdef INVARIANTS 8002 static unsigned long num_dirty_emulations; 8003 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, 8004 &num_dirty_emulations, 0, NULL); 8005 8006 static unsigned long num_accessed_emulations; 8007 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, 8008 &num_accessed_emulations, 0, NULL); 8009 8010 static unsigned long num_superpage_accessed_emulations; 8011 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, 8012 &num_superpage_accessed_emulations, 0, NULL); 8013 8014 static unsigned long ad_emulation_superpage_promotions; 8015 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, 8016 &ad_emulation_superpage_promotions, 0, NULL); 8017 #endif /* INVARIANTS */ 8018 8019 int 8020 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) 8021 { 8022 int rv; 8023 struct rwlock *lock; 8024 #if VM_NRESERVLEVEL > 0 8025 vm_page_t m, mpte; 8026 #endif 8027 pd_entry_t *pde; 8028 pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; 8029 8030 KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, 8031 ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); 8032 8033 if (!pmap_emulate_ad_bits(pmap)) 8034 return (-1); 8035 8036 PG_A = pmap_accessed_bit(pmap); 8037 PG_M = pmap_modified_bit(pmap); 8038 PG_V = pmap_valid_bit(pmap); 8039 PG_RW = pmap_rw_bit(pmap); 8040 8041 rv = -1; 8042 lock = NULL; 8043 PMAP_LOCK(pmap); 8044 8045 pde = pmap_pde(pmap, va); 8046 if (pde == NULL || (*pde & PG_V) == 0) 8047 goto done; 8048 8049 if ((*pde & PG_PS) != 0) { 8050 if (ftype == VM_PROT_READ) { 8051 #ifdef INVARIANTS 8052 atomic_add_long(&num_superpage_accessed_emulations, 1); 8053 #endif 8054 *pde |= PG_A; 8055 rv = 0; 8056 } 8057 goto done; 8058 } 8059 8060 pte = pmap_pde_to_pte(pde, va); 8061 if ((*pte & PG_V) == 0) 8062 goto done; 8063 8064 if (ftype == VM_PROT_WRITE) { 8065 if ((*pte & PG_RW) == 0) 8066 goto done; 8067 /* 8068 * Set the modified and accessed bits simultaneously. 8069 * 8070 * Intel EPT PTEs that do software emulation of A/D bits map 8071 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively. 8072 * An EPT misconfiguration is triggered if the PTE is writable 8073 * but not readable (WR=10). This is avoided by setting PG_A 8074 * and PG_M simultaneously. 8075 */ 8076 *pte |= PG_M | PG_A; 8077 } else { 8078 *pte |= PG_A; 8079 } 8080 8081 #if VM_NRESERVLEVEL > 0 8082 /* try to promote the mapping */ 8083 if (va < VM_MAXUSER_ADDRESS) 8084 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 8085 else 8086 mpte = NULL; 8087 8088 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 8089 8090 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 8091 pmap_ps_enabled(pmap) && 8092 (m->flags & PG_FICTITIOUS) == 0 && 8093 vm_reserv_level_iffullpop(m) == 0) { 8094 pmap_promote_pde(pmap, pde, va, &lock); 8095 #ifdef INVARIANTS 8096 atomic_add_long(&ad_emulation_superpage_promotions, 1); 8097 #endif 8098 } 8099 #endif 8100 8101 #ifdef INVARIANTS 8102 if (ftype == VM_PROT_WRITE) 8103 atomic_add_long(&num_dirty_emulations, 1); 8104 else 8105 atomic_add_long(&num_accessed_emulations, 1); 8106 #endif 8107 rv = 0; /* success */ 8108 done: 8109 if (lock != NULL) 8110 rw_wunlock(lock); 8111 PMAP_UNLOCK(pmap); 8112 return (rv); 8113 } 8114 8115 void 8116 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) 8117 { 8118 pml4_entry_t *pml4; 8119 pdp_entry_t *pdp; 8120 pd_entry_t *pde; 8121 pt_entry_t *pte, PG_V; 8122 int idx; 8123 8124 idx = 0; 8125 PG_V = pmap_valid_bit(pmap); 8126 PMAP_LOCK(pmap); 8127 8128 pml4 = pmap_pml4e(pmap, va); 8129 ptr[idx++] = *pml4; 8130 if ((*pml4 & PG_V) == 0) 8131 goto done; 8132 8133 pdp = pmap_pml4e_to_pdpe(pml4, va); 8134 ptr[idx++] = *pdp; 8135 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) 8136 goto done; 8137 8138 pde = pmap_pdpe_to_pde(pdp, va); 8139 ptr[idx++] = *pde; 8140 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) 8141 goto done; 8142 8143 pte = pmap_pde_to_pte(pde, va); 8144 ptr[idx++] = *pte; 8145 8146 done: 8147 PMAP_UNLOCK(pmap); 8148 *num = idx; 8149 } 8150 8151 /** 8152 * Get the kernel virtual address of a set of physical pages. If there are 8153 * physical addresses not covered by the DMAP perform a transient mapping 8154 * that will be removed when calling pmap_unmap_io_transient. 8155 * 8156 * \param page The pages the caller wishes to obtain the virtual 8157 * address on the kernel memory map. 8158 * \param vaddr On return contains the kernel virtual memory address 8159 * of the pages passed in the page parameter. 8160 * \param count Number of pages passed in. 8161 * \param can_fault TRUE if the thread using the mapped pages can take 8162 * page faults, FALSE otherwise. 8163 * 8164 * \returns TRUE if the caller must call pmap_unmap_io_transient when 8165 * finished or FALSE otherwise. 8166 * 8167 */ 8168 boolean_t 8169 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 8170 boolean_t can_fault) 8171 { 8172 vm_paddr_t paddr; 8173 boolean_t needs_mapping; 8174 pt_entry_t *pte; 8175 int cache_bits, error __unused, i; 8176 8177 /* 8178 * Allocate any KVA space that we need, this is done in a separate 8179 * loop to prevent calling vmem_alloc while pinned. 8180 */ 8181 needs_mapping = FALSE; 8182 for (i = 0; i < count; i++) { 8183 paddr = VM_PAGE_TO_PHYS(page[i]); 8184 if (__predict_false(paddr >= dmaplimit)) { 8185 error = vmem_alloc(kernel_arena, PAGE_SIZE, 8186 M_BESTFIT | M_WAITOK, &vaddr[i]); 8187 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 8188 needs_mapping = TRUE; 8189 } else { 8190 vaddr[i] = PHYS_TO_DMAP(paddr); 8191 } 8192 } 8193 8194 /* Exit early if everything is covered by the DMAP */ 8195 if (!needs_mapping) 8196 return (FALSE); 8197 8198 /* 8199 * NB: The sequence of updating a page table followed by accesses 8200 * to the corresponding pages used in the !DMAP case is subject to 8201 * the situation described in the "AMD64 Architecture Programmer's 8202 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special 8203 * Coherency Considerations". Therefore, issuing the INVLPG right 8204 * after modifying the PTE bits is crucial. 8205 */ 8206 if (!can_fault) 8207 sched_pin(); 8208 for (i = 0; i < count; i++) { 8209 paddr = VM_PAGE_TO_PHYS(page[i]); 8210 if (paddr >= dmaplimit) { 8211 if (can_fault) { 8212 /* 8213 * Slow path, since we can get page faults 8214 * while mappings are active don't pin the 8215 * thread to the CPU and instead add a global 8216 * mapping visible to all CPUs. 8217 */ 8218 pmap_qenter(vaddr[i], &page[i], 1); 8219 } else { 8220 pte = vtopte(vaddr[i]); 8221 cache_bits = pmap_cache_bits(kernel_pmap, 8222 page[i]->md.pat_mode, 0); 8223 pte_store(pte, paddr | X86_PG_RW | X86_PG_V | 8224 cache_bits); 8225 invlpg(vaddr[i]); 8226 } 8227 } 8228 } 8229 8230 return (needs_mapping); 8231 } 8232 8233 void 8234 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 8235 boolean_t can_fault) 8236 { 8237 vm_paddr_t paddr; 8238 int i; 8239 8240 if (!can_fault) 8241 sched_unpin(); 8242 for (i = 0; i < count; i++) { 8243 paddr = VM_PAGE_TO_PHYS(page[i]); 8244 if (paddr >= dmaplimit) { 8245 if (can_fault) 8246 pmap_qremove(vaddr[i], 1); 8247 vmem_free(kernel_arena, vaddr[i], PAGE_SIZE); 8248 } 8249 } 8250 } 8251 8252 vm_offset_t 8253 pmap_quick_enter_page(vm_page_t m) 8254 { 8255 vm_paddr_t paddr; 8256 8257 paddr = VM_PAGE_TO_PHYS(m); 8258 if (paddr < dmaplimit) 8259 return (PHYS_TO_DMAP(paddr)); 8260 mtx_lock_spin(&qframe_mtx); 8261 KASSERT(*vtopte(qframe) == 0, ("qframe busy")); 8262 pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A | 8263 X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0)); 8264 return (qframe); 8265 } 8266 8267 void 8268 pmap_quick_remove_page(vm_offset_t addr) 8269 { 8270 8271 if (addr != qframe) 8272 return; 8273 pte_store(vtopte(qframe), 0); 8274 invlpg(qframe); 8275 mtx_unlock_spin(&qframe_mtx); 8276 } 8277 8278 /* 8279 * Pdp pages from the large map are managed differently from either 8280 * kernel or user page table pages. They are permanently allocated at 8281 * initialization time, and their wire count is permanently set to 8282 * zero. The pml4 entries pointing to those pages are copied into 8283 * each allocated pmap. 8284 * 8285 * In contrast, pd and pt pages are managed like user page table 8286 * pages. They are dynamically allocated, and their wire count 8287 * represents the number of valid entries within the page. 8288 */ 8289 static vm_page_t 8290 pmap_large_map_getptp_unlocked(void) 8291 { 8292 vm_page_t m; 8293 8294 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 8295 VM_ALLOC_ZERO); 8296 if (m != NULL && (m->flags & PG_ZERO) == 0) 8297 pmap_zero_page(m); 8298 return (m); 8299 } 8300 8301 static vm_page_t 8302 pmap_large_map_getptp(void) 8303 { 8304 vm_page_t m; 8305 8306 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 8307 m = pmap_large_map_getptp_unlocked(); 8308 if (m == NULL) { 8309 PMAP_UNLOCK(kernel_pmap); 8310 vm_wait(NULL); 8311 PMAP_LOCK(kernel_pmap); 8312 /* Callers retry. */ 8313 } 8314 return (m); 8315 } 8316 8317 static pdp_entry_t * 8318 pmap_large_map_pdpe(vm_offset_t va) 8319 { 8320 vm_pindex_t pml4_idx; 8321 vm_paddr_t mphys; 8322 8323 pml4_idx = pmap_pml4e_index(va); 8324 KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents, 8325 ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I " 8326 "%#jx lm_ents %d", 8327 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); 8328 KASSERT((kernel_pmap->pm_pml4[pml4_idx] & X86_PG_V) != 0, 8329 ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " 8330 "LMSPML4I %#jx lm_ents %d", 8331 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); 8332 mphys = kernel_pmap->pm_pml4[pml4_idx] & PG_FRAME; 8333 return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va)); 8334 } 8335 8336 static pd_entry_t * 8337 pmap_large_map_pde(vm_offset_t va) 8338 { 8339 pdp_entry_t *pdpe; 8340 vm_page_t m; 8341 vm_paddr_t mphys; 8342 8343 retry: 8344 pdpe = pmap_large_map_pdpe(va); 8345 if (*pdpe == 0) { 8346 m = pmap_large_map_getptp(); 8347 if (m == NULL) 8348 goto retry; 8349 mphys = VM_PAGE_TO_PHYS(m); 8350 *pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; 8351 } else { 8352 MPASS((*pdpe & X86_PG_PS) == 0); 8353 mphys = *pdpe & PG_FRAME; 8354 } 8355 return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va)); 8356 } 8357 8358 static pt_entry_t * 8359 pmap_large_map_pte(vm_offset_t va) 8360 { 8361 pd_entry_t *pde; 8362 vm_page_t m; 8363 vm_paddr_t mphys; 8364 8365 retry: 8366 pde = pmap_large_map_pde(va); 8367 if (*pde == 0) { 8368 m = pmap_large_map_getptp(); 8369 if (m == NULL) 8370 goto retry; 8371 mphys = VM_PAGE_TO_PHYS(m); 8372 *pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; 8373 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->wire_count++; 8374 } else { 8375 MPASS((*pde & X86_PG_PS) == 0); 8376 mphys = *pde & PG_FRAME; 8377 } 8378 return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va)); 8379 } 8380 8381 static int 8382 pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase, 8383 vmem_addr_t *vmem_res) 8384 { 8385 8386 /* 8387 * Large mappings are all but static. Consequently, there 8388 * is no point in waiting for an earlier allocation to be 8389 * freed. 8390 */ 8391 return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN, 8392 VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res)); 8393 } 8394 8395 int 8396 pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr, 8397 vm_memattr_t mattr) 8398 { 8399 pdp_entry_t *pdpe; 8400 pd_entry_t *pde; 8401 pt_entry_t *pte; 8402 vm_offset_t va, inc; 8403 vmem_addr_t vmem_res; 8404 vm_paddr_t pa; 8405 int error; 8406 8407 if (len == 0 || spa + len < spa) 8408 return (EINVAL); 8409 8410 /* See if DMAP can serve. */ 8411 if (spa + len <= dmaplimit) { 8412 va = PHYS_TO_DMAP(spa); 8413 *addr = (void *)va; 8414 return (pmap_change_attr(va, len, mattr)); 8415 } 8416 8417 /* 8418 * No, allocate KVA. Fit the address with best possible 8419 * alignment for superpages. Fall back to worse align if 8420 * failed. 8421 */ 8422 error = ENOMEM; 8423 if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len, 8424 NBPDP) >= roundup2(spa, NBPDP) + NBPDP) 8425 error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK, 8426 &vmem_res); 8427 if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa, 8428 NBPDR) + NBPDR) 8429 error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK, 8430 &vmem_res); 8431 if (error != 0) 8432 error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res); 8433 if (error != 0) 8434 return (error); 8435 8436 /* 8437 * Fill pagetable. PG_M is not pre-set, we scan modified bits 8438 * in the pagetable to minimize flushing. No need to 8439 * invalidate TLB, since we only update invalid entries. 8440 */ 8441 PMAP_LOCK(kernel_pmap); 8442 for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc, 8443 len -= inc) { 8444 if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP && 8445 (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) { 8446 pdpe = pmap_large_map_pdpe(va); 8447 MPASS(*pdpe == 0); 8448 *pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW | 8449 X86_PG_V | X86_PG_A | pg_nx | 8450 pmap_cache_bits(kernel_pmap, mattr, TRUE); 8451 inc = NBPDP; 8452 } else if (len >= NBPDR && (pa & PDRMASK) == 0 && 8453 (va & PDRMASK) == 0) { 8454 pde = pmap_large_map_pde(va); 8455 MPASS(*pde == 0); 8456 *pde = pa | pg_g | X86_PG_PS | X86_PG_RW | 8457 X86_PG_V | X86_PG_A | pg_nx | 8458 pmap_cache_bits(kernel_pmap, mattr, TRUE); 8459 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))-> 8460 wire_count++; 8461 inc = NBPDR; 8462 } else { 8463 pte = pmap_large_map_pte(va); 8464 MPASS(*pte == 0); 8465 *pte = pa | pg_g | X86_PG_RW | X86_PG_V | 8466 X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, 8467 mattr, FALSE); 8468 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))-> 8469 wire_count++; 8470 inc = PAGE_SIZE; 8471 } 8472 } 8473 PMAP_UNLOCK(kernel_pmap); 8474 MPASS(len == 0); 8475 8476 *addr = (void *)vmem_res; 8477 return (0); 8478 } 8479 8480 void 8481 pmap_large_unmap(void *svaa, vm_size_t len) 8482 { 8483 vm_offset_t sva, va; 8484 vm_size_t inc; 8485 pdp_entry_t *pdpe, pdp; 8486 pd_entry_t *pde, pd; 8487 pt_entry_t *pte; 8488 vm_page_t m; 8489 struct spglist spgf; 8490 8491 sva = (vm_offset_t)svaa; 8492 if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS && 8493 sva + len <= DMAP_MIN_ADDRESS + dmaplimit)) 8494 return; 8495 8496 SLIST_INIT(&spgf); 8497 KASSERT(LARGEMAP_MIN_ADDRESS <= sva && sva + len <= 8498 LARGEMAP_MAX_ADDRESS + NBPML4 * (u_long)lm_ents, 8499 ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len)); 8500 PMAP_LOCK(kernel_pmap); 8501 for (va = sva; va < sva + len; va += inc) { 8502 pdpe = pmap_large_map_pdpe(va); 8503 pdp = *pdpe; 8504 KASSERT((pdp & X86_PG_V) != 0, 8505 ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, 8506 (u_long)pdpe, pdp)); 8507 if ((pdp & X86_PG_PS) != 0) { 8508 KASSERT((amd_feature & AMDID_PAGE1GB) != 0, 8509 ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, 8510 (u_long)pdpe, pdp)); 8511 KASSERT((va & PDPMASK) == 0, 8512 ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va, 8513 (u_long)pdpe, pdp)); 8514 KASSERT(va + NBPDP <= sva + len, 8515 ("unmap covers partial 1GB page, sva %#lx va %#lx " 8516 "pdpe %#lx pdp %#lx len %#lx", sva, va, 8517 (u_long)pdpe, pdp, len)); 8518 *pdpe = 0; 8519 inc = NBPDP; 8520 continue; 8521 } 8522 pde = pmap_pdpe_to_pde(pdpe, va); 8523 pd = *pde; 8524 KASSERT((pd & X86_PG_V) != 0, 8525 ("invalid pd va %#lx pde %#lx pd %#lx", va, 8526 (u_long)pde, pd)); 8527 if ((pd & X86_PG_PS) != 0) { 8528 KASSERT((va & PDRMASK) == 0, 8529 ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va, 8530 (u_long)pde, pd)); 8531 KASSERT(va + NBPDR <= sva + len, 8532 ("unmap covers partial 2MB page, sva %#lx va %#lx " 8533 "pde %#lx pd %#lx len %#lx", sva, va, (u_long)pde, 8534 pd, len)); 8535 pde_store(pde, 0); 8536 inc = NBPDR; 8537 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); 8538 m->wire_count--; 8539 if (m->wire_count == 0) { 8540 *pdpe = 0; 8541 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 8542 } 8543 continue; 8544 } 8545 pte = pmap_pde_to_pte(pde, va); 8546 KASSERT((*pte & X86_PG_V) != 0, 8547 ("invalid pte va %#lx pte %#lx pt %#lx", va, 8548 (u_long)pte, *pte)); 8549 pte_clear(pte); 8550 inc = PAGE_SIZE; 8551 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte)); 8552 m->wire_count--; 8553 if (m->wire_count == 0) { 8554 *pde = 0; 8555 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 8556 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); 8557 m->wire_count--; 8558 if (m->wire_count == 0) { 8559 *pdpe = 0; 8560 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 8561 } 8562 } 8563 } 8564 pmap_invalidate_range(kernel_pmap, sva, sva + len); 8565 PMAP_UNLOCK(kernel_pmap); 8566 vm_page_free_pages_toq(&spgf, false); 8567 vmem_free(large_vmem, sva, len); 8568 } 8569 8570 static void 8571 pmap_large_map_wb_fence_mfence(void) 8572 { 8573 8574 mfence(); 8575 } 8576 8577 static void 8578 pmap_large_map_wb_fence_sfence(void) 8579 { 8580 8581 sfence(); 8582 } 8583 8584 static void 8585 pmap_large_map_wb_fence_nop(void) 8586 { 8587 } 8588 8589 DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void), static) 8590 { 8591 8592 if (cpu_vendor_id != CPU_VENDOR_INTEL) 8593 return (pmap_large_map_wb_fence_mfence); 8594 else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB | 8595 CPUID_STDEXT_CLFLUSHOPT)) == 0) 8596 return (pmap_large_map_wb_fence_sfence); 8597 else 8598 /* clflush is strongly enough ordered */ 8599 return (pmap_large_map_wb_fence_nop); 8600 } 8601 8602 static void 8603 pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len) 8604 { 8605 8606 for (; len > 0; len -= cpu_clflush_line_size, 8607 va += cpu_clflush_line_size) 8608 clwb(va); 8609 } 8610 8611 static void 8612 pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len) 8613 { 8614 8615 for (; len > 0; len -= cpu_clflush_line_size, 8616 va += cpu_clflush_line_size) 8617 clflushopt(va); 8618 } 8619 8620 static void 8621 pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len) 8622 { 8623 8624 for (; len > 0; len -= cpu_clflush_line_size, 8625 va += cpu_clflush_line_size) 8626 clflush(va); 8627 } 8628 8629 static void 8630 pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len __unused) 8631 { 8632 } 8633 8634 DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t, vm_size_t), 8635 static) 8636 { 8637 8638 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0) 8639 return (pmap_large_map_flush_range_clwb); 8640 else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) 8641 return (pmap_large_map_flush_range_clflushopt); 8642 else if ((cpu_feature & CPUID_CLFSH) != 0) 8643 return (pmap_large_map_flush_range_clflush); 8644 else 8645 return (pmap_large_map_flush_range_nop); 8646 } 8647 8648 static void 8649 pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva) 8650 { 8651 volatile u_long *pe; 8652 u_long p; 8653 vm_offset_t va; 8654 vm_size_t inc; 8655 bool seen_other; 8656 8657 for (va = sva; va < eva; va += inc) { 8658 inc = 0; 8659 if ((amd_feature & AMDID_PAGE1GB) != 0) { 8660 pe = (volatile u_long *)pmap_large_map_pdpe(va); 8661 p = *pe; 8662 if ((p & X86_PG_PS) != 0) 8663 inc = NBPDP; 8664 } 8665 if (inc == 0) { 8666 pe = (volatile u_long *)pmap_large_map_pde(va); 8667 p = *pe; 8668 if ((p & X86_PG_PS) != 0) 8669 inc = NBPDR; 8670 } 8671 if (inc == 0) { 8672 pe = (volatile u_long *)pmap_large_map_pte(va); 8673 p = *pe; 8674 inc = PAGE_SIZE; 8675 } 8676 seen_other = false; 8677 for (;;) { 8678 if ((p & X86_PG_AVAIL1) != 0) { 8679 /* 8680 * Spin-wait for the end of a parallel 8681 * write-back. 8682 */ 8683 cpu_spinwait(); 8684 p = *pe; 8685 8686 /* 8687 * If we saw other write-back 8688 * occuring, we cannot rely on PG_M to 8689 * indicate state of the cache. The 8690 * PG_M bit is cleared before the 8691 * flush to avoid ignoring new writes, 8692 * and writes which are relevant for 8693 * us might happen after. 8694 */ 8695 seen_other = true; 8696 continue; 8697 } 8698 8699 if ((p & X86_PG_M) != 0 || seen_other) { 8700 if (!atomic_fcmpset_long(pe, &p, 8701 (p & ~X86_PG_M) | X86_PG_AVAIL1)) 8702 /* 8703 * If we saw PG_M without 8704 * PG_AVAIL1, and then on the 8705 * next attempt we do not 8706 * observe either PG_M or 8707 * PG_AVAIL1, the other 8708 * write-back started after us 8709 * and finished before us. We 8710 * can rely on it doing our 8711 * work. 8712 */ 8713 continue; 8714 pmap_large_map_flush_range(va, inc); 8715 atomic_clear_long(pe, X86_PG_AVAIL1); 8716 } 8717 break; 8718 } 8719 maybe_yield(); 8720 } 8721 } 8722 8723 /* 8724 * Write-back cache lines for the given address range. 8725 * 8726 * Must be called only on the range or sub-range returned from 8727 * pmap_large_map(). Must not be called on the coalesced ranges. 8728 * 8729 * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH 8730 * instructions support. 8731 */ 8732 void 8733 pmap_large_map_wb(void *svap, vm_size_t len) 8734 { 8735 vm_offset_t eva, sva; 8736 8737 sva = (vm_offset_t)svap; 8738 eva = sva + len; 8739 pmap_large_map_wb_fence(); 8740 if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) { 8741 pmap_large_map_flush_range(sva, len); 8742 } else { 8743 KASSERT(sva >= LARGEMAP_MIN_ADDRESS && 8744 eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4, 8745 ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len)); 8746 pmap_large_map_wb_large(sva, eva); 8747 } 8748 pmap_large_map_wb_fence(); 8749 } 8750 8751 static vm_page_t 8752 pmap_pti_alloc_page(void) 8753 { 8754 vm_page_t m; 8755 8756 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 8757 m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY | 8758 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 8759 return (m); 8760 } 8761 8762 static bool 8763 pmap_pti_free_page(vm_page_t m) 8764 { 8765 8766 KASSERT(m->wire_count > 0, ("page %p not wired", m)); 8767 if (!vm_page_unwire_noq(m)) 8768 return (false); 8769 vm_page_free_zero(m); 8770 return (true); 8771 } 8772 8773 static void 8774 pmap_pti_init(void) 8775 { 8776 vm_page_t pml4_pg; 8777 pdp_entry_t *pdpe; 8778 vm_offset_t va; 8779 int i; 8780 8781 if (!pti) 8782 return; 8783 pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL); 8784 VM_OBJECT_WLOCK(pti_obj); 8785 pml4_pg = pmap_pti_alloc_page(); 8786 pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg)); 8787 for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS && 8788 va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) { 8789 pdpe = pmap_pti_pdpe(va); 8790 pmap_pti_wire_pte(pdpe); 8791 } 8792 pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0], 8793 (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false); 8794 pmap_pti_add_kva_locked((vm_offset_t)gdt, (vm_offset_t)gdt + 8795 sizeof(struct user_segment_descriptor) * NGDT * MAXCPU, false); 8796 pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt + 8797 sizeof(struct gate_descriptor) * NIDT, false); 8798 pmap_pti_add_kva_locked((vm_offset_t)common_tss, 8799 (vm_offset_t)common_tss + sizeof(struct amd64tss) * MAXCPU, false); 8800 CPU_FOREACH(i) { 8801 /* Doublefault stack IST 1 */ 8802 va = common_tss[i].tss_ist1; 8803 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); 8804 /* NMI stack IST 2 */ 8805 va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu); 8806 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); 8807 /* MC# stack IST 3 */ 8808 va = common_tss[i].tss_ist3 + sizeof(struct nmi_pcpu); 8809 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); 8810 /* DB# stack IST 4 */ 8811 va = common_tss[i].tss_ist4 + sizeof(struct nmi_pcpu); 8812 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); 8813 } 8814 pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE, 8815 (vm_offset_t)etext, true); 8816 pti_finalized = true; 8817 VM_OBJECT_WUNLOCK(pti_obj); 8818 } 8819 SYSINIT(pmap_pti, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_pti_init, NULL); 8820 8821 static pdp_entry_t * 8822 pmap_pti_pdpe(vm_offset_t va) 8823 { 8824 pml4_entry_t *pml4e; 8825 pdp_entry_t *pdpe; 8826 vm_page_t m; 8827 vm_pindex_t pml4_idx; 8828 vm_paddr_t mphys; 8829 8830 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 8831 8832 pml4_idx = pmap_pml4e_index(va); 8833 pml4e = &pti_pml4[pml4_idx]; 8834 m = NULL; 8835 if (*pml4e == 0) { 8836 if (pti_finalized) 8837 panic("pml4 alloc after finalization\n"); 8838 m = pmap_pti_alloc_page(); 8839 if (*pml4e != 0) { 8840 pmap_pti_free_page(m); 8841 mphys = *pml4e & ~PAGE_MASK; 8842 } else { 8843 mphys = VM_PAGE_TO_PHYS(m); 8844 *pml4e = mphys | X86_PG_RW | X86_PG_V; 8845 } 8846 } else { 8847 mphys = *pml4e & ~PAGE_MASK; 8848 } 8849 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va); 8850 return (pdpe); 8851 } 8852 8853 static void 8854 pmap_pti_wire_pte(void *pte) 8855 { 8856 vm_page_t m; 8857 8858 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 8859 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); 8860 m->wire_count++; 8861 } 8862 8863 static void 8864 pmap_pti_unwire_pde(void *pde, bool only_ref) 8865 { 8866 vm_page_t m; 8867 8868 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 8869 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde)); 8870 MPASS(m->wire_count > 0); 8871 MPASS(only_ref || m->wire_count > 1); 8872 pmap_pti_free_page(m); 8873 } 8874 8875 static void 8876 pmap_pti_unwire_pte(void *pte, vm_offset_t va) 8877 { 8878 vm_page_t m; 8879 pd_entry_t *pde; 8880 8881 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 8882 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); 8883 MPASS(m->wire_count > 0); 8884 if (pmap_pti_free_page(m)) { 8885 pde = pmap_pti_pde(va); 8886 MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V); 8887 *pde = 0; 8888 pmap_pti_unwire_pde(pde, false); 8889 } 8890 } 8891 8892 static pd_entry_t * 8893 pmap_pti_pde(vm_offset_t va) 8894 { 8895 pdp_entry_t *pdpe; 8896 pd_entry_t *pde; 8897 vm_page_t m; 8898 vm_pindex_t pd_idx; 8899 vm_paddr_t mphys; 8900 8901 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 8902 8903 pdpe = pmap_pti_pdpe(va); 8904 if (*pdpe == 0) { 8905 m = pmap_pti_alloc_page(); 8906 if (*pdpe != 0) { 8907 pmap_pti_free_page(m); 8908 MPASS((*pdpe & X86_PG_PS) == 0); 8909 mphys = *pdpe & ~PAGE_MASK; 8910 } else { 8911 mphys = VM_PAGE_TO_PHYS(m); 8912 *pdpe = mphys | X86_PG_RW | X86_PG_V; 8913 } 8914 } else { 8915 MPASS((*pdpe & X86_PG_PS) == 0); 8916 mphys = *pdpe & ~PAGE_MASK; 8917 } 8918 8919 pde = (pd_entry_t *)PHYS_TO_DMAP(mphys); 8920 pd_idx = pmap_pde_index(va); 8921 pde += pd_idx; 8922 return (pde); 8923 } 8924 8925 static pt_entry_t * 8926 pmap_pti_pte(vm_offset_t va, bool *unwire_pde) 8927 { 8928 pd_entry_t *pde; 8929 pt_entry_t *pte; 8930 vm_page_t m; 8931 vm_paddr_t mphys; 8932 8933 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 8934 8935 pde = pmap_pti_pde(va); 8936 if (unwire_pde != NULL) { 8937 *unwire_pde = true; 8938 pmap_pti_wire_pte(pde); 8939 } 8940 if (*pde == 0) { 8941 m = pmap_pti_alloc_page(); 8942 if (*pde != 0) { 8943 pmap_pti_free_page(m); 8944 MPASS((*pde & X86_PG_PS) == 0); 8945 mphys = *pde & ~(PAGE_MASK | pg_nx); 8946 } else { 8947 mphys = VM_PAGE_TO_PHYS(m); 8948 *pde = mphys | X86_PG_RW | X86_PG_V; 8949 if (unwire_pde != NULL) 8950 *unwire_pde = false; 8951 } 8952 } else { 8953 MPASS((*pde & X86_PG_PS) == 0); 8954 mphys = *pde & ~(PAGE_MASK | pg_nx); 8955 } 8956 8957 pte = (pt_entry_t *)PHYS_TO_DMAP(mphys); 8958 pte += pmap_pte_index(va); 8959 8960 return (pte); 8961 } 8962 8963 static void 8964 pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec) 8965 { 8966 vm_paddr_t pa; 8967 pd_entry_t *pde; 8968 pt_entry_t *pte, ptev; 8969 bool unwire_pde; 8970 8971 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 8972 8973 sva = trunc_page(sva); 8974 MPASS(sva > VM_MAXUSER_ADDRESS); 8975 eva = round_page(eva); 8976 MPASS(sva < eva); 8977 for (; sva < eva; sva += PAGE_SIZE) { 8978 pte = pmap_pti_pte(sva, &unwire_pde); 8979 pa = pmap_kextract(sva); 8980 ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G | 8981 (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap, 8982 VM_MEMATTR_DEFAULT, FALSE); 8983 if (*pte == 0) { 8984 pte_store(pte, ptev); 8985 pmap_pti_wire_pte(pte); 8986 } else { 8987 KASSERT(!pti_finalized, 8988 ("pti overlap after fin %#lx %#lx %#lx", 8989 sva, *pte, ptev)); 8990 KASSERT(*pte == ptev, 8991 ("pti non-identical pte after fin %#lx %#lx %#lx", 8992 sva, *pte, ptev)); 8993 } 8994 if (unwire_pde) { 8995 pde = pmap_pti_pde(sva); 8996 pmap_pti_unwire_pde(pde, true); 8997 } 8998 } 8999 } 9000 9001 void 9002 pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec) 9003 { 9004 9005 if (!pti) 9006 return; 9007 VM_OBJECT_WLOCK(pti_obj); 9008 pmap_pti_add_kva_locked(sva, eva, exec); 9009 VM_OBJECT_WUNLOCK(pti_obj); 9010 } 9011 9012 void 9013 pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva) 9014 { 9015 pt_entry_t *pte; 9016 vm_offset_t va; 9017 9018 if (!pti) 9019 return; 9020 sva = rounddown2(sva, PAGE_SIZE); 9021 MPASS(sva > VM_MAXUSER_ADDRESS); 9022 eva = roundup2(eva, PAGE_SIZE); 9023 MPASS(sva < eva); 9024 VM_OBJECT_WLOCK(pti_obj); 9025 for (va = sva; va < eva; va += PAGE_SIZE) { 9026 pte = pmap_pti_pte(va, NULL); 9027 KASSERT((*pte & X86_PG_V) != 0, 9028 ("invalid pte va %#lx pte %#lx pt %#lx", va, 9029 (u_long)pte, *pte)); 9030 pte_clear(pte); 9031 pmap_pti_unwire_pte(pte, va); 9032 } 9033 pmap_invalidate_range(kernel_pmap, sva, eva); 9034 VM_OBJECT_WUNLOCK(pti_obj); 9035 } 9036 9037 static void * 9038 pkru_dup_range(void *ctx __unused, void *data) 9039 { 9040 struct pmap_pkru_range *node, *new_node; 9041 9042 new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); 9043 if (new_node == NULL) 9044 return (NULL); 9045 node = data; 9046 memcpy(new_node, node, sizeof(*node)); 9047 return (new_node); 9048 } 9049 9050 static void 9051 pkru_free_range(void *ctx __unused, void *node) 9052 { 9053 9054 uma_zfree(pmap_pkru_ranges_zone, node); 9055 } 9056 9057 static int 9058 pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, 9059 int flags) 9060 { 9061 struct pmap_pkru_range *ppr; 9062 int error; 9063 9064 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9065 MPASS(pmap->pm_type == PT_X86); 9066 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 9067 if ((flags & AMD64_PKRU_EXCL) != 0 && 9068 !rangeset_check_empty(&pmap->pm_pkru, sva, eva)) 9069 return (EBUSY); 9070 ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); 9071 if (ppr == NULL) 9072 return (ENOMEM); 9073 ppr->pkru_keyidx = keyidx; 9074 ppr->pkru_flags = flags & AMD64_PKRU_PERSIST; 9075 error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr); 9076 if (error != 0) 9077 uma_zfree(pmap_pkru_ranges_zone, ppr); 9078 return (error); 9079 } 9080 9081 static int 9082 pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 9083 { 9084 9085 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9086 MPASS(pmap->pm_type == PT_X86); 9087 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 9088 return (rangeset_remove(&pmap->pm_pkru, sva, eva)); 9089 } 9090 9091 static void 9092 pmap_pkru_deassign_all(pmap_t pmap) 9093 { 9094 9095 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9096 if (pmap->pm_type == PT_X86 && 9097 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) 9098 rangeset_remove_all(&pmap->pm_pkru); 9099 } 9100 9101 static bool 9102 pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 9103 { 9104 struct pmap_pkru_range *ppr, *prev_ppr; 9105 vm_offset_t va; 9106 9107 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9108 if (pmap->pm_type != PT_X86 || 9109 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || 9110 sva >= VM_MAXUSER_ADDRESS) 9111 return (true); 9112 MPASS(eva <= VM_MAXUSER_ADDRESS); 9113 for (va = sva, prev_ppr = NULL; va < eva;) { 9114 ppr = rangeset_lookup(&pmap->pm_pkru, va); 9115 if ((ppr == NULL) ^ (prev_ppr == NULL)) 9116 return (false); 9117 if (ppr == NULL) { 9118 va += PAGE_SIZE; 9119 continue; 9120 } 9121 if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx) 9122 return (false); 9123 va = ppr->pkru_rs_el.re_end; 9124 } 9125 return (true); 9126 } 9127 9128 static pt_entry_t 9129 pmap_pkru_get(pmap_t pmap, vm_offset_t va) 9130 { 9131 struct pmap_pkru_range *ppr; 9132 9133 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9134 if (pmap->pm_type != PT_X86 || 9135 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || 9136 va >= VM_MAXUSER_ADDRESS) 9137 return (0); 9138 ppr = rangeset_lookup(&pmap->pm_pkru, va); 9139 if (ppr != NULL) 9140 return (X86_PG_PKU(ppr->pkru_keyidx)); 9141 return (0); 9142 } 9143 9144 static bool 9145 pred_pkru_on_remove(void *ctx __unused, void *r) 9146 { 9147 struct pmap_pkru_range *ppr; 9148 9149 ppr = r; 9150 return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0); 9151 } 9152 9153 static void 9154 pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 9155 { 9156 9157 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9158 if (pmap->pm_type == PT_X86 && 9159 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 9160 rangeset_remove_pred(&pmap->pm_pkru, sva, eva, 9161 pred_pkru_on_remove); 9162 } 9163 } 9164 9165 static int 9166 pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap) 9167 { 9168 9169 PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED); 9170 PMAP_LOCK_ASSERT(src_pmap, MA_OWNED); 9171 MPASS(dst_pmap->pm_type == PT_X86); 9172 MPASS(src_pmap->pm_type == PT_X86); 9173 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 9174 if (src_pmap->pm_pkru.rs_data_ctx == NULL) 9175 return (0); 9176 return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru)); 9177 } 9178 9179 static void 9180 pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 9181 u_int keyidx) 9182 { 9183 pml4_entry_t *pml4e; 9184 pdp_entry_t *pdpe; 9185 pd_entry_t newpde, ptpaddr, *pde; 9186 pt_entry_t newpte, *ptep, pte; 9187 vm_offset_t va, va_next; 9188 bool changed; 9189 9190 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9191 MPASS(pmap->pm_type == PT_X86); 9192 MPASS(keyidx <= PMAP_MAX_PKRU_IDX); 9193 9194 for (changed = false, va = sva; va < eva; va = va_next) { 9195 pml4e = pmap_pml4e(pmap, va); 9196 if ((*pml4e & X86_PG_V) == 0) { 9197 va_next = (va + NBPML4) & ~PML4MASK; 9198 if (va_next < va) 9199 va_next = eva; 9200 continue; 9201 } 9202 9203 pdpe = pmap_pml4e_to_pdpe(pml4e, va); 9204 if ((*pdpe & X86_PG_V) == 0) { 9205 va_next = (va + NBPDP) & ~PDPMASK; 9206 if (va_next < va) 9207 va_next = eva; 9208 continue; 9209 } 9210 9211 va_next = (va + NBPDR) & ~PDRMASK; 9212 if (va_next < va) 9213 va_next = eva; 9214 9215 pde = pmap_pdpe_to_pde(pdpe, va); 9216 ptpaddr = *pde; 9217 if (ptpaddr == 0) 9218 continue; 9219 9220 MPASS((ptpaddr & X86_PG_V) != 0); 9221 if ((ptpaddr & PG_PS) != 0) { 9222 if (va + NBPDR == va_next && eva >= va_next) { 9223 newpde = (ptpaddr & ~X86_PG_PKU_MASK) | 9224 X86_PG_PKU(keyidx); 9225 if (newpde != ptpaddr) { 9226 *pde = newpde; 9227 changed = true; 9228 } 9229 continue; 9230 } else if (!pmap_demote_pde(pmap, pde, va)) { 9231 continue; 9232 } 9233 } 9234 9235 if (va_next > eva) 9236 va_next = eva; 9237 9238 for (ptep = pmap_pde_to_pte(pde, va); va != va_next; 9239 ptep++, va += PAGE_SIZE) { 9240 pte = *ptep; 9241 if ((pte & X86_PG_V) == 0) 9242 continue; 9243 newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx); 9244 if (newpte != pte) { 9245 *ptep = newpte; 9246 changed = true; 9247 } 9248 } 9249 } 9250 if (changed) 9251 pmap_invalidate_range(pmap, sva, eva); 9252 } 9253 9254 static int 9255 pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 9256 u_int keyidx, int flags) 9257 { 9258 9259 if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX || 9260 (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0) 9261 return (EINVAL); 9262 if (eva <= sva || eva > VM_MAXUSER_ADDRESS) 9263 return (EFAULT); 9264 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) 9265 return (ENOTSUP); 9266 return (0); 9267 } 9268 9269 int 9270 pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, 9271 int flags) 9272 { 9273 int error; 9274 9275 sva = trunc_page(sva); 9276 eva = round_page(eva); 9277 error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags); 9278 if (error != 0) 9279 return (error); 9280 for (;;) { 9281 PMAP_LOCK(pmap); 9282 error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags); 9283 if (error == 0) 9284 pmap_pkru_update_range(pmap, sva, eva, keyidx); 9285 PMAP_UNLOCK(pmap); 9286 if (error != ENOMEM) 9287 break; 9288 vm_wait(NULL); 9289 } 9290 return (error); 9291 } 9292 9293 int 9294 pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 9295 { 9296 int error; 9297 9298 sva = trunc_page(sva); 9299 eva = round_page(eva); 9300 error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0); 9301 if (error != 0) 9302 return (error); 9303 for (;;) { 9304 PMAP_LOCK(pmap); 9305 error = pmap_pkru_deassign(pmap, sva, eva); 9306 if (error == 0) 9307 pmap_pkru_update_range(pmap, sva, eva, 0); 9308 PMAP_UNLOCK(pmap); 9309 if (error != ENOMEM) 9310 break; 9311 vm_wait(NULL); 9312 } 9313 return (error); 9314 } 9315 9316 #include "opt_ddb.h" 9317 #ifdef DDB 9318 #include <sys/kdb.h> 9319 #include <ddb/ddb.h> 9320 9321 DB_SHOW_COMMAND(pte, pmap_print_pte) 9322 { 9323 pmap_t pmap; 9324 pml4_entry_t *pml4; 9325 pdp_entry_t *pdp; 9326 pd_entry_t *pde; 9327 pt_entry_t *pte, PG_V; 9328 vm_offset_t va; 9329 9330 if (!have_addr) { 9331 db_printf("show pte addr\n"); 9332 return; 9333 } 9334 va = (vm_offset_t)addr; 9335 9336 if (kdb_thread != NULL) 9337 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); 9338 else 9339 pmap = PCPU_GET(curpmap); 9340 9341 PG_V = pmap_valid_bit(pmap); 9342 pml4 = pmap_pml4e(pmap, va); 9343 db_printf("VA %#016lx pml4e %#016lx", va, *pml4); 9344 if ((*pml4 & PG_V) == 0) { 9345 db_printf("\n"); 9346 return; 9347 } 9348 pdp = pmap_pml4e_to_pdpe(pml4, va); 9349 db_printf(" pdpe %#016lx", *pdp); 9350 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { 9351 db_printf("\n"); 9352 return; 9353 } 9354 pde = pmap_pdpe_to_pde(pdp, va); 9355 db_printf(" pde %#016lx", *pde); 9356 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { 9357 db_printf("\n"); 9358 return; 9359 } 9360 pte = pmap_pde_to_pte(pde, va); 9361 db_printf(" pte %#016lx\n", *pte); 9362 } 9363 9364 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) 9365 { 9366 vm_paddr_t a; 9367 9368 if (have_addr) { 9369 a = (vm_paddr_t)addr; 9370 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); 9371 } else { 9372 db_printf("show phys2dmap addr\n"); 9373 } 9374 } 9375 #endif 9376