1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * 15 * This code is derived from software contributed to Berkeley by 16 * the Systems Programming Group of the University of Utah Computer 17 * Science Department and William Jolitz of UUNET Technologies Inc. 18 * 19 * Redistribution and use in source and binary forms, with or without 20 * modification, are permitted provided that the following conditions 21 * are met: 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 3. All advertising materials mentioning features or use of this software 28 * must display the following acknowledgement: 29 * This product includes software developed by the University of 30 * California, Berkeley and its contributors. 31 * 4. Neither the name of the University nor the names of its contributors 32 * may be used to endorse or promote products derived from this software 33 * without specific prior written permission. 34 * 35 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 36 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 38 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 39 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 40 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 41 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 42 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 43 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 44 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 45 * SUCH DAMAGE. 46 * 47 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 48 */ 49 /*- 50 * Copyright (c) 2003 Networks Associates Technology, Inc. 51 * Copyright (c) 2014-2019 The FreeBSD Foundation 52 * All rights reserved. 53 * 54 * This software was developed for the FreeBSD Project by Jake Burkholder, 55 * Safeport Network Services, and Network Associates Laboratories, the 56 * Security Research Division of Network Associates, Inc. under 57 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 58 * CHATS research program. 59 * 60 * Portions of this software were developed by 61 * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from 62 * the FreeBSD Foundation. 63 * 64 * Redistribution and use in source and binary forms, with or without 65 * modification, are permitted provided that the following conditions 66 * are met: 67 * 1. Redistributions of source code must retain the above copyright 68 * notice, this list of conditions and the following disclaimer. 69 * 2. Redistributions in binary form must reproduce the above copyright 70 * notice, this list of conditions and the following disclaimer in the 71 * documentation and/or other materials provided with the distribution. 72 * 73 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 74 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 75 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 76 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 77 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 78 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 79 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 80 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 81 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 82 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 83 * SUCH DAMAGE. 84 */ 85 86 #define AMD64_NPT_AWARE 87 88 #include <sys/cdefs.h> 89 __FBSDID("$FreeBSD$"); 90 91 /* 92 * Manages physical address maps. 93 * 94 * Since the information managed by this module is 95 * also stored by the logical address mapping module, 96 * this module may throw away valid virtual-to-physical 97 * mappings at almost any time. However, invalidations 98 * of virtual-to-physical mappings must be done as 99 * requested. 100 * 101 * In order to cope with hardware architectures which 102 * make virtual-to-physical map invalidates expensive, 103 * this module may delay invalidate or reduced protection 104 * operations until such time as they are actually 105 * necessary. This module is given full information as 106 * to which processors are currently using which maps, 107 * and to when physical maps must be made correct. 108 */ 109 110 #include "opt_ddb.h" 111 #include "opt_pmap.h" 112 #include "opt_vm.h" 113 114 #include <sys/param.h> 115 #include <sys/bitstring.h> 116 #include <sys/bus.h> 117 #include <sys/systm.h> 118 #include <sys/kernel.h> 119 #include <sys/ktr.h> 120 #include <sys/lock.h> 121 #include <sys/malloc.h> 122 #include <sys/mman.h> 123 #include <sys/mutex.h> 124 #include <sys/proc.h> 125 #include <sys/rangeset.h> 126 #include <sys/rwlock.h> 127 #include <sys/sbuf.h> 128 #include <sys/sx.h> 129 #include <sys/turnstile.h> 130 #include <sys/vmem.h> 131 #include <sys/vmmeter.h> 132 #include <sys/sched.h> 133 #include <sys/sysctl.h> 134 #include <sys/smp.h> 135 #ifdef DDB 136 #include <sys/kdb.h> 137 #include <ddb/ddb.h> 138 #endif 139 140 #include <vm/vm.h> 141 #include <vm/vm_param.h> 142 #include <vm/vm_kern.h> 143 #include <vm/vm_page.h> 144 #include <vm/vm_map.h> 145 #include <vm/vm_object.h> 146 #include <vm/vm_extern.h> 147 #include <vm/vm_pageout.h> 148 #include <vm/vm_pager.h> 149 #include <vm/vm_phys.h> 150 #include <vm/vm_radix.h> 151 #include <vm/vm_reserv.h> 152 #include <vm/uma.h> 153 154 #include <machine/intr_machdep.h> 155 #include <x86/apicvar.h> 156 #include <x86/ifunc.h> 157 #include <machine/cpu.h> 158 #include <machine/cputypes.h> 159 #include <machine/md_var.h> 160 #include <machine/pcb.h> 161 #include <machine/specialreg.h> 162 #ifdef SMP 163 #include <machine/smp.h> 164 #endif 165 #include <machine/sysarch.h> 166 #include <machine/tss.h> 167 168 static __inline boolean_t 169 pmap_type_guest(pmap_t pmap) 170 { 171 172 return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI)); 173 } 174 175 static __inline boolean_t 176 pmap_emulate_ad_bits(pmap_t pmap) 177 { 178 179 return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); 180 } 181 182 static __inline pt_entry_t 183 pmap_valid_bit(pmap_t pmap) 184 { 185 pt_entry_t mask; 186 187 switch (pmap->pm_type) { 188 case PT_X86: 189 case PT_RVI: 190 mask = X86_PG_V; 191 break; 192 case PT_EPT: 193 if (pmap_emulate_ad_bits(pmap)) 194 mask = EPT_PG_EMUL_V; 195 else 196 mask = EPT_PG_READ; 197 break; 198 default: 199 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); 200 } 201 202 return (mask); 203 } 204 205 static __inline pt_entry_t 206 pmap_rw_bit(pmap_t pmap) 207 { 208 pt_entry_t mask; 209 210 switch (pmap->pm_type) { 211 case PT_X86: 212 case PT_RVI: 213 mask = X86_PG_RW; 214 break; 215 case PT_EPT: 216 if (pmap_emulate_ad_bits(pmap)) 217 mask = EPT_PG_EMUL_RW; 218 else 219 mask = EPT_PG_WRITE; 220 break; 221 default: 222 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); 223 } 224 225 return (mask); 226 } 227 228 static pt_entry_t pg_g; 229 230 static __inline pt_entry_t 231 pmap_global_bit(pmap_t pmap) 232 { 233 pt_entry_t mask; 234 235 switch (pmap->pm_type) { 236 case PT_X86: 237 mask = pg_g; 238 break; 239 case PT_RVI: 240 case PT_EPT: 241 mask = 0; 242 break; 243 default: 244 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); 245 } 246 247 return (mask); 248 } 249 250 static __inline pt_entry_t 251 pmap_accessed_bit(pmap_t pmap) 252 { 253 pt_entry_t mask; 254 255 switch (pmap->pm_type) { 256 case PT_X86: 257 case PT_RVI: 258 mask = X86_PG_A; 259 break; 260 case PT_EPT: 261 if (pmap_emulate_ad_bits(pmap)) 262 mask = EPT_PG_READ; 263 else 264 mask = EPT_PG_A; 265 break; 266 default: 267 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); 268 } 269 270 return (mask); 271 } 272 273 static __inline pt_entry_t 274 pmap_modified_bit(pmap_t pmap) 275 { 276 pt_entry_t mask; 277 278 switch (pmap->pm_type) { 279 case PT_X86: 280 case PT_RVI: 281 mask = X86_PG_M; 282 break; 283 case PT_EPT: 284 if (pmap_emulate_ad_bits(pmap)) 285 mask = EPT_PG_WRITE; 286 else 287 mask = EPT_PG_M; 288 break; 289 default: 290 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); 291 } 292 293 return (mask); 294 } 295 296 static __inline pt_entry_t 297 pmap_pku_mask_bit(pmap_t pmap) 298 { 299 300 return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0); 301 } 302 303 #if !defined(DIAGNOSTIC) 304 #ifdef __GNUC_GNU_INLINE__ 305 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 306 #else 307 #define PMAP_INLINE extern inline 308 #endif 309 #else 310 #define PMAP_INLINE 311 #endif 312 313 #ifdef PV_STATS 314 #define PV_STAT(x) do { x ; } while (0) 315 #else 316 #define PV_STAT(x) do { } while (0) 317 #endif 318 319 #define pa_index(pa) ((pa) >> PDRSHIFT) 320 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 321 322 #define NPV_LIST_LOCKS MAXCPU 323 324 #define PHYS_TO_PV_LIST_LOCK(pa) \ 325 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) 326 327 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 328 struct rwlock **_lockp = (lockp); \ 329 struct rwlock *_new_lock; \ 330 \ 331 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 332 if (_new_lock != *_lockp) { \ 333 if (*_lockp != NULL) \ 334 rw_wunlock(*_lockp); \ 335 *_lockp = _new_lock; \ 336 rw_wlock(*_lockp); \ 337 } \ 338 } while (0) 339 340 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 341 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 342 343 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 344 struct rwlock **_lockp = (lockp); \ 345 \ 346 if (*_lockp != NULL) { \ 347 rw_wunlock(*_lockp); \ 348 *_lockp = NULL; \ 349 } \ 350 } while (0) 351 352 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 353 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 354 355 struct pmap kernel_pmap_store; 356 357 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 358 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 359 360 int nkpt; 361 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, 362 "Number of kernel page table pages allocated on bootup"); 363 364 static int ndmpdp; 365 vm_paddr_t dmaplimit; 366 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 367 pt_entry_t pg_nx; 368 369 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 370 371 static int pg_ps_enabled = 1; 372 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 373 &pg_ps_enabled, 0, "Are large page mappings enabled?"); 374 375 #define PAT_INDEX_SIZE 8 376 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 377 378 static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 379 static u_int64_t KPDphys; /* phys addr of kernel level 2 */ 380 u_int64_t KPDPphys; /* phys addr of kernel level 3 */ 381 u_int64_t KPML4phys; /* phys addr of kernel level 4 */ 382 383 static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ 384 static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 385 static int ndmpdpphys; /* number of DMPDPphys pages */ 386 387 static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */ 388 389 /* 390 * pmap_mapdev support pre initialization (i.e. console) 391 */ 392 #define PMAP_PREINIT_MAPPING_COUNT 8 393 static struct pmap_preinit_mapping { 394 vm_paddr_t pa; 395 vm_offset_t va; 396 vm_size_t sz; 397 int mode; 398 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 399 static int pmap_initialized; 400 401 /* 402 * Data for the pv entry allocation mechanism. 403 * Updates to pv_invl_gen are protected by the pv_list_locks[] 404 * elements, but reads are not. 405 */ 406 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 407 static struct mtx __exclusive_cache_line pv_chunks_mutex; 408 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; 409 static u_long pv_invl_gen[NPV_LIST_LOCKS]; 410 static struct md_page *pv_table; 411 static struct md_page pv_dummy; 412 413 /* 414 * All those kernel PT submaps that BSD is so fond of 415 */ 416 pt_entry_t *CMAP1 = NULL; 417 caddr_t CADDR1 = 0; 418 static vm_offset_t qframe = 0; 419 static struct mtx qframe_mtx; 420 421 static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ 422 423 static vmem_t *large_vmem; 424 static u_int lm_ents; 425 #define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= LARGEMAP_MIN_ADDRESS && \ 426 (va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents) 427 428 int pmap_pcid_enabled = 1; 429 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 430 &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?"); 431 int invpcid_works = 0; 432 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, 433 "Is the invpcid instruction available ?"); 434 435 int __read_frequently pti = 0; 436 SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 437 &pti, 0, 438 "Page Table Isolation enabled"); 439 static vm_object_t pti_obj; 440 static pml4_entry_t *pti_pml4; 441 static vm_pindex_t pti_pg_idx; 442 static bool pti_finalized; 443 444 struct pmap_pkru_range { 445 struct rs_el pkru_rs_el; 446 u_int pkru_keyidx; 447 int pkru_flags; 448 }; 449 450 static uma_zone_t pmap_pkru_ranges_zone; 451 static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 452 static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va); 453 static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 454 static void *pkru_dup_range(void *ctx, void *data); 455 static void pkru_free_range(void *ctx, void *node); 456 static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap); 457 static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 458 static void pmap_pkru_deassign_all(pmap_t pmap); 459 460 static int 461 pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS) 462 { 463 int i; 464 uint64_t res; 465 466 res = 0; 467 CPU_FOREACH(i) { 468 res += cpuid_to_pcpu[i]->pc_pm_save_cnt; 469 } 470 return (sysctl_handle_64(oidp, &res, 0, req)); 471 } 472 SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RD | 473 CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU", 474 "Count of saved TLB context on switch"); 475 476 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = 477 LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); 478 static struct mtx invl_gen_mtx; 479 /* Fake lock object to satisfy turnstiles interface. */ 480 static struct lock_object invl_gen_ts = { 481 .lo_name = "invlts", 482 }; 483 static struct pmap_invl_gen pmap_invl_gen_head = { 484 .gen = 1, 485 .next = NULL, 486 }; 487 static u_long pmap_invl_gen = 1; 488 static int pmap_invl_waiters; 489 static struct callout pmap_invl_callout; 490 static bool pmap_invl_callout_inited; 491 492 #define PMAP_ASSERT_NOT_IN_DI() \ 493 KASSERT(pmap_not_in_di(), ("DI already started")) 494 495 static bool 496 pmap_di_locked(void) 497 { 498 int tun; 499 500 if ((cpu_feature2 & CPUID2_CX16) == 0) 501 return (true); 502 tun = 0; 503 TUNABLE_INT_FETCH("vm.pmap.di_locked", &tun); 504 return (tun != 0); 505 } 506 507 static int 508 sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS) 509 { 510 int locked; 511 512 locked = pmap_di_locked(); 513 return (sysctl_handle_int(oidp, &locked, 0, req)); 514 } 515 SYSCTL_PROC(_vm_pmap, OID_AUTO, di_locked, CTLTYPE_INT | CTLFLAG_RDTUN | 516 CTLFLAG_MPSAFE, 0, 0, sysctl_pmap_di_locked, "", 517 "Locked delayed invalidation"); 518 519 static bool pmap_not_in_di_l(void); 520 static bool pmap_not_in_di_u(void); 521 DEFINE_IFUNC(, bool, pmap_not_in_di, (void)) 522 { 523 524 return (pmap_di_locked() ? pmap_not_in_di_l : pmap_not_in_di_u); 525 } 526 527 static bool 528 pmap_not_in_di_l(void) 529 { 530 struct pmap_invl_gen *invl_gen; 531 532 invl_gen = &curthread->td_md.md_invl_gen; 533 return (invl_gen->gen == 0); 534 } 535 536 static void 537 pmap_thread_init_invl_gen_l(struct thread *td) 538 { 539 struct pmap_invl_gen *invl_gen; 540 541 invl_gen = &td->td_md.md_invl_gen; 542 invl_gen->gen = 0; 543 } 544 545 static void 546 pmap_delayed_invl_wait_block(u_long *m_gen, u_long *invl_gen) 547 { 548 struct turnstile *ts; 549 550 ts = turnstile_trywait(&invl_gen_ts); 551 if (*m_gen > atomic_load_long(invl_gen)) 552 turnstile_wait(ts, NULL, TS_SHARED_QUEUE); 553 else 554 turnstile_cancel(ts); 555 } 556 557 static void 558 pmap_delayed_invl_finish_unblock(u_long new_gen) 559 { 560 struct turnstile *ts; 561 562 turnstile_chain_lock(&invl_gen_ts); 563 ts = turnstile_lookup(&invl_gen_ts); 564 if (new_gen != 0) 565 pmap_invl_gen = new_gen; 566 if (ts != NULL) { 567 turnstile_broadcast(ts, TS_SHARED_QUEUE); 568 turnstile_unpend(ts); 569 } 570 turnstile_chain_unlock(&invl_gen_ts); 571 } 572 573 /* 574 * Start a new Delayed Invalidation (DI) block of code, executed by 575 * the current thread. Within a DI block, the current thread may 576 * destroy both the page table and PV list entries for a mapping and 577 * then release the corresponding PV list lock before ensuring that 578 * the mapping is flushed from the TLBs of any processors with the 579 * pmap active. 580 */ 581 static void 582 pmap_delayed_invl_start_l(void) 583 { 584 struct pmap_invl_gen *invl_gen; 585 u_long currgen; 586 587 invl_gen = &curthread->td_md.md_invl_gen; 588 PMAP_ASSERT_NOT_IN_DI(); 589 mtx_lock(&invl_gen_mtx); 590 if (LIST_EMPTY(&pmap_invl_gen_tracker)) 591 currgen = pmap_invl_gen; 592 else 593 currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; 594 invl_gen->gen = currgen + 1; 595 LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); 596 mtx_unlock(&invl_gen_mtx); 597 } 598 599 /* 600 * Finish the DI block, previously started by the current thread. All 601 * required TLB flushes for the pages marked by 602 * pmap_delayed_invl_page() must be finished before this function is 603 * called. 604 * 605 * This function works by bumping the global DI generation number to 606 * the generation number of the current thread's DI, unless there is a 607 * pending DI that started earlier. In the latter case, bumping the 608 * global DI generation number would incorrectly signal that the 609 * earlier DI had finished. Instead, this function bumps the earlier 610 * DI's generation number to match the generation number of the 611 * current thread's DI. 612 */ 613 static void 614 pmap_delayed_invl_finish_l(void) 615 { 616 struct pmap_invl_gen *invl_gen, *next; 617 618 invl_gen = &curthread->td_md.md_invl_gen; 619 KASSERT(invl_gen->gen != 0, ("missed invl_start")); 620 mtx_lock(&invl_gen_mtx); 621 next = LIST_NEXT(invl_gen, link); 622 if (next == NULL) 623 pmap_delayed_invl_finish_unblock(invl_gen->gen); 624 else 625 next->gen = invl_gen->gen; 626 LIST_REMOVE(invl_gen, link); 627 mtx_unlock(&invl_gen_mtx); 628 invl_gen->gen = 0; 629 } 630 631 static bool 632 pmap_not_in_di_u(void) 633 { 634 struct pmap_invl_gen *invl_gen; 635 636 invl_gen = &curthread->td_md.md_invl_gen; 637 return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0); 638 } 639 640 static void 641 pmap_thread_init_invl_gen_u(struct thread *td) 642 { 643 struct pmap_invl_gen *invl_gen; 644 645 invl_gen = &td->td_md.md_invl_gen; 646 invl_gen->gen = 0; 647 invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID; 648 } 649 650 static bool 651 pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out) 652 { 653 uint64_t new_high, new_low, old_high, old_low; 654 char res; 655 656 old_low = new_low = 0; 657 old_high = new_high = (uintptr_t)0; 658 659 __asm volatile("lock;cmpxchg16b\t%1;sete\t%0" 660 : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) 661 : "b"(new_low), "c" (new_high) 662 : "memory", "cc"); 663 if (res == 0) { 664 if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0) 665 return (false); 666 out->gen = old_low; 667 out->next = (void *)old_high; 668 } else { 669 out->gen = new_low; 670 out->next = (void *)new_high; 671 } 672 return (true); 673 } 674 675 static bool 676 pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val, 677 struct pmap_invl_gen *new_val) 678 { 679 uint64_t new_high, new_low, old_high, old_low; 680 char res; 681 682 new_low = new_val->gen; 683 new_high = (uintptr_t)new_val->next; 684 old_low = old_val->gen; 685 old_high = (uintptr_t)old_val->next; 686 687 __asm volatile("lock;cmpxchg16b\t%1;sete\t%0" 688 : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) 689 : "b"(new_low), "c" (new_high) 690 : "memory", "cc"); 691 return (res); 692 } 693 694 #ifdef PV_STATS 695 static long invl_start_restart; 696 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_start_restart, CTLFLAG_RD, 697 &invl_start_restart, 0, 698 ""); 699 static long invl_finish_restart; 700 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD, 701 &invl_finish_restart, 0, 702 ""); 703 static int invl_max_qlen; 704 SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD, 705 &invl_max_qlen, 0, 706 ""); 707 #endif 708 709 static struct lock_delay_config __read_frequently di_delay; 710 LOCK_DELAY_SYSINIT_DEFAULT(di_delay); 711 712 static void 713 pmap_delayed_invl_start_u(void) 714 { 715 struct pmap_invl_gen *invl_gen, *p, prev, new_prev; 716 struct thread *td; 717 struct lock_delay_arg lda; 718 uintptr_t prevl; 719 u_char pri; 720 #ifdef PV_STATS 721 int i, ii; 722 #endif 723 724 td = curthread; 725 invl_gen = &td->td_md.md_invl_gen; 726 PMAP_ASSERT_NOT_IN_DI(); 727 lock_delay_arg_init(&lda, &di_delay); 728 invl_gen->saved_pri = 0; 729 pri = td->td_base_pri; 730 if (pri > PVM) { 731 thread_lock(td); 732 pri = td->td_base_pri; 733 if (pri > PVM) { 734 invl_gen->saved_pri = pri; 735 sched_prio(td, PVM); 736 } 737 thread_unlock(td); 738 } 739 again: 740 PV_STAT(i = 0); 741 for (p = &pmap_invl_gen_head;; p = prev.next) { 742 PV_STAT(i++); 743 prevl = atomic_load_ptr(&p->next); 744 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { 745 PV_STAT(atomic_add_long(&invl_start_restart, 1)); 746 lock_delay(&lda); 747 goto again; 748 } 749 if (prevl == 0) 750 break; 751 prev.next = (void *)prevl; 752 } 753 #ifdef PV_STATS 754 if ((ii = invl_max_qlen) < i) 755 atomic_cmpset_int(&invl_max_qlen, ii, i); 756 #endif 757 758 if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) { 759 PV_STAT(atomic_add_long(&invl_start_restart, 1)); 760 lock_delay(&lda); 761 goto again; 762 } 763 764 new_prev.gen = prev.gen; 765 new_prev.next = invl_gen; 766 invl_gen->gen = prev.gen + 1; 767 768 /* Formal fence between store to invl->gen and updating *p. */ 769 atomic_thread_fence_rel(); 770 771 /* 772 * After inserting an invl_gen element with invalid bit set, 773 * this thread blocks any other thread trying to enter the 774 * delayed invalidation block. Do not allow to remove us from 775 * the CPU, because it causes starvation for other threads. 776 */ 777 critical_enter(); 778 779 /* 780 * ABA for *p is not possible there, since p->gen can only 781 * increase. So if the *p thread finished its di, then 782 * started a new one and got inserted into the list at the 783 * same place, its gen will appear greater than the previously 784 * read gen. 785 */ 786 if (!pmap_di_store_invl(p, &prev, &new_prev)) { 787 critical_exit(); 788 PV_STAT(atomic_add_long(&invl_start_restart, 1)); 789 lock_delay(&lda); 790 goto again; 791 } 792 793 /* 794 * There we clear PMAP_INVL_GEN_NEXT_INVALID in 795 * invl_gen->next, allowing other threads to iterate past us. 796 * pmap_di_store_invl() provides fence between the generation 797 * write and the update of next. 798 */ 799 invl_gen->next = NULL; 800 critical_exit(); 801 } 802 803 static bool 804 pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen *invl_gen, 805 struct pmap_invl_gen *p) 806 { 807 struct pmap_invl_gen prev, new_prev; 808 u_long mygen; 809 810 /* 811 * Load invl_gen->gen after setting invl_gen->next 812 * PMAP_INVL_GEN_NEXT_INVALID. This prevents larger 813 * generations to propagate to our invl_gen->gen. Lock prefix 814 * in atomic_set_ptr() worked as seq_cst fence. 815 */ 816 mygen = atomic_load_long(&invl_gen->gen); 817 818 if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen) 819 return (false); 820 821 KASSERT(prev.gen < mygen, 822 ("invalid di gen sequence %lu %lu", prev.gen, mygen)); 823 new_prev.gen = mygen; 824 new_prev.next = (void *)((uintptr_t)invl_gen->next & 825 ~PMAP_INVL_GEN_NEXT_INVALID); 826 827 /* Formal fence between load of prev and storing update to it. */ 828 atomic_thread_fence_rel(); 829 830 return (pmap_di_store_invl(p, &prev, &new_prev)); 831 } 832 833 static void 834 pmap_delayed_invl_finish_u(void) 835 { 836 struct pmap_invl_gen *invl_gen, *p; 837 struct thread *td; 838 struct lock_delay_arg lda; 839 uintptr_t prevl; 840 841 td = curthread; 842 invl_gen = &td->td_md.md_invl_gen; 843 KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0")); 844 KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0, 845 ("missed invl_start: INVALID")); 846 lock_delay_arg_init(&lda, &di_delay); 847 848 again: 849 for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) { 850 prevl = atomic_load_ptr(&p->next); 851 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { 852 PV_STAT(atomic_add_long(&invl_finish_restart, 1)); 853 lock_delay(&lda); 854 goto again; 855 } 856 if ((void *)prevl == invl_gen) 857 break; 858 } 859 860 /* 861 * It is legitimate to not find ourself on the list if a 862 * thread before us finished its DI and started it again. 863 */ 864 if (__predict_false(p == NULL)) { 865 PV_STAT(atomic_add_long(&invl_finish_restart, 1)); 866 lock_delay(&lda); 867 goto again; 868 } 869 870 critical_enter(); 871 atomic_set_ptr((uintptr_t *)&invl_gen->next, 872 PMAP_INVL_GEN_NEXT_INVALID); 873 if (!pmap_delayed_invl_finish_u_crit(invl_gen, p)) { 874 atomic_clear_ptr((uintptr_t *)&invl_gen->next, 875 PMAP_INVL_GEN_NEXT_INVALID); 876 critical_exit(); 877 PV_STAT(atomic_add_long(&invl_finish_restart, 1)); 878 lock_delay(&lda); 879 goto again; 880 } 881 critical_exit(); 882 if (atomic_load_int(&pmap_invl_waiters) > 0) 883 pmap_delayed_invl_finish_unblock(0); 884 if (invl_gen->saved_pri != 0) { 885 thread_lock(td); 886 sched_prio(td, invl_gen->saved_pri); 887 thread_unlock(td); 888 } 889 } 890 891 #ifdef DDB 892 DB_SHOW_COMMAND(di_queue, pmap_di_queue) 893 { 894 struct pmap_invl_gen *p, *pn; 895 struct thread *td; 896 uintptr_t nextl; 897 bool first; 898 899 for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn, 900 first = false) { 901 nextl = atomic_load_ptr(&p->next); 902 pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID); 903 td = first ? NULL : __containerof(p, struct thread, 904 td_md.md_invl_gen); 905 db_printf("gen %lu inv %d td %p tid %d\n", p->gen, 906 (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td, 907 td != NULL ? td->td_tid : -1); 908 } 909 } 910 #endif 911 912 #ifdef PV_STATS 913 static long invl_wait; 914 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0, 915 "Number of times DI invalidation blocked pmap_remove_all/write"); 916 static long invl_wait_slow; 917 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD, &invl_wait_slow, 0, 918 "Number of slow invalidation waits for lockless DI"); 919 #endif 920 921 static u_long * 922 pmap_delayed_invl_genp(vm_page_t m) 923 { 924 925 return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); 926 } 927 928 static void 929 pmap_delayed_invl_callout_func(void *arg __unused) 930 { 931 932 if (atomic_load_int(&pmap_invl_waiters) == 0) 933 return; 934 pmap_delayed_invl_finish_unblock(0); 935 } 936 937 static void 938 pmap_delayed_invl_callout_init(void *arg __unused) 939 { 940 941 if (pmap_di_locked()) 942 return; 943 callout_init(&pmap_invl_callout, 1); 944 pmap_invl_callout_inited = true; 945 } 946 SYSINIT(pmap_di_callout, SI_SUB_CPU + 1, SI_ORDER_ANY, 947 pmap_delayed_invl_callout_init, NULL); 948 949 /* 950 * Ensure that all currently executing DI blocks, that need to flush 951 * TLB for the given page m, actually flushed the TLB at the time the 952 * function returned. If the page m has an empty PV list and we call 953 * pmap_delayed_invl_wait(), upon its return we know that no CPU has a 954 * valid mapping for the page m in either its page table or TLB. 955 * 956 * This function works by blocking until the global DI generation 957 * number catches up with the generation number associated with the 958 * given page m and its PV list. Since this function's callers 959 * typically own an object lock and sometimes own a page lock, it 960 * cannot sleep. Instead, it blocks on a turnstile to relinquish the 961 * processor. 962 */ 963 static void 964 pmap_delayed_invl_wait_l(vm_page_t m) 965 { 966 u_long *m_gen; 967 #ifdef PV_STATS 968 bool accounted = false; 969 #endif 970 971 m_gen = pmap_delayed_invl_genp(m); 972 while (*m_gen > pmap_invl_gen) { 973 #ifdef PV_STATS 974 if (!accounted) { 975 atomic_add_long(&invl_wait, 1); 976 accounted = true; 977 } 978 #endif 979 pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen); 980 } 981 } 982 983 static void 984 pmap_delayed_invl_wait_u(vm_page_t m) 985 { 986 u_long *m_gen; 987 struct lock_delay_arg lda; 988 bool fast; 989 990 fast = true; 991 m_gen = pmap_delayed_invl_genp(m); 992 lock_delay_arg_init(&lda, &di_delay); 993 while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) { 994 if (fast || !pmap_invl_callout_inited) { 995 PV_STAT(atomic_add_long(&invl_wait, 1)); 996 lock_delay(&lda); 997 fast = false; 998 } else { 999 /* 1000 * The page's invalidation generation number 1001 * is still below the current thread's number. 1002 * Prepare to block so that we do not waste 1003 * CPU cycles or worse, suffer livelock. 1004 * 1005 * Since it is impossible to block without 1006 * racing with pmap_delayed_invl_finish_u(), 1007 * prepare for the race by incrementing 1008 * pmap_invl_waiters and arming a 1-tick 1009 * callout which will unblock us if we lose 1010 * the race. 1011 */ 1012 atomic_add_int(&pmap_invl_waiters, 1); 1013 1014 /* 1015 * Re-check the current thread's invalidation 1016 * generation after incrementing 1017 * pmap_invl_waiters, so that there is no race 1018 * with pmap_delayed_invl_finish_u() setting 1019 * the page generation and checking 1020 * pmap_invl_waiters. The only race allowed 1021 * is for a missed unblock, which is handled 1022 * by the callout. 1023 */ 1024 if (*m_gen > 1025 atomic_load_long(&pmap_invl_gen_head.gen)) { 1026 callout_reset(&pmap_invl_callout, 1, 1027 pmap_delayed_invl_callout_func, NULL); 1028 PV_STAT(atomic_add_long(&invl_wait_slow, 1)); 1029 pmap_delayed_invl_wait_block(m_gen, 1030 &pmap_invl_gen_head.gen); 1031 } 1032 atomic_add_int(&pmap_invl_waiters, -1); 1033 } 1034 } 1035 } 1036 1037 DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *)) 1038 { 1039 1040 return (pmap_di_locked() ? pmap_thread_init_invl_gen_l : 1041 pmap_thread_init_invl_gen_u); 1042 } 1043 1044 DEFINE_IFUNC(static, void, pmap_delayed_invl_start, (void)) 1045 { 1046 1047 return (pmap_di_locked() ? pmap_delayed_invl_start_l : 1048 pmap_delayed_invl_start_u); 1049 } 1050 1051 DEFINE_IFUNC(static, void, pmap_delayed_invl_finish, (void)) 1052 { 1053 1054 return (pmap_di_locked() ? pmap_delayed_invl_finish_l : 1055 pmap_delayed_invl_finish_u); 1056 } 1057 1058 DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t)) 1059 { 1060 1061 return (pmap_di_locked() ? pmap_delayed_invl_wait_l : 1062 pmap_delayed_invl_wait_u); 1063 } 1064 1065 /* 1066 * Mark the page m's PV list as participating in the current thread's 1067 * DI block. Any threads concurrently using m's PV list to remove or 1068 * restrict all mappings to m will wait for the current thread's DI 1069 * block to complete before proceeding. 1070 * 1071 * The function works by setting the DI generation number for m's PV 1072 * list to at least the DI generation number of the current thread. 1073 * This forces a caller of pmap_delayed_invl_wait() to block until 1074 * current thread calls pmap_delayed_invl_finish(). 1075 */ 1076 static void 1077 pmap_delayed_invl_page(vm_page_t m) 1078 { 1079 u_long gen, *m_gen; 1080 1081 rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); 1082 gen = curthread->td_md.md_invl_gen.gen; 1083 if (gen == 0) 1084 return; 1085 m_gen = pmap_delayed_invl_genp(m); 1086 if (*m_gen < gen) 1087 *m_gen = gen; 1088 } 1089 1090 /* 1091 * Crashdump maps. 1092 */ 1093 static caddr_t crashdumpmap; 1094 1095 /* 1096 * Internal flags for pmap_enter()'s helper functions. 1097 */ 1098 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 1099 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 1100 1101 /* 1102 * Internal flags for pmap_mapdev_internal() and 1103 * pmap_change_attr_locked(). 1104 */ 1105 #define MAPDEV_FLUSHCACHE 0x0000001 /* Flush cache after mapping. */ 1106 #define MAPDEV_SETATTR 0x0000002 /* Modify existing attrs. */ 1107 1108 static void free_pv_chunk(struct pv_chunk *pc); 1109 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 1110 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 1111 static int popcnt_pc_map_pq(uint64_t *map); 1112 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 1113 static void reserve_pv_entries(pmap_t pmap, int needed, 1114 struct rwlock **lockp); 1115 static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1116 struct rwlock **lockp); 1117 static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, 1118 u_int flags, struct rwlock **lockp); 1119 #if VM_NRESERVLEVEL > 0 1120 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1121 struct rwlock **lockp); 1122 #endif 1123 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 1124 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 1125 vm_offset_t va); 1126 1127 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, 1128 int flags); 1129 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 1130 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, 1131 vm_offset_t va, struct rwlock **lockp); 1132 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, 1133 vm_offset_t va); 1134 static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, 1135 vm_prot_t prot, struct rwlock **lockp); 1136 static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, 1137 u_int flags, vm_page_t m, struct rwlock **lockp); 1138 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 1139 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 1140 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 1141 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted); 1142 static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, 1143 vm_offset_t eva); 1144 static void pmap_invalidate_cache_range_all(vm_offset_t sva, 1145 vm_offset_t eva); 1146 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, 1147 pd_entry_t pde); 1148 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 1149 static vm_page_t pmap_large_map_getptp_unlocked(void); 1150 static vm_paddr_t pmap_large_map_kextract(vm_offset_t va); 1151 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask); 1152 #if VM_NRESERVLEVEL > 0 1153 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 1154 struct rwlock **lockp); 1155 #endif 1156 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 1157 vm_prot_t prot); 1158 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask); 1159 static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, 1160 bool exec); 1161 static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va); 1162 static pd_entry_t *pmap_pti_pde(vm_offset_t va); 1163 static void pmap_pti_wire_pte(void *pte); 1164 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 1165 struct spglist *free, struct rwlock **lockp); 1166 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 1167 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 1168 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 1169 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 1170 struct spglist *free); 1171 static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1172 pd_entry_t *pde, struct spglist *free, 1173 struct rwlock **lockp); 1174 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 1175 vm_page_t m, struct rwlock **lockp); 1176 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 1177 pd_entry_t newpde); 1178 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); 1179 1180 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, 1181 struct rwlock **lockp); 1182 static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, 1183 struct rwlock **lockp); 1184 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, 1185 struct rwlock **lockp); 1186 1187 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 1188 struct spglist *free); 1189 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 1190 1191 /********************/ 1192 /* Inline functions */ 1193 /********************/ 1194 1195 /* Return a non-clipped PD index for a given VA */ 1196 static __inline vm_pindex_t 1197 pmap_pde_pindex(vm_offset_t va) 1198 { 1199 return (va >> PDRSHIFT); 1200 } 1201 1202 1203 /* Return a pointer to the PML4 slot that corresponds to a VA */ 1204 static __inline pml4_entry_t * 1205 pmap_pml4e(pmap_t pmap, vm_offset_t va) 1206 { 1207 1208 return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 1209 } 1210 1211 /* Return a pointer to the PDP slot that corresponds to a VA */ 1212 static __inline pdp_entry_t * 1213 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 1214 { 1215 pdp_entry_t *pdpe; 1216 1217 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 1218 return (&pdpe[pmap_pdpe_index(va)]); 1219 } 1220 1221 /* Return a pointer to the PDP slot that corresponds to a VA */ 1222 static __inline pdp_entry_t * 1223 pmap_pdpe(pmap_t pmap, vm_offset_t va) 1224 { 1225 pml4_entry_t *pml4e; 1226 pt_entry_t PG_V; 1227 1228 PG_V = pmap_valid_bit(pmap); 1229 pml4e = pmap_pml4e(pmap, va); 1230 if ((*pml4e & PG_V) == 0) 1231 return (NULL); 1232 return (pmap_pml4e_to_pdpe(pml4e, va)); 1233 } 1234 1235 /* Return a pointer to the PD slot that corresponds to a VA */ 1236 static __inline pd_entry_t * 1237 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 1238 { 1239 pd_entry_t *pde; 1240 1241 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 1242 return (&pde[pmap_pde_index(va)]); 1243 } 1244 1245 /* Return a pointer to the PD slot that corresponds to a VA */ 1246 static __inline pd_entry_t * 1247 pmap_pde(pmap_t pmap, vm_offset_t va) 1248 { 1249 pdp_entry_t *pdpe; 1250 pt_entry_t PG_V; 1251 1252 PG_V = pmap_valid_bit(pmap); 1253 pdpe = pmap_pdpe(pmap, va); 1254 if (pdpe == NULL || (*pdpe & PG_V) == 0) 1255 return (NULL); 1256 return (pmap_pdpe_to_pde(pdpe, va)); 1257 } 1258 1259 /* Return a pointer to the PT slot that corresponds to a VA */ 1260 static __inline pt_entry_t * 1261 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 1262 { 1263 pt_entry_t *pte; 1264 1265 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 1266 return (&pte[pmap_pte_index(va)]); 1267 } 1268 1269 /* Return a pointer to the PT slot that corresponds to a VA */ 1270 static __inline pt_entry_t * 1271 pmap_pte(pmap_t pmap, vm_offset_t va) 1272 { 1273 pd_entry_t *pde; 1274 pt_entry_t PG_V; 1275 1276 PG_V = pmap_valid_bit(pmap); 1277 pde = pmap_pde(pmap, va); 1278 if (pde == NULL || (*pde & PG_V) == 0) 1279 return (NULL); 1280 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 1281 return ((pt_entry_t *)pde); 1282 return (pmap_pde_to_pte(pde, va)); 1283 } 1284 1285 static __inline void 1286 pmap_resident_count_inc(pmap_t pmap, int count) 1287 { 1288 1289 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1290 pmap->pm_stats.resident_count += count; 1291 } 1292 1293 static __inline void 1294 pmap_resident_count_dec(pmap_t pmap, int count) 1295 { 1296 1297 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1298 KASSERT(pmap->pm_stats.resident_count >= count, 1299 ("pmap %p resident count underflow %ld %d", pmap, 1300 pmap->pm_stats.resident_count, count)); 1301 pmap->pm_stats.resident_count -= count; 1302 } 1303 1304 PMAP_INLINE pt_entry_t * 1305 vtopte(vm_offset_t va) 1306 { 1307 u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 1308 1309 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); 1310 1311 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 1312 } 1313 1314 static __inline pd_entry_t * 1315 vtopde(vm_offset_t va) 1316 { 1317 u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 1318 1319 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); 1320 1321 return (PDmap + ((va >> PDRSHIFT) & mask)); 1322 } 1323 1324 static u_int64_t 1325 allocpages(vm_paddr_t *firstaddr, int n) 1326 { 1327 u_int64_t ret; 1328 1329 ret = *firstaddr; 1330 bzero((void *)ret, n * PAGE_SIZE); 1331 *firstaddr += n * PAGE_SIZE; 1332 return (ret); 1333 } 1334 1335 CTASSERT(powerof2(NDMPML4E)); 1336 1337 /* number of kernel PDP slots */ 1338 #define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG) 1339 1340 static void 1341 nkpt_init(vm_paddr_t addr) 1342 { 1343 int pt_pages; 1344 1345 #ifdef NKPT 1346 pt_pages = NKPT; 1347 #else 1348 pt_pages = howmany(addr, 1 << PDRSHIFT); 1349 pt_pages += NKPDPE(pt_pages); 1350 1351 /* 1352 * Add some slop beyond the bare minimum required for bootstrapping 1353 * the kernel. 1354 * 1355 * This is quite important when allocating KVA for kernel modules. 1356 * The modules are required to be linked in the negative 2GB of 1357 * the address space. If we run out of KVA in this region then 1358 * pmap_growkernel() will need to allocate page table pages to map 1359 * the entire 512GB of KVA space which is an unnecessary tax on 1360 * physical memory. 1361 * 1362 * Secondly, device memory mapped as part of setting up the low- 1363 * level console(s) is taken from KVA, starting at virtual_avail. 1364 * This is because cninit() is called after pmap_bootstrap() but 1365 * before vm_init() and pmap_init(). 20MB for a frame buffer is 1366 * not uncommon. 1367 */ 1368 pt_pages += 32; /* 64MB additional slop. */ 1369 #endif 1370 nkpt = pt_pages; 1371 } 1372 1373 /* 1374 * Returns the proper write/execute permission for a physical page that is 1375 * part of the initial boot allocations. 1376 * 1377 * If the page has kernel text, it is marked as read-only. If the page has 1378 * kernel read-only data, it is marked as read-only/not-executable. If the 1379 * page has only read-write data, it is marked as read-write/not-executable. 1380 * If the page is below/above the kernel range, it is marked as read-write. 1381 * 1382 * This function operates on 2M pages, since we map the kernel space that 1383 * way. 1384 * 1385 * Note that this doesn't currently provide any protection for modules. 1386 */ 1387 static inline pt_entry_t 1388 bootaddr_rwx(vm_paddr_t pa) 1389 { 1390 1391 /* 1392 * Everything in the same 2M page as the start of the kernel 1393 * should be static. On the other hand, things in the same 2M 1394 * page as the end of the kernel could be read-write/executable, 1395 * as the kernel image is not guaranteed to end on a 2M boundary. 1396 */ 1397 if (pa < trunc_2mpage(btext - KERNBASE) || 1398 pa >= trunc_2mpage(_end - KERNBASE)) 1399 return (X86_PG_RW); 1400 /* 1401 * The linker should ensure that the read-only and read-write 1402 * portions don't share the same 2M page, so this shouldn't 1403 * impact read-only data. However, in any case, any page with 1404 * read-write data needs to be read-write. 1405 */ 1406 if (pa >= trunc_2mpage(brwsection - KERNBASE)) 1407 return (X86_PG_RW | pg_nx); 1408 /* 1409 * Mark any 2M page containing kernel text as read-only. Mark 1410 * other pages with read-only data as read-only and not executable. 1411 * (It is likely a small portion of the read-only data section will 1412 * be marked as read-only, but executable. This should be acceptable 1413 * since the read-only protection will keep the data from changing.) 1414 * Note that fixups to the .text section will still work until we 1415 * set CR0.WP. 1416 */ 1417 if (pa < round_2mpage(etext - KERNBASE)) 1418 return (0); 1419 return (pg_nx); 1420 } 1421 1422 static void 1423 create_pagetables(vm_paddr_t *firstaddr) 1424 { 1425 int i, j, ndm1g, nkpdpe, nkdmpde; 1426 pd_entry_t *pd_p; 1427 pdp_entry_t *pdp_p; 1428 pml4_entry_t *p4_p; 1429 uint64_t DMPDkernphys; 1430 1431 /* Allocate page table pages for the direct map */ 1432 ndmpdp = howmany(ptoa(Maxmem), NBPDP); 1433 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 1434 ndmpdp = 4; 1435 ndmpdpphys = howmany(ndmpdp, NPDPEPG); 1436 if (ndmpdpphys > NDMPML4E) { 1437 /* 1438 * Each NDMPML4E allows 512 GB, so limit to that, 1439 * and then readjust ndmpdp and ndmpdpphys. 1440 */ 1441 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); 1442 Maxmem = atop(NDMPML4E * NBPML4); 1443 ndmpdpphys = NDMPML4E; 1444 ndmpdp = NDMPML4E * NPDEPG; 1445 } 1446 DMPDPphys = allocpages(firstaddr, ndmpdpphys); 1447 ndm1g = 0; 1448 if ((amd_feature & AMDID_PAGE1GB) != 0) { 1449 /* 1450 * Calculate the number of 1G pages that will fully fit in 1451 * Maxmem. 1452 */ 1453 ndm1g = ptoa(Maxmem) >> PDPSHIFT; 1454 1455 /* 1456 * Allocate 2M pages for the kernel. These will be used in 1457 * place of the first one or more 1G pages from ndm1g. 1458 */ 1459 nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP); 1460 DMPDkernphys = allocpages(firstaddr, nkdmpde); 1461 } 1462 if (ndm1g < ndmpdp) 1463 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); 1464 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 1465 1466 /* Allocate pages */ 1467 KPML4phys = allocpages(firstaddr, 1); 1468 KPDPphys = allocpages(firstaddr, NKPML4E); 1469 1470 /* 1471 * Allocate the initial number of kernel page table pages required to 1472 * bootstrap. We defer this until after all memory-size dependent 1473 * allocations are done (e.g. direct map), so that we don't have to 1474 * build in too much slop in our estimate. 1475 * 1476 * Note that when NKPML4E > 1, we have an empty page underneath 1477 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) 1478 * pages. (pmap_enter requires a PD page to exist for each KPML4E.) 1479 */ 1480 nkpt_init(*firstaddr); 1481 nkpdpe = NKPDPE(nkpt); 1482 1483 KPTphys = allocpages(firstaddr, nkpt); 1484 KPDphys = allocpages(firstaddr, nkpdpe); 1485 1486 /* 1487 * Connect the zero-filled PT pages to their PD entries. This 1488 * implicitly maps the PT pages at their correct locations within 1489 * the PTmap. 1490 */ 1491 pd_p = (pd_entry_t *)KPDphys; 1492 for (i = 0; i < nkpt; i++) 1493 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 1494 1495 /* 1496 * Map from physical address zero to the end of loader preallocated 1497 * memory using 2MB pages. This replaces some of the PD entries 1498 * created above. 1499 */ 1500 for (i = 0; (i << PDRSHIFT) < KERNend; i++) 1501 /* Preset PG_M and PG_A because demotion expects it. */ 1502 pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | 1503 X86_PG_M | X86_PG_A | bootaddr_rwx(i << PDRSHIFT); 1504 1505 /* 1506 * Because we map the physical blocks in 2M pages, adjust firstaddr 1507 * to record the physical blocks we've actually mapped into kernel 1508 * virtual address space. 1509 */ 1510 if (*firstaddr < round_2mpage(KERNend)) 1511 *firstaddr = round_2mpage(KERNend); 1512 1513 /* And connect up the PD to the PDP (leaving room for L4 pages) */ 1514 pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); 1515 for (i = 0; i < nkpdpe; i++) 1516 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 1517 1518 /* 1519 * Now, set up the direct map region using 2MB and/or 1GB pages. If 1520 * the end of physical memory is not aligned to a 1GB page boundary, 1521 * then the residual physical memory is mapped with 2MB pages. Later, 1522 * if pmap_mapdev{_attr}() uses the direct map for non-write-back 1523 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings 1524 * that are partially used. 1525 */ 1526 pd_p = (pd_entry_t *)DMPDphys; 1527 for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { 1528 pd_p[j] = (vm_paddr_t)i << PDRSHIFT; 1529 /* Preset PG_M and PG_A because demotion expects it. */ 1530 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | 1531 X86_PG_M | X86_PG_A | pg_nx; 1532 } 1533 pdp_p = (pdp_entry_t *)DMPDPphys; 1534 for (i = 0; i < ndm1g; i++) { 1535 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; 1536 /* Preset PG_M and PG_A because demotion expects it. */ 1537 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | 1538 X86_PG_M | X86_PG_A | pg_nx; 1539 } 1540 for (j = 0; i < ndmpdp; i++, j++) { 1541 pdp_p[i] = DMPDphys + ptoa(j); 1542 pdp_p[i] |= X86_PG_RW | X86_PG_V; 1543 } 1544 1545 /* 1546 * Instead of using a 1G page for the memory containing the kernel, 1547 * use 2M pages with appropriate permissions. (If using 1G pages, 1548 * this will partially overwrite the PDPEs above.) 1549 */ 1550 if (ndm1g) { 1551 pd_p = (pd_entry_t *)DMPDkernphys; 1552 for (i = 0; i < (NPDEPG * nkdmpde); i++) 1553 pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | 1554 X86_PG_M | X86_PG_A | pg_nx | 1555 bootaddr_rwx(i << PDRSHIFT); 1556 for (i = 0; i < nkdmpde; i++) 1557 pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW | 1558 X86_PG_V; 1559 } 1560 1561 /* And recursively map PML4 to itself in order to get PTmap */ 1562 p4_p = (pml4_entry_t *)KPML4phys; 1563 p4_p[PML4PML4I] = KPML4phys; 1564 p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx; 1565 1566 /* Connect the Direct Map slot(s) up to the PML4. */ 1567 for (i = 0; i < ndmpdpphys; i++) { 1568 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); 1569 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V; 1570 } 1571 1572 /* Connect the KVA slots up to the PML4 */ 1573 for (i = 0; i < NKPML4E; i++) { 1574 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); 1575 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V; 1576 } 1577 } 1578 1579 /* 1580 * Bootstrap the system enough to run with virtual memory. 1581 * 1582 * On amd64 this is called after mapping has already been enabled 1583 * and just syncs the pmap module with what has already been done. 1584 * [We can't call it easily with mapping off since the kernel is not 1585 * mapped with PA == VA, hence we would have to relocate every address 1586 * from the linked base (virtual) address "KERNBASE" to the actual 1587 * (physical) address starting relative to 0] 1588 */ 1589 void 1590 pmap_bootstrap(vm_paddr_t *firstaddr) 1591 { 1592 vm_offset_t va; 1593 pt_entry_t *pte, *pcpu_pte; 1594 uint64_t cr4, pcpu_phys; 1595 u_long res; 1596 int i; 1597 1598 KERNend = *firstaddr; 1599 res = atop(KERNend - (vm_paddr_t)kernphys); 1600 1601 if (!pti) 1602 pg_g = X86_PG_G; 1603 1604 /* 1605 * Create an initial set of page tables to run the kernel in. 1606 */ 1607 create_pagetables(firstaddr); 1608 1609 pcpu_phys = allocpages(firstaddr, MAXCPU); 1610 1611 /* 1612 * Add a physical memory segment (vm_phys_seg) corresponding to the 1613 * preallocated kernel page table pages so that vm_page structures 1614 * representing these pages will be created. The vm_page structures 1615 * are required for promotion of the corresponding kernel virtual 1616 * addresses to superpage mappings. 1617 */ 1618 vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 1619 1620 /* 1621 * Account for the virtual addresses mapped by create_pagetables(). 1622 */ 1623 virtual_avail = (vm_offset_t)KERNBASE + round_2mpage(KERNend); 1624 virtual_end = VM_MAX_KERNEL_ADDRESS; 1625 1626 /* 1627 * Enable PG_G global pages, then switch to the kernel page 1628 * table from the bootstrap page table. After the switch, it 1629 * is possible to enable SMEP and SMAP since PG_U bits are 1630 * correct now. 1631 */ 1632 cr4 = rcr4(); 1633 cr4 |= CR4_PGE; 1634 load_cr4(cr4); 1635 load_cr3(KPML4phys); 1636 if (cpu_stdext_feature & CPUID_STDEXT_SMEP) 1637 cr4 |= CR4_SMEP; 1638 if (cpu_stdext_feature & CPUID_STDEXT_SMAP) 1639 cr4 |= CR4_SMAP; 1640 load_cr4(cr4); 1641 1642 /* 1643 * Initialize the kernel pmap (which is statically allocated). 1644 * Count bootstrap data as being resident in case any of this data is 1645 * later unmapped (using pmap_remove()) and freed. 1646 */ 1647 PMAP_LOCK_INIT(kernel_pmap); 1648 kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); 1649 kernel_pmap->pm_cr3 = KPML4phys; 1650 kernel_pmap->pm_ucr3 = PMAP_NO_CR3; 1651 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 1652 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1653 kernel_pmap->pm_stats.resident_count = res; 1654 kernel_pmap->pm_flags = pmap_flags; 1655 1656 /* 1657 * Initialize the TLB invalidations generation number lock. 1658 */ 1659 mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); 1660 1661 /* 1662 * Reserve some special page table entries/VA space for temporary 1663 * mapping of pages. 1664 */ 1665 #define SYSMAP(c, p, v, n) \ 1666 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 1667 1668 va = virtual_avail; 1669 pte = vtopte(va); 1670 1671 /* 1672 * Crashdump maps. The first page is reused as CMAP1 for the 1673 * memory test. 1674 */ 1675 SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) 1676 CADDR1 = crashdumpmap; 1677 1678 SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU); 1679 virtual_avail = va; 1680 1681 for (i = 0; i < MAXCPU; i++) { 1682 pcpu_pte[i] = (pcpu_phys + ptoa(i)) | X86_PG_V | X86_PG_RW | 1683 pg_g | pg_nx | X86_PG_M | X86_PG_A; 1684 } 1685 STAILQ_INIT(&cpuhead); 1686 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 1687 pcpu_init(&__pcpu[0], 0, sizeof(struct pcpu)); 1688 amd64_bsp_pcpu_init1(&__pcpu[0]); 1689 amd64_bsp_ist_init(&__pcpu[0]); 1690 __pcpu[0].pc_dynamic = temp_bsp_pcpu.pc_dynamic; 1691 __pcpu[0].pc_acpi_id = temp_bsp_pcpu.pc_acpi_id; 1692 1693 /* 1694 * Initialize the PAT MSR. 1695 * pmap_init_pat() clears and sets CR4_PGE, which, as a 1696 * side-effect, invalidates stale PG_G TLB entries that might 1697 * have been created in our pre-boot environment. 1698 */ 1699 pmap_init_pat(); 1700 1701 /* Initialize TLB Context Id. */ 1702 if (pmap_pcid_enabled) { 1703 for (i = 0; i < MAXCPU; i++) { 1704 kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN; 1705 kernel_pmap->pm_pcids[i].pm_gen = 1; 1706 } 1707 1708 /* 1709 * PMAP_PCID_KERN + 1 is used for initialization of 1710 * proc0 pmap. The pmap' pcid state might be used by 1711 * EFIRT entry before first context switch, so it 1712 * needs to be valid. 1713 */ 1714 PCPU_SET(pcid_next, PMAP_PCID_KERN + 2); 1715 PCPU_SET(pcid_gen, 1); 1716 1717 /* 1718 * pcpu area for APs is zeroed during AP startup. 1719 * pc_pcid_next and pc_pcid_gen are initialized by AP 1720 * during pcpu setup. 1721 */ 1722 load_cr4(rcr4() | CR4_PCIDE); 1723 } 1724 } 1725 1726 /* 1727 * Setup the PAT MSR. 1728 */ 1729 void 1730 pmap_init_pat(void) 1731 { 1732 uint64_t pat_msr; 1733 u_long cr0, cr4; 1734 int i; 1735 1736 /* Bail if this CPU doesn't implement PAT. */ 1737 if ((cpu_feature & CPUID_PAT) == 0) 1738 panic("no PAT??"); 1739 1740 /* Set default PAT index table. */ 1741 for (i = 0; i < PAT_INDEX_SIZE; i++) 1742 pat_index[i] = -1; 1743 pat_index[PAT_WRITE_BACK] = 0; 1744 pat_index[PAT_WRITE_THROUGH] = 1; 1745 pat_index[PAT_UNCACHEABLE] = 3; 1746 pat_index[PAT_WRITE_COMBINING] = 6; 1747 pat_index[PAT_WRITE_PROTECTED] = 5; 1748 pat_index[PAT_UNCACHED] = 2; 1749 1750 /* 1751 * Initialize default PAT entries. 1752 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 1753 * Program 5 and 6 as WP and WC. 1754 * 1755 * Leave 4 and 7 as WB and UC. Note that a recursive page table 1756 * mapping for a 2M page uses a PAT value with the bit 3 set due 1757 * to its overload with PG_PS. 1758 */ 1759 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 1760 PAT_VALUE(1, PAT_WRITE_THROUGH) | 1761 PAT_VALUE(2, PAT_UNCACHED) | 1762 PAT_VALUE(3, PAT_UNCACHEABLE) | 1763 PAT_VALUE(4, PAT_WRITE_BACK) | 1764 PAT_VALUE(5, PAT_WRITE_PROTECTED) | 1765 PAT_VALUE(6, PAT_WRITE_COMBINING) | 1766 PAT_VALUE(7, PAT_UNCACHEABLE); 1767 1768 /* Disable PGE. */ 1769 cr4 = rcr4(); 1770 load_cr4(cr4 & ~CR4_PGE); 1771 1772 /* Disable caches (CD = 1, NW = 0). */ 1773 cr0 = rcr0(); 1774 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 1775 1776 /* Flushes caches and TLBs. */ 1777 wbinvd(); 1778 invltlb(); 1779 1780 /* Update PAT and index table. */ 1781 wrmsr(MSR_PAT, pat_msr); 1782 1783 /* Flush caches and TLBs again. */ 1784 wbinvd(); 1785 invltlb(); 1786 1787 /* Restore caches and PGE. */ 1788 load_cr0(cr0); 1789 load_cr4(cr4); 1790 } 1791 1792 /* 1793 * Initialize a vm_page's machine-dependent fields. 1794 */ 1795 void 1796 pmap_page_init(vm_page_t m) 1797 { 1798 1799 TAILQ_INIT(&m->md.pv_list); 1800 m->md.pat_mode = PAT_WRITE_BACK; 1801 } 1802 1803 /* 1804 * Initialize the pmap module. 1805 * Called by vm_init, to initialize any structures that the pmap 1806 * system needs to map virtual memory. 1807 */ 1808 void 1809 pmap_init(void) 1810 { 1811 struct pmap_preinit_mapping *ppim; 1812 vm_page_t m, mpte; 1813 vm_size_t s; 1814 int error, i, pv_npg, ret, skz63; 1815 1816 /* L1TF, reserve page @0 unconditionally */ 1817 vm_page_blacklist_add(0, bootverbose); 1818 1819 /* Detect bare-metal Skylake Server and Skylake-X. */ 1820 if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL && 1821 CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) { 1822 /* 1823 * Skylake-X errata SKZ63. Processor May Hang When 1824 * Executing Code In an HLE Transaction Region between 1825 * 40000000H and 403FFFFFH. 1826 * 1827 * Mark the pages in the range as preallocated. It 1828 * seems to be impossible to distinguish between 1829 * Skylake Server and Skylake X. 1830 */ 1831 skz63 = 1; 1832 TUNABLE_INT_FETCH("hw.skz63_enable", &skz63); 1833 if (skz63 != 0) { 1834 if (bootverbose) 1835 printf("SKZ63: skipping 4M RAM starting " 1836 "at physical 1G\n"); 1837 for (i = 0; i < atop(0x400000); i++) { 1838 ret = vm_page_blacklist_add(0x40000000 + 1839 ptoa(i), FALSE); 1840 if (!ret && bootverbose) 1841 printf("page at %#lx already used\n", 1842 0x40000000 + ptoa(i)); 1843 } 1844 } 1845 } 1846 1847 /* 1848 * Initialize the vm page array entries for the kernel pmap's 1849 * page table pages. 1850 */ 1851 PMAP_LOCK(kernel_pmap); 1852 for (i = 0; i < nkpt; i++) { 1853 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 1854 KASSERT(mpte >= vm_page_array && 1855 mpte < &vm_page_array[vm_page_array_size], 1856 ("pmap_init: page table page is out of range")); 1857 mpte->pindex = pmap_pde_pindex(KERNBASE) + i; 1858 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 1859 mpte->wire_count = 1; 1860 1861 /* 1862 * Collect the page table pages that were replaced by a 2MB 1863 * page in create_pagetables(). They are zero filled. 1864 */ 1865 if (i << PDRSHIFT < KERNend && 1866 pmap_insert_pt_page(kernel_pmap, mpte, false)) 1867 panic("pmap_init: pmap_insert_pt_page failed"); 1868 } 1869 PMAP_UNLOCK(kernel_pmap); 1870 vm_wire_add(nkpt); 1871 1872 /* 1873 * If the kernel is running on a virtual machine, then it must assume 1874 * that MCA is enabled by the hypervisor. Moreover, the kernel must 1875 * be prepared for the hypervisor changing the vendor and family that 1876 * are reported by CPUID. Consequently, the workaround for AMD Family 1877 * 10h Erratum 383 is enabled if the processor's feature set does not 1878 * include at least one feature that is only supported by older Intel 1879 * or newer AMD processors. 1880 */ 1881 if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && 1882 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 1883 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 1884 AMDID2_FMA4)) == 0) 1885 workaround_erratum383 = 1; 1886 1887 /* 1888 * Are large page mappings enabled? 1889 */ 1890 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 1891 if (pg_ps_enabled) { 1892 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1893 ("pmap_init: can't assign to pagesizes[1]")); 1894 pagesizes[1] = NBPDR; 1895 } 1896 1897 /* 1898 * Initialize the pv chunk list mutex. 1899 */ 1900 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 1901 1902 /* 1903 * Initialize the pool of pv list locks. 1904 */ 1905 for (i = 0; i < NPV_LIST_LOCKS; i++) 1906 rw_init(&pv_list_locks[i], "pmap pv list"); 1907 1908 /* 1909 * Calculate the size of the pv head table for superpages. 1910 */ 1911 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); 1912 1913 /* 1914 * Allocate memory for the pv head table for superpages. 1915 */ 1916 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 1917 s = round_page(s); 1918 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 1919 for (i = 0; i < pv_npg; i++) 1920 TAILQ_INIT(&pv_table[i].pv_list); 1921 TAILQ_INIT(&pv_dummy.pv_list); 1922 1923 pmap_initialized = 1; 1924 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 1925 ppim = pmap_preinit_mapping + i; 1926 if (ppim->va == 0) 1927 continue; 1928 /* Make the direct map consistent */ 1929 if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) { 1930 (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), 1931 ppim->sz, ppim->mode); 1932 } 1933 if (!bootverbose) 1934 continue; 1935 printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i, 1936 ppim->pa, ppim->va, ppim->sz, ppim->mode); 1937 } 1938 1939 mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); 1940 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 1941 (vmem_addr_t *)&qframe); 1942 if (error != 0) 1943 panic("qframe allocation failed"); 1944 1945 lm_ents = 8; 1946 TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents); 1947 if (lm_ents > LMEPML4I - LMSPML4I + 1) 1948 lm_ents = LMEPML4I - LMSPML4I + 1; 1949 if (bootverbose) 1950 printf("pmap: large map %u PML4 slots (%lu Gb)\n", 1951 lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024)); 1952 if (lm_ents != 0) { 1953 large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS, 1954 (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK); 1955 if (large_vmem == NULL) { 1956 printf("pmap: cannot create large map\n"); 1957 lm_ents = 0; 1958 } 1959 for (i = 0; i < lm_ents; i++) { 1960 m = pmap_large_map_getptp_unlocked(); 1961 kernel_pmap->pm_pml4[LMSPML4I + i] = X86_PG_V | 1962 X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx | 1963 VM_PAGE_TO_PHYS(m); 1964 } 1965 } 1966 } 1967 1968 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, 1969 "2MB page mapping counters"); 1970 1971 static u_long pmap_pde_demotions; 1972 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, 1973 &pmap_pde_demotions, 0, "2MB page demotions"); 1974 1975 static u_long pmap_pde_mappings; 1976 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 1977 &pmap_pde_mappings, 0, "2MB page mappings"); 1978 1979 static u_long pmap_pde_p_failures; 1980 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 1981 &pmap_pde_p_failures, 0, "2MB page promotion failures"); 1982 1983 static u_long pmap_pde_promotions; 1984 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 1985 &pmap_pde_promotions, 0, "2MB page promotions"); 1986 1987 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0, 1988 "1GB page mapping counters"); 1989 1990 static u_long pmap_pdpe_demotions; 1991 SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, 1992 &pmap_pdpe_demotions, 0, "1GB page demotions"); 1993 1994 /*************************************************** 1995 * Low level helper routines..... 1996 ***************************************************/ 1997 1998 static pt_entry_t 1999 pmap_swap_pat(pmap_t pmap, pt_entry_t entry) 2000 { 2001 int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; 2002 2003 switch (pmap->pm_type) { 2004 case PT_X86: 2005 case PT_RVI: 2006 /* Verify that both PAT bits are not set at the same time */ 2007 KASSERT((entry & x86_pat_bits) != x86_pat_bits, 2008 ("Invalid PAT bits in entry %#lx", entry)); 2009 2010 /* Swap the PAT bits if one of them is set */ 2011 if ((entry & x86_pat_bits) != 0) 2012 entry ^= x86_pat_bits; 2013 break; 2014 case PT_EPT: 2015 /* 2016 * Nothing to do - the memory attributes are represented 2017 * the same way for regular pages and superpages. 2018 */ 2019 break; 2020 default: 2021 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); 2022 } 2023 2024 return (entry); 2025 } 2026 2027 boolean_t 2028 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 2029 { 2030 2031 return (mode >= 0 && mode < PAT_INDEX_SIZE && 2032 pat_index[(int)mode] >= 0); 2033 } 2034 2035 /* 2036 * Determine the appropriate bits to set in a PTE or PDE for a specified 2037 * caching mode. 2038 */ 2039 int 2040 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) 2041 { 2042 int cache_bits, pat_flag, pat_idx; 2043 2044 if (!pmap_is_valid_memattr(pmap, mode)) 2045 panic("Unknown caching mode %d\n", mode); 2046 2047 switch (pmap->pm_type) { 2048 case PT_X86: 2049 case PT_RVI: 2050 /* The PAT bit is different for PTE's and PDE's. */ 2051 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 2052 2053 /* Map the caching mode to a PAT index. */ 2054 pat_idx = pat_index[mode]; 2055 2056 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 2057 cache_bits = 0; 2058 if (pat_idx & 0x4) 2059 cache_bits |= pat_flag; 2060 if (pat_idx & 0x2) 2061 cache_bits |= PG_NC_PCD; 2062 if (pat_idx & 0x1) 2063 cache_bits |= PG_NC_PWT; 2064 break; 2065 2066 case PT_EPT: 2067 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); 2068 break; 2069 2070 default: 2071 panic("unsupported pmap type %d", pmap->pm_type); 2072 } 2073 2074 return (cache_bits); 2075 } 2076 2077 static int 2078 pmap_cache_mask(pmap_t pmap, boolean_t is_pde) 2079 { 2080 int mask; 2081 2082 switch (pmap->pm_type) { 2083 case PT_X86: 2084 case PT_RVI: 2085 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; 2086 break; 2087 case PT_EPT: 2088 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); 2089 break; 2090 default: 2091 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); 2092 } 2093 2094 return (mask); 2095 } 2096 2097 static int 2098 pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde) 2099 { 2100 int pat_flag, pat_idx; 2101 2102 pat_idx = 0; 2103 switch (pmap->pm_type) { 2104 case PT_X86: 2105 case PT_RVI: 2106 /* The PAT bit is different for PTE's and PDE's. */ 2107 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 2108 2109 if ((pte & pat_flag) != 0) 2110 pat_idx |= 0x4; 2111 if ((pte & PG_NC_PCD) != 0) 2112 pat_idx |= 0x2; 2113 if ((pte & PG_NC_PWT) != 0) 2114 pat_idx |= 0x1; 2115 break; 2116 case PT_EPT: 2117 if ((pte & EPT_PG_IGNORE_PAT) != 0) 2118 panic("EPT PTE %#lx has no PAT memory type", pte); 2119 pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3; 2120 break; 2121 } 2122 2123 /* See pmap_init_pat(). */ 2124 if (pat_idx == 4) 2125 pat_idx = 0; 2126 if (pat_idx == 7) 2127 pat_idx = 3; 2128 2129 return (pat_idx); 2130 } 2131 2132 bool 2133 pmap_ps_enabled(pmap_t pmap) 2134 { 2135 2136 return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); 2137 } 2138 2139 static void 2140 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) 2141 { 2142 2143 switch (pmap->pm_type) { 2144 case PT_X86: 2145 break; 2146 case PT_RVI: 2147 case PT_EPT: 2148 /* 2149 * XXX 2150 * This is a little bogus since the generation number is 2151 * supposed to be bumped up when a region of the address 2152 * space is invalidated in the page tables. 2153 * 2154 * In this case the old PDE entry is valid but yet we want 2155 * to make sure that any mappings using the old entry are 2156 * invalidated in the TLB. 2157 * 2158 * The reason this works as expected is because we rendezvous 2159 * "all" host cpus and force any vcpu context to exit as a 2160 * side-effect. 2161 */ 2162 atomic_add_acq_long(&pmap->pm_eptgen, 1); 2163 break; 2164 default: 2165 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); 2166 } 2167 pde_store(pde, newpde); 2168 } 2169 2170 /* 2171 * After changing the page size for the specified virtual address in the page 2172 * table, flush the corresponding entries from the processor's TLB. Only the 2173 * calling processor's TLB is affected. 2174 * 2175 * The calling thread must be pinned to a processor. 2176 */ 2177 static void 2178 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) 2179 { 2180 pt_entry_t PG_G; 2181 2182 if (pmap_type_guest(pmap)) 2183 return; 2184 2185 KASSERT(pmap->pm_type == PT_X86, 2186 ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); 2187 2188 PG_G = pmap_global_bit(pmap); 2189 2190 if ((newpde & PG_PS) == 0) 2191 /* Demotion: flush a specific 2MB page mapping. */ 2192 invlpg(va); 2193 else if ((newpde & PG_G) == 0) 2194 /* 2195 * Promotion: flush every 4KB page mapping from the TLB 2196 * because there are too many to flush individually. 2197 */ 2198 invltlb(); 2199 else { 2200 /* 2201 * Promotion: flush every 4KB page mapping from the TLB, 2202 * including any global (PG_G) mappings. 2203 */ 2204 invltlb_glob(); 2205 } 2206 } 2207 #ifdef SMP 2208 2209 /* 2210 * For SMP, these functions have to use the IPI mechanism for coherence. 2211 * 2212 * N.B.: Before calling any of the following TLB invalidation functions, 2213 * the calling processor must ensure that all stores updating a non- 2214 * kernel page table are globally performed. Otherwise, another 2215 * processor could cache an old, pre-update entry without being 2216 * invalidated. This can happen one of two ways: (1) The pmap becomes 2217 * active on another processor after its pm_active field is checked by 2218 * one of the following functions but before a store updating the page 2219 * table is globally performed. (2) The pmap becomes active on another 2220 * processor before its pm_active field is checked but due to 2221 * speculative loads one of the following functions stills reads the 2222 * pmap as inactive on the other processor. 2223 * 2224 * The kernel page table is exempt because its pm_active field is 2225 * immutable. The kernel page table is always active on every 2226 * processor. 2227 */ 2228 2229 /* 2230 * Interrupt the cpus that are executing in the guest context. 2231 * This will force the vcpu to exit and the cached EPT mappings 2232 * will be invalidated by the host before the next vmresume. 2233 */ 2234 static __inline void 2235 pmap_invalidate_ept(pmap_t pmap) 2236 { 2237 int ipinum; 2238 2239 sched_pin(); 2240 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 2241 ("pmap_invalidate_ept: absurd pm_active")); 2242 2243 /* 2244 * The TLB mappings associated with a vcpu context are not 2245 * flushed each time a different vcpu is chosen to execute. 2246 * 2247 * This is in contrast with a process's vtop mappings that 2248 * are flushed from the TLB on each context switch. 2249 * 2250 * Therefore we need to do more than just a TLB shootdown on 2251 * the active cpus in 'pmap->pm_active'. To do this we keep 2252 * track of the number of invalidations performed on this pmap. 2253 * 2254 * Each vcpu keeps a cache of this counter and compares it 2255 * just before a vmresume. If the counter is out-of-date an 2256 * invept will be done to flush stale mappings from the TLB. 2257 */ 2258 atomic_add_acq_long(&pmap->pm_eptgen, 1); 2259 2260 /* 2261 * Force the vcpu to exit and trap back into the hypervisor. 2262 */ 2263 ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; 2264 ipi_selected(pmap->pm_active, ipinum); 2265 sched_unpin(); 2266 } 2267 2268 static cpuset_t 2269 pmap_invalidate_cpu_mask(pmap_t pmap) 2270 { 2271 2272 return (pmap == kernel_pmap ? all_cpus : pmap->pm_active); 2273 } 2274 2275 static inline void 2276 pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va, 2277 const bool invpcid_works1) 2278 { 2279 struct invpcid_descr d; 2280 uint64_t kcr3, ucr3; 2281 uint32_t pcid; 2282 u_int cpuid, i; 2283 2284 cpuid = PCPU_GET(cpuid); 2285 if (pmap == PCPU_GET(curpmap)) { 2286 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 2287 /* 2288 * Because pm_pcid is recalculated on a 2289 * context switch, we must disable switching. 2290 * Otherwise, we might use a stale value 2291 * below. 2292 */ 2293 critical_enter(); 2294 pcid = pmap->pm_pcids[cpuid].pm_pcid; 2295 if (invpcid_works1) { 2296 d.pcid = pcid | PMAP_PCID_USER_PT; 2297 d.pad = 0; 2298 d.addr = va; 2299 invpcid(&d, INVPCID_ADDR); 2300 } else { 2301 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 2302 ucr3 = pmap->pm_ucr3 | pcid | 2303 PMAP_PCID_USER_PT | CR3_PCID_SAVE; 2304 pmap_pti_pcid_invlpg(ucr3, kcr3, va); 2305 } 2306 critical_exit(); 2307 } 2308 } else 2309 pmap->pm_pcids[cpuid].pm_gen = 0; 2310 2311 CPU_FOREACH(i) { 2312 if (cpuid != i) 2313 pmap->pm_pcids[i].pm_gen = 0; 2314 } 2315 2316 /* 2317 * The fence is between stores to pm_gen and the read of the 2318 * pm_active mask. We need to ensure that it is impossible 2319 * for us to miss the bit update in pm_active and 2320 * simultaneously observe a non-zero pm_gen in 2321 * pmap_activate_sw(), otherwise TLB update is missed. 2322 * Without the fence, IA32 allows such an outcome. Note that 2323 * pm_active is updated by a locked operation, which provides 2324 * the reciprocal fence. 2325 */ 2326 atomic_thread_fence_seq_cst(); 2327 } 2328 2329 static void 2330 pmap_invalidate_page_pcid_invpcid(pmap_t pmap, vm_offset_t va) 2331 { 2332 2333 pmap_invalidate_page_pcid(pmap, va, true); 2334 } 2335 2336 static void 2337 pmap_invalidate_page_pcid_noinvpcid(pmap_t pmap, vm_offset_t va) 2338 { 2339 2340 pmap_invalidate_page_pcid(pmap, va, false); 2341 } 2342 2343 static void 2344 pmap_invalidate_page_nopcid(pmap_t pmap, vm_offset_t va) 2345 { 2346 } 2347 2348 DEFINE_IFUNC(static, void, pmap_invalidate_page_mode, (pmap_t, vm_offset_t)) 2349 { 2350 2351 if (pmap_pcid_enabled) 2352 return (invpcid_works ? pmap_invalidate_page_pcid_invpcid : 2353 pmap_invalidate_page_pcid_noinvpcid); 2354 return (pmap_invalidate_page_nopcid); 2355 } 2356 2357 void 2358 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 2359 { 2360 2361 if (pmap_type_guest(pmap)) { 2362 pmap_invalidate_ept(pmap); 2363 return; 2364 } 2365 2366 KASSERT(pmap->pm_type == PT_X86, 2367 ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); 2368 2369 sched_pin(); 2370 if (pmap == kernel_pmap) { 2371 invlpg(va); 2372 } else { 2373 if (pmap == PCPU_GET(curpmap)) 2374 invlpg(va); 2375 pmap_invalidate_page_mode(pmap, va); 2376 } 2377 smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap); 2378 sched_unpin(); 2379 } 2380 2381 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ 2382 #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) 2383 2384 static void 2385 pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 2386 const bool invpcid_works1) 2387 { 2388 struct invpcid_descr d; 2389 uint64_t kcr3, ucr3; 2390 uint32_t pcid; 2391 u_int cpuid, i; 2392 2393 cpuid = PCPU_GET(cpuid); 2394 if (pmap == PCPU_GET(curpmap)) { 2395 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 2396 critical_enter(); 2397 pcid = pmap->pm_pcids[cpuid].pm_pcid; 2398 if (invpcid_works1) { 2399 d.pcid = pcid | PMAP_PCID_USER_PT; 2400 d.pad = 0; 2401 d.addr = sva; 2402 for (; d.addr < eva; d.addr += PAGE_SIZE) 2403 invpcid(&d, INVPCID_ADDR); 2404 } else { 2405 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 2406 ucr3 = pmap->pm_ucr3 | pcid | 2407 PMAP_PCID_USER_PT | CR3_PCID_SAVE; 2408 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); 2409 } 2410 critical_exit(); 2411 } 2412 } else 2413 pmap->pm_pcids[cpuid].pm_gen = 0; 2414 2415 CPU_FOREACH(i) { 2416 if (cpuid != i) 2417 pmap->pm_pcids[i].pm_gen = 0; 2418 } 2419 /* See the comment in pmap_invalidate_page_pcid(). */ 2420 atomic_thread_fence_seq_cst(); 2421 } 2422 2423 static void 2424 pmap_invalidate_range_pcid_invpcid(pmap_t pmap, vm_offset_t sva, 2425 vm_offset_t eva) 2426 { 2427 2428 pmap_invalidate_range_pcid(pmap, sva, eva, true); 2429 } 2430 2431 static void 2432 pmap_invalidate_range_pcid_noinvpcid(pmap_t pmap, vm_offset_t sva, 2433 vm_offset_t eva) 2434 { 2435 2436 pmap_invalidate_range_pcid(pmap, sva, eva, false); 2437 } 2438 2439 static void 2440 pmap_invalidate_range_nopcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2441 { 2442 } 2443 2444 DEFINE_IFUNC(static, void, pmap_invalidate_range_mode, (pmap_t, vm_offset_t, 2445 vm_offset_t)) 2446 { 2447 2448 if (pmap_pcid_enabled) 2449 return (invpcid_works ? pmap_invalidate_range_pcid_invpcid : 2450 pmap_invalidate_range_pcid_noinvpcid); 2451 return (pmap_invalidate_range_nopcid); 2452 } 2453 2454 void 2455 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2456 { 2457 vm_offset_t addr; 2458 2459 if (eva - sva >= PMAP_INVLPG_THRESHOLD) { 2460 pmap_invalidate_all(pmap); 2461 return; 2462 } 2463 2464 if (pmap_type_guest(pmap)) { 2465 pmap_invalidate_ept(pmap); 2466 return; 2467 } 2468 2469 KASSERT(pmap->pm_type == PT_X86, 2470 ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); 2471 2472 sched_pin(); 2473 if (pmap == kernel_pmap) { 2474 for (addr = sva; addr < eva; addr += PAGE_SIZE) 2475 invlpg(addr); 2476 } else { 2477 if (pmap == PCPU_GET(curpmap)) { 2478 for (addr = sva; addr < eva; addr += PAGE_SIZE) 2479 invlpg(addr); 2480 } 2481 pmap_invalidate_range_mode(pmap, sva, eva); 2482 } 2483 smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap); 2484 sched_unpin(); 2485 } 2486 2487 static inline void 2488 pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1) 2489 { 2490 struct invpcid_descr d; 2491 uint64_t kcr3, ucr3; 2492 uint32_t pcid; 2493 u_int cpuid, i; 2494 2495 if (pmap == kernel_pmap) { 2496 if (invpcid_works1) { 2497 bzero(&d, sizeof(d)); 2498 invpcid(&d, INVPCID_CTXGLOB); 2499 } else { 2500 invltlb_glob(); 2501 } 2502 } else { 2503 cpuid = PCPU_GET(cpuid); 2504 if (pmap == PCPU_GET(curpmap)) { 2505 critical_enter(); 2506 pcid = pmap->pm_pcids[cpuid].pm_pcid; 2507 if (invpcid_works1) { 2508 d.pcid = pcid; 2509 d.pad = 0; 2510 d.addr = 0; 2511 invpcid(&d, INVPCID_CTX); 2512 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 2513 d.pcid |= PMAP_PCID_USER_PT; 2514 invpcid(&d, INVPCID_CTX); 2515 } 2516 } else { 2517 kcr3 = pmap->pm_cr3 | pcid; 2518 ucr3 = pmap->pm_ucr3; 2519 if (ucr3 != PMAP_NO_CR3) { 2520 ucr3 |= pcid | PMAP_PCID_USER_PT; 2521 pmap_pti_pcid_invalidate(ucr3, kcr3); 2522 } else { 2523 load_cr3(kcr3); 2524 } 2525 } 2526 critical_exit(); 2527 } else 2528 pmap->pm_pcids[cpuid].pm_gen = 0; 2529 CPU_FOREACH(i) { 2530 if (cpuid != i) 2531 pmap->pm_pcids[i].pm_gen = 0; 2532 } 2533 } 2534 /* See the comment in pmap_invalidate_page_pcid(). */ 2535 atomic_thread_fence_seq_cst(); 2536 } 2537 2538 static void 2539 pmap_invalidate_all_pcid_invpcid(pmap_t pmap) 2540 { 2541 2542 pmap_invalidate_all_pcid(pmap, true); 2543 } 2544 2545 static void 2546 pmap_invalidate_all_pcid_noinvpcid(pmap_t pmap) 2547 { 2548 2549 pmap_invalidate_all_pcid(pmap, false); 2550 } 2551 2552 static void 2553 pmap_invalidate_all_nopcid(pmap_t pmap) 2554 { 2555 2556 if (pmap == kernel_pmap) 2557 invltlb_glob(); 2558 else if (pmap == PCPU_GET(curpmap)) 2559 invltlb(); 2560 } 2561 2562 DEFINE_IFUNC(static, void, pmap_invalidate_all_mode, (pmap_t)) 2563 { 2564 2565 if (pmap_pcid_enabled) 2566 return (invpcid_works ? pmap_invalidate_all_pcid_invpcid : 2567 pmap_invalidate_all_pcid_noinvpcid); 2568 return (pmap_invalidate_all_nopcid); 2569 } 2570 2571 void 2572 pmap_invalidate_all(pmap_t pmap) 2573 { 2574 2575 if (pmap_type_guest(pmap)) { 2576 pmap_invalidate_ept(pmap); 2577 return; 2578 } 2579 2580 KASSERT(pmap->pm_type == PT_X86, 2581 ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); 2582 2583 sched_pin(); 2584 pmap_invalidate_all_mode(pmap); 2585 smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap); 2586 sched_unpin(); 2587 } 2588 2589 void 2590 pmap_invalidate_cache(void) 2591 { 2592 2593 sched_pin(); 2594 wbinvd(); 2595 smp_cache_flush(); 2596 sched_unpin(); 2597 } 2598 2599 struct pde_action { 2600 cpuset_t invalidate; /* processors that invalidate their TLB */ 2601 pmap_t pmap; 2602 vm_offset_t va; 2603 pd_entry_t *pde; 2604 pd_entry_t newpde; 2605 u_int store; /* processor that updates the PDE */ 2606 }; 2607 2608 static void 2609 pmap_update_pde_action(void *arg) 2610 { 2611 struct pde_action *act = arg; 2612 2613 if (act->store == PCPU_GET(cpuid)) 2614 pmap_update_pde_store(act->pmap, act->pde, act->newpde); 2615 } 2616 2617 static void 2618 pmap_update_pde_teardown(void *arg) 2619 { 2620 struct pde_action *act = arg; 2621 2622 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 2623 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); 2624 } 2625 2626 /* 2627 * Change the page size for the specified virtual address in a way that 2628 * prevents any possibility of the TLB ever having two entries that map the 2629 * same virtual address using different page sizes. This is the recommended 2630 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 2631 * machine check exception for a TLB state that is improperly diagnosed as a 2632 * hardware error. 2633 */ 2634 static void 2635 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 2636 { 2637 struct pde_action act; 2638 cpuset_t active, other_cpus; 2639 u_int cpuid; 2640 2641 sched_pin(); 2642 cpuid = PCPU_GET(cpuid); 2643 other_cpus = all_cpus; 2644 CPU_CLR(cpuid, &other_cpus); 2645 if (pmap == kernel_pmap || pmap_type_guest(pmap)) 2646 active = all_cpus; 2647 else { 2648 active = pmap->pm_active; 2649 } 2650 if (CPU_OVERLAP(&active, &other_cpus)) { 2651 act.store = cpuid; 2652 act.invalidate = active; 2653 act.va = va; 2654 act.pmap = pmap; 2655 act.pde = pde; 2656 act.newpde = newpde; 2657 CPU_SET(cpuid, &active); 2658 smp_rendezvous_cpus(active, 2659 smp_no_rendezvous_barrier, pmap_update_pde_action, 2660 pmap_update_pde_teardown, &act); 2661 } else { 2662 pmap_update_pde_store(pmap, pde, newpde); 2663 if (CPU_ISSET(cpuid, &active)) 2664 pmap_update_pde_invalidate(pmap, va, newpde); 2665 } 2666 sched_unpin(); 2667 } 2668 #else /* !SMP */ 2669 /* 2670 * Normal, non-SMP, invalidation functions. 2671 */ 2672 void 2673 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 2674 { 2675 struct invpcid_descr d; 2676 uint64_t kcr3, ucr3; 2677 uint32_t pcid; 2678 2679 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 2680 pmap->pm_eptgen++; 2681 return; 2682 } 2683 KASSERT(pmap->pm_type == PT_X86, 2684 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 2685 2686 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 2687 invlpg(va); 2688 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && 2689 pmap->pm_ucr3 != PMAP_NO_CR3) { 2690 critical_enter(); 2691 pcid = pmap->pm_pcids[0].pm_pcid; 2692 if (invpcid_works) { 2693 d.pcid = pcid | PMAP_PCID_USER_PT; 2694 d.pad = 0; 2695 d.addr = va; 2696 invpcid(&d, INVPCID_ADDR); 2697 } else { 2698 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 2699 ucr3 = pmap->pm_ucr3 | pcid | 2700 PMAP_PCID_USER_PT | CR3_PCID_SAVE; 2701 pmap_pti_pcid_invlpg(ucr3, kcr3, va); 2702 } 2703 critical_exit(); 2704 } 2705 } else if (pmap_pcid_enabled) 2706 pmap->pm_pcids[0].pm_gen = 0; 2707 } 2708 2709 void 2710 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2711 { 2712 struct invpcid_descr d; 2713 vm_offset_t addr; 2714 uint64_t kcr3, ucr3; 2715 2716 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 2717 pmap->pm_eptgen++; 2718 return; 2719 } 2720 KASSERT(pmap->pm_type == PT_X86, 2721 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 2722 2723 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 2724 for (addr = sva; addr < eva; addr += PAGE_SIZE) 2725 invlpg(addr); 2726 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && 2727 pmap->pm_ucr3 != PMAP_NO_CR3) { 2728 critical_enter(); 2729 if (invpcid_works) { 2730 d.pcid = pmap->pm_pcids[0].pm_pcid | 2731 PMAP_PCID_USER_PT; 2732 d.pad = 0; 2733 d.addr = sva; 2734 for (; d.addr < eva; d.addr += PAGE_SIZE) 2735 invpcid(&d, INVPCID_ADDR); 2736 } else { 2737 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0]. 2738 pm_pcid | CR3_PCID_SAVE; 2739 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0]. 2740 pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; 2741 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); 2742 } 2743 critical_exit(); 2744 } 2745 } else if (pmap_pcid_enabled) { 2746 pmap->pm_pcids[0].pm_gen = 0; 2747 } 2748 } 2749 2750 void 2751 pmap_invalidate_all(pmap_t pmap) 2752 { 2753 struct invpcid_descr d; 2754 uint64_t kcr3, ucr3; 2755 2756 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 2757 pmap->pm_eptgen++; 2758 return; 2759 } 2760 KASSERT(pmap->pm_type == PT_X86, 2761 ("pmap_invalidate_all: unknown type %d", pmap->pm_type)); 2762 2763 if (pmap == kernel_pmap) { 2764 if (pmap_pcid_enabled && invpcid_works) { 2765 bzero(&d, sizeof(d)); 2766 invpcid(&d, INVPCID_CTXGLOB); 2767 } else { 2768 invltlb_glob(); 2769 } 2770 } else if (pmap == PCPU_GET(curpmap)) { 2771 if (pmap_pcid_enabled) { 2772 critical_enter(); 2773 if (invpcid_works) { 2774 d.pcid = pmap->pm_pcids[0].pm_pcid; 2775 d.pad = 0; 2776 d.addr = 0; 2777 invpcid(&d, INVPCID_CTX); 2778 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 2779 d.pcid |= PMAP_PCID_USER_PT; 2780 invpcid(&d, INVPCID_CTX); 2781 } 2782 } else { 2783 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid; 2784 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 2785 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[ 2786 0].pm_pcid | PMAP_PCID_USER_PT; 2787 pmap_pti_pcid_invalidate(ucr3, kcr3); 2788 } else 2789 load_cr3(kcr3); 2790 } 2791 critical_exit(); 2792 } else { 2793 invltlb(); 2794 } 2795 } else if (pmap_pcid_enabled) { 2796 pmap->pm_pcids[0].pm_gen = 0; 2797 } 2798 } 2799 2800 PMAP_INLINE void 2801 pmap_invalidate_cache(void) 2802 { 2803 2804 wbinvd(); 2805 } 2806 2807 static void 2808 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 2809 { 2810 2811 pmap_update_pde_store(pmap, pde, newpde); 2812 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) 2813 pmap_update_pde_invalidate(pmap, va, newpde); 2814 else 2815 pmap->pm_pcids[0].pm_gen = 0; 2816 } 2817 #endif /* !SMP */ 2818 2819 static void 2820 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) 2821 { 2822 2823 /* 2824 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created 2825 * by a promotion that did not invalidate the 512 4KB page mappings 2826 * that might exist in the TLB. Consequently, at this point, the TLB 2827 * may hold both 4KB and 2MB page mappings for the address range [va, 2828 * va + NBPDR). Therefore, the entire range must be invalidated here. 2829 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any 2830 * 4KB page mappings for the address range [va, va + NBPDR), and so a 2831 * single INVLPG suffices to invalidate the 2MB page mapping from the 2832 * TLB. 2833 */ 2834 if ((pde & PG_PROMOTED) != 0) 2835 pmap_invalidate_range(pmap, va, va + NBPDR - 1); 2836 else 2837 pmap_invalidate_page(pmap, va); 2838 } 2839 2840 DEFINE_IFUNC(, void, pmap_invalidate_cache_range, 2841 (vm_offset_t sva, vm_offset_t eva)) 2842 { 2843 2844 if ((cpu_feature & CPUID_SS) != 0) 2845 return (pmap_invalidate_cache_range_selfsnoop); 2846 if ((cpu_feature & CPUID_CLFSH) != 0) 2847 return (pmap_force_invalidate_cache_range); 2848 return (pmap_invalidate_cache_range_all); 2849 } 2850 2851 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 2852 2853 static void 2854 pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva) 2855 { 2856 2857 KASSERT((sva & PAGE_MASK) == 0, 2858 ("pmap_invalidate_cache_range: sva not page-aligned")); 2859 KASSERT((eva & PAGE_MASK) == 0, 2860 ("pmap_invalidate_cache_range: eva not page-aligned")); 2861 } 2862 2863 static void 2864 pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva) 2865 { 2866 2867 pmap_invalidate_cache_range_check_align(sva, eva); 2868 } 2869 2870 void 2871 pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 2872 { 2873 2874 sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); 2875 2876 /* 2877 * XXX: Some CPUs fault, hang, or trash the local APIC 2878 * registers if we use CLFLUSH on the local APIC range. The 2879 * local APIC is always uncached, so we don't need to flush 2880 * for that range anyway. 2881 */ 2882 if (pmap_kextract(sva) == lapic_paddr) 2883 return; 2884 2885 if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) { 2886 /* 2887 * Do per-cache line flush. Use the sfence 2888 * instruction to insure that previous stores are 2889 * included in the write-back. The processor 2890 * propagates flush to other processors in the cache 2891 * coherence domain. 2892 */ 2893 sfence(); 2894 for (; sva < eva; sva += cpu_clflush_line_size) 2895 clflushopt(sva); 2896 sfence(); 2897 } else { 2898 /* 2899 * Writes are ordered by CLFLUSH on Intel CPUs. 2900 */ 2901 if (cpu_vendor_id != CPU_VENDOR_INTEL) 2902 mfence(); 2903 for (; sva < eva; sva += cpu_clflush_line_size) 2904 clflush(sva); 2905 if (cpu_vendor_id != CPU_VENDOR_INTEL) 2906 mfence(); 2907 } 2908 } 2909 2910 static void 2911 pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva) 2912 { 2913 2914 pmap_invalidate_cache_range_check_align(sva, eva); 2915 pmap_invalidate_cache(); 2916 } 2917 2918 /* 2919 * Remove the specified set of pages from the data and instruction caches. 2920 * 2921 * In contrast to pmap_invalidate_cache_range(), this function does not 2922 * rely on the CPU's self-snoop feature, because it is intended for use 2923 * when moving pages into a different cache domain. 2924 */ 2925 void 2926 pmap_invalidate_cache_pages(vm_page_t *pages, int count) 2927 { 2928 vm_offset_t daddr, eva; 2929 int i; 2930 bool useclflushopt; 2931 2932 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; 2933 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 2934 ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt)) 2935 pmap_invalidate_cache(); 2936 else { 2937 if (useclflushopt) 2938 sfence(); 2939 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 2940 mfence(); 2941 for (i = 0; i < count; i++) { 2942 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 2943 eva = daddr + PAGE_SIZE; 2944 for (; daddr < eva; daddr += cpu_clflush_line_size) { 2945 if (useclflushopt) 2946 clflushopt(daddr); 2947 else 2948 clflush(daddr); 2949 } 2950 } 2951 if (useclflushopt) 2952 sfence(); 2953 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 2954 mfence(); 2955 } 2956 } 2957 2958 void 2959 pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva) 2960 { 2961 2962 pmap_invalidate_cache_range_check_align(sva, eva); 2963 2964 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) { 2965 pmap_force_invalidate_cache_range(sva, eva); 2966 return; 2967 } 2968 2969 /* See comment in pmap_force_invalidate_cache_range(). */ 2970 if (pmap_kextract(sva) == lapic_paddr) 2971 return; 2972 2973 sfence(); 2974 for (; sva < eva; sva += cpu_clflush_line_size) 2975 clwb(sva); 2976 sfence(); 2977 } 2978 2979 void 2980 pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) 2981 { 2982 pt_entry_t *pte; 2983 vm_offset_t vaddr; 2984 int error, pte_bits; 2985 2986 KASSERT((spa & PAGE_MASK) == 0, 2987 ("pmap_flush_cache_phys_range: spa not page-aligned")); 2988 KASSERT((epa & PAGE_MASK) == 0, 2989 ("pmap_flush_cache_phys_range: epa not page-aligned")); 2990 2991 if (spa < dmaplimit) { 2992 pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN( 2993 dmaplimit, epa))); 2994 if (dmaplimit >= epa) 2995 return; 2996 spa = dmaplimit; 2997 } 2998 2999 pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW | 3000 X86_PG_V; 3001 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 3002 &vaddr); 3003 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 3004 pte = vtopte(vaddr); 3005 for (; spa < epa; spa += PAGE_SIZE) { 3006 sched_pin(); 3007 pte_store(pte, spa | pte_bits); 3008 invlpg(vaddr); 3009 /* XXXKIB sfences inside flush_cache_range are excessive */ 3010 pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE); 3011 sched_unpin(); 3012 } 3013 vmem_free(kernel_arena, vaddr, PAGE_SIZE); 3014 } 3015 3016 /* 3017 * Routine: pmap_extract 3018 * Function: 3019 * Extract the physical page address associated 3020 * with the given map/virtual_address pair. 3021 */ 3022 vm_paddr_t 3023 pmap_extract(pmap_t pmap, vm_offset_t va) 3024 { 3025 pdp_entry_t *pdpe; 3026 pd_entry_t *pde; 3027 pt_entry_t *pte, PG_V; 3028 vm_paddr_t pa; 3029 3030 pa = 0; 3031 PG_V = pmap_valid_bit(pmap); 3032 PMAP_LOCK(pmap); 3033 pdpe = pmap_pdpe(pmap, va); 3034 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 3035 if ((*pdpe & PG_PS) != 0) 3036 pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); 3037 else { 3038 pde = pmap_pdpe_to_pde(pdpe, va); 3039 if ((*pde & PG_V) != 0) { 3040 if ((*pde & PG_PS) != 0) { 3041 pa = (*pde & PG_PS_FRAME) | 3042 (va & PDRMASK); 3043 } else { 3044 pte = pmap_pde_to_pte(pde, va); 3045 pa = (*pte & PG_FRAME) | 3046 (va & PAGE_MASK); 3047 } 3048 } 3049 } 3050 } 3051 PMAP_UNLOCK(pmap); 3052 return (pa); 3053 } 3054 3055 /* 3056 * Routine: pmap_extract_and_hold 3057 * Function: 3058 * Atomically extract and hold the physical page 3059 * with the given pmap and virtual address pair 3060 * if that mapping permits the given protection. 3061 */ 3062 vm_page_t 3063 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 3064 { 3065 pd_entry_t pde, *pdep; 3066 pt_entry_t pte, PG_RW, PG_V; 3067 vm_paddr_t pa; 3068 vm_page_t m; 3069 3070 pa = 0; 3071 m = NULL; 3072 PG_RW = pmap_rw_bit(pmap); 3073 PG_V = pmap_valid_bit(pmap); 3074 3075 PMAP_LOCK(pmap); 3076 pdep = pmap_pde(pmap, va); 3077 if (pdep != NULL && (pde = *pdep)) { 3078 if (pde & PG_PS) { 3079 if ((pde & PG_RW) != 0 || (prot & VM_PROT_WRITE) == 0) 3080 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | 3081 (va & PDRMASK)); 3082 } else { 3083 pte = *pmap_pde_to_pte(pdep, va); 3084 if ((pte & PG_V) != 0 && 3085 ((pte & PG_RW) != 0 || (prot & VM_PROT_WRITE) == 0)) 3086 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 3087 } 3088 if (m != NULL && !vm_page_wire_mapped(m)) 3089 m = NULL; 3090 } 3091 PMAP_UNLOCK(pmap); 3092 return (m); 3093 } 3094 3095 vm_paddr_t 3096 pmap_kextract(vm_offset_t va) 3097 { 3098 pd_entry_t pde; 3099 vm_paddr_t pa; 3100 3101 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 3102 pa = DMAP_TO_PHYS(va); 3103 } else if (PMAP_ADDRESS_IN_LARGEMAP(va)) { 3104 pa = pmap_large_map_kextract(va); 3105 } else { 3106 pde = *vtopde(va); 3107 if (pde & PG_PS) { 3108 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 3109 } else { 3110 /* 3111 * Beware of a concurrent promotion that changes the 3112 * PDE at this point! For example, vtopte() must not 3113 * be used to access the PTE because it would use the 3114 * new PDE. It is, however, safe to use the old PDE 3115 * because the page table page is preserved by the 3116 * promotion. 3117 */ 3118 pa = *pmap_pde_to_pte(&pde, va); 3119 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 3120 } 3121 } 3122 return (pa); 3123 } 3124 3125 /*************************************************** 3126 * Low level mapping routines..... 3127 ***************************************************/ 3128 3129 /* 3130 * Add a wired page to the kva. 3131 * Note: not SMP coherent. 3132 */ 3133 PMAP_INLINE void 3134 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 3135 { 3136 pt_entry_t *pte; 3137 3138 pte = vtopte(va); 3139 pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g); 3140 } 3141 3142 static __inline void 3143 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 3144 { 3145 pt_entry_t *pte; 3146 int cache_bits; 3147 3148 pte = vtopte(va); 3149 cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); 3150 pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | cache_bits); 3151 } 3152 3153 /* 3154 * Remove a page from the kernel pagetables. 3155 * Note: not SMP coherent. 3156 */ 3157 PMAP_INLINE void 3158 pmap_kremove(vm_offset_t va) 3159 { 3160 pt_entry_t *pte; 3161 3162 pte = vtopte(va); 3163 pte_clear(pte); 3164 } 3165 3166 /* 3167 * Used to map a range of physical addresses into kernel 3168 * virtual address space. 3169 * 3170 * The value passed in '*virt' is a suggested virtual address for 3171 * the mapping. Architectures which can support a direct-mapped 3172 * physical to virtual region can return the appropriate address 3173 * within that region, leaving '*virt' unchanged. Other 3174 * architectures should map the pages starting at '*virt' and 3175 * update '*virt' with the first usable address after the mapped 3176 * region. 3177 */ 3178 vm_offset_t 3179 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 3180 { 3181 return PHYS_TO_DMAP(start); 3182 } 3183 3184 3185 /* 3186 * Add a list of wired pages to the kva 3187 * this routine is only used for temporary 3188 * kernel mappings that do not need to have 3189 * page modification or references recorded. 3190 * Note that old mappings are simply written 3191 * over. The page *must* be wired. 3192 * Note: SMP coherent. Uses a ranged shootdown IPI. 3193 */ 3194 void 3195 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 3196 { 3197 pt_entry_t *endpte, oldpte, pa, *pte; 3198 vm_page_t m; 3199 int cache_bits; 3200 3201 oldpte = 0; 3202 pte = vtopte(sva); 3203 endpte = pte + count; 3204 while (pte < endpte) { 3205 m = *ma++; 3206 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); 3207 pa = VM_PAGE_TO_PHYS(m) | cache_bits; 3208 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { 3209 oldpte |= *pte; 3210 pte_store(pte, pa | pg_g | pg_nx | X86_PG_RW | X86_PG_V); 3211 } 3212 pte++; 3213 } 3214 if (__predict_false((oldpte & X86_PG_V) != 0)) 3215 pmap_invalidate_range(kernel_pmap, sva, sva + count * 3216 PAGE_SIZE); 3217 } 3218 3219 /* 3220 * This routine tears out page mappings from the 3221 * kernel -- it is meant only for temporary mappings. 3222 * Note: SMP coherent. Uses a ranged shootdown IPI. 3223 */ 3224 void 3225 pmap_qremove(vm_offset_t sva, int count) 3226 { 3227 vm_offset_t va; 3228 3229 va = sva; 3230 while (count-- > 0) { 3231 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); 3232 pmap_kremove(va); 3233 va += PAGE_SIZE; 3234 } 3235 pmap_invalidate_range(kernel_pmap, sva, va); 3236 } 3237 3238 /*************************************************** 3239 * Page table page management routines..... 3240 ***************************************************/ 3241 /* 3242 * Schedule the specified unused page table page to be freed. Specifically, 3243 * add the page to the specified list of pages that will be released to the 3244 * physical memory manager after the TLB has been updated. 3245 */ 3246 static __inline void 3247 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 3248 boolean_t set_PG_ZERO) 3249 { 3250 3251 if (set_PG_ZERO) 3252 m->flags |= PG_ZERO; 3253 else 3254 m->flags &= ~PG_ZERO; 3255 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 3256 } 3257 3258 /* 3259 * Inserts the specified page table page into the specified pmap's collection 3260 * of idle page table pages. Each of a pmap's page table pages is responsible 3261 * for mapping a distinct range of virtual addresses. The pmap's collection is 3262 * ordered by this virtual address range. 3263 * 3264 * If "promoted" is false, then the page table page "mpte" must be zero filled. 3265 */ 3266 static __inline int 3267 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) 3268 { 3269 3270 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3271 mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; 3272 return (vm_radix_insert(&pmap->pm_root, mpte)); 3273 } 3274 3275 /* 3276 * Removes the page table page mapping the specified virtual address from the 3277 * specified pmap's collection of idle page table pages, and returns it. 3278 * Otherwise, returns NULL if there is no page table page corresponding to the 3279 * specified virtual address. 3280 */ 3281 static __inline vm_page_t 3282 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 3283 { 3284 3285 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3286 return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va))); 3287 } 3288 3289 /* 3290 * Decrements a page table page's wire count, which is used to record the 3291 * number of valid page table entries within the page. If the wire count 3292 * drops to zero, then the page table page is unmapped. Returns TRUE if the 3293 * page table page was unmapped and FALSE otherwise. 3294 */ 3295 static inline boolean_t 3296 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 3297 { 3298 3299 --m->wire_count; 3300 if (m->wire_count == 0) { 3301 _pmap_unwire_ptp(pmap, va, m, free); 3302 return (TRUE); 3303 } else 3304 return (FALSE); 3305 } 3306 3307 static void 3308 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 3309 { 3310 3311 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3312 /* 3313 * unmap the page table page 3314 */ 3315 if (m->pindex >= (NUPDE + NUPDPE)) { 3316 /* PDP page */ 3317 pml4_entry_t *pml4; 3318 pml4 = pmap_pml4e(pmap, va); 3319 *pml4 = 0; 3320 if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) { 3321 pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)]; 3322 *pml4 = 0; 3323 } 3324 } else if (m->pindex >= NUPDE) { 3325 /* PD page */ 3326 pdp_entry_t *pdp; 3327 pdp = pmap_pdpe(pmap, va); 3328 *pdp = 0; 3329 } else { 3330 /* PTE page */ 3331 pd_entry_t *pd; 3332 pd = pmap_pde(pmap, va); 3333 *pd = 0; 3334 } 3335 pmap_resident_count_dec(pmap, 1); 3336 if (m->pindex < NUPDE) { 3337 /* We just released a PT, unhold the matching PD */ 3338 vm_page_t pdpg; 3339 3340 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 3341 pmap_unwire_ptp(pmap, va, pdpg, free); 3342 } 3343 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 3344 /* We just released a PD, unhold the matching PDP */ 3345 vm_page_t pdppg; 3346 3347 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 3348 pmap_unwire_ptp(pmap, va, pdppg, free); 3349 } 3350 3351 /* 3352 * Put page on a list so that it is released after 3353 * *ALL* TLB shootdown is done 3354 */ 3355 pmap_add_delayed_free_list(m, free, TRUE); 3356 } 3357 3358 /* 3359 * After removing a page table entry, this routine is used to 3360 * conditionally free the page, and manage the hold/wire counts. 3361 */ 3362 static int 3363 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 3364 struct spglist *free) 3365 { 3366 vm_page_t mpte; 3367 3368 if (va >= VM_MAXUSER_ADDRESS) 3369 return (0); 3370 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 3371 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 3372 return (pmap_unwire_ptp(pmap, va, mpte, free)); 3373 } 3374 3375 void 3376 pmap_pinit0(pmap_t pmap) 3377 { 3378 struct proc *p; 3379 struct thread *td; 3380 int i; 3381 3382 PMAP_LOCK_INIT(pmap); 3383 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 3384 pmap->pm_pml4u = NULL; 3385 pmap->pm_cr3 = KPML4phys; 3386 /* hack to keep pmap_pti_pcid_invalidate() alive */ 3387 pmap->pm_ucr3 = PMAP_NO_CR3; 3388 pmap->pm_root.rt_root = 0; 3389 CPU_ZERO(&pmap->pm_active); 3390 TAILQ_INIT(&pmap->pm_pvchunk); 3391 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 3392 pmap->pm_flags = pmap_flags; 3393 CPU_FOREACH(i) { 3394 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN + 1; 3395 pmap->pm_pcids[i].pm_gen = 1; 3396 } 3397 pmap_activate_boot(pmap); 3398 td = curthread; 3399 if (pti) { 3400 p = td->td_proc; 3401 PROC_LOCK(p); 3402 p->p_md.md_flags |= P_MD_KPTI; 3403 PROC_UNLOCK(p); 3404 } 3405 pmap_thread_init_invl_gen(td); 3406 3407 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 3408 pmap_pkru_ranges_zone = uma_zcreate("pkru ranges", 3409 sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL, 3410 UMA_ALIGN_PTR, 0); 3411 } 3412 } 3413 3414 void 3415 pmap_pinit_pml4(vm_page_t pml4pg) 3416 { 3417 pml4_entry_t *pm_pml4; 3418 int i; 3419 3420 pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); 3421 3422 /* Wire in kernel global address entries. */ 3423 for (i = 0; i < NKPML4E; i++) { 3424 pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW | 3425 X86_PG_V; 3426 } 3427 for (i = 0; i < ndmpdpphys; i++) { 3428 pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | 3429 X86_PG_V; 3430 } 3431 3432 /* install self-referential address mapping entry(s) */ 3433 pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW | 3434 X86_PG_A | X86_PG_M; 3435 3436 /* install large map entries if configured */ 3437 for (i = 0; i < lm_ents; i++) 3438 pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pml4[LMSPML4I + i]; 3439 } 3440 3441 static void 3442 pmap_pinit_pml4_pti(vm_page_t pml4pg) 3443 { 3444 pml4_entry_t *pm_pml4; 3445 int i; 3446 3447 pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); 3448 for (i = 0; i < NPML4EPG; i++) 3449 pm_pml4[i] = pti_pml4[i]; 3450 } 3451 3452 /* 3453 * Initialize a preallocated and zeroed pmap structure, 3454 * such as one in a vmspace structure. 3455 */ 3456 int 3457 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) 3458 { 3459 vm_page_t pml4pg, pml4pgu; 3460 vm_paddr_t pml4phys; 3461 int i; 3462 3463 /* 3464 * allocate the page directory page 3465 */ 3466 pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 3467 VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK); 3468 3469 pml4phys = VM_PAGE_TO_PHYS(pml4pg); 3470 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys); 3471 CPU_FOREACH(i) { 3472 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; 3473 pmap->pm_pcids[i].pm_gen = 0; 3474 } 3475 pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */ 3476 pmap->pm_ucr3 = PMAP_NO_CR3; 3477 pmap->pm_pml4u = NULL; 3478 3479 pmap->pm_type = pm_type; 3480 if ((pml4pg->flags & PG_ZERO) == 0) 3481 pagezero(pmap->pm_pml4); 3482 3483 /* 3484 * Do not install the host kernel mappings in the nested page 3485 * tables. These mappings are meaningless in the guest physical 3486 * address space. 3487 * Install minimal kernel mappings in PTI case. 3488 */ 3489 if (pm_type == PT_X86) { 3490 pmap->pm_cr3 = pml4phys; 3491 pmap_pinit_pml4(pml4pg); 3492 if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) { 3493 pml4pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 3494 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_WAITOK); 3495 pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP( 3496 VM_PAGE_TO_PHYS(pml4pgu)); 3497 pmap_pinit_pml4_pti(pml4pgu); 3498 pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu); 3499 } 3500 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 3501 rangeset_init(&pmap->pm_pkru, pkru_dup_range, 3502 pkru_free_range, pmap, M_NOWAIT); 3503 } 3504 } 3505 3506 pmap->pm_root.rt_root = 0; 3507 CPU_ZERO(&pmap->pm_active); 3508 TAILQ_INIT(&pmap->pm_pvchunk); 3509 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 3510 pmap->pm_flags = flags; 3511 pmap->pm_eptgen = 0; 3512 3513 return (1); 3514 } 3515 3516 int 3517 pmap_pinit(pmap_t pmap) 3518 { 3519 3520 return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); 3521 } 3522 3523 /* 3524 * This routine is called if the desired page table page does not exist. 3525 * 3526 * If page table page allocation fails, this routine may sleep before 3527 * returning NULL. It sleeps only if a lock pointer was given. 3528 * 3529 * Note: If a page allocation fails at page table level two or three, 3530 * one or two pages may be held during the wait, only to be released 3531 * afterwards. This conservative approach is easily argued to avoid 3532 * race conditions. 3533 */ 3534 static vm_page_t 3535 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 3536 { 3537 vm_page_t m, pdppg, pdpg; 3538 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 3539 3540 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3541 3542 PG_A = pmap_accessed_bit(pmap); 3543 PG_M = pmap_modified_bit(pmap); 3544 PG_V = pmap_valid_bit(pmap); 3545 PG_RW = pmap_rw_bit(pmap); 3546 3547 /* 3548 * Allocate a page table page. 3549 */ 3550 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 3551 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 3552 if (lockp != NULL) { 3553 RELEASE_PV_LIST_LOCK(lockp); 3554 PMAP_UNLOCK(pmap); 3555 PMAP_ASSERT_NOT_IN_DI(); 3556 vm_wait(NULL); 3557 PMAP_LOCK(pmap); 3558 } 3559 3560 /* 3561 * Indicate the need to retry. While waiting, the page table 3562 * page may have been allocated. 3563 */ 3564 return (NULL); 3565 } 3566 if ((m->flags & PG_ZERO) == 0) 3567 pmap_zero_page(m); 3568 3569 /* 3570 * Map the pagetable page into the process address space, if 3571 * it isn't already there. 3572 */ 3573 3574 if (ptepindex >= (NUPDE + NUPDPE)) { 3575 pml4_entry_t *pml4, *pml4u; 3576 vm_pindex_t pml4index; 3577 3578 /* Wire up a new PDPE page */ 3579 pml4index = ptepindex - (NUPDE + NUPDPE); 3580 pml4 = &pmap->pm_pml4[pml4index]; 3581 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 3582 if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) { 3583 /* 3584 * PTI: Make all user-space mappings in the 3585 * kernel-mode page table no-execute so that 3586 * we detect any programming errors that leave 3587 * the kernel-mode page table active on return 3588 * to user space. 3589 */ 3590 if (pmap->pm_ucr3 != PMAP_NO_CR3) 3591 *pml4 |= pg_nx; 3592 3593 pml4u = &pmap->pm_pml4u[pml4index]; 3594 *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | 3595 PG_A | PG_M; 3596 } 3597 3598 } else if (ptepindex >= NUPDE) { 3599 vm_pindex_t pml4index; 3600 vm_pindex_t pdpindex; 3601 pml4_entry_t *pml4; 3602 pdp_entry_t *pdp; 3603 3604 /* Wire up a new PDE page */ 3605 pdpindex = ptepindex - NUPDE; 3606 pml4index = pdpindex >> NPML4EPGSHIFT; 3607 3608 pml4 = &pmap->pm_pml4[pml4index]; 3609 if ((*pml4 & PG_V) == 0) { 3610 /* Have to allocate a new pdp, recurse */ 3611 if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, 3612 lockp) == NULL) { 3613 vm_page_unwire_noq(m); 3614 vm_page_free_zero(m); 3615 return (NULL); 3616 } 3617 } else { 3618 /* Add reference to pdp page */ 3619 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 3620 pdppg->wire_count++; 3621 } 3622 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 3623 3624 /* Now find the pdp page */ 3625 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 3626 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 3627 3628 } else { 3629 vm_pindex_t pml4index; 3630 vm_pindex_t pdpindex; 3631 pml4_entry_t *pml4; 3632 pdp_entry_t *pdp; 3633 pd_entry_t *pd; 3634 3635 /* Wire up a new PTE page */ 3636 pdpindex = ptepindex >> NPDPEPGSHIFT; 3637 pml4index = pdpindex >> NPML4EPGSHIFT; 3638 3639 /* First, find the pdp and check that its valid. */ 3640 pml4 = &pmap->pm_pml4[pml4index]; 3641 if ((*pml4 & PG_V) == 0) { 3642 /* Have to allocate a new pd, recurse */ 3643 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 3644 lockp) == NULL) { 3645 vm_page_unwire_noq(m); 3646 vm_page_free_zero(m); 3647 return (NULL); 3648 } 3649 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 3650 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 3651 } else { 3652 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 3653 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 3654 if ((*pdp & PG_V) == 0) { 3655 /* Have to allocate a new pd, recurse */ 3656 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 3657 lockp) == NULL) { 3658 vm_page_unwire_noq(m); 3659 vm_page_free_zero(m); 3660 return (NULL); 3661 } 3662 } else { 3663 /* Add reference to the pd page */ 3664 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 3665 pdpg->wire_count++; 3666 } 3667 } 3668 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 3669 3670 /* Now we know where the page directory page is */ 3671 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; 3672 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 3673 } 3674 3675 pmap_resident_count_inc(pmap, 1); 3676 3677 return (m); 3678 } 3679 3680 static vm_page_t 3681 pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 3682 { 3683 vm_pindex_t pdpindex, ptepindex; 3684 pdp_entry_t *pdpe, PG_V; 3685 vm_page_t pdpg; 3686 3687 PG_V = pmap_valid_bit(pmap); 3688 3689 retry: 3690 pdpe = pmap_pdpe(pmap, va); 3691 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 3692 /* Add a reference to the pd page. */ 3693 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 3694 pdpg->wire_count++; 3695 } else { 3696 /* Allocate a pd page. */ 3697 ptepindex = pmap_pde_pindex(va); 3698 pdpindex = ptepindex >> NPDPEPGSHIFT; 3699 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); 3700 if (pdpg == NULL && lockp != NULL) 3701 goto retry; 3702 } 3703 return (pdpg); 3704 } 3705 3706 static vm_page_t 3707 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 3708 { 3709 vm_pindex_t ptepindex; 3710 pd_entry_t *pd, PG_V; 3711 vm_page_t m; 3712 3713 PG_V = pmap_valid_bit(pmap); 3714 3715 /* 3716 * Calculate pagetable page index 3717 */ 3718 ptepindex = pmap_pde_pindex(va); 3719 retry: 3720 /* 3721 * Get the page directory entry 3722 */ 3723 pd = pmap_pde(pmap, va); 3724 3725 /* 3726 * This supports switching from a 2MB page to a 3727 * normal 4K page. 3728 */ 3729 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 3730 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { 3731 /* 3732 * Invalidation of the 2MB page mapping may have caused 3733 * the deallocation of the underlying PD page. 3734 */ 3735 pd = NULL; 3736 } 3737 } 3738 3739 /* 3740 * If the page table page is mapped, we just increment the 3741 * hold count, and activate it. 3742 */ 3743 if (pd != NULL && (*pd & PG_V) != 0) { 3744 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 3745 m->wire_count++; 3746 } else { 3747 /* 3748 * Here if the pte page isn't mapped, or if it has been 3749 * deallocated. 3750 */ 3751 m = _pmap_allocpte(pmap, ptepindex, lockp); 3752 if (m == NULL && lockp != NULL) 3753 goto retry; 3754 } 3755 return (m); 3756 } 3757 3758 3759 /*************************************************** 3760 * Pmap allocation/deallocation routines. 3761 ***************************************************/ 3762 3763 /* 3764 * Release any resources held by the given physical map. 3765 * Called when a pmap initialized by pmap_pinit is being released. 3766 * Should only be called if the map contains no valid mappings. 3767 */ 3768 void 3769 pmap_release(pmap_t pmap) 3770 { 3771 vm_page_t m; 3772 int i; 3773 3774 KASSERT(pmap->pm_stats.resident_count == 0, 3775 ("pmap_release: pmap resident count %ld != 0", 3776 pmap->pm_stats.resident_count)); 3777 KASSERT(vm_radix_is_empty(&pmap->pm_root), 3778 ("pmap_release: pmap has reserved page table page(s)")); 3779 KASSERT(CPU_EMPTY(&pmap->pm_active), 3780 ("releasing active pmap %p", pmap)); 3781 3782 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4)); 3783 3784 for (i = 0; i < NKPML4E; i++) /* KVA */ 3785 pmap->pm_pml4[KPML4BASE + i] = 0; 3786 for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ 3787 pmap->pm_pml4[DMPML4I + i] = 0; 3788 pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ 3789 for (i = 0; i < lm_ents; i++) /* Large Map */ 3790 pmap->pm_pml4[LMSPML4I + i] = 0; 3791 3792 vm_page_unwire_noq(m); 3793 vm_page_free_zero(m); 3794 3795 if (pmap->pm_pml4u != NULL) { 3796 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u)); 3797 vm_page_unwire_noq(m); 3798 vm_page_free(m); 3799 } 3800 if (pmap->pm_type == PT_X86 && 3801 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) 3802 rangeset_fini(&pmap->pm_pkru); 3803 } 3804 3805 static int 3806 kvm_size(SYSCTL_HANDLER_ARGS) 3807 { 3808 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 3809 3810 return sysctl_handle_long(oidp, &ksize, 0, req); 3811 } 3812 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 3813 0, 0, kvm_size, "LU", "Size of KVM"); 3814 3815 static int 3816 kvm_free(SYSCTL_HANDLER_ARGS) 3817 { 3818 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 3819 3820 return sysctl_handle_long(oidp, &kfree, 0, req); 3821 } 3822 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 3823 0, 0, kvm_free, "LU", "Amount of KVM free"); 3824 3825 /* 3826 * Allocate physical memory for the vm_page array and map it into KVA, 3827 * attempting to back the vm_pages with domain-local memory. 3828 */ 3829 void 3830 pmap_page_array_startup(long pages) 3831 { 3832 pdp_entry_t *pdpe; 3833 pd_entry_t *pde, newpdir; 3834 vm_offset_t va, start, end; 3835 vm_paddr_t pa; 3836 long pfn; 3837 int domain, i; 3838 3839 vm_page_array_size = pages; 3840 3841 start = va = VM_MIN_KERNEL_ADDRESS; 3842 end = va + pages * sizeof(struct vm_page); 3843 while (va < end) { 3844 pfn = first_page + (va - start) / sizeof(struct vm_page); 3845 domain = _vm_phys_domain(ptoa(pfn)); 3846 pdpe = pmap_pdpe(kernel_pmap, va); 3847 if ((*pdpe & X86_PG_V) == 0) { 3848 pa = vm_phys_early_alloc(domain, PAGE_SIZE); 3849 dump_add_page(pa); 3850 pagezero((void *)PHYS_TO_DMAP(pa)); 3851 *pdpe = (pdp_entry_t)(pa | X86_PG_V | X86_PG_RW | 3852 X86_PG_A | X86_PG_M); 3853 } 3854 pde = pmap_pdpe_to_pde(pdpe, va); 3855 if ((*pde & X86_PG_V) != 0) 3856 panic("Unexpected pde"); 3857 pa = vm_phys_early_alloc(domain, NBPDR); 3858 for (i = 0; i < NPDEPG; i++) 3859 dump_add_page(pa + i * PAGE_SIZE); 3860 newpdir = (pd_entry_t)(pa | X86_PG_V | X86_PG_RW | X86_PG_A | 3861 X86_PG_M | PG_PS | pg_g | pg_nx); 3862 pde_store(pde, newpdir); 3863 va += NBPDR; 3864 } 3865 vm_page_array = (vm_page_t)start; 3866 } 3867 3868 /* 3869 * grow the number of kernel page table entries, if needed 3870 */ 3871 void 3872 pmap_growkernel(vm_offset_t addr) 3873 { 3874 vm_paddr_t paddr; 3875 vm_page_t nkpg; 3876 pd_entry_t *pde, newpdir; 3877 pdp_entry_t *pdpe; 3878 3879 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 3880 3881 /* 3882 * Return if "addr" is within the range of kernel page table pages 3883 * that were preallocated during pmap bootstrap. Moreover, leave 3884 * "kernel_vm_end" and the kernel page table as they were. 3885 * 3886 * The correctness of this action is based on the following 3887 * argument: vm_map_insert() allocates contiguous ranges of the 3888 * kernel virtual address space. It calls this function if a range 3889 * ends after "kernel_vm_end". If the kernel is mapped between 3890 * "kernel_vm_end" and "addr", then the range cannot begin at 3891 * "kernel_vm_end". In fact, its beginning address cannot be less 3892 * than the kernel. Thus, there is no immediate need to allocate 3893 * any new kernel page table pages between "kernel_vm_end" and 3894 * "KERNBASE". 3895 */ 3896 if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR) 3897 return; 3898 3899 addr = roundup2(addr, NBPDR); 3900 if (addr - 1 >= vm_map_max(kernel_map)) 3901 addr = vm_map_max(kernel_map); 3902 while (kernel_vm_end < addr) { 3903 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end); 3904 if ((*pdpe & X86_PG_V) == 0) { 3905 /* We need a new PDP entry */ 3906 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT, 3907 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 3908 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 3909 if (nkpg == NULL) 3910 panic("pmap_growkernel: no memory to grow kernel"); 3911 if ((nkpg->flags & PG_ZERO) == 0) 3912 pmap_zero_page(nkpg); 3913 paddr = VM_PAGE_TO_PHYS(nkpg); 3914 *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | 3915 X86_PG_A | X86_PG_M); 3916 continue; /* try again */ 3917 } 3918 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); 3919 if ((*pde & X86_PG_V) != 0) { 3920 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 3921 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 3922 kernel_vm_end = vm_map_max(kernel_map); 3923 break; 3924 } 3925 continue; 3926 } 3927 3928 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end), 3929 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 3930 VM_ALLOC_ZERO); 3931 if (nkpg == NULL) 3932 panic("pmap_growkernel: no memory to grow kernel"); 3933 if ((nkpg->flags & PG_ZERO) == 0) 3934 pmap_zero_page(nkpg); 3935 paddr = VM_PAGE_TO_PHYS(nkpg); 3936 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; 3937 pde_store(pde, newpdir); 3938 3939 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 3940 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 3941 kernel_vm_end = vm_map_max(kernel_map); 3942 break; 3943 } 3944 } 3945 } 3946 3947 3948 /*************************************************** 3949 * page management routines. 3950 ***************************************************/ 3951 3952 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 3953 CTASSERT(_NPCM == 3); 3954 CTASSERT(_NPCPV == 168); 3955 3956 static __inline struct pv_chunk * 3957 pv_to_chunk(pv_entry_t pv) 3958 { 3959 3960 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 3961 } 3962 3963 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 3964 3965 #define PC_FREE0 0xfffffffffffffffful 3966 #define PC_FREE1 0xfffffffffffffffful 3967 #define PC_FREE2 0x000000fffffffffful 3968 3969 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 3970 3971 #ifdef PV_STATS 3972 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 3973 3974 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 3975 "Current number of pv entry chunks"); 3976 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 3977 "Current number of pv entry chunks allocated"); 3978 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 3979 "Current number of pv entry chunks frees"); 3980 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 3981 "Number of times tried to get a chunk page but failed."); 3982 3983 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 3984 static int pv_entry_spare; 3985 3986 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 3987 "Current number of pv entry frees"); 3988 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 3989 "Current number of pv entry allocs"); 3990 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 3991 "Current number of pv entries"); 3992 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 3993 "Current number of spare pv entries"); 3994 #endif 3995 3996 static void 3997 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di) 3998 { 3999 4000 if (pmap == NULL) 4001 return; 4002 pmap_invalidate_all(pmap); 4003 if (pmap != locked_pmap) 4004 PMAP_UNLOCK(pmap); 4005 if (start_di) 4006 pmap_delayed_invl_finish(); 4007 } 4008 4009 /* 4010 * We are in a serious low memory condition. Resort to 4011 * drastic measures to free some pages so we can allocate 4012 * another pv entry chunk. 4013 * 4014 * Returns NULL if PV entries were reclaimed from the specified pmap. 4015 * 4016 * We do not, however, unmap 2mpages because subsequent accesses will 4017 * allocate per-page pv entries until repromotion occurs, thereby 4018 * exacerbating the shortage of free pv entries. 4019 */ 4020 static vm_page_t 4021 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 4022 { 4023 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 4024 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 4025 struct md_page *pvh; 4026 pd_entry_t *pde; 4027 pmap_t next_pmap, pmap; 4028 pt_entry_t *pte, tpte; 4029 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 4030 pv_entry_t pv; 4031 vm_offset_t va; 4032 vm_page_t m, m_pc; 4033 struct spglist free; 4034 uint64_t inuse; 4035 int bit, field, freed; 4036 bool start_di; 4037 static int active_reclaims = 0; 4038 4039 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 4040 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 4041 pmap = NULL; 4042 m_pc = NULL; 4043 PG_G = PG_A = PG_M = PG_RW = 0; 4044 SLIST_INIT(&free); 4045 bzero(&pc_marker_b, sizeof(pc_marker_b)); 4046 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 4047 pc_marker = (struct pv_chunk *)&pc_marker_b; 4048 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 4049 4050 /* 4051 * A delayed invalidation block should already be active if 4052 * pmap_advise() or pmap_remove() called this function by way 4053 * of pmap_demote_pde_locked(). 4054 */ 4055 start_di = pmap_not_in_di(); 4056 4057 mtx_lock(&pv_chunks_mutex); 4058 active_reclaims++; 4059 TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); 4060 TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); 4061 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 4062 SLIST_EMPTY(&free)) { 4063 next_pmap = pc->pc_pmap; 4064 if (next_pmap == NULL) { 4065 /* 4066 * The next chunk is a marker. However, it is 4067 * not our marker, so active_reclaims must be 4068 * > 1. Consequently, the next_chunk code 4069 * will not rotate the pv_chunks list. 4070 */ 4071 goto next_chunk; 4072 } 4073 mtx_unlock(&pv_chunks_mutex); 4074 4075 /* 4076 * A pv_chunk can only be removed from the pc_lru list 4077 * when both pc_chunks_mutex is owned and the 4078 * corresponding pmap is locked. 4079 */ 4080 if (pmap != next_pmap) { 4081 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, 4082 start_di); 4083 pmap = next_pmap; 4084 /* Avoid deadlock and lock recursion. */ 4085 if (pmap > locked_pmap) { 4086 RELEASE_PV_LIST_LOCK(lockp); 4087 PMAP_LOCK(pmap); 4088 if (start_di) 4089 pmap_delayed_invl_start(); 4090 mtx_lock(&pv_chunks_mutex); 4091 continue; 4092 } else if (pmap != locked_pmap) { 4093 if (PMAP_TRYLOCK(pmap)) { 4094 if (start_di) 4095 pmap_delayed_invl_start(); 4096 mtx_lock(&pv_chunks_mutex); 4097 continue; 4098 } else { 4099 pmap = NULL; /* pmap is not locked */ 4100 mtx_lock(&pv_chunks_mutex); 4101 pc = TAILQ_NEXT(pc_marker, pc_lru); 4102 if (pc == NULL || 4103 pc->pc_pmap != next_pmap) 4104 continue; 4105 goto next_chunk; 4106 } 4107 } else if (start_di) 4108 pmap_delayed_invl_start(); 4109 PG_G = pmap_global_bit(pmap); 4110 PG_A = pmap_accessed_bit(pmap); 4111 PG_M = pmap_modified_bit(pmap); 4112 PG_RW = pmap_rw_bit(pmap); 4113 } 4114 4115 /* 4116 * Destroy every non-wired, 4 KB page mapping in the chunk. 4117 */ 4118 freed = 0; 4119 for (field = 0; field < _NPCM; field++) { 4120 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 4121 inuse != 0; inuse &= ~(1UL << bit)) { 4122 bit = bsfq(inuse); 4123 pv = &pc->pc_pventry[field * 64 + bit]; 4124 va = pv->pv_va; 4125 pde = pmap_pde(pmap, va); 4126 if ((*pde & PG_PS) != 0) 4127 continue; 4128 pte = pmap_pde_to_pte(pde, va); 4129 if ((*pte & PG_W) != 0) 4130 continue; 4131 tpte = pte_load_clear(pte); 4132 if ((tpte & PG_G) != 0) 4133 pmap_invalidate_page(pmap, va); 4134 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 4135 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 4136 vm_page_dirty(m); 4137 if ((tpte & PG_A) != 0) 4138 vm_page_aflag_set(m, PGA_REFERENCED); 4139 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 4140 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4141 m->md.pv_gen++; 4142 if (TAILQ_EMPTY(&m->md.pv_list) && 4143 (m->flags & PG_FICTITIOUS) == 0) { 4144 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4145 if (TAILQ_EMPTY(&pvh->pv_list)) { 4146 vm_page_aflag_clear(m, 4147 PGA_WRITEABLE); 4148 } 4149 } 4150 pmap_delayed_invl_page(m); 4151 pc->pc_map[field] |= 1UL << bit; 4152 pmap_unuse_pt(pmap, va, *pde, &free); 4153 freed++; 4154 } 4155 } 4156 if (freed == 0) { 4157 mtx_lock(&pv_chunks_mutex); 4158 goto next_chunk; 4159 } 4160 /* Every freed mapping is for a 4 KB page. */ 4161 pmap_resident_count_dec(pmap, freed); 4162 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 4163 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 4164 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 4165 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4166 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && 4167 pc->pc_map[2] == PC_FREE2) { 4168 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 4169 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 4170 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 4171 /* Entire chunk is free; return it. */ 4172 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 4173 dump_drop_page(m_pc->phys_addr); 4174 mtx_lock(&pv_chunks_mutex); 4175 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 4176 break; 4177 } 4178 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 4179 mtx_lock(&pv_chunks_mutex); 4180 /* One freed pv entry in locked_pmap is sufficient. */ 4181 if (pmap == locked_pmap) 4182 break; 4183 next_chunk: 4184 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 4185 TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); 4186 if (active_reclaims == 1 && pmap != NULL) { 4187 /* 4188 * Rotate the pv chunks list so that we do not 4189 * scan the same pv chunks that could not be 4190 * freed (because they contained a wired 4191 * and/or superpage mapping) on every 4192 * invocation of reclaim_pv_chunk(). 4193 */ 4194 while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { 4195 MPASS(pc->pc_pmap != NULL); 4196 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 4197 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 4198 } 4199 } 4200 } 4201 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 4202 TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); 4203 active_reclaims--; 4204 mtx_unlock(&pv_chunks_mutex); 4205 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); 4206 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 4207 m_pc = SLIST_FIRST(&free); 4208 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 4209 /* Recycle a freed page table page. */ 4210 m_pc->wire_count = 1; 4211 } 4212 vm_page_free_pages_toq(&free, true); 4213 return (m_pc); 4214 } 4215 4216 /* 4217 * free the pv_entry back to the free list 4218 */ 4219 static void 4220 free_pv_entry(pmap_t pmap, pv_entry_t pv) 4221 { 4222 struct pv_chunk *pc; 4223 int idx, field, bit; 4224 4225 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4226 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 4227 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 4228 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 4229 pc = pv_to_chunk(pv); 4230 idx = pv - &pc->pc_pventry[0]; 4231 field = idx / 64; 4232 bit = idx % 64; 4233 pc->pc_map[field] |= 1ul << bit; 4234 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 4235 pc->pc_map[2] != PC_FREE2) { 4236 /* 98% of the time, pc is already at the head of the list. */ 4237 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 4238 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4239 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 4240 } 4241 return; 4242 } 4243 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4244 free_pv_chunk(pc); 4245 } 4246 4247 static void 4248 free_pv_chunk(struct pv_chunk *pc) 4249 { 4250 vm_page_t m; 4251 4252 mtx_lock(&pv_chunks_mutex); 4253 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 4254 mtx_unlock(&pv_chunks_mutex); 4255 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 4256 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 4257 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 4258 /* entire chunk is free, return it */ 4259 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 4260 dump_drop_page(m->phys_addr); 4261 vm_page_unwire_noq(m); 4262 vm_page_free(m); 4263 } 4264 4265 /* 4266 * Returns a new PV entry, allocating a new PV chunk from the system when 4267 * needed. If this PV chunk allocation fails and a PV list lock pointer was 4268 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 4269 * returned. 4270 * 4271 * The given PV list lock may be released. 4272 */ 4273 static pv_entry_t 4274 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 4275 { 4276 int bit, field; 4277 pv_entry_t pv; 4278 struct pv_chunk *pc; 4279 vm_page_t m; 4280 4281 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4282 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 4283 retry: 4284 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 4285 if (pc != NULL) { 4286 for (field = 0; field < _NPCM; field++) { 4287 if (pc->pc_map[field]) { 4288 bit = bsfq(pc->pc_map[field]); 4289 break; 4290 } 4291 } 4292 if (field < _NPCM) { 4293 pv = &pc->pc_pventry[field * 64 + bit]; 4294 pc->pc_map[field] &= ~(1ul << bit); 4295 /* If this was the last item, move it to tail */ 4296 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 4297 pc->pc_map[2] == 0) { 4298 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4299 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 4300 pc_list); 4301 } 4302 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 4303 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 4304 return (pv); 4305 } 4306 } 4307 /* No free items, allocate another chunk */ 4308 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 4309 VM_ALLOC_WIRED); 4310 if (m == NULL) { 4311 if (lockp == NULL) { 4312 PV_STAT(pc_chunk_tryfail++); 4313 return (NULL); 4314 } 4315 m = reclaim_pv_chunk(pmap, lockp); 4316 if (m == NULL) 4317 goto retry; 4318 } 4319 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 4320 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 4321 dump_add_page(m->phys_addr); 4322 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 4323 pc->pc_pmap = pmap; 4324 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 4325 pc->pc_map[1] = PC_FREE1; 4326 pc->pc_map[2] = PC_FREE2; 4327 mtx_lock(&pv_chunks_mutex); 4328 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 4329 mtx_unlock(&pv_chunks_mutex); 4330 pv = &pc->pc_pventry[0]; 4331 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 4332 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 4333 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 4334 return (pv); 4335 } 4336 4337 /* 4338 * Returns the number of one bits within the given PV chunk map. 4339 * 4340 * The erratas for Intel processors state that "POPCNT Instruction May 4341 * Take Longer to Execute Than Expected". It is believed that the 4342 * issue is the spurious dependency on the destination register. 4343 * Provide a hint to the register rename logic that the destination 4344 * value is overwritten, by clearing it, as suggested in the 4345 * optimization manual. It should be cheap for unaffected processors 4346 * as well. 4347 * 4348 * Reference numbers for erratas are 4349 * 4th Gen Core: HSD146 4350 * 5th Gen Core: BDM85 4351 * 6th Gen Core: SKL029 4352 */ 4353 static int 4354 popcnt_pc_map_pq(uint64_t *map) 4355 { 4356 u_long result, tmp; 4357 4358 __asm __volatile("xorl %k0,%k0;popcntq %2,%0;" 4359 "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;" 4360 "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0" 4361 : "=&r" (result), "=&r" (tmp) 4362 : "m" (map[0]), "m" (map[1]), "m" (map[2])); 4363 return (result); 4364 } 4365 4366 /* 4367 * Ensure that the number of spare PV entries in the specified pmap meets or 4368 * exceeds the given count, "needed". 4369 * 4370 * The given PV list lock may be released. 4371 */ 4372 static void 4373 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 4374 { 4375 struct pch new_tail; 4376 struct pv_chunk *pc; 4377 vm_page_t m; 4378 int avail, free; 4379 bool reclaimed; 4380 4381 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4382 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 4383 4384 /* 4385 * Newly allocated PV chunks must be stored in a private list until 4386 * the required number of PV chunks have been allocated. Otherwise, 4387 * reclaim_pv_chunk() could recycle one of these chunks. In 4388 * contrast, these chunks must be added to the pmap upon allocation. 4389 */ 4390 TAILQ_INIT(&new_tail); 4391 retry: 4392 avail = 0; 4393 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 4394 #ifndef __POPCNT__ 4395 if ((cpu_feature2 & CPUID2_POPCNT) == 0) 4396 bit_count((bitstr_t *)pc->pc_map, 0, 4397 sizeof(pc->pc_map) * NBBY, &free); 4398 else 4399 #endif 4400 free = popcnt_pc_map_pq(pc->pc_map); 4401 if (free == 0) 4402 break; 4403 avail += free; 4404 if (avail >= needed) 4405 break; 4406 } 4407 for (reclaimed = false; avail < needed; avail += _NPCPV) { 4408 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 4409 VM_ALLOC_WIRED); 4410 if (m == NULL) { 4411 m = reclaim_pv_chunk(pmap, lockp); 4412 if (m == NULL) 4413 goto retry; 4414 reclaimed = true; 4415 } 4416 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 4417 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 4418 dump_add_page(m->phys_addr); 4419 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 4420 pc->pc_pmap = pmap; 4421 pc->pc_map[0] = PC_FREE0; 4422 pc->pc_map[1] = PC_FREE1; 4423 pc->pc_map[2] = PC_FREE2; 4424 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 4425 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 4426 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 4427 4428 /* 4429 * The reclaim might have freed a chunk from the current pmap. 4430 * If that chunk contained available entries, we need to 4431 * re-count the number of available entries. 4432 */ 4433 if (reclaimed) 4434 goto retry; 4435 } 4436 if (!TAILQ_EMPTY(&new_tail)) { 4437 mtx_lock(&pv_chunks_mutex); 4438 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 4439 mtx_unlock(&pv_chunks_mutex); 4440 } 4441 } 4442 4443 /* 4444 * First find and then remove the pv entry for the specified pmap and virtual 4445 * address from the specified pv list. Returns the pv entry if found and NULL 4446 * otherwise. This operation can be performed on pv lists for either 4KB or 4447 * 2MB page mappings. 4448 */ 4449 static __inline pv_entry_t 4450 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 4451 { 4452 pv_entry_t pv; 4453 4454 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4455 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 4456 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4457 pvh->pv_gen++; 4458 break; 4459 } 4460 } 4461 return (pv); 4462 } 4463 4464 /* 4465 * After demotion from a 2MB page mapping to 512 4KB page mappings, 4466 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 4467 * entries for each of the 4KB page mappings. 4468 */ 4469 static void 4470 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 4471 struct rwlock **lockp) 4472 { 4473 struct md_page *pvh; 4474 struct pv_chunk *pc; 4475 pv_entry_t pv; 4476 vm_offset_t va_last; 4477 vm_page_t m; 4478 int bit, field; 4479 4480 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4481 KASSERT((pa & PDRMASK) == 0, 4482 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 4483 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 4484 4485 /* 4486 * Transfer the 2mpage's pv entry for this mapping to the first 4487 * page's pv list. Once this transfer begins, the pv list lock 4488 * must not be released until the last pv entry is reinstantiated. 4489 */ 4490 pvh = pa_to_pvh(pa); 4491 va = trunc_2mpage(va); 4492 pv = pmap_pvh_remove(pvh, pmap, va); 4493 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 4494 m = PHYS_TO_VM_PAGE(pa); 4495 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4496 m->md.pv_gen++; 4497 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 4498 PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); 4499 va_last = va + NBPDR - PAGE_SIZE; 4500 for (;;) { 4501 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 4502 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 4503 pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); 4504 for (field = 0; field < _NPCM; field++) { 4505 while (pc->pc_map[field]) { 4506 bit = bsfq(pc->pc_map[field]); 4507 pc->pc_map[field] &= ~(1ul << bit); 4508 pv = &pc->pc_pventry[field * 64 + bit]; 4509 va += PAGE_SIZE; 4510 pv->pv_va = va; 4511 m++; 4512 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4513 ("pmap_pv_demote_pde: page %p is not managed", m)); 4514 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4515 m->md.pv_gen++; 4516 if (va == va_last) 4517 goto out; 4518 } 4519 } 4520 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4521 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 4522 } 4523 out: 4524 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 4525 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4526 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 4527 } 4528 PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); 4529 PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); 4530 } 4531 4532 #if VM_NRESERVLEVEL > 0 4533 /* 4534 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 4535 * replace the many pv entries for the 4KB page mappings by a single pv entry 4536 * for the 2MB page mapping. 4537 */ 4538 static void 4539 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 4540 struct rwlock **lockp) 4541 { 4542 struct md_page *pvh; 4543 pv_entry_t pv; 4544 vm_offset_t va_last; 4545 vm_page_t m; 4546 4547 KASSERT((pa & PDRMASK) == 0, 4548 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 4549 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 4550 4551 /* 4552 * Transfer the first page's pv entry for this mapping to the 2mpage's 4553 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 4554 * a transfer avoids the possibility that get_pv_entry() calls 4555 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 4556 * mappings that is being promoted. 4557 */ 4558 m = PHYS_TO_VM_PAGE(pa); 4559 va = trunc_2mpage(va); 4560 pv = pmap_pvh_remove(&m->md, pmap, va); 4561 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 4562 pvh = pa_to_pvh(pa); 4563 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4564 pvh->pv_gen++; 4565 /* Free the remaining NPTEPG - 1 pv entries. */ 4566 va_last = va + NBPDR - PAGE_SIZE; 4567 do { 4568 m++; 4569 va += PAGE_SIZE; 4570 pmap_pvh_free(&m->md, pmap, va); 4571 } while (va < va_last); 4572 } 4573 #endif /* VM_NRESERVLEVEL > 0 */ 4574 4575 /* 4576 * First find and then destroy the pv entry for the specified pmap and virtual 4577 * address. This operation can be performed on pv lists for either 4KB or 2MB 4578 * page mappings. 4579 */ 4580 static void 4581 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 4582 { 4583 pv_entry_t pv; 4584 4585 pv = pmap_pvh_remove(pvh, pmap, va); 4586 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 4587 free_pv_entry(pmap, pv); 4588 } 4589 4590 /* 4591 * Conditionally create the PV entry for a 4KB page mapping if the required 4592 * memory can be allocated without resorting to reclamation. 4593 */ 4594 static boolean_t 4595 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 4596 struct rwlock **lockp) 4597 { 4598 pv_entry_t pv; 4599 4600 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4601 /* Pass NULL instead of the lock pointer to disable reclamation. */ 4602 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 4603 pv->pv_va = va; 4604 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 4605 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4606 m->md.pv_gen++; 4607 return (TRUE); 4608 } else 4609 return (FALSE); 4610 } 4611 4612 /* 4613 * Create the PV entry for a 2MB page mapping. Always returns true unless the 4614 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 4615 * false if the PV entry cannot be allocated without resorting to reclamation. 4616 */ 4617 static bool 4618 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, 4619 struct rwlock **lockp) 4620 { 4621 struct md_page *pvh; 4622 pv_entry_t pv; 4623 vm_paddr_t pa; 4624 4625 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4626 /* Pass NULL instead of the lock pointer to disable reclamation. */ 4627 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 4628 NULL : lockp)) == NULL) 4629 return (false); 4630 pv->pv_va = va; 4631 pa = pde & PG_PS_FRAME; 4632 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 4633 pvh = pa_to_pvh(pa); 4634 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4635 pvh->pv_gen++; 4636 return (true); 4637 } 4638 4639 /* 4640 * Fills a page table page with mappings to consecutive physical pages. 4641 */ 4642 static void 4643 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 4644 { 4645 pt_entry_t *pte; 4646 4647 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 4648 *pte = newpte; 4649 newpte += PAGE_SIZE; 4650 } 4651 } 4652 4653 /* 4654 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 4655 * mapping is invalidated. 4656 */ 4657 static boolean_t 4658 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 4659 { 4660 struct rwlock *lock; 4661 boolean_t rv; 4662 4663 lock = NULL; 4664 rv = pmap_demote_pde_locked(pmap, pde, va, &lock); 4665 if (lock != NULL) 4666 rw_wunlock(lock); 4667 return (rv); 4668 } 4669 4670 static void 4671 pmap_demote_pde_check(pt_entry_t *firstpte __unused, pt_entry_t newpte __unused) 4672 { 4673 #ifdef INVARIANTS 4674 #ifdef DIAGNOSTIC 4675 pt_entry_t *xpte, *ypte; 4676 4677 for (xpte = firstpte; xpte < firstpte + NPTEPG; 4678 xpte++, newpte += PAGE_SIZE) { 4679 if ((*xpte & PG_FRAME) != (newpte & PG_FRAME)) { 4680 printf("pmap_demote_pde: xpte %zd and newpte map " 4681 "different pages: found %#lx, expected %#lx\n", 4682 xpte - firstpte, *xpte, newpte); 4683 printf("page table dump\n"); 4684 for (ypte = firstpte; ypte < firstpte + NPTEPG; ypte++) 4685 printf("%zd %#lx\n", ypte - firstpte, *ypte); 4686 panic("firstpte"); 4687 } 4688 } 4689 #else 4690 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 4691 ("pmap_demote_pde: firstpte and newpte map different physical" 4692 " addresses")); 4693 #endif 4694 #endif 4695 } 4696 4697 static void 4698 pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 4699 pd_entry_t oldpde, struct rwlock **lockp) 4700 { 4701 struct spglist free; 4702 vm_offset_t sva; 4703 4704 SLIST_INIT(&free); 4705 sva = trunc_2mpage(va); 4706 pmap_remove_pde(pmap, pde, sva, &free, lockp); 4707 if ((oldpde & pmap_global_bit(pmap)) == 0) 4708 pmap_invalidate_pde_page(pmap, sva, oldpde); 4709 vm_page_free_pages_toq(&free, true); 4710 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx in pmap %p", 4711 va, pmap); 4712 } 4713 4714 static boolean_t 4715 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 4716 struct rwlock **lockp) 4717 { 4718 pd_entry_t newpde, oldpde; 4719 pt_entry_t *firstpte, newpte; 4720 pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V; 4721 vm_paddr_t mptepa; 4722 vm_page_t mpte; 4723 int PG_PTE_CACHE; 4724 bool in_kernel; 4725 4726 PG_A = pmap_accessed_bit(pmap); 4727 PG_G = pmap_global_bit(pmap); 4728 PG_M = pmap_modified_bit(pmap); 4729 PG_RW = pmap_rw_bit(pmap); 4730 PG_V = pmap_valid_bit(pmap); 4731 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 4732 PG_PKU_MASK = pmap_pku_mask_bit(pmap); 4733 4734 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4735 in_kernel = va >= VM_MAXUSER_ADDRESS; 4736 oldpde = *pde; 4737 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 4738 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 4739 4740 /* 4741 * Invalidate the 2MB page mapping and return "failure" if the 4742 * mapping was never accessed. 4743 */ 4744 if ((oldpde & PG_A) == 0) { 4745 KASSERT((oldpde & PG_W) == 0, 4746 ("pmap_demote_pde: a wired mapping is missing PG_A")); 4747 pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); 4748 return (FALSE); 4749 } 4750 4751 mpte = pmap_remove_pt_page(pmap, va); 4752 if (mpte == NULL) { 4753 KASSERT((oldpde & PG_W) == 0, 4754 ("pmap_demote_pde: page table page for a wired mapping" 4755 " is missing")); 4756 4757 /* 4758 * If the page table page is missing and the mapping 4759 * is for a kernel address, the mapping must belong to 4760 * the direct map. Page table pages are preallocated 4761 * for every other part of the kernel address space, 4762 * so the direct map region is the only part of the 4763 * kernel address space that must be handled here. 4764 */ 4765 KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS && 4766 va < DMAP_MAX_ADDRESS), 4767 ("pmap_demote_pde: No saved mpte for va %#lx", va)); 4768 4769 /* 4770 * If the 2MB page mapping belongs to the direct map 4771 * region of the kernel's address space, then the page 4772 * allocation request specifies the highest possible 4773 * priority (VM_ALLOC_INTERRUPT). Otherwise, the 4774 * priority is normal. 4775 */ 4776 mpte = vm_page_alloc(NULL, pmap_pde_pindex(va), 4777 (in_kernel ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | 4778 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); 4779 4780 /* 4781 * If the allocation of the new page table page fails, 4782 * invalidate the 2MB page mapping and return "failure". 4783 */ 4784 if (mpte == NULL) { 4785 pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); 4786 return (FALSE); 4787 } 4788 4789 if (!in_kernel) { 4790 mpte->wire_count = NPTEPG; 4791 pmap_resident_count_inc(pmap, 1); 4792 } 4793 } 4794 mptepa = VM_PAGE_TO_PHYS(mpte); 4795 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 4796 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 4797 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 4798 ("pmap_demote_pde: oldpde is missing PG_M")); 4799 newpte = oldpde & ~PG_PS; 4800 newpte = pmap_swap_pat(pmap, newpte); 4801 4802 /* 4803 * If the page table page is not leftover from an earlier promotion, 4804 * initialize it. 4805 */ 4806 if (mpte->valid == 0) 4807 pmap_fill_ptp(firstpte, newpte); 4808 4809 pmap_demote_pde_check(firstpte, newpte); 4810 4811 /* 4812 * If the mapping has changed attributes, update the page table 4813 * entries. 4814 */ 4815 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 4816 pmap_fill_ptp(firstpte, newpte); 4817 4818 /* 4819 * The spare PV entries must be reserved prior to demoting the 4820 * mapping, that is, prior to changing the PDE. Otherwise, the state 4821 * of the PDE and the PV lists will be inconsistent, which can result 4822 * in reclaim_pv_chunk() attempting to remove a PV entry from the 4823 * wrong PV list and pmap_pv_demote_pde() failing to find the expected 4824 * PV entry for the 2MB page mapping that is being demoted. 4825 */ 4826 if ((oldpde & PG_MANAGED) != 0) 4827 reserve_pv_entries(pmap, NPTEPG - 1, lockp); 4828 4829 /* 4830 * Demote the mapping. This pmap is locked. The old PDE has 4831 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 4832 * set. Thus, there is no danger of a race with another 4833 * processor changing the setting of PG_A and/or PG_M between 4834 * the read above and the store below. 4835 */ 4836 if (workaround_erratum383) 4837 pmap_update_pde(pmap, va, pde, newpde); 4838 else 4839 pde_store(pde, newpde); 4840 4841 /* 4842 * Invalidate a stale recursive mapping of the page table page. 4843 */ 4844 if (in_kernel) 4845 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 4846 4847 /* 4848 * Demote the PV entry. 4849 */ 4850 if ((oldpde & PG_MANAGED) != 0) 4851 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); 4852 4853 atomic_add_long(&pmap_pde_demotions, 1); 4854 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p", 4855 va, pmap); 4856 return (TRUE); 4857 } 4858 4859 /* 4860 * pmap_remove_kernel_pde: Remove a kernel superpage mapping. 4861 */ 4862 static void 4863 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 4864 { 4865 pd_entry_t newpde; 4866 vm_paddr_t mptepa; 4867 vm_page_t mpte; 4868 4869 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 4870 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4871 mpte = pmap_remove_pt_page(pmap, va); 4872 if (mpte == NULL) 4873 panic("pmap_remove_kernel_pde: Missing pt page."); 4874 4875 mptepa = VM_PAGE_TO_PHYS(mpte); 4876 newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; 4877 4878 /* 4879 * If this page table page was unmapped by a promotion, then it 4880 * contains valid mappings. Zero it to invalidate those mappings. 4881 */ 4882 if (mpte->valid != 0) 4883 pagezero((void *)PHYS_TO_DMAP(mptepa)); 4884 4885 /* 4886 * Demote the mapping. 4887 */ 4888 if (workaround_erratum383) 4889 pmap_update_pde(pmap, va, pde, newpde); 4890 else 4891 pde_store(pde, newpde); 4892 4893 /* 4894 * Invalidate a stale recursive mapping of the page table page. 4895 */ 4896 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 4897 } 4898 4899 /* 4900 * pmap_remove_pde: do the things to unmap a superpage in a process 4901 */ 4902 static int 4903 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 4904 struct spglist *free, struct rwlock **lockp) 4905 { 4906 struct md_page *pvh; 4907 pd_entry_t oldpde; 4908 vm_offset_t eva, va; 4909 vm_page_t m, mpte; 4910 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 4911 4912 PG_G = pmap_global_bit(pmap); 4913 PG_A = pmap_accessed_bit(pmap); 4914 PG_M = pmap_modified_bit(pmap); 4915 PG_RW = pmap_rw_bit(pmap); 4916 4917 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4918 KASSERT((sva & PDRMASK) == 0, 4919 ("pmap_remove_pde: sva is not 2mpage aligned")); 4920 oldpde = pte_load_clear(pdq); 4921 if (oldpde & PG_W) 4922 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 4923 if ((oldpde & PG_G) != 0) 4924 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 4925 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); 4926 if (oldpde & PG_MANAGED) { 4927 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); 4928 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 4929 pmap_pvh_free(pvh, pmap, sva); 4930 eva = sva + NBPDR; 4931 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 4932 va < eva; va += PAGE_SIZE, m++) { 4933 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 4934 vm_page_dirty(m); 4935 if (oldpde & PG_A) 4936 vm_page_aflag_set(m, PGA_REFERENCED); 4937 if (TAILQ_EMPTY(&m->md.pv_list) && 4938 TAILQ_EMPTY(&pvh->pv_list)) 4939 vm_page_aflag_clear(m, PGA_WRITEABLE); 4940 pmap_delayed_invl_page(m); 4941 } 4942 } 4943 if (pmap == kernel_pmap) { 4944 pmap_remove_kernel_pde(pmap, pdq, sva); 4945 } else { 4946 mpte = pmap_remove_pt_page(pmap, sva); 4947 if (mpte != NULL) { 4948 KASSERT(mpte->valid == VM_PAGE_BITS_ALL, 4949 ("pmap_remove_pde: pte page not promoted")); 4950 pmap_resident_count_dec(pmap, 1); 4951 KASSERT(mpte->wire_count == NPTEPG, 4952 ("pmap_remove_pde: pte page wire count error")); 4953 mpte->wire_count = 0; 4954 pmap_add_delayed_free_list(mpte, free, FALSE); 4955 } 4956 } 4957 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); 4958 } 4959 4960 /* 4961 * pmap_remove_pte: do the things to unmap a page in a process 4962 */ 4963 static int 4964 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 4965 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) 4966 { 4967 struct md_page *pvh; 4968 pt_entry_t oldpte, PG_A, PG_M, PG_RW; 4969 vm_page_t m; 4970 4971 PG_A = pmap_accessed_bit(pmap); 4972 PG_M = pmap_modified_bit(pmap); 4973 PG_RW = pmap_rw_bit(pmap); 4974 4975 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4976 oldpte = pte_load_clear(ptq); 4977 if (oldpte & PG_W) 4978 pmap->pm_stats.wired_count -= 1; 4979 pmap_resident_count_dec(pmap, 1); 4980 if (oldpte & PG_MANAGED) { 4981 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 4982 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 4983 vm_page_dirty(m); 4984 if (oldpte & PG_A) 4985 vm_page_aflag_set(m, PGA_REFERENCED); 4986 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 4987 pmap_pvh_free(&m->md, pmap, va); 4988 if (TAILQ_EMPTY(&m->md.pv_list) && 4989 (m->flags & PG_FICTITIOUS) == 0) { 4990 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4991 if (TAILQ_EMPTY(&pvh->pv_list)) 4992 vm_page_aflag_clear(m, PGA_WRITEABLE); 4993 } 4994 pmap_delayed_invl_page(m); 4995 } 4996 return (pmap_unuse_pt(pmap, va, ptepde, free)); 4997 } 4998 4999 /* 5000 * Remove a single page from a process address space 5001 */ 5002 static void 5003 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 5004 struct spglist *free) 5005 { 5006 struct rwlock *lock; 5007 pt_entry_t *pte, PG_V; 5008 5009 PG_V = pmap_valid_bit(pmap); 5010 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5011 if ((*pde & PG_V) == 0) 5012 return; 5013 pte = pmap_pde_to_pte(pde, va); 5014 if ((*pte & PG_V) == 0) 5015 return; 5016 lock = NULL; 5017 pmap_remove_pte(pmap, pte, va, *pde, free, &lock); 5018 if (lock != NULL) 5019 rw_wunlock(lock); 5020 pmap_invalidate_page(pmap, va); 5021 } 5022 5023 /* 5024 * Removes the specified range of addresses from the page table page. 5025 */ 5026 static bool 5027 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 5028 pd_entry_t *pde, struct spglist *free, struct rwlock **lockp) 5029 { 5030 pt_entry_t PG_G, *pte; 5031 vm_offset_t va; 5032 bool anyvalid; 5033 5034 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5035 PG_G = pmap_global_bit(pmap); 5036 anyvalid = false; 5037 va = eva; 5038 for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++, 5039 sva += PAGE_SIZE) { 5040 if (*pte == 0) { 5041 if (va != eva) { 5042 pmap_invalidate_range(pmap, va, sva); 5043 va = eva; 5044 } 5045 continue; 5046 } 5047 if ((*pte & PG_G) == 0) 5048 anyvalid = true; 5049 else if (va == eva) 5050 va = sva; 5051 if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) { 5052 sva += PAGE_SIZE; 5053 break; 5054 } 5055 } 5056 if (va != eva) 5057 pmap_invalidate_range(pmap, va, sva); 5058 return (anyvalid); 5059 } 5060 5061 /* 5062 * Remove the given range of addresses from the specified map. 5063 * 5064 * It is assumed that the start and end are properly 5065 * rounded to the page size. 5066 */ 5067 void 5068 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5069 { 5070 struct rwlock *lock; 5071 vm_offset_t va_next; 5072 pml4_entry_t *pml4e; 5073 pdp_entry_t *pdpe; 5074 pd_entry_t ptpaddr, *pde; 5075 pt_entry_t PG_G, PG_V; 5076 struct spglist free; 5077 int anyvalid; 5078 5079 PG_G = pmap_global_bit(pmap); 5080 PG_V = pmap_valid_bit(pmap); 5081 5082 /* 5083 * Perform an unsynchronized read. This is, however, safe. 5084 */ 5085 if (pmap->pm_stats.resident_count == 0) 5086 return; 5087 5088 anyvalid = 0; 5089 SLIST_INIT(&free); 5090 5091 pmap_delayed_invl_start(); 5092 PMAP_LOCK(pmap); 5093 pmap_pkru_on_remove(pmap, sva, eva); 5094 5095 /* 5096 * special handling of removing one page. a very 5097 * common operation and easy to short circuit some 5098 * code. 5099 */ 5100 if (sva + PAGE_SIZE == eva) { 5101 pde = pmap_pde(pmap, sva); 5102 if (pde && (*pde & PG_PS) == 0) { 5103 pmap_remove_page(pmap, sva, pde, &free); 5104 goto out; 5105 } 5106 } 5107 5108 lock = NULL; 5109 for (; sva < eva; sva = va_next) { 5110 5111 if (pmap->pm_stats.resident_count == 0) 5112 break; 5113 5114 pml4e = pmap_pml4e(pmap, sva); 5115 if ((*pml4e & PG_V) == 0) { 5116 va_next = (sva + NBPML4) & ~PML4MASK; 5117 if (va_next < sva) 5118 va_next = eva; 5119 continue; 5120 } 5121 5122 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 5123 if ((*pdpe & PG_V) == 0) { 5124 va_next = (sva + NBPDP) & ~PDPMASK; 5125 if (va_next < sva) 5126 va_next = eva; 5127 continue; 5128 } 5129 5130 /* 5131 * Calculate index for next page table. 5132 */ 5133 va_next = (sva + NBPDR) & ~PDRMASK; 5134 if (va_next < sva) 5135 va_next = eva; 5136 5137 pde = pmap_pdpe_to_pde(pdpe, sva); 5138 ptpaddr = *pde; 5139 5140 /* 5141 * Weed out invalid mappings. 5142 */ 5143 if (ptpaddr == 0) 5144 continue; 5145 5146 /* 5147 * Check for large page. 5148 */ 5149 if ((ptpaddr & PG_PS) != 0) { 5150 /* 5151 * Are we removing the entire large page? If not, 5152 * demote the mapping and fall through. 5153 */ 5154 if (sva + NBPDR == va_next && eva >= va_next) { 5155 /* 5156 * The TLB entry for a PG_G mapping is 5157 * invalidated by pmap_remove_pde(). 5158 */ 5159 if ((ptpaddr & PG_G) == 0) 5160 anyvalid = 1; 5161 pmap_remove_pde(pmap, pde, sva, &free, &lock); 5162 continue; 5163 } else if (!pmap_demote_pde_locked(pmap, pde, sva, 5164 &lock)) { 5165 /* The large page mapping was destroyed. */ 5166 continue; 5167 } else 5168 ptpaddr = *pde; 5169 } 5170 5171 /* 5172 * Limit our scan to either the end of the va represented 5173 * by the current page table page, or to the end of the 5174 * range being removed. 5175 */ 5176 if (va_next > eva) 5177 va_next = eva; 5178 5179 if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock)) 5180 anyvalid = 1; 5181 } 5182 if (lock != NULL) 5183 rw_wunlock(lock); 5184 out: 5185 if (anyvalid) 5186 pmap_invalidate_all(pmap); 5187 PMAP_UNLOCK(pmap); 5188 pmap_delayed_invl_finish(); 5189 vm_page_free_pages_toq(&free, true); 5190 } 5191 5192 /* 5193 * Routine: pmap_remove_all 5194 * Function: 5195 * Removes this physical page from 5196 * all physical maps in which it resides. 5197 * Reflects back modify bits to the pager. 5198 * 5199 * Notes: 5200 * Original versions of this routine were very 5201 * inefficient because they iteratively called 5202 * pmap_remove (slow...) 5203 */ 5204 5205 void 5206 pmap_remove_all(vm_page_t m) 5207 { 5208 struct md_page *pvh; 5209 pv_entry_t pv; 5210 pmap_t pmap; 5211 struct rwlock *lock; 5212 pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; 5213 pd_entry_t *pde; 5214 vm_offset_t va; 5215 struct spglist free; 5216 int pvh_gen, md_gen; 5217 5218 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5219 ("pmap_remove_all: page %p is not managed", m)); 5220 SLIST_INIT(&free); 5221 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5222 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 5223 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5224 retry: 5225 rw_wlock(lock); 5226 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 5227 pmap = PV_PMAP(pv); 5228 if (!PMAP_TRYLOCK(pmap)) { 5229 pvh_gen = pvh->pv_gen; 5230 rw_wunlock(lock); 5231 PMAP_LOCK(pmap); 5232 rw_wlock(lock); 5233 if (pvh_gen != pvh->pv_gen) { 5234 rw_wunlock(lock); 5235 PMAP_UNLOCK(pmap); 5236 goto retry; 5237 } 5238 } 5239 va = pv->pv_va; 5240 pde = pmap_pde(pmap, va); 5241 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 5242 PMAP_UNLOCK(pmap); 5243 } 5244 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 5245 pmap = PV_PMAP(pv); 5246 if (!PMAP_TRYLOCK(pmap)) { 5247 pvh_gen = pvh->pv_gen; 5248 md_gen = m->md.pv_gen; 5249 rw_wunlock(lock); 5250 PMAP_LOCK(pmap); 5251 rw_wlock(lock); 5252 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 5253 rw_wunlock(lock); 5254 PMAP_UNLOCK(pmap); 5255 goto retry; 5256 } 5257 } 5258 PG_A = pmap_accessed_bit(pmap); 5259 PG_M = pmap_modified_bit(pmap); 5260 PG_RW = pmap_rw_bit(pmap); 5261 pmap_resident_count_dec(pmap, 1); 5262 pde = pmap_pde(pmap, pv->pv_va); 5263 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 5264 " a 2mpage in page %p's pv list", m)); 5265 pte = pmap_pde_to_pte(pde, pv->pv_va); 5266 tpte = pte_load_clear(pte); 5267 if (tpte & PG_W) 5268 pmap->pm_stats.wired_count--; 5269 if (tpte & PG_A) 5270 vm_page_aflag_set(m, PGA_REFERENCED); 5271 5272 /* 5273 * Update the vm_page_t clean and reference bits. 5274 */ 5275 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5276 vm_page_dirty(m); 5277 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); 5278 pmap_invalidate_page(pmap, pv->pv_va); 5279 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5280 m->md.pv_gen++; 5281 free_pv_entry(pmap, pv); 5282 PMAP_UNLOCK(pmap); 5283 } 5284 vm_page_aflag_clear(m, PGA_WRITEABLE); 5285 rw_wunlock(lock); 5286 pmap_delayed_invl_wait(m); 5287 vm_page_free_pages_toq(&free, true); 5288 } 5289 5290 /* 5291 * pmap_protect_pde: do the things to protect a 2mpage in a process 5292 */ 5293 static boolean_t 5294 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 5295 { 5296 pd_entry_t newpde, oldpde; 5297 vm_page_t m, mt; 5298 boolean_t anychanged; 5299 pt_entry_t PG_G, PG_M, PG_RW; 5300 5301 PG_G = pmap_global_bit(pmap); 5302 PG_M = pmap_modified_bit(pmap); 5303 PG_RW = pmap_rw_bit(pmap); 5304 5305 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5306 KASSERT((sva & PDRMASK) == 0, 5307 ("pmap_protect_pde: sva is not 2mpage aligned")); 5308 anychanged = FALSE; 5309 retry: 5310 oldpde = newpde = *pde; 5311 if ((prot & VM_PROT_WRITE) == 0) { 5312 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == 5313 (PG_MANAGED | PG_M | PG_RW)) { 5314 m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 5315 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 5316 vm_page_dirty(mt); 5317 } 5318 newpde &= ~(PG_RW | PG_M); 5319 } 5320 if ((prot & VM_PROT_EXECUTE) == 0) 5321 newpde |= pg_nx; 5322 if (newpde != oldpde) { 5323 /* 5324 * As an optimization to future operations on this PDE, clear 5325 * PG_PROMOTED. The impending invalidation will remove any 5326 * lingering 4KB page mappings from the TLB. 5327 */ 5328 if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED)) 5329 goto retry; 5330 if ((oldpde & PG_G) != 0) 5331 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 5332 else 5333 anychanged = TRUE; 5334 } 5335 return (anychanged); 5336 } 5337 5338 /* 5339 * Set the physical protection on the 5340 * specified range of this map as requested. 5341 */ 5342 void 5343 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 5344 { 5345 vm_offset_t va_next; 5346 pml4_entry_t *pml4e; 5347 pdp_entry_t *pdpe; 5348 pd_entry_t ptpaddr, *pde; 5349 pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; 5350 boolean_t anychanged; 5351 5352 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 5353 if (prot == VM_PROT_NONE) { 5354 pmap_remove(pmap, sva, eva); 5355 return; 5356 } 5357 5358 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 5359 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 5360 return; 5361 5362 PG_G = pmap_global_bit(pmap); 5363 PG_M = pmap_modified_bit(pmap); 5364 PG_V = pmap_valid_bit(pmap); 5365 PG_RW = pmap_rw_bit(pmap); 5366 anychanged = FALSE; 5367 5368 /* 5369 * Although this function delays and batches the invalidation 5370 * of stale TLB entries, it does not need to call 5371 * pmap_delayed_invl_start() and 5372 * pmap_delayed_invl_finish(), because it does not 5373 * ordinarily destroy mappings. Stale TLB entries from 5374 * protection-only changes need only be invalidated before the 5375 * pmap lock is released, because protection-only changes do 5376 * not destroy PV entries. Even operations that iterate over 5377 * a physical page's PV list of mappings, like 5378 * pmap_remove_write(), acquire the pmap lock for each 5379 * mapping. Consequently, for protection-only changes, the 5380 * pmap lock suffices to synchronize both page table and TLB 5381 * updates. 5382 * 5383 * This function only destroys a mapping if pmap_demote_pde() 5384 * fails. In that case, stale TLB entries are immediately 5385 * invalidated. 5386 */ 5387 5388 PMAP_LOCK(pmap); 5389 for (; sva < eva; sva = va_next) { 5390 5391 pml4e = pmap_pml4e(pmap, sva); 5392 if ((*pml4e & PG_V) == 0) { 5393 va_next = (sva + NBPML4) & ~PML4MASK; 5394 if (va_next < sva) 5395 va_next = eva; 5396 continue; 5397 } 5398 5399 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 5400 if ((*pdpe & PG_V) == 0) { 5401 va_next = (sva + NBPDP) & ~PDPMASK; 5402 if (va_next < sva) 5403 va_next = eva; 5404 continue; 5405 } 5406 5407 va_next = (sva + NBPDR) & ~PDRMASK; 5408 if (va_next < sva) 5409 va_next = eva; 5410 5411 pde = pmap_pdpe_to_pde(pdpe, sva); 5412 ptpaddr = *pde; 5413 5414 /* 5415 * Weed out invalid mappings. 5416 */ 5417 if (ptpaddr == 0) 5418 continue; 5419 5420 /* 5421 * Check for large page. 5422 */ 5423 if ((ptpaddr & PG_PS) != 0) { 5424 /* 5425 * Are we protecting the entire large page? If not, 5426 * demote the mapping and fall through. 5427 */ 5428 if (sva + NBPDR == va_next && eva >= va_next) { 5429 /* 5430 * The TLB entry for a PG_G mapping is 5431 * invalidated by pmap_protect_pde(). 5432 */ 5433 if (pmap_protect_pde(pmap, pde, sva, prot)) 5434 anychanged = TRUE; 5435 continue; 5436 } else if (!pmap_demote_pde(pmap, pde, sva)) { 5437 /* 5438 * The large page mapping was destroyed. 5439 */ 5440 continue; 5441 } 5442 } 5443 5444 if (va_next > eva) 5445 va_next = eva; 5446 5447 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 5448 sva += PAGE_SIZE) { 5449 pt_entry_t obits, pbits; 5450 vm_page_t m; 5451 5452 retry: 5453 obits = pbits = *pte; 5454 if ((pbits & PG_V) == 0) 5455 continue; 5456 5457 if ((prot & VM_PROT_WRITE) == 0) { 5458 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 5459 (PG_MANAGED | PG_M | PG_RW)) { 5460 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 5461 vm_page_dirty(m); 5462 } 5463 pbits &= ~(PG_RW | PG_M); 5464 } 5465 if ((prot & VM_PROT_EXECUTE) == 0) 5466 pbits |= pg_nx; 5467 5468 if (pbits != obits) { 5469 if (!atomic_cmpset_long(pte, obits, pbits)) 5470 goto retry; 5471 if (obits & PG_G) 5472 pmap_invalidate_page(pmap, sva); 5473 else 5474 anychanged = TRUE; 5475 } 5476 } 5477 } 5478 if (anychanged) 5479 pmap_invalidate_all(pmap); 5480 PMAP_UNLOCK(pmap); 5481 } 5482 5483 #if VM_NRESERVLEVEL > 0 5484 /* 5485 * Tries to promote the 512, contiguous 4KB page mappings that are within a 5486 * single page table page (PTP) to a single 2MB page mapping. For promotion 5487 * to occur, two conditions must be met: (1) the 4KB page mappings must map 5488 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 5489 * identical characteristics. 5490 */ 5491 static void 5492 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 5493 struct rwlock **lockp) 5494 { 5495 pd_entry_t newpde; 5496 pt_entry_t *firstpte, oldpte, pa, *pte; 5497 pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK; 5498 vm_page_t mpte; 5499 int PG_PTE_CACHE; 5500 5501 PG_A = pmap_accessed_bit(pmap); 5502 PG_G = pmap_global_bit(pmap); 5503 PG_M = pmap_modified_bit(pmap); 5504 PG_V = pmap_valid_bit(pmap); 5505 PG_RW = pmap_rw_bit(pmap); 5506 PG_PKU_MASK = pmap_pku_mask_bit(pmap); 5507 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 5508 5509 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5510 5511 /* 5512 * Examine the first PTE in the specified PTP. Abort if this PTE is 5513 * either invalid, unused, or does not map the first 4KB physical page 5514 * within a 2MB page. 5515 */ 5516 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 5517 setpde: 5518 newpde = *firstpte; 5519 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 5520 atomic_add_long(&pmap_pde_p_failures, 1); 5521 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 5522 " in pmap %p", va, pmap); 5523 return; 5524 } 5525 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 5526 /* 5527 * When PG_M is already clear, PG_RW can be cleared without 5528 * a TLB invalidation. 5529 */ 5530 if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW)) 5531 goto setpde; 5532 newpde &= ~PG_RW; 5533 } 5534 5535 /* 5536 * Examine each of the other PTEs in the specified PTP. Abort if this 5537 * PTE maps an unexpected 4KB physical page or does not have identical 5538 * characteristics to the first PTE. 5539 */ 5540 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 5541 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 5542 setpte: 5543 oldpte = *pte; 5544 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 5545 atomic_add_long(&pmap_pde_p_failures, 1); 5546 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 5547 " in pmap %p", va, pmap); 5548 return; 5549 } 5550 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 5551 /* 5552 * When PG_M is already clear, PG_RW can be cleared 5553 * without a TLB invalidation. 5554 */ 5555 if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW)) 5556 goto setpte; 5557 oldpte &= ~PG_RW; 5558 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 5559 " in pmap %p", (oldpte & PG_FRAME & PDRMASK) | 5560 (va & ~PDRMASK), pmap); 5561 } 5562 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 5563 atomic_add_long(&pmap_pde_p_failures, 1); 5564 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 5565 " in pmap %p", va, pmap); 5566 return; 5567 } 5568 pa -= PAGE_SIZE; 5569 } 5570 5571 /* 5572 * Save the page table page in its current state until the PDE 5573 * mapping the superpage is demoted by pmap_demote_pde() or 5574 * destroyed by pmap_remove_pde(). 5575 */ 5576 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 5577 KASSERT(mpte >= vm_page_array && 5578 mpte < &vm_page_array[vm_page_array_size], 5579 ("pmap_promote_pde: page table page is out of range")); 5580 KASSERT(mpte->pindex == pmap_pde_pindex(va), 5581 ("pmap_promote_pde: page table page's pindex is wrong")); 5582 if (pmap_insert_pt_page(pmap, mpte, true)) { 5583 atomic_add_long(&pmap_pde_p_failures, 1); 5584 CTR2(KTR_PMAP, 5585 "pmap_promote_pde: failure for va %#lx in pmap %p", va, 5586 pmap); 5587 return; 5588 } 5589 5590 /* 5591 * Promote the pv entries. 5592 */ 5593 if ((newpde & PG_MANAGED) != 0) 5594 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); 5595 5596 /* 5597 * Propagate the PAT index to its proper position. 5598 */ 5599 newpde = pmap_swap_pat(pmap, newpde); 5600 5601 /* 5602 * Map the superpage. 5603 */ 5604 if (workaround_erratum383) 5605 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 5606 else 5607 pde_store(pde, PG_PROMOTED | PG_PS | newpde); 5608 5609 atomic_add_long(&pmap_pde_promotions, 1); 5610 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" 5611 " in pmap %p", va, pmap); 5612 } 5613 #endif /* VM_NRESERVLEVEL > 0 */ 5614 5615 /* 5616 * Insert the given physical page (p) at 5617 * the specified virtual address (v) in the 5618 * target physical map with the protection requested. 5619 * 5620 * If specified, the page will be wired down, meaning 5621 * that the related pte can not be reclaimed. 5622 * 5623 * NB: This is the only routine which MAY NOT lazy-evaluate 5624 * or lose information. That is, this routine must actually 5625 * insert this page into the given map NOW. 5626 * 5627 * When destroying both a page table and PV entry, this function 5628 * performs the TLB invalidation before releasing the PV list 5629 * lock, so we do not need pmap_delayed_invl_page() calls here. 5630 */ 5631 int 5632 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 5633 u_int flags, int8_t psind) 5634 { 5635 struct rwlock *lock; 5636 pd_entry_t *pde; 5637 pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; 5638 pt_entry_t newpte, origpte; 5639 pv_entry_t pv; 5640 vm_paddr_t opa, pa; 5641 vm_page_t mpte, om; 5642 int rv; 5643 boolean_t nosleep; 5644 5645 PG_A = pmap_accessed_bit(pmap); 5646 PG_G = pmap_global_bit(pmap); 5647 PG_M = pmap_modified_bit(pmap); 5648 PG_V = pmap_valid_bit(pmap); 5649 PG_RW = pmap_rw_bit(pmap); 5650 5651 va = trunc_page(va); 5652 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 5653 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 5654 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", 5655 va)); 5656 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || 5657 va >= kmi.clean_eva, 5658 ("pmap_enter: managed mapping within the clean submap")); 5659 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) 5660 VM_OBJECT_ASSERT_LOCKED(m->object); 5661 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 5662 ("pmap_enter: flags %u has reserved bits set", flags)); 5663 pa = VM_PAGE_TO_PHYS(m); 5664 newpte = (pt_entry_t)(pa | PG_A | PG_V); 5665 if ((flags & VM_PROT_WRITE) != 0) 5666 newpte |= PG_M; 5667 if ((prot & VM_PROT_WRITE) != 0) 5668 newpte |= PG_RW; 5669 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 5670 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); 5671 if ((prot & VM_PROT_EXECUTE) == 0) 5672 newpte |= pg_nx; 5673 if ((flags & PMAP_ENTER_WIRED) != 0) 5674 newpte |= PG_W; 5675 if (va < VM_MAXUSER_ADDRESS) 5676 newpte |= PG_U; 5677 if (pmap == kernel_pmap) 5678 newpte |= PG_G; 5679 newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0); 5680 5681 /* 5682 * Set modified bit gratuitously for writeable mappings if 5683 * the page is unmanaged. We do not want to take a fault 5684 * to do the dirty bit accounting for these mappings. 5685 */ 5686 if ((m->oflags & VPO_UNMANAGED) != 0) { 5687 if ((newpte & PG_RW) != 0) 5688 newpte |= PG_M; 5689 } else 5690 newpte |= PG_MANAGED; 5691 5692 lock = NULL; 5693 PMAP_LOCK(pmap); 5694 if (psind == 1) { 5695 /* Assert the required virtual and physical alignment. */ 5696 KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); 5697 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 5698 rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock); 5699 goto out; 5700 } 5701 mpte = NULL; 5702 5703 /* 5704 * In the case that a page table page is not 5705 * resident, we are creating it here. 5706 */ 5707 retry: 5708 pde = pmap_pde(pmap, va); 5709 if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || 5710 pmap_demote_pde_locked(pmap, pde, va, &lock))) { 5711 pte = pmap_pde_to_pte(pde, va); 5712 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 5713 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 5714 mpte->wire_count++; 5715 } 5716 } else if (va < VM_MAXUSER_ADDRESS) { 5717 /* 5718 * Here if the pte page isn't mapped, or if it has been 5719 * deallocated. 5720 */ 5721 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 5722 mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), 5723 nosleep ? NULL : &lock); 5724 if (mpte == NULL && nosleep) { 5725 rv = KERN_RESOURCE_SHORTAGE; 5726 goto out; 5727 } 5728 goto retry; 5729 } else 5730 panic("pmap_enter: invalid page directory va=%#lx", va); 5731 5732 origpte = *pte; 5733 pv = NULL; 5734 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) 5735 newpte |= pmap_pkru_get(pmap, va); 5736 5737 /* 5738 * Is the specified virtual address already mapped? 5739 */ 5740 if ((origpte & PG_V) != 0) { 5741 /* 5742 * Wiring change, just update stats. We don't worry about 5743 * wiring PT pages as they remain resident as long as there 5744 * are valid mappings in them. Hence, if a user page is wired, 5745 * the PT page will be also. 5746 */ 5747 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 5748 pmap->pm_stats.wired_count++; 5749 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 5750 pmap->pm_stats.wired_count--; 5751 5752 /* 5753 * Remove the extra PT page reference. 5754 */ 5755 if (mpte != NULL) { 5756 mpte->wire_count--; 5757 KASSERT(mpte->wire_count > 0, 5758 ("pmap_enter: missing reference to page table page," 5759 " va: 0x%lx", va)); 5760 } 5761 5762 /* 5763 * Has the physical page changed? 5764 */ 5765 opa = origpte & PG_FRAME; 5766 if (opa == pa) { 5767 /* 5768 * No, might be a protection or wiring change. 5769 */ 5770 if ((origpte & PG_MANAGED) != 0 && 5771 (newpte & PG_RW) != 0) 5772 vm_page_aflag_set(m, PGA_WRITEABLE); 5773 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 5774 goto unchanged; 5775 goto validate; 5776 } 5777 5778 /* 5779 * The physical page has changed. Temporarily invalidate 5780 * the mapping. This ensures that all threads sharing the 5781 * pmap keep a consistent view of the mapping, which is 5782 * necessary for the correct handling of COW faults. It 5783 * also permits reuse of the old mapping's PV entry, 5784 * avoiding an allocation. 5785 * 5786 * For consistency, handle unmanaged mappings the same way. 5787 */ 5788 origpte = pte_load_clear(pte); 5789 KASSERT((origpte & PG_FRAME) == opa, 5790 ("pmap_enter: unexpected pa update for %#lx", va)); 5791 if ((origpte & PG_MANAGED) != 0) { 5792 om = PHYS_TO_VM_PAGE(opa); 5793 5794 /* 5795 * The pmap lock is sufficient to synchronize with 5796 * concurrent calls to pmap_page_test_mappings() and 5797 * pmap_ts_referenced(). 5798 */ 5799 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5800 vm_page_dirty(om); 5801 if ((origpte & PG_A) != 0) 5802 vm_page_aflag_set(om, PGA_REFERENCED); 5803 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 5804 pv = pmap_pvh_remove(&om->md, pmap, va); 5805 KASSERT(pv != NULL, 5806 ("pmap_enter: no PV entry for %#lx", va)); 5807 if ((newpte & PG_MANAGED) == 0) 5808 free_pv_entry(pmap, pv); 5809 if ((om->aflags & PGA_WRITEABLE) != 0 && 5810 TAILQ_EMPTY(&om->md.pv_list) && 5811 ((om->flags & PG_FICTITIOUS) != 0 || 5812 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 5813 vm_page_aflag_clear(om, PGA_WRITEABLE); 5814 } 5815 if ((origpte & PG_A) != 0) 5816 pmap_invalidate_page(pmap, va); 5817 origpte = 0; 5818 } else { 5819 /* 5820 * Increment the counters. 5821 */ 5822 if ((newpte & PG_W) != 0) 5823 pmap->pm_stats.wired_count++; 5824 pmap_resident_count_inc(pmap, 1); 5825 } 5826 5827 /* 5828 * Enter on the PV list if part of our managed memory. 5829 */ 5830 if ((newpte & PG_MANAGED) != 0) { 5831 if (pv == NULL) { 5832 pv = get_pv_entry(pmap, &lock); 5833 pv->pv_va = va; 5834 } 5835 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 5836 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5837 m->md.pv_gen++; 5838 if ((newpte & PG_RW) != 0) 5839 vm_page_aflag_set(m, PGA_WRITEABLE); 5840 } 5841 5842 /* 5843 * Update the PTE. 5844 */ 5845 if ((origpte & PG_V) != 0) { 5846 validate: 5847 origpte = pte_load_store(pte, newpte); 5848 KASSERT((origpte & PG_FRAME) == pa, 5849 ("pmap_enter: unexpected pa update for %#lx", va)); 5850 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == 5851 (PG_M | PG_RW)) { 5852 if ((origpte & PG_MANAGED) != 0) 5853 vm_page_dirty(m); 5854 5855 /* 5856 * Although the PTE may still have PG_RW set, TLB 5857 * invalidation may nonetheless be required because 5858 * the PTE no longer has PG_M set. 5859 */ 5860 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { 5861 /* 5862 * This PTE change does not require TLB invalidation. 5863 */ 5864 goto unchanged; 5865 } 5866 if ((origpte & PG_A) != 0) 5867 pmap_invalidate_page(pmap, va); 5868 } else 5869 pte_store(pte, newpte); 5870 5871 unchanged: 5872 5873 #if VM_NRESERVLEVEL > 0 5874 /* 5875 * If both the page table page and the reservation are fully 5876 * populated, then attempt promotion. 5877 */ 5878 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 5879 pmap_ps_enabled(pmap) && 5880 (m->flags & PG_FICTITIOUS) == 0 && 5881 vm_reserv_level_iffullpop(m) == 0) 5882 pmap_promote_pde(pmap, pde, va, &lock); 5883 #endif 5884 5885 rv = KERN_SUCCESS; 5886 out: 5887 if (lock != NULL) 5888 rw_wunlock(lock); 5889 PMAP_UNLOCK(pmap); 5890 return (rv); 5891 } 5892 5893 /* 5894 * Tries to create a read- and/or execute-only 2MB page mapping. Returns true 5895 * if successful. Returns false if (1) a page table page cannot be allocated 5896 * without sleeping, (2) a mapping already exists at the specified virtual 5897 * address, or (3) a PV entry cannot be allocated without reclaiming another 5898 * PV entry. 5899 */ 5900 static bool 5901 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 5902 struct rwlock **lockp) 5903 { 5904 pd_entry_t newpde; 5905 pt_entry_t PG_V; 5906 5907 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5908 PG_V = pmap_valid_bit(pmap); 5909 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | 5910 PG_PS | PG_V; 5911 if ((m->oflags & VPO_UNMANAGED) == 0) 5912 newpde |= PG_MANAGED; 5913 if ((prot & VM_PROT_EXECUTE) == 0) 5914 newpde |= pg_nx; 5915 if (va < VM_MAXUSER_ADDRESS) 5916 newpde |= PG_U; 5917 return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | 5918 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == 5919 KERN_SUCCESS); 5920 } 5921 5922 /* 5923 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 5924 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 5925 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 5926 * a mapping already exists at the specified virtual address. Returns 5927 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table 5928 * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if 5929 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. 5930 * 5931 * The parameter "m" is only used when creating a managed, writeable mapping. 5932 */ 5933 static int 5934 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, 5935 vm_page_t m, struct rwlock **lockp) 5936 { 5937 struct spglist free; 5938 pd_entry_t oldpde, *pde; 5939 pt_entry_t PG_G, PG_RW, PG_V; 5940 vm_page_t mt, pdpg; 5941 5942 KASSERT(pmap == kernel_pmap || (newpde & PG_W) == 0, 5943 ("pmap_enter_pde: cannot create wired user mapping")); 5944 PG_G = pmap_global_bit(pmap); 5945 PG_RW = pmap_rw_bit(pmap); 5946 KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW, 5947 ("pmap_enter_pde: newpde is missing PG_M")); 5948 PG_V = pmap_valid_bit(pmap); 5949 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5950 5951 if ((pdpg = pmap_allocpde(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? 5952 NULL : lockp)) == NULL) { 5953 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 5954 " in pmap %p", va, pmap); 5955 return (KERN_RESOURCE_SHORTAGE); 5956 } 5957 5958 /* 5959 * If pkru is not same for the whole pde range, return failure 5960 * and let vm_fault() cope. Check after pde allocation, since 5961 * it could sleep. 5962 */ 5963 if (!pmap_pkru_same(pmap, va, va + NBPDR)) { 5964 SLIST_INIT(&free); 5965 if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { 5966 pmap_invalidate_page(pmap, va); 5967 vm_page_free_pages_toq(&free, true); 5968 } 5969 return (KERN_FAILURE); 5970 } 5971 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) { 5972 newpde &= ~X86_PG_PKU_MASK; 5973 newpde |= pmap_pkru_get(pmap, va); 5974 } 5975 5976 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 5977 pde = &pde[pmap_pde_index(va)]; 5978 oldpde = *pde; 5979 if ((oldpde & PG_V) != 0) { 5980 KASSERT(pdpg->wire_count > 1, 5981 ("pmap_enter_pde: pdpg's wire count is too low")); 5982 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 5983 pdpg->wire_count--; 5984 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 5985 " in pmap %p", va, pmap); 5986 return (KERN_FAILURE); 5987 } 5988 /* Break the existing mapping(s). */ 5989 SLIST_INIT(&free); 5990 if ((oldpde & PG_PS) != 0) { 5991 /* 5992 * The reference to the PD page that was acquired by 5993 * pmap_allocpde() ensures that it won't be freed. 5994 * However, if the PDE resulted from a promotion, then 5995 * a reserved PT page could be freed. 5996 */ 5997 (void)pmap_remove_pde(pmap, pde, va, &free, lockp); 5998 if ((oldpde & PG_G) == 0) 5999 pmap_invalidate_pde_page(pmap, va, oldpde); 6000 } else { 6001 pmap_delayed_invl_start(); 6002 if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free, 6003 lockp)) 6004 pmap_invalidate_all(pmap); 6005 pmap_delayed_invl_finish(); 6006 } 6007 vm_page_free_pages_toq(&free, true); 6008 if (va >= VM_MAXUSER_ADDRESS) { 6009 /* 6010 * Both pmap_remove_pde() and pmap_remove_ptes() will 6011 * leave the kernel page table page zero filled. 6012 */ 6013 mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 6014 if (pmap_insert_pt_page(pmap, mt, false)) 6015 panic("pmap_enter_pde: trie insert failed"); 6016 } else 6017 KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", 6018 pde)); 6019 } 6020 if ((newpde & PG_MANAGED) != 0) { 6021 /* 6022 * Abort this mapping if its PV entry could not be created. 6023 */ 6024 if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) { 6025 SLIST_INIT(&free); 6026 if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { 6027 /* 6028 * Although "va" is not mapped, paging- 6029 * structure caches could nonetheless have 6030 * entries that refer to the freed page table 6031 * pages. Invalidate those entries. 6032 */ 6033 pmap_invalidate_page(pmap, va); 6034 vm_page_free_pages_toq(&free, true); 6035 } 6036 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 6037 " in pmap %p", va, pmap); 6038 return (KERN_RESOURCE_SHORTAGE); 6039 } 6040 if ((newpde & PG_RW) != 0) { 6041 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 6042 vm_page_aflag_set(mt, PGA_WRITEABLE); 6043 } 6044 } 6045 6046 /* 6047 * Increment counters. 6048 */ 6049 if ((newpde & PG_W) != 0) 6050 pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; 6051 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); 6052 6053 /* 6054 * Map the superpage. (This is not a promoted mapping; there will not 6055 * be any lingering 4KB page mappings in the TLB.) 6056 */ 6057 pde_store(pde, newpde); 6058 6059 atomic_add_long(&pmap_pde_mappings, 1); 6060 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 6061 " in pmap %p", va, pmap); 6062 return (KERN_SUCCESS); 6063 } 6064 6065 /* 6066 * Maps a sequence of resident pages belonging to the same object. 6067 * The sequence begins with the given page m_start. This page is 6068 * mapped at the given virtual address start. Each subsequent page is 6069 * mapped at a virtual address that is offset from start by the same 6070 * amount as the page is offset from m_start within the object. The 6071 * last page in the sequence is the page with the largest offset from 6072 * m_start that can be mapped at a virtual address less than the given 6073 * virtual address end. Not every virtual page between start and end 6074 * is mapped; only those for which a resident page exists with the 6075 * corresponding offset from m_start are mapped. 6076 */ 6077 void 6078 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 6079 vm_page_t m_start, vm_prot_t prot) 6080 { 6081 struct rwlock *lock; 6082 vm_offset_t va; 6083 vm_page_t m, mpte; 6084 vm_pindex_t diff, psize; 6085 6086 VM_OBJECT_ASSERT_LOCKED(m_start->object); 6087 6088 psize = atop(end - start); 6089 mpte = NULL; 6090 m = m_start; 6091 lock = NULL; 6092 PMAP_LOCK(pmap); 6093 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 6094 va = start + ptoa(diff); 6095 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 6096 m->psind == 1 && pmap_ps_enabled(pmap) && 6097 pmap_enter_2mpage(pmap, va, m, prot, &lock)) 6098 m = &m[NBPDR / PAGE_SIZE - 1]; 6099 else 6100 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 6101 mpte, &lock); 6102 m = TAILQ_NEXT(m, listq); 6103 } 6104 if (lock != NULL) 6105 rw_wunlock(lock); 6106 PMAP_UNLOCK(pmap); 6107 } 6108 6109 /* 6110 * this code makes some *MAJOR* assumptions: 6111 * 1. Current pmap & pmap exists. 6112 * 2. Not wired. 6113 * 3. Read access. 6114 * 4. No page table pages. 6115 * but is *MUCH* faster than pmap_enter... 6116 */ 6117 6118 void 6119 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 6120 { 6121 struct rwlock *lock; 6122 6123 lock = NULL; 6124 PMAP_LOCK(pmap); 6125 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 6126 if (lock != NULL) 6127 rw_wunlock(lock); 6128 PMAP_UNLOCK(pmap); 6129 } 6130 6131 static vm_page_t 6132 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 6133 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 6134 { 6135 struct spglist free; 6136 pt_entry_t newpte, *pte, PG_V; 6137 6138 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 6139 (m->oflags & VPO_UNMANAGED) != 0, 6140 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 6141 PG_V = pmap_valid_bit(pmap); 6142 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6143 6144 /* 6145 * In the case that a page table page is not 6146 * resident, we are creating it here. 6147 */ 6148 if (va < VM_MAXUSER_ADDRESS) { 6149 vm_pindex_t ptepindex; 6150 pd_entry_t *ptepa; 6151 6152 /* 6153 * Calculate pagetable page index 6154 */ 6155 ptepindex = pmap_pde_pindex(va); 6156 if (mpte && (mpte->pindex == ptepindex)) { 6157 mpte->wire_count++; 6158 } else { 6159 /* 6160 * Get the page directory entry 6161 */ 6162 ptepa = pmap_pde(pmap, va); 6163 6164 /* 6165 * If the page table page is mapped, we just increment 6166 * the hold count, and activate it. Otherwise, we 6167 * attempt to allocate a page table page. If this 6168 * attempt fails, we don't retry. Instead, we give up. 6169 */ 6170 if (ptepa && (*ptepa & PG_V) != 0) { 6171 if (*ptepa & PG_PS) 6172 return (NULL); 6173 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); 6174 mpte->wire_count++; 6175 } else { 6176 /* 6177 * Pass NULL instead of the PV list lock 6178 * pointer, because we don't intend to sleep. 6179 */ 6180 mpte = _pmap_allocpte(pmap, ptepindex, NULL); 6181 if (mpte == NULL) 6182 return (mpte); 6183 } 6184 } 6185 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 6186 pte = &pte[pmap_pte_index(va)]; 6187 } else { 6188 mpte = NULL; 6189 pte = vtopte(va); 6190 } 6191 if (*pte) { 6192 if (mpte != NULL) { 6193 mpte->wire_count--; 6194 mpte = NULL; 6195 } 6196 return (mpte); 6197 } 6198 6199 /* 6200 * Enter on the PV list if part of our managed memory. 6201 */ 6202 if ((m->oflags & VPO_UNMANAGED) == 0 && 6203 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 6204 if (mpte != NULL) { 6205 SLIST_INIT(&free); 6206 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 6207 /* 6208 * Although "va" is not mapped, paging- 6209 * structure caches could nonetheless have 6210 * entries that refer to the freed page table 6211 * pages. Invalidate those entries. 6212 */ 6213 pmap_invalidate_page(pmap, va); 6214 vm_page_free_pages_toq(&free, true); 6215 } 6216 mpte = NULL; 6217 } 6218 return (mpte); 6219 } 6220 6221 /* 6222 * Increment counters 6223 */ 6224 pmap_resident_count_inc(pmap, 1); 6225 6226 newpte = VM_PAGE_TO_PHYS(m) | PG_V | 6227 pmap_cache_bits(pmap, m->md.pat_mode, 0); 6228 if ((m->oflags & VPO_UNMANAGED) == 0) 6229 newpte |= PG_MANAGED; 6230 if ((prot & VM_PROT_EXECUTE) == 0) 6231 newpte |= pg_nx; 6232 if (va < VM_MAXUSER_ADDRESS) 6233 newpte |= PG_U | pmap_pkru_get(pmap, va); 6234 pte_store(pte, newpte); 6235 return (mpte); 6236 } 6237 6238 /* 6239 * Make a temporary mapping for a physical address. This is only intended 6240 * to be used for panic dumps. 6241 */ 6242 void * 6243 pmap_kenter_temporary(vm_paddr_t pa, int i) 6244 { 6245 vm_offset_t va; 6246 6247 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 6248 pmap_kenter(va, pa); 6249 invlpg(va); 6250 return ((void *)crashdumpmap); 6251 } 6252 6253 /* 6254 * This code maps large physical mmap regions into the 6255 * processor address space. Note that some shortcuts 6256 * are taken, but the code works. 6257 */ 6258 void 6259 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 6260 vm_pindex_t pindex, vm_size_t size) 6261 { 6262 pd_entry_t *pde; 6263 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 6264 vm_paddr_t pa, ptepa; 6265 vm_page_t p, pdpg; 6266 int pat_mode; 6267 6268 PG_A = pmap_accessed_bit(pmap); 6269 PG_M = pmap_modified_bit(pmap); 6270 PG_V = pmap_valid_bit(pmap); 6271 PG_RW = pmap_rw_bit(pmap); 6272 6273 VM_OBJECT_ASSERT_WLOCKED(object); 6274 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 6275 ("pmap_object_init_pt: non-device object")); 6276 if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 6277 if (!pmap_ps_enabled(pmap)) 6278 return; 6279 if (!vm_object_populate(object, pindex, pindex + atop(size))) 6280 return; 6281 p = vm_page_lookup(object, pindex); 6282 KASSERT(p->valid == VM_PAGE_BITS_ALL, 6283 ("pmap_object_init_pt: invalid page %p", p)); 6284 pat_mode = p->md.pat_mode; 6285 6286 /* 6287 * Abort the mapping if the first page is not physically 6288 * aligned to a 2MB page boundary. 6289 */ 6290 ptepa = VM_PAGE_TO_PHYS(p); 6291 if (ptepa & (NBPDR - 1)) 6292 return; 6293 6294 /* 6295 * Skip the first page. Abort the mapping if the rest of 6296 * the pages are not physically contiguous or have differing 6297 * memory attributes. 6298 */ 6299 p = TAILQ_NEXT(p, listq); 6300 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 6301 pa += PAGE_SIZE) { 6302 KASSERT(p->valid == VM_PAGE_BITS_ALL, 6303 ("pmap_object_init_pt: invalid page %p", p)); 6304 if (pa != VM_PAGE_TO_PHYS(p) || 6305 pat_mode != p->md.pat_mode) 6306 return; 6307 p = TAILQ_NEXT(p, listq); 6308 } 6309 6310 /* 6311 * Map using 2MB pages. Since "ptepa" is 2M aligned and 6312 * "size" is a multiple of 2M, adding the PAT setting to "pa" 6313 * will not affect the termination of this loop. 6314 */ 6315 PMAP_LOCK(pmap); 6316 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); 6317 pa < ptepa + size; pa += NBPDR) { 6318 pdpg = pmap_allocpde(pmap, addr, NULL); 6319 if (pdpg == NULL) { 6320 /* 6321 * The creation of mappings below is only an 6322 * optimization. If a page directory page 6323 * cannot be allocated without blocking, 6324 * continue on to the next mapping rather than 6325 * blocking. 6326 */ 6327 addr += NBPDR; 6328 continue; 6329 } 6330 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 6331 pde = &pde[pmap_pde_index(addr)]; 6332 if ((*pde & PG_V) == 0) { 6333 pde_store(pde, pa | PG_PS | PG_M | PG_A | 6334 PG_U | PG_RW | PG_V); 6335 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); 6336 atomic_add_long(&pmap_pde_mappings, 1); 6337 } else { 6338 /* Continue on if the PDE is already valid. */ 6339 pdpg->wire_count--; 6340 KASSERT(pdpg->wire_count > 0, 6341 ("pmap_object_init_pt: missing reference " 6342 "to page directory page, va: 0x%lx", addr)); 6343 } 6344 addr += NBPDR; 6345 } 6346 PMAP_UNLOCK(pmap); 6347 } 6348 } 6349 6350 /* 6351 * Clear the wired attribute from the mappings for the specified range of 6352 * addresses in the given pmap. Every valid mapping within that range 6353 * must have the wired attribute set. In contrast, invalid mappings 6354 * cannot have the wired attribute set, so they are ignored. 6355 * 6356 * The wired attribute of the page table entry is not a hardware 6357 * feature, so there is no need to invalidate any TLB entries. 6358 * Since pmap_demote_pde() for the wired entry must never fail, 6359 * pmap_delayed_invl_start()/finish() calls around the 6360 * function are not needed. 6361 */ 6362 void 6363 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 6364 { 6365 vm_offset_t va_next; 6366 pml4_entry_t *pml4e; 6367 pdp_entry_t *pdpe; 6368 pd_entry_t *pde; 6369 pt_entry_t *pte, PG_V; 6370 6371 PG_V = pmap_valid_bit(pmap); 6372 PMAP_LOCK(pmap); 6373 for (; sva < eva; sva = va_next) { 6374 pml4e = pmap_pml4e(pmap, sva); 6375 if ((*pml4e & PG_V) == 0) { 6376 va_next = (sva + NBPML4) & ~PML4MASK; 6377 if (va_next < sva) 6378 va_next = eva; 6379 continue; 6380 } 6381 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 6382 if ((*pdpe & PG_V) == 0) { 6383 va_next = (sva + NBPDP) & ~PDPMASK; 6384 if (va_next < sva) 6385 va_next = eva; 6386 continue; 6387 } 6388 va_next = (sva + NBPDR) & ~PDRMASK; 6389 if (va_next < sva) 6390 va_next = eva; 6391 pde = pmap_pdpe_to_pde(pdpe, sva); 6392 if ((*pde & PG_V) == 0) 6393 continue; 6394 if ((*pde & PG_PS) != 0) { 6395 if ((*pde & PG_W) == 0) 6396 panic("pmap_unwire: pde %#jx is missing PG_W", 6397 (uintmax_t)*pde); 6398 6399 /* 6400 * Are we unwiring the entire large page? If not, 6401 * demote the mapping and fall through. 6402 */ 6403 if (sva + NBPDR == va_next && eva >= va_next) { 6404 atomic_clear_long(pde, PG_W); 6405 pmap->pm_stats.wired_count -= NBPDR / 6406 PAGE_SIZE; 6407 continue; 6408 } else if (!pmap_demote_pde(pmap, pde, sva)) 6409 panic("pmap_unwire: demotion failed"); 6410 } 6411 if (va_next > eva) 6412 va_next = eva; 6413 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 6414 sva += PAGE_SIZE) { 6415 if ((*pte & PG_V) == 0) 6416 continue; 6417 if ((*pte & PG_W) == 0) 6418 panic("pmap_unwire: pte %#jx is missing PG_W", 6419 (uintmax_t)*pte); 6420 6421 /* 6422 * PG_W must be cleared atomically. Although the pmap 6423 * lock synchronizes access to PG_W, another processor 6424 * could be setting PG_M and/or PG_A concurrently. 6425 */ 6426 atomic_clear_long(pte, PG_W); 6427 pmap->pm_stats.wired_count--; 6428 } 6429 } 6430 PMAP_UNLOCK(pmap); 6431 } 6432 6433 /* 6434 * Copy the range specified by src_addr/len 6435 * from the source map to the range dst_addr/len 6436 * in the destination map. 6437 * 6438 * This routine is only advisory and need not do anything. 6439 */ 6440 void 6441 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 6442 vm_offset_t src_addr) 6443 { 6444 struct rwlock *lock; 6445 struct spglist free; 6446 pml4_entry_t *pml4e; 6447 pdp_entry_t *pdpe; 6448 pd_entry_t *pde, srcptepaddr; 6449 pt_entry_t *dst_pte, PG_A, PG_M, PG_V, ptetemp, *src_pte; 6450 vm_offset_t addr, end_addr, va_next; 6451 vm_page_t dst_pdpg, dstmpte, srcmpte; 6452 6453 if (dst_addr != src_addr) 6454 return; 6455 6456 if (dst_pmap->pm_type != src_pmap->pm_type) 6457 return; 6458 6459 /* 6460 * EPT page table entries that require emulation of A/D bits are 6461 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although 6462 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit 6463 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT 6464 * implementations flag an EPT misconfiguration for exec-only 6465 * mappings we skip this function entirely for emulated pmaps. 6466 */ 6467 if (pmap_emulate_ad_bits(dst_pmap)) 6468 return; 6469 6470 end_addr = src_addr + len; 6471 lock = NULL; 6472 if (dst_pmap < src_pmap) { 6473 PMAP_LOCK(dst_pmap); 6474 PMAP_LOCK(src_pmap); 6475 } else { 6476 PMAP_LOCK(src_pmap); 6477 PMAP_LOCK(dst_pmap); 6478 } 6479 6480 PG_A = pmap_accessed_bit(dst_pmap); 6481 PG_M = pmap_modified_bit(dst_pmap); 6482 PG_V = pmap_valid_bit(dst_pmap); 6483 6484 for (addr = src_addr; addr < end_addr; addr = va_next) { 6485 KASSERT(addr < UPT_MIN_ADDRESS, 6486 ("pmap_copy: invalid to pmap_copy page tables")); 6487 6488 pml4e = pmap_pml4e(src_pmap, addr); 6489 if ((*pml4e & PG_V) == 0) { 6490 va_next = (addr + NBPML4) & ~PML4MASK; 6491 if (va_next < addr) 6492 va_next = end_addr; 6493 continue; 6494 } 6495 6496 pdpe = pmap_pml4e_to_pdpe(pml4e, addr); 6497 if ((*pdpe & PG_V) == 0) { 6498 va_next = (addr + NBPDP) & ~PDPMASK; 6499 if (va_next < addr) 6500 va_next = end_addr; 6501 continue; 6502 } 6503 6504 va_next = (addr + NBPDR) & ~PDRMASK; 6505 if (va_next < addr) 6506 va_next = end_addr; 6507 6508 pde = pmap_pdpe_to_pde(pdpe, addr); 6509 srcptepaddr = *pde; 6510 if (srcptepaddr == 0) 6511 continue; 6512 6513 if (srcptepaddr & PG_PS) { 6514 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 6515 continue; 6516 dst_pdpg = pmap_allocpde(dst_pmap, addr, NULL); 6517 if (dst_pdpg == NULL) 6518 break; 6519 pde = (pd_entry_t *) 6520 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg)); 6521 pde = &pde[pmap_pde_index(addr)]; 6522 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 6523 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, 6524 PMAP_ENTER_NORECLAIM, &lock))) { 6525 *pde = srcptepaddr & ~PG_W; 6526 pmap_resident_count_inc(dst_pmap, NBPDR / 6527 PAGE_SIZE); 6528 atomic_add_long(&pmap_pde_mappings, 1); 6529 } else 6530 dst_pdpg->wire_count--; 6531 continue; 6532 } 6533 6534 srcptepaddr &= PG_FRAME; 6535 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 6536 KASSERT(srcmpte->wire_count > 0, 6537 ("pmap_copy: source page table page is unused")); 6538 6539 if (va_next > end_addr) 6540 va_next = end_addr; 6541 6542 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 6543 src_pte = &src_pte[pmap_pte_index(addr)]; 6544 dstmpte = NULL; 6545 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { 6546 ptetemp = *src_pte; 6547 6548 /* 6549 * We only virtual copy managed pages. 6550 */ 6551 if ((ptetemp & PG_MANAGED) == 0) 6552 continue; 6553 6554 if (dstmpte != NULL) { 6555 KASSERT(dstmpte->pindex == 6556 pmap_pde_pindex(addr), 6557 ("dstmpte pindex/addr mismatch")); 6558 dstmpte->wire_count++; 6559 } else if ((dstmpte = pmap_allocpte(dst_pmap, addr, 6560 NULL)) == NULL) 6561 goto out; 6562 dst_pte = (pt_entry_t *) 6563 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 6564 dst_pte = &dst_pte[pmap_pte_index(addr)]; 6565 if (*dst_pte == 0 && 6566 pmap_try_insert_pv_entry(dst_pmap, addr, 6567 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) { 6568 /* 6569 * Clear the wired, modified, and accessed 6570 * (referenced) bits during the copy. 6571 */ 6572 *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); 6573 pmap_resident_count_inc(dst_pmap, 1); 6574 } else { 6575 SLIST_INIT(&free); 6576 if (pmap_unwire_ptp(dst_pmap, addr, dstmpte, 6577 &free)) { 6578 /* 6579 * Although "addr" is not mapped, 6580 * paging-structure caches could 6581 * nonetheless have entries that refer 6582 * to the freed page table pages. 6583 * Invalidate those entries. 6584 */ 6585 pmap_invalidate_page(dst_pmap, addr); 6586 vm_page_free_pages_toq(&free, true); 6587 } 6588 goto out; 6589 } 6590 /* Have we copied all of the valid mappings? */ 6591 if (dstmpte->wire_count >= srcmpte->wire_count) 6592 break; 6593 } 6594 } 6595 out: 6596 if (lock != NULL) 6597 rw_wunlock(lock); 6598 PMAP_UNLOCK(src_pmap); 6599 PMAP_UNLOCK(dst_pmap); 6600 } 6601 6602 int 6603 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap) 6604 { 6605 int error; 6606 6607 if (dst_pmap->pm_type != src_pmap->pm_type || 6608 dst_pmap->pm_type != PT_X86 || 6609 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) 6610 return (0); 6611 for (;;) { 6612 if (dst_pmap < src_pmap) { 6613 PMAP_LOCK(dst_pmap); 6614 PMAP_LOCK(src_pmap); 6615 } else { 6616 PMAP_LOCK(src_pmap); 6617 PMAP_LOCK(dst_pmap); 6618 } 6619 error = pmap_pkru_copy(dst_pmap, src_pmap); 6620 /* Clean up partial copy on failure due to no memory. */ 6621 if (error == ENOMEM) 6622 pmap_pkru_deassign_all(dst_pmap); 6623 PMAP_UNLOCK(src_pmap); 6624 PMAP_UNLOCK(dst_pmap); 6625 if (error != ENOMEM) 6626 break; 6627 vm_wait(NULL); 6628 } 6629 return (error); 6630 } 6631 6632 /* 6633 * Zero the specified hardware page. 6634 */ 6635 void 6636 pmap_zero_page(vm_page_t m) 6637 { 6638 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 6639 6640 pagezero((void *)va); 6641 } 6642 6643 /* 6644 * Zero an an area within a single hardware page. off and size must not 6645 * cover an area beyond a single hardware page. 6646 */ 6647 void 6648 pmap_zero_page_area(vm_page_t m, int off, int size) 6649 { 6650 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 6651 6652 if (off == 0 && size == PAGE_SIZE) 6653 pagezero((void *)va); 6654 else 6655 bzero((char *)va + off, size); 6656 } 6657 6658 /* 6659 * Copy 1 specified hardware page to another. 6660 */ 6661 void 6662 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 6663 { 6664 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 6665 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 6666 6667 pagecopy((void *)src, (void *)dst); 6668 } 6669 6670 int unmapped_buf_allowed = 1; 6671 6672 void 6673 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 6674 vm_offset_t b_offset, int xfersize) 6675 { 6676 void *a_cp, *b_cp; 6677 vm_page_t pages[2]; 6678 vm_offset_t vaddr[2], a_pg_offset, b_pg_offset; 6679 int cnt; 6680 boolean_t mapped; 6681 6682 while (xfersize > 0) { 6683 a_pg_offset = a_offset & PAGE_MASK; 6684 pages[0] = ma[a_offset >> PAGE_SHIFT]; 6685 b_pg_offset = b_offset & PAGE_MASK; 6686 pages[1] = mb[b_offset >> PAGE_SHIFT]; 6687 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 6688 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 6689 mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE); 6690 a_cp = (char *)vaddr[0] + a_pg_offset; 6691 b_cp = (char *)vaddr[1] + b_pg_offset; 6692 bcopy(a_cp, b_cp, cnt); 6693 if (__predict_false(mapped)) 6694 pmap_unmap_io_transient(pages, vaddr, 2, FALSE); 6695 a_offset += cnt; 6696 b_offset += cnt; 6697 xfersize -= cnt; 6698 } 6699 } 6700 6701 /* 6702 * Returns true if the pmap's pv is one of the first 6703 * 16 pvs linked to from this page. This count may 6704 * be changed upwards or downwards in the future; it 6705 * is only necessary that true be returned for a small 6706 * subset of pmaps for proper page aging. 6707 */ 6708 boolean_t 6709 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 6710 { 6711 struct md_page *pvh; 6712 struct rwlock *lock; 6713 pv_entry_t pv; 6714 int loops = 0; 6715 boolean_t rv; 6716 6717 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6718 ("pmap_page_exists_quick: page %p is not managed", m)); 6719 rv = FALSE; 6720 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6721 rw_rlock(lock); 6722 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6723 if (PV_PMAP(pv) == pmap) { 6724 rv = TRUE; 6725 break; 6726 } 6727 loops++; 6728 if (loops >= 16) 6729 break; 6730 } 6731 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 6732 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6733 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 6734 if (PV_PMAP(pv) == pmap) { 6735 rv = TRUE; 6736 break; 6737 } 6738 loops++; 6739 if (loops >= 16) 6740 break; 6741 } 6742 } 6743 rw_runlock(lock); 6744 return (rv); 6745 } 6746 6747 /* 6748 * pmap_page_wired_mappings: 6749 * 6750 * Return the number of managed mappings to the given physical page 6751 * that are wired. 6752 */ 6753 int 6754 pmap_page_wired_mappings(vm_page_t m) 6755 { 6756 struct rwlock *lock; 6757 struct md_page *pvh; 6758 pmap_t pmap; 6759 pt_entry_t *pte; 6760 pv_entry_t pv; 6761 int count, md_gen, pvh_gen; 6762 6763 if ((m->oflags & VPO_UNMANAGED) != 0) 6764 return (0); 6765 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6766 rw_rlock(lock); 6767 restart: 6768 count = 0; 6769 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6770 pmap = PV_PMAP(pv); 6771 if (!PMAP_TRYLOCK(pmap)) { 6772 md_gen = m->md.pv_gen; 6773 rw_runlock(lock); 6774 PMAP_LOCK(pmap); 6775 rw_rlock(lock); 6776 if (md_gen != m->md.pv_gen) { 6777 PMAP_UNLOCK(pmap); 6778 goto restart; 6779 } 6780 } 6781 pte = pmap_pte(pmap, pv->pv_va); 6782 if ((*pte & PG_W) != 0) 6783 count++; 6784 PMAP_UNLOCK(pmap); 6785 } 6786 if ((m->flags & PG_FICTITIOUS) == 0) { 6787 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6788 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 6789 pmap = PV_PMAP(pv); 6790 if (!PMAP_TRYLOCK(pmap)) { 6791 md_gen = m->md.pv_gen; 6792 pvh_gen = pvh->pv_gen; 6793 rw_runlock(lock); 6794 PMAP_LOCK(pmap); 6795 rw_rlock(lock); 6796 if (md_gen != m->md.pv_gen || 6797 pvh_gen != pvh->pv_gen) { 6798 PMAP_UNLOCK(pmap); 6799 goto restart; 6800 } 6801 } 6802 pte = pmap_pde(pmap, pv->pv_va); 6803 if ((*pte & PG_W) != 0) 6804 count++; 6805 PMAP_UNLOCK(pmap); 6806 } 6807 } 6808 rw_runlock(lock); 6809 return (count); 6810 } 6811 6812 /* 6813 * Returns TRUE if the given page is mapped individually or as part of 6814 * a 2mpage. Otherwise, returns FALSE. 6815 */ 6816 boolean_t 6817 pmap_page_is_mapped(vm_page_t m) 6818 { 6819 struct rwlock *lock; 6820 boolean_t rv; 6821 6822 if ((m->oflags & VPO_UNMANAGED) != 0) 6823 return (FALSE); 6824 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6825 rw_rlock(lock); 6826 rv = !TAILQ_EMPTY(&m->md.pv_list) || 6827 ((m->flags & PG_FICTITIOUS) == 0 && 6828 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 6829 rw_runlock(lock); 6830 return (rv); 6831 } 6832 6833 /* 6834 * Destroy all managed, non-wired mappings in the given user-space 6835 * pmap. This pmap cannot be active on any processor besides the 6836 * caller. 6837 * 6838 * This function cannot be applied to the kernel pmap. Moreover, it 6839 * is not intended for general use. It is only to be used during 6840 * process termination. Consequently, it can be implemented in ways 6841 * that make it faster than pmap_remove(). First, it can more quickly 6842 * destroy mappings by iterating over the pmap's collection of PV 6843 * entries, rather than searching the page table. Second, it doesn't 6844 * have to test and clear the page table entries atomically, because 6845 * no processor is currently accessing the user address space. In 6846 * particular, a page table entry's dirty bit won't change state once 6847 * this function starts. 6848 * 6849 * Although this function destroys all of the pmap's managed, 6850 * non-wired mappings, it can delay and batch the invalidation of TLB 6851 * entries without calling pmap_delayed_invl_start() and 6852 * pmap_delayed_invl_finish(). Because the pmap is not active on 6853 * any other processor, none of these TLB entries will ever be used 6854 * before their eventual invalidation. Consequently, there is no need 6855 * for either pmap_remove_all() or pmap_remove_write() to wait for 6856 * that eventual TLB invalidation. 6857 */ 6858 void 6859 pmap_remove_pages(pmap_t pmap) 6860 { 6861 pd_entry_t ptepde; 6862 pt_entry_t *pte, tpte; 6863 pt_entry_t PG_M, PG_RW, PG_V; 6864 struct spglist free; 6865 vm_page_t m, mpte, mt; 6866 pv_entry_t pv; 6867 struct md_page *pvh; 6868 struct pv_chunk *pc, *npc; 6869 struct rwlock *lock; 6870 int64_t bit; 6871 uint64_t inuse, bitmask; 6872 int allfree, field, freed, idx; 6873 boolean_t superpage; 6874 vm_paddr_t pa; 6875 6876 /* 6877 * Assert that the given pmap is only active on the current 6878 * CPU. Unfortunately, we cannot block another CPU from 6879 * activating the pmap while this function is executing. 6880 */ 6881 KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); 6882 #ifdef INVARIANTS 6883 { 6884 cpuset_t other_cpus; 6885 6886 other_cpus = all_cpus; 6887 critical_enter(); 6888 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 6889 CPU_AND(&other_cpus, &pmap->pm_active); 6890 critical_exit(); 6891 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); 6892 } 6893 #endif 6894 6895 lock = NULL; 6896 PG_M = pmap_modified_bit(pmap); 6897 PG_V = pmap_valid_bit(pmap); 6898 PG_RW = pmap_rw_bit(pmap); 6899 6900 SLIST_INIT(&free); 6901 PMAP_LOCK(pmap); 6902 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 6903 allfree = 1; 6904 freed = 0; 6905 for (field = 0; field < _NPCM; field++) { 6906 inuse = ~pc->pc_map[field] & pc_freemask[field]; 6907 while (inuse != 0) { 6908 bit = bsfq(inuse); 6909 bitmask = 1UL << bit; 6910 idx = field * 64 + bit; 6911 pv = &pc->pc_pventry[idx]; 6912 inuse &= ~bitmask; 6913 6914 pte = pmap_pdpe(pmap, pv->pv_va); 6915 ptepde = *pte; 6916 pte = pmap_pdpe_to_pde(pte, pv->pv_va); 6917 tpte = *pte; 6918 if ((tpte & (PG_PS | PG_V)) == PG_V) { 6919 superpage = FALSE; 6920 ptepde = tpte; 6921 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 6922 PG_FRAME); 6923 pte = &pte[pmap_pte_index(pv->pv_va)]; 6924 tpte = *pte; 6925 } else { 6926 /* 6927 * Keep track whether 'tpte' is a 6928 * superpage explicitly instead of 6929 * relying on PG_PS being set. 6930 * 6931 * This is because PG_PS is numerically 6932 * identical to PG_PTE_PAT and thus a 6933 * regular page could be mistaken for 6934 * a superpage. 6935 */ 6936 superpage = TRUE; 6937 } 6938 6939 if ((tpte & PG_V) == 0) { 6940 panic("bad pte va %lx pte %lx", 6941 pv->pv_va, tpte); 6942 } 6943 6944 /* 6945 * We cannot remove wired pages from a process' mapping at this time 6946 */ 6947 if (tpte & PG_W) { 6948 allfree = 0; 6949 continue; 6950 } 6951 6952 if (superpage) 6953 pa = tpte & PG_PS_FRAME; 6954 else 6955 pa = tpte & PG_FRAME; 6956 6957 m = PHYS_TO_VM_PAGE(pa); 6958 KASSERT(m->phys_addr == pa, 6959 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 6960 m, (uintmax_t)m->phys_addr, 6961 (uintmax_t)tpte)); 6962 6963 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 6964 m < &vm_page_array[vm_page_array_size], 6965 ("pmap_remove_pages: bad tpte %#jx", 6966 (uintmax_t)tpte)); 6967 6968 pte_clear(pte); 6969 6970 /* 6971 * Update the vm_page_t clean/reference bits. 6972 */ 6973 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 6974 if (superpage) { 6975 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 6976 vm_page_dirty(mt); 6977 } else 6978 vm_page_dirty(m); 6979 } 6980 6981 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 6982 6983 /* Mark free */ 6984 pc->pc_map[field] |= bitmask; 6985 if (superpage) { 6986 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); 6987 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 6988 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 6989 pvh->pv_gen++; 6990 if (TAILQ_EMPTY(&pvh->pv_list)) { 6991 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 6992 if ((mt->aflags & PGA_WRITEABLE) != 0 && 6993 TAILQ_EMPTY(&mt->md.pv_list)) 6994 vm_page_aflag_clear(mt, PGA_WRITEABLE); 6995 } 6996 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 6997 if (mpte != NULL) { 6998 KASSERT(mpte->valid == VM_PAGE_BITS_ALL, 6999 ("pmap_remove_pages: pte page not promoted")); 7000 pmap_resident_count_dec(pmap, 1); 7001 KASSERT(mpte->wire_count == NPTEPG, 7002 ("pmap_remove_pages: pte page wire count error")); 7003 mpte->wire_count = 0; 7004 pmap_add_delayed_free_list(mpte, &free, FALSE); 7005 } 7006 } else { 7007 pmap_resident_count_dec(pmap, 1); 7008 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 7009 m->md.pv_gen++; 7010 if ((m->aflags & PGA_WRITEABLE) != 0 && 7011 TAILQ_EMPTY(&m->md.pv_list) && 7012 (m->flags & PG_FICTITIOUS) == 0) { 7013 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 7014 if (TAILQ_EMPTY(&pvh->pv_list)) 7015 vm_page_aflag_clear(m, PGA_WRITEABLE); 7016 } 7017 } 7018 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 7019 freed++; 7020 } 7021 } 7022 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 7023 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 7024 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 7025 if (allfree) { 7026 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 7027 free_pv_chunk(pc); 7028 } 7029 } 7030 if (lock != NULL) 7031 rw_wunlock(lock); 7032 pmap_invalidate_all(pmap); 7033 pmap_pkru_deassign_all(pmap); 7034 PMAP_UNLOCK(pmap); 7035 vm_page_free_pages_toq(&free, true); 7036 } 7037 7038 static boolean_t 7039 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 7040 { 7041 struct rwlock *lock; 7042 pv_entry_t pv; 7043 struct md_page *pvh; 7044 pt_entry_t *pte, mask; 7045 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 7046 pmap_t pmap; 7047 int md_gen, pvh_gen; 7048 boolean_t rv; 7049 7050 rv = FALSE; 7051 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 7052 rw_rlock(lock); 7053 restart: 7054 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 7055 pmap = PV_PMAP(pv); 7056 if (!PMAP_TRYLOCK(pmap)) { 7057 md_gen = m->md.pv_gen; 7058 rw_runlock(lock); 7059 PMAP_LOCK(pmap); 7060 rw_rlock(lock); 7061 if (md_gen != m->md.pv_gen) { 7062 PMAP_UNLOCK(pmap); 7063 goto restart; 7064 } 7065 } 7066 pte = pmap_pte(pmap, pv->pv_va); 7067 mask = 0; 7068 if (modified) { 7069 PG_M = pmap_modified_bit(pmap); 7070 PG_RW = pmap_rw_bit(pmap); 7071 mask |= PG_RW | PG_M; 7072 } 7073 if (accessed) { 7074 PG_A = pmap_accessed_bit(pmap); 7075 PG_V = pmap_valid_bit(pmap); 7076 mask |= PG_V | PG_A; 7077 } 7078 rv = (*pte & mask) == mask; 7079 PMAP_UNLOCK(pmap); 7080 if (rv) 7081 goto out; 7082 } 7083 if ((m->flags & PG_FICTITIOUS) == 0) { 7084 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 7085 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 7086 pmap = PV_PMAP(pv); 7087 if (!PMAP_TRYLOCK(pmap)) { 7088 md_gen = m->md.pv_gen; 7089 pvh_gen = pvh->pv_gen; 7090 rw_runlock(lock); 7091 PMAP_LOCK(pmap); 7092 rw_rlock(lock); 7093 if (md_gen != m->md.pv_gen || 7094 pvh_gen != pvh->pv_gen) { 7095 PMAP_UNLOCK(pmap); 7096 goto restart; 7097 } 7098 } 7099 pte = pmap_pde(pmap, pv->pv_va); 7100 mask = 0; 7101 if (modified) { 7102 PG_M = pmap_modified_bit(pmap); 7103 PG_RW = pmap_rw_bit(pmap); 7104 mask |= PG_RW | PG_M; 7105 } 7106 if (accessed) { 7107 PG_A = pmap_accessed_bit(pmap); 7108 PG_V = pmap_valid_bit(pmap); 7109 mask |= PG_V | PG_A; 7110 } 7111 rv = (*pte & mask) == mask; 7112 PMAP_UNLOCK(pmap); 7113 if (rv) 7114 goto out; 7115 } 7116 } 7117 out: 7118 rw_runlock(lock); 7119 return (rv); 7120 } 7121 7122 /* 7123 * pmap_is_modified: 7124 * 7125 * Return whether or not the specified physical page was modified 7126 * in any physical maps. 7127 */ 7128 boolean_t 7129 pmap_is_modified(vm_page_t m) 7130 { 7131 7132 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7133 ("pmap_is_modified: page %p is not managed", m)); 7134 7135 /* 7136 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 7137 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 7138 * is clear, no PTEs can have PG_M set. 7139 */ 7140 VM_OBJECT_ASSERT_WLOCKED(m->object); 7141 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 7142 return (FALSE); 7143 return (pmap_page_test_mappings(m, FALSE, TRUE)); 7144 } 7145 7146 /* 7147 * pmap_is_prefaultable: 7148 * 7149 * Return whether or not the specified virtual address is eligible 7150 * for prefault. 7151 */ 7152 boolean_t 7153 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 7154 { 7155 pd_entry_t *pde; 7156 pt_entry_t *pte, PG_V; 7157 boolean_t rv; 7158 7159 PG_V = pmap_valid_bit(pmap); 7160 rv = FALSE; 7161 PMAP_LOCK(pmap); 7162 pde = pmap_pde(pmap, addr); 7163 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { 7164 pte = pmap_pde_to_pte(pde, addr); 7165 rv = (*pte & PG_V) == 0; 7166 } 7167 PMAP_UNLOCK(pmap); 7168 return (rv); 7169 } 7170 7171 /* 7172 * pmap_is_referenced: 7173 * 7174 * Return whether or not the specified physical page was referenced 7175 * in any physical maps. 7176 */ 7177 boolean_t 7178 pmap_is_referenced(vm_page_t m) 7179 { 7180 7181 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7182 ("pmap_is_referenced: page %p is not managed", m)); 7183 return (pmap_page_test_mappings(m, TRUE, FALSE)); 7184 } 7185 7186 /* 7187 * Clear the write and modified bits in each of the given page's mappings. 7188 */ 7189 void 7190 pmap_remove_write(vm_page_t m) 7191 { 7192 struct md_page *pvh; 7193 pmap_t pmap; 7194 struct rwlock *lock; 7195 pv_entry_t next_pv, pv; 7196 pd_entry_t *pde; 7197 pt_entry_t oldpte, *pte, PG_M, PG_RW; 7198 vm_offset_t va; 7199 int pvh_gen, md_gen; 7200 7201 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7202 ("pmap_remove_write: page %p is not managed", m)); 7203 7204 /* 7205 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 7206 * set by another thread while the object is locked. Thus, 7207 * if PGA_WRITEABLE is clear, no page table entries need updating. 7208 */ 7209 VM_OBJECT_ASSERT_WLOCKED(m->object); 7210 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 7211 return; 7212 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 7213 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 7214 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 7215 retry_pv_loop: 7216 rw_wlock(lock); 7217 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 7218 pmap = PV_PMAP(pv); 7219 if (!PMAP_TRYLOCK(pmap)) { 7220 pvh_gen = pvh->pv_gen; 7221 rw_wunlock(lock); 7222 PMAP_LOCK(pmap); 7223 rw_wlock(lock); 7224 if (pvh_gen != pvh->pv_gen) { 7225 PMAP_UNLOCK(pmap); 7226 rw_wunlock(lock); 7227 goto retry_pv_loop; 7228 } 7229 } 7230 PG_RW = pmap_rw_bit(pmap); 7231 va = pv->pv_va; 7232 pde = pmap_pde(pmap, va); 7233 if ((*pde & PG_RW) != 0) 7234 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 7235 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 7236 ("inconsistent pv lock %p %p for page %p", 7237 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 7238 PMAP_UNLOCK(pmap); 7239 } 7240 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 7241 pmap = PV_PMAP(pv); 7242 if (!PMAP_TRYLOCK(pmap)) { 7243 pvh_gen = pvh->pv_gen; 7244 md_gen = m->md.pv_gen; 7245 rw_wunlock(lock); 7246 PMAP_LOCK(pmap); 7247 rw_wlock(lock); 7248 if (pvh_gen != pvh->pv_gen || 7249 md_gen != m->md.pv_gen) { 7250 PMAP_UNLOCK(pmap); 7251 rw_wunlock(lock); 7252 goto retry_pv_loop; 7253 } 7254 } 7255 PG_M = pmap_modified_bit(pmap); 7256 PG_RW = pmap_rw_bit(pmap); 7257 pde = pmap_pde(pmap, pv->pv_va); 7258 KASSERT((*pde & PG_PS) == 0, 7259 ("pmap_remove_write: found a 2mpage in page %p's pv list", 7260 m)); 7261 pte = pmap_pde_to_pte(pde, pv->pv_va); 7262 retry: 7263 oldpte = *pte; 7264 if (oldpte & PG_RW) { 7265 if (!atomic_cmpset_long(pte, oldpte, oldpte & 7266 ~(PG_RW | PG_M))) 7267 goto retry; 7268 if ((oldpte & PG_M) != 0) 7269 vm_page_dirty(m); 7270 pmap_invalidate_page(pmap, pv->pv_va); 7271 } 7272 PMAP_UNLOCK(pmap); 7273 } 7274 rw_wunlock(lock); 7275 vm_page_aflag_clear(m, PGA_WRITEABLE); 7276 pmap_delayed_invl_wait(m); 7277 } 7278 7279 static __inline boolean_t 7280 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) 7281 { 7282 7283 if (!pmap_emulate_ad_bits(pmap)) 7284 return (TRUE); 7285 7286 KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); 7287 7288 /* 7289 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration 7290 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared 7291 * if the EPT_PG_WRITE bit is set. 7292 */ 7293 if ((pte & EPT_PG_WRITE) != 0) 7294 return (FALSE); 7295 7296 /* 7297 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. 7298 */ 7299 if ((pte & EPT_PG_EXECUTE) == 0 || 7300 ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) 7301 return (TRUE); 7302 else 7303 return (FALSE); 7304 } 7305 7306 /* 7307 * pmap_ts_referenced: 7308 * 7309 * Return a count of reference bits for a page, clearing those bits. 7310 * It is not necessary for every reference bit to be cleared, but it 7311 * is necessary that 0 only be returned when there are truly no 7312 * reference bits set. 7313 * 7314 * As an optimization, update the page's dirty field if a modified bit is 7315 * found while counting reference bits. This opportunistic update can be 7316 * performed at low cost and can eliminate the need for some future calls 7317 * to pmap_is_modified(). However, since this function stops after 7318 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 7319 * dirty pages. Those dirty pages will only be detected by a future call 7320 * to pmap_is_modified(). 7321 * 7322 * A DI block is not needed within this function, because 7323 * invalidations are performed before the PV list lock is 7324 * released. 7325 */ 7326 int 7327 pmap_ts_referenced(vm_page_t m) 7328 { 7329 struct md_page *pvh; 7330 pv_entry_t pv, pvf; 7331 pmap_t pmap; 7332 struct rwlock *lock; 7333 pd_entry_t oldpde, *pde; 7334 pt_entry_t *pte, PG_A, PG_M, PG_RW; 7335 vm_offset_t va; 7336 vm_paddr_t pa; 7337 int cleared, md_gen, not_cleared, pvh_gen; 7338 struct spglist free; 7339 boolean_t demoted; 7340 7341 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7342 ("pmap_ts_referenced: page %p is not managed", m)); 7343 SLIST_INIT(&free); 7344 cleared = 0; 7345 pa = VM_PAGE_TO_PHYS(m); 7346 lock = PHYS_TO_PV_LIST_LOCK(pa); 7347 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 7348 rw_wlock(lock); 7349 retry: 7350 not_cleared = 0; 7351 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 7352 goto small_mappings; 7353 pv = pvf; 7354 do { 7355 if (pvf == NULL) 7356 pvf = pv; 7357 pmap = PV_PMAP(pv); 7358 if (!PMAP_TRYLOCK(pmap)) { 7359 pvh_gen = pvh->pv_gen; 7360 rw_wunlock(lock); 7361 PMAP_LOCK(pmap); 7362 rw_wlock(lock); 7363 if (pvh_gen != pvh->pv_gen) { 7364 PMAP_UNLOCK(pmap); 7365 goto retry; 7366 } 7367 } 7368 PG_A = pmap_accessed_bit(pmap); 7369 PG_M = pmap_modified_bit(pmap); 7370 PG_RW = pmap_rw_bit(pmap); 7371 va = pv->pv_va; 7372 pde = pmap_pde(pmap, pv->pv_va); 7373 oldpde = *pde; 7374 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 7375 /* 7376 * Although "oldpde" is mapping a 2MB page, because 7377 * this function is called at a 4KB page granularity, 7378 * we only update the 4KB page under test. 7379 */ 7380 vm_page_dirty(m); 7381 } 7382 if ((oldpde & PG_A) != 0) { 7383 /* 7384 * Since this reference bit is shared by 512 4KB 7385 * pages, it should not be cleared every time it is 7386 * tested. Apply a simple "hash" function on the 7387 * physical page number, the virtual superpage number, 7388 * and the pmap address to select one 4KB page out of 7389 * the 512 on which testing the reference bit will 7390 * result in clearing that reference bit. This 7391 * function is designed to avoid the selection of the 7392 * same 4KB page for every 2MB page mapping. 7393 * 7394 * On demotion, a mapping that hasn't been referenced 7395 * is simply destroyed. To avoid the possibility of a 7396 * subsequent page fault on a demoted wired mapping, 7397 * always leave its reference bit set. Moreover, 7398 * since the superpage is wired, the current state of 7399 * its reference bit won't affect page replacement. 7400 */ 7401 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 7402 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 7403 (oldpde & PG_W) == 0) { 7404 if (safe_to_clear_referenced(pmap, oldpde)) { 7405 atomic_clear_long(pde, PG_A); 7406 pmap_invalidate_page(pmap, pv->pv_va); 7407 demoted = FALSE; 7408 } else if (pmap_demote_pde_locked(pmap, pde, 7409 pv->pv_va, &lock)) { 7410 /* 7411 * Remove the mapping to a single page 7412 * so that a subsequent access may 7413 * repromote. Since the underlying 7414 * page table page is fully populated, 7415 * this removal never frees a page 7416 * table page. 7417 */ 7418 demoted = TRUE; 7419 va += VM_PAGE_TO_PHYS(m) - (oldpde & 7420 PG_PS_FRAME); 7421 pte = pmap_pde_to_pte(pde, va); 7422 pmap_remove_pte(pmap, pte, va, *pde, 7423 NULL, &lock); 7424 pmap_invalidate_page(pmap, va); 7425 } else 7426 demoted = TRUE; 7427 7428 if (demoted) { 7429 /* 7430 * The superpage mapping was removed 7431 * entirely and therefore 'pv' is no 7432 * longer valid. 7433 */ 7434 if (pvf == pv) 7435 pvf = NULL; 7436 pv = NULL; 7437 } 7438 cleared++; 7439 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 7440 ("inconsistent pv lock %p %p for page %p", 7441 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 7442 } else 7443 not_cleared++; 7444 } 7445 PMAP_UNLOCK(pmap); 7446 /* Rotate the PV list if it has more than one entry. */ 7447 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 7448 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 7449 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 7450 pvh->pv_gen++; 7451 } 7452 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 7453 goto out; 7454 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 7455 small_mappings: 7456 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 7457 goto out; 7458 pv = pvf; 7459 do { 7460 if (pvf == NULL) 7461 pvf = pv; 7462 pmap = PV_PMAP(pv); 7463 if (!PMAP_TRYLOCK(pmap)) { 7464 pvh_gen = pvh->pv_gen; 7465 md_gen = m->md.pv_gen; 7466 rw_wunlock(lock); 7467 PMAP_LOCK(pmap); 7468 rw_wlock(lock); 7469 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 7470 PMAP_UNLOCK(pmap); 7471 goto retry; 7472 } 7473 } 7474 PG_A = pmap_accessed_bit(pmap); 7475 PG_M = pmap_modified_bit(pmap); 7476 PG_RW = pmap_rw_bit(pmap); 7477 pde = pmap_pde(pmap, pv->pv_va); 7478 KASSERT((*pde & PG_PS) == 0, 7479 ("pmap_ts_referenced: found a 2mpage in page %p's pv list", 7480 m)); 7481 pte = pmap_pde_to_pte(pde, pv->pv_va); 7482 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 7483 vm_page_dirty(m); 7484 if ((*pte & PG_A) != 0) { 7485 if (safe_to_clear_referenced(pmap, *pte)) { 7486 atomic_clear_long(pte, PG_A); 7487 pmap_invalidate_page(pmap, pv->pv_va); 7488 cleared++; 7489 } else if ((*pte & PG_W) == 0) { 7490 /* 7491 * Wired pages cannot be paged out so 7492 * doing accessed bit emulation for 7493 * them is wasted effort. We do the 7494 * hard work for unwired pages only. 7495 */ 7496 pmap_remove_pte(pmap, pte, pv->pv_va, 7497 *pde, &free, &lock); 7498 pmap_invalidate_page(pmap, pv->pv_va); 7499 cleared++; 7500 if (pvf == pv) 7501 pvf = NULL; 7502 pv = NULL; 7503 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 7504 ("inconsistent pv lock %p %p for page %p", 7505 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 7506 } else 7507 not_cleared++; 7508 } 7509 PMAP_UNLOCK(pmap); 7510 /* Rotate the PV list if it has more than one entry. */ 7511 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 7512 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 7513 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 7514 m->md.pv_gen++; 7515 } 7516 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 7517 not_cleared < PMAP_TS_REFERENCED_MAX); 7518 out: 7519 rw_wunlock(lock); 7520 vm_page_free_pages_toq(&free, true); 7521 return (cleared + not_cleared); 7522 } 7523 7524 /* 7525 * Apply the given advice to the specified range of addresses within the 7526 * given pmap. Depending on the advice, clear the referenced and/or 7527 * modified flags in each mapping and set the mapped page's dirty field. 7528 */ 7529 void 7530 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 7531 { 7532 struct rwlock *lock; 7533 pml4_entry_t *pml4e; 7534 pdp_entry_t *pdpe; 7535 pd_entry_t oldpde, *pde; 7536 pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; 7537 vm_offset_t va, va_next; 7538 vm_page_t m; 7539 bool anychanged; 7540 7541 if (advice != MADV_DONTNEED && advice != MADV_FREE) 7542 return; 7543 7544 /* 7545 * A/D bit emulation requires an alternate code path when clearing 7546 * the modified and accessed bits below. Since this function is 7547 * advisory in nature we skip it entirely for pmaps that require 7548 * A/D bit emulation. 7549 */ 7550 if (pmap_emulate_ad_bits(pmap)) 7551 return; 7552 7553 PG_A = pmap_accessed_bit(pmap); 7554 PG_G = pmap_global_bit(pmap); 7555 PG_M = pmap_modified_bit(pmap); 7556 PG_V = pmap_valid_bit(pmap); 7557 PG_RW = pmap_rw_bit(pmap); 7558 anychanged = false; 7559 pmap_delayed_invl_start(); 7560 PMAP_LOCK(pmap); 7561 for (; sva < eva; sva = va_next) { 7562 pml4e = pmap_pml4e(pmap, sva); 7563 if ((*pml4e & PG_V) == 0) { 7564 va_next = (sva + NBPML4) & ~PML4MASK; 7565 if (va_next < sva) 7566 va_next = eva; 7567 continue; 7568 } 7569 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 7570 if ((*pdpe & PG_V) == 0) { 7571 va_next = (sva + NBPDP) & ~PDPMASK; 7572 if (va_next < sva) 7573 va_next = eva; 7574 continue; 7575 } 7576 va_next = (sva + NBPDR) & ~PDRMASK; 7577 if (va_next < sva) 7578 va_next = eva; 7579 pde = pmap_pdpe_to_pde(pdpe, sva); 7580 oldpde = *pde; 7581 if ((oldpde & PG_V) == 0) 7582 continue; 7583 else if ((oldpde & PG_PS) != 0) { 7584 if ((oldpde & PG_MANAGED) == 0) 7585 continue; 7586 lock = NULL; 7587 if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { 7588 if (lock != NULL) 7589 rw_wunlock(lock); 7590 7591 /* 7592 * The large page mapping was destroyed. 7593 */ 7594 continue; 7595 } 7596 7597 /* 7598 * Unless the page mappings are wired, remove the 7599 * mapping to a single page so that a subsequent 7600 * access may repromote. Choosing the last page 7601 * within the address range [sva, min(va_next, eva)) 7602 * generally results in more repromotions. Since the 7603 * underlying page table page is fully populated, this 7604 * removal never frees a page table page. 7605 */ 7606 if ((oldpde & PG_W) == 0) { 7607 va = eva; 7608 if (va > va_next) 7609 va = va_next; 7610 va -= PAGE_SIZE; 7611 KASSERT(va >= sva, 7612 ("pmap_advise: no address gap")); 7613 pte = pmap_pde_to_pte(pde, va); 7614 KASSERT((*pte & PG_V) != 0, 7615 ("pmap_advise: invalid PTE")); 7616 pmap_remove_pte(pmap, pte, va, *pde, NULL, 7617 &lock); 7618 anychanged = true; 7619 } 7620 if (lock != NULL) 7621 rw_wunlock(lock); 7622 } 7623 if (va_next > eva) 7624 va_next = eva; 7625 va = va_next; 7626 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 7627 sva += PAGE_SIZE) { 7628 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) 7629 goto maybe_invlrng; 7630 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 7631 if (advice == MADV_DONTNEED) { 7632 /* 7633 * Future calls to pmap_is_modified() 7634 * can be avoided by making the page 7635 * dirty now. 7636 */ 7637 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 7638 vm_page_dirty(m); 7639 } 7640 atomic_clear_long(pte, PG_M | PG_A); 7641 } else if ((*pte & PG_A) != 0) 7642 atomic_clear_long(pte, PG_A); 7643 else 7644 goto maybe_invlrng; 7645 7646 if ((*pte & PG_G) != 0) { 7647 if (va == va_next) 7648 va = sva; 7649 } else 7650 anychanged = true; 7651 continue; 7652 maybe_invlrng: 7653 if (va != va_next) { 7654 pmap_invalidate_range(pmap, va, sva); 7655 va = va_next; 7656 } 7657 } 7658 if (va != va_next) 7659 pmap_invalidate_range(pmap, va, sva); 7660 } 7661 if (anychanged) 7662 pmap_invalidate_all(pmap); 7663 PMAP_UNLOCK(pmap); 7664 pmap_delayed_invl_finish(); 7665 } 7666 7667 /* 7668 * Clear the modify bits on the specified physical page. 7669 */ 7670 void 7671 pmap_clear_modify(vm_page_t m) 7672 { 7673 struct md_page *pvh; 7674 pmap_t pmap; 7675 pv_entry_t next_pv, pv; 7676 pd_entry_t oldpde, *pde; 7677 pt_entry_t *pte, PG_M, PG_RW; 7678 struct rwlock *lock; 7679 vm_offset_t va; 7680 int md_gen, pvh_gen; 7681 7682 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7683 ("pmap_clear_modify: page %p is not managed", m)); 7684 VM_OBJECT_ASSERT_WLOCKED(m->object); 7685 KASSERT(!vm_page_xbusied(m), 7686 ("pmap_clear_modify: page %p is exclusive busied", m)); 7687 7688 /* 7689 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 7690 * If the object containing the page is locked and the page is not 7691 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 7692 */ 7693 if ((m->aflags & PGA_WRITEABLE) == 0) 7694 return; 7695 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 7696 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 7697 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 7698 rw_wlock(lock); 7699 restart: 7700 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 7701 pmap = PV_PMAP(pv); 7702 if (!PMAP_TRYLOCK(pmap)) { 7703 pvh_gen = pvh->pv_gen; 7704 rw_wunlock(lock); 7705 PMAP_LOCK(pmap); 7706 rw_wlock(lock); 7707 if (pvh_gen != pvh->pv_gen) { 7708 PMAP_UNLOCK(pmap); 7709 goto restart; 7710 } 7711 } 7712 PG_M = pmap_modified_bit(pmap); 7713 PG_RW = pmap_rw_bit(pmap); 7714 va = pv->pv_va; 7715 pde = pmap_pde(pmap, va); 7716 oldpde = *pde; 7717 /* If oldpde has PG_RW set, then it also has PG_M set. */ 7718 if ((oldpde & PG_RW) != 0 && 7719 pmap_demote_pde_locked(pmap, pde, va, &lock) && 7720 (oldpde & PG_W) == 0) { 7721 /* 7722 * Write protect the mapping to a single page so that 7723 * a subsequent write access may repromote. 7724 */ 7725 va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); 7726 pte = pmap_pde_to_pte(pde, va); 7727 atomic_clear_long(pte, PG_M | PG_RW); 7728 vm_page_dirty(m); 7729 pmap_invalidate_page(pmap, va); 7730 } 7731 PMAP_UNLOCK(pmap); 7732 } 7733 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 7734 pmap = PV_PMAP(pv); 7735 if (!PMAP_TRYLOCK(pmap)) { 7736 md_gen = m->md.pv_gen; 7737 pvh_gen = pvh->pv_gen; 7738 rw_wunlock(lock); 7739 PMAP_LOCK(pmap); 7740 rw_wlock(lock); 7741 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 7742 PMAP_UNLOCK(pmap); 7743 goto restart; 7744 } 7745 } 7746 PG_M = pmap_modified_bit(pmap); 7747 PG_RW = pmap_rw_bit(pmap); 7748 pde = pmap_pde(pmap, pv->pv_va); 7749 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 7750 " a 2mpage in page %p's pv list", m)); 7751 pte = pmap_pde_to_pte(pde, pv->pv_va); 7752 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 7753 atomic_clear_long(pte, PG_M); 7754 pmap_invalidate_page(pmap, pv->pv_va); 7755 } 7756 PMAP_UNLOCK(pmap); 7757 } 7758 rw_wunlock(lock); 7759 } 7760 7761 /* 7762 * Miscellaneous support routines follow 7763 */ 7764 7765 /* Adjust the cache mode for a 4KB page mapped via a PTE. */ 7766 static __inline void 7767 pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask) 7768 { 7769 u_int opte, npte; 7770 7771 /* 7772 * The cache mode bits are all in the low 32-bits of the 7773 * PTE, so we can just spin on updating the low 32-bits. 7774 */ 7775 do { 7776 opte = *(u_int *)pte; 7777 npte = opte & ~mask; 7778 npte |= cache_bits; 7779 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 7780 } 7781 7782 /* Adjust the cache mode for a 2MB page mapped via a PDE. */ 7783 static __inline void 7784 pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask) 7785 { 7786 u_int opde, npde; 7787 7788 /* 7789 * The cache mode bits are all in the low 32-bits of the 7790 * PDE, so we can just spin on updating the low 32-bits. 7791 */ 7792 do { 7793 opde = *(u_int *)pde; 7794 npde = opde & ~mask; 7795 npde |= cache_bits; 7796 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 7797 } 7798 7799 /* 7800 * Map a set of physical memory pages into the kernel virtual 7801 * address space. Return a pointer to where it is mapped. This 7802 * routine is intended to be used for mapping device memory, 7803 * NOT real memory. 7804 */ 7805 static void * 7806 pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags) 7807 { 7808 struct pmap_preinit_mapping *ppim; 7809 vm_offset_t va, offset; 7810 vm_size_t tmpsize; 7811 int i; 7812 7813 offset = pa & PAGE_MASK; 7814 size = round_page(offset + size); 7815 pa = trunc_page(pa); 7816 7817 if (!pmap_initialized) { 7818 va = 0; 7819 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 7820 ppim = pmap_preinit_mapping + i; 7821 if (ppim->va == 0) { 7822 ppim->pa = pa; 7823 ppim->sz = size; 7824 ppim->mode = mode; 7825 ppim->va = virtual_avail; 7826 virtual_avail += size; 7827 va = ppim->va; 7828 break; 7829 } 7830 } 7831 if (va == 0) 7832 panic("%s: too many preinit mappings", __func__); 7833 } else { 7834 /* 7835 * If we have a preinit mapping, re-use it. 7836 */ 7837 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 7838 ppim = pmap_preinit_mapping + i; 7839 if (ppim->pa == pa && ppim->sz == size && 7840 (ppim->mode == mode || 7841 (flags & MAPDEV_SETATTR) == 0)) 7842 return ((void *)(ppim->va + offset)); 7843 } 7844 /* 7845 * If the specified range of physical addresses fits within 7846 * the direct map window, use the direct map. 7847 */ 7848 if (pa < dmaplimit && pa + size <= dmaplimit) { 7849 va = PHYS_TO_DMAP(pa); 7850 if ((flags & MAPDEV_SETATTR) != 0) { 7851 PMAP_LOCK(kernel_pmap); 7852 i = pmap_change_attr_locked(va, size, mode, flags); 7853 PMAP_UNLOCK(kernel_pmap); 7854 } else 7855 i = 0; 7856 if (!i) 7857 return ((void *)(va + offset)); 7858 } 7859 va = kva_alloc(size); 7860 if (va == 0) 7861 panic("%s: Couldn't allocate KVA", __func__); 7862 } 7863 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 7864 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 7865 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 7866 if ((flags & MAPDEV_FLUSHCACHE) != 0) 7867 pmap_invalidate_cache_range(va, va + tmpsize); 7868 return ((void *)(va + offset)); 7869 } 7870 7871 void * 7872 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 7873 { 7874 7875 return (pmap_mapdev_internal(pa, size, mode, MAPDEV_FLUSHCACHE | 7876 MAPDEV_SETATTR)); 7877 } 7878 7879 void * 7880 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 7881 { 7882 7883 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 7884 } 7885 7886 void * 7887 pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size) 7888 { 7889 7890 return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE, 7891 MAPDEV_SETATTR)); 7892 } 7893 7894 void * 7895 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 7896 { 7897 7898 return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK, 7899 MAPDEV_FLUSHCACHE)); 7900 } 7901 7902 void 7903 pmap_unmapdev(vm_offset_t va, vm_size_t size) 7904 { 7905 struct pmap_preinit_mapping *ppim; 7906 vm_offset_t offset; 7907 int i; 7908 7909 /* If we gave a direct map region in pmap_mapdev, do nothing */ 7910 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 7911 return; 7912 offset = va & PAGE_MASK; 7913 size = round_page(offset + size); 7914 va = trunc_page(va); 7915 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 7916 ppim = pmap_preinit_mapping + i; 7917 if (ppim->va == va && ppim->sz == size) { 7918 if (pmap_initialized) 7919 return; 7920 ppim->pa = 0; 7921 ppim->va = 0; 7922 ppim->sz = 0; 7923 ppim->mode = 0; 7924 if (va + size == virtual_avail) 7925 virtual_avail = va; 7926 return; 7927 } 7928 } 7929 if (pmap_initialized) 7930 kva_free(va, size); 7931 } 7932 7933 /* 7934 * Tries to demote a 1GB page mapping. 7935 */ 7936 static boolean_t 7937 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) 7938 { 7939 pdp_entry_t newpdpe, oldpdpe; 7940 pd_entry_t *firstpde, newpde, *pde; 7941 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 7942 vm_paddr_t pdpgpa; 7943 vm_page_t pdpg; 7944 7945 PG_A = pmap_accessed_bit(pmap); 7946 PG_M = pmap_modified_bit(pmap); 7947 PG_V = pmap_valid_bit(pmap); 7948 PG_RW = pmap_rw_bit(pmap); 7949 7950 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7951 oldpdpe = *pdpe; 7952 KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), 7953 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 7954 if ((pdpg = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT | 7955 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 7956 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 7957 " in pmap %p", va, pmap); 7958 return (FALSE); 7959 } 7960 pdpgpa = VM_PAGE_TO_PHYS(pdpg); 7961 firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa); 7962 newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; 7963 KASSERT((oldpdpe & PG_A) != 0, 7964 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 7965 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 7966 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 7967 newpde = oldpdpe; 7968 7969 /* 7970 * Initialize the page directory page. 7971 */ 7972 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 7973 *pde = newpde; 7974 newpde += NBPDR; 7975 } 7976 7977 /* 7978 * Demote the mapping. 7979 */ 7980 *pdpe = newpdpe; 7981 7982 /* 7983 * Invalidate a stale recursive mapping of the page directory page. 7984 */ 7985 pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); 7986 7987 pmap_pdpe_demotions++; 7988 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 7989 " in pmap %p", va, pmap); 7990 return (TRUE); 7991 } 7992 7993 /* 7994 * Sets the memory attribute for the specified page. 7995 */ 7996 void 7997 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 7998 { 7999 8000 m->md.pat_mode = ma; 8001 8002 /* 8003 * If "m" is a normal page, update its direct mapping. This update 8004 * can be relied upon to perform any cache operations that are 8005 * required for data coherence. 8006 */ 8007 if ((m->flags & PG_FICTITIOUS) == 0 && 8008 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 8009 m->md.pat_mode)) 8010 panic("memory attribute change on the direct map failed"); 8011 } 8012 8013 /* 8014 * Changes the specified virtual address range's memory type to that given by 8015 * the parameter "mode". The specified virtual address range must be 8016 * completely contained within either the direct map or the kernel map. If 8017 * the virtual address range is contained within the kernel map, then the 8018 * memory type for each of the corresponding ranges of the direct map is also 8019 * changed. (The corresponding ranges of the direct map are those ranges that 8020 * map the same physical pages as the specified virtual address range.) These 8021 * changes to the direct map are necessary because Intel describes the 8022 * behavior of their processors as "undefined" if two or more mappings to the 8023 * same physical page have different memory types. 8024 * 8025 * Returns zero if the change completed successfully, and either EINVAL or 8026 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 8027 * of the virtual address range was not mapped, and ENOMEM is returned if 8028 * there was insufficient memory available to complete the change. In the 8029 * latter case, the memory type may have been changed on some part of the 8030 * virtual address range or the direct map. 8031 */ 8032 int 8033 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 8034 { 8035 int error; 8036 8037 PMAP_LOCK(kernel_pmap); 8038 error = pmap_change_attr_locked(va, size, mode, MAPDEV_FLUSHCACHE); 8039 PMAP_UNLOCK(kernel_pmap); 8040 return (error); 8041 } 8042 8043 static int 8044 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, int flags) 8045 { 8046 vm_offset_t base, offset, tmpva; 8047 vm_paddr_t pa_start, pa_end, pa_end1; 8048 pdp_entry_t *pdpe; 8049 pd_entry_t *pde; 8050 pt_entry_t *pte; 8051 int cache_bits_pte, cache_bits_pde, error; 8052 boolean_t changed; 8053 8054 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 8055 base = trunc_page(va); 8056 offset = va & PAGE_MASK; 8057 size = round_page(offset + size); 8058 8059 /* 8060 * Only supported on kernel virtual addresses, including the direct 8061 * map but excluding the recursive map. 8062 */ 8063 if (base < DMAP_MIN_ADDRESS) 8064 return (EINVAL); 8065 8066 cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1); 8067 cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0); 8068 changed = FALSE; 8069 8070 /* 8071 * Pages that aren't mapped aren't supported. Also break down 2MB pages 8072 * into 4KB pages if required. 8073 */ 8074 for (tmpva = base; tmpva < base + size; ) { 8075 pdpe = pmap_pdpe(kernel_pmap, tmpva); 8076 if (pdpe == NULL || *pdpe == 0) 8077 return (EINVAL); 8078 if (*pdpe & PG_PS) { 8079 /* 8080 * If the current 1GB page already has the required 8081 * memory type, then we need not demote this page. Just 8082 * increment tmpva to the next 1GB page frame. 8083 */ 8084 if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) { 8085 tmpva = trunc_1gpage(tmpva) + NBPDP; 8086 continue; 8087 } 8088 8089 /* 8090 * If the current offset aligns with a 1GB page frame 8091 * and there is at least 1GB left within the range, then 8092 * we need not break down this page into 2MB pages. 8093 */ 8094 if ((tmpva & PDPMASK) == 0 && 8095 tmpva + PDPMASK < base + size) { 8096 tmpva += NBPDP; 8097 continue; 8098 } 8099 if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) 8100 return (ENOMEM); 8101 } 8102 pde = pmap_pdpe_to_pde(pdpe, tmpva); 8103 if (*pde == 0) 8104 return (EINVAL); 8105 if (*pde & PG_PS) { 8106 /* 8107 * If the current 2MB page already has the required 8108 * memory type, then we need not demote this page. Just 8109 * increment tmpva to the next 2MB page frame. 8110 */ 8111 if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) { 8112 tmpva = trunc_2mpage(tmpva) + NBPDR; 8113 continue; 8114 } 8115 8116 /* 8117 * If the current offset aligns with a 2MB page frame 8118 * and there is at least 2MB left within the range, then 8119 * we need not break down this page into 4KB pages. 8120 */ 8121 if ((tmpva & PDRMASK) == 0 && 8122 tmpva + PDRMASK < base + size) { 8123 tmpva += NBPDR; 8124 continue; 8125 } 8126 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) 8127 return (ENOMEM); 8128 } 8129 pte = pmap_pde_to_pte(pde, tmpva); 8130 if (*pte == 0) 8131 return (EINVAL); 8132 tmpva += PAGE_SIZE; 8133 } 8134 error = 0; 8135 8136 /* 8137 * Ok, all the pages exist, so run through them updating their 8138 * cache mode if required. 8139 */ 8140 pa_start = pa_end = 0; 8141 for (tmpva = base; tmpva < base + size; ) { 8142 pdpe = pmap_pdpe(kernel_pmap, tmpva); 8143 if (*pdpe & PG_PS) { 8144 if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) { 8145 pmap_pde_attr(pdpe, cache_bits_pde, 8146 X86_PG_PDE_CACHE); 8147 changed = TRUE; 8148 } 8149 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 8150 (*pdpe & PG_PS_FRAME) < dmaplimit) { 8151 if (pa_start == pa_end) { 8152 /* Start physical address run. */ 8153 pa_start = *pdpe & PG_PS_FRAME; 8154 pa_end = pa_start + NBPDP; 8155 } else if (pa_end == (*pdpe & PG_PS_FRAME)) 8156 pa_end += NBPDP; 8157 else { 8158 /* Run ended, update direct map. */ 8159 error = pmap_change_attr_locked( 8160 PHYS_TO_DMAP(pa_start), 8161 pa_end - pa_start, mode, flags); 8162 if (error != 0) 8163 break; 8164 /* Start physical address run. */ 8165 pa_start = *pdpe & PG_PS_FRAME; 8166 pa_end = pa_start + NBPDP; 8167 } 8168 } 8169 tmpva = trunc_1gpage(tmpva) + NBPDP; 8170 continue; 8171 } 8172 pde = pmap_pdpe_to_pde(pdpe, tmpva); 8173 if (*pde & PG_PS) { 8174 if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) { 8175 pmap_pde_attr(pde, cache_bits_pde, 8176 X86_PG_PDE_CACHE); 8177 changed = TRUE; 8178 } 8179 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 8180 (*pde & PG_PS_FRAME) < dmaplimit) { 8181 if (pa_start == pa_end) { 8182 /* Start physical address run. */ 8183 pa_start = *pde & PG_PS_FRAME; 8184 pa_end = pa_start + NBPDR; 8185 } else if (pa_end == (*pde & PG_PS_FRAME)) 8186 pa_end += NBPDR; 8187 else { 8188 /* Run ended, update direct map. */ 8189 error = pmap_change_attr_locked( 8190 PHYS_TO_DMAP(pa_start), 8191 pa_end - pa_start, mode, flags); 8192 if (error != 0) 8193 break; 8194 /* Start physical address run. */ 8195 pa_start = *pde & PG_PS_FRAME; 8196 pa_end = pa_start + NBPDR; 8197 } 8198 } 8199 tmpva = trunc_2mpage(tmpva) + NBPDR; 8200 } else { 8201 pte = pmap_pde_to_pte(pde, tmpva); 8202 if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) { 8203 pmap_pte_attr(pte, cache_bits_pte, 8204 X86_PG_PTE_CACHE); 8205 changed = TRUE; 8206 } 8207 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 8208 (*pte & PG_FRAME) < dmaplimit) { 8209 if (pa_start == pa_end) { 8210 /* Start physical address run. */ 8211 pa_start = *pte & PG_FRAME; 8212 pa_end = pa_start + PAGE_SIZE; 8213 } else if (pa_end == (*pte & PG_FRAME)) 8214 pa_end += PAGE_SIZE; 8215 else { 8216 /* Run ended, update direct map. */ 8217 error = pmap_change_attr_locked( 8218 PHYS_TO_DMAP(pa_start), 8219 pa_end - pa_start, mode, flags); 8220 if (error != 0) 8221 break; 8222 /* Start physical address run. */ 8223 pa_start = *pte & PG_FRAME; 8224 pa_end = pa_start + PAGE_SIZE; 8225 } 8226 } 8227 tmpva += PAGE_SIZE; 8228 } 8229 } 8230 if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { 8231 pa_end1 = MIN(pa_end, dmaplimit); 8232 if (pa_start != pa_end1) 8233 error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), 8234 pa_end1 - pa_start, mode, flags); 8235 } 8236 8237 /* 8238 * Flush CPU caches if required to make sure any data isn't cached that 8239 * shouldn't be, etc. 8240 */ 8241 if (changed) { 8242 pmap_invalidate_range(kernel_pmap, base, tmpva); 8243 if ((flags & MAPDEV_FLUSHCACHE) != 0) 8244 pmap_invalidate_cache_range(base, tmpva); 8245 } 8246 return (error); 8247 } 8248 8249 /* 8250 * Demotes any mapping within the direct map region that covers more than the 8251 * specified range of physical addresses. This range's size must be a power 8252 * of two and its starting address must be a multiple of its size. Since the 8253 * demotion does not change any attributes of the mapping, a TLB invalidation 8254 * is not mandatory. The caller may, however, request a TLB invalidation. 8255 */ 8256 void 8257 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) 8258 { 8259 pdp_entry_t *pdpe; 8260 pd_entry_t *pde; 8261 vm_offset_t va; 8262 boolean_t changed; 8263 8264 if (len == 0) 8265 return; 8266 KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); 8267 KASSERT((base & (len - 1)) == 0, 8268 ("pmap_demote_DMAP: base is not a multiple of len")); 8269 if (len < NBPDP && base < dmaplimit) { 8270 va = PHYS_TO_DMAP(base); 8271 changed = FALSE; 8272 PMAP_LOCK(kernel_pmap); 8273 pdpe = pmap_pdpe(kernel_pmap, va); 8274 if ((*pdpe & X86_PG_V) == 0) 8275 panic("pmap_demote_DMAP: invalid PDPE"); 8276 if ((*pdpe & PG_PS) != 0) { 8277 if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) 8278 panic("pmap_demote_DMAP: PDPE failed"); 8279 changed = TRUE; 8280 } 8281 if (len < NBPDR) { 8282 pde = pmap_pdpe_to_pde(pdpe, va); 8283 if ((*pde & X86_PG_V) == 0) 8284 panic("pmap_demote_DMAP: invalid PDE"); 8285 if ((*pde & PG_PS) != 0) { 8286 if (!pmap_demote_pde(kernel_pmap, pde, va)) 8287 panic("pmap_demote_DMAP: PDE failed"); 8288 changed = TRUE; 8289 } 8290 } 8291 if (changed && invalidate) 8292 pmap_invalidate_page(kernel_pmap, va); 8293 PMAP_UNLOCK(kernel_pmap); 8294 } 8295 } 8296 8297 /* 8298 * perform the pmap work for mincore 8299 */ 8300 int 8301 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 8302 { 8303 pd_entry_t *pdep; 8304 pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; 8305 vm_paddr_t pa; 8306 int val; 8307 8308 PG_A = pmap_accessed_bit(pmap); 8309 PG_M = pmap_modified_bit(pmap); 8310 PG_V = pmap_valid_bit(pmap); 8311 PG_RW = pmap_rw_bit(pmap); 8312 8313 PMAP_LOCK(pmap); 8314 retry: 8315 pdep = pmap_pde(pmap, addr); 8316 if (pdep != NULL && (*pdep & PG_V)) { 8317 if (*pdep & PG_PS) { 8318 pte = *pdep; 8319 /* Compute the physical address of the 4KB page. */ 8320 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & 8321 PG_FRAME; 8322 val = MINCORE_SUPER; 8323 } else { 8324 pte = *pmap_pde_to_pte(pdep, addr); 8325 pa = pte & PG_FRAME; 8326 val = 0; 8327 } 8328 } else { 8329 pte = 0; 8330 pa = 0; 8331 val = 0; 8332 } 8333 if ((pte & PG_V) != 0) { 8334 val |= MINCORE_INCORE; 8335 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 8336 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 8337 if ((pte & PG_A) != 0) 8338 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 8339 } 8340 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 8341 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 8342 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 8343 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 8344 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 8345 goto retry; 8346 } else 8347 PA_UNLOCK_COND(*locked_pa); 8348 PMAP_UNLOCK(pmap); 8349 return (val); 8350 } 8351 8352 static uint64_t 8353 pmap_pcid_alloc(pmap_t pmap, u_int cpuid) 8354 { 8355 uint32_t gen, new_gen, pcid_next; 8356 8357 CRITICAL_ASSERT(curthread); 8358 gen = PCPU_GET(pcid_gen); 8359 if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN) 8360 return (pti ? 0 : CR3_PCID_SAVE); 8361 if (pmap->pm_pcids[cpuid].pm_gen == gen) 8362 return (CR3_PCID_SAVE); 8363 pcid_next = PCPU_GET(pcid_next); 8364 KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) || 8365 (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN), 8366 ("cpu %d pcid_next %#x", cpuid, pcid_next)); 8367 if ((!pti && pcid_next == PMAP_PCID_OVERMAX) || 8368 (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) { 8369 new_gen = gen + 1; 8370 if (new_gen == 0) 8371 new_gen = 1; 8372 PCPU_SET(pcid_gen, new_gen); 8373 pcid_next = PMAP_PCID_KERN + 1; 8374 } else { 8375 new_gen = gen; 8376 } 8377 pmap->pm_pcids[cpuid].pm_pcid = pcid_next; 8378 pmap->pm_pcids[cpuid].pm_gen = new_gen; 8379 PCPU_SET(pcid_next, pcid_next + 1); 8380 return (0); 8381 } 8382 8383 static uint64_t 8384 pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid) 8385 { 8386 uint64_t cached; 8387 8388 cached = pmap_pcid_alloc(pmap, cpuid); 8389 KASSERT(pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, 8390 ("pmap %p cpu %d pcid %#x", pmap, cpuid, 8391 pmap->pm_pcids[cpuid].pm_pcid)); 8392 KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || 8393 pmap == kernel_pmap, 8394 ("non-kernel pmap pmap %p cpu %d pcid %#x", 8395 pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); 8396 return (cached); 8397 } 8398 8399 static void 8400 pmap_activate_sw_pti_post(struct thread *td, pmap_t pmap) 8401 { 8402 8403 PCPU_GET(tssp)->tss_rsp0 = pmap->pm_ucr3 != PMAP_NO_CR3 ? 8404 PCPU_GET(pti_rsp0) : (uintptr_t)td->td_pcb; 8405 } 8406 8407 static void inline 8408 pmap_activate_sw_pcid_pti(pmap_t pmap, u_int cpuid, const bool invpcid_works1) 8409 { 8410 struct invpcid_descr d; 8411 uint64_t cached, cr3, kcr3, ucr3; 8412 8413 cached = pmap_pcid_alloc_checked(pmap, cpuid); 8414 cr3 = rcr3(); 8415 if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) 8416 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid); 8417 PCPU_SET(curpmap, pmap); 8418 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid; 8419 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid | 8420 PMAP_PCID_USER_PT; 8421 8422 if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) { 8423 /* 8424 * Explicitly invalidate translations cached from the 8425 * user page table. They are not automatically 8426 * flushed by reload of cr3 with the kernel page table 8427 * pointer above. 8428 * 8429 * Note that the if() condition is resolved statically 8430 * by using the function argument instead of 8431 * runtime-evaluated invpcid_works value. 8432 */ 8433 if (invpcid_works1) { 8434 d.pcid = PMAP_PCID_USER_PT | 8435 pmap->pm_pcids[cpuid].pm_pcid; 8436 d.pad = 0; 8437 d.addr = 0; 8438 invpcid(&d, INVPCID_CTX); 8439 } else { 8440 pmap_pti_pcid_invalidate(ucr3, kcr3); 8441 } 8442 } 8443 8444 PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); 8445 PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); 8446 if (cached) 8447 PCPU_INC(pm_save_cnt); 8448 } 8449 8450 static void 8451 pmap_activate_sw_pcid_invpcid_pti(struct thread *td, pmap_t pmap, u_int cpuid) 8452 { 8453 8454 pmap_activate_sw_pcid_pti(pmap, cpuid, true); 8455 pmap_activate_sw_pti_post(td, pmap); 8456 } 8457 8458 static void 8459 pmap_activate_sw_pcid_noinvpcid_pti(struct thread *td, pmap_t pmap, 8460 u_int cpuid) 8461 { 8462 register_t rflags; 8463 8464 /* 8465 * If the INVPCID instruction is not available, 8466 * invltlb_pcid_handler() is used to handle an invalidate_all 8467 * IPI, which checks for curpmap == smp_tlb_pmap. The below 8468 * sequence of operations has a window where %CR3 is loaded 8469 * with the new pmap's PML4 address, but the curpmap value has 8470 * not yet been updated. This causes the invltlb IPI handler, 8471 * which is called between the updates, to execute as a NOP, 8472 * which leaves stale TLB entries. 8473 * 8474 * Note that the most typical use of pmap_activate_sw(), from 8475 * the context switch, is immune to this race, because 8476 * interrupts are disabled (while the thread lock is owned), 8477 * and the IPI happens after curpmap is updated. Protect 8478 * other callers in a similar way, by disabling interrupts 8479 * around the %cr3 register reload and curpmap assignment. 8480 */ 8481 rflags = intr_disable(); 8482 pmap_activate_sw_pcid_pti(pmap, cpuid, false); 8483 intr_restore(rflags); 8484 pmap_activate_sw_pti_post(td, pmap); 8485 } 8486 8487 static void 8488 pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap, 8489 u_int cpuid) 8490 { 8491 uint64_t cached, cr3; 8492 8493 cached = pmap_pcid_alloc_checked(pmap, cpuid); 8494 cr3 = rcr3(); 8495 if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) 8496 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | 8497 cached); 8498 PCPU_SET(curpmap, pmap); 8499 if (cached) 8500 PCPU_INC(pm_save_cnt); 8501 } 8502 8503 static void 8504 pmap_activate_sw_pcid_noinvpcid_nopti(struct thread *td __unused, pmap_t pmap, 8505 u_int cpuid) 8506 { 8507 register_t rflags; 8508 8509 rflags = intr_disable(); 8510 pmap_activate_sw_pcid_nopti(td, pmap, cpuid); 8511 intr_restore(rflags); 8512 } 8513 8514 static void 8515 pmap_activate_sw_nopcid_nopti(struct thread *td __unused, pmap_t pmap, 8516 u_int cpuid __unused) 8517 { 8518 8519 load_cr3(pmap->pm_cr3); 8520 PCPU_SET(curpmap, pmap); 8521 } 8522 8523 static void 8524 pmap_activate_sw_nopcid_pti(struct thread *td, pmap_t pmap, 8525 u_int cpuid __unused) 8526 { 8527 8528 pmap_activate_sw_nopcid_nopti(td, pmap, cpuid); 8529 PCPU_SET(kcr3, pmap->pm_cr3); 8530 PCPU_SET(ucr3, pmap->pm_ucr3); 8531 pmap_activate_sw_pti_post(td, pmap); 8532 } 8533 8534 DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (struct thread *, pmap_t, 8535 u_int)) 8536 { 8537 8538 if (pmap_pcid_enabled && pti && invpcid_works) 8539 return (pmap_activate_sw_pcid_invpcid_pti); 8540 else if (pmap_pcid_enabled && pti && !invpcid_works) 8541 return (pmap_activate_sw_pcid_noinvpcid_pti); 8542 else if (pmap_pcid_enabled && !pti && invpcid_works) 8543 return (pmap_activate_sw_pcid_nopti); 8544 else if (pmap_pcid_enabled && !pti && !invpcid_works) 8545 return (pmap_activate_sw_pcid_noinvpcid_nopti); 8546 else if (!pmap_pcid_enabled && pti) 8547 return (pmap_activate_sw_nopcid_pti); 8548 else /* if (!pmap_pcid_enabled && !pti) */ 8549 return (pmap_activate_sw_nopcid_nopti); 8550 } 8551 8552 void 8553 pmap_activate_sw(struct thread *td) 8554 { 8555 pmap_t oldpmap, pmap; 8556 u_int cpuid; 8557 8558 oldpmap = PCPU_GET(curpmap); 8559 pmap = vmspace_pmap(td->td_proc->p_vmspace); 8560 if (oldpmap == pmap) 8561 return; 8562 cpuid = PCPU_GET(cpuid); 8563 #ifdef SMP 8564 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 8565 #else 8566 CPU_SET(cpuid, &pmap->pm_active); 8567 #endif 8568 pmap_activate_sw_mode(td, pmap, cpuid); 8569 #ifdef SMP 8570 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 8571 #else 8572 CPU_CLR(cpuid, &oldpmap->pm_active); 8573 #endif 8574 } 8575 8576 void 8577 pmap_activate(struct thread *td) 8578 { 8579 8580 critical_enter(); 8581 pmap_activate_sw(td); 8582 critical_exit(); 8583 } 8584 8585 void 8586 pmap_activate_boot(pmap_t pmap) 8587 { 8588 uint64_t kcr3; 8589 u_int cpuid; 8590 8591 /* 8592 * kernel_pmap must be never deactivated, and we ensure that 8593 * by never activating it at all. 8594 */ 8595 MPASS(pmap != kernel_pmap); 8596 8597 cpuid = PCPU_GET(cpuid); 8598 #ifdef SMP 8599 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 8600 #else 8601 CPU_SET(cpuid, &pmap->pm_active); 8602 #endif 8603 PCPU_SET(curpmap, pmap); 8604 if (pti) { 8605 kcr3 = pmap->pm_cr3; 8606 if (pmap_pcid_enabled) 8607 kcr3 |= pmap->pm_pcids[cpuid].pm_pcid | CR3_PCID_SAVE; 8608 } else { 8609 kcr3 = PMAP_NO_CR3; 8610 } 8611 PCPU_SET(kcr3, kcr3); 8612 PCPU_SET(ucr3, PMAP_NO_CR3); 8613 } 8614 8615 void 8616 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 8617 { 8618 } 8619 8620 /* 8621 * Increase the starting virtual address of the given mapping if a 8622 * different alignment might result in more superpage mappings. 8623 */ 8624 void 8625 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 8626 vm_offset_t *addr, vm_size_t size) 8627 { 8628 vm_offset_t superpage_offset; 8629 8630 if (size < NBPDR) 8631 return; 8632 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 8633 offset += ptoa(object->pg_color); 8634 superpage_offset = offset & PDRMASK; 8635 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 8636 (*addr & PDRMASK) == superpage_offset) 8637 return; 8638 if ((*addr & PDRMASK) < superpage_offset) 8639 *addr = (*addr & ~PDRMASK) + superpage_offset; 8640 else 8641 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 8642 } 8643 8644 #ifdef INVARIANTS 8645 static unsigned long num_dirty_emulations; 8646 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, 8647 &num_dirty_emulations, 0, NULL); 8648 8649 static unsigned long num_accessed_emulations; 8650 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, 8651 &num_accessed_emulations, 0, NULL); 8652 8653 static unsigned long num_superpage_accessed_emulations; 8654 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, 8655 &num_superpage_accessed_emulations, 0, NULL); 8656 8657 static unsigned long ad_emulation_superpage_promotions; 8658 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, 8659 &ad_emulation_superpage_promotions, 0, NULL); 8660 #endif /* INVARIANTS */ 8661 8662 int 8663 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) 8664 { 8665 int rv; 8666 struct rwlock *lock; 8667 #if VM_NRESERVLEVEL > 0 8668 vm_page_t m, mpte; 8669 #endif 8670 pd_entry_t *pde; 8671 pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; 8672 8673 KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, 8674 ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); 8675 8676 if (!pmap_emulate_ad_bits(pmap)) 8677 return (-1); 8678 8679 PG_A = pmap_accessed_bit(pmap); 8680 PG_M = pmap_modified_bit(pmap); 8681 PG_V = pmap_valid_bit(pmap); 8682 PG_RW = pmap_rw_bit(pmap); 8683 8684 rv = -1; 8685 lock = NULL; 8686 PMAP_LOCK(pmap); 8687 8688 pde = pmap_pde(pmap, va); 8689 if (pde == NULL || (*pde & PG_V) == 0) 8690 goto done; 8691 8692 if ((*pde & PG_PS) != 0) { 8693 if (ftype == VM_PROT_READ) { 8694 #ifdef INVARIANTS 8695 atomic_add_long(&num_superpage_accessed_emulations, 1); 8696 #endif 8697 *pde |= PG_A; 8698 rv = 0; 8699 } 8700 goto done; 8701 } 8702 8703 pte = pmap_pde_to_pte(pde, va); 8704 if ((*pte & PG_V) == 0) 8705 goto done; 8706 8707 if (ftype == VM_PROT_WRITE) { 8708 if ((*pte & PG_RW) == 0) 8709 goto done; 8710 /* 8711 * Set the modified and accessed bits simultaneously. 8712 * 8713 * Intel EPT PTEs that do software emulation of A/D bits map 8714 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively. 8715 * An EPT misconfiguration is triggered if the PTE is writable 8716 * but not readable (WR=10). This is avoided by setting PG_A 8717 * and PG_M simultaneously. 8718 */ 8719 *pte |= PG_M | PG_A; 8720 } else { 8721 *pte |= PG_A; 8722 } 8723 8724 #if VM_NRESERVLEVEL > 0 8725 /* try to promote the mapping */ 8726 if (va < VM_MAXUSER_ADDRESS) 8727 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 8728 else 8729 mpte = NULL; 8730 8731 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 8732 8733 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 8734 pmap_ps_enabled(pmap) && 8735 (m->flags & PG_FICTITIOUS) == 0 && 8736 vm_reserv_level_iffullpop(m) == 0) { 8737 pmap_promote_pde(pmap, pde, va, &lock); 8738 #ifdef INVARIANTS 8739 atomic_add_long(&ad_emulation_superpage_promotions, 1); 8740 #endif 8741 } 8742 #endif 8743 8744 #ifdef INVARIANTS 8745 if (ftype == VM_PROT_WRITE) 8746 atomic_add_long(&num_dirty_emulations, 1); 8747 else 8748 atomic_add_long(&num_accessed_emulations, 1); 8749 #endif 8750 rv = 0; /* success */ 8751 done: 8752 if (lock != NULL) 8753 rw_wunlock(lock); 8754 PMAP_UNLOCK(pmap); 8755 return (rv); 8756 } 8757 8758 void 8759 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) 8760 { 8761 pml4_entry_t *pml4; 8762 pdp_entry_t *pdp; 8763 pd_entry_t *pde; 8764 pt_entry_t *pte, PG_V; 8765 int idx; 8766 8767 idx = 0; 8768 PG_V = pmap_valid_bit(pmap); 8769 PMAP_LOCK(pmap); 8770 8771 pml4 = pmap_pml4e(pmap, va); 8772 ptr[idx++] = *pml4; 8773 if ((*pml4 & PG_V) == 0) 8774 goto done; 8775 8776 pdp = pmap_pml4e_to_pdpe(pml4, va); 8777 ptr[idx++] = *pdp; 8778 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) 8779 goto done; 8780 8781 pde = pmap_pdpe_to_pde(pdp, va); 8782 ptr[idx++] = *pde; 8783 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) 8784 goto done; 8785 8786 pte = pmap_pde_to_pte(pde, va); 8787 ptr[idx++] = *pte; 8788 8789 done: 8790 PMAP_UNLOCK(pmap); 8791 *num = idx; 8792 } 8793 8794 /** 8795 * Get the kernel virtual address of a set of physical pages. If there are 8796 * physical addresses not covered by the DMAP perform a transient mapping 8797 * that will be removed when calling pmap_unmap_io_transient. 8798 * 8799 * \param page The pages the caller wishes to obtain the virtual 8800 * address on the kernel memory map. 8801 * \param vaddr On return contains the kernel virtual memory address 8802 * of the pages passed in the page parameter. 8803 * \param count Number of pages passed in. 8804 * \param can_fault TRUE if the thread using the mapped pages can take 8805 * page faults, FALSE otherwise. 8806 * 8807 * \returns TRUE if the caller must call pmap_unmap_io_transient when 8808 * finished or FALSE otherwise. 8809 * 8810 */ 8811 boolean_t 8812 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 8813 boolean_t can_fault) 8814 { 8815 vm_paddr_t paddr; 8816 boolean_t needs_mapping; 8817 pt_entry_t *pte; 8818 int cache_bits, error __unused, i; 8819 8820 /* 8821 * Allocate any KVA space that we need, this is done in a separate 8822 * loop to prevent calling vmem_alloc while pinned. 8823 */ 8824 needs_mapping = FALSE; 8825 for (i = 0; i < count; i++) { 8826 paddr = VM_PAGE_TO_PHYS(page[i]); 8827 if (__predict_false(paddr >= dmaplimit)) { 8828 error = vmem_alloc(kernel_arena, PAGE_SIZE, 8829 M_BESTFIT | M_WAITOK, &vaddr[i]); 8830 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 8831 needs_mapping = TRUE; 8832 } else { 8833 vaddr[i] = PHYS_TO_DMAP(paddr); 8834 } 8835 } 8836 8837 /* Exit early if everything is covered by the DMAP */ 8838 if (!needs_mapping) 8839 return (FALSE); 8840 8841 /* 8842 * NB: The sequence of updating a page table followed by accesses 8843 * to the corresponding pages used in the !DMAP case is subject to 8844 * the situation described in the "AMD64 Architecture Programmer's 8845 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special 8846 * Coherency Considerations". Therefore, issuing the INVLPG right 8847 * after modifying the PTE bits is crucial. 8848 */ 8849 if (!can_fault) 8850 sched_pin(); 8851 for (i = 0; i < count; i++) { 8852 paddr = VM_PAGE_TO_PHYS(page[i]); 8853 if (paddr >= dmaplimit) { 8854 if (can_fault) { 8855 /* 8856 * Slow path, since we can get page faults 8857 * while mappings are active don't pin the 8858 * thread to the CPU and instead add a global 8859 * mapping visible to all CPUs. 8860 */ 8861 pmap_qenter(vaddr[i], &page[i], 1); 8862 } else { 8863 pte = vtopte(vaddr[i]); 8864 cache_bits = pmap_cache_bits(kernel_pmap, 8865 page[i]->md.pat_mode, 0); 8866 pte_store(pte, paddr | X86_PG_RW | X86_PG_V | 8867 cache_bits); 8868 invlpg(vaddr[i]); 8869 } 8870 } 8871 } 8872 8873 return (needs_mapping); 8874 } 8875 8876 void 8877 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 8878 boolean_t can_fault) 8879 { 8880 vm_paddr_t paddr; 8881 int i; 8882 8883 if (!can_fault) 8884 sched_unpin(); 8885 for (i = 0; i < count; i++) { 8886 paddr = VM_PAGE_TO_PHYS(page[i]); 8887 if (paddr >= dmaplimit) { 8888 if (can_fault) 8889 pmap_qremove(vaddr[i], 1); 8890 vmem_free(kernel_arena, vaddr[i], PAGE_SIZE); 8891 } 8892 } 8893 } 8894 8895 vm_offset_t 8896 pmap_quick_enter_page(vm_page_t m) 8897 { 8898 vm_paddr_t paddr; 8899 8900 paddr = VM_PAGE_TO_PHYS(m); 8901 if (paddr < dmaplimit) 8902 return (PHYS_TO_DMAP(paddr)); 8903 mtx_lock_spin(&qframe_mtx); 8904 KASSERT(*vtopte(qframe) == 0, ("qframe busy")); 8905 pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A | 8906 X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0)); 8907 return (qframe); 8908 } 8909 8910 void 8911 pmap_quick_remove_page(vm_offset_t addr) 8912 { 8913 8914 if (addr != qframe) 8915 return; 8916 pte_store(vtopte(qframe), 0); 8917 invlpg(qframe); 8918 mtx_unlock_spin(&qframe_mtx); 8919 } 8920 8921 /* 8922 * Pdp pages from the large map are managed differently from either 8923 * kernel or user page table pages. They are permanently allocated at 8924 * initialization time, and their wire count is permanently set to 8925 * zero. The pml4 entries pointing to those pages are copied into 8926 * each allocated pmap. 8927 * 8928 * In contrast, pd and pt pages are managed like user page table 8929 * pages. They are dynamically allocated, and their wire count 8930 * represents the number of valid entries within the page. 8931 */ 8932 static vm_page_t 8933 pmap_large_map_getptp_unlocked(void) 8934 { 8935 vm_page_t m; 8936 8937 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 8938 VM_ALLOC_ZERO); 8939 if (m != NULL && (m->flags & PG_ZERO) == 0) 8940 pmap_zero_page(m); 8941 return (m); 8942 } 8943 8944 static vm_page_t 8945 pmap_large_map_getptp(void) 8946 { 8947 vm_page_t m; 8948 8949 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 8950 m = pmap_large_map_getptp_unlocked(); 8951 if (m == NULL) { 8952 PMAP_UNLOCK(kernel_pmap); 8953 vm_wait(NULL); 8954 PMAP_LOCK(kernel_pmap); 8955 /* Callers retry. */ 8956 } 8957 return (m); 8958 } 8959 8960 static pdp_entry_t * 8961 pmap_large_map_pdpe(vm_offset_t va) 8962 { 8963 vm_pindex_t pml4_idx; 8964 vm_paddr_t mphys; 8965 8966 pml4_idx = pmap_pml4e_index(va); 8967 KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents, 8968 ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I " 8969 "%#jx lm_ents %d", 8970 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); 8971 KASSERT((kernel_pmap->pm_pml4[pml4_idx] & X86_PG_V) != 0, 8972 ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " 8973 "LMSPML4I %#jx lm_ents %d", 8974 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); 8975 mphys = kernel_pmap->pm_pml4[pml4_idx] & PG_FRAME; 8976 return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va)); 8977 } 8978 8979 static pd_entry_t * 8980 pmap_large_map_pde(vm_offset_t va) 8981 { 8982 pdp_entry_t *pdpe; 8983 vm_page_t m; 8984 vm_paddr_t mphys; 8985 8986 retry: 8987 pdpe = pmap_large_map_pdpe(va); 8988 if (*pdpe == 0) { 8989 m = pmap_large_map_getptp(); 8990 if (m == NULL) 8991 goto retry; 8992 mphys = VM_PAGE_TO_PHYS(m); 8993 *pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; 8994 } else { 8995 MPASS((*pdpe & X86_PG_PS) == 0); 8996 mphys = *pdpe & PG_FRAME; 8997 } 8998 return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va)); 8999 } 9000 9001 static pt_entry_t * 9002 pmap_large_map_pte(vm_offset_t va) 9003 { 9004 pd_entry_t *pde; 9005 vm_page_t m; 9006 vm_paddr_t mphys; 9007 9008 retry: 9009 pde = pmap_large_map_pde(va); 9010 if (*pde == 0) { 9011 m = pmap_large_map_getptp(); 9012 if (m == NULL) 9013 goto retry; 9014 mphys = VM_PAGE_TO_PHYS(m); 9015 *pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; 9016 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->wire_count++; 9017 } else { 9018 MPASS((*pde & X86_PG_PS) == 0); 9019 mphys = *pde & PG_FRAME; 9020 } 9021 return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va)); 9022 } 9023 9024 static vm_paddr_t 9025 pmap_large_map_kextract(vm_offset_t va) 9026 { 9027 pdp_entry_t *pdpe, pdp; 9028 pd_entry_t *pde, pd; 9029 pt_entry_t *pte, pt; 9030 9031 KASSERT(PMAP_ADDRESS_IN_LARGEMAP(va), 9032 ("not largemap range %#lx", (u_long)va)); 9033 pdpe = pmap_large_map_pdpe(va); 9034 pdp = *pdpe; 9035 KASSERT((pdp & X86_PG_V) != 0, 9036 ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, 9037 (u_long)pdpe, pdp)); 9038 if ((pdp & X86_PG_PS) != 0) { 9039 KASSERT((amd_feature & AMDID_PAGE1GB) != 0, 9040 ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, 9041 (u_long)pdpe, pdp)); 9042 return ((pdp & PG_PS_PDP_FRAME) | (va & PDPMASK)); 9043 } 9044 pde = pmap_pdpe_to_pde(pdpe, va); 9045 pd = *pde; 9046 KASSERT((pd & X86_PG_V) != 0, 9047 ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); 9048 if ((pd & X86_PG_PS) != 0) 9049 return ((pd & PG_PS_FRAME) | (va & PDRMASK)); 9050 pte = pmap_pde_to_pte(pde, va); 9051 pt = *pte; 9052 KASSERT((pt & X86_PG_V) != 0, 9053 ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, pt)); 9054 return ((pt & PG_FRAME) | (va & PAGE_MASK)); 9055 } 9056 9057 static int 9058 pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase, 9059 vmem_addr_t *vmem_res) 9060 { 9061 9062 /* 9063 * Large mappings are all but static. Consequently, there 9064 * is no point in waiting for an earlier allocation to be 9065 * freed. 9066 */ 9067 return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN, 9068 VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res)); 9069 } 9070 9071 int 9072 pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr, 9073 vm_memattr_t mattr) 9074 { 9075 pdp_entry_t *pdpe; 9076 pd_entry_t *pde; 9077 pt_entry_t *pte; 9078 vm_offset_t va, inc; 9079 vmem_addr_t vmem_res; 9080 vm_paddr_t pa; 9081 int error; 9082 9083 if (len == 0 || spa + len < spa) 9084 return (EINVAL); 9085 9086 /* See if DMAP can serve. */ 9087 if (spa + len <= dmaplimit) { 9088 va = PHYS_TO_DMAP(spa); 9089 *addr = (void *)va; 9090 return (pmap_change_attr(va, len, mattr)); 9091 } 9092 9093 /* 9094 * No, allocate KVA. Fit the address with best possible 9095 * alignment for superpages. Fall back to worse align if 9096 * failed. 9097 */ 9098 error = ENOMEM; 9099 if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len, 9100 NBPDP) >= roundup2(spa, NBPDP) + NBPDP) 9101 error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK, 9102 &vmem_res); 9103 if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa, 9104 NBPDR) + NBPDR) 9105 error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK, 9106 &vmem_res); 9107 if (error != 0) 9108 error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res); 9109 if (error != 0) 9110 return (error); 9111 9112 /* 9113 * Fill pagetable. PG_M is not pre-set, we scan modified bits 9114 * in the pagetable to minimize flushing. No need to 9115 * invalidate TLB, since we only update invalid entries. 9116 */ 9117 PMAP_LOCK(kernel_pmap); 9118 for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc, 9119 len -= inc) { 9120 if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP && 9121 (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) { 9122 pdpe = pmap_large_map_pdpe(va); 9123 MPASS(*pdpe == 0); 9124 *pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW | 9125 X86_PG_V | X86_PG_A | pg_nx | 9126 pmap_cache_bits(kernel_pmap, mattr, TRUE); 9127 inc = NBPDP; 9128 } else if (len >= NBPDR && (pa & PDRMASK) == 0 && 9129 (va & PDRMASK) == 0) { 9130 pde = pmap_large_map_pde(va); 9131 MPASS(*pde == 0); 9132 *pde = pa | pg_g | X86_PG_PS | X86_PG_RW | 9133 X86_PG_V | X86_PG_A | pg_nx | 9134 pmap_cache_bits(kernel_pmap, mattr, TRUE); 9135 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))-> 9136 wire_count++; 9137 inc = NBPDR; 9138 } else { 9139 pte = pmap_large_map_pte(va); 9140 MPASS(*pte == 0); 9141 *pte = pa | pg_g | X86_PG_RW | X86_PG_V | 9142 X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, 9143 mattr, FALSE); 9144 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))-> 9145 wire_count++; 9146 inc = PAGE_SIZE; 9147 } 9148 } 9149 PMAP_UNLOCK(kernel_pmap); 9150 MPASS(len == 0); 9151 9152 *addr = (void *)vmem_res; 9153 return (0); 9154 } 9155 9156 void 9157 pmap_large_unmap(void *svaa, vm_size_t len) 9158 { 9159 vm_offset_t sva, va; 9160 vm_size_t inc; 9161 pdp_entry_t *pdpe, pdp; 9162 pd_entry_t *pde, pd; 9163 pt_entry_t *pte; 9164 vm_page_t m; 9165 struct spglist spgf; 9166 9167 sva = (vm_offset_t)svaa; 9168 if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS && 9169 sva + len <= DMAP_MIN_ADDRESS + dmaplimit)) 9170 return; 9171 9172 SLIST_INIT(&spgf); 9173 KASSERT(PMAP_ADDRESS_IN_LARGEMAP(sva) && 9174 PMAP_ADDRESS_IN_LARGEMAP(sva + len - 1), 9175 ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len)); 9176 PMAP_LOCK(kernel_pmap); 9177 for (va = sva; va < sva + len; va += inc) { 9178 pdpe = pmap_large_map_pdpe(va); 9179 pdp = *pdpe; 9180 KASSERT((pdp & X86_PG_V) != 0, 9181 ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, 9182 (u_long)pdpe, pdp)); 9183 if ((pdp & X86_PG_PS) != 0) { 9184 KASSERT((amd_feature & AMDID_PAGE1GB) != 0, 9185 ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, 9186 (u_long)pdpe, pdp)); 9187 KASSERT((va & PDPMASK) == 0, 9188 ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va, 9189 (u_long)pdpe, pdp)); 9190 KASSERT(va + NBPDP <= sva + len, 9191 ("unmap covers partial 1GB page, sva %#lx va %#lx " 9192 "pdpe %#lx pdp %#lx len %#lx", sva, va, 9193 (u_long)pdpe, pdp, len)); 9194 *pdpe = 0; 9195 inc = NBPDP; 9196 continue; 9197 } 9198 pde = pmap_pdpe_to_pde(pdpe, va); 9199 pd = *pde; 9200 KASSERT((pd & X86_PG_V) != 0, 9201 ("invalid pd va %#lx pde %#lx pd %#lx", va, 9202 (u_long)pde, pd)); 9203 if ((pd & X86_PG_PS) != 0) { 9204 KASSERT((va & PDRMASK) == 0, 9205 ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va, 9206 (u_long)pde, pd)); 9207 KASSERT(va + NBPDR <= sva + len, 9208 ("unmap covers partial 2MB page, sva %#lx va %#lx " 9209 "pde %#lx pd %#lx len %#lx", sva, va, (u_long)pde, 9210 pd, len)); 9211 pde_store(pde, 0); 9212 inc = NBPDR; 9213 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); 9214 m->wire_count--; 9215 if (m->wire_count == 0) { 9216 *pdpe = 0; 9217 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 9218 } 9219 continue; 9220 } 9221 pte = pmap_pde_to_pte(pde, va); 9222 KASSERT((*pte & X86_PG_V) != 0, 9223 ("invalid pte va %#lx pte %#lx pt %#lx", va, 9224 (u_long)pte, *pte)); 9225 pte_clear(pte); 9226 inc = PAGE_SIZE; 9227 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte)); 9228 m->wire_count--; 9229 if (m->wire_count == 0) { 9230 *pde = 0; 9231 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 9232 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); 9233 m->wire_count--; 9234 if (m->wire_count == 0) { 9235 *pdpe = 0; 9236 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 9237 } 9238 } 9239 } 9240 pmap_invalidate_range(kernel_pmap, sva, sva + len); 9241 PMAP_UNLOCK(kernel_pmap); 9242 vm_page_free_pages_toq(&spgf, false); 9243 vmem_free(large_vmem, sva, len); 9244 } 9245 9246 static void 9247 pmap_large_map_wb_fence_mfence(void) 9248 { 9249 9250 mfence(); 9251 } 9252 9253 static void 9254 pmap_large_map_wb_fence_sfence(void) 9255 { 9256 9257 sfence(); 9258 } 9259 9260 static void 9261 pmap_large_map_wb_fence_nop(void) 9262 { 9263 } 9264 9265 DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void)) 9266 { 9267 9268 if (cpu_vendor_id != CPU_VENDOR_INTEL) 9269 return (pmap_large_map_wb_fence_mfence); 9270 else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB | 9271 CPUID_STDEXT_CLFLUSHOPT)) == 0) 9272 return (pmap_large_map_wb_fence_sfence); 9273 else 9274 /* clflush is strongly enough ordered */ 9275 return (pmap_large_map_wb_fence_nop); 9276 } 9277 9278 static void 9279 pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len) 9280 { 9281 9282 for (; len > 0; len -= cpu_clflush_line_size, 9283 va += cpu_clflush_line_size) 9284 clwb(va); 9285 } 9286 9287 static void 9288 pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len) 9289 { 9290 9291 for (; len > 0; len -= cpu_clflush_line_size, 9292 va += cpu_clflush_line_size) 9293 clflushopt(va); 9294 } 9295 9296 static void 9297 pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len) 9298 { 9299 9300 for (; len > 0; len -= cpu_clflush_line_size, 9301 va += cpu_clflush_line_size) 9302 clflush(va); 9303 } 9304 9305 static void 9306 pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len __unused) 9307 { 9308 } 9309 9310 DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t, vm_size_t)) 9311 { 9312 9313 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0) 9314 return (pmap_large_map_flush_range_clwb); 9315 else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) 9316 return (pmap_large_map_flush_range_clflushopt); 9317 else if ((cpu_feature & CPUID_CLFSH) != 0) 9318 return (pmap_large_map_flush_range_clflush); 9319 else 9320 return (pmap_large_map_flush_range_nop); 9321 } 9322 9323 static void 9324 pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva) 9325 { 9326 volatile u_long *pe; 9327 u_long p; 9328 vm_offset_t va; 9329 vm_size_t inc; 9330 bool seen_other; 9331 9332 for (va = sva; va < eva; va += inc) { 9333 inc = 0; 9334 if ((amd_feature & AMDID_PAGE1GB) != 0) { 9335 pe = (volatile u_long *)pmap_large_map_pdpe(va); 9336 p = *pe; 9337 if ((p & X86_PG_PS) != 0) 9338 inc = NBPDP; 9339 } 9340 if (inc == 0) { 9341 pe = (volatile u_long *)pmap_large_map_pde(va); 9342 p = *pe; 9343 if ((p & X86_PG_PS) != 0) 9344 inc = NBPDR; 9345 } 9346 if (inc == 0) { 9347 pe = (volatile u_long *)pmap_large_map_pte(va); 9348 p = *pe; 9349 inc = PAGE_SIZE; 9350 } 9351 seen_other = false; 9352 for (;;) { 9353 if ((p & X86_PG_AVAIL1) != 0) { 9354 /* 9355 * Spin-wait for the end of a parallel 9356 * write-back. 9357 */ 9358 cpu_spinwait(); 9359 p = *pe; 9360 9361 /* 9362 * If we saw other write-back 9363 * occuring, we cannot rely on PG_M to 9364 * indicate state of the cache. The 9365 * PG_M bit is cleared before the 9366 * flush to avoid ignoring new writes, 9367 * and writes which are relevant for 9368 * us might happen after. 9369 */ 9370 seen_other = true; 9371 continue; 9372 } 9373 9374 if ((p & X86_PG_M) != 0 || seen_other) { 9375 if (!atomic_fcmpset_long(pe, &p, 9376 (p & ~X86_PG_M) | X86_PG_AVAIL1)) 9377 /* 9378 * If we saw PG_M without 9379 * PG_AVAIL1, and then on the 9380 * next attempt we do not 9381 * observe either PG_M or 9382 * PG_AVAIL1, the other 9383 * write-back started after us 9384 * and finished before us. We 9385 * can rely on it doing our 9386 * work. 9387 */ 9388 continue; 9389 pmap_large_map_flush_range(va, inc); 9390 atomic_clear_long(pe, X86_PG_AVAIL1); 9391 } 9392 break; 9393 } 9394 maybe_yield(); 9395 } 9396 } 9397 9398 /* 9399 * Write-back cache lines for the given address range. 9400 * 9401 * Must be called only on the range or sub-range returned from 9402 * pmap_large_map(). Must not be called on the coalesced ranges. 9403 * 9404 * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH 9405 * instructions support. 9406 */ 9407 void 9408 pmap_large_map_wb(void *svap, vm_size_t len) 9409 { 9410 vm_offset_t eva, sva; 9411 9412 sva = (vm_offset_t)svap; 9413 eva = sva + len; 9414 pmap_large_map_wb_fence(); 9415 if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) { 9416 pmap_large_map_flush_range(sva, len); 9417 } else { 9418 KASSERT(sva >= LARGEMAP_MIN_ADDRESS && 9419 eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4, 9420 ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len)); 9421 pmap_large_map_wb_large(sva, eva); 9422 } 9423 pmap_large_map_wb_fence(); 9424 } 9425 9426 static vm_page_t 9427 pmap_pti_alloc_page(void) 9428 { 9429 vm_page_t m; 9430 9431 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 9432 m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY | 9433 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 9434 return (m); 9435 } 9436 9437 static bool 9438 pmap_pti_free_page(vm_page_t m) 9439 { 9440 9441 KASSERT(m->wire_count > 0, ("page %p not wired", m)); 9442 if (!vm_page_unwire_noq(m)) 9443 return (false); 9444 vm_page_free_zero(m); 9445 return (true); 9446 } 9447 9448 static void 9449 pmap_pti_init(void) 9450 { 9451 vm_page_t pml4_pg; 9452 pdp_entry_t *pdpe; 9453 vm_offset_t va; 9454 int i; 9455 9456 if (!pti) 9457 return; 9458 pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL); 9459 VM_OBJECT_WLOCK(pti_obj); 9460 pml4_pg = pmap_pti_alloc_page(); 9461 pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg)); 9462 for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS && 9463 va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) { 9464 pdpe = pmap_pti_pdpe(va); 9465 pmap_pti_wire_pte(pdpe); 9466 } 9467 pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0], 9468 (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false); 9469 pmap_pti_add_kva_locked((vm_offset_t)gdt, (vm_offset_t)gdt + 9470 sizeof(struct user_segment_descriptor) * NGDT * MAXCPU, false); 9471 pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt + 9472 sizeof(struct gate_descriptor) * NIDT, false); 9473 pmap_pti_add_kva_locked((vm_offset_t)common_tss, 9474 (vm_offset_t)common_tss + sizeof(struct amd64tss) * MAXCPU, false); 9475 CPU_FOREACH(i) { 9476 /* Doublefault stack IST 1 */ 9477 va = common_tss[i].tss_ist1; 9478 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); 9479 /* NMI stack IST 2 */ 9480 va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu); 9481 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); 9482 /* MC# stack IST 3 */ 9483 va = common_tss[i].tss_ist3 + sizeof(struct nmi_pcpu); 9484 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); 9485 /* DB# stack IST 4 */ 9486 va = common_tss[i].tss_ist4 + sizeof(struct nmi_pcpu); 9487 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); 9488 } 9489 pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE, 9490 (vm_offset_t)etext, true); 9491 pti_finalized = true; 9492 VM_OBJECT_WUNLOCK(pti_obj); 9493 } 9494 SYSINIT(pmap_pti, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_pti_init, NULL); 9495 9496 static pdp_entry_t * 9497 pmap_pti_pdpe(vm_offset_t va) 9498 { 9499 pml4_entry_t *pml4e; 9500 pdp_entry_t *pdpe; 9501 vm_page_t m; 9502 vm_pindex_t pml4_idx; 9503 vm_paddr_t mphys; 9504 9505 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 9506 9507 pml4_idx = pmap_pml4e_index(va); 9508 pml4e = &pti_pml4[pml4_idx]; 9509 m = NULL; 9510 if (*pml4e == 0) { 9511 if (pti_finalized) 9512 panic("pml4 alloc after finalization\n"); 9513 m = pmap_pti_alloc_page(); 9514 if (*pml4e != 0) { 9515 pmap_pti_free_page(m); 9516 mphys = *pml4e & ~PAGE_MASK; 9517 } else { 9518 mphys = VM_PAGE_TO_PHYS(m); 9519 *pml4e = mphys | X86_PG_RW | X86_PG_V; 9520 } 9521 } else { 9522 mphys = *pml4e & ~PAGE_MASK; 9523 } 9524 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va); 9525 return (pdpe); 9526 } 9527 9528 static void 9529 pmap_pti_wire_pte(void *pte) 9530 { 9531 vm_page_t m; 9532 9533 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 9534 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); 9535 m->wire_count++; 9536 } 9537 9538 static void 9539 pmap_pti_unwire_pde(void *pde, bool only_ref) 9540 { 9541 vm_page_t m; 9542 9543 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 9544 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde)); 9545 MPASS(m->wire_count > 0); 9546 MPASS(only_ref || m->wire_count > 1); 9547 pmap_pti_free_page(m); 9548 } 9549 9550 static void 9551 pmap_pti_unwire_pte(void *pte, vm_offset_t va) 9552 { 9553 vm_page_t m; 9554 pd_entry_t *pde; 9555 9556 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 9557 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); 9558 MPASS(m->wire_count > 0); 9559 if (pmap_pti_free_page(m)) { 9560 pde = pmap_pti_pde(va); 9561 MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V); 9562 *pde = 0; 9563 pmap_pti_unwire_pde(pde, false); 9564 } 9565 } 9566 9567 static pd_entry_t * 9568 pmap_pti_pde(vm_offset_t va) 9569 { 9570 pdp_entry_t *pdpe; 9571 pd_entry_t *pde; 9572 vm_page_t m; 9573 vm_pindex_t pd_idx; 9574 vm_paddr_t mphys; 9575 9576 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 9577 9578 pdpe = pmap_pti_pdpe(va); 9579 if (*pdpe == 0) { 9580 m = pmap_pti_alloc_page(); 9581 if (*pdpe != 0) { 9582 pmap_pti_free_page(m); 9583 MPASS((*pdpe & X86_PG_PS) == 0); 9584 mphys = *pdpe & ~PAGE_MASK; 9585 } else { 9586 mphys = VM_PAGE_TO_PHYS(m); 9587 *pdpe = mphys | X86_PG_RW | X86_PG_V; 9588 } 9589 } else { 9590 MPASS((*pdpe & X86_PG_PS) == 0); 9591 mphys = *pdpe & ~PAGE_MASK; 9592 } 9593 9594 pde = (pd_entry_t *)PHYS_TO_DMAP(mphys); 9595 pd_idx = pmap_pde_index(va); 9596 pde += pd_idx; 9597 return (pde); 9598 } 9599 9600 static pt_entry_t * 9601 pmap_pti_pte(vm_offset_t va, bool *unwire_pde) 9602 { 9603 pd_entry_t *pde; 9604 pt_entry_t *pte; 9605 vm_page_t m; 9606 vm_paddr_t mphys; 9607 9608 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 9609 9610 pde = pmap_pti_pde(va); 9611 if (unwire_pde != NULL) { 9612 *unwire_pde = true; 9613 pmap_pti_wire_pte(pde); 9614 } 9615 if (*pde == 0) { 9616 m = pmap_pti_alloc_page(); 9617 if (*pde != 0) { 9618 pmap_pti_free_page(m); 9619 MPASS((*pde & X86_PG_PS) == 0); 9620 mphys = *pde & ~(PAGE_MASK | pg_nx); 9621 } else { 9622 mphys = VM_PAGE_TO_PHYS(m); 9623 *pde = mphys | X86_PG_RW | X86_PG_V; 9624 if (unwire_pde != NULL) 9625 *unwire_pde = false; 9626 } 9627 } else { 9628 MPASS((*pde & X86_PG_PS) == 0); 9629 mphys = *pde & ~(PAGE_MASK | pg_nx); 9630 } 9631 9632 pte = (pt_entry_t *)PHYS_TO_DMAP(mphys); 9633 pte += pmap_pte_index(va); 9634 9635 return (pte); 9636 } 9637 9638 static void 9639 pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec) 9640 { 9641 vm_paddr_t pa; 9642 pd_entry_t *pde; 9643 pt_entry_t *pte, ptev; 9644 bool unwire_pde; 9645 9646 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 9647 9648 sva = trunc_page(sva); 9649 MPASS(sva > VM_MAXUSER_ADDRESS); 9650 eva = round_page(eva); 9651 MPASS(sva < eva); 9652 for (; sva < eva; sva += PAGE_SIZE) { 9653 pte = pmap_pti_pte(sva, &unwire_pde); 9654 pa = pmap_kextract(sva); 9655 ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G | 9656 (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap, 9657 VM_MEMATTR_DEFAULT, FALSE); 9658 if (*pte == 0) { 9659 pte_store(pte, ptev); 9660 pmap_pti_wire_pte(pte); 9661 } else { 9662 KASSERT(!pti_finalized, 9663 ("pti overlap after fin %#lx %#lx %#lx", 9664 sva, *pte, ptev)); 9665 KASSERT(*pte == ptev, 9666 ("pti non-identical pte after fin %#lx %#lx %#lx", 9667 sva, *pte, ptev)); 9668 } 9669 if (unwire_pde) { 9670 pde = pmap_pti_pde(sva); 9671 pmap_pti_unwire_pde(pde, true); 9672 } 9673 } 9674 } 9675 9676 void 9677 pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec) 9678 { 9679 9680 if (!pti) 9681 return; 9682 VM_OBJECT_WLOCK(pti_obj); 9683 pmap_pti_add_kva_locked(sva, eva, exec); 9684 VM_OBJECT_WUNLOCK(pti_obj); 9685 } 9686 9687 void 9688 pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva) 9689 { 9690 pt_entry_t *pte; 9691 vm_offset_t va; 9692 9693 if (!pti) 9694 return; 9695 sva = rounddown2(sva, PAGE_SIZE); 9696 MPASS(sva > VM_MAXUSER_ADDRESS); 9697 eva = roundup2(eva, PAGE_SIZE); 9698 MPASS(sva < eva); 9699 VM_OBJECT_WLOCK(pti_obj); 9700 for (va = sva; va < eva; va += PAGE_SIZE) { 9701 pte = pmap_pti_pte(va, NULL); 9702 KASSERT((*pte & X86_PG_V) != 0, 9703 ("invalid pte va %#lx pte %#lx pt %#lx", va, 9704 (u_long)pte, *pte)); 9705 pte_clear(pte); 9706 pmap_pti_unwire_pte(pte, va); 9707 } 9708 pmap_invalidate_range(kernel_pmap, sva, eva); 9709 VM_OBJECT_WUNLOCK(pti_obj); 9710 } 9711 9712 static void * 9713 pkru_dup_range(void *ctx __unused, void *data) 9714 { 9715 struct pmap_pkru_range *node, *new_node; 9716 9717 new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); 9718 if (new_node == NULL) 9719 return (NULL); 9720 node = data; 9721 memcpy(new_node, node, sizeof(*node)); 9722 return (new_node); 9723 } 9724 9725 static void 9726 pkru_free_range(void *ctx __unused, void *node) 9727 { 9728 9729 uma_zfree(pmap_pkru_ranges_zone, node); 9730 } 9731 9732 static int 9733 pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, 9734 int flags) 9735 { 9736 struct pmap_pkru_range *ppr; 9737 int error; 9738 9739 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9740 MPASS(pmap->pm_type == PT_X86); 9741 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 9742 if ((flags & AMD64_PKRU_EXCL) != 0 && 9743 !rangeset_check_empty(&pmap->pm_pkru, sva, eva)) 9744 return (EBUSY); 9745 ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); 9746 if (ppr == NULL) 9747 return (ENOMEM); 9748 ppr->pkru_keyidx = keyidx; 9749 ppr->pkru_flags = flags & AMD64_PKRU_PERSIST; 9750 error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr); 9751 if (error != 0) 9752 uma_zfree(pmap_pkru_ranges_zone, ppr); 9753 return (error); 9754 } 9755 9756 static int 9757 pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 9758 { 9759 9760 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9761 MPASS(pmap->pm_type == PT_X86); 9762 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 9763 return (rangeset_remove(&pmap->pm_pkru, sva, eva)); 9764 } 9765 9766 static void 9767 pmap_pkru_deassign_all(pmap_t pmap) 9768 { 9769 9770 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9771 if (pmap->pm_type == PT_X86 && 9772 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) 9773 rangeset_remove_all(&pmap->pm_pkru); 9774 } 9775 9776 static bool 9777 pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 9778 { 9779 struct pmap_pkru_range *ppr, *prev_ppr; 9780 vm_offset_t va; 9781 9782 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9783 if (pmap->pm_type != PT_X86 || 9784 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || 9785 sva >= VM_MAXUSER_ADDRESS) 9786 return (true); 9787 MPASS(eva <= VM_MAXUSER_ADDRESS); 9788 for (va = sva, prev_ppr = NULL; va < eva;) { 9789 ppr = rangeset_lookup(&pmap->pm_pkru, va); 9790 if ((ppr == NULL) ^ (prev_ppr == NULL)) 9791 return (false); 9792 if (ppr == NULL) { 9793 va += PAGE_SIZE; 9794 continue; 9795 } 9796 if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx) 9797 return (false); 9798 va = ppr->pkru_rs_el.re_end; 9799 } 9800 return (true); 9801 } 9802 9803 static pt_entry_t 9804 pmap_pkru_get(pmap_t pmap, vm_offset_t va) 9805 { 9806 struct pmap_pkru_range *ppr; 9807 9808 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9809 if (pmap->pm_type != PT_X86 || 9810 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || 9811 va >= VM_MAXUSER_ADDRESS) 9812 return (0); 9813 ppr = rangeset_lookup(&pmap->pm_pkru, va); 9814 if (ppr != NULL) 9815 return (X86_PG_PKU(ppr->pkru_keyidx)); 9816 return (0); 9817 } 9818 9819 static bool 9820 pred_pkru_on_remove(void *ctx __unused, void *r) 9821 { 9822 struct pmap_pkru_range *ppr; 9823 9824 ppr = r; 9825 return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0); 9826 } 9827 9828 static void 9829 pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 9830 { 9831 9832 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9833 if (pmap->pm_type == PT_X86 && 9834 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 9835 rangeset_remove_pred(&pmap->pm_pkru, sva, eva, 9836 pred_pkru_on_remove); 9837 } 9838 } 9839 9840 static int 9841 pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap) 9842 { 9843 9844 PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED); 9845 PMAP_LOCK_ASSERT(src_pmap, MA_OWNED); 9846 MPASS(dst_pmap->pm_type == PT_X86); 9847 MPASS(src_pmap->pm_type == PT_X86); 9848 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 9849 if (src_pmap->pm_pkru.rs_data_ctx == NULL) 9850 return (0); 9851 return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru)); 9852 } 9853 9854 static void 9855 pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 9856 u_int keyidx) 9857 { 9858 pml4_entry_t *pml4e; 9859 pdp_entry_t *pdpe; 9860 pd_entry_t newpde, ptpaddr, *pde; 9861 pt_entry_t newpte, *ptep, pte; 9862 vm_offset_t va, va_next; 9863 bool changed; 9864 9865 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9866 MPASS(pmap->pm_type == PT_X86); 9867 MPASS(keyidx <= PMAP_MAX_PKRU_IDX); 9868 9869 for (changed = false, va = sva; va < eva; va = va_next) { 9870 pml4e = pmap_pml4e(pmap, va); 9871 if ((*pml4e & X86_PG_V) == 0) { 9872 va_next = (va + NBPML4) & ~PML4MASK; 9873 if (va_next < va) 9874 va_next = eva; 9875 continue; 9876 } 9877 9878 pdpe = pmap_pml4e_to_pdpe(pml4e, va); 9879 if ((*pdpe & X86_PG_V) == 0) { 9880 va_next = (va + NBPDP) & ~PDPMASK; 9881 if (va_next < va) 9882 va_next = eva; 9883 continue; 9884 } 9885 9886 va_next = (va + NBPDR) & ~PDRMASK; 9887 if (va_next < va) 9888 va_next = eva; 9889 9890 pde = pmap_pdpe_to_pde(pdpe, va); 9891 ptpaddr = *pde; 9892 if (ptpaddr == 0) 9893 continue; 9894 9895 MPASS((ptpaddr & X86_PG_V) != 0); 9896 if ((ptpaddr & PG_PS) != 0) { 9897 if (va + NBPDR == va_next && eva >= va_next) { 9898 newpde = (ptpaddr & ~X86_PG_PKU_MASK) | 9899 X86_PG_PKU(keyidx); 9900 if (newpde != ptpaddr) { 9901 *pde = newpde; 9902 changed = true; 9903 } 9904 continue; 9905 } else if (!pmap_demote_pde(pmap, pde, va)) { 9906 continue; 9907 } 9908 } 9909 9910 if (va_next > eva) 9911 va_next = eva; 9912 9913 for (ptep = pmap_pde_to_pte(pde, va); va != va_next; 9914 ptep++, va += PAGE_SIZE) { 9915 pte = *ptep; 9916 if ((pte & X86_PG_V) == 0) 9917 continue; 9918 newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx); 9919 if (newpte != pte) { 9920 *ptep = newpte; 9921 changed = true; 9922 } 9923 } 9924 } 9925 if (changed) 9926 pmap_invalidate_range(pmap, sva, eva); 9927 } 9928 9929 static int 9930 pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 9931 u_int keyidx, int flags) 9932 { 9933 9934 if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX || 9935 (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0) 9936 return (EINVAL); 9937 if (eva <= sva || eva > VM_MAXUSER_ADDRESS) 9938 return (EFAULT); 9939 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) 9940 return (ENOTSUP); 9941 return (0); 9942 } 9943 9944 int 9945 pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, 9946 int flags) 9947 { 9948 int error; 9949 9950 sva = trunc_page(sva); 9951 eva = round_page(eva); 9952 error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags); 9953 if (error != 0) 9954 return (error); 9955 for (;;) { 9956 PMAP_LOCK(pmap); 9957 error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags); 9958 if (error == 0) 9959 pmap_pkru_update_range(pmap, sva, eva, keyidx); 9960 PMAP_UNLOCK(pmap); 9961 if (error != ENOMEM) 9962 break; 9963 vm_wait(NULL); 9964 } 9965 return (error); 9966 } 9967 9968 int 9969 pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 9970 { 9971 int error; 9972 9973 sva = trunc_page(sva); 9974 eva = round_page(eva); 9975 error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0); 9976 if (error != 0) 9977 return (error); 9978 for (;;) { 9979 PMAP_LOCK(pmap); 9980 error = pmap_pkru_deassign(pmap, sva, eva); 9981 if (error == 0) 9982 pmap_pkru_update_range(pmap, sva, eva, 0); 9983 PMAP_UNLOCK(pmap); 9984 if (error != ENOMEM) 9985 break; 9986 vm_wait(NULL); 9987 } 9988 return (error); 9989 } 9990 9991 /* 9992 * Track a range of the kernel's virtual address space that is contiguous 9993 * in various mapping attributes. 9994 */ 9995 struct pmap_kernel_map_range { 9996 vm_offset_t sva; 9997 pt_entry_t attrs; 9998 int ptes; 9999 int pdes; 10000 int pdpes; 10001 }; 10002 10003 static void 10004 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 10005 vm_offset_t eva) 10006 { 10007 const char *mode; 10008 int i, pat_idx; 10009 10010 if (eva <= range->sva) 10011 return; 10012 10013 pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true); 10014 for (i = 0; i < PAT_INDEX_SIZE; i++) 10015 if (pat_index[i] == pat_idx) 10016 break; 10017 10018 switch (i) { 10019 case PAT_WRITE_BACK: 10020 mode = "WB"; 10021 break; 10022 case PAT_WRITE_THROUGH: 10023 mode = "WT"; 10024 break; 10025 case PAT_UNCACHEABLE: 10026 mode = "UC"; 10027 break; 10028 case PAT_WRITE_PROTECTED: 10029 mode = "WP"; 10030 break; 10031 case PAT_WRITE_COMBINING: 10032 mode = "WC"; 10033 break; 10034 default: 10035 printf("%s: unknown PAT mode %#x for range %#016lx-%#016lx\n", 10036 __func__, i, range->sva, eva); 10037 mode = "??"; 10038 break; 10039 } 10040 10041 sbuf_printf(sb, "%#016lx-%#016lx r%c%c%c%c %s %d %d %d\n", 10042 range->sva, eva, 10043 (range->attrs & X86_PG_RW) != 0 ? 'w' : '-', 10044 (range->attrs & pg_nx) != 0 ? '-' : 'x', 10045 (range->attrs & X86_PG_U) != 0 ? 'u' : 's', 10046 (range->attrs & X86_PG_G) != 0 ? 'g' : '-', 10047 mode, range->pdpes, range->pdes, range->ptes); 10048 10049 /* Reset to sentinel value. */ 10050 range->sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1); 10051 } 10052 10053 /* 10054 * Determine whether the attributes specified by a page table entry match those 10055 * being tracked by the current range. This is not quite as simple as a direct 10056 * flag comparison since some PAT modes have multiple representations. 10057 */ 10058 static bool 10059 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 10060 { 10061 pt_entry_t diff, mask; 10062 10063 mask = X86_PG_G | X86_PG_RW | X86_PG_U | X86_PG_PDE_CACHE | pg_nx; 10064 diff = (range->attrs ^ attrs) & mask; 10065 if (diff == 0) 10066 return (true); 10067 if ((diff & ~X86_PG_PDE_PAT) == 0 && 10068 pmap_pat_index(kernel_pmap, range->attrs, true) == 10069 pmap_pat_index(kernel_pmap, attrs, true)) 10070 return (true); 10071 return (false); 10072 } 10073 10074 static void 10075 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 10076 pt_entry_t attrs) 10077 { 10078 10079 memset(range, 0, sizeof(*range)); 10080 range->sva = va; 10081 range->attrs = attrs; 10082 } 10083 10084 /* 10085 * Given a leaf PTE, derive the mapping's attributes. If they do not match 10086 * those of the current run, dump the address range and its attributes, and 10087 * begin a new run. 10088 */ 10089 static void 10090 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 10091 vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde, 10092 pt_entry_t pte) 10093 { 10094 pt_entry_t attrs; 10095 10096 attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx); 10097 10098 attrs |= pdpe & pg_nx; 10099 attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U)); 10100 if ((pdpe & PG_PS) != 0) { 10101 attrs |= pdpe & (X86_PG_G | X86_PG_PDE_CACHE); 10102 } else if (pde != 0) { 10103 attrs |= pde & pg_nx; 10104 attrs &= pg_nx | (pde & (X86_PG_RW | X86_PG_U)); 10105 } 10106 if ((pde & PG_PS) != 0) { 10107 attrs |= pde & (X86_PG_G | X86_PG_PDE_CACHE); 10108 } else if (pte != 0) { 10109 attrs |= pte & pg_nx; 10110 attrs &= pg_nx | (pte & (X86_PG_RW | X86_PG_U)); 10111 attrs |= pte & (X86_PG_G | X86_PG_PTE_CACHE); 10112 10113 /* Canonicalize by always using the PDE PAT bit. */ 10114 if ((attrs & X86_PG_PTE_PAT) != 0) 10115 attrs ^= X86_PG_PDE_PAT | X86_PG_PTE_PAT; 10116 } 10117 10118 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 10119 sysctl_kmaps_dump(sb, range, va); 10120 sysctl_kmaps_reinit(range, va, attrs); 10121 } 10122 } 10123 10124 static int 10125 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 10126 { 10127 struct pmap_kernel_map_range range; 10128 struct sbuf sbuf, *sb; 10129 pml4_entry_t pml4e; 10130 pdp_entry_t *pdp, pdpe; 10131 pd_entry_t *pd, pde; 10132 pt_entry_t *pt, pte; 10133 vm_offset_t sva; 10134 vm_paddr_t pa; 10135 int error, i, j, k, l; 10136 10137 error = sysctl_wire_old_buffer(req, 0); 10138 if (error != 0) 10139 return (error); 10140 sb = &sbuf; 10141 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 10142 10143 /* Sentinel value. */ 10144 range.sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1); 10145 10146 /* 10147 * Iterate over the kernel page tables without holding the kernel pmap 10148 * lock. Outside of the large map, kernel page table pages are never 10149 * freed, so at worst we will observe inconsistencies in the output. 10150 * Within the large map, ensure that PDP and PD page addresses are 10151 * valid before descending. 10152 */ 10153 for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) { 10154 switch (i) { 10155 case PML4PML4I: 10156 sbuf_printf(sb, "\nRecursive map:\n"); 10157 break; 10158 case DMPML4I: 10159 sbuf_printf(sb, "\nDirect map:\n"); 10160 break; 10161 case KPML4BASE: 10162 sbuf_printf(sb, "\nKernel map:\n"); 10163 break; 10164 case LMSPML4I: 10165 sbuf_printf(sb, "\nLarge map:\n"); 10166 break; 10167 } 10168 10169 /* Convert to canonical form. */ 10170 if (sva == 1ul << 47) 10171 sva |= -1ul << 48; 10172 10173 restart: 10174 pml4e = kernel_pmap->pm_pml4[i]; 10175 if ((pml4e & X86_PG_V) == 0) { 10176 sva = rounddown2(sva, NBPML4); 10177 sysctl_kmaps_dump(sb, &range, sva); 10178 sva += NBPML4; 10179 continue; 10180 } 10181 pa = pml4e & PG_FRAME; 10182 pdp = (pdp_entry_t *)PHYS_TO_DMAP(pa); 10183 10184 for (j = pmap_pdpe_index(sva); j < NPDPEPG; j++) { 10185 pdpe = pdp[j]; 10186 if ((pdpe & X86_PG_V) == 0) { 10187 sva = rounddown2(sva, NBPDP); 10188 sysctl_kmaps_dump(sb, &range, sva); 10189 sva += NBPDP; 10190 continue; 10191 } 10192 pa = pdpe & PG_FRAME; 10193 if (PMAP_ADDRESS_IN_LARGEMAP(sva) && 10194 vm_phys_paddr_to_vm_page(pa) == NULL) 10195 goto restart; 10196 if ((pdpe & PG_PS) != 0) { 10197 sva = rounddown2(sva, NBPDP); 10198 sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, 10199 0, 0); 10200 range.pdpes++; 10201 sva += NBPDP; 10202 continue; 10203 } 10204 pd = (pd_entry_t *)PHYS_TO_DMAP(pa); 10205 10206 for (k = pmap_pde_index(sva); k < NPDEPG; k++) { 10207 pde = pd[k]; 10208 if ((pde & X86_PG_V) == 0) { 10209 sva = rounddown2(sva, NBPDR); 10210 sysctl_kmaps_dump(sb, &range, sva); 10211 sva += NBPDR; 10212 continue; 10213 } 10214 pa = pde & PG_FRAME; 10215 if (PMAP_ADDRESS_IN_LARGEMAP(sva) && 10216 vm_phys_paddr_to_vm_page(pa) == NULL) 10217 goto restart; 10218 if ((pde & PG_PS) != 0) { 10219 sva = rounddown2(sva, NBPDR); 10220 sysctl_kmaps_check(sb, &range, sva, 10221 pml4e, pdpe, pde, 0); 10222 range.pdes++; 10223 sva += NBPDR; 10224 continue; 10225 } 10226 pt = (pt_entry_t *)PHYS_TO_DMAP(pa); 10227 10228 for (l = pmap_pte_index(sva); l < NPTEPG; l++, 10229 sva += PAGE_SIZE) { 10230 pte = pt[l]; 10231 if ((pte & X86_PG_V) == 0) { 10232 sysctl_kmaps_dump(sb, &range, 10233 sva); 10234 continue; 10235 } 10236 sysctl_kmaps_check(sb, &range, sva, 10237 pml4e, pdpe, pde, pte); 10238 range.ptes++; 10239 } 10240 } 10241 } 10242 } 10243 10244 error = sbuf_finish(sb); 10245 sbuf_delete(sb); 10246 return (error); 10247 } 10248 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 10249 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 10250 NULL, 0, sysctl_kmaps, "A", 10251 "Dump kernel address layout"); 10252 10253 #ifdef DDB 10254 DB_SHOW_COMMAND(pte, pmap_print_pte) 10255 { 10256 pmap_t pmap; 10257 pml4_entry_t *pml4; 10258 pdp_entry_t *pdp; 10259 pd_entry_t *pde; 10260 pt_entry_t *pte, PG_V; 10261 vm_offset_t va; 10262 10263 if (!have_addr) { 10264 db_printf("show pte addr\n"); 10265 return; 10266 } 10267 va = (vm_offset_t)addr; 10268 10269 if (kdb_thread != NULL) 10270 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); 10271 else 10272 pmap = PCPU_GET(curpmap); 10273 10274 PG_V = pmap_valid_bit(pmap); 10275 pml4 = pmap_pml4e(pmap, va); 10276 db_printf("VA %#016lx pml4e %#016lx", va, *pml4); 10277 if ((*pml4 & PG_V) == 0) { 10278 db_printf("\n"); 10279 return; 10280 } 10281 pdp = pmap_pml4e_to_pdpe(pml4, va); 10282 db_printf(" pdpe %#016lx", *pdp); 10283 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { 10284 db_printf("\n"); 10285 return; 10286 } 10287 pde = pmap_pdpe_to_pde(pdp, va); 10288 db_printf(" pde %#016lx", *pde); 10289 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { 10290 db_printf("\n"); 10291 return; 10292 } 10293 pte = pmap_pde_to_pte(pde, va); 10294 db_printf(" pte %#016lx\n", *pte); 10295 } 10296 10297 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) 10298 { 10299 vm_paddr_t a; 10300 10301 if (have_addr) { 10302 a = (vm_paddr_t)addr; 10303 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); 10304 } else { 10305 db_printf("show phys2dmap addr\n"); 10306 } 10307 } 10308 #endif 10309