1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * 15 * This code is derived from software contributed to Berkeley by 16 * the Systems Programming Group of the University of Utah Computer 17 * Science Department and William Jolitz of UUNET Technologies Inc. 18 * 19 * Redistribution and use in source and binary forms, with or without 20 * modification, are permitted provided that the following conditions 21 * are met: 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 3. All advertising materials mentioning features or use of this software 28 * must display the following acknowledgement: 29 * This product includes software developed by the University of 30 * California, Berkeley and its contributors. 31 * 4. Neither the name of the University nor the names of its contributors 32 * may be used to endorse or promote products derived from this software 33 * without specific prior written permission. 34 * 35 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 36 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 38 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 39 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 40 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 41 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 42 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 43 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 44 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 45 * SUCH DAMAGE. 46 * 47 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 48 */ 49 /*- 50 * Copyright (c) 2003 Networks Associates Technology, Inc. 51 * Copyright (c) 2014-2018 The FreeBSD Foundation 52 * All rights reserved. 53 * 54 * This software was developed for the FreeBSD Project by Jake Burkholder, 55 * Safeport Network Services, and Network Associates Laboratories, the 56 * Security Research Division of Network Associates, Inc. under 57 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 58 * CHATS research program. 59 * 60 * Portions of this software were developed by 61 * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from 62 * the FreeBSD Foundation. 63 * 64 * Redistribution and use in source and binary forms, with or without 65 * modification, are permitted provided that the following conditions 66 * are met: 67 * 1. Redistributions of source code must retain the above copyright 68 * notice, this list of conditions and the following disclaimer. 69 * 2. Redistributions in binary form must reproduce the above copyright 70 * notice, this list of conditions and the following disclaimer in the 71 * documentation and/or other materials provided with the distribution. 72 * 73 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 74 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 75 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 76 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 77 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 78 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 79 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 80 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 81 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 82 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 83 * SUCH DAMAGE. 84 */ 85 86 #define AMD64_NPT_AWARE 87 88 #include <sys/cdefs.h> 89 __FBSDID("$FreeBSD$"); 90 91 /* 92 * Manages physical address maps. 93 * 94 * Since the information managed by this module is 95 * also stored by the logical address mapping module, 96 * this module may throw away valid virtual-to-physical 97 * mappings at almost any time. However, invalidations 98 * of virtual-to-physical mappings must be done as 99 * requested. 100 * 101 * In order to cope with hardware architectures which 102 * make virtual-to-physical map invalidates expensive, 103 * this module may delay invalidate or reduced protection 104 * operations until such time as they are actually 105 * necessary. This module is given full information as 106 * to which processors are currently using which maps, 107 * and to when physical maps must be made correct. 108 */ 109 110 #include "opt_pmap.h" 111 #include "opt_vm.h" 112 113 #include <sys/param.h> 114 #include <sys/bitstring.h> 115 #include <sys/bus.h> 116 #include <sys/systm.h> 117 #include <sys/kernel.h> 118 #include <sys/ktr.h> 119 #include <sys/lock.h> 120 #include <sys/malloc.h> 121 #include <sys/mman.h> 122 #include <sys/mutex.h> 123 #include <sys/proc.h> 124 #include <sys/rwlock.h> 125 #include <sys/sx.h> 126 #include <sys/turnstile.h> 127 #include <sys/vmem.h> 128 #include <sys/vmmeter.h> 129 #include <sys/sched.h> 130 #include <sys/sysctl.h> 131 #include <sys/smp.h> 132 133 #include <vm/vm.h> 134 #include <vm/vm_param.h> 135 #include <vm/vm_kern.h> 136 #include <vm/vm_page.h> 137 #include <vm/vm_map.h> 138 #include <vm/vm_object.h> 139 #include <vm/vm_extern.h> 140 #include <vm/vm_pageout.h> 141 #include <vm/vm_pager.h> 142 #include <vm/vm_phys.h> 143 #include <vm/vm_radix.h> 144 #include <vm/vm_reserv.h> 145 #include <vm/uma.h> 146 147 #include <machine/intr_machdep.h> 148 #include <x86/apicvar.h> 149 #include <machine/cpu.h> 150 #include <machine/cputypes.h> 151 #include <machine/md_var.h> 152 #include <machine/pcb.h> 153 #include <machine/specialreg.h> 154 #ifdef SMP 155 #include <machine/smp.h> 156 #endif 157 #include <machine/tss.h> 158 159 static __inline boolean_t 160 pmap_type_guest(pmap_t pmap) 161 { 162 163 return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI)); 164 } 165 166 static __inline boolean_t 167 pmap_emulate_ad_bits(pmap_t pmap) 168 { 169 170 return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); 171 } 172 173 static __inline pt_entry_t 174 pmap_valid_bit(pmap_t pmap) 175 { 176 pt_entry_t mask; 177 178 switch (pmap->pm_type) { 179 case PT_X86: 180 case PT_RVI: 181 mask = X86_PG_V; 182 break; 183 case PT_EPT: 184 if (pmap_emulate_ad_bits(pmap)) 185 mask = EPT_PG_EMUL_V; 186 else 187 mask = EPT_PG_READ; 188 break; 189 default: 190 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); 191 } 192 193 return (mask); 194 } 195 196 static __inline pt_entry_t 197 pmap_rw_bit(pmap_t pmap) 198 { 199 pt_entry_t mask; 200 201 switch (pmap->pm_type) { 202 case PT_X86: 203 case PT_RVI: 204 mask = X86_PG_RW; 205 break; 206 case PT_EPT: 207 if (pmap_emulate_ad_bits(pmap)) 208 mask = EPT_PG_EMUL_RW; 209 else 210 mask = EPT_PG_WRITE; 211 break; 212 default: 213 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); 214 } 215 216 return (mask); 217 } 218 219 static pt_entry_t pg_g; 220 221 static __inline pt_entry_t 222 pmap_global_bit(pmap_t pmap) 223 { 224 pt_entry_t mask; 225 226 switch (pmap->pm_type) { 227 case PT_X86: 228 mask = pg_g; 229 break; 230 case PT_RVI: 231 case PT_EPT: 232 mask = 0; 233 break; 234 default: 235 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); 236 } 237 238 return (mask); 239 } 240 241 static __inline pt_entry_t 242 pmap_accessed_bit(pmap_t pmap) 243 { 244 pt_entry_t mask; 245 246 switch (pmap->pm_type) { 247 case PT_X86: 248 case PT_RVI: 249 mask = X86_PG_A; 250 break; 251 case PT_EPT: 252 if (pmap_emulate_ad_bits(pmap)) 253 mask = EPT_PG_READ; 254 else 255 mask = EPT_PG_A; 256 break; 257 default: 258 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); 259 } 260 261 return (mask); 262 } 263 264 static __inline pt_entry_t 265 pmap_modified_bit(pmap_t pmap) 266 { 267 pt_entry_t mask; 268 269 switch (pmap->pm_type) { 270 case PT_X86: 271 case PT_RVI: 272 mask = X86_PG_M; 273 break; 274 case PT_EPT: 275 if (pmap_emulate_ad_bits(pmap)) 276 mask = EPT_PG_WRITE; 277 else 278 mask = EPT_PG_M; 279 break; 280 default: 281 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); 282 } 283 284 return (mask); 285 } 286 287 #if !defined(DIAGNOSTIC) 288 #ifdef __GNUC_GNU_INLINE__ 289 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 290 #else 291 #define PMAP_INLINE extern inline 292 #endif 293 #else 294 #define PMAP_INLINE 295 #endif 296 297 #ifdef PV_STATS 298 #define PV_STAT(x) do { x ; } while (0) 299 #else 300 #define PV_STAT(x) do { } while (0) 301 #endif 302 303 #define pa_index(pa) ((pa) >> PDRSHIFT) 304 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 305 306 #define NPV_LIST_LOCKS MAXCPU 307 308 #define PHYS_TO_PV_LIST_LOCK(pa) \ 309 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) 310 311 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 312 struct rwlock **_lockp = (lockp); \ 313 struct rwlock *_new_lock; \ 314 \ 315 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 316 if (_new_lock != *_lockp) { \ 317 if (*_lockp != NULL) \ 318 rw_wunlock(*_lockp); \ 319 *_lockp = _new_lock; \ 320 rw_wlock(*_lockp); \ 321 } \ 322 } while (0) 323 324 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 325 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 326 327 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 328 struct rwlock **_lockp = (lockp); \ 329 \ 330 if (*_lockp != NULL) { \ 331 rw_wunlock(*_lockp); \ 332 *_lockp = NULL; \ 333 } \ 334 } while (0) 335 336 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 337 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 338 339 struct pmap kernel_pmap_store; 340 341 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 342 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 343 344 int nkpt; 345 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, 346 "Number of kernel page table pages allocated on bootup"); 347 348 static int ndmpdp; 349 vm_paddr_t dmaplimit; 350 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 351 pt_entry_t pg_nx; 352 353 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 354 355 static int pat_works = 1; 356 SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, 357 "Is page attribute table fully functional?"); 358 359 static int pg_ps_enabled = 1; 360 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 361 &pg_ps_enabled, 0, "Are large page mappings enabled?"); 362 363 #define PAT_INDEX_SIZE 8 364 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 365 366 static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 367 static u_int64_t KPDphys; /* phys addr of kernel level 2 */ 368 u_int64_t KPDPphys; /* phys addr of kernel level 3 */ 369 u_int64_t KPML4phys; /* phys addr of kernel level 4 */ 370 371 static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ 372 static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 373 static int ndmpdpphys; /* number of DMPDPphys pages */ 374 375 static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */ 376 377 /* 378 * pmap_mapdev support pre initialization (i.e. console) 379 */ 380 #define PMAP_PREINIT_MAPPING_COUNT 8 381 static struct pmap_preinit_mapping { 382 vm_paddr_t pa; 383 vm_offset_t va; 384 vm_size_t sz; 385 int mode; 386 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 387 static int pmap_initialized; 388 389 /* 390 * Data for the pv entry allocation mechanism. 391 * Updates to pv_invl_gen are protected by the pv_list_locks[] 392 * elements, but reads are not. 393 */ 394 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 395 static struct mtx __exclusive_cache_line pv_chunks_mutex; 396 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; 397 static u_long pv_invl_gen[NPV_LIST_LOCKS]; 398 static struct md_page *pv_table; 399 static struct md_page pv_dummy; 400 401 /* 402 * All those kernel PT submaps that BSD is so fond of 403 */ 404 pt_entry_t *CMAP1 = NULL; 405 caddr_t CADDR1 = 0; 406 static vm_offset_t qframe = 0; 407 static struct mtx qframe_mtx; 408 409 static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ 410 411 int pmap_pcid_enabled = 1; 412 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 413 &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?"); 414 int invpcid_works = 0; 415 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, 416 "Is the invpcid instruction available ?"); 417 418 int __read_frequently pti = 0; 419 SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 420 &pti, 0, 421 "Page Table Isolation enabled"); 422 static vm_object_t pti_obj; 423 static pml4_entry_t *pti_pml4; 424 static vm_pindex_t pti_pg_idx; 425 static bool pti_finalized; 426 427 static int 428 pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS) 429 { 430 int i; 431 uint64_t res; 432 433 res = 0; 434 CPU_FOREACH(i) { 435 res += cpuid_to_pcpu[i]->pc_pm_save_cnt; 436 } 437 return (sysctl_handle_64(oidp, &res, 0, req)); 438 } 439 SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW | 440 CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU", 441 "Count of saved TLB context on switch"); 442 443 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = 444 LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); 445 static struct mtx invl_gen_mtx; 446 static u_long pmap_invl_gen = 0; 447 /* Fake lock object to satisfy turnstiles interface. */ 448 static struct lock_object invl_gen_ts = { 449 .lo_name = "invlts", 450 }; 451 452 static bool 453 pmap_not_in_di(void) 454 { 455 456 return (curthread->td_md.md_invl_gen.gen == 0); 457 } 458 459 #define PMAP_ASSERT_NOT_IN_DI() \ 460 KASSERT(pmap_not_in_di(), ("DI already started")) 461 462 /* 463 * Start a new Delayed Invalidation (DI) block of code, executed by 464 * the current thread. Within a DI block, the current thread may 465 * destroy both the page table and PV list entries for a mapping and 466 * then release the corresponding PV list lock before ensuring that 467 * the mapping is flushed from the TLBs of any processors with the 468 * pmap active. 469 */ 470 static void 471 pmap_delayed_invl_started(void) 472 { 473 struct pmap_invl_gen *invl_gen; 474 u_long currgen; 475 476 invl_gen = &curthread->td_md.md_invl_gen; 477 PMAP_ASSERT_NOT_IN_DI(); 478 mtx_lock(&invl_gen_mtx); 479 if (LIST_EMPTY(&pmap_invl_gen_tracker)) 480 currgen = pmap_invl_gen; 481 else 482 currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; 483 invl_gen->gen = currgen + 1; 484 LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); 485 mtx_unlock(&invl_gen_mtx); 486 } 487 488 /* 489 * Finish the DI block, previously started by the current thread. All 490 * required TLB flushes for the pages marked by 491 * pmap_delayed_invl_page() must be finished before this function is 492 * called. 493 * 494 * This function works by bumping the global DI generation number to 495 * the generation number of the current thread's DI, unless there is a 496 * pending DI that started earlier. In the latter case, bumping the 497 * global DI generation number would incorrectly signal that the 498 * earlier DI had finished. Instead, this function bumps the earlier 499 * DI's generation number to match the generation number of the 500 * current thread's DI. 501 */ 502 static void 503 pmap_delayed_invl_finished(void) 504 { 505 struct pmap_invl_gen *invl_gen, *next; 506 struct turnstile *ts; 507 508 invl_gen = &curthread->td_md.md_invl_gen; 509 KASSERT(invl_gen->gen != 0, ("missed invl_started")); 510 mtx_lock(&invl_gen_mtx); 511 next = LIST_NEXT(invl_gen, link); 512 if (next == NULL) { 513 turnstile_chain_lock(&invl_gen_ts); 514 ts = turnstile_lookup(&invl_gen_ts); 515 pmap_invl_gen = invl_gen->gen; 516 if (ts != NULL) { 517 turnstile_broadcast(ts, TS_SHARED_QUEUE); 518 turnstile_unpend(ts); 519 } 520 turnstile_chain_unlock(&invl_gen_ts); 521 } else { 522 next->gen = invl_gen->gen; 523 } 524 LIST_REMOVE(invl_gen, link); 525 mtx_unlock(&invl_gen_mtx); 526 invl_gen->gen = 0; 527 } 528 529 #ifdef PV_STATS 530 static long invl_wait; 531 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0, 532 "Number of times DI invalidation blocked pmap_remove_all/write"); 533 #endif 534 535 static u_long * 536 pmap_delayed_invl_genp(vm_page_t m) 537 { 538 539 return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); 540 } 541 542 /* 543 * Ensure that all currently executing DI blocks, that need to flush 544 * TLB for the given page m, actually flushed the TLB at the time the 545 * function returned. If the page m has an empty PV list and we call 546 * pmap_delayed_invl_wait(), upon its return we know that no CPU has a 547 * valid mapping for the page m in either its page table or TLB. 548 * 549 * This function works by blocking until the global DI generation 550 * number catches up with the generation number associated with the 551 * given page m and its PV list. Since this function's callers 552 * typically own an object lock and sometimes own a page lock, it 553 * cannot sleep. Instead, it blocks on a turnstile to relinquish the 554 * processor. 555 */ 556 static void 557 pmap_delayed_invl_wait(vm_page_t m) 558 { 559 struct turnstile *ts; 560 u_long *m_gen; 561 #ifdef PV_STATS 562 bool accounted = false; 563 #endif 564 565 m_gen = pmap_delayed_invl_genp(m); 566 while (*m_gen > pmap_invl_gen) { 567 #ifdef PV_STATS 568 if (!accounted) { 569 atomic_add_long(&invl_wait, 1); 570 accounted = true; 571 } 572 #endif 573 ts = turnstile_trywait(&invl_gen_ts); 574 if (*m_gen > pmap_invl_gen) 575 turnstile_wait(ts, NULL, TS_SHARED_QUEUE); 576 else 577 turnstile_cancel(ts); 578 } 579 } 580 581 /* 582 * Mark the page m's PV list as participating in the current thread's 583 * DI block. Any threads concurrently using m's PV list to remove or 584 * restrict all mappings to m will wait for the current thread's DI 585 * block to complete before proceeding. 586 * 587 * The function works by setting the DI generation number for m's PV 588 * list to at least the DI generation number of the current thread. 589 * This forces a caller of pmap_delayed_invl_wait() to block until 590 * current thread calls pmap_delayed_invl_finished(). 591 */ 592 static void 593 pmap_delayed_invl_page(vm_page_t m) 594 { 595 u_long gen, *m_gen; 596 597 rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); 598 gen = curthread->td_md.md_invl_gen.gen; 599 if (gen == 0) 600 return; 601 m_gen = pmap_delayed_invl_genp(m); 602 if (*m_gen < gen) 603 *m_gen = gen; 604 } 605 606 /* 607 * Crashdump maps. 608 */ 609 static caddr_t crashdumpmap; 610 611 /* 612 * Internal flags for pmap_enter()'s helper functions. 613 */ 614 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 615 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 616 617 static void free_pv_chunk(struct pv_chunk *pc); 618 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 619 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 620 static int popcnt_pc_map_pq(uint64_t *map); 621 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 622 static void reserve_pv_entries(pmap_t pmap, int needed, 623 struct rwlock **lockp); 624 static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 625 struct rwlock **lockp); 626 static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, 627 u_int flags, struct rwlock **lockp); 628 #if VM_NRESERVLEVEL > 0 629 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 630 struct rwlock **lockp); 631 #endif 632 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 633 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 634 vm_offset_t va); 635 636 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); 637 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 638 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, 639 vm_offset_t va, struct rwlock **lockp); 640 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, 641 vm_offset_t va); 642 static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, 643 vm_prot_t prot, struct rwlock **lockp); 644 static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, 645 u_int flags, vm_page_t m, struct rwlock **lockp); 646 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 647 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 648 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 649 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 650 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, 651 pd_entry_t pde); 652 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 653 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask); 654 #if VM_NRESERVLEVEL > 0 655 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 656 struct rwlock **lockp); 657 #endif 658 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 659 vm_prot_t prot); 660 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask); 661 static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, 662 bool exec); 663 static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va); 664 static pd_entry_t *pmap_pti_pde(vm_offset_t va); 665 static void pmap_pti_wire_pte(void *pte); 666 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 667 struct spglist *free, struct rwlock **lockp); 668 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 669 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 670 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 671 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 672 struct spglist *free); 673 static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 674 pd_entry_t *pde, struct spglist *free, 675 struct rwlock **lockp); 676 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 677 vm_page_t m, struct rwlock **lockp); 678 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 679 pd_entry_t newpde); 680 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); 681 682 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, 683 struct rwlock **lockp); 684 static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, 685 struct rwlock **lockp); 686 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, 687 struct rwlock **lockp); 688 689 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 690 struct spglist *free); 691 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 692 693 /********************/ 694 /* Inline functions */ 695 /********************/ 696 697 /* Return a non-clipped PD index for a given VA */ 698 static __inline vm_pindex_t 699 pmap_pde_pindex(vm_offset_t va) 700 { 701 return (va >> PDRSHIFT); 702 } 703 704 705 /* Return a pointer to the PML4 slot that corresponds to a VA */ 706 static __inline pml4_entry_t * 707 pmap_pml4e(pmap_t pmap, vm_offset_t va) 708 { 709 710 return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 711 } 712 713 /* Return a pointer to the PDP slot that corresponds to a VA */ 714 static __inline pdp_entry_t * 715 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 716 { 717 pdp_entry_t *pdpe; 718 719 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 720 return (&pdpe[pmap_pdpe_index(va)]); 721 } 722 723 /* Return a pointer to the PDP slot that corresponds to a VA */ 724 static __inline pdp_entry_t * 725 pmap_pdpe(pmap_t pmap, vm_offset_t va) 726 { 727 pml4_entry_t *pml4e; 728 pt_entry_t PG_V; 729 730 PG_V = pmap_valid_bit(pmap); 731 pml4e = pmap_pml4e(pmap, va); 732 if ((*pml4e & PG_V) == 0) 733 return (NULL); 734 return (pmap_pml4e_to_pdpe(pml4e, va)); 735 } 736 737 /* Return a pointer to the PD slot that corresponds to a VA */ 738 static __inline pd_entry_t * 739 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 740 { 741 pd_entry_t *pde; 742 743 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 744 return (&pde[pmap_pde_index(va)]); 745 } 746 747 /* Return a pointer to the PD slot that corresponds to a VA */ 748 static __inline pd_entry_t * 749 pmap_pde(pmap_t pmap, vm_offset_t va) 750 { 751 pdp_entry_t *pdpe; 752 pt_entry_t PG_V; 753 754 PG_V = pmap_valid_bit(pmap); 755 pdpe = pmap_pdpe(pmap, va); 756 if (pdpe == NULL || (*pdpe & PG_V) == 0) 757 return (NULL); 758 return (pmap_pdpe_to_pde(pdpe, va)); 759 } 760 761 /* Return a pointer to the PT slot that corresponds to a VA */ 762 static __inline pt_entry_t * 763 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 764 { 765 pt_entry_t *pte; 766 767 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 768 return (&pte[pmap_pte_index(va)]); 769 } 770 771 /* Return a pointer to the PT slot that corresponds to a VA */ 772 static __inline pt_entry_t * 773 pmap_pte(pmap_t pmap, vm_offset_t va) 774 { 775 pd_entry_t *pde; 776 pt_entry_t PG_V; 777 778 PG_V = pmap_valid_bit(pmap); 779 pde = pmap_pde(pmap, va); 780 if (pde == NULL || (*pde & PG_V) == 0) 781 return (NULL); 782 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 783 return ((pt_entry_t *)pde); 784 return (pmap_pde_to_pte(pde, va)); 785 } 786 787 static __inline void 788 pmap_resident_count_inc(pmap_t pmap, int count) 789 { 790 791 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 792 pmap->pm_stats.resident_count += count; 793 } 794 795 static __inline void 796 pmap_resident_count_dec(pmap_t pmap, int count) 797 { 798 799 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 800 KASSERT(pmap->pm_stats.resident_count >= count, 801 ("pmap %p resident count underflow %ld %d", pmap, 802 pmap->pm_stats.resident_count, count)); 803 pmap->pm_stats.resident_count -= count; 804 } 805 806 PMAP_INLINE pt_entry_t * 807 vtopte(vm_offset_t va) 808 { 809 u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 810 811 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); 812 813 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 814 } 815 816 static __inline pd_entry_t * 817 vtopde(vm_offset_t va) 818 { 819 u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 820 821 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); 822 823 return (PDmap + ((va >> PDRSHIFT) & mask)); 824 } 825 826 static u_int64_t 827 allocpages(vm_paddr_t *firstaddr, int n) 828 { 829 u_int64_t ret; 830 831 ret = *firstaddr; 832 bzero((void *)ret, n * PAGE_SIZE); 833 *firstaddr += n * PAGE_SIZE; 834 return (ret); 835 } 836 837 CTASSERT(powerof2(NDMPML4E)); 838 839 /* number of kernel PDP slots */ 840 #define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG) 841 842 static void 843 nkpt_init(vm_paddr_t addr) 844 { 845 int pt_pages; 846 847 #ifdef NKPT 848 pt_pages = NKPT; 849 #else 850 pt_pages = howmany(addr, 1 << PDRSHIFT); 851 pt_pages += NKPDPE(pt_pages); 852 853 /* 854 * Add some slop beyond the bare minimum required for bootstrapping 855 * the kernel. 856 * 857 * This is quite important when allocating KVA for kernel modules. 858 * The modules are required to be linked in the negative 2GB of 859 * the address space. If we run out of KVA in this region then 860 * pmap_growkernel() will need to allocate page table pages to map 861 * the entire 512GB of KVA space which is an unnecessary tax on 862 * physical memory. 863 * 864 * Secondly, device memory mapped as part of setting up the low- 865 * level console(s) is taken from KVA, starting at virtual_avail. 866 * This is because cninit() is called after pmap_bootstrap() but 867 * before vm_init() and pmap_init(). 20MB for a frame buffer is 868 * not uncommon. 869 */ 870 pt_pages += 32; /* 64MB additional slop. */ 871 #endif 872 nkpt = pt_pages; 873 } 874 875 /* 876 * Returns the proper write/execute permission for a physical page that is 877 * part of the initial boot allocations. 878 * 879 * If the page has kernel text, it is marked as read-only. If the page has 880 * kernel read-only data, it is marked as read-only/not-executable. If the 881 * page has only read-write data, it is marked as read-write/not-executable. 882 * If the page is below/above the kernel range, it is marked as read-write. 883 * 884 * This function operates on 2M pages, since we map the kernel space that 885 * way. 886 * 887 * Note that this doesn't currently provide any protection for modules. 888 */ 889 static inline pt_entry_t 890 bootaddr_rwx(vm_paddr_t pa) 891 { 892 893 /* 894 * Everything in the same 2M page as the start of the kernel 895 * should be static. On the other hand, things in the same 2M 896 * page as the end of the kernel could be read-write/executable, 897 * as the kernel image is not guaranteed to end on a 2M boundary. 898 */ 899 if (pa < trunc_2mpage(btext - KERNBASE) || 900 pa >= trunc_2mpage(_end - KERNBASE)) 901 return (X86_PG_RW); 902 /* 903 * The linker should ensure that the read-only and read-write 904 * portions don't share the same 2M page, so this shouldn't 905 * impact read-only data. However, in any case, any page with 906 * read-write data needs to be read-write. 907 */ 908 if (pa >= trunc_2mpage(brwsection - KERNBASE)) 909 return (X86_PG_RW | pg_nx); 910 /* 911 * Mark any 2M page containing kernel text as read-only. Mark 912 * other pages with read-only data as read-only and not executable. 913 * (It is likely a small portion of the read-only data section will 914 * be marked as read-only, but executable. This should be acceptable 915 * since the read-only protection will keep the data from changing.) 916 * Note that fixups to the .text section will still work until we 917 * set CR0.WP. 918 */ 919 if (pa < round_2mpage(etext - KERNBASE)) 920 return (0); 921 return (pg_nx); 922 } 923 924 static void 925 create_pagetables(vm_paddr_t *firstaddr) 926 { 927 int i, j, ndm1g, nkpdpe, nkdmpde; 928 pt_entry_t *pt_p; 929 pd_entry_t *pd_p; 930 pdp_entry_t *pdp_p; 931 pml4_entry_t *p4_p; 932 uint64_t DMPDkernphys; 933 934 /* Allocate page table pages for the direct map */ 935 ndmpdp = howmany(ptoa(Maxmem), NBPDP); 936 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 937 ndmpdp = 4; 938 ndmpdpphys = howmany(ndmpdp, NPDPEPG); 939 if (ndmpdpphys > NDMPML4E) { 940 /* 941 * Each NDMPML4E allows 512 GB, so limit to that, 942 * and then readjust ndmpdp and ndmpdpphys. 943 */ 944 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); 945 Maxmem = atop(NDMPML4E * NBPML4); 946 ndmpdpphys = NDMPML4E; 947 ndmpdp = NDMPML4E * NPDEPG; 948 } 949 DMPDPphys = allocpages(firstaddr, ndmpdpphys); 950 ndm1g = 0; 951 if ((amd_feature & AMDID_PAGE1GB) != 0) { 952 /* 953 * Calculate the number of 1G pages that will fully fit in 954 * Maxmem. 955 */ 956 ndm1g = ptoa(Maxmem) >> PDPSHIFT; 957 958 /* 959 * Allocate 2M pages for the kernel. These will be used in 960 * place of the first one or more 1G pages from ndm1g. 961 */ 962 nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP); 963 DMPDkernphys = allocpages(firstaddr, nkdmpde); 964 } 965 if (ndm1g < ndmpdp) 966 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); 967 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 968 969 /* Allocate pages */ 970 KPML4phys = allocpages(firstaddr, 1); 971 KPDPphys = allocpages(firstaddr, NKPML4E); 972 973 /* 974 * Allocate the initial number of kernel page table pages required to 975 * bootstrap. We defer this until after all memory-size dependent 976 * allocations are done (e.g. direct map), so that we don't have to 977 * build in too much slop in our estimate. 978 * 979 * Note that when NKPML4E > 1, we have an empty page underneath 980 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) 981 * pages. (pmap_enter requires a PD page to exist for each KPML4E.) 982 */ 983 nkpt_init(*firstaddr); 984 nkpdpe = NKPDPE(nkpt); 985 986 KPTphys = allocpages(firstaddr, nkpt); 987 KPDphys = allocpages(firstaddr, nkpdpe); 988 989 /* Fill in the underlying page table pages */ 990 /* XXX not fully used, underneath 2M pages */ 991 pt_p = (pt_entry_t *)KPTphys; 992 for (i = 0; ptoa(i) < *firstaddr; i++) 993 pt_p[i] = ptoa(i) | X86_PG_V | pg_g | bootaddr_rwx(ptoa(i)); 994 995 /* Now map the page tables at their location within PTmap */ 996 pd_p = (pd_entry_t *)KPDphys; 997 for (i = 0; i < nkpt; i++) 998 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 999 1000 /* Map from zero to end of allocations under 2M pages */ 1001 /* This replaces some of the KPTphys entries above */ 1002 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) 1003 /* Preset PG_M and PG_A because demotion expects it. */ 1004 pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | 1005 X86_PG_M | X86_PG_A | bootaddr_rwx(i << PDRSHIFT); 1006 1007 /* 1008 * Because we map the physical blocks in 2M pages, adjust firstaddr 1009 * to record the physical blocks we've actually mapped into kernel 1010 * virtual address space. 1011 */ 1012 *firstaddr = round_2mpage(*firstaddr); 1013 1014 /* And connect up the PD to the PDP (leaving room for L4 pages) */ 1015 pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); 1016 for (i = 0; i < nkpdpe; i++) 1017 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 1018 1019 /* 1020 * Now, set up the direct map region using 2MB and/or 1GB pages. If 1021 * the end of physical memory is not aligned to a 1GB page boundary, 1022 * then the residual physical memory is mapped with 2MB pages. Later, 1023 * if pmap_mapdev{_attr}() uses the direct map for non-write-back 1024 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings 1025 * that are partially used. 1026 */ 1027 pd_p = (pd_entry_t *)DMPDphys; 1028 for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { 1029 pd_p[j] = (vm_paddr_t)i << PDRSHIFT; 1030 /* Preset PG_M and PG_A because demotion expects it. */ 1031 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | 1032 X86_PG_M | X86_PG_A | pg_nx; 1033 } 1034 pdp_p = (pdp_entry_t *)DMPDPphys; 1035 for (i = 0; i < ndm1g; i++) { 1036 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; 1037 /* Preset PG_M and PG_A because demotion expects it. */ 1038 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | 1039 X86_PG_M | X86_PG_A | pg_nx; 1040 } 1041 for (j = 0; i < ndmpdp; i++, j++) { 1042 pdp_p[i] = DMPDphys + ptoa(j); 1043 pdp_p[i] |= X86_PG_RW | X86_PG_V; 1044 } 1045 1046 /* 1047 * Instead of using a 1G page for the memory containing the kernel, 1048 * use 2M pages with appropriate permissions. (If using 1G pages, 1049 * this will partially overwrite the PDPEs above.) 1050 */ 1051 if (ndm1g) { 1052 pd_p = (pd_entry_t *)DMPDkernphys; 1053 for (i = 0; i < (NPDEPG * nkdmpde); i++) 1054 pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | 1055 X86_PG_M | X86_PG_A | pg_nx | 1056 bootaddr_rwx(i << PDRSHIFT); 1057 for (i = 0; i < nkdmpde; i++) 1058 pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW | 1059 X86_PG_V; 1060 } 1061 1062 /* And recursively map PML4 to itself in order to get PTmap */ 1063 p4_p = (pml4_entry_t *)KPML4phys; 1064 p4_p[PML4PML4I] = KPML4phys; 1065 p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx; 1066 1067 /* Connect the Direct Map slot(s) up to the PML4. */ 1068 for (i = 0; i < ndmpdpphys; i++) { 1069 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); 1070 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V; 1071 } 1072 1073 /* Connect the KVA slots up to the PML4 */ 1074 for (i = 0; i < NKPML4E; i++) { 1075 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); 1076 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V; 1077 } 1078 } 1079 1080 /* 1081 * Bootstrap the system enough to run with virtual memory. 1082 * 1083 * On amd64 this is called after mapping has already been enabled 1084 * and just syncs the pmap module with what has already been done. 1085 * [We can't call it easily with mapping off since the kernel is not 1086 * mapped with PA == VA, hence we would have to relocate every address 1087 * from the linked base (virtual) address "KERNBASE" to the actual 1088 * (physical) address starting relative to 0] 1089 */ 1090 void 1091 pmap_bootstrap(vm_paddr_t *firstaddr) 1092 { 1093 vm_offset_t va; 1094 pt_entry_t *pte; 1095 uint64_t cr4; 1096 int i; 1097 1098 KERNend = *firstaddr; 1099 1100 if (!pti) 1101 pg_g = X86_PG_G; 1102 1103 /* 1104 * Create an initial set of page tables to run the kernel in. 1105 */ 1106 create_pagetables(firstaddr); 1107 1108 /* 1109 * Add a physical memory segment (vm_phys_seg) corresponding to the 1110 * preallocated kernel page table pages so that vm_page structures 1111 * representing these pages will be created. The vm_page structures 1112 * are required for promotion of the corresponding kernel virtual 1113 * addresses to superpage mappings. 1114 */ 1115 vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 1116 1117 virtual_avail = (vm_offset_t) KERNBASE + *firstaddr; 1118 1119 virtual_end = VM_MAX_KERNEL_ADDRESS; 1120 1121 1122 /* 1123 * Enable PG_G global pages, then switch to the kernel page 1124 * table from the bootstrap page table. After the switch, it 1125 * is possible to enable SMEP and SMAP since PG_U bits are 1126 * correct now. 1127 */ 1128 cr4 = rcr4(); 1129 cr4 |= CR4_PGE; 1130 load_cr4(cr4); 1131 load_cr3(KPML4phys); 1132 if (cpu_stdext_feature & CPUID_STDEXT_SMEP) 1133 cr4 |= CR4_SMEP; 1134 if (cpu_stdext_feature & CPUID_STDEXT_SMAP) 1135 cr4 |= CR4_SMAP; 1136 load_cr4(cr4); 1137 1138 /* 1139 * Initialize the kernel pmap (which is statically allocated). 1140 */ 1141 PMAP_LOCK_INIT(kernel_pmap); 1142 kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); 1143 kernel_pmap->pm_cr3 = KPML4phys; 1144 kernel_pmap->pm_ucr3 = PMAP_NO_CR3; 1145 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 1146 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1147 kernel_pmap->pm_flags = pmap_flags; 1148 1149 /* 1150 * Initialize the TLB invalidations generation number lock. 1151 */ 1152 mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); 1153 1154 /* 1155 * Reserve some special page table entries/VA space for temporary 1156 * mapping of pages. 1157 */ 1158 #define SYSMAP(c, p, v, n) \ 1159 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 1160 1161 va = virtual_avail; 1162 pte = vtopte(va); 1163 1164 /* 1165 * Crashdump maps. The first page is reused as CMAP1 for the 1166 * memory test. 1167 */ 1168 SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) 1169 CADDR1 = crashdumpmap; 1170 1171 virtual_avail = va; 1172 1173 /* 1174 * Initialize the PAT MSR. 1175 * pmap_init_pat() clears and sets CR4_PGE, which, as a 1176 * side-effect, invalidates stale PG_G TLB entries that might 1177 * have been created in our pre-boot environment. 1178 */ 1179 pmap_init_pat(); 1180 1181 /* Initialize TLB Context Id. */ 1182 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); 1183 if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { 1184 /* Check for INVPCID support */ 1185 invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) 1186 != 0; 1187 for (i = 0; i < MAXCPU; i++) { 1188 kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN; 1189 kernel_pmap->pm_pcids[i].pm_gen = 1; 1190 } 1191 1192 /* 1193 * PMAP_PCID_KERN + 1 is used for initialization of 1194 * proc0 pmap. The pmap' pcid state might be used by 1195 * EFIRT entry before first context switch, so it 1196 * needs to be valid. 1197 */ 1198 PCPU_SET(pcid_next, PMAP_PCID_KERN + 2); 1199 PCPU_SET(pcid_gen, 1); 1200 1201 /* 1202 * pcpu area for APs is zeroed during AP startup. 1203 * pc_pcid_next and pc_pcid_gen are initialized by AP 1204 * during pcpu setup. 1205 */ 1206 load_cr4(rcr4() | CR4_PCIDE); 1207 } else { 1208 pmap_pcid_enabled = 0; 1209 } 1210 } 1211 1212 /* 1213 * Setup the PAT MSR. 1214 */ 1215 void 1216 pmap_init_pat(void) 1217 { 1218 int pat_table[PAT_INDEX_SIZE]; 1219 uint64_t pat_msr; 1220 u_long cr0, cr4; 1221 int i; 1222 1223 /* Bail if this CPU doesn't implement PAT. */ 1224 if ((cpu_feature & CPUID_PAT) == 0) 1225 panic("no PAT??"); 1226 1227 /* Set default PAT index table. */ 1228 for (i = 0; i < PAT_INDEX_SIZE; i++) 1229 pat_table[i] = -1; 1230 pat_table[PAT_WRITE_BACK] = 0; 1231 pat_table[PAT_WRITE_THROUGH] = 1; 1232 pat_table[PAT_UNCACHEABLE] = 3; 1233 pat_table[PAT_WRITE_COMBINING] = 3; 1234 pat_table[PAT_WRITE_PROTECTED] = 3; 1235 pat_table[PAT_UNCACHED] = 3; 1236 1237 /* Initialize default PAT entries. */ 1238 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 1239 PAT_VALUE(1, PAT_WRITE_THROUGH) | 1240 PAT_VALUE(2, PAT_UNCACHED) | 1241 PAT_VALUE(3, PAT_UNCACHEABLE) | 1242 PAT_VALUE(4, PAT_WRITE_BACK) | 1243 PAT_VALUE(5, PAT_WRITE_THROUGH) | 1244 PAT_VALUE(6, PAT_UNCACHED) | 1245 PAT_VALUE(7, PAT_UNCACHEABLE); 1246 1247 if (pat_works) { 1248 /* 1249 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 1250 * Program 5 and 6 as WP and WC. 1251 * Leave 4 and 7 as WB and UC. 1252 */ 1253 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); 1254 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | 1255 PAT_VALUE(6, PAT_WRITE_COMBINING); 1256 pat_table[PAT_UNCACHED] = 2; 1257 pat_table[PAT_WRITE_PROTECTED] = 5; 1258 pat_table[PAT_WRITE_COMBINING] = 6; 1259 } else { 1260 /* 1261 * Just replace PAT Index 2 with WC instead of UC-. 1262 */ 1263 pat_msr &= ~PAT_MASK(2); 1264 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 1265 pat_table[PAT_WRITE_COMBINING] = 2; 1266 } 1267 1268 /* Disable PGE. */ 1269 cr4 = rcr4(); 1270 load_cr4(cr4 & ~CR4_PGE); 1271 1272 /* Disable caches (CD = 1, NW = 0). */ 1273 cr0 = rcr0(); 1274 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 1275 1276 /* Flushes caches and TLBs. */ 1277 wbinvd(); 1278 invltlb(); 1279 1280 /* Update PAT and index table. */ 1281 wrmsr(MSR_PAT, pat_msr); 1282 for (i = 0; i < PAT_INDEX_SIZE; i++) 1283 pat_index[i] = pat_table[i]; 1284 1285 /* Flush caches and TLBs again. */ 1286 wbinvd(); 1287 invltlb(); 1288 1289 /* Restore caches and PGE. */ 1290 load_cr0(cr0); 1291 load_cr4(cr4); 1292 } 1293 1294 /* 1295 * Initialize a vm_page's machine-dependent fields. 1296 */ 1297 void 1298 pmap_page_init(vm_page_t m) 1299 { 1300 1301 TAILQ_INIT(&m->md.pv_list); 1302 m->md.pat_mode = PAT_WRITE_BACK; 1303 } 1304 1305 /* 1306 * Initialize the pmap module. 1307 * Called by vm_init, to initialize any structures that the pmap 1308 * system needs to map virtual memory. 1309 */ 1310 void 1311 pmap_init(void) 1312 { 1313 struct pmap_preinit_mapping *ppim; 1314 vm_page_t mpte; 1315 vm_size_t s; 1316 int error, i, pv_npg, ret, skz63; 1317 1318 /* L1TF, reserve page @0 unconditionally */ 1319 vm_page_blacklist_add(0, bootverbose); 1320 1321 /* Detect bare-metal Skylake Server and Skylake-X. */ 1322 if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL && 1323 CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) { 1324 /* 1325 * Skylake-X errata SKZ63. Processor May Hang When 1326 * Executing Code In an HLE Transaction Region between 1327 * 40000000H and 403FFFFFH. 1328 * 1329 * Mark the pages in the range as preallocated. It 1330 * seems to be impossible to distinguish between 1331 * Skylake Server and Skylake X. 1332 */ 1333 skz63 = 1; 1334 TUNABLE_INT_FETCH("hw.skz63_enable", &skz63); 1335 if (skz63 != 0) { 1336 if (bootverbose) 1337 printf("SKZ63: skipping 4M RAM starting " 1338 "at physical 1G\n"); 1339 for (i = 0; i < atop(0x400000); i++) { 1340 ret = vm_page_blacklist_add(0x40000000 + 1341 ptoa(i), FALSE); 1342 if (!ret && bootverbose) 1343 printf("page at %#lx already used\n", 1344 0x40000000 + ptoa(i)); 1345 } 1346 } 1347 } 1348 1349 /* 1350 * Initialize the vm page array entries for the kernel pmap's 1351 * page table pages. 1352 */ 1353 PMAP_LOCK(kernel_pmap); 1354 for (i = 0; i < nkpt; i++) { 1355 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 1356 KASSERT(mpte >= vm_page_array && 1357 mpte < &vm_page_array[vm_page_array_size], 1358 ("pmap_init: page table page is out of range")); 1359 mpte->pindex = pmap_pde_pindex(KERNBASE) + i; 1360 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 1361 mpte->wire_count = 1; 1362 if (i << PDRSHIFT < KERNend && 1363 pmap_insert_pt_page(kernel_pmap, mpte)) 1364 panic("pmap_init: pmap_insert_pt_page failed"); 1365 } 1366 PMAP_UNLOCK(kernel_pmap); 1367 vm_wire_add(nkpt); 1368 1369 /* 1370 * If the kernel is running on a virtual machine, then it must assume 1371 * that MCA is enabled by the hypervisor. Moreover, the kernel must 1372 * be prepared for the hypervisor changing the vendor and family that 1373 * are reported by CPUID. Consequently, the workaround for AMD Family 1374 * 10h Erratum 383 is enabled if the processor's feature set does not 1375 * include at least one feature that is only supported by older Intel 1376 * or newer AMD processors. 1377 */ 1378 if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && 1379 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 1380 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 1381 AMDID2_FMA4)) == 0) 1382 workaround_erratum383 = 1; 1383 1384 /* 1385 * Are large page mappings enabled? 1386 */ 1387 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 1388 if (pg_ps_enabled) { 1389 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1390 ("pmap_init: can't assign to pagesizes[1]")); 1391 pagesizes[1] = NBPDR; 1392 } 1393 1394 /* 1395 * Initialize the pv chunk list mutex. 1396 */ 1397 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 1398 1399 /* 1400 * Initialize the pool of pv list locks. 1401 */ 1402 for (i = 0; i < NPV_LIST_LOCKS; i++) 1403 rw_init(&pv_list_locks[i], "pmap pv list"); 1404 1405 /* 1406 * Calculate the size of the pv head table for superpages. 1407 */ 1408 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); 1409 1410 /* 1411 * Allocate memory for the pv head table for superpages. 1412 */ 1413 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 1414 s = round_page(s); 1415 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 1416 for (i = 0; i < pv_npg; i++) 1417 TAILQ_INIT(&pv_table[i].pv_list); 1418 TAILQ_INIT(&pv_dummy.pv_list); 1419 1420 pmap_initialized = 1; 1421 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 1422 ppim = pmap_preinit_mapping + i; 1423 if (ppim->va == 0) 1424 continue; 1425 /* Make the direct map consistent */ 1426 if (ppim->pa < dmaplimit && ppim->pa + ppim->sz < dmaplimit) { 1427 (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), 1428 ppim->sz, ppim->mode); 1429 } 1430 if (!bootverbose) 1431 continue; 1432 printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i, 1433 ppim->pa, ppim->va, ppim->sz, ppim->mode); 1434 } 1435 1436 mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); 1437 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 1438 (vmem_addr_t *)&qframe); 1439 if (error != 0) 1440 panic("qframe allocation failed"); 1441 } 1442 1443 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, 1444 "2MB page mapping counters"); 1445 1446 static u_long pmap_pde_demotions; 1447 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, 1448 &pmap_pde_demotions, 0, "2MB page demotions"); 1449 1450 static u_long pmap_pde_mappings; 1451 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 1452 &pmap_pde_mappings, 0, "2MB page mappings"); 1453 1454 static u_long pmap_pde_p_failures; 1455 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 1456 &pmap_pde_p_failures, 0, "2MB page promotion failures"); 1457 1458 static u_long pmap_pde_promotions; 1459 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 1460 &pmap_pde_promotions, 0, "2MB page promotions"); 1461 1462 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0, 1463 "1GB page mapping counters"); 1464 1465 static u_long pmap_pdpe_demotions; 1466 SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, 1467 &pmap_pdpe_demotions, 0, "1GB page demotions"); 1468 1469 /*************************************************** 1470 * Low level helper routines..... 1471 ***************************************************/ 1472 1473 static pt_entry_t 1474 pmap_swap_pat(pmap_t pmap, pt_entry_t entry) 1475 { 1476 int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; 1477 1478 switch (pmap->pm_type) { 1479 case PT_X86: 1480 case PT_RVI: 1481 /* Verify that both PAT bits are not set at the same time */ 1482 KASSERT((entry & x86_pat_bits) != x86_pat_bits, 1483 ("Invalid PAT bits in entry %#lx", entry)); 1484 1485 /* Swap the PAT bits if one of them is set */ 1486 if ((entry & x86_pat_bits) != 0) 1487 entry ^= x86_pat_bits; 1488 break; 1489 case PT_EPT: 1490 /* 1491 * Nothing to do - the memory attributes are represented 1492 * the same way for regular pages and superpages. 1493 */ 1494 break; 1495 default: 1496 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); 1497 } 1498 1499 return (entry); 1500 } 1501 1502 boolean_t 1503 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 1504 { 1505 1506 return (mode >= 0 && mode < PAT_INDEX_SIZE && 1507 pat_index[(int)mode] >= 0); 1508 } 1509 1510 /* 1511 * Determine the appropriate bits to set in a PTE or PDE for a specified 1512 * caching mode. 1513 */ 1514 int 1515 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) 1516 { 1517 int cache_bits, pat_flag, pat_idx; 1518 1519 if (!pmap_is_valid_memattr(pmap, mode)) 1520 panic("Unknown caching mode %d\n", mode); 1521 1522 switch (pmap->pm_type) { 1523 case PT_X86: 1524 case PT_RVI: 1525 /* The PAT bit is different for PTE's and PDE's. */ 1526 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 1527 1528 /* Map the caching mode to a PAT index. */ 1529 pat_idx = pat_index[mode]; 1530 1531 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 1532 cache_bits = 0; 1533 if (pat_idx & 0x4) 1534 cache_bits |= pat_flag; 1535 if (pat_idx & 0x2) 1536 cache_bits |= PG_NC_PCD; 1537 if (pat_idx & 0x1) 1538 cache_bits |= PG_NC_PWT; 1539 break; 1540 1541 case PT_EPT: 1542 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); 1543 break; 1544 1545 default: 1546 panic("unsupported pmap type %d", pmap->pm_type); 1547 } 1548 1549 return (cache_bits); 1550 } 1551 1552 static int 1553 pmap_cache_mask(pmap_t pmap, boolean_t is_pde) 1554 { 1555 int mask; 1556 1557 switch (pmap->pm_type) { 1558 case PT_X86: 1559 case PT_RVI: 1560 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; 1561 break; 1562 case PT_EPT: 1563 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); 1564 break; 1565 default: 1566 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); 1567 } 1568 1569 return (mask); 1570 } 1571 1572 bool 1573 pmap_ps_enabled(pmap_t pmap) 1574 { 1575 1576 return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); 1577 } 1578 1579 static void 1580 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) 1581 { 1582 1583 switch (pmap->pm_type) { 1584 case PT_X86: 1585 break; 1586 case PT_RVI: 1587 case PT_EPT: 1588 /* 1589 * XXX 1590 * This is a little bogus since the generation number is 1591 * supposed to be bumped up when a region of the address 1592 * space is invalidated in the page tables. 1593 * 1594 * In this case the old PDE entry is valid but yet we want 1595 * to make sure that any mappings using the old entry are 1596 * invalidated in the TLB. 1597 * 1598 * The reason this works as expected is because we rendezvous 1599 * "all" host cpus and force any vcpu context to exit as a 1600 * side-effect. 1601 */ 1602 atomic_add_acq_long(&pmap->pm_eptgen, 1); 1603 break; 1604 default: 1605 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); 1606 } 1607 pde_store(pde, newpde); 1608 } 1609 1610 /* 1611 * After changing the page size for the specified virtual address in the page 1612 * table, flush the corresponding entries from the processor's TLB. Only the 1613 * calling processor's TLB is affected. 1614 * 1615 * The calling thread must be pinned to a processor. 1616 */ 1617 static void 1618 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) 1619 { 1620 pt_entry_t PG_G; 1621 1622 if (pmap_type_guest(pmap)) 1623 return; 1624 1625 KASSERT(pmap->pm_type == PT_X86, 1626 ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); 1627 1628 PG_G = pmap_global_bit(pmap); 1629 1630 if ((newpde & PG_PS) == 0) 1631 /* Demotion: flush a specific 2MB page mapping. */ 1632 invlpg(va); 1633 else if ((newpde & PG_G) == 0) 1634 /* 1635 * Promotion: flush every 4KB page mapping from the TLB 1636 * because there are too many to flush individually. 1637 */ 1638 invltlb(); 1639 else { 1640 /* 1641 * Promotion: flush every 4KB page mapping from the TLB, 1642 * including any global (PG_G) mappings. 1643 */ 1644 invltlb_glob(); 1645 } 1646 } 1647 #ifdef SMP 1648 1649 /* 1650 * For SMP, these functions have to use the IPI mechanism for coherence. 1651 * 1652 * N.B.: Before calling any of the following TLB invalidation functions, 1653 * the calling processor must ensure that all stores updating a non- 1654 * kernel page table are globally performed. Otherwise, another 1655 * processor could cache an old, pre-update entry without being 1656 * invalidated. This can happen one of two ways: (1) The pmap becomes 1657 * active on another processor after its pm_active field is checked by 1658 * one of the following functions but before a store updating the page 1659 * table is globally performed. (2) The pmap becomes active on another 1660 * processor before its pm_active field is checked but due to 1661 * speculative loads one of the following functions stills reads the 1662 * pmap as inactive on the other processor. 1663 * 1664 * The kernel page table is exempt because its pm_active field is 1665 * immutable. The kernel page table is always active on every 1666 * processor. 1667 */ 1668 1669 /* 1670 * Interrupt the cpus that are executing in the guest context. 1671 * This will force the vcpu to exit and the cached EPT mappings 1672 * will be invalidated by the host before the next vmresume. 1673 */ 1674 static __inline void 1675 pmap_invalidate_ept(pmap_t pmap) 1676 { 1677 int ipinum; 1678 1679 sched_pin(); 1680 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 1681 ("pmap_invalidate_ept: absurd pm_active")); 1682 1683 /* 1684 * The TLB mappings associated with a vcpu context are not 1685 * flushed each time a different vcpu is chosen to execute. 1686 * 1687 * This is in contrast with a process's vtop mappings that 1688 * are flushed from the TLB on each context switch. 1689 * 1690 * Therefore we need to do more than just a TLB shootdown on 1691 * the active cpus in 'pmap->pm_active'. To do this we keep 1692 * track of the number of invalidations performed on this pmap. 1693 * 1694 * Each vcpu keeps a cache of this counter and compares it 1695 * just before a vmresume. If the counter is out-of-date an 1696 * invept will be done to flush stale mappings from the TLB. 1697 */ 1698 atomic_add_acq_long(&pmap->pm_eptgen, 1); 1699 1700 /* 1701 * Force the vcpu to exit and trap back into the hypervisor. 1702 */ 1703 ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; 1704 ipi_selected(pmap->pm_active, ipinum); 1705 sched_unpin(); 1706 } 1707 1708 void 1709 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1710 { 1711 cpuset_t *mask; 1712 struct invpcid_descr d; 1713 uint64_t kcr3, ucr3; 1714 uint32_t pcid; 1715 u_int cpuid, i; 1716 1717 if (pmap_type_guest(pmap)) { 1718 pmap_invalidate_ept(pmap); 1719 return; 1720 } 1721 1722 KASSERT(pmap->pm_type == PT_X86, 1723 ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); 1724 1725 sched_pin(); 1726 if (pmap == kernel_pmap) { 1727 invlpg(va); 1728 mask = &all_cpus; 1729 } else { 1730 cpuid = PCPU_GET(cpuid); 1731 if (pmap == PCPU_GET(curpmap)) { 1732 invlpg(va); 1733 if (pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) { 1734 /* 1735 * Disable context switching. pm_pcid 1736 * is recalculated on switch, which 1737 * might make us use wrong pcid below. 1738 */ 1739 critical_enter(); 1740 pcid = pmap->pm_pcids[cpuid].pm_pcid; 1741 1742 if (invpcid_works) { 1743 d.pcid = pcid | PMAP_PCID_USER_PT; 1744 d.pad = 0; 1745 d.addr = va; 1746 invpcid(&d, INVPCID_ADDR); 1747 } else { 1748 kcr3 = pmap->pm_cr3 | pcid | 1749 CR3_PCID_SAVE; 1750 ucr3 = pmap->pm_ucr3 | pcid | 1751 PMAP_PCID_USER_PT | CR3_PCID_SAVE; 1752 pmap_pti_pcid_invlpg(ucr3, kcr3, va); 1753 } 1754 critical_exit(); 1755 } 1756 } else if (pmap_pcid_enabled) 1757 pmap->pm_pcids[cpuid].pm_gen = 0; 1758 if (pmap_pcid_enabled) { 1759 CPU_FOREACH(i) { 1760 if (cpuid != i) 1761 pmap->pm_pcids[i].pm_gen = 0; 1762 } 1763 1764 /* 1765 * The fence is between stores to pm_gen and the read of 1766 * the pm_active mask. We need to ensure that it is 1767 * impossible for us to miss the bit update in pm_active 1768 * and simultaneously observe a non-zero pm_gen in 1769 * pmap_activate_sw(), otherwise TLB update is missed. 1770 * Without the fence, IA32 allows such an outcome. 1771 * Note that pm_active is updated by a locked operation, 1772 * which provides the reciprocal fence. 1773 */ 1774 atomic_thread_fence_seq_cst(); 1775 } 1776 mask = &pmap->pm_active; 1777 } 1778 smp_masked_invlpg(*mask, va, pmap); 1779 sched_unpin(); 1780 } 1781 1782 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ 1783 #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) 1784 1785 void 1786 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1787 { 1788 cpuset_t *mask; 1789 struct invpcid_descr d; 1790 vm_offset_t addr; 1791 uint64_t kcr3, ucr3; 1792 uint32_t pcid; 1793 u_int cpuid, i; 1794 1795 if (eva - sva >= PMAP_INVLPG_THRESHOLD) { 1796 pmap_invalidate_all(pmap); 1797 return; 1798 } 1799 1800 if (pmap_type_guest(pmap)) { 1801 pmap_invalidate_ept(pmap); 1802 return; 1803 } 1804 1805 KASSERT(pmap->pm_type == PT_X86, 1806 ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); 1807 1808 sched_pin(); 1809 cpuid = PCPU_GET(cpuid); 1810 if (pmap == kernel_pmap) { 1811 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1812 invlpg(addr); 1813 mask = &all_cpus; 1814 } else { 1815 if (pmap == PCPU_GET(curpmap)) { 1816 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1817 invlpg(addr); 1818 if (pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) { 1819 critical_enter(); 1820 pcid = pmap->pm_pcids[cpuid].pm_pcid; 1821 if (invpcid_works) { 1822 d.pcid = pcid | PMAP_PCID_USER_PT; 1823 d.pad = 0; 1824 d.addr = sva; 1825 for (; d.addr < eva; d.addr += 1826 PAGE_SIZE) 1827 invpcid(&d, INVPCID_ADDR); 1828 } else { 1829 kcr3 = pmap->pm_cr3 | pcid | 1830 CR3_PCID_SAVE; 1831 ucr3 = pmap->pm_ucr3 | pcid | 1832 PMAP_PCID_USER_PT | CR3_PCID_SAVE; 1833 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, 1834 eva); 1835 } 1836 critical_exit(); 1837 } 1838 } else if (pmap_pcid_enabled) { 1839 pmap->pm_pcids[cpuid].pm_gen = 0; 1840 } 1841 if (pmap_pcid_enabled) { 1842 CPU_FOREACH(i) { 1843 if (cpuid != i) 1844 pmap->pm_pcids[i].pm_gen = 0; 1845 } 1846 /* See the comment in pmap_invalidate_page(). */ 1847 atomic_thread_fence_seq_cst(); 1848 } 1849 mask = &pmap->pm_active; 1850 } 1851 smp_masked_invlpg_range(*mask, sva, eva, pmap); 1852 sched_unpin(); 1853 } 1854 1855 void 1856 pmap_invalidate_all(pmap_t pmap) 1857 { 1858 cpuset_t *mask; 1859 struct invpcid_descr d; 1860 uint64_t kcr3, ucr3; 1861 uint32_t pcid; 1862 u_int cpuid, i; 1863 1864 if (pmap_type_guest(pmap)) { 1865 pmap_invalidate_ept(pmap); 1866 return; 1867 } 1868 1869 KASSERT(pmap->pm_type == PT_X86, 1870 ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); 1871 1872 sched_pin(); 1873 if (pmap == kernel_pmap) { 1874 if (pmap_pcid_enabled && invpcid_works) { 1875 bzero(&d, sizeof(d)); 1876 invpcid(&d, INVPCID_CTXGLOB); 1877 } else { 1878 invltlb_glob(); 1879 } 1880 mask = &all_cpus; 1881 } else { 1882 cpuid = PCPU_GET(cpuid); 1883 if (pmap == PCPU_GET(curpmap)) { 1884 if (pmap_pcid_enabled) { 1885 critical_enter(); 1886 pcid = pmap->pm_pcids[cpuid].pm_pcid; 1887 if (invpcid_works) { 1888 d.pcid = pcid; 1889 d.pad = 0; 1890 d.addr = 0; 1891 invpcid(&d, INVPCID_CTX); 1892 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 1893 d.pcid |= PMAP_PCID_USER_PT; 1894 invpcid(&d, INVPCID_CTX); 1895 } 1896 } else { 1897 kcr3 = pmap->pm_cr3 | pcid; 1898 ucr3 = pmap->pm_ucr3; 1899 if (ucr3 != PMAP_NO_CR3) { 1900 ucr3 |= pcid | PMAP_PCID_USER_PT; 1901 pmap_pti_pcid_invalidate(ucr3, 1902 kcr3); 1903 } else { 1904 load_cr3(kcr3); 1905 } 1906 } 1907 critical_exit(); 1908 } else { 1909 invltlb(); 1910 } 1911 } else if (pmap_pcid_enabled) { 1912 pmap->pm_pcids[cpuid].pm_gen = 0; 1913 } 1914 if (pmap_pcid_enabled) { 1915 CPU_FOREACH(i) { 1916 if (cpuid != i) 1917 pmap->pm_pcids[i].pm_gen = 0; 1918 } 1919 /* See the comment in pmap_invalidate_page(). */ 1920 atomic_thread_fence_seq_cst(); 1921 } 1922 mask = &pmap->pm_active; 1923 } 1924 smp_masked_invltlb(*mask, pmap); 1925 sched_unpin(); 1926 } 1927 1928 void 1929 pmap_invalidate_cache(void) 1930 { 1931 1932 sched_pin(); 1933 wbinvd(); 1934 smp_cache_flush(); 1935 sched_unpin(); 1936 } 1937 1938 struct pde_action { 1939 cpuset_t invalidate; /* processors that invalidate their TLB */ 1940 pmap_t pmap; 1941 vm_offset_t va; 1942 pd_entry_t *pde; 1943 pd_entry_t newpde; 1944 u_int store; /* processor that updates the PDE */ 1945 }; 1946 1947 static void 1948 pmap_update_pde_action(void *arg) 1949 { 1950 struct pde_action *act = arg; 1951 1952 if (act->store == PCPU_GET(cpuid)) 1953 pmap_update_pde_store(act->pmap, act->pde, act->newpde); 1954 } 1955 1956 static void 1957 pmap_update_pde_teardown(void *arg) 1958 { 1959 struct pde_action *act = arg; 1960 1961 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 1962 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); 1963 } 1964 1965 /* 1966 * Change the page size for the specified virtual address in a way that 1967 * prevents any possibility of the TLB ever having two entries that map the 1968 * same virtual address using different page sizes. This is the recommended 1969 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 1970 * machine check exception for a TLB state that is improperly diagnosed as a 1971 * hardware error. 1972 */ 1973 static void 1974 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1975 { 1976 struct pde_action act; 1977 cpuset_t active, other_cpus; 1978 u_int cpuid; 1979 1980 sched_pin(); 1981 cpuid = PCPU_GET(cpuid); 1982 other_cpus = all_cpus; 1983 CPU_CLR(cpuid, &other_cpus); 1984 if (pmap == kernel_pmap || pmap_type_guest(pmap)) 1985 active = all_cpus; 1986 else { 1987 active = pmap->pm_active; 1988 } 1989 if (CPU_OVERLAP(&active, &other_cpus)) { 1990 act.store = cpuid; 1991 act.invalidate = active; 1992 act.va = va; 1993 act.pmap = pmap; 1994 act.pde = pde; 1995 act.newpde = newpde; 1996 CPU_SET(cpuid, &active); 1997 smp_rendezvous_cpus(active, 1998 smp_no_rendezvous_barrier, pmap_update_pde_action, 1999 pmap_update_pde_teardown, &act); 2000 } else { 2001 pmap_update_pde_store(pmap, pde, newpde); 2002 if (CPU_ISSET(cpuid, &active)) 2003 pmap_update_pde_invalidate(pmap, va, newpde); 2004 } 2005 sched_unpin(); 2006 } 2007 #else /* !SMP */ 2008 /* 2009 * Normal, non-SMP, invalidation functions. 2010 */ 2011 void 2012 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 2013 { 2014 struct invpcid_descr d; 2015 uint64_t kcr3, ucr3; 2016 uint32_t pcid; 2017 2018 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 2019 pmap->pm_eptgen++; 2020 return; 2021 } 2022 KASSERT(pmap->pm_type == PT_X86, 2023 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 2024 2025 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 2026 invlpg(va); 2027 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && 2028 pmap->pm_ucr3 != PMAP_NO_CR3) { 2029 critical_enter(); 2030 pcid = pmap->pm_pcids[0].pm_pcid; 2031 if (invpcid_works) { 2032 d.pcid = pcid | PMAP_PCID_USER_PT; 2033 d.pad = 0; 2034 d.addr = va; 2035 invpcid(&d, INVPCID_ADDR); 2036 } else { 2037 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 2038 ucr3 = pmap->pm_ucr3 | pcid | 2039 PMAP_PCID_USER_PT | CR3_PCID_SAVE; 2040 pmap_pti_pcid_invlpg(ucr3, kcr3, va); 2041 } 2042 critical_exit(); 2043 } 2044 } else if (pmap_pcid_enabled) 2045 pmap->pm_pcids[0].pm_gen = 0; 2046 } 2047 2048 void 2049 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2050 { 2051 struct invpcid_descr d; 2052 vm_offset_t addr; 2053 uint64_t kcr3, ucr3; 2054 2055 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 2056 pmap->pm_eptgen++; 2057 return; 2058 } 2059 KASSERT(pmap->pm_type == PT_X86, 2060 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 2061 2062 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 2063 for (addr = sva; addr < eva; addr += PAGE_SIZE) 2064 invlpg(addr); 2065 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && 2066 pmap->pm_ucr3 != PMAP_NO_CR3) { 2067 critical_enter(); 2068 if (invpcid_works) { 2069 d.pcid = pmap->pm_pcids[0].pm_pcid | 2070 PMAP_PCID_USER_PT; 2071 d.pad = 0; 2072 d.addr = sva; 2073 for (; d.addr < eva; d.addr += PAGE_SIZE) 2074 invpcid(&d, INVPCID_ADDR); 2075 } else { 2076 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0]. 2077 pm_pcid | CR3_PCID_SAVE; 2078 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0]. 2079 pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; 2080 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); 2081 } 2082 critical_exit(); 2083 } 2084 } else if (pmap_pcid_enabled) { 2085 pmap->pm_pcids[0].pm_gen = 0; 2086 } 2087 } 2088 2089 void 2090 pmap_invalidate_all(pmap_t pmap) 2091 { 2092 struct invpcid_descr d; 2093 uint64_t kcr3, ucr3; 2094 2095 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 2096 pmap->pm_eptgen++; 2097 return; 2098 } 2099 KASSERT(pmap->pm_type == PT_X86, 2100 ("pmap_invalidate_all: unknown type %d", pmap->pm_type)); 2101 2102 if (pmap == kernel_pmap) { 2103 if (pmap_pcid_enabled && invpcid_works) { 2104 bzero(&d, sizeof(d)); 2105 invpcid(&d, INVPCID_CTXGLOB); 2106 } else { 2107 invltlb_glob(); 2108 } 2109 } else if (pmap == PCPU_GET(curpmap)) { 2110 if (pmap_pcid_enabled) { 2111 critical_enter(); 2112 if (invpcid_works) { 2113 d.pcid = pmap->pm_pcids[0].pm_pcid; 2114 d.pad = 0; 2115 d.addr = 0; 2116 invpcid(&d, INVPCID_CTX); 2117 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 2118 d.pcid |= PMAP_PCID_USER_PT; 2119 invpcid(&d, INVPCID_CTX); 2120 } 2121 } else { 2122 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid; 2123 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 2124 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[ 2125 0].pm_pcid | PMAP_PCID_USER_PT; 2126 pmap_pti_pcid_invalidate(ucr3, kcr3); 2127 } else 2128 load_cr3(kcr3); 2129 } 2130 critical_exit(); 2131 } else { 2132 invltlb(); 2133 } 2134 } else if (pmap_pcid_enabled) { 2135 pmap->pm_pcids[0].pm_gen = 0; 2136 } 2137 } 2138 2139 PMAP_INLINE void 2140 pmap_invalidate_cache(void) 2141 { 2142 2143 wbinvd(); 2144 } 2145 2146 static void 2147 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 2148 { 2149 2150 pmap_update_pde_store(pmap, pde, newpde); 2151 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) 2152 pmap_update_pde_invalidate(pmap, va, newpde); 2153 else 2154 pmap->pm_pcids[0].pm_gen = 0; 2155 } 2156 #endif /* !SMP */ 2157 2158 static void 2159 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) 2160 { 2161 2162 /* 2163 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created 2164 * by a promotion that did not invalidate the 512 4KB page mappings 2165 * that might exist in the TLB. Consequently, at this point, the TLB 2166 * may hold both 4KB and 2MB page mappings for the address range [va, 2167 * va + NBPDR). Therefore, the entire range must be invalidated here. 2168 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any 2169 * 4KB page mappings for the address range [va, va + NBPDR), and so a 2170 * single INVLPG suffices to invalidate the 2MB page mapping from the 2171 * TLB. 2172 */ 2173 if ((pde & PG_PROMOTED) != 0) 2174 pmap_invalidate_range(pmap, va, va + NBPDR - 1); 2175 else 2176 pmap_invalidate_page(pmap, va); 2177 } 2178 2179 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 2180 2181 void 2182 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) 2183 { 2184 2185 if (force) { 2186 sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); 2187 } else { 2188 KASSERT((sva & PAGE_MASK) == 0, 2189 ("pmap_invalidate_cache_range: sva not page-aligned")); 2190 KASSERT((eva & PAGE_MASK) == 0, 2191 ("pmap_invalidate_cache_range: eva not page-aligned")); 2192 } 2193 2194 if ((cpu_feature & CPUID_SS) != 0 && !force) 2195 ; /* If "Self Snoop" is supported and allowed, do nothing. */ 2196 else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 && 2197 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 2198 /* 2199 * XXX: Some CPUs fault, hang, or trash the local APIC 2200 * registers if we use CLFLUSH on the local APIC 2201 * range. The local APIC is always uncached, so we 2202 * don't need to flush for that range anyway. 2203 */ 2204 if (pmap_kextract(sva) == lapic_paddr) 2205 return; 2206 2207 /* 2208 * Otherwise, do per-cache line flush. Use the sfence 2209 * instruction to insure that previous stores are 2210 * included in the write-back. The processor 2211 * propagates flush to other processors in the cache 2212 * coherence domain. 2213 */ 2214 sfence(); 2215 for (; sva < eva; sva += cpu_clflush_line_size) 2216 clflushopt(sva); 2217 sfence(); 2218 } else if ((cpu_feature & CPUID_CLFSH) != 0 && 2219 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 2220 if (pmap_kextract(sva) == lapic_paddr) 2221 return; 2222 /* 2223 * Writes are ordered by CLFLUSH on Intel CPUs. 2224 */ 2225 if (cpu_vendor_id != CPU_VENDOR_INTEL) 2226 mfence(); 2227 for (; sva < eva; sva += cpu_clflush_line_size) 2228 clflush(sva); 2229 if (cpu_vendor_id != CPU_VENDOR_INTEL) 2230 mfence(); 2231 } else { 2232 2233 /* 2234 * No targeted cache flush methods are supported by CPU, 2235 * or the supplied range is bigger than 2MB. 2236 * Globally invalidate cache. 2237 */ 2238 pmap_invalidate_cache(); 2239 } 2240 } 2241 2242 /* 2243 * Remove the specified set of pages from the data and instruction caches. 2244 * 2245 * In contrast to pmap_invalidate_cache_range(), this function does not 2246 * rely on the CPU's self-snoop feature, because it is intended for use 2247 * when moving pages into a different cache domain. 2248 */ 2249 void 2250 pmap_invalidate_cache_pages(vm_page_t *pages, int count) 2251 { 2252 vm_offset_t daddr, eva; 2253 int i; 2254 bool useclflushopt; 2255 2256 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; 2257 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 2258 ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt)) 2259 pmap_invalidate_cache(); 2260 else { 2261 if (useclflushopt) 2262 sfence(); 2263 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 2264 mfence(); 2265 for (i = 0; i < count; i++) { 2266 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 2267 eva = daddr + PAGE_SIZE; 2268 for (; daddr < eva; daddr += cpu_clflush_line_size) { 2269 if (useclflushopt) 2270 clflushopt(daddr); 2271 else 2272 clflush(daddr); 2273 } 2274 } 2275 if (useclflushopt) 2276 sfence(); 2277 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 2278 mfence(); 2279 } 2280 } 2281 2282 /* 2283 * Routine: pmap_extract 2284 * Function: 2285 * Extract the physical page address associated 2286 * with the given map/virtual_address pair. 2287 */ 2288 vm_paddr_t 2289 pmap_extract(pmap_t pmap, vm_offset_t va) 2290 { 2291 pdp_entry_t *pdpe; 2292 pd_entry_t *pde; 2293 pt_entry_t *pte, PG_V; 2294 vm_paddr_t pa; 2295 2296 pa = 0; 2297 PG_V = pmap_valid_bit(pmap); 2298 PMAP_LOCK(pmap); 2299 pdpe = pmap_pdpe(pmap, va); 2300 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 2301 if ((*pdpe & PG_PS) != 0) 2302 pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); 2303 else { 2304 pde = pmap_pdpe_to_pde(pdpe, va); 2305 if ((*pde & PG_V) != 0) { 2306 if ((*pde & PG_PS) != 0) { 2307 pa = (*pde & PG_PS_FRAME) | 2308 (va & PDRMASK); 2309 } else { 2310 pte = pmap_pde_to_pte(pde, va); 2311 pa = (*pte & PG_FRAME) | 2312 (va & PAGE_MASK); 2313 } 2314 } 2315 } 2316 } 2317 PMAP_UNLOCK(pmap); 2318 return (pa); 2319 } 2320 2321 /* 2322 * Routine: pmap_extract_and_hold 2323 * Function: 2324 * Atomically extract and hold the physical page 2325 * with the given pmap and virtual address pair 2326 * if that mapping permits the given protection. 2327 */ 2328 vm_page_t 2329 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 2330 { 2331 pd_entry_t pde, *pdep; 2332 pt_entry_t pte, PG_RW, PG_V; 2333 vm_paddr_t pa; 2334 vm_page_t m; 2335 2336 pa = 0; 2337 m = NULL; 2338 PG_RW = pmap_rw_bit(pmap); 2339 PG_V = pmap_valid_bit(pmap); 2340 PMAP_LOCK(pmap); 2341 retry: 2342 pdep = pmap_pde(pmap, va); 2343 if (pdep != NULL && (pde = *pdep)) { 2344 if (pde & PG_PS) { 2345 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 2346 if (vm_page_pa_tryrelock(pmap, (pde & 2347 PG_PS_FRAME) | (va & PDRMASK), &pa)) 2348 goto retry; 2349 m = PHYS_TO_VM_PAGE(pa); 2350 } 2351 } else { 2352 pte = *pmap_pde_to_pte(pdep, va); 2353 if ((pte & PG_V) && 2354 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 2355 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, 2356 &pa)) 2357 goto retry; 2358 m = PHYS_TO_VM_PAGE(pa); 2359 } 2360 } 2361 if (m != NULL) 2362 vm_page_hold(m); 2363 } 2364 PA_UNLOCK_COND(pa); 2365 PMAP_UNLOCK(pmap); 2366 return (m); 2367 } 2368 2369 vm_paddr_t 2370 pmap_kextract(vm_offset_t va) 2371 { 2372 pd_entry_t pde; 2373 vm_paddr_t pa; 2374 2375 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 2376 pa = DMAP_TO_PHYS(va); 2377 } else { 2378 pde = *vtopde(va); 2379 if (pde & PG_PS) { 2380 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 2381 } else { 2382 /* 2383 * Beware of a concurrent promotion that changes the 2384 * PDE at this point! For example, vtopte() must not 2385 * be used to access the PTE because it would use the 2386 * new PDE. It is, however, safe to use the old PDE 2387 * because the page table page is preserved by the 2388 * promotion. 2389 */ 2390 pa = *pmap_pde_to_pte(&pde, va); 2391 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 2392 } 2393 } 2394 return (pa); 2395 } 2396 2397 /*************************************************** 2398 * Low level mapping routines..... 2399 ***************************************************/ 2400 2401 /* 2402 * Add a wired page to the kva. 2403 * Note: not SMP coherent. 2404 */ 2405 PMAP_INLINE void 2406 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 2407 { 2408 pt_entry_t *pte; 2409 2410 pte = vtopte(va); 2411 pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g); 2412 } 2413 2414 static __inline void 2415 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 2416 { 2417 pt_entry_t *pte; 2418 int cache_bits; 2419 2420 pte = vtopte(va); 2421 cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); 2422 pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | cache_bits); 2423 } 2424 2425 /* 2426 * Remove a page from the kernel pagetables. 2427 * Note: not SMP coherent. 2428 */ 2429 PMAP_INLINE void 2430 pmap_kremove(vm_offset_t va) 2431 { 2432 pt_entry_t *pte; 2433 2434 pte = vtopte(va); 2435 pte_clear(pte); 2436 } 2437 2438 /* 2439 * Used to map a range of physical addresses into kernel 2440 * virtual address space. 2441 * 2442 * The value passed in '*virt' is a suggested virtual address for 2443 * the mapping. Architectures which can support a direct-mapped 2444 * physical to virtual region can return the appropriate address 2445 * within that region, leaving '*virt' unchanged. Other 2446 * architectures should map the pages starting at '*virt' and 2447 * update '*virt' with the first usable address after the mapped 2448 * region. 2449 */ 2450 vm_offset_t 2451 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 2452 { 2453 return PHYS_TO_DMAP(start); 2454 } 2455 2456 2457 /* 2458 * Add a list of wired pages to the kva 2459 * this routine is only used for temporary 2460 * kernel mappings that do not need to have 2461 * page modification or references recorded. 2462 * Note that old mappings are simply written 2463 * over. The page *must* be wired. 2464 * Note: SMP coherent. Uses a ranged shootdown IPI. 2465 */ 2466 void 2467 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 2468 { 2469 pt_entry_t *endpte, oldpte, pa, *pte; 2470 vm_page_t m; 2471 int cache_bits; 2472 2473 oldpte = 0; 2474 pte = vtopte(sva); 2475 endpte = pte + count; 2476 while (pte < endpte) { 2477 m = *ma++; 2478 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); 2479 pa = VM_PAGE_TO_PHYS(m) | cache_bits; 2480 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { 2481 oldpte |= *pte; 2482 pte_store(pte, pa | pg_g | pg_nx | X86_PG_RW | X86_PG_V); 2483 } 2484 pte++; 2485 } 2486 if (__predict_false((oldpte & X86_PG_V) != 0)) 2487 pmap_invalidate_range(kernel_pmap, sva, sva + count * 2488 PAGE_SIZE); 2489 } 2490 2491 /* 2492 * This routine tears out page mappings from the 2493 * kernel -- it is meant only for temporary mappings. 2494 * Note: SMP coherent. Uses a ranged shootdown IPI. 2495 */ 2496 void 2497 pmap_qremove(vm_offset_t sva, int count) 2498 { 2499 vm_offset_t va; 2500 2501 va = sva; 2502 while (count-- > 0) { 2503 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); 2504 pmap_kremove(va); 2505 va += PAGE_SIZE; 2506 } 2507 pmap_invalidate_range(kernel_pmap, sva, va); 2508 } 2509 2510 /*************************************************** 2511 * Page table page management routines..... 2512 ***************************************************/ 2513 /* 2514 * Schedule the specified unused page table page to be freed. Specifically, 2515 * add the page to the specified list of pages that will be released to the 2516 * physical memory manager after the TLB has been updated. 2517 */ 2518 static __inline void 2519 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 2520 boolean_t set_PG_ZERO) 2521 { 2522 2523 if (set_PG_ZERO) 2524 m->flags |= PG_ZERO; 2525 else 2526 m->flags &= ~PG_ZERO; 2527 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 2528 } 2529 2530 /* 2531 * Inserts the specified page table page into the specified pmap's collection 2532 * of idle page table pages. Each of a pmap's page table pages is responsible 2533 * for mapping a distinct range of virtual addresses. The pmap's collection is 2534 * ordered by this virtual address range. 2535 */ 2536 static __inline int 2537 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 2538 { 2539 2540 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2541 return (vm_radix_insert(&pmap->pm_root, mpte)); 2542 } 2543 2544 /* 2545 * Removes the page table page mapping the specified virtual address from the 2546 * specified pmap's collection of idle page table pages, and returns it. 2547 * Otherwise, returns NULL if there is no page table page corresponding to the 2548 * specified virtual address. 2549 */ 2550 static __inline vm_page_t 2551 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 2552 { 2553 2554 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2555 return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va))); 2556 } 2557 2558 /* 2559 * Decrements a page table page's wire count, which is used to record the 2560 * number of valid page table entries within the page. If the wire count 2561 * drops to zero, then the page table page is unmapped. Returns TRUE if the 2562 * page table page was unmapped and FALSE otherwise. 2563 */ 2564 static inline boolean_t 2565 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2566 { 2567 2568 --m->wire_count; 2569 if (m->wire_count == 0) { 2570 _pmap_unwire_ptp(pmap, va, m, free); 2571 return (TRUE); 2572 } else 2573 return (FALSE); 2574 } 2575 2576 static void 2577 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2578 { 2579 2580 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2581 /* 2582 * unmap the page table page 2583 */ 2584 if (m->pindex >= (NUPDE + NUPDPE)) { 2585 /* PDP page */ 2586 pml4_entry_t *pml4; 2587 pml4 = pmap_pml4e(pmap, va); 2588 *pml4 = 0; 2589 if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) { 2590 pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)]; 2591 *pml4 = 0; 2592 } 2593 } else if (m->pindex >= NUPDE) { 2594 /* PD page */ 2595 pdp_entry_t *pdp; 2596 pdp = pmap_pdpe(pmap, va); 2597 *pdp = 0; 2598 } else { 2599 /* PTE page */ 2600 pd_entry_t *pd; 2601 pd = pmap_pde(pmap, va); 2602 *pd = 0; 2603 } 2604 pmap_resident_count_dec(pmap, 1); 2605 if (m->pindex < NUPDE) { 2606 /* We just released a PT, unhold the matching PD */ 2607 vm_page_t pdpg; 2608 2609 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 2610 pmap_unwire_ptp(pmap, va, pdpg, free); 2611 } 2612 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 2613 /* We just released a PD, unhold the matching PDP */ 2614 vm_page_t pdppg; 2615 2616 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 2617 pmap_unwire_ptp(pmap, va, pdppg, free); 2618 } 2619 2620 /* 2621 * Put page on a list so that it is released after 2622 * *ALL* TLB shootdown is done 2623 */ 2624 pmap_add_delayed_free_list(m, free, TRUE); 2625 } 2626 2627 /* 2628 * After removing a page table entry, this routine is used to 2629 * conditionally free the page, and manage the hold/wire counts. 2630 */ 2631 static int 2632 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 2633 struct spglist *free) 2634 { 2635 vm_page_t mpte; 2636 2637 if (va >= VM_MAXUSER_ADDRESS) 2638 return (0); 2639 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 2640 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 2641 return (pmap_unwire_ptp(pmap, va, mpte, free)); 2642 } 2643 2644 void 2645 pmap_pinit0(pmap_t pmap) 2646 { 2647 int i; 2648 2649 PMAP_LOCK_INIT(pmap); 2650 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 2651 pmap->pm_pml4u = NULL; 2652 pmap->pm_cr3 = KPML4phys; 2653 /* hack to keep pmap_pti_pcid_invalidate() alive */ 2654 pmap->pm_ucr3 = PMAP_NO_CR3; 2655 pmap->pm_root.rt_root = 0; 2656 CPU_ZERO(&pmap->pm_active); 2657 TAILQ_INIT(&pmap->pm_pvchunk); 2658 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2659 pmap->pm_flags = pmap_flags; 2660 CPU_FOREACH(i) { 2661 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN + 1; 2662 pmap->pm_pcids[i].pm_gen = 1; 2663 } 2664 pmap_activate_boot(pmap); 2665 } 2666 2667 void 2668 pmap_pinit_pml4(vm_page_t pml4pg) 2669 { 2670 pml4_entry_t *pm_pml4; 2671 int i; 2672 2673 pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); 2674 2675 /* Wire in kernel global address entries. */ 2676 for (i = 0; i < NKPML4E; i++) { 2677 pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW | 2678 X86_PG_V; 2679 } 2680 for (i = 0; i < ndmpdpphys; i++) { 2681 pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | 2682 X86_PG_V; 2683 } 2684 2685 /* install self-referential address mapping entry(s) */ 2686 pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW | 2687 X86_PG_A | X86_PG_M; 2688 } 2689 2690 static void 2691 pmap_pinit_pml4_pti(vm_page_t pml4pg) 2692 { 2693 pml4_entry_t *pm_pml4; 2694 int i; 2695 2696 pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); 2697 for (i = 0; i < NPML4EPG; i++) 2698 pm_pml4[i] = pti_pml4[i]; 2699 } 2700 2701 /* 2702 * Initialize a preallocated and zeroed pmap structure, 2703 * such as one in a vmspace structure. 2704 */ 2705 int 2706 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) 2707 { 2708 vm_page_t pml4pg, pml4pgu; 2709 vm_paddr_t pml4phys; 2710 int i; 2711 2712 /* 2713 * allocate the page directory page 2714 */ 2715 pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 2716 VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK); 2717 2718 pml4phys = VM_PAGE_TO_PHYS(pml4pg); 2719 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys); 2720 CPU_FOREACH(i) { 2721 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; 2722 pmap->pm_pcids[i].pm_gen = 0; 2723 } 2724 pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */ 2725 pmap->pm_ucr3 = PMAP_NO_CR3; 2726 pmap->pm_pml4u = NULL; 2727 2728 pmap->pm_type = pm_type; 2729 if ((pml4pg->flags & PG_ZERO) == 0) 2730 pagezero(pmap->pm_pml4); 2731 2732 /* 2733 * Do not install the host kernel mappings in the nested page 2734 * tables. These mappings are meaningless in the guest physical 2735 * address space. 2736 * Install minimal kernel mappings in PTI case. 2737 */ 2738 if (pm_type == PT_X86) { 2739 pmap->pm_cr3 = pml4phys; 2740 pmap_pinit_pml4(pml4pg); 2741 if (pti) { 2742 pml4pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2743 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_WAITOK); 2744 pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP( 2745 VM_PAGE_TO_PHYS(pml4pgu)); 2746 pmap_pinit_pml4_pti(pml4pgu); 2747 pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu); 2748 } 2749 } 2750 2751 pmap->pm_root.rt_root = 0; 2752 CPU_ZERO(&pmap->pm_active); 2753 TAILQ_INIT(&pmap->pm_pvchunk); 2754 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2755 pmap->pm_flags = flags; 2756 pmap->pm_eptgen = 0; 2757 2758 return (1); 2759 } 2760 2761 int 2762 pmap_pinit(pmap_t pmap) 2763 { 2764 2765 return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); 2766 } 2767 2768 /* 2769 * This routine is called if the desired page table page does not exist. 2770 * 2771 * If page table page allocation fails, this routine may sleep before 2772 * returning NULL. It sleeps only if a lock pointer was given. 2773 * 2774 * Note: If a page allocation fails at page table level two or three, 2775 * one or two pages may be held during the wait, only to be released 2776 * afterwards. This conservative approach is easily argued to avoid 2777 * race conditions. 2778 */ 2779 static vm_page_t 2780 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 2781 { 2782 vm_page_t m, pdppg, pdpg; 2783 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 2784 2785 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2786 2787 PG_A = pmap_accessed_bit(pmap); 2788 PG_M = pmap_modified_bit(pmap); 2789 PG_V = pmap_valid_bit(pmap); 2790 PG_RW = pmap_rw_bit(pmap); 2791 2792 /* 2793 * Allocate a page table page. 2794 */ 2795 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 2796 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 2797 if (lockp != NULL) { 2798 RELEASE_PV_LIST_LOCK(lockp); 2799 PMAP_UNLOCK(pmap); 2800 PMAP_ASSERT_NOT_IN_DI(); 2801 vm_wait(NULL); 2802 PMAP_LOCK(pmap); 2803 } 2804 2805 /* 2806 * Indicate the need to retry. While waiting, the page table 2807 * page may have been allocated. 2808 */ 2809 return (NULL); 2810 } 2811 if ((m->flags & PG_ZERO) == 0) 2812 pmap_zero_page(m); 2813 2814 /* 2815 * Map the pagetable page into the process address space, if 2816 * it isn't already there. 2817 */ 2818 2819 if (ptepindex >= (NUPDE + NUPDPE)) { 2820 pml4_entry_t *pml4, *pml4u; 2821 vm_pindex_t pml4index; 2822 2823 /* Wire up a new PDPE page */ 2824 pml4index = ptepindex - (NUPDE + NUPDPE); 2825 pml4 = &pmap->pm_pml4[pml4index]; 2826 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 2827 if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) { 2828 /* 2829 * PTI: Make all user-space mappings in the 2830 * kernel-mode page table no-execute so that 2831 * we detect any programming errors that leave 2832 * the kernel-mode page table active on return 2833 * to user space. 2834 */ 2835 if (pmap->pm_ucr3 != PMAP_NO_CR3) 2836 *pml4 |= pg_nx; 2837 2838 pml4u = &pmap->pm_pml4u[pml4index]; 2839 *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | 2840 PG_A | PG_M; 2841 } 2842 2843 } else if (ptepindex >= NUPDE) { 2844 vm_pindex_t pml4index; 2845 vm_pindex_t pdpindex; 2846 pml4_entry_t *pml4; 2847 pdp_entry_t *pdp; 2848 2849 /* Wire up a new PDE page */ 2850 pdpindex = ptepindex - NUPDE; 2851 pml4index = pdpindex >> NPML4EPGSHIFT; 2852 2853 pml4 = &pmap->pm_pml4[pml4index]; 2854 if ((*pml4 & PG_V) == 0) { 2855 /* Have to allocate a new pdp, recurse */ 2856 if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, 2857 lockp) == NULL) { 2858 vm_page_unwire_noq(m); 2859 vm_page_free_zero(m); 2860 return (NULL); 2861 } 2862 } else { 2863 /* Add reference to pdp page */ 2864 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 2865 pdppg->wire_count++; 2866 } 2867 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 2868 2869 /* Now find the pdp page */ 2870 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 2871 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 2872 2873 } else { 2874 vm_pindex_t pml4index; 2875 vm_pindex_t pdpindex; 2876 pml4_entry_t *pml4; 2877 pdp_entry_t *pdp; 2878 pd_entry_t *pd; 2879 2880 /* Wire up a new PTE page */ 2881 pdpindex = ptepindex >> NPDPEPGSHIFT; 2882 pml4index = pdpindex >> NPML4EPGSHIFT; 2883 2884 /* First, find the pdp and check that its valid. */ 2885 pml4 = &pmap->pm_pml4[pml4index]; 2886 if ((*pml4 & PG_V) == 0) { 2887 /* Have to allocate a new pd, recurse */ 2888 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 2889 lockp) == NULL) { 2890 vm_page_unwire_noq(m); 2891 vm_page_free_zero(m); 2892 return (NULL); 2893 } 2894 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 2895 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 2896 } else { 2897 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 2898 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 2899 if ((*pdp & PG_V) == 0) { 2900 /* Have to allocate a new pd, recurse */ 2901 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 2902 lockp) == NULL) { 2903 vm_page_unwire_noq(m); 2904 vm_page_free_zero(m); 2905 return (NULL); 2906 } 2907 } else { 2908 /* Add reference to the pd page */ 2909 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 2910 pdpg->wire_count++; 2911 } 2912 } 2913 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 2914 2915 /* Now we know where the page directory page is */ 2916 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; 2917 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 2918 } 2919 2920 pmap_resident_count_inc(pmap, 1); 2921 2922 return (m); 2923 } 2924 2925 static vm_page_t 2926 pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 2927 { 2928 vm_pindex_t pdpindex, ptepindex; 2929 pdp_entry_t *pdpe, PG_V; 2930 vm_page_t pdpg; 2931 2932 PG_V = pmap_valid_bit(pmap); 2933 2934 retry: 2935 pdpe = pmap_pdpe(pmap, va); 2936 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 2937 /* Add a reference to the pd page. */ 2938 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 2939 pdpg->wire_count++; 2940 } else { 2941 /* Allocate a pd page. */ 2942 ptepindex = pmap_pde_pindex(va); 2943 pdpindex = ptepindex >> NPDPEPGSHIFT; 2944 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); 2945 if (pdpg == NULL && lockp != NULL) 2946 goto retry; 2947 } 2948 return (pdpg); 2949 } 2950 2951 static vm_page_t 2952 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 2953 { 2954 vm_pindex_t ptepindex; 2955 pd_entry_t *pd, PG_V; 2956 vm_page_t m; 2957 2958 PG_V = pmap_valid_bit(pmap); 2959 2960 /* 2961 * Calculate pagetable page index 2962 */ 2963 ptepindex = pmap_pde_pindex(va); 2964 retry: 2965 /* 2966 * Get the page directory entry 2967 */ 2968 pd = pmap_pde(pmap, va); 2969 2970 /* 2971 * This supports switching from a 2MB page to a 2972 * normal 4K page. 2973 */ 2974 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 2975 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { 2976 /* 2977 * Invalidation of the 2MB page mapping may have caused 2978 * the deallocation of the underlying PD page. 2979 */ 2980 pd = NULL; 2981 } 2982 } 2983 2984 /* 2985 * If the page table page is mapped, we just increment the 2986 * hold count, and activate it. 2987 */ 2988 if (pd != NULL && (*pd & PG_V) != 0) { 2989 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 2990 m->wire_count++; 2991 } else { 2992 /* 2993 * Here if the pte page isn't mapped, or if it has been 2994 * deallocated. 2995 */ 2996 m = _pmap_allocpte(pmap, ptepindex, lockp); 2997 if (m == NULL && lockp != NULL) 2998 goto retry; 2999 } 3000 return (m); 3001 } 3002 3003 3004 /*************************************************** 3005 * Pmap allocation/deallocation routines. 3006 ***************************************************/ 3007 3008 /* 3009 * Release any resources held by the given physical map. 3010 * Called when a pmap initialized by pmap_pinit is being released. 3011 * Should only be called if the map contains no valid mappings. 3012 */ 3013 void 3014 pmap_release(pmap_t pmap) 3015 { 3016 vm_page_t m; 3017 int i; 3018 3019 KASSERT(pmap->pm_stats.resident_count == 0, 3020 ("pmap_release: pmap resident count %ld != 0", 3021 pmap->pm_stats.resident_count)); 3022 KASSERT(vm_radix_is_empty(&pmap->pm_root), 3023 ("pmap_release: pmap has reserved page table page(s)")); 3024 KASSERT(CPU_EMPTY(&pmap->pm_active), 3025 ("releasing active pmap %p", pmap)); 3026 3027 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4)); 3028 3029 for (i = 0; i < NKPML4E; i++) /* KVA */ 3030 pmap->pm_pml4[KPML4BASE + i] = 0; 3031 for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ 3032 pmap->pm_pml4[DMPML4I + i] = 0; 3033 pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ 3034 3035 vm_page_unwire_noq(m); 3036 vm_page_free_zero(m); 3037 3038 if (pmap->pm_pml4u != NULL) { 3039 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u)); 3040 vm_page_unwire_noq(m); 3041 vm_page_free(m); 3042 } 3043 } 3044 3045 static int 3046 kvm_size(SYSCTL_HANDLER_ARGS) 3047 { 3048 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 3049 3050 return sysctl_handle_long(oidp, &ksize, 0, req); 3051 } 3052 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 3053 0, 0, kvm_size, "LU", "Size of KVM"); 3054 3055 static int 3056 kvm_free(SYSCTL_HANDLER_ARGS) 3057 { 3058 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 3059 3060 return sysctl_handle_long(oidp, &kfree, 0, req); 3061 } 3062 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 3063 0, 0, kvm_free, "LU", "Amount of KVM free"); 3064 3065 /* 3066 * grow the number of kernel page table entries, if needed 3067 */ 3068 void 3069 pmap_growkernel(vm_offset_t addr) 3070 { 3071 vm_paddr_t paddr; 3072 vm_page_t nkpg; 3073 pd_entry_t *pde, newpdir; 3074 pdp_entry_t *pdpe; 3075 3076 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 3077 3078 /* 3079 * Return if "addr" is within the range of kernel page table pages 3080 * that were preallocated during pmap bootstrap. Moreover, leave 3081 * "kernel_vm_end" and the kernel page table as they were. 3082 * 3083 * The correctness of this action is based on the following 3084 * argument: vm_map_insert() allocates contiguous ranges of the 3085 * kernel virtual address space. It calls this function if a range 3086 * ends after "kernel_vm_end". If the kernel is mapped between 3087 * "kernel_vm_end" and "addr", then the range cannot begin at 3088 * "kernel_vm_end". In fact, its beginning address cannot be less 3089 * than the kernel. Thus, there is no immediate need to allocate 3090 * any new kernel page table pages between "kernel_vm_end" and 3091 * "KERNBASE". 3092 */ 3093 if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR) 3094 return; 3095 3096 addr = roundup2(addr, NBPDR); 3097 if (addr - 1 >= vm_map_max(kernel_map)) 3098 addr = vm_map_max(kernel_map); 3099 while (kernel_vm_end < addr) { 3100 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end); 3101 if ((*pdpe & X86_PG_V) == 0) { 3102 /* We need a new PDP entry */ 3103 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT, 3104 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 3105 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 3106 if (nkpg == NULL) 3107 panic("pmap_growkernel: no memory to grow kernel"); 3108 if ((nkpg->flags & PG_ZERO) == 0) 3109 pmap_zero_page(nkpg); 3110 paddr = VM_PAGE_TO_PHYS(nkpg); 3111 *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | 3112 X86_PG_A | X86_PG_M); 3113 continue; /* try again */ 3114 } 3115 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); 3116 if ((*pde & X86_PG_V) != 0) { 3117 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 3118 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 3119 kernel_vm_end = vm_map_max(kernel_map); 3120 break; 3121 } 3122 continue; 3123 } 3124 3125 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end), 3126 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 3127 VM_ALLOC_ZERO); 3128 if (nkpg == NULL) 3129 panic("pmap_growkernel: no memory to grow kernel"); 3130 if ((nkpg->flags & PG_ZERO) == 0) 3131 pmap_zero_page(nkpg); 3132 paddr = VM_PAGE_TO_PHYS(nkpg); 3133 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; 3134 pde_store(pde, newpdir); 3135 3136 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 3137 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 3138 kernel_vm_end = vm_map_max(kernel_map); 3139 break; 3140 } 3141 } 3142 } 3143 3144 3145 /*************************************************** 3146 * page management routines. 3147 ***************************************************/ 3148 3149 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 3150 CTASSERT(_NPCM == 3); 3151 CTASSERT(_NPCPV == 168); 3152 3153 static __inline struct pv_chunk * 3154 pv_to_chunk(pv_entry_t pv) 3155 { 3156 3157 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 3158 } 3159 3160 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 3161 3162 #define PC_FREE0 0xfffffffffffffffful 3163 #define PC_FREE1 0xfffffffffffffffful 3164 #define PC_FREE2 0x000000fffffffffful 3165 3166 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 3167 3168 #ifdef PV_STATS 3169 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 3170 3171 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 3172 "Current number of pv entry chunks"); 3173 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 3174 "Current number of pv entry chunks allocated"); 3175 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 3176 "Current number of pv entry chunks frees"); 3177 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 3178 "Number of times tried to get a chunk page but failed."); 3179 3180 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 3181 static int pv_entry_spare; 3182 3183 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 3184 "Current number of pv entry frees"); 3185 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 3186 "Current number of pv entry allocs"); 3187 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 3188 "Current number of pv entries"); 3189 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 3190 "Current number of spare pv entries"); 3191 #endif 3192 3193 static void 3194 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di) 3195 { 3196 3197 if (pmap == NULL) 3198 return; 3199 pmap_invalidate_all(pmap); 3200 if (pmap != locked_pmap) 3201 PMAP_UNLOCK(pmap); 3202 if (start_di) 3203 pmap_delayed_invl_finished(); 3204 } 3205 3206 /* 3207 * We are in a serious low memory condition. Resort to 3208 * drastic measures to free some pages so we can allocate 3209 * another pv entry chunk. 3210 * 3211 * Returns NULL if PV entries were reclaimed from the specified pmap. 3212 * 3213 * We do not, however, unmap 2mpages because subsequent accesses will 3214 * allocate per-page pv entries until repromotion occurs, thereby 3215 * exacerbating the shortage of free pv entries. 3216 */ 3217 static vm_page_t 3218 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 3219 { 3220 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 3221 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 3222 struct md_page *pvh; 3223 pd_entry_t *pde; 3224 pmap_t next_pmap, pmap; 3225 pt_entry_t *pte, tpte; 3226 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 3227 pv_entry_t pv; 3228 vm_offset_t va; 3229 vm_page_t m, m_pc; 3230 struct spglist free; 3231 uint64_t inuse; 3232 int bit, field, freed; 3233 bool start_di; 3234 static int active_reclaims = 0; 3235 3236 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 3237 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 3238 pmap = NULL; 3239 m_pc = NULL; 3240 PG_G = PG_A = PG_M = PG_RW = 0; 3241 SLIST_INIT(&free); 3242 bzero(&pc_marker_b, sizeof(pc_marker_b)); 3243 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 3244 pc_marker = (struct pv_chunk *)&pc_marker_b; 3245 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 3246 3247 /* 3248 * A delayed invalidation block should already be active if 3249 * pmap_advise() or pmap_remove() called this function by way 3250 * of pmap_demote_pde_locked(). 3251 */ 3252 start_di = pmap_not_in_di(); 3253 3254 mtx_lock(&pv_chunks_mutex); 3255 active_reclaims++; 3256 TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); 3257 TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); 3258 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 3259 SLIST_EMPTY(&free)) { 3260 next_pmap = pc->pc_pmap; 3261 if (next_pmap == NULL) { 3262 /* 3263 * The next chunk is a marker. However, it is 3264 * not our marker, so active_reclaims must be 3265 * > 1. Consequently, the next_chunk code 3266 * will not rotate the pv_chunks list. 3267 */ 3268 goto next_chunk; 3269 } 3270 mtx_unlock(&pv_chunks_mutex); 3271 3272 /* 3273 * A pv_chunk can only be removed from the pc_lru list 3274 * when both pc_chunks_mutex is owned and the 3275 * corresponding pmap is locked. 3276 */ 3277 if (pmap != next_pmap) { 3278 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, 3279 start_di); 3280 pmap = next_pmap; 3281 /* Avoid deadlock and lock recursion. */ 3282 if (pmap > locked_pmap) { 3283 RELEASE_PV_LIST_LOCK(lockp); 3284 PMAP_LOCK(pmap); 3285 if (start_di) 3286 pmap_delayed_invl_started(); 3287 mtx_lock(&pv_chunks_mutex); 3288 continue; 3289 } else if (pmap != locked_pmap) { 3290 if (PMAP_TRYLOCK(pmap)) { 3291 if (start_di) 3292 pmap_delayed_invl_started(); 3293 mtx_lock(&pv_chunks_mutex); 3294 continue; 3295 } else { 3296 pmap = NULL; /* pmap is not locked */ 3297 mtx_lock(&pv_chunks_mutex); 3298 pc = TAILQ_NEXT(pc_marker, pc_lru); 3299 if (pc == NULL || 3300 pc->pc_pmap != next_pmap) 3301 continue; 3302 goto next_chunk; 3303 } 3304 } else if (start_di) 3305 pmap_delayed_invl_started(); 3306 PG_G = pmap_global_bit(pmap); 3307 PG_A = pmap_accessed_bit(pmap); 3308 PG_M = pmap_modified_bit(pmap); 3309 PG_RW = pmap_rw_bit(pmap); 3310 } 3311 3312 /* 3313 * Destroy every non-wired, 4 KB page mapping in the chunk. 3314 */ 3315 freed = 0; 3316 for (field = 0; field < _NPCM; field++) { 3317 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 3318 inuse != 0; inuse &= ~(1UL << bit)) { 3319 bit = bsfq(inuse); 3320 pv = &pc->pc_pventry[field * 64 + bit]; 3321 va = pv->pv_va; 3322 pde = pmap_pde(pmap, va); 3323 if ((*pde & PG_PS) != 0) 3324 continue; 3325 pte = pmap_pde_to_pte(pde, va); 3326 if ((*pte & PG_W) != 0) 3327 continue; 3328 tpte = pte_load_clear(pte); 3329 if ((tpte & PG_G) != 0) 3330 pmap_invalidate_page(pmap, va); 3331 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 3332 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3333 vm_page_dirty(m); 3334 if ((tpte & PG_A) != 0) 3335 vm_page_aflag_set(m, PGA_REFERENCED); 3336 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3337 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3338 m->md.pv_gen++; 3339 if (TAILQ_EMPTY(&m->md.pv_list) && 3340 (m->flags & PG_FICTITIOUS) == 0) { 3341 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3342 if (TAILQ_EMPTY(&pvh->pv_list)) { 3343 vm_page_aflag_clear(m, 3344 PGA_WRITEABLE); 3345 } 3346 } 3347 pmap_delayed_invl_page(m); 3348 pc->pc_map[field] |= 1UL << bit; 3349 pmap_unuse_pt(pmap, va, *pde, &free); 3350 freed++; 3351 } 3352 } 3353 if (freed == 0) { 3354 mtx_lock(&pv_chunks_mutex); 3355 goto next_chunk; 3356 } 3357 /* Every freed mapping is for a 4 KB page. */ 3358 pmap_resident_count_dec(pmap, freed); 3359 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 3360 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 3361 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 3362 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3363 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && 3364 pc->pc_map[2] == PC_FREE2) { 3365 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 3366 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 3367 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 3368 /* Entire chunk is free; return it. */ 3369 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 3370 dump_drop_page(m_pc->phys_addr); 3371 mtx_lock(&pv_chunks_mutex); 3372 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 3373 break; 3374 } 3375 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3376 mtx_lock(&pv_chunks_mutex); 3377 /* One freed pv entry in locked_pmap is sufficient. */ 3378 if (pmap == locked_pmap) 3379 break; 3380 next_chunk: 3381 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 3382 TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); 3383 if (active_reclaims == 1 && pmap != NULL) { 3384 /* 3385 * Rotate the pv chunks list so that we do not 3386 * scan the same pv chunks that could not be 3387 * freed (because they contained a wired 3388 * and/or superpage mapping) on every 3389 * invocation of reclaim_pv_chunk(). 3390 */ 3391 while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { 3392 MPASS(pc->pc_pmap != NULL); 3393 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 3394 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 3395 } 3396 } 3397 } 3398 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 3399 TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); 3400 active_reclaims--; 3401 mtx_unlock(&pv_chunks_mutex); 3402 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); 3403 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 3404 m_pc = SLIST_FIRST(&free); 3405 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 3406 /* Recycle a freed page table page. */ 3407 m_pc->wire_count = 1; 3408 } 3409 vm_page_free_pages_toq(&free, true); 3410 return (m_pc); 3411 } 3412 3413 /* 3414 * free the pv_entry back to the free list 3415 */ 3416 static void 3417 free_pv_entry(pmap_t pmap, pv_entry_t pv) 3418 { 3419 struct pv_chunk *pc; 3420 int idx, field, bit; 3421 3422 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3423 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 3424 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 3425 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 3426 pc = pv_to_chunk(pv); 3427 idx = pv - &pc->pc_pventry[0]; 3428 field = idx / 64; 3429 bit = idx % 64; 3430 pc->pc_map[field] |= 1ul << bit; 3431 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 3432 pc->pc_map[2] != PC_FREE2) { 3433 /* 98% of the time, pc is already at the head of the list. */ 3434 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 3435 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3436 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3437 } 3438 return; 3439 } 3440 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3441 free_pv_chunk(pc); 3442 } 3443 3444 static void 3445 free_pv_chunk(struct pv_chunk *pc) 3446 { 3447 vm_page_t m; 3448 3449 mtx_lock(&pv_chunks_mutex); 3450 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 3451 mtx_unlock(&pv_chunks_mutex); 3452 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 3453 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 3454 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 3455 /* entire chunk is free, return it */ 3456 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 3457 dump_drop_page(m->phys_addr); 3458 vm_page_unwire(m, PQ_NONE); 3459 vm_page_free(m); 3460 } 3461 3462 /* 3463 * Returns a new PV entry, allocating a new PV chunk from the system when 3464 * needed. If this PV chunk allocation fails and a PV list lock pointer was 3465 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 3466 * returned. 3467 * 3468 * The given PV list lock may be released. 3469 */ 3470 static pv_entry_t 3471 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 3472 { 3473 int bit, field; 3474 pv_entry_t pv; 3475 struct pv_chunk *pc; 3476 vm_page_t m; 3477 3478 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3479 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 3480 retry: 3481 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3482 if (pc != NULL) { 3483 for (field = 0; field < _NPCM; field++) { 3484 if (pc->pc_map[field]) { 3485 bit = bsfq(pc->pc_map[field]); 3486 break; 3487 } 3488 } 3489 if (field < _NPCM) { 3490 pv = &pc->pc_pventry[field * 64 + bit]; 3491 pc->pc_map[field] &= ~(1ul << bit); 3492 /* If this was the last item, move it to tail */ 3493 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 3494 pc->pc_map[2] == 0) { 3495 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3496 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 3497 pc_list); 3498 } 3499 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 3500 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 3501 return (pv); 3502 } 3503 } 3504 /* No free items, allocate another chunk */ 3505 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 3506 VM_ALLOC_WIRED); 3507 if (m == NULL) { 3508 if (lockp == NULL) { 3509 PV_STAT(pc_chunk_tryfail++); 3510 return (NULL); 3511 } 3512 m = reclaim_pv_chunk(pmap, lockp); 3513 if (m == NULL) 3514 goto retry; 3515 } 3516 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3517 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3518 dump_add_page(m->phys_addr); 3519 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 3520 pc->pc_pmap = pmap; 3521 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 3522 pc->pc_map[1] = PC_FREE1; 3523 pc->pc_map[2] = PC_FREE2; 3524 mtx_lock(&pv_chunks_mutex); 3525 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 3526 mtx_unlock(&pv_chunks_mutex); 3527 pv = &pc->pc_pventry[0]; 3528 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3529 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 3530 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 3531 return (pv); 3532 } 3533 3534 /* 3535 * Returns the number of one bits within the given PV chunk map. 3536 * 3537 * The erratas for Intel processors state that "POPCNT Instruction May 3538 * Take Longer to Execute Than Expected". It is believed that the 3539 * issue is the spurious dependency on the destination register. 3540 * Provide a hint to the register rename logic that the destination 3541 * value is overwritten, by clearing it, as suggested in the 3542 * optimization manual. It should be cheap for unaffected processors 3543 * as well. 3544 * 3545 * Reference numbers for erratas are 3546 * 4th Gen Core: HSD146 3547 * 5th Gen Core: BDM85 3548 * 6th Gen Core: SKL029 3549 */ 3550 static int 3551 popcnt_pc_map_pq(uint64_t *map) 3552 { 3553 u_long result, tmp; 3554 3555 __asm __volatile("xorl %k0,%k0;popcntq %2,%0;" 3556 "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;" 3557 "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0" 3558 : "=&r" (result), "=&r" (tmp) 3559 : "m" (map[0]), "m" (map[1]), "m" (map[2])); 3560 return (result); 3561 } 3562 3563 /* 3564 * Ensure that the number of spare PV entries in the specified pmap meets or 3565 * exceeds the given count, "needed". 3566 * 3567 * The given PV list lock may be released. 3568 */ 3569 static void 3570 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 3571 { 3572 struct pch new_tail; 3573 struct pv_chunk *pc; 3574 vm_page_t m; 3575 int avail, free; 3576 bool reclaimed; 3577 3578 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3579 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 3580 3581 /* 3582 * Newly allocated PV chunks must be stored in a private list until 3583 * the required number of PV chunks have been allocated. Otherwise, 3584 * reclaim_pv_chunk() could recycle one of these chunks. In 3585 * contrast, these chunks must be added to the pmap upon allocation. 3586 */ 3587 TAILQ_INIT(&new_tail); 3588 retry: 3589 avail = 0; 3590 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 3591 #ifndef __POPCNT__ 3592 if ((cpu_feature2 & CPUID2_POPCNT) == 0) 3593 bit_count((bitstr_t *)pc->pc_map, 0, 3594 sizeof(pc->pc_map) * NBBY, &free); 3595 else 3596 #endif 3597 free = popcnt_pc_map_pq(pc->pc_map); 3598 if (free == 0) 3599 break; 3600 avail += free; 3601 if (avail >= needed) 3602 break; 3603 } 3604 for (reclaimed = false; avail < needed; avail += _NPCPV) { 3605 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 3606 VM_ALLOC_WIRED); 3607 if (m == NULL) { 3608 m = reclaim_pv_chunk(pmap, lockp); 3609 if (m == NULL) 3610 goto retry; 3611 reclaimed = true; 3612 } 3613 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3614 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3615 dump_add_page(m->phys_addr); 3616 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 3617 pc->pc_pmap = pmap; 3618 pc->pc_map[0] = PC_FREE0; 3619 pc->pc_map[1] = PC_FREE1; 3620 pc->pc_map[2] = PC_FREE2; 3621 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3622 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 3623 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 3624 3625 /* 3626 * The reclaim might have freed a chunk from the current pmap. 3627 * If that chunk contained available entries, we need to 3628 * re-count the number of available entries. 3629 */ 3630 if (reclaimed) 3631 goto retry; 3632 } 3633 if (!TAILQ_EMPTY(&new_tail)) { 3634 mtx_lock(&pv_chunks_mutex); 3635 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 3636 mtx_unlock(&pv_chunks_mutex); 3637 } 3638 } 3639 3640 /* 3641 * First find and then remove the pv entry for the specified pmap and virtual 3642 * address from the specified pv list. Returns the pv entry if found and NULL 3643 * otherwise. This operation can be performed on pv lists for either 4KB or 3644 * 2MB page mappings. 3645 */ 3646 static __inline pv_entry_t 3647 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3648 { 3649 pv_entry_t pv; 3650 3651 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3652 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 3653 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3654 pvh->pv_gen++; 3655 break; 3656 } 3657 } 3658 return (pv); 3659 } 3660 3661 /* 3662 * After demotion from a 2MB page mapping to 512 4KB page mappings, 3663 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 3664 * entries for each of the 4KB page mappings. 3665 */ 3666 static void 3667 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3668 struct rwlock **lockp) 3669 { 3670 struct md_page *pvh; 3671 struct pv_chunk *pc; 3672 pv_entry_t pv; 3673 vm_offset_t va_last; 3674 vm_page_t m; 3675 int bit, field; 3676 3677 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3678 KASSERT((pa & PDRMASK) == 0, 3679 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 3680 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3681 3682 /* 3683 * Transfer the 2mpage's pv entry for this mapping to the first 3684 * page's pv list. Once this transfer begins, the pv list lock 3685 * must not be released until the last pv entry is reinstantiated. 3686 */ 3687 pvh = pa_to_pvh(pa); 3688 va = trunc_2mpage(va); 3689 pv = pmap_pvh_remove(pvh, pmap, va); 3690 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 3691 m = PHYS_TO_VM_PAGE(pa); 3692 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3693 m->md.pv_gen++; 3694 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 3695 PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); 3696 va_last = va + NBPDR - PAGE_SIZE; 3697 for (;;) { 3698 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3699 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 3700 pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); 3701 for (field = 0; field < _NPCM; field++) { 3702 while (pc->pc_map[field]) { 3703 bit = bsfq(pc->pc_map[field]); 3704 pc->pc_map[field] &= ~(1ul << bit); 3705 pv = &pc->pc_pventry[field * 64 + bit]; 3706 va += PAGE_SIZE; 3707 pv->pv_va = va; 3708 m++; 3709 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3710 ("pmap_pv_demote_pde: page %p is not managed", m)); 3711 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3712 m->md.pv_gen++; 3713 if (va == va_last) 3714 goto out; 3715 } 3716 } 3717 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3718 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3719 } 3720 out: 3721 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 3722 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3723 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3724 } 3725 PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); 3726 PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); 3727 } 3728 3729 #if VM_NRESERVLEVEL > 0 3730 /* 3731 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 3732 * replace the many pv entries for the 4KB page mappings by a single pv entry 3733 * for the 2MB page mapping. 3734 */ 3735 static void 3736 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3737 struct rwlock **lockp) 3738 { 3739 struct md_page *pvh; 3740 pv_entry_t pv; 3741 vm_offset_t va_last; 3742 vm_page_t m; 3743 3744 KASSERT((pa & PDRMASK) == 0, 3745 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 3746 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3747 3748 /* 3749 * Transfer the first page's pv entry for this mapping to the 2mpage's 3750 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 3751 * a transfer avoids the possibility that get_pv_entry() calls 3752 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 3753 * mappings that is being promoted. 3754 */ 3755 m = PHYS_TO_VM_PAGE(pa); 3756 va = trunc_2mpage(va); 3757 pv = pmap_pvh_remove(&m->md, pmap, va); 3758 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 3759 pvh = pa_to_pvh(pa); 3760 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3761 pvh->pv_gen++; 3762 /* Free the remaining NPTEPG - 1 pv entries. */ 3763 va_last = va + NBPDR - PAGE_SIZE; 3764 do { 3765 m++; 3766 va += PAGE_SIZE; 3767 pmap_pvh_free(&m->md, pmap, va); 3768 } while (va < va_last); 3769 } 3770 #endif /* VM_NRESERVLEVEL > 0 */ 3771 3772 /* 3773 * First find and then destroy the pv entry for the specified pmap and virtual 3774 * address. This operation can be performed on pv lists for either 4KB or 2MB 3775 * page mappings. 3776 */ 3777 static void 3778 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3779 { 3780 pv_entry_t pv; 3781 3782 pv = pmap_pvh_remove(pvh, pmap, va); 3783 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 3784 free_pv_entry(pmap, pv); 3785 } 3786 3787 /* 3788 * Conditionally create the PV entry for a 4KB page mapping if the required 3789 * memory can be allocated without resorting to reclamation. 3790 */ 3791 static boolean_t 3792 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 3793 struct rwlock **lockp) 3794 { 3795 pv_entry_t pv; 3796 3797 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3798 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3799 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 3800 pv->pv_va = va; 3801 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3802 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3803 m->md.pv_gen++; 3804 return (TRUE); 3805 } else 3806 return (FALSE); 3807 } 3808 3809 /* 3810 * Create the PV entry for a 2MB page mapping. Always returns true unless the 3811 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 3812 * false if the PV entry cannot be allocated without resorting to reclamation. 3813 */ 3814 static bool 3815 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, 3816 struct rwlock **lockp) 3817 { 3818 struct md_page *pvh; 3819 pv_entry_t pv; 3820 vm_paddr_t pa; 3821 3822 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3823 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3824 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 3825 NULL : lockp)) == NULL) 3826 return (false); 3827 pv->pv_va = va; 3828 pa = pde & PG_PS_FRAME; 3829 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3830 pvh = pa_to_pvh(pa); 3831 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3832 pvh->pv_gen++; 3833 return (true); 3834 } 3835 3836 /* 3837 * Fills a page table page with mappings to consecutive physical pages. 3838 */ 3839 static void 3840 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 3841 { 3842 pt_entry_t *pte; 3843 3844 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 3845 *pte = newpte; 3846 newpte += PAGE_SIZE; 3847 } 3848 } 3849 3850 /* 3851 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 3852 * mapping is invalidated. 3853 */ 3854 static boolean_t 3855 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3856 { 3857 struct rwlock *lock; 3858 boolean_t rv; 3859 3860 lock = NULL; 3861 rv = pmap_demote_pde_locked(pmap, pde, va, &lock); 3862 if (lock != NULL) 3863 rw_wunlock(lock); 3864 return (rv); 3865 } 3866 3867 static boolean_t 3868 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 3869 struct rwlock **lockp) 3870 { 3871 pd_entry_t newpde, oldpde; 3872 pt_entry_t *firstpte, newpte; 3873 pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V; 3874 vm_paddr_t mptepa; 3875 vm_page_t mpte; 3876 struct spglist free; 3877 vm_offset_t sva; 3878 int PG_PTE_CACHE; 3879 3880 PG_G = pmap_global_bit(pmap); 3881 PG_A = pmap_accessed_bit(pmap); 3882 PG_M = pmap_modified_bit(pmap); 3883 PG_RW = pmap_rw_bit(pmap); 3884 PG_V = pmap_valid_bit(pmap); 3885 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 3886 3887 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3888 oldpde = *pde; 3889 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 3890 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 3891 if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 3892 NULL) { 3893 KASSERT((oldpde & PG_W) == 0, 3894 ("pmap_demote_pde: page table page for a wired mapping" 3895 " is missing")); 3896 3897 /* 3898 * Invalidate the 2MB page mapping and return "failure" if the 3899 * mapping was never accessed or the allocation of the new 3900 * page table page fails. If the 2MB page mapping belongs to 3901 * the direct map region of the kernel's address space, then 3902 * the page allocation request specifies the highest possible 3903 * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is 3904 * normal. Page table pages are preallocated for every other 3905 * part of the kernel address space, so the direct map region 3906 * is the only part of the kernel address space that must be 3907 * handled here. 3908 */ 3909 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 3910 pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va < 3911 DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | 3912 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 3913 SLIST_INIT(&free); 3914 sva = trunc_2mpage(va); 3915 pmap_remove_pde(pmap, pde, sva, &free, lockp); 3916 if ((oldpde & PG_G) == 0) 3917 pmap_invalidate_pde_page(pmap, sva, oldpde); 3918 vm_page_free_pages_toq(&free, true); 3919 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx" 3920 " in pmap %p", va, pmap); 3921 return (FALSE); 3922 } 3923 if (va < VM_MAXUSER_ADDRESS) 3924 pmap_resident_count_inc(pmap, 1); 3925 } 3926 mptepa = VM_PAGE_TO_PHYS(mpte); 3927 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 3928 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 3929 KASSERT((oldpde & PG_A) != 0, 3930 ("pmap_demote_pde: oldpde is missing PG_A")); 3931 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 3932 ("pmap_demote_pde: oldpde is missing PG_M")); 3933 newpte = oldpde & ~PG_PS; 3934 newpte = pmap_swap_pat(pmap, newpte); 3935 3936 /* 3937 * If the page table page is new, initialize it. 3938 */ 3939 if (mpte->wire_count == 1) { 3940 mpte->wire_count = NPTEPG; 3941 pmap_fill_ptp(firstpte, newpte); 3942 } 3943 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 3944 ("pmap_demote_pde: firstpte and newpte map different physical" 3945 " addresses")); 3946 3947 /* 3948 * If the mapping has changed attributes, update the page table 3949 * entries. 3950 */ 3951 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 3952 pmap_fill_ptp(firstpte, newpte); 3953 3954 /* 3955 * The spare PV entries must be reserved prior to demoting the 3956 * mapping, that is, prior to changing the PDE. Otherwise, the state 3957 * of the PDE and the PV lists will be inconsistent, which can result 3958 * in reclaim_pv_chunk() attempting to remove a PV entry from the 3959 * wrong PV list and pmap_pv_demote_pde() failing to find the expected 3960 * PV entry for the 2MB page mapping that is being demoted. 3961 */ 3962 if ((oldpde & PG_MANAGED) != 0) 3963 reserve_pv_entries(pmap, NPTEPG - 1, lockp); 3964 3965 /* 3966 * Demote the mapping. This pmap is locked. The old PDE has 3967 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 3968 * set. Thus, there is no danger of a race with another 3969 * processor changing the setting of PG_A and/or PG_M between 3970 * the read above and the store below. 3971 */ 3972 if (workaround_erratum383) 3973 pmap_update_pde(pmap, va, pde, newpde); 3974 else 3975 pde_store(pde, newpde); 3976 3977 /* 3978 * Invalidate a stale recursive mapping of the page table page. 3979 */ 3980 if (va >= VM_MAXUSER_ADDRESS) 3981 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 3982 3983 /* 3984 * Demote the PV entry. 3985 */ 3986 if ((oldpde & PG_MANAGED) != 0) 3987 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); 3988 3989 atomic_add_long(&pmap_pde_demotions, 1); 3990 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx" 3991 " in pmap %p", va, pmap); 3992 return (TRUE); 3993 } 3994 3995 /* 3996 * pmap_remove_kernel_pde: Remove a kernel superpage mapping. 3997 */ 3998 static void 3999 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 4000 { 4001 pd_entry_t newpde; 4002 vm_paddr_t mptepa; 4003 vm_page_t mpte; 4004 4005 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 4006 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4007 mpte = pmap_remove_pt_page(pmap, va); 4008 if (mpte == NULL) 4009 panic("pmap_remove_kernel_pde: Missing pt page."); 4010 4011 mptepa = VM_PAGE_TO_PHYS(mpte); 4012 newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; 4013 4014 /* 4015 * Initialize the page table page. 4016 */ 4017 pagezero((void *)PHYS_TO_DMAP(mptepa)); 4018 4019 /* 4020 * Demote the mapping. 4021 */ 4022 if (workaround_erratum383) 4023 pmap_update_pde(pmap, va, pde, newpde); 4024 else 4025 pde_store(pde, newpde); 4026 4027 /* 4028 * Invalidate a stale recursive mapping of the page table page. 4029 */ 4030 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 4031 } 4032 4033 /* 4034 * pmap_remove_pde: do the things to unmap a superpage in a process 4035 */ 4036 static int 4037 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 4038 struct spglist *free, struct rwlock **lockp) 4039 { 4040 struct md_page *pvh; 4041 pd_entry_t oldpde; 4042 vm_offset_t eva, va; 4043 vm_page_t m, mpte; 4044 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 4045 4046 PG_G = pmap_global_bit(pmap); 4047 PG_A = pmap_accessed_bit(pmap); 4048 PG_M = pmap_modified_bit(pmap); 4049 PG_RW = pmap_rw_bit(pmap); 4050 4051 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4052 KASSERT((sva & PDRMASK) == 0, 4053 ("pmap_remove_pde: sva is not 2mpage aligned")); 4054 oldpde = pte_load_clear(pdq); 4055 if (oldpde & PG_W) 4056 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 4057 if ((oldpde & PG_G) != 0) 4058 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 4059 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); 4060 if (oldpde & PG_MANAGED) { 4061 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); 4062 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 4063 pmap_pvh_free(pvh, pmap, sva); 4064 eva = sva + NBPDR; 4065 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 4066 va < eva; va += PAGE_SIZE, m++) { 4067 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 4068 vm_page_dirty(m); 4069 if (oldpde & PG_A) 4070 vm_page_aflag_set(m, PGA_REFERENCED); 4071 if (TAILQ_EMPTY(&m->md.pv_list) && 4072 TAILQ_EMPTY(&pvh->pv_list)) 4073 vm_page_aflag_clear(m, PGA_WRITEABLE); 4074 pmap_delayed_invl_page(m); 4075 } 4076 } 4077 if (pmap == kernel_pmap) { 4078 pmap_remove_kernel_pde(pmap, pdq, sva); 4079 } else { 4080 mpte = pmap_remove_pt_page(pmap, sva); 4081 if (mpte != NULL) { 4082 pmap_resident_count_dec(pmap, 1); 4083 KASSERT(mpte->wire_count == NPTEPG, 4084 ("pmap_remove_pde: pte page wire count error")); 4085 mpte->wire_count = 0; 4086 pmap_add_delayed_free_list(mpte, free, FALSE); 4087 } 4088 } 4089 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); 4090 } 4091 4092 /* 4093 * pmap_remove_pte: do the things to unmap a page in a process 4094 */ 4095 static int 4096 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 4097 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) 4098 { 4099 struct md_page *pvh; 4100 pt_entry_t oldpte, PG_A, PG_M, PG_RW; 4101 vm_page_t m; 4102 4103 PG_A = pmap_accessed_bit(pmap); 4104 PG_M = pmap_modified_bit(pmap); 4105 PG_RW = pmap_rw_bit(pmap); 4106 4107 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4108 oldpte = pte_load_clear(ptq); 4109 if (oldpte & PG_W) 4110 pmap->pm_stats.wired_count -= 1; 4111 pmap_resident_count_dec(pmap, 1); 4112 if (oldpte & PG_MANAGED) { 4113 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 4114 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 4115 vm_page_dirty(m); 4116 if (oldpte & PG_A) 4117 vm_page_aflag_set(m, PGA_REFERENCED); 4118 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 4119 pmap_pvh_free(&m->md, pmap, va); 4120 if (TAILQ_EMPTY(&m->md.pv_list) && 4121 (m->flags & PG_FICTITIOUS) == 0) { 4122 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4123 if (TAILQ_EMPTY(&pvh->pv_list)) 4124 vm_page_aflag_clear(m, PGA_WRITEABLE); 4125 } 4126 pmap_delayed_invl_page(m); 4127 } 4128 return (pmap_unuse_pt(pmap, va, ptepde, free)); 4129 } 4130 4131 /* 4132 * Remove a single page from a process address space 4133 */ 4134 static void 4135 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 4136 struct spglist *free) 4137 { 4138 struct rwlock *lock; 4139 pt_entry_t *pte, PG_V; 4140 4141 PG_V = pmap_valid_bit(pmap); 4142 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4143 if ((*pde & PG_V) == 0) 4144 return; 4145 pte = pmap_pde_to_pte(pde, va); 4146 if ((*pte & PG_V) == 0) 4147 return; 4148 lock = NULL; 4149 pmap_remove_pte(pmap, pte, va, *pde, free, &lock); 4150 if (lock != NULL) 4151 rw_wunlock(lock); 4152 pmap_invalidate_page(pmap, va); 4153 } 4154 4155 /* 4156 * Removes the specified range of addresses from the page table page. 4157 */ 4158 static bool 4159 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 4160 pd_entry_t *pde, struct spglist *free, struct rwlock **lockp) 4161 { 4162 pt_entry_t PG_G, *pte; 4163 vm_offset_t va; 4164 bool anyvalid; 4165 4166 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4167 PG_G = pmap_global_bit(pmap); 4168 anyvalid = false; 4169 va = eva; 4170 for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++, 4171 sva += PAGE_SIZE) { 4172 if (*pte == 0) { 4173 if (va != eva) { 4174 pmap_invalidate_range(pmap, va, sva); 4175 va = eva; 4176 } 4177 continue; 4178 } 4179 if ((*pte & PG_G) == 0) 4180 anyvalid = true; 4181 else if (va == eva) 4182 va = sva; 4183 if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) { 4184 sva += PAGE_SIZE; 4185 break; 4186 } 4187 } 4188 if (va != eva) 4189 pmap_invalidate_range(pmap, va, sva); 4190 return (anyvalid); 4191 } 4192 4193 /* 4194 * Remove the given range of addresses from the specified map. 4195 * 4196 * It is assumed that the start and end are properly 4197 * rounded to the page size. 4198 */ 4199 void 4200 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4201 { 4202 struct rwlock *lock; 4203 vm_offset_t va_next; 4204 pml4_entry_t *pml4e; 4205 pdp_entry_t *pdpe; 4206 pd_entry_t ptpaddr, *pde; 4207 pt_entry_t PG_G, PG_V; 4208 struct spglist free; 4209 int anyvalid; 4210 4211 PG_G = pmap_global_bit(pmap); 4212 PG_V = pmap_valid_bit(pmap); 4213 4214 /* 4215 * Perform an unsynchronized read. This is, however, safe. 4216 */ 4217 if (pmap->pm_stats.resident_count == 0) 4218 return; 4219 4220 anyvalid = 0; 4221 SLIST_INIT(&free); 4222 4223 pmap_delayed_invl_started(); 4224 PMAP_LOCK(pmap); 4225 4226 /* 4227 * special handling of removing one page. a very 4228 * common operation and easy to short circuit some 4229 * code. 4230 */ 4231 if (sva + PAGE_SIZE == eva) { 4232 pde = pmap_pde(pmap, sva); 4233 if (pde && (*pde & PG_PS) == 0) { 4234 pmap_remove_page(pmap, sva, pde, &free); 4235 goto out; 4236 } 4237 } 4238 4239 lock = NULL; 4240 for (; sva < eva; sva = va_next) { 4241 4242 if (pmap->pm_stats.resident_count == 0) 4243 break; 4244 4245 pml4e = pmap_pml4e(pmap, sva); 4246 if ((*pml4e & PG_V) == 0) { 4247 va_next = (sva + NBPML4) & ~PML4MASK; 4248 if (va_next < sva) 4249 va_next = eva; 4250 continue; 4251 } 4252 4253 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 4254 if ((*pdpe & PG_V) == 0) { 4255 va_next = (sva + NBPDP) & ~PDPMASK; 4256 if (va_next < sva) 4257 va_next = eva; 4258 continue; 4259 } 4260 4261 /* 4262 * Calculate index for next page table. 4263 */ 4264 va_next = (sva + NBPDR) & ~PDRMASK; 4265 if (va_next < sva) 4266 va_next = eva; 4267 4268 pde = pmap_pdpe_to_pde(pdpe, sva); 4269 ptpaddr = *pde; 4270 4271 /* 4272 * Weed out invalid mappings. 4273 */ 4274 if (ptpaddr == 0) 4275 continue; 4276 4277 /* 4278 * Check for large page. 4279 */ 4280 if ((ptpaddr & PG_PS) != 0) { 4281 /* 4282 * Are we removing the entire large page? If not, 4283 * demote the mapping and fall through. 4284 */ 4285 if (sva + NBPDR == va_next && eva >= va_next) { 4286 /* 4287 * The TLB entry for a PG_G mapping is 4288 * invalidated by pmap_remove_pde(). 4289 */ 4290 if ((ptpaddr & PG_G) == 0) 4291 anyvalid = 1; 4292 pmap_remove_pde(pmap, pde, sva, &free, &lock); 4293 continue; 4294 } else if (!pmap_demote_pde_locked(pmap, pde, sva, 4295 &lock)) { 4296 /* The large page mapping was destroyed. */ 4297 continue; 4298 } else 4299 ptpaddr = *pde; 4300 } 4301 4302 /* 4303 * Limit our scan to either the end of the va represented 4304 * by the current page table page, or to the end of the 4305 * range being removed. 4306 */ 4307 if (va_next > eva) 4308 va_next = eva; 4309 4310 if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock)) 4311 anyvalid = 1; 4312 } 4313 if (lock != NULL) 4314 rw_wunlock(lock); 4315 out: 4316 if (anyvalid) 4317 pmap_invalidate_all(pmap); 4318 PMAP_UNLOCK(pmap); 4319 pmap_delayed_invl_finished(); 4320 vm_page_free_pages_toq(&free, true); 4321 } 4322 4323 /* 4324 * Routine: pmap_remove_all 4325 * Function: 4326 * Removes this physical page from 4327 * all physical maps in which it resides. 4328 * Reflects back modify bits to the pager. 4329 * 4330 * Notes: 4331 * Original versions of this routine were very 4332 * inefficient because they iteratively called 4333 * pmap_remove (slow...) 4334 */ 4335 4336 void 4337 pmap_remove_all(vm_page_t m) 4338 { 4339 struct md_page *pvh; 4340 pv_entry_t pv; 4341 pmap_t pmap; 4342 struct rwlock *lock; 4343 pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; 4344 pd_entry_t *pde; 4345 vm_offset_t va; 4346 struct spglist free; 4347 int pvh_gen, md_gen; 4348 4349 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4350 ("pmap_remove_all: page %p is not managed", m)); 4351 SLIST_INIT(&free); 4352 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4353 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 4354 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4355 retry: 4356 rw_wlock(lock); 4357 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 4358 pmap = PV_PMAP(pv); 4359 if (!PMAP_TRYLOCK(pmap)) { 4360 pvh_gen = pvh->pv_gen; 4361 rw_wunlock(lock); 4362 PMAP_LOCK(pmap); 4363 rw_wlock(lock); 4364 if (pvh_gen != pvh->pv_gen) { 4365 rw_wunlock(lock); 4366 PMAP_UNLOCK(pmap); 4367 goto retry; 4368 } 4369 } 4370 va = pv->pv_va; 4371 pde = pmap_pde(pmap, va); 4372 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 4373 PMAP_UNLOCK(pmap); 4374 } 4375 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4376 pmap = PV_PMAP(pv); 4377 if (!PMAP_TRYLOCK(pmap)) { 4378 pvh_gen = pvh->pv_gen; 4379 md_gen = m->md.pv_gen; 4380 rw_wunlock(lock); 4381 PMAP_LOCK(pmap); 4382 rw_wlock(lock); 4383 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4384 rw_wunlock(lock); 4385 PMAP_UNLOCK(pmap); 4386 goto retry; 4387 } 4388 } 4389 PG_A = pmap_accessed_bit(pmap); 4390 PG_M = pmap_modified_bit(pmap); 4391 PG_RW = pmap_rw_bit(pmap); 4392 pmap_resident_count_dec(pmap, 1); 4393 pde = pmap_pde(pmap, pv->pv_va); 4394 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 4395 " a 2mpage in page %p's pv list", m)); 4396 pte = pmap_pde_to_pte(pde, pv->pv_va); 4397 tpte = pte_load_clear(pte); 4398 if (tpte & PG_W) 4399 pmap->pm_stats.wired_count--; 4400 if (tpte & PG_A) 4401 vm_page_aflag_set(m, PGA_REFERENCED); 4402 4403 /* 4404 * Update the vm_page_t clean and reference bits. 4405 */ 4406 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 4407 vm_page_dirty(m); 4408 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); 4409 pmap_invalidate_page(pmap, pv->pv_va); 4410 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4411 m->md.pv_gen++; 4412 free_pv_entry(pmap, pv); 4413 PMAP_UNLOCK(pmap); 4414 } 4415 vm_page_aflag_clear(m, PGA_WRITEABLE); 4416 rw_wunlock(lock); 4417 pmap_delayed_invl_wait(m); 4418 vm_page_free_pages_toq(&free, true); 4419 } 4420 4421 /* 4422 * pmap_protect_pde: do the things to protect a 2mpage in a process 4423 */ 4424 static boolean_t 4425 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 4426 { 4427 pd_entry_t newpde, oldpde; 4428 vm_offset_t eva, va; 4429 vm_page_t m; 4430 boolean_t anychanged; 4431 pt_entry_t PG_G, PG_M, PG_RW; 4432 4433 PG_G = pmap_global_bit(pmap); 4434 PG_M = pmap_modified_bit(pmap); 4435 PG_RW = pmap_rw_bit(pmap); 4436 4437 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4438 KASSERT((sva & PDRMASK) == 0, 4439 ("pmap_protect_pde: sva is not 2mpage aligned")); 4440 anychanged = FALSE; 4441 retry: 4442 oldpde = newpde = *pde; 4443 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == 4444 (PG_MANAGED | PG_M | PG_RW)) { 4445 eva = sva + NBPDR; 4446 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 4447 va < eva; va += PAGE_SIZE, m++) 4448 vm_page_dirty(m); 4449 } 4450 if ((prot & VM_PROT_WRITE) == 0) 4451 newpde &= ~(PG_RW | PG_M); 4452 if ((prot & VM_PROT_EXECUTE) == 0) 4453 newpde |= pg_nx; 4454 if (newpde != oldpde) { 4455 /* 4456 * As an optimization to future operations on this PDE, clear 4457 * PG_PROMOTED. The impending invalidation will remove any 4458 * lingering 4KB page mappings from the TLB. 4459 */ 4460 if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED)) 4461 goto retry; 4462 if ((oldpde & PG_G) != 0) 4463 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 4464 else 4465 anychanged = TRUE; 4466 } 4467 return (anychanged); 4468 } 4469 4470 /* 4471 * Set the physical protection on the 4472 * specified range of this map as requested. 4473 */ 4474 void 4475 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4476 { 4477 vm_offset_t va_next; 4478 pml4_entry_t *pml4e; 4479 pdp_entry_t *pdpe; 4480 pd_entry_t ptpaddr, *pde; 4481 pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; 4482 boolean_t anychanged; 4483 4484 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 4485 if (prot == VM_PROT_NONE) { 4486 pmap_remove(pmap, sva, eva); 4487 return; 4488 } 4489 4490 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 4491 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 4492 return; 4493 4494 PG_G = pmap_global_bit(pmap); 4495 PG_M = pmap_modified_bit(pmap); 4496 PG_V = pmap_valid_bit(pmap); 4497 PG_RW = pmap_rw_bit(pmap); 4498 anychanged = FALSE; 4499 4500 /* 4501 * Although this function delays and batches the invalidation 4502 * of stale TLB entries, it does not need to call 4503 * pmap_delayed_invl_started() and 4504 * pmap_delayed_invl_finished(), because it does not 4505 * ordinarily destroy mappings. Stale TLB entries from 4506 * protection-only changes need only be invalidated before the 4507 * pmap lock is released, because protection-only changes do 4508 * not destroy PV entries. Even operations that iterate over 4509 * a physical page's PV list of mappings, like 4510 * pmap_remove_write(), acquire the pmap lock for each 4511 * mapping. Consequently, for protection-only changes, the 4512 * pmap lock suffices to synchronize both page table and TLB 4513 * updates. 4514 * 4515 * This function only destroys a mapping if pmap_demote_pde() 4516 * fails. In that case, stale TLB entries are immediately 4517 * invalidated. 4518 */ 4519 4520 PMAP_LOCK(pmap); 4521 for (; sva < eva; sva = va_next) { 4522 4523 pml4e = pmap_pml4e(pmap, sva); 4524 if ((*pml4e & PG_V) == 0) { 4525 va_next = (sva + NBPML4) & ~PML4MASK; 4526 if (va_next < sva) 4527 va_next = eva; 4528 continue; 4529 } 4530 4531 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 4532 if ((*pdpe & PG_V) == 0) { 4533 va_next = (sva + NBPDP) & ~PDPMASK; 4534 if (va_next < sva) 4535 va_next = eva; 4536 continue; 4537 } 4538 4539 va_next = (sva + NBPDR) & ~PDRMASK; 4540 if (va_next < sva) 4541 va_next = eva; 4542 4543 pde = pmap_pdpe_to_pde(pdpe, sva); 4544 ptpaddr = *pde; 4545 4546 /* 4547 * Weed out invalid mappings. 4548 */ 4549 if (ptpaddr == 0) 4550 continue; 4551 4552 /* 4553 * Check for large page. 4554 */ 4555 if ((ptpaddr & PG_PS) != 0) { 4556 /* 4557 * Are we protecting the entire large page? If not, 4558 * demote the mapping and fall through. 4559 */ 4560 if (sva + NBPDR == va_next && eva >= va_next) { 4561 /* 4562 * The TLB entry for a PG_G mapping is 4563 * invalidated by pmap_protect_pde(). 4564 */ 4565 if (pmap_protect_pde(pmap, pde, sva, prot)) 4566 anychanged = TRUE; 4567 continue; 4568 } else if (!pmap_demote_pde(pmap, pde, sva)) { 4569 /* 4570 * The large page mapping was destroyed. 4571 */ 4572 continue; 4573 } 4574 } 4575 4576 if (va_next > eva) 4577 va_next = eva; 4578 4579 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 4580 sva += PAGE_SIZE) { 4581 pt_entry_t obits, pbits; 4582 vm_page_t m; 4583 4584 retry: 4585 obits = pbits = *pte; 4586 if ((pbits & PG_V) == 0) 4587 continue; 4588 4589 if ((prot & VM_PROT_WRITE) == 0) { 4590 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 4591 (PG_MANAGED | PG_M | PG_RW)) { 4592 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 4593 vm_page_dirty(m); 4594 } 4595 pbits &= ~(PG_RW | PG_M); 4596 } 4597 if ((prot & VM_PROT_EXECUTE) == 0) 4598 pbits |= pg_nx; 4599 4600 if (pbits != obits) { 4601 if (!atomic_cmpset_long(pte, obits, pbits)) 4602 goto retry; 4603 if (obits & PG_G) 4604 pmap_invalidate_page(pmap, sva); 4605 else 4606 anychanged = TRUE; 4607 } 4608 } 4609 } 4610 if (anychanged) 4611 pmap_invalidate_all(pmap); 4612 PMAP_UNLOCK(pmap); 4613 } 4614 4615 #if VM_NRESERVLEVEL > 0 4616 /* 4617 * Tries to promote the 512, contiguous 4KB page mappings that are within a 4618 * single page table page (PTP) to a single 2MB page mapping. For promotion 4619 * to occur, two conditions must be met: (1) the 4KB page mappings must map 4620 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 4621 * identical characteristics. 4622 */ 4623 static void 4624 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 4625 struct rwlock **lockp) 4626 { 4627 pd_entry_t newpde; 4628 pt_entry_t *firstpte, oldpte, pa, *pte; 4629 pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V; 4630 vm_page_t mpte; 4631 int PG_PTE_CACHE; 4632 4633 PG_A = pmap_accessed_bit(pmap); 4634 PG_G = pmap_global_bit(pmap); 4635 PG_M = pmap_modified_bit(pmap); 4636 PG_V = pmap_valid_bit(pmap); 4637 PG_RW = pmap_rw_bit(pmap); 4638 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 4639 4640 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4641 4642 /* 4643 * Examine the first PTE in the specified PTP. Abort if this PTE is 4644 * either invalid, unused, or does not map the first 4KB physical page 4645 * within a 2MB page. 4646 */ 4647 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 4648 setpde: 4649 newpde = *firstpte; 4650 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 4651 atomic_add_long(&pmap_pde_p_failures, 1); 4652 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4653 " in pmap %p", va, pmap); 4654 return; 4655 } 4656 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 4657 /* 4658 * When PG_M is already clear, PG_RW can be cleared without 4659 * a TLB invalidation. 4660 */ 4661 if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW)) 4662 goto setpde; 4663 newpde &= ~PG_RW; 4664 } 4665 4666 /* 4667 * Examine each of the other PTEs in the specified PTP. Abort if this 4668 * PTE maps an unexpected 4KB physical page or does not have identical 4669 * characteristics to the first PTE. 4670 */ 4671 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 4672 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 4673 setpte: 4674 oldpte = *pte; 4675 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 4676 atomic_add_long(&pmap_pde_p_failures, 1); 4677 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4678 " in pmap %p", va, pmap); 4679 return; 4680 } 4681 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 4682 /* 4683 * When PG_M is already clear, PG_RW can be cleared 4684 * without a TLB invalidation. 4685 */ 4686 if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW)) 4687 goto setpte; 4688 oldpte &= ~PG_RW; 4689 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 4690 " in pmap %p", (oldpte & PG_FRAME & PDRMASK) | 4691 (va & ~PDRMASK), pmap); 4692 } 4693 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 4694 atomic_add_long(&pmap_pde_p_failures, 1); 4695 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4696 " in pmap %p", va, pmap); 4697 return; 4698 } 4699 pa -= PAGE_SIZE; 4700 } 4701 4702 /* 4703 * Save the page table page in its current state until the PDE 4704 * mapping the superpage is demoted by pmap_demote_pde() or 4705 * destroyed by pmap_remove_pde(). 4706 */ 4707 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 4708 KASSERT(mpte >= vm_page_array && 4709 mpte < &vm_page_array[vm_page_array_size], 4710 ("pmap_promote_pde: page table page is out of range")); 4711 KASSERT(mpte->pindex == pmap_pde_pindex(va), 4712 ("pmap_promote_pde: page table page's pindex is wrong")); 4713 if (pmap_insert_pt_page(pmap, mpte)) { 4714 atomic_add_long(&pmap_pde_p_failures, 1); 4715 CTR2(KTR_PMAP, 4716 "pmap_promote_pde: failure for va %#lx in pmap %p", va, 4717 pmap); 4718 return; 4719 } 4720 4721 /* 4722 * Promote the pv entries. 4723 */ 4724 if ((newpde & PG_MANAGED) != 0) 4725 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); 4726 4727 /* 4728 * Propagate the PAT index to its proper position. 4729 */ 4730 newpde = pmap_swap_pat(pmap, newpde); 4731 4732 /* 4733 * Map the superpage. 4734 */ 4735 if (workaround_erratum383) 4736 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 4737 else 4738 pde_store(pde, PG_PROMOTED | PG_PS | newpde); 4739 4740 atomic_add_long(&pmap_pde_promotions, 1); 4741 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" 4742 " in pmap %p", va, pmap); 4743 } 4744 #endif /* VM_NRESERVLEVEL > 0 */ 4745 4746 /* 4747 * Insert the given physical page (p) at 4748 * the specified virtual address (v) in the 4749 * target physical map with the protection requested. 4750 * 4751 * If specified, the page will be wired down, meaning 4752 * that the related pte can not be reclaimed. 4753 * 4754 * NB: This is the only routine which MAY NOT lazy-evaluate 4755 * or lose information. That is, this routine must actually 4756 * insert this page into the given map NOW. 4757 * 4758 * When destroying both a page table and PV entry, this function 4759 * performs the TLB invalidation before releasing the PV list 4760 * lock, so we do not need pmap_delayed_invl_page() calls here. 4761 */ 4762 int 4763 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4764 u_int flags, int8_t psind) 4765 { 4766 struct rwlock *lock; 4767 pd_entry_t *pde; 4768 pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; 4769 pt_entry_t newpte, origpte; 4770 pv_entry_t pv; 4771 vm_paddr_t opa, pa; 4772 vm_page_t mpte, om; 4773 int rv; 4774 boolean_t nosleep; 4775 4776 PG_A = pmap_accessed_bit(pmap); 4777 PG_G = pmap_global_bit(pmap); 4778 PG_M = pmap_modified_bit(pmap); 4779 PG_V = pmap_valid_bit(pmap); 4780 PG_RW = pmap_rw_bit(pmap); 4781 4782 va = trunc_page(va); 4783 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 4784 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 4785 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", 4786 va)); 4787 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || 4788 va >= kmi.clean_eva, 4789 ("pmap_enter: managed mapping within the clean submap")); 4790 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) 4791 VM_OBJECT_ASSERT_LOCKED(m->object); 4792 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 4793 ("pmap_enter: flags %u has reserved bits set", flags)); 4794 pa = VM_PAGE_TO_PHYS(m); 4795 newpte = (pt_entry_t)(pa | PG_A | PG_V); 4796 if ((flags & VM_PROT_WRITE) != 0) 4797 newpte |= PG_M; 4798 if ((prot & VM_PROT_WRITE) != 0) 4799 newpte |= PG_RW; 4800 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 4801 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); 4802 if ((prot & VM_PROT_EXECUTE) == 0) 4803 newpte |= pg_nx; 4804 if ((flags & PMAP_ENTER_WIRED) != 0) 4805 newpte |= PG_W; 4806 if (va < VM_MAXUSER_ADDRESS) 4807 newpte |= PG_U; 4808 if (pmap == kernel_pmap) 4809 newpte |= PG_G; 4810 newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0); 4811 4812 /* 4813 * Set modified bit gratuitously for writeable mappings if 4814 * the page is unmanaged. We do not want to take a fault 4815 * to do the dirty bit accounting for these mappings. 4816 */ 4817 if ((m->oflags & VPO_UNMANAGED) != 0) { 4818 if ((newpte & PG_RW) != 0) 4819 newpte |= PG_M; 4820 } else 4821 newpte |= PG_MANAGED; 4822 4823 lock = NULL; 4824 PMAP_LOCK(pmap); 4825 if (psind == 1) { 4826 /* Assert the required virtual and physical alignment. */ 4827 KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); 4828 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 4829 rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock); 4830 goto out; 4831 } 4832 mpte = NULL; 4833 4834 /* 4835 * In the case that a page table page is not 4836 * resident, we are creating it here. 4837 */ 4838 retry: 4839 pde = pmap_pde(pmap, va); 4840 if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || 4841 pmap_demote_pde_locked(pmap, pde, va, &lock))) { 4842 pte = pmap_pde_to_pte(pde, va); 4843 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 4844 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 4845 mpte->wire_count++; 4846 } 4847 } else if (va < VM_MAXUSER_ADDRESS) { 4848 /* 4849 * Here if the pte page isn't mapped, or if it has been 4850 * deallocated. 4851 */ 4852 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 4853 mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), 4854 nosleep ? NULL : &lock); 4855 if (mpte == NULL && nosleep) { 4856 rv = KERN_RESOURCE_SHORTAGE; 4857 goto out; 4858 } 4859 goto retry; 4860 } else 4861 panic("pmap_enter: invalid page directory va=%#lx", va); 4862 4863 origpte = *pte; 4864 pv = NULL; 4865 4866 /* 4867 * Is the specified virtual address already mapped? 4868 */ 4869 if ((origpte & PG_V) != 0) { 4870 /* 4871 * Wiring change, just update stats. We don't worry about 4872 * wiring PT pages as they remain resident as long as there 4873 * are valid mappings in them. Hence, if a user page is wired, 4874 * the PT page will be also. 4875 */ 4876 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 4877 pmap->pm_stats.wired_count++; 4878 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 4879 pmap->pm_stats.wired_count--; 4880 4881 /* 4882 * Remove the extra PT page reference. 4883 */ 4884 if (mpte != NULL) { 4885 mpte->wire_count--; 4886 KASSERT(mpte->wire_count > 0, 4887 ("pmap_enter: missing reference to page table page," 4888 " va: 0x%lx", va)); 4889 } 4890 4891 /* 4892 * Has the physical page changed? 4893 */ 4894 opa = origpte & PG_FRAME; 4895 if (opa == pa) { 4896 /* 4897 * No, might be a protection or wiring change. 4898 */ 4899 if ((origpte & PG_MANAGED) != 0 && 4900 (newpte & PG_RW) != 0) 4901 vm_page_aflag_set(m, PGA_WRITEABLE); 4902 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 4903 goto unchanged; 4904 goto validate; 4905 } 4906 4907 /* 4908 * The physical page has changed. Temporarily invalidate 4909 * the mapping. This ensures that all threads sharing the 4910 * pmap keep a consistent view of the mapping, which is 4911 * necessary for the correct handling of COW faults. It 4912 * also permits reuse of the old mapping's PV entry, 4913 * avoiding an allocation. 4914 * 4915 * For consistency, handle unmanaged mappings the same way. 4916 */ 4917 origpte = pte_load_clear(pte); 4918 KASSERT((origpte & PG_FRAME) == opa, 4919 ("pmap_enter: unexpected pa update for %#lx", va)); 4920 if ((origpte & PG_MANAGED) != 0) { 4921 om = PHYS_TO_VM_PAGE(opa); 4922 4923 /* 4924 * The pmap lock is sufficient to synchronize with 4925 * concurrent calls to pmap_page_test_mappings() and 4926 * pmap_ts_referenced(). 4927 */ 4928 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 4929 vm_page_dirty(om); 4930 if ((origpte & PG_A) != 0) 4931 vm_page_aflag_set(om, PGA_REFERENCED); 4932 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 4933 pv = pmap_pvh_remove(&om->md, pmap, va); 4934 if ((newpte & PG_MANAGED) == 0) 4935 free_pv_entry(pmap, pv); 4936 if ((om->aflags & PGA_WRITEABLE) != 0 && 4937 TAILQ_EMPTY(&om->md.pv_list) && 4938 ((om->flags & PG_FICTITIOUS) != 0 || 4939 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 4940 vm_page_aflag_clear(om, PGA_WRITEABLE); 4941 } 4942 if ((origpte & PG_A) != 0) 4943 pmap_invalidate_page(pmap, va); 4944 origpte = 0; 4945 } else { 4946 /* 4947 * Increment the counters. 4948 */ 4949 if ((newpte & PG_W) != 0) 4950 pmap->pm_stats.wired_count++; 4951 pmap_resident_count_inc(pmap, 1); 4952 } 4953 4954 /* 4955 * Enter on the PV list if part of our managed memory. 4956 */ 4957 if ((newpte & PG_MANAGED) != 0) { 4958 if (pv == NULL) { 4959 pv = get_pv_entry(pmap, &lock); 4960 pv->pv_va = va; 4961 } 4962 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 4963 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4964 m->md.pv_gen++; 4965 if ((newpte & PG_RW) != 0) 4966 vm_page_aflag_set(m, PGA_WRITEABLE); 4967 } 4968 4969 /* 4970 * Update the PTE. 4971 */ 4972 if ((origpte & PG_V) != 0) { 4973 validate: 4974 origpte = pte_load_store(pte, newpte); 4975 KASSERT((origpte & PG_FRAME) == pa, 4976 ("pmap_enter: unexpected pa update for %#lx", va)); 4977 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == 4978 (PG_M | PG_RW)) { 4979 if ((origpte & PG_MANAGED) != 0) 4980 vm_page_dirty(m); 4981 4982 /* 4983 * Although the PTE may still have PG_RW set, TLB 4984 * invalidation may nonetheless be required because 4985 * the PTE no longer has PG_M set. 4986 */ 4987 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { 4988 /* 4989 * This PTE change does not require TLB invalidation. 4990 */ 4991 goto unchanged; 4992 } 4993 if ((origpte & PG_A) != 0) 4994 pmap_invalidate_page(pmap, va); 4995 } else 4996 pte_store(pte, newpte); 4997 4998 unchanged: 4999 5000 #if VM_NRESERVLEVEL > 0 5001 /* 5002 * If both the page table page and the reservation are fully 5003 * populated, then attempt promotion. 5004 */ 5005 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 5006 pmap_ps_enabled(pmap) && 5007 (m->flags & PG_FICTITIOUS) == 0 && 5008 vm_reserv_level_iffullpop(m) == 0) 5009 pmap_promote_pde(pmap, pde, va, &lock); 5010 #endif 5011 5012 rv = KERN_SUCCESS; 5013 out: 5014 if (lock != NULL) 5015 rw_wunlock(lock); 5016 PMAP_UNLOCK(pmap); 5017 return (rv); 5018 } 5019 5020 /* 5021 * Tries to create a read- and/or execute-only 2MB page mapping. Returns true 5022 * if successful. Returns false if (1) a page table page cannot be allocated 5023 * without sleeping, (2) a mapping already exists at the specified virtual 5024 * address, or (3) a PV entry cannot be allocated without reclaiming another 5025 * PV entry. 5026 */ 5027 static bool 5028 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 5029 struct rwlock **lockp) 5030 { 5031 pd_entry_t newpde; 5032 pt_entry_t PG_V; 5033 5034 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5035 PG_V = pmap_valid_bit(pmap); 5036 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | 5037 PG_PS | PG_V; 5038 if ((m->oflags & VPO_UNMANAGED) == 0) 5039 newpde |= PG_MANAGED; 5040 if ((prot & VM_PROT_EXECUTE) == 0) 5041 newpde |= pg_nx; 5042 if (va < VM_MAXUSER_ADDRESS) 5043 newpde |= PG_U; 5044 return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | 5045 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == 5046 KERN_SUCCESS); 5047 } 5048 5049 /* 5050 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 5051 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 5052 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 5053 * a mapping already exists at the specified virtual address. Returns 5054 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table 5055 * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if 5056 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. 5057 * 5058 * The parameter "m" is only used when creating a managed, writeable mapping. 5059 */ 5060 static int 5061 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, 5062 vm_page_t m, struct rwlock **lockp) 5063 { 5064 struct spglist free; 5065 pd_entry_t oldpde, *pde; 5066 pt_entry_t PG_G, PG_RW, PG_V; 5067 vm_page_t mt, pdpg; 5068 5069 PG_G = pmap_global_bit(pmap); 5070 PG_RW = pmap_rw_bit(pmap); 5071 KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW, 5072 ("pmap_enter_pde: newpde is missing PG_M")); 5073 PG_V = pmap_valid_bit(pmap); 5074 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5075 5076 if ((pdpg = pmap_allocpde(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? 5077 NULL : lockp)) == NULL) { 5078 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 5079 " in pmap %p", va, pmap); 5080 return (KERN_RESOURCE_SHORTAGE); 5081 } 5082 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 5083 pde = &pde[pmap_pde_index(va)]; 5084 oldpde = *pde; 5085 if ((oldpde & PG_V) != 0) { 5086 KASSERT(pdpg->wire_count > 1, 5087 ("pmap_enter_pde: pdpg's wire count is too low")); 5088 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 5089 pdpg->wire_count--; 5090 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 5091 " in pmap %p", va, pmap); 5092 return (KERN_FAILURE); 5093 } 5094 /* Break the existing mapping(s). */ 5095 SLIST_INIT(&free); 5096 if ((oldpde & PG_PS) != 0) { 5097 /* 5098 * The reference to the PD page that was acquired by 5099 * pmap_allocpde() ensures that it won't be freed. 5100 * However, if the PDE resulted from a promotion, then 5101 * a reserved PT page could be freed. 5102 */ 5103 (void)pmap_remove_pde(pmap, pde, va, &free, lockp); 5104 if ((oldpde & PG_G) == 0) 5105 pmap_invalidate_pde_page(pmap, va, oldpde); 5106 } else { 5107 pmap_delayed_invl_started(); 5108 if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free, 5109 lockp)) 5110 pmap_invalidate_all(pmap); 5111 pmap_delayed_invl_finished(); 5112 } 5113 vm_page_free_pages_toq(&free, true); 5114 if (va >= VM_MAXUSER_ADDRESS) { 5115 mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 5116 if (pmap_insert_pt_page(pmap, mt)) { 5117 /* 5118 * XXX Currently, this can't happen because 5119 * we do not perform pmap_enter(psind == 1) 5120 * on the kernel pmap. 5121 */ 5122 panic("pmap_enter_pde: trie insert failed"); 5123 } 5124 } else 5125 KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", 5126 pde)); 5127 } 5128 if ((newpde & PG_MANAGED) != 0) { 5129 /* 5130 * Abort this mapping if its PV entry could not be created. 5131 */ 5132 if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) { 5133 SLIST_INIT(&free); 5134 if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { 5135 /* 5136 * Although "va" is not mapped, paging- 5137 * structure caches could nonetheless have 5138 * entries that refer to the freed page table 5139 * pages. Invalidate those entries. 5140 */ 5141 pmap_invalidate_page(pmap, va); 5142 vm_page_free_pages_toq(&free, true); 5143 } 5144 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 5145 " in pmap %p", va, pmap); 5146 return (KERN_RESOURCE_SHORTAGE); 5147 } 5148 if ((newpde & PG_RW) != 0) { 5149 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 5150 vm_page_aflag_set(mt, PGA_WRITEABLE); 5151 } 5152 } 5153 5154 /* 5155 * Increment counters. 5156 */ 5157 if ((newpde & PG_W) != 0) 5158 pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; 5159 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); 5160 5161 /* 5162 * Map the superpage. (This is not a promoted mapping; there will not 5163 * be any lingering 4KB page mappings in the TLB.) 5164 */ 5165 pde_store(pde, newpde); 5166 5167 atomic_add_long(&pmap_pde_mappings, 1); 5168 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 5169 " in pmap %p", va, pmap); 5170 return (KERN_SUCCESS); 5171 } 5172 5173 /* 5174 * Maps a sequence of resident pages belonging to the same object. 5175 * The sequence begins with the given page m_start. This page is 5176 * mapped at the given virtual address start. Each subsequent page is 5177 * mapped at a virtual address that is offset from start by the same 5178 * amount as the page is offset from m_start within the object. The 5179 * last page in the sequence is the page with the largest offset from 5180 * m_start that can be mapped at a virtual address less than the given 5181 * virtual address end. Not every virtual page between start and end 5182 * is mapped; only those for which a resident page exists with the 5183 * corresponding offset from m_start are mapped. 5184 */ 5185 void 5186 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 5187 vm_page_t m_start, vm_prot_t prot) 5188 { 5189 struct rwlock *lock; 5190 vm_offset_t va; 5191 vm_page_t m, mpte; 5192 vm_pindex_t diff, psize; 5193 5194 VM_OBJECT_ASSERT_LOCKED(m_start->object); 5195 5196 psize = atop(end - start); 5197 mpte = NULL; 5198 m = m_start; 5199 lock = NULL; 5200 PMAP_LOCK(pmap); 5201 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 5202 va = start + ptoa(diff); 5203 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 5204 m->psind == 1 && pmap_ps_enabled(pmap) && 5205 pmap_enter_2mpage(pmap, va, m, prot, &lock)) 5206 m = &m[NBPDR / PAGE_SIZE - 1]; 5207 else 5208 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 5209 mpte, &lock); 5210 m = TAILQ_NEXT(m, listq); 5211 } 5212 if (lock != NULL) 5213 rw_wunlock(lock); 5214 PMAP_UNLOCK(pmap); 5215 } 5216 5217 /* 5218 * this code makes some *MAJOR* assumptions: 5219 * 1. Current pmap & pmap exists. 5220 * 2. Not wired. 5221 * 3. Read access. 5222 * 4. No page table pages. 5223 * but is *MUCH* faster than pmap_enter... 5224 */ 5225 5226 void 5227 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 5228 { 5229 struct rwlock *lock; 5230 5231 lock = NULL; 5232 PMAP_LOCK(pmap); 5233 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 5234 if (lock != NULL) 5235 rw_wunlock(lock); 5236 PMAP_UNLOCK(pmap); 5237 } 5238 5239 static vm_page_t 5240 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 5241 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 5242 { 5243 struct spglist free; 5244 pt_entry_t *pte, PG_V; 5245 vm_paddr_t pa; 5246 5247 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 5248 (m->oflags & VPO_UNMANAGED) != 0, 5249 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 5250 PG_V = pmap_valid_bit(pmap); 5251 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5252 5253 /* 5254 * In the case that a page table page is not 5255 * resident, we are creating it here. 5256 */ 5257 if (va < VM_MAXUSER_ADDRESS) { 5258 vm_pindex_t ptepindex; 5259 pd_entry_t *ptepa; 5260 5261 /* 5262 * Calculate pagetable page index 5263 */ 5264 ptepindex = pmap_pde_pindex(va); 5265 if (mpte && (mpte->pindex == ptepindex)) { 5266 mpte->wire_count++; 5267 } else { 5268 /* 5269 * Get the page directory entry 5270 */ 5271 ptepa = pmap_pde(pmap, va); 5272 5273 /* 5274 * If the page table page is mapped, we just increment 5275 * the hold count, and activate it. Otherwise, we 5276 * attempt to allocate a page table page. If this 5277 * attempt fails, we don't retry. Instead, we give up. 5278 */ 5279 if (ptepa && (*ptepa & PG_V) != 0) { 5280 if (*ptepa & PG_PS) 5281 return (NULL); 5282 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); 5283 mpte->wire_count++; 5284 } else { 5285 /* 5286 * Pass NULL instead of the PV list lock 5287 * pointer, because we don't intend to sleep. 5288 */ 5289 mpte = _pmap_allocpte(pmap, ptepindex, NULL); 5290 if (mpte == NULL) 5291 return (mpte); 5292 } 5293 } 5294 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 5295 pte = &pte[pmap_pte_index(va)]; 5296 } else { 5297 mpte = NULL; 5298 pte = vtopte(va); 5299 } 5300 if (*pte) { 5301 if (mpte != NULL) { 5302 mpte->wire_count--; 5303 mpte = NULL; 5304 } 5305 return (mpte); 5306 } 5307 5308 /* 5309 * Enter on the PV list if part of our managed memory. 5310 */ 5311 if ((m->oflags & VPO_UNMANAGED) == 0 && 5312 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 5313 if (mpte != NULL) { 5314 SLIST_INIT(&free); 5315 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 5316 /* 5317 * Although "va" is not mapped, paging- 5318 * structure caches could nonetheless have 5319 * entries that refer to the freed page table 5320 * pages. Invalidate those entries. 5321 */ 5322 pmap_invalidate_page(pmap, va); 5323 vm_page_free_pages_toq(&free, true); 5324 } 5325 mpte = NULL; 5326 } 5327 return (mpte); 5328 } 5329 5330 /* 5331 * Increment counters 5332 */ 5333 pmap_resident_count_inc(pmap, 1); 5334 5335 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0); 5336 if ((prot & VM_PROT_EXECUTE) == 0) 5337 pa |= pg_nx; 5338 5339 /* 5340 * Now validate mapping with RO protection 5341 */ 5342 if ((m->oflags & VPO_UNMANAGED) != 0) 5343 pte_store(pte, pa | PG_V | PG_U); 5344 else 5345 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 5346 return (mpte); 5347 } 5348 5349 /* 5350 * Make a temporary mapping for a physical address. This is only intended 5351 * to be used for panic dumps. 5352 */ 5353 void * 5354 pmap_kenter_temporary(vm_paddr_t pa, int i) 5355 { 5356 vm_offset_t va; 5357 5358 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 5359 pmap_kenter(va, pa); 5360 invlpg(va); 5361 return ((void *)crashdumpmap); 5362 } 5363 5364 /* 5365 * This code maps large physical mmap regions into the 5366 * processor address space. Note that some shortcuts 5367 * are taken, but the code works. 5368 */ 5369 void 5370 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 5371 vm_pindex_t pindex, vm_size_t size) 5372 { 5373 pd_entry_t *pde; 5374 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 5375 vm_paddr_t pa, ptepa; 5376 vm_page_t p, pdpg; 5377 int pat_mode; 5378 5379 PG_A = pmap_accessed_bit(pmap); 5380 PG_M = pmap_modified_bit(pmap); 5381 PG_V = pmap_valid_bit(pmap); 5382 PG_RW = pmap_rw_bit(pmap); 5383 5384 VM_OBJECT_ASSERT_WLOCKED(object); 5385 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 5386 ("pmap_object_init_pt: non-device object")); 5387 if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 5388 if (!pmap_ps_enabled(pmap)) 5389 return; 5390 if (!vm_object_populate(object, pindex, pindex + atop(size))) 5391 return; 5392 p = vm_page_lookup(object, pindex); 5393 KASSERT(p->valid == VM_PAGE_BITS_ALL, 5394 ("pmap_object_init_pt: invalid page %p", p)); 5395 pat_mode = p->md.pat_mode; 5396 5397 /* 5398 * Abort the mapping if the first page is not physically 5399 * aligned to a 2MB page boundary. 5400 */ 5401 ptepa = VM_PAGE_TO_PHYS(p); 5402 if (ptepa & (NBPDR - 1)) 5403 return; 5404 5405 /* 5406 * Skip the first page. Abort the mapping if the rest of 5407 * the pages are not physically contiguous or have differing 5408 * memory attributes. 5409 */ 5410 p = TAILQ_NEXT(p, listq); 5411 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 5412 pa += PAGE_SIZE) { 5413 KASSERT(p->valid == VM_PAGE_BITS_ALL, 5414 ("pmap_object_init_pt: invalid page %p", p)); 5415 if (pa != VM_PAGE_TO_PHYS(p) || 5416 pat_mode != p->md.pat_mode) 5417 return; 5418 p = TAILQ_NEXT(p, listq); 5419 } 5420 5421 /* 5422 * Map using 2MB pages. Since "ptepa" is 2M aligned and 5423 * "size" is a multiple of 2M, adding the PAT setting to "pa" 5424 * will not affect the termination of this loop. 5425 */ 5426 PMAP_LOCK(pmap); 5427 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); 5428 pa < ptepa + size; pa += NBPDR) { 5429 pdpg = pmap_allocpde(pmap, addr, NULL); 5430 if (pdpg == NULL) { 5431 /* 5432 * The creation of mappings below is only an 5433 * optimization. If a page directory page 5434 * cannot be allocated without blocking, 5435 * continue on to the next mapping rather than 5436 * blocking. 5437 */ 5438 addr += NBPDR; 5439 continue; 5440 } 5441 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 5442 pde = &pde[pmap_pde_index(addr)]; 5443 if ((*pde & PG_V) == 0) { 5444 pde_store(pde, pa | PG_PS | PG_M | PG_A | 5445 PG_U | PG_RW | PG_V); 5446 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); 5447 atomic_add_long(&pmap_pde_mappings, 1); 5448 } else { 5449 /* Continue on if the PDE is already valid. */ 5450 pdpg->wire_count--; 5451 KASSERT(pdpg->wire_count > 0, 5452 ("pmap_object_init_pt: missing reference " 5453 "to page directory page, va: 0x%lx", addr)); 5454 } 5455 addr += NBPDR; 5456 } 5457 PMAP_UNLOCK(pmap); 5458 } 5459 } 5460 5461 /* 5462 * Clear the wired attribute from the mappings for the specified range of 5463 * addresses in the given pmap. Every valid mapping within that range 5464 * must have the wired attribute set. In contrast, invalid mappings 5465 * cannot have the wired attribute set, so they are ignored. 5466 * 5467 * The wired attribute of the page table entry is not a hardware 5468 * feature, so there is no need to invalidate any TLB entries. 5469 * Since pmap_demote_pde() for the wired entry must never fail, 5470 * pmap_delayed_invl_started()/finished() calls around the 5471 * function are not needed. 5472 */ 5473 void 5474 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5475 { 5476 vm_offset_t va_next; 5477 pml4_entry_t *pml4e; 5478 pdp_entry_t *pdpe; 5479 pd_entry_t *pde; 5480 pt_entry_t *pte, PG_V; 5481 5482 PG_V = pmap_valid_bit(pmap); 5483 PMAP_LOCK(pmap); 5484 for (; sva < eva; sva = va_next) { 5485 pml4e = pmap_pml4e(pmap, sva); 5486 if ((*pml4e & PG_V) == 0) { 5487 va_next = (sva + NBPML4) & ~PML4MASK; 5488 if (va_next < sva) 5489 va_next = eva; 5490 continue; 5491 } 5492 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 5493 if ((*pdpe & PG_V) == 0) { 5494 va_next = (sva + NBPDP) & ~PDPMASK; 5495 if (va_next < sva) 5496 va_next = eva; 5497 continue; 5498 } 5499 va_next = (sva + NBPDR) & ~PDRMASK; 5500 if (va_next < sva) 5501 va_next = eva; 5502 pde = pmap_pdpe_to_pde(pdpe, sva); 5503 if ((*pde & PG_V) == 0) 5504 continue; 5505 if ((*pde & PG_PS) != 0) { 5506 if ((*pde & PG_W) == 0) 5507 panic("pmap_unwire: pde %#jx is missing PG_W", 5508 (uintmax_t)*pde); 5509 5510 /* 5511 * Are we unwiring the entire large page? If not, 5512 * demote the mapping and fall through. 5513 */ 5514 if (sva + NBPDR == va_next && eva >= va_next) { 5515 atomic_clear_long(pde, PG_W); 5516 pmap->pm_stats.wired_count -= NBPDR / 5517 PAGE_SIZE; 5518 continue; 5519 } else if (!pmap_demote_pde(pmap, pde, sva)) 5520 panic("pmap_unwire: demotion failed"); 5521 } 5522 if (va_next > eva) 5523 va_next = eva; 5524 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 5525 sva += PAGE_SIZE) { 5526 if ((*pte & PG_V) == 0) 5527 continue; 5528 if ((*pte & PG_W) == 0) 5529 panic("pmap_unwire: pte %#jx is missing PG_W", 5530 (uintmax_t)*pte); 5531 5532 /* 5533 * PG_W must be cleared atomically. Although the pmap 5534 * lock synchronizes access to PG_W, another processor 5535 * could be setting PG_M and/or PG_A concurrently. 5536 */ 5537 atomic_clear_long(pte, PG_W); 5538 pmap->pm_stats.wired_count--; 5539 } 5540 } 5541 PMAP_UNLOCK(pmap); 5542 } 5543 5544 /* 5545 * Copy the range specified by src_addr/len 5546 * from the source map to the range dst_addr/len 5547 * in the destination map. 5548 * 5549 * This routine is only advisory and need not do anything. 5550 */ 5551 5552 void 5553 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 5554 vm_offset_t src_addr) 5555 { 5556 struct rwlock *lock; 5557 struct spglist free; 5558 vm_offset_t addr; 5559 vm_offset_t end_addr = src_addr + len; 5560 vm_offset_t va_next; 5561 vm_page_t dst_pdpg, dstmpte, srcmpte; 5562 pt_entry_t PG_A, PG_M, PG_V; 5563 5564 if (dst_addr != src_addr) 5565 return; 5566 5567 if (dst_pmap->pm_type != src_pmap->pm_type) 5568 return; 5569 5570 /* 5571 * EPT page table entries that require emulation of A/D bits are 5572 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although 5573 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit 5574 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT 5575 * implementations flag an EPT misconfiguration for exec-only 5576 * mappings we skip this function entirely for emulated pmaps. 5577 */ 5578 if (pmap_emulate_ad_bits(dst_pmap)) 5579 return; 5580 5581 lock = NULL; 5582 if (dst_pmap < src_pmap) { 5583 PMAP_LOCK(dst_pmap); 5584 PMAP_LOCK(src_pmap); 5585 } else { 5586 PMAP_LOCK(src_pmap); 5587 PMAP_LOCK(dst_pmap); 5588 } 5589 5590 PG_A = pmap_accessed_bit(dst_pmap); 5591 PG_M = pmap_modified_bit(dst_pmap); 5592 PG_V = pmap_valid_bit(dst_pmap); 5593 5594 for (addr = src_addr; addr < end_addr; addr = va_next) { 5595 pt_entry_t *src_pte, *dst_pte; 5596 pml4_entry_t *pml4e; 5597 pdp_entry_t *pdpe; 5598 pd_entry_t srcptepaddr, *pde; 5599 5600 KASSERT(addr < UPT_MIN_ADDRESS, 5601 ("pmap_copy: invalid to pmap_copy page tables")); 5602 5603 pml4e = pmap_pml4e(src_pmap, addr); 5604 if ((*pml4e & PG_V) == 0) { 5605 va_next = (addr + NBPML4) & ~PML4MASK; 5606 if (va_next < addr) 5607 va_next = end_addr; 5608 continue; 5609 } 5610 5611 pdpe = pmap_pml4e_to_pdpe(pml4e, addr); 5612 if ((*pdpe & PG_V) == 0) { 5613 va_next = (addr + NBPDP) & ~PDPMASK; 5614 if (va_next < addr) 5615 va_next = end_addr; 5616 continue; 5617 } 5618 5619 va_next = (addr + NBPDR) & ~PDRMASK; 5620 if (va_next < addr) 5621 va_next = end_addr; 5622 5623 pde = pmap_pdpe_to_pde(pdpe, addr); 5624 srcptepaddr = *pde; 5625 if (srcptepaddr == 0) 5626 continue; 5627 5628 if (srcptepaddr & PG_PS) { 5629 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 5630 continue; 5631 dst_pdpg = pmap_allocpde(dst_pmap, addr, NULL); 5632 if (dst_pdpg == NULL) 5633 break; 5634 pde = (pd_entry_t *) 5635 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg)); 5636 pde = &pde[pmap_pde_index(addr)]; 5637 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 5638 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, 5639 PMAP_ENTER_NORECLAIM, &lock))) { 5640 *pde = srcptepaddr & ~PG_W; 5641 pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE); 5642 atomic_add_long(&pmap_pde_mappings, 1); 5643 } else 5644 dst_pdpg->wire_count--; 5645 continue; 5646 } 5647 5648 srcptepaddr &= PG_FRAME; 5649 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 5650 KASSERT(srcmpte->wire_count > 0, 5651 ("pmap_copy: source page table page is unused")); 5652 5653 if (va_next > end_addr) 5654 va_next = end_addr; 5655 5656 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 5657 src_pte = &src_pte[pmap_pte_index(addr)]; 5658 dstmpte = NULL; 5659 while (addr < va_next) { 5660 pt_entry_t ptetemp; 5661 ptetemp = *src_pte; 5662 /* 5663 * we only virtual copy managed pages 5664 */ 5665 if ((ptetemp & PG_MANAGED) != 0) { 5666 if (dstmpte != NULL && 5667 dstmpte->pindex == pmap_pde_pindex(addr)) 5668 dstmpte->wire_count++; 5669 else if ((dstmpte = pmap_allocpte(dst_pmap, 5670 addr, NULL)) == NULL) 5671 goto out; 5672 dst_pte = (pt_entry_t *) 5673 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 5674 dst_pte = &dst_pte[pmap_pte_index(addr)]; 5675 if (*dst_pte == 0 && 5676 pmap_try_insert_pv_entry(dst_pmap, addr, 5677 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), 5678 &lock)) { 5679 /* 5680 * Clear the wired, modified, and 5681 * accessed (referenced) bits 5682 * during the copy. 5683 */ 5684 *dst_pte = ptetemp & ~(PG_W | PG_M | 5685 PG_A); 5686 pmap_resident_count_inc(dst_pmap, 1); 5687 } else { 5688 SLIST_INIT(&free); 5689 if (pmap_unwire_ptp(dst_pmap, addr, 5690 dstmpte, &free)) { 5691 /* 5692 * Although "addr" is not 5693 * mapped, paging-structure 5694 * caches could nonetheless 5695 * have entries that refer to 5696 * the freed page table pages. 5697 * Invalidate those entries. 5698 */ 5699 pmap_invalidate_page(dst_pmap, 5700 addr); 5701 vm_page_free_pages_toq(&free, 5702 true); 5703 } 5704 goto out; 5705 } 5706 if (dstmpte->wire_count >= srcmpte->wire_count) 5707 break; 5708 } 5709 addr += PAGE_SIZE; 5710 src_pte++; 5711 } 5712 } 5713 out: 5714 if (lock != NULL) 5715 rw_wunlock(lock); 5716 PMAP_UNLOCK(src_pmap); 5717 PMAP_UNLOCK(dst_pmap); 5718 } 5719 5720 /* 5721 * Zero the specified hardware page. 5722 */ 5723 void 5724 pmap_zero_page(vm_page_t m) 5725 { 5726 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5727 5728 pagezero((void *)va); 5729 } 5730 5731 /* 5732 * Zero an an area within a single hardware page. off and size must not 5733 * cover an area beyond a single hardware page. 5734 */ 5735 void 5736 pmap_zero_page_area(vm_page_t m, int off, int size) 5737 { 5738 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5739 5740 if (off == 0 && size == PAGE_SIZE) 5741 pagezero((void *)va); 5742 else 5743 bzero((char *)va + off, size); 5744 } 5745 5746 /* 5747 * Copy 1 specified hardware page to another. 5748 */ 5749 void 5750 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 5751 { 5752 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 5753 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 5754 5755 pagecopy((void *)src, (void *)dst); 5756 } 5757 5758 int unmapped_buf_allowed = 1; 5759 5760 void 5761 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 5762 vm_offset_t b_offset, int xfersize) 5763 { 5764 void *a_cp, *b_cp; 5765 vm_page_t pages[2]; 5766 vm_offset_t vaddr[2], a_pg_offset, b_pg_offset; 5767 int cnt; 5768 boolean_t mapped; 5769 5770 while (xfersize > 0) { 5771 a_pg_offset = a_offset & PAGE_MASK; 5772 pages[0] = ma[a_offset >> PAGE_SHIFT]; 5773 b_pg_offset = b_offset & PAGE_MASK; 5774 pages[1] = mb[b_offset >> PAGE_SHIFT]; 5775 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 5776 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 5777 mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE); 5778 a_cp = (char *)vaddr[0] + a_pg_offset; 5779 b_cp = (char *)vaddr[1] + b_pg_offset; 5780 bcopy(a_cp, b_cp, cnt); 5781 if (__predict_false(mapped)) 5782 pmap_unmap_io_transient(pages, vaddr, 2, FALSE); 5783 a_offset += cnt; 5784 b_offset += cnt; 5785 xfersize -= cnt; 5786 } 5787 } 5788 5789 /* 5790 * Returns true if the pmap's pv is one of the first 5791 * 16 pvs linked to from this page. This count may 5792 * be changed upwards or downwards in the future; it 5793 * is only necessary that true be returned for a small 5794 * subset of pmaps for proper page aging. 5795 */ 5796 boolean_t 5797 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 5798 { 5799 struct md_page *pvh; 5800 struct rwlock *lock; 5801 pv_entry_t pv; 5802 int loops = 0; 5803 boolean_t rv; 5804 5805 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5806 ("pmap_page_exists_quick: page %p is not managed", m)); 5807 rv = FALSE; 5808 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5809 rw_rlock(lock); 5810 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5811 if (PV_PMAP(pv) == pmap) { 5812 rv = TRUE; 5813 break; 5814 } 5815 loops++; 5816 if (loops >= 16) 5817 break; 5818 } 5819 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 5820 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5821 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5822 if (PV_PMAP(pv) == pmap) { 5823 rv = TRUE; 5824 break; 5825 } 5826 loops++; 5827 if (loops >= 16) 5828 break; 5829 } 5830 } 5831 rw_runlock(lock); 5832 return (rv); 5833 } 5834 5835 /* 5836 * pmap_page_wired_mappings: 5837 * 5838 * Return the number of managed mappings to the given physical page 5839 * that are wired. 5840 */ 5841 int 5842 pmap_page_wired_mappings(vm_page_t m) 5843 { 5844 struct rwlock *lock; 5845 struct md_page *pvh; 5846 pmap_t pmap; 5847 pt_entry_t *pte; 5848 pv_entry_t pv; 5849 int count, md_gen, pvh_gen; 5850 5851 if ((m->oflags & VPO_UNMANAGED) != 0) 5852 return (0); 5853 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5854 rw_rlock(lock); 5855 restart: 5856 count = 0; 5857 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5858 pmap = PV_PMAP(pv); 5859 if (!PMAP_TRYLOCK(pmap)) { 5860 md_gen = m->md.pv_gen; 5861 rw_runlock(lock); 5862 PMAP_LOCK(pmap); 5863 rw_rlock(lock); 5864 if (md_gen != m->md.pv_gen) { 5865 PMAP_UNLOCK(pmap); 5866 goto restart; 5867 } 5868 } 5869 pte = pmap_pte(pmap, pv->pv_va); 5870 if ((*pte & PG_W) != 0) 5871 count++; 5872 PMAP_UNLOCK(pmap); 5873 } 5874 if ((m->flags & PG_FICTITIOUS) == 0) { 5875 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5876 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5877 pmap = PV_PMAP(pv); 5878 if (!PMAP_TRYLOCK(pmap)) { 5879 md_gen = m->md.pv_gen; 5880 pvh_gen = pvh->pv_gen; 5881 rw_runlock(lock); 5882 PMAP_LOCK(pmap); 5883 rw_rlock(lock); 5884 if (md_gen != m->md.pv_gen || 5885 pvh_gen != pvh->pv_gen) { 5886 PMAP_UNLOCK(pmap); 5887 goto restart; 5888 } 5889 } 5890 pte = pmap_pde(pmap, pv->pv_va); 5891 if ((*pte & PG_W) != 0) 5892 count++; 5893 PMAP_UNLOCK(pmap); 5894 } 5895 } 5896 rw_runlock(lock); 5897 return (count); 5898 } 5899 5900 /* 5901 * Returns TRUE if the given page is mapped individually or as part of 5902 * a 2mpage. Otherwise, returns FALSE. 5903 */ 5904 boolean_t 5905 pmap_page_is_mapped(vm_page_t m) 5906 { 5907 struct rwlock *lock; 5908 boolean_t rv; 5909 5910 if ((m->oflags & VPO_UNMANAGED) != 0) 5911 return (FALSE); 5912 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5913 rw_rlock(lock); 5914 rv = !TAILQ_EMPTY(&m->md.pv_list) || 5915 ((m->flags & PG_FICTITIOUS) == 0 && 5916 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 5917 rw_runlock(lock); 5918 return (rv); 5919 } 5920 5921 /* 5922 * Destroy all managed, non-wired mappings in the given user-space 5923 * pmap. This pmap cannot be active on any processor besides the 5924 * caller. 5925 * 5926 * This function cannot be applied to the kernel pmap. Moreover, it 5927 * is not intended for general use. It is only to be used during 5928 * process termination. Consequently, it can be implemented in ways 5929 * that make it faster than pmap_remove(). First, it can more quickly 5930 * destroy mappings by iterating over the pmap's collection of PV 5931 * entries, rather than searching the page table. Second, it doesn't 5932 * have to test and clear the page table entries atomically, because 5933 * no processor is currently accessing the user address space. In 5934 * particular, a page table entry's dirty bit won't change state once 5935 * this function starts. 5936 * 5937 * Although this function destroys all of the pmap's managed, 5938 * non-wired mappings, it can delay and batch the invalidation of TLB 5939 * entries without calling pmap_delayed_invl_started() and 5940 * pmap_delayed_invl_finished(). Because the pmap is not active on 5941 * any other processor, none of these TLB entries will ever be used 5942 * before their eventual invalidation. Consequently, there is no need 5943 * for either pmap_remove_all() or pmap_remove_write() to wait for 5944 * that eventual TLB invalidation. 5945 */ 5946 void 5947 pmap_remove_pages(pmap_t pmap) 5948 { 5949 pd_entry_t ptepde; 5950 pt_entry_t *pte, tpte; 5951 pt_entry_t PG_M, PG_RW, PG_V; 5952 struct spglist free; 5953 vm_page_t m, mpte, mt; 5954 pv_entry_t pv; 5955 struct md_page *pvh; 5956 struct pv_chunk *pc, *npc; 5957 struct rwlock *lock; 5958 int64_t bit; 5959 uint64_t inuse, bitmask; 5960 int allfree, field, freed, idx; 5961 boolean_t superpage; 5962 vm_paddr_t pa; 5963 5964 /* 5965 * Assert that the given pmap is only active on the current 5966 * CPU. Unfortunately, we cannot block another CPU from 5967 * activating the pmap while this function is executing. 5968 */ 5969 KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); 5970 #ifdef INVARIANTS 5971 { 5972 cpuset_t other_cpus; 5973 5974 other_cpus = all_cpus; 5975 critical_enter(); 5976 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 5977 CPU_AND(&other_cpus, &pmap->pm_active); 5978 critical_exit(); 5979 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); 5980 } 5981 #endif 5982 5983 lock = NULL; 5984 PG_M = pmap_modified_bit(pmap); 5985 PG_V = pmap_valid_bit(pmap); 5986 PG_RW = pmap_rw_bit(pmap); 5987 5988 SLIST_INIT(&free); 5989 PMAP_LOCK(pmap); 5990 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 5991 allfree = 1; 5992 freed = 0; 5993 for (field = 0; field < _NPCM; field++) { 5994 inuse = ~pc->pc_map[field] & pc_freemask[field]; 5995 while (inuse != 0) { 5996 bit = bsfq(inuse); 5997 bitmask = 1UL << bit; 5998 idx = field * 64 + bit; 5999 pv = &pc->pc_pventry[idx]; 6000 inuse &= ~bitmask; 6001 6002 pte = pmap_pdpe(pmap, pv->pv_va); 6003 ptepde = *pte; 6004 pte = pmap_pdpe_to_pde(pte, pv->pv_va); 6005 tpte = *pte; 6006 if ((tpte & (PG_PS | PG_V)) == PG_V) { 6007 superpage = FALSE; 6008 ptepde = tpte; 6009 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 6010 PG_FRAME); 6011 pte = &pte[pmap_pte_index(pv->pv_va)]; 6012 tpte = *pte; 6013 } else { 6014 /* 6015 * Keep track whether 'tpte' is a 6016 * superpage explicitly instead of 6017 * relying on PG_PS being set. 6018 * 6019 * This is because PG_PS is numerically 6020 * identical to PG_PTE_PAT and thus a 6021 * regular page could be mistaken for 6022 * a superpage. 6023 */ 6024 superpage = TRUE; 6025 } 6026 6027 if ((tpte & PG_V) == 0) { 6028 panic("bad pte va %lx pte %lx", 6029 pv->pv_va, tpte); 6030 } 6031 6032 /* 6033 * We cannot remove wired pages from a process' mapping at this time 6034 */ 6035 if (tpte & PG_W) { 6036 allfree = 0; 6037 continue; 6038 } 6039 6040 if (superpage) 6041 pa = tpte & PG_PS_FRAME; 6042 else 6043 pa = tpte & PG_FRAME; 6044 6045 m = PHYS_TO_VM_PAGE(pa); 6046 KASSERT(m->phys_addr == pa, 6047 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 6048 m, (uintmax_t)m->phys_addr, 6049 (uintmax_t)tpte)); 6050 6051 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 6052 m < &vm_page_array[vm_page_array_size], 6053 ("pmap_remove_pages: bad tpte %#jx", 6054 (uintmax_t)tpte)); 6055 6056 pte_clear(pte); 6057 6058 /* 6059 * Update the vm_page_t clean/reference bits. 6060 */ 6061 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 6062 if (superpage) { 6063 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 6064 vm_page_dirty(mt); 6065 } else 6066 vm_page_dirty(m); 6067 } 6068 6069 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 6070 6071 /* Mark free */ 6072 pc->pc_map[field] |= bitmask; 6073 if (superpage) { 6074 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); 6075 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 6076 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 6077 pvh->pv_gen++; 6078 if (TAILQ_EMPTY(&pvh->pv_list)) { 6079 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 6080 if ((mt->aflags & PGA_WRITEABLE) != 0 && 6081 TAILQ_EMPTY(&mt->md.pv_list)) 6082 vm_page_aflag_clear(mt, PGA_WRITEABLE); 6083 } 6084 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 6085 if (mpte != NULL) { 6086 pmap_resident_count_dec(pmap, 1); 6087 KASSERT(mpte->wire_count == NPTEPG, 6088 ("pmap_remove_pages: pte page wire count error")); 6089 mpte->wire_count = 0; 6090 pmap_add_delayed_free_list(mpte, &free, FALSE); 6091 } 6092 } else { 6093 pmap_resident_count_dec(pmap, 1); 6094 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 6095 m->md.pv_gen++; 6096 if ((m->aflags & PGA_WRITEABLE) != 0 && 6097 TAILQ_EMPTY(&m->md.pv_list) && 6098 (m->flags & PG_FICTITIOUS) == 0) { 6099 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6100 if (TAILQ_EMPTY(&pvh->pv_list)) 6101 vm_page_aflag_clear(m, PGA_WRITEABLE); 6102 } 6103 } 6104 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 6105 freed++; 6106 } 6107 } 6108 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 6109 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 6110 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 6111 if (allfree) { 6112 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 6113 free_pv_chunk(pc); 6114 } 6115 } 6116 if (lock != NULL) 6117 rw_wunlock(lock); 6118 pmap_invalidate_all(pmap); 6119 PMAP_UNLOCK(pmap); 6120 vm_page_free_pages_toq(&free, true); 6121 } 6122 6123 static boolean_t 6124 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 6125 { 6126 struct rwlock *lock; 6127 pv_entry_t pv; 6128 struct md_page *pvh; 6129 pt_entry_t *pte, mask; 6130 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 6131 pmap_t pmap; 6132 int md_gen, pvh_gen; 6133 boolean_t rv; 6134 6135 rv = FALSE; 6136 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6137 rw_rlock(lock); 6138 restart: 6139 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6140 pmap = PV_PMAP(pv); 6141 if (!PMAP_TRYLOCK(pmap)) { 6142 md_gen = m->md.pv_gen; 6143 rw_runlock(lock); 6144 PMAP_LOCK(pmap); 6145 rw_rlock(lock); 6146 if (md_gen != m->md.pv_gen) { 6147 PMAP_UNLOCK(pmap); 6148 goto restart; 6149 } 6150 } 6151 pte = pmap_pte(pmap, pv->pv_va); 6152 mask = 0; 6153 if (modified) { 6154 PG_M = pmap_modified_bit(pmap); 6155 PG_RW = pmap_rw_bit(pmap); 6156 mask |= PG_RW | PG_M; 6157 } 6158 if (accessed) { 6159 PG_A = pmap_accessed_bit(pmap); 6160 PG_V = pmap_valid_bit(pmap); 6161 mask |= PG_V | PG_A; 6162 } 6163 rv = (*pte & mask) == mask; 6164 PMAP_UNLOCK(pmap); 6165 if (rv) 6166 goto out; 6167 } 6168 if ((m->flags & PG_FICTITIOUS) == 0) { 6169 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6170 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 6171 pmap = PV_PMAP(pv); 6172 if (!PMAP_TRYLOCK(pmap)) { 6173 md_gen = m->md.pv_gen; 6174 pvh_gen = pvh->pv_gen; 6175 rw_runlock(lock); 6176 PMAP_LOCK(pmap); 6177 rw_rlock(lock); 6178 if (md_gen != m->md.pv_gen || 6179 pvh_gen != pvh->pv_gen) { 6180 PMAP_UNLOCK(pmap); 6181 goto restart; 6182 } 6183 } 6184 pte = pmap_pde(pmap, pv->pv_va); 6185 mask = 0; 6186 if (modified) { 6187 PG_M = pmap_modified_bit(pmap); 6188 PG_RW = pmap_rw_bit(pmap); 6189 mask |= PG_RW | PG_M; 6190 } 6191 if (accessed) { 6192 PG_A = pmap_accessed_bit(pmap); 6193 PG_V = pmap_valid_bit(pmap); 6194 mask |= PG_V | PG_A; 6195 } 6196 rv = (*pte & mask) == mask; 6197 PMAP_UNLOCK(pmap); 6198 if (rv) 6199 goto out; 6200 } 6201 } 6202 out: 6203 rw_runlock(lock); 6204 return (rv); 6205 } 6206 6207 /* 6208 * pmap_is_modified: 6209 * 6210 * Return whether or not the specified physical page was modified 6211 * in any physical maps. 6212 */ 6213 boolean_t 6214 pmap_is_modified(vm_page_t m) 6215 { 6216 6217 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6218 ("pmap_is_modified: page %p is not managed", m)); 6219 6220 /* 6221 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 6222 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 6223 * is clear, no PTEs can have PG_M set. 6224 */ 6225 VM_OBJECT_ASSERT_WLOCKED(m->object); 6226 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 6227 return (FALSE); 6228 return (pmap_page_test_mappings(m, FALSE, TRUE)); 6229 } 6230 6231 /* 6232 * pmap_is_prefaultable: 6233 * 6234 * Return whether or not the specified virtual address is eligible 6235 * for prefault. 6236 */ 6237 boolean_t 6238 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 6239 { 6240 pd_entry_t *pde; 6241 pt_entry_t *pte, PG_V; 6242 boolean_t rv; 6243 6244 PG_V = pmap_valid_bit(pmap); 6245 rv = FALSE; 6246 PMAP_LOCK(pmap); 6247 pde = pmap_pde(pmap, addr); 6248 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { 6249 pte = pmap_pde_to_pte(pde, addr); 6250 rv = (*pte & PG_V) == 0; 6251 } 6252 PMAP_UNLOCK(pmap); 6253 return (rv); 6254 } 6255 6256 /* 6257 * pmap_is_referenced: 6258 * 6259 * Return whether or not the specified physical page was referenced 6260 * in any physical maps. 6261 */ 6262 boolean_t 6263 pmap_is_referenced(vm_page_t m) 6264 { 6265 6266 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6267 ("pmap_is_referenced: page %p is not managed", m)); 6268 return (pmap_page_test_mappings(m, TRUE, FALSE)); 6269 } 6270 6271 /* 6272 * Clear the write and modified bits in each of the given page's mappings. 6273 */ 6274 void 6275 pmap_remove_write(vm_page_t m) 6276 { 6277 struct md_page *pvh; 6278 pmap_t pmap; 6279 struct rwlock *lock; 6280 pv_entry_t next_pv, pv; 6281 pd_entry_t *pde; 6282 pt_entry_t oldpte, *pte, PG_M, PG_RW; 6283 vm_offset_t va; 6284 int pvh_gen, md_gen; 6285 6286 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6287 ("pmap_remove_write: page %p is not managed", m)); 6288 6289 /* 6290 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 6291 * set by another thread while the object is locked. Thus, 6292 * if PGA_WRITEABLE is clear, no page table entries need updating. 6293 */ 6294 VM_OBJECT_ASSERT_WLOCKED(m->object); 6295 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 6296 return; 6297 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6298 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 6299 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6300 retry_pv_loop: 6301 rw_wlock(lock); 6302 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 6303 pmap = PV_PMAP(pv); 6304 if (!PMAP_TRYLOCK(pmap)) { 6305 pvh_gen = pvh->pv_gen; 6306 rw_wunlock(lock); 6307 PMAP_LOCK(pmap); 6308 rw_wlock(lock); 6309 if (pvh_gen != pvh->pv_gen) { 6310 PMAP_UNLOCK(pmap); 6311 rw_wunlock(lock); 6312 goto retry_pv_loop; 6313 } 6314 } 6315 PG_RW = pmap_rw_bit(pmap); 6316 va = pv->pv_va; 6317 pde = pmap_pde(pmap, va); 6318 if ((*pde & PG_RW) != 0) 6319 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 6320 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 6321 ("inconsistent pv lock %p %p for page %p", 6322 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 6323 PMAP_UNLOCK(pmap); 6324 } 6325 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6326 pmap = PV_PMAP(pv); 6327 if (!PMAP_TRYLOCK(pmap)) { 6328 pvh_gen = pvh->pv_gen; 6329 md_gen = m->md.pv_gen; 6330 rw_wunlock(lock); 6331 PMAP_LOCK(pmap); 6332 rw_wlock(lock); 6333 if (pvh_gen != pvh->pv_gen || 6334 md_gen != m->md.pv_gen) { 6335 PMAP_UNLOCK(pmap); 6336 rw_wunlock(lock); 6337 goto retry_pv_loop; 6338 } 6339 } 6340 PG_M = pmap_modified_bit(pmap); 6341 PG_RW = pmap_rw_bit(pmap); 6342 pde = pmap_pde(pmap, pv->pv_va); 6343 KASSERT((*pde & PG_PS) == 0, 6344 ("pmap_remove_write: found a 2mpage in page %p's pv list", 6345 m)); 6346 pte = pmap_pde_to_pte(pde, pv->pv_va); 6347 retry: 6348 oldpte = *pte; 6349 if (oldpte & PG_RW) { 6350 if (!atomic_cmpset_long(pte, oldpte, oldpte & 6351 ~(PG_RW | PG_M))) 6352 goto retry; 6353 if ((oldpte & PG_M) != 0) 6354 vm_page_dirty(m); 6355 pmap_invalidate_page(pmap, pv->pv_va); 6356 } 6357 PMAP_UNLOCK(pmap); 6358 } 6359 rw_wunlock(lock); 6360 vm_page_aflag_clear(m, PGA_WRITEABLE); 6361 pmap_delayed_invl_wait(m); 6362 } 6363 6364 static __inline boolean_t 6365 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) 6366 { 6367 6368 if (!pmap_emulate_ad_bits(pmap)) 6369 return (TRUE); 6370 6371 KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); 6372 6373 /* 6374 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration 6375 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared 6376 * if the EPT_PG_WRITE bit is set. 6377 */ 6378 if ((pte & EPT_PG_WRITE) != 0) 6379 return (FALSE); 6380 6381 /* 6382 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. 6383 */ 6384 if ((pte & EPT_PG_EXECUTE) == 0 || 6385 ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) 6386 return (TRUE); 6387 else 6388 return (FALSE); 6389 } 6390 6391 /* 6392 * pmap_ts_referenced: 6393 * 6394 * Return a count of reference bits for a page, clearing those bits. 6395 * It is not necessary for every reference bit to be cleared, but it 6396 * is necessary that 0 only be returned when there are truly no 6397 * reference bits set. 6398 * 6399 * As an optimization, update the page's dirty field if a modified bit is 6400 * found while counting reference bits. This opportunistic update can be 6401 * performed at low cost and can eliminate the need for some future calls 6402 * to pmap_is_modified(). However, since this function stops after 6403 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 6404 * dirty pages. Those dirty pages will only be detected by a future call 6405 * to pmap_is_modified(). 6406 * 6407 * A DI block is not needed within this function, because 6408 * invalidations are performed before the PV list lock is 6409 * released. 6410 */ 6411 int 6412 pmap_ts_referenced(vm_page_t m) 6413 { 6414 struct md_page *pvh; 6415 pv_entry_t pv, pvf; 6416 pmap_t pmap; 6417 struct rwlock *lock; 6418 pd_entry_t oldpde, *pde; 6419 pt_entry_t *pte, PG_A, PG_M, PG_RW; 6420 vm_offset_t va; 6421 vm_paddr_t pa; 6422 int cleared, md_gen, not_cleared, pvh_gen; 6423 struct spglist free; 6424 boolean_t demoted; 6425 6426 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6427 ("pmap_ts_referenced: page %p is not managed", m)); 6428 SLIST_INIT(&free); 6429 cleared = 0; 6430 pa = VM_PAGE_TO_PHYS(m); 6431 lock = PHYS_TO_PV_LIST_LOCK(pa); 6432 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 6433 rw_wlock(lock); 6434 retry: 6435 not_cleared = 0; 6436 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 6437 goto small_mappings; 6438 pv = pvf; 6439 do { 6440 if (pvf == NULL) 6441 pvf = pv; 6442 pmap = PV_PMAP(pv); 6443 if (!PMAP_TRYLOCK(pmap)) { 6444 pvh_gen = pvh->pv_gen; 6445 rw_wunlock(lock); 6446 PMAP_LOCK(pmap); 6447 rw_wlock(lock); 6448 if (pvh_gen != pvh->pv_gen) { 6449 PMAP_UNLOCK(pmap); 6450 goto retry; 6451 } 6452 } 6453 PG_A = pmap_accessed_bit(pmap); 6454 PG_M = pmap_modified_bit(pmap); 6455 PG_RW = pmap_rw_bit(pmap); 6456 va = pv->pv_va; 6457 pde = pmap_pde(pmap, pv->pv_va); 6458 oldpde = *pde; 6459 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 6460 /* 6461 * Although "oldpde" is mapping a 2MB page, because 6462 * this function is called at a 4KB page granularity, 6463 * we only update the 4KB page under test. 6464 */ 6465 vm_page_dirty(m); 6466 } 6467 if ((oldpde & PG_A) != 0) { 6468 /* 6469 * Since this reference bit is shared by 512 4KB 6470 * pages, it should not be cleared every time it is 6471 * tested. Apply a simple "hash" function on the 6472 * physical page number, the virtual superpage number, 6473 * and the pmap address to select one 4KB page out of 6474 * the 512 on which testing the reference bit will 6475 * result in clearing that reference bit. This 6476 * function is designed to avoid the selection of the 6477 * same 4KB page for every 2MB page mapping. 6478 * 6479 * On demotion, a mapping that hasn't been referenced 6480 * is simply destroyed. To avoid the possibility of a 6481 * subsequent page fault on a demoted wired mapping, 6482 * always leave its reference bit set. Moreover, 6483 * since the superpage is wired, the current state of 6484 * its reference bit won't affect page replacement. 6485 */ 6486 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 6487 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 6488 (oldpde & PG_W) == 0) { 6489 if (safe_to_clear_referenced(pmap, oldpde)) { 6490 atomic_clear_long(pde, PG_A); 6491 pmap_invalidate_page(pmap, pv->pv_va); 6492 demoted = FALSE; 6493 } else if (pmap_demote_pde_locked(pmap, pde, 6494 pv->pv_va, &lock)) { 6495 /* 6496 * Remove the mapping to a single page 6497 * so that a subsequent access may 6498 * repromote. Since the underlying 6499 * page table page is fully populated, 6500 * this removal never frees a page 6501 * table page. 6502 */ 6503 demoted = TRUE; 6504 va += VM_PAGE_TO_PHYS(m) - (oldpde & 6505 PG_PS_FRAME); 6506 pte = pmap_pde_to_pte(pde, va); 6507 pmap_remove_pte(pmap, pte, va, *pde, 6508 NULL, &lock); 6509 pmap_invalidate_page(pmap, va); 6510 } else 6511 demoted = TRUE; 6512 6513 if (demoted) { 6514 /* 6515 * The superpage mapping was removed 6516 * entirely and therefore 'pv' is no 6517 * longer valid. 6518 */ 6519 if (pvf == pv) 6520 pvf = NULL; 6521 pv = NULL; 6522 } 6523 cleared++; 6524 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 6525 ("inconsistent pv lock %p %p for page %p", 6526 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 6527 } else 6528 not_cleared++; 6529 } 6530 PMAP_UNLOCK(pmap); 6531 /* Rotate the PV list if it has more than one entry. */ 6532 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 6533 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 6534 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 6535 pvh->pv_gen++; 6536 } 6537 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 6538 goto out; 6539 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 6540 small_mappings: 6541 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 6542 goto out; 6543 pv = pvf; 6544 do { 6545 if (pvf == NULL) 6546 pvf = pv; 6547 pmap = PV_PMAP(pv); 6548 if (!PMAP_TRYLOCK(pmap)) { 6549 pvh_gen = pvh->pv_gen; 6550 md_gen = m->md.pv_gen; 6551 rw_wunlock(lock); 6552 PMAP_LOCK(pmap); 6553 rw_wlock(lock); 6554 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 6555 PMAP_UNLOCK(pmap); 6556 goto retry; 6557 } 6558 } 6559 PG_A = pmap_accessed_bit(pmap); 6560 PG_M = pmap_modified_bit(pmap); 6561 PG_RW = pmap_rw_bit(pmap); 6562 pde = pmap_pde(pmap, pv->pv_va); 6563 KASSERT((*pde & PG_PS) == 0, 6564 ("pmap_ts_referenced: found a 2mpage in page %p's pv list", 6565 m)); 6566 pte = pmap_pde_to_pte(pde, pv->pv_va); 6567 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6568 vm_page_dirty(m); 6569 if ((*pte & PG_A) != 0) { 6570 if (safe_to_clear_referenced(pmap, *pte)) { 6571 atomic_clear_long(pte, PG_A); 6572 pmap_invalidate_page(pmap, pv->pv_va); 6573 cleared++; 6574 } else if ((*pte & PG_W) == 0) { 6575 /* 6576 * Wired pages cannot be paged out so 6577 * doing accessed bit emulation for 6578 * them is wasted effort. We do the 6579 * hard work for unwired pages only. 6580 */ 6581 pmap_remove_pte(pmap, pte, pv->pv_va, 6582 *pde, &free, &lock); 6583 pmap_invalidate_page(pmap, pv->pv_va); 6584 cleared++; 6585 if (pvf == pv) 6586 pvf = NULL; 6587 pv = NULL; 6588 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 6589 ("inconsistent pv lock %p %p for page %p", 6590 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 6591 } else 6592 not_cleared++; 6593 } 6594 PMAP_UNLOCK(pmap); 6595 /* Rotate the PV list if it has more than one entry. */ 6596 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 6597 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 6598 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 6599 m->md.pv_gen++; 6600 } 6601 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 6602 not_cleared < PMAP_TS_REFERENCED_MAX); 6603 out: 6604 rw_wunlock(lock); 6605 vm_page_free_pages_toq(&free, true); 6606 return (cleared + not_cleared); 6607 } 6608 6609 /* 6610 * Apply the given advice to the specified range of addresses within the 6611 * given pmap. Depending on the advice, clear the referenced and/or 6612 * modified flags in each mapping and set the mapped page's dirty field. 6613 */ 6614 void 6615 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 6616 { 6617 struct rwlock *lock; 6618 pml4_entry_t *pml4e; 6619 pdp_entry_t *pdpe; 6620 pd_entry_t oldpde, *pde; 6621 pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; 6622 vm_offset_t va, va_next; 6623 vm_page_t m; 6624 boolean_t anychanged; 6625 6626 if (advice != MADV_DONTNEED && advice != MADV_FREE) 6627 return; 6628 6629 /* 6630 * A/D bit emulation requires an alternate code path when clearing 6631 * the modified and accessed bits below. Since this function is 6632 * advisory in nature we skip it entirely for pmaps that require 6633 * A/D bit emulation. 6634 */ 6635 if (pmap_emulate_ad_bits(pmap)) 6636 return; 6637 6638 PG_A = pmap_accessed_bit(pmap); 6639 PG_G = pmap_global_bit(pmap); 6640 PG_M = pmap_modified_bit(pmap); 6641 PG_V = pmap_valid_bit(pmap); 6642 PG_RW = pmap_rw_bit(pmap); 6643 anychanged = FALSE; 6644 pmap_delayed_invl_started(); 6645 PMAP_LOCK(pmap); 6646 for (; sva < eva; sva = va_next) { 6647 pml4e = pmap_pml4e(pmap, sva); 6648 if ((*pml4e & PG_V) == 0) { 6649 va_next = (sva + NBPML4) & ~PML4MASK; 6650 if (va_next < sva) 6651 va_next = eva; 6652 continue; 6653 } 6654 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 6655 if ((*pdpe & PG_V) == 0) { 6656 va_next = (sva + NBPDP) & ~PDPMASK; 6657 if (va_next < sva) 6658 va_next = eva; 6659 continue; 6660 } 6661 va_next = (sva + NBPDR) & ~PDRMASK; 6662 if (va_next < sva) 6663 va_next = eva; 6664 pde = pmap_pdpe_to_pde(pdpe, sva); 6665 oldpde = *pde; 6666 if ((oldpde & PG_V) == 0) 6667 continue; 6668 else if ((oldpde & PG_PS) != 0) { 6669 if ((oldpde & PG_MANAGED) == 0) 6670 continue; 6671 lock = NULL; 6672 if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { 6673 if (lock != NULL) 6674 rw_wunlock(lock); 6675 6676 /* 6677 * The large page mapping was destroyed. 6678 */ 6679 continue; 6680 } 6681 6682 /* 6683 * Unless the page mappings are wired, remove the 6684 * mapping to a single page so that a subsequent 6685 * access may repromote. Since the underlying page 6686 * table page is fully populated, this removal never 6687 * frees a page table page. 6688 */ 6689 if ((oldpde & PG_W) == 0) { 6690 pte = pmap_pde_to_pte(pde, sva); 6691 KASSERT((*pte & PG_V) != 0, 6692 ("pmap_advise: invalid PTE")); 6693 pmap_remove_pte(pmap, pte, sva, *pde, NULL, 6694 &lock); 6695 anychanged = TRUE; 6696 } 6697 if (lock != NULL) 6698 rw_wunlock(lock); 6699 } 6700 if (va_next > eva) 6701 va_next = eva; 6702 va = va_next; 6703 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 6704 sva += PAGE_SIZE) { 6705 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) 6706 goto maybe_invlrng; 6707 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 6708 if (advice == MADV_DONTNEED) { 6709 /* 6710 * Future calls to pmap_is_modified() 6711 * can be avoided by making the page 6712 * dirty now. 6713 */ 6714 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 6715 vm_page_dirty(m); 6716 } 6717 atomic_clear_long(pte, PG_M | PG_A); 6718 } else if ((*pte & PG_A) != 0) 6719 atomic_clear_long(pte, PG_A); 6720 else 6721 goto maybe_invlrng; 6722 6723 if ((*pte & PG_G) != 0) { 6724 if (va == va_next) 6725 va = sva; 6726 } else 6727 anychanged = TRUE; 6728 continue; 6729 maybe_invlrng: 6730 if (va != va_next) { 6731 pmap_invalidate_range(pmap, va, sva); 6732 va = va_next; 6733 } 6734 } 6735 if (va != va_next) 6736 pmap_invalidate_range(pmap, va, sva); 6737 } 6738 if (anychanged) 6739 pmap_invalidate_all(pmap); 6740 PMAP_UNLOCK(pmap); 6741 pmap_delayed_invl_finished(); 6742 } 6743 6744 /* 6745 * Clear the modify bits on the specified physical page. 6746 */ 6747 void 6748 pmap_clear_modify(vm_page_t m) 6749 { 6750 struct md_page *pvh; 6751 pmap_t pmap; 6752 pv_entry_t next_pv, pv; 6753 pd_entry_t oldpde, *pde; 6754 pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V; 6755 struct rwlock *lock; 6756 vm_offset_t va; 6757 int md_gen, pvh_gen; 6758 6759 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6760 ("pmap_clear_modify: page %p is not managed", m)); 6761 VM_OBJECT_ASSERT_WLOCKED(m->object); 6762 KASSERT(!vm_page_xbusied(m), 6763 ("pmap_clear_modify: page %p is exclusive busied", m)); 6764 6765 /* 6766 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 6767 * If the object containing the page is locked and the page is not 6768 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 6769 */ 6770 if ((m->aflags & PGA_WRITEABLE) == 0) 6771 return; 6772 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 6773 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6774 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6775 rw_wlock(lock); 6776 restart: 6777 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 6778 pmap = PV_PMAP(pv); 6779 if (!PMAP_TRYLOCK(pmap)) { 6780 pvh_gen = pvh->pv_gen; 6781 rw_wunlock(lock); 6782 PMAP_LOCK(pmap); 6783 rw_wlock(lock); 6784 if (pvh_gen != pvh->pv_gen) { 6785 PMAP_UNLOCK(pmap); 6786 goto restart; 6787 } 6788 } 6789 PG_M = pmap_modified_bit(pmap); 6790 PG_V = pmap_valid_bit(pmap); 6791 PG_RW = pmap_rw_bit(pmap); 6792 va = pv->pv_va; 6793 pde = pmap_pde(pmap, va); 6794 oldpde = *pde; 6795 if ((oldpde & PG_RW) != 0) { 6796 if (pmap_demote_pde_locked(pmap, pde, va, &lock)) { 6797 if ((oldpde & PG_W) == 0) { 6798 /* 6799 * Write protect the mapping to a 6800 * single page so that a subsequent 6801 * write access may repromote. 6802 */ 6803 va += VM_PAGE_TO_PHYS(m) - (oldpde & 6804 PG_PS_FRAME); 6805 pte = pmap_pde_to_pte(pde, va); 6806 oldpte = *pte; 6807 if ((oldpte & PG_V) != 0) { 6808 while (!atomic_cmpset_long(pte, 6809 oldpte, 6810 oldpte & ~(PG_M | PG_RW))) 6811 oldpte = *pte; 6812 vm_page_dirty(m); 6813 pmap_invalidate_page(pmap, va); 6814 } 6815 } 6816 } 6817 } 6818 PMAP_UNLOCK(pmap); 6819 } 6820 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6821 pmap = PV_PMAP(pv); 6822 if (!PMAP_TRYLOCK(pmap)) { 6823 md_gen = m->md.pv_gen; 6824 pvh_gen = pvh->pv_gen; 6825 rw_wunlock(lock); 6826 PMAP_LOCK(pmap); 6827 rw_wlock(lock); 6828 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 6829 PMAP_UNLOCK(pmap); 6830 goto restart; 6831 } 6832 } 6833 PG_M = pmap_modified_bit(pmap); 6834 PG_RW = pmap_rw_bit(pmap); 6835 pde = pmap_pde(pmap, pv->pv_va); 6836 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 6837 " a 2mpage in page %p's pv list", m)); 6838 pte = pmap_pde_to_pte(pde, pv->pv_va); 6839 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 6840 atomic_clear_long(pte, PG_M); 6841 pmap_invalidate_page(pmap, pv->pv_va); 6842 } 6843 PMAP_UNLOCK(pmap); 6844 } 6845 rw_wunlock(lock); 6846 } 6847 6848 /* 6849 * Miscellaneous support routines follow 6850 */ 6851 6852 /* Adjust the cache mode for a 4KB page mapped via a PTE. */ 6853 static __inline void 6854 pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask) 6855 { 6856 u_int opte, npte; 6857 6858 /* 6859 * The cache mode bits are all in the low 32-bits of the 6860 * PTE, so we can just spin on updating the low 32-bits. 6861 */ 6862 do { 6863 opte = *(u_int *)pte; 6864 npte = opte & ~mask; 6865 npte |= cache_bits; 6866 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 6867 } 6868 6869 /* Adjust the cache mode for a 2MB page mapped via a PDE. */ 6870 static __inline void 6871 pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask) 6872 { 6873 u_int opde, npde; 6874 6875 /* 6876 * The cache mode bits are all in the low 32-bits of the 6877 * PDE, so we can just spin on updating the low 32-bits. 6878 */ 6879 do { 6880 opde = *(u_int *)pde; 6881 npde = opde & ~mask; 6882 npde |= cache_bits; 6883 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 6884 } 6885 6886 /* 6887 * Map a set of physical memory pages into the kernel virtual 6888 * address space. Return a pointer to where it is mapped. This 6889 * routine is intended to be used for mapping device memory, 6890 * NOT real memory. 6891 */ 6892 void * 6893 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 6894 { 6895 struct pmap_preinit_mapping *ppim; 6896 vm_offset_t va, offset; 6897 vm_size_t tmpsize; 6898 int i; 6899 6900 offset = pa & PAGE_MASK; 6901 size = round_page(offset + size); 6902 pa = trunc_page(pa); 6903 6904 if (!pmap_initialized) { 6905 va = 0; 6906 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 6907 ppim = pmap_preinit_mapping + i; 6908 if (ppim->va == 0) { 6909 ppim->pa = pa; 6910 ppim->sz = size; 6911 ppim->mode = mode; 6912 ppim->va = virtual_avail; 6913 virtual_avail += size; 6914 va = ppim->va; 6915 break; 6916 } 6917 } 6918 if (va == 0) 6919 panic("%s: too many preinit mappings", __func__); 6920 } else { 6921 /* 6922 * If we have a preinit mapping, re-use it. 6923 */ 6924 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 6925 ppim = pmap_preinit_mapping + i; 6926 if (ppim->pa == pa && ppim->sz == size && 6927 ppim->mode == mode) 6928 return ((void *)(ppim->va + offset)); 6929 } 6930 /* 6931 * If the specified range of physical addresses fits within 6932 * the direct map window, use the direct map. 6933 */ 6934 if (pa < dmaplimit && pa + size < dmaplimit) { 6935 va = PHYS_TO_DMAP(pa); 6936 if (!pmap_change_attr(va, size, mode)) 6937 return ((void *)(va + offset)); 6938 } 6939 va = kva_alloc(size); 6940 if (va == 0) 6941 panic("%s: Couldn't allocate KVA", __func__); 6942 } 6943 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 6944 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 6945 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 6946 pmap_invalidate_cache_range(va, va + tmpsize, FALSE); 6947 return ((void *)(va + offset)); 6948 } 6949 6950 void * 6951 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 6952 { 6953 6954 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 6955 } 6956 6957 void * 6958 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 6959 { 6960 6961 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 6962 } 6963 6964 void 6965 pmap_unmapdev(vm_offset_t va, vm_size_t size) 6966 { 6967 struct pmap_preinit_mapping *ppim; 6968 vm_offset_t offset; 6969 int i; 6970 6971 /* If we gave a direct map region in pmap_mapdev, do nothing */ 6972 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 6973 return; 6974 offset = va & PAGE_MASK; 6975 size = round_page(offset + size); 6976 va = trunc_page(va); 6977 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 6978 ppim = pmap_preinit_mapping + i; 6979 if (ppim->va == va && ppim->sz == size) { 6980 if (pmap_initialized) 6981 return; 6982 ppim->pa = 0; 6983 ppim->va = 0; 6984 ppim->sz = 0; 6985 ppim->mode = 0; 6986 if (va + size == virtual_avail) 6987 virtual_avail = va; 6988 return; 6989 } 6990 } 6991 if (pmap_initialized) 6992 kva_free(va, size); 6993 } 6994 6995 /* 6996 * Tries to demote a 1GB page mapping. 6997 */ 6998 static boolean_t 6999 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) 7000 { 7001 pdp_entry_t newpdpe, oldpdpe; 7002 pd_entry_t *firstpde, newpde, *pde; 7003 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 7004 vm_paddr_t pdpgpa; 7005 vm_page_t pdpg; 7006 7007 PG_A = pmap_accessed_bit(pmap); 7008 PG_M = pmap_modified_bit(pmap); 7009 PG_V = pmap_valid_bit(pmap); 7010 PG_RW = pmap_rw_bit(pmap); 7011 7012 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7013 oldpdpe = *pdpe; 7014 KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), 7015 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 7016 if ((pdpg = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT | 7017 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 7018 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 7019 " in pmap %p", va, pmap); 7020 return (FALSE); 7021 } 7022 pdpgpa = VM_PAGE_TO_PHYS(pdpg); 7023 firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa); 7024 newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; 7025 KASSERT((oldpdpe & PG_A) != 0, 7026 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 7027 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 7028 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 7029 newpde = oldpdpe; 7030 7031 /* 7032 * Initialize the page directory page. 7033 */ 7034 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 7035 *pde = newpde; 7036 newpde += NBPDR; 7037 } 7038 7039 /* 7040 * Demote the mapping. 7041 */ 7042 *pdpe = newpdpe; 7043 7044 /* 7045 * Invalidate a stale recursive mapping of the page directory page. 7046 */ 7047 pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); 7048 7049 pmap_pdpe_demotions++; 7050 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 7051 " in pmap %p", va, pmap); 7052 return (TRUE); 7053 } 7054 7055 /* 7056 * Sets the memory attribute for the specified page. 7057 */ 7058 void 7059 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 7060 { 7061 7062 m->md.pat_mode = ma; 7063 7064 /* 7065 * If "m" is a normal page, update its direct mapping. This update 7066 * can be relied upon to perform any cache operations that are 7067 * required for data coherence. 7068 */ 7069 if ((m->flags & PG_FICTITIOUS) == 0 && 7070 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 7071 m->md.pat_mode)) 7072 panic("memory attribute change on the direct map failed"); 7073 } 7074 7075 /* 7076 * Changes the specified virtual address range's memory type to that given by 7077 * the parameter "mode". The specified virtual address range must be 7078 * completely contained within either the direct map or the kernel map. If 7079 * the virtual address range is contained within the kernel map, then the 7080 * memory type for each of the corresponding ranges of the direct map is also 7081 * changed. (The corresponding ranges of the direct map are those ranges that 7082 * map the same physical pages as the specified virtual address range.) These 7083 * changes to the direct map are necessary because Intel describes the 7084 * behavior of their processors as "undefined" if two or more mappings to the 7085 * same physical page have different memory types. 7086 * 7087 * Returns zero if the change completed successfully, and either EINVAL or 7088 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 7089 * of the virtual address range was not mapped, and ENOMEM is returned if 7090 * there was insufficient memory available to complete the change. In the 7091 * latter case, the memory type may have been changed on some part of the 7092 * virtual address range or the direct map. 7093 */ 7094 int 7095 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 7096 { 7097 int error; 7098 7099 PMAP_LOCK(kernel_pmap); 7100 error = pmap_change_attr_locked(va, size, mode); 7101 PMAP_UNLOCK(kernel_pmap); 7102 return (error); 7103 } 7104 7105 static int 7106 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) 7107 { 7108 vm_offset_t base, offset, tmpva; 7109 vm_paddr_t pa_start, pa_end, pa_end1; 7110 pdp_entry_t *pdpe; 7111 pd_entry_t *pde; 7112 pt_entry_t *pte; 7113 int cache_bits_pte, cache_bits_pde, error; 7114 boolean_t changed; 7115 7116 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 7117 base = trunc_page(va); 7118 offset = va & PAGE_MASK; 7119 size = round_page(offset + size); 7120 7121 /* 7122 * Only supported on kernel virtual addresses, including the direct 7123 * map but excluding the recursive map. 7124 */ 7125 if (base < DMAP_MIN_ADDRESS) 7126 return (EINVAL); 7127 7128 cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1); 7129 cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0); 7130 changed = FALSE; 7131 7132 /* 7133 * Pages that aren't mapped aren't supported. Also break down 2MB pages 7134 * into 4KB pages if required. 7135 */ 7136 for (tmpva = base; tmpva < base + size; ) { 7137 pdpe = pmap_pdpe(kernel_pmap, tmpva); 7138 if (pdpe == NULL || *pdpe == 0) 7139 return (EINVAL); 7140 if (*pdpe & PG_PS) { 7141 /* 7142 * If the current 1GB page already has the required 7143 * memory type, then we need not demote this page. Just 7144 * increment tmpva to the next 1GB page frame. 7145 */ 7146 if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) { 7147 tmpva = trunc_1gpage(tmpva) + NBPDP; 7148 continue; 7149 } 7150 7151 /* 7152 * If the current offset aligns with a 1GB page frame 7153 * and there is at least 1GB left within the range, then 7154 * we need not break down this page into 2MB pages. 7155 */ 7156 if ((tmpva & PDPMASK) == 0 && 7157 tmpva + PDPMASK < base + size) { 7158 tmpva += NBPDP; 7159 continue; 7160 } 7161 if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) 7162 return (ENOMEM); 7163 } 7164 pde = pmap_pdpe_to_pde(pdpe, tmpva); 7165 if (*pde == 0) 7166 return (EINVAL); 7167 if (*pde & PG_PS) { 7168 /* 7169 * If the current 2MB page already has the required 7170 * memory type, then we need not demote this page. Just 7171 * increment tmpva to the next 2MB page frame. 7172 */ 7173 if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) { 7174 tmpva = trunc_2mpage(tmpva) + NBPDR; 7175 continue; 7176 } 7177 7178 /* 7179 * If the current offset aligns with a 2MB page frame 7180 * and there is at least 2MB left within the range, then 7181 * we need not break down this page into 4KB pages. 7182 */ 7183 if ((tmpva & PDRMASK) == 0 && 7184 tmpva + PDRMASK < base + size) { 7185 tmpva += NBPDR; 7186 continue; 7187 } 7188 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) 7189 return (ENOMEM); 7190 } 7191 pte = pmap_pde_to_pte(pde, tmpva); 7192 if (*pte == 0) 7193 return (EINVAL); 7194 tmpva += PAGE_SIZE; 7195 } 7196 error = 0; 7197 7198 /* 7199 * Ok, all the pages exist, so run through them updating their 7200 * cache mode if required. 7201 */ 7202 pa_start = pa_end = 0; 7203 for (tmpva = base; tmpva < base + size; ) { 7204 pdpe = pmap_pdpe(kernel_pmap, tmpva); 7205 if (*pdpe & PG_PS) { 7206 if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) { 7207 pmap_pde_attr(pdpe, cache_bits_pde, 7208 X86_PG_PDE_CACHE); 7209 changed = TRUE; 7210 } 7211 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 7212 (*pdpe & PG_PS_FRAME) < dmaplimit) { 7213 if (pa_start == pa_end) { 7214 /* Start physical address run. */ 7215 pa_start = *pdpe & PG_PS_FRAME; 7216 pa_end = pa_start + NBPDP; 7217 } else if (pa_end == (*pdpe & PG_PS_FRAME)) 7218 pa_end += NBPDP; 7219 else { 7220 /* Run ended, update direct map. */ 7221 error = pmap_change_attr_locked( 7222 PHYS_TO_DMAP(pa_start), 7223 pa_end - pa_start, mode); 7224 if (error != 0) 7225 break; 7226 /* Start physical address run. */ 7227 pa_start = *pdpe & PG_PS_FRAME; 7228 pa_end = pa_start + NBPDP; 7229 } 7230 } 7231 tmpva = trunc_1gpage(tmpva) + NBPDP; 7232 continue; 7233 } 7234 pde = pmap_pdpe_to_pde(pdpe, tmpva); 7235 if (*pde & PG_PS) { 7236 if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) { 7237 pmap_pde_attr(pde, cache_bits_pde, 7238 X86_PG_PDE_CACHE); 7239 changed = TRUE; 7240 } 7241 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 7242 (*pde & PG_PS_FRAME) < dmaplimit) { 7243 if (pa_start == pa_end) { 7244 /* Start physical address run. */ 7245 pa_start = *pde & PG_PS_FRAME; 7246 pa_end = pa_start + NBPDR; 7247 } else if (pa_end == (*pde & PG_PS_FRAME)) 7248 pa_end += NBPDR; 7249 else { 7250 /* Run ended, update direct map. */ 7251 error = pmap_change_attr_locked( 7252 PHYS_TO_DMAP(pa_start), 7253 pa_end - pa_start, mode); 7254 if (error != 0) 7255 break; 7256 /* Start physical address run. */ 7257 pa_start = *pde & PG_PS_FRAME; 7258 pa_end = pa_start + NBPDR; 7259 } 7260 } 7261 tmpva = trunc_2mpage(tmpva) + NBPDR; 7262 } else { 7263 pte = pmap_pde_to_pte(pde, tmpva); 7264 if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) { 7265 pmap_pte_attr(pte, cache_bits_pte, 7266 X86_PG_PTE_CACHE); 7267 changed = TRUE; 7268 } 7269 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 7270 (*pte & PG_FRAME) < dmaplimit) { 7271 if (pa_start == pa_end) { 7272 /* Start physical address run. */ 7273 pa_start = *pte & PG_FRAME; 7274 pa_end = pa_start + PAGE_SIZE; 7275 } else if (pa_end == (*pte & PG_FRAME)) 7276 pa_end += PAGE_SIZE; 7277 else { 7278 /* Run ended, update direct map. */ 7279 error = pmap_change_attr_locked( 7280 PHYS_TO_DMAP(pa_start), 7281 pa_end - pa_start, mode); 7282 if (error != 0) 7283 break; 7284 /* Start physical address run. */ 7285 pa_start = *pte & PG_FRAME; 7286 pa_end = pa_start + PAGE_SIZE; 7287 } 7288 } 7289 tmpva += PAGE_SIZE; 7290 } 7291 } 7292 if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { 7293 pa_end1 = MIN(pa_end, dmaplimit); 7294 if (pa_start != pa_end1) 7295 error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), 7296 pa_end1 - pa_start, mode); 7297 } 7298 7299 /* 7300 * Flush CPU caches if required to make sure any data isn't cached that 7301 * shouldn't be, etc. 7302 */ 7303 if (changed) { 7304 pmap_invalidate_range(kernel_pmap, base, tmpva); 7305 pmap_invalidate_cache_range(base, tmpva, FALSE); 7306 } 7307 return (error); 7308 } 7309 7310 /* 7311 * Demotes any mapping within the direct map region that covers more than the 7312 * specified range of physical addresses. This range's size must be a power 7313 * of two and its starting address must be a multiple of its size. Since the 7314 * demotion does not change any attributes of the mapping, a TLB invalidation 7315 * is not mandatory. The caller may, however, request a TLB invalidation. 7316 */ 7317 void 7318 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) 7319 { 7320 pdp_entry_t *pdpe; 7321 pd_entry_t *pde; 7322 vm_offset_t va; 7323 boolean_t changed; 7324 7325 if (len == 0) 7326 return; 7327 KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); 7328 KASSERT((base & (len - 1)) == 0, 7329 ("pmap_demote_DMAP: base is not a multiple of len")); 7330 if (len < NBPDP && base < dmaplimit) { 7331 va = PHYS_TO_DMAP(base); 7332 changed = FALSE; 7333 PMAP_LOCK(kernel_pmap); 7334 pdpe = pmap_pdpe(kernel_pmap, va); 7335 if ((*pdpe & X86_PG_V) == 0) 7336 panic("pmap_demote_DMAP: invalid PDPE"); 7337 if ((*pdpe & PG_PS) != 0) { 7338 if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) 7339 panic("pmap_demote_DMAP: PDPE failed"); 7340 changed = TRUE; 7341 } 7342 if (len < NBPDR) { 7343 pde = pmap_pdpe_to_pde(pdpe, va); 7344 if ((*pde & X86_PG_V) == 0) 7345 panic("pmap_demote_DMAP: invalid PDE"); 7346 if ((*pde & PG_PS) != 0) { 7347 if (!pmap_demote_pde(kernel_pmap, pde, va)) 7348 panic("pmap_demote_DMAP: PDE failed"); 7349 changed = TRUE; 7350 } 7351 } 7352 if (changed && invalidate) 7353 pmap_invalidate_page(kernel_pmap, va); 7354 PMAP_UNLOCK(kernel_pmap); 7355 } 7356 } 7357 7358 /* 7359 * perform the pmap work for mincore 7360 */ 7361 int 7362 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 7363 { 7364 pd_entry_t *pdep; 7365 pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; 7366 vm_paddr_t pa; 7367 int val; 7368 7369 PG_A = pmap_accessed_bit(pmap); 7370 PG_M = pmap_modified_bit(pmap); 7371 PG_V = pmap_valid_bit(pmap); 7372 PG_RW = pmap_rw_bit(pmap); 7373 7374 PMAP_LOCK(pmap); 7375 retry: 7376 pdep = pmap_pde(pmap, addr); 7377 if (pdep != NULL && (*pdep & PG_V)) { 7378 if (*pdep & PG_PS) { 7379 pte = *pdep; 7380 /* Compute the physical address of the 4KB page. */ 7381 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & 7382 PG_FRAME; 7383 val = MINCORE_SUPER; 7384 } else { 7385 pte = *pmap_pde_to_pte(pdep, addr); 7386 pa = pte & PG_FRAME; 7387 val = 0; 7388 } 7389 } else { 7390 pte = 0; 7391 pa = 0; 7392 val = 0; 7393 } 7394 if ((pte & PG_V) != 0) { 7395 val |= MINCORE_INCORE; 7396 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 7397 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 7398 if ((pte & PG_A) != 0) 7399 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 7400 } 7401 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 7402 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 7403 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 7404 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 7405 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 7406 goto retry; 7407 } else 7408 PA_UNLOCK_COND(*locked_pa); 7409 PMAP_UNLOCK(pmap); 7410 return (val); 7411 } 7412 7413 static uint64_t 7414 pmap_pcid_alloc(pmap_t pmap, u_int cpuid) 7415 { 7416 uint32_t gen, new_gen, pcid_next; 7417 7418 CRITICAL_ASSERT(curthread); 7419 gen = PCPU_GET(pcid_gen); 7420 if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN) 7421 return (pti ? 0 : CR3_PCID_SAVE); 7422 if (pmap->pm_pcids[cpuid].pm_gen == gen) 7423 return (CR3_PCID_SAVE); 7424 pcid_next = PCPU_GET(pcid_next); 7425 KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) || 7426 (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN), 7427 ("cpu %d pcid_next %#x", cpuid, pcid_next)); 7428 if ((!pti && pcid_next == PMAP_PCID_OVERMAX) || 7429 (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) { 7430 new_gen = gen + 1; 7431 if (new_gen == 0) 7432 new_gen = 1; 7433 PCPU_SET(pcid_gen, new_gen); 7434 pcid_next = PMAP_PCID_KERN + 1; 7435 } else { 7436 new_gen = gen; 7437 } 7438 pmap->pm_pcids[cpuid].pm_pcid = pcid_next; 7439 pmap->pm_pcids[cpuid].pm_gen = new_gen; 7440 PCPU_SET(pcid_next, pcid_next + 1); 7441 return (0); 7442 } 7443 7444 void 7445 pmap_activate_sw(struct thread *td) 7446 { 7447 pmap_t oldpmap, pmap; 7448 struct invpcid_descr d; 7449 uint64_t cached, cr3, kcr3, kern_pti_cached, rsp0, ucr3; 7450 register_t rflags; 7451 u_int cpuid; 7452 struct amd64tss *tssp; 7453 7454 rflags = 0; 7455 oldpmap = PCPU_GET(curpmap); 7456 pmap = vmspace_pmap(td->td_proc->p_vmspace); 7457 if (oldpmap == pmap) 7458 return; 7459 cpuid = PCPU_GET(cpuid); 7460 #ifdef SMP 7461 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 7462 #else 7463 CPU_SET(cpuid, &pmap->pm_active); 7464 #endif 7465 cr3 = rcr3(); 7466 if (pmap_pcid_enabled) { 7467 cached = pmap_pcid_alloc(pmap, cpuid); 7468 KASSERT(pmap->pm_pcids[cpuid].pm_pcid >= 0 && 7469 pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, 7470 ("pmap %p cpu %d pcid %#x", pmap, cpuid, 7471 pmap->pm_pcids[cpuid].pm_pcid)); 7472 KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || 7473 pmap == kernel_pmap, 7474 ("non-kernel pmap thread %p pmap %p cpu %d pcid %#x", 7475 td, pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); 7476 7477 /* 7478 * If the INVPCID instruction is not available, 7479 * invltlb_pcid_handler() is used for handle 7480 * invalidate_all IPI, which checks for curpmap == 7481 * smp_tlb_pmap. Below operations sequence has a 7482 * window where %CR3 is loaded with the new pmap's 7483 * PML4 address, but curpmap value is not yet updated. 7484 * This causes invltlb IPI handler, called between the 7485 * updates, to execute as NOP, which leaves stale TLB 7486 * entries. 7487 * 7488 * Note that the most typical use of 7489 * pmap_activate_sw(), from the context switch, is 7490 * immune to this race, because interrupts are 7491 * disabled (while the thread lock is owned), and IPI 7492 * happens after curpmap is updated. Protect other 7493 * callers in a similar way, by disabling interrupts 7494 * around the %cr3 register reload and curpmap 7495 * assignment. 7496 */ 7497 if (!invpcid_works) 7498 rflags = intr_disable(); 7499 7500 kern_pti_cached = pti ? 0 : cached; 7501 if (!kern_pti_cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) { 7502 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | 7503 kern_pti_cached); 7504 } 7505 PCPU_SET(curpmap, pmap); 7506 if (pti) { 7507 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid; 7508 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid | 7509 PMAP_PCID_USER_PT; 7510 7511 if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) { 7512 /* 7513 * Manually invalidate translations cached 7514 * from the user page table. They are not 7515 * flushed by reload of cr3 with the kernel 7516 * page table pointer above. 7517 */ 7518 if (invpcid_works) { 7519 d.pcid = PMAP_PCID_USER_PT | 7520 pmap->pm_pcids[cpuid].pm_pcid; 7521 d.pad = 0; 7522 d.addr = 0; 7523 invpcid(&d, INVPCID_CTX); 7524 } else { 7525 pmap_pti_pcid_invalidate(ucr3, kcr3); 7526 } 7527 } 7528 7529 PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); 7530 PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); 7531 } 7532 if (!invpcid_works) 7533 intr_restore(rflags); 7534 if (cached) 7535 PCPU_INC(pm_save_cnt); 7536 } else { 7537 load_cr3(pmap->pm_cr3); 7538 PCPU_SET(curpmap, pmap); 7539 if (pti) { 7540 PCPU_SET(kcr3, pmap->pm_cr3); 7541 PCPU_SET(ucr3, pmap->pm_ucr3); 7542 } 7543 } 7544 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 7545 rsp0 = ((vm_offset_t)PCPU_PTR(pti_stack) + 7546 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful; 7547 tssp = PCPU_GET(tssp); 7548 tssp->tss_rsp0 = rsp0; 7549 } 7550 #ifdef SMP 7551 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 7552 #else 7553 CPU_CLR(cpuid, &oldpmap->pm_active); 7554 #endif 7555 } 7556 7557 void 7558 pmap_activate(struct thread *td) 7559 { 7560 7561 critical_enter(); 7562 pmap_activate_sw(td); 7563 critical_exit(); 7564 } 7565 7566 void 7567 pmap_activate_boot(pmap_t pmap) 7568 { 7569 uint64_t kcr3; 7570 u_int cpuid; 7571 7572 /* 7573 * kernel_pmap must be never deactivated, and we ensure that 7574 * by never activating it at all. 7575 */ 7576 MPASS(pmap != kernel_pmap); 7577 7578 cpuid = PCPU_GET(cpuid); 7579 #ifdef SMP 7580 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 7581 #else 7582 CPU_SET(cpuid, &pmap->pm_active); 7583 #endif 7584 PCPU_SET(curpmap, pmap); 7585 if (pti) { 7586 kcr3 = pmap->pm_cr3; 7587 if (pmap_pcid_enabled) 7588 kcr3 |= pmap->pm_pcids[cpuid].pm_pcid | CR3_PCID_SAVE; 7589 } else { 7590 kcr3 = PMAP_NO_CR3; 7591 } 7592 PCPU_SET(kcr3, kcr3); 7593 PCPU_SET(ucr3, PMAP_NO_CR3); 7594 } 7595 7596 void 7597 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 7598 { 7599 } 7600 7601 /* 7602 * Increase the starting virtual address of the given mapping if a 7603 * different alignment might result in more superpage mappings. 7604 */ 7605 void 7606 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 7607 vm_offset_t *addr, vm_size_t size) 7608 { 7609 vm_offset_t superpage_offset; 7610 7611 if (size < NBPDR) 7612 return; 7613 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 7614 offset += ptoa(object->pg_color); 7615 superpage_offset = offset & PDRMASK; 7616 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 7617 (*addr & PDRMASK) == superpage_offset) 7618 return; 7619 if ((*addr & PDRMASK) < superpage_offset) 7620 *addr = (*addr & ~PDRMASK) + superpage_offset; 7621 else 7622 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 7623 } 7624 7625 #ifdef INVARIANTS 7626 static unsigned long num_dirty_emulations; 7627 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, 7628 &num_dirty_emulations, 0, NULL); 7629 7630 static unsigned long num_accessed_emulations; 7631 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, 7632 &num_accessed_emulations, 0, NULL); 7633 7634 static unsigned long num_superpage_accessed_emulations; 7635 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, 7636 &num_superpage_accessed_emulations, 0, NULL); 7637 7638 static unsigned long ad_emulation_superpage_promotions; 7639 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, 7640 &ad_emulation_superpage_promotions, 0, NULL); 7641 #endif /* INVARIANTS */ 7642 7643 int 7644 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) 7645 { 7646 int rv; 7647 struct rwlock *lock; 7648 #if VM_NRESERVLEVEL > 0 7649 vm_page_t m, mpte; 7650 #endif 7651 pd_entry_t *pde; 7652 pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; 7653 7654 KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, 7655 ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); 7656 7657 if (!pmap_emulate_ad_bits(pmap)) 7658 return (-1); 7659 7660 PG_A = pmap_accessed_bit(pmap); 7661 PG_M = pmap_modified_bit(pmap); 7662 PG_V = pmap_valid_bit(pmap); 7663 PG_RW = pmap_rw_bit(pmap); 7664 7665 rv = -1; 7666 lock = NULL; 7667 PMAP_LOCK(pmap); 7668 7669 pde = pmap_pde(pmap, va); 7670 if (pde == NULL || (*pde & PG_V) == 0) 7671 goto done; 7672 7673 if ((*pde & PG_PS) != 0) { 7674 if (ftype == VM_PROT_READ) { 7675 #ifdef INVARIANTS 7676 atomic_add_long(&num_superpage_accessed_emulations, 1); 7677 #endif 7678 *pde |= PG_A; 7679 rv = 0; 7680 } 7681 goto done; 7682 } 7683 7684 pte = pmap_pde_to_pte(pde, va); 7685 if ((*pte & PG_V) == 0) 7686 goto done; 7687 7688 if (ftype == VM_PROT_WRITE) { 7689 if ((*pte & PG_RW) == 0) 7690 goto done; 7691 /* 7692 * Set the modified and accessed bits simultaneously. 7693 * 7694 * Intel EPT PTEs that do software emulation of A/D bits map 7695 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively. 7696 * An EPT misconfiguration is triggered if the PTE is writable 7697 * but not readable (WR=10). This is avoided by setting PG_A 7698 * and PG_M simultaneously. 7699 */ 7700 *pte |= PG_M | PG_A; 7701 } else { 7702 *pte |= PG_A; 7703 } 7704 7705 #if VM_NRESERVLEVEL > 0 7706 /* try to promote the mapping */ 7707 if (va < VM_MAXUSER_ADDRESS) 7708 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7709 else 7710 mpte = NULL; 7711 7712 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 7713 7714 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 7715 pmap_ps_enabled(pmap) && 7716 (m->flags & PG_FICTITIOUS) == 0 && 7717 vm_reserv_level_iffullpop(m) == 0) { 7718 pmap_promote_pde(pmap, pde, va, &lock); 7719 #ifdef INVARIANTS 7720 atomic_add_long(&ad_emulation_superpage_promotions, 1); 7721 #endif 7722 } 7723 #endif 7724 7725 #ifdef INVARIANTS 7726 if (ftype == VM_PROT_WRITE) 7727 atomic_add_long(&num_dirty_emulations, 1); 7728 else 7729 atomic_add_long(&num_accessed_emulations, 1); 7730 #endif 7731 rv = 0; /* success */ 7732 done: 7733 if (lock != NULL) 7734 rw_wunlock(lock); 7735 PMAP_UNLOCK(pmap); 7736 return (rv); 7737 } 7738 7739 void 7740 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) 7741 { 7742 pml4_entry_t *pml4; 7743 pdp_entry_t *pdp; 7744 pd_entry_t *pde; 7745 pt_entry_t *pte, PG_V; 7746 int idx; 7747 7748 idx = 0; 7749 PG_V = pmap_valid_bit(pmap); 7750 PMAP_LOCK(pmap); 7751 7752 pml4 = pmap_pml4e(pmap, va); 7753 ptr[idx++] = *pml4; 7754 if ((*pml4 & PG_V) == 0) 7755 goto done; 7756 7757 pdp = pmap_pml4e_to_pdpe(pml4, va); 7758 ptr[idx++] = *pdp; 7759 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) 7760 goto done; 7761 7762 pde = pmap_pdpe_to_pde(pdp, va); 7763 ptr[idx++] = *pde; 7764 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) 7765 goto done; 7766 7767 pte = pmap_pde_to_pte(pde, va); 7768 ptr[idx++] = *pte; 7769 7770 done: 7771 PMAP_UNLOCK(pmap); 7772 *num = idx; 7773 } 7774 7775 /** 7776 * Get the kernel virtual address of a set of physical pages. If there are 7777 * physical addresses not covered by the DMAP perform a transient mapping 7778 * that will be removed when calling pmap_unmap_io_transient. 7779 * 7780 * \param page The pages the caller wishes to obtain the virtual 7781 * address on the kernel memory map. 7782 * \param vaddr On return contains the kernel virtual memory address 7783 * of the pages passed in the page parameter. 7784 * \param count Number of pages passed in. 7785 * \param can_fault TRUE if the thread using the mapped pages can take 7786 * page faults, FALSE otherwise. 7787 * 7788 * \returns TRUE if the caller must call pmap_unmap_io_transient when 7789 * finished or FALSE otherwise. 7790 * 7791 */ 7792 boolean_t 7793 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 7794 boolean_t can_fault) 7795 { 7796 vm_paddr_t paddr; 7797 boolean_t needs_mapping; 7798 pt_entry_t *pte; 7799 int cache_bits, error __unused, i; 7800 7801 /* 7802 * Allocate any KVA space that we need, this is done in a separate 7803 * loop to prevent calling vmem_alloc while pinned. 7804 */ 7805 needs_mapping = FALSE; 7806 for (i = 0; i < count; i++) { 7807 paddr = VM_PAGE_TO_PHYS(page[i]); 7808 if (__predict_false(paddr >= dmaplimit)) { 7809 error = vmem_alloc(kernel_arena, PAGE_SIZE, 7810 M_BESTFIT | M_WAITOK, &vaddr[i]); 7811 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 7812 needs_mapping = TRUE; 7813 } else { 7814 vaddr[i] = PHYS_TO_DMAP(paddr); 7815 } 7816 } 7817 7818 /* Exit early if everything is covered by the DMAP */ 7819 if (!needs_mapping) 7820 return (FALSE); 7821 7822 /* 7823 * NB: The sequence of updating a page table followed by accesses 7824 * to the corresponding pages used in the !DMAP case is subject to 7825 * the situation described in the "AMD64 Architecture Programmer's 7826 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special 7827 * Coherency Considerations". Therefore, issuing the INVLPG right 7828 * after modifying the PTE bits is crucial. 7829 */ 7830 if (!can_fault) 7831 sched_pin(); 7832 for (i = 0; i < count; i++) { 7833 paddr = VM_PAGE_TO_PHYS(page[i]); 7834 if (paddr >= dmaplimit) { 7835 if (can_fault) { 7836 /* 7837 * Slow path, since we can get page faults 7838 * while mappings are active don't pin the 7839 * thread to the CPU and instead add a global 7840 * mapping visible to all CPUs. 7841 */ 7842 pmap_qenter(vaddr[i], &page[i], 1); 7843 } else { 7844 pte = vtopte(vaddr[i]); 7845 cache_bits = pmap_cache_bits(kernel_pmap, 7846 page[i]->md.pat_mode, 0); 7847 pte_store(pte, paddr | X86_PG_RW | X86_PG_V | 7848 cache_bits); 7849 invlpg(vaddr[i]); 7850 } 7851 } 7852 } 7853 7854 return (needs_mapping); 7855 } 7856 7857 void 7858 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 7859 boolean_t can_fault) 7860 { 7861 vm_paddr_t paddr; 7862 int i; 7863 7864 if (!can_fault) 7865 sched_unpin(); 7866 for (i = 0; i < count; i++) { 7867 paddr = VM_PAGE_TO_PHYS(page[i]); 7868 if (paddr >= dmaplimit) { 7869 if (can_fault) 7870 pmap_qremove(vaddr[i], 1); 7871 vmem_free(kernel_arena, vaddr[i], PAGE_SIZE); 7872 } 7873 } 7874 } 7875 7876 vm_offset_t 7877 pmap_quick_enter_page(vm_page_t m) 7878 { 7879 vm_paddr_t paddr; 7880 7881 paddr = VM_PAGE_TO_PHYS(m); 7882 if (paddr < dmaplimit) 7883 return (PHYS_TO_DMAP(paddr)); 7884 mtx_lock_spin(&qframe_mtx); 7885 KASSERT(*vtopte(qframe) == 0, ("qframe busy")); 7886 pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A | 7887 X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0)); 7888 return (qframe); 7889 } 7890 7891 void 7892 pmap_quick_remove_page(vm_offset_t addr) 7893 { 7894 7895 if (addr != qframe) 7896 return; 7897 pte_store(vtopte(qframe), 0); 7898 invlpg(qframe); 7899 mtx_unlock_spin(&qframe_mtx); 7900 } 7901 7902 static vm_page_t 7903 pmap_pti_alloc_page(void) 7904 { 7905 vm_page_t m; 7906 7907 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 7908 m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY | 7909 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 7910 return (m); 7911 } 7912 7913 static bool 7914 pmap_pti_free_page(vm_page_t m) 7915 { 7916 7917 KASSERT(m->wire_count > 0, ("page %p not wired", m)); 7918 if (!vm_page_unwire_noq(m)) 7919 return (false); 7920 vm_page_free_zero(m); 7921 return (true); 7922 } 7923 7924 static void 7925 pmap_pti_init(void) 7926 { 7927 vm_page_t pml4_pg; 7928 pdp_entry_t *pdpe; 7929 vm_offset_t va; 7930 int i; 7931 7932 if (!pti) 7933 return; 7934 pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL); 7935 VM_OBJECT_WLOCK(pti_obj); 7936 pml4_pg = pmap_pti_alloc_page(); 7937 pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg)); 7938 for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS && 7939 va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) { 7940 pdpe = pmap_pti_pdpe(va); 7941 pmap_pti_wire_pte(pdpe); 7942 } 7943 pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0], 7944 (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false); 7945 pmap_pti_add_kva_locked((vm_offset_t)gdt, (vm_offset_t)gdt + 7946 sizeof(struct user_segment_descriptor) * NGDT * MAXCPU, false); 7947 pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt + 7948 sizeof(struct gate_descriptor) * NIDT, false); 7949 pmap_pti_add_kva_locked((vm_offset_t)common_tss, 7950 (vm_offset_t)common_tss + sizeof(struct amd64tss) * MAXCPU, false); 7951 CPU_FOREACH(i) { 7952 /* Doublefault stack IST 1 */ 7953 va = common_tss[i].tss_ist1; 7954 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); 7955 /* NMI stack IST 2 */ 7956 va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu); 7957 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); 7958 /* MC# stack IST 3 */ 7959 va = common_tss[i].tss_ist3 + sizeof(struct nmi_pcpu); 7960 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); 7961 /* DB# stack IST 4 */ 7962 va = common_tss[i].tss_ist4 + sizeof(struct nmi_pcpu); 7963 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); 7964 } 7965 pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE, 7966 (vm_offset_t)etext, true); 7967 pti_finalized = true; 7968 VM_OBJECT_WUNLOCK(pti_obj); 7969 } 7970 SYSINIT(pmap_pti, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_pti_init, NULL); 7971 7972 static pdp_entry_t * 7973 pmap_pti_pdpe(vm_offset_t va) 7974 { 7975 pml4_entry_t *pml4e; 7976 pdp_entry_t *pdpe; 7977 vm_page_t m; 7978 vm_pindex_t pml4_idx; 7979 vm_paddr_t mphys; 7980 7981 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 7982 7983 pml4_idx = pmap_pml4e_index(va); 7984 pml4e = &pti_pml4[pml4_idx]; 7985 m = NULL; 7986 if (*pml4e == 0) { 7987 if (pti_finalized) 7988 panic("pml4 alloc after finalization\n"); 7989 m = pmap_pti_alloc_page(); 7990 if (*pml4e != 0) { 7991 pmap_pti_free_page(m); 7992 mphys = *pml4e & ~PAGE_MASK; 7993 } else { 7994 mphys = VM_PAGE_TO_PHYS(m); 7995 *pml4e = mphys | X86_PG_RW | X86_PG_V; 7996 } 7997 } else { 7998 mphys = *pml4e & ~PAGE_MASK; 7999 } 8000 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va); 8001 return (pdpe); 8002 } 8003 8004 static void 8005 pmap_pti_wire_pte(void *pte) 8006 { 8007 vm_page_t m; 8008 8009 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 8010 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); 8011 m->wire_count++; 8012 } 8013 8014 static void 8015 pmap_pti_unwire_pde(void *pde, bool only_ref) 8016 { 8017 vm_page_t m; 8018 8019 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 8020 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde)); 8021 MPASS(m->wire_count > 0); 8022 MPASS(only_ref || m->wire_count > 1); 8023 pmap_pti_free_page(m); 8024 } 8025 8026 static void 8027 pmap_pti_unwire_pte(void *pte, vm_offset_t va) 8028 { 8029 vm_page_t m; 8030 pd_entry_t *pde; 8031 8032 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 8033 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); 8034 MPASS(m->wire_count > 0); 8035 if (pmap_pti_free_page(m)) { 8036 pde = pmap_pti_pde(va); 8037 MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V); 8038 *pde = 0; 8039 pmap_pti_unwire_pde(pde, false); 8040 } 8041 } 8042 8043 static pd_entry_t * 8044 pmap_pti_pde(vm_offset_t va) 8045 { 8046 pdp_entry_t *pdpe; 8047 pd_entry_t *pde; 8048 vm_page_t m; 8049 vm_pindex_t pd_idx; 8050 vm_paddr_t mphys; 8051 8052 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 8053 8054 pdpe = pmap_pti_pdpe(va); 8055 if (*pdpe == 0) { 8056 m = pmap_pti_alloc_page(); 8057 if (*pdpe != 0) { 8058 pmap_pti_free_page(m); 8059 MPASS((*pdpe & X86_PG_PS) == 0); 8060 mphys = *pdpe & ~PAGE_MASK; 8061 } else { 8062 mphys = VM_PAGE_TO_PHYS(m); 8063 *pdpe = mphys | X86_PG_RW | X86_PG_V; 8064 } 8065 } else { 8066 MPASS((*pdpe & X86_PG_PS) == 0); 8067 mphys = *pdpe & ~PAGE_MASK; 8068 } 8069 8070 pde = (pd_entry_t *)PHYS_TO_DMAP(mphys); 8071 pd_idx = pmap_pde_index(va); 8072 pde += pd_idx; 8073 return (pde); 8074 } 8075 8076 static pt_entry_t * 8077 pmap_pti_pte(vm_offset_t va, bool *unwire_pde) 8078 { 8079 pd_entry_t *pde; 8080 pt_entry_t *pte; 8081 vm_page_t m; 8082 vm_paddr_t mphys; 8083 8084 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 8085 8086 pde = pmap_pti_pde(va); 8087 if (unwire_pde != NULL) { 8088 *unwire_pde = true; 8089 pmap_pti_wire_pte(pde); 8090 } 8091 if (*pde == 0) { 8092 m = pmap_pti_alloc_page(); 8093 if (*pde != 0) { 8094 pmap_pti_free_page(m); 8095 MPASS((*pde & X86_PG_PS) == 0); 8096 mphys = *pde & ~(PAGE_MASK | pg_nx); 8097 } else { 8098 mphys = VM_PAGE_TO_PHYS(m); 8099 *pde = mphys | X86_PG_RW | X86_PG_V; 8100 if (unwire_pde != NULL) 8101 *unwire_pde = false; 8102 } 8103 } else { 8104 MPASS((*pde & X86_PG_PS) == 0); 8105 mphys = *pde & ~(PAGE_MASK | pg_nx); 8106 } 8107 8108 pte = (pt_entry_t *)PHYS_TO_DMAP(mphys); 8109 pte += pmap_pte_index(va); 8110 8111 return (pte); 8112 } 8113 8114 static void 8115 pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec) 8116 { 8117 vm_paddr_t pa; 8118 pd_entry_t *pde; 8119 pt_entry_t *pte, ptev; 8120 bool unwire_pde; 8121 8122 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 8123 8124 sva = trunc_page(sva); 8125 MPASS(sva > VM_MAXUSER_ADDRESS); 8126 eva = round_page(eva); 8127 MPASS(sva < eva); 8128 for (; sva < eva; sva += PAGE_SIZE) { 8129 pte = pmap_pti_pte(sva, &unwire_pde); 8130 pa = pmap_kextract(sva); 8131 ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G | 8132 (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap, 8133 VM_MEMATTR_DEFAULT, FALSE); 8134 if (*pte == 0) { 8135 pte_store(pte, ptev); 8136 pmap_pti_wire_pte(pte); 8137 } else { 8138 KASSERT(!pti_finalized, 8139 ("pti overlap after fin %#lx %#lx %#lx", 8140 sva, *pte, ptev)); 8141 KASSERT(*pte == ptev, 8142 ("pti non-identical pte after fin %#lx %#lx %#lx", 8143 sva, *pte, ptev)); 8144 } 8145 if (unwire_pde) { 8146 pde = pmap_pti_pde(sva); 8147 pmap_pti_unwire_pde(pde, true); 8148 } 8149 } 8150 } 8151 8152 void 8153 pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec) 8154 { 8155 8156 if (!pti) 8157 return; 8158 VM_OBJECT_WLOCK(pti_obj); 8159 pmap_pti_add_kva_locked(sva, eva, exec); 8160 VM_OBJECT_WUNLOCK(pti_obj); 8161 } 8162 8163 void 8164 pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva) 8165 { 8166 pt_entry_t *pte; 8167 vm_offset_t va; 8168 8169 if (!pti) 8170 return; 8171 sva = rounddown2(sva, PAGE_SIZE); 8172 MPASS(sva > VM_MAXUSER_ADDRESS); 8173 eva = roundup2(eva, PAGE_SIZE); 8174 MPASS(sva < eva); 8175 VM_OBJECT_WLOCK(pti_obj); 8176 for (va = sva; va < eva; va += PAGE_SIZE) { 8177 pte = pmap_pti_pte(va, NULL); 8178 KASSERT((*pte & X86_PG_V) != 0, 8179 ("invalid pte va %#lx pte %#lx pt %#lx", va, 8180 (u_long)pte, *pte)); 8181 pte_clear(pte); 8182 pmap_pti_unwire_pte(pte, va); 8183 } 8184 pmap_invalidate_range(kernel_pmap, sva, eva); 8185 VM_OBJECT_WUNLOCK(pti_obj); 8186 } 8187 8188 #include "opt_ddb.h" 8189 #ifdef DDB 8190 #include <sys/kdb.h> 8191 #include <ddb/ddb.h> 8192 8193 DB_SHOW_COMMAND(pte, pmap_print_pte) 8194 { 8195 pmap_t pmap; 8196 pml4_entry_t *pml4; 8197 pdp_entry_t *pdp; 8198 pd_entry_t *pde; 8199 pt_entry_t *pte, PG_V; 8200 vm_offset_t va; 8201 8202 if (!have_addr) { 8203 db_printf("show pte addr\n"); 8204 return; 8205 } 8206 va = (vm_offset_t)addr; 8207 8208 if (kdb_thread != NULL) 8209 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); 8210 else 8211 pmap = PCPU_GET(curpmap); 8212 8213 PG_V = pmap_valid_bit(pmap); 8214 pml4 = pmap_pml4e(pmap, va); 8215 db_printf("VA %#016lx pml4e %#016lx", va, *pml4); 8216 if ((*pml4 & PG_V) == 0) { 8217 db_printf("\n"); 8218 return; 8219 } 8220 pdp = pmap_pml4e_to_pdpe(pml4, va); 8221 db_printf(" pdpe %#016lx", *pdp); 8222 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { 8223 db_printf("\n"); 8224 return; 8225 } 8226 pde = pmap_pdpe_to_pde(pdp, va); 8227 db_printf(" pde %#016lx", *pde); 8228 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { 8229 db_printf("\n"); 8230 return; 8231 } 8232 pte = pmap_pde_to_pte(pde, va); 8233 db_printf(" pte %#016lx\n", *pte); 8234 } 8235 8236 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) 8237 { 8238 vm_paddr_t a; 8239 8240 if (have_addr) { 8241 a = (vm_paddr_t)addr; 8242 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); 8243 } else { 8244 db_printf("show phys2dmap addr\n"); 8245 } 8246 } 8247 #endif 8248