1 /*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2003 Peter Wemm 9 * All rights reserved. 10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 11 * All rights reserved. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * the Systems Programming Group of the University of Utah Computer 15 * Science Department and William Jolitz of UUNET Technologies Inc. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. All advertising materials mentioning features or use of this software 26 * must display the following acknowledgement: 27 * This product includes software developed by the University of 28 * California, Berkeley and its contributors. 29 * 4. Neither the name of the University nor the names of its contributors 30 * may be used to endorse or promote products derived from this software 31 * without specific prior written permission. 32 * 33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 36 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 43 * SUCH DAMAGE. 44 * 45 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 46 */ 47 /*- 48 * Copyright (c) 2003 Networks Associates Technology, Inc. 49 * All rights reserved. 50 * 51 * This software was developed for the FreeBSD Project by Jake Burkholder, 52 * Safeport Network Services, and Network Associates Laboratories, the 53 * Security Research Division of Network Associates, Inc. under 54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 55 * CHATS research program. 56 * 57 * Redistribution and use in source and binary forms, with or without 58 * modification, are permitted provided that the following conditions 59 * are met: 60 * 1. Redistributions of source code must retain the above copyright 61 * notice, this list of conditions and the following disclaimer. 62 * 2. Redistributions in binary form must reproduce the above copyright 63 * notice, this list of conditions and the following disclaimer in the 64 * documentation and/or other materials provided with the distribution. 65 * 66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 69 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 76 * SUCH DAMAGE. 77 */ 78 79 #define AMD64_NPT_AWARE 80 81 #include <sys/cdefs.h> 82 __FBSDID("$FreeBSD$"); 83 84 /* 85 * Manages physical address maps. 86 * 87 * Since the information managed by this module is 88 * also stored by the logical address mapping module, 89 * this module may throw away valid virtual-to-physical 90 * mappings at almost any time. However, invalidations 91 * of virtual-to-physical mappings must be done as 92 * requested. 93 * 94 * In order to cope with hardware architectures which 95 * make virtual-to-physical map invalidates expensive, 96 * this module may delay invalidate or reduced protection 97 * operations until such time as they are actually 98 * necessary. This module is given full information as 99 * to which processors are currently using which maps, 100 * and to when physical maps must be made correct. 101 */ 102 103 #include "opt_pmap.h" 104 #include "opt_vm.h" 105 106 #include <sys/param.h> 107 #include <sys/bitstring.h> 108 #include <sys/bus.h> 109 #include <sys/systm.h> 110 #include <sys/kernel.h> 111 #include <sys/ktr.h> 112 #include <sys/lock.h> 113 #include <sys/malloc.h> 114 #include <sys/mman.h> 115 #include <sys/mutex.h> 116 #include <sys/proc.h> 117 #include <sys/rwlock.h> 118 #include <sys/sx.h> 119 #include <sys/turnstile.h> 120 #include <sys/vmem.h> 121 #include <sys/vmmeter.h> 122 #include <sys/sched.h> 123 #include <sys/sysctl.h> 124 #include <sys/smp.h> 125 126 #include <vm/vm.h> 127 #include <vm/vm_param.h> 128 #include <vm/vm_kern.h> 129 #include <vm/vm_page.h> 130 #include <vm/vm_map.h> 131 #include <vm/vm_object.h> 132 #include <vm/vm_extern.h> 133 #include <vm/vm_pageout.h> 134 #include <vm/vm_pager.h> 135 #include <vm/vm_phys.h> 136 #include <vm/vm_radix.h> 137 #include <vm/vm_reserv.h> 138 #include <vm/uma.h> 139 140 #include <machine/intr_machdep.h> 141 #include <x86/apicvar.h> 142 #include <machine/cpu.h> 143 #include <machine/cputypes.h> 144 #include <machine/md_var.h> 145 #include <machine/pcb.h> 146 #include <machine/specialreg.h> 147 #ifdef SMP 148 #include <machine/smp.h> 149 #endif 150 151 static __inline boolean_t 152 pmap_type_guest(pmap_t pmap) 153 { 154 155 return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI)); 156 } 157 158 static __inline boolean_t 159 pmap_emulate_ad_bits(pmap_t pmap) 160 { 161 162 return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); 163 } 164 165 static __inline pt_entry_t 166 pmap_valid_bit(pmap_t pmap) 167 { 168 pt_entry_t mask; 169 170 switch (pmap->pm_type) { 171 case PT_X86: 172 case PT_RVI: 173 mask = X86_PG_V; 174 break; 175 case PT_EPT: 176 if (pmap_emulate_ad_bits(pmap)) 177 mask = EPT_PG_EMUL_V; 178 else 179 mask = EPT_PG_READ; 180 break; 181 default: 182 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); 183 } 184 185 return (mask); 186 } 187 188 static __inline pt_entry_t 189 pmap_rw_bit(pmap_t pmap) 190 { 191 pt_entry_t mask; 192 193 switch (pmap->pm_type) { 194 case PT_X86: 195 case PT_RVI: 196 mask = X86_PG_RW; 197 break; 198 case PT_EPT: 199 if (pmap_emulate_ad_bits(pmap)) 200 mask = EPT_PG_EMUL_RW; 201 else 202 mask = EPT_PG_WRITE; 203 break; 204 default: 205 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); 206 } 207 208 return (mask); 209 } 210 211 static __inline pt_entry_t 212 pmap_global_bit(pmap_t pmap) 213 { 214 pt_entry_t mask; 215 216 switch (pmap->pm_type) { 217 case PT_X86: 218 mask = X86_PG_G; 219 break; 220 case PT_RVI: 221 case PT_EPT: 222 mask = 0; 223 break; 224 default: 225 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); 226 } 227 228 return (mask); 229 } 230 231 static __inline pt_entry_t 232 pmap_accessed_bit(pmap_t pmap) 233 { 234 pt_entry_t mask; 235 236 switch (pmap->pm_type) { 237 case PT_X86: 238 case PT_RVI: 239 mask = X86_PG_A; 240 break; 241 case PT_EPT: 242 if (pmap_emulate_ad_bits(pmap)) 243 mask = EPT_PG_READ; 244 else 245 mask = EPT_PG_A; 246 break; 247 default: 248 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); 249 } 250 251 return (mask); 252 } 253 254 static __inline pt_entry_t 255 pmap_modified_bit(pmap_t pmap) 256 { 257 pt_entry_t mask; 258 259 switch (pmap->pm_type) { 260 case PT_X86: 261 case PT_RVI: 262 mask = X86_PG_M; 263 break; 264 case PT_EPT: 265 if (pmap_emulate_ad_bits(pmap)) 266 mask = EPT_PG_WRITE; 267 else 268 mask = EPT_PG_M; 269 break; 270 default: 271 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); 272 } 273 274 return (mask); 275 } 276 277 extern struct pcpu __pcpu[]; 278 279 #if !defined(DIAGNOSTIC) 280 #ifdef __GNUC_GNU_INLINE__ 281 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 282 #else 283 #define PMAP_INLINE extern inline 284 #endif 285 #else 286 #define PMAP_INLINE 287 #endif 288 289 #ifdef PV_STATS 290 #define PV_STAT(x) do { x ; } while (0) 291 #else 292 #define PV_STAT(x) do { } while (0) 293 #endif 294 295 #define pa_index(pa) ((pa) >> PDRSHIFT) 296 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 297 298 #define NPV_LIST_LOCKS MAXCPU 299 300 #define PHYS_TO_PV_LIST_LOCK(pa) \ 301 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) 302 303 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 304 struct rwlock **_lockp = (lockp); \ 305 struct rwlock *_new_lock; \ 306 \ 307 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 308 if (_new_lock != *_lockp) { \ 309 if (*_lockp != NULL) \ 310 rw_wunlock(*_lockp); \ 311 *_lockp = _new_lock; \ 312 rw_wlock(*_lockp); \ 313 } \ 314 } while (0) 315 316 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 317 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 318 319 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 320 struct rwlock **_lockp = (lockp); \ 321 \ 322 if (*_lockp != NULL) { \ 323 rw_wunlock(*_lockp); \ 324 *_lockp = NULL; \ 325 } \ 326 } while (0) 327 328 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 329 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 330 331 struct pmap kernel_pmap_store; 332 333 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 334 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 335 336 int nkpt; 337 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, 338 "Number of kernel page table pages allocated on bootup"); 339 340 static int ndmpdp; 341 vm_paddr_t dmaplimit; 342 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 343 pt_entry_t pg_nx; 344 345 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 346 347 static int pat_works = 1; 348 SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, 349 "Is page attribute table fully functional?"); 350 351 static int pg_ps_enabled = 1; 352 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 353 &pg_ps_enabled, 0, "Are large page mappings enabled?"); 354 355 #define PAT_INDEX_SIZE 8 356 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 357 358 static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 359 static u_int64_t KPDphys; /* phys addr of kernel level 2 */ 360 u_int64_t KPDPphys; /* phys addr of kernel level 3 */ 361 u_int64_t KPML4phys; /* phys addr of kernel level 4 */ 362 363 static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ 364 static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 365 static int ndmpdpphys; /* number of DMPDPphys pages */ 366 367 /* 368 * pmap_mapdev support pre initialization (i.e. console) 369 */ 370 #define PMAP_PREINIT_MAPPING_COUNT 8 371 static struct pmap_preinit_mapping { 372 vm_paddr_t pa; 373 vm_offset_t va; 374 vm_size_t sz; 375 int mode; 376 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 377 static int pmap_initialized; 378 379 /* 380 * Data for the pv entry allocation mechanism. 381 * Updates to pv_invl_gen are protected by the pv_list_locks[] 382 * elements, but reads are not. 383 */ 384 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 385 static struct mtx pv_chunks_mutex; 386 static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 387 static u_long pv_invl_gen[NPV_LIST_LOCKS]; 388 static struct md_page *pv_table; 389 static struct md_page pv_dummy; 390 391 /* 392 * All those kernel PT submaps that BSD is so fond of 393 */ 394 pt_entry_t *CMAP1 = NULL; 395 caddr_t CADDR1 = 0; 396 static vm_offset_t qframe = 0; 397 static struct mtx qframe_mtx; 398 399 static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ 400 401 int pmap_pcid_enabled = 1; 402 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 403 &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?"); 404 int invpcid_works = 0; 405 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, 406 "Is the invpcid instruction available ?"); 407 408 static int 409 pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS) 410 { 411 int i; 412 uint64_t res; 413 414 res = 0; 415 CPU_FOREACH(i) { 416 res += cpuid_to_pcpu[i]->pc_pm_save_cnt; 417 } 418 return (sysctl_handle_64(oidp, &res, 0, req)); 419 } 420 SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW | 421 CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU", 422 "Count of saved TLB context on switch"); 423 424 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = 425 LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); 426 static struct mtx invl_gen_mtx; 427 static u_long pmap_invl_gen = 0; 428 /* Fake lock object to satisfy turnstiles interface. */ 429 static struct lock_object invl_gen_ts = { 430 .lo_name = "invlts", 431 }; 432 433 #define PMAP_ASSERT_NOT_IN_DI() \ 434 KASSERT(curthread->td_md.md_invl_gen.gen == 0, ("DI already started")) 435 436 /* 437 * Start a new Delayed Invalidation (DI) block of code, executed by 438 * the current thread. Within a DI block, the current thread may 439 * destroy both the page table and PV list entries for a mapping and 440 * then release the corresponding PV list lock before ensuring that 441 * the mapping is flushed from the TLBs of any processors with the 442 * pmap active. 443 */ 444 static void 445 pmap_delayed_invl_started(void) 446 { 447 struct pmap_invl_gen *invl_gen; 448 u_long currgen; 449 450 invl_gen = &curthread->td_md.md_invl_gen; 451 PMAP_ASSERT_NOT_IN_DI(); 452 mtx_lock(&invl_gen_mtx); 453 if (LIST_EMPTY(&pmap_invl_gen_tracker)) 454 currgen = pmap_invl_gen; 455 else 456 currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; 457 invl_gen->gen = currgen + 1; 458 LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); 459 mtx_unlock(&invl_gen_mtx); 460 } 461 462 /* 463 * Finish the DI block, previously started by the current thread. All 464 * required TLB flushes for the pages marked by 465 * pmap_delayed_invl_page() must be finished before this function is 466 * called. 467 * 468 * This function works by bumping the global DI generation number to 469 * the generation number of the current thread's DI, unless there is a 470 * pending DI that started earlier. In the latter case, bumping the 471 * global DI generation number would incorrectly signal that the 472 * earlier DI had finished. Instead, this function bumps the earlier 473 * DI's generation number to match the generation number of the 474 * current thread's DI. 475 */ 476 static void 477 pmap_delayed_invl_finished(void) 478 { 479 struct pmap_invl_gen *invl_gen, *next; 480 struct turnstile *ts; 481 482 invl_gen = &curthread->td_md.md_invl_gen; 483 KASSERT(invl_gen->gen != 0, ("missed invl_started")); 484 mtx_lock(&invl_gen_mtx); 485 next = LIST_NEXT(invl_gen, link); 486 if (next == NULL) { 487 turnstile_chain_lock(&invl_gen_ts); 488 ts = turnstile_lookup(&invl_gen_ts); 489 pmap_invl_gen = invl_gen->gen; 490 if (ts != NULL) { 491 turnstile_broadcast(ts, TS_SHARED_QUEUE); 492 turnstile_unpend(ts, TS_SHARED_LOCK); 493 } 494 turnstile_chain_unlock(&invl_gen_ts); 495 } else { 496 next->gen = invl_gen->gen; 497 } 498 LIST_REMOVE(invl_gen, link); 499 mtx_unlock(&invl_gen_mtx); 500 invl_gen->gen = 0; 501 } 502 503 #ifdef PV_STATS 504 static long invl_wait; 505 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0, 506 "Number of times DI invalidation blocked pmap_remove_all/write"); 507 #endif 508 509 static u_long * 510 pmap_delayed_invl_genp(vm_page_t m) 511 { 512 513 return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); 514 } 515 516 /* 517 * Ensure that all currently executing DI blocks, that need to flush 518 * TLB for the given page m, actually flushed the TLB at the time the 519 * function returned. If the page m has an empty PV list and we call 520 * pmap_delayed_invl_wait(), upon its return we know that no CPU has a 521 * valid mapping for the page m in either its page table or TLB. 522 * 523 * This function works by blocking until the global DI generation 524 * number catches up with the generation number associated with the 525 * given page m and its PV list. Since this function's callers 526 * typically own an object lock and sometimes own a page lock, it 527 * cannot sleep. Instead, it blocks on a turnstile to relinquish the 528 * processor. 529 */ 530 static void 531 pmap_delayed_invl_wait(vm_page_t m) 532 { 533 struct thread *td; 534 struct turnstile *ts; 535 u_long *m_gen; 536 #ifdef PV_STATS 537 bool accounted = false; 538 #endif 539 540 td = curthread; 541 m_gen = pmap_delayed_invl_genp(m); 542 while (*m_gen > pmap_invl_gen) { 543 #ifdef PV_STATS 544 if (!accounted) { 545 atomic_add_long(&invl_wait, 1); 546 accounted = true; 547 } 548 #endif 549 ts = turnstile_trywait(&invl_gen_ts); 550 if (*m_gen > pmap_invl_gen) 551 turnstile_wait(ts, NULL, TS_SHARED_QUEUE); 552 else 553 turnstile_cancel(ts); 554 } 555 } 556 557 /* 558 * Mark the page m's PV list as participating in the current thread's 559 * DI block. Any threads concurrently using m's PV list to remove or 560 * restrict all mappings to m will wait for the current thread's DI 561 * block to complete before proceeding. 562 * 563 * The function works by setting the DI generation number for m's PV 564 * list to at least the DI generation number of the current thread. 565 * This forces a caller of pmap_delayed_invl_wait() to block until 566 * current thread calls pmap_delayed_invl_finished(). 567 */ 568 static void 569 pmap_delayed_invl_page(vm_page_t m) 570 { 571 u_long gen, *m_gen; 572 573 rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); 574 gen = curthread->td_md.md_invl_gen.gen; 575 if (gen == 0) 576 return; 577 m_gen = pmap_delayed_invl_genp(m); 578 if (*m_gen < gen) 579 *m_gen = gen; 580 } 581 582 /* 583 * Crashdump maps. 584 */ 585 static caddr_t crashdumpmap; 586 587 static void free_pv_chunk(struct pv_chunk *pc); 588 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 589 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 590 static int popcnt_pc_map_pq(uint64_t *map); 591 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 592 static void reserve_pv_entries(pmap_t pmap, int needed, 593 struct rwlock **lockp); 594 static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 595 struct rwlock **lockp); 596 static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 597 struct rwlock **lockp); 598 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 599 struct rwlock **lockp); 600 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 601 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 602 vm_offset_t va); 603 604 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); 605 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 606 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, 607 vm_offset_t va, struct rwlock **lockp); 608 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, 609 vm_offset_t va); 610 static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, 611 vm_prot_t prot, struct rwlock **lockp); 612 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 613 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 614 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 615 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 616 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, 617 pd_entry_t pde); 618 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 619 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask); 620 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 621 struct rwlock **lockp); 622 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 623 vm_prot_t prot); 624 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask); 625 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 626 struct spglist *free, struct rwlock **lockp); 627 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 628 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 629 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 630 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 631 struct spglist *free); 632 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 633 vm_page_t m, struct rwlock **lockp); 634 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 635 pd_entry_t newpde); 636 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); 637 638 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, 639 struct rwlock **lockp); 640 static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, 641 struct rwlock **lockp); 642 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, 643 struct rwlock **lockp); 644 645 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 646 struct spglist *free); 647 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 648 static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 649 650 /* 651 * Move the kernel virtual free pointer to the next 652 * 2MB. This is used to help improve performance 653 * by using a large (2MB) page for much of the kernel 654 * (.text, .data, .bss) 655 */ 656 static vm_offset_t 657 pmap_kmem_choose(vm_offset_t addr) 658 { 659 vm_offset_t newaddr = addr; 660 661 newaddr = roundup2(addr, NBPDR); 662 return (newaddr); 663 } 664 665 /********************/ 666 /* Inline functions */ 667 /********************/ 668 669 /* Return a non-clipped PD index for a given VA */ 670 static __inline vm_pindex_t 671 pmap_pde_pindex(vm_offset_t va) 672 { 673 return (va >> PDRSHIFT); 674 } 675 676 677 /* Return a pointer to the PML4 slot that corresponds to a VA */ 678 static __inline pml4_entry_t * 679 pmap_pml4e(pmap_t pmap, vm_offset_t va) 680 { 681 682 return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 683 } 684 685 /* Return a pointer to the PDP slot that corresponds to a VA */ 686 static __inline pdp_entry_t * 687 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 688 { 689 pdp_entry_t *pdpe; 690 691 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 692 return (&pdpe[pmap_pdpe_index(va)]); 693 } 694 695 /* Return a pointer to the PDP slot that corresponds to a VA */ 696 static __inline pdp_entry_t * 697 pmap_pdpe(pmap_t pmap, vm_offset_t va) 698 { 699 pml4_entry_t *pml4e; 700 pt_entry_t PG_V; 701 702 PG_V = pmap_valid_bit(pmap); 703 pml4e = pmap_pml4e(pmap, va); 704 if ((*pml4e & PG_V) == 0) 705 return (NULL); 706 return (pmap_pml4e_to_pdpe(pml4e, va)); 707 } 708 709 /* Return a pointer to the PD slot that corresponds to a VA */ 710 static __inline pd_entry_t * 711 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 712 { 713 pd_entry_t *pde; 714 715 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 716 return (&pde[pmap_pde_index(va)]); 717 } 718 719 /* Return a pointer to the PD slot that corresponds to a VA */ 720 static __inline pd_entry_t * 721 pmap_pde(pmap_t pmap, vm_offset_t va) 722 { 723 pdp_entry_t *pdpe; 724 pt_entry_t PG_V; 725 726 PG_V = pmap_valid_bit(pmap); 727 pdpe = pmap_pdpe(pmap, va); 728 if (pdpe == NULL || (*pdpe & PG_V) == 0) 729 return (NULL); 730 return (pmap_pdpe_to_pde(pdpe, va)); 731 } 732 733 /* Return a pointer to the PT slot that corresponds to a VA */ 734 static __inline pt_entry_t * 735 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 736 { 737 pt_entry_t *pte; 738 739 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 740 return (&pte[pmap_pte_index(va)]); 741 } 742 743 /* Return a pointer to the PT slot that corresponds to a VA */ 744 static __inline pt_entry_t * 745 pmap_pte(pmap_t pmap, vm_offset_t va) 746 { 747 pd_entry_t *pde; 748 pt_entry_t PG_V; 749 750 PG_V = pmap_valid_bit(pmap); 751 pde = pmap_pde(pmap, va); 752 if (pde == NULL || (*pde & PG_V) == 0) 753 return (NULL); 754 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 755 return ((pt_entry_t *)pde); 756 return (pmap_pde_to_pte(pde, va)); 757 } 758 759 static __inline void 760 pmap_resident_count_inc(pmap_t pmap, int count) 761 { 762 763 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 764 pmap->pm_stats.resident_count += count; 765 } 766 767 static __inline void 768 pmap_resident_count_dec(pmap_t pmap, int count) 769 { 770 771 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 772 KASSERT(pmap->pm_stats.resident_count >= count, 773 ("pmap %p resident count underflow %ld %d", pmap, 774 pmap->pm_stats.resident_count, count)); 775 pmap->pm_stats.resident_count -= count; 776 } 777 778 PMAP_INLINE pt_entry_t * 779 vtopte(vm_offset_t va) 780 { 781 u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 782 783 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); 784 785 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 786 } 787 788 static __inline pd_entry_t * 789 vtopde(vm_offset_t va) 790 { 791 u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 792 793 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); 794 795 return (PDmap + ((va >> PDRSHIFT) & mask)); 796 } 797 798 static u_int64_t 799 allocpages(vm_paddr_t *firstaddr, int n) 800 { 801 u_int64_t ret; 802 803 ret = *firstaddr; 804 bzero((void *)ret, n * PAGE_SIZE); 805 *firstaddr += n * PAGE_SIZE; 806 return (ret); 807 } 808 809 CTASSERT(powerof2(NDMPML4E)); 810 811 /* number of kernel PDP slots */ 812 #define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG) 813 814 static void 815 nkpt_init(vm_paddr_t addr) 816 { 817 int pt_pages; 818 819 #ifdef NKPT 820 pt_pages = NKPT; 821 #else 822 pt_pages = howmany(addr, 1 << PDRSHIFT); 823 pt_pages += NKPDPE(pt_pages); 824 825 /* 826 * Add some slop beyond the bare minimum required for bootstrapping 827 * the kernel. 828 * 829 * This is quite important when allocating KVA for kernel modules. 830 * The modules are required to be linked in the negative 2GB of 831 * the address space. If we run out of KVA in this region then 832 * pmap_growkernel() will need to allocate page table pages to map 833 * the entire 512GB of KVA space which is an unnecessary tax on 834 * physical memory. 835 * 836 * Secondly, device memory mapped as part of setting up the low- 837 * level console(s) is taken from KVA, starting at virtual_avail. 838 * This is because cninit() is called after pmap_bootstrap() but 839 * before vm_init() and pmap_init(). 20MB for a frame buffer is 840 * not uncommon. 841 */ 842 pt_pages += 32; /* 64MB additional slop. */ 843 #endif 844 nkpt = pt_pages; 845 } 846 847 static void 848 create_pagetables(vm_paddr_t *firstaddr) 849 { 850 int i, j, ndm1g, nkpdpe; 851 pt_entry_t *pt_p; 852 pd_entry_t *pd_p; 853 pdp_entry_t *pdp_p; 854 pml4_entry_t *p4_p; 855 856 /* Allocate page table pages for the direct map */ 857 ndmpdp = howmany(ptoa(Maxmem), NBPDP); 858 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 859 ndmpdp = 4; 860 ndmpdpphys = howmany(ndmpdp, NPDPEPG); 861 if (ndmpdpphys > NDMPML4E) { 862 /* 863 * Each NDMPML4E allows 512 GB, so limit to that, 864 * and then readjust ndmpdp and ndmpdpphys. 865 */ 866 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); 867 Maxmem = atop(NDMPML4E * NBPML4); 868 ndmpdpphys = NDMPML4E; 869 ndmpdp = NDMPML4E * NPDEPG; 870 } 871 DMPDPphys = allocpages(firstaddr, ndmpdpphys); 872 ndm1g = 0; 873 if ((amd_feature & AMDID_PAGE1GB) != 0) 874 ndm1g = ptoa(Maxmem) >> PDPSHIFT; 875 if (ndm1g < ndmpdp) 876 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); 877 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 878 879 /* Allocate pages */ 880 KPML4phys = allocpages(firstaddr, 1); 881 KPDPphys = allocpages(firstaddr, NKPML4E); 882 883 /* 884 * Allocate the initial number of kernel page table pages required to 885 * bootstrap. We defer this until after all memory-size dependent 886 * allocations are done (e.g. direct map), so that we don't have to 887 * build in too much slop in our estimate. 888 * 889 * Note that when NKPML4E > 1, we have an empty page underneath 890 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) 891 * pages. (pmap_enter requires a PD page to exist for each KPML4E.) 892 */ 893 nkpt_init(*firstaddr); 894 nkpdpe = NKPDPE(nkpt); 895 896 KPTphys = allocpages(firstaddr, nkpt); 897 KPDphys = allocpages(firstaddr, nkpdpe); 898 899 /* Fill in the underlying page table pages */ 900 /* Nominally read-only (but really R/W) from zero to physfree */ 901 /* XXX not fully used, underneath 2M pages */ 902 pt_p = (pt_entry_t *)KPTphys; 903 for (i = 0; ptoa(i) < *firstaddr; i++) 904 pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G; 905 906 /* Now map the page tables at their location within PTmap */ 907 pd_p = (pd_entry_t *)KPDphys; 908 for (i = 0; i < nkpt; i++) 909 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 910 911 /* Map from zero to end of allocations under 2M pages */ 912 /* This replaces some of the KPTphys entries above */ 913 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) 914 pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS | 915 X86_PG_G; 916 917 /* And connect up the PD to the PDP (leaving room for L4 pages) */ 918 pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); 919 for (i = 0; i < nkpdpe; i++) 920 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V | 921 PG_U; 922 923 /* 924 * Now, set up the direct map region using 2MB and/or 1GB pages. If 925 * the end of physical memory is not aligned to a 1GB page boundary, 926 * then the residual physical memory is mapped with 2MB pages. Later, 927 * if pmap_mapdev{_attr}() uses the direct map for non-write-back 928 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings 929 * that are partially used. 930 */ 931 pd_p = (pd_entry_t *)DMPDphys; 932 for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { 933 pd_p[j] = (vm_paddr_t)i << PDRSHIFT; 934 /* Preset PG_M and PG_A because demotion expects it. */ 935 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | 936 X86_PG_M | X86_PG_A; 937 } 938 pdp_p = (pdp_entry_t *)DMPDPphys; 939 for (i = 0; i < ndm1g; i++) { 940 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; 941 /* Preset PG_M and PG_A because demotion expects it. */ 942 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | 943 X86_PG_M | X86_PG_A; 944 } 945 for (j = 0; i < ndmpdp; i++, j++) { 946 pdp_p[i] = DMPDphys + ptoa(j); 947 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U; 948 } 949 950 /* And recursively map PML4 to itself in order to get PTmap */ 951 p4_p = (pml4_entry_t *)KPML4phys; 952 p4_p[PML4PML4I] = KPML4phys; 953 p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U; 954 955 /* Connect the Direct Map slot(s) up to the PML4. */ 956 for (i = 0; i < ndmpdpphys; i++) { 957 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); 958 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U; 959 } 960 961 /* Connect the KVA slots up to the PML4 */ 962 for (i = 0; i < NKPML4E; i++) { 963 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); 964 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U; 965 } 966 } 967 968 /* 969 * Bootstrap the system enough to run with virtual memory. 970 * 971 * On amd64 this is called after mapping has already been enabled 972 * and just syncs the pmap module with what has already been done. 973 * [We can't call it easily with mapping off since the kernel is not 974 * mapped with PA == VA, hence we would have to relocate every address 975 * from the linked base (virtual) address "KERNBASE" to the actual 976 * (physical) address starting relative to 0] 977 */ 978 void 979 pmap_bootstrap(vm_paddr_t *firstaddr) 980 { 981 vm_offset_t va; 982 pt_entry_t *pte; 983 int i; 984 985 /* 986 * Create an initial set of page tables to run the kernel in. 987 */ 988 create_pagetables(firstaddr); 989 990 /* 991 * Add a physical memory segment (vm_phys_seg) corresponding to the 992 * preallocated kernel page table pages so that vm_page structures 993 * representing these pages will be created. The vm_page structures 994 * are required for promotion of the corresponding kernel virtual 995 * addresses to superpage mappings. 996 */ 997 vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 998 999 virtual_avail = (vm_offset_t) KERNBASE + *firstaddr; 1000 virtual_avail = pmap_kmem_choose(virtual_avail); 1001 1002 virtual_end = VM_MAX_KERNEL_ADDRESS; 1003 1004 1005 /* XXX do %cr0 as well */ 1006 load_cr4(rcr4() | CR4_PGE); 1007 load_cr3(KPML4phys); 1008 if (cpu_stdext_feature & CPUID_STDEXT_SMEP) 1009 load_cr4(rcr4() | CR4_SMEP); 1010 1011 /* 1012 * Initialize the kernel pmap (which is statically allocated). 1013 */ 1014 PMAP_LOCK_INIT(kernel_pmap); 1015 kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); 1016 kernel_pmap->pm_cr3 = KPML4phys; 1017 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 1018 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1019 kernel_pmap->pm_flags = pmap_flags; 1020 1021 /* 1022 * Initialize the TLB invalidations generation number lock. 1023 */ 1024 mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); 1025 1026 /* 1027 * Reserve some special page table entries/VA space for temporary 1028 * mapping of pages. 1029 */ 1030 #define SYSMAP(c, p, v, n) \ 1031 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 1032 1033 va = virtual_avail; 1034 pte = vtopte(va); 1035 1036 /* 1037 * Crashdump maps. The first page is reused as CMAP1 for the 1038 * memory test. 1039 */ 1040 SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) 1041 CADDR1 = crashdumpmap; 1042 1043 virtual_avail = va; 1044 1045 /* 1046 * Initialize the PAT MSR. 1047 * pmap_init_pat() clears and sets CR4_PGE, which, as a 1048 * side-effect, invalidates stale PG_G TLB entries that might 1049 * have been created in our pre-boot environment. 1050 */ 1051 pmap_init_pat(); 1052 1053 /* Initialize TLB Context Id. */ 1054 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); 1055 if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { 1056 /* Check for INVPCID support */ 1057 invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) 1058 != 0; 1059 for (i = 0; i < MAXCPU; i++) { 1060 kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN; 1061 kernel_pmap->pm_pcids[i].pm_gen = 1; 1062 } 1063 __pcpu[0].pc_pcid_next = PMAP_PCID_KERN + 1; 1064 __pcpu[0].pc_pcid_gen = 1; 1065 /* 1066 * pcpu area for APs is zeroed during AP startup. 1067 * pc_pcid_next and pc_pcid_gen are initialized by AP 1068 * during pcpu setup. 1069 */ 1070 load_cr4(rcr4() | CR4_PCIDE); 1071 } else { 1072 pmap_pcid_enabled = 0; 1073 } 1074 } 1075 1076 /* 1077 * Setup the PAT MSR. 1078 */ 1079 void 1080 pmap_init_pat(void) 1081 { 1082 int pat_table[PAT_INDEX_SIZE]; 1083 uint64_t pat_msr; 1084 u_long cr0, cr4; 1085 int i; 1086 1087 /* Bail if this CPU doesn't implement PAT. */ 1088 if ((cpu_feature & CPUID_PAT) == 0) 1089 panic("no PAT??"); 1090 1091 /* Set default PAT index table. */ 1092 for (i = 0; i < PAT_INDEX_SIZE; i++) 1093 pat_table[i] = -1; 1094 pat_table[PAT_WRITE_BACK] = 0; 1095 pat_table[PAT_WRITE_THROUGH] = 1; 1096 pat_table[PAT_UNCACHEABLE] = 3; 1097 pat_table[PAT_WRITE_COMBINING] = 3; 1098 pat_table[PAT_WRITE_PROTECTED] = 3; 1099 pat_table[PAT_UNCACHED] = 3; 1100 1101 /* Initialize default PAT entries. */ 1102 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 1103 PAT_VALUE(1, PAT_WRITE_THROUGH) | 1104 PAT_VALUE(2, PAT_UNCACHED) | 1105 PAT_VALUE(3, PAT_UNCACHEABLE) | 1106 PAT_VALUE(4, PAT_WRITE_BACK) | 1107 PAT_VALUE(5, PAT_WRITE_THROUGH) | 1108 PAT_VALUE(6, PAT_UNCACHED) | 1109 PAT_VALUE(7, PAT_UNCACHEABLE); 1110 1111 if (pat_works) { 1112 /* 1113 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 1114 * Program 5 and 6 as WP and WC. 1115 * Leave 4 and 7 as WB and UC. 1116 */ 1117 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); 1118 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | 1119 PAT_VALUE(6, PAT_WRITE_COMBINING); 1120 pat_table[PAT_UNCACHED] = 2; 1121 pat_table[PAT_WRITE_PROTECTED] = 5; 1122 pat_table[PAT_WRITE_COMBINING] = 6; 1123 } else { 1124 /* 1125 * Just replace PAT Index 2 with WC instead of UC-. 1126 */ 1127 pat_msr &= ~PAT_MASK(2); 1128 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 1129 pat_table[PAT_WRITE_COMBINING] = 2; 1130 } 1131 1132 /* Disable PGE. */ 1133 cr4 = rcr4(); 1134 load_cr4(cr4 & ~CR4_PGE); 1135 1136 /* Disable caches (CD = 1, NW = 0). */ 1137 cr0 = rcr0(); 1138 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 1139 1140 /* Flushes caches and TLBs. */ 1141 wbinvd(); 1142 invltlb(); 1143 1144 /* Update PAT and index table. */ 1145 wrmsr(MSR_PAT, pat_msr); 1146 for (i = 0; i < PAT_INDEX_SIZE; i++) 1147 pat_index[i] = pat_table[i]; 1148 1149 /* Flush caches and TLBs again. */ 1150 wbinvd(); 1151 invltlb(); 1152 1153 /* Restore caches and PGE. */ 1154 load_cr0(cr0); 1155 load_cr4(cr4); 1156 } 1157 1158 /* 1159 * Initialize a vm_page's machine-dependent fields. 1160 */ 1161 void 1162 pmap_page_init(vm_page_t m) 1163 { 1164 1165 TAILQ_INIT(&m->md.pv_list); 1166 m->md.pat_mode = PAT_WRITE_BACK; 1167 } 1168 1169 /* 1170 * Initialize the pmap module. 1171 * Called by vm_init, to initialize any structures that the pmap 1172 * system needs to map virtual memory. 1173 */ 1174 void 1175 pmap_init(void) 1176 { 1177 struct pmap_preinit_mapping *ppim; 1178 vm_page_t mpte; 1179 vm_size_t s; 1180 int error, i, pv_npg; 1181 1182 /* 1183 * Initialize the vm page array entries for the kernel pmap's 1184 * page table pages. 1185 */ 1186 for (i = 0; i < nkpt; i++) { 1187 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 1188 KASSERT(mpte >= vm_page_array && 1189 mpte < &vm_page_array[vm_page_array_size], 1190 ("pmap_init: page table page is out of range")); 1191 mpte->pindex = pmap_pde_pindex(KERNBASE) + i; 1192 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 1193 } 1194 1195 /* 1196 * If the kernel is running on a virtual machine, then it must assume 1197 * that MCA is enabled by the hypervisor. Moreover, the kernel must 1198 * be prepared for the hypervisor changing the vendor and family that 1199 * are reported by CPUID. Consequently, the workaround for AMD Family 1200 * 10h Erratum 383 is enabled if the processor's feature set does not 1201 * include at least one feature that is only supported by older Intel 1202 * or newer AMD processors. 1203 */ 1204 if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && 1205 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 1206 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 1207 AMDID2_FMA4)) == 0) 1208 workaround_erratum383 = 1; 1209 1210 /* 1211 * Are large page mappings enabled? 1212 */ 1213 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 1214 if (pg_ps_enabled) { 1215 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1216 ("pmap_init: can't assign to pagesizes[1]")); 1217 pagesizes[1] = NBPDR; 1218 } 1219 1220 /* 1221 * Initialize the pv chunk list mutex. 1222 */ 1223 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 1224 1225 /* 1226 * Initialize the pool of pv list locks. 1227 */ 1228 for (i = 0; i < NPV_LIST_LOCKS; i++) 1229 rw_init(&pv_list_locks[i], "pmap pv list"); 1230 1231 /* 1232 * Calculate the size of the pv head table for superpages. 1233 */ 1234 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); 1235 1236 /* 1237 * Allocate memory for the pv head table for superpages. 1238 */ 1239 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 1240 s = round_page(s); 1241 pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, 1242 M_WAITOK | M_ZERO); 1243 for (i = 0; i < pv_npg; i++) 1244 TAILQ_INIT(&pv_table[i].pv_list); 1245 TAILQ_INIT(&pv_dummy.pv_list); 1246 1247 pmap_initialized = 1; 1248 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 1249 ppim = pmap_preinit_mapping + i; 1250 if (ppim->va == 0) 1251 continue; 1252 /* Make the direct map consistent */ 1253 if (ppim->pa < dmaplimit && ppim->pa + ppim->sz < dmaplimit) { 1254 (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), 1255 ppim->sz, ppim->mode); 1256 } 1257 if (!bootverbose) 1258 continue; 1259 printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i, 1260 ppim->pa, ppim->va, ppim->sz, ppim->mode); 1261 } 1262 1263 mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); 1264 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 1265 (vmem_addr_t *)&qframe); 1266 if (error != 0) 1267 panic("qframe allocation failed"); 1268 } 1269 1270 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, 1271 "2MB page mapping counters"); 1272 1273 static u_long pmap_pde_demotions; 1274 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, 1275 &pmap_pde_demotions, 0, "2MB page demotions"); 1276 1277 static u_long pmap_pde_mappings; 1278 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 1279 &pmap_pde_mappings, 0, "2MB page mappings"); 1280 1281 static u_long pmap_pde_p_failures; 1282 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 1283 &pmap_pde_p_failures, 0, "2MB page promotion failures"); 1284 1285 static u_long pmap_pde_promotions; 1286 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 1287 &pmap_pde_promotions, 0, "2MB page promotions"); 1288 1289 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0, 1290 "1GB page mapping counters"); 1291 1292 static u_long pmap_pdpe_demotions; 1293 SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, 1294 &pmap_pdpe_demotions, 0, "1GB page demotions"); 1295 1296 /*************************************************** 1297 * Low level helper routines..... 1298 ***************************************************/ 1299 1300 static pt_entry_t 1301 pmap_swap_pat(pmap_t pmap, pt_entry_t entry) 1302 { 1303 int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; 1304 1305 switch (pmap->pm_type) { 1306 case PT_X86: 1307 case PT_RVI: 1308 /* Verify that both PAT bits are not set at the same time */ 1309 KASSERT((entry & x86_pat_bits) != x86_pat_bits, 1310 ("Invalid PAT bits in entry %#lx", entry)); 1311 1312 /* Swap the PAT bits if one of them is set */ 1313 if ((entry & x86_pat_bits) != 0) 1314 entry ^= x86_pat_bits; 1315 break; 1316 case PT_EPT: 1317 /* 1318 * Nothing to do - the memory attributes are represented 1319 * the same way for regular pages and superpages. 1320 */ 1321 break; 1322 default: 1323 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); 1324 } 1325 1326 return (entry); 1327 } 1328 1329 /* 1330 * Determine the appropriate bits to set in a PTE or PDE for a specified 1331 * caching mode. 1332 */ 1333 int 1334 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) 1335 { 1336 int cache_bits, pat_flag, pat_idx; 1337 1338 if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) 1339 panic("Unknown caching mode %d\n", mode); 1340 1341 switch (pmap->pm_type) { 1342 case PT_X86: 1343 case PT_RVI: 1344 /* The PAT bit is different for PTE's and PDE's. */ 1345 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 1346 1347 /* Map the caching mode to a PAT index. */ 1348 pat_idx = pat_index[mode]; 1349 1350 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 1351 cache_bits = 0; 1352 if (pat_idx & 0x4) 1353 cache_bits |= pat_flag; 1354 if (pat_idx & 0x2) 1355 cache_bits |= PG_NC_PCD; 1356 if (pat_idx & 0x1) 1357 cache_bits |= PG_NC_PWT; 1358 break; 1359 1360 case PT_EPT: 1361 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); 1362 break; 1363 1364 default: 1365 panic("unsupported pmap type %d", pmap->pm_type); 1366 } 1367 1368 return (cache_bits); 1369 } 1370 1371 static int 1372 pmap_cache_mask(pmap_t pmap, boolean_t is_pde) 1373 { 1374 int mask; 1375 1376 switch (pmap->pm_type) { 1377 case PT_X86: 1378 case PT_RVI: 1379 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; 1380 break; 1381 case PT_EPT: 1382 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); 1383 break; 1384 default: 1385 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); 1386 } 1387 1388 return (mask); 1389 } 1390 1391 static __inline boolean_t 1392 pmap_ps_enabled(pmap_t pmap) 1393 { 1394 1395 return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); 1396 } 1397 1398 static void 1399 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) 1400 { 1401 1402 switch (pmap->pm_type) { 1403 case PT_X86: 1404 break; 1405 case PT_RVI: 1406 case PT_EPT: 1407 /* 1408 * XXX 1409 * This is a little bogus since the generation number is 1410 * supposed to be bumped up when a region of the address 1411 * space is invalidated in the page tables. 1412 * 1413 * In this case the old PDE entry is valid but yet we want 1414 * to make sure that any mappings using the old entry are 1415 * invalidated in the TLB. 1416 * 1417 * The reason this works as expected is because we rendezvous 1418 * "all" host cpus and force any vcpu context to exit as a 1419 * side-effect. 1420 */ 1421 atomic_add_acq_long(&pmap->pm_eptgen, 1); 1422 break; 1423 default: 1424 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); 1425 } 1426 pde_store(pde, newpde); 1427 } 1428 1429 /* 1430 * After changing the page size for the specified virtual address in the page 1431 * table, flush the corresponding entries from the processor's TLB. Only the 1432 * calling processor's TLB is affected. 1433 * 1434 * The calling thread must be pinned to a processor. 1435 */ 1436 static void 1437 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) 1438 { 1439 pt_entry_t PG_G; 1440 1441 if (pmap_type_guest(pmap)) 1442 return; 1443 1444 KASSERT(pmap->pm_type == PT_X86, 1445 ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); 1446 1447 PG_G = pmap_global_bit(pmap); 1448 1449 if ((newpde & PG_PS) == 0) 1450 /* Demotion: flush a specific 2MB page mapping. */ 1451 invlpg(va); 1452 else if ((newpde & PG_G) == 0) 1453 /* 1454 * Promotion: flush every 4KB page mapping from the TLB 1455 * because there are too many to flush individually. 1456 */ 1457 invltlb(); 1458 else { 1459 /* 1460 * Promotion: flush every 4KB page mapping from the TLB, 1461 * including any global (PG_G) mappings. 1462 */ 1463 invltlb_glob(); 1464 } 1465 } 1466 #ifdef SMP 1467 1468 /* 1469 * For SMP, these functions have to use the IPI mechanism for coherence. 1470 * 1471 * N.B.: Before calling any of the following TLB invalidation functions, 1472 * the calling processor must ensure that all stores updating a non- 1473 * kernel page table are globally performed. Otherwise, another 1474 * processor could cache an old, pre-update entry without being 1475 * invalidated. This can happen one of two ways: (1) The pmap becomes 1476 * active on another processor after its pm_active field is checked by 1477 * one of the following functions but before a store updating the page 1478 * table is globally performed. (2) The pmap becomes active on another 1479 * processor before its pm_active field is checked but due to 1480 * speculative loads one of the following functions stills reads the 1481 * pmap as inactive on the other processor. 1482 * 1483 * The kernel page table is exempt because its pm_active field is 1484 * immutable. The kernel page table is always active on every 1485 * processor. 1486 */ 1487 1488 /* 1489 * Interrupt the cpus that are executing in the guest context. 1490 * This will force the vcpu to exit and the cached EPT mappings 1491 * will be invalidated by the host before the next vmresume. 1492 */ 1493 static __inline void 1494 pmap_invalidate_ept(pmap_t pmap) 1495 { 1496 int ipinum; 1497 1498 sched_pin(); 1499 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 1500 ("pmap_invalidate_ept: absurd pm_active")); 1501 1502 /* 1503 * The TLB mappings associated with a vcpu context are not 1504 * flushed each time a different vcpu is chosen to execute. 1505 * 1506 * This is in contrast with a process's vtop mappings that 1507 * are flushed from the TLB on each context switch. 1508 * 1509 * Therefore we need to do more than just a TLB shootdown on 1510 * the active cpus in 'pmap->pm_active'. To do this we keep 1511 * track of the number of invalidations performed on this pmap. 1512 * 1513 * Each vcpu keeps a cache of this counter and compares it 1514 * just before a vmresume. If the counter is out-of-date an 1515 * invept will be done to flush stale mappings from the TLB. 1516 */ 1517 atomic_add_acq_long(&pmap->pm_eptgen, 1); 1518 1519 /* 1520 * Force the vcpu to exit and trap back into the hypervisor. 1521 */ 1522 ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; 1523 ipi_selected(pmap->pm_active, ipinum); 1524 sched_unpin(); 1525 } 1526 1527 void 1528 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1529 { 1530 cpuset_t *mask; 1531 u_int cpuid, i; 1532 1533 if (pmap_type_guest(pmap)) { 1534 pmap_invalidate_ept(pmap); 1535 return; 1536 } 1537 1538 KASSERT(pmap->pm_type == PT_X86, 1539 ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); 1540 1541 sched_pin(); 1542 if (pmap == kernel_pmap) { 1543 invlpg(va); 1544 mask = &all_cpus; 1545 } else { 1546 cpuid = PCPU_GET(cpuid); 1547 if (pmap == PCPU_GET(curpmap)) 1548 invlpg(va); 1549 else if (pmap_pcid_enabled) 1550 pmap->pm_pcids[cpuid].pm_gen = 0; 1551 if (pmap_pcid_enabled) { 1552 CPU_FOREACH(i) { 1553 if (cpuid != i) 1554 pmap->pm_pcids[i].pm_gen = 0; 1555 } 1556 } 1557 mask = &pmap->pm_active; 1558 } 1559 smp_masked_invlpg(*mask, va); 1560 sched_unpin(); 1561 } 1562 1563 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ 1564 #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) 1565 1566 void 1567 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1568 { 1569 cpuset_t *mask; 1570 vm_offset_t addr; 1571 u_int cpuid, i; 1572 1573 if (eva - sva >= PMAP_INVLPG_THRESHOLD) { 1574 pmap_invalidate_all(pmap); 1575 return; 1576 } 1577 1578 if (pmap_type_guest(pmap)) { 1579 pmap_invalidate_ept(pmap); 1580 return; 1581 } 1582 1583 KASSERT(pmap->pm_type == PT_X86, 1584 ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); 1585 1586 sched_pin(); 1587 cpuid = PCPU_GET(cpuid); 1588 if (pmap == kernel_pmap) { 1589 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1590 invlpg(addr); 1591 mask = &all_cpus; 1592 } else { 1593 if (pmap == PCPU_GET(curpmap)) { 1594 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1595 invlpg(addr); 1596 } else if (pmap_pcid_enabled) { 1597 pmap->pm_pcids[cpuid].pm_gen = 0; 1598 } 1599 if (pmap_pcid_enabled) { 1600 CPU_FOREACH(i) { 1601 if (cpuid != i) 1602 pmap->pm_pcids[i].pm_gen = 0; 1603 } 1604 } 1605 mask = &pmap->pm_active; 1606 } 1607 smp_masked_invlpg_range(*mask, sva, eva); 1608 sched_unpin(); 1609 } 1610 1611 void 1612 pmap_invalidate_all(pmap_t pmap) 1613 { 1614 cpuset_t *mask; 1615 struct invpcid_descr d; 1616 u_int cpuid, i; 1617 1618 if (pmap_type_guest(pmap)) { 1619 pmap_invalidate_ept(pmap); 1620 return; 1621 } 1622 1623 KASSERT(pmap->pm_type == PT_X86, 1624 ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); 1625 1626 sched_pin(); 1627 if (pmap == kernel_pmap) { 1628 if (pmap_pcid_enabled && invpcid_works) { 1629 bzero(&d, sizeof(d)); 1630 invpcid(&d, INVPCID_CTXGLOB); 1631 } else { 1632 invltlb_glob(); 1633 } 1634 mask = &all_cpus; 1635 } else { 1636 cpuid = PCPU_GET(cpuid); 1637 if (pmap == PCPU_GET(curpmap)) { 1638 if (pmap_pcid_enabled) { 1639 if (invpcid_works) { 1640 d.pcid = pmap->pm_pcids[cpuid].pm_pcid; 1641 d.pad = 0; 1642 d.addr = 0; 1643 invpcid(&d, INVPCID_CTX); 1644 } else { 1645 load_cr3(pmap->pm_cr3 | pmap->pm_pcids 1646 [PCPU_GET(cpuid)].pm_pcid); 1647 } 1648 } else { 1649 invltlb(); 1650 } 1651 } else if (pmap_pcid_enabled) { 1652 pmap->pm_pcids[cpuid].pm_gen = 0; 1653 } 1654 if (pmap_pcid_enabled) { 1655 CPU_FOREACH(i) { 1656 if (cpuid != i) 1657 pmap->pm_pcids[i].pm_gen = 0; 1658 } 1659 } 1660 mask = &pmap->pm_active; 1661 } 1662 smp_masked_invltlb(*mask, pmap); 1663 sched_unpin(); 1664 } 1665 1666 void 1667 pmap_invalidate_cache(void) 1668 { 1669 1670 sched_pin(); 1671 wbinvd(); 1672 smp_cache_flush(); 1673 sched_unpin(); 1674 } 1675 1676 struct pde_action { 1677 cpuset_t invalidate; /* processors that invalidate their TLB */ 1678 pmap_t pmap; 1679 vm_offset_t va; 1680 pd_entry_t *pde; 1681 pd_entry_t newpde; 1682 u_int store; /* processor that updates the PDE */ 1683 }; 1684 1685 static void 1686 pmap_update_pde_action(void *arg) 1687 { 1688 struct pde_action *act = arg; 1689 1690 if (act->store == PCPU_GET(cpuid)) 1691 pmap_update_pde_store(act->pmap, act->pde, act->newpde); 1692 } 1693 1694 static void 1695 pmap_update_pde_teardown(void *arg) 1696 { 1697 struct pde_action *act = arg; 1698 1699 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 1700 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); 1701 } 1702 1703 /* 1704 * Change the page size for the specified virtual address in a way that 1705 * prevents any possibility of the TLB ever having two entries that map the 1706 * same virtual address using different page sizes. This is the recommended 1707 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 1708 * machine check exception for a TLB state that is improperly diagnosed as a 1709 * hardware error. 1710 */ 1711 static void 1712 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1713 { 1714 struct pde_action act; 1715 cpuset_t active, other_cpus; 1716 u_int cpuid; 1717 1718 sched_pin(); 1719 cpuid = PCPU_GET(cpuid); 1720 other_cpus = all_cpus; 1721 CPU_CLR(cpuid, &other_cpus); 1722 if (pmap == kernel_pmap || pmap_type_guest(pmap)) 1723 active = all_cpus; 1724 else { 1725 active = pmap->pm_active; 1726 } 1727 if (CPU_OVERLAP(&active, &other_cpus)) { 1728 act.store = cpuid; 1729 act.invalidate = active; 1730 act.va = va; 1731 act.pmap = pmap; 1732 act.pde = pde; 1733 act.newpde = newpde; 1734 CPU_SET(cpuid, &active); 1735 smp_rendezvous_cpus(active, 1736 smp_no_rendevous_barrier, pmap_update_pde_action, 1737 pmap_update_pde_teardown, &act); 1738 } else { 1739 pmap_update_pde_store(pmap, pde, newpde); 1740 if (CPU_ISSET(cpuid, &active)) 1741 pmap_update_pde_invalidate(pmap, va, newpde); 1742 } 1743 sched_unpin(); 1744 } 1745 #else /* !SMP */ 1746 /* 1747 * Normal, non-SMP, invalidation functions. 1748 */ 1749 void 1750 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1751 { 1752 1753 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 1754 pmap->pm_eptgen++; 1755 return; 1756 } 1757 KASSERT(pmap->pm_type == PT_X86, 1758 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 1759 1760 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) 1761 invlpg(va); 1762 else if (pmap_pcid_enabled) 1763 pmap->pm_pcids[0].pm_gen = 0; 1764 } 1765 1766 void 1767 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1768 { 1769 vm_offset_t addr; 1770 1771 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 1772 pmap->pm_eptgen++; 1773 return; 1774 } 1775 KASSERT(pmap->pm_type == PT_X86, 1776 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 1777 1778 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 1779 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1780 invlpg(addr); 1781 } else if (pmap_pcid_enabled) { 1782 pmap->pm_pcids[0].pm_gen = 0; 1783 } 1784 } 1785 1786 void 1787 pmap_invalidate_all(pmap_t pmap) 1788 { 1789 struct invpcid_descr d; 1790 1791 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 1792 pmap->pm_eptgen++; 1793 return; 1794 } 1795 KASSERT(pmap->pm_type == PT_X86, 1796 ("pmap_invalidate_all: unknown type %d", pmap->pm_type)); 1797 1798 if (pmap == kernel_pmap) { 1799 if (pmap_pcid_enabled && invpcid_works) { 1800 bzero(&d, sizeof(d)); 1801 invpcid(&d, INVPCID_CTXGLOB); 1802 } else { 1803 invltlb_glob(); 1804 } 1805 } else if (pmap == PCPU_GET(curpmap)) { 1806 if (pmap_pcid_enabled) { 1807 if (invpcid_works) { 1808 d.pcid = pmap->pm_pcids[0].pm_pcid; 1809 d.pad = 0; 1810 d.addr = 0; 1811 invpcid(&d, INVPCID_CTX); 1812 } else { 1813 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[0]. 1814 pm_pcid); 1815 } 1816 } else { 1817 invltlb(); 1818 } 1819 } else if (pmap_pcid_enabled) { 1820 pmap->pm_pcids[0].pm_gen = 0; 1821 } 1822 } 1823 1824 PMAP_INLINE void 1825 pmap_invalidate_cache(void) 1826 { 1827 1828 wbinvd(); 1829 } 1830 1831 static void 1832 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1833 { 1834 1835 pmap_update_pde_store(pmap, pde, newpde); 1836 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) 1837 pmap_update_pde_invalidate(pmap, va, newpde); 1838 else 1839 pmap->pm_pcids[0].pm_gen = 0; 1840 } 1841 #endif /* !SMP */ 1842 1843 static void 1844 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) 1845 { 1846 1847 /* 1848 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created 1849 * by a promotion that did not invalidate the 512 4KB page mappings 1850 * that might exist in the TLB. Consequently, at this point, the TLB 1851 * may hold both 4KB and 2MB page mappings for the address range [va, 1852 * va + NBPDR). Therefore, the entire range must be invalidated here. 1853 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any 1854 * 4KB page mappings for the address range [va, va + NBPDR), and so a 1855 * single INVLPG suffices to invalidate the 2MB page mapping from the 1856 * TLB. 1857 */ 1858 if ((pde & PG_PROMOTED) != 0) 1859 pmap_invalidate_range(pmap, va, va + NBPDR - 1); 1860 else 1861 pmap_invalidate_page(pmap, va); 1862 } 1863 1864 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1865 1866 void 1867 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) 1868 { 1869 1870 if (force) { 1871 sva &= ~(vm_offset_t)cpu_clflush_line_size; 1872 } else { 1873 KASSERT((sva & PAGE_MASK) == 0, 1874 ("pmap_invalidate_cache_range: sva not page-aligned")); 1875 KASSERT((eva & PAGE_MASK) == 0, 1876 ("pmap_invalidate_cache_range: eva not page-aligned")); 1877 } 1878 1879 if ((cpu_feature & CPUID_SS) != 0 && !force) 1880 ; /* If "Self Snoop" is supported and allowed, do nothing. */ 1881 else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 && 1882 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 1883 /* 1884 * XXX: Some CPUs fault, hang, or trash the local APIC 1885 * registers if we use CLFLUSH on the local APIC 1886 * range. The local APIC is always uncached, so we 1887 * don't need to flush for that range anyway. 1888 */ 1889 if (pmap_kextract(sva) == lapic_paddr) 1890 return; 1891 1892 /* 1893 * Otherwise, do per-cache line flush. Use the sfence 1894 * instruction to insure that previous stores are 1895 * included in the write-back. The processor 1896 * propagates flush to other processors in the cache 1897 * coherence domain. 1898 */ 1899 sfence(); 1900 for (; sva < eva; sva += cpu_clflush_line_size) 1901 clflushopt(sva); 1902 sfence(); 1903 } else if ((cpu_feature & CPUID_CLFSH) != 0 && 1904 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 1905 if (pmap_kextract(sva) == lapic_paddr) 1906 return; 1907 /* 1908 * Writes are ordered by CLFLUSH on Intel CPUs. 1909 */ 1910 if (cpu_vendor_id != CPU_VENDOR_INTEL) 1911 mfence(); 1912 for (; sva < eva; sva += cpu_clflush_line_size) 1913 clflush(sva); 1914 if (cpu_vendor_id != CPU_VENDOR_INTEL) 1915 mfence(); 1916 } else { 1917 1918 /* 1919 * No targeted cache flush methods are supported by CPU, 1920 * or the supplied range is bigger than 2MB. 1921 * Globally invalidate cache. 1922 */ 1923 pmap_invalidate_cache(); 1924 } 1925 } 1926 1927 /* 1928 * Remove the specified set of pages from the data and instruction caches. 1929 * 1930 * In contrast to pmap_invalidate_cache_range(), this function does not 1931 * rely on the CPU's self-snoop feature, because it is intended for use 1932 * when moving pages into a different cache domain. 1933 */ 1934 void 1935 pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1936 { 1937 vm_offset_t daddr, eva; 1938 int i; 1939 bool useclflushopt; 1940 1941 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; 1942 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1943 ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt)) 1944 pmap_invalidate_cache(); 1945 else { 1946 if (useclflushopt) 1947 sfence(); 1948 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 1949 mfence(); 1950 for (i = 0; i < count; i++) { 1951 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 1952 eva = daddr + PAGE_SIZE; 1953 for (; daddr < eva; daddr += cpu_clflush_line_size) { 1954 if (useclflushopt) 1955 clflushopt(daddr); 1956 else 1957 clflush(daddr); 1958 } 1959 } 1960 if (useclflushopt) 1961 sfence(); 1962 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 1963 mfence(); 1964 } 1965 } 1966 1967 /* 1968 * Routine: pmap_extract 1969 * Function: 1970 * Extract the physical page address associated 1971 * with the given map/virtual_address pair. 1972 */ 1973 vm_paddr_t 1974 pmap_extract(pmap_t pmap, vm_offset_t va) 1975 { 1976 pdp_entry_t *pdpe; 1977 pd_entry_t *pde; 1978 pt_entry_t *pte, PG_V; 1979 vm_paddr_t pa; 1980 1981 pa = 0; 1982 PG_V = pmap_valid_bit(pmap); 1983 PMAP_LOCK(pmap); 1984 pdpe = pmap_pdpe(pmap, va); 1985 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 1986 if ((*pdpe & PG_PS) != 0) 1987 pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); 1988 else { 1989 pde = pmap_pdpe_to_pde(pdpe, va); 1990 if ((*pde & PG_V) != 0) { 1991 if ((*pde & PG_PS) != 0) { 1992 pa = (*pde & PG_PS_FRAME) | 1993 (va & PDRMASK); 1994 } else { 1995 pte = pmap_pde_to_pte(pde, va); 1996 pa = (*pte & PG_FRAME) | 1997 (va & PAGE_MASK); 1998 } 1999 } 2000 } 2001 } 2002 PMAP_UNLOCK(pmap); 2003 return (pa); 2004 } 2005 2006 /* 2007 * Routine: pmap_extract_and_hold 2008 * Function: 2009 * Atomically extract and hold the physical page 2010 * with the given pmap and virtual address pair 2011 * if that mapping permits the given protection. 2012 */ 2013 vm_page_t 2014 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 2015 { 2016 pd_entry_t pde, *pdep; 2017 pt_entry_t pte, PG_RW, PG_V; 2018 vm_paddr_t pa; 2019 vm_page_t m; 2020 2021 pa = 0; 2022 m = NULL; 2023 PG_RW = pmap_rw_bit(pmap); 2024 PG_V = pmap_valid_bit(pmap); 2025 PMAP_LOCK(pmap); 2026 retry: 2027 pdep = pmap_pde(pmap, va); 2028 if (pdep != NULL && (pde = *pdep)) { 2029 if (pde & PG_PS) { 2030 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 2031 if (vm_page_pa_tryrelock(pmap, (pde & 2032 PG_PS_FRAME) | (va & PDRMASK), &pa)) 2033 goto retry; 2034 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | 2035 (va & PDRMASK)); 2036 vm_page_hold(m); 2037 } 2038 } else { 2039 pte = *pmap_pde_to_pte(pdep, va); 2040 if ((pte & PG_V) && 2041 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 2042 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, 2043 &pa)) 2044 goto retry; 2045 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 2046 vm_page_hold(m); 2047 } 2048 } 2049 } 2050 PA_UNLOCK_COND(pa); 2051 PMAP_UNLOCK(pmap); 2052 return (m); 2053 } 2054 2055 vm_paddr_t 2056 pmap_kextract(vm_offset_t va) 2057 { 2058 pd_entry_t pde; 2059 vm_paddr_t pa; 2060 2061 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 2062 pa = DMAP_TO_PHYS(va); 2063 } else { 2064 pde = *vtopde(va); 2065 if (pde & PG_PS) { 2066 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 2067 } else { 2068 /* 2069 * Beware of a concurrent promotion that changes the 2070 * PDE at this point! For example, vtopte() must not 2071 * be used to access the PTE because it would use the 2072 * new PDE. It is, however, safe to use the old PDE 2073 * because the page table page is preserved by the 2074 * promotion. 2075 */ 2076 pa = *pmap_pde_to_pte(&pde, va); 2077 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 2078 } 2079 } 2080 return (pa); 2081 } 2082 2083 /*************************************************** 2084 * Low level mapping routines..... 2085 ***************************************************/ 2086 2087 /* 2088 * Add a wired page to the kva. 2089 * Note: not SMP coherent. 2090 */ 2091 PMAP_INLINE void 2092 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 2093 { 2094 pt_entry_t *pte; 2095 2096 pte = vtopte(va); 2097 pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G); 2098 } 2099 2100 static __inline void 2101 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 2102 { 2103 pt_entry_t *pte; 2104 int cache_bits; 2105 2106 pte = vtopte(va); 2107 cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); 2108 pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits); 2109 } 2110 2111 /* 2112 * Remove a page from the kernel pagetables. 2113 * Note: not SMP coherent. 2114 */ 2115 PMAP_INLINE void 2116 pmap_kremove(vm_offset_t va) 2117 { 2118 pt_entry_t *pte; 2119 2120 pte = vtopte(va); 2121 pte_clear(pte); 2122 } 2123 2124 /* 2125 * Used to map a range of physical addresses into kernel 2126 * virtual address space. 2127 * 2128 * The value passed in '*virt' is a suggested virtual address for 2129 * the mapping. Architectures which can support a direct-mapped 2130 * physical to virtual region can return the appropriate address 2131 * within that region, leaving '*virt' unchanged. Other 2132 * architectures should map the pages starting at '*virt' and 2133 * update '*virt' with the first usable address after the mapped 2134 * region. 2135 */ 2136 vm_offset_t 2137 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 2138 { 2139 return PHYS_TO_DMAP(start); 2140 } 2141 2142 2143 /* 2144 * Add a list of wired pages to the kva 2145 * this routine is only used for temporary 2146 * kernel mappings that do not need to have 2147 * page modification or references recorded. 2148 * Note that old mappings are simply written 2149 * over. The page *must* be wired. 2150 * Note: SMP coherent. Uses a ranged shootdown IPI. 2151 */ 2152 void 2153 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 2154 { 2155 pt_entry_t *endpte, oldpte, pa, *pte; 2156 vm_page_t m; 2157 int cache_bits; 2158 2159 oldpte = 0; 2160 pte = vtopte(sva); 2161 endpte = pte + count; 2162 while (pte < endpte) { 2163 m = *ma++; 2164 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); 2165 pa = VM_PAGE_TO_PHYS(m) | cache_bits; 2166 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { 2167 oldpte |= *pte; 2168 pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V); 2169 } 2170 pte++; 2171 } 2172 if (__predict_false((oldpte & X86_PG_V) != 0)) 2173 pmap_invalidate_range(kernel_pmap, sva, sva + count * 2174 PAGE_SIZE); 2175 } 2176 2177 /* 2178 * This routine tears out page mappings from the 2179 * kernel -- it is meant only for temporary mappings. 2180 * Note: SMP coherent. Uses a ranged shootdown IPI. 2181 */ 2182 void 2183 pmap_qremove(vm_offset_t sva, int count) 2184 { 2185 vm_offset_t va; 2186 2187 va = sva; 2188 while (count-- > 0) { 2189 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); 2190 pmap_kremove(va); 2191 va += PAGE_SIZE; 2192 } 2193 pmap_invalidate_range(kernel_pmap, sva, va); 2194 } 2195 2196 /*************************************************** 2197 * Page table page management routines..... 2198 ***************************************************/ 2199 static __inline void 2200 pmap_free_zero_pages(struct spglist *free) 2201 { 2202 vm_page_t m; 2203 2204 while ((m = SLIST_FIRST(free)) != NULL) { 2205 SLIST_REMOVE_HEAD(free, plinks.s.ss); 2206 /* Preserve the page's PG_ZERO setting. */ 2207 vm_page_free_toq(m); 2208 } 2209 } 2210 2211 /* 2212 * Schedule the specified unused page table page to be freed. Specifically, 2213 * add the page to the specified list of pages that will be released to the 2214 * physical memory manager after the TLB has been updated. 2215 */ 2216 static __inline void 2217 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 2218 boolean_t set_PG_ZERO) 2219 { 2220 2221 if (set_PG_ZERO) 2222 m->flags |= PG_ZERO; 2223 else 2224 m->flags &= ~PG_ZERO; 2225 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 2226 } 2227 2228 /* 2229 * Inserts the specified page table page into the specified pmap's collection 2230 * of idle page table pages. Each of a pmap's page table pages is responsible 2231 * for mapping a distinct range of virtual addresses. The pmap's collection is 2232 * ordered by this virtual address range. 2233 */ 2234 static __inline int 2235 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 2236 { 2237 2238 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2239 return (vm_radix_insert(&pmap->pm_root, mpte)); 2240 } 2241 2242 /* 2243 * Removes the page table page mapping the specified virtual address from the 2244 * specified pmap's collection of idle page table pages, and returns it. 2245 * Otherwise, returns NULL if there is no page table page corresponding to the 2246 * specified virtual address. 2247 */ 2248 static __inline vm_page_t 2249 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 2250 { 2251 2252 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2253 return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va))); 2254 } 2255 2256 /* 2257 * Decrements a page table page's wire count, which is used to record the 2258 * number of valid page table entries within the page. If the wire count 2259 * drops to zero, then the page table page is unmapped. Returns TRUE if the 2260 * page table page was unmapped and FALSE otherwise. 2261 */ 2262 static inline boolean_t 2263 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2264 { 2265 2266 --m->wire_count; 2267 if (m->wire_count == 0) { 2268 _pmap_unwire_ptp(pmap, va, m, free); 2269 return (TRUE); 2270 } else 2271 return (FALSE); 2272 } 2273 2274 static void 2275 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2276 { 2277 2278 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2279 /* 2280 * unmap the page table page 2281 */ 2282 if (m->pindex >= (NUPDE + NUPDPE)) { 2283 /* PDP page */ 2284 pml4_entry_t *pml4; 2285 pml4 = pmap_pml4e(pmap, va); 2286 *pml4 = 0; 2287 } else if (m->pindex >= NUPDE) { 2288 /* PD page */ 2289 pdp_entry_t *pdp; 2290 pdp = pmap_pdpe(pmap, va); 2291 *pdp = 0; 2292 } else { 2293 /* PTE page */ 2294 pd_entry_t *pd; 2295 pd = pmap_pde(pmap, va); 2296 *pd = 0; 2297 } 2298 pmap_resident_count_dec(pmap, 1); 2299 if (m->pindex < NUPDE) { 2300 /* We just released a PT, unhold the matching PD */ 2301 vm_page_t pdpg; 2302 2303 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 2304 pmap_unwire_ptp(pmap, va, pdpg, free); 2305 } 2306 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 2307 /* We just released a PD, unhold the matching PDP */ 2308 vm_page_t pdppg; 2309 2310 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 2311 pmap_unwire_ptp(pmap, va, pdppg, free); 2312 } 2313 2314 /* 2315 * This is a release store so that the ordinary store unmapping 2316 * the page table page is globally performed before TLB shoot- 2317 * down is begun. 2318 */ 2319 atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1); 2320 2321 /* 2322 * Put page on a list so that it is released after 2323 * *ALL* TLB shootdown is done 2324 */ 2325 pmap_add_delayed_free_list(m, free, TRUE); 2326 } 2327 2328 /* 2329 * After removing a page table entry, this routine is used to 2330 * conditionally free the page, and manage the hold/wire counts. 2331 */ 2332 static int 2333 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 2334 struct spglist *free) 2335 { 2336 vm_page_t mpte; 2337 2338 if (va >= VM_MAXUSER_ADDRESS) 2339 return (0); 2340 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 2341 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 2342 return (pmap_unwire_ptp(pmap, va, mpte, free)); 2343 } 2344 2345 void 2346 pmap_pinit0(pmap_t pmap) 2347 { 2348 int i; 2349 2350 PMAP_LOCK_INIT(pmap); 2351 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 2352 pmap->pm_cr3 = KPML4phys; 2353 pmap->pm_root.rt_root = 0; 2354 CPU_ZERO(&pmap->pm_active); 2355 TAILQ_INIT(&pmap->pm_pvchunk); 2356 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2357 pmap->pm_flags = pmap_flags; 2358 CPU_FOREACH(i) { 2359 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; 2360 pmap->pm_pcids[i].pm_gen = 0; 2361 } 2362 PCPU_SET(curpmap, kernel_pmap); 2363 pmap_activate(curthread); 2364 CPU_FILL(&kernel_pmap->pm_active); 2365 } 2366 2367 void 2368 pmap_pinit_pml4(vm_page_t pml4pg) 2369 { 2370 pml4_entry_t *pm_pml4; 2371 int i; 2372 2373 pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); 2374 2375 /* Wire in kernel global address entries. */ 2376 for (i = 0; i < NKPML4E; i++) { 2377 pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW | 2378 X86_PG_V | PG_U; 2379 } 2380 for (i = 0; i < ndmpdpphys; i++) { 2381 pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | 2382 X86_PG_V | PG_U; 2383 } 2384 2385 /* install self-referential address mapping entry(s) */ 2386 pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW | 2387 X86_PG_A | X86_PG_M; 2388 } 2389 2390 /* 2391 * Initialize a preallocated and zeroed pmap structure, 2392 * such as one in a vmspace structure. 2393 */ 2394 int 2395 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) 2396 { 2397 vm_page_t pml4pg; 2398 vm_paddr_t pml4phys; 2399 int i; 2400 2401 /* 2402 * allocate the page directory page 2403 */ 2404 while ((pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2405 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) 2406 VM_WAIT; 2407 2408 pml4phys = VM_PAGE_TO_PHYS(pml4pg); 2409 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys); 2410 CPU_FOREACH(i) { 2411 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; 2412 pmap->pm_pcids[i].pm_gen = 0; 2413 } 2414 pmap->pm_cr3 = ~0; /* initialize to an invalid value */ 2415 2416 if ((pml4pg->flags & PG_ZERO) == 0) 2417 pagezero(pmap->pm_pml4); 2418 2419 /* 2420 * Do not install the host kernel mappings in the nested page 2421 * tables. These mappings are meaningless in the guest physical 2422 * address space. 2423 */ 2424 if ((pmap->pm_type = pm_type) == PT_X86) { 2425 pmap->pm_cr3 = pml4phys; 2426 pmap_pinit_pml4(pml4pg); 2427 } 2428 2429 pmap->pm_root.rt_root = 0; 2430 CPU_ZERO(&pmap->pm_active); 2431 TAILQ_INIT(&pmap->pm_pvchunk); 2432 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2433 pmap->pm_flags = flags; 2434 pmap->pm_eptgen = 0; 2435 2436 return (1); 2437 } 2438 2439 int 2440 pmap_pinit(pmap_t pmap) 2441 { 2442 2443 return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); 2444 } 2445 2446 /* 2447 * This routine is called if the desired page table page does not exist. 2448 * 2449 * If page table page allocation fails, this routine may sleep before 2450 * returning NULL. It sleeps only if a lock pointer was given. 2451 * 2452 * Note: If a page allocation fails at page table level two or three, 2453 * one or two pages may be held during the wait, only to be released 2454 * afterwards. This conservative approach is easily argued to avoid 2455 * race conditions. 2456 */ 2457 static vm_page_t 2458 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 2459 { 2460 vm_page_t m, pdppg, pdpg; 2461 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 2462 2463 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2464 2465 PG_A = pmap_accessed_bit(pmap); 2466 PG_M = pmap_modified_bit(pmap); 2467 PG_V = pmap_valid_bit(pmap); 2468 PG_RW = pmap_rw_bit(pmap); 2469 2470 /* 2471 * Allocate a page table page. 2472 */ 2473 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 2474 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 2475 if (lockp != NULL) { 2476 RELEASE_PV_LIST_LOCK(lockp); 2477 PMAP_UNLOCK(pmap); 2478 PMAP_ASSERT_NOT_IN_DI(); 2479 VM_WAIT; 2480 PMAP_LOCK(pmap); 2481 } 2482 2483 /* 2484 * Indicate the need to retry. While waiting, the page table 2485 * page may have been allocated. 2486 */ 2487 return (NULL); 2488 } 2489 if ((m->flags & PG_ZERO) == 0) 2490 pmap_zero_page(m); 2491 2492 /* 2493 * Map the pagetable page into the process address space, if 2494 * it isn't already there. 2495 */ 2496 2497 if (ptepindex >= (NUPDE + NUPDPE)) { 2498 pml4_entry_t *pml4; 2499 vm_pindex_t pml4index; 2500 2501 /* Wire up a new PDPE page */ 2502 pml4index = ptepindex - (NUPDE + NUPDPE); 2503 pml4 = &pmap->pm_pml4[pml4index]; 2504 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 2505 2506 } else if (ptepindex >= NUPDE) { 2507 vm_pindex_t pml4index; 2508 vm_pindex_t pdpindex; 2509 pml4_entry_t *pml4; 2510 pdp_entry_t *pdp; 2511 2512 /* Wire up a new PDE page */ 2513 pdpindex = ptepindex - NUPDE; 2514 pml4index = pdpindex >> NPML4EPGSHIFT; 2515 2516 pml4 = &pmap->pm_pml4[pml4index]; 2517 if ((*pml4 & PG_V) == 0) { 2518 /* Have to allocate a new pdp, recurse */ 2519 if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, 2520 lockp) == NULL) { 2521 --m->wire_count; 2522 atomic_subtract_int(&vm_cnt.v_wire_count, 1); 2523 vm_page_free_zero(m); 2524 return (NULL); 2525 } 2526 } else { 2527 /* Add reference to pdp page */ 2528 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 2529 pdppg->wire_count++; 2530 } 2531 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 2532 2533 /* Now find the pdp page */ 2534 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 2535 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 2536 2537 } else { 2538 vm_pindex_t pml4index; 2539 vm_pindex_t pdpindex; 2540 pml4_entry_t *pml4; 2541 pdp_entry_t *pdp; 2542 pd_entry_t *pd; 2543 2544 /* Wire up a new PTE page */ 2545 pdpindex = ptepindex >> NPDPEPGSHIFT; 2546 pml4index = pdpindex >> NPML4EPGSHIFT; 2547 2548 /* First, find the pdp and check that its valid. */ 2549 pml4 = &pmap->pm_pml4[pml4index]; 2550 if ((*pml4 & PG_V) == 0) { 2551 /* Have to allocate a new pd, recurse */ 2552 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 2553 lockp) == NULL) { 2554 --m->wire_count; 2555 atomic_subtract_int(&vm_cnt.v_wire_count, 1); 2556 vm_page_free_zero(m); 2557 return (NULL); 2558 } 2559 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 2560 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 2561 } else { 2562 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 2563 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 2564 if ((*pdp & PG_V) == 0) { 2565 /* Have to allocate a new pd, recurse */ 2566 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 2567 lockp) == NULL) { 2568 --m->wire_count; 2569 atomic_subtract_int(&vm_cnt.v_wire_count, 2570 1); 2571 vm_page_free_zero(m); 2572 return (NULL); 2573 } 2574 } else { 2575 /* Add reference to the pd page */ 2576 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 2577 pdpg->wire_count++; 2578 } 2579 } 2580 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 2581 2582 /* Now we know where the page directory page is */ 2583 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; 2584 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 2585 } 2586 2587 pmap_resident_count_inc(pmap, 1); 2588 2589 return (m); 2590 } 2591 2592 static vm_page_t 2593 pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 2594 { 2595 vm_pindex_t pdpindex, ptepindex; 2596 pdp_entry_t *pdpe, PG_V; 2597 vm_page_t pdpg; 2598 2599 PG_V = pmap_valid_bit(pmap); 2600 2601 retry: 2602 pdpe = pmap_pdpe(pmap, va); 2603 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 2604 /* Add a reference to the pd page. */ 2605 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 2606 pdpg->wire_count++; 2607 } else { 2608 /* Allocate a pd page. */ 2609 ptepindex = pmap_pde_pindex(va); 2610 pdpindex = ptepindex >> NPDPEPGSHIFT; 2611 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); 2612 if (pdpg == NULL && lockp != NULL) 2613 goto retry; 2614 } 2615 return (pdpg); 2616 } 2617 2618 static vm_page_t 2619 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 2620 { 2621 vm_pindex_t ptepindex; 2622 pd_entry_t *pd, PG_V; 2623 vm_page_t m; 2624 2625 PG_V = pmap_valid_bit(pmap); 2626 2627 /* 2628 * Calculate pagetable page index 2629 */ 2630 ptepindex = pmap_pde_pindex(va); 2631 retry: 2632 /* 2633 * Get the page directory entry 2634 */ 2635 pd = pmap_pde(pmap, va); 2636 2637 /* 2638 * This supports switching from a 2MB page to a 2639 * normal 4K page. 2640 */ 2641 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 2642 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { 2643 /* 2644 * Invalidation of the 2MB page mapping may have caused 2645 * the deallocation of the underlying PD page. 2646 */ 2647 pd = NULL; 2648 } 2649 } 2650 2651 /* 2652 * If the page table page is mapped, we just increment the 2653 * hold count, and activate it. 2654 */ 2655 if (pd != NULL && (*pd & PG_V) != 0) { 2656 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 2657 m->wire_count++; 2658 } else { 2659 /* 2660 * Here if the pte page isn't mapped, or if it has been 2661 * deallocated. 2662 */ 2663 m = _pmap_allocpte(pmap, ptepindex, lockp); 2664 if (m == NULL && lockp != NULL) 2665 goto retry; 2666 } 2667 return (m); 2668 } 2669 2670 2671 /*************************************************** 2672 * Pmap allocation/deallocation routines. 2673 ***************************************************/ 2674 2675 /* 2676 * Release any resources held by the given physical map. 2677 * Called when a pmap initialized by pmap_pinit is being released. 2678 * Should only be called if the map contains no valid mappings. 2679 */ 2680 void 2681 pmap_release(pmap_t pmap) 2682 { 2683 vm_page_t m; 2684 int i; 2685 2686 KASSERT(pmap->pm_stats.resident_count == 0, 2687 ("pmap_release: pmap resident count %ld != 0", 2688 pmap->pm_stats.resident_count)); 2689 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2690 ("pmap_release: pmap has reserved page table page(s)")); 2691 KASSERT(CPU_EMPTY(&pmap->pm_active), 2692 ("releasing active pmap %p", pmap)); 2693 2694 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4)); 2695 2696 for (i = 0; i < NKPML4E; i++) /* KVA */ 2697 pmap->pm_pml4[KPML4BASE + i] = 0; 2698 for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ 2699 pmap->pm_pml4[DMPML4I + i] = 0; 2700 pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ 2701 2702 m->wire_count--; 2703 atomic_subtract_int(&vm_cnt.v_wire_count, 1); 2704 vm_page_free_zero(m); 2705 } 2706 2707 static int 2708 kvm_size(SYSCTL_HANDLER_ARGS) 2709 { 2710 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 2711 2712 return sysctl_handle_long(oidp, &ksize, 0, req); 2713 } 2714 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 2715 0, 0, kvm_size, "LU", "Size of KVM"); 2716 2717 static int 2718 kvm_free(SYSCTL_HANDLER_ARGS) 2719 { 2720 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 2721 2722 return sysctl_handle_long(oidp, &kfree, 0, req); 2723 } 2724 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 2725 0, 0, kvm_free, "LU", "Amount of KVM free"); 2726 2727 /* 2728 * grow the number of kernel page table entries, if needed 2729 */ 2730 void 2731 pmap_growkernel(vm_offset_t addr) 2732 { 2733 vm_paddr_t paddr; 2734 vm_page_t nkpg; 2735 pd_entry_t *pde, newpdir; 2736 pdp_entry_t *pdpe; 2737 2738 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2739 2740 /* 2741 * Return if "addr" is within the range of kernel page table pages 2742 * that were preallocated during pmap bootstrap. Moreover, leave 2743 * "kernel_vm_end" and the kernel page table as they were. 2744 * 2745 * The correctness of this action is based on the following 2746 * argument: vm_map_insert() allocates contiguous ranges of the 2747 * kernel virtual address space. It calls this function if a range 2748 * ends after "kernel_vm_end". If the kernel is mapped between 2749 * "kernel_vm_end" and "addr", then the range cannot begin at 2750 * "kernel_vm_end". In fact, its beginning address cannot be less 2751 * than the kernel. Thus, there is no immediate need to allocate 2752 * any new kernel page table pages between "kernel_vm_end" and 2753 * "KERNBASE". 2754 */ 2755 if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR) 2756 return; 2757 2758 addr = roundup2(addr, NBPDR); 2759 if (addr - 1 >= kernel_map->max_offset) 2760 addr = kernel_map->max_offset; 2761 while (kernel_vm_end < addr) { 2762 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end); 2763 if ((*pdpe & X86_PG_V) == 0) { 2764 /* We need a new PDP entry */ 2765 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT, 2766 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 2767 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2768 if (nkpg == NULL) 2769 panic("pmap_growkernel: no memory to grow kernel"); 2770 if ((nkpg->flags & PG_ZERO) == 0) 2771 pmap_zero_page(nkpg); 2772 paddr = VM_PAGE_TO_PHYS(nkpg); 2773 *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | 2774 X86_PG_A | X86_PG_M); 2775 continue; /* try again */ 2776 } 2777 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); 2778 if ((*pde & X86_PG_V) != 0) { 2779 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2780 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2781 kernel_vm_end = kernel_map->max_offset; 2782 break; 2783 } 2784 continue; 2785 } 2786 2787 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end), 2788 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2789 VM_ALLOC_ZERO); 2790 if (nkpg == NULL) 2791 panic("pmap_growkernel: no memory to grow kernel"); 2792 if ((nkpg->flags & PG_ZERO) == 0) 2793 pmap_zero_page(nkpg); 2794 paddr = VM_PAGE_TO_PHYS(nkpg); 2795 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; 2796 pde_store(pde, newpdir); 2797 2798 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2799 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2800 kernel_vm_end = kernel_map->max_offset; 2801 break; 2802 } 2803 } 2804 } 2805 2806 2807 /*************************************************** 2808 * page management routines. 2809 ***************************************************/ 2810 2811 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2812 CTASSERT(_NPCM == 3); 2813 CTASSERT(_NPCPV == 168); 2814 2815 static __inline struct pv_chunk * 2816 pv_to_chunk(pv_entry_t pv) 2817 { 2818 2819 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2820 } 2821 2822 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2823 2824 #define PC_FREE0 0xfffffffffffffffful 2825 #define PC_FREE1 0xfffffffffffffffful 2826 #define PC_FREE2 0x000000fffffffffful 2827 2828 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 2829 2830 #ifdef PV_STATS 2831 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2832 2833 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2834 "Current number of pv entry chunks"); 2835 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2836 "Current number of pv entry chunks allocated"); 2837 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2838 "Current number of pv entry chunks frees"); 2839 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2840 "Number of times tried to get a chunk page but failed."); 2841 2842 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 2843 static int pv_entry_spare; 2844 2845 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2846 "Current number of pv entry frees"); 2847 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2848 "Current number of pv entry allocs"); 2849 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2850 "Current number of pv entries"); 2851 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2852 "Current number of spare pv entries"); 2853 #endif 2854 2855 /* 2856 * We are in a serious low memory condition. Resort to 2857 * drastic measures to free some pages so we can allocate 2858 * another pv entry chunk. 2859 * 2860 * Returns NULL if PV entries were reclaimed from the specified pmap. 2861 * 2862 * We do not, however, unmap 2mpages because subsequent accesses will 2863 * allocate per-page pv entries until repromotion occurs, thereby 2864 * exacerbating the shortage of free pv entries. 2865 */ 2866 static vm_page_t 2867 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 2868 { 2869 struct pch new_tail; 2870 struct pv_chunk *pc; 2871 struct md_page *pvh; 2872 pd_entry_t *pde; 2873 pmap_t pmap; 2874 pt_entry_t *pte, tpte; 2875 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 2876 pv_entry_t pv; 2877 vm_offset_t va; 2878 vm_page_t m, m_pc; 2879 struct spglist free; 2880 uint64_t inuse; 2881 int bit, field, freed; 2882 2883 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2884 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 2885 pmap = NULL; 2886 m_pc = NULL; 2887 PG_G = PG_A = PG_M = PG_RW = 0; 2888 SLIST_INIT(&free); 2889 TAILQ_INIT(&new_tail); 2890 pmap_delayed_invl_started(); 2891 mtx_lock(&pv_chunks_mutex); 2892 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) { 2893 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2894 mtx_unlock(&pv_chunks_mutex); 2895 if (pmap != pc->pc_pmap) { 2896 if (pmap != NULL) { 2897 pmap_invalidate_all(pmap); 2898 if (pmap != locked_pmap) 2899 PMAP_UNLOCK(pmap); 2900 } 2901 pmap_delayed_invl_finished(); 2902 pmap_delayed_invl_started(); 2903 pmap = pc->pc_pmap; 2904 /* Avoid deadlock and lock recursion. */ 2905 if (pmap > locked_pmap) { 2906 RELEASE_PV_LIST_LOCK(lockp); 2907 PMAP_LOCK(pmap); 2908 } else if (pmap != locked_pmap && 2909 !PMAP_TRYLOCK(pmap)) { 2910 pmap = NULL; 2911 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2912 mtx_lock(&pv_chunks_mutex); 2913 continue; 2914 } 2915 PG_G = pmap_global_bit(pmap); 2916 PG_A = pmap_accessed_bit(pmap); 2917 PG_M = pmap_modified_bit(pmap); 2918 PG_RW = pmap_rw_bit(pmap); 2919 } 2920 2921 /* 2922 * Destroy every non-wired, 4 KB page mapping in the chunk. 2923 */ 2924 freed = 0; 2925 for (field = 0; field < _NPCM; field++) { 2926 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2927 inuse != 0; inuse &= ~(1UL << bit)) { 2928 bit = bsfq(inuse); 2929 pv = &pc->pc_pventry[field * 64 + bit]; 2930 va = pv->pv_va; 2931 pde = pmap_pde(pmap, va); 2932 if ((*pde & PG_PS) != 0) 2933 continue; 2934 pte = pmap_pde_to_pte(pde, va); 2935 if ((*pte & PG_W) != 0) 2936 continue; 2937 tpte = pte_load_clear(pte); 2938 if ((tpte & PG_G) != 0) 2939 pmap_invalidate_page(pmap, va); 2940 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 2941 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2942 vm_page_dirty(m); 2943 if ((tpte & PG_A) != 0) 2944 vm_page_aflag_set(m, PGA_REFERENCED); 2945 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2946 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2947 m->md.pv_gen++; 2948 if (TAILQ_EMPTY(&m->md.pv_list) && 2949 (m->flags & PG_FICTITIOUS) == 0) { 2950 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2951 if (TAILQ_EMPTY(&pvh->pv_list)) { 2952 vm_page_aflag_clear(m, 2953 PGA_WRITEABLE); 2954 } 2955 } 2956 pmap_delayed_invl_page(m); 2957 pc->pc_map[field] |= 1UL << bit; 2958 pmap_unuse_pt(pmap, va, *pde, &free); 2959 freed++; 2960 } 2961 } 2962 if (freed == 0) { 2963 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2964 mtx_lock(&pv_chunks_mutex); 2965 continue; 2966 } 2967 /* Every freed mapping is for a 4 KB page. */ 2968 pmap_resident_count_dec(pmap, freed); 2969 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 2970 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 2971 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 2972 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2973 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && 2974 pc->pc_map[2] == PC_FREE2) { 2975 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 2976 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 2977 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 2978 /* Entire chunk is free; return it. */ 2979 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2980 dump_drop_page(m_pc->phys_addr); 2981 mtx_lock(&pv_chunks_mutex); 2982 break; 2983 } 2984 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2985 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2986 mtx_lock(&pv_chunks_mutex); 2987 /* One freed pv entry in locked_pmap is sufficient. */ 2988 if (pmap == locked_pmap) 2989 break; 2990 } 2991 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 2992 mtx_unlock(&pv_chunks_mutex); 2993 if (pmap != NULL) { 2994 pmap_invalidate_all(pmap); 2995 if (pmap != locked_pmap) 2996 PMAP_UNLOCK(pmap); 2997 } 2998 pmap_delayed_invl_finished(); 2999 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 3000 m_pc = SLIST_FIRST(&free); 3001 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 3002 /* Recycle a freed page table page. */ 3003 m_pc->wire_count = 1; 3004 atomic_add_int(&vm_cnt.v_wire_count, 1); 3005 } 3006 pmap_free_zero_pages(&free); 3007 return (m_pc); 3008 } 3009 3010 /* 3011 * free the pv_entry back to the free list 3012 */ 3013 static void 3014 free_pv_entry(pmap_t pmap, pv_entry_t pv) 3015 { 3016 struct pv_chunk *pc; 3017 int idx, field, bit; 3018 3019 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3020 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 3021 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 3022 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 3023 pc = pv_to_chunk(pv); 3024 idx = pv - &pc->pc_pventry[0]; 3025 field = idx / 64; 3026 bit = idx % 64; 3027 pc->pc_map[field] |= 1ul << bit; 3028 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 3029 pc->pc_map[2] != PC_FREE2) { 3030 /* 98% of the time, pc is already at the head of the list. */ 3031 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 3032 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3033 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3034 } 3035 return; 3036 } 3037 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3038 free_pv_chunk(pc); 3039 } 3040 3041 static void 3042 free_pv_chunk(struct pv_chunk *pc) 3043 { 3044 vm_page_t m; 3045 3046 mtx_lock(&pv_chunks_mutex); 3047 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 3048 mtx_unlock(&pv_chunks_mutex); 3049 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 3050 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 3051 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 3052 /* entire chunk is free, return it */ 3053 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 3054 dump_drop_page(m->phys_addr); 3055 vm_page_unwire(m, PQ_NONE); 3056 vm_page_free(m); 3057 } 3058 3059 /* 3060 * Returns a new PV entry, allocating a new PV chunk from the system when 3061 * needed. If this PV chunk allocation fails and a PV list lock pointer was 3062 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 3063 * returned. 3064 * 3065 * The given PV list lock may be released. 3066 */ 3067 static pv_entry_t 3068 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 3069 { 3070 int bit, field; 3071 pv_entry_t pv; 3072 struct pv_chunk *pc; 3073 vm_page_t m; 3074 3075 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3076 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 3077 retry: 3078 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3079 if (pc != NULL) { 3080 for (field = 0; field < _NPCM; field++) { 3081 if (pc->pc_map[field]) { 3082 bit = bsfq(pc->pc_map[field]); 3083 break; 3084 } 3085 } 3086 if (field < _NPCM) { 3087 pv = &pc->pc_pventry[field * 64 + bit]; 3088 pc->pc_map[field] &= ~(1ul << bit); 3089 /* If this was the last item, move it to tail */ 3090 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 3091 pc->pc_map[2] == 0) { 3092 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3093 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 3094 pc_list); 3095 } 3096 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 3097 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 3098 return (pv); 3099 } 3100 } 3101 /* No free items, allocate another chunk */ 3102 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 3103 VM_ALLOC_WIRED); 3104 if (m == NULL) { 3105 if (lockp == NULL) { 3106 PV_STAT(pc_chunk_tryfail++); 3107 return (NULL); 3108 } 3109 m = reclaim_pv_chunk(pmap, lockp); 3110 if (m == NULL) 3111 goto retry; 3112 } 3113 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3114 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3115 dump_add_page(m->phys_addr); 3116 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 3117 pc->pc_pmap = pmap; 3118 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 3119 pc->pc_map[1] = PC_FREE1; 3120 pc->pc_map[2] = PC_FREE2; 3121 mtx_lock(&pv_chunks_mutex); 3122 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 3123 mtx_unlock(&pv_chunks_mutex); 3124 pv = &pc->pc_pventry[0]; 3125 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3126 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 3127 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 3128 return (pv); 3129 } 3130 3131 /* 3132 * Returns the number of one bits within the given PV chunk map. 3133 * 3134 * The erratas for Intel processors state that "POPCNT Instruction May 3135 * Take Longer to Execute Than Expected". It is believed that the 3136 * issue is the spurious dependency on the destination register. 3137 * Provide a hint to the register rename logic that the destination 3138 * value is overwritten, by clearing it, as suggested in the 3139 * optimization manual. It should be cheap for unaffected processors 3140 * as well. 3141 * 3142 * Reference numbers for erratas are 3143 * 4th Gen Core: HSD146 3144 * 5th Gen Core: BDM85 3145 * 6th Gen Core: SKL029 3146 */ 3147 static int 3148 popcnt_pc_map_pq(uint64_t *map) 3149 { 3150 u_long result, tmp; 3151 3152 __asm __volatile("xorl %k0,%k0;popcntq %2,%0;" 3153 "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;" 3154 "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0" 3155 : "=&r" (result), "=&r" (tmp) 3156 : "m" (map[0]), "m" (map[1]), "m" (map[2])); 3157 return (result); 3158 } 3159 3160 /* 3161 * Ensure that the number of spare PV entries in the specified pmap meets or 3162 * exceeds the given count, "needed". 3163 * 3164 * The given PV list lock may be released. 3165 */ 3166 static void 3167 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 3168 { 3169 struct pch new_tail; 3170 struct pv_chunk *pc; 3171 int avail, free; 3172 vm_page_t m; 3173 3174 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3175 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 3176 3177 /* 3178 * Newly allocated PV chunks must be stored in a private list until 3179 * the required number of PV chunks have been allocated. Otherwise, 3180 * reclaim_pv_chunk() could recycle one of these chunks. In 3181 * contrast, these chunks must be added to the pmap upon allocation. 3182 */ 3183 TAILQ_INIT(&new_tail); 3184 retry: 3185 avail = 0; 3186 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 3187 #ifndef __POPCNT__ 3188 if ((cpu_feature2 & CPUID2_POPCNT) == 0) 3189 bit_count((bitstr_t *)pc->pc_map, 0, 3190 sizeof(pc->pc_map) * NBBY, &free); 3191 else 3192 #endif 3193 free = popcnt_pc_map_pq(pc->pc_map); 3194 if (free == 0) 3195 break; 3196 avail += free; 3197 if (avail >= needed) 3198 break; 3199 } 3200 for (; avail < needed; avail += _NPCPV) { 3201 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 3202 VM_ALLOC_WIRED); 3203 if (m == NULL) { 3204 m = reclaim_pv_chunk(pmap, lockp); 3205 if (m == NULL) 3206 goto retry; 3207 } 3208 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3209 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3210 dump_add_page(m->phys_addr); 3211 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 3212 pc->pc_pmap = pmap; 3213 pc->pc_map[0] = PC_FREE0; 3214 pc->pc_map[1] = PC_FREE1; 3215 pc->pc_map[2] = PC_FREE2; 3216 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3217 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 3218 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 3219 } 3220 if (!TAILQ_EMPTY(&new_tail)) { 3221 mtx_lock(&pv_chunks_mutex); 3222 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 3223 mtx_unlock(&pv_chunks_mutex); 3224 } 3225 } 3226 3227 /* 3228 * First find and then remove the pv entry for the specified pmap and virtual 3229 * address from the specified pv list. Returns the pv entry if found and NULL 3230 * otherwise. This operation can be performed on pv lists for either 4KB or 3231 * 2MB page mappings. 3232 */ 3233 static __inline pv_entry_t 3234 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3235 { 3236 pv_entry_t pv; 3237 3238 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3239 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 3240 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3241 pvh->pv_gen++; 3242 break; 3243 } 3244 } 3245 return (pv); 3246 } 3247 3248 /* 3249 * After demotion from a 2MB page mapping to 512 4KB page mappings, 3250 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 3251 * entries for each of the 4KB page mappings. 3252 */ 3253 static void 3254 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3255 struct rwlock **lockp) 3256 { 3257 struct md_page *pvh; 3258 struct pv_chunk *pc; 3259 pv_entry_t pv; 3260 vm_offset_t va_last; 3261 vm_page_t m; 3262 int bit, field; 3263 3264 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3265 KASSERT((pa & PDRMASK) == 0, 3266 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 3267 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3268 3269 /* 3270 * Transfer the 2mpage's pv entry for this mapping to the first 3271 * page's pv list. Once this transfer begins, the pv list lock 3272 * must not be released until the last pv entry is reinstantiated. 3273 */ 3274 pvh = pa_to_pvh(pa); 3275 va = trunc_2mpage(va); 3276 pv = pmap_pvh_remove(pvh, pmap, va); 3277 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 3278 m = PHYS_TO_VM_PAGE(pa); 3279 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3280 m->md.pv_gen++; 3281 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 3282 PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); 3283 va_last = va + NBPDR - PAGE_SIZE; 3284 for (;;) { 3285 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3286 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 3287 pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); 3288 for (field = 0; field < _NPCM; field++) { 3289 while (pc->pc_map[field]) { 3290 bit = bsfq(pc->pc_map[field]); 3291 pc->pc_map[field] &= ~(1ul << bit); 3292 pv = &pc->pc_pventry[field * 64 + bit]; 3293 va += PAGE_SIZE; 3294 pv->pv_va = va; 3295 m++; 3296 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3297 ("pmap_pv_demote_pde: page %p is not managed", m)); 3298 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3299 m->md.pv_gen++; 3300 if (va == va_last) 3301 goto out; 3302 } 3303 } 3304 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3305 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3306 } 3307 out: 3308 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 3309 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3310 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3311 } 3312 PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); 3313 PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); 3314 } 3315 3316 /* 3317 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 3318 * replace the many pv entries for the 4KB page mappings by a single pv entry 3319 * for the 2MB page mapping. 3320 */ 3321 static void 3322 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3323 struct rwlock **lockp) 3324 { 3325 struct md_page *pvh; 3326 pv_entry_t pv; 3327 vm_offset_t va_last; 3328 vm_page_t m; 3329 3330 KASSERT((pa & PDRMASK) == 0, 3331 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 3332 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3333 3334 /* 3335 * Transfer the first page's pv entry for this mapping to the 2mpage's 3336 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 3337 * a transfer avoids the possibility that get_pv_entry() calls 3338 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 3339 * mappings that is being promoted. 3340 */ 3341 m = PHYS_TO_VM_PAGE(pa); 3342 va = trunc_2mpage(va); 3343 pv = pmap_pvh_remove(&m->md, pmap, va); 3344 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 3345 pvh = pa_to_pvh(pa); 3346 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3347 pvh->pv_gen++; 3348 /* Free the remaining NPTEPG - 1 pv entries. */ 3349 va_last = va + NBPDR - PAGE_SIZE; 3350 do { 3351 m++; 3352 va += PAGE_SIZE; 3353 pmap_pvh_free(&m->md, pmap, va); 3354 } while (va < va_last); 3355 } 3356 3357 /* 3358 * First find and then destroy the pv entry for the specified pmap and virtual 3359 * address. This operation can be performed on pv lists for either 4KB or 2MB 3360 * page mappings. 3361 */ 3362 static void 3363 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3364 { 3365 pv_entry_t pv; 3366 3367 pv = pmap_pvh_remove(pvh, pmap, va); 3368 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 3369 free_pv_entry(pmap, pv); 3370 } 3371 3372 /* 3373 * Conditionally create the PV entry for a 4KB page mapping if the required 3374 * memory can be allocated without resorting to reclamation. 3375 */ 3376 static boolean_t 3377 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 3378 struct rwlock **lockp) 3379 { 3380 pv_entry_t pv; 3381 3382 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3383 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3384 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 3385 pv->pv_va = va; 3386 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3387 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3388 m->md.pv_gen++; 3389 return (TRUE); 3390 } else 3391 return (FALSE); 3392 } 3393 3394 /* 3395 * Conditionally create the PV entry for a 2MB page mapping if the required 3396 * memory can be allocated without resorting to reclamation. 3397 */ 3398 static boolean_t 3399 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3400 struct rwlock **lockp) 3401 { 3402 struct md_page *pvh; 3403 pv_entry_t pv; 3404 3405 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3406 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3407 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 3408 pv->pv_va = va; 3409 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3410 pvh = pa_to_pvh(pa); 3411 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3412 pvh->pv_gen++; 3413 return (TRUE); 3414 } else 3415 return (FALSE); 3416 } 3417 3418 /* 3419 * Fills a page table page with mappings to consecutive physical pages. 3420 */ 3421 static void 3422 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 3423 { 3424 pt_entry_t *pte; 3425 3426 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 3427 *pte = newpte; 3428 newpte += PAGE_SIZE; 3429 } 3430 } 3431 3432 /* 3433 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 3434 * mapping is invalidated. 3435 */ 3436 static boolean_t 3437 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3438 { 3439 struct rwlock *lock; 3440 boolean_t rv; 3441 3442 lock = NULL; 3443 rv = pmap_demote_pde_locked(pmap, pde, va, &lock); 3444 if (lock != NULL) 3445 rw_wunlock(lock); 3446 return (rv); 3447 } 3448 3449 static boolean_t 3450 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 3451 struct rwlock **lockp) 3452 { 3453 pd_entry_t newpde, oldpde; 3454 pt_entry_t *firstpte, newpte; 3455 pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V; 3456 vm_paddr_t mptepa; 3457 vm_page_t mpte; 3458 struct spglist free; 3459 vm_offset_t sva; 3460 int PG_PTE_CACHE; 3461 3462 PG_G = pmap_global_bit(pmap); 3463 PG_A = pmap_accessed_bit(pmap); 3464 PG_M = pmap_modified_bit(pmap); 3465 PG_RW = pmap_rw_bit(pmap); 3466 PG_V = pmap_valid_bit(pmap); 3467 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 3468 3469 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3470 oldpde = *pde; 3471 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 3472 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 3473 if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 3474 NULL) { 3475 KASSERT((oldpde & PG_W) == 0, 3476 ("pmap_demote_pde: page table page for a wired mapping" 3477 " is missing")); 3478 3479 /* 3480 * Invalidate the 2MB page mapping and return "failure" if the 3481 * mapping was never accessed or the allocation of the new 3482 * page table page fails. If the 2MB page mapping belongs to 3483 * the direct map region of the kernel's address space, then 3484 * the page allocation request specifies the highest possible 3485 * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is 3486 * normal. Page table pages are preallocated for every other 3487 * part of the kernel address space, so the direct map region 3488 * is the only part of the kernel address space that must be 3489 * handled here. 3490 */ 3491 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 3492 pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va < 3493 DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | 3494 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 3495 SLIST_INIT(&free); 3496 sva = trunc_2mpage(va); 3497 pmap_remove_pde(pmap, pde, sva, &free, lockp); 3498 if ((oldpde & PG_G) == 0) 3499 pmap_invalidate_pde_page(pmap, sva, oldpde); 3500 pmap_free_zero_pages(&free); 3501 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx" 3502 " in pmap %p", va, pmap); 3503 return (FALSE); 3504 } 3505 if (va < VM_MAXUSER_ADDRESS) 3506 pmap_resident_count_inc(pmap, 1); 3507 } 3508 mptepa = VM_PAGE_TO_PHYS(mpte); 3509 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 3510 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 3511 KASSERT((oldpde & PG_A) != 0, 3512 ("pmap_demote_pde: oldpde is missing PG_A")); 3513 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 3514 ("pmap_demote_pde: oldpde is missing PG_M")); 3515 newpte = oldpde & ~PG_PS; 3516 newpte = pmap_swap_pat(pmap, newpte); 3517 3518 /* 3519 * If the page table page is new, initialize it. 3520 */ 3521 if (mpte->wire_count == 1) { 3522 mpte->wire_count = NPTEPG; 3523 pmap_fill_ptp(firstpte, newpte); 3524 } 3525 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 3526 ("pmap_demote_pde: firstpte and newpte map different physical" 3527 " addresses")); 3528 3529 /* 3530 * If the mapping has changed attributes, update the page table 3531 * entries. 3532 */ 3533 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 3534 pmap_fill_ptp(firstpte, newpte); 3535 3536 /* 3537 * The spare PV entries must be reserved prior to demoting the 3538 * mapping, that is, prior to changing the PDE. Otherwise, the state 3539 * of the PDE and the PV lists will be inconsistent, which can result 3540 * in reclaim_pv_chunk() attempting to remove a PV entry from the 3541 * wrong PV list and pmap_pv_demote_pde() failing to find the expected 3542 * PV entry for the 2MB page mapping that is being demoted. 3543 */ 3544 if ((oldpde & PG_MANAGED) != 0) 3545 reserve_pv_entries(pmap, NPTEPG - 1, lockp); 3546 3547 /* 3548 * Demote the mapping. This pmap is locked. The old PDE has 3549 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 3550 * set. Thus, there is no danger of a race with another 3551 * processor changing the setting of PG_A and/or PG_M between 3552 * the read above and the store below. 3553 */ 3554 if (workaround_erratum383) 3555 pmap_update_pde(pmap, va, pde, newpde); 3556 else 3557 pde_store(pde, newpde); 3558 3559 /* 3560 * Invalidate a stale recursive mapping of the page table page. 3561 */ 3562 if (va >= VM_MAXUSER_ADDRESS) 3563 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 3564 3565 /* 3566 * Demote the PV entry. 3567 */ 3568 if ((oldpde & PG_MANAGED) != 0) 3569 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); 3570 3571 atomic_add_long(&pmap_pde_demotions, 1); 3572 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx" 3573 " in pmap %p", va, pmap); 3574 return (TRUE); 3575 } 3576 3577 /* 3578 * pmap_remove_kernel_pde: Remove a kernel superpage mapping. 3579 */ 3580 static void 3581 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3582 { 3583 pd_entry_t newpde; 3584 vm_paddr_t mptepa; 3585 vm_page_t mpte; 3586 3587 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 3588 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3589 mpte = pmap_remove_pt_page(pmap, va); 3590 if (mpte == NULL) 3591 panic("pmap_remove_kernel_pde: Missing pt page."); 3592 3593 mptepa = VM_PAGE_TO_PHYS(mpte); 3594 newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; 3595 3596 /* 3597 * Initialize the page table page. 3598 */ 3599 pagezero((void *)PHYS_TO_DMAP(mptepa)); 3600 3601 /* 3602 * Demote the mapping. 3603 */ 3604 if (workaround_erratum383) 3605 pmap_update_pde(pmap, va, pde, newpde); 3606 else 3607 pde_store(pde, newpde); 3608 3609 /* 3610 * Invalidate a stale recursive mapping of the page table page. 3611 */ 3612 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 3613 } 3614 3615 /* 3616 * pmap_remove_pde: do the things to unmap a superpage in a process 3617 */ 3618 static int 3619 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 3620 struct spglist *free, struct rwlock **lockp) 3621 { 3622 struct md_page *pvh; 3623 pd_entry_t oldpde; 3624 vm_offset_t eva, va; 3625 vm_page_t m, mpte; 3626 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 3627 3628 PG_G = pmap_global_bit(pmap); 3629 PG_A = pmap_accessed_bit(pmap); 3630 PG_M = pmap_modified_bit(pmap); 3631 PG_RW = pmap_rw_bit(pmap); 3632 3633 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3634 KASSERT((sva & PDRMASK) == 0, 3635 ("pmap_remove_pde: sva is not 2mpage aligned")); 3636 oldpde = pte_load_clear(pdq); 3637 if (oldpde & PG_W) 3638 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 3639 if ((oldpde & PG_G) != 0) 3640 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 3641 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); 3642 if (oldpde & PG_MANAGED) { 3643 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); 3644 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 3645 pmap_pvh_free(pvh, pmap, sva); 3646 eva = sva + NBPDR; 3647 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 3648 va < eva; va += PAGE_SIZE, m++) { 3649 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3650 vm_page_dirty(m); 3651 if (oldpde & PG_A) 3652 vm_page_aflag_set(m, PGA_REFERENCED); 3653 if (TAILQ_EMPTY(&m->md.pv_list) && 3654 TAILQ_EMPTY(&pvh->pv_list)) 3655 vm_page_aflag_clear(m, PGA_WRITEABLE); 3656 pmap_delayed_invl_page(m); 3657 } 3658 } 3659 if (pmap == kernel_pmap) { 3660 pmap_remove_kernel_pde(pmap, pdq, sva); 3661 } else { 3662 mpte = pmap_remove_pt_page(pmap, sva); 3663 if (mpte != NULL) { 3664 pmap_resident_count_dec(pmap, 1); 3665 KASSERT(mpte->wire_count == NPTEPG, 3666 ("pmap_remove_pde: pte page wire count error")); 3667 mpte->wire_count = 0; 3668 pmap_add_delayed_free_list(mpte, free, FALSE); 3669 atomic_subtract_int(&vm_cnt.v_wire_count, 1); 3670 } 3671 } 3672 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); 3673 } 3674 3675 /* 3676 * pmap_remove_pte: do the things to unmap a page in a process 3677 */ 3678 static int 3679 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 3680 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) 3681 { 3682 struct md_page *pvh; 3683 pt_entry_t oldpte, PG_A, PG_M, PG_RW; 3684 vm_page_t m; 3685 3686 PG_A = pmap_accessed_bit(pmap); 3687 PG_M = pmap_modified_bit(pmap); 3688 PG_RW = pmap_rw_bit(pmap); 3689 3690 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3691 oldpte = pte_load_clear(ptq); 3692 if (oldpte & PG_W) 3693 pmap->pm_stats.wired_count -= 1; 3694 pmap_resident_count_dec(pmap, 1); 3695 if (oldpte & PG_MANAGED) { 3696 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 3697 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3698 vm_page_dirty(m); 3699 if (oldpte & PG_A) 3700 vm_page_aflag_set(m, PGA_REFERENCED); 3701 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3702 pmap_pvh_free(&m->md, pmap, va); 3703 if (TAILQ_EMPTY(&m->md.pv_list) && 3704 (m->flags & PG_FICTITIOUS) == 0) { 3705 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3706 if (TAILQ_EMPTY(&pvh->pv_list)) 3707 vm_page_aflag_clear(m, PGA_WRITEABLE); 3708 } 3709 pmap_delayed_invl_page(m); 3710 } 3711 return (pmap_unuse_pt(pmap, va, ptepde, free)); 3712 } 3713 3714 /* 3715 * Remove a single page from a process address space 3716 */ 3717 static void 3718 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 3719 struct spglist *free) 3720 { 3721 struct rwlock *lock; 3722 pt_entry_t *pte, PG_V; 3723 3724 PG_V = pmap_valid_bit(pmap); 3725 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3726 if ((*pde & PG_V) == 0) 3727 return; 3728 pte = pmap_pde_to_pte(pde, va); 3729 if ((*pte & PG_V) == 0) 3730 return; 3731 lock = NULL; 3732 pmap_remove_pte(pmap, pte, va, *pde, free, &lock); 3733 if (lock != NULL) 3734 rw_wunlock(lock); 3735 pmap_invalidate_page(pmap, va); 3736 } 3737 3738 /* 3739 * Remove the given range of addresses from the specified map. 3740 * 3741 * It is assumed that the start and end are properly 3742 * rounded to the page size. 3743 */ 3744 void 3745 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3746 { 3747 struct rwlock *lock; 3748 vm_offset_t va, va_next; 3749 pml4_entry_t *pml4e; 3750 pdp_entry_t *pdpe; 3751 pd_entry_t ptpaddr, *pde; 3752 pt_entry_t *pte, PG_G, PG_V; 3753 struct spglist free; 3754 int anyvalid; 3755 3756 PG_G = pmap_global_bit(pmap); 3757 PG_V = pmap_valid_bit(pmap); 3758 3759 /* 3760 * Perform an unsynchronized read. This is, however, safe. 3761 */ 3762 if (pmap->pm_stats.resident_count == 0) 3763 return; 3764 3765 anyvalid = 0; 3766 SLIST_INIT(&free); 3767 3768 pmap_delayed_invl_started(); 3769 PMAP_LOCK(pmap); 3770 3771 /* 3772 * special handling of removing one page. a very 3773 * common operation and easy to short circuit some 3774 * code. 3775 */ 3776 if (sva + PAGE_SIZE == eva) { 3777 pde = pmap_pde(pmap, sva); 3778 if (pde && (*pde & PG_PS) == 0) { 3779 pmap_remove_page(pmap, sva, pde, &free); 3780 goto out; 3781 } 3782 } 3783 3784 lock = NULL; 3785 for (; sva < eva; sva = va_next) { 3786 3787 if (pmap->pm_stats.resident_count == 0) 3788 break; 3789 3790 pml4e = pmap_pml4e(pmap, sva); 3791 if ((*pml4e & PG_V) == 0) { 3792 va_next = (sva + NBPML4) & ~PML4MASK; 3793 if (va_next < sva) 3794 va_next = eva; 3795 continue; 3796 } 3797 3798 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 3799 if ((*pdpe & PG_V) == 0) { 3800 va_next = (sva + NBPDP) & ~PDPMASK; 3801 if (va_next < sva) 3802 va_next = eva; 3803 continue; 3804 } 3805 3806 /* 3807 * Calculate index for next page table. 3808 */ 3809 va_next = (sva + NBPDR) & ~PDRMASK; 3810 if (va_next < sva) 3811 va_next = eva; 3812 3813 pde = pmap_pdpe_to_pde(pdpe, sva); 3814 ptpaddr = *pde; 3815 3816 /* 3817 * Weed out invalid mappings. 3818 */ 3819 if (ptpaddr == 0) 3820 continue; 3821 3822 /* 3823 * Check for large page. 3824 */ 3825 if ((ptpaddr & PG_PS) != 0) { 3826 /* 3827 * Are we removing the entire large page? If not, 3828 * demote the mapping and fall through. 3829 */ 3830 if (sva + NBPDR == va_next && eva >= va_next) { 3831 /* 3832 * The TLB entry for a PG_G mapping is 3833 * invalidated by pmap_remove_pde(). 3834 */ 3835 if ((ptpaddr & PG_G) == 0) 3836 anyvalid = 1; 3837 pmap_remove_pde(pmap, pde, sva, &free, &lock); 3838 continue; 3839 } else if (!pmap_demote_pde_locked(pmap, pde, sva, 3840 &lock)) { 3841 /* The large page mapping was destroyed. */ 3842 continue; 3843 } else 3844 ptpaddr = *pde; 3845 } 3846 3847 /* 3848 * Limit our scan to either the end of the va represented 3849 * by the current page table page, or to the end of the 3850 * range being removed. 3851 */ 3852 if (va_next > eva) 3853 va_next = eva; 3854 3855 va = va_next; 3856 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 3857 sva += PAGE_SIZE) { 3858 if (*pte == 0) { 3859 if (va != va_next) { 3860 pmap_invalidate_range(pmap, va, sva); 3861 va = va_next; 3862 } 3863 continue; 3864 } 3865 if ((*pte & PG_G) == 0) 3866 anyvalid = 1; 3867 else if (va == va_next) 3868 va = sva; 3869 if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free, 3870 &lock)) { 3871 sva += PAGE_SIZE; 3872 break; 3873 } 3874 } 3875 if (va != va_next) 3876 pmap_invalidate_range(pmap, va, sva); 3877 } 3878 if (lock != NULL) 3879 rw_wunlock(lock); 3880 out: 3881 if (anyvalid) 3882 pmap_invalidate_all(pmap); 3883 PMAP_UNLOCK(pmap); 3884 pmap_delayed_invl_finished(); 3885 pmap_free_zero_pages(&free); 3886 } 3887 3888 /* 3889 * Routine: pmap_remove_all 3890 * Function: 3891 * Removes this physical page from 3892 * all physical maps in which it resides. 3893 * Reflects back modify bits to the pager. 3894 * 3895 * Notes: 3896 * Original versions of this routine were very 3897 * inefficient because they iteratively called 3898 * pmap_remove (slow...) 3899 */ 3900 3901 void 3902 pmap_remove_all(vm_page_t m) 3903 { 3904 struct md_page *pvh; 3905 pv_entry_t pv; 3906 pmap_t pmap; 3907 struct rwlock *lock; 3908 pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; 3909 pd_entry_t *pde; 3910 vm_offset_t va; 3911 struct spglist free; 3912 int pvh_gen, md_gen; 3913 3914 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3915 ("pmap_remove_all: page %p is not managed", m)); 3916 SLIST_INIT(&free); 3917 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3918 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 3919 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3920 retry: 3921 rw_wlock(lock); 3922 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 3923 pmap = PV_PMAP(pv); 3924 if (!PMAP_TRYLOCK(pmap)) { 3925 pvh_gen = pvh->pv_gen; 3926 rw_wunlock(lock); 3927 PMAP_LOCK(pmap); 3928 rw_wlock(lock); 3929 if (pvh_gen != pvh->pv_gen) { 3930 rw_wunlock(lock); 3931 PMAP_UNLOCK(pmap); 3932 goto retry; 3933 } 3934 } 3935 va = pv->pv_va; 3936 pde = pmap_pde(pmap, va); 3937 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 3938 PMAP_UNLOCK(pmap); 3939 } 3940 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3941 pmap = PV_PMAP(pv); 3942 if (!PMAP_TRYLOCK(pmap)) { 3943 pvh_gen = pvh->pv_gen; 3944 md_gen = m->md.pv_gen; 3945 rw_wunlock(lock); 3946 PMAP_LOCK(pmap); 3947 rw_wlock(lock); 3948 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 3949 rw_wunlock(lock); 3950 PMAP_UNLOCK(pmap); 3951 goto retry; 3952 } 3953 } 3954 PG_A = pmap_accessed_bit(pmap); 3955 PG_M = pmap_modified_bit(pmap); 3956 PG_RW = pmap_rw_bit(pmap); 3957 pmap_resident_count_dec(pmap, 1); 3958 pde = pmap_pde(pmap, pv->pv_va); 3959 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 3960 " a 2mpage in page %p's pv list", m)); 3961 pte = pmap_pde_to_pte(pde, pv->pv_va); 3962 tpte = pte_load_clear(pte); 3963 if (tpte & PG_W) 3964 pmap->pm_stats.wired_count--; 3965 if (tpte & PG_A) 3966 vm_page_aflag_set(m, PGA_REFERENCED); 3967 3968 /* 3969 * Update the vm_page_t clean and reference bits. 3970 */ 3971 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3972 vm_page_dirty(m); 3973 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); 3974 pmap_invalidate_page(pmap, pv->pv_va); 3975 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3976 m->md.pv_gen++; 3977 free_pv_entry(pmap, pv); 3978 PMAP_UNLOCK(pmap); 3979 } 3980 vm_page_aflag_clear(m, PGA_WRITEABLE); 3981 rw_wunlock(lock); 3982 pmap_delayed_invl_wait(m); 3983 pmap_free_zero_pages(&free); 3984 } 3985 3986 /* 3987 * pmap_protect_pde: do the things to protect a 2mpage in a process 3988 */ 3989 static boolean_t 3990 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 3991 { 3992 pd_entry_t newpde, oldpde; 3993 vm_offset_t eva, va; 3994 vm_page_t m; 3995 boolean_t anychanged; 3996 pt_entry_t PG_G, PG_M, PG_RW; 3997 3998 PG_G = pmap_global_bit(pmap); 3999 PG_M = pmap_modified_bit(pmap); 4000 PG_RW = pmap_rw_bit(pmap); 4001 4002 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4003 KASSERT((sva & PDRMASK) == 0, 4004 ("pmap_protect_pde: sva is not 2mpage aligned")); 4005 anychanged = FALSE; 4006 retry: 4007 oldpde = newpde = *pde; 4008 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == 4009 (PG_MANAGED | PG_M | PG_RW)) { 4010 eva = sva + NBPDR; 4011 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 4012 va < eva; va += PAGE_SIZE, m++) 4013 vm_page_dirty(m); 4014 } 4015 if ((prot & VM_PROT_WRITE) == 0) 4016 newpde &= ~(PG_RW | PG_M); 4017 if ((prot & VM_PROT_EXECUTE) == 0) 4018 newpde |= pg_nx; 4019 if (newpde != oldpde) { 4020 /* 4021 * As an optimization to future operations on this PDE, clear 4022 * PG_PROMOTED. The impending invalidation will remove any 4023 * lingering 4KB page mappings from the TLB. 4024 */ 4025 if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED)) 4026 goto retry; 4027 if ((oldpde & PG_G) != 0) 4028 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 4029 else 4030 anychanged = TRUE; 4031 } 4032 return (anychanged); 4033 } 4034 4035 /* 4036 * Set the physical protection on the 4037 * specified range of this map as requested. 4038 */ 4039 void 4040 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4041 { 4042 vm_offset_t va_next; 4043 pml4_entry_t *pml4e; 4044 pdp_entry_t *pdpe; 4045 pd_entry_t ptpaddr, *pde; 4046 pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; 4047 boolean_t anychanged; 4048 4049 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 4050 if (prot == VM_PROT_NONE) { 4051 pmap_remove(pmap, sva, eva); 4052 return; 4053 } 4054 4055 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 4056 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 4057 return; 4058 4059 PG_G = pmap_global_bit(pmap); 4060 PG_M = pmap_modified_bit(pmap); 4061 PG_V = pmap_valid_bit(pmap); 4062 PG_RW = pmap_rw_bit(pmap); 4063 anychanged = FALSE; 4064 4065 PMAP_LOCK(pmap); 4066 for (; sva < eva; sva = va_next) { 4067 4068 pml4e = pmap_pml4e(pmap, sva); 4069 if ((*pml4e & PG_V) == 0) { 4070 va_next = (sva + NBPML4) & ~PML4MASK; 4071 if (va_next < sva) 4072 va_next = eva; 4073 continue; 4074 } 4075 4076 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 4077 if ((*pdpe & PG_V) == 0) { 4078 va_next = (sva + NBPDP) & ~PDPMASK; 4079 if (va_next < sva) 4080 va_next = eva; 4081 continue; 4082 } 4083 4084 va_next = (sva + NBPDR) & ~PDRMASK; 4085 if (va_next < sva) 4086 va_next = eva; 4087 4088 pde = pmap_pdpe_to_pde(pdpe, sva); 4089 ptpaddr = *pde; 4090 4091 /* 4092 * Weed out invalid mappings. 4093 */ 4094 if (ptpaddr == 0) 4095 continue; 4096 4097 /* 4098 * Check for large page. 4099 */ 4100 if ((ptpaddr & PG_PS) != 0) { 4101 /* 4102 * Are we protecting the entire large page? If not, 4103 * demote the mapping and fall through. 4104 */ 4105 if (sva + NBPDR == va_next && eva >= va_next) { 4106 /* 4107 * The TLB entry for a PG_G mapping is 4108 * invalidated by pmap_protect_pde(). 4109 */ 4110 if (pmap_protect_pde(pmap, pde, sva, prot)) 4111 anychanged = TRUE; 4112 continue; 4113 } else if (!pmap_demote_pde(pmap, pde, sva)) { 4114 /* 4115 * The large page mapping was destroyed. 4116 */ 4117 continue; 4118 } 4119 } 4120 4121 if (va_next > eva) 4122 va_next = eva; 4123 4124 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 4125 sva += PAGE_SIZE) { 4126 pt_entry_t obits, pbits; 4127 vm_page_t m; 4128 4129 retry: 4130 obits = pbits = *pte; 4131 if ((pbits & PG_V) == 0) 4132 continue; 4133 4134 if ((prot & VM_PROT_WRITE) == 0) { 4135 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 4136 (PG_MANAGED | PG_M | PG_RW)) { 4137 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 4138 vm_page_dirty(m); 4139 } 4140 pbits &= ~(PG_RW | PG_M); 4141 } 4142 if ((prot & VM_PROT_EXECUTE) == 0) 4143 pbits |= pg_nx; 4144 4145 if (pbits != obits) { 4146 if (!atomic_cmpset_long(pte, obits, pbits)) 4147 goto retry; 4148 if (obits & PG_G) 4149 pmap_invalidate_page(pmap, sva); 4150 else 4151 anychanged = TRUE; 4152 } 4153 } 4154 } 4155 if (anychanged) 4156 pmap_invalidate_all(pmap); 4157 PMAP_UNLOCK(pmap); 4158 } 4159 4160 /* 4161 * Tries to promote the 512, contiguous 4KB page mappings that are within a 4162 * single page table page (PTP) to a single 2MB page mapping. For promotion 4163 * to occur, two conditions must be met: (1) the 4KB page mappings must map 4164 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 4165 * identical characteristics. 4166 */ 4167 static void 4168 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 4169 struct rwlock **lockp) 4170 { 4171 pd_entry_t newpde; 4172 pt_entry_t *firstpte, oldpte, pa, *pte; 4173 pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V; 4174 vm_page_t mpte; 4175 int PG_PTE_CACHE; 4176 4177 PG_A = pmap_accessed_bit(pmap); 4178 PG_G = pmap_global_bit(pmap); 4179 PG_M = pmap_modified_bit(pmap); 4180 PG_V = pmap_valid_bit(pmap); 4181 PG_RW = pmap_rw_bit(pmap); 4182 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 4183 4184 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4185 4186 /* 4187 * Examine the first PTE in the specified PTP. Abort if this PTE is 4188 * either invalid, unused, or does not map the first 4KB physical page 4189 * within a 2MB page. 4190 */ 4191 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 4192 setpde: 4193 newpde = *firstpte; 4194 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 4195 atomic_add_long(&pmap_pde_p_failures, 1); 4196 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4197 " in pmap %p", va, pmap); 4198 return; 4199 } 4200 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 4201 /* 4202 * When PG_M is already clear, PG_RW can be cleared without 4203 * a TLB invalidation. 4204 */ 4205 if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW)) 4206 goto setpde; 4207 newpde &= ~PG_RW; 4208 } 4209 4210 /* 4211 * Examine each of the other PTEs in the specified PTP. Abort if this 4212 * PTE maps an unexpected 4KB physical page or does not have identical 4213 * characteristics to the first PTE. 4214 */ 4215 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 4216 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 4217 setpte: 4218 oldpte = *pte; 4219 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 4220 atomic_add_long(&pmap_pde_p_failures, 1); 4221 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4222 " in pmap %p", va, pmap); 4223 return; 4224 } 4225 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 4226 /* 4227 * When PG_M is already clear, PG_RW can be cleared 4228 * without a TLB invalidation. 4229 */ 4230 if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW)) 4231 goto setpte; 4232 oldpte &= ~PG_RW; 4233 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 4234 " in pmap %p", (oldpte & PG_FRAME & PDRMASK) | 4235 (va & ~PDRMASK), pmap); 4236 } 4237 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 4238 atomic_add_long(&pmap_pde_p_failures, 1); 4239 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4240 " in pmap %p", va, pmap); 4241 return; 4242 } 4243 pa -= PAGE_SIZE; 4244 } 4245 4246 /* 4247 * Save the page table page in its current state until the PDE 4248 * mapping the superpage is demoted by pmap_demote_pde() or 4249 * destroyed by pmap_remove_pde(). 4250 */ 4251 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 4252 KASSERT(mpte >= vm_page_array && 4253 mpte < &vm_page_array[vm_page_array_size], 4254 ("pmap_promote_pde: page table page is out of range")); 4255 KASSERT(mpte->pindex == pmap_pde_pindex(va), 4256 ("pmap_promote_pde: page table page's pindex is wrong")); 4257 if (pmap_insert_pt_page(pmap, mpte)) { 4258 atomic_add_long(&pmap_pde_p_failures, 1); 4259 CTR2(KTR_PMAP, 4260 "pmap_promote_pde: failure for va %#lx in pmap %p", va, 4261 pmap); 4262 return; 4263 } 4264 4265 /* 4266 * Promote the pv entries. 4267 */ 4268 if ((newpde & PG_MANAGED) != 0) 4269 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); 4270 4271 /* 4272 * Propagate the PAT index to its proper position. 4273 */ 4274 newpde = pmap_swap_pat(pmap, newpde); 4275 4276 /* 4277 * Map the superpage. 4278 */ 4279 if (workaround_erratum383) 4280 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 4281 else 4282 pde_store(pde, PG_PROMOTED | PG_PS | newpde); 4283 4284 atomic_add_long(&pmap_pde_promotions, 1); 4285 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" 4286 " in pmap %p", va, pmap); 4287 } 4288 4289 /* 4290 * Insert the given physical page (p) at 4291 * the specified virtual address (v) in the 4292 * target physical map with the protection requested. 4293 * 4294 * If specified, the page will be wired down, meaning 4295 * that the related pte can not be reclaimed. 4296 * 4297 * NB: This is the only routine which MAY NOT lazy-evaluate 4298 * or lose information. That is, this routine must actually 4299 * insert this page into the given map NOW. 4300 * 4301 * When destroying both a page table and PV entry, this function 4302 * performs the TLB invalidation before releasing the PV list 4303 * lock, so we do not need pmap_delayed_invl_page() calls here. 4304 */ 4305 int 4306 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4307 u_int flags, int8_t psind __unused) 4308 { 4309 struct rwlock *lock; 4310 pd_entry_t *pde; 4311 pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; 4312 pt_entry_t newpte, origpte; 4313 pv_entry_t pv; 4314 vm_paddr_t opa, pa; 4315 vm_page_t mpte, om; 4316 boolean_t nosleep; 4317 4318 PG_A = pmap_accessed_bit(pmap); 4319 PG_G = pmap_global_bit(pmap); 4320 PG_M = pmap_modified_bit(pmap); 4321 PG_V = pmap_valid_bit(pmap); 4322 PG_RW = pmap_rw_bit(pmap); 4323 4324 va = trunc_page(va); 4325 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 4326 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 4327 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", 4328 va)); 4329 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || 4330 va >= kmi.clean_eva, 4331 ("pmap_enter: managed mapping within the clean submap")); 4332 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) 4333 VM_OBJECT_ASSERT_LOCKED(m->object); 4334 pa = VM_PAGE_TO_PHYS(m); 4335 newpte = (pt_entry_t)(pa | PG_A | PG_V); 4336 if ((flags & VM_PROT_WRITE) != 0) 4337 newpte |= PG_M; 4338 if ((prot & VM_PROT_WRITE) != 0) 4339 newpte |= PG_RW; 4340 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 4341 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); 4342 if ((prot & VM_PROT_EXECUTE) == 0) 4343 newpte |= pg_nx; 4344 if ((flags & PMAP_ENTER_WIRED) != 0) 4345 newpte |= PG_W; 4346 if (va < VM_MAXUSER_ADDRESS) 4347 newpte |= PG_U; 4348 if (pmap == kernel_pmap) 4349 newpte |= PG_G; 4350 newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0); 4351 4352 /* 4353 * Set modified bit gratuitously for writeable mappings if 4354 * the page is unmanaged. We do not want to take a fault 4355 * to do the dirty bit accounting for these mappings. 4356 */ 4357 if ((m->oflags & VPO_UNMANAGED) != 0) { 4358 if ((newpte & PG_RW) != 0) 4359 newpte |= PG_M; 4360 } else 4361 newpte |= PG_MANAGED; 4362 4363 mpte = NULL; 4364 4365 lock = NULL; 4366 PMAP_LOCK(pmap); 4367 4368 /* 4369 * In the case that a page table page is not 4370 * resident, we are creating it here. 4371 */ 4372 retry: 4373 pde = pmap_pde(pmap, va); 4374 if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || 4375 pmap_demote_pde_locked(pmap, pde, va, &lock))) { 4376 pte = pmap_pde_to_pte(pde, va); 4377 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 4378 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 4379 mpte->wire_count++; 4380 } 4381 } else if (va < VM_MAXUSER_ADDRESS) { 4382 /* 4383 * Here if the pte page isn't mapped, or if it has been 4384 * deallocated. 4385 */ 4386 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 4387 mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), 4388 nosleep ? NULL : &lock); 4389 if (mpte == NULL && nosleep) { 4390 if (lock != NULL) 4391 rw_wunlock(lock); 4392 PMAP_UNLOCK(pmap); 4393 return (KERN_RESOURCE_SHORTAGE); 4394 } 4395 goto retry; 4396 } else 4397 panic("pmap_enter: invalid page directory va=%#lx", va); 4398 4399 origpte = *pte; 4400 4401 /* 4402 * Is the specified virtual address already mapped? 4403 */ 4404 if ((origpte & PG_V) != 0) { 4405 /* 4406 * Wiring change, just update stats. We don't worry about 4407 * wiring PT pages as they remain resident as long as there 4408 * are valid mappings in them. Hence, if a user page is wired, 4409 * the PT page will be also. 4410 */ 4411 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 4412 pmap->pm_stats.wired_count++; 4413 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 4414 pmap->pm_stats.wired_count--; 4415 4416 /* 4417 * Remove the extra PT page reference. 4418 */ 4419 if (mpte != NULL) { 4420 mpte->wire_count--; 4421 KASSERT(mpte->wire_count > 0, 4422 ("pmap_enter: missing reference to page table page," 4423 " va: 0x%lx", va)); 4424 } 4425 4426 /* 4427 * Has the physical page changed? 4428 */ 4429 opa = origpte & PG_FRAME; 4430 if (opa == pa) { 4431 /* 4432 * No, might be a protection or wiring change. 4433 */ 4434 if ((origpte & PG_MANAGED) != 0 && 4435 (newpte & PG_RW) != 0) 4436 vm_page_aflag_set(m, PGA_WRITEABLE); 4437 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 4438 goto unchanged; 4439 goto validate; 4440 } 4441 } else { 4442 /* 4443 * Increment the counters. 4444 */ 4445 if ((newpte & PG_W) != 0) 4446 pmap->pm_stats.wired_count++; 4447 pmap_resident_count_inc(pmap, 1); 4448 } 4449 4450 /* 4451 * Enter on the PV list if part of our managed memory. 4452 */ 4453 if ((newpte & PG_MANAGED) != 0) { 4454 pv = get_pv_entry(pmap, &lock); 4455 pv->pv_va = va; 4456 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 4457 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4458 m->md.pv_gen++; 4459 if ((newpte & PG_RW) != 0) 4460 vm_page_aflag_set(m, PGA_WRITEABLE); 4461 } 4462 4463 /* 4464 * Update the PTE. 4465 */ 4466 if ((origpte & PG_V) != 0) { 4467 validate: 4468 origpte = pte_load_store(pte, newpte); 4469 opa = origpte & PG_FRAME; 4470 if (opa != pa) { 4471 if ((origpte & PG_MANAGED) != 0) { 4472 om = PHYS_TO_VM_PAGE(opa); 4473 if ((origpte & (PG_M | PG_RW)) == (PG_M | 4474 PG_RW)) 4475 vm_page_dirty(om); 4476 if ((origpte & PG_A) != 0) 4477 vm_page_aflag_set(om, PGA_REFERENCED); 4478 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 4479 pmap_pvh_free(&om->md, pmap, va); 4480 if ((om->aflags & PGA_WRITEABLE) != 0 && 4481 TAILQ_EMPTY(&om->md.pv_list) && 4482 ((om->flags & PG_FICTITIOUS) != 0 || 4483 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 4484 vm_page_aflag_clear(om, PGA_WRITEABLE); 4485 } 4486 } else if ((newpte & PG_M) == 0 && (origpte & (PG_M | 4487 PG_RW)) == (PG_M | PG_RW)) { 4488 if ((origpte & PG_MANAGED) != 0) 4489 vm_page_dirty(m); 4490 4491 /* 4492 * Although the PTE may still have PG_RW set, TLB 4493 * invalidation may nonetheless be required because 4494 * the PTE no longer has PG_M set. 4495 */ 4496 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { 4497 /* 4498 * This PTE change does not require TLB invalidation. 4499 */ 4500 goto unchanged; 4501 } 4502 if ((origpte & PG_A) != 0) 4503 pmap_invalidate_page(pmap, va); 4504 } else 4505 pte_store(pte, newpte); 4506 4507 unchanged: 4508 4509 /* 4510 * If both the page table page and the reservation are fully 4511 * populated, then attempt promotion. 4512 */ 4513 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 4514 pmap_ps_enabled(pmap) && 4515 (m->flags & PG_FICTITIOUS) == 0 && 4516 vm_reserv_level_iffullpop(m) == 0) 4517 pmap_promote_pde(pmap, pde, va, &lock); 4518 4519 if (lock != NULL) 4520 rw_wunlock(lock); 4521 PMAP_UNLOCK(pmap); 4522 return (KERN_SUCCESS); 4523 } 4524 4525 /* 4526 * Tries to create a 2MB page mapping. Returns TRUE if successful and FALSE 4527 * otherwise. Fails if (1) a page table page cannot be allocated without 4528 * blocking, (2) a mapping already exists at the specified virtual address, or 4529 * (3) a pv entry cannot be allocated without reclaiming another pv entry. 4530 */ 4531 static boolean_t 4532 pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4533 struct rwlock **lockp) 4534 { 4535 pd_entry_t *pde, newpde; 4536 pt_entry_t PG_V; 4537 vm_page_t mpde; 4538 struct spglist free; 4539 4540 PG_V = pmap_valid_bit(pmap); 4541 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4542 4543 if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) { 4544 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 4545 " in pmap %p", va, pmap); 4546 return (FALSE); 4547 } 4548 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde)); 4549 pde = &pde[pmap_pde_index(va)]; 4550 if ((*pde & PG_V) != 0) { 4551 KASSERT(mpde->wire_count > 1, 4552 ("pmap_enter_pde: mpde's wire count is too low")); 4553 mpde->wire_count--; 4554 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 4555 " in pmap %p", va, pmap); 4556 return (FALSE); 4557 } 4558 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | 4559 PG_PS | PG_V; 4560 if ((m->oflags & VPO_UNMANAGED) == 0) { 4561 newpde |= PG_MANAGED; 4562 4563 /* 4564 * Abort this mapping if its PV entry could not be created. 4565 */ 4566 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m), 4567 lockp)) { 4568 SLIST_INIT(&free); 4569 if (pmap_unwire_ptp(pmap, va, mpde, &free)) { 4570 /* 4571 * Although "va" is not mapped, paging- 4572 * structure caches could nonetheless have 4573 * entries that refer to the freed page table 4574 * pages. Invalidate those entries. 4575 */ 4576 pmap_invalidate_page(pmap, va); 4577 pmap_free_zero_pages(&free); 4578 } 4579 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 4580 " in pmap %p", va, pmap); 4581 return (FALSE); 4582 } 4583 } 4584 if ((prot & VM_PROT_EXECUTE) == 0) 4585 newpde |= pg_nx; 4586 if (va < VM_MAXUSER_ADDRESS) 4587 newpde |= PG_U; 4588 4589 /* 4590 * Increment counters. 4591 */ 4592 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); 4593 4594 /* 4595 * Map the superpage. (This is not a promoted mapping; there will not 4596 * be any lingering 4KB page mappings in the TLB.) 4597 */ 4598 pde_store(pde, newpde); 4599 4600 atomic_add_long(&pmap_pde_mappings, 1); 4601 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 4602 " in pmap %p", va, pmap); 4603 return (TRUE); 4604 } 4605 4606 /* 4607 * Maps a sequence of resident pages belonging to the same object. 4608 * The sequence begins with the given page m_start. This page is 4609 * mapped at the given virtual address start. Each subsequent page is 4610 * mapped at a virtual address that is offset from start by the same 4611 * amount as the page is offset from m_start within the object. The 4612 * last page in the sequence is the page with the largest offset from 4613 * m_start that can be mapped at a virtual address less than the given 4614 * virtual address end. Not every virtual page between start and end 4615 * is mapped; only those for which a resident page exists with the 4616 * corresponding offset from m_start are mapped. 4617 */ 4618 void 4619 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 4620 vm_page_t m_start, vm_prot_t prot) 4621 { 4622 struct rwlock *lock; 4623 vm_offset_t va; 4624 vm_page_t m, mpte; 4625 vm_pindex_t diff, psize; 4626 4627 VM_OBJECT_ASSERT_LOCKED(m_start->object); 4628 4629 psize = atop(end - start); 4630 mpte = NULL; 4631 m = m_start; 4632 lock = NULL; 4633 PMAP_LOCK(pmap); 4634 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 4635 va = start + ptoa(diff); 4636 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 4637 m->psind == 1 && pmap_ps_enabled(pmap) && 4638 pmap_enter_pde(pmap, va, m, prot, &lock)) 4639 m = &m[NBPDR / PAGE_SIZE - 1]; 4640 else 4641 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 4642 mpte, &lock); 4643 m = TAILQ_NEXT(m, listq); 4644 } 4645 if (lock != NULL) 4646 rw_wunlock(lock); 4647 PMAP_UNLOCK(pmap); 4648 } 4649 4650 /* 4651 * this code makes some *MAJOR* assumptions: 4652 * 1. Current pmap & pmap exists. 4653 * 2. Not wired. 4654 * 3. Read access. 4655 * 4. No page table pages. 4656 * but is *MUCH* faster than pmap_enter... 4657 */ 4658 4659 void 4660 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4661 { 4662 struct rwlock *lock; 4663 4664 lock = NULL; 4665 PMAP_LOCK(pmap); 4666 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 4667 if (lock != NULL) 4668 rw_wunlock(lock); 4669 PMAP_UNLOCK(pmap); 4670 } 4671 4672 static vm_page_t 4673 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 4674 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 4675 { 4676 struct spglist free; 4677 pt_entry_t *pte, PG_V; 4678 vm_paddr_t pa; 4679 4680 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 4681 (m->oflags & VPO_UNMANAGED) != 0, 4682 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 4683 PG_V = pmap_valid_bit(pmap); 4684 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4685 4686 /* 4687 * In the case that a page table page is not 4688 * resident, we are creating it here. 4689 */ 4690 if (va < VM_MAXUSER_ADDRESS) { 4691 vm_pindex_t ptepindex; 4692 pd_entry_t *ptepa; 4693 4694 /* 4695 * Calculate pagetable page index 4696 */ 4697 ptepindex = pmap_pde_pindex(va); 4698 if (mpte && (mpte->pindex == ptepindex)) { 4699 mpte->wire_count++; 4700 } else { 4701 /* 4702 * Get the page directory entry 4703 */ 4704 ptepa = pmap_pde(pmap, va); 4705 4706 /* 4707 * If the page table page is mapped, we just increment 4708 * the hold count, and activate it. Otherwise, we 4709 * attempt to allocate a page table page. If this 4710 * attempt fails, we don't retry. Instead, we give up. 4711 */ 4712 if (ptepa && (*ptepa & PG_V) != 0) { 4713 if (*ptepa & PG_PS) 4714 return (NULL); 4715 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); 4716 mpte->wire_count++; 4717 } else { 4718 /* 4719 * Pass NULL instead of the PV list lock 4720 * pointer, because we don't intend to sleep. 4721 */ 4722 mpte = _pmap_allocpte(pmap, ptepindex, NULL); 4723 if (mpte == NULL) 4724 return (mpte); 4725 } 4726 } 4727 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 4728 pte = &pte[pmap_pte_index(va)]; 4729 } else { 4730 mpte = NULL; 4731 pte = vtopte(va); 4732 } 4733 if (*pte) { 4734 if (mpte != NULL) { 4735 mpte->wire_count--; 4736 mpte = NULL; 4737 } 4738 return (mpte); 4739 } 4740 4741 /* 4742 * Enter on the PV list if part of our managed memory. 4743 */ 4744 if ((m->oflags & VPO_UNMANAGED) == 0 && 4745 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 4746 if (mpte != NULL) { 4747 SLIST_INIT(&free); 4748 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 4749 /* 4750 * Although "va" is not mapped, paging- 4751 * structure caches could nonetheless have 4752 * entries that refer to the freed page table 4753 * pages. Invalidate those entries. 4754 */ 4755 pmap_invalidate_page(pmap, va); 4756 pmap_free_zero_pages(&free); 4757 } 4758 mpte = NULL; 4759 } 4760 return (mpte); 4761 } 4762 4763 /* 4764 * Increment counters 4765 */ 4766 pmap_resident_count_inc(pmap, 1); 4767 4768 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0); 4769 if ((prot & VM_PROT_EXECUTE) == 0) 4770 pa |= pg_nx; 4771 4772 /* 4773 * Now validate mapping with RO protection 4774 */ 4775 if ((m->oflags & VPO_UNMANAGED) != 0) 4776 pte_store(pte, pa | PG_V | PG_U); 4777 else 4778 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 4779 return (mpte); 4780 } 4781 4782 /* 4783 * Make a temporary mapping for a physical address. This is only intended 4784 * to be used for panic dumps. 4785 */ 4786 void * 4787 pmap_kenter_temporary(vm_paddr_t pa, int i) 4788 { 4789 vm_offset_t va; 4790 4791 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 4792 pmap_kenter(va, pa); 4793 invlpg(va); 4794 return ((void *)crashdumpmap); 4795 } 4796 4797 /* 4798 * This code maps large physical mmap regions into the 4799 * processor address space. Note that some shortcuts 4800 * are taken, but the code works. 4801 */ 4802 void 4803 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 4804 vm_pindex_t pindex, vm_size_t size) 4805 { 4806 pd_entry_t *pde; 4807 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 4808 vm_paddr_t pa, ptepa; 4809 vm_page_t p, pdpg; 4810 int pat_mode; 4811 4812 PG_A = pmap_accessed_bit(pmap); 4813 PG_M = pmap_modified_bit(pmap); 4814 PG_V = pmap_valid_bit(pmap); 4815 PG_RW = pmap_rw_bit(pmap); 4816 4817 VM_OBJECT_ASSERT_WLOCKED(object); 4818 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 4819 ("pmap_object_init_pt: non-device object")); 4820 if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 4821 if (!pmap_ps_enabled(pmap)) 4822 return; 4823 if (!vm_object_populate(object, pindex, pindex + atop(size))) 4824 return; 4825 p = vm_page_lookup(object, pindex); 4826 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4827 ("pmap_object_init_pt: invalid page %p", p)); 4828 pat_mode = p->md.pat_mode; 4829 4830 /* 4831 * Abort the mapping if the first page is not physically 4832 * aligned to a 2MB page boundary. 4833 */ 4834 ptepa = VM_PAGE_TO_PHYS(p); 4835 if (ptepa & (NBPDR - 1)) 4836 return; 4837 4838 /* 4839 * Skip the first page. Abort the mapping if the rest of 4840 * the pages are not physically contiguous or have differing 4841 * memory attributes. 4842 */ 4843 p = TAILQ_NEXT(p, listq); 4844 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 4845 pa += PAGE_SIZE) { 4846 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4847 ("pmap_object_init_pt: invalid page %p", p)); 4848 if (pa != VM_PAGE_TO_PHYS(p) || 4849 pat_mode != p->md.pat_mode) 4850 return; 4851 p = TAILQ_NEXT(p, listq); 4852 } 4853 4854 /* 4855 * Map using 2MB pages. Since "ptepa" is 2M aligned and 4856 * "size" is a multiple of 2M, adding the PAT setting to "pa" 4857 * will not affect the termination of this loop. 4858 */ 4859 PMAP_LOCK(pmap); 4860 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); 4861 pa < ptepa + size; pa += NBPDR) { 4862 pdpg = pmap_allocpde(pmap, addr, NULL); 4863 if (pdpg == NULL) { 4864 /* 4865 * The creation of mappings below is only an 4866 * optimization. If a page directory page 4867 * cannot be allocated without blocking, 4868 * continue on to the next mapping rather than 4869 * blocking. 4870 */ 4871 addr += NBPDR; 4872 continue; 4873 } 4874 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 4875 pde = &pde[pmap_pde_index(addr)]; 4876 if ((*pde & PG_V) == 0) { 4877 pde_store(pde, pa | PG_PS | PG_M | PG_A | 4878 PG_U | PG_RW | PG_V); 4879 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); 4880 atomic_add_long(&pmap_pde_mappings, 1); 4881 } else { 4882 /* Continue on if the PDE is already valid. */ 4883 pdpg->wire_count--; 4884 KASSERT(pdpg->wire_count > 0, 4885 ("pmap_object_init_pt: missing reference " 4886 "to page directory page, va: 0x%lx", addr)); 4887 } 4888 addr += NBPDR; 4889 } 4890 PMAP_UNLOCK(pmap); 4891 } 4892 } 4893 4894 /* 4895 * Clear the wired attribute from the mappings for the specified range of 4896 * addresses in the given pmap. Every valid mapping within that range 4897 * must have the wired attribute set. In contrast, invalid mappings 4898 * cannot have the wired attribute set, so they are ignored. 4899 * 4900 * The wired attribute of the page table entry is not a hardware 4901 * feature, so there is no need to invalidate any TLB entries. 4902 * Since pmap_demote_pde() for the wired entry must never fail, 4903 * pmap_delayed_invl_started()/finished() calls around the 4904 * function are not needed. 4905 */ 4906 void 4907 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4908 { 4909 vm_offset_t va_next; 4910 pml4_entry_t *pml4e; 4911 pdp_entry_t *pdpe; 4912 pd_entry_t *pde; 4913 pt_entry_t *pte, PG_V; 4914 4915 PG_V = pmap_valid_bit(pmap); 4916 PMAP_LOCK(pmap); 4917 for (; sva < eva; sva = va_next) { 4918 pml4e = pmap_pml4e(pmap, sva); 4919 if ((*pml4e & PG_V) == 0) { 4920 va_next = (sva + NBPML4) & ~PML4MASK; 4921 if (va_next < sva) 4922 va_next = eva; 4923 continue; 4924 } 4925 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 4926 if ((*pdpe & PG_V) == 0) { 4927 va_next = (sva + NBPDP) & ~PDPMASK; 4928 if (va_next < sva) 4929 va_next = eva; 4930 continue; 4931 } 4932 va_next = (sva + NBPDR) & ~PDRMASK; 4933 if (va_next < sva) 4934 va_next = eva; 4935 pde = pmap_pdpe_to_pde(pdpe, sva); 4936 if ((*pde & PG_V) == 0) 4937 continue; 4938 if ((*pde & PG_PS) != 0) { 4939 if ((*pde & PG_W) == 0) 4940 panic("pmap_unwire: pde %#jx is missing PG_W", 4941 (uintmax_t)*pde); 4942 4943 /* 4944 * Are we unwiring the entire large page? If not, 4945 * demote the mapping and fall through. 4946 */ 4947 if (sva + NBPDR == va_next && eva >= va_next) { 4948 atomic_clear_long(pde, PG_W); 4949 pmap->pm_stats.wired_count -= NBPDR / 4950 PAGE_SIZE; 4951 continue; 4952 } else if (!pmap_demote_pde(pmap, pde, sva)) 4953 panic("pmap_unwire: demotion failed"); 4954 } 4955 if (va_next > eva) 4956 va_next = eva; 4957 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 4958 sva += PAGE_SIZE) { 4959 if ((*pte & PG_V) == 0) 4960 continue; 4961 if ((*pte & PG_W) == 0) 4962 panic("pmap_unwire: pte %#jx is missing PG_W", 4963 (uintmax_t)*pte); 4964 4965 /* 4966 * PG_W must be cleared atomically. Although the pmap 4967 * lock synchronizes access to PG_W, another processor 4968 * could be setting PG_M and/or PG_A concurrently. 4969 */ 4970 atomic_clear_long(pte, PG_W); 4971 pmap->pm_stats.wired_count--; 4972 } 4973 } 4974 PMAP_UNLOCK(pmap); 4975 } 4976 4977 /* 4978 * Copy the range specified by src_addr/len 4979 * from the source map to the range dst_addr/len 4980 * in the destination map. 4981 * 4982 * This routine is only advisory and need not do anything. 4983 */ 4984 4985 void 4986 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 4987 vm_offset_t src_addr) 4988 { 4989 struct rwlock *lock; 4990 struct spglist free; 4991 vm_offset_t addr; 4992 vm_offset_t end_addr = src_addr + len; 4993 vm_offset_t va_next; 4994 pt_entry_t PG_A, PG_M, PG_V; 4995 4996 if (dst_addr != src_addr) 4997 return; 4998 4999 if (dst_pmap->pm_type != src_pmap->pm_type) 5000 return; 5001 5002 /* 5003 * EPT page table entries that require emulation of A/D bits are 5004 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although 5005 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit 5006 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT 5007 * implementations flag an EPT misconfiguration for exec-only 5008 * mappings we skip this function entirely for emulated pmaps. 5009 */ 5010 if (pmap_emulate_ad_bits(dst_pmap)) 5011 return; 5012 5013 lock = NULL; 5014 if (dst_pmap < src_pmap) { 5015 PMAP_LOCK(dst_pmap); 5016 PMAP_LOCK(src_pmap); 5017 } else { 5018 PMAP_LOCK(src_pmap); 5019 PMAP_LOCK(dst_pmap); 5020 } 5021 5022 PG_A = pmap_accessed_bit(dst_pmap); 5023 PG_M = pmap_modified_bit(dst_pmap); 5024 PG_V = pmap_valid_bit(dst_pmap); 5025 5026 for (addr = src_addr; addr < end_addr; addr = va_next) { 5027 pt_entry_t *src_pte, *dst_pte; 5028 vm_page_t dstmpde, dstmpte, srcmpte; 5029 pml4_entry_t *pml4e; 5030 pdp_entry_t *pdpe; 5031 pd_entry_t srcptepaddr, *pde; 5032 5033 KASSERT(addr < UPT_MIN_ADDRESS, 5034 ("pmap_copy: invalid to pmap_copy page tables")); 5035 5036 pml4e = pmap_pml4e(src_pmap, addr); 5037 if ((*pml4e & PG_V) == 0) { 5038 va_next = (addr + NBPML4) & ~PML4MASK; 5039 if (va_next < addr) 5040 va_next = end_addr; 5041 continue; 5042 } 5043 5044 pdpe = pmap_pml4e_to_pdpe(pml4e, addr); 5045 if ((*pdpe & PG_V) == 0) { 5046 va_next = (addr + NBPDP) & ~PDPMASK; 5047 if (va_next < addr) 5048 va_next = end_addr; 5049 continue; 5050 } 5051 5052 va_next = (addr + NBPDR) & ~PDRMASK; 5053 if (va_next < addr) 5054 va_next = end_addr; 5055 5056 pde = pmap_pdpe_to_pde(pdpe, addr); 5057 srcptepaddr = *pde; 5058 if (srcptepaddr == 0) 5059 continue; 5060 5061 if (srcptepaddr & PG_PS) { 5062 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 5063 continue; 5064 dstmpde = pmap_allocpde(dst_pmap, addr, NULL); 5065 if (dstmpde == NULL) 5066 break; 5067 pde = (pd_entry_t *) 5068 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde)); 5069 pde = &pde[pmap_pde_index(addr)]; 5070 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 5071 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & 5072 PG_PS_FRAME, &lock))) { 5073 *pde = srcptepaddr & ~PG_W; 5074 pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE); 5075 atomic_add_long(&pmap_pde_mappings, 1); 5076 } else 5077 dstmpde->wire_count--; 5078 continue; 5079 } 5080 5081 srcptepaddr &= PG_FRAME; 5082 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 5083 KASSERT(srcmpte->wire_count > 0, 5084 ("pmap_copy: source page table page is unused")); 5085 5086 if (va_next > end_addr) 5087 va_next = end_addr; 5088 5089 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 5090 src_pte = &src_pte[pmap_pte_index(addr)]; 5091 dstmpte = NULL; 5092 while (addr < va_next) { 5093 pt_entry_t ptetemp; 5094 ptetemp = *src_pte; 5095 /* 5096 * we only virtual copy managed pages 5097 */ 5098 if ((ptetemp & PG_MANAGED) != 0) { 5099 if (dstmpte != NULL && 5100 dstmpte->pindex == pmap_pde_pindex(addr)) 5101 dstmpte->wire_count++; 5102 else if ((dstmpte = pmap_allocpte(dst_pmap, 5103 addr, NULL)) == NULL) 5104 goto out; 5105 dst_pte = (pt_entry_t *) 5106 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 5107 dst_pte = &dst_pte[pmap_pte_index(addr)]; 5108 if (*dst_pte == 0 && 5109 pmap_try_insert_pv_entry(dst_pmap, addr, 5110 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), 5111 &lock)) { 5112 /* 5113 * Clear the wired, modified, and 5114 * accessed (referenced) bits 5115 * during the copy. 5116 */ 5117 *dst_pte = ptetemp & ~(PG_W | PG_M | 5118 PG_A); 5119 pmap_resident_count_inc(dst_pmap, 1); 5120 } else { 5121 SLIST_INIT(&free); 5122 if (pmap_unwire_ptp(dst_pmap, addr, 5123 dstmpte, &free)) { 5124 /* 5125 * Although "addr" is not 5126 * mapped, paging-structure 5127 * caches could nonetheless 5128 * have entries that refer to 5129 * the freed page table pages. 5130 * Invalidate those entries. 5131 */ 5132 pmap_invalidate_page(dst_pmap, 5133 addr); 5134 pmap_free_zero_pages(&free); 5135 } 5136 goto out; 5137 } 5138 if (dstmpte->wire_count >= srcmpte->wire_count) 5139 break; 5140 } 5141 addr += PAGE_SIZE; 5142 src_pte++; 5143 } 5144 } 5145 out: 5146 if (lock != NULL) 5147 rw_wunlock(lock); 5148 PMAP_UNLOCK(src_pmap); 5149 PMAP_UNLOCK(dst_pmap); 5150 } 5151 5152 /* 5153 * Zero the specified hardware page. 5154 */ 5155 void 5156 pmap_zero_page(vm_page_t m) 5157 { 5158 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5159 5160 pagezero((void *)va); 5161 } 5162 5163 /* 5164 * Zero an an area within a single hardware page. off and size must not 5165 * cover an area beyond a single hardware page. 5166 */ 5167 void 5168 pmap_zero_page_area(vm_page_t m, int off, int size) 5169 { 5170 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5171 5172 if (off == 0 && size == PAGE_SIZE) 5173 pagezero((void *)va); 5174 else 5175 bzero((char *)va + off, size); 5176 } 5177 5178 /* 5179 * Copy 1 specified hardware page to another. 5180 */ 5181 void 5182 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 5183 { 5184 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 5185 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 5186 5187 pagecopy((void *)src, (void *)dst); 5188 } 5189 5190 int unmapped_buf_allowed = 1; 5191 5192 void 5193 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 5194 vm_offset_t b_offset, int xfersize) 5195 { 5196 void *a_cp, *b_cp; 5197 vm_page_t pages[2]; 5198 vm_offset_t vaddr[2], a_pg_offset, b_pg_offset; 5199 int cnt; 5200 boolean_t mapped; 5201 5202 while (xfersize > 0) { 5203 a_pg_offset = a_offset & PAGE_MASK; 5204 pages[0] = ma[a_offset >> PAGE_SHIFT]; 5205 b_pg_offset = b_offset & PAGE_MASK; 5206 pages[1] = mb[b_offset >> PAGE_SHIFT]; 5207 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 5208 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 5209 mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE); 5210 a_cp = (char *)vaddr[0] + a_pg_offset; 5211 b_cp = (char *)vaddr[1] + b_pg_offset; 5212 bcopy(a_cp, b_cp, cnt); 5213 if (__predict_false(mapped)) 5214 pmap_unmap_io_transient(pages, vaddr, 2, FALSE); 5215 a_offset += cnt; 5216 b_offset += cnt; 5217 xfersize -= cnt; 5218 } 5219 } 5220 5221 /* 5222 * Returns true if the pmap's pv is one of the first 5223 * 16 pvs linked to from this page. This count may 5224 * be changed upwards or downwards in the future; it 5225 * is only necessary that true be returned for a small 5226 * subset of pmaps for proper page aging. 5227 */ 5228 boolean_t 5229 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 5230 { 5231 struct md_page *pvh; 5232 struct rwlock *lock; 5233 pv_entry_t pv; 5234 int loops = 0; 5235 boolean_t rv; 5236 5237 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5238 ("pmap_page_exists_quick: page %p is not managed", m)); 5239 rv = FALSE; 5240 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5241 rw_rlock(lock); 5242 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5243 if (PV_PMAP(pv) == pmap) { 5244 rv = TRUE; 5245 break; 5246 } 5247 loops++; 5248 if (loops >= 16) 5249 break; 5250 } 5251 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 5252 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5253 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5254 if (PV_PMAP(pv) == pmap) { 5255 rv = TRUE; 5256 break; 5257 } 5258 loops++; 5259 if (loops >= 16) 5260 break; 5261 } 5262 } 5263 rw_runlock(lock); 5264 return (rv); 5265 } 5266 5267 /* 5268 * pmap_page_wired_mappings: 5269 * 5270 * Return the number of managed mappings to the given physical page 5271 * that are wired. 5272 */ 5273 int 5274 pmap_page_wired_mappings(vm_page_t m) 5275 { 5276 struct rwlock *lock; 5277 struct md_page *pvh; 5278 pmap_t pmap; 5279 pt_entry_t *pte; 5280 pv_entry_t pv; 5281 int count, md_gen, pvh_gen; 5282 5283 if ((m->oflags & VPO_UNMANAGED) != 0) 5284 return (0); 5285 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5286 rw_rlock(lock); 5287 restart: 5288 count = 0; 5289 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5290 pmap = PV_PMAP(pv); 5291 if (!PMAP_TRYLOCK(pmap)) { 5292 md_gen = m->md.pv_gen; 5293 rw_runlock(lock); 5294 PMAP_LOCK(pmap); 5295 rw_rlock(lock); 5296 if (md_gen != m->md.pv_gen) { 5297 PMAP_UNLOCK(pmap); 5298 goto restart; 5299 } 5300 } 5301 pte = pmap_pte(pmap, pv->pv_va); 5302 if ((*pte & PG_W) != 0) 5303 count++; 5304 PMAP_UNLOCK(pmap); 5305 } 5306 if ((m->flags & PG_FICTITIOUS) == 0) { 5307 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5308 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5309 pmap = PV_PMAP(pv); 5310 if (!PMAP_TRYLOCK(pmap)) { 5311 md_gen = m->md.pv_gen; 5312 pvh_gen = pvh->pv_gen; 5313 rw_runlock(lock); 5314 PMAP_LOCK(pmap); 5315 rw_rlock(lock); 5316 if (md_gen != m->md.pv_gen || 5317 pvh_gen != pvh->pv_gen) { 5318 PMAP_UNLOCK(pmap); 5319 goto restart; 5320 } 5321 } 5322 pte = pmap_pde(pmap, pv->pv_va); 5323 if ((*pte & PG_W) != 0) 5324 count++; 5325 PMAP_UNLOCK(pmap); 5326 } 5327 } 5328 rw_runlock(lock); 5329 return (count); 5330 } 5331 5332 /* 5333 * Returns TRUE if the given page is mapped individually or as part of 5334 * a 2mpage. Otherwise, returns FALSE. 5335 */ 5336 boolean_t 5337 pmap_page_is_mapped(vm_page_t m) 5338 { 5339 struct rwlock *lock; 5340 boolean_t rv; 5341 5342 if ((m->oflags & VPO_UNMANAGED) != 0) 5343 return (FALSE); 5344 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5345 rw_rlock(lock); 5346 rv = !TAILQ_EMPTY(&m->md.pv_list) || 5347 ((m->flags & PG_FICTITIOUS) == 0 && 5348 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 5349 rw_runlock(lock); 5350 return (rv); 5351 } 5352 5353 /* 5354 * Destroy all managed, non-wired mappings in the given user-space 5355 * pmap. This pmap cannot be active on any processor besides the 5356 * caller. 5357 * 5358 * This function cannot be applied to the kernel pmap. Moreover, it 5359 * is not intended for general use. It is only to be used during 5360 * process termination. Consequently, it can be implemented in ways 5361 * that make it faster than pmap_remove(). First, it can more quickly 5362 * destroy mappings by iterating over the pmap's collection of PV 5363 * entries, rather than searching the page table. Second, it doesn't 5364 * have to test and clear the page table entries atomically, because 5365 * no processor is currently accessing the user address space. In 5366 * particular, a page table entry's dirty bit won't change state once 5367 * this function starts. 5368 */ 5369 void 5370 pmap_remove_pages(pmap_t pmap) 5371 { 5372 pd_entry_t ptepde; 5373 pt_entry_t *pte, tpte; 5374 pt_entry_t PG_M, PG_RW, PG_V; 5375 struct spglist free; 5376 vm_page_t m, mpte, mt; 5377 pv_entry_t pv; 5378 struct md_page *pvh; 5379 struct pv_chunk *pc, *npc; 5380 struct rwlock *lock; 5381 int64_t bit; 5382 uint64_t inuse, bitmask; 5383 int allfree, field, freed, idx; 5384 boolean_t superpage; 5385 vm_paddr_t pa; 5386 5387 /* 5388 * Assert that the given pmap is only active on the current 5389 * CPU. Unfortunately, we cannot block another CPU from 5390 * activating the pmap while this function is executing. 5391 */ 5392 KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); 5393 #ifdef INVARIANTS 5394 { 5395 cpuset_t other_cpus; 5396 5397 other_cpus = all_cpus; 5398 critical_enter(); 5399 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 5400 CPU_AND(&other_cpus, &pmap->pm_active); 5401 critical_exit(); 5402 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); 5403 } 5404 #endif 5405 5406 lock = NULL; 5407 PG_M = pmap_modified_bit(pmap); 5408 PG_V = pmap_valid_bit(pmap); 5409 PG_RW = pmap_rw_bit(pmap); 5410 5411 SLIST_INIT(&free); 5412 PMAP_LOCK(pmap); 5413 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 5414 allfree = 1; 5415 freed = 0; 5416 for (field = 0; field < _NPCM; field++) { 5417 inuse = ~pc->pc_map[field] & pc_freemask[field]; 5418 while (inuse != 0) { 5419 bit = bsfq(inuse); 5420 bitmask = 1UL << bit; 5421 idx = field * 64 + bit; 5422 pv = &pc->pc_pventry[idx]; 5423 inuse &= ~bitmask; 5424 5425 pte = pmap_pdpe(pmap, pv->pv_va); 5426 ptepde = *pte; 5427 pte = pmap_pdpe_to_pde(pte, pv->pv_va); 5428 tpte = *pte; 5429 if ((tpte & (PG_PS | PG_V)) == PG_V) { 5430 superpage = FALSE; 5431 ptepde = tpte; 5432 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 5433 PG_FRAME); 5434 pte = &pte[pmap_pte_index(pv->pv_va)]; 5435 tpte = *pte; 5436 } else { 5437 /* 5438 * Keep track whether 'tpte' is a 5439 * superpage explicitly instead of 5440 * relying on PG_PS being set. 5441 * 5442 * This is because PG_PS is numerically 5443 * identical to PG_PTE_PAT and thus a 5444 * regular page could be mistaken for 5445 * a superpage. 5446 */ 5447 superpage = TRUE; 5448 } 5449 5450 if ((tpte & PG_V) == 0) { 5451 panic("bad pte va %lx pte %lx", 5452 pv->pv_va, tpte); 5453 } 5454 5455 /* 5456 * We cannot remove wired pages from a process' mapping at this time 5457 */ 5458 if (tpte & PG_W) { 5459 allfree = 0; 5460 continue; 5461 } 5462 5463 if (superpage) 5464 pa = tpte & PG_PS_FRAME; 5465 else 5466 pa = tpte & PG_FRAME; 5467 5468 m = PHYS_TO_VM_PAGE(pa); 5469 KASSERT(m->phys_addr == pa, 5470 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 5471 m, (uintmax_t)m->phys_addr, 5472 (uintmax_t)tpte)); 5473 5474 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 5475 m < &vm_page_array[vm_page_array_size], 5476 ("pmap_remove_pages: bad tpte %#jx", 5477 (uintmax_t)tpte)); 5478 5479 pte_clear(pte); 5480 5481 /* 5482 * Update the vm_page_t clean/reference bits. 5483 */ 5484 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5485 if (superpage) { 5486 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 5487 vm_page_dirty(mt); 5488 } else 5489 vm_page_dirty(m); 5490 } 5491 5492 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 5493 5494 /* Mark free */ 5495 pc->pc_map[field] |= bitmask; 5496 if (superpage) { 5497 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); 5498 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 5499 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5500 pvh->pv_gen++; 5501 if (TAILQ_EMPTY(&pvh->pv_list)) { 5502 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 5503 if ((mt->aflags & PGA_WRITEABLE) != 0 && 5504 TAILQ_EMPTY(&mt->md.pv_list)) 5505 vm_page_aflag_clear(mt, PGA_WRITEABLE); 5506 } 5507 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 5508 if (mpte != NULL) { 5509 pmap_resident_count_dec(pmap, 1); 5510 KASSERT(mpte->wire_count == NPTEPG, 5511 ("pmap_remove_pages: pte page wire count error")); 5512 mpte->wire_count = 0; 5513 pmap_add_delayed_free_list(mpte, &free, FALSE); 5514 atomic_subtract_int(&vm_cnt.v_wire_count, 1); 5515 } 5516 } else { 5517 pmap_resident_count_dec(pmap, 1); 5518 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5519 m->md.pv_gen++; 5520 if ((m->aflags & PGA_WRITEABLE) != 0 && 5521 TAILQ_EMPTY(&m->md.pv_list) && 5522 (m->flags & PG_FICTITIOUS) == 0) { 5523 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5524 if (TAILQ_EMPTY(&pvh->pv_list)) 5525 vm_page_aflag_clear(m, PGA_WRITEABLE); 5526 } 5527 } 5528 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 5529 freed++; 5530 } 5531 } 5532 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 5533 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 5534 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 5535 if (allfree) { 5536 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5537 free_pv_chunk(pc); 5538 } 5539 } 5540 if (lock != NULL) 5541 rw_wunlock(lock); 5542 pmap_invalidate_all(pmap); 5543 PMAP_UNLOCK(pmap); 5544 pmap_free_zero_pages(&free); 5545 } 5546 5547 static boolean_t 5548 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 5549 { 5550 struct rwlock *lock; 5551 pv_entry_t pv; 5552 struct md_page *pvh; 5553 pt_entry_t *pte, mask; 5554 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 5555 pmap_t pmap; 5556 int md_gen, pvh_gen; 5557 boolean_t rv; 5558 5559 rv = FALSE; 5560 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5561 rw_rlock(lock); 5562 restart: 5563 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5564 pmap = PV_PMAP(pv); 5565 if (!PMAP_TRYLOCK(pmap)) { 5566 md_gen = m->md.pv_gen; 5567 rw_runlock(lock); 5568 PMAP_LOCK(pmap); 5569 rw_rlock(lock); 5570 if (md_gen != m->md.pv_gen) { 5571 PMAP_UNLOCK(pmap); 5572 goto restart; 5573 } 5574 } 5575 pte = pmap_pte(pmap, pv->pv_va); 5576 mask = 0; 5577 if (modified) { 5578 PG_M = pmap_modified_bit(pmap); 5579 PG_RW = pmap_rw_bit(pmap); 5580 mask |= PG_RW | PG_M; 5581 } 5582 if (accessed) { 5583 PG_A = pmap_accessed_bit(pmap); 5584 PG_V = pmap_valid_bit(pmap); 5585 mask |= PG_V | PG_A; 5586 } 5587 rv = (*pte & mask) == mask; 5588 PMAP_UNLOCK(pmap); 5589 if (rv) 5590 goto out; 5591 } 5592 if ((m->flags & PG_FICTITIOUS) == 0) { 5593 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5594 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5595 pmap = PV_PMAP(pv); 5596 if (!PMAP_TRYLOCK(pmap)) { 5597 md_gen = m->md.pv_gen; 5598 pvh_gen = pvh->pv_gen; 5599 rw_runlock(lock); 5600 PMAP_LOCK(pmap); 5601 rw_rlock(lock); 5602 if (md_gen != m->md.pv_gen || 5603 pvh_gen != pvh->pv_gen) { 5604 PMAP_UNLOCK(pmap); 5605 goto restart; 5606 } 5607 } 5608 pte = pmap_pde(pmap, pv->pv_va); 5609 mask = 0; 5610 if (modified) { 5611 PG_M = pmap_modified_bit(pmap); 5612 PG_RW = pmap_rw_bit(pmap); 5613 mask |= PG_RW | PG_M; 5614 } 5615 if (accessed) { 5616 PG_A = pmap_accessed_bit(pmap); 5617 PG_V = pmap_valid_bit(pmap); 5618 mask |= PG_V | PG_A; 5619 } 5620 rv = (*pte & mask) == mask; 5621 PMAP_UNLOCK(pmap); 5622 if (rv) 5623 goto out; 5624 } 5625 } 5626 out: 5627 rw_runlock(lock); 5628 return (rv); 5629 } 5630 5631 /* 5632 * pmap_is_modified: 5633 * 5634 * Return whether or not the specified physical page was modified 5635 * in any physical maps. 5636 */ 5637 boolean_t 5638 pmap_is_modified(vm_page_t m) 5639 { 5640 5641 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5642 ("pmap_is_modified: page %p is not managed", m)); 5643 5644 /* 5645 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 5646 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 5647 * is clear, no PTEs can have PG_M set. 5648 */ 5649 VM_OBJECT_ASSERT_WLOCKED(m->object); 5650 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 5651 return (FALSE); 5652 return (pmap_page_test_mappings(m, FALSE, TRUE)); 5653 } 5654 5655 /* 5656 * pmap_is_prefaultable: 5657 * 5658 * Return whether or not the specified virtual address is eligible 5659 * for prefault. 5660 */ 5661 boolean_t 5662 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 5663 { 5664 pd_entry_t *pde; 5665 pt_entry_t *pte, PG_V; 5666 boolean_t rv; 5667 5668 PG_V = pmap_valid_bit(pmap); 5669 rv = FALSE; 5670 PMAP_LOCK(pmap); 5671 pde = pmap_pde(pmap, addr); 5672 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { 5673 pte = pmap_pde_to_pte(pde, addr); 5674 rv = (*pte & PG_V) == 0; 5675 } 5676 PMAP_UNLOCK(pmap); 5677 return (rv); 5678 } 5679 5680 /* 5681 * pmap_is_referenced: 5682 * 5683 * Return whether or not the specified physical page was referenced 5684 * in any physical maps. 5685 */ 5686 boolean_t 5687 pmap_is_referenced(vm_page_t m) 5688 { 5689 5690 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5691 ("pmap_is_referenced: page %p is not managed", m)); 5692 return (pmap_page_test_mappings(m, TRUE, FALSE)); 5693 } 5694 5695 /* 5696 * Clear the write and modified bits in each of the given page's mappings. 5697 */ 5698 void 5699 pmap_remove_write(vm_page_t m) 5700 { 5701 struct md_page *pvh; 5702 pmap_t pmap; 5703 struct rwlock *lock; 5704 pv_entry_t next_pv, pv; 5705 pd_entry_t *pde; 5706 pt_entry_t oldpte, *pte, PG_M, PG_RW; 5707 vm_offset_t va; 5708 int pvh_gen, md_gen; 5709 5710 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5711 ("pmap_remove_write: page %p is not managed", m)); 5712 5713 /* 5714 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 5715 * set by another thread while the object is locked. Thus, 5716 * if PGA_WRITEABLE is clear, no page table entries need updating. 5717 */ 5718 VM_OBJECT_ASSERT_WLOCKED(m->object); 5719 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 5720 return; 5721 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5722 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 5723 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5724 retry_pv_loop: 5725 rw_wlock(lock); 5726 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5727 pmap = PV_PMAP(pv); 5728 if (!PMAP_TRYLOCK(pmap)) { 5729 pvh_gen = pvh->pv_gen; 5730 rw_wunlock(lock); 5731 PMAP_LOCK(pmap); 5732 rw_wlock(lock); 5733 if (pvh_gen != pvh->pv_gen) { 5734 PMAP_UNLOCK(pmap); 5735 rw_wunlock(lock); 5736 goto retry_pv_loop; 5737 } 5738 } 5739 PG_RW = pmap_rw_bit(pmap); 5740 va = pv->pv_va; 5741 pde = pmap_pde(pmap, va); 5742 if ((*pde & PG_RW) != 0) 5743 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 5744 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 5745 ("inconsistent pv lock %p %p for page %p", 5746 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 5747 PMAP_UNLOCK(pmap); 5748 } 5749 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5750 pmap = PV_PMAP(pv); 5751 if (!PMAP_TRYLOCK(pmap)) { 5752 pvh_gen = pvh->pv_gen; 5753 md_gen = m->md.pv_gen; 5754 rw_wunlock(lock); 5755 PMAP_LOCK(pmap); 5756 rw_wlock(lock); 5757 if (pvh_gen != pvh->pv_gen || 5758 md_gen != m->md.pv_gen) { 5759 PMAP_UNLOCK(pmap); 5760 rw_wunlock(lock); 5761 goto retry_pv_loop; 5762 } 5763 } 5764 PG_M = pmap_modified_bit(pmap); 5765 PG_RW = pmap_rw_bit(pmap); 5766 pde = pmap_pde(pmap, pv->pv_va); 5767 KASSERT((*pde & PG_PS) == 0, 5768 ("pmap_remove_write: found a 2mpage in page %p's pv list", 5769 m)); 5770 pte = pmap_pde_to_pte(pde, pv->pv_va); 5771 retry: 5772 oldpte = *pte; 5773 if (oldpte & PG_RW) { 5774 if (!atomic_cmpset_long(pte, oldpte, oldpte & 5775 ~(PG_RW | PG_M))) 5776 goto retry; 5777 if ((oldpte & PG_M) != 0) 5778 vm_page_dirty(m); 5779 pmap_invalidate_page(pmap, pv->pv_va); 5780 } 5781 PMAP_UNLOCK(pmap); 5782 } 5783 rw_wunlock(lock); 5784 vm_page_aflag_clear(m, PGA_WRITEABLE); 5785 pmap_delayed_invl_wait(m); 5786 } 5787 5788 static __inline boolean_t 5789 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) 5790 { 5791 5792 if (!pmap_emulate_ad_bits(pmap)) 5793 return (TRUE); 5794 5795 KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); 5796 5797 /* 5798 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration 5799 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared 5800 * if the EPT_PG_WRITE bit is set. 5801 */ 5802 if ((pte & EPT_PG_WRITE) != 0) 5803 return (FALSE); 5804 5805 /* 5806 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. 5807 */ 5808 if ((pte & EPT_PG_EXECUTE) == 0 || 5809 ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) 5810 return (TRUE); 5811 else 5812 return (FALSE); 5813 } 5814 5815 /* 5816 * pmap_ts_referenced: 5817 * 5818 * Return a count of reference bits for a page, clearing those bits. 5819 * It is not necessary for every reference bit to be cleared, but it 5820 * is necessary that 0 only be returned when there are truly no 5821 * reference bits set. 5822 * 5823 * As an optimization, update the page's dirty field if a modified bit is 5824 * found while counting reference bits. This opportunistic update can be 5825 * performed at low cost and can eliminate the need for some future calls 5826 * to pmap_is_modified(). However, since this function stops after 5827 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 5828 * dirty pages. Those dirty pages will only be detected by a future call 5829 * to pmap_is_modified(). 5830 * 5831 * A DI block is not needed within this function, because 5832 * invalidations are performed before the PV list lock is 5833 * released. 5834 */ 5835 int 5836 pmap_ts_referenced(vm_page_t m) 5837 { 5838 struct md_page *pvh; 5839 pv_entry_t pv, pvf; 5840 pmap_t pmap; 5841 struct rwlock *lock; 5842 pd_entry_t oldpde, *pde; 5843 pt_entry_t *pte, PG_A, PG_M, PG_RW; 5844 vm_offset_t va; 5845 vm_paddr_t pa; 5846 int cleared, md_gen, not_cleared, pvh_gen; 5847 struct spglist free; 5848 boolean_t demoted; 5849 5850 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5851 ("pmap_ts_referenced: page %p is not managed", m)); 5852 SLIST_INIT(&free); 5853 cleared = 0; 5854 pa = VM_PAGE_TO_PHYS(m); 5855 lock = PHYS_TO_PV_LIST_LOCK(pa); 5856 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 5857 rw_wlock(lock); 5858 retry: 5859 not_cleared = 0; 5860 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 5861 goto small_mappings; 5862 pv = pvf; 5863 do { 5864 if (pvf == NULL) 5865 pvf = pv; 5866 pmap = PV_PMAP(pv); 5867 if (!PMAP_TRYLOCK(pmap)) { 5868 pvh_gen = pvh->pv_gen; 5869 rw_wunlock(lock); 5870 PMAP_LOCK(pmap); 5871 rw_wlock(lock); 5872 if (pvh_gen != pvh->pv_gen) { 5873 PMAP_UNLOCK(pmap); 5874 goto retry; 5875 } 5876 } 5877 PG_A = pmap_accessed_bit(pmap); 5878 PG_M = pmap_modified_bit(pmap); 5879 PG_RW = pmap_rw_bit(pmap); 5880 va = pv->pv_va; 5881 pde = pmap_pde(pmap, pv->pv_va); 5882 oldpde = *pde; 5883 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5884 /* 5885 * Although "oldpde" is mapping a 2MB page, because 5886 * this function is called at a 4KB page granularity, 5887 * we only update the 4KB page under test. 5888 */ 5889 vm_page_dirty(m); 5890 } 5891 if ((oldpde & PG_A) != 0) { 5892 /* 5893 * Since this reference bit is shared by 512 4KB 5894 * pages, it should not be cleared every time it is 5895 * tested. Apply a simple "hash" function on the 5896 * physical page number, the virtual superpage number, 5897 * and the pmap address to select one 4KB page out of 5898 * the 512 on which testing the reference bit will 5899 * result in clearing that reference bit. This 5900 * function is designed to avoid the selection of the 5901 * same 4KB page for every 2MB page mapping. 5902 * 5903 * On demotion, a mapping that hasn't been referenced 5904 * is simply destroyed. To avoid the possibility of a 5905 * subsequent page fault on a demoted wired mapping, 5906 * always leave its reference bit set. Moreover, 5907 * since the superpage is wired, the current state of 5908 * its reference bit won't affect page replacement. 5909 */ 5910 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 5911 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 5912 (oldpde & PG_W) == 0) { 5913 if (safe_to_clear_referenced(pmap, oldpde)) { 5914 atomic_clear_long(pde, PG_A); 5915 pmap_invalidate_page(pmap, pv->pv_va); 5916 demoted = FALSE; 5917 } else if (pmap_demote_pde_locked(pmap, pde, 5918 pv->pv_va, &lock)) { 5919 /* 5920 * Remove the mapping to a single page 5921 * so that a subsequent access may 5922 * repromote. Since the underlying 5923 * page table page is fully populated, 5924 * this removal never frees a page 5925 * table page. 5926 */ 5927 demoted = TRUE; 5928 va += VM_PAGE_TO_PHYS(m) - (oldpde & 5929 PG_PS_FRAME); 5930 pte = pmap_pde_to_pte(pde, va); 5931 pmap_remove_pte(pmap, pte, va, *pde, 5932 NULL, &lock); 5933 pmap_invalidate_page(pmap, va); 5934 } else 5935 demoted = TRUE; 5936 5937 if (demoted) { 5938 /* 5939 * The superpage mapping was removed 5940 * entirely and therefore 'pv' is no 5941 * longer valid. 5942 */ 5943 if (pvf == pv) 5944 pvf = NULL; 5945 pv = NULL; 5946 } 5947 cleared++; 5948 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 5949 ("inconsistent pv lock %p %p for page %p", 5950 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 5951 } else 5952 not_cleared++; 5953 } 5954 PMAP_UNLOCK(pmap); 5955 /* Rotate the PV list if it has more than one entry. */ 5956 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 5957 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5958 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5959 pvh->pv_gen++; 5960 } 5961 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 5962 goto out; 5963 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 5964 small_mappings: 5965 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 5966 goto out; 5967 pv = pvf; 5968 do { 5969 if (pvf == NULL) 5970 pvf = pv; 5971 pmap = PV_PMAP(pv); 5972 if (!PMAP_TRYLOCK(pmap)) { 5973 pvh_gen = pvh->pv_gen; 5974 md_gen = m->md.pv_gen; 5975 rw_wunlock(lock); 5976 PMAP_LOCK(pmap); 5977 rw_wlock(lock); 5978 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 5979 PMAP_UNLOCK(pmap); 5980 goto retry; 5981 } 5982 } 5983 PG_A = pmap_accessed_bit(pmap); 5984 PG_M = pmap_modified_bit(pmap); 5985 PG_RW = pmap_rw_bit(pmap); 5986 pde = pmap_pde(pmap, pv->pv_va); 5987 KASSERT((*pde & PG_PS) == 0, 5988 ("pmap_ts_referenced: found a 2mpage in page %p's pv list", 5989 m)); 5990 pte = pmap_pde_to_pte(pde, pv->pv_va); 5991 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5992 vm_page_dirty(m); 5993 if ((*pte & PG_A) != 0) { 5994 if (safe_to_clear_referenced(pmap, *pte)) { 5995 atomic_clear_long(pte, PG_A); 5996 pmap_invalidate_page(pmap, pv->pv_va); 5997 cleared++; 5998 } else if ((*pte & PG_W) == 0) { 5999 /* 6000 * Wired pages cannot be paged out so 6001 * doing accessed bit emulation for 6002 * them is wasted effort. We do the 6003 * hard work for unwired pages only. 6004 */ 6005 pmap_remove_pte(pmap, pte, pv->pv_va, 6006 *pde, &free, &lock); 6007 pmap_invalidate_page(pmap, pv->pv_va); 6008 cleared++; 6009 if (pvf == pv) 6010 pvf = NULL; 6011 pv = NULL; 6012 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 6013 ("inconsistent pv lock %p %p for page %p", 6014 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 6015 } else 6016 not_cleared++; 6017 } 6018 PMAP_UNLOCK(pmap); 6019 /* Rotate the PV list if it has more than one entry. */ 6020 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 6021 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 6022 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 6023 m->md.pv_gen++; 6024 } 6025 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 6026 not_cleared < PMAP_TS_REFERENCED_MAX); 6027 out: 6028 rw_wunlock(lock); 6029 pmap_free_zero_pages(&free); 6030 return (cleared + not_cleared); 6031 } 6032 6033 /* 6034 * Apply the given advice to the specified range of addresses within the 6035 * given pmap. Depending on the advice, clear the referenced and/or 6036 * modified flags in each mapping and set the mapped page's dirty field. 6037 */ 6038 void 6039 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 6040 { 6041 struct rwlock *lock; 6042 pml4_entry_t *pml4e; 6043 pdp_entry_t *pdpe; 6044 pd_entry_t oldpde, *pde; 6045 pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; 6046 vm_offset_t va, va_next; 6047 vm_page_t m; 6048 boolean_t anychanged; 6049 6050 if (advice != MADV_DONTNEED && advice != MADV_FREE) 6051 return; 6052 6053 /* 6054 * A/D bit emulation requires an alternate code path when clearing 6055 * the modified and accessed bits below. Since this function is 6056 * advisory in nature we skip it entirely for pmaps that require 6057 * A/D bit emulation. 6058 */ 6059 if (pmap_emulate_ad_bits(pmap)) 6060 return; 6061 6062 PG_A = pmap_accessed_bit(pmap); 6063 PG_G = pmap_global_bit(pmap); 6064 PG_M = pmap_modified_bit(pmap); 6065 PG_V = pmap_valid_bit(pmap); 6066 PG_RW = pmap_rw_bit(pmap); 6067 anychanged = FALSE; 6068 pmap_delayed_invl_started(); 6069 PMAP_LOCK(pmap); 6070 for (; sva < eva; sva = va_next) { 6071 pml4e = pmap_pml4e(pmap, sva); 6072 if ((*pml4e & PG_V) == 0) { 6073 va_next = (sva + NBPML4) & ~PML4MASK; 6074 if (va_next < sva) 6075 va_next = eva; 6076 continue; 6077 } 6078 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 6079 if ((*pdpe & PG_V) == 0) { 6080 va_next = (sva + NBPDP) & ~PDPMASK; 6081 if (va_next < sva) 6082 va_next = eva; 6083 continue; 6084 } 6085 va_next = (sva + NBPDR) & ~PDRMASK; 6086 if (va_next < sva) 6087 va_next = eva; 6088 pde = pmap_pdpe_to_pde(pdpe, sva); 6089 oldpde = *pde; 6090 if ((oldpde & PG_V) == 0) 6091 continue; 6092 else if ((oldpde & PG_PS) != 0) { 6093 if ((oldpde & PG_MANAGED) == 0) 6094 continue; 6095 lock = NULL; 6096 if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { 6097 if (lock != NULL) 6098 rw_wunlock(lock); 6099 6100 /* 6101 * The large page mapping was destroyed. 6102 */ 6103 continue; 6104 } 6105 6106 /* 6107 * Unless the page mappings are wired, remove the 6108 * mapping to a single page so that a subsequent 6109 * access may repromote. Since the underlying page 6110 * table page is fully populated, this removal never 6111 * frees a page table page. 6112 */ 6113 if ((oldpde & PG_W) == 0) { 6114 pte = pmap_pde_to_pte(pde, sva); 6115 KASSERT((*pte & PG_V) != 0, 6116 ("pmap_advise: invalid PTE")); 6117 pmap_remove_pte(pmap, pte, sva, *pde, NULL, 6118 &lock); 6119 anychanged = TRUE; 6120 } 6121 if (lock != NULL) 6122 rw_wunlock(lock); 6123 } 6124 if (va_next > eva) 6125 va_next = eva; 6126 va = va_next; 6127 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 6128 sva += PAGE_SIZE) { 6129 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) 6130 goto maybe_invlrng; 6131 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 6132 if (advice == MADV_DONTNEED) { 6133 /* 6134 * Future calls to pmap_is_modified() 6135 * can be avoided by making the page 6136 * dirty now. 6137 */ 6138 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 6139 vm_page_dirty(m); 6140 } 6141 atomic_clear_long(pte, PG_M | PG_A); 6142 } else if ((*pte & PG_A) != 0) 6143 atomic_clear_long(pte, PG_A); 6144 else 6145 goto maybe_invlrng; 6146 6147 if ((*pte & PG_G) != 0) { 6148 if (va == va_next) 6149 va = sva; 6150 } else 6151 anychanged = TRUE; 6152 continue; 6153 maybe_invlrng: 6154 if (va != va_next) { 6155 pmap_invalidate_range(pmap, va, sva); 6156 va = va_next; 6157 } 6158 } 6159 if (va != va_next) 6160 pmap_invalidate_range(pmap, va, sva); 6161 } 6162 if (anychanged) 6163 pmap_invalidate_all(pmap); 6164 PMAP_UNLOCK(pmap); 6165 pmap_delayed_invl_finished(); 6166 } 6167 6168 /* 6169 * Clear the modify bits on the specified physical page. 6170 */ 6171 void 6172 pmap_clear_modify(vm_page_t m) 6173 { 6174 struct md_page *pvh; 6175 pmap_t pmap; 6176 pv_entry_t next_pv, pv; 6177 pd_entry_t oldpde, *pde; 6178 pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V; 6179 struct rwlock *lock; 6180 vm_offset_t va; 6181 int md_gen, pvh_gen; 6182 6183 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6184 ("pmap_clear_modify: page %p is not managed", m)); 6185 VM_OBJECT_ASSERT_WLOCKED(m->object); 6186 KASSERT(!vm_page_xbusied(m), 6187 ("pmap_clear_modify: page %p is exclusive busied", m)); 6188 6189 /* 6190 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 6191 * If the object containing the page is locked and the page is not 6192 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 6193 */ 6194 if ((m->aflags & PGA_WRITEABLE) == 0) 6195 return; 6196 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 6197 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6198 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6199 rw_wlock(lock); 6200 restart: 6201 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 6202 pmap = PV_PMAP(pv); 6203 if (!PMAP_TRYLOCK(pmap)) { 6204 pvh_gen = pvh->pv_gen; 6205 rw_wunlock(lock); 6206 PMAP_LOCK(pmap); 6207 rw_wlock(lock); 6208 if (pvh_gen != pvh->pv_gen) { 6209 PMAP_UNLOCK(pmap); 6210 goto restart; 6211 } 6212 } 6213 PG_M = pmap_modified_bit(pmap); 6214 PG_V = pmap_valid_bit(pmap); 6215 PG_RW = pmap_rw_bit(pmap); 6216 va = pv->pv_va; 6217 pde = pmap_pde(pmap, va); 6218 oldpde = *pde; 6219 if ((oldpde & PG_RW) != 0) { 6220 if (pmap_demote_pde_locked(pmap, pde, va, &lock)) { 6221 if ((oldpde & PG_W) == 0) { 6222 /* 6223 * Write protect the mapping to a 6224 * single page so that a subsequent 6225 * write access may repromote. 6226 */ 6227 va += VM_PAGE_TO_PHYS(m) - (oldpde & 6228 PG_PS_FRAME); 6229 pte = pmap_pde_to_pte(pde, va); 6230 oldpte = *pte; 6231 if ((oldpte & PG_V) != 0) { 6232 while (!atomic_cmpset_long(pte, 6233 oldpte, 6234 oldpte & ~(PG_M | PG_RW))) 6235 oldpte = *pte; 6236 vm_page_dirty(m); 6237 pmap_invalidate_page(pmap, va); 6238 } 6239 } 6240 } 6241 } 6242 PMAP_UNLOCK(pmap); 6243 } 6244 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6245 pmap = PV_PMAP(pv); 6246 if (!PMAP_TRYLOCK(pmap)) { 6247 md_gen = m->md.pv_gen; 6248 pvh_gen = pvh->pv_gen; 6249 rw_wunlock(lock); 6250 PMAP_LOCK(pmap); 6251 rw_wlock(lock); 6252 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 6253 PMAP_UNLOCK(pmap); 6254 goto restart; 6255 } 6256 } 6257 PG_M = pmap_modified_bit(pmap); 6258 PG_RW = pmap_rw_bit(pmap); 6259 pde = pmap_pde(pmap, pv->pv_va); 6260 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 6261 " a 2mpage in page %p's pv list", m)); 6262 pte = pmap_pde_to_pte(pde, pv->pv_va); 6263 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 6264 atomic_clear_long(pte, PG_M); 6265 pmap_invalidate_page(pmap, pv->pv_va); 6266 } 6267 PMAP_UNLOCK(pmap); 6268 } 6269 rw_wunlock(lock); 6270 } 6271 6272 /* 6273 * Miscellaneous support routines follow 6274 */ 6275 6276 /* Adjust the cache mode for a 4KB page mapped via a PTE. */ 6277 static __inline void 6278 pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask) 6279 { 6280 u_int opte, npte; 6281 6282 /* 6283 * The cache mode bits are all in the low 32-bits of the 6284 * PTE, so we can just spin on updating the low 32-bits. 6285 */ 6286 do { 6287 opte = *(u_int *)pte; 6288 npte = opte & ~mask; 6289 npte |= cache_bits; 6290 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 6291 } 6292 6293 /* Adjust the cache mode for a 2MB page mapped via a PDE. */ 6294 static __inline void 6295 pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask) 6296 { 6297 u_int opde, npde; 6298 6299 /* 6300 * The cache mode bits are all in the low 32-bits of the 6301 * PDE, so we can just spin on updating the low 32-bits. 6302 */ 6303 do { 6304 opde = *(u_int *)pde; 6305 npde = opde & ~mask; 6306 npde |= cache_bits; 6307 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 6308 } 6309 6310 /* 6311 * Map a set of physical memory pages into the kernel virtual 6312 * address space. Return a pointer to where it is mapped. This 6313 * routine is intended to be used for mapping device memory, 6314 * NOT real memory. 6315 */ 6316 void * 6317 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 6318 { 6319 struct pmap_preinit_mapping *ppim; 6320 vm_offset_t va, offset; 6321 vm_size_t tmpsize; 6322 int i; 6323 6324 offset = pa & PAGE_MASK; 6325 size = round_page(offset + size); 6326 pa = trunc_page(pa); 6327 6328 if (!pmap_initialized) { 6329 va = 0; 6330 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 6331 ppim = pmap_preinit_mapping + i; 6332 if (ppim->va == 0) { 6333 ppim->pa = pa; 6334 ppim->sz = size; 6335 ppim->mode = mode; 6336 ppim->va = virtual_avail; 6337 virtual_avail += size; 6338 va = ppim->va; 6339 break; 6340 } 6341 } 6342 if (va == 0) 6343 panic("%s: too many preinit mappings", __func__); 6344 } else { 6345 /* 6346 * If we have a preinit mapping, re-use it. 6347 */ 6348 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 6349 ppim = pmap_preinit_mapping + i; 6350 if (ppim->pa == pa && ppim->sz == size && 6351 ppim->mode == mode) 6352 return ((void *)(ppim->va + offset)); 6353 } 6354 /* 6355 * If the specified range of physical addresses fits within 6356 * the direct map window, use the direct map. 6357 */ 6358 if (pa < dmaplimit && pa + size < dmaplimit) { 6359 va = PHYS_TO_DMAP(pa); 6360 if (!pmap_change_attr(va, size, mode)) 6361 return ((void *)(va + offset)); 6362 } 6363 va = kva_alloc(size); 6364 if (va == 0) 6365 panic("%s: Couldn't allocate KVA", __func__); 6366 } 6367 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 6368 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 6369 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 6370 pmap_invalidate_cache_range(va, va + tmpsize, FALSE); 6371 return ((void *)(va + offset)); 6372 } 6373 6374 void * 6375 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 6376 { 6377 6378 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 6379 } 6380 6381 void * 6382 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 6383 { 6384 6385 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 6386 } 6387 6388 void 6389 pmap_unmapdev(vm_offset_t va, vm_size_t size) 6390 { 6391 struct pmap_preinit_mapping *ppim; 6392 vm_offset_t offset; 6393 int i; 6394 6395 /* If we gave a direct map region in pmap_mapdev, do nothing */ 6396 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 6397 return; 6398 offset = va & PAGE_MASK; 6399 size = round_page(offset + size); 6400 va = trunc_page(va); 6401 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 6402 ppim = pmap_preinit_mapping + i; 6403 if (ppim->va == va && ppim->sz == size) { 6404 if (pmap_initialized) 6405 return; 6406 ppim->pa = 0; 6407 ppim->va = 0; 6408 ppim->sz = 0; 6409 ppim->mode = 0; 6410 if (va + size == virtual_avail) 6411 virtual_avail = va; 6412 return; 6413 } 6414 } 6415 if (pmap_initialized) 6416 kva_free(va, size); 6417 } 6418 6419 /* 6420 * Tries to demote a 1GB page mapping. 6421 */ 6422 static boolean_t 6423 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) 6424 { 6425 pdp_entry_t newpdpe, oldpdpe; 6426 pd_entry_t *firstpde, newpde, *pde; 6427 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 6428 vm_paddr_t mpdepa; 6429 vm_page_t mpde; 6430 6431 PG_A = pmap_accessed_bit(pmap); 6432 PG_M = pmap_modified_bit(pmap); 6433 PG_V = pmap_valid_bit(pmap); 6434 PG_RW = pmap_rw_bit(pmap); 6435 6436 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6437 oldpdpe = *pdpe; 6438 KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), 6439 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 6440 if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT | 6441 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 6442 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 6443 " in pmap %p", va, pmap); 6444 return (FALSE); 6445 } 6446 mpdepa = VM_PAGE_TO_PHYS(mpde); 6447 firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa); 6448 newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; 6449 KASSERT((oldpdpe & PG_A) != 0, 6450 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 6451 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 6452 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 6453 newpde = oldpdpe; 6454 6455 /* 6456 * Initialize the page directory page. 6457 */ 6458 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 6459 *pde = newpde; 6460 newpde += NBPDR; 6461 } 6462 6463 /* 6464 * Demote the mapping. 6465 */ 6466 *pdpe = newpdpe; 6467 6468 /* 6469 * Invalidate a stale recursive mapping of the page directory page. 6470 */ 6471 pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); 6472 6473 pmap_pdpe_demotions++; 6474 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 6475 " in pmap %p", va, pmap); 6476 return (TRUE); 6477 } 6478 6479 /* 6480 * Sets the memory attribute for the specified page. 6481 */ 6482 void 6483 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 6484 { 6485 6486 m->md.pat_mode = ma; 6487 6488 /* 6489 * If "m" is a normal page, update its direct mapping. This update 6490 * can be relied upon to perform any cache operations that are 6491 * required for data coherence. 6492 */ 6493 if ((m->flags & PG_FICTITIOUS) == 0 && 6494 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 6495 m->md.pat_mode)) 6496 panic("memory attribute change on the direct map failed"); 6497 } 6498 6499 /* 6500 * Changes the specified virtual address range's memory type to that given by 6501 * the parameter "mode". The specified virtual address range must be 6502 * completely contained within either the direct map or the kernel map. If 6503 * the virtual address range is contained within the kernel map, then the 6504 * memory type for each of the corresponding ranges of the direct map is also 6505 * changed. (The corresponding ranges of the direct map are those ranges that 6506 * map the same physical pages as the specified virtual address range.) These 6507 * changes to the direct map are necessary because Intel describes the 6508 * behavior of their processors as "undefined" if two or more mappings to the 6509 * same physical page have different memory types. 6510 * 6511 * Returns zero if the change completed successfully, and either EINVAL or 6512 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 6513 * of the virtual address range was not mapped, and ENOMEM is returned if 6514 * there was insufficient memory available to complete the change. In the 6515 * latter case, the memory type may have been changed on some part of the 6516 * virtual address range or the direct map. 6517 */ 6518 int 6519 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 6520 { 6521 int error; 6522 6523 PMAP_LOCK(kernel_pmap); 6524 error = pmap_change_attr_locked(va, size, mode); 6525 PMAP_UNLOCK(kernel_pmap); 6526 return (error); 6527 } 6528 6529 static int 6530 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) 6531 { 6532 vm_offset_t base, offset, tmpva; 6533 vm_paddr_t pa_start, pa_end, pa_end1; 6534 pdp_entry_t *pdpe; 6535 pd_entry_t *pde; 6536 pt_entry_t *pte; 6537 int cache_bits_pte, cache_bits_pde, error; 6538 boolean_t changed; 6539 6540 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 6541 base = trunc_page(va); 6542 offset = va & PAGE_MASK; 6543 size = round_page(offset + size); 6544 6545 /* 6546 * Only supported on kernel virtual addresses, including the direct 6547 * map but excluding the recursive map. 6548 */ 6549 if (base < DMAP_MIN_ADDRESS) 6550 return (EINVAL); 6551 6552 cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1); 6553 cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0); 6554 changed = FALSE; 6555 6556 /* 6557 * Pages that aren't mapped aren't supported. Also break down 2MB pages 6558 * into 4KB pages if required. 6559 */ 6560 for (tmpva = base; tmpva < base + size; ) { 6561 pdpe = pmap_pdpe(kernel_pmap, tmpva); 6562 if (pdpe == NULL || *pdpe == 0) 6563 return (EINVAL); 6564 if (*pdpe & PG_PS) { 6565 /* 6566 * If the current 1GB page already has the required 6567 * memory type, then we need not demote this page. Just 6568 * increment tmpva to the next 1GB page frame. 6569 */ 6570 if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) { 6571 tmpva = trunc_1gpage(tmpva) + NBPDP; 6572 continue; 6573 } 6574 6575 /* 6576 * If the current offset aligns with a 1GB page frame 6577 * and there is at least 1GB left within the range, then 6578 * we need not break down this page into 2MB pages. 6579 */ 6580 if ((tmpva & PDPMASK) == 0 && 6581 tmpva + PDPMASK < base + size) { 6582 tmpva += NBPDP; 6583 continue; 6584 } 6585 if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) 6586 return (ENOMEM); 6587 } 6588 pde = pmap_pdpe_to_pde(pdpe, tmpva); 6589 if (*pde == 0) 6590 return (EINVAL); 6591 if (*pde & PG_PS) { 6592 /* 6593 * If the current 2MB page already has the required 6594 * memory type, then we need not demote this page. Just 6595 * increment tmpva to the next 2MB page frame. 6596 */ 6597 if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) { 6598 tmpva = trunc_2mpage(tmpva) + NBPDR; 6599 continue; 6600 } 6601 6602 /* 6603 * If the current offset aligns with a 2MB page frame 6604 * and there is at least 2MB left within the range, then 6605 * we need not break down this page into 4KB pages. 6606 */ 6607 if ((tmpva & PDRMASK) == 0 && 6608 tmpva + PDRMASK < base + size) { 6609 tmpva += NBPDR; 6610 continue; 6611 } 6612 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) 6613 return (ENOMEM); 6614 } 6615 pte = pmap_pde_to_pte(pde, tmpva); 6616 if (*pte == 0) 6617 return (EINVAL); 6618 tmpva += PAGE_SIZE; 6619 } 6620 error = 0; 6621 6622 /* 6623 * Ok, all the pages exist, so run through them updating their 6624 * cache mode if required. 6625 */ 6626 pa_start = pa_end = 0; 6627 for (tmpva = base; tmpva < base + size; ) { 6628 pdpe = pmap_pdpe(kernel_pmap, tmpva); 6629 if (*pdpe & PG_PS) { 6630 if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) { 6631 pmap_pde_attr(pdpe, cache_bits_pde, 6632 X86_PG_PDE_CACHE); 6633 changed = TRUE; 6634 } 6635 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 6636 (*pdpe & PG_PS_FRAME) < dmaplimit) { 6637 if (pa_start == pa_end) { 6638 /* Start physical address run. */ 6639 pa_start = *pdpe & PG_PS_FRAME; 6640 pa_end = pa_start + NBPDP; 6641 } else if (pa_end == (*pdpe & PG_PS_FRAME)) 6642 pa_end += NBPDP; 6643 else { 6644 /* Run ended, update direct map. */ 6645 error = pmap_change_attr_locked( 6646 PHYS_TO_DMAP(pa_start), 6647 pa_end - pa_start, mode); 6648 if (error != 0) 6649 break; 6650 /* Start physical address run. */ 6651 pa_start = *pdpe & PG_PS_FRAME; 6652 pa_end = pa_start + NBPDP; 6653 } 6654 } 6655 tmpva = trunc_1gpage(tmpva) + NBPDP; 6656 continue; 6657 } 6658 pde = pmap_pdpe_to_pde(pdpe, tmpva); 6659 if (*pde & PG_PS) { 6660 if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) { 6661 pmap_pde_attr(pde, cache_bits_pde, 6662 X86_PG_PDE_CACHE); 6663 changed = TRUE; 6664 } 6665 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 6666 (*pde & PG_PS_FRAME) < dmaplimit) { 6667 if (pa_start == pa_end) { 6668 /* Start physical address run. */ 6669 pa_start = *pde & PG_PS_FRAME; 6670 pa_end = pa_start + NBPDR; 6671 } else if (pa_end == (*pde & PG_PS_FRAME)) 6672 pa_end += NBPDR; 6673 else { 6674 /* Run ended, update direct map. */ 6675 error = pmap_change_attr_locked( 6676 PHYS_TO_DMAP(pa_start), 6677 pa_end - pa_start, mode); 6678 if (error != 0) 6679 break; 6680 /* Start physical address run. */ 6681 pa_start = *pde & PG_PS_FRAME; 6682 pa_end = pa_start + NBPDR; 6683 } 6684 } 6685 tmpva = trunc_2mpage(tmpva) + NBPDR; 6686 } else { 6687 pte = pmap_pde_to_pte(pde, tmpva); 6688 if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) { 6689 pmap_pte_attr(pte, cache_bits_pte, 6690 X86_PG_PTE_CACHE); 6691 changed = TRUE; 6692 } 6693 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 6694 (*pte & PG_PS_FRAME) < dmaplimit) { 6695 if (pa_start == pa_end) { 6696 /* Start physical address run. */ 6697 pa_start = *pte & PG_FRAME; 6698 pa_end = pa_start + PAGE_SIZE; 6699 } else if (pa_end == (*pte & PG_FRAME)) 6700 pa_end += PAGE_SIZE; 6701 else { 6702 /* Run ended, update direct map. */ 6703 error = pmap_change_attr_locked( 6704 PHYS_TO_DMAP(pa_start), 6705 pa_end - pa_start, mode); 6706 if (error != 0) 6707 break; 6708 /* Start physical address run. */ 6709 pa_start = *pte & PG_FRAME; 6710 pa_end = pa_start + PAGE_SIZE; 6711 } 6712 } 6713 tmpva += PAGE_SIZE; 6714 } 6715 } 6716 if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { 6717 pa_end1 = MIN(pa_end, dmaplimit); 6718 if (pa_start != pa_end1) 6719 error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), 6720 pa_end1 - pa_start, mode); 6721 } 6722 6723 /* 6724 * Flush CPU caches if required to make sure any data isn't cached that 6725 * shouldn't be, etc. 6726 */ 6727 if (changed) { 6728 pmap_invalidate_range(kernel_pmap, base, tmpva); 6729 pmap_invalidate_cache_range(base, tmpva, FALSE); 6730 } 6731 return (error); 6732 } 6733 6734 /* 6735 * Demotes any mapping within the direct map region that covers more than the 6736 * specified range of physical addresses. This range's size must be a power 6737 * of two and its starting address must be a multiple of its size. Since the 6738 * demotion does not change any attributes of the mapping, a TLB invalidation 6739 * is not mandatory. The caller may, however, request a TLB invalidation. 6740 */ 6741 void 6742 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) 6743 { 6744 pdp_entry_t *pdpe; 6745 pd_entry_t *pde; 6746 vm_offset_t va; 6747 boolean_t changed; 6748 6749 if (len == 0) 6750 return; 6751 KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); 6752 KASSERT((base & (len - 1)) == 0, 6753 ("pmap_demote_DMAP: base is not a multiple of len")); 6754 if (len < NBPDP && base < dmaplimit) { 6755 va = PHYS_TO_DMAP(base); 6756 changed = FALSE; 6757 PMAP_LOCK(kernel_pmap); 6758 pdpe = pmap_pdpe(kernel_pmap, va); 6759 if ((*pdpe & X86_PG_V) == 0) 6760 panic("pmap_demote_DMAP: invalid PDPE"); 6761 if ((*pdpe & PG_PS) != 0) { 6762 if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) 6763 panic("pmap_demote_DMAP: PDPE failed"); 6764 changed = TRUE; 6765 } 6766 if (len < NBPDR) { 6767 pde = pmap_pdpe_to_pde(pdpe, va); 6768 if ((*pde & X86_PG_V) == 0) 6769 panic("pmap_demote_DMAP: invalid PDE"); 6770 if ((*pde & PG_PS) != 0) { 6771 if (!pmap_demote_pde(kernel_pmap, pde, va)) 6772 panic("pmap_demote_DMAP: PDE failed"); 6773 changed = TRUE; 6774 } 6775 } 6776 if (changed && invalidate) 6777 pmap_invalidate_page(kernel_pmap, va); 6778 PMAP_UNLOCK(kernel_pmap); 6779 } 6780 } 6781 6782 /* 6783 * perform the pmap work for mincore 6784 */ 6785 int 6786 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 6787 { 6788 pd_entry_t *pdep; 6789 pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; 6790 vm_paddr_t pa; 6791 int val; 6792 6793 PG_A = pmap_accessed_bit(pmap); 6794 PG_M = pmap_modified_bit(pmap); 6795 PG_V = pmap_valid_bit(pmap); 6796 PG_RW = pmap_rw_bit(pmap); 6797 6798 PMAP_LOCK(pmap); 6799 retry: 6800 pdep = pmap_pde(pmap, addr); 6801 if (pdep != NULL && (*pdep & PG_V)) { 6802 if (*pdep & PG_PS) { 6803 pte = *pdep; 6804 /* Compute the physical address of the 4KB page. */ 6805 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & 6806 PG_FRAME; 6807 val = MINCORE_SUPER; 6808 } else { 6809 pte = *pmap_pde_to_pte(pdep, addr); 6810 pa = pte & PG_FRAME; 6811 val = 0; 6812 } 6813 } else { 6814 pte = 0; 6815 pa = 0; 6816 val = 0; 6817 } 6818 if ((pte & PG_V) != 0) { 6819 val |= MINCORE_INCORE; 6820 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6821 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6822 if ((pte & PG_A) != 0) 6823 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6824 } 6825 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 6826 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 6827 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 6828 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 6829 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 6830 goto retry; 6831 } else 6832 PA_UNLOCK_COND(*locked_pa); 6833 PMAP_UNLOCK(pmap); 6834 return (val); 6835 } 6836 6837 static uint64_t 6838 pmap_pcid_alloc(pmap_t pmap, u_int cpuid) 6839 { 6840 uint32_t gen, new_gen, pcid_next; 6841 6842 CRITICAL_ASSERT(curthread); 6843 gen = PCPU_GET(pcid_gen); 6844 if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN || 6845 pmap->pm_pcids[cpuid].pm_gen == gen) 6846 return (CR3_PCID_SAVE); 6847 pcid_next = PCPU_GET(pcid_next); 6848 KASSERT(pcid_next <= PMAP_PCID_OVERMAX, ("cpu %d pcid_next %#x", 6849 cpuid, pcid_next)); 6850 if (pcid_next == PMAP_PCID_OVERMAX) { 6851 new_gen = gen + 1; 6852 if (new_gen == 0) 6853 new_gen = 1; 6854 PCPU_SET(pcid_gen, new_gen); 6855 pcid_next = PMAP_PCID_KERN + 1; 6856 } else { 6857 new_gen = gen; 6858 } 6859 pmap->pm_pcids[cpuid].pm_pcid = pcid_next; 6860 pmap->pm_pcids[cpuid].pm_gen = new_gen; 6861 PCPU_SET(pcid_next, pcid_next + 1); 6862 return (0); 6863 } 6864 6865 void 6866 pmap_activate_sw(struct thread *td) 6867 { 6868 pmap_t oldpmap, pmap; 6869 uint64_t cached, cr3; 6870 register_t rflags; 6871 u_int cpuid; 6872 6873 oldpmap = PCPU_GET(curpmap); 6874 pmap = vmspace_pmap(td->td_proc->p_vmspace); 6875 if (oldpmap == pmap) 6876 return; 6877 cpuid = PCPU_GET(cpuid); 6878 #ifdef SMP 6879 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 6880 #else 6881 CPU_SET(cpuid, &pmap->pm_active); 6882 #endif 6883 cr3 = rcr3(); 6884 if (pmap_pcid_enabled) { 6885 cached = pmap_pcid_alloc(pmap, cpuid); 6886 KASSERT(pmap->pm_pcids[cpuid].pm_pcid >= 0 && 6887 pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, 6888 ("pmap %p cpu %d pcid %#x", pmap, cpuid, 6889 pmap->pm_pcids[cpuid].pm_pcid)); 6890 KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || 6891 pmap == kernel_pmap, 6892 ("non-kernel pmap thread %p pmap %p cpu %d pcid %#x", 6893 td, pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); 6894 6895 /* 6896 * If the INVPCID instruction is not available, 6897 * invltlb_pcid_handler() is used for handle 6898 * invalidate_all IPI, which checks for curpmap == 6899 * smp_tlb_pmap. Below operations sequence has a 6900 * window where %CR3 is loaded with the new pmap's 6901 * PML4 address, but curpmap value is not yet updated. 6902 * This causes invltlb IPI handler, called between the 6903 * updates, to execute as NOP, which leaves stale TLB 6904 * entries. 6905 * 6906 * Note that the most typical use of 6907 * pmap_activate_sw(), from the context switch, is 6908 * immune to this race, because interrupts are 6909 * disabled (while the thread lock is owned), and IPI 6910 * happends after curpmap is updated. Protect other 6911 * callers in a similar way, by disabling interrupts 6912 * around the %cr3 register reload and curpmap 6913 * assignment. 6914 */ 6915 if (!invpcid_works) 6916 rflags = intr_disable(); 6917 6918 if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) { 6919 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | 6920 cached); 6921 if (cached) 6922 PCPU_INC(pm_save_cnt); 6923 } 6924 PCPU_SET(curpmap, pmap); 6925 if (!invpcid_works) 6926 intr_restore(rflags); 6927 } else if (cr3 != pmap->pm_cr3) { 6928 load_cr3(pmap->pm_cr3); 6929 PCPU_SET(curpmap, pmap); 6930 } 6931 #ifdef SMP 6932 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 6933 #else 6934 CPU_CLR(cpuid, &oldpmap->pm_active); 6935 #endif 6936 } 6937 6938 void 6939 pmap_activate(struct thread *td) 6940 { 6941 6942 critical_enter(); 6943 pmap_activate_sw(td); 6944 critical_exit(); 6945 } 6946 6947 void 6948 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 6949 { 6950 } 6951 6952 /* 6953 * Increase the starting virtual address of the given mapping if a 6954 * different alignment might result in more superpage mappings. 6955 */ 6956 void 6957 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 6958 vm_offset_t *addr, vm_size_t size) 6959 { 6960 vm_offset_t superpage_offset; 6961 6962 if (size < NBPDR) 6963 return; 6964 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 6965 offset += ptoa(object->pg_color); 6966 superpage_offset = offset & PDRMASK; 6967 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 6968 (*addr & PDRMASK) == superpage_offset) 6969 return; 6970 if ((*addr & PDRMASK) < superpage_offset) 6971 *addr = (*addr & ~PDRMASK) + superpage_offset; 6972 else 6973 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 6974 } 6975 6976 #ifdef INVARIANTS 6977 static unsigned long num_dirty_emulations; 6978 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, 6979 &num_dirty_emulations, 0, NULL); 6980 6981 static unsigned long num_accessed_emulations; 6982 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, 6983 &num_accessed_emulations, 0, NULL); 6984 6985 static unsigned long num_superpage_accessed_emulations; 6986 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, 6987 &num_superpage_accessed_emulations, 0, NULL); 6988 6989 static unsigned long ad_emulation_superpage_promotions; 6990 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, 6991 &ad_emulation_superpage_promotions, 0, NULL); 6992 #endif /* INVARIANTS */ 6993 6994 int 6995 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) 6996 { 6997 int rv; 6998 struct rwlock *lock; 6999 vm_page_t m, mpte; 7000 pd_entry_t *pde; 7001 pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; 7002 7003 KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, 7004 ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); 7005 7006 if (!pmap_emulate_ad_bits(pmap)) 7007 return (-1); 7008 7009 PG_A = pmap_accessed_bit(pmap); 7010 PG_M = pmap_modified_bit(pmap); 7011 PG_V = pmap_valid_bit(pmap); 7012 PG_RW = pmap_rw_bit(pmap); 7013 7014 rv = -1; 7015 lock = NULL; 7016 PMAP_LOCK(pmap); 7017 7018 pde = pmap_pde(pmap, va); 7019 if (pde == NULL || (*pde & PG_V) == 0) 7020 goto done; 7021 7022 if ((*pde & PG_PS) != 0) { 7023 if (ftype == VM_PROT_READ) { 7024 #ifdef INVARIANTS 7025 atomic_add_long(&num_superpage_accessed_emulations, 1); 7026 #endif 7027 *pde |= PG_A; 7028 rv = 0; 7029 } 7030 goto done; 7031 } 7032 7033 pte = pmap_pde_to_pte(pde, va); 7034 if ((*pte & PG_V) == 0) 7035 goto done; 7036 7037 if (ftype == VM_PROT_WRITE) { 7038 if ((*pte & PG_RW) == 0) 7039 goto done; 7040 /* 7041 * Set the modified and accessed bits simultaneously. 7042 * 7043 * Intel EPT PTEs that do software emulation of A/D bits map 7044 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively. 7045 * An EPT misconfiguration is triggered if the PTE is writable 7046 * but not readable (WR=10). This is avoided by setting PG_A 7047 * and PG_M simultaneously. 7048 */ 7049 *pte |= PG_M | PG_A; 7050 } else { 7051 *pte |= PG_A; 7052 } 7053 7054 /* try to promote the mapping */ 7055 if (va < VM_MAXUSER_ADDRESS) 7056 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7057 else 7058 mpte = NULL; 7059 7060 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 7061 7062 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 7063 pmap_ps_enabled(pmap) && 7064 (m->flags & PG_FICTITIOUS) == 0 && 7065 vm_reserv_level_iffullpop(m) == 0) { 7066 pmap_promote_pde(pmap, pde, va, &lock); 7067 #ifdef INVARIANTS 7068 atomic_add_long(&ad_emulation_superpage_promotions, 1); 7069 #endif 7070 } 7071 #ifdef INVARIANTS 7072 if (ftype == VM_PROT_WRITE) 7073 atomic_add_long(&num_dirty_emulations, 1); 7074 else 7075 atomic_add_long(&num_accessed_emulations, 1); 7076 #endif 7077 rv = 0; /* success */ 7078 done: 7079 if (lock != NULL) 7080 rw_wunlock(lock); 7081 PMAP_UNLOCK(pmap); 7082 return (rv); 7083 } 7084 7085 void 7086 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) 7087 { 7088 pml4_entry_t *pml4; 7089 pdp_entry_t *pdp; 7090 pd_entry_t *pde; 7091 pt_entry_t *pte, PG_V; 7092 int idx; 7093 7094 idx = 0; 7095 PG_V = pmap_valid_bit(pmap); 7096 PMAP_LOCK(pmap); 7097 7098 pml4 = pmap_pml4e(pmap, va); 7099 ptr[idx++] = *pml4; 7100 if ((*pml4 & PG_V) == 0) 7101 goto done; 7102 7103 pdp = pmap_pml4e_to_pdpe(pml4, va); 7104 ptr[idx++] = *pdp; 7105 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) 7106 goto done; 7107 7108 pde = pmap_pdpe_to_pde(pdp, va); 7109 ptr[idx++] = *pde; 7110 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) 7111 goto done; 7112 7113 pte = pmap_pde_to_pte(pde, va); 7114 ptr[idx++] = *pte; 7115 7116 done: 7117 PMAP_UNLOCK(pmap); 7118 *num = idx; 7119 } 7120 7121 /** 7122 * Get the kernel virtual address of a set of physical pages. If there are 7123 * physical addresses not covered by the DMAP perform a transient mapping 7124 * that will be removed when calling pmap_unmap_io_transient. 7125 * 7126 * \param page The pages the caller wishes to obtain the virtual 7127 * address on the kernel memory map. 7128 * \param vaddr On return contains the kernel virtual memory address 7129 * of the pages passed in the page parameter. 7130 * \param count Number of pages passed in. 7131 * \param can_fault TRUE if the thread using the mapped pages can take 7132 * page faults, FALSE otherwise. 7133 * 7134 * \returns TRUE if the caller must call pmap_unmap_io_transient when 7135 * finished or FALSE otherwise. 7136 * 7137 */ 7138 boolean_t 7139 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 7140 boolean_t can_fault) 7141 { 7142 vm_paddr_t paddr; 7143 boolean_t needs_mapping; 7144 pt_entry_t *pte; 7145 int cache_bits, error, i; 7146 7147 /* 7148 * Allocate any KVA space that we need, this is done in a separate 7149 * loop to prevent calling vmem_alloc while pinned. 7150 */ 7151 needs_mapping = FALSE; 7152 for (i = 0; i < count; i++) { 7153 paddr = VM_PAGE_TO_PHYS(page[i]); 7154 if (__predict_false(paddr >= dmaplimit)) { 7155 error = vmem_alloc(kernel_arena, PAGE_SIZE, 7156 M_BESTFIT | M_WAITOK, &vaddr[i]); 7157 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 7158 needs_mapping = TRUE; 7159 } else { 7160 vaddr[i] = PHYS_TO_DMAP(paddr); 7161 } 7162 } 7163 7164 /* Exit early if everything is covered by the DMAP */ 7165 if (!needs_mapping) 7166 return (FALSE); 7167 7168 /* 7169 * NB: The sequence of updating a page table followed by accesses 7170 * to the corresponding pages used in the !DMAP case is subject to 7171 * the situation described in the "AMD64 Architecture Programmer's 7172 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special 7173 * Coherency Considerations". Therefore, issuing the INVLPG right 7174 * after modifying the PTE bits is crucial. 7175 */ 7176 if (!can_fault) 7177 sched_pin(); 7178 for (i = 0; i < count; i++) { 7179 paddr = VM_PAGE_TO_PHYS(page[i]); 7180 if (paddr >= dmaplimit) { 7181 if (can_fault) { 7182 /* 7183 * Slow path, since we can get page faults 7184 * while mappings are active don't pin the 7185 * thread to the CPU and instead add a global 7186 * mapping visible to all CPUs. 7187 */ 7188 pmap_qenter(vaddr[i], &page[i], 1); 7189 } else { 7190 pte = vtopte(vaddr[i]); 7191 cache_bits = pmap_cache_bits(kernel_pmap, 7192 page[i]->md.pat_mode, 0); 7193 pte_store(pte, paddr | X86_PG_RW | X86_PG_V | 7194 cache_bits); 7195 invlpg(vaddr[i]); 7196 } 7197 } 7198 } 7199 7200 return (needs_mapping); 7201 } 7202 7203 void 7204 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 7205 boolean_t can_fault) 7206 { 7207 vm_paddr_t paddr; 7208 int i; 7209 7210 if (!can_fault) 7211 sched_unpin(); 7212 for (i = 0; i < count; i++) { 7213 paddr = VM_PAGE_TO_PHYS(page[i]); 7214 if (paddr >= dmaplimit) { 7215 if (can_fault) 7216 pmap_qremove(vaddr[i], 1); 7217 vmem_free(kernel_arena, vaddr[i], PAGE_SIZE); 7218 } 7219 } 7220 } 7221 7222 vm_offset_t 7223 pmap_quick_enter_page(vm_page_t m) 7224 { 7225 vm_paddr_t paddr; 7226 7227 paddr = VM_PAGE_TO_PHYS(m); 7228 if (paddr < dmaplimit) 7229 return (PHYS_TO_DMAP(paddr)); 7230 mtx_lock_spin(&qframe_mtx); 7231 KASSERT(*vtopte(qframe) == 0, ("qframe busy")); 7232 pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A | 7233 X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0)); 7234 return (qframe); 7235 } 7236 7237 void 7238 pmap_quick_remove_page(vm_offset_t addr) 7239 { 7240 7241 if (addr != qframe) 7242 return; 7243 pte_store(vtopte(qframe), 0); 7244 invlpg(qframe); 7245 mtx_unlock_spin(&qframe_mtx); 7246 } 7247 7248 #include "opt_ddb.h" 7249 #ifdef DDB 7250 #include <sys/kdb.h> 7251 #include <ddb/ddb.h> 7252 7253 DB_SHOW_COMMAND(pte, pmap_print_pte) 7254 { 7255 pmap_t pmap; 7256 pml4_entry_t *pml4; 7257 pdp_entry_t *pdp; 7258 pd_entry_t *pde; 7259 pt_entry_t *pte, PG_V; 7260 vm_offset_t va; 7261 7262 if (!have_addr) { 7263 db_printf("show pte addr\n"); 7264 return; 7265 } 7266 va = (vm_offset_t)addr; 7267 7268 if (kdb_thread != NULL) 7269 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); 7270 else 7271 pmap = PCPU_GET(curpmap); 7272 7273 PG_V = pmap_valid_bit(pmap); 7274 pml4 = pmap_pml4e(pmap, va); 7275 db_printf("VA %#016lx pml4e %#016lx", va, *pml4); 7276 if ((*pml4 & PG_V) == 0) { 7277 db_printf("\n"); 7278 return; 7279 } 7280 pdp = pmap_pml4e_to_pdpe(pml4, va); 7281 db_printf(" pdpe %#016lx", *pdp); 7282 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { 7283 db_printf("\n"); 7284 return; 7285 } 7286 pde = pmap_pdpe_to_pde(pdp, va); 7287 db_printf(" pde %#016lx", *pde); 7288 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { 7289 db_printf("\n"); 7290 return; 7291 } 7292 pte = pmap_pde_to_pte(pde, va); 7293 db_printf(" pte %#016lx\n", *pte); 7294 } 7295 7296 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) 7297 { 7298 vm_paddr_t a; 7299 7300 if (have_addr) { 7301 a = (vm_paddr_t)addr; 7302 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); 7303 } else { 7304 db_printf("show phys2dmap addr\n"); 7305 } 7306 } 7307 #endif 7308