1 /* $OpenBSD: pmap.c,v 1.141 2020/12/16 21:11:35 bluhm Exp $ */ 2 /* $NetBSD: pmap.c,v 1.3 2003/05/08 18:13:13 thorpej Exp $ */ 3 4 /* 5 * Copyright (c) 1997 Charles D. Cranor and Washington University. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 * Copyright 2001 (c) Wasabi Systems, Inc. 31 * All rights reserved. 32 * 33 * Written by Frank van der Linden for Wasabi Systems, Inc. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgement: 45 * This product includes software developed for the NetBSD Project by 46 * Wasabi Systems, Inc. 47 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 48 * or promote products derived from this software without specific prior 49 * written permission. 50 * 51 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 53 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 54 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 55 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 56 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 57 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 58 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 59 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 60 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 61 * POSSIBILITY OF SUCH DAMAGE. 62 */ 63 64 /* 65 * This is the i386 pmap modified and generalized to support x86-64 66 * as well. The idea is to hide the upper N levels of the page tables 67 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest 68 * is mostly untouched, except that it uses some more generalized 69 * macros and interfaces. 70 * 71 * This pmap has been tested on the i386 as well, and it can be easily 72 * adapted to PAE. 73 * 74 * fvdl@wasabisystems.com 18-Jun-2001 75 */ 76 77 /* 78 * pmap.c: i386 pmap module rewrite 79 * Chuck Cranor <chuck@ccrc.wustl.edu> 80 * 11-Aug-97 81 * 82 * history of this pmap module: in addition to my own input, i used 83 * the following references for this rewrite of the i386 pmap: 84 * 85 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 86 * BSD hp300 pmap done by Mike Hibler at University of Utah. 87 * it was then ported to the i386 by William Jolitz of UUNET 88 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 89 * project fixed some bugs and provided some speed ups. 90 * 91 * [2] the FreeBSD i386 pmap. this pmap seems to be the 92 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 93 * and David Greenman. 94 * 95 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 96 * between several processors. the VAX version was done by 97 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 98 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 99 * David Golub, and Richard Draves. the alpha version was 100 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 101 * (NetBSD/alpha). 102 */ 103 104 #include <sys/param.h> 105 #include <sys/systm.h> 106 #include <sys/atomic.h> 107 #include <sys/proc.h> 108 #include <sys/pool.h> 109 #include <sys/user.h> 110 #include <sys/mutex.h> 111 112 #include <uvm/uvm.h> 113 114 #include <machine/cpu.h> 115 #ifdef MULTIPROCESSOR 116 #include <machine/i82489reg.h> 117 #include <machine/i82489var.h> 118 #endif 119 120 #include "vmm.h" 121 122 #if NVMM > 0 123 #include <machine/vmmvar.h> 124 #endif /* NVMM > 0 */ 125 126 #include "acpi.h" 127 128 /* #define PMAP_DEBUG */ 129 130 #ifdef PMAP_DEBUG 131 #define DPRINTF(x...) do { printf(x); } while(0) 132 #else 133 #define DPRINTF(x...) 134 #endif /* PMAP_DEBUG */ 135 136 137 /* 138 * general info: 139 * 140 * - for an explanation of how the i386 MMU hardware works see 141 * the comments in <machine/pte.h>. 142 * 143 * - for an explanation of the general memory structure used by 144 * this pmap (including the recursive mapping), see the comments 145 * in <machine/pmap.h>. 146 * 147 * this file contains the code for the "pmap module." the module's 148 * job is to manage the hardware's virtual to physical address mappings. 149 * note that there are two levels of mapping in the VM system: 150 * 151 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 152 * to map ranges of virtual address space to objects/files. for 153 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 154 * to the file /bin/ls starting at offset zero." note that 155 * the upper layer mapping is not concerned with how individual 156 * vm_pages are mapped. 157 * 158 * [2] the lower layer of the VM system (the pmap) maintains the mappings 159 * from virtual addresses. it is concerned with which vm_page is 160 * mapped where. for example, when you run /bin/ls and start 161 * at page 0x1000 the fault routine may lookup the correct page 162 * of the /bin/ls file and then ask the pmap layer to establish 163 * a mapping for it. 164 * 165 * note that information in the lower layer of the VM system can be 166 * thrown away since it can easily be reconstructed from the info 167 * in the upper layer. 168 * 169 * data structures we use include: 170 * - struct pmap: describes the address space of one process 171 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 172 * - struct pg_to_free: a list of virtual addresses whose mappings 173 * have been changed. used for TLB flushing. 174 */ 175 176 /* 177 * memory allocation 178 * 179 * - there are three data structures that we must dynamically allocate: 180 * 181 * [A] new process' page directory page (PDP) 182 * - plan 1: done at pmap_create() we use 183 * pool_get(&pmap_pmap_pool, PR_WAITOK) to do this allocation. 184 * 185 * if we are low in free physical memory then we sleep in 186 * pool_get() -- in this case this is ok since we are creating 187 * a new pmap and should not be holding any locks. 188 * 189 * XXX: the fork code currently has no way to return an "out of 190 * memory, try again" error code since uvm_fork [fka vm_fork] 191 * is a void function. 192 * 193 * [B] new page tables pages (PTP) 194 * call uvm_pagealloc() 195 * => success: zero page, add to pm_pdir 196 * => failure: we are out of free vm_pages, let pmap_enter() 197 * tell UVM about it. 198 * 199 * note: for kernel PTPs, we start with NKPTP of them. as we map 200 * kernel memory (at uvm_map time) we check to see if we've grown 201 * the kernel pmap. if so, we call the optional function 202 * pmap_growkernel() to grow the kernel PTPs in advance. 203 * 204 * [C] pv_entry structures 205 * - try to allocate one from the pool. 206 * If we fail, we simply let pmap_enter() tell UVM about it. 207 */ 208 209 long nkptp[] = NKPTP_INITIALIZER; 210 211 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 212 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 213 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 214 const long nbpd[] = NBPD_INITIALIZER; 215 pd_entry_t *const normal_pdes[] = PDES_INITIALIZER; 216 217 #define pmap_pte_set(p, n) atomic_swap_64(p, n) 218 #define pmap_pte_clearbits(p, b) x86_atomic_clearbits_u64(p, b) 219 #define pmap_pte_setbits(p, b) x86_atomic_setbits_u64(p, b) 220 221 /* 222 * global data structures 223 */ 224 225 struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ 226 227 /* 228 * pmap_pg_wc: if our processor supports PAT then we set this 229 * to be the pte bits for Write Combining. Else we fall back to 230 * UC- so mtrrs can override the cacheability; 231 */ 232 int pmap_pg_wc = PG_UCMINUS; 233 234 /* 235 * pmap_use_pcid: nonzero if PCID use is enabled (currently we require INVPCID) 236 * 237 * The next three are zero unless and until PCID support is enabled so code 238 * can just 'or' them in as needed without tests. 239 * cr3_pcid: CR3_REUSE_PCID 240 * cr3_pcid_proc and cr3_pcid_temp: PCID_PROC and PCID_TEMP 241 */ 242 #if PCID_KERN != 0 243 # error "pmap.c assumes PCID_KERN is zero" 244 #endif 245 int pmap_use_pcid; 246 static u_int cr3_pcid_proc; 247 static u_int cr3_pcid_temp; 248 /* these two are accessed from locore.o */ 249 paddr_t cr3_reuse_pcid; 250 paddr_t cr3_pcid_proc_intel; 251 252 /* 253 * other data structures 254 */ 255 256 pt_entry_t protection_codes[8]; /* maps MI prot to i386 prot code */ 257 int pmap_initialized = 0; /* pmap_init done yet? */ 258 259 /* 260 * pv management structures. 261 */ 262 struct pool pmap_pv_pool; 263 264 /* 265 * linked list of all non-kernel pmaps 266 */ 267 268 struct pmap_head pmaps; 269 270 /* 271 * pool that pmap structures are allocated from 272 */ 273 274 struct pool pmap_pmap_pool; 275 276 /* 277 * When we're freeing a ptp, we need to delay the freeing until all 278 * tlb shootdown has been done. This is the list of the to-be-freed pages. 279 */ 280 TAILQ_HEAD(pg_to_free, vm_page); 281 282 /* 283 * pool that PDPs are allocated from 284 */ 285 286 struct pool pmap_pdp_pool; 287 void pmap_pdp_ctor(pd_entry_t *); 288 void pmap_pdp_ctor_intel(pd_entry_t *); 289 290 extern vaddr_t msgbuf_vaddr; 291 extern paddr_t msgbuf_paddr; 292 293 extern vaddr_t idt_vaddr; /* we allocate IDT early */ 294 extern paddr_t idt_paddr; 295 296 extern vaddr_t lo32_vaddr; 297 extern vaddr_t lo32_paddr; 298 299 vaddr_t virtual_avail; 300 extern int end; 301 302 /* 303 * local prototypes 304 */ 305 306 void pmap_enter_pv(struct vm_page *, struct pv_entry *, struct pmap *, 307 vaddr_t, struct vm_page *); 308 struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t); 309 struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int); 310 int pmap_find_pte_direct(struct pmap *pm, vaddr_t va, pt_entry_t **pd, int *offs); 311 void pmap_free_ptp(struct pmap *, struct vm_page *, 312 vaddr_t, struct pg_to_free *); 313 void pmap_freepage(struct pmap *, struct vm_page *, int, struct pg_to_free *); 314 #ifdef MULTIPROCESSOR 315 static int pmap_is_active(struct pmap *, int); 316 #endif 317 paddr_t pmap_map_ptes(struct pmap *); 318 struct pv_entry *pmap_remove_pv(struct vm_page *, struct pmap *, vaddr_t); 319 void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int); 320 void pmap_remove_ept(struct pmap *, vaddr_t, vaddr_t); 321 void pmap_do_remove_ept(struct pmap *, vaddr_t); 322 int pmap_enter_ept(struct pmap *, vaddr_t, paddr_t, vm_prot_t); 323 int pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, 324 vaddr_t, int, struct pv_entry **); 325 void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, 326 vaddr_t, vaddr_t, int, struct pv_entry **); 327 #define PMAP_REMOVE_ALL 0 /* remove all mappings */ 328 #define PMAP_REMOVE_SKIPWIRED 1 /* skip wired mappings */ 329 330 void pmap_unmap_ptes(struct pmap *, paddr_t); 331 int pmap_get_physpage(vaddr_t, int, paddr_t *); 332 int pmap_pdes_valid(vaddr_t, pd_entry_t *); 333 void pmap_alloc_level(vaddr_t, int, long *); 334 335 static inline 336 void pmap_sync_flags_pte(struct vm_page *, u_long); 337 338 void pmap_tlb_shootpage(struct pmap *, vaddr_t, int); 339 void pmap_tlb_shootrange(struct pmap *, vaddr_t, vaddr_t, int); 340 void pmap_tlb_shoottlb(struct pmap *, int); 341 #ifdef MULTIPROCESSOR 342 void pmap_tlb_shootwait(void); 343 #else 344 #define pmap_tlb_shootwait() 345 #endif 346 347 /* 348 * p m a p i n l i n e h e l p e r f u n c t i o n s 349 */ 350 351 /* 352 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 353 * of course the kernel is always loaded 354 */ 355 356 static __inline int 357 pmap_is_curpmap(struct pmap *pmap) 358 { 359 return((pmap == pmap_kernel()) || 360 (pmap->pm_pdirpa == (rcr3() & CR3_PADDR))); 361 } 362 363 /* 364 * pmap_is_active: is this pmap loaded into the specified processor's %cr3? 365 */ 366 367 #ifdef MULTIPROCESSOR 368 static __inline int 369 pmap_is_active(struct pmap *pmap, int cpu_id) 370 { 371 return (pmap == pmap_kernel() || 372 (pmap->pm_cpus & (1ULL << cpu_id)) != 0); 373 } 374 #endif 375 376 static __inline u_int 377 pmap_pte2flags(u_long pte) 378 { 379 return (((pte & PG_U) ? PG_PMAP_REF : 0) | 380 ((pte & PG_M) ? PG_PMAP_MOD : 0)); 381 } 382 383 static inline void 384 pmap_sync_flags_pte(struct vm_page *pg, u_long pte) 385 { 386 if (pte & (PG_U|PG_M)) { 387 atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(pte)); 388 } 389 } 390 391 /* 392 * pmap_map_ptes: map a pmap's PTEs into KVM 393 * 394 * This should not be done for EPT pmaps 395 */ 396 paddr_t 397 pmap_map_ptes(struct pmap *pmap) 398 { 399 paddr_t cr3; 400 401 KASSERT(pmap->pm_type != PMAP_TYPE_EPT); 402 403 /* the kernel's pmap is always accessible */ 404 if (pmap == pmap_kernel()) 405 return 0; 406 407 /* 408 * Lock the target map before switching to its page tables to 409 * guarantee other CPUs have finished changing the tables before 410 * we potentially start caching table and TLB entries. 411 */ 412 mtx_enter(&pmap->pm_mtx); 413 414 cr3 = rcr3(); 415 KASSERT((cr3 & CR3_PCID) == PCID_KERN || 416 (cr3 & CR3_PCID) == PCID_PROC); 417 if (pmap->pm_pdirpa == (cr3 & CR3_PADDR)) 418 cr3 = 0; 419 else { 420 cr3 |= cr3_reuse_pcid; 421 lcr3(pmap->pm_pdirpa | cr3_pcid_temp); 422 } 423 424 return cr3; 425 } 426 427 void 428 pmap_unmap_ptes(struct pmap *pmap, paddr_t save_cr3) 429 { 430 if (pmap != pmap_kernel()) 431 mtx_leave(&pmap->pm_mtx); 432 433 if (save_cr3 != 0) 434 lcr3(save_cr3); 435 } 436 437 int 438 pmap_find_pte_direct(struct pmap *pm, vaddr_t va, pt_entry_t **pd, int *offs) 439 { 440 u_long mask, shift; 441 pd_entry_t pde; 442 paddr_t pdpa; 443 int lev; 444 445 pdpa = pm->pm_pdirpa; 446 shift = L4_SHIFT; 447 mask = L4_MASK; 448 for (lev = PTP_LEVELS; lev > 0; lev--) { 449 *pd = (pd_entry_t *)PMAP_DIRECT_MAP(pdpa); 450 *offs = (VA_SIGN_POS(va) & mask) >> shift; 451 pde = (*pd)[*offs]; 452 453 /* Large pages are different, break early if we run into one. */ 454 if ((pde & (PG_PS|PG_V)) != PG_V) 455 return (lev - 1); 456 457 pdpa = ((*pd)[*offs] & PG_FRAME); 458 /* 4096/8 == 512 == 2^9 entries per level */ 459 shift -= 9; 460 mask >>= 9; 461 } 462 463 return (0); 464 } 465 466 /* 467 * p m a p k e n t e r f u n c t i o n s 468 * 469 * functions to quickly enter/remove pages from the kernel address 470 * space. pmap_kremove is exported to MI kernel. we make use of 471 * the recursive PTE mappings. 472 */ 473 474 /* 475 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 476 * 477 * => no need to lock anything, assume va is already allocated 478 * => should be faster than normal pmap enter function 479 */ 480 481 void 482 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot) 483 { 484 pt_entry_t *pte, opte, npte; 485 486 pte = kvtopte(va); 487 488 npte = (pa & PMAP_PA_MASK) | ((prot & PROT_WRITE) ? PG_RW : PG_RO) | 489 ((pa & PMAP_NOCACHE) ? PG_N : 0) | 490 ((pa & PMAP_WC) ? pmap_pg_wc : 0) | PG_V; 491 492 /* special 1:1 mappings in the first 2MB must not be global */ 493 if (va >= (vaddr_t)NBPD_L2) 494 npte |= pg_g_kern; 495 496 if (!(prot & PROT_EXEC)) 497 npte |= pg_nx; 498 opte = pmap_pte_set(pte, npte); 499 #ifdef LARGEPAGES 500 /* XXX For now... */ 501 if (opte & PG_PS) 502 panic("%s: PG_PS", __func__); 503 #endif 504 if (pmap_valid_entry(opte)) { 505 if (pa & PMAP_NOCACHE && (opte & PG_N) == 0) 506 wbinvd_on_all_cpus(); 507 /* This shouldn't happen */ 508 pmap_tlb_shootpage(pmap_kernel(), va, 1); 509 pmap_tlb_shootwait(); 510 } 511 } 512 513 /* 514 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 515 * 516 * => no need to lock anything 517 * => caller must dispose of any vm_page mapped in the va range 518 * => note: not an inline function 519 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 520 * => we assume kernel only unmaps valid addresses and thus don't bother 521 * checking the valid bit before doing TLB flushing 522 */ 523 524 void 525 pmap_kremove(vaddr_t sva, vsize_t len) 526 { 527 pt_entry_t *pte, opte; 528 vaddr_t va, eva; 529 530 eva = sva + len; 531 532 for (va = sva; va != eva; va += PAGE_SIZE) { 533 pte = kvtopte(va); 534 535 opte = pmap_pte_set(pte, 0); 536 #ifdef LARGEPAGES 537 KASSERT((opte & PG_PS) == 0); 538 #endif 539 KASSERT((opte & PG_PVLIST) == 0); 540 } 541 542 pmap_tlb_shootrange(pmap_kernel(), sva, eva, 1); 543 pmap_tlb_shootwait(); 544 } 545 546 /* 547 * pmap_set_pml4_early 548 * 549 * Utility function to map 2GB of 2MB pages to 'pa'. The VA that is assigned 550 * is the pml4 entry for 'early mappings' (see pmap.h). This function is used 551 * by display drivers that need to map their framebuffers early, before the 552 * pmap is fully initialized (eg, to show panic messages). 553 * 554 * Users of this function must call pmap_clear_pml4_early to remove the 555 * mapping when finished. 556 * 557 * Parameters: 558 * pa: phys addr to map 559 * 560 * Return value: 561 * VA mapping to 'pa'. This mapping is 2GB in size and starts at the base 562 * of the 2MB region containing 'va'. 563 */ 564 vaddr_t 565 pmap_set_pml4_early(paddr_t pa) 566 { 567 extern paddr_t early_pte_pages; 568 pt_entry_t *pml4e, *pte; 569 int i, j, off; 570 paddr_t curpa; 571 vaddr_t va; 572 573 pml4e = (pt_entry_t *)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE); 574 pml4e[PDIR_SLOT_EARLY] = (pd_entry_t)early_pte_pages | PG_V | PG_RW; 575 576 off = pa & PAGE_MASK_L2; 577 curpa = pa & L2_FRAME; 578 579 pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages); 580 memset(pte, 0, 3 * NBPG); 581 582 pte[0] = (early_pte_pages + NBPG) | PG_V | PG_RW; 583 pte[1] = (early_pte_pages + 2 * NBPG) | PG_V | PG_RW; 584 585 pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages + NBPG); 586 for (i = 0; i < 2; i++) { 587 /* 2 early pages of mappings */ 588 for (j = 0; j < 512; j++) { 589 /* j[0..511] : 2MB mappings per page */ 590 pte[(i * 512) + j] = curpa | PG_V | PG_RW | PG_PS; 591 curpa += (2 * 1024 * 1024); 592 } 593 } 594 595 va = (vaddr_t)((PDIR_SLOT_EARLY * 512ULL) << L3_SHIFT) + off; 596 return VA_SIGN_NEG(va); 597 } 598 599 /* 600 * pmap_clear_pml4_early 601 * 602 * Clears the mapping previously established with pmap_set_pml4_early. 603 */ 604 void 605 pmap_clear_pml4_early(void) 606 { 607 extern paddr_t early_pte_pages; 608 pt_entry_t *pml4e, *pte; 609 610 pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages); 611 memset(pte, 0, 3 * NBPG); 612 613 pml4e = (pd_entry_t *)pmap_kernel()->pm_pdir; 614 pml4e[PDIR_SLOT_EARLY] = 0; 615 tlbflush(); 616 } 617 618 /* 619 * p m a p i n i t f u n c t i o n s 620 * 621 * pmap_bootstrap and pmap_init are called during system startup 622 * to init the pmap module. pmap_bootstrap() does a low level 623 * init just to get things rolling. pmap_init() finishes the job. 624 */ 625 626 /* 627 * pmap_bootstrap: get the system in a state where it can run with VM 628 * properly enabled (called before main()). the VM system is 629 * fully init'd later... 630 */ 631 632 paddr_t 633 pmap_bootstrap(paddr_t first_avail, paddr_t max_pa) 634 { 635 vaddr_t kva_start = VM_MIN_KERNEL_ADDRESS; 636 struct pmap *kpm; 637 int curslot, i, j, p; 638 long ndmpdp; 639 paddr_t dmpd, dmpdp, start_cur, cur_pa; 640 vaddr_t kva, kva_end; 641 pt_entry_t *pml3, *pml2; 642 643 /* 644 * define the boundaries of the managed kernel virtual address 645 * space. 646 */ 647 648 virtual_avail = kva_start; /* first free KVA */ 649 650 /* 651 * set up protection_codes: we need to be able to convert from 652 * a MI protection code (some combo of VM_PROT...) to something 653 * we can jam into a i386 PTE. 654 */ 655 656 protection_codes[PROT_NONE] = pg_nx; /* --- */ 657 protection_codes[PROT_EXEC] = PG_RO; /* --x */ 658 protection_codes[PROT_READ] = PG_RO | pg_nx; /* -r- */ 659 protection_codes[PROT_READ | PROT_EXEC] = PG_RO; /* -rx */ 660 protection_codes[PROT_WRITE] = PG_RW | pg_nx; /* w-- */ 661 protection_codes[PROT_WRITE | PROT_EXEC] = PG_RW; /* w-x */ 662 protection_codes[PROT_WRITE | PROT_READ] = PG_RW | pg_nx; /* wr- */ 663 protection_codes[PROT_READ | PROT_WRITE | PROT_EXEC] = PG_RW; /* wrx */ 664 665 /* 666 * now we init the kernel's pmap 667 * 668 * the kernel pmap's pm_obj is not used for much. however, in 669 * user pmaps the pm_obj contains the list of active PTPs. 670 * the pm_obj currently does not have a pager. 671 */ 672 673 kpm = pmap_kernel(); 674 for (i = 0; i < PTP_LEVELS - 1; i++) { 675 uvm_objinit(&kpm->pm_obj[i], NULL, 1); 676 kpm->pm_ptphint[i] = NULL; 677 } 678 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 679 kpm->pm_pdir = (pd_entry_t *)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE); 680 kpm->pm_pdirpa = proc0.p_addr->u_pcb.pcb_cr3; 681 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 682 atop(kva_start - VM_MIN_KERNEL_ADDRESS); 683 /* 684 * the above is just a rough estimate and not critical to the proper 685 * operation of the system. 686 */ 687 688 kpm->pm_type = PMAP_TYPE_NORMAL; 689 690 curpcb->pcb_pmap = kpm; /* proc0's pcb */ 691 692 /* 693 * Configure and enable PCID use if supported. 694 * Currently we require INVPCID support. 695 */ 696 if ((cpu_ecxfeature & CPUIDECX_PCID) && cpuid_level >= 0x07) { 697 uint32_t ebx, dummy; 698 CPUID_LEAF(0x7, 0, dummy, ebx, dummy, dummy); 699 if (ebx & SEFF0EBX_INVPCID) { 700 pmap_use_pcid = 1; 701 lcr4( rcr4() | CR4_PCIDE ); 702 cr3_pcid_proc = PCID_PROC; 703 cr3_pcid_temp = PCID_TEMP; 704 cr3_reuse_pcid = CR3_REUSE_PCID; 705 cr3_pcid_proc_intel = PCID_PROC_INTEL; 706 } 707 } 708 709 /* 710 * Add PG_G attribute to already mapped kernel pages. pg_g_kern 711 * is calculated in locore0.S and may be set to: 712 * 713 * 0 if this CPU does not safely support global pages in the kernel 714 * (Intel/Meltdown) 715 * PG_G if this CPU does safely support global pages in the kernel 716 * (AMD) 717 */ 718 #if KERNBASE == VM_MIN_KERNEL_ADDRESS 719 for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ; 720 #else 721 kva_end = roundup((vaddr_t)&end, PAGE_SIZE); 722 for (kva = KERNBASE; kva < kva_end ; 723 #endif 724 kva += PAGE_SIZE) { 725 unsigned long p1i = pl1_i(kva); 726 if (pmap_valid_entry(PTE_BASE[p1i])) 727 PTE_BASE[p1i] |= pg_g_kern; 728 } 729 730 /* 731 * Map the direct map. The first 4GB were mapped in locore, here 732 * we map the rest if it exists. We actually use the direct map 733 * here to set up the page tables, we're assuming that we're still 734 * operating in the lower 4GB of memory. 735 * 736 * Map (up to) the first 512GB of physical memory first. This part 737 * is handled differently than physical memory > 512GB since we have 738 * already mapped part of this range in locore0. 739 */ 740 ndmpdp = (max_pa + NBPD_L3 - 1) >> L3_SHIFT; 741 if (ndmpdp < NDML2_ENTRIES) 742 ndmpdp = NDML2_ENTRIES; /* At least 4GB */ 743 if (ndmpdp > 512) 744 ndmpdp = 512; /* At most 512GB */ 745 746 dmpdp = kpm->pm_pdir[PDIR_SLOT_DIRECT] & PG_FRAME; 747 748 dmpd = first_avail; first_avail += ndmpdp * PAGE_SIZE; 749 750 for (i = NDML2_ENTRIES; i < NPDPG * ndmpdp; i++) { 751 paddr_t pdp; 752 vaddr_t va; 753 754 pdp = (paddr_t)&(((pd_entry_t *)dmpd)[i]); 755 va = PMAP_DIRECT_MAP(pdp); 756 757 *((pd_entry_t *)va) = ((paddr_t)i << L2_SHIFT); 758 *((pd_entry_t *)va) |= PG_RW | PG_V | PG_PS | pg_g_kern | PG_U | 759 PG_M | pg_nx; 760 } 761 762 for (i = NDML2_ENTRIES; i < ndmpdp; i++) { 763 paddr_t pdp; 764 vaddr_t va; 765 766 pdp = (paddr_t)&(((pd_entry_t *)dmpdp)[i]); 767 va = PMAP_DIRECT_MAP(pdp); 768 769 *((pd_entry_t *)va) = dmpd + (i << PAGE_SHIFT); 770 *((pd_entry_t *)va) |= PG_RW | PG_V | PG_U | PG_M | pg_nx; 771 } 772 773 kpm->pm_pdir[PDIR_SLOT_DIRECT] = dmpdp | PG_V | PG_KW | PG_U | 774 PG_M | pg_nx; 775 776 /* Map any remaining physical memory > 512GB */ 777 for (curslot = 1 ; curslot < NUM_L4_SLOT_DIRECT ; curslot++) { 778 /* 779 * Start of current range starts at PA (curslot) * 512GB 780 */ 781 start_cur = (paddr_t)(curslot * NBPD_L4); 782 if (max_pa > start_cur) { 783 /* Next 512GB, new PML4e and L3(512GB) page */ 784 dmpd = first_avail; first_avail += PAGE_SIZE; 785 pml3 = (pt_entry_t *)PMAP_DIRECT_MAP(dmpd); 786 kpm->pm_pdir[PDIR_SLOT_DIRECT + curslot] = dmpd | 787 PG_KW | PG_V | PG_U | PG_M | pg_nx; 788 789 /* Calculate full 1GB pages in this 512GB region */ 790 p = ((max_pa - start_cur) >> L3_SHIFT); 791 792 /* Check if a partial (<1GB) page remains */ 793 if (max_pa & L2_MASK) 794 p++; 795 796 /* 797 * Handle the case where this range is full and there 798 * is still more memory after (p would be > 512). 799 */ 800 if (p > NPDPG) 801 p = NPDPG; 802 803 /* Allocate 'p' L2(1GB) pages and populate */ 804 for (i = 0; i < p; i++) { 805 dmpd = first_avail; first_avail += PAGE_SIZE; 806 pml2 = (pt_entry_t *)PMAP_DIRECT_MAP(dmpd); 807 pml3[i] = dmpd | 808 PG_RW | PG_V | PG_U | PG_M | pg_nx; 809 810 cur_pa = start_cur + (i << L3_SHIFT); 811 j = 0; 812 813 while (cur_pa < max_pa && j < NPDPG) { 814 pml2[j] = curslot * NBPD_L4 + 815 (uint64_t)i * NBPD_L3 + 816 (uint64_t)j * NBPD_L2; 817 pml2[j] |= PG_RW | PG_V | pg_g_kern | 818 PG_U | PG_M | pg_nx | PG_PS; 819 cur_pa += NBPD_L2; 820 j++; 821 } 822 } 823 } 824 } 825 826 tlbflush(); 827 828 msgbuf_vaddr = virtual_avail; 829 virtual_avail += round_page(MSGBUFSIZE); 830 831 idt_vaddr = virtual_avail; 832 virtual_avail += 2 * PAGE_SIZE; 833 idt_paddr = first_avail; /* steal a page */ 834 first_avail += 2 * PAGE_SIZE; 835 836 #if defined(MULTIPROCESSOR) || \ 837 (NACPI > 0 && !defined(SMALL_KERNEL)) 838 /* 839 * Grab a page below 4G for things that need it (i.e. 840 * having an initial %cr3 for the MP trampoline). 841 */ 842 lo32_vaddr = virtual_avail; 843 virtual_avail += PAGE_SIZE; 844 lo32_paddr = first_avail; 845 first_avail += PAGE_SIZE; 846 #endif 847 848 /* 849 * init the global lists. 850 */ 851 LIST_INIT(&pmaps); 852 853 /* 854 * initialize the pmap pools. 855 */ 856 857 pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, IPL_VM, 0, 858 "pmappl", NULL); 859 pool_init(&pmap_pv_pool, sizeof(struct pv_entry), 0, IPL_VM, 0, 860 "pvpl", &pool_allocator_single); 861 pool_sethiwat(&pmap_pv_pool, 32 * 1024); 862 863 /* 864 * initialize the PDE pool. 865 */ 866 867 pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, IPL_VM, 0, 868 "pdppl", &pool_allocator_single); 869 870 kpm->pm_pdir_intel = NULL; 871 kpm->pm_pdirpa_intel = 0; 872 873 /* 874 * ensure the TLB is sync'd with reality by flushing it... 875 */ 876 877 tlbflush(); 878 879 return first_avail; 880 } 881 882 /* 883 * pmap_randomize 884 * 885 * Randomizes the location of the kernel pmap 886 */ 887 void 888 pmap_randomize(void) 889 { 890 pd_entry_t *pml4va, *oldpml4va; 891 paddr_t pml4pa; 892 int i; 893 894 pml4va = km_alloc(PAGE_SIZE, &kv_page, &kp_zero, &kd_nowait); 895 if (pml4va == NULL) 896 panic("%s: km_alloc failed", __func__); 897 898 /* Copy old PML4 page to new one */ 899 oldpml4va = pmap_kernel()->pm_pdir; 900 memcpy(pml4va, oldpml4va, PAGE_SIZE); 901 902 /* Switch to new PML4 */ 903 pmap_extract(pmap_kernel(), (vaddr_t)pml4va, &pml4pa); 904 lcr3(pml4pa); 905 906 /* Fixup pmap_kernel and proc0's %cr3 */ 907 pmap_kernel()->pm_pdirpa = pml4pa; 908 pmap_kernel()->pm_pdir = pml4va; 909 proc0.p_addr->u_pcb.pcb_cr3 = pml4pa; 910 911 /* Fixup recursive PTE PML4E slot. We are only changing the PA */ 912 pml4va[PDIR_SLOT_PTE] = pml4pa | (pml4va[PDIR_SLOT_PTE] & ~PG_FRAME); 913 914 for (i = 0; i < NPDPG; i++) { 915 /* PTE slot already handled earlier */ 916 if (i == PDIR_SLOT_PTE) 917 continue; 918 919 if (pml4va[i] & PG_FRAME) 920 pmap_randomize_level(&pml4va[i], 3); 921 } 922 923 /* Wipe out bootstrap PML4 */ 924 memset(oldpml4va, 0, PAGE_SIZE); 925 tlbflush(); 926 } 927 928 void 929 pmap_randomize_level(pd_entry_t *pde, int level) 930 { 931 pd_entry_t *new_pd_va; 932 paddr_t old_pd_pa, new_pd_pa; 933 vaddr_t old_pd_va; 934 struct vm_page *pg; 935 int i; 936 937 if (level == 0) 938 return; 939 940 if (level < PTP_LEVELS - 1 && (*pde & PG_PS)) 941 return; 942 943 new_pd_va = km_alloc(PAGE_SIZE, &kv_page, &kp_zero, &kd_nowait); 944 if (new_pd_va == NULL) 945 panic("%s: cannot allocate page for L%d page directory", 946 __func__, level); 947 948 old_pd_pa = *pde & PG_FRAME; 949 old_pd_va = PMAP_DIRECT_MAP(old_pd_pa); 950 pmap_extract(pmap_kernel(), (vaddr_t)new_pd_va, &new_pd_pa); 951 memcpy(new_pd_va, (void *)old_pd_va, PAGE_SIZE); 952 *pde = new_pd_pa | (*pde & ~PG_FRAME); 953 954 tlbflush(); 955 memset((void *)old_pd_va, 0, PAGE_SIZE); 956 957 pg = PHYS_TO_VM_PAGE(old_pd_pa); 958 if (pg != NULL) { 959 pg->wire_count--; 960 pmap_kernel()->pm_stats.resident_count--; 961 if (pg->wire_count <= 1) 962 uvm_pagefree(pg); 963 } 964 965 for (i = 0; i < NPDPG; i++) 966 if (new_pd_va[i] & PG_FRAME) 967 pmap_randomize_level(&new_pd_va[i], level - 1); 968 } 969 970 /* 971 * Pre-allocate PTPs for low memory, so that 1:1 mappings for various 972 * trampoline code can be entered. 973 */ 974 paddr_t 975 pmap_prealloc_lowmem_ptps(paddr_t first_avail) 976 { 977 pd_entry_t *pdes; 978 int level; 979 paddr_t newp; 980 981 pdes = pmap_kernel()->pm_pdir; 982 level = PTP_LEVELS; 983 for (;;) { 984 newp = first_avail; first_avail += PAGE_SIZE; 985 memset((void *)PMAP_DIRECT_MAP(newp), 0, PAGE_SIZE); 986 pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW; 987 level--; 988 if (level <= 1) 989 break; 990 pdes = normal_pdes[level - 2]; 991 } 992 993 return first_avail; 994 } 995 996 /* 997 * pmap_init: no further initialization required on this platform 998 */ 999 void 1000 pmap_init(void) 1001 { 1002 pmap_initialized = 1; 1003 } 1004 1005 /* 1006 * p v _ e n t r y f u n c t i o n s 1007 */ 1008 1009 /* 1010 * main pv_entry manipulation functions: 1011 * pmap_enter_pv: enter a mapping onto a pv list 1012 * pmap_remove_pv: remove a mapping from a pv list 1013 */ 1014 1015 /* 1016 * pmap_enter_pv: enter a mapping onto a pv list 1017 * 1018 * => caller should adjust ptp's wire_count before calling 1019 * 1020 * pve: preallocated pve for us to use 1021 * ptp: PTP in pmap that maps this VA 1022 */ 1023 1024 void 1025 pmap_enter_pv(struct vm_page *pg, struct pv_entry *pve, struct pmap *pmap, 1026 vaddr_t va, struct vm_page *ptp) 1027 { 1028 pve->pv_pmap = pmap; 1029 pve->pv_va = va; 1030 pve->pv_ptp = ptp; /* NULL for kernel pmap */ 1031 mtx_enter(&pg->mdpage.pv_mtx); 1032 pve->pv_next = pg->mdpage.pv_list; /* add to ... */ 1033 pg->mdpage.pv_list = pve; /* ... list */ 1034 mtx_leave(&pg->mdpage.pv_mtx); 1035 } 1036 1037 /* 1038 * pmap_remove_pv: try to remove a mapping from a pv_list 1039 * 1040 * => caller should adjust ptp's wire_count and free PTP if needed 1041 * => we return the removed pve 1042 */ 1043 1044 struct pv_entry * 1045 pmap_remove_pv(struct vm_page *pg, struct pmap *pmap, vaddr_t va) 1046 { 1047 struct pv_entry *pve, **prevptr; 1048 1049 mtx_enter(&pg->mdpage.pv_mtx); 1050 prevptr = &pg->mdpage.pv_list; 1051 while ((pve = *prevptr) != NULL) { 1052 if (pve->pv_pmap == pmap && pve->pv_va == va) { /* match? */ 1053 *prevptr = pve->pv_next; /* remove it! */ 1054 break; 1055 } 1056 prevptr = &pve->pv_next; /* previous pointer */ 1057 } 1058 mtx_leave(&pg->mdpage.pv_mtx); 1059 return(pve); /* return removed pve */ 1060 } 1061 1062 /* 1063 * p t p f u n c t i o n s 1064 */ 1065 1066 struct vm_page * 1067 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level) 1068 { 1069 int lidx = level - 1; 1070 struct vm_page *pg; 1071 1072 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && 1073 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) 1074 return (pmap->pm_ptphint[lidx]); 1075 1076 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); 1077 1078 return pg; 1079 } 1080 1081 void 1082 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level, 1083 struct pg_to_free *pagelist) 1084 { 1085 int lidx; 1086 struct uvm_object *obj; 1087 1088 lidx = level - 1; 1089 1090 obj = &pmap->pm_obj[lidx]; 1091 pmap->pm_stats.resident_count--; 1092 if (pmap->pm_ptphint[lidx] == ptp) 1093 pmap->pm_ptphint[lidx] = RBT_ROOT(uvm_objtree, &obj->memt); 1094 ptp->wire_count = 0; 1095 uvm_pagerealloc(ptp, NULL, 0); 1096 TAILQ_INSERT_TAIL(pagelist, ptp, pageq); 1097 } 1098 1099 void 1100 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 1101 struct pg_to_free *pagelist) 1102 { 1103 unsigned long index; 1104 int level; 1105 vaddr_t invaladdr; 1106 1107 level = 1; 1108 do { 1109 pmap_freepage(pmap, ptp, level, pagelist); 1110 index = pl_i(va, level + 1); 1111 pmap_pte_set(&normal_pdes[level - 1][index], 0); 1112 if (level == PTP_LEVELS - 1 && pmap->pm_pdir_intel != NULL) { 1113 /* Zap special meltdown PML4e */ 1114 pmap_pte_set(&pmap->pm_pdir_intel[index], 0); 1115 DPRINTF("%s: cleared meltdown PML4e @ index %lu " 1116 "(va range start 0x%llx)\n", __func__, index, 1117 (uint64_t)(index << L4_SHIFT)); 1118 } 1119 invaladdr = level == 1 ? (vaddr_t)PTE_BASE : 1120 (vaddr_t)normal_pdes[level - 2]; 1121 pmap_tlb_shootpage(pmap, invaladdr + index * PAGE_SIZE, 1122 pmap_is_curpmap(curpcb->pcb_pmap)); 1123 if (level < PTP_LEVELS - 1) { 1124 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); 1125 ptp->wire_count--; 1126 if (ptp->wire_count > 1) 1127 break; 1128 } 1129 } while (++level < PTP_LEVELS); 1130 } 1131 1132 /* 1133 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 1134 * 1135 * => pmap should NOT be pmap_kernel() 1136 */ 1137 1138 struct vm_page * 1139 pmap_get_ptp(struct pmap *pmap, vaddr_t va) 1140 { 1141 struct vm_page *ptp, *pptp; 1142 int i; 1143 unsigned long index; 1144 pd_entry_t *pva, *pva_intel; 1145 paddr_t ppa, pa; 1146 struct uvm_object *obj; 1147 1148 ptp = NULL; 1149 pa = (paddr_t)-1; 1150 1151 /* 1152 * Loop through all page table levels seeing if we need to 1153 * add a new page to that level. 1154 */ 1155 for (i = PTP_LEVELS; i > 1; i--) { 1156 /* 1157 * Save values from previous round. 1158 */ 1159 pptp = ptp; 1160 ppa = pa; 1161 1162 index = pl_i(va, i); 1163 pva = normal_pdes[i - 2]; 1164 1165 if (pmap_valid_entry(pva[index])) { 1166 ppa = pva[index] & PG_FRAME; 1167 ptp = NULL; 1168 continue; 1169 } 1170 1171 obj = &pmap->pm_obj[i-2]; 1172 ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL, 1173 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 1174 1175 if (ptp == NULL) 1176 return NULL; 1177 1178 atomic_clearbits_int(&ptp->pg_flags, PG_BUSY); 1179 ptp->wire_count = 1; 1180 pmap->pm_ptphint[i - 2] = ptp; 1181 pa = VM_PAGE_TO_PHYS(ptp); 1182 pva[index] = (pd_entry_t) (pa | PG_u | PG_RW | PG_V); 1183 1184 /* 1185 * Meltdown Special case - if we are adding a new PML4e for 1186 * usermode addresses, just copy the PML4e to the U-K page 1187 * table. 1188 */ 1189 if (pmap->pm_pdir_intel != NULL && i == PTP_LEVELS && 1190 va < VM_MAXUSER_ADDRESS) { 1191 pva_intel = pmap->pm_pdir_intel; 1192 pva_intel[index] = pva[index]; 1193 DPRINTF("%s: copying usermode PML4e (content=0x%llx) " 1194 "from 0x%llx -> 0x%llx\n", __func__, pva[index], 1195 (uint64_t)&pva[index], (uint64_t)&pva_intel[index]); 1196 } 1197 1198 pmap->pm_stats.resident_count++; 1199 /* 1200 * If we're not in the top level, increase the 1201 * wire count of the parent page. 1202 */ 1203 if (i < PTP_LEVELS) { 1204 if (pptp == NULL) 1205 pptp = pmap_find_ptp(pmap, va, ppa, i); 1206 #ifdef DIAGNOSTIC 1207 if (pptp == NULL) 1208 panic("%s: pde page disappeared", __func__); 1209 #endif 1210 pptp->wire_count++; 1211 } 1212 } 1213 1214 /* 1215 * ptp is not NULL if we just allocated a new ptp. If it's 1216 * still NULL, we must look up the existing one. 1217 */ 1218 if (ptp == NULL) { 1219 ptp = pmap_find_ptp(pmap, va, ppa, 1); 1220 #ifdef DIAGNOSTIC 1221 if (ptp == NULL) { 1222 printf("va %lx ppa %lx\n", (unsigned long)va, 1223 (unsigned long)ppa); 1224 panic("%s: unmanaged user PTP", __func__); 1225 } 1226 #endif 1227 } 1228 1229 pmap->pm_ptphint[0] = ptp; 1230 return(ptp); 1231 } 1232 1233 /* 1234 * p m a p l i f e c y c l e f u n c t i o n s 1235 */ 1236 1237 /* 1238 * pmap_pdp_ctor: constructor for the PDP cache. 1239 */ 1240 1241 void 1242 pmap_pdp_ctor(pd_entry_t *pdir) 1243 { 1244 paddr_t pdirpa; 1245 int npde, i; 1246 struct pmap *kpm = pmap_kernel(); 1247 1248 /* fetch the physical address of the page directory. */ 1249 (void) pmap_extract(kpm, (vaddr_t) pdir, &pdirpa); 1250 1251 /* zero init area */ 1252 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); 1253 1254 /* put in recursive PDE to map the PTEs */ 1255 pdir[PDIR_SLOT_PTE] = pdirpa | PG_V | PG_KW | pg_nx; 1256 1257 npde = nkptp[PTP_LEVELS - 1]; 1258 1259 /* put in kernel VM PDEs */ 1260 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 1261 npde * sizeof(pd_entry_t)); 1262 1263 /* zero the rest */ 1264 memset(&pdir[PDIR_SLOT_KERN + npde], 0, 1265 (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t)); 1266 1267 for (i = 0; i < NUM_L4_SLOT_DIRECT; i++) 1268 pdir[PDIR_SLOT_DIRECT + i] = kpm->pm_pdir[PDIR_SLOT_DIRECT + i]; 1269 1270 #if VM_MIN_KERNEL_ADDRESS != KERNBASE 1271 pdir[pl4_pi(KERNBASE)] = PDP_BASE[pl4_pi(KERNBASE)]; 1272 #endif 1273 } 1274 1275 void 1276 pmap_pdp_ctor_intel(pd_entry_t *pdir) 1277 { 1278 struct pmap *kpm = pmap_kernel(); 1279 1280 /* Copy PML4es from pmap_kernel's U-K view */ 1281 memcpy(pdir, kpm->pm_pdir_intel, PAGE_SIZE); 1282 } 1283 1284 /* 1285 * pmap_create: create a pmap 1286 * 1287 * => note: old pmap interface took a "size" args which allowed for 1288 * the creation of "software only" pmaps (not in bsd). 1289 */ 1290 1291 struct pmap * 1292 pmap_create(void) 1293 { 1294 struct pmap *pmap; 1295 int i; 1296 1297 pmap = pool_get(&pmap_pmap_pool, PR_WAITOK); 1298 1299 mtx_init(&pmap->pm_mtx, IPL_VM); 1300 1301 /* init uvm_object */ 1302 for (i = 0; i < PTP_LEVELS - 1; i++) { 1303 uvm_objinit(&pmap->pm_obj[i], NULL, 1); 1304 pmap->pm_ptphint[i] = NULL; 1305 } 1306 pmap->pm_stats.wired_count = 0; 1307 pmap->pm_stats.resident_count = 1; /* count the PDP allocd below */ 1308 pmap->pm_cpus = 0; 1309 pmap->pm_type = PMAP_TYPE_NORMAL; 1310 1311 /* allocate PDP */ 1312 1313 /* 1314 * note that there is no need to splvm to protect us from 1315 * malloc since malloc allocates out of a submap and we should 1316 * have already allocated kernel PTPs to cover the range... 1317 */ 1318 1319 pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK); 1320 pmap_pdp_ctor(pmap->pm_pdir); 1321 1322 pmap->pm_pdirpa = pmap->pm_pdir[PDIR_SLOT_PTE] & PG_FRAME; 1323 1324 /* 1325 * Intel CPUs need a special page table to be used during usermode 1326 * execution, one that lacks all kernel mappings. 1327 */ 1328 if (cpu_meltdown) { 1329 pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool, PR_WAITOK); 1330 pmap_pdp_ctor_intel(pmap->pm_pdir_intel); 1331 pmap->pm_stats.resident_count++; 1332 if (!pmap_extract(pmap_kernel(), (vaddr_t)pmap->pm_pdir_intel, 1333 &pmap->pm_pdirpa_intel)) 1334 panic("%s: unknown PA mapping for meltdown PML4", 1335 __func__); 1336 } else { 1337 pmap->pm_pdir_intel = NULL; 1338 pmap->pm_pdirpa_intel = 0; 1339 } 1340 1341 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 1342 return (pmap); 1343 } 1344 1345 /* 1346 * pmap_destroy: drop reference count on pmap. free pmap if 1347 * reference count goes to zero. 1348 */ 1349 1350 void 1351 pmap_destroy(struct pmap *pmap) 1352 { 1353 struct vm_page *pg; 1354 int refs; 1355 int i; 1356 1357 /* 1358 * drop reference count 1359 */ 1360 1361 refs = atomic_dec_int_nv(&pmap->pm_obj[0].uo_refs); 1362 if (refs > 0) { 1363 return; 1364 } 1365 1366 /* 1367 * reference count is zero, free pmap resources and then free pmap. 1368 */ 1369 1370 #ifdef DIAGNOSTIC 1371 if (__predict_false(pmap->pm_cpus != 0)) 1372 printf("%s: pmap %p cpus=0x%llx\n", __func__, 1373 (void *)pmap, pmap->pm_cpus); 1374 #endif 1375 1376 /* 1377 * remove it from global list of pmaps 1378 */ 1379 LIST_REMOVE(pmap, pm_list); 1380 1381 /* 1382 * free any remaining PTPs 1383 */ 1384 1385 for (i = 0; i < PTP_LEVELS - 1; i++) { 1386 while ((pg = RBT_ROOT(uvm_objtree, 1387 &pmap->pm_obj[i].memt)) != NULL) { 1388 KASSERT((pg->pg_flags & PG_BUSY) == 0); 1389 1390 pg->wire_count = 0; 1391 pmap->pm_stats.resident_count--; 1392 1393 uvm_pagefree(pg); 1394 } 1395 } 1396 1397 pool_put(&pmap_pdp_pool, pmap->pm_pdir); 1398 1399 if (pmap->pm_pdir_intel != NULL) { 1400 pmap->pm_stats.resident_count--; 1401 pool_put(&pmap_pdp_pool, pmap->pm_pdir_intel); 1402 } 1403 1404 pool_put(&pmap_pmap_pool, pmap); 1405 } 1406 1407 /* 1408 * Add a reference to the specified pmap. 1409 */ 1410 1411 void 1412 pmap_reference(struct pmap *pmap) 1413 { 1414 atomic_inc_int(&pmap->pm_obj[0].uo_refs); 1415 } 1416 1417 /* 1418 * pmap_activate: activate a process' pmap (fill in %cr3) 1419 * 1420 * => called from cpu_fork() and when switching pmaps during exec 1421 * => if p is the curproc, then load it into the MMU 1422 */ 1423 1424 void 1425 pmap_activate(struct proc *p) 1426 { 1427 struct pcb *pcb = &p->p_addr->u_pcb; 1428 struct pmap *pmap = p->p_vmspace->vm_map.pmap; 1429 1430 pcb->pcb_pmap = pmap; 1431 pcb->pcb_cr3 = pmap->pm_pdirpa; 1432 pcb->pcb_cr3 |= (pmap != pmap_kernel()) ? cr3_pcid_proc : 1433 (PCID_KERN | cr3_reuse_pcid); 1434 1435 if (p == curproc) { 1436 lcr3(pcb->pcb_cr3); 1437 1438 /* in case we return to userspace without context switching */ 1439 if (cpu_meltdown) { 1440 struct cpu_info *self = curcpu(); 1441 1442 self->ci_kern_cr3 = pcb->pcb_cr3 | cr3_reuse_pcid; 1443 self->ci_user_cr3 = pmap->pm_pdirpa_intel | 1444 cr3_pcid_proc_intel; 1445 } 1446 1447 /* 1448 * mark the pmap in use by this processor. 1449 */ 1450 x86_atomic_setbits_u64(&pmap->pm_cpus, (1ULL << cpu_number())); 1451 } 1452 } 1453 1454 /* 1455 * pmap_deactivate: deactivate a process' pmap 1456 */ 1457 1458 void 1459 pmap_deactivate(struct proc *p) 1460 { 1461 struct pmap *pmap = p->p_vmspace->vm_map.pmap; 1462 1463 /* 1464 * mark the pmap no longer in use by this processor. 1465 */ 1466 x86_atomic_clearbits_u64(&pmap->pm_cpus, (1ULL << cpu_number())); 1467 } 1468 1469 /* 1470 * end of lifecycle functions 1471 */ 1472 1473 /* 1474 * some misc. functions 1475 */ 1476 1477 int 1478 pmap_pdes_valid(vaddr_t va, pd_entry_t *lastpde) 1479 { 1480 int i; 1481 unsigned long index; 1482 pd_entry_t pde; 1483 1484 for (i = PTP_LEVELS; i > 1; i--) { 1485 index = pl_i(va, i); 1486 pde = normal_pdes[i - 2][index]; 1487 if (!pmap_valid_entry(pde)) 1488 return 0; 1489 } 1490 if (lastpde != NULL) 1491 *lastpde = pde; 1492 return 1; 1493 } 1494 1495 /* 1496 * pmap_extract: extract a PA for the given VA 1497 */ 1498 1499 int 1500 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 1501 { 1502 pt_entry_t *ptes; 1503 int level, offs; 1504 1505 if (pmap == pmap_kernel() && va >= PMAP_DIRECT_BASE && 1506 va < PMAP_DIRECT_END) { 1507 *pap = va - PMAP_DIRECT_BASE; 1508 return 1; 1509 } 1510 1511 level = pmap_find_pte_direct(pmap, va, &ptes, &offs); 1512 1513 if (__predict_true(level == 0 && pmap_valid_entry(ptes[offs]))) { 1514 if (pap != NULL) 1515 *pap = (ptes[offs] & PG_FRAME) | (va & PAGE_MASK); 1516 return 1; 1517 } 1518 if (level == 1 && (ptes[offs] & (PG_PS|PG_V)) == (PG_PS|PG_V)) { 1519 if (pap != NULL) 1520 *pap = (ptes[offs] & PG_LGFRAME) | (va & PAGE_MASK_L2); 1521 return 1; 1522 } 1523 1524 return 0; 1525 } 1526 1527 /* 1528 * pmap_zero_page: zero a page 1529 */ 1530 1531 void 1532 pmap_zero_page(struct vm_page *pg) 1533 { 1534 pagezero(pmap_map_direct(pg)); 1535 } 1536 1537 /* 1538 * pmap_flush_cache: flush the cache for a virtual address. 1539 */ 1540 void 1541 pmap_flush_cache(vaddr_t addr, vsize_t len) 1542 { 1543 vaddr_t i; 1544 1545 if (curcpu()->ci_cflushsz == 0) { 1546 wbinvd_on_all_cpus(); 1547 return; 1548 } 1549 1550 /* all cpus that have clflush also have mfence. */ 1551 mfence(); 1552 for (i = addr; i < addr + len; i += curcpu()->ci_cflushsz) 1553 clflush(i); 1554 mfence(); 1555 } 1556 1557 /* 1558 * pmap_copy_page: copy a page 1559 */ 1560 1561 void 1562 pmap_copy_page(struct vm_page *srcpg, struct vm_page *dstpg) 1563 { 1564 vaddr_t srcva = pmap_map_direct(srcpg); 1565 vaddr_t dstva = pmap_map_direct(dstpg); 1566 1567 memcpy((void *)dstva, (void *)srcva, PAGE_SIZE); 1568 } 1569 1570 /* 1571 * p m a p r e m o v e f u n c t i o n s 1572 * 1573 * functions that remove mappings 1574 */ 1575 1576 /* 1577 * pmap_remove_ptes: remove PTEs from a PTP 1578 * 1579 * => must have proper locking on pmap_master_lock 1580 * => PTP must be mapped into KVA 1581 * => PTP should be null if pmap == pmap_kernel() 1582 */ 1583 1584 void 1585 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 1586 vaddr_t startva, vaddr_t endva, int flags, struct pv_entry **free_pvs) 1587 { 1588 struct pv_entry *pve; 1589 pt_entry_t *pte = (pt_entry_t *) ptpva; 1590 struct vm_page *pg; 1591 pt_entry_t opte; 1592 1593 /* 1594 * note that ptpva points to the PTE that maps startva. this may 1595 * or may not be the first PTE in the PTP. 1596 * 1597 * we loop through the PTP while there are still PTEs to look at 1598 * and the wire_count is greater than 1 (because we use the wire_count 1599 * to keep track of the number of real PTEs in the PTP). 1600 */ 1601 1602 for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1) 1603 ; pte++, startva += PAGE_SIZE) { 1604 if (!pmap_valid_entry(*pte)) 1605 continue; /* VA not mapped */ 1606 if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) { 1607 continue; 1608 } 1609 1610 /* atomically save the old PTE and zap! it */ 1611 opte = pmap_pte_set(pte, 0); 1612 1613 if (opte & PG_W) 1614 pmap->pm_stats.wired_count--; 1615 pmap->pm_stats.resident_count--; 1616 1617 if (ptp != NULL) 1618 ptp->wire_count--; /* dropping a PTE */ 1619 1620 pg = PHYS_TO_VM_PAGE(opte & PG_FRAME); 1621 1622 /* 1623 * if we are not on a pv list we are done. 1624 */ 1625 1626 if ((opte & PG_PVLIST) == 0) { 1627 #ifdef DIAGNOSTIC 1628 if (pg != NULL) 1629 panic("%s: managed page without PG_PVLIST: " 1630 "va 0x%lx, opte 0x%llx", __func__, 1631 startva, opte); 1632 #endif 1633 continue; 1634 } 1635 1636 #ifdef DIAGNOSTIC 1637 if (pg == NULL) 1638 panic("%s: unmanaged page marked PG_PVLIST: " 1639 "va 0x%lx, opte 0x%llx", __func__, 1640 startva, opte); 1641 #endif 1642 1643 /* sync R/M bits */ 1644 pmap_sync_flags_pte(pg, opte); 1645 pve = pmap_remove_pv(pg, pmap, startva); 1646 if (pve != NULL) { 1647 pve->pv_next = *free_pvs; 1648 *free_pvs = pve; 1649 } 1650 1651 /* end of "for" loop: time for next pte */ 1652 } 1653 } 1654 1655 /* 1656 * pmap_remove_pte: remove a single PTE from a PTP 1657 * 1658 * => must have proper locking on pmap_master_lock 1659 * => PTP must be mapped into KVA 1660 * => PTP should be null if pmap == pmap_kernel() 1661 * => returns true if we removed a mapping 1662 */ 1663 1664 int 1665 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 1666 vaddr_t va, int flags, struct pv_entry **free_pvs) 1667 { 1668 struct pv_entry *pve; 1669 struct vm_page *pg; 1670 pt_entry_t opte; 1671 1672 if (!pmap_valid_entry(*pte)) 1673 return 0; /* VA not mapped */ 1674 if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) { 1675 return 0; 1676 } 1677 1678 /* atomically save the old PTE and zap! it */ 1679 opte = pmap_pte_set(pte, 0); 1680 1681 if (opte & PG_W) 1682 pmap->pm_stats.wired_count--; 1683 pmap->pm_stats.resident_count--; 1684 1685 if (ptp != NULL) 1686 ptp->wire_count--; /* dropping a PTE */ 1687 1688 pg = PHYS_TO_VM_PAGE(opte & PG_FRAME); 1689 1690 /* 1691 * if we are not on a pv list we are done. 1692 */ 1693 if ((opte & PG_PVLIST) == 0) { 1694 #ifdef DIAGNOSTIC 1695 if (pg != NULL) 1696 panic("%s: managed page without PG_PVLIST: " 1697 "va 0x%lx, opte 0x%llx", __func__, va, opte); 1698 #endif 1699 return 1; 1700 } 1701 1702 #ifdef DIAGNOSTIC 1703 if (pg == NULL) 1704 panic("%s: unmanaged page marked PG_PVLIST: " 1705 "va 0x%lx, opte 0x%llx", __func__, va, opte); 1706 #endif 1707 1708 /* sync R/M bits */ 1709 pmap_sync_flags_pte(pg, opte); 1710 pve = pmap_remove_pv(pg, pmap, va); 1711 if (pve != NULL) { 1712 pve->pv_next = *free_pvs; 1713 *free_pvs = pve; 1714 } 1715 1716 return 1; 1717 } 1718 1719 /* 1720 * pmap_remove: top level mapping removal function 1721 * 1722 * => caller should not be holding any pmap locks 1723 */ 1724 1725 void 1726 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 1727 { 1728 if (pmap->pm_type == PMAP_TYPE_EPT) 1729 pmap_remove_ept(pmap, sva, eva); 1730 else 1731 pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL); 1732 } 1733 1734 /* 1735 * pmap_do_remove: mapping removal guts 1736 * 1737 * => caller should not be holding any pmap locks 1738 */ 1739 1740 void 1741 pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags) 1742 { 1743 pd_entry_t pde; 1744 int result; 1745 paddr_t ptppa; 1746 vaddr_t blkendva; 1747 struct vm_page *ptp; 1748 struct pv_entry *pve; 1749 struct pv_entry *free_pvs = NULL; 1750 vaddr_t va; 1751 int shootall = 0, shootself; 1752 struct pg_to_free empty_ptps; 1753 paddr_t scr3; 1754 1755 TAILQ_INIT(&empty_ptps); 1756 1757 scr3 = pmap_map_ptes(pmap); 1758 shootself = (scr3 == 0); 1759 1760 /* 1761 * removing one page? take shortcut function. 1762 */ 1763 1764 if (sva + PAGE_SIZE == eva) { 1765 if (pmap_pdes_valid(sva, &pde)) { 1766 1767 /* PA of the PTP */ 1768 ptppa = pde & PG_FRAME; 1769 1770 /* get PTP if non-kernel mapping */ 1771 1772 if (pmap == pmap_kernel()) { 1773 /* we never free kernel PTPs */ 1774 ptp = NULL; 1775 } else { 1776 ptp = pmap_find_ptp(pmap, sva, ptppa, 1); 1777 #ifdef DIAGNOSTIC 1778 if (ptp == NULL) 1779 panic("%s: unmanaged PTP detected", 1780 __func__); 1781 #endif 1782 } 1783 1784 /* do it! */ 1785 result = pmap_remove_pte(pmap, ptp, 1786 &PTE_BASE[pl1_i(sva)], sva, flags, &free_pvs); 1787 1788 /* 1789 * if mapping removed and the PTP is no longer 1790 * being used, free it! 1791 */ 1792 1793 if (result && ptp && ptp->wire_count <= 1) 1794 pmap_free_ptp(pmap, ptp, sva, &empty_ptps); 1795 pmap_tlb_shootpage(pmap, sva, shootself); 1796 pmap_unmap_ptes(pmap, scr3); 1797 pmap_tlb_shootwait(); 1798 } else { 1799 pmap_unmap_ptes(pmap, scr3); 1800 } 1801 1802 goto cleanup; 1803 } 1804 1805 if ((eva - sva > 32 * PAGE_SIZE) && sva < VM_MIN_KERNEL_ADDRESS) 1806 shootall = 1; 1807 1808 for (va = sva; va < eva; va = blkendva) { 1809 /* determine range of block */ 1810 blkendva = x86_round_pdr(va + 1); 1811 if (blkendva > eva) 1812 blkendva = eva; 1813 1814 /* 1815 * XXXCDC: our PTE mappings should never be removed 1816 * with pmap_remove! if we allow this (and why would 1817 * we?) then we end up freeing the pmap's page 1818 * directory page (PDP) before we are finished using 1819 * it when we hit in in the recursive mapping. this 1820 * is BAD. 1821 * 1822 * long term solution is to move the PTEs out of user 1823 * address space. and into kernel address space (up 1824 * with APTE). then we can set VM_MAXUSER_ADDRESS to 1825 * be VM_MAX_ADDRESS. 1826 */ 1827 1828 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE) 1829 /* XXXCDC: ugly hack to avoid freeing PDP here */ 1830 continue; 1831 1832 if (!pmap_pdes_valid(va, &pde)) 1833 continue; 1834 1835 /* PA of the PTP */ 1836 ptppa = pde & PG_FRAME; 1837 1838 /* get PTP if non-kernel mapping */ 1839 if (pmap == pmap_kernel()) { 1840 /* we never free kernel PTPs */ 1841 ptp = NULL; 1842 } else { 1843 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 1844 #ifdef DIAGNOSTIC 1845 if (ptp == NULL) 1846 panic("%s: unmanaged PTP detected", __func__); 1847 #endif 1848 } 1849 pmap_remove_ptes(pmap, ptp, (vaddr_t)&PTE_BASE[pl1_i(va)], 1850 va, blkendva, flags, &free_pvs); 1851 1852 /* if PTP is no longer being used, free it! */ 1853 if (ptp && ptp->wire_count <= 1) { 1854 pmap_free_ptp(pmap, ptp, va, &empty_ptps); 1855 } 1856 } 1857 1858 if (shootall) 1859 pmap_tlb_shoottlb(pmap, shootself); 1860 else 1861 pmap_tlb_shootrange(pmap, sva, eva, shootself); 1862 1863 pmap_unmap_ptes(pmap, scr3); 1864 pmap_tlb_shootwait(); 1865 1866 cleanup: 1867 while ((pve = free_pvs) != NULL) { 1868 free_pvs = pve->pv_next; 1869 pool_put(&pmap_pv_pool, pve); 1870 } 1871 1872 while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) { 1873 TAILQ_REMOVE(&empty_ptps, ptp, pageq); 1874 uvm_pagefree(ptp); 1875 } 1876 } 1877 1878 /* 1879 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 1880 * 1881 * => R/M bits are sync'd back to attrs 1882 */ 1883 1884 void 1885 pmap_page_remove(struct vm_page *pg) 1886 { 1887 struct pv_entry *pve; 1888 struct pmap *pm; 1889 pt_entry_t opte; 1890 #ifdef DIAGNOSTIC 1891 pd_entry_t pde; 1892 #endif 1893 struct pg_to_free empty_ptps; 1894 struct vm_page *ptp; 1895 paddr_t scr3; 1896 int shootself; 1897 1898 TAILQ_INIT(&empty_ptps); 1899 1900 mtx_enter(&pg->mdpage.pv_mtx); 1901 while ((pve = pg->mdpage.pv_list) != NULL) { 1902 pmap_reference(pve->pv_pmap); 1903 pm = pve->pv_pmap; 1904 mtx_leave(&pg->mdpage.pv_mtx); 1905 1906 /* XXX use direct map? */ 1907 scr3 = pmap_map_ptes(pm); /* locks pmap */ 1908 shootself = (scr3 == 0); 1909 1910 /* 1911 * We dropped the pvlist lock before grabbing the pmap 1912 * lock to avoid lock ordering problems. This means 1913 * we have to check the pvlist again since somebody 1914 * else might have modified it. All we care about is 1915 * that the pvlist entry matches the pmap we just 1916 * locked. If it doesn't, unlock the pmap and try 1917 * again. 1918 */ 1919 mtx_enter(&pg->mdpage.pv_mtx); 1920 if ((pve = pg->mdpage.pv_list) == NULL || 1921 pve->pv_pmap != pm) { 1922 mtx_leave(&pg->mdpage.pv_mtx); 1923 pmap_unmap_ptes(pm, scr3); /* unlocks pmap */ 1924 pmap_destroy(pm); 1925 mtx_enter(&pg->mdpage.pv_mtx); 1926 continue; 1927 } 1928 1929 pg->mdpage.pv_list = pve->pv_next; 1930 mtx_leave(&pg->mdpage.pv_mtx); 1931 1932 #ifdef DIAGNOSTIC 1933 if (pve->pv_ptp != NULL && pmap_pdes_valid(pve->pv_va, &pde) && 1934 (pde & PG_FRAME) != VM_PAGE_TO_PHYS(pve->pv_ptp)) { 1935 printf("%s: pg=%p: va=%lx, pv_ptp=%p\n", __func__, 1936 pg, pve->pv_va, pve->pv_ptp); 1937 printf("%s: PTP's phys addr: " 1938 "actual=%lx, recorded=%lx\n", __func__, 1939 (unsigned long)(pde & PG_FRAME), 1940 VM_PAGE_TO_PHYS(pve->pv_ptp)); 1941 panic("%s: mapped managed page has " 1942 "invalid pv_ptp field", __func__); 1943 } 1944 #endif 1945 1946 /* atomically save the old PTE and zap it */ 1947 opte = pmap_pte_set(&PTE_BASE[pl1_i(pve->pv_va)], 0); 1948 1949 if (opte & PG_W) 1950 pve->pv_pmap->pm_stats.wired_count--; 1951 pve->pv_pmap->pm_stats.resident_count--; 1952 1953 pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va, shootself); 1954 1955 pmap_sync_flags_pte(pg, opte); 1956 1957 /* update the PTP reference count. free if last reference. */ 1958 if (pve->pv_ptp != NULL) { 1959 pve->pv_ptp->wire_count--; 1960 if (pve->pv_ptp->wire_count <= 1) { 1961 pmap_free_ptp(pve->pv_pmap, pve->pv_ptp, 1962 pve->pv_va, &empty_ptps); 1963 } 1964 } 1965 pmap_unmap_ptes(pve->pv_pmap, scr3); /* unlocks pmap */ 1966 pmap_destroy(pve->pv_pmap); 1967 pool_put(&pmap_pv_pool, pve); 1968 mtx_enter(&pg->mdpage.pv_mtx); 1969 } 1970 mtx_leave(&pg->mdpage.pv_mtx); 1971 1972 pmap_tlb_shootwait(); 1973 1974 while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) { 1975 TAILQ_REMOVE(&empty_ptps, ptp, pageq); 1976 uvm_pagefree(ptp); 1977 } 1978 } 1979 1980 /* 1981 * p m a p a t t r i b u t e f u n c t i o n s 1982 * functions that test/change managed page's attributes 1983 * since a page can be mapped multiple times we must check each PTE that 1984 * maps it by going down the pv lists. 1985 */ 1986 1987 /* 1988 * pmap_test_attrs: test a page's attributes 1989 */ 1990 1991 int 1992 pmap_test_attrs(struct vm_page *pg, unsigned int testbits) 1993 { 1994 struct pv_entry *pve; 1995 pt_entry_t *ptes; 1996 int level, offs; 1997 u_long mybits, testflags; 1998 1999 testflags = pmap_pte2flags(testbits); 2000 2001 if (pg->pg_flags & testflags) 2002 return 1; 2003 2004 mybits = 0; 2005 mtx_enter(&pg->mdpage.pv_mtx); 2006 for (pve = pg->mdpage.pv_list; pve != NULL && mybits == 0; 2007 pve = pve->pv_next) { 2008 level = pmap_find_pte_direct(pve->pv_pmap, pve->pv_va, &ptes, 2009 &offs); 2010 mybits |= (ptes[offs] & testbits); 2011 } 2012 mtx_leave(&pg->mdpage.pv_mtx); 2013 2014 if (mybits == 0) 2015 return 0; 2016 2017 atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(mybits)); 2018 2019 return 1; 2020 } 2021 2022 /* 2023 * pmap_clear_attrs: change a page's attributes 2024 * 2025 * => we return 1 if we cleared one of the bits we were asked to 2026 */ 2027 2028 int 2029 pmap_clear_attrs(struct vm_page *pg, unsigned long clearbits) 2030 { 2031 struct pv_entry *pve; 2032 pt_entry_t *ptes, opte; 2033 u_long clearflags; 2034 int result, level, offs; 2035 2036 clearflags = pmap_pte2flags(clearbits); 2037 2038 result = pg->pg_flags & clearflags; 2039 if (result) 2040 atomic_clearbits_int(&pg->pg_flags, clearflags); 2041 2042 mtx_enter(&pg->mdpage.pv_mtx); 2043 for (pve = pg->mdpage.pv_list; pve != NULL; pve = pve->pv_next) { 2044 level = pmap_find_pte_direct(pve->pv_pmap, pve->pv_va, &ptes, 2045 &offs); 2046 opte = ptes[offs]; 2047 if (opte & clearbits) { 2048 result = 1; 2049 pmap_pte_clearbits(&ptes[offs], (opte & clearbits)); 2050 pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va, 2051 pmap_is_curpmap(pve->pv_pmap)); 2052 } 2053 } 2054 mtx_leave(&pg->mdpage.pv_mtx); 2055 2056 pmap_tlb_shootwait(); 2057 2058 return (result != 0); 2059 } 2060 2061 /* 2062 * p m a p p r o t e c t i o n f u n c t i o n s 2063 */ 2064 2065 /* 2066 * pmap_page_protect: change the protection of all recorded mappings 2067 * of a managed page 2068 * 2069 * => NOTE: this is an inline function in pmap.h 2070 */ 2071 2072 /* see pmap.h */ 2073 2074 /* 2075 * pmap_protect: set the protection in of the pages in a pmap 2076 * 2077 * => NOTE: this is an inline function in pmap.h 2078 */ 2079 2080 /* see pmap.h */ 2081 2082 /* 2083 * pmap_write_protect: write-protect pages in a pmap 2084 */ 2085 2086 void 2087 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 2088 { 2089 pt_entry_t nx, *spte, *epte; 2090 vaddr_t blockend; 2091 int shootall = 0, shootself; 2092 vaddr_t va; 2093 paddr_t scr3; 2094 2095 scr3 = pmap_map_ptes(pmap); 2096 shootself = (scr3 == 0); 2097 2098 /* should be ok, but just in case ... */ 2099 sva &= PG_FRAME; 2100 eva &= PG_FRAME; 2101 2102 nx = 0; 2103 if (!(prot & PROT_EXEC)) 2104 nx = pg_nx; 2105 2106 if ((eva - sva > 32 * PAGE_SIZE) && sva < VM_MIN_KERNEL_ADDRESS) 2107 shootall = 1; 2108 2109 for (va = sva; va < eva ; va = blockend) { 2110 blockend = (va & L2_FRAME) + NBPD_L2; 2111 if (blockend > eva) 2112 blockend = eva; 2113 2114 /* 2115 * XXXCDC: our PTE mappings should never be write-protected! 2116 * 2117 * long term solution is to move the PTEs out of user 2118 * address space. and into kernel address space (up 2119 * with APTE). then we can set VM_MAXUSER_ADDRESS to 2120 * be VM_MAX_ADDRESS. 2121 */ 2122 2123 /* XXXCDC: ugly hack to avoid freeing PDP here */ 2124 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE) 2125 continue; 2126 2127 /* empty block? */ 2128 if (!pmap_pdes_valid(va, NULL)) 2129 continue; 2130 2131 #ifdef DIAGNOSTIC 2132 if (va >= VM_MAXUSER_ADDRESS && va < VM_MAX_ADDRESS) 2133 panic("%s: PTE space", __func__); 2134 #endif 2135 2136 spte = &PTE_BASE[pl1_i(va)]; 2137 epte = &PTE_BASE[pl1_i(blockend)]; 2138 2139 for (/*null */; spte < epte ; spte++) { 2140 if (!pmap_valid_entry(*spte)) 2141 continue; 2142 pmap_pte_clearbits(spte, PG_RW); 2143 pmap_pte_setbits(spte, nx); 2144 } 2145 } 2146 2147 if (shootall) 2148 pmap_tlb_shoottlb(pmap, shootself); 2149 else 2150 pmap_tlb_shootrange(pmap, sva, eva, shootself); 2151 2152 pmap_unmap_ptes(pmap, scr3); 2153 pmap_tlb_shootwait(); 2154 } 2155 2156 /* 2157 * end of protection functions 2158 */ 2159 2160 /* 2161 * pmap_unwire: clear the wired bit in the PTE 2162 * 2163 * => mapping should already be in map 2164 */ 2165 2166 void 2167 pmap_unwire(struct pmap *pmap, vaddr_t va) 2168 { 2169 pt_entry_t *ptes; 2170 int level, offs; 2171 2172 level = pmap_find_pte_direct(pmap, va, &ptes, &offs); 2173 2174 if (level == 0) { 2175 2176 #ifdef DIAGNOSTIC 2177 if (!pmap_valid_entry(ptes[offs])) 2178 panic("%s: invalid (unmapped) va 0x%lx", __func__, va); 2179 #endif 2180 if (__predict_true((ptes[offs] & PG_W) != 0)) { 2181 pmap_pte_clearbits(&ptes[offs], PG_W); 2182 pmap->pm_stats.wired_count--; 2183 } 2184 #ifdef DIAGNOSTIC 2185 else { 2186 printf("%s: wiring for pmap %p va 0x%lx " 2187 "didn't change!\n", __func__, pmap, va); 2188 } 2189 #endif 2190 } 2191 #ifdef DIAGNOSTIC 2192 else { 2193 panic("%s: invalid PDE", __func__); 2194 } 2195 #endif 2196 } 2197 2198 /* 2199 * pmap_collect: free resources held by a pmap 2200 * 2201 * => optional function. 2202 * => called when a process is swapped out to free memory. 2203 */ 2204 2205 void 2206 pmap_collect(struct pmap *pmap) 2207 { 2208 /* 2209 * free all of the pt pages by removing the physical mappings 2210 * for its entire address space. 2211 */ 2212 2213 /* pmap_do_remove(pmap, VM_MIN_ADDRESS, VM_MAX_ADDRESS, 2214 PMAP_REMOVE_SKIPWIRED); 2215 */ 2216 } 2217 2218 /* 2219 * pmap_copy: copy mappings from one pmap to another 2220 * 2221 * => optional function 2222 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 2223 */ 2224 2225 /* 2226 * defined as macro in pmap.h 2227 */ 2228 2229 void 2230 pmap_enter_special(vaddr_t va, paddr_t pa, vm_prot_t prot) 2231 { 2232 uint64_t l4idx, l3idx, l2idx, l1idx; 2233 pd_entry_t *pd, *ptp; 2234 paddr_t npa; 2235 struct pmap *pmap = pmap_kernel(); 2236 pt_entry_t *ptes; 2237 int level, offs; 2238 2239 /* If CPU is secure, no need to do anything */ 2240 if (!cpu_meltdown) 2241 return; 2242 2243 /* Must be kernel VA */ 2244 if (va < VM_MIN_KERNEL_ADDRESS) 2245 panic("%s: invalid special mapping va 0x%lx requested", 2246 __func__, va); 2247 2248 if (pmap->pm_pdir_intel == NULL) 2249 pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool, 2250 PR_WAITOK | PR_ZERO); 2251 2252 l4idx = (va & L4_MASK) >> L4_SHIFT; /* PML4E idx */ 2253 l3idx = (va & L3_MASK) >> L3_SHIFT; /* PDPTE idx */ 2254 l2idx = (va & L2_MASK) >> L2_SHIFT; /* PDE idx */ 2255 l1idx = (va & L1_MASK) >> L1_SHIFT; /* PTE idx */ 2256 2257 DPRINTF("%s: va=0x%llx pa=0x%llx l4idx=%lld l3idx=%lld " 2258 "l2idx=%lld l1idx=%lld\n", __func__, (uint64_t)va, 2259 (uint64_t)pa, l4idx, l3idx, l2idx, l1idx); 2260 2261 /* Start at PML4 / top level */ 2262 pd = pmap->pm_pdir_intel; 2263 2264 if (pd == NULL) 2265 panic("%s: PML4 not initialized for pmap @ %p", __func__, 2266 pmap); 2267 2268 /* npa = physaddr of PDPT */ 2269 npa = pd[l4idx] & PMAP_PA_MASK; 2270 2271 /* Valid PML4e for the 512GB region containing va? */ 2272 if (!npa) { 2273 /* No valid PML4E - allocate PDPT page and set PML4E */ 2274 2275 ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO); 2276 2277 if (!pmap_extract(pmap, (vaddr_t)ptp, &npa)) 2278 panic("%s: can't locate PDPT page", __func__); 2279 2280 pd[l4idx] = (npa | PG_RW | PG_V); 2281 2282 DPRINTF("%s: allocated new PDPT page at phys 0x%llx, " 2283 "setting PML4e[%lld] = 0x%llx\n", __func__, 2284 (uint64_t)npa, l4idx, pd[l4idx]); 2285 } 2286 2287 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa); 2288 if (pd == NULL) 2289 panic("%s: can't locate PDPT @ pa=0x%llx", __func__, 2290 (uint64_t)npa); 2291 2292 /* npa = physaddr of PD page */ 2293 npa = pd[l3idx] & PMAP_PA_MASK; 2294 2295 /* Valid PDPTe for the 1GB region containing va? */ 2296 if (!npa) { 2297 /* No valid PDPTe - allocate PD page and set PDPTe */ 2298 2299 ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO); 2300 2301 if (!pmap_extract(pmap, (vaddr_t)ptp, &npa)) 2302 panic("%s: can't locate PD page", __func__); 2303 2304 pd[l3idx] = (npa | PG_RW | PG_V); 2305 2306 DPRINTF("%s: allocated new PD page at phys 0x%llx, " 2307 "setting PDPTe[%lld] = 0x%llx\n", __func__, 2308 (uint64_t)npa, l3idx, pd[l3idx]); 2309 } 2310 2311 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa); 2312 if (pd == NULL) 2313 panic("%s: can't locate PD page @ pa=0x%llx", __func__, 2314 (uint64_t)npa); 2315 2316 /* npa = physaddr of PT page */ 2317 npa = pd[l2idx] & PMAP_PA_MASK; 2318 2319 /* Valid PDE for the 2MB region containing va? */ 2320 if (!npa) { 2321 /* No valid PDE - allocate PT page and set PDE */ 2322 2323 ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO); 2324 2325 if (!pmap_extract(pmap, (vaddr_t)ptp, &npa)) 2326 panic("%s: can't locate PT page", __func__); 2327 2328 pd[l2idx] = (npa | PG_RW | PG_V); 2329 2330 DPRINTF("%s: allocated new PT page at phys 0x%llx, " 2331 "setting PDE[%lld] = 0x%llx\n", __func__, 2332 (uint64_t)npa, l2idx, pd[l2idx]); 2333 } 2334 2335 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa); 2336 if (pd == NULL) 2337 panic("%s: can't locate PT page @ pa=0x%llx", __func__, 2338 (uint64_t)npa); 2339 2340 DPRINTF("%s: setting PTE, PT page @ phys 0x%llx virt 0x%llx prot " 2341 "0x%llx was 0x%llx\n", __func__, (uint64_t)npa, (uint64_t)pd, 2342 (uint64_t)prot, (uint64_t)pd[l1idx]); 2343 2344 pd[l1idx] = pa | protection_codes[prot] | PG_V | PG_W; 2345 2346 /* 2347 * Look up the corresponding U+K entry. If we're installing the 2348 * same PA into the U-K map then set the PG_G bit on both and copy 2349 * the cache-control bits from the U+K entry to the U-K entry. 2350 */ 2351 level = pmap_find_pte_direct(pmap, va, &ptes, &offs); 2352 if (__predict_true(level == 0 && pmap_valid_entry(ptes[offs]))) { 2353 if (((pd[l1idx] ^ ptes[offs]) & PG_FRAME) == 0) { 2354 pd[l1idx] |= PG_G | (ptes[offs] & (PG_N | PG_WT)); 2355 ptes[offs] |= PG_G; 2356 } else { 2357 DPRINTF("%s: special diffing mapping at %llx\n", 2358 __func__, (long long)va); 2359 } 2360 } else 2361 DPRINTF("%s: no U+K mapping for special mapping?\n", __func__); 2362 2363 DPRINTF("%s: setting PTE[%lld] = 0x%llx\n", __func__, l1idx, pd[l1idx]); 2364 } 2365 2366 void 2367 pmap_remove_ept(struct pmap *pmap, vaddr_t sgpa, vaddr_t egpa) 2368 { 2369 vaddr_t v; 2370 #if NVMM > 0 2371 struct vmx_invept_descriptor vid; 2372 #endif /* NVMM > 0 */ 2373 2374 DPRINTF("%s: sgpa=0x%llx egpa=0x%llx\n", __func__, (uint64_t)sgpa, 2375 (uint64_t)egpa); 2376 for (v = sgpa; v < egpa + PAGE_SIZE; v += PAGE_SIZE) 2377 pmap_do_remove_ept(pmap, v); 2378 2379 #if NVMM > 0 2380 if (pmap->eptp != 0) { 2381 memset(&vid, 0, sizeof(vid)); 2382 vid.vid_eptp = pmap->eptp; 2383 DPRINTF("%s: flushing EPT TLB for EPTP 0x%llx\n", __func__, 2384 vid.vid_eptp); 2385 invept(IA32_VMX_INVEPT_SINGLE_CTX, &vid); 2386 } 2387 #endif /* NVMM > 0 */ 2388 } 2389 2390 void 2391 pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa) 2392 { 2393 uint64_t l4idx, l3idx, l2idx, l1idx; 2394 struct vm_page *pg3, *pg2, *pg1; 2395 paddr_t npa3, npa2, npa1; 2396 pd_entry_t *pd4, *pd3, *pd2, *pd1; 2397 pd_entry_t *pptes; 2398 2399 l4idx = (gpa & L4_MASK) >> L4_SHIFT; /* PML4E idx */ 2400 l3idx = (gpa & L3_MASK) >> L3_SHIFT; /* PDPTE idx */ 2401 l2idx = (gpa & L2_MASK) >> L2_SHIFT; /* PDE idx */ 2402 l1idx = (gpa & L1_MASK) >> L1_SHIFT; /* PTE idx */ 2403 2404 /* Start at PML4 / top level */ 2405 pd4 = (pd_entry_t *)pmap->pm_pdir; 2406 2407 if (pd4 == NULL) 2408 return; 2409 2410 /* npa3 = physaddr of PDPT */ 2411 npa3 = pd4[l4idx] & PMAP_PA_MASK; 2412 if (!npa3) 2413 return; 2414 pd3 = (pd_entry_t *)PMAP_DIRECT_MAP(npa3); 2415 pg3 = PHYS_TO_VM_PAGE(npa3); 2416 2417 /* npa2 = physaddr of PD page */ 2418 npa2 = pd3[l3idx] & PMAP_PA_MASK; 2419 if (!npa2) 2420 return; 2421 pd2 = (pd_entry_t *)PMAP_DIRECT_MAP(npa2); 2422 pg2 = PHYS_TO_VM_PAGE(npa2); 2423 2424 /* npa1 = physaddr of PT page */ 2425 npa1 = pd2[l2idx] & PMAP_PA_MASK; 2426 if (!npa1) 2427 return; 2428 pd1 = (pd_entry_t *)PMAP_DIRECT_MAP(npa1); 2429 pg1 = PHYS_TO_VM_PAGE(npa1); 2430 2431 if (pd1[l1idx] == 0) 2432 return; 2433 2434 pd1[l1idx] = 0; 2435 pg1->wire_count--; 2436 pmap->pm_stats.resident_count--; 2437 2438 if (pg1->wire_count > 1) 2439 return; 2440 2441 pg1->wire_count = 0; 2442 pptes = (pd_entry_t *)PMAP_DIRECT_MAP(npa2); 2443 pptes[l2idx] = 0; 2444 uvm_pagefree(pg1); 2445 pmap->pm_stats.resident_count--; 2446 2447 pg2->wire_count--; 2448 if (pg2->wire_count > 1) 2449 return; 2450 2451 pg2->wire_count = 0; 2452 pptes = (pd_entry_t *)PMAP_DIRECT_MAP(npa3); 2453 pptes[l3idx] = 0; 2454 uvm_pagefree(pg2); 2455 pmap->pm_stats.resident_count--; 2456 2457 pg3->wire_count--; 2458 if (pg3->wire_count > 1) 2459 return; 2460 2461 pg3->wire_count = 0; 2462 pptes = pd4; 2463 pptes[l4idx] = 0; 2464 uvm_pagefree(pg3); 2465 pmap->pm_stats.resident_count--; 2466 } 2467 2468 int 2469 pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t hpa, vm_prot_t prot) 2470 { 2471 uint64_t l4idx, l3idx, l2idx, l1idx; 2472 pd_entry_t *pd, npte; 2473 struct vm_page *ptp, *pptp; 2474 paddr_t npa; 2475 struct uvm_object *obj; 2476 2477 if (gpa > MAXDSIZ) 2478 return ENOMEM; 2479 2480 l4idx = (gpa & L4_MASK) >> L4_SHIFT; /* PML4E idx */ 2481 l3idx = (gpa & L3_MASK) >> L3_SHIFT; /* PDPTE idx */ 2482 l2idx = (gpa & L2_MASK) >> L2_SHIFT; /* PDE idx */ 2483 l1idx = (gpa & L1_MASK) >> L1_SHIFT; /* PTE idx */ 2484 2485 /* Start at PML4 / top level */ 2486 pd = (pd_entry_t *)pmap->pm_pdir; 2487 2488 if (pd == NULL) 2489 return ENOMEM; 2490 2491 /* npa = physaddr of PDPT */ 2492 npa = pd[l4idx] & PMAP_PA_MASK; 2493 2494 /* Valid PML4e for the 512GB region containing gpa? */ 2495 if (!npa) { 2496 /* No valid PML4e - allocate PDPT page and set PML4e */ 2497 obj = &pmap->pm_obj[2]; /* PML4 UVM object */ 2498 ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 3), NULL, 2499 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 2500 2501 if (ptp == NULL) 2502 return ENOMEM; 2503 2504 /* 2505 * New PDPT page - we are setting the first entry, so set 2506 * the wired count to 1 2507 */ 2508 ptp->wire_count = 1; 2509 2510 /* Calculate phys address of this new PDPT page */ 2511 npa = VM_PAGE_TO_PHYS(ptp); 2512 2513 /* 2514 * Higher levels get full perms; specific permissions are 2515 * entered at the lowest level. 2516 */ 2517 pd[l4idx] = (npa | EPT_R | EPT_W | EPT_X); 2518 2519 pmap->pm_stats.resident_count++; 2520 2521 pptp = ptp; 2522 } else { 2523 /* Already allocated PML4e */ 2524 pptp = PHYS_TO_VM_PAGE(npa); 2525 } 2526 2527 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa); 2528 if (pd == NULL) 2529 panic("%s: can't locate PDPT @ pa=0x%llx", __func__, 2530 (uint64_t)npa); 2531 2532 /* npa = physaddr of PD page */ 2533 npa = pd[l3idx] & PMAP_PA_MASK; 2534 2535 /* Valid PDPTe for the 1GB region containing gpa? */ 2536 if (!npa) { 2537 /* No valid PDPTe - allocate PD page and set PDPTe */ 2538 obj = &pmap->pm_obj[1]; /* PDPT UVM object */ 2539 ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 2), NULL, 2540 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 2541 2542 if (ptp == NULL) 2543 return ENOMEM; 2544 2545 /* 2546 * New PD page - we are setting the first entry, so set 2547 * the wired count to 1 2548 */ 2549 ptp->wire_count = 1; 2550 pptp->wire_count++; 2551 2552 npa = VM_PAGE_TO_PHYS(ptp); 2553 2554 /* 2555 * Higher levels get full perms; specific permissions are 2556 * entered at the lowest level. 2557 */ 2558 pd[l3idx] = (npa | EPT_R | EPT_W | EPT_X); 2559 2560 pmap->pm_stats.resident_count++; 2561 2562 pptp = ptp; 2563 } else { 2564 /* Already allocated PDPTe */ 2565 pptp = PHYS_TO_VM_PAGE(npa); 2566 } 2567 2568 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa); 2569 if (pd == NULL) 2570 panic("%s: can't locate PD page @ pa=0x%llx", __func__, 2571 (uint64_t)npa); 2572 2573 /* npa = physaddr of PT page */ 2574 npa = pd[l2idx] & PMAP_PA_MASK; 2575 2576 /* Valid PDE for the 2MB region containing gpa? */ 2577 if (!npa) { 2578 /* No valid PDE - allocate PT page and set PDE */ 2579 obj = &pmap->pm_obj[0]; /* PDE UVM object */ 2580 ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 1), NULL, 2581 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 2582 2583 if (ptp == NULL) 2584 return ENOMEM; 2585 2586 pptp->wire_count++; 2587 2588 npa = VM_PAGE_TO_PHYS(ptp); 2589 2590 /* 2591 * Higher level get full perms; specific permissions are 2592 * entered at the lowest level. 2593 */ 2594 pd[l2idx] = (npa | EPT_R | EPT_W | EPT_X); 2595 2596 pmap->pm_stats.resident_count++; 2597 2598 } else { 2599 /* Find final ptp */ 2600 ptp = PHYS_TO_VM_PAGE(npa); 2601 if (ptp == NULL) 2602 panic("%s: ptp page vanished?", __func__); 2603 } 2604 2605 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa); 2606 if (pd == NULL) 2607 panic("%s: can't locate PT page @ pa=0x%llx", __func__, 2608 (uint64_t)npa); 2609 2610 npte = hpa | EPT_WB; 2611 if (prot & PROT_READ) 2612 npte |= EPT_R; 2613 if (prot & PROT_WRITE) 2614 npte |= EPT_W; 2615 if (prot & PROT_EXEC) 2616 npte |= EPT_X; 2617 2618 if (pd[l1idx] == 0) { 2619 ptp->wire_count++; 2620 pmap->pm_stats.resident_count++; 2621 } else { 2622 /* XXX flush ept */ 2623 } 2624 2625 pd[l1idx] = npte; 2626 2627 return 0; 2628 } 2629 2630 /* 2631 * pmap_enter: enter a mapping into a pmap 2632 * 2633 * => must be done "now" ... no lazy-evaluation 2634 */ 2635 2636 int 2637 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, int flags) 2638 { 2639 pt_entry_t opte, npte; 2640 struct vm_page *ptp, *pg = NULL; 2641 struct pv_entry *pve, *opve = NULL; 2642 int ptpdelta, wireddelta, resdelta; 2643 int wired = (flags & PMAP_WIRED) != 0; 2644 int nocache = (pa & PMAP_NOCACHE) != 0; 2645 int wc = (pa & PMAP_WC) != 0; 2646 int error, shootself; 2647 paddr_t scr3; 2648 2649 if (pmap->pm_type == PMAP_TYPE_EPT) 2650 return pmap_enter_ept(pmap, va, pa, prot); 2651 2652 KASSERT(!(wc && nocache)); 2653 pa &= PMAP_PA_MASK; 2654 2655 #ifdef DIAGNOSTIC 2656 if (va == (vaddr_t) PDP_BASE) 2657 panic("%s: trying to map over PDP!", __func__); 2658 2659 /* sanity check: kernel PTPs should already have been pre-allocated */ 2660 if (va >= VM_MIN_KERNEL_ADDRESS && 2661 !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)])) 2662 panic("%s: missing kernel PTP for va %lx!", __func__, va); 2663 2664 #endif 2665 2666 pve = pool_get(&pmap_pv_pool, PR_NOWAIT); 2667 if (pve == NULL) { 2668 if (flags & PMAP_CANFAIL) { 2669 error = ENOMEM; 2670 goto out; 2671 } 2672 panic("%s: no pv entries available", __func__); 2673 } 2674 2675 /* 2676 * map in ptes and get a pointer to our PTP (unless we are the kernel) 2677 */ 2678 2679 scr3 = pmap_map_ptes(pmap); 2680 shootself = (scr3 == 0); 2681 if (pmap == pmap_kernel()) { 2682 ptp = NULL; 2683 } else { 2684 ptp = pmap_get_ptp(pmap, va); 2685 if (ptp == NULL) { 2686 if (flags & PMAP_CANFAIL) { 2687 pmap_unmap_ptes(pmap, scr3); 2688 error = ENOMEM; 2689 goto out; 2690 } 2691 panic("%s: get ptp failed", __func__); 2692 } 2693 } 2694 opte = PTE_BASE[pl1_i(va)]; /* old PTE */ 2695 2696 /* 2697 * is there currently a valid mapping at our VA? 2698 */ 2699 2700 if (pmap_valid_entry(opte)) { 2701 /* 2702 * first, calculate pm_stats updates. resident count will not 2703 * change since we are replacing/changing a valid mapping. 2704 * wired count might change... 2705 */ 2706 2707 resdelta = 0; 2708 if (wired && (opte & PG_W) == 0) 2709 wireddelta = 1; 2710 else if (!wired && (opte & PG_W) != 0) 2711 wireddelta = -1; 2712 else 2713 wireddelta = 0; 2714 ptpdelta = 0; 2715 2716 /* 2717 * is the currently mapped PA the same as the one we 2718 * want to map? 2719 */ 2720 2721 if ((opte & PG_FRAME) == pa) { 2722 2723 /* if this is on the PVLIST, sync R/M bit */ 2724 if (opte & PG_PVLIST) { 2725 pg = PHYS_TO_VM_PAGE(pa); 2726 #ifdef DIAGNOSTIC 2727 if (pg == NULL) 2728 panic("%s: same pa, PG_PVLIST " 2729 "mapping with unmanaged page: " 2730 "va 0x%lx, opte 0x%llx, pa 0x%lx", 2731 __func__, va, opte, pa); 2732 #endif 2733 pmap_sync_flags_pte(pg, opte); 2734 } else { 2735 #ifdef DIAGNOSTIC 2736 if (PHYS_TO_VM_PAGE(pa) != NULL) 2737 panic("%s: same pa, no PG_PVLIST " 2738 "mapping with managed page: " 2739 "va 0x%lx, opte 0x%llx, pa 0x%lx", 2740 __func__, va, opte, pa); 2741 #endif 2742 } 2743 goto enter_now; 2744 } 2745 2746 /* 2747 * changing PAs: we must remove the old one first 2748 */ 2749 2750 /* 2751 * if current mapping is on a pvlist, 2752 * remove it (sync R/M bits) 2753 */ 2754 2755 if (opte & PG_PVLIST) { 2756 pg = PHYS_TO_VM_PAGE(opte & PG_FRAME); 2757 #ifdef DIAGNOSTIC 2758 if (pg == NULL) 2759 panic("%s: PG_PVLIST mapping with unmanaged " 2760 "page: va 0x%lx, opte 0x%llx, pa 0x%lx", 2761 __func__, va, opte, pa); 2762 #endif 2763 pmap_sync_flags_pte(pg, opte); 2764 opve = pmap_remove_pv(pg, pmap, va); 2765 pg = NULL; /* This is not the page we are looking for */ 2766 } 2767 } else { /* opte not valid */ 2768 resdelta = 1; 2769 if (wired) 2770 wireddelta = 1; 2771 else 2772 wireddelta = 0; 2773 if (ptp != NULL) 2774 ptpdelta = 1; 2775 else 2776 ptpdelta = 0; 2777 } 2778 2779 /* 2780 * pve is either NULL or points to a now-free pv_entry structure 2781 * (the latter case is if we called pmap_remove_pv above). 2782 * 2783 * if this entry is to be on a pvlist, enter it now. 2784 */ 2785 2786 if (pmap_initialized) 2787 pg = PHYS_TO_VM_PAGE(pa); 2788 2789 if (pg != NULL) { 2790 pmap_enter_pv(pg, pve, pmap, va, ptp); 2791 pve = NULL; 2792 } 2793 2794 enter_now: 2795 /* 2796 * at this point pg is !NULL if we want the PG_PVLIST bit set 2797 */ 2798 2799 pmap->pm_stats.resident_count += resdelta; 2800 pmap->pm_stats.wired_count += wireddelta; 2801 if (ptp != NULL) 2802 ptp->wire_count += ptpdelta; 2803 2804 KASSERT(pg == PHYS_TO_VM_PAGE(pa)); 2805 2806 npte = pa | protection_codes[prot] | PG_V; 2807 if (pg != NULL) { 2808 npte |= PG_PVLIST; 2809 /* 2810 * make sure that if the page is write combined all 2811 * instances of pmap_enter make it so. 2812 */ 2813 if (pg->pg_flags & PG_PMAP_WC) { 2814 KASSERT(nocache == 0); 2815 wc = 1; 2816 } 2817 } 2818 if (wc) 2819 npte |= pmap_pg_wc; 2820 if (wired) 2821 npte |= PG_W; 2822 if (nocache) 2823 npte |= PG_N; 2824 if (va < VM_MAXUSER_ADDRESS) 2825 npte |= PG_u; 2826 else if (va < VM_MAX_ADDRESS) 2827 npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */ 2828 if (pmap == pmap_kernel()) 2829 npte |= pg_g_kern; 2830 2831 PTE_BASE[pl1_i(va)] = npte; /* zap! */ 2832 2833 /* 2834 * If we changed anything other than modified/used bits, 2835 * flush the TLB. (is this overkill?) 2836 */ 2837 if (pmap_valid_entry(opte)) { 2838 if (nocache && (opte & PG_N) == 0) 2839 wbinvd_on_all_cpus(); 2840 pmap_tlb_shootpage(pmap, va, shootself); 2841 } 2842 2843 pmap_unmap_ptes(pmap, scr3); 2844 pmap_tlb_shootwait(); 2845 2846 error = 0; 2847 2848 out: 2849 if (pve != NULL) 2850 pool_put(&pmap_pv_pool, pve); 2851 if (opve != NULL) 2852 pool_put(&pmap_pv_pool, opve); 2853 2854 return error; 2855 } 2856 2857 int 2858 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp) 2859 { 2860 struct vm_page *ptp; 2861 struct pmap *kpm = pmap_kernel(); 2862 2863 if (uvm.page_init_done == 0) { 2864 vaddr_t va; 2865 2866 /* 2867 * we're growing the kernel pmap early (from 2868 * uvm_pageboot_alloc()). this case must be 2869 * handled a little differently. 2870 */ 2871 2872 va = pmap_steal_memory(PAGE_SIZE, NULL, NULL); 2873 *paddrp = PMAP_DIRECT_UNMAP(va); 2874 } else { 2875 ptp = uvm_pagealloc(&kpm->pm_obj[level - 1], 2876 ptp_va2o(va, level), NULL, 2877 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 2878 if (ptp == NULL) 2879 panic("%s: out of memory", __func__); 2880 atomic_clearbits_int(&ptp->pg_flags, PG_BUSY); 2881 ptp->wire_count = 1; 2882 *paddrp = VM_PAGE_TO_PHYS(ptp); 2883 } 2884 kpm->pm_stats.resident_count++; 2885 return 1; 2886 } 2887 2888 /* 2889 * Allocate the amount of specified ptps for a ptp level, and populate 2890 * all levels below accordingly, mapping virtual addresses starting at 2891 * kva. 2892 * 2893 * Used by pmap_growkernel. 2894 */ 2895 void 2896 pmap_alloc_level(vaddr_t kva, int lvl, long *needed_ptps) 2897 { 2898 unsigned long i; 2899 vaddr_t va; 2900 paddr_t pa; 2901 unsigned long index, endindex; 2902 int level; 2903 pd_entry_t *pdep; 2904 2905 for (level = lvl; level > 1; level--) { 2906 if (level == PTP_LEVELS) 2907 pdep = pmap_kernel()->pm_pdir; 2908 else 2909 pdep = normal_pdes[level - 2]; 2910 va = kva; 2911 index = pl_i(kva, level); 2912 endindex = index + needed_ptps[level - 1]; 2913 /* 2914 * XXX special case for first time call. 2915 */ 2916 if (nkptp[level - 1] != 0) 2917 index++; 2918 else 2919 endindex--; 2920 2921 for (i = index; i <= endindex; i++) { 2922 pmap_get_physpage(va, level - 1, &pa); 2923 pdep[i] = pa | PG_RW | PG_V | pg_nx; 2924 nkptp[level - 1]++; 2925 va += nbpd[level - 1]; 2926 } 2927 } 2928 } 2929 2930 /* 2931 * pmap_growkernel: increase usage of KVM space 2932 * 2933 * => we allocate new PTPs for the kernel and install them in all 2934 * the pmaps on the system. 2935 */ 2936 2937 static vaddr_t pmap_maxkvaddr = VM_MIN_KERNEL_ADDRESS; 2938 2939 vaddr_t 2940 pmap_growkernel(vaddr_t maxkvaddr) 2941 { 2942 struct pmap *kpm = pmap_kernel(), *pm; 2943 int s, i; 2944 unsigned newpdes; 2945 long needed_kptp[PTP_LEVELS], target_nptp, old; 2946 2947 if (maxkvaddr <= pmap_maxkvaddr) 2948 return pmap_maxkvaddr; 2949 2950 maxkvaddr = x86_round_pdr(maxkvaddr); 2951 old = nkptp[PTP_LEVELS - 1]; 2952 /* 2953 * This loop could be optimized more, but pmap_growkernel() 2954 * is called infrequently. 2955 */ 2956 for (i = PTP_LEVELS - 1; i >= 1; i--) { 2957 target_nptp = pl_i(maxkvaddr, i + 1) - 2958 pl_i(VM_MIN_KERNEL_ADDRESS, i + 1); 2959 /* 2960 * XXX only need to check toplevel. 2961 */ 2962 if (target_nptp > nkptpmax[i]) 2963 panic("%s: out of KVA space", __func__); 2964 needed_kptp[i] = target_nptp - nkptp[i] + 1; 2965 } 2966 2967 2968 s = splhigh(); /* to be safe */ 2969 pmap_alloc_level(pmap_maxkvaddr, PTP_LEVELS, needed_kptp); 2970 2971 /* 2972 * If the number of top level entries changed, update all 2973 * pmaps. 2974 */ 2975 if (needed_kptp[PTP_LEVELS - 1] != 0) { 2976 newpdes = nkptp[PTP_LEVELS - 1] - old; 2977 LIST_FOREACH(pm, &pmaps, pm_list) { 2978 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 2979 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 2980 newpdes * sizeof (pd_entry_t)); 2981 } 2982 } 2983 pmap_maxkvaddr = maxkvaddr; 2984 splx(s); 2985 2986 return maxkvaddr; 2987 } 2988 2989 vaddr_t 2990 pmap_steal_memory(vsize_t size, vaddr_t *start, vaddr_t *end) 2991 { 2992 int segno; 2993 u_int npg; 2994 vaddr_t va; 2995 paddr_t pa; 2996 struct vm_physseg *seg; 2997 2998 size = round_page(size); 2999 npg = atop(size); 3000 3001 for (segno = 0, seg = vm_physmem; segno < vm_nphysseg; segno++, seg++) { 3002 if (seg->avail_end - seg->avail_start < npg) 3003 continue; 3004 /* 3005 * We can only steal at an ``unused'' segment boundary, 3006 * i.e. either at the start or at the end. 3007 */ 3008 if (seg->avail_start == seg->start || 3009 seg->avail_end == seg->end) 3010 break; 3011 } 3012 if (segno == vm_nphysseg) { 3013 panic("%s: out of memory", __func__); 3014 } else { 3015 if (seg->avail_start == seg->start) { 3016 pa = ptoa(seg->avail_start); 3017 seg->avail_start += npg; 3018 seg->start += npg; 3019 } else { 3020 pa = ptoa(seg->avail_end) - size; 3021 seg->avail_end -= npg; 3022 seg->end -= npg; 3023 } 3024 /* 3025 * If all the segment has been consumed now, remove it. 3026 * Note that the crash dump code still knows about it 3027 * and will dump it correctly. 3028 */ 3029 if (seg->start == seg->end) { 3030 if (vm_nphysseg-- == 1) 3031 panic("%s: out of memory", __func__); 3032 while (segno < vm_nphysseg) { 3033 seg[0] = seg[1]; /* struct copy */ 3034 seg++; 3035 segno++; 3036 } 3037 } 3038 3039 va = PMAP_DIRECT_MAP(pa); 3040 memset((void *)va, 0, size); 3041 } 3042 3043 if (start != NULL) 3044 *start = virtual_avail; 3045 if (end != NULL) 3046 *end = VM_MAX_KERNEL_ADDRESS; 3047 3048 return (va); 3049 } 3050 3051 void 3052 pmap_virtual_space(vaddr_t *vstartp, vaddr_t *vendp) 3053 { 3054 *vstartp = virtual_avail; 3055 *vendp = VM_MAX_KERNEL_ADDRESS; 3056 } 3057 3058 /* 3059 * pmap_convert 3060 * 3061 * Converts 'pmap' to the new 'mode'. 3062 * 3063 * Parameters: 3064 * pmap: the pmap to convert 3065 * mode: the new mode (see pmap.h, PMAP_TYPE_xxx) 3066 * 3067 * Return value: 3068 * always 0 3069 */ 3070 int 3071 pmap_convert(struct pmap *pmap, int mode) 3072 { 3073 pt_entry_t *pte; 3074 3075 pmap->pm_type = mode; 3076 3077 if (mode == PMAP_TYPE_EPT) { 3078 /* Clear PML4 */ 3079 pte = (pt_entry_t *)pmap->pm_pdir; 3080 memset(pte, 0, PAGE_SIZE); 3081 3082 /* Give back the meltdown pdir */ 3083 if (pmap->pm_pdir_intel != NULL) { 3084 pool_put(&pmap_pdp_pool, pmap->pm_pdir_intel); 3085 pmap->pm_pdir_intel = NULL; 3086 } 3087 } 3088 3089 return (0); 3090 } 3091 3092 #ifdef MULTIPROCESSOR 3093 /* 3094 * Locking for tlb shootdown. 3095 * 3096 * We lock by setting tlb_shoot_wait to the number of cpus that will 3097 * receive our tlb shootdown. After sending the IPIs, we don't need to 3098 * worry about locking order or interrupts spinning for the lock because 3099 * the call that grabs the "lock" isn't the one that releases it. And 3100 * there is nothing that can block the IPI that releases the lock. 3101 * 3102 * The functions are organized so that we first count the number of 3103 * cpus we need to send the IPI to, then we grab the counter, then 3104 * we send the IPIs, then we finally do our own shootdown. 3105 * 3106 * Our shootdown is last to make it parallel with the other cpus 3107 * to shorten the spin time. 3108 * 3109 * Notice that we depend on failures to send IPIs only being able to 3110 * happen during boot. If they happen later, the above assumption 3111 * doesn't hold since we can end up in situations where noone will 3112 * release the lock if we get an interrupt in a bad moment. 3113 */ 3114 #ifdef MP_LOCKDEBUG 3115 #include <ddb/db_output.h> 3116 extern int __mp_lock_spinout; 3117 #endif 3118 3119 volatile long tlb_shoot_wait __attribute__((section(".kudata"))); 3120 3121 volatile vaddr_t tlb_shoot_addr1 __attribute__((section(".kudata"))); 3122 volatile vaddr_t tlb_shoot_addr2 __attribute__((section(".kudata"))); 3123 volatile int tlb_shoot_first_pcid __attribute__((section(".kudata"))); 3124 3125 void 3126 pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself) 3127 { 3128 struct cpu_info *ci, *self = curcpu(); 3129 CPU_INFO_ITERATOR cii; 3130 long wait = 0; 3131 u_int64_t mask = 0; 3132 int is_kva = va >= VM_MIN_KERNEL_ADDRESS; 3133 3134 CPU_INFO_FOREACH(cii, ci) { 3135 if (ci == self || !(ci->ci_flags & CPUF_RUNNING)) 3136 continue; 3137 if (!is_kva && !pmap_is_active(pm, ci->ci_cpuid)) 3138 continue; 3139 mask |= (1ULL << ci->ci_cpuid); 3140 wait++; 3141 } 3142 3143 if (wait > 0) { 3144 int s = splvm(); 3145 3146 while (atomic_cas_ulong(&tlb_shoot_wait, 0, wait) != 0) { 3147 #ifdef MP_LOCKDEBUG 3148 int nticks = __mp_lock_spinout; 3149 #endif 3150 while (tlb_shoot_wait != 0) { 3151 CPU_BUSY_CYCLE(); 3152 #ifdef MP_LOCKDEBUG 3153 3154 if (--nticks <= 0) { 3155 db_printf("%s: spun out", __func__); 3156 db_enter(); 3157 nticks = __mp_lock_spinout; 3158 } 3159 #endif 3160 } 3161 } 3162 tlb_shoot_first_pcid = is_kva ? PCID_KERN : PCID_PROC; 3163 tlb_shoot_addr1 = va; 3164 CPU_INFO_FOREACH(cii, ci) { 3165 if ((mask & (1ULL << ci->ci_cpuid)) == 0) 3166 continue; 3167 if (x86_fast_ipi(ci, LAPIC_IPI_INVLPG) != 0) 3168 panic("%s: ipi failed", __func__); 3169 } 3170 splx(s); 3171 } 3172 3173 if (!pmap_use_pcid) { 3174 if (shootself) 3175 pmap_update_pg(va); 3176 } else if (is_kva) { 3177 invpcid(INVPCID_ADDR, PCID_PROC, va); 3178 invpcid(INVPCID_ADDR, PCID_KERN, va); 3179 } else if (shootself) { 3180 invpcid(INVPCID_ADDR, PCID_PROC, va); 3181 if (cpu_meltdown) 3182 invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va); 3183 } 3184 } 3185 3186 void 3187 pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva, int shootself) 3188 { 3189 struct cpu_info *ci, *self = curcpu(); 3190 CPU_INFO_ITERATOR cii; 3191 long wait = 0; 3192 u_int64_t mask = 0; 3193 int is_kva = sva >= VM_MIN_KERNEL_ADDRESS; 3194 vaddr_t va; 3195 3196 CPU_INFO_FOREACH(cii, ci) { 3197 if (ci == self || !(ci->ci_flags & CPUF_RUNNING)) 3198 continue; 3199 if (!is_kva && !pmap_is_active(pm, ci->ci_cpuid)) 3200 continue; 3201 mask |= (1ULL << ci->ci_cpuid); 3202 wait++; 3203 } 3204 3205 if (wait > 0) { 3206 int s = splvm(); 3207 3208 while (atomic_cas_ulong(&tlb_shoot_wait, 0, wait) != 0) { 3209 #ifdef MP_LOCKDEBUG 3210 int nticks = __mp_lock_spinout; 3211 #endif 3212 while (tlb_shoot_wait != 0) { 3213 CPU_BUSY_CYCLE(); 3214 #ifdef MP_LOCKDEBUG 3215 3216 if (--nticks <= 0) { 3217 db_printf("%s: spun out", __func__); 3218 db_enter(); 3219 nticks = __mp_lock_spinout; 3220 } 3221 #endif 3222 } 3223 } 3224 tlb_shoot_first_pcid = is_kva ? PCID_KERN : PCID_PROC; 3225 tlb_shoot_addr1 = sva; 3226 tlb_shoot_addr2 = eva; 3227 CPU_INFO_FOREACH(cii, ci) { 3228 if ((mask & (1ULL << ci->ci_cpuid)) == 0) 3229 continue; 3230 if (x86_fast_ipi(ci, LAPIC_IPI_INVLRANGE) != 0) 3231 panic("%s: ipi failed", __func__); 3232 } 3233 splx(s); 3234 } 3235 3236 if (!pmap_use_pcid) { 3237 if (shootself) { 3238 for (va = sva; va < eva; va += PAGE_SIZE) 3239 pmap_update_pg(va); 3240 } 3241 } else if (is_kva) { 3242 for (va = sva; va < eva; va += PAGE_SIZE) { 3243 invpcid(INVPCID_ADDR, PCID_PROC, va); 3244 invpcid(INVPCID_ADDR, PCID_KERN, va); 3245 } 3246 } else if (shootself) { 3247 if (cpu_meltdown) { 3248 for (va = sva; va < eva; va += PAGE_SIZE) { 3249 invpcid(INVPCID_ADDR, PCID_PROC, va); 3250 invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va); 3251 } 3252 } else { 3253 for (va = sva; va < eva; va += PAGE_SIZE) 3254 invpcid(INVPCID_ADDR, PCID_PROC, va); 3255 } 3256 } 3257 } 3258 3259 void 3260 pmap_tlb_shoottlb(struct pmap *pm, int shootself) 3261 { 3262 struct cpu_info *ci, *self = curcpu(); 3263 CPU_INFO_ITERATOR cii; 3264 long wait = 0; 3265 u_int64_t mask = 0; 3266 3267 KASSERT(pm != pmap_kernel()); 3268 3269 CPU_INFO_FOREACH(cii, ci) { 3270 if (ci == self || !pmap_is_active(pm, ci->ci_cpuid) || 3271 !(ci->ci_flags & CPUF_RUNNING)) 3272 continue; 3273 mask |= (1ULL << ci->ci_cpuid); 3274 wait++; 3275 } 3276 3277 if (wait) { 3278 int s = splvm(); 3279 3280 while (atomic_cas_ulong(&tlb_shoot_wait, 0, wait) != 0) { 3281 #ifdef MP_LOCKDEBUG 3282 int nticks = __mp_lock_spinout; 3283 #endif 3284 while (tlb_shoot_wait != 0) { 3285 CPU_BUSY_CYCLE(); 3286 #ifdef MP_LOCKDEBUG 3287 3288 if (--nticks <= 0) { 3289 db_printf("%s: spun out", __func__); 3290 db_enter(); 3291 nticks = __mp_lock_spinout; 3292 } 3293 #endif 3294 } 3295 } 3296 3297 CPU_INFO_FOREACH(cii, ci) { 3298 if ((mask & (1ULL << ci->ci_cpuid)) == 0) 3299 continue; 3300 if (x86_fast_ipi(ci, LAPIC_IPI_INVLTLB) != 0) 3301 panic("%s: ipi failed", __func__); 3302 } 3303 splx(s); 3304 } 3305 3306 if (shootself) { 3307 if (!pmap_use_pcid) 3308 tlbflush(); 3309 else { 3310 invpcid(INVPCID_PCID, PCID_PROC, 0); 3311 if (cpu_meltdown) 3312 invpcid(INVPCID_PCID, PCID_PROC_INTEL, 0); 3313 } 3314 } 3315 } 3316 3317 void 3318 pmap_tlb_shootwait(void) 3319 { 3320 #ifdef MP_LOCKDEBUG 3321 int nticks = __mp_lock_spinout; 3322 #endif 3323 while (tlb_shoot_wait != 0) { 3324 CPU_BUSY_CYCLE(); 3325 #ifdef MP_LOCKDEBUG 3326 if (--nticks <= 0) { 3327 db_printf("%s: spun out", __func__); 3328 db_enter(); 3329 nticks = __mp_lock_spinout; 3330 } 3331 #endif 3332 } 3333 } 3334 3335 #else /* MULTIPROCESSOR */ 3336 3337 void 3338 pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself) 3339 { 3340 if (!pmap_use_pcid) { 3341 if (shootself) 3342 pmap_update_pg(va); 3343 } else if (va >= VM_MIN_KERNEL_ADDRESS) { 3344 invpcid(INVPCID_ADDR, PCID_PROC, va); 3345 invpcid(INVPCID_ADDR, PCID_KERN, va); 3346 } else if (shootself) { 3347 invpcid(INVPCID_ADDR, PCID_PROC, va); 3348 if (cpu_meltdown) 3349 invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va); 3350 } 3351 } 3352 3353 void 3354 pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva, int shootself) 3355 { 3356 vaddr_t va; 3357 3358 if (!pmap_use_pcid) { 3359 if (shootself) { 3360 for (va = sva; va < eva; va += PAGE_SIZE) 3361 pmap_update_pg(va); 3362 } 3363 } else if (sva >= VM_MIN_KERNEL_ADDRESS) { 3364 for (va = sva; va < eva; va += PAGE_SIZE) { 3365 invpcid(INVPCID_ADDR, PCID_PROC, va); 3366 invpcid(INVPCID_ADDR, PCID_KERN, va); 3367 } 3368 } else if (shootself) { 3369 if (cpu_meltdown) { 3370 for (va = sva; va < eva; va += PAGE_SIZE) { 3371 invpcid(INVPCID_ADDR, PCID_PROC, va); 3372 invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va); 3373 } 3374 } else { 3375 for (va = sva; va < eva; va += PAGE_SIZE) 3376 invpcid(INVPCID_ADDR, PCID_PROC, va); 3377 } 3378 } 3379 } 3380 3381 void 3382 pmap_tlb_shoottlb(struct pmap *pm, int shootself) 3383 { 3384 if (shootself) { 3385 if (!pmap_use_pcid) 3386 tlbflush(); 3387 else { 3388 invpcid(INVPCID_PCID, PCID_PROC, 0); 3389 if (cpu_meltdown) 3390 invpcid(INVPCID_PCID, PCID_PROC_INTEL, 0); 3391 } 3392 } 3393 } 3394 #endif /* MULTIPROCESSOR */ 3395