1 /* $OpenBSD: pmap.c,v 1.99 2016/06/07 06:23:19 dlg Exp $ */ 2 /* $NetBSD: pmap.c,v 1.3 2003/05/08 18:13:13 thorpej Exp $ */ 3 4 /* 5 * Copyright (c) 1997 Charles D. Cranor and Washington University. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 * Copyright 2001 (c) Wasabi Systems, Inc. 31 * All rights reserved. 32 * 33 * Written by Frank van der Linden for Wasabi Systems, Inc. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgement: 45 * This product includes software developed for the NetBSD Project by 46 * Wasabi Systems, Inc. 47 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 48 * or promote products derived from this software without specific prior 49 * written permission. 50 * 51 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 53 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 54 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 55 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 56 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 57 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 58 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 59 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 60 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 61 * POSSIBILITY OF SUCH DAMAGE. 62 */ 63 64 /* 65 * This is the i386 pmap modified and generalized to support x86-64 66 * as well. The idea is to hide the upper N levels of the page tables 67 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest 68 * is mostly untouched, except that it uses some more generalized 69 * macros and interfaces. 70 * 71 * This pmap has been tested on the i386 as well, and it can be easily 72 * adapted to PAE. 73 * 74 * fvdl@wasabisystems.com 18-Jun-2001 75 */ 76 77 /* 78 * pmap.c: i386 pmap module rewrite 79 * Chuck Cranor <chuck@ccrc.wustl.edu> 80 * 11-Aug-97 81 * 82 * history of this pmap module: in addition to my own input, i used 83 * the following references for this rewrite of the i386 pmap: 84 * 85 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 86 * BSD hp300 pmap done by Mike Hibler at University of Utah. 87 * it was then ported to the i386 by William Jolitz of UUNET 88 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 89 * project fixed some bugs and provided some speed ups. 90 * 91 * [2] the FreeBSD i386 pmap. this pmap seems to be the 92 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 93 * and David Greenman. 94 * 95 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 96 * between several processors. the VAX version was done by 97 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 98 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 99 * David Golub, and Richard Draves. the alpha version was 100 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 101 * (NetBSD/alpha). 102 */ 103 104 #include <sys/param.h> 105 #include <sys/systm.h> 106 #include <sys/atomic.h> 107 #include <sys/proc.h> 108 #include <sys/malloc.h> 109 #include <sys/pool.h> 110 #include <sys/user.h> 111 #include <sys/kernel.h> 112 #include <sys/mutex.h> 113 #include <sys/sched.h> 114 115 #include <uvm/uvm.h> 116 117 #include <machine/lock.h> 118 #include <machine/cpu.h> 119 #include <machine/specialreg.h> 120 #ifdef MULTIPROCESSOR 121 #include <machine/i82489reg.h> 122 #include <machine/i82489var.h> 123 #endif 124 125 126 #include <machine/isa_machdep.h> 127 128 #include "acpi.h" 129 130 /* 131 * general info: 132 * 133 * - for an explanation of how the i386 MMU hardware works see 134 * the comments in <machine/pte.h>. 135 * 136 * - for an explanation of the general memory structure used by 137 * this pmap (including the recursive mapping), see the comments 138 * in <machine/pmap.h>. 139 * 140 * this file contains the code for the "pmap module." the module's 141 * job is to manage the hardware's virtual to physical address mappings. 142 * note that there are two levels of mapping in the VM system: 143 * 144 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 145 * to map ranges of virtual address space to objects/files. for 146 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 147 * to the file /bin/ls starting at offset zero." note that 148 * the upper layer mapping is not concerned with how individual 149 * vm_pages are mapped. 150 * 151 * [2] the lower layer of the VM system (the pmap) maintains the mappings 152 * from virtual addresses. it is concerned with which vm_page is 153 * mapped where. for example, when you run /bin/ls and start 154 * at page 0x1000 the fault routine may lookup the correct page 155 * of the /bin/ls file and then ask the pmap layer to establish 156 * a mapping for it. 157 * 158 * note that information in the lower layer of the VM system can be 159 * thrown away since it can easily be reconstructed from the info 160 * in the upper layer. 161 * 162 * data structures we use include: 163 * - struct pmap: describes the address space of one process 164 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 165 * - struct pg_to_free: a list of virtual addresses whose mappings 166 * have been changed. used for TLB flushing. 167 */ 168 169 /* 170 * memory allocation 171 * 172 * - there are three data structures that we must dynamically allocate: 173 * 174 * [A] new process' page directory page (PDP) 175 * - plan 1: done at pmap_create() we use 176 * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this 177 * allocation. 178 * 179 * if we are low in free physical memory then we sleep in 180 * uvm_km_alloc -- in this case this is ok since we are creating 181 * a new pmap and should not be holding any locks. 182 * 183 * if the kernel is totally out of virtual space 184 * (i.e. uvm_km_alloc returns NULL), then we panic. 185 * 186 * XXX: the fork code currently has no way to return an "out of 187 * memory, try again" error code since uvm_fork [fka vm_fork] 188 * is a void function. 189 * 190 * [B] new page tables pages (PTP) 191 * call uvm_pagealloc() 192 * => success: zero page, add to pm_pdir 193 * => failure: we are out of free vm_pages, let pmap_enter() 194 * tell UVM about it. 195 * 196 * note: for kernel PTPs, we start with NKPTP of them. as we map 197 * kernel memory (at uvm_map time) we check to see if we've grown 198 * the kernel pmap. if so, we call the optional function 199 * pmap_growkernel() to grow the kernel PTPs in advance. 200 * 201 * [C] pv_entry structures 202 * - try to allocate one from the pool. 203 * If we fail, we simply let pmap_enter() tell UVM about it. 204 */ 205 206 vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 207 int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 208 long nkptp[] = NKPTP_INITIALIZER; 209 long nkptpmax[] = NKPTPMAX_INITIALIZER; 210 long nbpd[] = NBPD_INITIALIZER; 211 pd_entry_t *normal_pdes[] = PDES_INITIALIZER; 212 213 #define pmap_pte_set(p, n) atomic_swap_64(p, n) 214 #define pmap_pte_clearbits(p, b) x86_atomic_clearbits_u64(p, b) 215 #define pmap_pte_setbits(p, b) x86_atomic_setbits_u64(p, b) 216 217 /* 218 * global data structures 219 */ 220 221 struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ 222 223 /* 224 * pmap_pg_wc: if our processor supports PAT then we set this 225 * to be the pte bits for Write Combining. Else we fall back to 226 * UC- so mtrrs can override the cacheability; 227 */ 228 int pmap_pg_wc = PG_UCMINUS; 229 230 /* 231 * other data structures 232 */ 233 234 pt_entry_t protection_codes[8]; /* maps MI prot to i386 prot code */ 235 boolean_t pmap_initialized = FALSE; /* pmap_init done yet? */ 236 237 /* 238 * pv management structures. 239 */ 240 struct pool pmap_pv_pool; 241 242 /* 243 * linked list of all non-kernel pmaps 244 */ 245 246 struct pmap_head pmaps; 247 248 /* 249 * pool that pmap structures are allocated from 250 */ 251 252 struct pool pmap_pmap_pool; 253 254 /* 255 * When we're freeing a ptp, we need to delay the freeing until all 256 * tlb shootdown has been done. This is the list of the to-be-freed pages. 257 */ 258 TAILQ_HEAD(pg_to_free, vm_page); 259 260 /* 261 * pool that PDPs are allocated from 262 */ 263 264 struct pool pmap_pdp_pool; 265 void pmap_pdp_ctor(pd_entry_t *); 266 267 extern vaddr_t msgbuf_vaddr; 268 extern paddr_t msgbuf_paddr; 269 270 extern vaddr_t idt_vaddr; /* we allocate IDT early */ 271 extern paddr_t idt_paddr; 272 273 extern vaddr_t lo32_vaddr; 274 extern vaddr_t lo32_paddr; 275 276 vaddr_t virtual_avail; 277 extern int end; 278 279 /* 280 * local prototypes 281 */ 282 283 void pmap_enter_pv(struct vm_page *, struct pv_entry *, struct pmap *, 284 vaddr_t, struct vm_page *); 285 struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, pd_entry_t **); 286 struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int); 287 int pmap_find_pte_direct(struct pmap *pm, vaddr_t va, pt_entry_t **pd, int *offs); 288 void pmap_free_ptp(struct pmap *, struct vm_page *, 289 vaddr_t, pt_entry_t *, pd_entry_t **, struct pg_to_free *); 290 void pmap_freepage(struct pmap *, struct vm_page *, int, struct pg_to_free *); 291 static boolean_t pmap_is_active(struct pmap *, int); 292 void pmap_map_ptes(struct pmap *, pt_entry_t **, pd_entry_t ***, paddr_t *); 293 struct pv_entry *pmap_remove_pv(struct vm_page *, struct pmap *, vaddr_t); 294 void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int); 295 boolean_t pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, 296 vaddr_t, int, struct pv_entry **); 297 void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, 298 vaddr_t, vaddr_t, int, struct pv_entry **); 299 #define PMAP_REMOVE_ALL 0 /* remove all mappings */ 300 #define PMAP_REMOVE_SKIPWIRED 1 /* skip wired mappings */ 301 302 void pmap_unmap_ptes(struct pmap *, paddr_t); 303 boolean_t pmap_get_physpage(vaddr_t, int, paddr_t *); 304 boolean_t pmap_pdes_valid(vaddr_t, pd_entry_t **, pd_entry_t *); 305 void pmap_alloc_level(pd_entry_t **, vaddr_t, int, long *); 306 307 void pmap_sync_flags_pte(struct vm_page *, u_long); 308 309 void pmap_tlb_shootpage(struct pmap *, vaddr_t, int); 310 void pmap_tlb_shootrange(struct pmap *, vaddr_t, vaddr_t, int); 311 void pmap_tlb_shoottlb(struct pmap *, int); 312 #ifdef MULTIPROCESSOR 313 void pmap_tlb_shootwait(void); 314 #else 315 #define pmap_tlb_shootwait() 316 #endif 317 318 319 /* 320 * p m a p i n l i n e h e l p e r f u n c t i o n s 321 */ 322 323 /* 324 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 325 * of course the kernel is always loaded 326 */ 327 328 static __inline boolean_t 329 pmap_is_curpmap(struct pmap *pmap) 330 { 331 return((pmap == pmap_kernel()) || 332 (pmap->pm_pdirpa == (paddr_t) rcr3())); 333 } 334 335 /* 336 * pmap_is_active: is this pmap loaded into the specified processor's %cr3? 337 */ 338 339 static __inline boolean_t 340 pmap_is_active(struct pmap *pmap, int cpu_id) 341 { 342 return (pmap == pmap_kernel() || 343 (pmap->pm_cpus & (1ULL << cpu_id)) != 0); 344 } 345 346 static __inline u_int 347 pmap_pte2flags(u_long pte) 348 { 349 return (((pte & PG_U) ? PG_PMAP_REF : 0) | 350 ((pte & PG_M) ? PG_PMAP_MOD : 0)); 351 } 352 353 void 354 pmap_sync_flags_pte(struct vm_page *pg, u_long pte) 355 { 356 if (pte & (PG_U|PG_M)) { 357 atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(pte)); 358 } 359 } 360 361 /* 362 * pmap_map_ptes: map a pmap's PTEs into KVM 363 */ 364 365 void 366 pmap_map_ptes(struct pmap *pmap, pt_entry_t **ptepp, pd_entry_t ***pdeppp, paddr_t *save_cr3) 367 { 368 paddr_t cr3 = rcr3(); 369 370 /* the kernel's pmap is always accessible */ 371 if (pmap == pmap_kernel() || pmap->pm_pdirpa == cr3) { 372 *save_cr3 = 0; 373 } else { 374 *save_cr3 = cr3; 375 376 /* 377 * Not sure if we need this, but better be safe. 378 * We don't have the current pmap in order to unset its 379 * active bit, but this just means that we may receive 380 * an unneccessary cross-CPU TLB flush now and then. 381 */ 382 x86_atomic_setbits_u64(&pmap->pm_cpus, (1ULL << cpu_number())); 383 384 lcr3(pmap->pm_pdirpa); 385 } 386 387 if (pmap != pmap_kernel()) 388 mtx_enter(&pmap->pm_mtx); 389 390 *ptepp = PTE_BASE; 391 *pdeppp = normal_pdes; 392 } 393 394 void 395 pmap_unmap_ptes(struct pmap *pmap, paddr_t save_cr3) 396 { 397 if (pmap != pmap_kernel()) 398 mtx_leave(&pmap->pm_mtx); 399 400 if (save_cr3 != 0) { 401 x86_atomic_clearbits_u64(&pmap->pm_cpus, (1ULL << cpu_number())); 402 lcr3(save_cr3); 403 } 404 } 405 406 /* 407 * pmap_fix_ept 408 * 409 * Fixes up an EPT PTE for vaddr 'va' by reconfiguring the low bits to 410 * conform to the EPT format (separate R/W/X bits and various "must be 411 * 0 bits") 412 * 413 * Parameters: 414 * pm: The pmap in question 415 * va: The VA to fix up 416 */ 417 void 418 pmap_fix_ept(struct pmap *pm, vaddr_t va) 419 { 420 u_long mask, shift; 421 pd_entry_t pde, *pd; 422 paddr_t pdpa; 423 int lev, offs; 424 425 pdpa = pm->pm_pdirpa; 426 shift = L4_SHIFT; 427 mask = L4_MASK; 428 for (lev = PTP_LEVELS; lev > 0; lev--) { 429 pd = (pd_entry_t *)PMAP_DIRECT_MAP(pdpa); 430 offs = (VA_SIGN_POS(va) & mask) >> shift; 431 432 pd[offs] |= EPT_R | EPT_W | EPT_X; 433 /* 434 * Levels 3-4 have bits 3:7 'must be 0' 435 * Level 2 has bits 3:6 'must be 0', and bit 7 is always 436 * 0 in our EPT format (thus, bits 3:7 == 0) 437 */ 438 switch(lev) { 439 case 4: 440 case 3: 441 case 2: 442 /* Bits 3:7 = 0 */ 443 pd[offs] &= ~(0xF8); 444 break; 445 case 1: pd[offs] |= EPT_WB; 446 break; 447 } 448 449 pde = pd[offs]; 450 451 /* Large pages are different, break early if we run into one. */ 452 if ((pde & (PG_PS|PG_V)) != PG_V) 453 panic("pmap_fix_ept: large page in EPT"); 454 455 pdpa = (pd[offs] & PG_FRAME); 456 /* 4096/8 == 512 == 2^9 entries per level */ 457 shift -= 9; 458 mask >>= 9; 459 } 460 } 461 462 int 463 pmap_find_pte_direct(struct pmap *pm, vaddr_t va, pt_entry_t **pd, int *offs) 464 { 465 u_long mask, shift; 466 pd_entry_t pde; 467 paddr_t pdpa; 468 int lev; 469 470 pdpa = pm->pm_pdirpa; 471 shift = L4_SHIFT; 472 mask = L4_MASK; 473 for (lev = PTP_LEVELS; lev > 0; lev--) { 474 *pd = (pd_entry_t *)PMAP_DIRECT_MAP(pdpa); 475 *offs = (VA_SIGN_POS(va) & mask) >> shift; 476 pde = (*pd)[*offs]; 477 478 /* Large pages are different, break early if we run into one. */ 479 if ((pde & (PG_PS|PG_V)) != PG_V) 480 return (lev - 1); 481 482 pdpa = ((*pd)[*offs] & PG_FRAME); 483 /* 4096/8 == 512 == 2^9 entries per level */ 484 shift -= 9; 485 mask >>= 9; 486 } 487 488 return (0); 489 } 490 491 492 /* 493 * p m a p k e n t e r f u n c t i o n s 494 * 495 * functions to quickly enter/remove pages from the kernel address 496 * space. pmap_kremove is exported to MI kernel. we make use of 497 * the recursive PTE mappings. 498 */ 499 500 /* 501 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 502 * 503 * => no need to lock anything, assume va is already allocated 504 * => should be faster than normal pmap enter function 505 */ 506 507 void 508 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot) 509 { 510 pt_entry_t *pte, opte, npte; 511 512 pte = kvtopte(va); 513 514 npte = (pa & PMAP_PA_MASK) | ((prot & PROT_WRITE) ? PG_RW : PG_RO) | 515 ((pa & PMAP_NOCACHE) ? PG_N : 0) | 516 ((pa & PMAP_WC) ? pmap_pg_wc : 0) | PG_V; 517 518 /* special 1:1 mappings in the first 2MB must not be global */ 519 if (va >= (vaddr_t)NBPD_L2) 520 npte |= PG_G; 521 522 if (!(prot & PROT_EXEC)) 523 npte |= pg_nx; 524 opte = pmap_pte_set(pte, npte); 525 #ifdef LARGEPAGES 526 /* XXX For now... */ 527 if (opte & PG_PS) 528 panic("%s: PG_PS", __func__); 529 #endif 530 if (pmap_valid_entry(opte)) { 531 if (pa & PMAP_NOCACHE && (opte & PG_N) == 0) 532 wbinvd(); 533 /* This shouldn't happen */ 534 pmap_tlb_shootpage(pmap_kernel(), va, 1); 535 pmap_tlb_shootwait(); 536 } 537 } 538 539 /* 540 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 541 * 542 * => no need to lock anything 543 * => caller must dispose of any vm_page mapped in the va range 544 * => note: not an inline function 545 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 546 * => we assume kernel only unmaps valid addresses and thus don't bother 547 * checking the valid bit before doing TLB flushing 548 */ 549 550 void 551 pmap_kremove(vaddr_t sva, vsize_t len) 552 { 553 pt_entry_t *pte, opte; 554 vaddr_t va, eva; 555 556 eva = sva + len; 557 558 for (va = sva; va != eva; va += PAGE_SIZE) { 559 pte = kvtopte(va); 560 561 opte = pmap_pte_set(pte, 0); 562 #ifdef LARGEPAGES 563 KASSERT((opte & PG_PS) == 0); 564 #endif 565 KASSERT((opte & PG_PVLIST) == 0); 566 } 567 568 pmap_tlb_shootrange(pmap_kernel(), sva, eva, 1); 569 pmap_tlb_shootwait(); 570 } 571 572 /* 573 * p m a p i n i t f u n c t i o n s 574 * 575 * pmap_bootstrap and pmap_init are called during system startup 576 * to init the pmap module. pmap_bootstrap() does a low level 577 * init just to get things rolling. pmap_init() finishes the job. 578 */ 579 580 /* 581 * pmap_bootstrap: get the system in a state where it can run with VM 582 * properly enabled (called before main()). the VM system is 583 * fully init'd later... 584 * 585 * => on i386, locore.s has already enabled the MMU by allocating 586 * a PDP for the kernel, and nkpde PTP's for the kernel. 587 * => kva_start is the first free virtual address in kernel space 588 */ 589 590 paddr_t 591 pmap_bootstrap(paddr_t first_avail, paddr_t max_pa) 592 { 593 vaddr_t kva, kva_end, kva_start = VM_MIN_KERNEL_ADDRESS; 594 struct pmap *kpm; 595 int i; 596 unsigned long p1i; 597 long ndmpdp; 598 paddr_t dmpd, dmpdp; 599 600 /* 601 * define the boundaries of the managed kernel virtual address 602 * space. 603 */ 604 605 virtual_avail = kva_start; /* first free KVA */ 606 607 /* 608 * set up protection_codes: we need to be able to convert from 609 * a MI protection code (some combo of VM_PROT...) to something 610 * we can jam into a i386 PTE. 611 */ 612 613 protection_codes[PROT_NONE] = pg_nx; /* --- */ 614 protection_codes[PROT_EXEC] = PG_RO; /* --x */ 615 protection_codes[PROT_READ] = PG_RO | pg_nx; /* -r- */ 616 protection_codes[PROT_READ | PROT_EXEC] = PG_RO; /* -rx */ 617 protection_codes[PROT_WRITE] = PG_RW | pg_nx; /* w-- */ 618 protection_codes[PROT_WRITE | PROT_EXEC] = PG_RW; /* w-x */ 619 protection_codes[PROT_WRITE | PROT_READ] = PG_RW | pg_nx; /* wr- */ 620 protection_codes[PROT_READ | PROT_WRITE | PROT_EXEC] = PG_RW; /* wrx */ 621 622 /* 623 * now we init the kernel's pmap 624 * 625 * the kernel pmap's pm_obj is not used for much. however, in 626 * user pmaps the pm_obj contains the list of active PTPs. 627 * the pm_obj currently does not have a pager. it might be possible 628 * to add a pager that would allow a process to read-only mmap its 629 * own page tables (fast user level vtophys?). this may or may not 630 * be useful. 631 */ 632 633 kpm = pmap_kernel(); 634 for (i = 0; i < PTP_LEVELS - 1; i++) { 635 uvm_objinit(&kpm->pm_obj[i], NULL, 1); 636 kpm->pm_ptphint[i] = NULL; 637 } 638 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 639 kpm->pm_pdir = (pd_entry_t *)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE); 640 kpm->pm_pdirpa = proc0.p_addr->u_pcb.pcb_cr3; 641 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 642 atop(kva_start - VM_MIN_KERNEL_ADDRESS); 643 644 kpm->pm_type = PMAP_TYPE_NORMAL; 645 646 /* 647 * the above is just a rough estimate and not critical to the proper 648 * operation of the system. 649 */ 650 651 curpcb->pcb_pmap = kpm; /* proc0's pcb */ 652 653 /* 654 * enable global TLB entries. 655 */ 656 /* add PG_G attribute to already mapped kernel pages */ 657 #if KERNBASE == VM_MIN_KERNEL_ADDRESS 658 for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ; 659 #else 660 kva_end = roundup((vaddr_t)&end, PAGE_SIZE); 661 for (kva = KERNBASE; kva < kva_end ; 662 #endif 663 kva += PAGE_SIZE) { 664 p1i = pl1_i(kva); 665 if (pmap_valid_entry(PTE_BASE[p1i])) 666 PTE_BASE[p1i] |= PG_G; 667 } 668 669 /* 670 * Map the direct map. The first 4GB were mapped in locore, here 671 * we map the rest if it exists. We actually use the direct map 672 * here to set up the page tables, we're assuming that we're still 673 * operating in the lower 4GB of memory. 674 */ 675 ndmpdp = (max_pa + NBPD_L3 - 1) >> L3_SHIFT; 676 if (ndmpdp < NDML2_ENTRIES) 677 ndmpdp = NDML2_ENTRIES; /* At least 4GB */ 678 679 dmpdp = kpm->pm_pdir[PDIR_SLOT_DIRECT] & PG_FRAME; 680 681 dmpd = first_avail; first_avail += ndmpdp * PAGE_SIZE; 682 683 for (i = NDML2_ENTRIES; i < NPDPG * ndmpdp; i++) { 684 paddr_t pdp; 685 vaddr_t va; 686 687 pdp = (paddr_t)&(((pd_entry_t *)dmpd)[i]); 688 va = PMAP_DIRECT_MAP(pdp); 689 690 *((pd_entry_t *)va) = ((paddr_t)i << L2_SHIFT); 691 *((pd_entry_t *)va) |= PG_RW | PG_V | PG_PS | PG_G | PG_U | 692 PG_M | pg_nx; 693 } 694 695 for (i = NDML2_ENTRIES; i < ndmpdp; i++) { 696 paddr_t pdp; 697 vaddr_t va; 698 699 pdp = (paddr_t)&(((pd_entry_t *)dmpdp)[i]); 700 va = PMAP_DIRECT_MAP(pdp); 701 702 *((pd_entry_t *)va) = dmpd + (i << PAGE_SHIFT); 703 *((pd_entry_t *)va) |= PG_RW | PG_V | PG_U | PG_M | pg_nx; 704 } 705 706 kpm->pm_pdir[PDIR_SLOT_DIRECT] = dmpdp | PG_V | PG_KW | PG_U | 707 PG_M | pg_nx; 708 709 tlbflush(); 710 711 msgbuf_vaddr = virtual_avail; 712 virtual_avail += round_page(MSGBUFSIZE); 713 714 idt_vaddr = virtual_avail; 715 virtual_avail += 2 * PAGE_SIZE; 716 idt_paddr = first_avail; /* steal a page */ 717 first_avail += 2 * PAGE_SIZE; 718 719 #if defined(MULTIPROCESSOR) || \ 720 (NACPI > 0 && !defined(SMALL_KERNEL)) 721 /* 722 * Grab a page below 4G for things that need it (i.e. 723 * having an initial %cr3 for the MP trampoline). 724 */ 725 lo32_vaddr = virtual_avail; 726 virtual_avail += PAGE_SIZE; 727 lo32_paddr = first_avail; 728 first_avail += PAGE_SIZE; 729 #endif 730 731 /* 732 * init the global lists. 733 */ 734 LIST_INIT(&pmaps); 735 736 /* 737 * initialize the pmap pool. 738 */ 739 740 pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, 0, 0, 741 "pmappl", NULL); 742 pool_setipl(&pmap_pmap_pool, IPL_NONE); 743 pool_init(&pmap_pv_pool, sizeof(struct pv_entry), 0, 0, 0, "pvpl", 744 &pool_allocator_single); 745 pool_setipl(&pmap_pv_pool, IPL_VM); 746 pool_sethiwat(&pmap_pv_pool, 32 * 1024); 747 748 /* 749 * initialize the PDE pool. 750 */ 751 752 pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, PR_WAITOK, "pdppl", NULL); 753 pool_setipl(&pmap_pdp_pool, IPL_NONE); 754 755 /* 756 * ensure the TLB is sync'd with reality by flushing it... 757 */ 758 759 tlbflush(); 760 761 return first_avail; 762 } 763 764 /* 765 * Pre-allocate PTPs for low memory, so that 1:1 mappings for various 766 * trampoline code can be entered. 767 */ 768 paddr_t 769 pmap_prealloc_lowmem_ptps(paddr_t first_avail) 770 { 771 pd_entry_t *pdes; 772 int level; 773 paddr_t newp; 774 775 pdes = pmap_kernel()->pm_pdir; 776 level = PTP_LEVELS; 777 for (;;) { 778 newp = first_avail; first_avail += PAGE_SIZE; 779 memset((void *)PMAP_DIRECT_MAP(newp), 0, PAGE_SIZE); 780 pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW; 781 level--; 782 if (level <= 1) 783 break; 784 pdes = normal_pdes[level - 2]; 785 } 786 787 return first_avail; 788 } 789 790 /* 791 * pmap_init: called from uvm_init, our job is to get the pmap 792 * system ready to manage mappings... this mainly means initing 793 * the pv_entry stuff. 794 */ 795 796 void 797 pmap_init(void) 798 { 799 /* 800 * done: pmap module is up (and ready for business) 801 */ 802 803 pmap_initialized = TRUE; 804 } 805 806 /* 807 * p v _ e n t r y f u n c t i o n s 808 */ 809 810 /* 811 * main pv_entry manipulation functions: 812 * pmap_enter_pv: enter a mapping onto a pv list 813 * pmap_remove_pv: remove a mapping from a pv list 814 */ 815 816 /* 817 * pmap_enter_pv: enter a mapping onto a pv list 818 * 819 * => caller should adjust ptp's wire_count before calling 820 * 821 * pve: preallocated pve for us to use 822 * ptp: PTP in pmap that maps this VA 823 */ 824 825 void 826 pmap_enter_pv(struct vm_page *pg, struct pv_entry *pve, struct pmap *pmap, 827 vaddr_t va, struct vm_page *ptp) 828 { 829 pve->pv_pmap = pmap; 830 pve->pv_va = va; 831 pve->pv_ptp = ptp; /* NULL for kernel pmap */ 832 mtx_enter(&pg->mdpage.pv_mtx); 833 pve->pv_next = pg->mdpage.pv_list; /* add to ... */ 834 pg->mdpage.pv_list = pve; /* ... list */ 835 mtx_leave(&pg->mdpage.pv_mtx); 836 } 837 838 /* 839 * pmap_remove_pv: try to remove a mapping from a pv_list 840 * 841 * => caller should adjust ptp's wire_count and free PTP if needed 842 * => we return the removed pve 843 */ 844 845 struct pv_entry * 846 pmap_remove_pv(struct vm_page *pg, struct pmap *pmap, vaddr_t va) 847 { 848 struct pv_entry *pve, **prevptr; 849 850 mtx_enter(&pg->mdpage.pv_mtx); 851 prevptr = &pg->mdpage.pv_list; 852 while ((pve = *prevptr) != NULL) { 853 if (pve->pv_pmap == pmap && pve->pv_va == va) { /* match? */ 854 *prevptr = pve->pv_next; /* remove it! */ 855 break; 856 } 857 prevptr = &pve->pv_next; /* previous pointer */ 858 } 859 mtx_leave(&pg->mdpage.pv_mtx); 860 return(pve); /* return removed pve */ 861 } 862 863 /* 864 * p t p f u n c t i o n s 865 */ 866 867 struct vm_page * 868 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level) 869 { 870 int lidx = level - 1; 871 struct vm_page *pg; 872 873 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && 874 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) 875 return (pmap->pm_ptphint[lidx]); 876 877 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); 878 879 return pg; 880 } 881 882 void 883 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level, 884 struct pg_to_free *pagelist) 885 { 886 int lidx; 887 struct uvm_object *obj; 888 889 lidx = level - 1; 890 891 obj = &pmap->pm_obj[lidx]; 892 pmap->pm_stats.resident_count--; 893 if (pmap->pm_ptphint[lidx] == ptp) 894 pmap->pm_ptphint[lidx] = RB_ROOT(&obj->memt); 895 ptp->wire_count = 0; 896 uvm_pagerealloc(ptp, NULL, 0); 897 TAILQ_INSERT_TAIL(pagelist, ptp, pageq); 898 } 899 900 void 901 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 902 pt_entry_t *ptes, pd_entry_t **pdes, struct pg_to_free *pagelist) 903 { 904 unsigned long index; 905 int level; 906 vaddr_t invaladdr; 907 pd_entry_t opde; 908 909 level = 1; 910 do { 911 pmap_freepage(pmap, ptp, level, pagelist); 912 index = pl_i(va, level + 1); 913 opde = pmap_pte_set(&pdes[level - 1][index], 0); 914 invaladdr = level == 1 ? (vaddr_t)ptes : 915 (vaddr_t)pdes[level - 2]; 916 pmap_tlb_shootpage(curpcb->pcb_pmap, 917 invaladdr + index * PAGE_SIZE, 918 pmap_is_curpmap(curpcb->pcb_pmap)); 919 #if defined(MULTIPROCESSOR) 920 invaladdr = level == 1 ? (vaddr_t)PTE_BASE : 921 (vaddr_t)normal_pdes[level - 2]; 922 pmap_tlb_shootpage(pmap, invaladdr + index * PAGE_SIZE, 923 pmap_is_curpmap(curpcb->pcb_pmap)); 924 #endif 925 if (level < PTP_LEVELS - 1) { 926 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); 927 ptp->wire_count--; 928 if (ptp->wire_count > 1) 929 break; 930 } 931 } while (++level < PTP_LEVELS); 932 } 933 934 /* 935 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 936 * 937 * => pmap should NOT be pmap_kernel() 938 */ 939 940 941 struct vm_page * 942 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t **pdes) 943 { 944 struct vm_page *ptp, *pptp; 945 int i; 946 unsigned long index; 947 pd_entry_t *pva; 948 paddr_t ppa, pa; 949 struct uvm_object *obj; 950 951 ptp = NULL; 952 pa = (paddr_t)-1; 953 954 /* 955 * Loop through all page table levels seeing if we need to 956 * add a new page to that level. 957 */ 958 for (i = PTP_LEVELS; i > 1; i--) { 959 /* 960 * Save values from previous round. 961 */ 962 pptp = ptp; 963 ppa = pa; 964 965 index = pl_i(va, i); 966 pva = pdes[i - 2]; 967 968 if (pmap_valid_entry(pva[index])) { 969 ppa = pva[index] & PG_FRAME; 970 ptp = NULL; 971 continue; 972 } 973 974 obj = &pmap->pm_obj[i-2]; 975 ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL, 976 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 977 978 if (ptp == NULL) 979 return NULL; 980 981 atomic_clearbits_int(&ptp->pg_flags, PG_BUSY); 982 ptp->wire_count = 1; 983 pmap->pm_ptphint[i - 2] = ptp; 984 pa = VM_PAGE_TO_PHYS(ptp); 985 pva[index] = (pd_entry_t) (pa | PG_u | PG_RW | PG_V); 986 pmap->pm_stats.resident_count++; 987 /* 988 * If we're not in the top level, increase the 989 * wire count of the parent page. 990 */ 991 if (i < PTP_LEVELS) { 992 if (pptp == NULL) 993 pptp = pmap_find_ptp(pmap, va, ppa, i); 994 #ifdef DIAGNOSTIC 995 if (pptp == NULL) 996 panic("%s: pde page disappeared", __func__); 997 #endif 998 pptp->wire_count++; 999 } 1000 } 1001 1002 /* 1003 * ptp is not NULL if we just allocated a new ptp. If it's 1004 * still NULL, we must look up the existing one. 1005 */ 1006 if (ptp == NULL) { 1007 ptp = pmap_find_ptp(pmap, va, ppa, 1); 1008 #ifdef DIAGNOSTIC 1009 if (ptp == NULL) { 1010 printf("va %lx ppa %lx\n", (unsigned long)va, 1011 (unsigned long)ppa); 1012 panic("%s: unmanaged user PTP", __func__); 1013 } 1014 #endif 1015 } 1016 1017 pmap->pm_ptphint[0] = ptp; 1018 return(ptp); 1019 } 1020 1021 /* 1022 * p m a p l i f e c y c l e f u n c t i o n s 1023 */ 1024 1025 /* 1026 * pmap_pdp_ctor: constructor for the PDP cache. 1027 */ 1028 1029 void 1030 pmap_pdp_ctor(pd_entry_t *pdir) 1031 { 1032 paddr_t pdirpa; 1033 int npde; 1034 1035 /* fetch the physical address of the page directory. */ 1036 (void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa); 1037 1038 /* zero init area */ 1039 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); 1040 1041 /* put in recursive PDE to map the PTEs */ 1042 pdir[PDIR_SLOT_PTE] = pdirpa | PG_V | PG_KW | pg_nx; 1043 1044 npde = nkptp[PTP_LEVELS - 1]; 1045 1046 /* put in kernel VM PDEs */ 1047 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 1048 npde * sizeof(pd_entry_t)); 1049 1050 /* zero the rest */ 1051 memset(&pdir[PDIR_SLOT_KERN + npde], 0, 1052 (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t)); 1053 1054 pdir[PDIR_SLOT_DIRECT] = pmap_kernel()->pm_pdir[PDIR_SLOT_DIRECT]; 1055 1056 #if VM_MIN_KERNEL_ADDRESS != KERNBASE 1057 pdir[pl4_pi(KERNBASE)] = PDP_BASE[pl4_pi(KERNBASE)]; 1058 #endif 1059 } 1060 1061 /* 1062 * pmap_create: create a pmap 1063 * 1064 * => note: old pmap interface took a "size" args which allowed for 1065 * the creation of "software only" pmaps (not in bsd). 1066 */ 1067 1068 struct pmap * 1069 pmap_create(void) 1070 { 1071 struct pmap *pmap; 1072 int i; 1073 1074 pmap = pool_get(&pmap_pmap_pool, PR_WAITOK); 1075 1076 mtx_init(&pmap->pm_mtx, IPL_VM); 1077 1078 /* init uvm_object */ 1079 for (i = 0; i < PTP_LEVELS - 1; i++) { 1080 uvm_objinit(&pmap->pm_obj[i], NULL, 1); 1081 pmap->pm_ptphint[i] = NULL; 1082 } 1083 pmap->pm_stats.wired_count = 0; 1084 pmap->pm_stats.resident_count = 1; /* count the PDP allocd below */ 1085 pmap->pm_cpus = 0; 1086 pmap->pm_type = PMAP_TYPE_NORMAL; 1087 1088 /* allocate PDP */ 1089 1090 /* 1091 * note that there is no need to splvm to protect us from 1092 * malloc since malloc allocates out of a submap and we should 1093 * have already allocated kernel PTPs to cover the range... 1094 */ 1095 1096 pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK); 1097 pmap_pdp_ctor(pmap->pm_pdir); 1098 1099 pmap->pm_pdirpa = pmap->pm_pdir[PDIR_SLOT_PTE] & PG_FRAME; 1100 1101 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 1102 return (pmap); 1103 } 1104 1105 /* 1106 * pmap_destroy: drop reference count on pmap. free pmap if 1107 * reference count goes to zero. 1108 */ 1109 1110 void 1111 pmap_destroy(struct pmap *pmap) 1112 { 1113 struct vm_page *pg; 1114 int refs; 1115 int i; 1116 1117 /* 1118 * drop reference count 1119 */ 1120 1121 refs = atomic_dec_int_nv(&pmap->pm_obj[0].uo_refs); 1122 if (refs > 0) { 1123 return; 1124 } 1125 1126 /* 1127 * reference count is zero, free pmap resources and then free pmap. 1128 */ 1129 1130 #ifdef DIAGNOSTIC 1131 if (__predict_false(pmap->pm_cpus != 0)) 1132 printf("%s: pmap %p cpus=0x%llx\n", __func__, 1133 (void *)pmap, pmap->pm_cpus); 1134 #endif 1135 1136 /* 1137 * remove it from global list of pmaps 1138 */ 1139 LIST_REMOVE(pmap, pm_list); 1140 1141 /* 1142 * free any remaining PTPs 1143 */ 1144 1145 for (i = 0; i < PTP_LEVELS - 1; i++) { 1146 while ((pg = RB_ROOT(&pmap->pm_obj[i].memt)) != NULL) { 1147 KASSERT((pg->pg_flags & PG_BUSY) == 0); 1148 1149 pg->wire_count = 0; 1150 uvm_pagefree(pg); 1151 } 1152 } 1153 1154 /* XXX: need to flush it out of other processor's space? */ 1155 pool_put(&pmap_pdp_pool, pmap->pm_pdir); 1156 1157 pool_put(&pmap_pmap_pool, pmap); 1158 } 1159 1160 /* 1161 * Add a reference to the specified pmap. 1162 */ 1163 1164 void 1165 pmap_reference(struct pmap *pmap) 1166 { 1167 atomic_inc_int(&pmap->pm_obj[0].uo_refs); 1168 } 1169 1170 /* 1171 * pmap_activate: activate a process' pmap (fill in %cr3) 1172 * 1173 * => called from cpu_fork() and when switching pmaps during exec 1174 * => if p is the curproc, then load it into the MMU 1175 */ 1176 1177 void 1178 pmap_activate(struct proc *p) 1179 { 1180 struct pcb *pcb = &p->p_addr->u_pcb; 1181 struct pmap *pmap = p->p_vmspace->vm_map.pmap; 1182 1183 pcb->pcb_pmap = pmap; 1184 pcb->pcb_cr3 = pmap->pm_pdirpa; 1185 if (p == curproc) { 1186 lcr3(pcb->pcb_cr3); 1187 1188 /* 1189 * mark the pmap in use by this processor. 1190 */ 1191 x86_atomic_setbits_u64(&pmap->pm_cpus, (1ULL << cpu_number())); 1192 } 1193 } 1194 1195 /* 1196 * pmap_deactivate: deactivate a process' pmap 1197 */ 1198 1199 void 1200 pmap_deactivate(struct proc *p) 1201 { 1202 struct pmap *pmap = p->p_vmspace->vm_map.pmap; 1203 1204 /* 1205 * mark the pmap no longer in use by this processor. 1206 */ 1207 x86_atomic_clearbits_u64(&pmap->pm_cpus, (1ULL << cpu_number())); 1208 } 1209 1210 /* 1211 * end of lifecycle functions 1212 */ 1213 1214 /* 1215 * some misc. functions 1216 */ 1217 1218 boolean_t 1219 pmap_pdes_valid(vaddr_t va, pd_entry_t **pdes, pd_entry_t *lastpde) 1220 { 1221 int i; 1222 unsigned long index; 1223 pd_entry_t pde; 1224 1225 for (i = PTP_LEVELS; i > 1; i--) { 1226 index = pl_i(va, i); 1227 pde = pdes[i - 2][index]; 1228 if (!pmap_valid_entry(pde)) 1229 return FALSE; 1230 } 1231 if (lastpde != NULL) 1232 *lastpde = pde; 1233 return TRUE; 1234 } 1235 1236 /* 1237 * pmap_extract: extract a PA for the given VA 1238 */ 1239 1240 boolean_t 1241 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 1242 { 1243 pt_entry_t *ptes; 1244 int level, offs; 1245 1246 if (pmap == pmap_kernel() && va >= PMAP_DIRECT_BASE && 1247 va < PMAP_DIRECT_END) { 1248 *pap = va - PMAP_DIRECT_BASE; 1249 return (TRUE); 1250 } 1251 1252 level = pmap_find_pte_direct(pmap, va, &ptes, &offs); 1253 1254 if (__predict_true(level == 0 && pmap_valid_entry(ptes[offs]))) { 1255 if (pap != NULL) 1256 *pap = (ptes[offs] & PG_FRAME) | (va & PAGE_MASK); 1257 return (TRUE); 1258 } 1259 if (level == 1 && (ptes[offs] & (PG_PS|PG_V)) == (PG_PS|PG_V)) { 1260 if (pap != NULL) 1261 *pap = (ptes[offs] & PG_LGFRAME) | (va & PAGE_MASK_L2); 1262 return (TRUE); 1263 } 1264 1265 return FALSE; 1266 } 1267 1268 /* 1269 * pmap_zero_page: zero a page 1270 */ 1271 1272 void 1273 pmap_zero_page(struct vm_page *pg) 1274 { 1275 pagezero(pmap_map_direct(pg)); 1276 } 1277 1278 /* 1279 * pmap_flush_cache: flush the cache for a virtual address. 1280 */ 1281 void 1282 pmap_flush_cache(vaddr_t addr, vsize_t len) 1283 { 1284 vaddr_t i; 1285 1286 if (curcpu()->ci_cflushsz == 0) { 1287 wbinvd(); 1288 return; 1289 } 1290 1291 /* all cpus that have clflush also have mfence. */ 1292 mfence(); 1293 for (i = addr; i < addr + len; i += curcpu()->ci_cflushsz) 1294 clflush(i); 1295 mfence(); 1296 } 1297 1298 /* 1299 * pmap_copy_page: copy a page 1300 */ 1301 1302 void 1303 pmap_copy_page(struct vm_page *srcpg, struct vm_page *dstpg) 1304 { 1305 vaddr_t srcva = pmap_map_direct(srcpg); 1306 vaddr_t dstva = pmap_map_direct(dstpg); 1307 1308 memcpy((void *)dstva, (void *)srcva, PAGE_SIZE); 1309 } 1310 1311 /* 1312 * p m a p r e m o v e f u n c t i o n s 1313 * 1314 * functions that remove mappings 1315 */ 1316 1317 /* 1318 * pmap_remove_ptes: remove PTEs from a PTP 1319 * 1320 * => must have proper locking on pmap_master_lock 1321 * => PTP must be mapped into KVA 1322 * => PTP should be null if pmap == pmap_kernel() 1323 */ 1324 1325 void 1326 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 1327 vaddr_t startva, vaddr_t endva, int flags, struct pv_entry **free_pvs) 1328 { 1329 struct pv_entry *pve; 1330 pt_entry_t *pte = (pt_entry_t *) ptpva; 1331 struct vm_page *pg; 1332 pt_entry_t opte; 1333 1334 /* 1335 * note that ptpva points to the PTE that maps startva. this may 1336 * or may not be the first PTE in the PTP. 1337 * 1338 * we loop through the PTP while there are still PTEs to look at 1339 * and the wire_count is greater than 1 (because we use the wire_count 1340 * to keep track of the number of real PTEs in the PTP). 1341 */ 1342 1343 for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1) 1344 ; pte++, startva += PAGE_SIZE) { 1345 if (!pmap_valid_entry(*pte)) 1346 continue; /* VA not mapped */ 1347 if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) { 1348 continue; 1349 } 1350 1351 /* atomically save the old PTE and zap! it */ 1352 opte = pmap_pte_set(pte, 0); 1353 1354 if (opte & PG_W) 1355 pmap->pm_stats.wired_count--; 1356 pmap->pm_stats.resident_count--; 1357 1358 if (ptp) 1359 ptp->wire_count--; /* dropping a PTE */ 1360 1361 pg = PHYS_TO_VM_PAGE(opte & PG_FRAME); 1362 1363 /* 1364 * if we are not on a pv list we are done. 1365 */ 1366 1367 if ((opte & PG_PVLIST) == 0) { 1368 #ifdef DIAGNOSTIC 1369 if (pg != NULL) 1370 panic("%s: managed page without PG_PVLIST " 1371 "for 0x%lx", __func__, startva); 1372 #endif 1373 continue; 1374 } 1375 1376 #ifdef DIAGNOSTIC 1377 if (pg == NULL) 1378 panic("%s: unmanaged page marked PG_PVLIST, " 1379 "va = 0x%lx, pa = 0x%lx", __func__, 1380 startva, (u_long)(opte & PG_FRAME)); 1381 #endif 1382 1383 /* sync R/M bits */ 1384 pmap_sync_flags_pte(pg, opte); 1385 pve = pmap_remove_pv(pg, pmap, startva); 1386 if (pve) { 1387 pve->pv_next = *free_pvs; 1388 *free_pvs = pve; 1389 } 1390 1391 /* end of "for" loop: time for next pte */ 1392 } 1393 } 1394 1395 1396 /* 1397 * pmap_remove_pte: remove a single PTE from a PTP 1398 * 1399 * => must have proper locking on pmap_master_lock 1400 * => PTP must be mapped into KVA 1401 * => PTP should be null if pmap == pmap_kernel() 1402 * => returns true if we removed a mapping 1403 */ 1404 1405 boolean_t 1406 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 1407 vaddr_t va, int flags, struct pv_entry **free_pvs) 1408 { 1409 struct pv_entry *pve; 1410 struct vm_page *pg; 1411 pt_entry_t opte; 1412 1413 if (!pmap_valid_entry(*pte)) 1414 return(FALSE); /* VA not mapped */ 1415 if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) { 1416 return(FALSE); 1417 } 1418 1419 /* atomically save the old PTE and zap! it */ 1420 opte = pmap_pte_set(pte, 0); 1421 1422 if (opte & PG_W) 1423 pmap->pm_stats.wired_count--; 1424 pmap->pm_stats.resident_count--; 1425 1426 if (ptp) 1427 ptp->wire_count--; /* dropping a PTE */ 1428 1429 pg = PHYS_TO_VM_PAGE(opte & PG_FRAME); 1430 1431 /* 1432 * if we are not on a pv list we are done. 1433 */ 1434 if ((opte & PG_PVLIST) == 0) { 1435 #ifdef DIAGNOSTIC 1436 if (pg != NULL) 1437 panic("%s: managed page without PG_PVLIST for 0x%lx", 1438 __func__, va); 1439 #endif 1440 return(TRUE); 1441 } 1442 1443 #ifdef DIAGNOSTIC 1444 if (pg == NULL) 1445 panic("%s: unmanaged page marked PG_PVLIST, va = 0x%lx, " 1446 "pa = 0x%lx", __func__, va, (u_long)(opte & PG_FRAME)); 1447 #endif 1448 1449 /* sync R/M bits */ 1450 pmap_sync_flags_pte(pg, opte); 1451 pve = pmap_remove_pv(pg, pmap, va); 1452 if (pve) { 1453 pve->pv_next = *free_pvs; 1454 *free_pvs = pve; 1455 } 1456 1457 return(TRUE); 1458 } 1459 1460 /* 1461 * pmap_remove: top level mapping removal function 1462 * 1463 * => caller should not be holding any pmap locks 1464 */ 1465 1466 void 1467 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 1468 { 1469 pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL); 1470 } 1471 1472 /* 1473 * pmap_do_remove: mapping removal guts 1474 * 1475 * => caller should not be holding any pmap locks 1476 */ 1477 1478 void 1479 pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags) 1480 { 1481 pt_entry_t *ptes; 1482 pd_entry_t **pdes, pde; 1483 boolean_t result; 1484 paddr_t ptppa; 1485 vaddr_t blkendva; 1486 struct vm_page *ptp; 1487 struct pv_entry *pve; 1488 struct pv_entry *free_pvs = NULL; 1489 vaddr_t va; 1490 int shootall = 0, shootself; 1491 struct pg_to_free empty_ptps; 1492 paddr_t scr3; 1493 1494 TAILQ_INIT(&empty_ptps); 1495 1496 pmap_map_ptes(pmap, &ptes, &pdes, &scr3); 1497 shootself = (scr3 == 0); 1498 1499 /* 1500 * removing one page? take shortcut function. 1501 */ 1502 1503 if (sva + PAGE_SIZE == eva) { 1504 if (pmap_pdes_valid(sva, pdes, &pde)) { 1505 1506 /* PA of the PTP */ 1507 ptppa = pde & PG_FRAME; 1508 1509 /* get PTP if non-kernel mapping */ 1510 1511 if (pmap == pmap_kernel()) { 1512 /* we never free kernel PTPs */ 1513 ptp = NULL; 1514 } else { 1515 ptp = pmap_find_ptp(pmap, sva, ptppa, 1); 1516 #ifdef DIAGNOSTIC 1517 if (ptp == NULL) 1518 panic("%s: unmanaged PTP detected", 1519 __func__); 1520 #endif 1521 } 1522 1523 /* do it! */ 1524 result = pmap_remove_pte(pmap, ptp, 1525 &ptes[pl1_i(sva)], sva, flags, &free_pvs); 1526 1527 /* 1528 * if mapping removed and the PTP is no longer 1529 * being used, free it! 1530 */ 1531 1532 if (result && ptp && ptp->wire_count <= 1) 1533 pmap_free_ptp(pmap, ptp, sva, ptes, pdes, 1534 &empty_ptps); 1535 pmap_tlb_shootpage(pmap, sva, shootself); 1536 pmap_unmap_ptes(pmap, scr3); 1537 pmap_tlb_shootwait(); 1538 } else { 1539 pmap_unmap_ptes(pmap, scr3); 1540 } 1541 1542 goto cleanup; 1543 } 1544 1545 if ((eva - sva > 32 * PAGE_SIZE) && pmap != pmap_kernel()) 1546 shootall = 1; 1547 1548 for (va = sva; va < eva; va = blkendva) { 1549 /* determine range of block */ 1550 blkendva = x86_round_pdr(va + 1); 1551 if (blkendva > eva) 1552 blkendva = eva; 1553 1554 /* 1555 * XXXCDC: our PTE mappings should never be removed 1556 * with pmap_remove! if we allow this (and why would 1557 * we?) then we end up freeing the pmap's page 1558 * directory page (PDP) before we are finished using 1559 * it when we hit in in the recursive mapping. this 1560 * is BAD. 1561 * 1562 * long term solution is to move the PTEs out of user 1563 * address space. and into kernel address space (up 1564 * with APTE). then we can set VM_MAXUSER_ADDRESS to 1565 * be VM_MAX_ADDRESS. 1566 */ 1567 1568 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE) 1569 /* XXXCDC: ugly hack to avoid freeing PDP here */ 1570 continue; 1571 1572 if (!pmap_pdes_valid(va, pdes, &pde)) 1573 continue; 1574 1575 /* PA of the PTP */ 1576 ptppa = pde & PG_FRAME; 1577 1578 /* get PTP if non-kernel mapping */ 1579 if (pmap == pmap_kernel()) { 1580 /* we never free kernel PTPs */ 1581 ptp = NULL; 1582 } else { 1583 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 1584 #ifdef DIAGNOSTIC 1585 if (ptp == NULL) 1586 panic("%s: unmanaged PTP detected", __func__); 1587 #endif 1588 } 1589 pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], 1590 va, blkendva, flags, &free_pvs); 1591 1592 /* if PTP is no longer being used, free it! */ 1593 if (ptp && ptp->wire_count <= 1) { 1594 pmap_free_ptp(pmap, ptp, va, ptes, pdes, &empty_ptps); 1595 } 1596 } 1597 1598 if (shootall) 1599 pmap_tlb_shoottlb(pmap, shootself); 1600 else 1601 pmap_tlb_shootrange(pmap, sva, eva, shootself); 1602 1603 pmap_unmap_ptes(pmap, scr3); 1604 pmap_tlb_shootwait(); 1605 1606 cleanup: 1607 while ((pve = free_pvs) != NULL) { 1608 free_pvs = pve->pv_next; 1609 pool_put(&pmap_pv_pool, pve); 1610 } 1611 1612 while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) { 1613 TAILQ_REMOVE(&empty_ptps, ptp, pageq); 1614 uvm_pagefree(ptp); 1615 } 1616 } 1617 1618 /* 1619 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 1620 * 1621 * => R/M bits are sync'd back to attrs 1622 */ 1623 1624 void 1625 pmap_page_remove(struct vm_page *pg) 1626 { 1627 struct pv_entry *pve; 1628 struct pmap *pm; 1629 pt_entry_t *ptes, opte; 1630 pd_entry_t **pdes; 1631 #ifdef DIAGNOSTIC 1632 pd_entry_t pde; 1633 #endif 1634 struct pg_to_free empty_ptps; 1635 struct vm_page *ptp; 1636 paddr_t scr3; 1637 int shootself; 1638 1639 TAILQ_INIT(&empty_ptps); 1640 1641 mtx_enter(&pg->mdpage.pv_mtx); 1642 while ((pve = pg->mdpage.pv_list) != NULL) { 1643 pmap_reference(pve->pv_pmap); 1644 pm = pve->pv_pmap; 1645 mtx_leave(&pg->mdpage.pv_mtx); 1646 1647 /* XXX use direct map? */ 1648 pmap_map_ptes(pm, &ptes, &pdes, &scr3); /* locks pmap */ 1649 shootself = (scr3 == 0); 1650 1651 /* 1652 * We dropped the pvlist lock before grabbing the pmap 1653 * lock to avoid lock ordering problems. This means 1654 * we have to check the pvlist again since somebody 1655 * else might have modified it. All we care about is 1656 * that the pvlist entry matches the pmap we just 1657 * locked. If it doesn't, unlock the pmap and try 1658 * again. 1659 */ 1660 mtx_enter(&pg->mdpage.pv_mtx); 1661 if ((pve = pg->mdpage.pv_list) == NULL || 1662 pve->pv_pmap != pm) { 1663 mtx_leave(&pg->mdpage.pv_mtx); 1664 pmap_unmap_ptes(pm, scr3); /* unlocks pmap */ 1665 pmap_destroy(pm); 1666 mtx_enter(&pg->mdpage.pv_mtx); 1667 continue; 1668 } 1669 1670 pg->mdpage.pv_list = pve->pv_next; 1671 mtx_leave(&pg->mdpage.pv_mtx); 1672 1673 #ifdef DIAGNOSTIC 1674 if (pve->pv_ptp && pmap_pdes_valid(pve->pv_va, pdes, &pde) && 1675 (pde & PG_FRAME) != VM_PAGE_TO_PHYS(pve->pv_ptp)) { 1676 printf("%s: pg=%p: va=%lx, pv_ptp=%p\n", __func__, 1677 pg, pve->pv_va, pve->pv_ptp); 1678 printf("%s: PTP's phys addr: " 1679 "actual=%lx, recorded=%lx\n", __func__, 1680 (unsigned long)(pde & PG_FRAME), 1681 VM_PAGE_TO_PHYS(pve->pv_ptp)); 1682 panic("%s: mapped managed page has " 1683 "invalid pv_ptp field", __func__); 1684 } 1685 #endif 1686 1687 /* atomically save the old PTE and zap it */ 1688 opte = pmap_pte_set(&ptes[pl1_i(pve->pv_va)], 0); 1689 1690 if (opte & PG_W) 1691 pve->pv_pmap->pm_stats.wired_count--; 1692 pve->pv_pmap->pm_stats.resident_count--; 1693 1694 pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va, shootself); 1695 1696 pmap_sync_flags_pte(pg, opte); 1697 1698 /* update the PTP reference count. free if last reference. */ 1699 if (pve->pv_ptp) { 1700 pve->pv_ptp->wire_count--; 1701 if (pve->pv_ptp->wire_count <= 1) { 1702 pmap_free_ptp(pve->pv_pmap, pve->pv_ptp, 1703 pve->pv_va, ptes, pdes, &empty_ptps); 1704 } 1705 } 1706 pmap_unmap_ptes(pve->pv_pmap, scr3); /* unlocks pmap */ 1707 pmap_destroy(pve->pv_pmap); 1708 pool_put(&pmap_pv_pool, pve); 1709 mtx_enter(&pg->mdpage.pv_mtx); 1710 } 1711 mtx_leave(&pg->mdpage.pv_mtx); 1712 1713 pmap_tlb_shootwait(); 1714 1715 while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) { 1716 TAILQ_REMOVE(&empty_ptps, ptp, pageq); 1717 uvm_pagefree(ptp); 1718 } 1719 } 1720 1721 /* 1722 * p m a p a t t r i b u t e f u n c t i o n s 1723 * functions that test/change managed page's attributes 1724 * since a page can be mapped multiple times we must check each PTE that 1725 * maps it by going down the pv lists. 1726 */ 1727 1728 /* 1729 * pmap_test_attrs: test a page's attributes 1730 */ 1731 1732 boolean_t 1733 pmap_test_attrs(struct vm_page *pg, unsigned int testbits) 1734 { 1735 struct pv_entry *pve; 1736 pt_entry_t *ptes; 1737 int level, offs; 1738 u_long mybits, testflags; 1739 1740 testflags = pmap_pte2flags(testbits); 1741 1742 if (pg->pg_flags & testflags) 1743 return (TRUE); 1744 1745 mybits = 0; 1746 mtx_enter(&pg->mdpage.pv_mtx); 1747 for (pve = pg->mdpage.pv_list; pve != NULL && mybits == 0; 1748 pve = pve->pv_next) { 1749 level = pmap_find_pte_direct(pve->pv_pmap, pve->pv_va, &ptes, 1750 &offs); 1751 mybits |= (ptes[offs] & testbits); 1752 } 1753 mtx_leave(&pg->mdpage.pv_mtx); 1754 1755 if (mybits == 0) 1756 return (FALSE); 1757 1758 atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(mybits)); 1759 1760 return (TRUE); 1761 } 1762 1763 /* 1764 * pmap_clear_attrs: change a page's attributes 1765 * 1766 * => we return TRUE if we cleared one of the bits we were asked to 1767 */ 1768 1769 boolean_t 1770 pmap_clear_attrs(struct vm_page *pg, unsigned long clearbits) 1771 { 1772 struct pv_entry *pve; 1773 pt_entry_t *ptes, opte; 1774 u_long clearflags; 1775 int result, level, offs; 1776 1777 clearflags = pmap_pte2flags(clearbits); 1778 1779 result = pg->pg_flags & clearflags; 1780 if (result) 1781 atomic_clearbits_int(&pg->pg_flags, clearflags); 1782 1783 mtx_enter(&pg->mdpage.pv_mtx); 1784 for (pve = pg->mdpage.pv_list; pve != NULL; pve = pve->pv_next) { 1785 level = pmap_find_pte_direct(pve->pv_pmap, pve->pv_va, &ptes, 1786 &offs); 1787 opte = ptes[offs]; 1788 if (opte & clearbits) { 1789 result = 1; 1790 pmap_pte_clearbits(&ptes[offs], (opte & clearbits)); 1791 pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va, 1792 pmap_is_curpmap(pve->pv_pmap)); 1793 } 1794 } 1795 mtx_leave(&pg->mdpage.pv_mtx); 1796 1797 pmap_tlb_shootwait(); 1798 1799 return (result != 0); 1800 } 1801 1802 /* 1803 * p m a p p r o t e c t i o n f u n c t i o n s 1804 */ 1805 1806 /* 1807 * pmap_page_protect: change the protection of all recorded mappings 1808 * of a managed page 1809 * 1810 * => NOTE: this is an inline function in pmap.h 1811 */ 1812 1813 /* see pmap.h */ 1814 1815 /* 1816 * pmap_protect: set the protection in of the pages in a pmap 1817 * 1818 * => NOTE: this is an inline function in pmap.h 1819 */ 1820 1821 /* see pmap.h */ 1822 1823 /* 1824 * pmap_write_protect: write-protect pages in a pmap 1825 */ 1826 1827 void 1828 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 1829 { 1830 pt_entry_t nx, *ptes, *spte, *epte; 1831 pd_entry_t **pdes; 1832 vaddr_t blockend; 1833 int shootall = 0, shootself; 1834 vaddr_t va; 1835 paddr_t scr3; 1836 1837 pmap_map_ptes(pmap, &ptes, &pdes, &scr3); 1838 shootself = (scr3 == 0); 1839 1840 /* should be ok, but just in case ... */ 1841 sva &= PG_FRAME; 1842 eva &= PG_FRAME; 1843 1844 nx = 0; 1845 if (!(prot & PROT_EXEC)) 1846 nx = pg_nx; 1847 1848 if ((eva - sva > 32 * PAGE_SIZE) && pmap != pmap_kernel()) 1849 shootall = 1; 1850 1851 for (va = sva; va < eva ; va = blockend) { 1852 blockend = (va & L2_FRAME) + NBPD_L2; 1853 if (blockend > eva) 1854 blockend = eva; 1855 1856 /* 1857 * XXXCDC: our PTE mappings should never be write-protected! 1858 * 1859 * long term solution is to move the PTEs out of user 1860 * address space. and into kernel address space (up 1861 * with APTE). then we can set VM_MAXUSER_ADDRESS to 1862 * be VM_MAX_ADDRESS. 1863 */ 1864 1865 /* XXXCDC: ugly hack to avoid freeing PDP here */ 1866 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE) 1867 continue; 1868 1869 /* empty block? */ 1870 if (!pmap_pdes_valid(va, pdes, NULL)) 1871 continue; 1872 1873 #ifdef DIAGNOSTIC 1874 if (va >= VM_MAXUSER_ADDRESS && va < VM_MAX_ADDRESS) 1875 panic("%s: PTE space", __func__); 1876 #endif 1877 1878 spte = &ptes[pl1_i(va)]; 1879 epte = &ptes[pl1_i(blockend)]; 1880 1881 for (/*null */; spte < epte ; spte++) { 1882 if (!pmap_valid_entry(*spte)) 1883 continue; 1884 pmap_pte_clearbits(spte, PG_RW); 1885 pmap_pte_setbits(spte, nx); 1886 } 1887 } 1888 1889 if (shootall) 1890 pmap_tlb_shoottlb(pmap, shootself); 1891 else 1892 pmap_tlb_shootrange(pmap, sva, eva, shootself); 1893 1894 pmap_unmap_ptes(pmap, scr3); 1895 pmap_tlb_shootwait(); 1896 } 1897 1898 /* 1899 * end of protection functions 1900 */ 1901 1902 /* 1903 * pmap_unwire: clear the wired bit in the PTE 1904 * 1905 * => mapping should already be in map 1906 */ 1907 1908 void 1909 pmap_unwire(struct pmap *pmap, vaddr_t va) 1910 { 1911 pt_entry_t *ptes; 1912 int level, offs; 1913 1914 level = pmap_find_pte_direct(pmap, va, &ptes, &offs); 1915 1916 if (level == 0) { 1917 1918 #ifdef DIAGNOSTIC 1919 if (!pmap_valid_entry(ptes[offs])) 1920 panic("%s: invalid (unmapped) va 0x%lx", __func__, va); 1921 #endif 1922 if (__predict_true((ptes[offs] & PG_W) != 0)) { 1923 pmap_pte_clearbits(&ptes[offs], PG_W); 1924 pmap->pm_stats.wired_count--; 1925 } 1926 #ifdef DIAGNOSTIC 1927 else { 1928 printf("%s: wiring for pmap %p va 0x%lx " 1929 "didn't change!\n", __func__, pmap, va); 1930 } 1931 #endif 1932 } 1933 #ifdef DIAGNOSTIC 1934 else { 1935 panic("%s: invalid PDE", __func__); 1936 } 1937 #endif 1938 } 1939 1940 /* 1941 * pmap_collect: free resources held by a pmap 1942 * 1943 * => optional function. 1944 * => called when a process is swapped out to free memory. 1945 */ 1946 1947 void 1948 pmap_collect(struct pmap *pmap) 1949 { 1950 /* 1951 * free all of the pt pages by removing the physical mappings 1952 * for its entire address space. 1953 */ 1954 1955 /* pmap_do_remove(pmap, VM_MIN_ADDRESS, VM_MAX_ADDRESS, 1956 PMAP_REMOVE_SKIPWIRED); 1957 */ 1958 } 1959 1960 /* 1961 * pmap_copy: copy mappings from one pmap to another 1962 * 1963 * => optional function 1964 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 1965 */ 1966 1967 /* 1968 * defined as macro in pmap.h 1969 */ 1970 1971 /* 1972 * pmap_enter: enter a mapping into a pmap 1973 * 1974 * => must be done "now" ... no lazy-evaluation 1975 */ 1976 1977 int 1978 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, int flags) 1979 { 1980 pt_entry_t *ptes, opte, npte; 1981 pd_entry_t **pdes; 1982 struct vm_page *ptp, *pg = NULL; 1983 struct pv_entry *pve, *opve = NULL; 1984 int ptpdelta, wireddelta, resdelta; 1985 boolean_t wired = (flags & PMAP_WIRED) != 0; 1986 boolean_t nocache = (pa & PMAP_NOCACHE) != 0; 1987 boolean_t wc = (pa & PMAP_WC) != 0; 1988 int error, shootself; 1989 paddr_t scr3; 1990 1991 KASSERT(!(wc && nocache)); 1992 pa &= PMAP_PA_MASK; 1993 1994 #ifdef DIAGNOSTIC 1995 if (va == (vaddr_t) PDP_BASE) 1996 panic("%s: trying to map over PDP!", __func__); 1997 1998 /* sanity check: kernel PTPs should already have been pre-allocated */ 1999 if (va >= VM_MIN_KERNEL_ADDRESS && 2000 !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)])) 2001 panic("%s: missing kernel PTP for va %lx!", __func__, va); 2002 2003 #endif 2004 2005 pve = pool_get(&pmap_pv_pool, PR_NOWAIT); 2006 if (pve == NULL) { 2007 if (flags & PMAP_CANFAIL) { 2008 error = ENOMEM; 2009 goto out; 2010 } 2011 panic("%s: no pv entries available", __func__); 2012 } 2013 2014 /* 2015 * map in ptes and get a pointer to our PTP (unless we are the kernel) 2016 */ 2017 2018 pmap_map_ptes(pmap, &ptes, &pdes, &scr3); 2019 shootself = (scr3 == 0); 2020 if (pmap == pmap_kernel()) { 2021 ptp = NULL; 2022 } else { 2023 ptp = pmap_get_ptp(pmap, va, pdes); 2024 if (ptp == NULL) { 2025 if (flags & PMAP_CANFAIL) { 2026 pmap_unmap_ptes(pmap, scr3); 2027 error = ENOMEM; 2028 goto out; 2029 } 2030 panic("%s: get ptp failed", __func__); 2031 } 2032 } 2033 opte = ptes[pl1_i(va)]; /* old PTE */ 2034 2035 /* 2036 * is there currently a valid mapping at our VA? 2037 */ 2038 2039 if (pmap_valid_entry(opte)) { 2040 /* 2041 * first, calculate pm_stats updates. resident count will not 2042 * change since we are replacing/changing a valid mapping. 2043 * wired count might change... 2044 */ 2045 2046 resdelta = 0; 2047 if (wired && (opte & PG_W) == 0) 2048 wireddelta = 1; 2049 else if (!wired && (opte & PG_W) != 0) 2050 wireddelta = -1; 2051 else 2052 wireddelta = 0; 2053 ptpdelta = 0; 2054 2055 /* 2056 * is the currently mapped PA the same as the one we 2057 * want to map? 2058 */ 2059 2060 if ((opte & PG_FRAME) == pa) { 2061 2062 /* if this is on the PVLIST, sync R/M bit */ 2063 if (opte & PG_PVLIST) { 2064 pg = PHYS_TO_VM_PAGE(pa); 2065 #ifdef DIAGNOSTIC 2066 if (pg == NULL) 2067 panic("%s: same pa PG_PVLIST " 2068 "mapping with unmanaged page " 2069 "pa = 0x%lx (0x%lx)", __func__, 2070 pa, atop(pa)); 2071 #endif 2072 pmap_sync_flags_pte(pg, opte); 2073 } else { 2074 #ifdef DIAGNOSTIC 2075 if (PHYS_TO_VM_PAGE(pa) != NULL) 2076 panic("%s: same pa, managed " 2077 "page, no PG_VLIST pa: 0x%lx\n", 2078 __func__, pa); 2079 #endif 2080 } 2081 goto enter_now; 2082 } 2083 2084 /* 2085 * changing PAs: we must remove the old one first 2086 */ 2087 2088 /* 2089 * if current mapping is on a pvlist, 2090 * remove it (sync R/M bits) 2091 */ 2092 2093 if (opte & PG_PVLIST) { 2094 pg = PHYS_TO_VM_PAGE(opte & PG_FRAME); 2095 #ifdef DIAGNOSTIC 2096 if (pg == NULL) 2097 panic("%s: PG_PVLIST mapping with unmanaged " 2098 "page pa = 0x%lx (0x%lx)", 2099 __func__, pa, atop(pa)); 2100 #endif 2101 pmap_sync_flags_pte(pg, opte); 2102 opve = pmap_remove_pv(pg, pmap, va); 2103 pg = NULL; /* This is not the page we are looking for */ 2104 } 2105 } else { /* opte not valid */ 2106 resdelta = 1; 2107 if (wired) 2108 wireddelta = 1; 2109 else 2110 wireddelta = 0; 2111 if (ptp) 2112 ptpdelta = 1; 2113 else 2114 ptpdelta = 0; 2115 } 2116 2117 /* 2118 * pve is either NULL or points to a now-free pv_entry structure 2119 * (the latter case is if we called pmap_remove_pv above). 2120 * 2121 * if this entry is to be on a pvlist, enter it now. 2122 */ 2123 2124 if (pmap_initialized) 2125 pg = PHYS_TO_VM_PAGE(pa); 2126 2127 if (pg != NULL) { 2128 pmap_enter_pv(pg, pve, pmap, va, ptp); 2129 pve = NULL; 2130 } 2131 2132 enter_now: 2133 /* 2134 * at this point pg is !NULL if we want the PG_PVLIST bit set 2135 */ 2136 2137 pmap->pm_stats.resident_count += resdelta; 2138 pmap->pm_stats.wired_count += wireddelta; 2139 if (ptp) 2140 ptp->wire_count += ptpdelta; 2141 2142 KASSERT(pg == PHYS_TO_VM_PAGE(pa)); 2143 2144 npte = pa | protection_codes[prot] | PG_V; 2145 if (pg != NULL) { 2146 npte |= PG_PVLIST; 2147 /* 2148 * make sure that if the page is write combined all 2149 * instances of pmap_enter make it so. 2150 */ 2151 if (pg->pg_flags & PG_PMAP_WC) { 2152 KASSERT(nocache == 0); 2153 wc = TRUE; 2154 } 2155 } 2156 if (wc) 2157 npte |= pmap_pg_wc; 2158 if (wired) 2159 npte |= PG_W; 2160 if (nocache) 2161 npte |= PG_N; 2162 if (va < VM_MAXUSER_ADDRESS) 2163 npte |= PG_u; 2164 else if (va < VM_MAX_ADDRESS) 2165 npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */ 2166 if (pmap == pmap_kernel()) 2167 npte |= PG_G; 2168 2169 ptes[pl1_i(va)] = npte; /* zap! */ 2170 2171 /* 2172 * If we changed anything other than modified/used bits, 2173 * flush the TLB. (is this overkill?) 2174 */ 2175 if (pmap_valid_entry(opte)) { 2176 if (nocache && (opte & PG_N) == 0) 2177 wbinvd(); 2178 pmap_tlb_shootpage(pmap, va, shootself); 2179 } 2180 2181 pmap_unmap_ptes(pmap, scr3); 2182 pmap_tlb_shootwait(); 2183 2184 error = 0; 2185 2186 if (pmap->pm_type == PMAP_TYPE_EPT) 2187 pmap_fix_ept(pmap, va); 2188 2189 out: 2190 if (pve) 2191 pool_put(&pmap_pv_pool, pve); 2192 if (opve) 2193 pool_put(&pmap_pv_pool, opve); 2194 2195 return error; 2196 } 2197 2198 boolean_t 2199 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp) 2200 { 2201 struct vm_page *ptp; 2202 struct pmap *kpm = pmap_kernel(); 2203 2204 if (uvm.page_init_done == FALSE) { 2205 vaddr_t va; 2206 2207 /* 2208 * we're growing the kernel pmap early (from 2209 * uvm_pageboot_alloc()). this case must be 2210 * handled a little differently. 2211 */ 2212 2213 va = pmap_steal_memory(PAGE_SIZE, NULL, NULL); 2214 *paddrp = PMAP_DIRECT_UNMAP(va); 2215 } else { 2216 ptp = uvm_pagealloc(&kpm->pm_obj[level - 1], 2217 ptp_va2o(va, level), NULL, 2218 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 2219 if (ptp == NULL) 2220 panic("%s: out of memory", __func__); 2221 atomic_clearbits_int(&ptp->pg_flags, PG_BUSY); 2222 ptp->wire_count = 1; 2223 *paddrp = VM_PAGE_TO_PHYS(ptp); 2224 } 2225 kpm->pm_stats.resident_count++; 2226 return TRUE; 2227 } 2228 2229 /* 2230 * Allocate the amount of specified ptps for a ptp level, and populate 2231 * all levels below accordingly, mapping virtual addresses starting at 2232 * kva. 2233 * 2234 * Used by pmap_growkernel. 2235 */ 2236 void 2237 pmap_alloc_level(pd_entry_t **pdes, vaddr_t kva, int lvl, long *needed_ptps) 2238 { 2239 unsigned long i; 2240 vaddr_t va; 2241 paddr_t pa; 2242 unsigned long index, endindex; 2243 int level; 2244 pd_entry_t *pdep; 2245 2246 for (level = lvl; level > 1; level--) { 2247 if (level == PTP_LEVELS) 2248 pdep = pmap_kernel()->pm_pdir; 2249 else 2250 pdep = pdes[level - 2]; 2251 va = kva; 2252 index = pl_i(kva, level); 2253 endindex = index + needed_ptps[level - 1]; 2254 /* 2255 * XXX special case for first time call. 2256 */ 2257 if (nkptp[level - 1] != 0) 2258 index++; 2259 else 2260 endindex--; 2261 2262 for (i = index; i <= endindex; i++) { 2263 pmap_get_physpage(va, level - 1, &pa); 2264 pdep[i] = pa | PG_RW | PG_V | pg_nx; 2265 nkptp[level - 1]++; 2266 va += nbpd[level - 1]; 2267 } 2268 } 2269 } 2270 2271 /* 2272 * pmap_growkernel: increase usage of KVM space 2273 * 2274 * => we allocate new PTPs for the kernel and install them in all 2275 * the pmaps on the system. 2276 */ 2277 2278 static vaddr_t pmap_maxkvaddr = VM_MIN_KERNEL_ADDRESS; 2279 2280 vaddr_t 2281 pmap_growkernel(vaddr_t maxkvaddr) 2282 { 2283 struct pmap *kpm = pmap_kernel(), *pm; 2284 int s, i; 2285 unsigned newpdes; 2286 long needed_kptp[PTP_LEVELS], target_nptp, old; 2287 2288 if (maxkvaddr <= pmap_maxkvaddr) 2289 return pmap_maxkvaddr; 2290 2291 maxkvaddr = x86_round_pdr(maxkvaddr); 2292 old = nkptp[PTP_LEVELS - 1]; 2293 /* 2294 * This loop could be optimized more, but pmap_growkernel() 2295 * is called infrequently. 2296 */ 2297 for (i = PTP_LEVELS - 1; i >= 1; i--) { 2298 target_nptp = pl_i(maxkvaddr, i + 1) - 2299 pl_i(VM_MIN_KERNEL_ADDRESS, i + 1); 2300 /* 2301 * XXX only need to check toplevel. 2302 */ 2303 if (target_nptp > nkptpmax[i]) 2304 panic("%s: out of KVA space", __func__); 2305 needed_kptp[i] = target_nptp - nkptp[i] + 1; 2306 } 2307 2308 2309 s = splhigh(); /* to be safe */ 2310 pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, 2311 needed_kptp); 2312 2313 /* 2314 * If the number of top level entries changed, update all 2315 * pmaps. 2316 */ 2317 if (needed_kptp[PTP_LEVELS - 1] != 0) { 2318 newpdes = nkptp[PTP_LEVELS - 1] - old; 2319 LIST_FOREACH(pm, &pmaps, pm_list) { 2320 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 2321 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 2322 newpdes * sizeof (pd_entry_t)); 2323 } 2324 2325 /* Invalidate the PDP cache. */ 2326 #if 0 2327 pool_cache_invalidate(&pmap_pdp_cache); 2328 #endif 2329 } 2330 pmap_maxkvaddr = maxkvaddr; 2331 splx(s); 2332 2333 return maxkvaddr; 2334 } 2335 2336 vaddr_t 2337 pmap_steal_memory(vsize_t size, vaddr_t *start, vaddr_t *end) 2338 { 2339 int segno; 2340 u_int npg; 2341 vaddr_t va; 2342 paddr_t pa; 2343 struct vm_physseg *seg; 2344 2345 size = round_page(size); 2346 npg = atop(size); 2347 2348 for (segno = 0, seg = vm_physmem; segno < vm_nphysseg; segno++, seg++) { 2349 if (seg->avail_end - seg->avail_start < npg) 2350 continue; 2351 /* 2352 * We can only steal at an ``unused'' segment boundary, 2353 * i.e. either at the start or at the end. 2354 */ 2355 if (seg->avail_start == seg->start || 2356 seg->avail_end == seg->end) 2357 break; 2358 } 2359 if (segno == vm_nphysseg) { 2360 panic("%s: out of memory", __func__); 2361 } else { 2362 if (seg->avail_start == seg->start) { 2363 pa = ptoa(seg->avail_start); 2364 seg->avail_start += npg; 2365 seg->start += npg; 2366 } else { 2367 pa = ptoa(seg->avail_end) - size; 2368 seg->avail_end -= npg; 2369 seg->end -= npg; 2370 } 2371 /* 2372 * If all the segment has been consumed now, remove it. 2373 * Note that the crash dump code still knows about it 2374 * and will dump it correctly. 2375 */ 2376 if (seg->start == seg->end) { 2377 if (vm_nphysseg-- == 1) 2378 panic("%s: out of memory", __func__); 2379 while (segno < vm_nphysseg) { 2380 seg[0] = seg[1]; /* struct copy */ 2381 seg++; 2382 segno++; 2383 } 2384 } 2385 2386 va = PMAP_DIRECT_MAP(pa); 2387 memset((void *)va, 0, size); 2388 } 2389 2390 if (start != NULL) 2391 *start = virtual_avail; 2392 if (end != NULL) 2393 *end = VM_MAX_KERNEL_ADDRESS; 2394 2395 return (va); 2396 } 2397 2398 #ifdef DEBUG 2399 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 2400 2401 /* 2402 * pmap_dump: dump all the mappings from a pmap 2403 * 2404 * => caller should not be holding any pmap locks 2405 */ 2406 2407 void 2408 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 2409 { 2410 pt_entry_t *ptes, *pte; 2411 pd_entry_t **pdes; 2412 vaddr_t blkendva; 2413 paddr_t scr3; 2414 2415 /* 2416 * if end is out of range truncate. 2417 * if (end == start) update to max. 2418 */ 2419 2420 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 2421 eva = VM_MAXUSER_ADDRESS; 2422 2423 pmap_map_ptes(pmap, &ptes, &pdes, &scr3); 2424 2425 /* 2426 * dumping a range of pages: we dump in PTP sized blocks (4MB) 2427 */ 2428 2429 for (/* null */ ; sva < eva ; sva = blkendva) { 2430 2431 /* determine range of block */ 2432 blkendva = x86_round_pdr(sva+1); 2433 if (blkendva > eva) 2434 blkendva = eva; 2435 2436 /* valid block? */ 2437 if (!pmap_pdes_valid(sva, pdes, NULL)) 2438 continue; 2439 2440 pte = &ptes[pl1_i(sva)]; 2441 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 2442 if (!pmap_valid_entry(*pte)) 2443 continue; 2444 printf("va %#lx -> pa %#llx (pte=%#llx)\n", 2445 sva, *pte, *pte & PG_FRAME); 2446 } 2447 } 2448 pmap_unmap_ptes(pmap, scr3); 2449 } 2450 #endif 2451 2452 void 2453 pmap_virtual_space(vaddr_t *vstartp, vaddr_t *vendp) 2454 { 2455 *vstartp = virtual_avail; 2456 *vendp = VM_MAX_KERNEL_ADDRESS; 2457 } 2458 2459 /* 2460 * pmap_convert 2461 * 2462 * Converts 'pmap' to the new 'mode'. 2463 * 2464 * Parameters: 2465 * pmap: the pmap to convert 2466 * mode: the new mode (see pmap.h, PMAP_TYPE_xxx) 2467 * 2468 * Return value: 2469 * always 0 2470 */ 2471 int 2472 pmap_convert(struct pmap *pmap, int mode) 2473 { 2474 pt_entry_t *pte; 2475 2476 pmap->pm_type = mode; 2477 2478 if (mode == PMAP_TYPE_EPT) { 2479 /* Clear low 512GB region (first PML4E) */ 2480 pte = (pt_entry_t *)pmap->pm_pdir; 2481 *pte = 0; 2482 } 2483 2484 return (0); 2485 } 2486 2487 #ifdef MULTIPROCESSOR 2488 /* 2489 * Locking for tlb shootdown. 2490 * 2491 * We lock by setting tlb_shoot_wait to the number of cpus that will 2492 * receive our tlb shootdown. After sending the IPIs, we don't need to 2493 * worry about locking order or interrupts spinning for the lock because 2494 * the call that grabs the "lock" isn't the one that releases it. And 2495 * there is nothing that can block the IPI that releases the lock. 2496 * 2497 * The functions are organized so that we first count the number of 2498 * cpus we need to send the IPI to, then we grab the counter, then 2499 * we send the IPIs, then we finally do our own shootdown. 2500 * 2501 * Our shootdown is last to make it parallel with the other cpus 2502 * to shorten the spin time. 2503 * 2504 * Notice that we depend on failures to send IPIs only being able to 2505 * happen during boot. If they happen later, the above assumption 2506 * doesn't hold since we can end up in situations where noone will 2507 * release the lock if we get an interrupt in a bad moment. 2508 */ 2509 2510 volatile long tlb_shoot_wait; 2511 2512 volatile vaddr_t tlb_shoot_addr1; 2513 volatile vaddr_t tlb_shoot_addr2; 2514 2515 void 2516 pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself) 2517 { 2518 struct cpu_info *ci, *self = curcpu(); 2519 CPU_INFO_ITERATOR cii; 2520 long wait = 0; 2521 u_int64_t mask = 0; 2522 2523 CPU_INFO_FOREACH(cii, ci) { 2524 if (ci == self || !pmap_is_active(pm, ci->ci_cpuid) || 2525 !(ci->ci_flags & CPUF_RUNNING)) 2526 continue; 2527 mask |= (1ULL << ci->ci_cpuid); 2528 wait++; 2529 } 2530 2531 if (wait > 0) { 2532 int s = splvm(); 2533 2534 while (atomic_cas_ulong(&tlb_shoot_wait, 0, wait) != 0) { 2535 while (tlb_shoot_wait != 0) 2536 SPINLOCK_SPIN_HOOK; 2537 } 2538 tlb_shoot_addr1 = va; 2539 CPU_INFO_FOREACH(cii, ci) { 2540 if ((mask & (1ULL << ci->ci_cpuid)) == 0) 2541 continue; 2542 if (x86_fast_ipi(ci, LAPIC_IPI_INVLPG) != 0) 2543 panic("%s: ipi failed", __func__); 2544 } 2545 splx(s); 2546 } 2547 2548 if (shootself) 2549 pmap_update_pg(va); 2550 } 2551 2552 void 2553 pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva, int shootself) 2554 { 2555 struct cpu_info *ci, *self = curcpu(); 2556 CPU_INFO_ITERATOR cii; 2557 long wait = 0; 2558 u_int64_t mask = 0; 2559 vaddr_t va; 2560 2561 CPU_INFO_FOREACH(cii, ci) { 2562 if (ci == self || !pmap_is_active(pm, ci->ci_cpuid) || 2563 !(ci->ci_flags & CPUF_RUNNING)) 2564 continue; 2565 mask |= (1ULL << ci->ci_cpuid); 2566 wait++; 2567 } 2568 2569 if (wait > 0) { 2570 int s = splvm(); 2571 2572 while (atomic_cas_ulong(&tlb_shoot_wait, 0, wait) != 0) { 2573 while (tlb_shoot_wait != 0) 2574 SPINLOCK_SPIN_HOOK; 2575 } 2576 tlb_shoot_addr1 = sva; 2577 tlb_shoot_addr2 = eva; 2578 CPU_INFO_FOREACH(cii, ci) { 2579 if ((mask & (1ULL << ci->ci_cpuid)) == 0) 2580 continue; 2581 if (x86_fast_ipi(ci, LAPIC_IPI_INVLRANGE) != 0) 2582 panic("%s: ipi failed", __func__); 2583 } 2584 splx(s); 2585 } 2586 2587 if (shootself) 2588 for (va = sva; va < eva; va += PAGE_SIZE) 2589 pmap_update_pg(va); 2590 } 2591 2592 void 2593 pmap_tlb_shoottlb(struct pmap *pm, int shootself) 2594 { 2595 struct cpu_info *ci, *self = curcpu(); 2596 CPU_INFO_ITERATOR cii; 2597 long wait = 0; 2598 u_int64_t mask = 0; 2599 2600 CPU_INFO_FOREACH(cii, ci) { 2601 if (ci == self || !pmap_is_active(pm, ci->ci_cpuid) || !(ci->ci_flags & CPUF_RUNNING)) 2602 continue; 2603 mask |= (1ULL << ci->ci_cpuid); 2604 wait++; 2605 } 2606 2607 if (wait) { 2608 int s = splvm(); 2609 2610 while (atomic_cas_ulong(&tlb_shoot_wait, 0, wait) != 0) { 2611 while (tlb_shoot_wait != 0) 2612 SPINLOCK_SPIN_HOOK; 2613 } 2614 2615 CPU_INFO_FOREACH(cii, ci) { 2616 if ((mask & (1ULL << ci->ci_cpuid)) == 0) 2617 continue; 2618 if (x86_fast_ipi(ci, LAPIC_IPI_INVLTLB) != 0) 2619 panic("%s: ipi failed", __func__); 2620 } 2621 splx(s); 2622 } 2623 2624 if (shootself) 2625 tlbflush(); 2626 } 2627 2628 void 2629 pmap_tlb_shootwait(void) 2630 { 2631 while (tlb_shoot_wait != 0) 2632 SPINLOCK_SPIN_HOOK; 2633 } 2634 2635 #else 2636 2637 void 2638 pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself) 2639 { 2640 if (shootself) 2641 pmap_update_pg(va); 2642 2643 } 2644 2645 void 2646 pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva, int shootself) 2647 { 2648 vaddr_t va; 2649 2650 if (!shootself) 2651 return; 2652 2653 for (va = sva; va < eva; va += PAGE_SIZE) 2654 pmap_update_pg(va); 2655 2656 } 2657 2658 void 2659 pmap_tlb_shoottlb(struct pmap *pm, int shootself) 2660 { 2661 if (shootself) 2662 tlbflush(); 2663 } 2664 #endif /* MULTIPROCESSOR */ 2665