1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * Copyright (c) 1994 John S. Dyson 4 * Copyright (c) 1994 David Greenman 5 * Copyright (c) 2003 Peter Wemm 6 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu> 7 * Copyright (c) 2008, 2009 The DragonFly Project. 8 * Copyright (c) 2008, 2009 Jordan Gordeev. 9 * Copyright (c) 2011-2017 Matthew Dillon 10 * All rights reserved. 11 * 12 * This code is derived from software contributed to Berkeley by 13 * the Systems Programming Group of the University of Utah Computer 14 * Science Department and William Jolitz of UUNET Technologies Inc. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. All advertising materials mentioning features or use of this software 25 * must display the following acknowledgement: 26 * This product includes software developed by the University of 27 * California, Berkeley and its contributors. 28 * 4. Neither the name of the University nor the names of its contributors 29 * may be used to endorse or promote products derived from this software 30 * without specific prior written permission. 31 * 32 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 35 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 42 * SUCH DAMAGE. 43 */ 44 /* 45 * Manage physical address maps for x86-64 systems. 46 */ 47 48 #if 0 /* JG */ 49 #include "opt_disable_pse.h" 50 #include "opt_pmap.h" 51 #endif 52 #include "opt_msgbuf.h" 53 54 #include <sys/param.h> 55 #include <sys/kernel.h> 56 #include <sys/proc.h> 57 #include <sys/msgbuf.h> 58 #include <sys/vmmeter.h> 59 #include <sys/mman.h> 60 #include <sys/systm.h> 61 62 #include <vm/vm.h> 63 #include <vm/vm_param.h> 64 #include <sys/sysctl.h> 65 #include <sys/lock.h> 66 #include <vm/vm_kern.h> 67 #include <vm/vm_page.h> 68 #include <vm/vm_map.h> 69 #include <vm/vm_object.h> 70 #include <vm/vm_extern.h> 71 #include <vm/vm_pageout.h> 72 #include <vm/vm_pager.h> 73 #include <vm/vm_zone.h> 74 75 #include <sys/user.h> 76 #include <sys/thread2.h> 77 #include <sys/spinlock2.h> 78 #include <vm/vm_page2.h> 79 80 #include <machine/cputypes.h> 81 #include <machine/md_var.h> 82 #include <machine/specialreg.h> 83 #include <machine/smp.h> 84 #include <machine_base/apic/apicreg.h> 85 #include <machine/globaldata.h> 86 #include <machine/pmap.h> 87 #include <machine/pmap_inval.h> 88 #include <machine/inttypes.h> 89 90 #include <ddb/ddb.h> 91 92 #define PMAP_KEEP_PDIRS 93 #ifndef PMAP_SHPGPERPROC 94 #define PMAP_SHPGPERPROC 2000 95 #endif 96 97 #if defined(DIAGNOSTIC) 98 #define PMAP_DIAGNOSTIC 99 #endif 100 101 #define MINPV 2048 102 103 /* 104 * pmap debugging will report who owns a pv lock when blocking. 105 */ 106 #ifdef PMAP_DEBUG 107 108 #define PMAP_DEBUG_DECL ,const char *func, int lineno 109 #define PMAP_DEBUG_ARGS , __func__, __LINE__ 110 #define PMAP_DEBUG_COPY , func, lineno 111 112 #define pv_get(pmap, pindex, pmarkp) _pv_get(pmap, pindex, pmarkp \ 113 PMAP_DEBUG_ARGS) 114 #define pv_lock(pv) _pv_lock(pv \ 115 PMAP_DEBUG_ARGS) 116 #define pv_hold_try(pv) _pv_hold_try(pv \ 117 PMAP_DEBUG_ARGS) 118 #define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp \ 119 PMAP_DEBUG_ARGS) 120 121 #define pv_free(pv, pvp) _pv_free(pv, pvp PMAP_DEBUG_ARGS) 122 123 #else 124 125 #define PMAP_DEBUG_DECL 126 #define PMAP_DEBUG_ARGS 127 #define PMAP_DEBUG_COPY 128 129 #define pv_get(pmap, pindex, pmarkp) _pv_get(pmap, pindex, pmarkp) 130 #define pv_lock(pv) _pv_lock(pv) 131 #define pv_hold_try(pv) _pv_hold_try(pv) 132 #define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp) 133 #define pv_free(pv, pvp) _pv_free(pv, pvp) 134 135 #endif 136 137 /* 138 * Get PDEs and PTEs for user/kernel address space 139 */ 140 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 141 142 #define pmap_pde_v(pmap, pte) ((*(pd_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0) 143 #define pmap_pte_w(pmap, pte) ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_W_IDX]) != 0) 144 #define pmap_pte_m(pmap, pte) ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_M_IDX]) != 0) 145 #define pmap_pte_u(pmap, pte) ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_U_IDX]) != 0) 146 #define pmap_pte_v(pmap, pte) ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0) 147 148 /* 149 * Given a map and a machine independent protection code, 150 * convert to a vax protection code. 151 */ 152 #define pte_prot(m, p) \ 153 (m->protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)]) 154 static uint64_t protection_codes[PROTECTION_CODES_SIZE]; 155 156 struct pmap kernel_pmap; 157 158 MALLOC_DEFINE(M_OBJPMAP, "objpmap", "pmaps associated with VM objects"); 159 160 vm_paddr_t avail_start; /* PA of first available physical page */ 161 vm_paddr_t avail_end; /* PA of last available physical page */ 162 vm_offset_t virtual2_start; /* cutout free area prior to kernel start */ 163 vm_offset_t virtual2_end; 164 vm_offset_t virtual_start; /* VA of first avail page (after kernel bss) */ 165 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 166 vm_offset_t KvaStart; /* VA start of KVA space */ 167 vm_offset_t KvaEnd; /* VA end of KVA space (non-inclusive) */ 168 vm_offset_t KvaSize; /* max size of kernel virtual address space */ 169 static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ 170 //static int pgeflag; /* PG_G or-in */ 171 //static int pseflag; /* PG_PS or-in */ 172 uint64_t PatMsr; 173 174 static int ndmpdp; 175 static vm_paddr_t dmaplimit; 176 static int nkpt; 177 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 178 179 static pt_entry_t pat_pte_index[PAT_INDEX_SIZE]; /* PAT -> PG_ bits */ 180 /*static pt_entry_t pat_pde_index[PAT_INDEX_SIZE];*/ /* PAT -> PG_ bits */ 181 182 static uint64_t KPTbase; 183 static uint64_t KPTphys; 184 static uint64_t KPDphys; /* phys addr of kernel level 2 */ 185 static uint64_t KPDbase; /* phys addr of kernel level 2 @ KERNBASE */ 186 uint64_t KPDPphys; /* phys addr of kernel level 3 */ 187 uint64_t KPML4phys; /* phys addr of kernel level 4 */ 188 189 static uint64_t DMPDphys; /* phys addr of direct mapped level 2 */ 190 static uint64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 191 192 /* 193 * Data for the pv entry allocation mechanism 194 */ 195 static vm_zone_t pvzone; 196 static struct vm_zone pvzone_store; 197 static int pv_entry_max=0, pv_entry_high_water=0; 198 static int pmap_pagedaemon_waken = 0; 199 static struct pv_entry *pvinit; 200 201 /* 202 * All those kernel PT submaps that BSD is so fond of 203 */ 204 pt_entry_t *CMAP1 = NULL, *ptmmap; 205 caddr_t CADDR1 = NULL, ptvmmap = NULL; 206 static pt_entry_t *msgbufmap; 207 struct msgbuf *msgbufp=NULL; 208 209 /* 210 * PMAP default PG_* bits. Needed to be able to add 211 * EPT/NPT pagetable pmap_bits for the VMM module 212 */ 213 uint64_t pmap_bits_default[] = { 214 REGULAR_PMAP, /* TYPE_IDX 0 */ 215 X86_PG_V, /* PG_V_IDX 1 */ 216 X86_PG_RW, /* PG_RW_IDX 2 */ 217 X86_PG_U, /* PG_U_IDX 3 */ 218 X86_PG_A, /* PG_A_IDX 4 */ 219 X86_PG_M, /* PG_M_IDX 5 */ 220 X86_PG_PS, /* PG_PS_IDX3 6 */ 221 X86_PG_G, /* PG_G_IDX 7 */ 222 X86_PG_AVAIL1, /* PG_AVAIL1_IDX 8 */ 223 X86_PG_AVAIL2, /* PG_AVAIL2_IDX 9 */ 224 X86_PG_AVAIL3, /* PG_AVAIL3_IDX 10 */ 225 X86_PG_NC_PWT | X86_PG_NC_PCD, /* PG_N_IDX 11 */ 226 X86_PG_NX, /* PG_NX_IDX 12 */ 227 }; 228 /* 229 * Crashdump maps. 230 */ 231 static pt_entry_t *pt_crashdumpmap; 232 static caddr_t crashdumpmap; 233 234 static int pmap_debug = 0; 235 SYSCTL_INT(_machdep, OID_AUTO, pmap_debug, CTLFLAG_RW, 236 &pmap_debug, 0, "Debug pmap's"); 237 #ifdef PMAP_DEBUG2 238 static int pmap_enter_debug = 0; 239 SYSCTL_INT(_machdep, OID_AUTO, pmap_enter_debug, CTLFLAG_RW, 240 &pmap_enter_debug, 0, "Debug pmap_enter's"); 241 #endif 242 static int pmap_yield_count = 64; 243 SYSCTL_INT(_machdep, OID_AUTO, pmap_yield_count, CTLFLAG_RW, 244 &pmap_yield_count, 0, "Yield during init_pt/release"); 245 static int pmap_mmu_optimize = 0; 246 SYSCTL_INT(_machdep, OID_AUTO, pmap_mmu_optimize, CTLFLAG_RW, 247 &pmap_mmu_optimize, 0, "Share page table pages when possible"); 248 int pmap_fast_kernel_cpusync = 0; 249 SYSCTL_INT(_machdep, OID_AUTO, pmap_fast_kernel_cpusync, CTLFLAG_RW, 250 &pmap_fast_kernel_cpusync, 0, "Share page table pages when possible"); 251 int pmap_dynamic_delete = 0; 252 SYSCTL_INT(_machdep, OID_AUTO, pmap_dynamic_delete, CTLFLAG_RW, 253 &pmap_dynamic_delete, 0, "Dynamically delete PT/PD/PDPs"); 254 int pmap_lock_delay = 100; 255 SYSCTL_INT(_machdep, OID_AUTO, pmap_lock_delay, CTLFLAG_RW, 256 &pmap_lock_delay, 0, "Spin loops"); 257 258 static int pmap_nx_enable = 0; 259 /* needs manual TUNABLE in early probe, see below */ 260 261 #define DISABLE_PSE 262 263 /* Standard user access funtions */ 264 extern int std_copyinstr (const void *udaddr, void *kaddr, size_t len, 265 size_t *lencopied); 266 extern int std_copyin (const void *udaddr, void *kaddr, size_t len); 267 extern int std_copyout (const void *kaddr, void *udaddr, size_t len); 268 extern int std_fubyte (const uint8_t *base); 269 extern int std_subyte (uint8_t *base, uint8_t byte); 270 extern int32_t std_fuword32 (const uint32_t *base); 271 extern int64_t std_fuword64 (const uint64_t *base); 272 extern int std_suword64 (uint64_t *base, uint64_t word); 273 extern int std_suword32 (uint32_t *base, int word); 274 extern uint32_t std_swapu32 (volatile uint32_t *base, uint32_t v); 275 extern uint64_t std_swapu64 (volatile uint64_t *base, uint64_t v); 276 277 static void pv_hold(pv_entry_t pv); 278 static int _pv_hold_try(pv_entry_t pv 279 PMAP_DEBUG_DECL); 280 static void pv_drop(pv_entry_t pv); 281 static void _pv_lock(pv_entry_t pv 282 PMAP_DEBUG_DECL); 283 static void pv_unlock(pv_entry_t pv); 284 static pv_entry_t _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew 285 PMAP_DEBUG_DECL); 286 static pv_entry_t _pv_get(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp 287 PMAP_DEBUG_DECL); 288 static void _pv_free(pv_entry_t pv, pv_entry_t pvp PMAP_DEBUG_DECL); 289 static pv_entry_t pv_get_try(pmap_t pmap, vm_pindex_t pindex, 290 vm_pindex_t **pmarkp, int *errorp); 291 static void pv_put(pv_entry_t pv); 292 static void *pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex); 293 static pv_entry_t pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, 294 pv_entry_t *pvpp); 295 static pv_entry_t pmap_allocpte_seg(pmap_t pmap, vm_pindex_t ptepindex, 296 pv_entry_t *pvpp, vm_map_entry_t entry, vm_offset_t va); 297 static void pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, 298 pmap_inval_bulk_t *bulk, int destroy); 299 static vm_page_t pmap_remove_pv_page(pv_entry_t pv); 300 static int pmap_release_pv(pv_entry_t pv, pv_entry_t pvp, 301 pmap_inval_bulk_t *bulk); 302 303 struct pmap_scan_info; 304 static void pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info, 305 pv_entry_t pte_pv, vm_pindex_t *pte_placemark, 306 pv_entry_t pt_pv, int sharept, 307 vm_offset_t va, pt_entry_t *ptep, void *arg __unused); 308 static void pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info, 309 pv_entry_t pte_pv, vm_pindex_t *pte_placemark, 310 pv_entry_t pt_pv, int sharept, 311 vm_offset_t va, pt_entry_t *ptep, void *arg __unused); 312 313 static void i386_protection_init (void); 314 static void create_pagetables(vm_paddr_t *firstaddr); 315 static void pmap_remove_all (vm_page_t m); 316 static boolean_t pmap_testbit (vm_page_t m, int bit); 317 318 static pt_entry_t * pmap_pte_quick (pmap_t pmap, vm_offset_t va); 319 static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 320 321 static void pmap_pinit_defaults(struct pmap *pmap); 322 static void pv_placemarker_wait(pmap_t pmap, vm_pindex_t *pmark); 323 static void pv_placemarker_wakeup(pmap_t pmap, vm_pindex_t *pmark); 324 325 static unsigned pdir4mb; 326 327 static int 328 pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2) 329 { 330 if (pv1->pv_pindex < pv2->pv_pindex) 331 return(-1); 332 if (pv1->pv_pindex > pv2->pv_pindex) 333 return(1); 334 return(0); 335 } 336 337 RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry, 338 pv_entry_compare, vm_pindex_t, pv_pindex); 339 340 static __inline 341 void 342 pmap_page_stats_adding(vm_page_t m) 343 { 344 globaldata_t gd = mycpu; 345 346 if (TAILQ_EMPTY(&m->md.pv_list)) { 347 ++gd->gd_vmtotal.t_arm; 348 } else if (TAILQ_FIRST(&m->md.pv_list) == 349 TAILQ_LAST(&m->md.pv_list, md_page_pv_list)) { 350 ++gd->gd_vmtotal.t_armshr; 351 ++gd->gd_vmtotal.t_avmshr; 352 } else { 353 ++gd->gd_vmtotal.t_avmshr; 354 } 355 } 356 357 static __inline 358 void 359 pmap_page_stats_deleting(vm_page_t m) 360 { 361 globaldata_t gd = mycpu; 362 363 if (TAILQ_EMPTY(&m->md.pv_list)) { 364 --gd->gd_vmtotal.t_arm; 365 } else if (TAILQ_FIRST(&m->md.pv_list) == 366 TAILQ_LAST(&m->md.pv_list, md_page_pv_list)) { 367 --gd->gd_vmtotal.t_armshr; 368 --gd->gd_vmtotal.t_avmshr; 369 } else { 370 --gd->gd_vmtotal.t_avmshr; 371 } 372 } 373 374 /* 375 * This is an ineligent crowbar to prevent heavily threaded programs 376 * from creating long live-locks in the pmap code when pmap_mmu_optimize 377 * is enabled. Without it a pmap-local page table page can wind up being 378 * constantly created and destroyed (without injury, but also without 379 * progress) as the optimization tries to switch to the object's shared page 380 * table page. 381 */ 382 static __inline void 383 pmap_softwait(pmap_t pmap) 384 { 385 while (pmap->pm_softhold) { 386 tsleep_interlock(&pmap->pm_softhold, 0); 387 if (pmap->pm_softhold) 388 tsleep(&pmap->pm_softhold, PINTERLOCKED, "mmopt", 0); 389 } 390 } 391 392 static __inline void 393 pmap_softhold(pmap_t pmap) 394 { 395 while (atomic_swap_int(&pmap->pm_softhold, 1) == 1) { 396 tsleep_interlock(&pmap->pm_softhold, 0); 397 if (atomic_swap_int(&pmap->pm_softhold, 1) == 1) 398 tsleep(&pmap->pm_softhold, PINTERLOCKED, "mmopt", 0); 399 } 400 } 401 402 static __inline void 403 pmap_softdone(pmap_t pmap) 404 { 405 atomic_swap_int(&pmap->pm_softhold, 0); 406 wakeup(&pmap->pm_softhold); 407 } 408 409 /* 410 * Move the kernel virtual free pointer to the next 411 * 2MB. This is used to help improve performance 412 * by using a large (2MB) page for much of the kernel 413 * (.text, .data, .bss) 414 */ 415 static 416 vm_offset_t 417 pmap_kmem_choose(vm_offset_t addr) 418 { 419 vm_offset_t newaddr = addr; 420 421 newaddr = roundup2(addr, NBPDR); 422 return newaddr; 423 } 424 425 /* 426 * Returns the pindex of a page table entry (representing a terminal page). 427 * There are NUPTE_TOTAL page table entries possible (a huge number) 428 * 429 * x86-64 has a 48-bit address space, where bit 47 is sign-extended out. 430 * We want to properly translate negative KVAs. 431 */ 432 static __inline 433 vm_pindex_t 434 pmap_pte_pindex(vm_offset_t va) 435 { 436 return ((va >> PAGE_SHIFT) & (NUPTE_TOTAL - 1)); 437 } 438 439 /* 440 * Returns the pindex of a page table. 441 */ 442 static __inline 443 vm_pindex_t 444 pmap_pt_pindex(vm_offset_t va) 445 { 446 return (NUPTE_TOTAL + ((va >> PDRSHIFT) & (NUPT_TOTAL - 1))); 447 } 448 449 /* 450 * Returns the pindex of a page directory. 451 */ 452 static __inline 453 vm_pindex_t 454 pmap_pd_pindex(vm_offset_t va) 455 { 456 return (NUPTE_TOTAL + NUPT_TOTAL + 457 ((va >> PDPSHIFT) & (NUPD_TOTAL - 1))); 458 } 459 460 static __inline 461 vm_pindex_t 462 pmap_pdp_pindex(vm_offset_t va) 463 { 464 return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + 465 ((va >> PML4SHIFT) & (NUPDP_TOTAL - 1))); 466 } 467 468 static __inline 469 vm_pindex_t 470 pmap_pml4_pindex(void) 471 { 472 return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL); 473 } 474 475 /* 476 * Return various clipped indexes for a given VA 477 * 478 * Returns the index of a pt in a page directory, representing a page 479 * table. 480 */ 481 static __inline 482 vm_pindex_t 483 pmap_pt_index(vm_offset_t va) 484 { 485 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 486 } 487 488 /* 489 * Returns the index of a pd in a page directory page, representing a page 490 * directory. 491 */ 492 static __inline 493 vm_pindex_t 494 pmap_pd_index(vm_offset_t va) 495 { 496 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 497 } 498 499 /* 500 * Returns the index of a pdp in the pml4 table, representing a page 501 * directory page. 502 */ 503 static __inline 504 vm_pindex_t 505 pmap_pdp_index(vm_offset_t va) 506 { 507 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 508 } 509 510 /* 511 * Locate the requested pt_entry 512 */ 513 static __inline 514 pv_entry_t 515 pv_entry_lookup(pmap_t pmap, vm_pindex_t pindex) 516 { 517 pv_entry_t pv; 518 519 if (pindex < pmap_pt_pindex(0)) 520 pv = pmap->pm_pvhint_pte; 521 else if (pindex < pmap_pd_pindex(0)) 522 pv = pmap->pm_pvhint_pt; 523 else 524 pv = NULL; 525 cpu_ccfence(); 526 if (pv == NULL || pv->pv_pmap != pmap) { 527 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, 528 pindex); 529 } else if (pv->pv_pindex != pindex) { 530 pv = pv_entry_rb_tree_RB_LOOKUP_REL(&pmap->pm_pvroot, 531 pindex, pv); 532 } 533 return pv; 534 } 535 536 /* 537 * pmap_pte_quick: 538 * 539 * Super fast pmap_pte routine best used when scanning the pv lists. 540 * This eliminates many course-grained invltlb calls. Note that many of 541 * the pv list scans are across different pmaps and it is very wasteful 542 * to do an entire invltlb when checking a single mapping. 543 */ 544 static __inline pt_entry_t *pmap_pte(pmap_t pmap, vm_offset_t va); 545 546 static 547 pt_entry_t * 548 pmap_pte_quick(pmap_t pmap, vm_offset_t va) 549 { 550 return pmap_pte(pmap, va); 551 } 552 553 /* 554 * The placemarker hash must be broken up into four zones so lock 555 * ordering semantics continue to work (e.g. pte, pt, pd, then pdp). 556 * 557 * Placemarkers are used to 'lock' page table indices that do not have 558 * a pv_entry. This allows the pmap to support managed and unmanaged 559 * pages and shared page tables. 560 */ 561 #define PM_PLACE_BASE (PM_PLACEMARKS >> 2) 562 563 static __inline 564 vm_pindex_t * 565 pmap_placemarker_hash(pmap_t pmap, vm_pindex_t pindex) 566 { 567 int hi; 568 569 if (pindex < pmap_pt_pindex(0)) /* zone 0 - PTE */ 570 hi = 0; 571 else if (pindex < pmap_pd_pindex(0)) /* zone 1 - PT */ 572 hi = PM_PLACE_BASE; 573 else if (pindex < pmap_pdp_pindex(0)) /* zone 2 - PD */ 574 hi = PM_PLACE_BASE << 1; 575 else /* zone 3 - PDP (and PML4E) */ 576 hi = PM_PLACE_BASE | (PM_PLACE_BASE << 1); 577 hi += pindex & (PM_PLACE_BASE - 1); 578 579 return (&pmap->pm_placemarks[hi]); 580 } 581 582 583 /* 584 * Generic procedure to index a pte from a pt, pd, or pdp. 585 * 586 * NOTE: Normally passed pindex as pmap_xx_index(). pmap_xx_pindex() is NOT 587 * a page table page index but is instead of PV lookup index. 588 */ 589 static 590 void * 591 pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex) 592 { 593 pt_entry_t *pte; 594 595 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pv->pv_m)); 596 return(&pte[pindex]); 597 } 598 599 /* 600 * Return pointer to PDP slot in the PML4 601 */ 602 static __inline 603 pml4_entry_t * 604 pmap_pdp(pmap_t pmap, vm_offset_t va) 605 { 606 return (&pmap->pm_pml4[pmap_pdp_index(va)]); 607 } 608 609 /* 610 * Return pointer to PD slot in the PDP given a pointer to the PDP 611 */ 612 static __inline 613 pdp_entry_t * 614 pmap_pdp_to_pd(pml4_entry_t pdp_pte, vm_offset_t va) 615 { 616 pdp_entry_t *pd; 617 618 pd = (pdp_entry_t *)PHYS_TO_DMAP(pdp_pte & PG_FRAME); 619 return (&pd[pmap_pd_index(va)]); 620 } 621 622 /* 623 * Return pointer to PD slot in the PDP. 624 */ 625 static __inline 626 pdp_entry_t * 627 pmap_pd(pmap_t pmap, vm_offset_t va) 628 { 629 pml4_entry_t *pdp; 630 631 pdp = pmap_pdp(pmap, va); 632 if ((*pdp & pmap->pmap_bits[PG_V_IDX]) == 0) 633 return NULL; 634 return (pmap_pdp_to_pd(*pdp, va)); 635 } 636 637 /* 638 * Return pointer to PT slot in the PD given a pointer to the PD 639 */ 640 static __inline 641 pd_entry_t * 642 pmap_pd_to_pt(pdp_entry_t pd_pte, vm_offset_t va) 643 { 644 pd_entry_t *pt; 645 646 pt = (pd_entry_t *)PHYS_TO_DMAP(pd_pte & PG_FRAME); 647 return (&pt[pmap_pt_index(va)]); 648 } 649 650 /* 651 * Return pointer to PT slot in the PD 652 * 653 * SIMPLE PMAP NOTE: Simple pmaps (embedded in objects) do not have PDPs, 654 * so we cannot lookup the PD via the PDP. Instead we 655 * must look it up via the pmap. 656 */ 657 static __inline 658 pd_entry_t * 659 pmap_pt(pmap_t pmap, vm_offset_t va) 660 { 661 pdp_entry_t *pd; 662 pv_entry_t pv; 663 vm_pindex_t pd_pindex; 664 vm_paddr_t phys; 665 666 if (pmap->pm_flags & PMAP_FLAG_SIMPLE) { 667 pd_pindex = pmap_pd_pindex(va); 668 spin_lock_shared(&pmap->pm_spin); 669 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pd_pindex); 670 if (pv == NULL || pv->pv_m == NULL) { 671 spin_unlock_shared(&pmap->pm_spin); 672 return NULL; 673 } 674 phys = VM_PAGE_TO_PHYS(pv->pv_m); 675 spin_unlock_shared(&pmap->pm_spin); 676 return (pmap_pd_to_pt(phys, va)); 677 } else { 678 pd = pmap_pd(pmap, va); 679 if (pd == NULL || (*pd & pmap->pmap_bits[PG_V_IDX]) == 0) 680 return NULL; 681 return (pmap_pd_to_pt(*pd, va)); 682 } 683 } 684 685 /* 686 * Return pointer to PTE slot in the PT given a pointer to the PT 687 */ 688 static __inline 689 pt_entry_t * 690 pmap_pt_to_pte(pd_entry_t pt_pte, vm_offset_t va) 691 { 692 pt_entry_t *pte; 693 694 pte = (pt_entry_t *)PHYS_TO_DMAP(pt_pte & PG_FRAME); 695 return (&pte[pmap_pte_index(va)]); 696 } 697 698 /* 699 * Return pointer to PTE slot in the PT 700 */ 701 static __inline 702 pt_entry_t * 703 pmap_pte(pmap_t pmap, vm_offset_t va) 704 { 705 pd_entry_t *pt; 706 707 pt = pmap_pt(pmap, va); 708 if (pt == NULL || (*pt & pmap->pmap_bits[PG_V_IDX]) == 0) 709 return NULL; 710 if ((*pt & pmap->pmap_bits[PG_PS_IDX]) != 0) 711 return ((pt_entry_t *)pt); 712 return (pmap_pt_to_pte(*pt, va)); 713 } 714 715 /* 716 * Of all the layers (PTE, PT, PD, PDP, PML4) the best one to cache is 717 * the PT layer. This will speed up core pmap operations considerably. 718 * 719 * NOTE: The pmap spinlock does not need to be held but the passed-in pv 720 * must be in a known associated state (typically by being locked when 721 * the pmap spinlock isn't held). We allow the race for that case. 722 * 723 * NOTE: pm_pvhint* is only accessed (read) with the spin-lock held, using 724 * cpu_ccfence() to prevent compiler optimizations from reloading the 725 * field. 726 */ 727 static __inline 728 void 729 pv_cache(pv_entry_t pv, vm_pindex_t pindex) 730 { 731 if (pindex < pmap_pt_pindex(0)) { 732 if (pv->pv_pmap) 733 pv->pv_pmap->pm_pvhint_pte = pv; 734 } else if (pindex < pmap_pd_pindex(0)) { 735 if (pv->pv_pmap) 736 pv->pv_pmap->pm_pvhint_pt = pv; 737 } 738 } 739 740 741 /* 742 * Return address of PT slot in PD (KVM only) 743 * 744 * Cannot be used for user page tables because it might interfere with 745 * the shared page-table-page optimization (pmap_mmu_optimize). 746 */ 747 static __inline 748 pd_entry_t * 749 vtopt(vm_offset_t va) 750 { 751 uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 752 NPML4EPGSHIFT)) - 1); 753 754 return (PDmap + ((va >> PDRSHIFT) & mask)); 755 } 756 757 /* 758 * KVM - return address of PTE slot in PT 759 */ 760 static __inline 761 pt_entry_t * 762 vtopte(vm_offset_t va) 763 { 764 uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + 765 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 766 767 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 768 } 769 770 /* 771 * Returns the physical address translation from va for a user address. 772 * (vm_paddr_t)-1 is returned on failure. 773 */ 774 vm_paddr_t 775 uservtophys(vm_offset_t va) 776 { 777 uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + 778 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 779 vm_paddr_t pa; 780 pt_entry_t pte; 781 pmap_t pmap; 782 783 pmap = vmspace_pmap(mycpu->gd_curthread->td_lwp->lwp_vmspace); 784 pa = (vm_paddr_t)-1; 785 if (va < VM_MAX_USER_ADDRESS) { 786 pte = kreadmem64(PTmap + ((va >> PAGE_SHIFT) & mask)); 787 if (pte & pmap->pmap_bits[PG_V_IDX]) 788 pa = (pte & PG_FRAME) | (va & PAGE_MASK); 789 } 790 return pa; 791 } 792 793 static uint64_t 794 allocpages(vm_paddr_t *firstaddr, long n) 795 { 796 uint64_t ret; 797 798 ret = *firstaddr; 799 bzero((void *)ret, n * PAGE_SIZE); 800 *firstaddr += n * PAGE_SIZE; 801 return (ret); 802 } 803 804 static 805 void 806 create_pagetables(vm_paddr_t *firstaddr) 807 { 808 long i; /* must be 64 bits */ 809 long nkpt_base; 810 long nkpt_phys; 811 int j; 812 813 /* 814 * We are running (mostly) V=P at this point 815 * 816 * Calculate NKPT - number of kernel page tables. We have to 817 * accomodoate prealloction of the vm_page_array, dump bitmap, 818 * MSGBUF_SIZE, and other stuff. Be generous. 819 * 820 * Maxmem is in pages. 821 * 822 * ndmpdp is the number of 1GB pages we wish to map. 823 */ 824 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; 825 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 826 ndmpdp = 4; 827 KKASSERT(ndmpdp <= NKPDPE * NPDEPG); 828 829 /* 830 * Starting at the beginning of kvm (not KERNBASE). 831 */ 832 nkpt_phys = (Maxmem * sizeof(struct vm_page) + NBPDR - 1) / NBPDR; 833 nkpt_phys += (Maxmem * sizeof(struct pv_entry) + NBPDR - 1) / NBPDR; 834 nkpt_phys += ((nkpt + nkpt + 1 + NKPML4E + NKPDPE + NDMPML4E + 835 ndmpdp) + 511) / 512; 836 nkpt_phys += 128; 837 838 /* 839 * Starting at KERNBASE - map 2G worth of page table pages. 840 * KERNBASE is offset -2G from the end of kvm. 841 */ 842 nkpt_base = (NPDPEPG - KPDPI) * NPTEPG; /* typically 2 x 512 */ 843 844 /* 845 * Allocate pages 846 */ 847 KPTbase = allocpages(firstaddr, nkpt_base); 848 KPTphys = allocpages(firstaddr, nkpt_phys); 849 KPML4phys = allocpages(firstaddr, 1); 850 KPDPphys = allocpages(firstaddr, NKPML4E); 851 KPDphys = allocpages(firstaddr, NKPDPE); 852 853 /* 854 * Calculate the page directory base for KERNBASE, 855 * that is where we start populating the page table pages. 856 * Basically this is the end - 2. 857 */ 858 KPDbase = KPDphys + ((NKPDPE - (NPDPEPG - KPDPI)) << PAGE_SHIFT); 859 860 DMPDPphys = allocpages(firstaddr, NDMPML4E); 861 if ((amd_feature & AMDID_PAGE1GB) == 0) 862 DMPDphys = allocpages(firstaddr, ndmpdp); 863 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 864 865 /* 866 * Fill in the underlying page table pages for the area around 867 * KERNBASE. This remaps low physical memory to KERNBASE. 868 * 869 * Read-only from zero to physfree 870 * XXX not fully used, underneath 2M pages 871 */ 872 for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) { 873 ((pt_entry_t *)KPTbase)[i] = i << PAGE_SHIFT; 874 ((pt_entry_t *)KPTbase)[i] |= 875 pmap_bits_default[PG_RW_IDX] | 876 pmap_bits_default[PG_V_IDX] | 877 pmap_bits_default[PG_G_IDX]; 878 } 879 880 /* 881 * Now map the initial kernel page tables. One block of page 882 * tables is placed at the beginning of kernel virtual memory, 883 * and another block is placed at KERNBASE to map the kernel binary, 884 * data, bss, and initial pre-allocations. 885 */ 886 for (i = 0; i < nkpt_base; i++) { 887 ((pd_entry_t *)KPDbase)[i] = KPTbase + (i << PAGE_SHIFT); 888 ((pd_entry_t *)KPDbase)[i] |= 889 pmap_bits_default[PG_RW_IDX] | 890 pmap_bits_default[PG_V_IDX]; 891 } 892 for (i = 0; i < nkpt_phys; i++) { 893 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); 894 ((pd_entry_t *)KPDphys)[i] |= 895 pmap_bits_default[PG_RW_IDX] | 896 pmap_bits_default[PG_V_IDX]; 897 } 898 899 /* 900 * Map from zero to end of allocations using 2M pages as an 901 * optimization. This will bypass some of the KPTBase pages 902 * above in the KERNBASE area. 903 */ 904 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) { 905 ((pd_entry_t *)KPDbase)[i] = i << PDRSHIFT; 906 ((pd_entry_t *)KPDbase)[i] |= 907 pmap_bits_default[PG_RW_IDX] | 908 pmap_bits_default[PG_V_IDX] | 909 pmap_bits_default[PG_PS_IDX] | 910 pmap_bits_default[PG_G_IDX]; 911 } 912 913 /* 914 * And connect up the PD to the PDP. The kernel pmap is expected 915 * to pre-populate all of its PDs. See NKPDPE in vmparam.h. 916 */ 917 for (i = 0; i < NKPDPE; i++) { 918 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] = 919 KPDphys + (i << PAGE_SHIFT); 920 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] |= 921 pmap_bits_default[PG_RW_IDX] | 922 pmap_bits_default[PG_V_IDX] | 923 pmap_bits_default[PG_U_IDX]; 924 } 925 926 /* 927 * Now set up the direct map space using either 2MB or 1GB pages 928 * Preset PG_M and PG_A because demotion expects it. 929 * 930 * When filling in entries in the PD pages make sure any excess 931 * entries are set to zero as we allocated enough PD pages 932 */ 933 if ((amd_feature & AMDID_PAGE1GB) == 0) { 934 for (i = 0; i < NPDEPG * ndmpdp; i++) { 935 ((pd_entry_t *)DMPDphys)[i] = i << PDRSHIFT; 936 ((pd_entry_t *)DMPDphys)[i] |= 937 pmap_bits_default[PG_RW_IDX] | 938 pmap_bits_default[PG_V_IDX] | 939 pmap_bits_default[PG_PS_IDX] | 940 pmap_bits_default[PG_G_IDX] | 941 pmap_bits_default[PG_M_IDX] | 942 pmap_bits_default[PG_A_IDX]; 943 } 944 945 /* 946 * And the direct map space's PDP 947 */ 948 for (i = 0; i < ndmpdp; i++) { 949 ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + 950 (i << PAGE_SHIFT); 951 ((pdp_entry_t *)DMPDPphys)[i] |= 952 pmap_bits_default[PG_RW_IDX] | 953 pmap_bits_default[PG_V_IDX] | 954 pmap_bits_default[PG_U_IDX]; 955 } 956 } else { 957 for (i = 0; i < ndmpdp; i++) { 958 ((pdp_entry_t *)DMPDPphys)[i] = 959 (vm_paddr_t)i << PDPSHIFT; 960 ((pdp_entry_t *)DMPDPphys)[i] |= 961 pmap_bits_default[PG_RW_IDX] | 962 pmap_bits_default[PG_V_IDX] | 963 pmap_bits_default[PG_PS_IDX] | 964 pmap_bits_default[PG_G_IDX] | 965 pmap_bits_default[PG_M_IDX] | 966 pmap_bits_default[PG_A_IDX]; 967 } 968 } 969 970 /* And recursively map PML4 to itself in order to get PTmap */ 971 ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; 972 ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= 973 pmap_bits_default[PG_RW_IDX] | 974 pmap_bits_default[PG_V_IDX] | 975 pmap_bits_default[PG_U_IDX]; 976 977 /* 978 * Connect the Direct Map slots up to the PML4 979 */ 980 for (j = 0; j < NDMPML4E; ++j) { 981 ((pdp_entry_t *)KPML4phys)[DMPML4I + j] = 982 (DMPDPphys + ((vm_paddr_t)j << PAGE_SHIFT)) | 983 pmap_bits_default[PG_RW_IDX] | 984 pmap_bits_default[PG_V_IDX] | 985 pmap_bits_default[PG_U_IDX]; 986 } 987 988 /* 989 * Connect the KVA slot up to the PML4 990 */ 991 ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys; 992 ((pdp_entry_t *)KPML4phys)[KPML4I] |= 993 pmap_bits_default[PG_RW_IDX] | 994 pmap_bits_default[PG_V_IDX] | 995 pmap_bits_default[PG_U_IDX]; 996 } 997 998 /* 999 * Bootstrap the system enough to run with virtual memory. 1000 * 1001 * On the i386 this is called after mapping has already been enabled 1002 * and just syncs the pmap module with what has already been done. 1003 * [We can't call it easily with mapping off since the kernel is not 1004 * mapped with PA == VA, hence we would have to relocate every address 1005 * from the linked base (virtual) address "KERNBASE" to the actual 1006 * (physical) address starting relative to 0] 1007 */ 1008 void 1009 pmap_bootstrap(vm_paddr_t *firstaddr) 1010 { 1011 vm_offset_t va; 1012 pt_entry_t *pte; 1013 int i; 1014 1015 KvaStart = VM_MIN_KERNEL_ADDRESS; 1016 KvaEnd = VM_MAX_KERNEL_ADDRESS; 1017 KvaSize = KvaEnd - KvaStart; 1018 1019 avail_start = *firstaddr; 1020 1021 /* 1022 * Create an initial set of page tables to run the kernel in. 1023 */ 1024 create_pagetables(firstaddr); 1025 1026 virtual2_start = KvaStart; 1027 virtual2_end = PTOV_OFFSET; 1028 1029 virtual_start = (vm_offset_t) PTOV_OFFSET + *firstaddr; 1030 virtual_start = pmap_kmem_choose(virtual_start); 1031 1032 virtual_end = VM_MAX_KERNEL_ADDRESS; 1033 1034 /* XXX do %cr0 as well */ 1035 load_cr4(rcr4() | CR4_PGE | CR4_PSE); 1036 load_cr3(KPML4phys); 1037 1038 /* 1039 * Initialize protection array. 1040 */ 1041 i386_protection_init(); 1042 1043 /* 1044 * The kernel's pmap is statically allocated so we don't have to use 1045 * pmap_create, which is unlikely to work correctly at this part of 1046 * the boot sequence (XXX and which no longer exists). 1047 */ 1048 kernel_pmap.pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys); 1049 kernel_pmap.pm_count = 1; 1050 CPUMASK_ASSALLONES(kernel_pmap.pm_active); 1051 RB_INIT(&kernel_pmap.pm_pvroot); 1052 spin_init(&kernel_pmap.pm_spin, "pmapbootstrap"); 1053 for (i = 0; i < PM_PLACEMARKS; ++i) 1054 kernel_pmap.pm_placemarks[i] = PM_NOPLACEMARK; 1055 1056 /* 1057 * Reserve some special page table entries/VA space for temporary 1058 * mapping of pages. 1059 */ 1060 #define SYSMAP(c, p, v, n) \ 1061 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 1062 1063 va = virtual_start; 1064 pte = vtopte(va); 1065 1066 /* 1067 * CMAP1/CMAP2 are used for zeroing and copying pages. 1068 */ 1069 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 1070 1071 /* 1072 * Crashdump maps. 1073 */ 1074 SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); 1075 1076 /* 1077 * ptvmmap is used for reading arbitrary physical pages via 1078 * /dev/mem. 1079 */ 1080 SYSMAP(caddr_t, ptmmap, ptvmmap, 1) 1081 1082 /* 1083 * msgbufp is used to map the system message buffer. 1084 * XXX msgbufmap is not used. 1085 */ 1086 SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 1087 atop(round_page(MSGBUF_SIZE))) 1088 1089 virtual_start = va; 1090 virtual_start = pmap_kmem_choose(virtual_start); 1091 1092 *CMAP1 = 0; 1093 1094 /* 1095 * PG_G is terribly broken on SMP because we IPI invltlb's in some 1096 * cases rather then invl1pg. Actually, I don't even know why it 1097 * works under UP because self-referential page table mappings 1098 */ 1099 // pgeflag = 0; 1100 1101 /* 1102 * Initialize the 4MB page size flag 1103 */ 1104 // pseflag = 0; 1105 /* 1106 * The 4MB page version of the initial 1107 * kernel page mapping. 1108 */ 1109 pdir4mb = 0; 1110 1111 #if !defined(DISABLE_PSE) 1112 if (cpu_feature & CPUID_PSE) { 1113 pt_entry_t ptditmp; 1114 /* 1115 * Note that we have enabled PSE mode 1116 */ 1117 // pseflag = kernel_pmap.pmap_bits[PG_PS_IDX]; 1118 ptditmp = *(PTmap + x86_64_btop(KERNBASE)); 1119 ptditmp &= ~(NBPDR - 1); 1120 ptditmp |= pmap_bits_default[PG_V_IDX] | 1121 pmap_bits_default[PG_RW_IDX] | 1122 pmap_bits_default[PG_PS_IDX] | 1123 pmap_bits_default[PG_U_IDX]; 1124 // pgeflag; 1125 pdir4mb = ptditmp; 1126 } 1127 #endif 1128 cpu_invltlb(); 1129 1130 /* Initialize the PAT MSR */ 1131 pmap_init_pat(); 1132 pmap_pinit_defaults(&kernel_pmap); 1133 1134 TUNABLE_INT_FETCH("machdep.pmap_fast_kernel_cpusync", 1135 &pmap_fast_kernel_cpusync); 1136 1137 } 1138 1139 /* 1140 * Setup the PAT MSR. 1141 */ 1142 void 1143 pmap_init_pat(void) 1144 { 1145 uint64_t pat_msr; 1146 u_long cr0, cr4; 1147 1148 /* 1149 * Default values mapping PATi,PCD,PWT bits at system reset. 1150 * The default values effectively ignore the PATi bit by 1151 * repeating the encodings for 0-3 in 4-7, and map the PCD 1152 * and PWT bit combinations to the expected PAT types. 1153 */ 1154 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | /* 000 */ 1155 PAT_VALUE(1, PAT_WRITE_THROUGH) | /* 001 */ 1156 PAT_VALUE(2, PAT_UNCACHED) | /* 010 */ 1157 PAT_VALUE(3, PAT_UNCACHEABLE) | /* 011 */ 1158 PAT_VALUE(4, PAT_WRITE_BACK) | /* 100 */ 1159 PAT_VALUE(5, PAT_WRITE_THROUGH) | /* 101 */ 1160 PAT_VALUE(6, PAT_UNCACHED) | /* 110 */ 1161 PAT_VALUE(7, PAT_UNCACHEABLE); /* 111 */ 1162 pat_pte_index[PAT_WRITE_BACK] = 0; 1163 pat_pte_index[PAT_WRITE_THROUGH]= 0 | X86_PG_NC_PWT; 1164 pat_pte_index[PAT_UNCACHED] = X86_PG_NC_PCD; 1165 pat_pte_index[PAT_UNCACHEABLE] = X86_PG_NC_PCD | X86_PG_NC_PWT; 1166 pat_pte_index[PAT_WRITE_PROTECTED] = pat_pte_index[PAT_UNCACHEABLE]; 1167 pat_pte_index[PAT_WRITE_COMBINING] = pat_pte_index[PAT_UNCACHEABLE]; 1168 1169 if (cpu_feature & CPUID_PAT) { 1170 /* 1171 * If we support the PAT then set-up entries for 1172 * WRITE_PROTECTED and WRITE_COMBINING using bit patterns 1173 * 5 and 6. 1174 */ 1175 pat_msr = (pat_msr & ~PAT_MASK(5)) | 1176 PAT_VALUE(5, PAT_WRITE_PROTECTED); 1177 pat_msr = (pat_msr & ~PAT_MASK(6)) | 1178 PAT_VALUE(6, PAT_WRITE_COMBINING); 1179 pat_pte_index[PAT_WRITE_PROTECTED] = X86_PG_PTE_PAT | X86_PG_NC_PWT; 1180 pat_pte_index[PAT_WRITE_COMBINING] = X86_PG_PTE_PAT | X86_PG_NC_PCD; 1181 1182 /* 1183 * Then enable the PAT 1184 */ 1185 1186 /* Disable PGE. */ 1187 cr4 = rcr4(); 1188 load_cr4(cr4 & ~CR4_PGE); 1189 1190 /* Disable caches (CD = 1, NW = 0). */ 1191 cr0 = rcr0(); 1192 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 1193 1194 /* Flushes caches and TLBs. */ 1195 wbinvd(); 1196 cpu_invltlb(); 1197 1198 /* Update PAT and index table. */ 1199 wrmsr(MSR_PAT, pat_msr); 1200 1201 /* Flush caches and TLBs again. */ 1202 wbinvd(); 1203 cpu_invltlb(); 1204 1205 /* Restore caches and PGE. */ 1206 load_cr0(cr0); 1207 load_cr4(cr4); 1208 PatMsr = pat_msr; 1209 } 1210 } 1211 1212 /* 1213 * Set 4mb pdir for mp startup 1214 */ 1215 void 1216 pmap_set_opt(void) 1217 { 1218 if (cpu_feature & CPUID_PSE) { 1219 load_cr4(rcr4() | CR4_PSE); 1220 if (pdir4mb && mycpu->gd_cpuid == 0) { /* only on BSP */ 1221 cpu_invltlb(); 1222 } 1223 } 1224 } 1225 1226 /* 1227 * Initialize the pmap module. 1228 * Called by vm_init, to initialize any structures that the pmap 1229 * system needs to map virtual memory. 1230 * pmap_init has been enhanced to support in a fairly consistant 1231 * way, discontiguous physical memory. 1232 */ 1233 void 1234 pmap_init(void) 1235 { 1236 int i; 1237 int initial_pvs; 1238 1239 /* 1240 * Allocate memory for random pmap data structures. Includes the 1241 * pv_head_table. 1242 */ 1243 1244 for (i = 0; i < vm_page_array_size; i++) { 1245 vm_page_t m; 1246 1247 m = &vm_page_array[i]; 1248 TAILQ_INIT(&m->md.pv_list); 1249 } 1250 1251 /* 1252 * init the pv free list 1253 */ 1254 initial_pvs = vm_page_array_size; 1255 if (initial_pvs < MINPV) 1256 initial_pvs = MINPV; 1257 pvzone = &pvzone_store; 1258 pvinit = (void *)kmem_alloc(&kernel_map, 1259 initial_pvs * sizeof (struct pv_entry), 1260 VM_SUBSYS_PVENTRY); 1261 zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), 1262 pvinit, initial_pvs); 1263 1264 /* 1265 * Now it is safe to enable pv_table recording. 1266 */ 1267 pmap_initialized = TRUE; 1268 } 1269 1270 /* 1271 * Initialize the address space (zone) for the pv_entries. Set a 1272 * high water mark so that the system can recover from excessive 1273 * numbers of pv entries. 1274 */ 1275 void 1276 pmap_init2(void) 1277 { 1278 int shpgperproc = PMAP_SHPGPERPROC; 1279 int entry_max; 1280 1281 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 1282 pv_entry_max = shpgperproc * maxproc + vm_page_array_size; 1283 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 1284 pv_entry_high_water = 9 * (pv_entry_max / 10); 1285 1286 /* 1287 * Subtract out pages already installed in the zone (hack) 1288 */ 1289 entry_max = pv_entry_max - vm_page_array_size; 1290 if (entry_max <= 0) 1291 entry_max = 1; 1292 1293 zinitna(pvzone, NULL, 0, entry_max, ZONE_INTERRUPT); 1294 1295 /* 1296 * Enable dynamic deletion of empty higher-level page table pages 1297 * by default only if system memory is < 8GB (use 7GB for slop). 1298 * This can save a little memory, but imposes significant 1299 * performance overhead for things like bulk builds, and for programs 1300 * which do a lot of memory mapping and memory unmapping. 1301 */ 1302 if (pmap_dynamic_delete < 0) { 1303 if (vmstats.v_page_count < 7LL * 1024 * 1024 * 1024 / PAGE_SIZE) 1304 pmap_dynamic_delete = 1; 1305 else 1306 pmap_dynamic_delete = 0; 1307 } 1308 } 1309 1310 /* 1311 * Typically used to initialize a fictitious page by vm/device_pager.c 1312 */ 1313 void 1314 pmap_page_init(struct vm_page *m) 1315 { 1316 vm_page_init(m); 1317 TAILQ_INIT(&m->md.pv_list); 1318 } 1319 1320 /*************************************************** 1321 * Low level helper routines..... 1322 ***************************************************/ 1323 1324 /* 1325 * this routine defines the region(s) of memory that should 1326 * not be tested for the modified bit. 1327 */ 1328 static __inline 1329 int 1330 pmap_track_modified(vm_pindex_t pindex) 1331 { 1332 vm_offset_t va = (vm_offset_t)pindex << PAGE_SHIFT; 1333 if ((va < clean_sva) || (va >= clean_eva)) 1334 return 1; 1335 else 1336 return 0; 1337 } 1338 1339 /* 1340 * Extract the physical page address associated with the map/VA pair. 1341 * The page must be wired for this to work reliably. 1342 */ 1343 vm_paddr_t 1344 pmap_extract(pmap_t pmap, vm_offset_t va, void **handlep) 1345 { 1346 vm_paddr_t rtval; 1347 pv_entry_t pt_pv; 1348 pt_entry_t *ptep; 1349 1350 rtval = 0; 1351 if (va >= VM_MAX_USER_ADDRESS) { 1352 /* 1353 * Kernel page directories might be direct-mapped and 1354 * there is typically no PV tracking of pte's 1355 */ 1356 pd_entry_t *pt; 1357 1358 pt = pmap_pt(pmap, va); 1359 if (pt && (*pt & pmap->pmap_bits[PG_V_IDX])) { 1360 if (*pt & pmap->pmap_bits[PG_PS_IDX]) { 1361 rtval = *pt & PG_PS_FRAME; 1362 rtval |= va & PDRMASK; 1363 } else { 1364 ptep = pmap_pt_to_pte(*pt, va); 1365 if (*pt & pmap->pmap_bits[PG_V_IDX]) { 1366 rtval = *ptep & PG_FRAME; 1367 rtval |= va & PAGE_MASK; 1368 } 1369 } 1370 } 1371 if (handlep) 1372 *handlep = NULL; 1373 } else { 1374 /* 1375 * User pages currently do not direct-map the page directory 1376 * and some pages might not used managed PVs. But all PT's 1377 * will have a PV. 1378 */ 1379 pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL); 1380 if (pt_pv) { 1381 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 1382 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 1383 rtval = *ptep & PG_FRAME; 1384 rtval |= va & PAGE_MASK; 1385 } 1386 if (handlep) 1387 *handlep = pt_pv; /* locked until done */ 1388 else 1389 pv_put (pt_pv); 1390 } else if (handlep) { 1391 *handlep = NULL; 1392 } 1393 } 1394 return rtval; 1395 } 1396 1397 void 1398 pmap_extract_done(void *handle) 1399 { 1400 if (handle) 1401 pv_put((pv_entry_t)handle); 1402 } 1403 1404 /* 1405 * Similar to extract but checks protections, SMP-friendly short-cut for 1406 * vm_fault_page[_quick](). Can return NULL to cause the caller to 1407 * fall-through to the real fault code. Does not work with HVM page 1408 * tables. 1409 * 1410 * if busyp is NULL the returned page, if not NULL, is held (and not busied). 1411 * 1412 * If busyp is not NULL and this function sets *busyp non-zero, the returned 1413 * page is busied (and not held). 1414 * 1415 * If busyp is not NULL and this function sets *busyp to zero, the returned 1416 * page is held (and not busied). 1417 * 1418 * If VM_PROT_WRITE is set in prot, and the pte is already writable, the 1419 * returned page will be dirtied. If the pte is not already writable NULL 1420 * is returned. In otherwords, if the bit is set and a vm_page_t is returned, 1421 * any COW will already have happened and that page can be written by the 1422 * caller. 1423 * 1424 * WARNING! THE RETURNED PAGE IS ONLY HELD AND NOT SUITABLE FOR READING 1425 * OR WRITING AS-IS. 1426 */ 1427 vm_page_t 1428 pmap_fault_page_quick(pmap_t pmap, vm_offset_t va, vm_prot_t prot, int *busyp) 1429 { 1430 if (pmap && 1431 va < VM_MAX_USER_ADDRESS && 1432 (pmap->pm_flags & PMAP_HVM) == 0) { 1433 pv_entry_t pt_pv; 1434 pv_entry_t pte_pv; 1435 pt_entry_t *ptep; 1436 pt_entry_t req; 1437 vm_page_t m; 1438 int error; 1439 1440 req = pmap->pmap_bits[PG_V_IDX] | 1441 pmap->pmap_bits[PG_U_IDX]; 1442 if (prot & VM_PROT_WRITE) 1443 req |= pmap->pmap_bits[PG_RW_IDX]; 1444 1445 pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL); 1446 if (pt_pv == NULL) 1447 return (NULL); 1448 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 1449 if ((*ptep & req) != req) { 1450 pv_put(pt_pv); 1451 return (NULL); 1452 } 1453 pte_pv = pv_get_try(pmap, pmap_pte_pindex(va), NULL, &error); 1454 if (pte_pv && error == 0) { 1455 m = pte_pv->pv_m; 1456 if (prot & VM_PROT_WRITE) { 1457 /* interlocked by presence of pv_entry */ 1458 vm_page_dirty(m); 1459 } 1460 if (busyp) { 1461 if (prot & VM_PROT_WRITE) { 1462 if (vm_page_busy_try(m, TRUE)) 1463 m = NULL; 1464 *busyp = 1; 1465 } else { 1466 vm_page_hold(m); 1467 *busyp = 0; 1468 } 1469 } else { 1470 vm_page_hold(m); 1471 } 1472 pv_put(pte_pv); 1473 } else if (pte_pv) { 1474 pv_drop(pte_pv); 1475 m = NULL; 1476 } else { 1477 /* error, since we didn't request a placemarker */ 1478 m = NULL; 1479 } 1480 pv_put(pt_pv); 1481 return(m); 1482 } else { 1483 return(NULL); 1484 } 1485 } 1486 1487 /* 1488 * Extract the physical page address associated kernel virtual address. 1489 */ 1490 vm_paddr_t 1491 pmap_kextract(vm_offset_t va) 1492 { 1493 pd_entry_t pt; /* pt entry in pd */ 1494 vm_paddr_t pa; 1495 1496 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 1497 pa = DMAP_TO_PHYS(va); 1498 } else { 1499 pt = *vtopt(va); 1500 if (pt & kernel_pmap.pmap_bits[PG_PS_IDX]) { 1501 pa = (pt & PG_PS_FRAME) | (va & PDRMASK); 1502 } else { 1503 /* 1504 * Beware of a concurrent promotion that changes the 1505 * PDE at this point! For example, vtopte() must not 1506 * be used to access the PTE because it would use the 1507 * new PDE. It is, however, safe to use the old PDE 1508 * because the page table page is preserved by the 1509 * promotion. 1510 */ 1511 pa = *pmap_pt_to_pte(pt, va); 1512 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 1513 } 1514 } 1515 return pa; 1516 } 1517 1518 /*************************************************** 1519 * Low level mapping routines..... 1520 ***************************************************/ 1521 1522 /* 1523 * Routine: pmap_kenter 1524 * Function: 1525 * Add a wired page to the KVA 1526 * NOTE! note that in order for the mapping to take effect -- you 1527 * should do an invltlb after doing the pmap_kenter(). 1528 */ 1529 void 1530 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1531 { 1532 pt_entry_t *ptep; 1533 pt_entry_t npte; 1534 1535 npte = pa | 1536 kernel_pmap.pmap_bits[PG_RW_IDX] | 1537 kernel_pmap.pmap_bits[PG_V_IDX]; 1538 // pgeflag; 1539 ptep = vtopte(va); 1540 #if 1 1541 pmap_inval_smp(&kernel_pmap, va, 1, ptep, npte); 1542 #else 1543 /* FUTURE */ 1544 if (*ptep) 1545 pmap_inval_smp(&kernel_pmap, va, ptep, npte); 1546 else 1547 *ptep = npte; 1548 #endif 1549 } 1550 1551 /* 1552 * Similar to pmap_kenter(), except we only invalidate the mapping on the 1553 * current CPU. Returns 0 if the previous pte was 0, 1 if it wasn't 1554 * (caller can conditionalize calling smp_invltlb()). 1555 */ 1556 int 1557 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa) 1558 { 1559 pt_entry_t *ptep; 1560 pt_entry_t npte; 1561 int res; 1562 1563 npte = pa | kernel_pmap.pmap_bits[PG_RW_IDX] | 1564 kernel_pmap.pmap_bits[PG_V_IDX]; 1565 // npte |= pgeflag; 1566 ptep = vtopte(va); 1567 #if 1 1568 res = 1; 1569 #else 1570 /* FUTURE */ 1571 res = (*ptep != 0); 1572 #endif 1573 atomic_swap_long(ptep, npte); 1574 cpu_invlpg((void *)va); 1575 1576 return res; 1577 } 1578 1579 /* 1580 * Enter addresses into the kernel pmap but don't bother 1581 * doing any tlb invalidations. Caller will do a rollup 1582 * invalidation via pmap_rollup_inval(). 1583 */ 1584 int 1585 pmap_kenter_noinval(vm_offset_t va, vm_paddr_t pa) 1586 { 1587 pt_entry_t *ptep; 1588 pt_entry_t npte; 1589 int res; 1590 1591 npte = pa | 1592 kernel_pmap.pmap_bits[PG_RW_IDX] | 1593 kernel_pmap.pmap_bits[PG_V_IDX]; 1594 // pgeflag; 1595 ptep = vtopte(va); 1596 #if 1 1597 res = 1; 1598 #else 1599 /* FUTURE */ 1600 res = (*ptep != 0); 1601 #endif 1602 atomic_swap_long(ptep, npte); 1603 cpu_invlpg((void *)va); 1604 1605 return res; 1606 } 1607 1608 /* 1609 * remove a page from the kernel pagetables 1610 */ 1611 void 1612 pmap_kremove(vm_offset_t va) 1613 { 1614 pt_entry_t *ptep; 1615 1616 ptep = vtopte(va); 1617 pmap_inval_smp(&kernel_pmap, va, 1, ptep, 0); 1618 } 1619 1620 void 1621 pmap_kremove_quick(vm_offset_t va) 1622 { 1623 pt_entry_t *ptep; 1624 1625 ptep = vtopte(va); 1626 (void)pte_load_clear(ptep); 1627 cpu_invlpg((void *)va); 1628 } 1629 1630 /* 1631 * Remove addresses from the kernel pmap but don't bother 1632 * doing any tlb invalidations. Caller will do a rollup 1633 * invalidation via pmap_rollup_inval(). 1634 */ 1635 void 1636 pmap_kremove_noinval(vm_offset_t va) 1637 { 1638 pt_entry_t *ptep; 1639 1640 ptep = vtopte(va); 1641 (void)pte_load_clear(ptep); 1642 } 1643 1644 /* 1645 * XXX these need to be recoded. They are not used in any critical path. 1646 */ 1647 void 1648 pmap_kmodify_rw(vm_offset_t va) 1649 { 1650 atomic_set_long(vtopte(va), kernel_pmap.pmap_bits[PG_RW_IDX]); 1651 cpu_invlpg((void *)va); 1652 } 1653 1654 /* NOT USED 1655 void 1656 pmap_kmodify_nc(vm_offset_t va) 1657 { 1658 atomic_set_long(vtopte(va), PG_N); 1659 cpu_invlpg((void *)va); 1660 } 1661 */ 1662 1663 /* 1664 * Used to map a range of physical addresses into kernel virtual 1665 * address space during the low level boot, typically to map the 1666 * dump bitmap, message buffer, and vm_page_array. 1667 * 1668 * These mappings are typically made at some pointer after the end of the 1669 * kernel text+data. 1670 * 1671 * We could return PHYS_TO_DMAP(start) here and not allocate any 1672 * via (*virtp), but then kmem from userland and kernel dumps won't 1673 * have access to the related pointers. 1674 */ 1675 vm_offset_t 1676 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot) 1677 { 1678 vm_offset_t va; 1679 vm_offset_t va_start; 1680 1681 /*return PHYS_TO_DMAP(start);*/ 1682 1683 va_start = *virtp; 1684 va = va_start; 1685 1686 while (start < end) { 1687 pmap_kenter_quick(va, start); 1688 va += PAGE_SIZE; 1689 start += PAGE_SIZE; 1690 } 1691 *virtp = va; 1692 return va_start; 1693 } 1694 1695 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1696 1697 /* 1698 * Remove the specified set of pages from the data and instruction caches. 1699 * 1700 * In contrast to pmap_invalidate_cache_range(), this function does not 1701 * rely on the CPU's self-snoop feature, because it is intended for use 1702 * when moving pages into a different cache domain. 1703 */ 1704 void 1705 pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1706 { 1707 vm_offset_t daddr, eva; 1708 int i; 1709 1710 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1711 (cpu_feature & CPUID_CLFSH) == 0) 1712 wbinvd(); 1713 else { 1714 cpu_mfence(); 1715 for (i = 0; i < count; i++) { 1716 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 1717 eva = daddr + PAGE_SIZE; 1718 for (; daddr < eva; daddr += cpu_clflush_line_size) 1719 clflush(daddr); 1720 } 1721 cpu_mfence(); 1722 } 1723 } 1724 1725 void 1726 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 1727 { 1728 KASSERT((sva & PAGE_MASK) == 0, 1729 ("pmap_invalidate_cache_range: sva not page-aligned")); 1730 KASSERT((eva & PAGE_MASK) == 0, 1731 ("pmap_invalidate_cache_range: eva not page-aligned")); 1732 1733 if (cpu_feature & CPUID_SS) { 1734 ; /* If "Self Snoop" is supported, do nothing. */ 1735 } else { 1736 /* Globally invalidate caches */ 1737 cpu_wbinvd_on_all_cpus(); 1738 } 1739 } 1740 1741 /* 1742 * Invalidate the specified range of virtual memory on all cpus associated 1743 * with the pmap. 1744 */ 1745 void 1746 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1747 { 1748 pmap_inval_smp(pmap, sva, (eva - sva) >> PAGE_SHIFT, NULL, 0); 1749 } 1750 1751 /* 1752 * Add a list of wired pages to the kva. This routine is used for temporary 1753 * kernel mappings such as those found in buffer cache buffer. Page 1754 * modifications and accesses are not tracked or recorded. 1755 * 1756 * NOTE! Old mappings are simply overwritten, and we cannot assume relaxed 1757 * semantics as previous mappings may have been zerod without any 1758 * invalidation. 1759 * 1760 * The page *must* be wired. 1761 */ 1762 static __inline void 1763 _pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count, int doinval) 1764 { 1765 vm_offset_t end_va; 1766 vm_offset_t va; 1767 1768 end_va = beg_va + count * PAGE_SIZE; 1769 1770 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 1771 pt_entry_t pte; 1772 pt_entry_t *ptep; 1773 1774 ptep = vtopte(va); 1775 pte = VM_PAGE_TO_PHYS(*m) | 1776 kernel_pmap.pmap_bits[PG_RW_IDX] | 1777 kernel_pmap.pmap_bits[PG_V_IDX] | 1778 kernel_pmap.pmap_cache_bits[(*m)->pat_mode]; 1779 // pgeflag; 1780 atomic_swap_long(ptep, pte); 1781 m++; 1782 } 1783 if (doinval) 1784 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 1785 } 1786 1787 void 1788 pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count) 1789 { 1790 _pmap_qenter(beg_va, m, count, 1); 1791 } 1792 1793 void 1794 pmap_qenter_noinval(vm_offset_t beg_va, vm_page_t *m, int count) 1795 { 1796 _pmap_qenter(beg_va, m, count, 0); 1797 } 1798 1799 /* 1800 * This routine jerks page mappings from the kernel -- it is meant only 1801 * for temporary mappings such as those found in buffer cache buffers. 1802 * No recording modified or access status occurs. 1803 * 1804 * MPSAFE, INTERRUPT SAFE (cluster callback) 1805 */ 1806 void 1807 pmap_qremove(vm_offset_t beg_va, int count) 1808 { 1809 vm_offset_t end_va; 1810 vm_offset_t va; 1811 1812 end_va = beg_va + count * PAGE_SIZE; 1813 1814 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 1815 pt_entry_t *pte; 1816 1817 pte = vtopte(va); 1818 (void)pte_load_clear(pte); 1819 cpu_invlpg((void *)va); 1820 } 1821 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 1822 } 1823 1824 /* 1825 * This routine removes temporary kernel mappings, only invalidating them 1826 * on the current cpu. It should only be used under carefully controlled 1827 * conditions. 1828 */ 1829 void 1830 pmap_qremove_quick(vm_offset_t beg_va, int count) 1831 { 1832 vm_offset_t end_va; 1833 vm_offset_t va; 1834 1835 end_va = beg_va + count * PAGE_SIZE; 1836 1837 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 1838 pt_entry_t *pte; 1839 1840 pte = vtopte(va); 1841 (void)pte_load_clear(pte); 1842 cpu_invlpg((void *)va); 1843 } 1844 } 1845 1846 /* 1847 * This routine removes temporary kernel mappings *without* invalidating 1848 * the TLB. It can only be used on permanent kva reservations such as those 1849 * found in buffer cache buffers, under carefully controlled circumstances. 1850 * 1851 * NOTE: Repopulating these KVAs requires unconditional invalidation. 1852 * (pmap_qenter() does unconditional invalidation). 1853 */ 1854 void 1855 pmap_qremove_noinval(vm_offset_t beg_va, int count) 1856 { 1857 vm_offset_t end_va; 1858 vm_offset_t va; 1859 1860 end_va = beg_va + count * PAGE_SIZE; 1861 1862 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 1863 pt_entry_t *pte; 1864 1865 pte = vtopte(va); 1866 (void)pte_load_clear(pte); 1867 } 1868 } 1869 1870 /* 1871 * Create a new thread and optionally associate it with a (new) process. 1872 * NOTE! the new thread's cpu may not equal the current cpu. 1873 */ 1874 void 1875 pmap_init_thread(thread_t td) 1876 { 1877 /* enforce pcb placement & alignment */ 1878 td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1; 1879 td->td_pcb = (struct pcb *)((intptr_t)td->td_pcb & ~(intptr_t)0xF); 1880 td->td_savefpu = &td->td_pcb->pcb_save; 1881 td->td_sp = (char *)td->td_pcb; /* no -16 */ 1882 } 1883 1884 /* 1885 * This routine directly affects the fork perf for a process. 1886 */ 1887 void 1888 pmap_init_proc(struct proc *p) 1889 { 1890 } 1891 1892 static void 1893 pmap_pinit_defaults(struct pmap *pmap) 1894 { 1895 bcopy(pmap_bits_default, pmap->pmap_bits, 1896 sizeof(pmap_bits_default)); 1897 bcopy(protection_codes, pmap->protection_codes, 1898 sizeof(protection_codes)); 1899 bcopy(pat_pte_index, pmap->pmap_cache_bits, 1900 sizeof(pat_pte_index)); 1901 pmap->pmap_cache_mask = X86_PG_NC_PWT | X86_PG_NC_PCD | X86_PG_PTE_PAT; 1902 pmap->copyinstr = std_copyinstr; 1903 pmap->copyin = std_copyin; 1904 pmap->copyout = std_copyout; 1905 pmap->fubyte = std_fubyte; 1906 pmap->subyte = std_subyte; 1907 pmap->fuword32 = std_fuword32; 1908 pmap->fuword64 = std_fuword64; 1909 pmap->suword32 = std_suword32; 1910 pmap->suword64 = std_suword64; 1911 pmap->swapu32 = std_swapu32; 1912 pmap->swapu64 = std_swapu64; 1913 } 1914 /* 1915 * Initialize pmap0/vmspace0. 1916 * 1917 * On architectures where the kernel pmap is not integrated into the user 1918 * process pmap, this pmap represents the process pmap, not the kernel pmap. 1919 * kernel_pmap should be used to directly access the kernel_pmap. 1920 */ 1921 void 1922 pmap_pinit0(struct pmap *pmap) 1923 { 1924 int i; 1925 1926 pmap->pm_pml4 = (pml4_entry_t *)(PTOV_OFFSET + KPML4phys); 1927 pmap->pm_count = 1; 1928 CPUMASK_ASSZERO(pmap->pm_active); 1929 pmap->pm_pvhint_pt = NULL; 1930 pmap->pm_pvhint_pte = NULL; 1931 RB_INIT(&pmap->pm_pvroot); 1932 spin_init(&pmap->pm_spin, "pmapinit0"); 1933 for (i = 0; i < PM_PLACEMARKS; ++i) 1934 pmap->pm_placemarks[i] = PM_NOPLACEMARK; 1935 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1936 pmap_pinit_defaults(pmap); 1937 } 1938 1939 /* 1940 * Initialize a preallocated and zeroed pmap structure, 1941 * such as one in a vmspace structure. 1942 */ 1943 static void 1944 pmap_pinit_simple(struct pmap *pmap) 1945 { 1946 int i; 1947 1948 /* 1949 * Misc initialization 1950 */ 1951 pmap->pm_count = 1; 1952 CPUMASK_ASSZERO(pmap->pm_active); 1953 pmap->pm_pvhint_pt = NULL; 1954 pmap->pm_pvhint_pte = NULL; 1955 pmap->pm_flags = PMAP_FLAG_SIMPLE; 1956 1957 pmap_pinit_defaults(pmap); 1958 1959 /* 1960 * Don't blow up locks/tokens on re-use (XXX fix/use drop code 1961 * for this). 1962 */ 1963 if (pmap->pm_pmlpv == NULL) { 1964 RB_INIT(&pmap->pm_pvroot); 1965 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1966 spin_init(&pmap->pm_spin, "pmapinitsimple"); 1967 for (i = 0; i < PM_PLACEMARKS; ++i) 1968 pmap->pm_placemarks[i] = PM_NOPLACEMARK; 1969 } 1970 } 1971 1972 void 1973 pmap_pinit(struct pmap *pmap) 1974 { 1975 pv_entry_t pv; 1976 int j; 1977 1978 if (pmap->pm_pmlpv) { 1979 if (pmap->pmap_bits[TYPE_IDX] != REGULAR_PMAP) { 1980 pmap_puninit(pmap); 1981 } 1982 } 1983 1984 pmap_pinit_simple(pmap); 1985 pmap->pm_flags &= ~PMAP_FLAG_SIMPLE; 1986 1987 /* 1988 * No need to allocate page table space yet but we do need a valid 1989 * page directory table. 1990 */ 1991 if (pmap->pm_pml4 == NULL) { 1992 pmap->pm_pml4 = 1993 (pml4_entry_t *)kmem_alloc_pageable(&kernel_map, 1994 PAGE_SIZE, 1995 VM_SUBSYS_PML4); 1996 } 1997 1998 /* 1999 * Allocate the page directory page, which wires it even though 2000 * it isn't being entered into some higher level page table (it 2001 * being the highest level). If one is already cached we don't 2002 * have to do anything. 2003 */ 2004 if ((pv = pmap->pm_pmlpv) == NULL) { 2005 pv = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL); 2006 pmap->pm_pmlpv = pv; 2007 pmap_kenter((vm_offset_t)pmap->pm_pml4, 2008 VM_PAGE_TO_PHYS(pv->pv_m)); 2009 pv_put(pv); 2010 2011 /* 2012 * Install DMAP and KMAP. 2013 */ 2014 for (j = 0; j < NDMPML4E; ++j) { 2015 pmap->pm_pml4[DMPML4I + j] = 2016 (DMPDPphys + ((vm_paddr_t)j << PAGE_SHIFT)) | 2017 pmap->pmap_bits[PG_RW_IDX] | 2018 pmap->pmap_bits[PG_V_IDX] | 2019 pmap->pmap_bits[PG_U_IDX]; 2020 } 2021 pmap->pm_pml4[KPML4I] = KPDPphys | 2022 pmap->pmap_bits[PG_RW_IDX] | 2023 pmap->pmap_bits[PG_V_IDX] | 2024 pmap->pmap_bits[PG_U_IDX]; 2025 2026 /* 2027 * install self-referential address mapping entry 2028 */ 2029 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pv->pv_m) | 2030 pmap->pmap_bits[PG_V_IDX] | 2031 pmap->pmap_bits[PG_RW_IDX] | 2032 pmap->pmap_bits[PG_A_IDX] | 2033 pmap->pmap_bits[PG_M_IDX]; 2034 } else { 2035 KKASSERT(pv->pv_m->flags & PG_MAPPED); 2036 KKASSERT(pv->pv_m->flags & PG_WRITEABLE); 2037 } 2038 KKASSERT(pmap->pm_pml4[255] == 0); 2039 KKASSERT(RB_ROOT(&pmap->pm_pvroot) == pv); 2040 KKASSERT(pv->pv_entry.rbe_left == NULL); 2041 KKASSERT(pv->pv_entry.rbe_right == NULL); 2042 } 2043 2044 /* 2045 * Clean up a pmap structure so it can be physically freed. This routine 2046 * is called by the vmspace dtor function. A great deal of pmap data is 2047 * left passively mapped to improve vmspace management so we have a bit 2048 * of cleanup work to do here. 2049 */ 2050 void 2051 pmap_puninit(pmap_t pmap) 2052 { 2053 pv_entry_t pv; 2054 vm_page_t p; 2055 2056 KKASSERT(CPUMASK_TESTZERO(pmap->pm_active)); 2057 if ((pv = pmap->pm_pmlpv) != NULL) { 2058 if (pv_hold_try(pv) == 0) 2059 pv_lock(pv); 2060 KKASSERT(pv == pmap->pm_pmlpv); 2061 p = pmap_remove_pv_page(pv); 2062 pv_free(pv, NULL); 2063 pv = NULL; /* safety */ 2064 pmap_kremove((vm_offset_t)pmap->pm_pml4); 2065 vm_page_busy_wait(p, FALSE, "pgpun"); 2066 KKASSERT(p->flags & (PG_FICTITIOUS|PG_UNMANAGED)); 2067 vm_page_unwire(p, 0); 2068 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 2069 2070 /* 2071 * XXX eventually clean out PML4 static entries and 2072 * use vm_page_free_zero() 2073 */ 2074 vm_page_free(p); 2075 pmap->pm_pmlpv = NULL; 2076 } 2077 if (pmap->pm_pml4) { 2078 KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys)); 2079 kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE); 2080 pmap->pm_pml4 = NULL; 2081 } 2082 KKASSERT(pmap->pm_stats.resident_count == 0); 2083 KKASSERT(pmap->pm_stats.wired_count == 0); 2084 } 2085 2086 /* 2087 * This function is now unused (used to add the pmap to the pmap_list) 2088 */ 2089 void 2090 pmap_pinit2(struct pmap *pmap) 2091 { 2092 } 2093 2094 /* 2095 * This routine is called when various levels in the page table need to 2096 * be populated. This routine cannot fail. 2097 * 2098 * This function returns two locked pv_entry's, one representing the 2099 * requested pv and one representing the requested pv's parent pv. If 2100 * an intermediate page table does not exist it will be created, mapped, 2101 * wired, and the parent page table will be given an additional hold 2102 * count representing the presence of the child pv_entry. 2103 */ 2104 static 2105 pv_entry_t 2106 pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp) 2107 { 2108 pt_entry_t *ptep; 2109 pv_entry_t pv; 2110 pv_entry_t pvp; 2111 pt_entry_t v; 2112 vm_pindex_t pt_pindex; 2113 vm_page_t m; 2114 int isnew; 2115 int ispt; 2116 2117 /* 2118 * If the pv already exists and we aren't being asked for the 2119 * parent page table page we can just return it. A locked+held pv 2120 * is returned. The pv will also have a second hold related to the 2121 * pmap association that we don't have to worry about. 2122 */ 2123 ispt = 0; 2124 pv = pv_alloc(pmap, ptepindex, &isnew); 2125 if (isnew == 0 && pvpp == NULL) 2126 return(pv); 2127 2128 /* 2129 * Special case terminal PVs. These are not page table pages so 2130 * no vm_page is allocated (the caller supplied the vm_page). If 2131 * pvpp is non-NULL we are being asked to also removed the pt_pv 2132 * for this pv. 2133 * 2134 * Note that pt_pv's are only returned for user VAs. We assert that 2135 * a pt_pv is not being requested for kernel VAs. The kernel 2136 * pre-wires all higher-level page tables so don't overload managed 2137 * higher-level page tables on top of it! 2138 */ 2139 if (ptepindex < pmap_pt_pindex(0)) { 2140 if (ptepindex >= NUPTE_USER) { 2141 /* kernel manages this manually for KVM */ 2142 KKASSERT(pvpp == NULL); 2143 } else { 2144 KKASSERT(pvpp != NULL); 2145 pt_pindex = NUPTE_TOTAL + (ptepindex >> NPTEPGSHIFT); 2146 pvp = pmap_allocpte(pmap, pt_pindex, NULL); 2147 if (isnew) 2148 vm_page_wire_quick(pvp->pv_m); 2149 *pvpp = pvp; 2150 } 2151 return(pv); 2152 } 2153 2154 /* 2155 * The kernel never uses managed PT/PD/PDP pages. 2156 */ 2157 KKASSERT(pmap != &kernel_pmap); 2158 2159 /* 2160 * Non-terminal PVs allocate a VM page to represent the page table, 2161 * so we have to resolve pvp and calculate ptepindex for the pvp 2162 * and then for the page table entry index in the pvp for 2163 * fall-through. 2164 */ 2165 if (ptepindex < pmap_pd_pindex(0)) { 2166 /* 2167 * pv is PT, pvp is PD 2168 */ 2169 ptepindex = (ptepindex - pmap_pt_pindex(0)) >> NPDEPGSHIFT; 2170 ptepindex += NUPTE_TOTAL + NUPT_TOTAL; 2171 pvp = pmap_allocpte(pmap, ptepindex, NULL); 2172 2173 /* 2174 * PT index in PD 2175 */ 2176 ptepindex = pv->pv_pindex - pmap_pt_pindex(0); 2177 ptepindex &= ((1ul << NPDEPGSHIFT) - 1); 2178 ispt = 1; 2179 } else if (ptepindex < pmap_pdp_pindex(0)) { 2180 /* 2181 * pv is PD, pvp is PDP 2182 * 2183 * SIMPLE PMAP NOTE: Simple pmaps do not allocate above 2184 * the PD. 2185 */ 2186 ptepindex = (ptepindex - pmap_pd_pindex(0)) >> NPDPEPGSHIFT; 2187 ptepindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL; 2188 2189 if (pmap->pm_flags & PMAP_FLAG_SIMPLE) { 2190 KKASSERT(pvpp == NULL); 2191 pvp = NULL; 2192 } else { 2193 pvp = pmap_allocpte(pmap, ptepindex, NULL); 2194 } 2195 2196 /* 2197 * PD index in PDP 2198 */ 2199 ptepindex = pv->pv_pindex - pmap_pd_pindex(0); 2200 ptepindex &= ((1ul << NPDPEPGSHIFT) - 1); 2201 } else if (ptepindex < pmap_pml4_pindex()) { 2202 /* 2203 * pv is PDP, pvp is the root pml4 table 2204 */ 2205 pvp = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL); 2206 2207 /* 2208 * PDP index in PML4 2209 */ 2210 ptepindex = pv->pv_pindex - pmap_pdp_pindex(0); 2211 ptepindex &= ((1ul << NPML4EPGSHIFT) - 1); 2212 } else { 2213 /* 2214 * pv represents the top-level PML4, there is no parent. 2215 */ 2216 pvp = NULL; 2217 } 2218 2219 if (isnew == 0) 2220 goto notnew; 2221 2222 /* 2223 * (isnew) is TRUE, pv is not terminal. 2224 * 2225 * (1) Add a wire count to the parent page table (pvp). 2226 * (2) Allocate a VM page for the page table. 2227 * (3) Enter the VM page into the parent page table. 2228 * 2229 * page table pages are marked PG_WRITEABLE and PG_MAPPED. 2230 */ 2231 if (pvp) 2232 vm_page_wire_quick(pvp->pv_m); 2233 2234 for (;;) { 2235 m = vm_page_alloc(NULL, pv->pv_pindex, 2236 VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM | 2237 VM_ALLOC_INTERRUPT); 2238 if (m) 2239 break; 2240 vm_wait(0); 2241 } 2242 vm_page_wire(m); /* wire for mapping in parent */ 2243 vm_page_unmanage(m); /* m must be spinunlocked */ 2244 pmap_zero_page(VM_PAGE_TO_PHYS(m)); 2245 m->valid = VM_PAGE_BITS_ALL; 2246 2247 vm_page_spin_lock(m); 2248 pmap_page_stats_adding(m); 2249 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2250 pv->pv_m = m; 2251 vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); 2252 vm_page_spin_unlock(m); 2253 2254 /* 2255 * (isnew) is TRUE, pv is not terminal. 2256 * 2257 * Wire the page into pvp. Bump the resident_count for the pmap. 2258 * There is no pvp for the top level, address the pm_pml4[] array 2259 * directly. 2260 * 2261 * If the caller wants the parent we return it, otherwise 2262 * we just put it away. 2263 * 2264 * No interlock is needed for pte 0 -> non-zero. 2265 * 2266 * In the situation where *ptep is valid we might have an unmanaged 2267 * page table page shared from another page table which we need to 2268 * unshare before installing our private page table page. 2269 */ 2270 if (pvp) { 2271 v = VM_PAGE_TO_PHYS(m) | 2272 (pmap->pmap_bits[PG_U_IDX] | 2273 pmap->pmap_bits[PG_RW_IDX] | 2274 pmap->pmap_bits[PG_V_IDX] | 2275 pmap->pmap_bits[PG_A_IDX] | 2276 pmap->pmap_bits[PG_M_IDX]); 2277 ptep = pv_pte_lookup(pvp, ptepindex); 2278 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 2279 pt_entry_t pte; 2280 2281 if (ispt == 0) { 2282 panic("pmap_allocpte: unexpected pte %p/%d", 2283 pvp, (int)ptepindex); 2284 } 2285 pte = pmap_inval_smp(pmap, (vm_offset_t)-1, 1, ptep, v); 2286 if (vm_page_unwire_quick( 2287 PHYS_TO_VM_PAGE(pte & PG_FRAME))) { 2288 panic("pmap_allocpte: shared pgtable " 2289 "pg bad wirecount"); 2290 } 2291 } else { 2292 pt_entry_t pte; 2293 2294 pte = atomic_swap_long(ptep, v); 2295 if (pte != 0) { 2296 kprintf("install pgtbl mixup 0x%016jx " 2297 "old/new 0x%016jx/0x%016jx\n", 2298 (intmax_t)ptepindex, pte, v); 2299 } 2300 } 2301 } 2302 vm_page_wakeup(m); 2303 2304 /* 2305 * (isnew) may be TRUE or FALSE, pv may or may not be terminal. 2306 */ 2307 notnew: 2308 if (pvp) { 2309 KKASSERT(pvp->pv_m != NULL); 2310 ptep = pv_pte_lookup(pvp, ptepindex); 2311 v = VM_PAGE_TO_PHYS(pv->pv_m) | 2312 (pmap->pmap_bits[PG_U_IDX] | 2313 pmap->pmap_bits[PG_RW_IDX] | 2314 pmap->pmap_bits[PG_V_IDX] | 2315 pmap->pmap_bits[PG_A_IDX] | 2316 pmap->pmap_bits[PG_M_IDX]); 2317 if (*ptep != v) { 2318 kprintf("mismatched upper level pt %016jx/%016jx\n", 2319 *ptep, v); 2320 } 2321 } 2322 if (pvpp) 2323 *pvpp = pvp; 2324 else if (pvp) 2325 pv_put(pvp); 2326 return (pv); 2327 } 2328 2329 /* 2330 * This version of pmap_allocpte() checks for possible segment optimizations 2331 * that would allow page-table sharing. It can be called for terminal 2332 * page or page table page ptepindex's. 2333 * 2334 * The function is called with page table page ptepindex's for fictitious 2335 * and unmanaged terminal pages. That is, we don't want to allocate a 2336 * terminal pv, we just want the pt_pv. pvpp is usually passed as NULL 2337 * for this case. 2338 * 2339 * This function can return a pv and *pvpp associated with the passed in pmap 2340 * OR a pv and *pvpp associated with the shared pmap. In the latter case 2341 * an unmanaged page table page will be entered into the pass in pmap. 2342 */ 2343 static 2344 pv_entry_t 2345 pmap_allocpte_seg(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp, 2346 vm_map_entry_t entry, vm_offset_t va) 2347 { 2348 vm_object_t object; 2349 pmap_t obpmap; 2350 pmap_t *obpmapp; 2351 vm_pindex_t *pt_placemark; 2352 vm_offset_t b; 2353 pv_entry_t pte_pv; /* in original or shared pmap */ 2354 pv_entry_t pt_pv; /* in original or shared pmap */ 2355 pv_entry_t proc_pd_pv; /* in original pmap */ 2356 pv_entry_t proc_pt_pv; /* in original pmap */ 2357 pv_entry_t xpv; /* PT in shared pmap */ 2358 pd_entry_t *pt; /* PT entry in PD of original pmap */ 2359 pd_entry_t opte; /* contents of *pt */ 2360 pd_entry_t npte; /* contents of *pt */ 2361 vm_page_t m; 2362 int softhold; 2363 2364 /* 2365 * Basic tests, require a non-NULL vm_map_entry, require proper 2366 * alignment and type for the vm_map_entry, require that the 2367 * underlying object already be allocated. 2368 * 2369 * We allow almost any type of object to use this optimization. 2370 * The object itself does NOT have to be sized to a multiple of the 2371 * segment size, but the memory mapping does. 2372 * 2373 * XXX don't handle devices currently, because VM_PAGE_TO_PHYS() 2374 * won't work as expected. 2375 */ 2376 if (entry == NULL || 2377 pmap_mmu_optimize == 0 || /* not enabled */ 2378 (pmap->pm_flags & PMAP_HVM) || /* special pmap */ 2379 ptepindex >= pmap_pd_pindex(0) || /* not terminal or pt */ 2380 entry->inheritance != VM_INHERIT_SHARE || /* not shared */ 2381 entry->maptype != VM_MAPTYPE_NORMAL || /* weird map type */ 2382 entry->object.vm_object == NULL || /* needs VM object */ 2383 entry->object.vm_object->type == OBJT_DEVICE || /* ick */ 2384 entry->object.vm_object->type == OBJT_MGTDEVICE || /* ick */ 2385 (entry->offset & SEG_MASK) || /* must be aligned */ 2386 (entry->start & SEG_MASK)) { 2387 return(pmap_allocpte(pmap, ptepindex, pvpp)); 2388 } 2389 2390 /* 2391 * Make sure the full segment can be represented. 2392 */ 2393 b = va & ~(vm_offset_t)SEG_MASK; 2394 if (b < entry->start || b + SEG_SIZE > entry->end) 2395 return(pmap_allocpte(pmap, ptepindex, pvpp)); 2396 2397 /* 2398 * If the full segment can be represented dive the VM object's 2399 * shared pmap, allocating as required. 2400 */ 2401 object = entry->object.vm_object; 2402 2403 if (entry->protection & VM_PROT_WRITE) 2404 obpmapp = &object->md.pmap_rw; 2405 else 2406 obpmapp = &object->md.pmap_ro; 2407 2408 #ifdef PMAP_DEBUG2 2409 if (pmap_enter_debug > 0) { 2410 --pmap_enter_debug; 2411 kprintf("pmap_allocpte_seg: va=%jx prot %08x o=%p " 2412 "obpmapp %p %p\n", 2413 va, entry->protection, object, 2414 obpmapp, *obpmapp); 2415 kprintf("pmap_allocpte_seg: entry %p %jx-%jx\n", 2416 entry, entry->start, entry->end); 2417 } 2418 #endif 2419 2420 /* 2421 * We allocate what appears to be a normal pmap but because portions 2422 * of this pmap are shared with other unrelated pmaps we have to 2423 * set pm_active to point to all cpus. 2424 * 2425 * XXX Currently using pmap_spin to interlock the update, can't use 2426 * vm_object_hold/drop because the token might already be held 2427 * shared OR exclusive and we don't know. 2428 */ 2429 while ((obpmap = *obpmapp) == NULL) { 2430 obpmap = kmalloc(sizeof(*obpmap), M_OBJPMAP, M_WAITOK|M_ZERO); 2431 pmap_pinit_simple(obpmap); 2432 pmap_pinit2(obpmap); 2433 spin_lock(&pmap_spin); 2434 if (*obpmapp != NULL) { 2435 /* 2436 * Handle race 2437 */ 2438 spin_unlock(&pmap_spin); 2439 pmap_release(obpmap); 2440 pmap_puninit(obpmap); 2441 kfree(obpmap, M_OBJPMAP); 2442 obpmap = *obpmapp; /* safety */ 2443 } else { 2444 obpmap->pm_active = smp_active_mask; 2445 obpmap->pm_flags |= PMAP_SEGSHARED; 2446 *obpmapp = obpmap; 2447 spin_unlock(&pmap_spin); 2448 } 2449 } 2450 2451 /* 2452 * Layering is: PTE, PT, PD, PDP, PML4. We have to return the 2453 * pte/pt using the shared pmap from the object but also adjust 2454 * the process pmap's page table page as a side effect. 2455 */ 2456 2457 /* 2458 * Resolve the terminal PTE and PT in the shared pmap. This is what 2459 * we will return. This is true if ptepindex represents a terminal 2460 * page, otherwise pte_pv is actually the PT and pt_pv is actually 2461 * the PD. 2462 */ 2463 pt_pv = NULL; 2464 pte_pv = pmap_allocpte(obpmap, ptepindex, &pt_pv); 2465 softhold = 0; 2466 retry: 2467 if (ptepindex >= pmap_pt_pindex(0)) 2468 xpv = pte_pv; 2469 else 2470 xpv = pt_pv; 2471 2472 /* 2473 * Resolve the PD in the process pmap so we can properly share the 2474 * page table page. Lock order is bottom-up (leaf first)! 2475 * 2476 * NOTE: proc_pt_pv can be NULL. 2477 */ 2478 proc_pt_pv = pv_get(pmap, pmap_pt_pindex(b), &pt_placemark); 2479 proc_pd_pv = pmap_allocpte(pmap, pmap_pd_pindex(b), NULL); 2480 #ifdef PMAP_DEBUG2 2481 if (pmap_enter_debug > 0) { 2482 --pmap_enter_debug; 2483 kprintf("proc_pt_pv %p (wc %d) pd_pv %p va=%jx\n", 2484 proc_pt_pv, 2485 (proc_pt_pv ? proc_pt_pv->pv_m->wire_count : -1), 2486 proc_pd_pv, 2487 va); 2488 } 2489 #endif 2490 2491 /* 2492 * xpv is the page table page pv from the shared object 2493 * (for convenience), from above. 2494 * 2495 * Calculate the pte value for the PT to load into the process PD. 2496 * If we have to change it we must properly dispose of the previous 2497 * entry. 2498 */ 2499 pt = pv_pte_lookup(proc_pd_pv, pmap_pt_index(b)); 2500 npte = VM_PAGE_TO_PHYS(xpv->pv_m) | 2501 (pmap->pmap_bits[PG_U_IDX] | 2502 pmap->pmap_bits[PG_RW_IDX] | 2503 pmap->pmap_bits[PG_V_IDX] | 2504 pmap->pmap_bits[PG_A_IDX] | 2505 pmap->pmap_bits[PG_M_IDX]); 2506 2507 /* 2508 * Dispose of previous page table page if it was local to the 2509 * process pmap. If the old pt is not empty we cannot dispose of it 2510 * until we clean it out. This case should not arise very often so 2511 * it is not optimized. 2512 * 2513 * Leave pt_pv and pte_pv (in our object pmap) locked and intact 2514 * for the retry. 2515 */ 2516 if (proc_pt_pv) { 2517 pmap_inval_bulk_t bulk; 2518 2519 if (proc_pt_pv->pv_m->wire_count != 1) { 2520 /* 2521 * The page table has a bunch of stuff in it 2522 * which we have to scrap. 2523 */ 2524 if (softhold == 0) { 2525 softhold = 1; 2526 pmap_softhold(pmap); 2527 } 2528 pv_put(proc_pd_pv); 2529 pv_put(proc_pt_pv); 2530 pmap_remove(pmap, 2531 va & ~(vm_offset_t)SEG_MASK, 2532 (va + SEG_SIZE) & ~(vm_offset_t)SEG_MASK); 2533 } else { 2534 /* 2535 * The page table is empty and can be destroyed. 2536 * However, doing so leaves the pt slot unlocked, 2537 * so we have to loop-up to handle any races until 2538 * we get a NULL proc_pt_pv and a proper pt_placemark. 2539 */ 2540 pmap_inval_bulk_init(&bulk, proc_pt_pv->pv_pmap); 2541 pmap_release_pv(proc_pt_pv, proc_pd_pv, &bulk); 2542 pmap_inval_bulk_flush(&bulk); 2543 pv_put(proc_pd_pv); 2544 } 2545 goto retry; 2546 } 2547 2548 /* 2549 * Handle remaining cases. We are holding pt_placemark to lock 2550 * the page table page in the primary pmap while we manipulate 2551 * it. 2552 */ 2553 if (*pt == 0) { 2554 atomic_swap_long(pt, npte); 2555 vm_page_wire_quick(xpv->pv_m); /* shared pt -> proc */ 2556 vm_page_wire_quick(proc_pd_pv->pv_m); /* proc pd for sh pt */ 2557 atomic_add_long(&pmap->pm_stats.resident_count, 1); 2558 } else if (*pt != npte) { 2559 opte = pmap_inval_smp(pmap, (vm_offset_t)-1, 1, pt, npte); 2560 2561 #if 0 2562 opte = pte_load_clear(pt); 2563 KKASSERT(opte && opte != npte); 2564 2565 *pt = npte; 2566 #endif 2567 vm_page_wire_quick(xpv->pv_m); /* shared pt -> proc */ 2568 2569 /* 2570 * Clean up opte, bump the wire_count for the process 2571 * PD page representing the new entry if it was 2572 * previously empty. 2573 * 2574 * If the entry was not previously empty and we have 2575 * a PT in the proc pmap then opte must match that 2576 * pt. The proc pt must be retired (this is done 2577 * later on in this procedure). 2578 * 2579 * NOTE: replacing valid pte, wire_count on proc_pd_pv 2580 * stays the same. 2581 */ 2582 KKASSERT(opte & pmap->pmap_bits[PG_V_IDX]); 2583 m = PHYS_TO_VM_PAGE(opte & PG_FRAME); 2584 if (vm_page_unwire_quick(m)) { 2585 panic("pmap_allocpte_seg: " 2586 "bad wire count %p", 2587 m); 2588 } 2589 } 2590 2591 if (softhold) 2592 pmap_softdone(pmap); 2593 2594 /* 2595 * Remove our earmark on the page table page. 2596 */ 2597 pv_placemarker_wakeup(pmap, pt_placemark); 2598 2599 /* 2600 * The existing process page table was replaced and must be destroyed 2601 * here. 2602 */ 2603 if (proc_pd_pv) 2604 pv_put(proc_pd_pv); 2605 if (pvpp) 2606 *pvpp = pt_pv; 2607 else 2608 pv_put(pt_pv); 2609 return (pte_pv); 2610 } 2611 2612 /* 2613 * Release any resources held by the given physical map. 2614 * 2615 * Called when a pmap initialized by pmap_pinit is being released. Should 2616 * only be called if the map contains no valid mappings. 2617 */ 2618 struct pmap_release_info { 2619 pmap_t pmap; 2620 int retry; 2621 pv_entry_t pvp; 2622 }; 2623 2624 static int pmap_release_callback(pv_entry_t pv, void *data); 2625 2626 void 2627 pmap_release(struct pmap *pmap) 2628 { 2629 struct pmap_release_info info; 2630 2631 KASSERT(CPUMASK_TESTZERO(pmap->pm_active), 2632 ("pmap still active! %016jx", 2633 (uintmax_t)CPUMASK_LOWMASK(pmap->pm_active))); 2634 2635 /* 2636 * There is no longer a pmap_list, if there were we would remove the 2637 * pmap from it here. 2638 */ 2639 2640 /* 2641 * Pull pv's off the RB tree in order from low to high and release 2642 * each page. 2643 */ 2644 info.pmap = pmap; 2645 do { 2646 info.retry = 0; 2647 info.pvp = NULL; 2648 2649 spin_lock(&pmap->pm_spin); 2650 RB_SCAN(pv_entry_rb_tree, &pmap->pm_pvroot, NULL, 2651 pmap_release_callback, &info); 2652 spin_unlock(&pmap->pm_spin); 2653 2654 if (info.pvp) 2655 pv_put(info.pvp); 2656 } while (info.retry); 2657 2658 2659 /* 2660 * One resident page (the pml4 page) should remain. 2661 * No wired pages should remain. 2662 */ 2663 #if 1 2664 if (pmap->pm_stats.resident_count != 2665 ((pmap->pm_flags & PMAP_FLAG_SIMPLE) ? 0 : 1) || 2666 pmap->pm_stats.wired_count != 0) { 2667 kprintf("fatal pmap problem - pmap %p flags %08x " 2668 "rescnt=%jd wirecnt=%jd\n", 2669 pmap, 2670 pmap->pm_flags, 2671 pmap->pm_stats.resident_count, 2672 pmap->pm_stats.wired_count); 2673 tsleep(pmap, 0, "DEAD", 0); 2674 } 2675 #else 2676 KKASSERT(pmap->pm_stats.resident_count == 2677 ((pmap->pm_flags & PMAP_FLAG_SIMPLE) ? 0 : 1)); 2678 KKASSERT(pmap->pm_stats.wired_count == 0); 2679 #endif 2680 } 2681 2682 /* 2683 * Called from low to high. We must cache the proper parent pv so we 2684 * can adjust its wired count. 2685 */ 2686 static int 2687 pmap_release_callback(pv_entry_t pv, void *data) 2688 { 2689 struct pmap_release_info *info = data; 2690 pmap_t pmap = info->pmap; 2691 vm_pindex_t pindex; 2692 int r; 2693 2694 /* 2695 * Acquire a held and locked pv, check for release race 2696 */ 2697 pindex = pv->pv_pindex; 2698 if (info->pvp == pv) { 2699 spin_unlock(&pmap->pm_spin); 2700 info->pvp = NULL; 2701 } else if (pv_hold_try(pv)) { 2702 spin_unlock(&pmap->pm_spin); 2703 } else { 2704 spin_unlock(&pmap->pm_spin); 2705 pv_lock(pv); 2706 pv_put(pv); 2707 info->retry = 1; 2708 spin_lock(&pmap->pm_spin); 2709 2710 return -1; 2711 } 2712 KKASSERT(pv->pv_pmap == pmap && pindex == pv->pv_pindex); 2713 2714 if (pv->pv_pindex < pmap_pt_pindex(0)) { 2715 /* 2716 * I am PTE, parent is PT 2717 */ 2718 pindex = pv->pv_pindex >> NPTEPGSHIFT; 2719 pindex += NUPTE_TOTAL; 2720 } else if (pv->pv_pindex < pmap_pd_pindex(0)) { 2721 /* 2722 * I am PT, parent is PD 2723 */ 2724 pindex = (pv->pv_pindex - NUPTE_TOTAL) >> NPDEPGSHIFT; 2725 pindex += NUPTE_TOTAL + NUPT_TOTAL; 2726 } else if (pv->pv_pindex < pmap_pdp_pindex(0)) { 2727 /* 2728 * I am PD, parent is PDP 2729 */ 2730 pindex = (pv->pv_pindex - NUPTE_TOTAL - NUPT_TOTAL) >> 2731 NPDPEPGSHIFT; 2732 pindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL; 2733 } else if (pv->pv_pindex < pmap_pml4_pindex()) { 2734 /* 2735 * I am PDP, parent is PML4 (there's only one) 2736 */ 2737 #if 0 2738 pindex = (pv->pv_pindex - NUPTE_TOTAL - NUPT_TOTAL - 2739 NUPD_TOTAL) >> NPML4EPGSHIFT; 2740 pindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL; 2741 #endif 2742 pindex = pmap_pml4_pindex(); 2743 } else { 2744 /* 2745 * parent is NULL 2746 */ 2747 if (info->pvp) { 2748 pv_put(info->pvp); 2749 info->pvp = NULL; 2750 } 2751 pindex = 0; 2752 } 2753 if (pindex) { 2754 if (info->pvp && info->pvp->pv_pindex != pindex) { 2755 pv_put(info->pvp); 2756 info->pvp = NULL; 2757 } 2758 if (info->pvp == NULL) 2759 info->pvp = pv_get(pmap, pindex, NULL); 2760 } else { 2761 if (info->pvp) { 2762 pv_put(info->pvp); 2763 info->pvp = NULL; 2764 } 2765 } 2766 r = pmap_release_pv(pv, info->pvp, NULL); 2767 spin_lock(&pmap->pm_spin); 2768 2769 return(r); 2770 } 2771 2772 /* 2773 * Called with held (i.e. also locked) pv. This function will dispose of 2774 * the lock along with the pv. 2775 * 2776 * If the caller already holds the locked parent page table for pv it 2777 * must pass it as pvp, allowing us to avoid a deadlock, else it can 2778 * pass NULL for pvp. 2779 */ 2780 static int 2781 pmap_release_pv(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk) 2782 { 2783 vm_page_t p; 2784 2785 /* 2786 * The pmap is currently not spinlocked, pv is held+locked. 2787 * Remove the pv's page from its parent's page table. The 2788 * parent's page table page's wire_count will be decremented. 2789 * 2790 * This will clean out the pte at any level of the page table. 2791 * If smp != 0 all cpus are affected. 2792 * 2793 * Do not tear-down recursively, its faster to just let the 2794 * release run its course. 2795 */ 2796 pmap_remove_pv_pte(pv, pvp, bulk, 0); 2797 2798 /* 2799 * Terminal pvs are unhooked from their vm_pages. Because 2800 * terminal pages aren't page table pages they aren't wired 2801 * by us, so we have to be sure not to unwire them either. 2802 */ 2803 if (pv->pv_pindex < pmap_pt_pindex(0)) { 2804 pmap_remove_pv_page(pv); 2805 goto skip; 2806 } 2807 2808 /* 2809 * We leave the top-level page table page cached, wired, and 2810 * mapped in the pmap until the dtor function (pmap_puninit()) 2811 * gets called. 2812 * 2813 * Since we are leaving the top-level pv intact we need 2814 * to break out of what would otherwise be an infinite loop. 2815 */ 2816 if (pv->pv_pindex == pmap_pml4_pindex()) { 2817 pv_put(pv); 2818 return(-1); 2819 } 2820 2821 /* 2822 * For page table pages (other than the top-level page), 2823 * remove and free the vm_page. The representitive mapping 2824 * removed above by pmap_remove_pv_pte() did not undo the 2825 * last wire_count so we have to do that as well. 2826 */ 2827 p = pmap_remove_pv_page(pv); 2828 vm_page_busy_wait(p, FALSE, "pmaprl"); 2829 if (p->wire_count != 1) { 2830 kprintf("p->wire_count was %016lx %d\n", 2831 pv->pv_pindex, p->wire_count); 2832 } 2833 KKASSERT(p->wire_count == 1); 2834 KKASSERT(p->flags & PG_UNMANAGED); 2835 2836 vm_page_unwire(p, 0); 2837 KKASSERT(p->wire_count == 0); 2838 2839 vm_page_free(p); 2840 skip: 2841 pv_free(pv, pvp); 2842 2843 return 0; 2844 } 2845 2846 /* 2847 * This function will remove the pte associated with a pv from its parent. 2848 * Terminal pv's are supported. All cpus specified by (bulk) are properly 2849 * invalidated. 2850 * 2851 * The wire count will be dropped on the parent page table. The wire 2852 * count on the page being removed (pv->pv_m) from the parent page table 2853 * is NOT touched. Note that terminal pages will not have any additional 2854 * wire counts while page table pages will have at least one representing 2855 * the mapping, plus others representing sub-mappings. 2856 * 2857 * NOTE: Cannot be called on kernel page table pages, only KVM terminal 2858 * pages and user page table and terminal pages. 2859 * 2860 * NOTE: The pte being removed might be unmanaged, and the pv supplied might 2861 * be freshly allocated and not imply that the pte is managed. In this 2862 * case pv->pv_m should be NULL. 2863 * 2864 * The pv must be locked. The pvp, if supplied, must be locked. All 2865 * supplied pv's will remain locked on return. 2866 * 2867 * XXX must lock parent pv's if they exist to remove pte XXX 2868 */ 2869 static 2870 void 2871 pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk, 2872 int destroy) 2873 { 2874 vm_pindex_t ptepindex = pv->pv_pindex; 2875 pmap_t pmap = pv->pv_pmap; 2876 vm_page_t p; 2877 int gotpvp = 0; 2878 2879 KKASSERT(pmap); 2880 2881 if (ptepindex == pmap_pml4_pindex()) { 2882 /* 2883 * We are the top level PML4E table, there is no parent. 2884 */ 2885 p = pmap->pm_pmlpv->pv_m; 2886 KKASSERT(pv->pv_m == p); /* debugging */ 2887 } else if (ptepindex >= pmap_pdp_pindex(0)) { 2888 /* 2889 * Remove a PDP page from the PML4E. This can only occur 2890 * with user page tables. We do not have to lock the 2891 * pml4 PV so just ignore pvp. 2892 */ 2893 vm_pindex_t pml4_pindex; 2894 vm_pindex_t pdp_index; 2895 pml4_entry_t *pdp; 2896 2897 pdp_index = ptepindex - pmap_pdp_pindex(0); 2898 if (pvp == NULL) { 2899 pml4_pindex = pmap_pml4_pindex(); 2900 pvp = pv_get(pv->pv_pmap, pml4_pindex, NULL); 2901 KKASSERT(pvp); 2902 gotpvp = 1; 2903 } 2904 2905 pdp = &pmap->pm_pml4[pdp_index & ((1ul << NPML4EPGSHIFT) - 1)]; 2906 KKASSERT((*pdp & pmap->pmap_bits[PG_V_IDX]) != 0); 2907 p = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 2908 pmap_inval_bulk(bulk, (vm_offset_t)-1, pdp, 0); 2909 KKASSERT(pv->pv_m == p); /* debugging */ 2910 } else if (ptepindex >= pmap_pd_pindex(0)) { 2911 /* 2912 * Remove a PD page from the PDP 2913 * 2914 * SIMPLE PMAP NOTE: Non-existant pvp's are ok in the case 2915 * of a simple pmap because it stops at 2916 * the PD page. 2917 */ 2918 vm_pindex_t pdp_pindex; 2919 vm_pindex_t pd_index; 2920 pdp_entry_t *pd; 2921 2922 pd_index = ptepindex - pmap_pd_pindex(0); 2923 2924 if (pvp == NULL) { 2925 pdp_pindex = NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + 2926 (pd_index >> NPML4EPGSHIFT); 2927 pvp = pv_get(pv->pv_pmap, pdp_pindex, NULL); 2928 gotpvp = 1; 2929 } 2930 2931 if (pvp) { 2932 pd = pv_pte_lookup(pvp, pd_index & 2933 ((1ul << NPDPEPGSHIFT) - 1)); 2934 KKASSERT((*pd & pmap->pmap_bits[PG_V_IDX]) != 0); 2935 p = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 2936 pmap_inval_bulk(bulk, (vm_offset_t)-1, pd, 0); 2937 } else { 2938 KKASSERT(pmap->pm_flags & PMAP_FLAG_SIMPLE); 2939 p = pv->pv_m; /* degenerate test later */ 2940 } 2941 KKASSERT(pv->pv_m == p); /* debugging */ 2942 } else if (ptepindex >= pmap_pt_pindex(0)) { 2943 /* 2944 * Remove a PT page from the PD 2945 */ 2946 vm_pindex_t pd_pindex; 2947 vm_pindex_t pt_index; 2948 pd_entry_t *pt; 2949 2950 pt_index = ptepindex - pmap_pt_pindex(0); 2951 2952 if (pvp == NULL) { 2953 pd_pindex = NUPTE_TOTAL + NUPT_TOTAL + 2954 (pt_index >> NPDPEPGSHIFT); 2955 pvp = pv_get(pv->pv_pmap, pd_pindex, NULL); 2956 KKASSERT(pvp); 2957 gotpvp = 1; 2958 } 2959 2960 pt = pv_pte_lookup(pvp, pt_index & ((1ul << NPDPEPGSHIFT) - 1)); 2961 #if 0 2962 KASSERT((*pt & pmap->pmap_bits[PG_V_IDX]) != 0, 2963 ("*pt unexpectedly invalid %016jx " 2964 "gotpvp=%d ptepindex=%ld ptindex=%ld pv=%p pvp=%p", 2965 *pt, gotpvp, ptepindex, pt_index, pv, pvp)); 2966 p = PHYS_TO_VM_PAGE(*pt & PG_FRAME); 2967 #else 2968 if ((*pt & pmap->pmap_bits[PG_V_IDX]) == 0) { 2969 kprintf("*pt unexpectedly invalid %016jx " 2970 "gotpvp=%d ptepindex=%ld ptindex=%ld " 2971 "pv=%p pvp=%p\n", 2972 *pt, gotpvp, ptepindex, pt_index, pv, pvp); 2973 tsleep(pt, 0, "DEAD", 0); 2974 p = pv->pv_m; 2975 } else { 2976 p = PHYS_TO_VM_PAGE(*pt & PG_FRAME); 2977 } 2978 #endif 2979 pmap_inval_bulk(bulk, (vm_offset_t)-1, pt, 0); 2980 KKASSERT(pv->pv_m == p); /* debugging */ 2981 } else { 2982 /* 2983 * Remove a PTE from the PT page. The PV might exist even if 2984 * the PTE is not managed, in whichcase pv->pv_m should be 2985 * NULL. 2986 * 2987 * NOTE: Userland pmaps manage the parent PT/PD/PDP page 2988 * table pages but the kernel_pmap does not. 2989 * 2990 * NOTE: pv's must be locked bottom-up to avoid deadlocking. 2991 * pv is a pte_pv so we can safely lock pt_pv. 2992 * 2993 * NOTE: FICTITIOUS pages may have multiple physical mappings 2994 * so PHYS_TO_VM_PAGE() will not necessarily work for 2995 * terminal ptes. 2996 */ 2997 vm_pindex_t pt_pindex; 2998 pt_entry_t *ptep; 2999 pt_entry_t pte; 3000 vm_offset_t va; 3001 3002 pt_pindex = ptepindex >> NPTEPGSHIFT; 3003 va = (vm_offset_t)ptepindex << PAGE_SHIFT; 3004 3005 if (ptepindex >= NUPTE_USER) { 3006 ptep = vtopte(ptepindex << PAGE_SHIFT); 3007 KKASSERT(pvp == NULL); 3008 /* pvp remains NULL */ 3009 } else { 3010 if (pvp == NULL) { 3011 pt_pindex = NUPTE_TOTAL + 3012 (ptepindex >> NPDPEPGSHIFT); 3013 pvp = pv_get(pv->pv_pmap, pt_pindex, NULL); 3014 KKASSERT(pvp); 3015 gotpvp = 1; 3016 } 3017 ptep = pv_pte_lookup(pvp, ptepindex & 3018 ((1ul << NPDPEPGSHIFT) - 1)); 3019 } 3020 pte = pmap_inval_bulk(bulk, va, ptep, 0); 3021 if (bulk == NULL) /* XXX */ 3022 cpu_invlpg((void *)va); /* XXX */ 3023 3024 /* 3025 * Now update the vm_page_t 3026 */ 3027 if ((pte & pmap->pmap_bits[PG_MANAGED_IDX]) && 3028 (pte & pmap->pmap_bits[PG_V_IDX])) { 3029 /* 3030 * Valid managed page, adjust (p). 3031 */ 3032 if (pte & pmap->pmap_bits[PG_DEVICE_IDX]) { 3033 p = pv->pv_m; 3034 } else { 3035 p = PHYS_TO_VM_PAGE(pte & PG_FRAME); 3036 KKASSERT(pv->pv_m == p); 3037 } 3038 if (pte & pmap->pmap_bits[PG_M_IDX]) { 3039 if (pmap_track_modified(ptepindex)) 3040 vm_page_dirty(p); 3041 } 3042 if (pte & pmap->pmap_bits[PG_A_IDX]) { 3043 vm_page_flag_set(p, PG_REFERENCED); 3044 } 3045 } else { 3046 /* 3047 * Unmanaged page, do not try to adjust the vm_page_t. 3048 * pv could be freshly allocated for a pmap_enter(), 3049 * replacing an unmanaged page with a managed one. 3050 * 3051 * pv->pv_m might reflect the new page and not the 3052 * existing page. 3053 * 3054 * We could extract p from the physical address and 3055 * adjust it but we explicitly do not for unmanaged 3056 * pages. 3057 */ 3058 p = NULL; 3059 } 3060 if (pte & pmap->pmap_bits[PG_W_IDX]) 3061 atomic_add_long(&pmap->pm_stats.wired_count, -1); 3062 if (pte & pmap->pmap_bits[PG_G_IDX]) 3063 cpu_invlpg((void *)va); 3064 } 3065 3066 /* 3067 * If requested, scrap the underlying pv->pv_m and the underlying 3068 * pv. If this is a page-table-page we must also free the page. 3069 * 3070 * pvp must be returned locked. 3071 */ 3072 if (destroy == 1) { 3073 /* 3074 * page table page (PT, PD, PDP, PML4), caller was responsible 3075 * for testing wired_count. 3076 */ 3077 KKASSERT(pv->pv_m->wire_count == 1); 3078 p = pmap_remove_pv_page(pv); 3079 pv_free(pv, pvp); 3080 pv = NULL; 3081 3082 vm_page_busy_wait(p, FALSE, "pgpun"); 3083 vm_page_unwire(p, 0); 3084 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 3085 vm_page_free(p); 3086 } else if (destroy == 2) { 3087 /* 3088 * Normal page, remove from pmap and leave the underlying 3089 * page untouched. 3090 */ 3091 pmap_remove_pv_page(pv); 3092 pv_free(pv, pvp); 3093 pv = NULL; /* safety */ 3094 } 3095 3096 /* 3097 * If we acquired pvp ourselves then we are responsible for 3098 * recursively deleting it. 3099 */ 3100 if (pvp && gotpvp) { 3101 /* 3102 * Recursively destroy higher-level page tables. 3103 * 3104 * This is optional. If we do not, they will still 3105 * be destroyed when the process exits. 3106 * 3107 * NOTE: Do not destroy pv_entry's with extra hold refs, 3108 * a caller may have unlocked it and intends to 3109 * continue to use it. 3110 */ 3111 if (pmap_dynamic_delete && 3112 pvp->pv_m && 3113 pvp->pv_m->wire_count == 1 && 3114 (pvp->pv_hold & PV_HOLD_MASK) == 2 && 3115 pvp->pv_pindex != pmap_pml4_pindex()) { 3116 if (pmap_dynamic_delete == 2) 3117 kprintf("A %jd %08x\n", pvp->pv_pindex, pvp->pv_hold); 3118 if (pmap != &kernel_pmap) { 3119 pmap_remove_pv_pte(pvp, NULL, bulk, 1); 3120 pvp = NULL; /* safety */ 3121 } else { 3122 kprintf("Attempt to remove kernel_pmap pindex " 3123 "%jd\n", pvp->pv_pindex); 3124 pv_put(pvp); 3125 } 3126 } else { 3127 pv_put(pvp); 3128 } 3129 } 3130 } 3131 3132 /* 3133 * Remove the vm_page association to a pv. The pv must be locked. 3134 */ 3135 static 3136 vm_page_t 3137 pmap_remove_pv_page(pv_entry_t pv) 3138 { 3139 vm_page_t m; 3140 3141 m = pv->pv_m; 3142 vm_page_spin_lock(m); 3143 KKASSERT(m && m == pv->pv_m); 3144 pv->pv_m = NULL; 3145 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3146 pmap_page_stats_deleting(m); 3147 if (TAILQ_EMPTY(&m->md.pv_list)) 3148 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 3149 vm_page_spin_unlock(m); 3150 3151 return(m); 3152 } 3153 3154 /* 3155 * Grow the number of kernel page table entries, if needed. 3156 * 3157 * This routine is always called to validate any address space 3158 * beyond KERNBASE (for kldloads). kernel_vm_end only governs the address 3159 * space below KERNBASE. 3160 * 3161 * kernel_map must be locked exclusively by the caller. 3162 */ 3163 void 3164 pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) 3165 { 3166 vm_paddr_t paddr; 3167 vm_offset_t ptppaddr; 3168 vm_page_t nkpg; 3169 pd_entry_t *pt, newpt; 3170 pdp_entry_t newpd; 3171 int update_kernel_vm_end; 3172 3173 /* 3174 * bootstrap kernel_vm_end on first real VM use 3175 */ 3176 if (kernel_vm_end == 0) { 3177 kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 3178 nkpt = 0; 3179 while ((*pmap_pt(&kernel_pmap, kernel_vm_end) & kernel_pmap.pmap_bits[PG_V_IDX]) != 0) { 3180 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & 3181 ~(PAGE_SIZE * NPTEPG - 1); 3182 nkpt++; 3183 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 3184 kernel_vm_end = kernel_map.max_offset; 3185 break; 3186 } 3187 } 3188 } 3189 3190 /* 3191 * Fill in the gaps. kernel_vm_end is only adjusted for ranges 3192 * below KERNBASE. Ranges above KERNBASE are kldloaded and we 3193 * do not want to force-fill 128G worth of page tables. 3194 */ 3195 if (kstart < KERNBASE) { 3196 if (kstart > kernel_vm_end) 3197 kstart = kernel_vm_end; 3198 KKASSERT(kend <= KERNBASE); 3199 update_kernel_vm_end = 1; 3200 } else { 3201 update_kernel_vm_end = 0; 3202 } 3203 3204 kstart = rounddown2(kstart, PAGE_SIZE * NPTEPG); 3205 kend = roundup2(kend, PAGE_SIZE * NPTEPG); 3206 3207 if (kend - 1 >= kernel_map.max_offset) 3208 kend = kernel_map.max_offset; 3209 3210 while (kstart < kend) { 3211 pt = pmap_pt(&kernel_pmap, kstart); 3212 if (pt == NULL) { 3213 /* We need a new PD entry */ 3214 nkpg = vm_page_alloc(NULL, mycpu->gd_rand_incr++, 3215 VM_ALLOC_NORMAL | 3216 VM_ALLOC_SYSTEM | 3217 VM_ALLOC_INTERRUPT); 3218 if (nkpg == NULL) { 3219 panic("pmap_growkernel: no memory to grow " 3220 "kernel"); 3221 } 3222 paddr = VM_PAGE_TO_PHYS(nkpg); 3223 pmap_zero_page(paddr); 3224 newpd = (pdp_entry_t) 3225 (paddr | 3226 kernel_pmap.pmap_bits[PG_V_IDX] | 3227 kernel_pmap.pmap_bits[PG_RW_IDX] | 3228 kernel_pmap.pmap_bits[PG_A_IDX] | 3229 kernel_pmap.pmap_bits[PG_M_IDX]); 3230 *pmap_pd(&kernel_pmap, kstart) = newpd; 3231 continue; /* try again */ 3232 } 3233 if ((*pt & kernel_pmap.pmap_bits[PG_V_IDX]) != 0) { 3234 kstart = (kstart + PAGE_SIZE * NPTEPG) & 3235 ~(PAGE_SIZE * NPTEPG - 1); 3236 if (kstart - 1 >= kernel_map.max_offset) { 3237 kstart = kernel_map.max_offset; 3238 break; 3239 } 3240 continue; 3241 } 3242 3243 /* 3244 * We need a new PT 3245 * 3246 * This index is bogus, but out of the way 3247 */ 3248 nkpg = vm_page_alloc(NULL, mycpu->gd_rand_incr++, 3249 VM_ALLOC_NORMAL | 3250 VM_ALLOC_SYSTEM | 3251 VM_ALLOC_INTERRUPT); 3252 if (nkpg == NULL) 3253 panic("pmap_growkernel: no memory to grow kernel"); 3254 3255 vm_page_wire(nkpg); 3256 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 3257 pmap_zero_page(ptppaddr); 3258 newpt = (pd_entry_t)(ptppaddr | 3259 kernel_pmap.pmap_bits[PG_V_IDX] | 3260 kernel_pmap.pmap_bits[PG_RW_IDX] | 3261 kernel_pmap.pmap_bits[PG_A_IDX] | 3262 kernel_pmap.pmap_bits[PG_M_IDX]); 3263 atomic_swap_long(pmap_pt(&kernel_pmap, kstart), newpt); 3264 3265 kstart = (kstart + PAGE_SIZE * NPTEPG) & 3266 ~(PAGE_SIZE * NPTEPG - 1); 3267 3268 if (kstart - 1 >= kernel_map.max_offset) { 3269 kstart = kernel_map.max_offset; 3270 break; 3271 } 3272 } 3273 3274 /* 3275 * Only update kernel_vm_end for areas below KERNBASE. 3276 */ 3277 if (update_kernel_vm_end && kernel_vm_end < kstart) 3278 kernel_vm_end = kstart; 3279 } 3280 3281 /* 3282 * Add a reference to the specified pmap. 3283 */ 3284 void 3285 pmap_reference(pmap_t pmap) 3286 { 3287 if (pmap != NULL) 3288 atomic_add_int(&pmap->pm_count, 1); 3289 } 3290 3291 /*************************************************** 3292 * page management routines. 3293 ***************************************************/ 3294 3295 /* 3296 * Hold a pv without locking it 3297 */ 3298 static void 3299 pv_hold(pv_entry_t pv) 3300 { 3301 atomic_add_int(&pv->pv_hold, 1); 3302 } 3303 3304 /* 3305 * Hold a pv_entry, preventing its destruction. TRUE is returned if the pv 3306 * was successfully locked, FALSE if it wasn't. The caller must dispose of 3307 * the pv properly. 3308 * 3309 * Either the pmap->pm_spin or the related vm_page_spin (if traversing a 3310 * pv list via its page) must be held by the caller in order to stabilize 3311 * the pv. 3312 */ 3313 static int 3314 _pv_hold_try(pv_entry_t pv PMAP_DEBUG_DECL) 3315 { 3316 u_int count; 3317 3318 /* 3319 * Critical path shortcut expects pv to already have one ref 3320 * (for the pv->pv_pmap). 3321 */ 3322 if (atomic_cmpset_int(&pv->pv_hold, 1, PV_HOLD_LOCKED | 2)) { 3323 #ifdef PMAP_DEBUG 3324 pv->pv_func = func; 3325 pv->pv_line = lineno; 3326 #endif 3327 return TRUE; 3328 } 3329 3330 for (;;) { 3331 count = pv->pv_hold; 3332 cpu_ccfence(); 3333 if ((count & PV_HOLD_LOCKED) == 0) { 3334 if (atomic_cmpset_int(&pv->pv_hold, count, 3335 (count + 1) | PV_HOLD_LOCKED)) { 3336 #ifdef PMAP_DEBUG 3337 pv->pv_func = func; 3338 pv->pv_line = lineno; 3339 #endif 3340 return TRUE; 3341 } 3342 } else { 3343 if (atomic_cmpset_int(&pv->pv_hold, count, count + 1)) 3344 return FALSE; 3345 } 3346 /* retry */ 3347 } 3348 } 3349 3350 /* 3351 * Drop a previously held pv_entry which could not be locked, allowing its 3352 * destruction. 3353 * 3354 * Must not be called with a spinlock held as we might zfree() the pv if it 3355 * is no longer associated with a pmap and this was the last hold count. 3356 */ 3357 static void 3358 pv_drop(pv_entry_t pv) 3359 { 3360 u_int count; 3361 3362 for (;;) { 3363 count = pv->pv_hold; 3364 cpu_ccfence(); 3365 KKASSERT((count & PV_HOLD_MASK) > 0); 3366 KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) != 3367 (PV_HOLD_LOCKED | 1)); 3368 if (atomic_cmpset_int(&pv->pv_hold, count, count - 1)) { 3369 if ((count & PV_HOLD_MASK) == 1) { 3370 #ifdef PMAP_DEBUG2 3371 if (pmap_enter_debug > 0) { 3372 --pmap_enter_debug; 3373 kprintf("pv_drop: free pv %p\n", pv); 3374 } 3375 #endif 3376 KKASSERT(count == 1); 3377 KKASSERT(pv->pv_pmap == NULL); 3378 zfree(pvzone, pv); 3379 } 3380 return; 3381 } 3382 /* retry */ 3383 } 3384 } 3385 3386 /* 3387 * Find or allocate the requested PV entry, returning a locked, held pv. 3388 * 3389 * If (*isnew) is non-zero, the returned pv will have two hold counts, one 3390 * for the caller and one representing the pmap and vm_page association. 3391 * 3392 * If (*isnew) is zero, the returned pv will have only one hold count. 3393 * 3394 * Since both associations can only be adjusted while the pv is locked, 3395 * together they represent just one additional hold. 3396 */ 3397 static 3398 pv_entry_t 3399 _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew PMAP_DEBUG_DECL) 3400 { 3401 struct mdglobaldata *md = mdcpu; 3402 pv_entry_t pv; 3403 pv_entry_t pnew; 3404 int pmap_excl = 0; 3405 3406 pnew = NULL; 3407 if (md->gd_newpv) { 3408 #if 1 3409 pnew = atomic_swap_ptr((void *)&md->gd_newpv, NULL); 3410 #else 3411 crit_enter(); 3412 pnew = md->gd_newpv; /* might race NULL */ 3413 md->gd_newpv = NULL; 3414 crit_exit(); 3415 #endif 3416 } 3417 if (pnew == NULL) 3418 pnew = zalloc(pvzone); 3419 3420 spin_lock_shared(&pmap->pm_spin); 3421 for (;;) { 3422 /* 3423 * Shortcut cache 3424 */ 3425 pv = pv_entry_lookup(pmap, pindex); 3426 if (pv == NULL) { 3427 vm_pindex_t *pmark; 3428 3429 /* 3430 * Requires exclusive pmap spinlock 3431 */ 3432 if (pmap_excl == 0) { 3433 pmap_excl = 1; 3434 if (!spin_lock_upgrade_try(&pmap->pm_spin)) { 3435 spin_unlock_shared(&pmap->pm_spin); 3436 spin_lock(&pmap->pm_spin); 3437 continue; 3438 } 3439 } 3440 3441 /* 3442 * We need to block if someone is holding our 3443 * placemarker. As long as we determine the 3444 * placemarker has not been aquired we do not 3445 * need to get it as acquision also requires 3446 * the pmap spin lock. 3447 * 3448 * However, we can race the wakeup. 3449 */ 3450 pmark = pmap_placemarker_hash(pmap, pindex); 3451 3452 if (((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) { 3453 atomic_set_long(pmark, PM_PLACEMARK_WAKEUP); 3454 tsleep_interlock(pmark, 0); 3455 if (((*pmark ^ pindex) & 3456 ~PM_PLACEMARK_WAKEUP) == 0) { 3457 spin_unlock(&pmap->pm_spin); 3458 tsleep(pmark, PINTERLOCKED, "pvplc", 0); 3459 spin_lock(&pmap->pm_spin); 3460 } 3461 continue; 3462 } 3463 3464 /* 3465 * Setup the new entry 3466 */ 3467 pnew->pv_pmap = pmap; 3468 pnew->pv_pindex = pindex; 3469 pnew->pv_hold = PV_HOLD_LOCKED | 2; 3470 #ifdef PMAP_DEBUG 3471 pnew->pv_func = func; 3472 pnew->pv_line = lineno; 3473 if (pnew->pv_line_lastfree > 0) { 3474 pnew->pv_line_lastfree = 3475 -pnew->pv_line_lastfree; 3476 } 3477 #endif 3478 pv = pv_entry_rb_tree_RB_INSERT(&pmap->pm_pvroot, pnew); 3479 atomic_add_long(&pmap->pm_stats.resident_count, 1); 3480 spin_unlock(&pmap->pm_spin); 3481 *isnew = 1; 3482 3483 KKASSERT(pv == NULL); 3484 return(pnew); 3485 } 3486 3487 /* 3488 * We already have an entry, cleanup the staged pnew if 3489 * we can get the lock, otherwise block and retry. 3490 */ 3491 if (__predict_true(_pv_hold_try(pv PMAP_DEBUG_COPY))) { 3492 if (pmap_excl) 3493 spin_unlock(&pmap->pm_spin); 3494 else 3495 spin_unlock_shared(&pmap->pm_spin); 3496 #if 1 3497 pnew = atomic_swap_ptr((void *)&md->gd_newpv, pnew); 3498 if (pnew) 3499 zfree(pvzone, pnew); 3500 #else 3501 crit_enter(); 3502 if (md->gd_newpv == NULL) 3503 md->gd_newpv = pnew; 3504 else 3505 zfree(pvzone, pnew); 3506 crit_exit(); 3507 #endif 3508 KKASSERT(pv->pv_pmap == pmap && 3509 pv->pv_pindex == pindex); 3510 *isnew = 0; 3511 return(pv); 3512 } 3513 if (pmap_excl) { 3514 spin_unlock(&pmap->pm_spin); 3515 _pv_lock(pv PMAP_DEBUG_COPY); 3516 pv_put(pv); 3517 spin_lock(&pmap->pm_spin); 3518 } else { 3519 spin_unlock_shared(&pmap->pm_spin); 3520 _pv_lock(pv PMAP_DEBUG_COPY); 3521 pv_put(pv); 3522 spin_lock_shared(&pmap->pm_spin); 3523 } 3524 } 3525 /* NOT REACHED */ 3526 } 3527 3528 /* 3529 * Find the requested PV entry, returning a locked+held pv or NULL 3530 */ 3531 static 3532 pv_entry_t 3533 _pv_get(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp PMAP_DEBUG_DECL) 3534 { 3535 pv_entry_t pv; 3536 int pmap_excl = 0; 3537 3538 spin_lock_shared(&pmap->pm_spin); 3539 for (;;) { 3540 /* 3541 * Shortcut cache 3542 */ 3543 pv = pv_entry_lookup(pmap, pindex); 3544 if (pv == NULL) { 3545 /* 3546 * Block if there is ANY placemarker. If we are to 3547 * return it, we must also aquire the spot, so we 3548 * have to block even if the placemarker is held on 3549 * a different address. 3550 * 3551 * OPTIMIZATION: If pmarkp is passed as NULL the 3552 * caller is just probing (or looking for a real 3553 * pv_entry), and in this case we only need to check 3554 * to see if the placemarker matches pindex. 3555 */ 3556 vm_pindex_t *pmark; 3557 3558 /* 3559 * Requires exclusive pmap spinlock 3560 */ 3561 if (pmap_excl == 0) { 3562 pmap_excl = 1; 3563 if (!spin_lock_upgrade_try(&pmap->pm_spin)) { 3564 spin_unlock_shared(&pmap->pm_spin); 3565 spin_lock(&pmap->pm_spin); 3566 continue; 3567 } 3568 } 3569 3570 pmark = pmap_placemarker_hash(pmap, pindex); 3571 3572 if ((pmarkp && *pmark != PM_NOPLACEMARK) || 3573 ((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) { 3574 atomic_set_long(pmark, PM_PLACEMARK_WAKEUP); 3575 tsleep_interlock(pmark, 0); 3576 if ((pmarkp && *pmark != PM_NOPLACEMARK) || 3577 ((*pmark ^ pindex) & 3578 ~PM_PLACEMARK_WAKEUP) == 0) { 3579 spin_unlock(&pmap->pm_spin); 3580 tsleep(pmark, PINTERLOCKED, "pvpld", 0); 3581 spin_lock(&pmap->pm_spin); 3582 } 3583 continue; 3584 } 3585 if (pmarkp) { 3586 if (atomic_swap_long(pmark, pindex) != 3587 PM_NOPLACEMARK) { 3588 panic("_pv_get: pmark race"); 3589 } 3590 *pmarkp = pmark; 3591 } 3592 spin_unlock(&pmap->pm_spin); 3593 return NULL; 3594 } 3595 if (_pv_hold_try(pv PMAP_DEBUG_COPY)) { 3596 pv_cache(pv, pindex); 3597 if (pmap_excl) 3598 spin_unlock(&pmap->pm_spin); 3599 else 3600 spin_unlock_shared(&pmap->pm_spin); 3601 KKASSERT(pv->pv_pmap == pmap && 3602 pv->pv_pindex == pindex); 3603 return(pv); 3604 } 3605 if (pmap_excl) { 3606 spin_unlock(&pmap->pm_spin); 3607 _pv_lock(pv PMAP_DEBUG_COPY); 3608 pv_put(pv); 3609 spin_lock(&pmap->pm_spin); 3610 } else { 3611 spin_unlock_shared(&pmap->pm_spin); 3612 _pv_lock(pv PMAP_DEBUG_COPY); 3613 pv_put(pv); 3614 spin_lock_shared(&pmap->pm_spin); 3615 } 3616 } 3617 } 3618 3619 /* 3620 * Lookup, hold, and attempt to lock (pmap,pindex). 3621 * 3622 * If the entry does not exist NULL is returned and *errorp is set to 0 3623 * 3624 * If the entry exists and could be successfully locked it is returned and 3625 * errorp is set to 0. 3626 * 3627 * If the entry exists but could NOT be successfully locked it is returned 3628 * held and *errorp is set to 1. 3629 * 3630 * If the entry is placemarked by someone else NULL is returned and *errorp 3631 * is set to 1. 3632 */ 3633 static 3634 pv_entry_t 3635 pv_get_try(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp, int *errorp) 3636 { 3637 pv_entry_t pv; 3638 3639 spin_lock_shared(&pmap->pm_spin); 3640 3641 pv = pv_entry_lookup(pmap, pindex); 3642 if (pv == NULL) { 3643 vm_pindex_t *pmark; 3644 3645 pmark = pmap_placemarker_hash(pmap, pindex); 3646 3647 if (((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) { 3648 *errorp = 1; 3649 } else if (pmarkp && 3650 atomic_cmpset_long(pmark, PM_NOPLACEMARK, pindex)) { 3651 *errorp = 0; 3652 } else { 3653 /* 3654 * Can't set a placemark with a NULL pmarkp, or if 3655 * pmarkp is non-NULL but we failed to set our 3656 * placemark. 3657 */ 3658 *errorp = 1; 3659 } 3660 if (pmarkp) 3661 *pmarkp = pmark; 3662 spin_unlock_shared(&pmap->pm_spin); 3663 3664 return NULL; 3665 } 3666 3667 /* 3668 * XXX This has problems if the lock is shared, why? 3669 */ 3670 if (pv_hold_try(pv)) { 3671 pv_cache(pv, pindex); /* overwrite ok (shared lock) */ 3672 spin_unlock_shared(&pmap->pm_spin); 3673 *errorp = 0; 3674 KKASSERT(pv->pv_pmap == pmap && pv->pv_pindex == pindex); 3675 return(pv); /* lock succeeded */ 3676 } 3677 spin_unlock_shared(&pmap->pm_spin); 3678 *errorp = 1; 3679 3680 return (pv); /* lock failed */ 3681 } 3682 3683 /* 3684 * Lock a held pv, keeping the hold count 3685 */ 3686 static 3687 void 3688 _pv_lock(pv_entry_t pv PMAP_DEBUG_DECL) 3689 { 3690 u_int count; 3691 3692 for (;;) { 3693 count = pv->pv_hold; 3694 cpu_ccfence(); 3695 if ((count & PV_HOLD_LOCKED) == 0) { 3696 if (atomic_cmpset_int(&pv->pv_hold, count, 3697 count | PV_HOLD_LOCKED)) { 3698 #ifdef PMAP_DEBUG 3699 pv->pv_func = func; 3700 pv->pv_line = lineno; 3701 #endif 3702 return; 3703 } 3704 continue; 3705 } 3706 tsleep_interlock(pv, 0); 3707 if (atomic_cmpset_int(&pv->pv_hold, count, 3708 count | PV_HOLD_WAITING)) { 3709 #ifdef PMAP_DEBUG2 3710 if (pmap_enter_debug > 0) { 3711 --pmap_enter_debug; 3712 kprintf("pv waiting on %s:%d\n", 3713 pv->pv_func, pv->pv_line); 3714 } 3715 #endif 3716 tsleep(pv, PINTERLOCKED, "pvwait", hz); 3717 } 3718 /* retry */ 3719 } 3720 } 3721 3722 /* 3723 * Unlock a held and locked pv, keeping the hold count. 3724 */ 3725 static 3726 void 3727 pv_unlock(pv_entry_t pv) 3728 { 3729 u_int count; 3730 3731 for (;;) { 3732 count = pv->pv_hold; 3733 cpu_ccfence(); 3734 KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) >= 3735 (PV_HOLD_LOCKED | 1)); 3736 if (atomic_cmpset_int(&pv->pv_hold, count, 3737 count & 3738 ~(PV_HOLD_LOCKED | PV_HOLD_WAITING))) { 3739 if (count & PV_HOLD_WAITING) 3740 wakeup(pv); 3741 break; 3742 } 3743 } 3744 } 3745 3746 /* 3747 * Unlock and drop a pv. If the pv is no longer associated with a pmap 3748 * and the hold count drops to zero we will free it. 3749 * 3750 * Caller should not hold any spin locks. We are protected from hold races 3751 * by virtue of holds only occuring only with a pmap_spin or vm_page_spin 3752 * lock held. A pv cannot be located otherwise. 3753 */ 3754 static 3755 void 3756 pv_put(pv_entry_t pv) 3757 { 3758 #ifdef PMAP_DEBUG2 3759 if (pmap_enter_debug > 0) { 3760 --pmap_enter_debug; 3761 kprintf("pv_put pv=%p hold=%08x\n", pv, pv->pv_hold); 3762 } 3763 #endif 3764 3765 /* 3766 * Normal put-aways must have a pv_m associated with the pv, 3767 * but allow the case where the pv has been destructed due 3768 * to pmap_dynamic_delete. 3769 */ 3770 KKASSERT(pv->pv_pmap == NULL || pv->pv_m != NULL); 3771 3772 /* 3773 * Fast - shortcut most common condition 3774 */ 3775 if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 1)) 3776 return; 3777 3778 /* 3779 * Slow 3780 */ 3781 pv_unlock(pv); 3782 pv_drop(pv); 3783 } 3784 3785 /* 3786 * Remove the pmap association from a pv, require that pv_m already be removed, 3787 * then unlock and drop the pv. Any pte operations must have already been 3788 * completed. This call may result in a last-drop which will physically free 3789 * the pv. 3790 * 3791 * Removing the pmap association entails an additional drop. 3792 * 3793 * pv must be exclusively locked on call and will be disposed of on return. 3794 */ 3795 static 3796 void 3797 _pv_free(pv_entry_t pv, pv_entry_t pvp PMAP_DEBUG_DECL) 3798 { 3799 pmap_t pmap; 3800 3801 #ifdef PMAP_DEBUG 3802 pv->pv_func_lastfree = func; 3803 pv->pv_line_lastfree = lineno; 3804 #endif 3805 KKASSERT(pv->pv_m == NULL); 3806 KKASSERT((pv->pv_hold & (PV_HOLD_LOCKED|PV_HOLD_MASK)) >= 3807 (PV_HOLD_LOCKED|1)); 3808 if ((pmap = pv->pv_pmap) != NULL) { 3809 spin_lock(&pmap->pm_spin); 3810 KKASSERT(pv->pv_pmap == pmap); 3811 if (pmap->pm_pvhint_pt == pv) 3812 pmap->pm_pvhint_pt = NULL; 3813 if (pmap->pm_pvhint_pte == pv) 3814 pmap->pm_pvhint_pte = NULL; 3815 pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); 3816 atomic_add_long(&pmap->pm_stats.resident_count, -1); 3817 pv->pv_pmap = NULL; 3818 pv->pv_pindex = 0; 3819 spin_unlock(&pmap->pm_spin); 3820 3821 /* 3822 * Try to shortcut three atomic ops, otherwise fall through 3823 * and do it normally. Drop two refs and the lock all in 3824 * one go. 3825 */ 3826 if (pvp) 3827 vm_page_unwire_quick(pvp->pv_m); 3828 if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 0)) { 3829 #ifdef PMAP_DEBUG2 3830 if (pmap_enter_debug > 0) { 3831 --pmap_enter_debug; 3832 kprintf("pv_free: free pv %p\n", pv); 3833 } 3834 #endif 3835 zfree(pvzone, pv); 3836 return; 3837 } 3838 pv_drop(pv); /* ref for pv_pmap */ 3839 } 3840 pv_unlock(pv); 3841 pv_drop(pv); 3842 } 3843 3844 /* 3845 * This routine is very drastic, but can save the system 3846 * in a pinch. 3847 */ 3848 void 3849 pmap_collect(void) 3850 { 3851 int i; 3852 vm_page_t m; 3853 static int warningdone=0; 3854 3855 if (pmap_pagedaemon_waken == 0) 3856 return; 3857 pmap_pagedaemon_waken = 0; 3858 if (warningdone < 5) { 3859 kprintf("pmap_collect: collecting pv entries -- " 3860 "suggest increasing PMAP_SHPGPERPROC\n"); 3861 warningdone++; 3862 } 3863 3864 for (i = 0; i < vm_page_array_size; i++) { 3865 m = &vm_page_array[i]; 3866 if (m->wire_count || m->hold_count) 3867 continue; 3868 if (vm_page_busy_try(m, TRUE) == 0) { 3869 if (m->wire_count == 0 && m->hold_count == 0) { 3870 pmap_remove_all(m); 3871 } 3872 vm_page_wakeup(m); 3873 } 3874 } 3875 } 3876 3877 /* 3878 * Scan the pmap for active page table entries and issue a callback. 3879 * The callback must dispose of pte_pv, whos PTE entry is at *ptep in 3880 * its parent page table. 3881 * 3882 * pte_pv will be NULL if the page or page table is unmanaged. 3883 * pt_pv will point to the page table page containing the pte for the page. 3884 * 3885 * NOTE! If we come across an unmanaged page TABLE (verses an unmanaged page), 3886 * we pass a NULL pte_pv and we pass a pt_pv pointing to the passed 3887 * process pmap's PD and page to the callback function. This can be 3888 * confusing because the pt_pv is really a pd_pv, and the target page 3889 * table page is simply aliased by the pmap and not owned by it. 3890 * 3891 * It is assumed that the start and end are properly rounded to the page size. 3892 * 3893 * It is assumed that PD pages and above are managed and thus in the RB tree, 3894 * allowing us to use RB_SCAN from the PD pages down for ranged scans. 3895 */ 3896 struct pmap_scan_info { 3897 struct pmap *pmap; 3898 vm_offset_t sva; 3899 vm_offset_t eva; 3900 vm_pindex_t sva_pd_pindex; 3901 vm_pindex_t eva_pd_pindex; 3902 void (*func)(pmap_t, struct pmap_scan_info *, 3903 pv_entry_t, vm_pindex_t *, pv_entry_t, 3904 int, vm_offset_t, 3905 pt_entry_t *, void *); 3906 void *arg; 3907 pmap_inval_bulk_t bulk_core; 3908 pmap_inval_bulk_t *bulk; 3909 int count; 3910 int stop; 3911 }; 3912 3913 static int pmap_scan_cmp(pv_entry_t pv, void *data); 3914 static int pmap_scan_callback(pv_entry_t pv, void *data); 3915 3916 static void 3917 pmap_scan(struct pmap_scan_info *info, int smp_inval) 3918 { 3919 struct pmap *pmap = info->pmap; 3920 pv_entry_t pd_pv; /* A page directory PV */ 3921 pv_entry_t pt_pv; /* A page table PV */ 3922 pv_entry_t pte_pv; /* A page table entry PV */ 3923 vm_pindex_t *pte_placemark; 3924 vm_pindex_t *pt_placemark; 3925 pt_entry_t *ptep; 3926 pt_entry_t oldpte; 3927 struct pv_entry dummy_pv; 3928 3929 info->stop = 0; 3930 if (pmap == NULL) 3931 return; 3932 if (info->sva == info->eva) 3933 return; 3934 if (smp_inval) { 3935 info->bulk = &info->bulk_core; 3936 pmap_inval_bulk_init(&info->bulk_core, pmap); 3937 } else { 3938 info->bulk = NULL; 3939 } 3940 3941 /* 3942 * Hold the token for stability; if the pmap is empty we have nothing 3943 * to do. 3944 */ 3945 #if 0 3946 if (pmap->pm_stats.resident_count == 0) { 3947 return; 3948 } 3949 #endif 3950 3951 info->count = 0; 3952 3953 /* 3954 * Special handling for scanning one page, which is a very common 3955 * operation (it is?). 3956 * 3957 * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4 3958 */ 3959 if (info->sva + PAGE_SIZE == info->eva) { 3960 if (info->sva >= VM_MAX_USER_ADDRESS) { 3961 /* 3962 * Kernel mappings do not track wire counts on 3963 * page table pages and only maintain pd_pv and 3964 * pte_pv levels so pmap_scan() works. 3965 */ 3966 pt_pv = NULL; 3967 pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva), 3968 &pte_placemark); 3969 ptep = vtopte(info->sva); 3970 } else { 3971 /* 3972 * User pages which are unmanaged will not have a 3973 * pte_pv. User page table pages which are unmanaged 3974 * (shared from elsewhere) will also not have a pt_pv. 3975 * The func() callback will pass both pte_pv and pt_pv 3976 * as NULL in that case. 3977 * 3978 * We hold pte_placemark across the operation for 3979 * unmanaged pages. 3980 * 3981 * WARNING! We must hold pt_placemark across the 3982 * *ptep test to prevent misintepreting 3983 * a non-zero *ptep as a shared page 3984 * table page. Hold it across the function 3985 * callback as well for SMP safety. 3986 */ 3987 pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva), 3988 &pte_placemark); 3989 pt_pv = pv_get(pmap, pmap_pt_pindex(info->sva), 3990 &pt_placemark); 3991 if (pt_pv == NULL) { 3992 KKASSERT(pte_pv == NULL); 3993 pd_pv = pv_get(pmap, 3994 pmap_pd_pindex(info->sva), 3995 NULL); 3996 if (pd_pv) { 3997 ptep = pv_pte_lookup(pd_pv, 3998 pmap_pt_index(info->sva)); 3999 if (*ptep) { 4000 info->func(pmap, info, 4001 NULL, pt_placemark, 4002 pd_pv, 1, 4003 info->sva, ptep, 4004 info->arg); 4005 } else { 4006 pv_placemarker_wakeup(pmap, 4007 pt_placemark); 4008 } 4009 pv_put(pd_pv); 4010 } else { 4011 pv_placemarker_wakeup(pmap, 4012 pt_placemark); 4013 } 4014 pv_placemarker_wakeup(pmap, pte_placemark); 4015 goto fast_skip; 4016 } 4017 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(info->sva)); 4018 } 4019 4020 /* 4021 * NOTE: *ptep can't be ripped out from under us if we hold 4022 * pte_pv (or pte_placemark) locked, but bits can 4023 * change. 4024 */ 4025 oldpte = *ptep; 4026 cpu_ccfence(); 4027 if (oldpte == 0) { 4028 KKASSERT(pte_pv == NULL); 4029 pv_placemarker_wakeup(pmap, pte_placemark); 4030 } else if (pte_pv) { 4031 KASSERT((oldpte & (pmap->pmap_bits[PG_MANAGED_IDX] | 4032 pmap->pmap_bits[PG_V_IDX])) == 4033 (pmap->pmap_bits[PG_MANAGED_IDX] | 4034 pmap->pmap_bits[PG_V_IDX]), 4035 ("badA *ptep %016lx/%016lx sva %016lx pte_pv %p", 4036 *ptep, oldpte, info->sva, pte_pv)); 4037 info->func(pmap, info, pte_pv, NULL, pt_pv, 0, 4038 info->sva, ptep, info->arg); 4039 } else { 4040 KASSERT((oldpte & (pmap->pmap_bits[PG_MANAGED_IDX] | 4041 pmap->pmap_bits[PG_V_IDX])) == 4042 pmap->pmap_bits[PG_V_IDX], 4043 ("badB *ptep %016lx/%016lx sva %016lx pte_pv NULL", 4044 *ptep, oldpte, info->sva)); 4045 info->func(pmap, info, NULL, pte_placemark, pt_pv, 0, 4046 info->sva, ptep, info->arg); 4047 } 4048 if (pt_pv) 4049 pv_put(pt_pv); 4050 fast_skip: 4051 pmap_inval_bulk_flush(info->bulk); 4052 return; 4053 } 4054 4055 /* 4056 * Nominal scan case, RB_SCAN() for PD pages and iterate from 4057 * there. 4058 * 4059 * WARNING! eva can overflow our standard ((N + mask) >> bits) 4060 * bounds, resulting in a pd_pindex of 0. To solve the 4061 * problem we use an inclusive range. 4062 */ 4063 info->sva_pd_pindex = pmap_pd_pindex(info->sva); 4064 info->eva_pd_pindex = pmap_pd_pindex(info->eva - PAGE_SIZE); 4065 4066 if (info->sva >= VM_MAX_USER_ADDRESS) { 4067 /* 4068 * The kernel does not currently maintain any pv_entry's for 4069 * higher-level page tables. 4070 */ 4071 bzero(&dummy_pv, sizeof(dummy_pv)); 4072 dummy_pv.pv_pindex = info->sva_pd_pindex; 4073 spin_lock(&pmap->pm_spin); 4074 while (dummy_pv.pv_pindex <= info->eva_pd_pindex) { 4075 pmap_scan_callback(&dummy_pv, info); 4076 ++dummy_pv.pv_pindex; 4077 if (dummy_pv.pv_pindex < info->sva_pd_pindex) /*wrap*/ 4078 break; 4079 } 4080 spin_unlock(&pmap->pm_spin); 4081 } else { 4082 /* 4083 * User page tables maintain local PML4, PDP, and PD 4084 * pv_entry's at the very least. PT pv's might be 4085 * unmanaged and thus not exist. PTE pv's might be 4086 * unmanaged and thus not exist. 4087 */ 4088 spin_lock(&pmap->pm_spin); 4089 pv_entry_rb_tree_RB_SCAN(&pmap->pm_pvroot, pmap_scan_cmp, 4090 pmap_scan_callback, info); 4091 spin_unlock(&pmap->pm_spin); 4092 } 4093 pmap_inval_bulk_flush(info->bulk); 4094 } 4095 4096 /* 4097 * WARNING! pmap->pm_spin held 4098 * 4099 * WARNING! eva can overflow our standard ((N + mask) >> bits) 4100 * bounds, resulting in a pd_pindex of 0. To solve the 4101 * problem we use an inclusive range. 4102 */ 4103 static int 4104 pmap_scan_cmp(pv_entry_t pv, void *data) 4105 { 4106 struct pmap_scan_info *info = data; 4107 if (pv->pv_pindex < info->sva_pd_pindex) 4108 return(-1); 4109 if (pv->pv_pindex > info->eva_pd_pindex) 4110 return(1); 4111 return(0); 4112 } 4113 4114 /* 4115 * pmap_scan() by PDs 4116 * 4117 * WARNING! pmap->pm_spin held 4118 */ 4119 static int 4120 pmap_scan_callback(pv_entry_t pv, void *data) 4121 { 4122 struct pmap_scan_info *info = data; 4123 struct pmap *pmap = info->pmap; 4124 pv_entry_t pd_pv; /* A page directory PV */ 4125 pv_entry_t pt_pv; /* A page table PV */ 4126 vm_pindex_t *pt_placemark; 4127 pt_entry_t *ptep; 4128 pt_entry_t oldpte; 4129 vm_offset_t sva; 4130 vm_offset_t eva; 4131 vm_offset_t va_next; 4132 vm_pindex_t pd_pindex; 4133 int error; 4134 4135 /* 4136 * Stop if requested 4137 */ 4138 if (info->stop) 4139 return -1; 4140 4141 /* 4142 * Pull the PD pindex from the pv before releasing the spinlock. 4143 * 4144 * WARNING: pv is faked for kernel pmap scans. 4145 */ 4146 pd_pindex = pv->pv_pindex; 4147 spin_unlock(&pmap->pm_spin); 4148 pv = NULL; /* invalid after spinlock unlocked */ 4149 4150 /* 4151 * Calculate the page range within the PD. SIMPLE pmaps are 4152 * direct-mapped for the entire 2^64 address space. Normal pmaps 4153 * reflect the user and kernel address space which requires 4154 * cannonicalization w/regards to converting pd_pindex's back 4155 * into addresses. 4156 */ 4157 sva = (pd_pindex - pmap_pd_pindex(0)) << PDPSHIFT; 4158 if ((pmap->pm_flags & PMAP_FLAG_SIMPLE) == 0 && 4159 (sva & PML4_SIGNMASK)) { 4160 sva |= PML4_SIGNMASK; 4161 } 4162 eva = sva + NBPDP; /* can overflow */ 4163 if (sva < info->sva) 4164 sva = info->sva; 4165 if (eva < info->sva || eva > info->eva) 4166 eva = info->eva; 4167 4168 /* 4169 * NOTE: kernel mappings do not track page table pages, only 4170 * terminal pages. 4171 * 4172 * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4. 4173 * However, for the scan to be efficient we try to 4174 * cache items top-down. 4175 */ 4176 pd_pv = NULL; 4177 pt_pv = NULL; 4178 4179 for (; sva < eva; sva = va_next) { 4180 if (info->stop) 4181 break; 4182 if (sva >= VM_MAX_USER_ADDRESS) { 4183 if (pt_pv) { 4184 pv_put(pt_pv); 4185 pt_pv = NULL; 4186 } 4187 goto kernel_skip; 4188 } 4189 4190 /* 4191 * PD cache, scan shortcut if it doesn't exist. 4192 */ 4193 if (pd_pv == NULL) { 4194 pd_pv = pv_get(pmap, pmap_pd_pindex(sva), NULL); 4195 } else if (pd_pv->pv_pmap != pmap || 4196 pd_pv->pv_pindex != pmap_pd_pindex(sva)) { 4197 pv_put(pd_pv); 4198 pd_pv = pv_get(pmap, pmap_pd_pindex(sva), NULL); 4199 } 4200 if (pd_pv == NULL) { 4201 va_next = (sva + NBPDP) & ~PDPMASK; 4202 if (va_next < sva) 4203 va_next = eva; 4204 continue; 4205 } 4206 4207 /* 4208 * PT cache 4209 * 4210 * NOTE: The cached pt_pv can be removed from the pmap when 4211 * pmap_dynamic_delete is enabled. 4212 */ 4213 if (pt_pv && (pt_pv->pv_pmap != pmap || 4214 pt_pv->pv_pindex != pmap_pt_pindex(sva))) { 4215 pv_put(pt_pv); 4216 pt_pv = NULL; 4217 } 4218 if (pt_pv == NULL) { 4219 pt_pv = pv_get_try(pmap, pmap_pt_pindex(sva), 4220 &pt_placemark, &error); 4221 if (error) { 4222 pv_put(pd_pv); /* lock order */ 4223 pd_pv = NULL; 4224 if (pt_pv) { 4225 pv_lock(pt_pv); 4226 pv_put(pt_pv); 4227 pt_pv = NULL; 4228 } else { 4229 pv_placemarker_wait(pmap, pt_placemark); 4230 } 4231 va_next = sva; 4232 continue; 4233 } 4234 /* may have to re-check later if pt_pv is NULL here */ 4235 } 4236 4237 /* 4238 * If pt_pv is NULL we either have an shared page table 4239 * page and must issue a callback specific to that case, 4240 * or there is no page table page. 4241 * 4242 * Either way we can skip the page table page. 4243 * 4244 * WARNING! pt_pv can also be NULL due to a pv creation 4245 * race where we find it to be NULL and then 4246 * later see a pte_pv. But its possible the pt_pv 4247 * got created inbetween the two operations, so 4248 * we must check. 4249 */ 4250 if (pt_pv == NULL) { 4251 /* 4252 * Possible unmanaged (shared from another pmap) 4253 * page table page. 4254 * 4255 * WARNING! We must hold pt_placemark across the 4256 * *ptep test to prevent misintepreting 4257 * a non-zero *ptep as a shared page 4258 * table page. Hold it across the function 4259 * callback as well for SMP safety. 4260 */ 4261 ptep = pv_pte_lookup(pd_pv, pmap_pt_index(sva)); 4262 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 4263 info->func(pmap, info, NULL, pt_placemark, 4264 pd_pv, 1, 4265 sva, ptep, info->arg); 4266 } else { 4267 pv_placemarker_wakeup(pmap, pt_placemark); 4268 } 4269 4270 /* 4271 * Done, move to next page table page. 4272 */ 4273 va_next = (sva + NBPDR) & ~PDRMASK; 4274 if (va_next < sva) 4275 va_next = eva; 4276 continue; 4277 } 4278 4279 /* 4280 * From this point in the loop testing pt_pv for non-NULL 4281 * means we are in UVM, else if it is NULL we are in KVM. 4282 * 4283 * Limit our scan to either the end of the va represented 4284 * by the current page table page, or to the end of the 4285 * range being removed. 4286 */ 4287 kernel_skip: 4288 va_next = (sva + NBPDR) & ~PDRMASK; 4289 if (va_next < sva) 4290 va_next = eva; 4291 if (va_next > eva) 4292 va_next = eva; 4293 4294 /* 4295 * Scan the page table for pages. Some pages may not be 4296 * managed (might not have a pv_entry). 4297 * 4298 * There is no page table management for kernel pages so 4299 * pt_pv will be NULL in that case, but otherwise pt_pv 4300 * is non-NULL, locked, and referenced. 4301 */ 4302 4303 /* 4304 * At this point a non-NULL pt_pv means a UVA, and a NULL 4305 * pt_pv means a KVA. 4306 */ 4307 if (pt_pv) 4308 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva)); 4309 else 4310 ptep = vtopte(sva); 4311 4312 while (sva < va_next) { 4313 pv_entry_t pte_pv; 4314 vm_pindex_t *pte_placemark; 4315 4316 /* 4317 * Yield every 64 pages, stop if requested. 4318 */ 4319 if ((++info->count & 63) == 0) 4320 lwkt_user_yield(); 4321 if (info->stop) 4322 break; 4323 4324 /* 4325 * We can shortcut our scan if *ptep == 0. This is 4326 * an unlocked check. 4327 */ 4328 if (*ptep == 0) { 4329 sva += PAGE_SIZE; 4330 ++ptep; 4331 continue; 4332 } 4333 cpu_ccfence(); 4334 4335 /* 4336 * Acquire the related pte_pv, if any. If *ptep == 0 4337 * the related pte_pv should not exist, but if *ptep 4338 * is not zero the pte_pv may or may not exist (e.g. 4339 * will not exist for an unmanaged page). 4340 * 4341 * However a multitude of races are possible here 4342 * so if we cannot lock definite state we clean out 4343 * our cache and break the inner while() loop to 4344 * force a loop up to the top of the for(). 4345 * 4346 * XXX unlock/relock pd_pv, pt_pv, and re-test their 4347 * validity instead of looping up? 4348 */ 4349 pte_pv = pv_get_try(pmap, pmap_pte_pindex(sva), 4350 &pte_placemark, &error); 4351 if (error) { 4352 pv_put(pd_pv); /* lock order */ 4353 pd_pv = NULL; 4354 if (pt_pv) { 4355 pv_put(pt_pv); /* lock order */ 4356 pt_pv = NULL; 4357 } 4358 if (pte_pv) { /* block */ 4359 pv_lock(pte_pv); 4360 pv_put(pte_pv); 4361 pte_pv = NULL; 4362 } else { 4363 pv_placemarker_wait(pmap, 4364 pte_placemark); 4365 } 4366 va_next = sva; /* retry */ 4367 break; 4368 } 4369 4370 /* 4371 * Reload *ptep after successfully locking the 4372 * pindex. If *ptep == 0 we had better NOT have a 4373 * pte_pv. 4374 */ 4375 cpu_ccfence(); 4376 oldpte = *ptep; 4377 if (oldpte == 0) { 4378 if (pte_pv) { 4379 kprintf("Unexpected non-NULL pte_pv " 4380 "%p pt_pv %p " 4381 "*ptep = %016lx/%016lx\n", 4382 pte_pv, pt_pv, *ptep, oldpte); 4383 panic("Unexpected non-NULL pte_pv"); 4384 } else { 4385 pv_placemarker_wakeup(pmap, pte_placemark); 4386 } 4387 sva += PAGE_SIZE; 4388 ++ptep; 4389 continue; 4390 } 4391 4392 /* 4393 * We can't hold pd_pv across the callback (because 4394 * we don't pass it to the callback and the callback 4395 * might deadlock) 4396 */ 4397 if (pd_pv) { 4398 vm_page_wire_quick(pd_pv->pv_m); 4399 pv_unlock(pd_pv); 4400 } 4401 4402 /* 4403 * Ready for the callback. The locked pte_pv (if any) 4404 * is consumed by the callback. pte_pv will exist if 4405 * the page is managed, and will not exist if it 4406 * isn't. 4407 */ 4408 if (oldpte & pmap->pmap_bits[PG_MANAGED_IDX]) { 4409 /* 4410 * Managed pte 4411 */ 4412 KASSERT(pte_pv && 4413 (oldpte & pmap->pmap_bits[PG_V_IDX]), 4414 ("badC *ptep %016lx/%016lx sva %016lx " 4415 "pte_pv %p", 4416 *ptep, oldpte, sva, pte_pv)); 4417 /* 4418 * We must unlock pd_pv across the callback 4419 * to avoid deadlocks on any recursive 4420 * disposal. Re-check that it still exists 4421 * after re-locking. 4422 * 4423 * Call target disposes of pte_pv and may 4424 * destroy but will not dispose of pt_pv. 4425 */ 4426 info->func(pmap, info, pte_pv, NULL, 4427 pt_pv, 0, 4428 sva, ptep, info->arg); 4429 } else { 4430 /* 4431 * Unmanaged pte 4432 * 4433 * We must unlock pd_pv across the callback 4434 * to avoid deadlocks on any recursive 4435 * disposal. Re-check that it still exists 4436 * after re-locking. 4437 * 4438 * Call target disposes of pte_pv or 4439 * pte_placemark and may destroy but will 4440 * not dispose of pt_pv. 4441 */ 4442 KASSERT(pte_pv == NULL && 4443 (oldpte & pmap->pmap_bits[PG_V_IDX]), 4444 ("badD *ptep %016lx/%016lx sva %016lx " 4445 "pte_pv %p pte_pv->pv_m %p ", 4446 *ptep, oldpte, sva, 4447 pte_pv, (pte_pv ? pte_pv->pv_m : NULL))); 4448 if (pte_pv) 4449 kprintf("RaceD\n"); 4450 if (pte_pv) { 4451 info->func(pmap, info, 4452 pte_pv, NULL, 4453 pt_pv, 0, 4454 sva, ptep, info->arg); 4455 } else { 4456 info->func(pmap, info, 4457 NULL, pte_placemark, 4458 pt_pv, 0, 4459 sva, ptep, info->arg); 4460 } 4461 } 4462 if (pd_pv) { 4463 pv_lock(pd_pv); 4464 vm_page_unwire_quick(pd_pv->pv_m); 4465 if (pd_pv->pv_pmap == NULL) { 4466 va_next = sva; /* retry */ 4467 break; 4468 } 4469 } 4470 4471 /* 4472 * NOTE: The cached pt_pv can be removed from the 4473 * pmap when pmap_dynamic_delete is enabled, 4474 * which will cause ptep to become stale. 4475 * 4476 * This also means that no pages remain under 4477 * the PT, so we can just break out of the inner 4478 * loop and let the outer loop clean everything 4479 * up. 4480 */ 4481 if (pt_pv && pt_pv->pv_pmap != pmap) 4482 break; 4483 pte_pv = NULL; 4484 sva += PAGE_SIZE; 4485 ++ptep; 4486 } 4487 } 4488 if (pd_pv) { 4489 pv_put(pd_pv); 4490 pd_pv = NULL; 4491 } 4492 if (pt_pv) { 4493 pv_put(pt_pv); 4494 pt_pv = NULL; 4495 } 4496 if ((++info->count & 7) == 0) 4497 lwkt_user_yield(); 4498 4499 /* 4500 * Relock before returning. 4501 */ 4502 spin_lock(&pmap->pm_spin); 4503 return (0); 4504 } 4505 4506 void 4507 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 4508 { 4509 struct pmap_scan_info info; 4510 4511 info.pmap = pmap; 4512 info.sva = sva; 4513 info.eva = eva; 4514 info.func = pmap_remove_callback; 4515 info.arg = NULL; 4516 pmap_scan(&info, 1); 4517 #if 0 4518 cpu_invltlb(); 4519 if (eva - sva < 1024*1024) { 4520 while (sva < eva) { 4521 cpu_invlpg((void *)sva); 4522 sva += PAGE_SIZE; 4523 } 4524 } 4525 #endif 4526 } 4527 4528 static void 4529 pmap_remove_noinval(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 4530 { 4531 struct pmap_scan_info info; 4532 4533 info.pmap = pmap; 4534 info.sva = sva; 4535 info.eva = eva; 4536 info.func = pmap_remove_callback; 4537 info.arg = NULL; 4538 pmap_scan(&info, 0); 4539 } 4540 4541 static void 4542 pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info, 4543 pv_entry_t pte_pv, vm_pindex_t *pte_placemark, 4544 pv_entry_t pt_pv, int sharept, 4545 vm_offset_t va, pt_entry_t *ptep, void *arg __unused) 4546 { 4547 pt_entry_t pte; 4548 4549 if (pte_pv) { 4550 /* 4551 * Managed entry 4552 * 4553 * This will also drop pt_pv's wire_count. Note that 4554 * terminal pages are not wired based on mmu presence. 4555 * 4556 * NOTE: If this is the kernel_pmap, pt_pv can be NULL. 4557 */ 4558 KKASSERT(pte_pv->pv_m != NULL); 4559 pmap_remove_pv_pte(pte_pv, pt_pv, info->bulk, 2); 4560 pte_pv = NULL; /* safety */ 4561 4562 /* 4563 * Recursively destroy higher-level page tables. 4564 * 4565 * This is optional. If we do not, they will still 4566 * be destroyed when the process exits. 4567 * 4568 * NOTE: Do not destroy pv_entry's with extra hold refs, 4569 * a caller may have unlocked it and intends to 4570 * continue to use it. 4571 */ 4572 if (pmap_dynamic_delete && 4573 pt_pv && 4574 pt_pv->pv_m && 4575 pt_pv->pv_m->wire_count == 1 && 4576 (pt_pv->pv_hold & PV_HOLD_MASK) == 2 && 4577 pt_pv->pv_pindex != pmap_pml4_pindex()) { 4578 if (pmap_dynamic_delete == 2) 4579 kprintf("B %jd %08x\n", pt_pv->pv_pindex, pt_pv->pv_hold); 4580 pv_hold(pt_pv); /* extra hold */ 4581 pmap_remove_pv_pte(pt_pv, NULL, info->bulk, 1); 4582 pv_lock(pt_pv); /* prior extra hold + relock */ 4583 } 4584 } else if (sharept == 0) { 4585 /* 4586 * Unmanaged pte (pte_placemark is non-NULL) 4587 * 4588 * pt_pv's wire_count is still bumped by unmanaged pages 4589 * so we must decrement it manually. 4590 * 4591 * We have to unwire the target page table page. 4592 */ 4593 pte = pmap_inval_bulk(info->bulk, va, ptep, 0); 4594 if (pte & pmap->pmap_bits[PG_W_IDX]) 4595 atomic_add_long(&pmap->pm_stats.wired_count, -1); 4596 atomic_add_long(&pmap->pm_stats.resident_count, -1); 4597 if (vm_page_unwire_quick(pt_pv->pv_m)) 4598 panic("pmap_remove: insufficient wirecount"); 4599 pv_placemarker_wakeup(pmap, pte_placemark); 4600 } else { 4601 /* 4602 * Unmanaged page table (pt, pd, or pdp. Not pte) for 4603 * a shared page table. 4604 * 4605 * pt_pv is actually the pd_pv for our pmap (not the shared 4606 * object pmap). 4607 * 4608 * We have to unwire the target page table page and we 4609 * have to unwire our page directory page. 4610 * 4611 * It is unclear how we can invalidate a segment so we 4612 * invalidate -1 which invlidates the tlb. 4613 */ 4614 pte = pmap_inval_bulk(info->bulk, (vm_offset_t)-1, ptep, 0); 4615 atomic_add_long(&pmap->pm_stats.resident_count, -1); 4616 KKASSERT((pte & pmap->pmap_bits[PG_DEVICE_IDX]) == 0); 4617 if (vm_page_unwire_quick(PHYS_TO_VM_PAGE(pte & PG_FRAME))) 4618 panic("pmap_remove: shared pgtable1 bad wirecount"); 4619 if (vm_page_unwire_quick(pt_pv->pv_m)) 4620 panic("pmap_remove: shared pgtable2 bad wirecount"); 4621 pv_placemarker_wakeup(pmap, pte_placemark); 4622 } 4623 } 4624 4625 /* 4626 * Removes this physical page from all physical maps in which it resides. 4627 * Reflects back modify bits to the pager. 4628 * 4629 * This routine may not be called from an interrupt. 4630 */ 4631 static 4632 void 4633 pmap_remove_all(vm_page_t m) 4634 { 4635 pv_entry_t pv; 4636 pmap_inval_bulk_t bulk; 4637 4638 if (!pmap_initialized /* || (m->flags & PG_FICTITIOUS)*/) 4639 return; 4640 4641 vm_page_spin_lock(m); 4642 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4643 KKASSERT(pv->pv_m == m); 4644 if (pv_hold_try(pv)) { 4645 vm_page_spin_unlock(m); 4646 } else { 4647 vm_page_spin_unlock(m); 4648 pv_lock(pv); 4649 pv_put(pv); 4650 vm_page_spin_lock(m); 4651 continue; 4652 } 4653 KKASSERT(pv->pv_pmap && pv->pv_m == m); 4654 4655 /* 4656 * Holding no spinlocks, pv is locked. Once we scrap 4657 * pv we can no longer use it as a list iterator (but 4658 * we are doing a TAILQ_FIRST() so we are ok). 4659 */ 4660 pmap_inval_bulk_init(&bulk, pv->pv_pmap); 4661 pmap_remove_pv_pte(pv, NULL, &bulk, 2); 4662 pv = NULL; /* safety */ 4663 pmap_inval_bulk_flush(&bulk); 4664 vm_page_spin_lock(m); 4665 } 4666 KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0); 4667 vm_page_spin_unlock(m); 4668 } 4669 4670 /* 4671 * Removes the page from a particular pmap 4672 */ 4673 void 4674 pmap_remove_specific(pmap_t pmap, vm_page_t m) 4675 { 4676 pv_entry_t pv; 4677 pmap_inval_bulk_t bulk; 4678 4679 if (!pmap_initialized) 4680 return; 4681 4682 again: 4683 vm_page_spin_lock(m); 4684 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4685 if (pv->pv_pmap != pmap) 4686 continue; 4687 KKASSERT(pv->pv_m == m); 4688 if (pv_hold_try(pv)) { 4689 vm_page_spin_unlock(m); 4690 } else { 4691 vm_page_spin_unlock(m); 4692 pv_lock(pv); 4693 pv_put(pv); 4694 goto again; 4695 } 4696 KKASSERT(pv->pv_pmap == pmap && pv->pv_m == m); 4697 4698 /* 4699 * Holding no spinlocks, pv is locked. Once gone it can't 4700 * be used as an iterator. In fact, because we couldn't 4701 * necessarily lock it atomically it may have moved within 4702 * the list and ALSO cannot be used as an iterator. 4703 */ 4704 pmap_inval_bulk_init(&bulk, pv->pv_pmap); 4705 pmap_remove_pv_pte(pv, NULL, &bulk, 2); 4706 pv = NULL; /* safety */ 4707 pmap_inval_bulk_flush(&bulk); 4708 goto again; 4709 } 4710 vm_page_spin_unlock(m); 4711 } 4712 4713 /* 4714 * Set the physical protection on the specified range of this map 4715 * as requested. This function is typically only used for debug watchpoints 4716 * and COW pages. 4717 * 4718 * This function may not be called from an interrupt if the map is 4719 * not the kernel_pmap. 4720 * 4721 * NOTE! For shared page table pages we just unmap the page. 4722 */ 4723 void 4724 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4725 { 4726 struct pmap_scan_info info; 4727 /* JG review for NX */ 4728 4729 if (pmap == NULL) 4730 return; 4731 if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == VM_PROT_NONE) { 4732 pmap_remove(pmap, sva, eva); 4733 return; 4734 } 4735 if (prot & VM_PROT_WRITE) 4736 return; 4737 info.pmap = pmap; 4738 info.sva = sva; 4739 info.eva = eva; 4740 info.func = pmap_protect_callback; 4741 info.arg = &prot; 4742 pmap_scan(&info, 1); 4743 } 4744 4745 static 4746 void 4747 pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info, 4748 pv_entry_t pte_pv, vm_pindex_t *pte_placemark, 4749 pv_entry_t pt_pv, int sharept, 4750 vm_offset_t va, pt_entry_t *ptep, void *arg __unused) 4751 { 4752 pt_entry_t pbits; 4753 pt_entry_t cbits; 4754 pt_entry_t pte; 4755 vm_page_t m; 4756 4757 again: 4758 pbits = *ptep; 4759 cbits = pbits; 4760 if (pte_pv) { 4761 KKASSERT(pte_pv->pv_m != NULL); 4762 m = NULL; 4763 if (pbits & pmap->pmap_bits[PG_A_IDX]) { 4764 if ((pbits & pmap->pmap_bits[PG_DEVICE_IDX]) == 0) { 4765 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 4766 KKASSERT(m == pte_pv->pv_m); 4767 vm_page_flag_set(m, PG_REFERENCED); 4768 } 4769 cbits &= ~pmap->pmap_bits[PG_A_IDX]; 4770 } 4771 if (pbits & pmap->pmap_bits[PG_M_IDX]) { 4772 if (pmap_track_modified(pte_pv->pv_pindex)) { 4773 if ((pbits & pmap->pmap_bits[PG_DEVICE_IDX]) == 0) { 4774 if (m == NULL) { 4775 m = PHYS_TO_VM_PAGE(pbits & 4776 PG_FRAME); 4777 } 4778 vm_page_dirty(m); 4779 } 4780 cbits &= ~pmap->pmap_bits[PG_M_IDX]; 4781 } 4782 } 4783 } else if (sharept) { 4784 /* 4785 * Unmanaged page table, pt_pv is actually the pd_pv 4786 * for our pmap (not the object's shared pmap). 4787 * 4788 * When asked to protect something in a shared page table 4789 * page we just unmap the page table page. We have to 4790 * invalidate the tlb in this situation. 4791 * 4792 * XXX Warning, shared page tables will not be used for 4793 * OBJT_DEVICE or OBJT_MGTDEVICE (PG_FICTITIOUS) mappings 4794 * so PHYS_TO_VM_PAGE() should be safe here. 4795 */ 4796 pte = pmap_inval_smp(pmap, (vm_offset_t)-1, 1, ptep, 0); 4797 if (vm_page_unwire_quick(PHYS_TO_VM_PAGE(pte & PG_FRAME))) 4798 panic("pmap_protect: pgtable1 pg bad wirecount"); 4799 if (vm_page_unwire_quick(pt_pv->pv_m)) 4800 panic("pmap_protect: pgtable2 pg bad wirecount"); 4801 ptep = NULL; 4802 } 4803 /* else unmanaged page, adjust bits, no wire changes */ 4804 4805 if (ptep) { 4806 cbits &= ~pmap->pmap_bits[PG_RW_IDX]; 4807 #ifdef PMAP_DEBUG2 4808 if (pmap_enter_debug > 0) { 4809 --pmap_enter_debug; 4810 kprintf("pmap_protect va=%lx ptep=%p pte_pv=%p " 4811 "pt_pv=%p cbits=%08lx\n", 4812 va, ptep, pte_pv, 4813 pt_pv, cbits 4814 ); 4815 } 4816 #endif 4817 if (pbits != cbits) { 4818 vm_offset_t xva; 4819 4820 xva = (sharept) ? (vm_offset_t)-1 : va; 4821 if (!pmap_inval_smp_cmpset(pmap, xva, 4822 ptep, pbits, cbits)) { 4823 goto again; 4824 } 4825 } 4826 } 4827 if (pte_pv) 4828 pv_put(pte_pv); 4829 else 4830 pv_placemarker_wakeup(pmap, pte_placemark); 4831 } 4832 4833 /* 4834 * Insert the vm_page (m) at the virtual address (va), replacing any prior 4835 * mapping at that address. Set protection and wiring as requested. 4836 * 4837 * If entry is non-NULL we check to see if the SEG_SIZE optimization is 4838 * possible. If it is we enter the page into the appropriate shared pmap 4839 * hanging off the related VM object instead of the passed pmap, then we 4840 * share the page table page from the VM object's pmap into the current pmap. 4841 * 4842 * NOTE: This routine MUST insert the page into the pmap now, it cannot 4843 * lazy-evaluate. 4844 * 4845 * NOTE: If (m) is PG_UNMANAGED it may also be a temporary fake vm_page_t. 4846 * never record it. 4847 */ 4848 void 4849 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4850 boolean_t wired, vm_map_entry_t entry) 4851 { 4852 pv_entry_t pt_pv; /* page table */ 4853 pv_entry_t pte_pv; /* page table entry */ 4854 vm_pindex_t *pte_placemark; 4855 pt_entry_t *ptep; 4856 vm_paddr_t opa; 4857 pt_entry_t origpte, newpte; 4858 vm_paddr_t pa; 4859 4860 if (pmap == NULL) 4861 return; 4862 va = trunc_page(va); 4863 #ifdef PMAP_DIAGNOSTIC 4864 if (va >= KvaEnd) 4865 panic("pmap_enter: toobig"); 4866 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) 4867 panic("pmap_enter: invalid to pmap_enter page table " 4868 "pages (va: 0x%lx)", va); 4869 #endif 4870 if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) { 4871 kprintf("Warning: pmap_enter called on UVA with " 4872 "kernel_pmap\n"); 4873 #ifdef DDB 4874 db_print_backtrace(); 4875 #endif 4876 } 4877 if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) { 4878 kprintf("Warning: pmap_enter called on KVA without" 4879 "kernel_pmap\n"); 4880 #ifdef DDB 4881 db_print_backtrace(); 4882 #endif 4883 } 4884 4885 /* 4886 * Get locked PV entries for our new page table entry (pte_pv or 4887 * pte_placemark) and for its parent page table (pt_pv). We need 4888 * the parent so we can resolve the location of the ptep. 4889 * 4890 * Only hardware MMU actions can modify the ptep out from 4891 * under us. 4892 * 4893 * if (m) is fictitious or unmanaged we do not create a managing 4894 * pte_pv for it. Any pre-existing page's management state must 4895 * match (avoiding code complexity). 4896 * 4897 * If the pmap is still being initialized we assume existing 4898 * page tables. 4899 * 4900 * Kernel mapppings do not track page table pages (i.e. pt_pv). 4901 * 4902 * WARNING! If replacing a managed mapping with an unmanaged mapping 4903 * pte_pv will wind up being non-NULL and must be handled 4904 * below. 4905 */ 4906 if (pmap_initialized == FALSE) { 4907 pte_pv = NULL; 4908 pt_pv = NULL; 4909 pte_placemark = NULL; 4910 ptep = vtopte(va); 4911 origpte = *ptep; 4912 } else if (m->flags & (/*PG_FICTITIOUS |*/ PG_UNMANAGED)) { /* XXX */ 4913 pmap_softwait(pmap); 4914 pte_pv = pv_get(pmap, pmap_pte_pindex(va), &pte_placemark); 4915 KKASSERT(pte_pv == NULL); 4916 if (va >= VM_MAX_USER_ADDRESS) { 4917 pt_pv = NULL; 4918 ptep = vtopte(va); 4919 } else { 4920 pt_pv = pmap_allocpte_seg(pmap, pmap_pt_pindex(va), 4921 NULL, entry, va); 4922 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 4923 } 4924 origpte = *ptep; 4925 cpu_ccfence(); 4926 KASSERT(origpte == 0 || 4927 (origpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0, 4928 ("Invalid PTE 0x%016jx @ 0x%016jx\n", origpte, va)); 4929 } else { 4930 pmap_softwait(pmap); 4931 if (va >= VM_MAX_USER_ADDRESS) { 4932 /* 4933 * Kernel map, pv_entry-tracked. 4934 */ 4935 pt_pv = NULL; 4936 pte_pv = pmap_allocpte(pmap, pmap_pte_pindex(va), NULL); 4937 ptep = vtopte(va); 4938 } else { 4939 /* 4940 * User map 4941 */ 4942 pte_pv = pmap_allocpte_seg(pmap, pmap_pte_pindex(va), 4943 &pt_pv, entry, va); 4944 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 4945 } 4946 pte_placemark = NULL; /* safety */ 4947 origpte = *ptep; 4948 cpu_ccfence(); 4949 KASSERT(origpte == 0 || 4950 (origpte & pmap->pmap_bits[PG_MANAGED_IDX]), 4951 ("Invalid PTE 0x%016jx @ 0x%016jx\n", origpte, va)); 4952 } 4953 4954 pa = VM_PAGE_TO_PHYS(m); 4955 opa = origpte & PG_FRAME; 4956 4957 /* 4958 * Calculate the new PTE. Note that pte_pv alone does not mean 4959 * the new pte_pv is managed, it could exist because the old pte 4960 * was managed even if the new one is not. 4961 */ 4962 newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | 4963 pmap->pmap_bits[PG_V_IDX] | pmap->pmap_bits[PG_A_IDX]); 4964 if (wired) 4965 newpte |= pmap->pmap_bits[PG_W_IDX]; 4966 if (va < VM_MAX_USER_ADDRESS) 4967 newpte |= pmap->pmap_bits[PG_U_IDX]; 4968 if (pte_pv && (m->flags & (/*PG_FICTITIOUS |*/ PG_UNMANAGED)) == 0) 4969 newpte |= pmap->pmap_bits[PG_MANAGED_IDX]; 4970 // if (pmap == &kernel_pmap) 4971 // newpte |= pgeflag; 4972 newpte |= pmap->pmap_cache_bits[m->pat_mode]; 4973 if (m->flags & PG_FICTITIOUS) 4974 newpte |= pmap->pmap_bits[PG_DEVICE_IDX]; 4975 4976 /* 4977 * It is possible for multiple faults to occur in threaded 4978 * environments, the existing pte might be correct. 4979 */ 4980 if (((origpte ^ newpte) & 4981 ~(pt_entry_t)(pmap->pmap_bits[PG_M_IDX] | 4982 pmap->pmap_bits[PG_A_IDX])) == 0) { 4983 goto done; 4984 } 4985 4986 /* 4987 * Ok, either the address changed or the protection or wiring 4988 * changed. 4989 * 4990 * Clear the current entry, interlocking the removal. For managed 4991 * pte's this will also flush the modified state to the vm_page. 4992 * Atomic ops are mandatory in order to ensure that PG_M events are 4993 * not lost during any transition. 4994 * 4995 * WARNING: The caller has busied the new page but not the original 4996 * vm_page which we are trying to replace. Because we hold 4997 * the pte_pv lock, but have not busied the page, PG bits 4998 * can be cleared out from under us. 4999 */ 5000 if (opa) { 5001 if (origpte & pmap->pmap_bits[PG_MANAGED_IDX]) { 5002 /* 5003 * Old page was managed. Expect pte_pv to exist. 5004 * (it might also exist if the old page was unmanaged). 5005 * 5006 * NOTE: pt_pv won't exist for a kernel page 5007 * (managed or otherwise). 5008 * 5009 * NOTE: We may be reusing the pte_pv so we do not 5010 * destroy it in pmap_remove_pv_pte(). 5011 */ 5012 KKASSERT(pte_pv && pte_pv->pv_m); 5013 if (prot & VM_PROT_NOSYNC) { 5014 pmap_remove_pv_pte(pte_pv, pt_pv, NULL, 0); 5015 } else { 5016 pmap_inval_bulk_t bulk; 5017 5018 pmap_inval_bulk_init(&bulk, pmap); 5019 pmap_remove_pv_pte(pte_pv, pt_pv, &bulk, 0); 5020 pmap_inval_bulk_flush(&bulk); 5021 } 5022 pmap_remove_pv_page(pte_pv); 5023 /* will either set pte_pv->pv_m or pv_free() later */ 5024 } else { 5025 /* 5026 * Old page was not managed. If we have a pte_pv 5027 * it better not have a pv_m assigned to it. If the 5028 * new page is managed the pte_pv will be destroyed 5029 * near the end (we need its interlock). 5030 * 5031 * NOTE: We leave the wire count on the PT page 5032 * intact for the followup enter, but adjust 5033 * the wired-pages count on the pmap. 5034 */ 5035 KKASSERT(pte_pv == NULL); 5036 if (prot & VM_PROT_NOSYNC) { 5037 /* 5038 * NOSYNC (no mmu sync) requested. 5039 */ 5040 (void)pte_load_clear(ptep); 5041 cpu_invlpg((void *)va); 5042 } else { 5043 /* 5044 * Nominal SYNC 5045 */ 5046 pmap_inval_smp(pmap, va, 1, ptep, 0); 5047 } 5048 5049 /* 5050 * We must adjust pm_stats manually for unmanaged 5051 * pages. 5052 */ 5053 if (pt_pv) { 5054 atomic_add_long(&pmap->pm_stats. 5055 resident_count, -1); 5056 } 5057 if (origpte & pmap->pmap_bits[PG_W_IDX]) { 5058 atomic_add_long(&pmap->pm_stats. 5059 wired_count, -1); 5060 } 5061 } 5062 KKASSERT(*ptep == 0); 5063 } 5064 5065 #ifdef PMAP_DEBUG2 5066 if (pmap_enter_debug > 0) { 5067 --pmap_enter_debug; 5068 kprintf("pmap_enter: va=%lx m=%p origpte=%lx newpte=%lx ptep=%p" 5069 " pte_pv=%p pt_pv=%p opa=%lx prot=%02x\n", 5070 va, m, 5071 origpte, newpte, ptep, 5072 pte_pv, pt_pv, opa, prot); 5073 } 5074 #endif 5075 5076 if ((newpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0) { 5077 /* 5078 * Entering an unmanaged page. We must wire the pt_pv unless 5079 * we retained the wiring from an unmanaged page we had 5080 * removed (if we retained it via pte_pv that will go away 5081 * soon). 5082 */ 5083 if (pt_pv && (opa == 0 || 5084 (origpte & pmap->pmap_bits[PG_MANAGED_IDX]))) { 5085 vm_page_wire_quick(pt_pv->pv_m); 5086 } 5087 if (wired) 5088 atomic_add_long(&pmap->pm_stats.wired_count, 1); 5089 5090 /* 5091 * Unmanaged pages need manual resident_count tracking. 5092 */ 5093 if (pt_pv) { 5094 atomic_add_long(&pt_pv->pv_pmap->pm_stats. 5095 resident_count, 1); 5096 } 5097 if (newpte & pmap->pmap_bits[PG_RW_IDX]) 5098 vm_page_flag_set(m, PG_WRITEABLE); 5099 } else { 5100 /* 5101 * Entering a managed page. Our pte_pv takes care of the 5102 * PT wiring, so if we had removed an unmanaged page before 5103 * we must adjust. 5104 * 5105 * We have to take care of the pmap wired count ourselves. 5106 * 5107 * Enter on the PV list if part of our managed memory. 5108 */ 5109 KKASSERT(pte_pv && (pte_pv->pv_m == NULL || pte_pv->pv_m == m)); 5110 vm_page_spin_lock(m); 5111 pte_pv->pv_m = m; 5112 pmap_page_stats_adding(m); 5113 TAILQ_INSERT_TAIL(&m->md.pv_list, pte_pv, pv_list); 5114 vm_page_flag_set(m, PG_MAPPED); 5115 if (newpte & pmap->pmap_bits[PG_RW_IDX]) 5116 vm_page_flag_set(m, PG_WRITEABLE); 5117 vm_page_spin_unlock(m); 5118 5119 if (pt_pv && opa && 5120 (origpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0) { 5121 vm_page_unwire_quick(pt_pv->pv_m); 5122 } 5123 5124 /* 5125 * Adjust pmap wired pages count for new entry. 5126 */ 5127 if (wired) { 5128 atomic_add_long(&pte_pv->pv_pmap->pm_stats. 5129 wired_count, 1); 5130 } 5131 } 5132 5133 /* 5134 * Kernel VMAs (pt_pv == NULL) require pmap invalidation interlocks. 5135 * 5136 * User VMAs do not because those will be zero->non-zero, so no 5137 * stale entries to worry about at this point. 5138 * 5139 * For KVM there appear to still be issues. Theoretically we 5140 * should be able to scrap the interlocks entirely but we 5141 * get crashes. 5142 */ 5143 if ((prot & VM_PROT_NOSYNC) == 0 && pt_pv == NULL) { 5144 pmap_inval_smp(pmap, va, 1, ptep, newpte); 5145 } else { 5146 origpte = atomic_swap_long(ptep, newpte); 5147 if (origpte & pmap->pmap_bits[PG_M_IDX]) { 5148 kprintf("pmap [M] race @ %016jx\n", va); 5149 atomic_set_long(ptep, pmap->pmap_bits[PG_M_IDX]); 5150 } 5151 if (pt_pv == NULL) 5152 cpu_invlpg((void *)va); 5153 } 5154 5155 /* 5156 * Cleanup 5157 */ 5158 done: 5159 KKASSERT((newpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0 || 5160 (m->flags & PG_MAPPED)); 5161 5162 /* 5163 * Cleanup the pv entry, allowing other accessors. If the new page 5164 * is not managed but we have a pte_pv (which was locking our 5165 * operation), we can free it now. pte_pv->pv_m should be NULL. 5166 */ 5167 if (pte_pv && (newpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0) { 5168 pv_free(pte_pv, pt_pv); 5169 } else if (pte_pv) { 5170 pv_put(pte_pv); 5171 } else if (pte_placemark) { 5172 pv_placemarker_wakeup(pmap, pte_placemark); 5173 } 5174 if (pt_pv) 5175 pv_put(pt_pv); 5176 } 5177 5178 /* 5179 * This code works like pmap_enter() but assumes VM_PROT_READ and not-wired. 5180 * This code also assumes that the pmap has no pre-existing entry for this 5181 * VA. 5182 * 5183 * This code currently may only be used on user pmaps, not kernel_pmap. 5184 */ 5185 void 5186 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m) 5187 { 5188 pmap_enter(pmap, va, m, VM_PROT_READ, FALSE, NULL); 5189 } 5190 5191 /* 5192 * Make a temporary mapping for a physical address. This is only intended 5193 * to be used for panic dumps. 5194 * 5195 * The caller is responsible for calling smp_invltlb(). 5196 */ 5197 void * 5198 pmap_kenter_temporary(vm_paddr_t pa, long i) 5199 { 5200 pmap_kenter_quick((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa); 5201 return ((void *)crashdumpmap); 5202 } 5203 5204 #define MAX_INIT_PT (96) 5205 5206 /* 5207 * This routine preloads the ptes for a given object into the specified pmap. 5208 * This eliminates the blast of soft faults on process startup and 5209 * immediately after an mmap. 5210 */ 5211 static int pmap_object_init_pt_callback(vm_page_t p, void *data); 5212 5213 void 5214 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot, 5215 vm_object_t object, vm_pindex_t pindex, 5216 vm_size_t size, int limit) 5217 { 5218 struct rb_vm_page_scan_info info; 5219 struct lwp *lp; 5220 vm_size_t psize; 5221 5222 /* 5223 * We can't preinit if read access isn't set or there is no pmap 5224 * or object. 5225 */ 5226 if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL) 5227 return; 5228 5229 /* 5230 * We can't preinit if the pmap is not the current pmap 5231 */ 5232 lp = curthread->td_lwp; 5233 if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace)) 5234 return; 5235 5236 /* 5237 * Misc additional checks 5238 */ 5239 psize = x86_64_btop(size); 5240 5241 if ((object->type != OBJT_VNODE) || 5242 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && 5243 (object->resident_page_count > MAX_INIT_PT))) { 5244 return; 5245 } 5246 5247 if (pindex + psize > object->size) { 5248 if (object->size < pindex) 5249 return; 5250 psize = object->size - pindex; 5251 } 5252 5253 if (psize == 0) 5254 return; 5255 5256 /* 5257 * If everything is segment-aligned do not pre-init here. Instead 5258 * allow the normal vm_fault path to pass a segment hint to 5259 * pmap_enter() which will then use an object-referenced shared 5260 * page table page. 5261 */ 5262 if ((addr & SEG_MASK) == 0 && 5263 (ctob(psize) & SEG_MASK) == 0 && 5264 (ctob(pindex) & SEG_MASK) == 0) { 5265 return; 5266 } 5267 5268 /* 5269 * Use a red-black scan to traverse the requested range and load 5270 * any valid pages found into the pmap. 5271 * 5272 * We cannot safely scan the object's memq without holding the 5273 * object token. 5274 */ 5275 info.start_pindex = pindex; 5276 info.end_pindex = pindex + psize - 1; 5277 info.limit = limit; 5278 info.mpte = NULL; 5279 info.addr = addr; 5280 info.pmap = pmap; 5281 info.object = object; 5282 5283 /* 5284 * By using the NOLK scan, the callback function must be sure 5285 * to return -1 if the VM page falls out of the object. 5286 */ 5287 vm_object_hold_shared(object); 5288 vm_page_rb_tree_RB_SCAN_NOLK(&object->rb_memq, rb_vm_page_scancmp, 5289 pmap_object_init_pt_callback, &info); 5290 vm_object_drop(object); 5291 } 5292 5293 static 5294 int 5295 pmap_object_init_pt_callback(vm_page_t p, void *data) 5296 { 5297 struct rb_vm_page_scan_info *info = data; 5298 vm_pindex_t rel_index; 5299 int hard_busy; 5300 5301 /* 5302 * don't allow an madvise to blow away our really 5303 * free pages allocating pv entries. 5304 */ 5305 if ((info->limit & MAP_PREFAULT_MADVISE) && 5306 vmstats.v_free_count < vmstats.v_free_reserved) { 5307 return(-1); 5308 } 5309 5310 /* 5311 * Ignore list markers and ignore pages we cannot instantly 5312 * busy (while holding the object token). 5313 */ 5314 if (p->flags & PG_MARKER) 5315 return 0; 5316 hard_busy = 0; 5317 again: 5318 if (hard_busy) { 5319 if (vm_page_busy_try(p, TRUE)) 5320 return 0; 5321 } else { 5322 if (vm_page_sbusy_try(p)) 5323 return 0; 5324 } 5325 if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && 5326 (p->flags & PG_FICTITIOUS) == 0) { 5327 if ((p->queue - p->pc) == PQ_CACHE) { 5328 if (hard_busy == 0) { 5329 vm_page_sbusy_drop(p); 5330 hard_busy = 1; 5331 goto again; 5332 } 5333 vm_page_deactivate(p); 5334 } 5335 rel_index = p->pindex - info->start_pindex; 5336 pmap_enter_quick(info->pmap, 5337 info->addr + x86_64_ptob(rel_index), p); 5338 } 5339 if (hard_busy) 5340 vm_page_wakeup(p); 5341 else 5342 vm_page_sbusy_drop(p); 5343 5344 /* 5345 * We are using an unlocked scan (that is, the scan expects its 5346 * current element to remain in the tree on return). So we have 5347 * to check here and abort the scan if it isn't. 5348 */ 5349 if (p->object != info->object) 5350 return -1; 5351 lwkt_yield(); 5352 return(0); 5353 } 5354 5355 /* 5356 * Return TRUE if the pmap is in shape to trivially pre-fault the specified 5357 * address. 5358 * 5359 * Returns FALSE if it would be non-trivial or if a pte is already loaded 5360 * into the slot. 5361 * 5362 * XXX This is safe only because page table pages are not freed. 5363 */ 5364 int 5365 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) 5366 { 5367 pt_entry_t *pte; 5368 5369 /*spin_lock(&pmap->pm_spin);*/ 5370 if ((pte = pmap_pte(pmap, addr)) != NULL) { 5371 if (*pte & pmap->pmap_bits[PG_V_IDX]) { 5372 /*spin_unlock(&pmap->pm_spin);*/ 5373 return FALSE; 5374 } 5375 } 5376 /*spin_unlock(&pmap->pm_spin);*/ 5377 return TRUE; 5378 } 5379 5380 /* 5381 * Change the wiring attribute for a pmap/va pair. The mapping must already 5382 * exist in the pmap. The mapping may or may not be managed. The wiring in 5383 * the page is not changed, the page is returned so the caller can adjust 5384 * its wiring (the page is not locked in any way). 5385 * 5386 * Wiring is not a hardware characteristic so there is no need to invalidate 5387 * TLB. However, in an SMP environment we must use a locked bus cycle to 5388 * update the pte (if we are not using the pmap_inval_*() API that is)... 5389 * it's ok to do this for simple wiring changes. 5390 */ 5391 vm_page_t 5392 pmap_unwire(pmap_t pmap, vm_offset_t va) 5393 { 5394 pt_entry_t *ptep; 5395 pv_entry_t pt_pv; 5396 vm_paddr_t pa; 5397 vm_page_t m; 5398 5399 if (pmap == NULL) 5400 return NULL; 5401 5402 /* 5403 * Assume elements in the kernel pmap are stable 5404 */ 5405 if (pmap == &kernel_pmap) { 5406 if (pmap_pt(pmap, va) == 0) 5407 return NULL; 5408 ptep = pmap_pte_quick(pmap, va); 5409 if (pmap_pte_v(pmap, ptep)) { 5410 if (pmap_pte_w(pmap, ptep)) 5411 atomic_add_long(&pmap->pm_stats.wired_count,-1); 5412 atomic_clear_long(ptep, pmap->pmap_bits[PG_W_IDX]); 5413 pa = *ptep & PG_FRAME; 5414 m = PHYS_TO_VM_PAGE(pa); 5415 } else { 5416 m = NULL; 5417 } 5418 } else { 5419 /* 5420 * We can only [un]wire pmap-local pages (we cannot wire 5421 * shared pages) 5422 */ 5423 pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL); 5424 if (pt_pv == NULL) 5425 return NULL; 5426 5427 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 5428 if ((*ptep & pmap->pmap_bits[PG_V_IDX]) == 0) { 5429 pv_put(pt_pv); 5430 return NULL; 5431 } 5432 5433 if (pmap_pte_w(pmap, ptep)) { 5434 atomic_add_long(&pt_pv->pv_pmap->pm_stats.wired_count, 5435 -1); 5436 } 5437 /* XXX else return NULL so caller doesn't unwire m ? */ 5438 5439 atomic_clear_long(ptep, pmap->pmap_bits[PG_W_IDX]); 5440 5441 pa = *ptep & PG_FRAME; 5442 m = PHYS_TO_VM_PAGE(pa); /* held by wired count */ 5443 pv_put(pt_pv); 5444 } 5445 return m; 5446 } 5447 5448 /* 5449 * Copy the range specified by src_addr/len from the source map to 5450 * the range dst_addr/len in the destination map. 5451 * 5452 * This routine is only advisory and need not do anything. 5453 */ 5454 void 5455 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 5456 vm_size_t len, vm_offset_t src_addr) 5457 { 5458 } 5459 5460 /* 5461 * pmap_zero_page: 5462 * 5463 * Zero the specified physical page. 5464 * 5465 * This function may be called from an interrupt and no locking is 5466 * required. 5467 */ 5468 void 5469 pmap_zero_page(vm_paddr_t phys) 5470 { 5471 vm_offset_t va = PHYS_TO_DMAP(phys); 5472 5473 pagezero((void *)va); 5474 } 5475 5476 /* 5477 * pmap_zero_page: 5478 * 5479 * Zero part of a physical page by mapping it into memory and clearing 5480 * its contents with bzero. 5481 * 5482 * off and size may not cover an area beyond a single hardware page. 5483 */ 5484 void 5485 pmap_zero_page_area(vm_paddr_t phys, int off, int size) 5486 { 5487 vm_offset_t virt = PHYS_TO_DMAP(phys); 5488 5489 bzero((char *)virt + off, size); 5490 } 5491 5492 /* 5493 * pmap_copy_page: 5494 * 5495 * Copy the physical page from the source PA to the target PA. 5496 * This function may be called from an interrupt. No locking 5497 * is required. 5498 */ 5499 void 5500 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst) 5501 { 5502 vm_offset_t src_virt, dst_virt; 5503 5504 src_virt = PHYS_TO_DMAP(src); 5505 dst_virt = PHYS_TO_DMAP(dst); 5506 bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE); 5507 } 5508 5509 /* 5510 * pmap_copy_page_frag: 5511 * 5512 * Copy the physical page from the source PA to the target PA. 5513 * This function may be called from an interrupt. No locking 5514 * is required. 5515 */ 5516 void 5517 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes) 5518 { 5519 vm_offset_t src_virt, dst_virt; 5520 5521 src_virt = PHYS_TO_DMAP(src); 5522 dst_virt = PHYS_TO_DMAP(dst); 5523 5524 bcopy((char *)src_virt + (src & PAGE_MASK), 5525 (char *)dst_virt + (dst & PAGE_MASK), 5526 bytes); 5527 } 5528 5529 /* 5530 * Returns true if the pmap's pv is one of the first 16 pvs linked to from 5531 * this page. This count may be changed upwards or downwards in the future; 5532 * it is only necessary that true be returned for a small subset of pmaps 5533 * for proper page aging. 5534 */ 5535 boolean_t 5536 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 5537 { 5538 pv_entry_t pv; 5539 int loops = 0; 5540 5541 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 5542 return FALSE; 5543 5544 vm_page_spin_lock(m); 5545 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 5546 if (pv->pv_pmap == pmap) { 5547 vm_page_spin_unlock(m); 5548 return TRUE; 5549 } 5550 loops++; 5551 if (loops >= 16) 5552 break; 5553 } 5554 vm_page_spin_unlock(m); 5555 return (FALSE); 5556 } 5557 5558 /* 5559 * Remove all pages from specified address space this aids process exit 5560 * speeds. Also, this code may be special cased for the current process 5561 * only. 5562 */ 5563 void 5564 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5565 { 5566 pmap_remove_noinval(pmap, sva, eva); 5567 cpu_invltlb(); 5568 } 5569 5570 /* 5571 * pmap_testbit tests bits in pte's note that the testbit/clearbit 5572 * routines are inline, and a lot of things compile-time evaluate. 5573 */ 5574 5575 static 5576 boolean_t 5577 pmap_testbit(vm_page_t m, int bit) 5578 { 5579 pv_entry_t pv; 5580 pt_entry_t *pte; 5581 pmap_t pmap; 5582 5583 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 5584 return FALSE; 5585 5586 if (TAILQ_FIRST(&m->md.pv_list) == NULL) 5587 return FALSE; 5588 vm_page_spin_lock(m); 5589 if (TAILQ_FIRST(&m->md.pv_list) == NULL) { 5590 vm_page_spin_unlock(m); 5591 return FALSE; 5592 } 5593 5594 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 5595 #if defined(PMAP_DIAGNOSTIC) 5596 if (pv->pv_pmap == NULL) { 5597 kprintf("Null pmap (tb) at pindex: %"PRIu64"\n", 5598 pv->pv_pindex); 5599 continue; 5600 } 5601 #endif 5602 pmap = pv->pv_pmap; 5603 5604 /* 5605 * If the bit being tested is the modified bit, then 5606 * mark clean_map and ptes as never 5607 * modified. 5608 * 5609 * WARNING! Because we do not lock the pv, *pte can be in a 5610 * state of flux. Despite this the value of *pte 5611 * will still be related to the vm_page in some way 5612 * because the pv cannot be destroyed as long as we 5613 * hold the vm_page spin lock. 5614 */ 5615 if (bit == PG_A_IDX || bit == PG_M_IDX) { 5616 //& (pmap->pmap_bits[PG_A_IDX] | pmap->pmap_bits[PG_M_IDX])) { 5617 if (!pmap_track_modified(pv->pv_pindex)) 5618 continue; 5619 } 5620 5621 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); 5622 if (*pte & pmap->pmap_bits[bit]) { 5623 vm_page_spin_unlock(m); 5624 return TRUE; 5625 } 5626 } 5627 vm_page_spin_unlock(m); 5628 return (FALSE); 5629 } 5630 5631 /* 5632 * This routine is used to modify bits in ptes. Only one bit should be 5633 * specified. PG_RW requires special handling. 5634 * 5635 * Caller must NOT hold any spin locks 5636 */ 5637 static __inline 5638 void 5639 pmap_clearbit(vm_page_t m, int bit_index) 5640 { 5641 pv_entry_t pv; 5642 pt_entry_t *pte; 5643 pt_entry_t pbits; 5644 pmap_t pmap; 5645 5646 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { 5647 if (bit_index == PG_RW_IDX) 5648 vm_page_flag_clear(m, PG_WRITEABLE); 5649 return; 5650 } 5651 5652 /* 5653 * PG_M or PG_A case 5654 * 5655 * Loop over all current mappings setting/clearing as appropos If 5656 * setting RO do we need to clear the VAC? 5657 * 5658 * NOTE: When clearing PG_M we could also (not implemented) drop 5659 * through to the PG_RW code and clear PG_RW too, forcing 5660 * a fault on write to redetect PG_M for virtual kernels, but 5661 * it isn't necessary since virtual kernels invalidate the 5662 * pte when they clear the VPTE_M bit in their virtual page 5663 * tables. 5664 * 5665 * NOTE: Does not re-dirty the page when clearing only PG_M. 5666 * 5667 * NOTE: Because we do not lock the pv, *pte can be in a state of 5668 * flux. Despite this the value of *pte is still somewhat 5669 * related while we hold the vm_page spin lock. 5670 * 5671 * *pte can be zero due to this race. Since we are clearing 5672 * bits we basically do no harm when this race occurs. 5673 */ 5674 if (bit_index != PG_RW_IDX) { 5675 vm_page_spin_lock(m); 5676 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 5677 #if defined(PMAP_DIAGNOSTIC) 5678 if (pv->pv_pmap == NULL) { 5679 kprintf("Null pmap (cb) at pindex: %"PRIu64"\n", 5680 pv->pv_pindex); 5681 continue; 5682 } 5683 #endif 5684 pmap = pv->pv_pmap; 5685 pte = pmap_pte_quick(pv->pv_pmap, 5686 pv->pv_pindex << PAGE_SHIFT); 5687 pbits = *pte; 5688 if (pbits & pmap->pmap_bits[bit_index]) 5689 atomic_clear_long(pte, pmap->pmap_bits[bit_index]); 5690 } 5691 vm_page_spin_unlock(m); 5692 return; 5693 } 5694 5695 /* 5696 * Clear PG_RW. Also clears PG_M and marks the page dirty if PG_M 5697 * was set. 5698 */ 5699 restart: 5700 vm_page_spin_lock(m); 5701 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 5702 /* 5703 * don't write protect pager mappings 5704 */ 5705 if (!pmap_track_modified(pv->pv_pindex)) 5706 continue; 5707 5708 #if defined(PMAP_DIAGNOSTIC) 5709 if (pv->pv_pmap == NULL) { 5710 kprintf("Null pmap (cb) at pindex: %"PRIu64"\n", 5711 pv->pv_pindex); 5712 continue; 5713 } 5714 #endif 5715 pmap = pv->pv_pmap; 5716 5717 /* 5718 * Skip pages which do not have PG_RW set. 5719 */ 5720 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); 5721 if ((*pte & pmap->pmap_bits[PG_RW_IDX]) == 0) 5722 continue; 5723 5724 /* 5725 * We must lock the PV to be able to safely test the pte. 5726 */ 5727 if (pv_hold_try(pv)) { 5728 vm_page_spin_unlock(m); 5729 } else { 5730 vm_page_spin_unlock(m); 5731 pv_lock(pv); /* held, now do a blocking lock */ 5732 pv_put(pv); 5733 goto restart; 5734 } 5735 5736 /* 5737 * Reload pte after acquiring pv. 5738 */ 5739 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); 5740 #if 0 5741 if ((*pte & pmap->pmap_bits[PG_RW_IDX]) == 0) { 5742 pv_put(pv); 5743 goto restart; 5744 } 5745 #endif 5746 5747 KKASSERT(pv->pv_pmap == pmap && pv->pv_m == m); 5748 for (;;) { 5749 pt_entry_t nbits; 5750 5751 pbits = *pte; 5752 cpu_ccfence(); 5753 nbits = pbits & ~(pmap->pmap_bits[PG_RW_IDX] | 5754 pmap->pmap_bits[PG_M_IDX]); 5755 if (pmap_inval_smp_cmpset(pmap, 5756 ((vm_offset_t)pv->pv_pindex << PAGE_SHIFT), 5757 pte, pbits, nbits)) { 5758 break; 5759 } 5760 cpu_pause(); 5761 } 5762 5763 /* 5764 * If PG_M was found to be set while we were clearing PG_RW 5765 * we also clear PG_M (done above) and mark the page dirty. 5766 * Callers expect this behavior. 5767 * 5768 * we lost pv so it cannot be used as an iterator. In fact, 5769 * because we couldn't necessarily lock it atomically it may 5770 * have moved within the list and ALSO cannot be used as an 5771 * iterator. 5772 */ 5773 vm_page_spin_lock(m); 5774 if (pbits & pmap->pmap_bits[PG_M_IDX]) 5775 vm_page_dirty(m); 5776 vm_page_spin_unlock(m); 5777 pv_put(pv); 5778 goto restart; 5779 } 5780 if (bit_index == PG_RW_IDX) 5781 vm_page_flag_clear(m, PG_WRITEABLE); 5782 vm_page_spin_unlock(m); 5783 } 5784 5785 /* 5786 * Lower the permission for all mappings to a given page. 5787 * 5788 * Page must be busied by caller. Because page is busied by caller this 5789 * should not be able to race a pmap_enter(). 5790 */ 5791 void 5792 pmap_page_protect(vm_page_t m, vm_prot_t prot) 5793 { 5794 /* JG NX support? */ 5795 if ((prot & VM_PROT_WRITE) == 0) { 5796 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { 5797 /* 5798 * NOTE: pmap_clearbit(.. PG_RW) also clears 5799 * the PG_WRITEABLE flag in (m). 5800 */ 5801 pmap_clearbit(m, PG_RW_IDX); 5802 } else { 5803 pmap_remove_all(m); 5804 } 5805 } 5806 } 5807 5808 vm_paddr_t 5809 pmap_phys_address(vm_pindex_t ppn) 5810 { 5811 return (x86_64_ptob(ppn)); 5812 } 5813 5814 /* 5815 * Return a count of reference bits for a page, clearing those bits. 5816 * It is not necessary for every reference bit to be cleared, but it 5817 * is necessary that 0 only be returned when there are truly no 5818 * reference bits set. 5819 * 5820 * XXX: The exact number of bits to check and clear is a matter that 5821 * should be tested and standardized at some point in the future for 5822 * optimal aging of shared pages. 5823 * 5824 * This routine may not block. 5825 */ 5826 int 5827 pmap_ts_referenced(vm_page_t m) 5828 { 5829 pv_entry_t pv; 5830 pt_entry_t *pte; 5831 pmap_t pmap; 5832 int rtval = 0; 5833 5834 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 5835 return (rtval); 5836 5837 vm_page_spin_lock(m); 5838 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 5839 if (!pmap_track_modified(pv->pv_pindex)) 5840 continue; 5841 pmap = pv->pv_pmap; 5842 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); 5843 if (pte && (*pte & pmap->pmap_bits[PG_A_IDX])) { 5844 atomic_clear_long(pte, pmap->pmap_bits[PG_A_IDX]); 5845 rtval++; 5846 if (rtval > 4) 5847 break; 5848 } 5849 } 5850 vm_page_spin_unlock(m); 5851 return (rtval); 5852 } 5853 5854 /* 5855 * pmap_is_modified: 5856 * 5857 * Return whether or not the specified physical page was modified 5858 * in any physical maps. 5859 */ 5860 boolean_t 5861 pmap_is_modified(vm_page_t m) 5862 { 5863 boolean_t res; 5864 5865 res = pmap_testbit(m, PG_M_IDX); 5866 return (res); 5867 } 5868 5869 /* 5870 * Clear the modify bits on the specified physical page. 5871 */ 5872 void 5873 pmap_clear_modify(vm_page_t m) 5874 { 5875 pmap_clearbit(m, PG_M_IDX); 5876 } 5877 5878 /* 5879 * pmap_clear_reference: 5880 * 5881 * Clear the reference bit on the specified physical page. 5882 */ 5883 void 5884 pmap_clear_reference(vm_page_t m) 5885 { 5886 pmap_clearbit(m, PG_A_IDX); 5887 } 5888 5889 /* 5890 * Miscellaneous support routines follow 5891 */ 5892 5893 static 5894 void 5895 i386_protection_init(void) 5896 { 5897 uint64_t *kp; 5898 int prot; 5899 5900 /* 5901 * NX supported? (boot time loader.conf override only) 5902 */ 5903 TUNABLE_INT_FETCH("machdep.pmap_nx_enable", &pmap_nx_enable); 5904 if (pmap_nx_enable == 0 || (amd_feature & AMDID_NX) == 0) 5905 pmap_bits_default[PG_NX_IDX] = 0; 5906 5907 /* 5908 * 0 is basically read-only access, but also set the NX (no-execute) 5909 * bit when VM_PROT_EXECUTE is not specified. 5910 */ 5911 kp = protection_codes; 5912 for (prot = 0; prot < PROTECTION_CODES_SIZE; prot++) { 5913 switch (prot) { 5914 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: 5915 /* 5916 * This case handled elsewhere 5917 */ 5918 *kp++ = 0; 5919 break; 5920 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: 5921 /* 5922 * Read-only is 0|NX 5923 */ 5924 *kp++ = pmap_bits_default[PG_NX_IDX]; 5925 break; 5926 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: 5927 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: 5928 /* 5929 * Execute requires read access 5930 */ 5931 *kp++ = 0; 5932 break; 5933 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: 5934 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: 5935 /* 5936 * Write without execute is RW|NX 5937 */ 5938 *kp++ = pmap_bits_default[PG_RW_IDX] | 5939 pmap_bits_default[PG_NX_IDX]; 5940 break; 5941 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: 5942 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: 5943 /* 5944 * Write with execute is RW 5945 */ 5946 *kp++ = pmap_bits_default[PG_RW_IDX]; 5947 break; 5948 } 5949 } 5950 } 5951 5952 /* 5953 * Map a set of physical memory pages into the kernel virtual 5954 * address space. Return a pointer to where it is mapped. This 5955 * routine is intended to be used for mapping device memory, 5956 * NOT real memory. 5957 * 5958 * NOTE: We can't use pgeflag unless we invalidate the pages one at 5959 * a time. 5960 * 5961 * NOTE: The PAT attributes {WRITE_BACK, WRITE_THROUGH, UNCACHED, UNCACHEABLE} 5962 * work whether the cpu supports PAT or not. The remaining PAT 5963 * attributes {WRITE_PROTECTED, WRITE_COMBINING} only work if the cpu 5964 * supports PAT. 5965 */ 5966 void * 5967 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 5968 { 5969 return(pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5970 } 5971 5972 void * 5973 pmap_mapdev_uncacheable(vm_paddr_t pa, vm_size_t size) 5974 { 5975 return(pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 5976 } 5977 5978 void * 5979 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 5980 { 5981 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5982 } 5983 5984 /* 5985 * Map a set of physical memory pages into the kernel virtual 5986 * address space. Return a pointer to where it is mapped. This 5987 * routine is intended to be used for mapping device memory, 5988 * NOT real memory. 5989 */ 5990 void * 5991 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 5992 { 5993 vm_offset_t va, tmpva, offset; 5994 pt_entry_t *pte; 5995 vm_size_t tmpsize; 5996 5997 offset = pa & PAGE_MASK; 5998 size = roundup(offset + size, PAGE_SIZE); 5999 6000 va = kmem_alloc_nofault(&kernel_map, size, VM_SUBSYS_MAPDEV, PAGE_SIZE); 6001 if (va == 0) 6002 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 6003 6004 pa = pa & ~PAGE_MASK; 6005 for (tmpva = va, tmpsize = size; tmpsize > 0;) { 6006 pte = vtopte(tmpva); 6007 *pte = pa | 6008 kernel_pmap.pmap_bits[PG_RW_IDX] | 6009 kernel_pmap.pmap_bits[PG_V_IDX] | /* pgeflag | */ 6010 kernel_pmap.pmap_cache_bits[mode]; 6011 tmpsize -= PAGE_SIZE; 6012 tmpva += PAGE_SIZE; 6013 pa += PAGE_SIZE; 6014 } 6015 pmap_invalidate_range(&kernel_pmap, va, va + size); 6016 pmap_invalidate_cache_range(va, va + size); 6017 6018 return ((void *)(va + offset)); 6019 } 6020 6021 void 6022 pmap_unmapdev(vm_offset_t va, vm_size_t size) 6023 { 6024 vm_offset_t base, offset; 6025 6026 base = va & ~PAGE_MASK; 6027 offset = va & PAGE_MASK; 6028 size = roundup(offset + size, PAGE_SIZE); 6029 pmap_qremove(va, size >> PAGE_SHIFT); 6030 kmem_free(&kernel_map, base, size); 6031 } 6032 6033 /* 6034 * Sets the memory attribute for the specified page. 6035 */ 6036 void 6037 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 6038 { 6039 6040 m->pat_mode = ma; 6041 6042 /* 6043 * If "m" is a normal page, update its direct mapping. This update 6044 * can be relied upon to perform any cache operations that are 6045 * required for data coherence. 6046 */ 6047 if ((m->flags & PG_FICTITIOUS) == 0) 6048 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 1, m->pat_mode); 6049 } 6050 6051 /* 6052 * Change the PAT attribute on an existing kernel memory map. Caller 6053 * must ensure that the virtual memory in question is not accessed 6054 * during the adjustment. 6055 */ 6056 void 6057 pmap_change_attr(vm_offset_t va, vm_size_t count, int mode) 6058 { 6059 pt_entry_t *pte; 6060 vm_offset_t base; 6061 int changed = 0; 6062 6063 if (va == 0) 6064 panic("pmap_change_attr: va is NULL"); 6065 base = trunc_page(va); 6066 6067 while (count) { 6068 pte = vtopte(va); 6069 *pte = (*pte & ~(pt_entry_t)(kernel_pmap.pmap_cache_mask)) | 6070 kernel_pmap.pmap_cache_bits[mode]; 6071 --count; 6072 va += PAGE_SIZE; 6073 } 6074 6075 changed = 1; /* XXX: not optimal */ 6076 6077 /* 6078 * Flush CPU caches if required to make sure any data isn't cached that 6079 * shouldn't be, etc. 6080 */ 6081 if (changed) { 6082 pmap_invalidate_range(&kernel_pmap, base, va); 6083 pmap_invalidate_cache_range(base, va); 6084 } 6085 } 6086 6087 /* 6088 * perform the pmap work for mincore 6089 */ 6090 int 6091 pmap_mincore(pmap_t pmap, vm_offset_t addr) 6092 { 6093 pt_entry_t *ptep, pte; 6094 vm_page_t m; 6095 int val = 0; 6096 6097 ptep = pmap_pte(pmap, addr); 6098 6099 if (ptep && (pte = *ptep) != 0) { 6100 vm_offset_t pa; 6101 6102 val = MINCORE_INCORE; 6103 if ((pte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0) 6104 goto done; 6105 6106 pa = pte & PG_FRAME; 6107 6108 if (pte & pmap->pmap_bits[PG_DEVICE_IDX]) 6109 m = NULL; 6110 else 6111 m = PHYS_TO_VM_PAGE(pa); 6112 6113 /* 6114 * Modified by us 6115 */ 6116 if (pte & pmap->pmap_bits[PG_M_IDX]) 6117 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 6118 /* 6119 * Modified by someone 6120 */ 6121 else if (m && (m->dirty || pmap_is_modified(m))) 6122 val |= MINCORE_MODIFIED_OTHER; 6123 /* 6124 * Referenced by us 6125 */ 6126 if (pte & pmap->pmap_bits[PG_A_IDX]) 6127 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 6128 6129 /* 6130 * Referenced by someone 6131 */ 6132 else if (m && ((m->flags & PG_REFERENCED) || 6133 pmap_ts_referenced(m))) { 6134 val |= MINCORE_REFERENCED_OTHER; 6135 vm_page_flag_set(m, PG_REFERENCED); 6136 } 6137 } 6138 done: 6139 6140 return val; 6141 } 6142 6143 /* 6144 * Replace p->p_vmspace with a new one. If adjrefs is non-zero the new 6145 * vmspace will be ref'd and the old one will be deref'd. 6146 * 6147 * The vmspace for all lwps associated with the process will be adjusted 6148 * and cr3 will be reloaded if any lwp is the current lwp. 6149 * 6150 * The process must hold the vmspace->vm_map.token for oldvm and newvm 6151 */ 6152 void 6153 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs) 6154 { 6155 struct vmspace *oldvm; 6156 struct lwp *lp; 6157 6158 oldvm = p->p_vmspace; 6159 if (oldvm != newvm) { 6160 if (adjrefs) 6161 vmspace_ref(newvm); 6162 p->p_vmspace = newvm; 6163 KKASSERT(p->p_nthreads == 1); 6164 lp = RB_ROOT(&p->p_lwp_tree); 6165 pmap_setlwpvm(lp, newvm); 6166 if (adjrefs) 6167 vmspace_rel(oldvm); 6168 } 6169 } 6170 6171 /* 6172 * Set the vmspace for a LWP. The vmspace is almost universally set the 6173 * same as the process vmspace, but virtual kernels need to swap out contexts 6174 * on a per-lwp basis. 6175 * 6176 * Caller does not necessarily hold any vmspace tokens. Caller must control 6177 * the lwp (typically be in the context of the lwp). We use a critical 6178 * section to protect against statclock and hardclock (statistics collection). 6179 */ 6180 void 6181 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm) 6182 { 6183 struct vmspace *oldvm; 6184 struct pmap *pmap; 6185 6186 oldvm = lp->lwp_vmspace; 6187 6188 if (oldvm != newvm) { 6189 crit_enter(); 6190 KKASSERT((newvm->vm_refcnt & VM_REF_DELETED) == 0); 6191 lp->lwp_vmspace = newvm; 6192 if (curthread->td_lwp == lp) { 6193 pmap = vmspace_pmap(newvm); 6194 ATOMIC_CPUMASK_ORBIT(pmap->pm_active, mycpu->gd_cpuid); 6195 if (pmap->pm_active_lock & CPULOCK_EXCL) 6196 pmap_interlock_wait(newvm); 6197 #if defined(SWTCH_OPTIM_STATS) 6198 tlb_flush_count++; 6199 #endif 6200 if (pmap->pmap_bits[TYPE_IDX] == REGULAR_PMAP) { 6201 curthread->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4); 6202 } else if (pmap->pmap_bits[TYPE_IDX] == EPT_PMAP) { 6203 curthread->td_pcb->pcb_cr3 = KPML4phys; 6204 } else { 6205 panic("pmap_setlwpvm: unknown pmap type\n"); 6206 } 6207 load_cr3(curthread->td_pcb->pcb_cr3); 6208 pmap = vmspace_pmap(oldvm); 6209 ATOMIC_CPUMASK_NANDBIT(pmap->pm_active, 6210 mycpu->gd_cpuid); 6211 } 6212 crit_exit(); 6213 } 6214 } 6215 6216 /* 6217 * Called when switching to a locked pmap, used to interlock against pmaps 6218 * undergoing modifications to prevent us from activating the MMU for the 6219 * target pmap until all such modifications have completed. We have to do 6220 * this because the thread making the modifications has already set up its 6221 * SMP synchronization mask. 6222 * 6223 * This function cannot sleep! 6224 * 6225 * No requirements. 6226 */ 6227 void 6228 pmap_interlock_wait(struct vmspace *vm) 6229 { 6230 struct pmap *pmap = &vm->vm_pmap; 6231 6232 if (pmap->pm_active_lock & CPULOCK_EXCL) { 6233 crit_enter(); 6234 KKASSERT(curthread->td_critcount >= 2); 6235 DEBUG_PUSH_INFO("pmap_interlock_wait"); 6236 while (pmap->pm_active_lock & CPULOCK_EXCL) { 6237 cpu_ccfence(); 6238 lwkt_process_ipiq(); 6239 } 6240 DEBUG_POP_INFO(); 6241 crit_exit(); 6242 } 6243 } 6244 6245 vm_offset_t 6246 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) 6247 { 6248 6249 if ((obj == NULL) || (size < NBPDR) || 6250 ((obj->type != OBJT_DEVICE) && (obj->type != OBJT_MGTDEVICE))) { 6251 return addr; 6252 } 6253 6254 addr = roundup2(addr, NBPDR); 6255 return addr; 6256 } 6257 6258 /* 6259 * Used by kmalloc/kfree, page already exists at va 6260 */ 6261 vm_page_t 6262 pmap_kvtom(vm_offset_t va) 6263 { 6264 pt_entry_t *ptep = vtopte(va); 6265 6266 KKASSERT((*ptep & kernel_pmap.pmap_bits[PG_DEVICE_IDX]) == 0); 6267 return(PHYS_TO_VM_PAGE(*ptep & PG_FRAME)); 6268 } 6269 6270 /* 6271 * Initialize machine-specific shared page directory support. This 6272 * is executed when a VM object is created. 6273 */ 6274 void 6275 pmap_object_init(vm_object_t object) 6276 { 6277 object->md.pmap_rw = NULL; 6278 object->md.pmap_ro = NULL; 6279 } 6280 6281 /* 6282 * Clean up machine-specific shared page directory support. This 6283 * is executed when a VM object is destroyed. 6284 */ 6285 void 6286 pmap_object_free(vm_object_t object) 6287 { 6288 pmap_t pmap; 6289 6290 if ((pmap = object->md.pmap_rw) != NULL) { 6291 object->md.pmap_rw = NULL; 6292 pmap_remove_noinval(pmap, 6293 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); 6294 CPUMASK_ASSZERO(pmap->pm_active); 6295 pmap_release(pmap); 6296 pmap_puninit(pmap); 6297 kfree(pmap, M_OBJPMAP); 6298 } 6299 if ((pmap = object->md.pmap_ro) != NULL) { 6300 object->md.pmap_ro = NULL; 6301 pmap_remove_noinval(pmap, 6302 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); 6303 CPUMASK_ASSZERO(pmap->pm_active); 6304 pmap_release(pmap); 6305 pmap_puninit(pmap); 6306 kfree(pmap, M_OBJPMAP); 6307 } 6308 } 6309 6310 /* 6311 * pmap_pgscan_callback - Used by pmap_pgscan to acquire the related 6312 * VM page and issue a pginfo->callback. 6313 * 6314 * We are expected to dispose of any non-NULL pte_pv. 6315 */ 6316 static 6317 void 6318 pmap_pgscan_callback(pmap_t pmap, struct pmap_scan_info *info, 6319 pv_entry_t pte_pv, vm_pindex_t *pte_placemark, 6320 pv_entry_t pt_pv, int sharept, 6321 vm_offset_t va, pt_entry_t *ptep, void *arg) 6322 { 6323 struct pmap_pgscan_info *pginfo = arg; 6324 vm_page_t m; 6325 6326 if (pte_pv) { 6327 /* 6328 * Try to busy the page while we hold the pte_pv locked. 6329 */ 6330 KKASSERT(pte_pv->pv_m); 6331 m = PHYS_TO_VM_PAGE(*ptep & PG_FRAME); 6332 if (vm_page_busy_try(m, TRUE) == 0) { 6333 if (m == PHYS_TO_VM_PAGE(*ptep & PG_FRAME)) { 6334 /* 6335 * The callback is issued with the pte_pv 6336 * unlocked and put away, and the pt_pv 6337 * unlocked. 6338 */ 6339 pv_put(pte_pv); 6340 if (pt_pv) { 6341 vm_page_wire_quick(pt_pv->pv_m); 6342 pv_unlock(pt_pv); 6343 } 6344 if (pginfo->callback(pginfo, va, m) < 0) 6345 info->stop = 1; 6346 if (pt_pv) { 6347 pv_lock(pt_pv); 6348 vm_page_unwire_quick(pt_pv->pv_m); 6349 } 6350 } else { 6351 vm_page_wakeup(m); 6352 pv_put(pte_pv); 6353 } 6354 } else { 6355 ++pginfo->busycount; 6356 pv_put(pte_pv); 6357 } 6358 } else { 6359 /* 6360 * Shared page table or unmanaged page (sharept or !sharept) 6361 */ 6362 pv_placemarker_wakeup(pmap, pte_placemark); 6363 } 6364 } 6365 6366 void 6367 pmap_pgscan(struct pmap_pgscan_info *pginfo) 6368 { 6369 struct pmap_scan_info info; 6370 6371 pginfo->offset = pginfo->beg_addr; 6372 info.pmap = pginfo->pmap; 6373 info.sva = pginfo->beg_addr; 6374 info.eva = pginfo->end_addr; 6375 info.func = pmap_pgscan_callback; 6376 info.arg = pginfo; 6377 pmap_scan(&info, 0); 6378 if (info.stop == 0) 6379 pginfo->offset = pginfo->end_addr; 6380 } 6381 6382 /* 6383 * Wait for a placemarker that we do not own to clear. The placemarker 6384 * in question is not necessarily set to the pindex we want, we may have 6385 * to wait on the element because we want to reserve it ourselves. 6386 * 6387 * NOTE: PM_PLACEMARK_WAKEUP sets a bit which is already set in 6388 * PM_NOPLACEMARK, so it does not interfere with placemarks 6389 * which have already been woken up. 6390 */ 6391 static 6392 void 6393 pv_placemarker_wait(pmap_t pmap, vm_pindex_t *pmark) 6394 { 6395 if (*pmark != PM_NOPLACEMARK) { 6396 atomic_set_long(pmark, PM_PLACEMARK_WAKEUP); 6397 tsleep_interlock(pmark, 0); 6398 if (*pmark != PM_NOPLACEMARK) 6399 tsleep(pmark, PINTERLOCKED, "pvplw", 0); 6400 } 6401 } 6402 6403 /* 6404 * Wakeup a placemarker that we own. Replace the entry with 6405 * PM_NOPLACEMARK and issue a wakeup() if necessary. 6406 */ 6407 static 6408 void 6409 pv_placemarker_wakeup(pmap_t pmap, vm_pindex_t *pmark) 6410 { 6411 vm_pindex_t pindex; 6412 6413 pindex = atomic_swap_long(pmark, PM_NOPLACEMARK); 6414 KKASSERT(pindex != PM_NOPLACEMARK); 6415 if (pindex & PM_PLACEMARK_WAKEUP) 6416 wakeup(pmark); 6417 } 6418