1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * Copyright (c) 1994 John S. Dyson 4 * Copyright (c) 1994 David Greenman 5 * Copyright (c) 2003 Peter Wemm 6 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu> 7 * Copyright (c) 2008, 2009 The DragonFly Project. 8 * Copyright (c) 2008, 2009 Jordan Gordeev. 9 * Copyright (c) 2011-2017 Matthew Dillon 10 * All rights reserved. 11 * 12 * This code is derived from software contributed to Berkeley by 13 * the Systems Programming Group of the University of Utah Computer 14 * Science Department and William Jolitz of UUNET Technologies Inc. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. All advertising materials mentioning features or use of this software 25 * must display the following acknowledgement: 26 * This product includes software developed by the University of 27 * California, Berkeley and its contributors. 28 * 4. Neither the name of the University nor the names of its contributors 29 * may be used to endorse or promote products derived from this software 30 * without specific prior written permission. 31 * 32 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 35 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 42 * SUCH DAMAGE. 43 */ 44 /* 45 * Manage physical address maps for x86-64 systems. 46 */ 47 48 #if 0 /* JG */ 49 #include "opt_disable_pse.h" 50 #include "opt_pmap.h" 51 #endif 52 #include "opt_msgbuf.h" 53 54 #include <sys/param.h> 55 #include <sys/kernel.h> 56 #include <sys/proc.h> 57 #include <sys/msgbuf.h> 58 #include <sys/vmmeter.h> 59 #include <sys/mman.h> 60 #include <sys/systm.h> 61 62 #include <vm/vm.h> 63 #include <vm/vm_param.h> 64 #include <sys/sysctl.h> 65 #include <sys/lock.h> 66 #include <vm/vm_kern.h> 67 #include <vm/vm_page.h> 68 #include <vm/vm_map.h> 69 #include <vm/vm_object.h> 70 #include <vm/vm_extern.h> 71 #include <vm/vm_pageout.h> 72 #include <vm/vm_pager.h> 73 #include <vm/vm_zone.h> 74 75 #include <sys/user.h> 76 #include <sys/thread2.h> 77 #include <sys/sysref2.h> 78 #include <sys/spinlock2.h> 79 #include <vm/vm_page2.h> 80 81 #include <machine/cputypes.h> 82 #include <machine/md_var.h> 83 #include <machine/specialreg.h> 84 #include <machine/smp.h> 85 #include <machine_base/apic/apicreg.h> 86 #include <machine/globaldata.h> 87 #include <machine/pmap.h> 88 #include <machine/pmap_inval.h> 89 #include <machine/inttypes.h> 90 91 #include <ddb/ddb.h> 92 93 #define PMAP_KEEP_PDIRS 94 #ifndef PMAP_SHPGPERPROC 95 #define PMAP_SHPGPERPROC 2000 96 #endif 97 98 #if defined(DIAGNOSTIC) 99 #define PMAP_DIAGNOSTIC 100 #endif 101 102 #define MINPV 2048 103 104 /* 105 * pmap debugging will report who owns a pv lock when blocking. 106 */ 107 #ifdef PMAP_DEBUG 108 109 #define PMAP_DEBUG_DECL ,const char *func, int lineno 110 #define PMAP_DEBUG_ARGS , __func__, __LINE__ 111 #define PMAP_DEBUG_COPY , func, lineno 112 113 #define pv_get(pmap, pindex, pmarkp) _pv_get(pmap, pindex, pmarkp \ 114 PMAP_DEBUG_ARGS) 115 #define pv_lock(pv) _pv_lock(pv \ 116 PMAP_DEBUG_ARGS) 117 #define pv_hold_try(pv) _pv_hold_try(pv \ 118 PMAP_DEBUG_ARGS) 119 #define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp \ 120 PMAP_DEBUG_ARGS) 121 122 #define pv_free(pv, pvp) _pv_free(pv, pvp PMAP_DEBUG_ARGS) 123 124 #else 125 126 #define PMAP_DEBUG_DECL 127 #define PMAP_DEBUG_ARGS 128 #define PMAP_DEBUG_COPY 129 130 #define pv_get(pmap, pindex, pmarkp) _pv_get(pmap, pindex, pmarkp) 131 #define pv_lock(pv) _pv_lock(pv) 132 #define pv_hold_try(pv) _pv_hold_try(pv) 133 #define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp) 134 #define pv_free(pv, pvp) _pv_free(pv, pvp) 135 136 #endif 137 138 /* 139 * Get PDEs and PTEs for user/kernel address space 140 */ 141 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 142 143 #define pmap_pde_v(pmap, pte) ((*(pd_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0) 144 #define pmap_pte_w(pmap, pte) ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_W_IDX]) != 0) 145 #define pmap_pte_m(pmap, pte) ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_M_IDX]) != 0) 146 #define pmap_pte_u(pmap, pte) ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_U_IDX]) != 0) 147 #define pmap_pte_v(pmap, pte) ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0) 148 149 /* 150 * Given a map and a machine independent protection code, 151 * convert to a vax protection code. 152 */ 153 #define pte_prot(m, p) \ 154 (m->protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)]) 155 static uint64_t protection_codes[PROTECTION_CODES_SIZE]; 156 157 struct pmap kernel_pmap; 158 159 MALLOC_DEFINE(M_OBJPMAP, "objpmap", "pmaps associated with VM objects"); 160 161 vm_paddr_t avail_start; /* PA of first available physical page */ 162 vm_paddr_t avail_end; /* PA of last available physical page */ 163 vm_offset_t virtual2_start; /* cutout free area prior to kernel start */ 164 vm_offset_t virtual2_end; 165 vm_offset_t virtual_start; /* VA of first avail page (after kernel bss) */ 166 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 167 vm_offset_t KvaStart; /* VA start of KVA space */ 168 vm_offset_t KvaEnd; /* VA end of KVA space (non-inclusive) */ 169 vm_offset_t KvaSize; /* max size of kernel virtual address space */ 170 static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ 171 //static int pgeflag; /* PG_G or-in */ 172 //static int pseflag; /* PG_PS or-in */ 173 uint64_t PatMsr; 174 175 static int ndmpdp; 176 static vm_paddr_t dmaplimit; 177 static int nkpt; 178 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 179 180 static pt_entry_t pat_pte_index[PAT_INDEX_SIZE]; /* PAT -> PG_ bits */ 181 /*static pt_entry_t pat_pde_index[PAT_INDEX_SIZE];*/ /* PAT -> PG_ bits */ 182 183 static uint64_t KPTbase; 184 static uint64_t KPTphys; 185 static uint64_t KPDphys; /* phys addr of kernel level 2 */ 186 static uint64_t KPDbase; /* phys addr of kernel level 2 @ KERNBASE */ 187 uint64_t KPDPphys; /* phys addr of kernel level 3 */ 188 uint64_t KPML4phys; /* phys addr of kernel level 4 */ 189 190 static uint64_t DMPDphys; /* phys addr of direct mapped level 2 */ 191 static uint64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 192 193 /* 194 * Data for the pv entry allocation mechanism 195 */ 196 static vm_zone_t pvzone; 197 static struct vm_zone pvzone_store; 198 static int pv_entry_max=0, pv_entry_high_water=0; 199 static int pmap_pagedaemon_waken = 0; 200 static struct pv_entry *pvinit; 201 202 /* 203 * All those kernel PT submaps that BSD is so fond of 204 */ 205 pt_entry_t *CMAP1 = NULL, *ptmmap; 206 caddr_t CADDR1 = NULL, ptvmmap = NULL; 207 static pt_entry_t *msgbufmap; 208 struct msgbuf *msgbufp=NULL; 209 210 /* 211 * PMAP default PG_* bits. Needed to be able to add 212 * EPT/NPT pagetable pmap_bits for the VMM module 213 */ 214 uint64_t pmap_bits_default[] = { 215 REGULAR_PMAP, /* TYPE_IDX 0 */ 216 X86_PG_V, /* PG_V_IDX 1 */ 217 X86_PG_RW, /* PG_RW_IDX 2 */ 218 X86_PG_U, /* PG_U_IDX 3 */ 219 X86_PG_A, /* PG_A_IDX 4 */ 220 X86_PG_M, /* PG_M_IDX 5 */ 221 X86_PG_PS, /* PG_PS_IDX3 6 */ 222 X86_PG_G, /* PG_G_IDX 7 */ 223 X86_PG_AVAIL1, /* PG_AVAIL1_IDX 8 */ 224 X86_PG_AVAIL2, /* PG_AVAIL2_IDX 9 */ 225 X86_PG_AVAIL3, /* PG_AVAIL3_IDX 10 */ 226 X86_PG_NC_PWT | X86_PG_NC_PCD, /* PG_N_IDX 11 */ 227 X86_PG_NX, /* PG_NX_IDX 12 */ 228 }; 229 /* 230 * Crashdump maps. 231 */ 232 static pt_entry_t *pt_crashdumpmap; 233 static caddr_t crashdumpmap; 234 235 static int pmap_debug = 0; 236 SYSCTL_INT(_machdep, OID_AUTO, pmap_debug, CTLFLAG_RW, 237 &pmap_debug, 0, "Debug pmap's"); 238 #ifdef PMAP_DEBUG2 239 static int pmap_enter_debug = 0; 240 SYSCTL_INT(_machdep, OID_AUTO, pmap_enter_debug, CTLFLAG_RW, 241 &pmap_enter_debug, 0, "Debug pmap_enter's"); 242 #endif 243 static int pmap_yield_count = 64; 244 SYSCTL_INT(_machdep, OID_AUTO, pmap_yield_count, CTLFLAG_RW, 245 &pmap_yield_count, 0, "Yield during init_pt/release"); 246 static int pmap_mmu_optimize = 0; 247 SYSCTL_INT(_machdep, OID_AUTO, pmap_mmu_optimize, CTLFLAG_RW, 248 &pmap_mmu_optimize, 0, "Share page table pages when possible"); 249 int pmap_fast_kernel_cpusync = 0; 250 SYSCTL_INT(_machdep, OID_AUTO, pmap_fast_kernel_cpusync, CTLFLAG_RW, 251 &pmap_fast_kernel_cpusync, 0, "Share page table pages when possible"); 252 int pmap_dynamic_delete = 0; 253 SYSCTL_INT(_machdep, OID_AUTO, pmap_dynamic_delete, CTLFLAG_RW, 254 &pmap_dynamic_delete, 0, "Dynamically delete PT/PD/PDPs"); 255 256 static int pmap_nx_enable = 0; 257 /* needs manual TUNABLE in early probe, see below */ 258 259 #define DISABLE_PSE 260 261 /* Standard user access funtions */ 262 extern int std_copyinstr (const void *udaddr, void *kaddr, size_t len, 263 size_t *lencopied); 264 extern int std_copyin (const void *udaddr, void *kaddr, size_t len); 265 extern int std_copyout (const void *kaddr, void *udaddr, size_t len); 266 extern int std_fubyte (const uint8_t *base); 267 extern int std_subyte (uint8_t *base, uint8_t byte); 268 extern int32_t std_fuword32 (const uint32_t *base); 269 extern int64_t std_fuword64 (const uint64_t *base); 270 extern int std_suword64 (uint64_t *base, uint64_t word); 271 extern int std_suword32 (uint32_t *base, int word); 272 extern uint32_t std_swapu32 (volatile uint32_t *base, uint32_t v); 273 extern uint64_t std_swapu64 (volatile uint64_t *base, uint64_t v); 274 275 static void pv_hold(pv_entry_t pv); 276 static int _pv_hold_try(pv_entry_t pv 277 PMAP_DEBUG_DECL); 278 static void pv_drop(pv_entry_t pv); 279 static void _pv_lock(pv_entry_t pv 280 PMAP_DEBUG_DECL); 281 static void pv_unlock(pv_entry_t pv); 282 static pv_entry_t _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew 283 PMAP_DEBUG_DECL); 284 static pv_entry_t _pv_get(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp 285 PMAP_DEBUG_DECL); 286 static void _pv_free(pv_entry_t pv, pv_entry_t pvp PMAP_DEBUG_DECL); 287 static pv_entry_t pv_get_try(pmap_t pmap, vm_pindex_t pindex, 288 vm_pindex_t **pmarkp, int *errorp); 289 static void pv_put(pv_entry_t pv); 290 static void *pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex); 291 static pv_entry_t pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, 292 pv_entry_t *pvpp); 293 static pv_entry_t pmap_allocpte_seg(pmap_t pmap, vm_pindex_t ptepindex, 294 pv_entry_t *pvpp, vm_map_entry_t entry, vm_offset_t va); 295 static void pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, 296 pmap_inval_bulk_t *bulk, int destroy); 297 static vm_page_t pmap_remove_pv_page(pv_entry_t pv); 298 static int pmap_release_pv(pv_entry_t pv, pv_entry_t pvp, 299 pmap_inval_bulk_t *bulk); 300 301 struct pmap_scan_info; 302 static void pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info, 303 pv_entry_t pte_pv, vm_pindex_t *pte_placemark, 304 pv_entry_t pt_pv, int sharept, 305 vm_offset_t va, pt_entry_t *ptep, void *arg __unused); 306 static void pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info, 307 pv_entry_t pte_pv, vm_pindex_t *pte_placemark, 308 pv_entry_t pt_pv, int sharept, 309 vm_offset_t va, pt_entry_t *ptep, void *arg __unused); 310 311 static void i386_protection_init (void); 312 static void create_pagetables(vm_paddr_t *firstaddr); 313 static void pmap_remove_all (vm_page_t m); 314 static boolean_t pmap_testbit (vm_page_t m, int bit); 315 316 static pt_entry_t * pmap_pte_quick (pmap_t pmap, vm_offset_t va); 317 static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 318 319 static void pmap_pinit_defaults(struct pmap *pmap); 320 static void pv_placemarker_wait(pmap_t pmap, vm_pindex_t *pmark); 321 static void pv_placemarker_wakeup(pmap_t pmap, vm_pindex_t *pmark); 322 323 static unsigned pdir4mb; 324 325 static int 326 pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2) 327 { 328 if (pv1->pv_pindex < pv2->pv_pindex) 329 return(-1); 330 if (pv1->pv_pindex > pv2->pv_pindex) 331 return(1); 332 return(0); 333 } 334 335 RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry, 336 pv_entry_compare, vm_pindex_t, pv_pindex); 337 338 static __inline 339 void 340 pmap_page_stats_adding(vm_page_t m) 341 { 342 globaldata_t gd = mycpu; 343 344 if (TAILQ_EMPTY(&m->md.pv_list)) { 345 ++gd->gd_vmtotal.t_arm; 346 } else if (TAILQ_FIRST(&m->md.pv_list) == 347 TAILQ_LAST(&m->md.pv_list, md_page_pv_list)) { 348 ++gd->gd_vmtotal.t_armshr; 349 ++gd->gd_vmtotal.t_avmshr; 350 } else { 351 ++gd->gd_vmtotal.t_avmshr; 352 } 353 } 354 355 static __inline 356 void 357 pmap_page_stats_deleting(vm_page_t m) 358 { 359 globaldata_t gd = mycpu; 360 361 if (TAILQ_EMPTY(&m->md.pv_list)) { 362 --gd->gd_vmtotal.t_arm; 363 } else if (TAILQ_FIRST(&m->md.pv_list) == 364 TAILQ_LAST(&m->md.pv_list, md_page_pv_list)) { 365 --gd->gd_vmtotal.t_armshr; 366 --gd->gd_vmtotal.t_avmshr; 367 } else { 368 --gd->gd_vmtotal.t_avmshr; 369 } 370 } 371 372 /* 373 * Move the kernel virtual free pointer to the next 374 * 2MB. This is used to help improve performance 375 * by using a large (2MB) page for much of the kernel 376 * (.text, .data, .bss) 377 */ 378 static 379 vm_offset_t 380 pmap_kmem_choose(vm_offset_t addr) 381 { 382 vm_offset_t newaddr = addr; 383 384 newaddr = roundup2(addr, NBPDR); 385 return newaddr; 386 } 387 388 /* 389 * pmap_pte_quick: 390 * 391 * Super fast pmap_pte routine best used when scanning the pv lists. 392 * This eliminates many course-grained invltlb calls. Note that many of 393 * the pv list scans are across different pmaps and it is very wasteful 394 * to do an entire invltlb when checking a single mapping. 395 */ 396 static __inline pt_entry_t *pmap_pte(pmap_t pmap, vm_offset_t va); 397 398 static 399 pt_entry_t * 400 pmap_pte_quick(pmap_t pmap, vm_offset_t va) 401 { 402 return pmap_pte(pmap, va); 403 } 404 405 /* 406 * Returns the pindex of a page table entry (representing a terminal page). 407 * There are NUPTE_TOTAL page table entries possible (a huge number) 408 * 409 * x86-64 has a 48-bit address space, where bit 47 is sign-extended out. 410 * We want to properly translate negative KVAs. 411 */ 412 static __inline 413 vm_pindex_t 414 pmap_pte_pindex(vm_offset_t va) 415 { 416 return ((va >> PAGE_SHIFT) & (NUPTE_TOTAL - 1)); 417 } 418 419 /* 420 * Returns the pindex of a page table. 421 */ 422 static __inline 423 vm_pindex_t 424 pmap_pt_pindex(vm_offset_t va) 425 { 426 return (NUPTE_TOTAL + ((va >> PDRSHIFT) & (NUPT_TOTAL - 1))); 427 } 428 429 /* 430 * Returns the pindex of a page directory. 431 */ 432 static __inline 433 vm_pindex_t 434 pmap_pd_pindex(vm_offset_t va) 435 { 436 return (NUPTE_TOTAL + NUPT_TOTAL + 437 ((va >> PDPSHIFT) & (NUPD_TOTAL - 1))); 438 } 439 440 static __inline 441 vm_pindex_t 442 pmap_pdp_pindex(vm_offset_t va) 443 { 444 return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + 445 ((va >> PML4SHIFT) & (NUPDP_TOTAL - 1))); 446 } 447 448 static __inline 449 vm_pindex_t 450 pmap_pml4_pindex(void) 451 { 452 return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL); 453 } 454 455 /* 456 * Return various clipped indexes for a given VA 457 * 458 * Returns the index of a pt in a page directory, representing a page 459 * table. 460 */ 461 static __inline 462 vm_pindex_t 463 pmap_pt_index(vm_offset_t va) 464 { 465 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 466 } 467 468 /* 469 * Returns the index of a pd in a page directory page, representing a page 470 * directory. 471 */ 472 static __inline 473 vm_pindex_t 474 pmap_pd_index(vm_offset_t va) 475 { 476 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 477 } 478 479 /* 480 * Returns the index of a pdp in the pml4 table, representing a page 481 * directory page. 482 */ 483 static __inline 484 vm_pindex_t 485 pmap_pdp_index(vm_offset_t va) 486 { 487 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 488 } 489 490 /* 491 * The placemarker hash must be broken up into four zones so lock 492 * ordering semantics continue to work (e.g. pte, pt, pd, then pdp). 493 * 494 * Placemarkers are used to 'lock' page table indices that do not have 495 * a pv_entry. This allows the pmap to support managed and unmanaged 496 * pages and shared page tables. 497 */ 498 #define PM_PLACE_BASE (PM_PLACEMARKS >> 2) 499 500 static __inline 501 vm_pindex_t * 502 pmap_placemarker_hash(pmap_t pmap, vm_pindex_t pindex) 503 { 504 int hi; 505 506 if (pindex < pmap_pt_pindex(0)) /* zone 0 - PTE */ 507 hi = 0; 508 else if (pindex < pmap_pd_pindex(0)) /* zone 1 - PT */ 509 hi = PM_PLACE_BASE; 510 else if (pindex < pmap_pdp_pindex(0)) /* zone 2 - PD */ 511 hi = PM_PLACE_BASE << 1; 512 else /* zone 3 - PDP (and PML4E) */ 513 hi = PM_PLACE_BASE | (PM_PLACE_BASE << 1); 514 hi += pindex & (PM_PLACE_BASE - 1); 515 516 return (&pmap->pm_placemarks[hi]); 517 } 518 519 520 /* 521 * Generic procedure to index a pte from a pt, pd, or pdp. 522 * 523 * NOTE: Normally passed pindex as pmap_xx_index(). pmap_xx_pindex() is NOT 524 * a page table page index but is instead of PV lookup index. 525 */ 526 static 527 void * 528 pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex) 529 { 530 pt_entry_t *pte; 531 532 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pv->pv_m)); 533 return(&pte[pindex]); 534 } 535 536 /* 537 * Return pointer to PDP slot in the PML4 538 */ 539 static __inline 540 pml4_entry_t * 541 pmap_pdp(pmap_t pmap, vm_offset_t va) 542 { 543 return (&pmap->pm_pml4[pmap_pdp_index(va)]); 544 } 545 546 /* 547 * Return pointer to PD slot in the PDP given a pointer to the PDP 548 */ 549 static __inline 550 pdp_entry_t * 551 pmap_pdp_to_pd(pml4_entry_t pdp_pte, vm_offset_t va) 552 { 553 pdp_entry_t *pd; 554 555 pd = (pdp_entry_t *)PHYS_TO_DMAP(pdp_pte & PG_FRAME); 556 return (&pd[pmap_pd_index(va)]); 557 } 558 559 /* 560 * Return pointer to PD slot in the PDP. 561 */ 562 static __inline 563 pdp_entry_t * 564 pmap_pd(pmap_t pmap, vm_offset_t va) 565 { 566 pml4_entry_t *pdp; 567 568 pdp = pmap_pdp(pmap, va); 569 if ((*pdp & pmap->pmap_bits[PG_V_IDX]) == 0) 570 return NULL; 571 return (pmap_pdp_to_pd(*pdp, va)); 572 } 573 574 /* 575 * Return pointer to PT slot in the PD given a pointer to the PD 576 */ 577 static __inline 578 pd_entry_t * 579 pmap_pd_to_pt(pdp_entry_t pd_pte, vm_offset_t va) 580 { 581 pd_entry_t *pt; 582 583 pt = (pd_entry_t *)PHYS_TO_DMAP(pd_pte & PG_FRAME); 584 return (&pt[pmap_pt_index(va)]); 585 } 586 587 /* 588 * Return pointer to PT slot in the PD 589 * 590 * SIMPLE PMAP NOTE: Simple pmaps (embedded in objects) do not have PDPs, 591 * so we cannot lookup the PD via the PDP. Instead we 592 * must look it up via the pmap. 593 */ 594 static __inline 595 pd_entry_t * 596 pmap_pt(pmap_t pmap, vm_offset_t va) 597 { 598 pdp_entry_t *pd; 599 pv_entry_t pv; 600 vm_pindex_t pd_pindex; 601 602 if (pmap->pm_flags & PMAP_FLAG_SIMPLE) { 603 pd_pindex = pmap_pd_pindex(va); 604 spin_lock(&pmap->pm_spin); 605 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pd_pindex); 606 spin_unlock(&pmap->pm_spin); 607 if (pv == NULL || pv->pv_m == NULL) 608 return NULL; 609 return (pmap_pd_to_pt(VM_PAGE_TO_PHYS(pv->pv_m), va)); 610 } else { 611 pd = pmap_pd(pmap, va); 612 if (pd == NULL || (*pd & pmap->pmap_bits[PG_V_IDX]) == 0) 613 return NULL; 614 return (pmap_pd_to_pt(*pd, va)); 615 } 616 } 617 618 /* 619 * Return pointer to PTE slot in the PT given a pointer to the PT 620 */ 621 static __inline 622 pt_entry_t * 623 pmap_pt_to_pte(pd_entry_t pt_pte, vm_offset_t va) 624 { 625 pt_entry_t *pte; 626 627 pte = (pt_entry_t *)PHYS_TO_DMAP(pt_pte & PG_FRAME); 628 return (&pte[pmap_pte_index(va)]); 629 } 630 631 /* 632 * Return pointer to PTE slot in the PT 633 */ 634 static __inline 635 pt_entry_t * 636 pmap_pte(pmap_t pmap, vm_offset_t va) 637 { 638 pd_entry_t *pt; 639 640 pt = pmap_pt(pmap, va); 641 if (pt == NULL || (*pt & pmap->pmap_bits[PG_V_IDX]) == 0) 642 return NULL; 643 if ((*pt & pmap->pmap_bits[PG_PS_IDX]) != 0) 644 return ((pt_entry_t *)pt); 645 return (pmap_pt_to_pte(*pt, va)); 646 } 647 648 /* 649 * Of all the layers (PTE, PT, PD, PDP, PML4) the best one to cache is 650 * the PT layer. This will speed up core pmap operations considerably. 651 * 652 * NOTE: The pmap spinlock does not need to be held but the passed-in pv 653 * must be in a known associated state (typically by being locked when 654 * the pmap spinlock isn't held). We allow the race for that case. 655 * 656 * NOTE: pm_pvhint is only accessed (read) with the spin-lock held, using 657 * cpu_ccfence() to prevent compiler optimizations from reloading the 658 * field. 659 */ 660 static __inline 661 void 662 pv_cache(pv_entry_t pv, vm_pindex_t pindex) 663 { 664 if (pindex >= pmap_pt_pindex(0) && pindex < pmap_pd_pindex(0)) { 665 if (pv->pv_pmap) 666 pv->pv_pmap->pm_pvhint = pv; 667 } 668 } 669 670 671 /* 672 * Return address of PT slot in PD (KVM only) 673 * 674 * Cannot be used for user page tables because it might interfere with 675 * the shared page-table-page optimization (pmap_mmu_optimize). 676 */ 677 static __inline 678 pd_entry_t * 679 vtopt(vm_offset_t va) 680 { 681 uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 682 NPML4EPGSHIFT)) - 1); 683 684 return (PDmap + ((va >> PDRSHIFT) & mask)); 685 } 686 687 /* 688 * KVM - return address of PTE slot in PT 689 */ 690 static __inline 691 pt_entry_t * 692 vtopte(vm_offset_t va) 693 { 694 uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + 695 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 696 697 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 698 } 699 700 static uint64_t 701 allocpages(vm_paddr_t *firstaddr, long n) 702 { 703 uint64_t ret; 704 705 ret = *firstaddr; 706 bzero((void *)ret, n * PAGE_SIZE); 707 *firstaddr += n * PAGE_SIZE; 708 return (ret); 709 } 710 711 static 712 void 713 create_pagetables(vm_paddr_t *firstaddr) 714 { 715 long i; /* must be 64 bits */ 716 long nkpt_base; 717 long nkpt_phys; 718 int j; 719 720 /* 721 * We are running (mostly) V=P at this point 722 * 723 * Calculate NKPT - number of kernel page tables. We have to 724 * accomodoate prealloction of the vm_page_array, dump bitmap, 725 * MSGBUF_SIZE, and other stuff. Be generous. 726 * 727 * Maxmem is in pages. 728 * 729 * ndmpdp is the number of 1GB pages we wish to map. 730 */ 731 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; 732 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 733 ndmpdp = 4; 734 KKASSERT(ndmpdp <= NKPDPE * NPDEPG); 735 736 /* 737 * Starting at the beginning of kvm (not KERNBASE). 738 */ 739 nkpt_phys = (Maxmem * sizeof(struct vm_page) + NBPDR - 1) / NBPDR; 740 nkpt_phys += (Maxmem * sizeof(struct pv_entry) + NBPDR - 1) / NBPDR; 741 nkpt_phys += ((nkpt + nkpt + 1 + NKPML4E + NKPDPE + NDMPML4E + 742 ndmpdp) + 511) / 512; 743 nkpt_phys += 128; 744 745 /* 746 * Starting at KERNBASE - map 2G worth of page table pages. 747 * KERNBASE is offset -2G from the end of kvm. 748 */ 749 nkpt_base = (NPDPEPG - KPDPI) * NPTEPG; /* typically 2 x 512 */ 750 751 /* 752 * Allocate pages 753 */ 754 KPTbase = allocpages(firstaddr, nkpt_base); 755 KPTphys = allocpages(firstaddr, nkpt_phys); 756 KPML4phys = allocpages(firstaddr, 1); 757 KPDPphys = allocpages(firstaddr, NKPML4E); 758 KPDphys = allocpages(firstaddr, NKPDPE); 759 760 /* 761 * Calculate the page directory base for KERNBASE, 762 * that is where we start populating the page table pages. 763 * Basically this is the end - 2. 764 */ 765 KPDbase = KPDphys + ((NKPDPE - (NPDPEPG - KPDPI)) << PAGE_SHIFT); 766 767 DMPDPphys = allocpages(firstaddr, NDMPML4E); 768 if ((amd_feature & AMDID_PAGE1GB) == 0) 769 DMPDphys = allocpages(firstaddr, ndmpdp); 770 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 771 772 /* 773 * Fill in the underlying page table pages for the area around 774 * KERNBASE. This remaps low physical memory to KERNBASE. 775 * 776 * Read-only from zero to physfree 777 * XXX not fully used, underneath 2M pages 778 */ 779 for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) { 780 ((pt_entry_t *)KPTbase)[i] = i << PAGE_SHIFT; 781 ((pt_entry_t *)KPTbase)[i] |= 782 pmap_bits_default[PG_RW_IDX] | 783 pmap_bits_default[PG_V_IDX] | 784 pmap_bits_default[PG_G_IDX]; 785 } 786 787 /* 788 * Now map the initial kernel page tables. One block of page 789 * tables is placed at the beginning of kernel virtual memory, 790 * and another block is placed at KERNBASE to map the kernel binary, 791 * data, bss, and initial pre-allocations. 792 */ 793 for (i = 0; i < nkpt_base; i++) { 794 ((pd_entry_t *)KPDbase)[i] = KPTbase + (i << PAGE_SHIFT); 795 ((pd_entry_t *)KPDbase)[i] |= 796 pmap_bits_default[PG_RW_IDX] | 797 pmap_bits_default[PG_V_IDX]; 798 } 799 for (i = 0; i < nkpt_phys; i++) { 800 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); 801 ((pd_entry_t *)KPDphys)[i] |= 802 pmap_bits_default[PG_RW_IDX] | 803 pmap_bits_default[PG_V_IDX]; 804 } 805 806 /* 807 * Map from zero to end of allocations using 2M pages as an 808 * optimization. This will bypass some of the KPTBase pages 809 * above in the KERNBASE area. 810 */ 811 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) { 812 ((pd_entry_t *)KPDbase)[i] = i << PDRSHIFT; 813 ((pd_entry_t *)KPDbase)[i] |= 814 pmap_bits_default[PG_RW_IDX] | 815 pmap_bits_default[PG_V_IDX] | 816 pmap_bits_default[PG_PS_IDX] | 817 pmap_bits_default[PG_G_IDX]; 818 } 819 820 /* 821 * And connect up the PD to the PDP. The kernel pmap is expected 822 * to pre-populate all of its PDs. See NKPDPE in vmparam.h. 823 */ 824 for (i = 0; i < NKPDPE; i++) { 825 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] = 826 KPDphys + (i << PAGE_SHIFT); 827 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] |= 828 pmap_bits_default[PG_RW_IDX] | 829 pmap_bits_default[PG_V_IDX] | 830 pmap_bits_default[PG_U_IDX]; 831 } 832 833 /* 834 * Now set up the direct map space using either 2MB or 1GB pages 835 * Preset PG_M and PG_A because demotion expects it. 836 * 837 * When filling in entries in the PD pages make sure any excess 838 * entries are set to zero as we allocated enough PD pages 839 */ 840 if ((amd_feature & AMDID_PAGE1GB) == 0) { 841 for (i = 0; i < NPDEPG * ndmpdp; i++) { 842 ((pd_entry_t *)DMPDphys)[i] = i << PDRSHIFT; 843 ((pd_entry_t *)DMPDphys)[i] |= 844 pmap_bits_default[PG_RW_IDX] | 845 pmap_bits_default[PG_V_IDX] | 846 pmap_bits_default[PG_PS_IDX] | 847 pmap_bits_default[PG_G_IDX] | 848 pmap_bits_default[PG_M_IDX] | 849 pmap_bits_default[PG_A_IDX]; 850 } 851 852 /* 853 * And the direct map space's PDP 854 */ 855 for (i = 0; i < ndmpdp; i++) { 856 ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + 857 (i << PAGE_SHIFT); 858 ((pdp_entry_t *)DMPDPphys)[i] |= 859 pmap_bits_default[PG_RW_IDX] | 860 pmap_bits_default[PG_V_IDX] | 861 pmap_bits_default[PG_U_IDX]; 862 } 863 } else { 864 for (i = 0; i < ndmpdp; i++) { 865 ((pdp_entry_t *)DMPDPphys)[i] = 866 (vm_paddr_t)i << PDPSHIFT; 867 ((pdp_entry_t *)DMPDPphys)[i] |= 868 pmap_bits_default[PG_RW_IDX] | 869 pmap_bits_default[PG_V_IDX] | 870 pmap_bits_default[PG_PS_IDX] | 871 pmap_bits_default[PG_G_IDX] | 872 pmap_bits_default[PG_M_IDX] | 873 pmap_bits_default[PG_A_IDX]; 874 } 875 } 876 877 /* And recursively map PML4 to itself in order to get PTmap */ 878 ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; 879 ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= 880 pmap_bits_default[PG_RW_IDX] | 881 pmap_bits_default[PG_V_IDX] | 882 pmap_bits_default[PG_U_IDX]; 883 884 /* 885 * Connect the Direct Map slots up to the PML4 886 */ 887 for (j = 0; j < NDMPML4E; ++j) { 888 ((pdp_entry_t *)KPML4phys)[DMPML4I + j] = 889 (DMPDPphys + ((vm_paddr_t)j << PML4SHIFT)) | 890 pmap_bits_default[PG_RW_IDX] | 891 pmap_bits_default[PG_V_IDX] | 892 pmap_bits_default[PG_U_IDX]; 893 } 894 895 /* 896 * Connect the KVA slot up to the PML4 897 */ 898 ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys; 899 ((pdp_entry_t *)KPML4phys)[KPML4I] |= 900 pmap_bits_default[PG_RW_IDX] | 901 pmap_bits_default[PG_V_IDX] | 902 pmap_bits_default[PG_U_IDX]; 903 } 904 905 /* 906 * Bootstrap the system enough to run with virtual memory. 907 * 908 * On the i386 this is called after mapping has already been enabled 909 * and just syncs the pmap module with what has already been done. 910 * [We can't call it easily with mapping off since the kernel is not 911 * mapped with PA == VA, hence we would have to relocate every address 912 * from the linked base (virtual) address "KERNBASE" to the actual 913 * (physical) address starting relative to 0] 914 */ 915 void 916 pmap_bootstrap(vm_paddr_t *firstaddr) 917 { 918 vm_offset_t va; 919 pt_entry_t *pte; 920 int i; 921 922 KvaStart = VM_MIN_KERNEL_ADDRESS; 923 KvaEnd = VM_MAX_KERNEL_ADDRESS; 924 KvaSize = KvaEnd - KvaStart; 925 926 avail_start = *firstaddr; 927 928 /* 929 * Create an initial set of page tables to run the kernel in. 930 */ 931 create_pagetables(firstaddr); 932 933 virtual2_start = KvaStart; 934 virtual2_end = PTOV_OFFSET; 935 936 virtual_start = (vm_offset_t) PTOV_OFFSET + *firstaddr; 937 virtual_start = pmap_kmem_choose(virtual_start); 938 939 virtual_end = VM_MAX_KERNEL_ADDRESS; 940 941 /* XXX do %cr0 as well */ 942 load_cr4(rcr4() | CR4_PGE | CR4_PSE); 943 load_cr3(KPML4phys); 944 945 /* 946 * Initialize protection array. 947 */ 948 i386_protection_init(); 949 950 /* 951 * The kernel's pmap is statically allocated so we don't have to use 952 * pmap_create, which is unlikely to work correctly at this part of 953 * the boot sequence (XXX and which no longer exists). 954 */ 955 kernel_pmap.pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys); 956 kernel_pmap.pm_count = 1; 957 CPUMASK_ASSALLONES(kernel_pmap.pm_active); 958 RB_INIT(&kernel_pmap.pm_pvroot); 959 spin_init(&kernel_pmap.pm_spin, "pmapbootstrap"); 960 for (i = 0; i < PM_PLACEMARKS; ++i) 961 kernel_pmap.pm_placemarks[i] = PM_NOPLACEMARK; 962 963 /* 964 * Reserve some special page table entries/VA space for temporary 965 * mapping of pages. 966 */ 967 #define SYSMAP(c, p, v, n) \ 968 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 969 970 va = virtual_start; 971 pte = vtopte(va); 972 973 /* 974 * CMAP1/CMAP2 are used for zeroing and copying pages. 975 */ 976 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 977 978 /* 979 * Crashdump maps. 980 */ 981 SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); 982 983 /* 984 * ptvmmap is used for reading arbitrary physical pages via 985 * /dev/mem. 986 */ 987 SYSMAP(caddr_t, ptmmap, ptvmmap, 1) 988 989 /* 990 * msgbufp is used to map the system message buffer. 991 * XXX msgbufmap is not used. 992 */ 993 SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 994 atop(round_page(MSGBUF_SIZE))) 995 996 virtual_start = va; 997 virtual_start = pmap_kmem_choose(virtual_start); 998 999 *CMAP1 = 0; 1000 1001 /* 1002 * PG_G is terribly broken on SMP because we IPI invltlb's in some 1003 * cases rather then invl1pg. Actually, I don't even know why it 1004 * works under UP because self-referential page table mappings 1005 */ 1006 // pgeflag = 0; 1007 1008 /* 1009 * Initialize the 4MB page size flag 1010 */ 1011 // pseflag = 0; 1012 /* 1013 * The 4MB page version of the initial 1014 * kernel page mapping. 1015 */ 1016 pdir4mb = 0; 1017 1018 #if !defined(DISABLE_PSE) 1019 if (cpu_feature & CPUID_PSE) { 1020 pt_entry_t ptditmp; 1021 /* 1022 * Note that we have enabled PSE mode 1023 */ 1024 // pseflag = kernel_pmap.pmap_bits[PG_PS_IDX]; 1025 ptditmp = *(PTmap + x86_64_btop(KERNBASE)); 1026 ptditmp &= ~(NBPDR - 1); 1027 ptditmp |= pmap_bits_default[PG_V_IDX] | 1028 pmap_bits_default[PG_RW_IDX] | 1029 pmap_bits_default[PG_PS_IDX] | 1030 pmap_bits_default[PG_U_IDX]; 1031 // pgeflag; 1032 pdir4mb = ptditmp; 1033 } 1034 #endif 1035 cpu_invltlb(); 1036 1037 /* Initialize the PAT MSR */ 1038 pmap_init_pat(); 1039 pmap_pinit_defaults(&kernel_pmap); 1040 1041 TUNABLE_INT_FETCH("machdep.pmap_fast_kernel_cpusync", 1042 &pmap_fast_kernel_cpusync); 1043 1044 } 1045 1046 /* 1047 * Setup the PAT MSR. 1048 */ 1049 void 1050 pmap_init_pat(void) 1051 { 1052 uint64_t pat_msr; 1053 u_long cr0, cr4; 1054 1055 /* 1056 * Default values mapping PATi,PCD,PWT bits at system reset. 1057 * The default values effectively ignore the PATi bit by 1058 * repeating the encodings for 0-3 in 4-7, and map the PCD 1059 * and PWT bit combinations to the expected PAT types. 1060 */ 1061 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | /* 000 */ 1062 PAT_VALUE(1, PAT_WRITE_THROUGH) | /* 001 */ 1063 PAT_VALUE(2, PAT_UNCACHED) | /* 010 */ 1064 PAT_VALUE(3, PAT_UNCACHEABLE) | /* 011 */ 1065 PAT_VALUE(4, PAT_WRITE_BACK) | /* 100 */ 1066 PAT_VALUE(5, PAT_WRITE_THROUGH) | /* 101 */ 1067 PAT_VALUE(6, PAT_UNCACHED) | /* 110 */ 1068 PAT_VALUE(7, PAT_UNCACHEABLE); /* 111 */ 1069 pat_pte_index[PAT_WRITE_BACK] = 0; 1070 pat_pte_index[PAT_WRITE_THROUGH]= 0 | X86_PG_NC_PWT; 1071 pat_pte_index[PAT_UNCACHED] = X86_PG_NC_PCD; 1072 pat_pte_index[PAT_UNCACHEABLE] = X86_PG_NC_PCD | X86_PG_NC_PWT; 1073 pat_pte_index[PAT_WRITE_PROTECTED] = pat_pte_index[PAT_UNCACHEABLE]; 1074 pat_pte_index[PAT_WRITE_COMBINING] = pat_pte_index[PAT_UNCACHEABLE]; 1075 1076 if (cpu_feature & CPUID_PAT) { 1077 /* 1078 * If we support the PAT then set-up entries for 1079 * WRITE_PROTECTED and WRITE_COMBINING using bit patterns 1080 * 4 and 5. 1081 */ 1082 pat_msr = (pat_msr & ~PAT_MASK(4)) | 1083 PAT_VALUE(4, PAT_WRITE_PROTECTED); 1084 pat_msr = (pat_msr & ~PAT_MASK(5)) | 1085 PAT_VALUE(5, PAT_WRITE_COMBINING); 1086 pat_pte_index[PAT_WRITE_PROTECTED] = X86_PG_PTE_PAT | 0; 1087 pat_pte_index[PAT_WRITE_COMBINING] = X86_PG_PTE_PAT | X86_PG_NC_PWT; 1088 1089 /* 1090 * Then enable the PAT 1091 */ 1092 1093 /* Disable PGE. */ 1094 cr4 = rcr4(); 1095 load_cr4(cr4 & ~CR4_PGE); 1096 1097 /* Disable caches (CD = 1, NW = 0). */ 1098 cr0 = rcr0(); 1099 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 1100 1101 /* Flushes caches and TLBs. */ 1102 wbinvd(); 1103 cpu_invltlb(); 1104 1105 /* Update PAT and index table. */ 1106 wrmsr(MSR_PAT, pat_msr); 1107 1108 /* Flush caches and TLBs again. */ 1109 wbinvd(); 1110 cpu_invltlb(); 1111 1112 /* Restore caches and PGE. */ 1113 load_cr0(cr0); 1114 load_cr4(cr4); 1115 PatMsr = pat_msr; 1116 } 1117 } 1118 1119 /* 1120 * Set 4mb pdir for mp startup 1121 */ 1122 void 1123 pmap_set_opt(void) 1124 { 1125 if (cpu_feature & CPUID_PSE) { 1126 load_cr4(rcr4() | CR4_PSE); 1127 if (pdir4mb && mycpu->gd_cpuid == 0) { /* only on BSP */ 1128 cpu_invltlb(); 1129 } 1130 } 1131 } 1132 1133 /* 1134 * Initialize the pmap module. 1135 * Called by vm_init, to initialize any structures that the pmap 1136 * system needs to map virtual memory. 1137 * pmap_init has been enhanced to support in a fairly consistant 1138 * way, discontiguous physical memory. 1139 */ 1140 void 1141 pmap_init(void) 1142 { 1143 int i; 1144 int initial_pvs; 1145 1146 /* 1147 * Allocate memory for random pmap data structures. Includes the 1148 * pv_head_table. 1149 */ 1150 1151 for (i = 0; i < vm_page_array_size; i++) { 1152 vm_page_t m; 1153 1154 m = &vm_page_array[i]; 1155 TAILQ_INIT(&m->md.pv_list); 1156 } 1157 1158 /* 1159 * init the pv free list 1160 */ 1161 initial_pvs = vm_page_array_size; 1162 if (initial_pvs < MINPV) 1163 initial_pvs = MINPV; 1164 pvzone = &pvzone_store; 1165 pvinit = (void *)kmem_alloc(&kernel_map, 1166 initial_pvs * sizeof (struct pv_entry), 1167 VM_SUBSYS_PVENTRY); 1168 zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), 1169 pvinit, initial_pvs); 1170 1171 /* 1172 * Now it is safe to enable pv_table recording. 1173 */ 1174 pmap_initialized = TRUE; 1175 } 1176 1177 /* 1178 * Initialize the address space (zone) for the pv_entries. Set a 1179 * high water mark so that the system can recover from excessive 1180 * numbers of pv entries. 1181 */ 1182 void 1183 pmap_init2(void) 1184 { 1185 int shpgperproc = PMAP_SHPGPERPROC; 1186 int entry_max; 1187 1188 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 1189 pv_entry_max = shpgperproc * maxproc + vm_page_array_size; 1190 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 1191 pv_entry_high_water = 9 * (pv_entry_max / 10); 1192 1193 /* 1194 * Subtract out pages already installed in the zone (hack) 1195 */ 1196 entry_max = pv_entry_max - vm_page_array_size; 1197 if (entry_max <= 0) 1198 entry_max = 1; 1199 1200 zinitna(pvzone, NULL, 0, entry_max, ZONE_INTERRUPT); 1201 1202 /* 1203 * Enable dynamic deletion of empty higher-level page table pages 1204 * by default only if system memory is < 8GB (use 7GB for slop). 1205 * This can save a little memory, but imposes significant 1206 * performance overhead for things like bulk builds, and for programs 1207 * which do a lot of memory mapping and memory unmapping. 1208 */ 1209 if (pmap_dynamic_delete < 0) { 1210 if (vmstats.v_page_count < 7LL * 1024 * 1024 * 1024 / PAGE_SIZE) 1211 pmap_dynamic_delete = 1; 1212 else 1213 pmap_dynamic_delete = 0; 1214 } 1215 } 1216 1217 /* 1218 * Typically used to initialize a fictitious page by vm/device_pager.c 1219 */ 1220 void 1221 pmap_page_init(struct vm_page *m) 1222 { 1223 vm_page_init(m); 1224 TAILQ_INIT(&m->md.pv_list); 1225 } 1226 1227 /*************************************************** 1228 * Low level helper routines..... 1229 ***************************************************/ 1230 1231 /* 1232 * this routine defines the region(s) of memory that should 1233 * not be tested for the modified bit. 1234 */ 1235 static __inline 1236 int 1237 pmap_track_modified(vm_pindex_t pindex) 1238 { 1239 vm_offset_t va = (vm_offset_t)pindex << PAGE_SHIFT; 1240 if ((va < clean_sva) || (va >= clean_eva)) 1241 return 1; 1242 else 1243 return 0; 1244 } 1245 1246 /* 1247 * Extract the physical page address associated with the map/VA pair. 1248 * The page must be wired for this to work reliably. 1249 */ 1250 vm_paddr_t 1251 pmap_extract(pmap_t pmap, vm_offset_t va, void **handlep) 1252 { 1253 vm_paddr_t rtval; 1254 pv_entry_t pt_pv; 1255 pt_entry_t *ptep; 1256 1257 rtval = 0; 1258 if (va >= VM_MAX_USER_ADDRESS) { 1259 /* 1260 * Kernel page directories might be direct-mapped and 1261 * there is typically no PV tracking of pte's 1262 */ 1263 pd_entry_t *pt; 1264 1265 pt = pmap_pt(pmap, va); 1266 if (pt && (*pt & pmap->pmap_bits[PG_V_IDX])) { 1267 if (*pt & pmap->pmap_bits[PG_PS_IDX]) { 1268 rtval = *pt & PG_PS_FRAME; 1269 rtval |= va & PDRMASK; 1270 } else { 1271 ptep = pmap_pt_to_pte(*pt, va); 1272 if (*pt & pmap->pmap_bits[PG_V_IDX]) { 1273 rtval = *ptep & PG_FRAME; 1274 rtval |= va & PAGE_MASK; 1275 } 1276 } 1277 } 1278 if (handlep) 1279 *handlep = NULL; 1280 } else { 1281 /* 1282 * User pages currently do not direct-map the page directory 1283 * and some pages might not used managed PVs. But all PT's 1284 * will have a PV. 1285 */ 1286 pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL); 1287 if (pt_pv) { 1288 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 1289 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 1290 rtval = *ptep & PG_FRAME; 1291 rtval |= va & PAGE_MASK; 1292 } 1293 if (handlep) 1294 *handlep = pt_pv; /* locked until done */ 1295 else 1296 pv_put (pt_pv); 1297 } else if (handlep) { 1298 *handlep = NULL; 1299 } 1300 } 1301 return rtval; 1302 } 1303 1304 void 1305 pmap_extract_done(void *handle) 1306 { 1307 if (handle) 1308 pv_put((pv_entry_t)handle); 1309 } 1310 1311 /* 1312 * Similar to extract but checks protections, SMP-friendly short-cut for 1313 * vm_fault_page[_quick](). Can return NULL to cause the caller to 1314 * fall-through to the real fault code. Does not work with HVM page 1315 * tables. 1316 * 1317 * The returned page, if not NULL, is held (and not busied). 1318 * 1319 * WARNING! THE RETURNED PAGE IS ONLY HELD AND NOT SUITABLE FOR READING 1320 * OR WRITING AS-IS. 1321 */ 1322 vm_page_t 1323 pmap_fault_page_quick(pmap_t pmap, vm_offset_t va, vm_prot_t prot, int *busyp) 1324 { 1325 if (pmap && 1326 va < VM_MAX_USER_ADDRESS && 1327 (pmap->pm_flags & PMAP_HVM) == 0) { 1328 pv_entry_t pt_pv; 1329 pv_entry_t pte_pv; 1330 pt_entry_t *ptep; 1331 pt_entry_t req; 1332 vm_page_t m; 1333 int error; 1334 1335 req = pmap->pmap_bits[PG_V_IDX] | 1336 pmap->pmap_bits[PG_U_IDX]; 1337 if (prot & VM_PROT_WRITE) 1338 req |= pmap->pmap_bits[PG_RW_IDX]; 1339 1340 pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL); 1341 if (pt_pv == NULL) 1342 return (NULL); 1343 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 1344 if ((*ptep & req) != req) { 1345 pv_put(pt_pv); 1346 return (NULL); 1347 } 1348 pte_pv = pv_get_try(pmap, pmap_pte_pindex(va), NULL, &error); 1349 if (pte_pv && error == 0) { 1350 m = pte_pv->pv_m; 1351 if (prot & VM_PROT_WRITE) { 1352 /* interlocked by presence of pv_entry */ 1353 vm_page_dirty(m); 1354 } 1355 if (busyp) { 1356 if (prot & VM_PROT_WRITE) { 1357 if (vm_page_busy_try(m, TRUE)) 1358 m = NULL; 1359 *busyp = 1; 1360 } else { 1361 vm_page_hold(m); 1362 *busyp = 0; 1363 } 1364 } else { 1365 vm_page_hold(m); 1366 } 1367 pv_put(pte_pv); 1368 } else if (pte_pv) { 1369 pv_drop(pte_pv); 1370 m = NULL; 1371 } else { 1372 /* error, since we didn't request a placemarker */ 1373 m = NULL; 1374 } 1375 pv_put(pt_pv); 1376 return(m); 1377 } else { 1378 return(NULL); 1379 } 1380 } 1381 1382 /* 1383 * Extract the physical page address associated kernel virtual address. 1384 */ 1385 vm_paddr_t 1386 pmap_kextract(vm_offset_t va) 1387 { 1388 pd_entry_t pt; /* pt entry in pd */ 1389 vm_paddr_t pa; 1390 1391 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 1392 pa = DMAP_TO_PHYS(va); 1393 } else { 1394 pt = *vtopt(va); 1395 if (pt & kernel_pmap.pmap_bits[PG_PS_IDX]) { 1396 pa = (pt & PG_PS_FRAME) | (va & PDRMASK); 1397 } else { 1398 /* 1399 * Beware of a concurrent promotion that changes the 1400 * PDE at this point! For example, vtopte() must not 1401 * be used to access the PTE because it would use the 1402 * new PDE. It is, however, safe to use the old PDE 1403 * because the page table page is preserved by the 1404 * promotion. 1405 */ 1406 pa = *pmap_pt_to_pte(pt, va); 1407 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 1408 } 1409 } 1410 return pa; 1411 } 1412 1413 /*************************************************** 1414 * Low level mapping routines..... 1415 ***************************************************/ 1416 1417 /* 1418 * Routine: pmap_kenter 1419 * Function: 1420 * Add a wired page to the KVA 1421 * NOTE! note that in order for the mapping to take effect -- you 1422 * should do an invltlb after doing the pmap_kenter(). 1423 */ 1424 void 1425 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1426 { 1427 pt_entry_t *ptep; 1428 pt_entry_t npte; 1429 1430 npte = pa | 1431 kernel_pmap.pmap_bits[PG_RW_IDX] | 1432 kernel_pmap.pmap_bits[PG_V_IDX]; 1433 // pgeflag; 1434 ptep = vtopte(va); 1435 #if 1 1436 pmap_inval_smp(&kernel_pmap, va, 1, ptep, npte); 1437 #else 1438 /* FUTURE */ 1439 if (*ptep) 1440 pmap_inval_smp(&kernel_pmap, va, ptep, npte); 1441 else 1442 *ptep = npte; 1443 #endif 1444 } 1445 1446 /* 1447 * Similar to pmap_kenter(), except we only invalidate the mapping on the 1448 * current CPU. Returns 0 if the previous pte was 0, 1 if it wasn't 1449 * (caller can conditionalize calling smp_invltlb()). 1450 */ 1451 int 1452 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa) 1453 { 1454 pt_entry_t *ptep; 1455 pt_entry_t npte; 1456 int res; 1457 1458 npte = pa | kernel_pmap.pmap_bits[PG_RW_IDX] | 1459 kernel_pmap.pmap_bits[PG_V_IDX]; 1460 // npte |= pgeflag; 1461 ptep = vtopte(va); 1462 #if 1 1463 res = 1; 1464 #else 1465 /* FUTURE */ 1466 res = (*ptep != 0); 1467 #endif 1468 atomic_swap_long(ptep, npte); 1469 cpu_invlpg((void *)va); 1470 1471 return res; 1472 } 1473 1474 /* 1475 * Enter addresses into the kernel pmap but don't bother 1476 * doing any tlb invalidations. Caller will do a rollup 1477 * invalidation via pmap_rollup_inval(). 1478 */ 1479 int 1480 pmap_kenter_noinval(vm_offset_t va, vm_paddr_t pa) 1481 { 1482 pt_entry_t *ptep; 1483 pt_entry_t npte; 1484 int res; 1485 1486 npte = pa | 1487 kernel_pmap.pmap_bits[PG_RW_IDX] | 1488 kernel_pmap.pmap_bits[PG_V_IDX]; 1489 // pgeflag; 1490 ptep = vtopte(va); 1491 #if 1 1492 res = 1; 1493 #else 1494 /* FUTURE */ 1495 res = (*ptep != 0); 1496 #endif 1497 atomic_swap_long(ptep, npte); 1498 cpu_invlpg((void *)va); 1499 1500 return res; 1501 } 1502 1503 /* 1504 * remove a page from the kernel pagetables 1505 */ 1506 void 1507 pmap_kremove(vm_offset_t va) 1508 { 1509 pt_entry_t *ptep; 1510 1511 ptep = vtopte(va); 1512 pmap_inval_smp(&kernel_pmap, va, 1, ptep, 0); 1513 } 1514 1515 void 1516 pmap_kremove_quick(vm_offset_t va) 1517 { 1518 pt_entry_t *ptep; 1519 1520 ptep = vtopte(va); 1521 (void)pte_load_clear(ptep); 1522 cpu_invlpg((void *)va); 1523 } 1524 1525 /* 1526 * Remove addresses from the kernel pmap but don't bother 1527 * doing any tlb invalidations. Caller will do a rollup 1528 * invalidation via pmap_rollup_inval(). 1529 */ 1530 void 1531 pmap_kremove_noinval(vm_offset_t va) 1532 { 1533 pt_entry_t *ptep; 1534 1535 ptep = vtopte(va); 1536 (void)pte_load_clear(ptep); 1537 } 1538 1539 /* 1540 * XXX these need to be recoded. They are not used in any critical path. 1541 */ 1542 void 1543 pmap_kmodify_rw(vm_offset_t va) 1544 { 1545 atomic_set_long(vtopte(va), kernel_pmap.pmap_bits[PG_RW_IDX]); 1546 cpu_invlpg((void *)va); 1547 } 1548 1549 /* NOT USED 1550 void 1551 pmap_kmodify_nc(vm_offset_t va) 1552 { 1553 atomic_set_long(vtopte(va), PG_N); 1554 cpu_invlpg((void *)va); 1555 } 1556 */ 1557 1558 /* 1559 * Used to map a range of physical addresses into kernel virtual 1560 * address space during the low level boot, typically to map the 1561 * dump bitmap, message buffer, and vm_page_array. 1562 * 1563 * These mappings are typically made at some pointer after the end of the 1564 * kernel text+data. 1565 * 1566 * We could return PHYS_TO_DMAP(start) here and not allocate any 1567 * via (*virtp), but then kmem from userland and kernel dumps won't 1568 * have access to the related pointers. 1569 */ 1570 vm_offset_t 1571 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot) 1572 { 1573 vm_offset_t va; 1574 vm_offset_t va_start; 1575 1576 /*return PHYS_TO_DMAP(start);*/ 1577 1578 va_start = *virtp; 1579 va = va_start; 1580 1581 while (start < end) { 1582 pmap_kenter_quick(va, start); 1583 va += PAGE_SIZE; 1584 start += PAGE_SIZE; 1585 } 1586 *virtp = va; 1587 return va_start; 1588 } 1589 1590 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1591 1592 /* 1593 * Remove the specified set of pages from the data and instruction caches. 1594 * 1595 * In contrast to pmap_invalidate_cache_range(), this function does not 1596 * rely on the CPU's self-snoop feature, because it is intended for use 1597 * when moving pages into a different cache domain. 1598 */ 1599 void 1600 pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1601 { 1602 vm_offset_t daddr, eva; 1603 int i; 1604 1605 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1606 (cpu_feature & CPUID_CLFSH) == 0) 1607 wbinvd(); 1608 else { 1609 cpu_mfence(); 1610 for (i = 0; i < count; i++) { 1611 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 1612 eva = daddr + PAGE_SIZE; 1613 for (; daddr < eva; daddr += cpu_clflush_line_size) 1614 clflush(daddr); 1615 } 1616 cpu_mfence(); 1617 } 1618 } 1619 1620 void 1621 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 1622 { 1623 KASSERT((sva & PAGE_MASK) == 0, 1624 ("pmap_invalidate_cache_range: sva not page-aligned")); 1625 KASSERT((eva & PAGE_MASK) == 0, 1626 ("pmap_invalidate_cache_range: eva not page-aligned")); 1627 1628 if (cpu_feature & CPUID_SS) { 1629 ; /* If "Self Snoop" is supported, do nothing. */ 1630 } else { 1631 /* Globally invalidate caches */ 1632 cpu_wbinvd_on_all_cpus(); 1633 } 1634 } 1635 1636 /* 1637 * Invalidate the specified range of virtual memory on all cpus associated 1638 * with the pmap. 1639 */ 1640 void 1641 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1642 { 1643 pmap_inval_smp(pmap, sva, (eva - sva) >> PAGE_SHIFT, NULL, 0); 1644 } 1645 1646 /* 1647 * Add a list of wired pages to the kva. This routine is used for temporary 1648 * kernel mappings such as those found in buffer cache buffer. Page 1649 * modifications and accesses are not tracked or recorded. 1650 * 1651 * NOTE! Old mappings are simply overwritten, and we cannot assume relaxed 1652 * semantics as previous mappings may have been zerod without any 1653 * invalidation. 1654 * 1655 * The page *must* be wired. 1656 */ 1657 void 1658 pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count) 1659 { 1660 vm_offset_t end_va; 1661 vm_offset_t va; 1662 1663 end_va = beg_va + count * PAGE_SIZE; 1664 1665 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 1666 pt_entry_t pte; 1667 pt_entry_t *ptep; 1668 1669 ptep = vtopte(va); 1670 pte = VM_PAGE_TO_PHYS(*m) | 1671 kernel_pmap.pmap_bits[PG_RW_IDX] | 1672 kernel_pmap.pmap_bits[PG_V_IDX] | 1673 kernel_pmap.pmap_cache_bits[(*m)->pat_mode]; 1674 // pgeflag; 1675 atomic_swap_long(ptep, pte); 1676 m++; 1677 } 1678 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 1679 } 1680 1681 /* 1682 * This routine jerks page mappings from the kernel -- it is meant only 1683 * for temporary mappings such as those found in buffer cache buffers. 1684 * No recording modified or access status occurs. 1685 * 1686 * MPSAFE, INTERRUPT SAFE (cluster callback) 1687 */ 1688 void 1689 pmap_qremove(vm_offset_t beg_va, int count) 1690 { 1691 vm_offset_t end_va; 1692 vm_offset_t va; 1693 1694 end_va = beg_va + count * PAGE_SIZE; 1695 1696 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 1697 pt_entry_t *pte; 1698 1699 pte = vtopte(va); 1700 (void)pte_load_clear(pte); 1701 cpu_invlpg((void *)va); 1702 } 1703 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 1704 } 1705 1706 /* 1707 * This routine removes temporary kernel mappings, only invalidating them 1708 * on the current cpu. It should only be used under carefully controlled 1709 * conditions. 1710 */ 1711 void 1712 pmap_qremove_quick(vm_offset_t beg_va, int count) 1713 { 1714 vm_offset_t end_va; 1715 vm_offset_t va; 1716 1717 end_va = beg_va + count * PAGE_SIZE; 1718 1719 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 1720 pt_entry_t *pte; 1721 1722 pte = vtopte(va); 1723 (void)pte_load_clear(pte); 1724 cpu_invlpg((void *)va); 1725 } 1726 } 1727 1728 /* 1729 * This routine removes temporary kernel mappings *without* invalidating 1730 * the TLB. It can only be used on permanent kva reservations such as those 1731 * found in buffer cache buffers, under carefully controlled circumstances. 1732 * 1733 * NOTE: Repopulating these KVAs requires unconditional invalidation. 1734 * (pmap_qenter() does unconditional invalidation). 1735 */ 1736 void 1737 pmap_qremove_noinval(vm_offset_t beg_va, int count) 1738 { 1739 vm_offset_t end_va; 1740 vm_offset_t va; 1741 1742 end_va = beg_va + count * PAGE_SIZE; 1743 1744 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 1745 pt_entry_t *pte; 1746 1747 pte = vtopte(va); 1748 (void)pte_load_clear(pte); 1749 } 1750 } 1751 1752 /* 1753 * Create a new thread and optionally associate it with a (new) process. 1754 * NOTE! the new thread's cpu may not equal the current cpu. 1755 */ 1756 void 1757 pmap_init_thread(thread_t td) 1758 { 1759 /* enforce pcb placement & alignment */ 1760 td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1; 1761 td->td_pcb = (struct pcb *)((intptr_t)td->td_pcb & ~(intptr_t)0xF); 1762 td->td_savefpu = &td->td_pcb->pcb_save; 1763 td->td_sp = (char *)td->td_pcb; /* no -16 */ 1764 } 1765 1766 /* 1767 * This routine directly affects the fork perf for a process. 1768 */ 1769 void 1770 pmap_init_proc(struct proc *p) 1771 { 1772 } 1773 1774 static void 1775 pmap_pinit_defaults(struct pmap *pmap) 1776 { 1777 bcopy(pmap_bits_default, pmap->pmap_bits, 1778 sizeof(pmap_bits_default)); 1779 bcopy(protection_codes, pmap->protection_codes, 1780 sizeof(protection_codes)); 1781 bcopy(pat_pte_index, pmap->pmap_cache_bits, 1782 sizeof(pat_pte_index)); 1783 pmap->pmap_cache_mask = X86_PG_NC_PWT | X86_PG_NC_PCD | X86_PG_PTE_PAT; 1784 pmap->copyinstr = std_copyinstr; 1785 pmap->copyin = std_copyin; 1786 pmap->copyout = std_copyout; 1787 pmap->fubyte = std_fubyte; 1788 pmap->subyte = std_subyte; 1789 pmap->fuword32 = std_fuword32; 1790 pmap->fuword64 = std_fuword64; 1791 pmap->suword32 = std_suword32; 1792 pmap->suword64 = std_suword64; 1793 pmap->swapu32 = std_swapu32; 1794 pmap->swapu64 = std_swapu64; 1795 } 1796 /* 1797 * Initialize pmap0/vmspace0. 1798 * 1799 * On architectures where the kernel pmap is not integrated into the user 1800 * process pmap, this pmap represents the process pmap, not the kernel pmap. 1801 * kernel_pmap should be used to directly access the kernel_pmap. 1802 */ 1803 void 1804 pmap_pinit0(struct pmap *pmap) 1805 { 1806 int i; 1807 1808 pmap->pm_pml4 = (pml4_entry_t *)(PTOV_OFFSET + KPML4phys); 1809 pmap->pm_count = 1; 1810 CPUMASK_ASSZERO(pmap->pm_active); 1811 pmap->pm_pvhint = NULL; 1812 RB_INIT(&pmap->pm_pvroot); 1813 spin_init(&pmap->pm_spin, "pmapinit0"); 1814 for (i = 0; i < PM_PLACEMARKS; ++i) 1815 pmap->pm_placemarks[i] = PM_NOPLACEMARK; 1816 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1817 pmap_pinit_defaults(pmap); 1818 } 1819 1820 /* 1821 * Initialize a preallocated and zeroed pmap structure, 1822 * such as one in a vmspace structure. 1823 */ 1824 static void 1825 pmap_pinit_simple(struct pmap *pmap) 1826 { 1827 int i; 1828 1829 /* 1830 * Misc initialization 1831 */ 1832 pmap->pm_count = 1; 1833 CPUMASK_ASSZERO(pmap->pm_active); 1834 pmap->pm_pvhint = NULL; 1835 pmap->pm_flags = PMAP_FLAG_SIMPLE; 1836 1837 pmap_pinit_defaults(pmap); 1838 1839 /* 1840 * Don't blow up locks/tokens on re-use (XXX fix/use drop code 1841 * for this). 1842 */ 1843 if (pmap->pm_pmlpv == NULL) { 1844 RB_INIT(&pmap->pm_pvroot); 1845 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1846 spin_init(&pmap->pm_spin, "pmapinitsimple"); 1847 for (i = 0; i < PM_PLACEMARKS; ++i) 1848 pmap->pm_placemarks[i] = PM_NOPLACEMARK; 1849 } 1850 } 1851 1852 void 1853 pmap_pinit(struct pmap *pmap) 1854 { 1855 pv_entry_t pv; 1856 int j; 1857 1858 if (pmap->pm_pmlpv) { 1859 if (pmap->pmap_bits[TYPE_IDX] != REGULAR_PMAP) { 1860 pmap_puninit(pmap); 1861 } 1862 } 1863 1864 pmap_pinit_simple(pmap); 1865 pmap->pm_flags &= ~PMAP_FLAG_SIMPLE; 1866 1867 /* 1868 * No need to allocate page table space yet but we do need a valid 1869 * page directory table. 1870 */ 1871 if (pmap->pm_pml4 == NULL) { 1872 pmap->pm_pml4 = 1873 (pml4_entry_t *)kmem_alloc_pageable(&kernel_map, 1874 PAGE_SIZE, 1875 VM_SUBSYS_PML4); 1876 } 1877 1878 /* 1879 * Allocate the page directory page, which wires it even though 1880 * it isn't being entered into some higher level page table (it 1881 * being the highest level). If one is already cached we don't 1882 * have to do anything. 1883 */ 1884 if ((pv = pmap->pm_pmlpv) == NULL) { 1885 pv = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL); 1886 pmap->pm_pmlpv = pv; 1887 pmap_kenter((vm_offset_t)pmap->pm_pml4, 1888 VM_PAGE_TO_PHYS(pv->pv_m)); 1889 pv_put(pv); 1890 1891 /* 1892 * Install DMAP and KMAP. 1893 */ 1894 for (j = 0; j < NDMPML4E; ++j) { 1895 pmap->pm_pml4[DMPML4I + j] = 1896 (DMPDPphys + ((vm_paddr_t)j << PML4SHIFT)) | 1897 pmap->pmap_bits[PG_RW_IDX] | 1898 pmap->pmap_bits[PG_V_IDX] | 1899 pmap->pmap_bits[PG_U_IDX]; 1900 } 1901 pmap->pm_pml4[KPML4I] = KPDPphys | 1902 pmap->pmap_bits[PG_RW_IDX] | 1903 pmap->pmap_bits[PG_V_IDX] | 1904 pmap->pmap_bits[PG_U_IDX]; 1905 1906 /* 1907 * install self-referential address mapping entry 1908 */ 1909 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pv->pv_m) | 1910 pmap->pmap_bits[PG_V_IDX] | 1911 pmap->pmap_bits[PG_RW_IDX] | 1912 pmap->pmap_bits[PG_A_IDX] | 1913 pmap->pmap_bits[PG_M_IDX]; 1914 } else { 1915 KKASSERT(pv->pv_m->flags & PG_MAPPED); 1916 KKASSERT(pv->pv_m->flags & PG_WRITEABLE); 1917 } 1918 KKASSERT(pmap->pm_pml4[255] == 0); 1919 KKASSERT(RB_ROOT(&pmap->pm_pvroot) == pv); 1920 KKASSERT(pv->pv_entry.rbe_left == NULL); 1921 KKASSERT(pv->pv_entry.rbe_right == NULL); 1922 } 1923 1924 /* 1925 * Clean up a pmap structure so it can be physically freed. This routine 1926 * is called by the vmspace dtor function. A great deal of pmap data is 1927 * left passively mapped to improve vmspace management so we have a bit 1928 * of cleanup work to do here. 1929 */ 1930 void 1931 pmap_puninit(pmap_t pmap) 1932 { 1933 pv_entry_t pv; 1934 vm_page_t p; 1935 1936 KKASSERT(CPUMASK_TESTZERO(pmap->pm_active)); 1937 if ((pv = pmap->pm_pmlpv) != NULL) { 1938 if (pv_hold_try(pv) == 0) 1939 pv_lock(pv); 1940 KKASSERT(pv == pmap->pm_pmlpv); 1941 p = pmap_remove_pv_page(pv); 1942 pv_free(pv, NULL); 1943 pv = NULL; /* safety */ 1944 pmap_kremove((vm_offset_t)pmap->pm_pml4); 1945 vm_page_busy_wait(p, FALSE, "pgpun"); 1946 KKASSERT(p->flags & (PG_FICTITIOUS|PG_UNMANAGED)); 1947 vm_page_unwire(p, 0); 1948 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 1949 1950 /* 1951 * XXX eventually clean out PML4 static entries and 1952 * use vm_page_free_zero() 1953 */ 1954 vm_page_free(p); 1955 pmap->pm_pmlpv = NULL; 1956 } 1957 if (pmap->pm_pml4) { 1958 KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys)); 1959 kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE); 1960 pmap->pm_pml4 = NULL; 1961 } 1962 KKASSERT(pmap->pm_stats.resident_count == 0); 1963 KKASSERT(pmap->pm_stats.wired_count == 0); 1964 } 1965 1966 /* 1967 * This function is now unused (used to add the pmap to the pmap_list) 1968 */ 1969 void 1970 pmap_pinit2(struct pmap *pmap) 1971 { 1972 } 1973 1974 /* 1975 * This routine is called when various levels in the page table need to 1976 * be populated. This routine cannot fail. 1977 * 1978 * This function returns two locked pv_entry's, one representing the 1979 * requested pv and one representing the requested pv's parent pv. If 1980 * an intermediate page table does not exist it will be created, mapped, 1981 * wired, and the parent page table will be given an additional hold 1982 * count representing the presence of the child pv_entry. 1983 */ 1984 static 1985 pv_entry_t 1986 pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp) 1987 { 1988 pt_entry_t *ptep; 1989 pv_entry_t pv; 1990 pv_entry_t pvp; 1991 pt_entry_t v; 1992 vm_pindex_t pt_pindex; 1993 vm_page_t m; 1994 int isnew; 1995 int ispt; 1996 1997 /* 1998 * If the pv already exists and we aren't being asked for the 1999 * parent page table page we can just return it. A locked+held pv 2000 * is returned. The pv will also have a second hold related to the 2001 * pmap association that we don't have to worry about. 2002 */ 2003 ispt = 0; 2004 pv = pv_alloc(pmap, ptepindex, &isnew); 2005 if (isnew == 0 && pvpp == NULL) 2006 return(pv); 2007 2008 /* 2009 * Special case terminal PVs. These are not page table pages so 2010 * no vm_page is allocated (the caller supplied the vm_page). If 2011 * pvpp is non-NULL we are being asked to also removed the pt_pv 2012 * for this pv. 2013 * 2014 * Note that pt_pv's are only returned for user VAs. We assert that 2015 * a pt_pv is not being requested for kernel VAs. The kernel 2016 * pre-wires all higher-level page tables so don't overload managed 2017 * higher-level page tables on top of it! 2018 */ 2019 if (ptepindex < pmap_pt_pindex(0)) { 2020 if (ptepindex >= NUPTE_USER) { 2021 /* kernel manages this manually for KVM */ 2022 KKASSERT(pvpp == NULL); 2023 } else { 2024 KKASSERT(pvpp != NULL); 2025 pt_pindex = NUPTE_TOTAL + (ptepindex >> NPTEPGSHIFT); 2026 pvp = pmap_allocpte(pmap, pt_pindex, NULL); 2027 if (isnew) 2028 vm_page_wire_quick(pvp->pv_m); 2029 *pvpp = pvp; 2030 } 2031 return(pv); 2032 } 2033 2034 /* 2035 * The kernel never uses managed PT/PD/PDP pages. 2036 */ 2037 KKASSERT(pmap != &kernel_pmap); 2038 2039 /* 2040 * Non-terminal PVs allocate a VM page to represent the page table, 2041 * so we have to resolve pvp and calculate ptepindex for the pvp 2042 * and then for the page table entry index in the pvp for 2043 * fall-through. 2044 */ 2045 if (ptepindex < pmap_pd_pindex(0)) { 2046 /* 2047 * pv is PT, pvp is PD 2048 */ 2049 ptepindex = (ptepindex - pmap_pt_pindex(0)) >> NPDEPGSHIFT; 2050 ptepindex += NUPTE_TOTAL + NUPT_TOTAL; 2051 pvp = pmap_allocpte(pmap, ptepindex, NULL); 2052 2053 /* 2054 * PT index in PD 2055 */ 2056 ptepindex = pv->pv_pindex - pmap_pt_pindex(0); 2057 ptepindex &= ((1ul << NPDEPGSHIFT) - 1); 2058 ispt = 1; 2059 } else if (ptepindex < pmap_pdp_pindex(0)) { 2060 /* 2061 * pv is PD, pvp is PDP 2062 * 2063 * SIMPLE PMAP NOTE: Simple pmaps do not allocate above 2064 * the PD. 2065 */ 2066 ptepindex = (ptepindex - pmap_pd_pindex(0)) >> NPDPEPGSHIFT; 2067 ptepindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL; 2068 2069 if (pmap->pm_flags & PMAP_FLAG_SIMPLE) { 2070 KKASSERT(pvpp == NULL); 2071 pvp = NULL; 2072 } else { 2073 pvp = pmap_allocpte(pmap, ptepindex, NULL); 2074 } 2075 2076 /* 2077 * PD index in PDP 2078 */ 2079 ptepindex = pv->pv_pindex - pmap_pd_pindex(0); 2080 ptepindex &= ((1ul << NPDPEPGSHIFT) - 1); 2081 } else if (ptepindex < pmap_pml4_pindex()) { 2082 /* 2083 * pv is PDP, pvp is the root pml4 table 2084 */ 2085 pvp = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL); 2086 2087 /* 2088 * PDP index in PML4 2089 */ 2090 ptepindex = pv->pv_pindex - pmap_pdp_pindex(0); 2091 ptepindex &= ((1ul << NPML4EPGSHIFT) - 1); 2092 } else { 2093 /* 2094 * pv represents the top-level PML4, there is no parent. 2095 */ 2096 pvp = NULL; 2097 } 2098 2099 if (isnew == 0) 2100 goto notnew; 2101 2102 /* 2103 * (isnew) is TRUE, pv is not terminal. 2104 * 2105 * (1) Add a wire count to the parent page table (pvp). 2106 * (2) Allocate a VM page for the page table. 2107 * (3) Enter the VM page into the parent page table. 2108 * 2109 * page table pages are marked PG_WRITEABLE and PG_MAPPED. 2110 */ 2111 if (pvp) 2112 vm_page_wire_quick(pvp->pv_m); 2113 2114 for (;;) { 2115 m = vm_page_alloc(NULL, pv->pv_pindex, 2116 VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM | 2117 VM_ALLOC_INTERRUPT); 2118 if (m) 2119 break; 2120 vm_wait(0); 2121 } 2122 vm_page_wire(m); /* wire for mapping in parent */ 2123 vm_page_unmanage(m); /* m must be spinunlocked */ 2124 pmap_zero_page(VM_PAGE_TO_PHYS(m)); 2125 m->valid = VM_PAGE_BITS_ALL; 2126 2127 vm_page_spin_lock(m); 2128 pmap_page_stats_adding(m); 2129 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2130 pv->pv_m = m; 2131 vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); 2132 vm_page_spin_unlock(m); 2133 2134 /* 2135 * (isnew) is TRUE, pv is not terminal. 2136 * 2137 * Wire the page into pvp. Bump the resident_count for the pmap. 2138 * There is no pvp for the top level, address the pm_pml4[] array 2139 * directly. 2140 * 2141 * If the caller wants the parent we return it, otherwise 2142 * we just put it away. 2143 * 2144 * No interlock is needed for pte 0 -> non-zero. 2145 * 2146 * In the situation where *ptep is valid we might have an unmanaged 2147 * page table page shared from another page table which we need to 2148 * unshare before installing our private page table page. 2149 */ 2150 if (pvp) { 2151 v = VM_PAGE_TO_PHYS(m) | 2152 (pmap->pmap_bits[PG_U_IDX] | 2153 pmap->pmap_bits[PG_RW_IDX] | 2154 pmap->pmap_bits[PG_V_IDX] | 2155 pmap->pmap_bits[PG_A_IDX] | 2156 pmap->pmap_bits[PG_M_IDX]); 2157 ptep = pv_pte_lookup(pvp, ptepindex); 2158 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 2159 pt_entry_t pte; 2160 2161 if (ispt == 0) { 2162 panic("pmap_allocpte: unexpected pte %p/%d", 2163 pvp, (int)ptepindex); 2164 } 2165 pte = pmap_inval_smp(pmap, (vm_offset_t)-1, 1, ptep, v); 2166 if (vm_page_unwire_quick( 2167 PHYS_TO_VM_PAGE(pte & PG_FRAME))) { 2168 panic("pmap_allocpte: shared pgtable " 2169 "pg bad wirecount"); 2170 } 2171 } else { 2172 pt_entry_t pte; 2173 2174 pte = atomic_swap_long(ptep, v); 2175 if (pte != 0) { 2176 kprintf("install pgtbl mixup 0x%016jx " 2177 "old/new 0x%016jx/0x%016jx\n", 2178 (intmax_t)ptepindex, pte, v); 2179 } 2180 } 2181 } 2182 vm_page_wakeup(m); 2183 2184 /* 2185 * (isnew) may be TRUE or FALSE, pv may or may not be terminal. 2186 */ 2187 notnew: 2188 if (pvp) { 2189 KKASSERT(pvp->pv_m != NULL); 2190 ptep = pv_pte_lookup(pvp, ptepindex); 2191 v = VM_PAGE_TO_PHYS(pv->pv_m) | 2192 (pmap->pmap_bits[PG_U_IDX] | 2193 pmap->pmap_bits[PG_RW_IDX] | 2194 pmap->pmap_bits[PG_V_IDX] | 2195 pmap->pmap_bits[PG_A_IDX] | 2196 pmap->pmap_bits[PG_M_IDX]); 2197 if (*ptep != v) { 2198 kprintf("mismatched upper level pt %016jx/%016jx\n", 2199 *ptep, v); 2200 } 2201 } 2202 if (pvpp) 2203 *pvpp = pvp; 2204 else if (pvp) 2205 pv_put(pvp); 2206 return (pv); 2207 } 2208 2209 /* 2210 * This version of pmap_allocpte() checks for possible segment optimizations 2211 * that would allow page-table sharing. It can be called for terminal 2212 * page or page table page ptepindex's. 2213 * 2214 * The function is called with page table page ptepindex's for fictitious 2215 * and unmanaged terminal pages. That is, we don't want to allocate a 2216 * terminal pv, we just want the pt_pv. pvpp is usually passed as NULL 2217 * for this case. 2218 * 2219 * This function can return a pv and *pvpp associated with the passed in pmap 2220 * OR a pv and *pvpp associated with the shared pmap. In the latter case 2221 * an unmanaged page table page will be entered into the pass in pmap. 2222 */ 2223 static 2224 pv_entry_t 2225 pmap_allocpte_seg(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp, 2226 vm_map_entry_t entry, vm_offset_t va) 2227 { 2228 vm_object_t object; 2229 pmap_t obpmap; 2230 pmap_t *obpmapp; 2231 vm_offset_t b; 2232 pv_entry_t pte_pv; /* in original or shared pmap */ 2233 pv_entry_t pt_pv; /* in original or shared pmap */ 2234 pv_entry_t proc_pd_pv; /* in original pmap */ 2235 pv_entry_t proc_pt_pv; /* in original pmap */ 2236 pv_entry_t xpv; /* PT in shared pmap */ 2237 pd_entry_t *pt; /* PT entry in PD of original pmap */ 2238 pd_entry_t opte; /* contents of *pt */ 2239 pd_entry_t npte; /* contents of *pt */ 2240 vm_page_t m; 2241 2242 /* 2243 * Basic tests, require a non-NULL vm_map_entry, require proper 2244 * alignment and type for the vm_map_entry, require that the 2245 * underlying object already be allocated. 2246 * 2247 * We allow almost any type of object to use this optimization. 2248 * The object itself does NOT have to be sized to a multiple of the 2249 * segment size, but the memory mapping does. 2250 * 2251 * XXX don't handle devices currently, because VM_PAGE_TO_PHYS() 2252 * won't work as expected. 2253 */ 2254 if (entry == NULL || 2255 pmap_mmu_optimize == 0 || /* not enabled */ 2256 (pmap->pm_flags & PMAP_HVM) || /* special pmap */ 2257 ptepindex >= pmap_pd_pindex(0) || /* not terminal or pt */ 2258 entry->inheritance != VM_INHERIT_SHARE || /* not shared */ 2259 entry->maptype != VM_MAPTYPE_NORMAL || /* weird map type */ 2260 entry->object.vm_object == NULL || /* needs VM object */ 2261 entry->object.vm_object->type == OBJT_DEVICE || /* ick */ 2262 entry->object.vm_object->type == OBJT_MGTDEVICE || /* ick */ 2263 (entry->offset & SEG_MASK) || /* must be aligned */ 2264 (entry->start & SEG_MASK)) { 2265 return(pmap_allocpte(pmap, ptepindex, pvpp)); 2266 } 2267 2268 /* 2269 * Make sure the full segment can be represented. 2270 */ 2271 b = va & ~(vm_offset_t)SEG_MASK; 2272 if (b < entry->start || b + SEG_SIZE > entry->end) 2273 return(pmap_allocpte(pmap, ptepindex, pvpp)); 2274 2275 /* 2276 * If the full segment can be represented dive the VM object's 2277 * shared pmap, allocating as required. 2278 */ 2279 object = entry->object.vm_object; 2280 2281 if (entry->protection & VM_PROT_WRITE) 2282 obpmapp = &object->md.pmap_rw; 2283 else 2284 obpmapp = &object->md.pmap_ro; 2285 2286 #ifdef PMAP_DEBUG2 2287 if (pmap_enter_debug > 0) { 2288 --pmap_enter_debug; 2289 kprintf("pmap_allocpte_seg: va=%jx prot %08x o=%p " 2290 "obpmapp %p %p\n", 2291 va, entry->protection, object, 2292 obpmapp, *obpmapp); 2293 kprintf("pmap_allocpte_seg: entry %p %jx-%jx\n", 2294 entry, entry->start, entry->end); 2295 } 2296 #endif 2297 2298 /* 2299 * We allocate what appears to be a normal pmap but because portions 2300 * of this pmap are shared with other unrelated pmaps we have to 2301 * set pm_active to point to all cpus. 2302 * 2303 * XXX Currently using pmap_spin to interlock the update, can't use 2304 * vm_object_hold/drop because the token might already be held 2305 * shared OR exclusive and we don't know. 2306 */ 2307 while ((obpmap = *obpmapp) == NULL) { 2308 obpmap = kmalloc(sizeof(*obpmap), M_OBJPMAP, M_WAITOK|M_ZERO); 2309 pmap_pinit_simple(obpmap); 2310 pmap_pinit2(obpmap); 2311 spin_lock(&pmap_spin); 2312 if (*obpmapp != NULL) { 2313 /* 2314 * Handle race 2315 */ 2316 spin_unlock(&pmap_spin); 2317 pmap_release(obpmap); 2318 pmap_puninit(obpmap); 2319 kfree(obpmap, M_OBJPMAP); 2320 obpmap = *obpmapp; /* safety */ 2321 } else { 2322 obpmap->pm_active = smp_active_mask; 2323 obpmap->pm_flags |= PMAP_SEGSHARED; 2324 *obpmapp = obpmap; 2325 spin_unlock(&pmap_spin); 2326 } 2327 } 2328 2329 /* 2330 * Layering is: PTE, PT, PD, PDP, PML4. We have to return the 2331 * pte/pt using the shared pmap from the object but also adjust 2332 * the process pmap's page table page as a side effect. 2333 */ 2334 2335 /* 2336 * Resolve the terminal PTE and PT in the shared pmap. This is what 2337 * we will return. This is true if ptepindex represents a terminal 2338 * page, otherwise pte_pv is actually the PT and pt_pv is actually 2339 * the PD. 2340 */ 2341 pt_pv = NULL; 2342 pte_pv = pmap_allocpte(obpmap, ptepindex, &pt_pv); 2343 retry: 2344 if (ptepindex >= pmap_pt_pindex(0)) 2345 xpv = pte_pv; 2346 else 2347 xpv = pt_pv; 2348 2349 /* 2350 * Resolve the PD in the process pmap so we can properly share the 2351 * page table page. Lock order is bottom-up (leaf first)! 2352 * 2353 * NOTE: proc_pt_pv can be NULL. 2354 */ 2355 proc_pt_pv = pv_get(pmap, pmap_pt_pindex(b), NULL); 2356 proc_pd_pv = pmap_allocpte(pmap, pmap_pd_pindex(b), NULL); 2357 #ifdef PMAP_DEBUG2 2358 if (pmap_enter_debug > 0) { 2359 --pmap_enter_debug; 2360 kprintf("proc_pt_pv %p (wc %d) pd_pv %p va=%jx\n", 2361 proc_pt_pv, 2362 (proc_pt_pv ? proc_pt_pv->pv_m->wire_count : -1), 2363 proc_pd_pv, 2364 va); 2365 } 2366 #endif 2367 2368 /* 2369 * xpv is the page table page pv from the shared object 2370 * (for convenience), from above. 2371 * 2372 * Calculate the pte value for the PT to load into the process PD. 2373 * If we have to change it we must properly dispose of the previous 2374 * entry. 2375 */ 2376 pt = pv_pte_lookup(proc_pd_pv, pmap_pt_index(b)); 2377 npte = VM_PAGE_TO_PHYS(xpv->pv_m) | 2378 (pmap->pmap_bits[PG_U_IDX] | 2379 pmap->pmap_bits[PG_RW_IDX] | 2380 pmap->pmap_bits[PG_V_IDX] | 2381 pmap->pmap_bits[PG_A_IDX] | 2382 pmap->pmap_bits[PG_M_IDX]); 2383 2384 /* 2385 * Dispose of previous page table page if it was local to the 2386 * process pmap. If the old pt is not empty we cannot dispose of it 2387 * until we clean it out. This case should not arise very often so 2388 * it is not optimized. 2389 * 2390 * Leave pt_pv and pte_pv (in our object pmap) locked and intact 2391 * for the retry. 2392 */ 2393 if (proc_pt_pv) { 2394 pmap_inval_bulk_t bulk; 2395 2396 if (proc_pt_pv->pv_m->wire_count != 1) { 2397 pv_put(proc_pd_pv); 2398 pv_put(proc_pt_pv); 2399 pmap_remove(pmap, 2400 va & ~(vm_offset_t)SEG_MASK, 2401 (va + SEG_SIZE) & ~(vm_offset_t)SEG_MASK); 2402 goto retry; 2403 } 2404 2405 /* 2406 * The release call will indirectly clean out *pt 2407 */ 2408 pmap_inval_bulk_init(&bulk, proc_pt_pv->pv_pmap); 2409 pmap_release_pv(proc_pt_pv, proc_pd_pv, &bulk); 2410 pmap_inval_bulk_flush(&bulk); 2411 proc_pt_pv = NULL; 2412 /* relookup */ 2413 pt = pv_pte_lookup(proc_pd_pv, pmap_pt_index(b)); 2414 } 2415 2416 /* 2417 * Handle remaining cases. 2418 */ 2419 if (*pt == 0) { 2420 atomic_swap_long(pt, npte); 2421 vm_page_wire_quick(xpv->pv_m); /* shared pt -> proc */ 2422 vm_page_wire_quick(proc_pd_pv->pv_m); /* proc pd for sh pt */ 2423 atomic_add_long(&pmap->pm_stats.resident_count, 1); 2424 } else if (*pt != npte) { 2425 opte = pmap_inval_smp(pmap, (vm_offset_t)-1, 1, pt, npte); 2426 2427 #if 0 2428 opte = pte_load_clear(pt); 2429 KKASSERT(opte && opte != npte); 2430 2431 *pt = npte; 2432 #endif 2433 vm_page_wire_quick(xpv->pv_m); /* shared pt -> proc */ 2434 2435 /* 2436 * Clean up opte, bump the wire_count for the process 2437 * PD page representing the new entry if it was 2438 * previously empty. 2439 * 2440 * If the entry was not previously empty and we have 2441 * a PT in the proc pmap then opte must match that 2442 * pt. The proc pt must be retired (this is done 2443 * later on in this procedure). 2444 * 2445 * NOTE: replacing valid pte, wire_count on proc_pd_pv 2446 * stays the same. 2447 */ 2448 KKASSERT(opte & pmap->pmap_bits[PG_V_IDX]); 2449 m = PHYS_TO_VM_PAGE(opte & PG_FRAME); 2450 if (vm_page_unwire_quick(m)) { 2451 panic("pmap_allocpte_seg: " 2452 "bad wire count %p", 2453 m); 2454 } 2455 } 2456 2457 /* 2458 * The existing process page table was replaced and must be destroyed 2459 * here. 2460 */ 2461 if (proc_pd_pv) 2462 pv_put(proc_pd_pv); 2463 if (pvpp) 2464 *pvpp = pt_pv; 2465 else 2466 pv_put(pt_pv); 2467 2468 return (pte_pv); 2469 } 2470 2471 /* 2472 * Release any resources held by the given physical map. 2473 * 2474 * Called when a pmap initialized by pmap_pinit is being released. Should 2475 * only be called if the map contains no valid mappings. 2476 */ 2477 struct pmap_release_info { 2478 pmap_t pmap; 2479 int retry; 2480 pv_entry_t pvp; 2481 }; 2482 2483 static int pmap_release_callback(pv_entry_t pv, void *data); 2484 2485 void 2486 pmap_release(struct pmap *pmap) 2487 { 2488 struct pmap_release_info info; 2489 2490 KASSERT(CPUMASK_TESTZERO(pmap->pm_active), 2491 ("pmap still active! %016jx", 2492 (uintmax_t)CPUMASK_LOWMASK(pmap->pm_active))); 2493 2494 /* 2495 * There is no longer a pmap_list, if there were we would remove the 2496 * pmap from it here. 2497 */ 2498 2499 /* 2500 * Pull pv's off the RB tree in order from low to high and release 2501 * each page. 2502 */ 2503 info.pmap = pmap; 2504 do { 2505 info.retry = 0; 2506 info.pvp = NULL; 2507 2508 spin_lock(&pmap->pm_spin); 2509 RB_SCAN(pv_entry_rb_tree, &pmap->pm_pvroot, NULL, 2510 pmap_release_callback, &info); 2511 spin_unlock(&pmap->pm_spin); 2512 2513 if (info.pvp) 2514 pv_put(info.pvp); 2515 } while (info.retry); 2516 2517 2518 /* 2519 * One resident page (the pml4 page) should remain. 2520 * No wired pages should remain. 2521 */ 2522 #if 1 2523 if (pmap->pm_stats.resident_count != 2524 ((pmap->pm_flags & PMAP_FLAG_SIMPLE) ? 0 : 1) || 2525 pmap->pm_stats.wired_count != 0) { 2526 kprintf("fatal pmap problem - pmap %p flags %08x " 2527 "rescnt=%jd wirecnt=%jd\n", 2528 pmap, 2529 pmap->pm_flags, 2530 pmap->pm_stats.resident_count, 2531 pmap->pm_stats.wired_count); 2532 tsleep(pmap, 0, "DEAD", 0); 2533 } 2534 #else 2535 KKASSERT(pmap->pm_stats.resident_count == 2536 ((pmap->pm_flags & PMAP_FLAG_SIMPLE) ? 0 : 1)); 2537 KKASSERT(pmap->pm_stats.wired_count == 0); 2538 #endif 2539 } 2540 2541 /* 2542 * Called from low to high. We must cache the proper parent pv so we 2543 * can adjust its wired count. 2544 */ 2545 static int 2546 pmap_release_callback(pv_entry_t pv, void *data) 2547 { 2548 struct pmap_release_info *info = data; 2549 pmap_t pmap = info->pmap; 2550 vm_pindex_t pindex; 2551 int r; 2552 2553 /* 2554 * Acquire a held and locked pv, check for release race 2555 */ 2556 pindex = pv->pv_pindex; 2557 if (info->pvp == pv) { 2558 spin_unlock(&pmap->pm_spin); 2559 info->pvp = NULL; 2560 } else if (pv_hold_try(pv)) { 2561 spin_unlock(&pmap->pm_spin); 2562 } else { 2563 spin_unlock(&pmap->pm_spin); 2564 pv_lock(pv); 2565 pv_put(pv); 2566 info->retry = 1; 2567 spin_lock(&pmap->pm_spin); 2568 2569 return -1; 2570 } 2571 KKASSERT(pv->pv_pmap == pmap && pindex == pv->pv_pindex); 2572 2573 if (pv->pv_pindex < pmap_pt_pindex(0)) { 2574 /* 2575 * I am PTE, parent is PT 2576 */ 2577 pindex = pv->pv_pindex >> NPTEPGSHIFT; 2578 pindex += NUPTE_TOTAL; 2579 } else if (pv->pv_pindex < pmap_pd_pindex(0)) { 2580 /* 2581 * I am PT, parent is PD 2582 */ 2583 pindex = (pv->pv_pindex - NUPTE_TOTAL) >> NPDEPGSHIFT; 2584 pindex += NUPTE_TOTAL + NUPT_TOTAL; 2585 } else if (pv->pv_pindex < pmap_pdp_pindex(0)) { 2586 /* 2587 * I am PD, parent is PDP 2588 */ 2589 pindex = (pv->pv_pindex - NUPTE_TOTAL - NUPT_TOTAL) >> 2590 NPDPEPGSHIFT; 2591 pindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL; 2592 } else if (pv->pv_pindex < pmap_pml4_pindex()) { 2593 /* 2594 * I am PDP, parent is PML4 (there's only one) 2595 */ 2596 #if 0 2597 pindex = (pv->pv_pindex - NUPTE_TOTAL - NUPT_TOTAL - 2598 NUPD_TOTAL) >> NPML4EPGSHIFT; 2599 pindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL; 2600 #endif 2601 pindex = pmap_pml4_pindex(); 2602 } else { 2603 /* 2604 * parent is NULL 2605 */ 2606 if (info->pvp) { 2607 pv_put(info->pvp); 2608 info->pvp = NULL; 2609 } 2610 pindex = 0; 2611 } 2612 if (pindex) { 2613 if (info->pvp && info->pvp->pv_pindex != pindex) { 2614 pv_put(info->pvp); 2615 info->pvp = NULL; 2616 } 2617 if (info->pvp == NULL) 2618 info->pvp = pv_get(pmap, pindex, NULL); 2619 } else { 2620 if (info->pvp) { 2621 pv_put(info->pvp); 2622 info->pvp = NULL; 2623 } 2624 } 2625 r = pmap_release_pv(pv, info->pvp, NULL); 2626 spin_lock(&pmap->pm_spin); 2627 2628 return(r); 2629 } 2630 2631 /* 2632 * Called with held (i.e. also locked) pv. This function will dispose of 2633 * the lock along with the pv. 2634 * 2635 * If the caller already holds the locked parent page table for pv it 2636 * must pass it as pvp, allowing us to avoid a deadlock, else it can 2637 * pass NULL for pvp. 2638 */ 2639 static int 2640 pmap_release_pv(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk) 2641 { 2642 vm_page_t p; 2643 2644 /* 2645 * The pmap is currently not spinlocked, pv is held+locked. 2646 * Remove the pv's page from its parent's page table. The 2647 * parent's page table page's wire_count will be decremented. 2648 * 2649 * This will clean out the pte at any level of the page table. 2650 * If smp != 0 all cpus are affected. 2651 * 2652 * Do not tear-down recursively, its faster to just let the 2653 * release run its course. 2654 */ 2655 pmap_remove_pv_pte(pv, pvp, bulk, 0); 2656 2657 /* 2658 * Terminal pvs are unhooked from their vm_pages. Because 2659 * terminal pages aren't page table pages they aren't wired 2660 * by us, so we have to be sure not to unwire them either. 2661 */ 2662 if (pv->pv_pindex < pmap_pt_pindex(0)) { 2663 pmap_remove_pv_page(pv); 2664 goto skip; 2665 } 2666 2667 /* 2668 * We leave the top-level page table page cached, wired, and 2669 * mapped in the pmap until the dtor function (pmap_puninit()) 2670 * gets called. 2671 * 2672 * Since we are leaving the top-level pv intact we need 2673 * to break out of what would otherwise be an infinite loop. 2674 */ 2675 if (pv->pv_pindex == pmap_pml4_pindex()) { 2676 pv_put(pv); 2677 return(-1); 2678 } 2679 2680 /* 2681 * For page table pages (other than the top-level page), 2682 * remove and free the vm_page. The representitive mapping 2683 * removed above by pmap_remove_pv_pte() did not undo the 2684 * last wire_count so we have to do that as well. 2685 */ 2686 p = pmap_remove_pv_page(pv); 2687 vm_page_busy_wait(p, FALSE, "pmaprl"); 2688 if (p->wire_count != 1) { 2689 kprintf("p->wire_count was %016lx %d\n", 2690 pv->pv_pindex, p->wire_count); 2691 } 2692 KKASSERT(p->wire_count == 1); 2693 KKASSERT(p->flags & PG_UNMANAGED); 2694 2695 vm_page_unwire(p, 0); 2696 KKASSERT(p->wire_count == 0); 2697 2698 vm_page_free(p); 2699 skip: 2700 pv_free(pv, pvp); 2701 2702 return 0; 2703 } 2704 2705 /* 2706 * This function will remove the pte associated with a pv from its parent. 2707 * Terminal pv's are supported. All cpus specified by (bulk) are properly 2708 * invalidated. 2709 * 2710 * The wire count will be dropped on the parent page table. The wire 2711 * count on the page being removed (pv->pv_m) from the parent page table 2712 * is NOT touched. Note that terminal pages will not have any additional 2713 * wire counts while page table pages will have at least one representing 2714 * the mapping, plus others representing sub-mappings. 2715 * 2716 * NOTE: Cannot be called on kernel page table pages, only KVM terminal 2717 * pages and user page table and terminal pages. 2718 * 2719 * NOTE: The pte being removed might be unmanaged, and the pv supplied might 2720 * be freshly allocated and not imply that the pte is managed. In this 2721 * case pv->pv_m should be NULL. 2722 * 2723 * The pv must be locked. The pvp, if supplied, must be locked. All 2724 * supplied pv's will remain locked on return. 2725 * 2726 * XXX must lock parent pv's if they exist to remove pte XXX 2727 */ 2728 static 2729 void 2730 pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk, 2731 int destroy) 2732 { 2733 vm_pindex_t ptepindex = pv->pv_pindex; 2734 pmap_t pmap = pv->pv_pmap; 2735 vm_page_t p; 2736 int gotpvp = 0; 2737 2738 KKASSERT(pmap); 2739 2740 if (ptepindex == pmap_pml4_pindex()) { 2741 /* 2742 * We are the top level PML4E table, there is no parent. 2743 */ 2744 p = pmap->pm_pmlpv->pv_m; 2745 KKASSERT(pv->pv_m == p); /* debugging */ 2746 } else if (ptepindex >= pmap_pdp_pindex(0)) { 2747 /* 2748 * Remove a PDP page from the PML4E. This can only occur 2749 * with user page tables. We do not have to lock the 2750 * pml4 PV so just ignore pvp. 2751 */ 2752 vm_pindex_t pml4_pindex; 2753 vm_pindex_t pdp_index; 2754 pml4_entry_t *pdp; 2755 2756 pdp_index = ptepindex - pmap_pdp_pindex(0); 2757 if (pvp == NULL) { 2758 pml4_pindex = pmap_pml4_pindex(); 2759 pvp = pv_get(pv->pv_pmap, pml4_pindex, NULL); 2760 KKASSERT(pvp); 2761 gotpvp = 1; 2762 } 2763 2764 pdp = &pmap->pm_pml4[pdp_index & ((1ul << NPML4EPGSHIFT) - 1)]; 2765 KKASSERT((*pdp & pmap->pmap_bits[PG_V_IDX]) != 0); 2766 p = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 2767 pmap_inval_bulk(bulk, (vm_offset_t)-1, pdp, 0); 2768 KKASSERT(pv->pv_m == p); /* debugging */ 2769 } else if (ptepindex >= pmap_pd_pindex(0)) { 2770 /* 2771 * Remove a PD page from the PDP 2772 * 2773 * SIMPLE PMAP NOTE: Non-existant pvp's are ok in the case 2774 * of a simple pmap because it stops at 2775 * the PD page. 2776 */ 2777 vm_pindex_t pdp_pindex; 2778 vm_pindex_t pd_index; 2779 pdp_entry_t *pd; 2780 2781 pd_index = ptepindex - pmap_pd_pindex(0); 2782 2783 if (pvp == NULL) { 2784 pdp_pindex = NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + 2785 (pd_index >> NPML4EPGSHIFT); 2786 pvp = pv_get(pv->pv_pmap, pdp_pindex, NULL); 2787 gotpvp = 1; 2788 } 2789 2790 if (pvp) { 2791 pd = pv_pte_lookup(pvp, pd_index & 2792 ((1ul << NPDPEPGSHIFT) - 1)); 2793 KKASSERT((*pd & pmap->pmap_bits[PG_V_IDX]) != 0); 2794 p = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 2795 pmap_inval_bulk(bulk, (vm_offset_t)-1, pd, 0); 2796 } else { 2797 KKASSERT(pmap->pm_flags & PMAP_FLAG_SIMPLE); 2798 p = pv->pv_m; /* degenerate test later */ 2799 } 2800 KKASSERT(pv->pv_m == p); /* debugging */ 2801 } else if (ptepindex >= pmap_pt_pindex(0)) { 2802 /* 2803 * Remove a PT page from the PD 2804 */ 2805 vm_pindex_t pd_pindex; 2806 vm_pindex_t pt_index; 2807 pd_entry_t *pt; 2808 2809 pt_index = ptepindex - pmap_pt_pindex(0); 2810 2811 if (pvp == NULL) { 2812 pd_pindex = NUPTE_TOTAL + NUPT_TOTAL + 2813 (pt_index >> NPDPEPGSHIFT); 2814 pvp = pv_get(pv->pv_pmap, pd_pindex, NULL); 2815 KKASSERT(pvp); 2816 gotpvp = 1; 2817 } 2818 2819 pt = pv_pte_lookup(pvp, pt_index & ((1ul << NPDPEPGSHIFT) - 1)); 2820 #if 0 2821 KASSERT((*pt & pmap->pmap_bits[PG_V_IDX]) != 0, 2822 ("*pt unexpectedly invalid %016jx " 2823 "gotpvp=%d ptepindex=%ld ptindex=%ld pv=%p pvp=%p", 2824 *pt, gotpvp, ptepindex, pt_index, pv, pvp)); 2825 p = PHYS_TO_VM_PAGE(*pt & PG_FRAME); 2826 #else 2827 if ((*pt & pmap->pmap_bits[PG_V_IDX]) == 0) { 2828 kprintf("*pt unexpectedly invalid %016jx " 2829 "gotpvp=%d ptepindex=%ld ptindex=%ld " 2830 "pv=%p pvp=%p\n", 2831 *pt, gotpvp, ptepindex, pt_index, pv, pvp); 2832 tsleep(pt, 0, "DEAD", 0); 2833 p = pv->pv_m; 2834 } else { 2835 p = PHYS_TO_VM_PAGE(*pt & PG_FRAME); 2836 } 2837 #endif 2838 pmap_inval_bulk(bulk, (vm_offset_t)-1, pt, 0); 2839 KKASSERT(pv->pv_m == p); /* debugging */ 2840 } else { 2841 /* 2842 * Remove a PTE from the PT page. The PV might exist even if 2843 * the PTE is not managed, in whichcase pv->pv_m should be 2844 * NULL. 2845 * 2846 * NOTE: Userland pmaps manage the parent PT/PD/PDP page 2847 * table pages but the kernel_pmap does not. 2848 * 2849 * NOTE: pv's must be locked bottom-up to avoid deadlocking. 2850 * pv is a pte_pv so we can safely lock pt_pv. 2851 * 2852 * NOTE: FICTITIOUS pages may have multiple physical mappings 2853 * so PHYS_TO_VM_PAGE() will not necessarily work for 2854 * terminal ptes. 2855 */ 2856 vm_pindex_t pt_pindex; 2857 pt_entry_t *ptep; 2858 pt_entry_t pte; 2859 vm_offset_t va; 2860 2861 pt_pindex = ptepindex >> NPTEPGSHIFT; 2862 va = (vm_offset_t)ptepindex << PAGE_SHIFT; 2863 2864 if (ptepindex >= NUPTE_USER) { 2865 ptep = vtopte(ptepindex << PAGE_SHIFT); 2866 KKASSERT(pvp == NULL); 2867 /* pvp remains NULL */ 2868 } else { 2869 if (pvp == NULL) { 2870 pt_pindex = NUPTE_TOTAL + 2871 (ptepindex >> NPDPEPGSHIFT); 2872 pvp = pv_get(pv->pv_pmap, pt_pindex, NULL); 2873 KKASSERT(pvp); 2874 gotpvp = 1; 2875 } 2876 ptep = pv_pte_lookup(pvp, ptepindex & 2877 ((1ul << NPDPEPGSHIFT) - 1)); 2878 } 2879 pte = pmap_inval_bulk(bulk, va, ptep, 0); 2880 if (bulk == NULL) /* XXX */ 2881 cpu_invlpg((void *)va); /* XXX */ 2882 2883 /* 2884 * Now update the vm_page_t 2885 */ 2886 if ((pte & pmap->pmap_bits[PG_MANAGED_IDX]) && 2887 (pte & pmap->pmap_bits[PG_V_IDX])) { 2888 /* 2889 * Valid managed page, adjust (p). 2890 */ 2891 if (pte & pmap->pmap_bits[PG_DEVICE_IDX]) { 2892 p = pv->pv_m; 2893 } else { 2894 p = PHYS_TO_VM_PAGE(pte & PG_FRAME); 2895 KKASSERT(pv->pv_m == p); 2896 } 2897 if (pte & pmap->pmap_bits[PG_M_IDX]) { 2898 if (pmap_track_modified(ptepindex)) 2899 vm_page_dirty(p); 2900 } 2901 if (pte & pmap->pmap_bits[PG_A_IDX]) { 2902 vm_page_flag_set(p, PG_REFERENCED); 2903 } 2904 } else { 2905 /* 2906 * Unmanaged page, do not try to adjust the vm_page_t. 2907 * pv could be freshly allocated for a pmap_enter(), 2908 * replacing an unmanaged page with a managed one. 2909 * 2910 * pv->pv_m might reflect the new page and not the 2911 * existing page. 2912 * 2913 * We could extract p from the physical address and 2914 * adjust it but we explicitly do not for unmanaged 2915 * pages. 2916 */ 2917 p = NULL; 2918 } 2919 if (pte & pmap->pmap_bits[PG_W_IDX]) 2920 atomic_add_long(&pmap->pm_stats.wired_count, -1); 2921 if (pte & pmap->pmap_bits[PG_G_IDX]) 2922 cpu_invlpg((void *)va); 2923 } 2924 2925 /* 2926 * If requested, scrap the underlying pv->pv_m and the underlying 2927 * pv. If this is a page-table-page we must also free the page. 2928 * 2929 * pvp must be returned locked. 2930 */ 2931 if (destroy == 1) { 2932 /* 2933 * page table page (PT, PD, PDP, PML4), caller was responsible 2934 * for testing wired_count. 2935 */ 2936 KKASSERT(pv->pv_m->wire_count == 1); 2937 p = pmap_remove_pv_page(pv); 2938 pv_free(pv, pvp); 2939 pv = NULL; 2940 2941 vm_page_busy_wait(p, FALSE, "pgpun"); 2942 vm_page_unwire(p, 0); 2943 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 2944 vm_page_free(p); 2945 } else if (destroy == 2) { 2946 /* 2947 * Normal page, remove from pmap and leave the underlying 2948 * page untouched. 2949 */ 2950 pmap_remove_pv_page(pv); 2951 pv_free(pv, pvp); 2952 pv = NULL; /* safety */ 2953 } 2954 2955 /* 2956 * If we acquired pvp ourselves then we are responsible for 2957 * recursively deleting it. 2958 */ 2959 if (pvp && gotpvp) { 2960 /* 2961 * Recursively destroy higher-level page tables. 2962 * 2963 * This is optional. If we do not, they will still 2964 * be destroyed when the process exits. 2965 * 2966 * NOTE: Do not destroy pv_entry's with extra hold refs, 2967 * a caller may have unlocked it and intends to 2968 * continue to use it. 2969 */ 2970 if (pmap_dynamic_delete && 2971 pvp->pv_m && 2972 pvp->pv_m->wire_count == 1 && 2973 (pvp->pv_hold & PV_HOLD_MASK) == 2 && 2974 pvp->pv_pindex != pmap_pml4_pindex()) { 2975 if (pmap_dynamic_delete == 2) 2976 kprintf("A %jd %08x\n", pvp->pv_pindex, pvp->pv_hold); 2977 if (pmap != &kernel_pmap) { 2978 pmap_remove_pv_pte(pvp, NULL, bulk, 1); 2979 pvp = NULL; /* safety */ 2980 } else { 2981 kprintf("Attempt to remove kernel_pmap pindex " 2982 "%jd\n", pvp->pv_pindex); 2983 pv_put(pvp); 2984 } 2985 } else { 2986 pv_put(pvp); 2987 } 2988 } 2989 } 2990 2991 /* 2992 * Remove the vm_page association to a pv. The pv must be locked. 2993 */ 2994 static 2995 vm_page_t 2996 pmap_remove_pv_page(pv_entry_t pv) 2997 { 2998 vm_page_t m; 2999 3000 m = pv->pv_m; 3001 vm_page_spin_lock(m); 3002 KKASSERT(m && m == pv->pv_m); 3003 pv->pv_m = NULL; 3004 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3005 pmap_page_stats_deleting(m); 3006 if (TAILQ_EMPTY(&m->md.pv_list)) 3007 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 3008 vm_page_spin_unlock(m); 3009 3010 return(m); 3011 } 3012 3013 /* 3014 * Grow the number of kernel page table entries, if needed. 3015 * 3016 * This routine is always called to validate any address space 3017 * beyond KERNBASE (for kldloads). kernel_vm_end only governs the address 3018 * space below KERNBASE. 3019 * 3020 * kernel_map must be locked exclusively by the caller. 3021 */ 3022 void 3023 pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) 3024 { 3025 vm_paddr_t paddr; 3026 vm_offset_t ptppaddr; 3027 vm_page_t nkpg; 3028 pd_entry_t *pt, newpt; 3029 pdp_entry_t newpd; 3030 int update_kernel_vm_end; 3031 3032 /* 3033 * bootstrap kernel_vm_end on first real VM use 3034 */ 3035 if (kernel_vm_end == 0) { 3036 kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 3037 nkpt = 0; 3038 while ((*pmap_pt(&kernel_pmap, kernel_vm_end) & kernel_pmap.pmap_bits[PG_V_IDX]) != 0) { 3039 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & 3040 ~(PAGE_SIZE * NPTEPG - 1); 3041 nkpt++; 3042 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 3043 kernel_vm_end = kernel_map.max_offset; 3044 break; 3045 } 3046 } 3047 } 3048 3049 /* 3050 * Fill in the gaps. kernel_vm_end is only adjusted for ranges 3051 * below KERNBASE. Ranges above KERNBASE are kldloaded and we 3052 * do not want to force-fill 128G worth of page tables. 3053 */ 3054 if (kstart < KERNBASE) { 3055 if (kstart > kernel_vm_end) 3056 kstart = kernel_vm_end; 3057 KKASSERT(kend <= KERNBASE); 3058 update_kernel_vm_end = 1; 3059 } else { 3060 update_kernel_vm_end = 0; 3061 } 3062 3063 kstart = rounddown2(kstart, PAGE_SIZE * NPTEPG); 3064 kend = roundup2(kend, PAGE_SIZE * NPTEPG); 3065 3066 if (kend - 1 >= kernel_map.max_offset) 3067 kend = kernel_map.max_offset; 3068 3069 while (kstart < kend) { 3070 pt = pmap_pt(&kernel_pmap, kstart); 3071 if (pt == NULL) { 3072 /* We need a new PD entry */ 3073 nkpg = vm_page_alloc(NULL, mycpu->gd_rand_incr++, 3074 VM_ALLOC_NORMAL | 3075 VM_ALLOC_SYSTEM | 3076 VM_ALLOC_INTERRUPT); 3077 if (nkpg == NULL) { 3078 panic("pmap_growkernel: no memory to grow " 3079 "kernel"); 3080 } 3081 paddr = VM_PAGE_TO_PHYS(nkpg); 3082 pmap_zero_page(paddr); 3083 newpd = (pdp_entry_t) 3084 (paddr | 3085 kernel_pmap.pmap_bits[PG_V_IDX] | 3086 kernel_pmap.pmap_bits[PG_RW_IDX] | 3087 kernel_pmap.pmap_bits[PG_A_IDX] | 3088 kernel_pmap.pmap_bits[PG_M_IDX]); 3089 *pmap_pd(&kernel_pmap, kstart) = newpd; 3090 continue; /* try again */ 3091 } 3092 if ((*pt & kernel_pmap.pmap_bits[PG_V_IDX]) != 0) { 3093 kstart = (kstart + PAGE_SIZE * NPTEPG) & 3094 ~(PAGE_SIZE * NPTEPG - 1); 3095 if (kstart - 1 >= kernel_map.max_offset) { 3096 kstart = kernel_map.max_offset; 3097 break; 3098 } 3099 continue; 3100 } 3101 3102 /* 3103 * We need a new PT 3104 * 3105 * This index is bogus, but out of the way 3106 */ 3107 nkpg = vm_page_alloc(NULL, mycpu->gd_rand_incr++, 3108 VM_ALLOC_NORMAL | 3109 VM_ALLOC_SYSTEM | 3110 VM_ALLOC_INTERRUPT); 3111 if (nkpg == NULL) 3112 panic("pmap_growkernel: no memory to grow kernel"); 3113 3114 vm_page_wire(nkpg); 3115 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 3116 pmap_zero_page(ptppaddr); 3117 newpt = (pd_entry_t)(ptppaddr | 3118 kernel_pmap.pmap_bits[PG_V_IDX] | 3119 kernel_pmap.pmap_bits[PG_RW_IDX] | 3120 kernel_pmap.pmap_bits[PG_A_IDX] | 3121 kernel_pmap.pmap_bits[PG_M_IDX]); 3122 atomic_swap_long(pmap_pt(&kernel_pmap, kstart), newpt); 3123 3124 kstart = (kstart + PAGE_SIZE * NPTEPG) & 3125 ~(PAGE_SIZE * NPTEPG - 1); 3126 3127 if (kstart - 1 >= kernel_map.max_offset) { 3128 kstart = kernel_map.max_offset; 3129 break; 3130 } 3131 } 3132 3133 /* 3134 * Only update kernel_vm_end for areas below KERNBASE. 3135 */ 3136 if (update_kernel_vm_end && kernel_vm_end < kstart) 3137 kernel_vm_end = kstart; 3138 } 3139 3140 /* 3141 * Add a reference to the specified pmap. 3142 */ 3143 void 3144 pmap_reference(pmap_t pmap) 3145 { 3146 if (pmap != NULL) 3147 atomic_add_int(&pmap->pm_count, 1); 3148 } 3149 3150 /*************************************************** 3151 * page management routines. 3152 ***************************************************/ 3153 3154 /* 3155 * Hold a pv without locking it 3156 */ 3157 static void 3158 pv_hold(pv_entry_t pv) 3159 { 3160 atomic_add_int(&pv->pv_hold, 1); 3161 } 3162 3163 /* 3164 * Hold a pv_entry, preventing its destruction. TRUE is returned if the pv 3165 * was successfully locked, FALSE if it wasn't. The caller must dispose of 3166 * the pv properly. 3167 * 3168 * Either the pmap->pm_spin or the related vm_page_spin (if traversing a 3169 * pv list via its page) must be held by the caller in order to stabilize 3170 * the pv. 3171 */ 3172 static int 3173 _pv_hold_try(pv_entry_t pv PMAP_DEBUG_DECL) 3174 { 3175 u_int count; 3176 3177 /* 3178 * Critical path shortcut expects pv to already have one ref 3179 * (for the pv->pv_pmap). 3180 */ 3181 if (atomic_cmpset_int(&pv->pv_hold, 1, PV_HOLD_LOCKED | 2)) { 3182 #ifdef PMAP_DEBUG 3183 pv->pv_func = func; 3184 pv->pv_line = lineno; 3185 #endif 3186 return TRUE; 3187 } 3188 3189 for (;;) { 3190 count = pv->pv_hold; 3191 cpu_ccfence(); 3192 if ((count & PV_HOLD_LOCKED) == 0) { 3193 if (atomic_cmpset_int(&pv->pv_hold, count, 3194 (count + 1) | PV_HOLD_LOCKED)) { 3195 #ifdef PMAP_DEBUG 3196 pv->pv_func = func; 3197 pv->pv_line = lineno; 3198 #endif 3199 return TRUE; 3200 } 3201 } else { 3202 if (atomic_cmpset_int(&pv->pv_hold, count, count + 1)) 3203 return FALSE; 3204 } 3205 /* retry */ 3206 } 3207 } 3208 3209 /* 3210 * Drop a previously held pv_entry which could not be locked, allowing its 3211 * destruction. 3212 * 3213 * Must not be called with a spinlock held as we might zfree() the pv if it 3214 * is no longer associated with a pmap and this was the last hold count. 3215 */ 3216 static void 3217 pv_drop(pv_entry_t pv) 3218 { 3219 u_int count; 3220 3221 for (;;) { 3222 count = pv->pv_hold; 3223 cpu_ccfence(); 3224 KKASSERT((count & PV_HOLD_MASK) > 0); 3225 KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) != 3226 (PV_HOLD_LOCKED | 1)); 3227 if (atomic_cmpset_int(&pv->pv_hold, count, count - 1)) { 3228 if ((count & PV_HOLD_MASK) == 1) { 3229 #ifdef PMAP_DEBUG2 3230 if (pmap_enter_debug > 0) { 3231 --pmap_enter_debug; 3232 kprintf("pv_drop: free pv %p\n", pv); 3233 } 3234 #endif 3235 KKASSERT(count == 1); 3236 KKASSERT(pv->pv_pmap == NULL); 3237 zfree(pvzone, pv); 3238 } 3239 return; 3240 } 3241 /* retry */ 3242 } 3243 } 3244 3245 /* 3246 * Find or allocate the requested PV entry, returning a locked, held pv. 3247 * 3248 * If (*isnew) is non-zero, the returned pv will have two hold counts, one 3249 * for the caller and one representing the pmap and vm_page association. 3250 * 3251 * If (*isnew) is zero, the returned pv will have only one hold count. 3252 * 3253 * Since both associations can only be adjusted while the pv is locked, 3254 * together they represent just one additional hold. 3255 */ 3256 static 3257 pv_entry_t 3258 _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew PMAP_DEBUG_DECL) 3259 { 3260 pv_entry_t pv; 3261 pv_entry_t pnew; 3262 struct mdglobaldata *md = mdcpu; 3263 3264 pnew = NULL; 3265 if (md->gd_newpv) { 3266 #if 0 3267 pnew = atomic_swap_ptr((void *)&md->gd_newpv, NULL); 3268 #else 3269 crit_enter(); 3270 pnew = md->gd_newpv; /* might race NULL */ 3271 md->gd_newpv = NULL; 3272 crit_exit(); 3273 #endif 3274 } 3275 if (pnew == NULL) 3276 pnew = zalloc(pvzone); 3277 3278 spin_lock(&pmap->pm_spin); 3279 for (;;) { 3280 /* 3281 * Shortcut cache 3282 */ 3283 pv = pmap->pm_pvhint; 3284 cpu_ccfence(); 3285 if (pv == NULL || 3286 pv->pv_pmap != pmap || 3287 pv->pv_pindex != pindex) { 3288 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, 3289 pindex); 3290 } 3291 if (pv == NULL) { 3292 vm_pindex_t *pmark; 3293 3294 /* 3295 * We need to block if someone is holding our 3296 * placemarker. As long as we determine the 3297 * placemarker has not been aquired we do not 3298 * need to get it as acquision also requires 3299 * the pmap spin lock. 3300 * 3301 * However, we can race the wakeup. 3302 */ 3303 pmark = pmap_placemarker_hash(pmap, pindex); 3304 3305 if (((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) { 3306 atomic_set_long(pmark, PM_PLACEMARK_WAKEUP); 3307 tsleep_interlock(pmark, 0); 3308 if (((*pmark ^ pindex) & 3309 ~PM_PLACEMARK_WAKEUP) == 0) { 3310 spin_unlock(&pmap->pm_spin); 3311 tsleep(pmark, PINTERLOCKED, "pvplc", 0); 3312 spin_lock(&pmap->pm_spin); 3313 } 3314 continue; 3315 } 3316 3317 /* 3318 * Setup the new entry 3319 */ 3320 pnew->pv_pmap = pmap; 3321 pnew->pv_pindex = pindex; 3322 pnew->pv_hold = PV_HOLD_LOCKED | 2; 3323 #ifdef PMAP_DEBUG 3324 pnew->pv_func = func; 3325 pnew->pv_line = lineno; 3326 if (pnew->pv_line_lastfree > 0) { 3327 pnew->pv_line_lastfree = 3328 -pnew->pv_line_lastfree; 3329 } 3330 #endif 3331 pv = pv_entry_rb_tree_RB_INSERT(&pmap->pm_pvroot, pnew); 3332 atomic_add_long(&pmap->pm_stats.resident_count, 1); 3333 spin_unlock(&pmap->pm_spin); 3334 *isnew = 1; 3335 3336 KKASSERT(pv == NULL); 3337 return(pnew); 3338 } 3339 3340 /* 3341 * We already have an entry, cleanup the staged pnew if 3342 * we can get the lock, otherwise block and retry. 3343 */ 3344 if (__predict_true(_pv_hold_try(pv PMAP_DEBUG_COPY))) { 3345 spin_unlock(&pmap->pm_spin); 3346 #if 0 3347 pnew = atomic_swap_ptr((void *)&md->gd_newpv, pnew); 3348 if (pnew) 3349 zfree(pvzone, pnew); 3350 #else 3351 crit_enter(); 3352 if (md->gd_newpv == NULL) 3353 md->gd_newpv = pnew; 3354 else 3355 zfree(pvzone, pnew); 3356 crit_exit(); 3357 #endif 3358 KKASSERT(pv->pv_pmap == pmap && 3359 pv->pv_pindex == pindex); 3360 *isnew = 0; 3361 return(pv); 3362 } 3363 spin_unlock(&pmap->pm_spin); 3364 _pv_lock(pv PMAP_DEBUG_COPY); 3365 pv_put(pv); 3366 spin_lock(&pmap->pm_spin); 3367 } 3368 /* NOT REACHED */ 3369 } 3370 3371 /* 3372 * Find the requested PV entry, returning a locked+held pv or NULL 3373 */ 3374 static 3375 pv_entry_t 3376 _pv_get(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp PMAP_DEBUG_DECL) 3377 { 3378 pv_entry_t pv; 3379 3380 spin_lock(&pmap->pm_spin); 3381 for (;;) { 3382 /* 3383 * Shortcut cache 3384 */ 3385 pv = pmap->pm_pvhint; 3386 cpu_ccfence(); 3387 if (pv == NULL || 3388 pv->pv_pmap != pmap || 3389 pv->pv_pindex != pindex) { 3390 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, 3391 pindex); 3392 } 3393 if (pv == NULL) { 3394 /* 3395 * Block if there is ANY placemarker. If we are to 3396 * return it, we must also aquire the spot, so we 3397 * have to block even if the placemarker is held on 3398 * a different address. 3399 * 3400 * OPTIMIZATION: If pmarkp is passed as NULL the 3401 * caller is just probing (or looking for a real 3402 * pv_entry), and in this case we only need to check 3403 * to see if the placemarker matches pindex. 3404 */ 3405 vm_pindex_t *pmark; 3406 3407 pmark = pmap_placemarker_hash(pmap, pindex); 3408 3409 if ((pmarkp && *pmark != PM_NOPLACEMARK) || 3410 ((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) { 3411 atomic_set_long(pmark, PM_PLACEMARK_WAKEUP); 3412 tsleep_interlock(pmark, 0); 3413 if ((pmarkp && *pmark != PM_NOPLACEMARK) || 3414 ((*pmark ^ pindex) & 3415 ~PM_PLACEMARK_WAKEUP) == 0) { 3416 spin_unlock(&pmap->pm_spin); 3417 tsleep(pmark, PINTERLOCKED, "pvpld", 0); 3418 spin_lock(&pmap->pm_spin); 3419 } 3420 continue; 3421 } 3422 if (pmarkp) { 3423 if (atomic_swap_long(pmark, pindex) != 3424 PM_NOPLACEMARK) { 3425 panic("_pv_get: pmark race"); 3426 } 3427 *pmarkp = pmark; 3428 } 3429 spin_unlock(&pmap->pm_spin); 3430 return NULL; 3431 } 3432 if (_pv_hold_try(pv PMAP_DEBUG_COPY)) { 3433 pv_cache(pv, pindex); 3434 spin_unlock(&pmap->pm_spin); 3435 KKASSERT(pv->pv_pmap == pmap && 3436 pv->pv_pindex == pindex); 3437 return(pv); 3438 } 3439 spin_unlock(&pmap->pm_spin); 3440 _pv_lock(pv PMAP_DEBUG_COPY); 3441 pv_put(pv); 3442 spin_lock(&pmap->pm_spin); 3443 } 3444 } 3445 3446 /* 3447 * Lookup, hold, and attempt to lock (pmap,pindex). 3448 * 3449 * If the entry does not exist NULL is returned and *errorp is set to 0 3450 * 3451 * If the entry exists and could be successfully locked it is returned and 3452 * errorp is set to 0. 3453 * 3454 * If the entry exists but could NOT be successfully locked it is returned 3455 * held and *errorp is set to 1. 3456 * 3457 * If the entry is placemarked by someone else NULL is returned and *errorp 3458 * is set to 1. 3459 */ 3460 static 3461 pv_entry_t 3462 pv_get_try(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp, int *errorp) 3463 { 3464 pv_entry_t pv; 3465 3466 spin_lock_shared(&pmap->pm_spin); 3467 3468 pv = pmap->pm_pvhint; 3469 cpu_ccfence(); 3470 if (pv == NULL || 3471 pv->pv_pmap != pmap || 3472 pv->pv_pindex != pindex) { 3473 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex); 3474 } 3475 3476 if (pv == NULL) { 3477 vm_pindex_t *pmark; 3478 3479 pmark = pmap_placemarker_hash(pmap, pindex); 3480 3481 if (((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) { 3482 *errorp = 1; 3483 } else if (pmarkp && 3484 atomic_cmpset_long(pmark, PM_NOPLACEMARK, pindex)) { 3485 *errorp = 0; 3486 } else { 3487 /* 3488 * Can't set a placemark with a NULL pmarkp, or if 3489 * pmarkp is non-NULL but we failed to set our 3490 * placemark. 3491 */ 3492 *errorp = 1; 3493 } 3494 if (pmarkp) 3495 *pmarkp = pmark; 3496 spin_unlock_shared(&pmap->pm_spin); 3497 3498 return NULL; 3499 } 3500 3501 /* 3502 * XXX This has problems if the lock is shared, why? 3503 */ 3504 if (pv_hold_try(pv)) { 3505 pv_cache(pv, pindex); /* overwrite ok (shared lock) */ 3506 spin_unlock_shared(&pmap->pm_spin); 3507 *errorp = 0; 3508 KKASSERT(pv->pv_pmap == pmap && pv->pv_pindex == pindex); 3509 return(pv); /* lock succeeded */ 3510 } 3511 spin_unlock_shared(&pmap->pm_spin); 3512 *errorp = 1; 3513 3514 return (pv); /* lock failed */ 3515 } 3516 3517 /* 3518 * Lock a held pv, keeping the hold count 3519 */ 3520 static 3521 void 3522 _pv_lock(pv_entry_t pv PMAP_DEBUG_DECL) 3523 { 3524 u_int count; 3525 3526 for (;;) { 3527 count = pv->pv_hold; 3528 cpu_ccfence(); 3529 if ((count & PV_HOLD_LOCKED) == 0) { 3530 if (atomic_cmpset_int(&pv->pv_hold, count, 3531 count | PV_HOLD_LOCKED)) { 3532 #ifdef PMAP_DEBUG 3533 pv->pv_func = func; 3534 pv->pv_line = lineno; 3535 #endif 3536 return; 3537 } 3538 continue; 3539 } 3540 tsleep_interlock(pv, 0); 3541 if (atomic_cmpset_int(&pv->pv_hold, count, 3542 count | PV_HOLD_WAITING)) { 3543 #ifdef PMAP_DEBUG2 3544 if (pmap_enter_debug > 0) { 3545 --pmap_enter_debug; 3546 kprintf("pv waiting on %s:%d\n", 3547 pv->pv_func, pv->pv_line); 3548 } 3549 #endif 3550 tsleep(pv, PINTERLOCKED, "pvwait", hz); 3551 } 3552 /* retry */ 3553 } 3554 } 3555 3556 /* 3557 * Unlock a held and locked pv, keeping the hold count. 3558 */ 3559 static 3560 void 3561 pv_unlock(pv_entry_t pv) 3562 { 3563 u_int count; 3564 3565 for (;;) { 3566 count = pv->pv_hold; 3567 cpu_ccfence(); 3568 KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) >= 3569 (PV_HOLD_LOCKED | 1)); 3570 if (atomic_cmpset_int(&pv->pv_hold, count, 3571 count & 3572 ~(PV_HOLD_LOCKED | PV_HOLD_WAITING))) { 3573 if (count & PV_HOLD_WAITING) 3574 wakeup(pv); 3575 break; 3576 } 3577 } 3578 } 3579 3580 /* 3581 * Unlock and drop a pv. If the pv is no longer associated with a pmap 3582 * and the hold count drops to zero we will free it. 3583 * 3584 * Caller should not hold any spin locks. We are protected from hold races 3585 * by virtue of holds only occuring only with a pmap_spin or vm_page_spin 3586 * lock held. A pv cannot be located otherwise. 3587 */ 3588 static 3589 void 3590 pv_put(pv_entry_t pv) 3591 { 3592 #ifdef PMAP_DEBUG2 3593 if (pmap_enter_debug > 0) { 3594 --pmap_enter_debug; 3595 kprintf("pv_put pv=%p hold=%08x\n", pv, pv->pv_hold); 3596 } 3597 #endif 3598 3599 /* 3600 * Normal put-aways must have a pv_m associated with the pv, 3601 * but allow the case where the pv has been destructed due 3602 * to pmap_dynamic_delete. 3603 */ 3604 KKASSERT(pv->pv_pmap == NULL || pv->pv_m != NULL); 3605 3606 /* 3607 * Fast - shortcut most common condition 3608 */ 3609 if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 1)) 3610 return; 3611 3612 /* 3613 * Slow 3614 */ 3615 pv_unlock(pv); 3616 pv_drop(pv); 3617 } 3618 3619 /* 3620 * Remove the pmap association from a pv, require that pv_m already be removed, 3621 * then unlock and drop the pv. Any pte operations must have already been 3622 * completed. This call may result in a last-drop which will physically free 3623 * the pv. 3624 * 3625 * Removing the pmap association entails an additional drop. 3626 * 3627 * pv must be exclusively locked on call and will be disposed of on return. 3628 */ 3629 static 3630 void 3631 _pv_free(pv_entry_t pv, pv_entry_t pvp PMAP_DEBUG_DECL) 3632 { 3633 pmap_t pmap; 3634 3635 #ifdef PMAP_DEBUG 3636 pv->pv_func_lastfree = func; 3637 pv->pv_line_lastfree = lineno; 3638 #endif 3639 KKASSERT(pv->pv_m == NULL); 3640 KKASSERT((pv->pv_hold & (PV_HOLD_LOCKED|PV_HOLD_MASK)) >= 3641 (PV_HOLD_LOCKED|1)); 3642 if ((pmap = pv->pv_pmap) != NULL) { 3643 spin_lock(&pmap->pm_spin); 3644 KKASSERT(pv->pv_pmap == pmap); 3645 if (pmap->pm_pvhint == pv) 3646 pmap->pm_pvhint = NULL; 3647 pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); 3648 atomic_add_long(&pmap->pm_stats.resident_count, -1); 3649 pv->pv_pmap = NULL; 3650 pv->pv_pindex = 0; 3651 spin_unlock(&pmap->pm_spin); 3652 3653 /* 3654 * Try to shortcut three atomic ops, otherwise fall through 3655 * and do it normally. Drop two refs and the lock all in 3656 * one go. 3657 */ 3658 if (pvp) 3659 vm_page_unwire_quick(pvp->pv_m); 3660 if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 0)) { 3661 #ifdef PMAP_DEBUG2 3662 if (pmap_enter_debug > 0) { 3663 --pmap_enter_debug; 3664 kprintf("pv_free: free pv %p\n", pv); 3665 } 3666 #endif 3667 zfree(pvzone, pv); 3668 return; 3669 } 3670 pv_drop(pv); /* ref for pv_pmap */ 3671 } 3672 pv_unlock(pv); 3673 pv_drop(pv); 3674 } 3675 3676 /* 3677 * This routine is very drastic, but can save the system 3678 * in a pinch. 3679 */ 3680 void 3681 pmap_collect(void) 3682 { 3683 int i; 3684 vm_page_t m; 3685 static int warningdone=0; 3686 3687 if (pmap_pagedaemon_waken == 0) 3688 return; 3689 pmap_pagedaemon_waken = 0; 3690 if (warningdone < 5) { 3691 kprintf("pmap_collect: collecting pv entries -- " 3692 "suggest increasing PMAP_SHPGPERPROC\n"); 3693 warningdone++; 3694 } 3695 3696 for (i = 0; i < vm_page_array_size; i++) { 3697 m = &vm_page_array[i]; 3698 if (m->wire_count || m->hold_count) 3699 continue; 3700 if (vm_page_busy_try(m, TRUE) == 0) { 3701 if (m->wire_count == 0 && m->hold_count == 0) { 3702 pmap_remove_all(m); 3703 } 3704 vm_page_wakeup(m); 3705 } 3706 } 3707 } 3708 3709 /* 3710 * Scan the pmap for active page table entries and issue a callback. 3711 * The callback must dispose of pte_pv, whos PTE entry is at *ptep in 3712 * its parent page table. 3713 * 3714 * pte_pv will be NULL if the page or page table is unmanaged. 3715 * pt_pv will point to the page table page containing the pte for the page. 3716 * 3717 * NOTE! If we come across an unmanaged page TABLE (verses an unmanaged page), 3718 * we pass a NULL pte_pv and we pass a pt_pv pointing to the passed 3719 * process pmap's PD and page to the callback function. This can be 3720 * confusing because the pt_pv is really a pd_pv, and the target page 3721 * table page is simply aliased by the pmap and not owned by it. 3722 * 3723 * It is assumed that the start and end are properly rounded to the page size. 3724 * 3725 * It is assumed that PD pages and above are managed and thus in the RB tree, 3726 * allowing us to use RB_SCAN from the PD pages down for ranged scans. 3727 */ 3728 struct pmap_scan_info { 3729 struct pmap *pmap; 3730 vm_offset_t sva; 3731 vm_offset_t eva; 3732 vm_pindex_t sva_pd_pindex; 3733 vm_pindex_t eva_pd_pindex; 3734 void (*func)(pmap_t, struct pmap_scan_info *, 3735 pv_entry_t, vm_pindex_t *, pv_entry_t, 3736 int, vm_offset_t, 3737 pt_entry_t *, void *); 3738 void *arg; 3739 pmap_inval_bulk_t bulk_core; 3740 pmap_inval_bulk_t *bulk; 3741 int count; 3742 int stop; 3743 }; 3744 3745 static int pmap_scan_cmp(pv_entry_t pv, void *data); 3746 static int pmap_scan_callback(pv_entry_t pv, void *data); 3747 3748 static void 3749 pmap_scan(struct pmap_scan_info *info, int smp_inval) 3750 { 3751 struct pmap *pmap = info->pmap; 3752 pv_entry_t pd_pv; /* A page directory PV */ 3753 pv_entry_t pt_pv; /* A page table PV */ 3754 pv_entry_t pte_pv; /* A page table entry PV */ 3755 vm_pindex_t *pte_placemark; 3756 vm_pindex_t *pt_placemark; 3757 pt_entry_t *ptep; 3758 pt_entry_t oldpte; 3759 struct pv_entry dummy_pv; 3760 3761 info->stop = 0; 3762 if (pmap == NULL) 3763 return; 3764 if (info->sva == info->eva) 3765 return; 3766 if (smp_inval) { 3767 info->bulk = &info->bulk_core; 3768 pmap_inval_bulk_init(&info->bulk_core, pmap); 3769 } else { 3770 info->bulk = NULL; 3771 } 3772 3773 /* 3774 * Hold the token for stability; if the pmap is empty we have nothing 3775 * to do. 3776 */ 3777 #if 0 3778 if (pmap->pm_stats.resident_count == 0) { 3779 return; 3780 } 3781 #endif 3782 3783 info->count = 0; 3784 3785 /* 3786 * Special handling for scanning one page, which is a very common 3787 * operation (it is?). 3788 * 3789 * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4 3790 */ 3791 if (info->sva + PAGE_SIZE == info->eva) { 3792 if (info->sva >= VM_MAX_USER_ADDRESS) { 3793 /* 3794 * Kernel mappings do not track wire counts on 3795 * page table pages and only maintain pd_pv and 3796 * pte_pv levels so pmap_scan() works. 3797 */ 3798 pt_pv = NULL; 3799 pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva), 3800 &pte_placemark); 3801 ptep = vtopte(info->sva); 3802 } else { 3803 /* 3804 * User pages which are unmanaged will not have a 3805 * pte_pv. User page table pages which are unmanaged 3806 * (shared from elsewhere) will also not have a pt_pv. 3807 * The func() callback will pass both pte_pv and pt_pv 3808 * as NULL in that case. 3809 * 3810 * We hold pte_placemark across the operation for 3811 * unmanaged pages. 3812 * 3813 * WARNING! We must hold pt_placemark across the 3814 * *ptep test to prevent misintepreting 3815 * a non-zero *ptep as a shared page 3816 * table page. Hold it across the function 3817 * callback as well for SMP safety. 3818 */ 3819 pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva), 3820 &pte_placemark); 3821 pt_pv = pv_get(pmap, pmap_pt_pindex(info->sva), 3822 &pt_placemark); 3823 if (pt_pv == NULL) { 3824 KKASSERT(pte_pv == NULL); 3825 pd_pv = pv_get(pmap, 3826 pmap_pd_pindex(info->sva), 3827 NULL); 3828 if (pd_pv) { 3829 ptep = pv_pte_lookup(pd_pv, 3830 pmap_pt_index(info->sva)); 3831 if (*ptep) { 3832 info->func(pmap, info, 3833 NULL, pt_placemark, 3834 pd_pv, 1, 3835 info->sva, ptep, 3836 info->arg); 3837 } else { 3838 pv_placemarker_wakeup(pmap, 3839 pt_placemark); 3840 } 3841 pv_put(pd_pv); 3842 } else { 3843 pv_placemarker_wakeup(pmap, 3844 pt_placemark); 3845 } 3846 pv_placemarker_wakeup(pmap, pte_placemark); 3847 goto fast_skip; 3848 } 3849 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(info->sva)); 3850 } 3851 3852 /* 3853 * NOTE: *ptep can't be ripped out from under us if we hold 3854 * pte_pv (or pte_placemark) locked, but bits can 3855 * change. 3856 */ 3857 oldpte = *ptep; 3858 cpu_ccfence(); 3859 if (oldpte == 0) { 3860 KKASSERT(pte_pv == NULL); 3861 pv_placemarker_wakeup(pmap, pte_placemark); 3862 } else if (pte_pv) { 3863 KASSERT((oldpte & (pmap->pmap_bits[PG_MANAGED_IDX] | 3864 pmap->pmap_bits[PG_V_IDX])) == 3865 (pmap->pmap_bits[PG_MANAGED_IDX] | 3866 pmap->pmap_bits[PG_V_IDX]), 3867 ("badA *ptep %016lx/%016lx sva %016lx pte_pv %p", 3868 *ptep, oldpte, info->sva, pte_pv)); 3869 info->func(pmap, info, pte_pv, NULL, pt_pv, 0, 3870 info->sva, ptep, info->arg); 3871 } else { 3872 KASSERT((oldpte & (pmap->pmap_bits[PG_MANAGED_IDX] | 3873 pmap->pmap_bits[PG_V_IDX])) == 3874 pmap->pmap_bits[PG_V_IDX], 3875 ("badB *ptep %016lx/%016lx sva %016lx pte_pv NULL", 3876 *ptep, oldpte, info->sva)); 3877 info->func(pmap, info, NULL, pte_placemark, pt_pv, 0, 3878 info->sva, ptep, info->arg); 3879 } 3880 if (pt_pv) 3881 pv_put(pt_pv); 3882 fast_skip: 3883 pmap_inval_bulk_flush(info->bulk); 3884 return; 3885 } 3886 3887 /* 3888 * Nominal scan case, RB_SCAN() for PD pages and iterate from 3889 * there. 3890 * 3891 * WARNING! eva can overflow our standard ((N + mask) >> bits) 3892 * bounds, resulting in a pd_pindex of 0. To solve the 3893 * problem we use an inclusive range. 3894 */ 3895 info->sva_pd_pindex = pmap_pd_pindex(info->sva); 3896 info->eva_pd_pindex = pmap_pd_pindex(info->eva - PAGE_SIZE); 3897 3898 if (info->sva >= VM_MAX_USER_ADDRESS) { 3899 /* 3900 * The kernel does not currently maintain any pv_entry's for 3901 * higher-level page tables. 3902 */ 3903 bzero(&dummy_pv, sizeof(dummy_pv)); 3904 dummy_pv.pv_pindex = info->sva_pd_pindex; 3905 spin_lock(&pmap->pm_spin); 3906 while (dummy_pv.pv_pindex <= info->eva_pd_pindex) { 3907 pmap_scan_callback(&dummy_pv, info); 3908 ++dummy_pv.pv_pindex; 3909 if (dummy_pv.pv_pindex < info->sva_pd_pindex) /*wrap*/ 3910 break; 3911 } 3912 spin_unlock(&pmap->pm_spin); 3913 } else { 3914 /* 3915 * User page tables maintain local PML4, PDP, and PD 3916 * pv_entry's at the very least. PT pv's might be 3917 * unmanaged and thus not exist. PTE pv's might be 3918 * unmanaged and thus not exist. 3919 */ 3920 spin_lock(&pmap->pm_spin); 3921 pv_entry_rb_tree_RB_SCAN(&pmap->pm_pvroot, pmap_scan_cmp, 3922 pmap_scan_callback, info); 3923 spin_unlock(&pmap->pm_spin); 3924 } 3925 pmap_inval_bulk_flush(info->bulk); 3926 } 3927 3928 /* 3929 * WARNING! pmap->pm_spin held 3930 * 3931 * WARNING! eva can overflow our standard ((N + mask) >> bits) 3932 * bounds, resulting in a pd_pindex of 0. To solve the 3933 * problem we use an inclusive range. 3934 */ 3935 static int 3936 pmap_scan_cmp(pv_entry_t pv, void *data) 3937 { 3938 struct pmap_scan_info *info = data; 3939 if (pv->pv_pindex < info->sva_pd_pindex) 3940 return(-1); 3941 if (pv->pv_pindex > info->eva_pd_pindex) 3942 return(1); 3943 return(0); 3944 } 3945 3946 /* 3947 * pmap_scan() by PDs 3948 * 3949 * WARNING! pmap->pm_spin held 3950 */ 3951 static int 3952 pmap_scan_callback(pv_entry_t pv, void *data) 3953 { 3954 struct pmap_scan_info *info = data; 3955 struct pmap *pmap = info->pmap; 3956 pv_entry_t pd_pv; /* A page directory PV */ 3957 pv_entry_t pt_pv; /* A page table PV */ 3958 vm_pindex_t *pt_placemark; 3959 pt_entry_t *ptep; 3960 pt_entry_t oldpte; 3961 vm_offset_t sva; 3962 vm_offset_t eva; 3963 vm_offset_t va_next; 3964 vm_pindex_t pd_pindex; 3965 int error; 3966 3967 /* 3968 * Stop if requested 3969 */ 3970 if (info->stop) 3971 return -1; 3972 3973 /* 3974 * Pull the PD pindex from the pv before releasing the spinlock. 3975 * 3976 * WARNING: pv is faked for kernel pmap scans. 3977 */ 3978 pd_pindex = pv->pv_pindex; 3979 spin_unlock(&pmap->pm_spin); 3980 pv = NULL; /* invalid after spinlock unlocked */ 3981 3982 /* 3983 * Calculate the page range within the PD. SIMPLE pmaps are 3984 * direct-mapped for the entire 2^64 address space. Normal pmaps 3985 * reflect the user and kernel address space which requires 3986 * cannonicalization w/regards to converting pd_pindex's back 3987 * into addresses. 3988 */ 3989 sva = (pd_pindex - pmap_pd_pindex(0)) << PDPSHIFT; 3990 if ((pmap->pm_flags & PMAP_FLAG_SIMPLE) == 0 && 3991 (sva & PML4_SIGNMASK)) { 3992 sva |= PML4_SIGNMASK; 3993 } 3994 eva = sva + NBPDP; /* can overflow */ 3995 if (sva < info->sva) 3996 sva = info->sva; 3997 if (eva < info->sva || eva > info->eva) 3998 eva = info->eva; 3999 4000 /* 4001 * NOTE: kernel mappings do not track page table pages, only 4002 * terminal pages. 4003 * 4004 * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4. 4005 * However, for the scan to be efficient we try to 4006 * cache items top-down. 4007 */ 4008 pd_pv = NULL; 4009 pt_pv = NULL; 4010 4011 for (; sva < eva; sva = va_next) { 4012 if (info->stop) 4013 break; 4014 if (sva >= VM_MAX_USER_ADDRESS) { 4015 if (pt_pv) { 4016 pv_put(pt_pv); 4017 pt_pv = NULL; 4018 } 4019 goto kernel_skip; 4020 } 4021 4022 /* 4023 * PD cache, scan shortcut if it doesn't exist. 4024 */ 4025 if (pd_pv == NULL) { 4026 pd_pv = pv_get(pmap, pmap_pd_pindex(sva), NULL); 4027 } else if (pd_pv->pv_pmap != pmap || 4028 pd_pv->pv_pindex != pmap_pd_pindex(sva)) { 4029 pv_put(pd_pv); 4030 pd_pv = pv_get(pmap, pmap_pd_pindex(sva), NULL); 4031 } 4032 if (pd_pv == NULL) { 4033 va_next = (sva + NBPDP) & ~PDPMASK; 4034 if (va_next < sva) 4035 va_next = eva; 4036 continue; 4037 } 4038 4039 /* 4040 * PT cache 4041 * 4042 * NOTE: The cached pt_pv can be removed from the pmap when 4043 * pmap_dynamic_delete is enabled. 4044 */ 4045 if (pt_pv && (pt_pv->pv_pmap != pmap || 4046 pt_pv->pv_pindex != pmap_pt_pindex(sva))) { 4047 pv_put(pt_pv); 4048 pt_pv = NULL; 4049 } 4050 if (pt_pv == NULL) { 4051 pt_pv = pv_get_try(pmap, pmap_pt_pindex(sva), 4052 &pt_placemark, &error); 4053 if (error) { 4054 pv_put(pd_pv); /* lock order */ 4055 pd_pv = NULL; 4056 if (pt_pv) { 4057 pv_lock(pt_pv); 4058 pv_put(pt_pv); 4059 pt_pv = NULL; 4060 } else { 4061 pv_placemarker_wait(pmap, pt_placemark); 4062 } 4063 va_next = sva; 4064 continue; 4065 } 4066 /* may have to re-check later if pt_pv is NULL here */ 4067 } 4068 4069 /* 4070 * If pt_pv is NULL we either have an shared page table 4071 * page and must issue a callback specific to that case, 4072 * or there is no page table page. 4073 * 4074 * Either way we can skip the page table page. 4075 * 4076 * WARNING! pt_pv can also be NULL due to a pv creation 4077 * race where we find it to be NULL and then 4078 * later see a pte_pv. But its possible the pt_pv 4079 * got created inbetween the two operations, so 4080 * we must check. 4081 */ 4082 if (pt_pv == NULL) { 4083 /* 4084 * Possible unmanaged (shared from another pmap) 4085 * page table page. 4086 * 4087 * WARNING! We must hold pt_placemark across the 4088 * *ptep test to prevent misintepreting 4089 * a non-zero *ptep as a shared page 4090 * table page. Hold it across the function 4091 * callback as well for SMP safety. 4092 */ 4093 ptep = pv_pte_lookup(pd_pv, pmap_pt_index(sva)); 4094 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 4095 info->func(pmap, info, NULL, pt_placemark, 4096 pd_pv, 1, 4097 sva, ptep, info->arg); 4098 } else { 4099 pv_placemarker_wakeup(pmap, pt_placemark); 4100 } 4101 4102 /* 4103 * Done, move to next page table page. 4104 */ 4105 va_next = (sva + NBPDR) & ~PDRMASK; 4106 if (va_next < sva) 4107 va_next = eva; 4108 continue; 4109 } 4110 4111 /* 4112 * From this point in the loop testing pt_pv for non-NULL 4113 * means we are in UVM, else if it is NULL we are in KVM. 4114 * 4115 * Limit our scan to either the end of the va represented 4116 * by the current page table page, or to the end of the 4117 * range being removed. 4118 */ 4119 kernel_skip: 4120 va_next = (sva + NBPDR) & ~PDRMASK; 4121 if (va_next < sva) 4122 va_next = eva; 4123 if (va_next > eva) 4124 va_next = eva; 4125 4126 /* 4127 * Scan the page table for pages. Some pages may not be 4128 * managed (might not have a pv_entry). 4129 * 4130 * There is no page table management for kernel pages so 4131 * pt_pv will be NULL in that case, but otherwise pt_pv 4132 * is non-NULL, locked, and referenced. 4133 */ 4134 4135 /* 4136 * At this point a non-NULL pt_pv means a UVA, and a NULL 4137 * pt_pv means a KVA. 4138 */ 4139 if (pt_pv) 4140 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva)); 4141 else 4142 ptep = vtopte(sva); 4143 4144 while (sva < va_next) { 4145 pv_entry_t pte_pv; 4146 vm_pindex_t *pte_placemark; 4147 4148 /* 4149 * Yield every 64 pages, stop if requested. 4150 */ 4151 if ((++info->count & 63) == 0) 4152 lwkt_user_yield(); 4153 if (info->stop) 4154 break; 4155 4156 /* 4157 * We can shortcut our scan if *ptep == 0. This is 4158 * an unlocked check. 4159 */ 4160 if (*ptep == 0) { 4161 sva += PAGE_SIZE; 4162 ++ptep; 4163 continue; 4164 } 4165 cpu_ccfence(); 4166 4167 /* 4168 * Acquire the related pte_pv, if any. If *ptep == 0 4169 * the related pte_pv should not exist, but if *ptep 4170 * is not zero the pte_pv may or may not exist (e.g. 4171 * will not exist for an unmanaged page). 4172 * 4173 * However a multitude of races are possible here 4174 * so if we cannot lock definite state we clean out 4175 * our cache and break the inner while() loop to 4176 * force a loop up to the top of the for(). 4177 * 4178 * XXX unlock/relock pd_pv, pt_pv, and re-test their 4179 * validity instead of looping up? 4180 */ 4181 pte_pv = pv_get_try(pmap, pmap_pte_pindex(sva), 4182 &pte_placemark, &error); 4183 if (error) { 4184 pv_put(pd_pv); /* lock order */ 4185 pd_pv = NULL; 4186 if (pt_pv) { 4187 pv_put(pt_pv); /* lock order */ 4188 pt_pv = NULL; 4189 } 4190 if (pte_pv) { /* block */ 4191 pv_lock(pte_pv); 4192 pv_put(pte_pv); 4193 pte_pv = NULL; 4194 } else { 4195 pv_placemarker_wait(pmap, 4196 pte_placemark); 4197 } 4198 va_next = sva; /* retry */ 4199 break; 4200 } 4201 4202 /* 4203 * Reload *ptep after successfully locking the 4204 * pindex. If *ptep == 0 we had better NOT have a 4205 * pte_pv. 4206 */ 4207 cpu_ccfence(); 4208 oldpte = *ptep; 4209 if (oldpte == 0) { 4210 if (pte_pv) { 4211 kprintf("Unexpected non-NULL pte_pv " 4212 "%p pt_pv %p " 4213 "*ptep = %016lx/%016lx\n", 4214 pte_pv, pt_pv, *ptep, oldpte); 4215 panic("Unexpected non-NULL pte_pv"); 4216 } else { 4217 pv_placemarker_wakeup(pmap, pte_placemark); 4218 } 4219 sva += PAGE_SIZE; 4220 ++ptep; 4221 continue; 4222 } 4223 4224 /* 4225 * We can't hold pd_pv across the callback (because 4226 * we don't pass it to the callback and the callback 4227 * might deadlock) 4228 */ 4229 if (pd_pv) { 4230 vm_page_wire_quick(pd_pv->pv_m); 4231 pv_unlock(pd_pv); 4232 } 4233 4234 /* 4235 * Ready for the callback. The locked pte_pv (if any) 4236 * is consumed by the callback. pte_pv will exist if 4237 * the page is managed, and will not exist if it 4238 * isn't. 4239 */ 4240 if (oldpte & pmap->pmap_bits[PG_MANAGED_IDX]) { 4241 /* 4242 * Managed pte 4243 */ 4244 KASSERT(pte_pv && 4245 (oldpte & pmap->pmap_bits[PG_V_IDX]), 4246 ("badC *ptep %016lx/%016lx sva %016lx " 4247 "pte_pv %p", 4248 *ptep, oldpte, sva, pte_pv)); 4249 /* 4250 * We must unlock pd_pv across the callback 4251 * to avoid deadlocks on any recursive 4252 * disposal. Re-check that it still exists 4253 * after re-locking. 4254 * 4255 * Call target disposes of pte_pv and may 4256 * destroy but will not dispose of pt_pv. 4257 */ 4258 info->func(pmap, info, pte_pv, NULL, 4259 pt_pv, 0, 4260 sva, ptep, info->arg); 4261 } else { 4262 /* 4263 * Unmanaged pte 4264 * 4265 * We must unlock pd_pv across the callback 4266 * to avoid deadlocks on any recursive 4267 * disposal. Re-check that it still exists 4268 * after re-locking. 4269 * 4270 * Call target disposes of pte_pv or 4271 * pte_placemark and may destroy but will 4272 * not dispose of pt_pv. 4273 */ 4274 KASSERT(pte_pv == NULL && 4275 (oldpte & pmap->pmap_bits[PG_V_IDX]), 4276 ("badD *ptep %016lx/%016lx sva %016lx " 4277 "pte_pv %p pte_pv->pv_m %p ", 4278 *ptep, oldpte, sva, 4279 pte_pv, (pte_pv ? pte_pv->pv_m : NULL))); 4280 if (pte_pv) 4281 kprintf("RaceD\n"); 4282 if (pte_pv) { 4283 info->func(pmap, info, 4284 pte_pv, NULL, 4285 pt_pv, 0, 4286 sva, ptep, info->arg); 4287 } else { 4288 info->func(pmap, info, 4289 NULL, pte_placemark, 4290 pt_pv, 0, 4291 sva, ptep, info->arg); 4292 } 4293 } 4294 if (pd_pv) { 4295 pv_lock(pd_pv); 4296 vm_page_unwire_quick(pd_pv->pv_m); 4297 if (pd_pv->pv_pmap == NULL) { 4298 va_next = sva; /* retry */ 4299 break; 4300 } 4301 } 4302 4303 /* 4304 * NOTE: The cached pt_pv can be removed from the 4305 * pmap when pmap_dynamic_delete is enabled, 4306 * which will cause ptep to become stale. 4307 * 4308 * This also means that no pages remain under 4309 * the PT, so we can just break out of the inner 4310 * loop and let the outer loop clean everything 4311 * up. 4312 */ 4313 if (pt_pv && pt_pv->pv_pmap != pmap) 4314 break; 4315 pte_pv = NULL; 4316 sva += PAGE_SIZE; 4317 ++ptep; 4318 } 4319 } 4320 if (pd_pv) { 4321 pv_put(pd_pv); 4322 pd_pv = NULL; 4323 } 4324 if (pt_pv) { 4325 pv_put(pt_pv); 4326 pt_pv = NULL; 4327 } 4328 if ((++info->count & 7) == 0) 4329 lwkt_user_yield(); 4330 4331 /* 4332 * Relock before returning. 4333 */ 4334 spin_lock(&pmap->pm_spin); 4335 return (0); 4336 } 4337 4338 void 4339 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 4340 { 4341 struct pmap_scan_info info; 4342 4343 info.pmap = pmap; 4344 info.sva = sva; 4345 info.eva = eva; 4346 info.func = pmap_remove_callback; 4347 info.arg = NULL; 4348 pmap_scan(&info, 1); 4349 #if 0 4350 cpu_invltlb(); 4351 if (eva - sva < 1024*1024) { 4352 while (sva < eva) { 4353 cpu_invlpg((void *)sva); 4354 sva += PAGE_SIZE; 4355 } 4356 } 4357 #endif 4358 } 4359 4360 static void 4361 pmap_remove_noinval(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 4362 { 4363 struct pmap_scan_info info; 4364 4365 info.pmap = pmap; 4366 info.sva = sva; 4367 info.eva = eva; 4368 info.func = pmap_remove_callback; 4369 info.arg = NULL; 4370 pmap_scan(&info, 0); 4371 } 4372 4373 static void 4374 pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info, 4375 pv_entry_t pte_pv, vm_pindex_t *pte_placemark, 4376 pv_entry_t pt_pv, int sharept, 4377 vm_offset_t va, pt_entry_t *ptep, void *arg __unused) 4378 { 4379 pt_entry_t pte; 4380 4381 if (pte_pv) { 4382 /* 4383 * Managed entry 4384 * 4385 * This will also drop pt_pv's wire_count. Note that 4386 * terminal pages are not wired based on mmu presence. 4387 * 4388 * NOTE: If this is the kernel_pmap, pt_pv can be NULL. 4389 */ 4390 KKASSERT(pte_pv->pv_m != NULL); 4391 pmap_remove_pv_pte(pte_pv, pt_pv, info->bulk, 2); 4392 pte_pv = NULL; /* safety */ 4393 4394 /* 4395 * Recursively destroy higher-level page tables. 4396 * 4397 * This is optional. If we do not, they will still 4398 * be destroyed when the process exits. 4399 * 4400 * NOTE: Do not destroy pv_entry's with extra hold refs, 4401 * a caller may have unlocked it and intends to 4402 * continue to use it. 4403 */ 4404 if (pmap_dynamic_delete && 4405 pt_pv && 4406 pt_pv->pv_m && 4407 pt_pv->pv_m->wire_count == 1 && 4408 (pt_pv->pv_hold & PV_HOLD_MASK) == 2 && 4409 pt_pv->pv_pindex != pmap_pml4_pindex()) { 4410 if (pmap_dynamic_delete == 2) 4411 kprintf("B %jd %08x\n", pt_pv->pv_pindex, pt_pv->pv_hold); 4412 pv_hold(pt_pv); /* extra hold */ 4413 pmap_remove_pv_pte(pt_pv, NULL, info->bulk, 1); 4414 pv_lock(pt_pv); /* prior extra hold + relock */ 4415 } 4416 } else if (sharept == 0) { 4417 /* 4418 * Unmanaged pte (pte_placemark is non-NULL) 4419 * 4420 * pt_pv's wire_count is still bumped by unmanaged pages 4421 * so we must decrement it manually. 4422 * 4423 * We have to unwire the target page table page. 4424 */ 4425 pte = pmap_inval_bulk(info->bulk, va, ptep, 0); 4426 if (pte & pmap->pmap_bits[PG_W_IDX]) 4427 atomic_add_long(&pmap->pm_stats.wired_count, -1); 4428 atomic_add_long(&pmap->pm_stats.resident_count, -1); 4429 if (vm_page_unwire_quick(pt_pv->pv_m)) 4430 panic("pmap_remove: insufficient wirecount"); 4431 pv_placemarker_wakeup(pmap, pte_placemark); 4432 } else { 4433 /* 4434 * Unmanaged page table (pt, pd, or pdp. Not pte) for 4435 * a shared page table. 4436 * 4437 * pt_pv is actually the pd_pv for our pmap (not the shared 4438 * object pmap). 4439 * 4440 * We have to unwire the target page table page and we 4441 * have to unwire our page directory page. 4442 * 4443 * It is unclear how we can invalidate a segment so we 4444 * invalidate -1 which invlidates the tlb. 4445 */ 4446 pte = pmap_inval_bulk(info->bulk, (vm_offset_t)-1, ptep, 0); 4447 atomic_add_long(&pmap->pm_stats.resident_count, -1); 4448 KKASSERT((pte & pmap->pmap_bits[PG_DEVICE_IDX]) == 0); 4449 if (vm_page_unwire_quick(PHYS_TO_VM_PAGE(pte & PG_FRAME))) 4450 panic("pmap_remove: shared pgtable1 bad wirecount"); 4451 if (vm_page_unwire_quick(pt_pv->pv_m)) 4452 panic("pmap_remove: shared pgtable2 bad wirecount"); 4453 pv_placemarker_wakeup(pmap, pte_placemark); 4454 } 4455 } 4456 4457 /* 4458 * Removes this physical page from all physical maps in which it resides. 4459 * Reflects back modify bits to the pager. 4460 * 4461 * This routine may not be called from an interrupt. 4462 */ 4463 static 4464 void 4465 pmap_remove_all(vm_page_t m) 4466 { 4467 pv_entry_t pv; 4468 pmap_inval_bulk_t bulk; 4469 4470 if (!pmap_initialized /* || (m->flags & PG_FICTITIOUS)*/) 4471 return; 4472 4473 vm_page_spin_lock(m); 4474 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4475 KKASSERT(pv->pv_m == m); 4476 if (pv_hold_try(pv)) { 4477 vm_page_spin_unlock(m); 4478 } else { 4479 vm_page_spin_unlock(m); 4480 pv_lock(pv); 4481 pv_put(pv); 4482 vm_page_spin_lock(m); 4483 continue; 4484 } 4485 KKASSERT(pv->pv_pmap && pv->pv_m == m); 4486 4487 /* 4488 * Holding no spinlocks, pv is locked. Once we scrap 4489 * pv we can no longer use it as a list iterator (but 4490 * we are doing a TAILQ_FIRST() so we are ok). 4491 */ 4492 pmap_inval_bulk_init(&bulk, pv->pv_pmap); 4493 pmap_remove_pv_pte(pv, NULL, &bulk, 2); 4494 pv = NULL; /* safety */ 4495 pmap_inval_bulk_flush(&bulk); 4496 vm_page_spin_lock(m); 4497 } 4498 KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0); 4499 vm_page_spin_unlock(m); 4500 } 4501 4502 /* 4503 * Removes the page from a particular pmap 4504 */ 4505 void 4506 pmap_remove_specific(pmap_t pmap, vm_page_t m) 4507 { 4508 pv_entry_t pv; 4509 pmap_inval_bulk_t bulk; 4510 4511 if (!pmap_initialized) 4512 return; 4513 4514 again: 4515 vm_page_spin_lock(m); 4516 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4517 if (pv->pv_pmap != pmap) 4518 continue; 4519 KKASSERT(pv->pv_m == m); 4520 if (pv_hold_try(pv)) { 4521 vm_page_spin_unlock(m); 4522 } else { 4523 vm_page_spin_unlock(m); 4524 pv_lock(pv); 4525 pv_put(pv); 4526 goto again; 4527 } 4528 KKASSERT(pv->pv_pmap == pmap && pv->pv_m == m); 4529 4530 /* 4531 * Holding no spinlocks, pv is locked. Once gone it can't 4532 * be used as an iterator. In fact, because we couldn't 4533 * necessarily lock it atomically it may have moved within 4534 * the list and ALSO cannot be used as an iterator. 4535 */ 4536 pmap_inval_bulk_init(&bulk, pv->pv_pmap); 4537 pmap_remove_pv_pte(pv, NULL, &bulk, 2); 4538 pv = NULL; /* safety */ 4539 pmap_inval_bulk_flush(&bulk); 4540 goto again; 4541 } 4542 vm_page_spin_unlock(m); 4543 } 4544 4545 /* 4546 * Set the physical protection on the specified range of this map 4547 * as requested. This function is typically only used for debug watchpoints 4548 * and COW pages. 4549 * 4550 * This function may not be called from an interrupt if the map is 4551 * not the kernel_pmap. 4552 * 4553 * NOTE! For shared page table pages we just unmap the page. 4554 */ 4555 void 4556 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4557 { 4558 struct pmap_scan_info info; 4559 /* JG review for NX */ 4560 4561 if (pmap == NULL) 4562 return; 4563 if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == VM_PROT_NONE) { 4564 pmap_remove(pmap, sva, eva); 4565 return; 4566 } 4567 if (prot & VM_PROT_WRITE) 4568 return; 4569 info.pmap = pmap; 4570 info.sva = sva; 4571 info.eva = eva; 4572 info.func = pmap_protect_callback; 4573 info.arg = &prot; 4574 pmap_scan(&info, 1); 4575 } 4576 4577 static 4578 void 4579 pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info, 4580 pv_entry_t pte_pv, vm_pindex_t *pte_placemark, 4581 pv_entry_t pt_pv, int sharept, 4582 vm_offset_t va, pt_entry_t *ptep, void *arg __unused) 4583 { 4584 pt_entry_t pbits; 4585 pt_entry_t cbits; 4586 pt_entry_t pte; 4587 vm_page_t m; 4588 4589 again: 4590 pbits = *ptep; 4591 cbits = pbits; 4592 if (pte_pv) { 4593 KKASSERT(pte_pv->pv_m != NULL); 4594 m = NULL; 4595 if (pbits & pmap->pmap_bits[PG_A_IDX]) { 4596 if ((pbits & pmap->pmap_bits[PG_DEVICE_IDX]) == 0) { 4597 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 4598 KKASSERT(m == pte_pv->pv_m); 4599 vm_page_flag_set(m, PG_REFERENCED); 4600 } 4601 cbits &= ~pmap->pmap_bits[PG_A_IDX]; 4602 } 4603 if (pbits & pmap->pmap_bits[PG_M_IDX]) { 4604 if (pmap_track_modified(pte_pv->pv_pindex)) { 4605 if ((pbits & pmap->pmap_bits[PG_DEVICE_IDX]) == 0) { 4606 if (m == NULL) { 4607 m = PHYS_TO_VM_PAGE(pbits & 4608 PG_FRAME); 4609 } 4610 vm_page_dirty(m); 4611 } 4612 cbits &= ~pmap->pmap_bits[PG_M_IDX]; 4613 } 4614 } 4615 } else if (sharept) { 4616 /* 4617 * Unmanaged page table, pt_pv is actually the pd_pv 4618 * for our pmap (not the object's shared pmap). 4619 * 4620 * When asked to protect something in a shared page table 4621 * page we just unmap the page table page. We have to 4622 * invalidate the tlb in this situation. 4623 * 4624 * XXX Warning, shared page tables will not be used for 4625 * OBJT_DEVICE or OBJT_MGTDEVICE (PG_FICTITIOUS) mappings 4626 * so PHYS_TO_VM_PAGE() should be safe here. 4627 */ 4628 pte = pmap_inval_smp(pmap, (vm_offset_t)-1, 1, ptep, 0); 4629 if (vm_page_unwire_quick(PHYS_TO_VM_PAGE(pte & PG_FRAME))) 4630 panic("pmap_protect: pgtable1 pg bad wirecount"); 4631 if (vm_page_unwire_quick(pt_pv->pv_m)) 4632 panic("pmap_protect: pgtable2 pg bad wirecount"); 4633 ptep = NULL; 4634 } 4635 /* else unmanaged page, adjust bits, no wire changes */ 4636 4637 if (ptep) { 4638 cbits &= ~pmap->pmap_bits[PG_RW_IDX]; 4639 #ifdef PMAP_DEBUG2 4640 if (pmap_enter_debug > 0) { 4641 --pmap_enter_debug; 4642 kprintf("pmap_protect va=%lx ptep=%p pte_pv=%p " 4643 "pt_pv=%p cbits=%08lx\n", 4644 va, ptep, pte_pv, 4645 pt_pv, cbits 4646 ); 4647 } 4648 #endif 4649 if (pbits != cbits) { 4650 vm_offset_t xva; 4651 4652 xva = (sharept) ? (vm_offset_t)-1 : va; 4653 if (!pmap_inval_smp_cmpset(pmap, xva, 4654 ptep, pbits, cbits)) { 4655 goto again; 4656 } 4657 } 4658 } 4659 if (pte_pv) 4660 pv_put(pte_pv); 4661 else 4662 pv_placemarker_wakeup(pmap, pte_placemark); 4663 } 4664 4665 /* 4666 * Insert the vm_page (m) at the virtual address (va), replacing any prior 4667 * mapping at that address. Set protection and wiring as requested. 4668 * 4669 * If entry is non-NULL we check to see if the SEG_SIZE optimization is 4670 * possible. If it is we enter the page into the appropriate shared pmap 4671 * hanging off the related VM object instead of the passed pmap, then we 4672 * share the page table page from the VM object's pmap into the current pmap. 4673 * 4674 * NOTE: This routine MUST insert the page into the pmap now, it cannot 4675 * lazy-evaluate. 4676 * 4677 * NOTE: If (m) is PG_UNMANAGED it may also be a temporary fake vm_page_t. 4678 * never record it. 4679 */ 4680 void 4681 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4682 boolean_t wired, vm_map_entry_t entry) 4683 { 4684 pv_entry_t pt_pv; /* page table */ 4685 pv_entry_t pte_pv; /* page table entry */ 4686 vm_pindex_t *pte_placemark; 4687 pt_entry_t *ptep; 4688 vm_paddr_t opa; 4689 pt_entry_t origpte, newpte; 4690 vm_paddr_t pa; 4691 4692 if (pmap == NULL) 4693 return; 4694 va = trunc_page(va); 4695 #ifdef PMAP_DIAGNOSTIC 4696 if (va >= KvaEnd) 4697 panic("pmap_enter: toobig"); 4698 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) 4699 panic("pmap_enter: invalid to pmap_enter page table " 4700 "pages (va: 0x%lx)", va); 4701 #endif 4702 if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) { 4703 kprintf("Warning: pmap_enter called on UVA with " 4704 "kernel_pmap\n"); 4705 #ifdef DDB 4706 db_print_backtrace(); 4707 #endif 4708 } 4709 if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) { 4710 kprintf("Warning: pmap_enter called on KVA without" 4711 "kernel_pmap\n"); 4712 #ifdef DDB 4713 db_print_backtrace(); 4714 #endif 4715 } 4716 4717 /* 4718 * Get locked PV entries for our new page table entry (pte_pv or 4719 * pte_placemark) and for its parent page table (pt_pv). We need 4720 * the parent so we can resolve the location of the ptep. 4721 * 4722 * Only hardware MMU actions can modify the ptep out from 4723 * under us. 4724 * 4725 * if (m) is fictitious or unmanaged we do not create a managing 4726 * pte_pv for it. Any pre-existing page's management state must 4727 * match (avoiding code complexity). 4728 * 4729 * If the pmap is still being initialized we assume existing 4730 * page tables. 4731 * 4732 * Kernel mapppings do not track page table pages (i.e. pt_pv). 4733 * 4734 * WARNING! If replacing a managed mapping with an unmanaged mapping 4735 * pte_pv will wind up being non-NULL and must be handled 4736 * below. 4737 */ 4738 if (pmap_initialized == FALSE) { 4739 pte_pv = NULL; 4740 pt_pv = NULL; 4741 pte_placemark = NULL; 4742 ptep = vtopte(va); 4743 origpte = *ptep; 4744 } else if (m->flags & (/*PG_FICTITIOUS |*/ PG_UNMANAGED)) { /* XXX */ 4745 pte_pv = pv_get(pmap, pmap_pte_pindex(va), &pte_placemark); 4746 KKASSERT(pte_pv == NULL); 4747 if (va >= VM_MAX_USER_ADDRESS) { 4748 pt_pv = NULL; 4749 ptep = vtopte(va); 4750 } else { 4751 pt_pv = pmap_allocpte_seg(pmap, pmap_pt_pindex(va), 4752 NULL, entry, va); 4753 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 4754 } 4755 origpte = *ptep; 4756 cpu_ccfence(); 4757 KASSERT(origpte == 0 || 4758 (origpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0, 4759 ("Invalid PTE 0x%016jx @ 0x%016jx\n", origpte, va)); 4760 } else { 4761 if (va >= VM_MAX_USER_ADDRESS) { 4762 /* 4763 * Kernel map, pv_entry-tracked. 4764 */ 4765 pt_pv = NULL; 4766 pte_pv = pmap_allocpte(pmap, pmap_pte_pindex(va), NULL); 4767 ptep = vtopte(va); 4768 } else { 4769 /* 4770 * User map 4771 */ 4772 pte_pv = pmap_allocpte_seg(pmap, pmap_pte_pindex(va), 4773 &pt_pv, entry, va); 4774 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 4775 } 4776 pte_placemark = NULL; /* safety */ 4777 origpte = *ptep; 4778 cpu_ccfence(); 4779 KASSERT(origpte == 0 || 4780 (origpte & pmap->pmap_bits[PG_MANAGED_IDX]), 4781 ("Invalid PTE 0x%016jx @ 0x%016jx\n", origpte, va)); 4782 } 4783 4784 pa = VM_PAGE_TO_PHYS(m); 4785 opa = origpte & PG_FRAME; 4786 4787 /* 4788 * Calculate the new PTE. Note that pte_pv alone does not mean 4789 * the new pte_pv is managed, it could exist because the old pte 4790 * was managed even if the new one is not. 4791 */ 4792 newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | 4793 pmap->pmap_bits[PG_V_IDX] | pmap->pmap_bits[PG_A_IDX]); 4794 if (wired) 4795 newpte |= pmap->pmap_bits[PG_W_IDX]; 4796 if (va < VM_MAX_USER_ADDRESS) 4797 newpte |= pmap->pmap_bits[PG_U_IDX]; 4798 if (pte_pv && (m->flags & (/*PG_FICTITIOUS |*/ PG_UNMANAGED)) == 0) 4799 newpte |= pmap->pmap_bits[PG_MANAGED_IDX]; 4800 // if (pmap == &kernel_pmap) 4801 // newpte |= pgeflag; 4802 newpte |= pmap->pmap_cache_bits[m->pat_mode]; 4803 if (m->flags & PG_FICTITIOUS) 4804 newpte |= pmap->pmap_bits[PG_DEVICE_IDX]; 4805 4806 /* 4807 * It is possible for multiple faults to occur in threaded 4808 * environments, the existing pte might be correct. 4809 */ 4810 if (((origpte ^ newpte) & 4811 ~(pt_entry_t)(pmap->pmap_bits[PG_M_IDX] | 4812 pmap->pmap_bits[PG_A_IDX])) == 0) { 4813 goto done; 4814 } 4815 4816 /* 4817 * Ok, either the address changed or the protection or wiring 4818 * changed. 4819 * 4820 * Clear the current entry, interlocking the removal. For managed 4821 * pte's this will also flush the modified state to the vm_page. 4822 * Atomic ops are mandatory in order to ensure that PG_M events are 4823 * not lost during any transition. 4824 * 4825 * WARNING: The caller has busied the new page but not the original 4826 * vm_page which we are trying to replace. Because we hold 4827 * the pte_pv lock, but have not busied the page, PG bits 4828 * can be cleared out from under us. 4829 */ 4830 if (opa) { 4831 if (origpte & pmap->pmap_bits[PG_MANAGED_IDX]) { 4832 /* 4833 * Old page was managed. Expect pte_pv to exist. 4834 * (it might also exist if the old page was unmanaged). 4835 * 4836 * NOTE: pt_pv won't exist for a kernel page 4837 * (managed or otherwise). 4838 * 4839 * NOTE: We may be reusing the pte_pv so we do not 4840 * destroy it in pmap_remove_pv_pte(). 4841 */ 4842 KKASSERT(pte_pv && pte_pv->pv_m); 4843 if (prot & VM_PROT_NOSYNC) { 4844 pmap_remove_pv_pte(pte_pv, pt_pv, NULL, 0); 4845 } else { 4846 pmap_inval_bulk_t bulk; 4847 4848 pmap_inval_bulk_init(&bulk, pmap); 4849 pmap_remove_pv_pte(pte_pv, pt_pv, &bulk, 0); 4850 pmap_inval_bulk_flush(&bulk); 4851 } 4852 pmap_remove_pv_page(pte_pv); 4853 /* will either set pte_pv->pv_m or pv_free() later */ 4854 } else { 4855 /* 4856 * Old page was not managed. If we have a pte_pv 4857 * it better not have a pv_m assigned to it. If the 4858 * new page is managed the pte_pv will be destroyed 4859 * near the end (we need its interlock). 4860 * 4861 * NOTE: We leave the wire count on the PT page 4862 * intact for the followup enter, but adjust 4863 * the wired-pages count on the pmap. 4864 */ 4865 KKASSERT(pte_pv == NULL); 4866 if (prot & VM_PROT_NOSYNC) { 4867 /* 4868 * NOSYNC (no mmu sync) requested. 4869 */ 4870 (void)pte_load_clear(ptep); 4871 cpu_invlpg((void *)va); 4872 } else { 4873 /* 4874 * Nominal SYNC 4875 */ 4876 pmap_inval_smp(pmap, va, 1, ptep, 0); 4877 } 4878 4879 /* 4880 * We must adjust pm_stats manually for unmanaged 4881 * pages. 4882 */ 4883 if (pt_pv) { 4884 atomic_add_long(&pmap->pm_stats. 4885 resident_count, -1); 4886 } 4887 if (origpte & pmap->pmap_bits[PG_W_IDX]) { 4888 atomic_add_long(&pmap->pm_stats. 4889 wired_count, -1); 4890 } 4891 } 4892 KKASSERT(*ptep == 0); 4893 } 4894 4895 #ifdef PMAP_DEBUG2 4896 if (pmap_enter_debug > 0) { 4897 --pmap_enter_debug; 4898 kprintf("pmap_enter: va=%lx m=%p origpte=%lx newpte=%lx ptep=%p" 4899 " pte_pv=%p pt_pv=%p opa=%lx prot=%02x\n", 4900 va, m, 4901 origpte, newpte, ptep, 4902 pte_pv, pt_pv, opa, prot); 4903 } 4904 #endif 4905 4906 if ((newpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0) { 4907 /* 4908 * Entering an unmanaged page. We must wire the pt_pv unless 4909 * we retained the wiring from an unmanaged page we had 4910 * removed (if we retained it via pte_pv that will go away 4911 * soon). 4912 */ 4913 if (pt_pv && (opa == 0 || 4914 (origpte & pmap->pmap_bits[PG_MANAGED_IDX]))) { 4915 vm_page_wire_quick(pt_pv->pv_m); 4916 } 4917 if (wired) 4918 atomic_add_long(&pmap->pm_stats.wired_count, 1); 4919 4920 /* 4921 * Unmanaged pages need manual resident_count tracking. 4922 */ 4923 if (pt_pv) { 4924 atomic_add_long(&pt_pv->pv_pmap->pm_stats. 4925 resident_count, 1); 4926 } 4927 if (newpte & pmap->pmap_bits[PG_RW_IDX]) 4928 vm_page_flag_set(m, PG_WRITEABLE); 4929 } else { 4930 /* 4931 * Entering a managed page. Our pte_pv takes care of the 4932 * PT wiring, so if we had removed an unmanaged page before 4933 * we must adjust. 4934 * 4935 * We have to take care of the pmap wired count ourselves. 4936 * 4937 * Enter on the PV list if part of our managed memory. 4938 */ 4939 KKASSERT(pte_pv && (pte_pv->pv_m == NULL || pte_pv->pv_m == m)); 4940 vm_page_spin_lock(m); 4941 pte_pv->pv_m = m; 4942 pmap_page_stats_adding(m); 4943 TAILQ_INSERT_TAIL(&m->md.pv_list, pte_pv, pv_list); 4944 vm_page_flag_set(m, PG_MAPPED); 4945 if (newpte & pmap->pmap_bits[PG_RW_IDX]) 4946 vm_page_flag_set(m, PG_WRITEABLE); 4947 vm_page_spin_unlock(m); 4948 4949 if (pt_pv && opa && 4950 (origpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0) { 4951 vm_page_unwire_quick(pt_pv->pv_m); 4952 } 4953 4954 /* 4955 * Adjust pmap wired pages count for new entry. 4956 */ 4957 if (wired) { 4958 atomic_add_long(&pte_pv->pv_pmap->pm_stats. 4959 wired_count, 1); 4960 } 4961 } 4962 4963 /* 4964 * Kernel VMAs (pt_pv == NULL) require pmap invalidation interlocks. 4965 * 4966 * User VMAs do not because those will be zero->non-zero, so no 4967 * stale entries to worry about at this point. 4968 * 4969 * For KVM there appear to still be issues. Theoretically we 4970 * should be able to scrap the interlocks entirely but we 4971 * get crashes. 4972 */ 4973 if ((prot & VM_PROT_NOSYNC) == 0 && pt_pv == NULL) { 4974 pmap_inval_smp(pmap, va, 1, ptep, newpte); 4975 } else { 4976 origpte = atomic_swap_long(ptep, newpte); 4977 if (origpte & pmap->pmap_bits[PG_M_IDX]) { 4978 kprintf("pmap [M] race @ %016jx\n", va); 4979 atomic_set_long(ptep, pmap->pmap_bits[PG_M_IDX]); 4980 } 4981 if (pt_pv == NULL) 4982 cpu_invlpg((void *)va); 4983 } 4984 4985 /* 4986 * Cleanup 4987 */ 4988 done: 4989 KKASSERT((newpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0 || 4990 (m->flags & PG_MAPPED)); 4991 4992 /* 4993 * Cleanup the pv entry, allowing other accessors. If the new page 4994 * is not managed but we have a pte_pv (which was locking our 4995 * operation), we can free it now. pte_pv->pv_m should be NULL. 4996 */ 4997 if (pte_pv && (newpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0) { 4998 pv_free(pte_pv, pt_pv); 4999 } else if (pte_pv) { 5000 pv_put(pte_pv); 5001 } else if (pte_placemark) { 5002 pv_placemarker_wakeup(pmap, pte_placemark); 5003 } 5004 if (pt_pv) 5005 pv_put(pt_pv); 5006 } 5007 5008 /* 5009 * This code works like pmap_enter() but assumes VM_PROT_READ and not-wired. 5010 * This code also assumes that the pmap has no pre-existing entry for this 5011 * VA. 5012 * 5013 * This code currently may only be used on user pmaps, not kernel_pmap. 5014 */ 5015 void 5016 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m) 5017 { 5018 pmap_enter(pmap, va, m, VM_PROT_READ, FALSE, NULL); 5019 } 5020 5021 /* 5022 * Make a temporary mapping for a physical address. This is only intended 5023 * to be used for panic dumps. 5024 * 5025 * The caller is responsible for calling smp_invltlb(). 5026 */ 5027 void * 5028 pmap_kenter_temporary(vm_paddr_t pa, long i) 5029 { 5030 pmap_kenter_quick((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa); 5031 return ((void *)crashdumpmap); 5032 } 5033 5034 #define MAX_INIT_PT (96) 5035 5036 /* 5037 * This routine preloads the ptes for a given object into the specified pmap. 5038 * This eliminates the blast of soft faults on process startup and 5039 * immediately after an mmap. 5040 */ 5041 static int pmap_object_init_pt_callback(vm_page_t p, void *data); 5042 5043 void 5044 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot, 5045 vm_object_t object, vm_pindex_t pindex, 5046 vm_size_t size, int limit) 5047 { 5048 struct rb_vm_page_scan_info info; 5049 struct lwp *lp; 5050 vm_size_t psize; 5051 5052 /* 5053 * We can't preinit if read access isn't set or there is no pmap 5054 * or object. 5055 */ 5056 if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL) 5057 return; 5058 5059 /* 5060 * We can't preinit if the pmap is not the current pmap 5061 */ 5062 lp = curthread->td_lwp; 5063 if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace)) 5064 return; 5065 5066 /* 5067 * Misc additional checks 5068 */ 5069 psize = x86_64_btop(size); 5070 5071 if ((object->type != OBJT_VNODE) || 5072 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && 5073 (object->resident_page_count > MAX_INIT_PT))) { 5074 return; 5075 } 5076 5077 if (pindex + psize > object->size) { 5078 if (object->size < pindex) 5079 return; 5080 psize = object->size - pindex; 5081 } 5082 5083 if (psize == 0) 5084 return; 5085 5086 /* 5087 * If everything is segment-aligned do not pre-init here. Instead 5088 * allow the normal vm_fault path to pass a segment hint to 5089 * pmap_enter() which will then use an object-referenced shared 5090 * page table page. 5091 */ 5092 if ((addr & SEG_MASK) == 0 && 5093 (ctob(psize) & SEG_MASK) == 0 && 5094 (ctob(pindex) & SEG_MASK) == 0) { 5095 return; 5096 } 5097 5098 /* 5099 * Use a red-black scan to traverse the requested range and load 5100 * any valid pages found into the pmap. 5101 * 5102 * We cannot safely scan the object's memq without holding the 5103 * object token. 5104 */ 5105 info.start_pindex = pindex; 5106 info.end_pindex = pindex + psize - 1; 5107 info.limit = limit; 5108 info.mpte = NULL; 5109 info.addr = addr; 5110 info.pmap = pmap; 5111 5112 vm_object_hold_shared(object); 5113 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 5114 pmap_object_init_pt_callback, &info); 5115 vm_object_drop(object); 5116 } 5117 5118 static 5119 int 5120 pmap_object_init_pt_callback(vm_page_t p, void *data) 5121 { 5122 struct rb_vm_page_scan_info *info = data; 5123 vm_pindex_t rel_index; 5124 5125 /* 5126 * don't allow an madvise to blow away our really 5127 * free pages allocating pv entries. 5128 */ 5129 if ((info->limit & MAP_PREFAULT_MADVISE) && 5130 vmstats.v_free_count < vmstats.v_free_reserved) { 5131 return(-1); 5132 } 5133 5134 /* 5135 * Ignore list markers and ignore pages we cannot instantly 5136 * busy (while holding the object token). 5137 */ 5138 if (p->flags & PG_MARKER) 5139 return 0; 5140 if (vm_page_busy_try(p, TRUE)) 5141 return 0; 5142 if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && 5143 (p->flags & PG_FICTITIOUS) == 0) { 5144 if ((p->queue - p->pc) == PQ_CACHE) 5145 vm_page_deactivate(p); 5146 rel_index = p->pindex - info->start_pindex; 5147 pmap_enter_quick(info->pmap, 5148 info->addr + x86_64_ptob(rel_index), p); 5149 } 5150 vm_page_wakeup(p); 5151 lwkt_yield(); 5152 return(0); 5153 } 5154 5155 /* 5156 * Return TRUE if the pmap is in shape to trivially pre-fault the specified 5157 * address. 5158 * 5159 * Returns FALSE if it would be non-trivial or if a pte is already loaded 5160 * into the slot. 5161 * 5162 * XXX This is safe only because page table pages are not freed. 5163 */ 5164 int 5165 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) 5166 { 5167 pt_entry_t *pte; 5168 5169 /*spin_lock(&pmap->pm_spin);*/ 5170 if ((pte = pmap_pte(pmap, addr)) != NULL) { 5171 if (*pte & pmap->pmap_bits[PG_V_IDX]) { 5172 /*spin_unlock(&pmap->pm_spin);*/ 5173 return FALSE; 5174 } 5175 } 5176 /*spin_unlock(&pmap->pm_spin);*/ 5177 return TRUE; 5178 } 5179 5180 /* 5181 * Change the wiring attribute for a pmap/va pair. The mapping must already 5182 * exist in the pmap. The mapping may or may not be managed. The wiring in 5183 * the page is not changed, the page is returned so the caller can adjust 5184 * its wiring (the page is not locked in any way). 5185 * 5186 * Wiring is not a hardware characteristic so there is no need to invalidate 5187 * TLB. However, in an SMP environment we must use a locked bus cycle to 5188 * update the pte (if we are not using the pmap_inval_*() API that is)... 5189 * it's ok to do this for simple wiring changes. 5190 */ 5191 vm_page_t 5192 pmap_unwire(pmap_t pmap, vm_offset_t va) 5193 { 5194 pt_entry_t *ptep; 5195 pv_entry_t pt_pv; 5196 vm_paddr_t pa; 5197 vm_page_t m; 5198 5199 if (pmap == NULL) 5200 return NULL; 5201 5202 /* 5203 * Assume elements in the kernel pmap are stable 5204 */ 5205 if (pmap == &kernel_pmap) { 5206 if (pmap_pt(pmap, va) == 0) 5207 return NULL; 5208 ptep = pmap_pte_quick(pmap, va); 5209 if (pmap_pte_v(pmap, ptep)) { 5210 if (pmap_pte_w(pmap, ptep)) 5211 atomic_add_long(&pmap->pm_stats.wired_count,-1); 5212 atomic_clear_long(ptep, pmap->pmap_bits[PG_W_IDX]); 5213 pa = *ptep & PG_FRAME; 5214 m = PHYS_TO_VM_PAGE(pa); 5215 } else { 5216 m = NULL; 5217 } 5218 } else { 5219 /* 5220 * We can only [un]wire pmap-local pages (we cannot wire 5221 * shared pages) 5222 */ 5223 pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL); 5224 if (pt_pv == NULL) 5225 return NULL; 5226 5227 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 5228 if ((*ptep & pmap->pmap_bits[PG_V_IDX]) == 0) { 5229 pv_put(pt_pv); 5230 return NULL; 5231 } 5232 5233 if (pmap_pte_w(pmap, ptep)) { 5234 atomic_add_long(&pt_pv->pv_pmap->pm_stats.wired_count, 5235 -1); 5236 } 5237 /* XXX else return NULL so caller doesn't unwire m ? */ 5238 5239 atomic_clear_long(ptep, pmap->pmap_bits[PG_W_IDX]); 5240 5241 pa = *ptep & PG_FRAME; 5242 m = PHYS_TO_VM_PAGE(pa); /* held by wired count */ 5243 pv_put(pt_pv); 5244 } 5245 return m; 5246 } 5247 5248 /* 5249 * Copy the range specified by src_addr/len from the source map to 5250 * the range dst_addr/len in the destination map. 5251 * 5252 * This routine is only advisory and need not do anything. 5253 */ 5254 void 5255 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 5256 vm_size_t len, vm_offset_t src_addr) 5257 { 5258 } 5259 5260 /* 5261 * pmap_zero_page: 5262 * 5263 * Zero the specified physical page. 5264 * 5265 * This function may be called from an interrupt and no locking is 5266 * required. 5267 */ 5268 void 5269 pmap_zero_page(vm_paddr_t phys) 5270 { 5271 vm_offset_t va = PHYS_TO_DMAP(phys); 5272 5273 pagezero((void *)va); 5274 } 5275 5276 /* 5277 * pmap_zero_page: 5278 * 5279 * Zero part of a physical page by mapping it into memory and clearing 5280 * its contents with bzero. 5281 * 5282 * off and size may not cover an area beyond a single hardware page. 5283 */ 5284 void 5285 pmap_zero_page_area(vm_paddr_t phys, int off, int size) 5286 { 5287 vm_offset_t virt = PHYS_TO_DMAP(phys); 5288 5289 bzero((char *)virt + off, size); 5290 } 5291 5292 /* 5293 * pmap_copy_page: 5294 * 5295 * Copy the physical page from the source PA to the target PA. 5296 * This function may be called from an interrupt. No locking 5297 * is required. 5298 */ 5299 void 5300 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst) 5301 { 5302 vm_offset_t src_virt, dst_virt; 5303 5304 src_virt = PHYS_TO_DMAP(src); 5305 dst_virt = PHYS_TO_DMAP(dst); 5306 bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE); 5307 } 5308 5309 /* 5310 * pmap_copy_page_frag: 5311 * 5312 * Copy the physical page from the source PA to the target PA. 5313 * This function may be called from an interrupt. No locking 5314 * is required. 5315 */ 5316 void 5317 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes) 5318 { 5319 vm_offset_t src_virt, dst_virt; 5320 5321 src_virt = PHYS_TO_DMAP(src); 5322 dst_virt = PHYS_TO_DMAP(dst); 5323 5324 bcopy((char *)src_virt + (src & PAGE_MASK), 5325 (char *)dst_virt + (dst & PAGE_MASK), 5326 bytes); 5327 } 5328 5329 /* 5330 * Returns true if the pmap's pv is one of the first 16 pvs linked to from 5331 * this page. This count may be changed upwards or downwards in the future; 5332 * it is only necessary that true be returned for a small subset of pmaps 5333 * for proper page aging. 5334 */ 5335 boolean_t 5336 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 5337 { 5338 pv_entry_t pv; 5339 int loops = 0; 5340 5341 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 5342 return FALSE; 5343 5344 vm_page_spin_lock(m); 5345 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 5346 if (pv->pv_pmap == pmap) { 5347 vm_page_spin_unlock(m); 5348 return TRUE; 5349 } 5350 loops++; 5351 if (loops >= 16) 5352 break; 5353 } 5354 vm_page_spin_unlock(m); 5355 return (FALSE); 5356 } 5357 5358 /* 5359 * Remove all pages from specified address space this aids process exit 5360 * speeds. Also, this code may be special cased for the current process 5361 * only. 5362 */ 5363 void 5364 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5365 { 5366 pmap_remove_noinval(pmap, sva, eva); 5367 cpu_invltlb(); 5368 } 5369 5370 /* 5371 * pmap_testbit tests bits in pte's note that the testbit/clearbit 5372 * routines are inline, and a lot of things compile-time evaluate. 5373 */ 5374 static 5375 boolean_t 5376 pmap_testbit(vm_page_t m, int bit) 5377 { 5378 pv_entry_t pv; 5379 pt_entry_t *pte; 5380 pmap_t pmap; 5381 5382 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 5383 return FALSE; 5384 5385 if (TAILQ_FIRST(&m->md.pv_list) == NULL) 5386 return FALSE; 5387 vm_page_spin_lock(m); 5388 if (TAILQ_FIRST(&m->md.pv_list) == NULL) { 5389 vm_page_spin_unlock(m); 5390 return FALSE; 5391 } 5392 5393 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 5394 5395 #if defined(PMAP_DIAGNOSTIC) 5396 if (pv->pv_pmap == NULL) { 5397 kprintf("Null pmap (tb) at pindex: %"PRIu64"\n", 5398 pv->pv_pindex); 5399 continue; 5400 } 5401 #endif 5402 pmap = pv->pv_pmap; 5403 5404 /* 5405 * If the bit being tested is the modified bit, then 5406 * mark clean_map and ptes as never 5407 * modified. 5408 * 5409 * WARNING! Because we do not lock the pv, *pte can be in a 5410 * state of flux. Despite this the value of *pte 5411 * will still be related to the vm_page in some way 5412 * because the pv cannot be destroyed as long as we 5413 * hold the vm_page spin lock. 5414 */ 5415 if (bit == PG_A_IDX || bit == PG_M_IDX) { 5416 //& (pmap->pmap_bits[PG_A_IDX] | pmap->pmap_bits[PG_M_IDX])) { 5417 if (!pmap_track_modified(pv->pv_pindex)) 5418 continue; 5419 } 5420 5421 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); 5422 if (*pte & pmap->pmap_bits[bit]) { 5423 vm_page_spin_unlock(m); 5424 return TRUE; 5425 } 5426 } 5427 vm_page_spin_unlock(m); 5428 return (FALSE); 5429 } 5430 5431 /* 5432 * This routine is used to modify bits in ptes. Only one bit should be 5433 * specified. PG_RW requires special handling. 5434 * 5435 * Caller must NOT hold any spin locks 5436 */ 5437 static __inline 5438 void 5439 pmap_clearbit(vm_page_t m, int bit_index) 5440 { 5441 pv_entry_t pv; 5442 pt_entry_t *pte; 5443 pt_entry_t pbits; 5444 pmap_t pmap; 5445 5446 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { 5447 if (bit_index == PG_RW_IDX) 5448 vm_page_flag_clear(m, PG_WRITEABLE); 5449 return; 5450 } 5451 5452 /* 5453 * PG_M or PG_A case 5454 * 5455 * Loop over all current mappings setting/clearing as appropos If 5456 * setting RO do we need to clear the VAC? 5457 * 5458 * NOTE: When clearing PG_M we could also (not implemented) drop 5459 * through to the PG_RW code and clear PG_RW too, forcing 5460 * a fault on write to redetect PG_M for virtual kernels, but 5461 * it isn't necessary since virtual kernels invalidate the 5462 * pte when they clear the VPTE_M bit in their virtual page 5463 * tables. 5464 * 5465 * NOTE: Does not re-dirty the page when clearing only PG_M. 5466 * 5467 * NOTE: Because we do not lock the pv, *pte can be in a state of 5468 * flux. Despite this the value of *pte is still somewhat 5469 * related while we hold the vm_page spin lock. 5470 * 5471 * *pte can be zero due to this race. Since we are clearing 5472 * bits we basically do no harm when this race occurs. 5473 */ 5474 if (bit_index != PG_RW_IDX) { 5475 vm_page_spin_lock(m); 5476 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 5477 #if defined(PMAP_DIAGNOSTIC) 5478 if (pv->pv_pmap == NULL) { 5479 kprintf("Null pmap (cb) at pindex: %"PRIu64"\n", 5480 pv->pv_pindex); 5481 continue; 5482 } 5483 #endif 5484 pmap = pv->pv_pmap; 5485 pte = pmap_pte_quick(pv->pv_pmap, 5486 pv->pv_pindex << PAGE_SHIFT); 5487 pbits = *pte; 5488 if (pbits & pmap->pmap_bits[bit_index]) 5489 atomic_clear_long(pte, pmap->pmap_bits[bit_index]); 5490 } 5491 vm_page_spin_unlock(m); 5492 return; 5493 } 5494 5495 /* 5496 * Clear PG_RW. Also clears PG_M and marks the page dirty if PG_M 5497 * was set. 5498 */ 5499 restart: 5500 vm_page_spin_lock(m); 5501 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 5502 /* 5503 * don't write protect pager mappings 5504 */ 5505 if (!pmap_track_modified(pv->pv_pindex)) 5506 continue; 5507 5508 #if defined(PMAP_DIAGNOSTIC) 5509 if (pv->pv_pmap == NULL) { 5510 kprintf("Null pmap (cb) at pindex: %"PRIu64"\n", 5511 pv->pv_pindex); 5512 continue; 5513 } 5514 #endif 5515 pmap = pv->pv_pmap; 5516 5517 /* 5518 * Skip pages which do not have PG_RW set. 5519 */ 5520 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); 5521 if ((*pte & pmap->pmap_bits[PG_RW_IDX]) == 0) 5522 continue; 5523 5524 /* 5525 * We must lock the PV to be able to safely test the pte. 5526 */ 5527 if (pv_hold_try(pv)) { 5528 vm_page_spin_unlock(m); 5529 } else { 5530 vm_page_spin_unlock(m); 5531 pv_lock(pv); /* held, now do a blocking lock */ 5532 pv_put(pv); 5533 goto restart; 5534 } 5535 5536 /* 5537 * Reload pte after acquiring pv. 5538 */ 5539 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); 5540 #if 0 5541 if ((*pte & pmap->pmap_bits[PG_RW_IDX]) == 0) { 5542 pv_put(pv); 5543 goto restart; 5544 } 5545 #endif 5546 5547 KKASSERT(pv->pv_pmap == pmap && pv->pv_m == m); 5548 for (;;) { 5549 pt_entry_t nbits; 5550 5551 pbits = *pte; 5552 cpu_ccfence(); 5553 nbits = pbits & ~(pmap->pmap_bits[PG_RW_IDX] | 5554 pmap->pmap_bits[PG_M_IDX]); 5555 if (pmap_inval_smp_cmpset(pmap, 5556 ((vm_offset_t)pv->pv_pindex << PAGE_SHIFT), 5557 pte, pbits, nbits)) { 5558 break; 5559 } 5560 cpu_pause(); 5561 } 5562 5563 /* 5564 * If PG_M was found to be set while we were clearing PG_RW 5565 * we also clear PG_M (done above) and mark the page dirty. 5566 * Callers expect this behavior. 5567 * 5568 * we lost pv so it cannot be used as an iterator. In fact, 5569 * because we couldn't necessarily lock it atomically it may 5570 * have moved within the list and ALSO cannot be used as an 5571 * iterator. 5572 */ 5573 vm_page_spin_lock(m); 5574 if (pbits & pmap->pmap_bits[PG_M_IDX]) 5575 vm_page_dirty(m); 5576 vm_page_spin_unlock(m); 5577 pv_put(pv); 5578 goto restart; 5579 } 5580 if (bit_index == PG_RW_IDX) 5581 vm_page_flag_clear(m, PG_WRITEABLE); 5582 vm_page_spin_unlock(m); 5583 } 5584 5585 /* 5586 * Lower the permission for all mappings to a given page. 5587 * 5588 * Page must be busied by caller. Because page is busied by caller this 5589 * should not be able to race a pmap_enter(). 5590 */ 5591 void 5592 pmap_page_protect(vm_page_t m, vm_prot_t prot) 5593 { 5594 /* JG NX support? */ 5595 if ((prot & VM_PROT_WRITE) == 0) { 5596 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { 5597 /* 5598 * NOTE: pmap_clearbit(.. PG_RW) also clears 5599 * the PG_WRITEABLE flag in (m). 5600 */ 5601 pmap_clearbit(m, PG_RW_IDX); 5602 } else { 5603 pmap_remove_all(m); 5604 } 5605 } 5606 } 5607 5608 vm_paddr_t 5609 pmap_phys_address(vm_pindex_t ppn) 5610 { 5611 return (x86_64_ptob(ppn)); 5612 } 5613 5614 /* 5615 * Return a count of reference bits for a page, clearing those bits. 5616 * It is not necessary for every reference bit to be cleared, but it 5617 * is necessary that 0 only be returned when there are truly no 5618 * reference bits set. 5619 * 5620 * XXX: The exact number of bits to check and clear is a matter that 5621 * should be tested and standardized at some point in the future for 5622 * optimal aging of shared pages. 5623 * 5624 * This routine may not block. 5625 */ 5626 int 5627 pmap_ts_referenced(vm_page_t m) 5628 { 5629 pv_entry_t pv; 5630 pt_entry_t *pte; 5631 pmap_t pmap; 5632 int rtval = 0; 5633 5634 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 5635 return (rtval); 5636 5637 vm_page_spin_lock(m); 5638 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 5639 if (!pmap_track_modified(pv->pv_pindex)) 5640 continue; 5641 pmap = pv->pv_pmap; 5642 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); 5643 if (pte && (*pte & pmap->pmap_bits[PG_A_IDX])) { 5644 atomic_clear_long(pte, pmap->pmap_bits[PG_A_IDX]); 5645 rtval++; 5646 if (rtval > 4) 5647 break; 5648 } 5649 } 5650 vm_page_spin_unlock(m); 5651 return (rtval); 5652 } 5653 5654 /* 5655 * pmap_is_modified: 5656 * 5657 * Return whether or not the specified physical page was modified 5658 * in any physical maps. 5659 */ 5660 boolean_t 5661 pmap_is_modified(vm_page_t m) 5662 { 5663 boolean_t res; 5664 5665 res = pmap_testbit(m, PG_M_IDX); 5666 return (res); 5667 } 5668 5669 /* 5670 * Clear the modify bits on the specified physical page. 5671 */ 5672 void 5673 pmap_clear_modify(vm_page_t m) 5674 { 5675 pmap_clearbit(m, PG_M_IDX); 5676 } 5677 5678 /* 5679 * pmap_clear_reference: 5680 * 5681 * Clear the reference bit on the specified physical page. 5682 */ 5683 void 5684 pmap_clear_reference(vm_page_t m) 5685 { 5686 pmap_clearbit(m, PG_A_IDX); 5687 } 5688 5689 /* 5690 * Miscellaneous support routines follow 5691 */ 5692 5693 static 5694 void 5695 i386_protection_init(void) 5696 { 5697 uint64_t *kp; 5698 int prot; 5699 5700 /* 5701 * NX supported? (boot time loader.conf override only) 5702 */ 5703 TUNABLE_INT_FETCH("machdep.pmap_nx_enable", &pmap_nx_enable); 5704 if (pmap_nx_enable == 0 || (amd_feature & AMDID_NX) == 0) 5705 pmap_bits_default[PG_NX_IDX] = 0; 5706 5707 /* 5708 * 0 is basically read-only access, but also set the NX (no-execute) 5709 * bit when VM_PROT_EXECUTE is not specified. 5710 */ 5711 kp = protection_codes; 5712 for (prot = 0; prot < PROTECTION_CODES_SIZE; prot++) { 5713 switch (prot) { 5714 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: 5715 /* 5716 * This case handled elsewhere 5717 */ 5718 *kp++ = 0; 5719 break; 5720 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: 5721 /* 5722 * Read-only is 0|NX 5723 */ 5724 *kp++ = pmap_bits_default[PG_NX_IDX]; 5725 break; 5726 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: 5727 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: 5728 /* 5729 * Execute requires read access 5730 */ 5731 *kp++ = 0; 5732 break; 5733 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: 5734 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: 5735 /* 5736 * Write without execute is RW|NX 5737 */ 5738 *kp++ = pmap_bits_default[PG_RW_IDX] | 5739 pmap_bits_default[PG_NX_IDX]; 5740 break; 5741 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: 5742 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: 5743 /* 5744 * Write with execute is RW 5745 */ 5746 *kp++ = pmap_bits_default[PG_RW_IDX]; 5747 break; 5748 } 5749 } 5750 } 5751 5752 /* 5753 * Map a set of physical memory pages into the kernel virtual 5754 * address space. Return a pointer to where it is mapped. This 5755 * routine is intended to be used for mapping device memory, 5756 * NOT real memory. 5757 * 5758 * NOTE: We can't use pgeflag unless we invalidate the pages one at 5759 * a time. 5760 * 5761 * NOTE: The PAT attributes {WRITE_BACK, WRITE_THROUGH, UNCACHED, UNCACHEABLE} 5762 * work whether the cpu supports PAT or not. The remaining PAT 5763 * attributes {WRITE_PROTECTED, WRITE_COMBINING} only work if the cpu 5764 * supports PAT. 5765 */ 5766 void * 5767 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 5768 { 5769 return(pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5770 } 5771 5772 void * 5773 pmap_mapdev_uncacheable(vm_paddr_t pa, vm_size_t size) 5774 { 5775 return(pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 5776 } 5777 5778 void * 5779 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 5780 { 5781 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5782 } 5783 5784 /* 5785 * Map a set of physical memory pages into the kernel virtual 5786 * address space. Return a pointer to where it is mapped. This 5787 * routine is intended to be used for mapping device memory, 5788 * NOT real memory. 5789 */ 5790 void * 5791 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 5792 { 5793 vm_offset_t va, tmpva, offset; 5794 pt_entry_t *pte; 5795 vm_size_t tmpsize; 5796 5797 offset = pa & PAGE_MASK; 5798 size = roundup(offset + size, PAGE_SIZE); 5799 5800 va = kmem_alloc_nofault(&kernel_map, size, VM_SUBSYS_MAPDEV, PAGE_SIZE); 5801 if (va == 0) 5802 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 5803 5804 pa = pa & ~PAGE_MASK; 5805 for (tmpva = va, tmpsize = size; tmpsize > 0;) { 5806 pte = vtopte(tmpva); 5807 *pte = pa | 5808 kernel_pmap.pmap_bits[PG_RW_IDX] | 5809 kernel_pmap.pmap_bits[PG_V_IDX] | /* pgeflag | */ 5810 kernel_pmap.pmap_cache_bits[mode]; 5811 tmpsize -= PAGE_SIZE; 5812 tmpva += PAGE_SIZE; 5813 pa += PAGE_SIZE; 5814 } 5815 pmap_invalidate_range(&kernel_pmap, va, va + size); 5816 pmap_invalidate_cache_range(va, va + size); 5817 5818 return ((void *)(va + offset)); 5819 } 5820 5821 void 5822 pmap_unmapdev(vm_offset_t va, vm_size_t size) 5823 { 5824 vm_offset_t base, offset; 5825 5826 base = va & ~PAGE_MASK; 5827 offset = va & PAGE_MASK; 5828 size = roundup(offset + size, PAGE_SIZE); 5829 pmap_qremove(va, size >> PAGE_SHIFT); 5830 kmem_free(&kernel_map, base, size); 5831 } 5832 5833 /* 5834 * Sets the memory attribute for the specified page. 5835 */ 5836 void 5837 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5838 { 5839 5840 m->pat_mode = ma; 5841 5842 /* 5843 * If "m" is a normal page, update its direct mapping. This update 5844 * can be relied upon to perform any cache operations that are 5845 * required for data coherence. 5846 */ 5847 if ((m->flags & PG_FICTITIOUS) == 0) 5848 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 1, m->pat_mode); 5849 } 5850 5851 /* 5852 * Change the PAT attribute on an existing kernel memory map. Caller 5853 * must ensure that the virtual memory in question is not accessed 5854 * during the adjustment. 5855 */ 5856 void 5857 pmap_change_attr(vm_offset_t va, vm_size_t count, int mode) 5858 { 5859 pt_entry_t *pte; 5860 vm_offset_t base; 5861 int changed = 0; 5862 5863 if (va == 0) 5864 panic("pmap_change_attr: va is NULL"); 5865 base = trunc_page(va); 5866 5867 while (count) { 5868 pte = vtopte(va); 5869 *pte = (*pte & ~(pt_entry_t)(kernel_pmap.pmap_cache_mask)) | 5870 kernel_pmap.pmap_cache_bits[mode]; 5871 --count; 5872 va += PAGE_SIZE; 5873 } 5874 5875 changed = 1; /* XXX: not optimal */ 5876 5877 /* 5878 * Flush CPU caches if required to make sure any data isn't cached that 5879 * shouldn't be, etc. 5880 */ 5881 if (changed) { 5882 pmap_invalidate_range(&kernel_pmap, base, va); 5883 pmap_invalidate_cache_range(base, va); 5884 } 5885 } 5886 5887 /* 5888 * perform the pmap work for mincore 5889 */ 5890 int 5891 pmap_mincore(pmap_t pmap, vm_offset_t addr) 5892 { 5893 pt_entry_t *ptep, pte; 5894 vm_page_t m; 5895 int val = 0; 5896 5897 ptep = pmap_pte(pmap, addr); 5898 5899 if (ptep && (pte = *ptep) != 0) { 5900 vm_offset_t pa; 5901 5902 val = MINCORE_INCORE; 5903 if ((pte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0) 5904 goto done; 5905 5906 pa = pte & PG_FRAME; 5907 5908 if (pte & pmap->pmap_bits[PG_DEVICE_IDX]) 5909 m = NULL; 5910 else 5911 m = PHYS_TO_VM_PAGE(pa); 5912 5913 /* 5914 * Modified by us 5915 */ 5916 if (pte & pmap->pmap_bits[PG_M_IDX]) 5917 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 5918 /* 5919 * Modified by someone 5920 */ 5921 else if (m && (m->dirty || pmap_is_modified(m))) 5922 val |= MINCORE_MODIFIED_OTHER; 5923 /* 5924 * Referenced by us 5925 */ 5926 if (pte & pmap->pmap_bits[PG_A_IDX]) 5927 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 5928 5929 /* 5930 * Referenced by someone 5931 */ 5932 else if (m && ((m->flags & PG_REFERENCED) || 5933 pmap_ts_referenced(m))) { 5934 val |= MINCORE_REFERENCED_OTHER; 5935 vm_page_flag_set(m, PG_REFERENCED); 5936 } 5937 } 5938 done: 5939 5940 return val; 5941 } 5942 5943 /* 5944 * Replace p->p_vmspace with a new one. If adjrefs is non-zero the new 5945 * vmspace will be ref'd and the old one will be deref'd. 5946 * 5947 * The vmspace for all lwps associated with the process will be adjusted 5948 * and cr3 will be reloaded if any lwp is the current lwp. 5949 * 5950 * The process must hold the vmspace->vm_map.token for oldvm and newvm 5951 */ 5952 void 5953 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs) 5954 { 5955 struct vmspace *oldvm; 5956 struct lwp *lp; 5957 5958 oldvm = p->p_vmspace; 5959 if (oldvm != newvm) { 5960 if (adjrefs) 5961 vmspace_ref(newvm); 5962 p->p_vmspace = newvm; 5963 KKASSERT(p->p_nthreads == 1); 5964 lp = RB_ROOT(&p->p_lwp_tree); 5965 pmap_setlwpvm(lp, newvm); 5966 if (adjrefs) 5967 vmspace_rel(oldvm); 5968 } 5969 } 5970 5971 /* 5972 * Set the vmspace for a LWP. The vmspace is almost universally set the 5973 * same as the process vmspace, but virtual kernels need to swap out contexts 5974 * on a per-lwp basis. 5975 * 5976 * Caller does not necessarily hold any vmspace tokens. Caller must control 5977 * the lwp (typically be in the context of the lwp). We use a critical 5978 * section to protect against statclock and hardclock (statistics collection). 5979 */ 5980 void 5981 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm) 5982 { 5983 struct vmspace *oldvm; 5984 struct pmap *pmap; 5985 5986 oldvm = lp->lwp_vmspace; 5987 5988 if (oldvm != newvm) { 5989 crit_enter(); 5990 KKASSERT((newvm->vm_refcnt & VM_REF_DELETED) == 0); 5991 lp->lwp_vmspace = newvm; 5992 if (curthread->td_lwp == lp) { 5993 pmap = vmspace_pmap(newvm); 5994 ATOMIC_CPUMASK_ORBIT(pmap->pm_active, mycpu->gd_cpuid); 5995 if (pmap->pm_active_lock & CPULOCK_EXCL) 5996 pmap_interlock_wait(newvm); 5997 #if defined(SWTCH_OPTIM_STATS) 5998 tlb_flush_count++; 5999 #endif 6000 if (pmap->pmap_bits[TYPE_IDX] == REGULAR_PMAP) { 6001 curthread->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4); 6002 } else if (pmap->pmap_bits[TYPE_IDX] == EPT_PMAP) { 6003 curthread->td_pcb->pcb_cr3 = KPML4phys; 6004 } else { 6005 panic("pmap_setlwpvm: unknown pmap type\n"); 6006 } 6007 load_cr3(curthread->td_pcb->pcb_cr3); 6008 pmap = vmspace_pmap(oldvm); 6009 ATOMIC_CPUMASK_NANDBIT(pmap->pm_active, 6010 mycpu->gd_cpuid); 6011 } 6012 crit_exit(); 6013 } 6014 } 6015 6016 /* 6017 * Called when switching to a locked pmap, used to interlock against pmaps 6018 * undergoing modifications to prevent us from activating the MMU for the 6019 * target pmap until all such modifications have completed. We have to do 6020 * this because the thread making the modifications has already set up its 6021 * SMP synchronization mask. 6022 * 6023 * This function cannot sleep! 6024 * 6025 * No requirements. 6026 */ 6027 void 6028 pmap_interlock_wait(struct vmspace *vm) 6029 { 6030 struct pmap *pmap = &vm->vm_pmap; 6031 6032 if (pmap->pm_active_lock & CPULOCK_EXCL) { 6033 crit_enter(); 6034 KKASSERT(curthread->td_critcount >= 2); 6035 DEBUG_PUSH_INFO("pmap_interlock_wait"); 6036 while (pmap->pm_active_lock & CPULOCK_EXCL) { 6037 cpu_ccfence(); 6038 lwkt_process_ipiq(); 6039 } 6040 DEBUG_POP_INFO(); 6041 crit_exit(); 6042 } 6043 } 6044 6045 vm_offset_t 6046 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) 6047 { 6048 6049 if ((obj == NULL) || (size < NBPDR) || 6050 ((obj->type != OBJT_DEVICE) && (obj->type != OBJT_MGTDEVICE))) { 6051 return addr; 6052 } 6053 6054 addr = roundup2(addr, NBPDR); 6055 return addr; 6056 } 6057 6058 /* 6059 * Used by kmalloc/kfree, page already exists at va 6060 */ 6061 vm_page_t 6062 pmap_kvtom(vm_offset_t va) 6063 { 6064 pt_entry_t *ptep = vtopte(va); 6065 6066 KKASSERT((*ptep & kernel_pmap.pmap_bits[PG_DEVICE_IDX]) == 0); 6067 return(PHYS_TO_VM_PAGE(*ptep & PG_FRAME)); 6068 } 6069 6070 /* 6071 * Initialize machine-specific shared page directory support. This 6072 * is executed when a VM object is created. 6073 */ 6074 void 6075 pmap_object_init(vm_object_t object) 6076 { 6077 object->md.pmap_rw = NULL; 6078 object->md.pmap_ro = NULL; 6079 } 6080 6081 /* 6082 * Clean up machine-specific shared page directory support. This 6083 * is executed when a VM object is destroyed. 6084 */ 6085 void 6086 pmap_object_free(vm_object_t object) 6087 { 6088 pmap_t pmap; 6089 6090 if ((pmap = object->md.pmap_rw) != NULL) { 6091 object->md.pmap_rw = NULL; 6092 pmap_remove_noinval(pmap, 6093 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); 6094 CPUMASK_ASSZERO(pmap->pm_active); 6095 pmap_release(pmap); 6096 pmap_puninit(pmap); 6097 kfree(pmap, M_OBJPMAP); 6098 } 6099 if ((pmap = object->md.pmap_ro) != NULL) { 6100 object->md.pmap_ro = NULL; 6101 pmap_remove_noinval(pmap, 6102 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); 6103 CPUMASK_ASSZERO(pmap->pm_active); 6104 pmap_release(pmap); 6105 pmap_puninit(pmap); 6106 kfree(pmap, M_OBJPMAP); 6107 } 6108 } 6109 6110 /* 6111 * pmap_pgscan_callback - Used by pmap_pgscan to acquire the related 6112 * VM page and issue a pginfo->callback. 6113 * 6114 * We are expected to dispose of any non-NULL pte_pv. 6115 */ 6116 static 6117 void 6118 pmap_pgscan_callback(pmap_t pmap, struct pmap_scan_info *info, 6119 pv_entry_t pte_pv, vm_pindex_t *pte_placemark, 6120 pv_entry_t pt_pv, int sharept, 6121 vm_offset_t va, pt_entry_t *ptep, void *arg) 6122 { 6123 struct pmap_pgscan_info *pginfo = arg; 6124 vm_page_t m; 6125 6126 if (pte_pv) { 6127 /* 6128 * Try to busy the page while we hold the pte_pv locked. 6129 */ 6130 KKASSERT(pte_pv->pv_m); 6131 m = PHYS_TO_VM_PAGE(*ptep & PG_FRAME); 6132 if (vm_page_busy_try(m, TRUE) == 0) { 6133 if (m == PHYS_TO_VM_PAGE(*ptep & PG_FRAME)) { 6134 /* 6135 * The callback is issued with the pte_pv 6136 * unlocked and put away, and the pt_pv 6137 * unlocked. 6138 */ 6139 pv_put(pte_pv); 6140 if (pt_pv) { 6141 vm_page_wire_quick(pt_pv->pv_m); 6142 pv_unlock(pt_pv); 6143 } 6144 if (pginfo->callback(pginfo, va, m) < 0) 6145 info->stop = 1; 6146 if (pt_pv) { 6147 pv_lock(pt_pv); 6148 vm_page_unwire_quick(pt_pv->pv_m); 6149 } 6150 } else { 6151 vm_page_wakeup(m); 6152 pv_put(pte_pv); 6153 } 6154 } else { 6155 ++pginfo->busycount; 6156 pv_put(pte_pv); 6157 } 6158 } else { 6159 /* 6160 * Shared page table or unmanaged page (sharept or !sharept) 6161 */ 6162 pv_placemarker_wakeup(pmap, pte_placemark); 6163 } 6164 } 6165 6166 void 6167 pmap_pgscan(struct pmap_pgscan_info *pginfo) 6168 { 6169 struct pmap_scan_info info; 6170 6171 pginfo->offset = pginfo->beg_addr; 6172 info.pmap = pginfo->pmap; 6173 info.sva = pginfo->beg_addr; 6174 info.eva = pginfo->end_addr; 6175 info.func = pmap_pgscan_callback; 6176 info.arg = pginfo; 6177 pmap_scan(&info, 0); 6178 if (info.stop == 0) 6179 pginfo->offset = pginfo->end_addr; 6180 } 6181 6182 /* 6183 * Wait for a placemarker that we do not own to clear. The placemarker 6184 * in question is not necessarily set to the pindex we want, we may have 6185 * to wait on the element because we want to reserve it ourselves. 6186 * 6187 * NOTE: PM_PLACEMARK_WAKEUP sets a bit which is already set in 6188 * PM_NOPLACEMARK, so it does not interfere with placemarks 6189 * which have already been woken up. 6190 */ 6191 static 6192 void 6193 pv_placemarker_wait(pmap_t pmap, vm_pindex_t *pmark) 6194 { 6195 if (*pmark != PM_NOPLACEMARK) { 6196 atomic_set_long(pmark, PM_PLACEMARK_WAKEUP); 6197 tsleep_interlock(pmark, 0); 6198 if (*pmark != PM_NOPLACEMARK) 6199 tsleep(pmark, PINTERLOCKED, "pvplw", 0); 6200 } 6201 } 6202 6203 /* 6204 * Wakeup a placemarker that we own. Replace the entry with 6205 * PM_NOPLACEMARK and issue a wakeup() if necessary. 6206 */ 6207 static 6208 void 6209 pv_placemarker_wakeup(pmap_t pmap, vm_pindex_t *pmark) 6210 { 6211 vm_pindex_t pindex; 6212 6213 pindex = atomic_swap_long(pmark, PM_NOPLACEMARK); 6214 KKASSERT(pindex != PM_NOPLACEMARK); 6215 if (pindex & PM_PLACEMARK_WAKEUP) 6216 wakeup(pmark); 6217 } 6218