1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * Copyright (c) 1994 John S. Dyson 4 * Copyright (c) 1994 David Greenman 5 * Copyright (c) 2003 Peter Wemm 6 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu> 7 * Copyright (c) 2008, 2009 The DragonFly Project. 8 * Copyright (c) 2008, 2009 Jordan Gordeev. 9 * Copyright (c) 2011-2012 Matthew Dillon 10 * All rights reserved. 11 * 12 * This code is derived from software contributed to Berkeley by 13 * the Systems Programming Group of the University of Utah Computer 14 * Science Department and William Jolitz of UUNET Technologies Inc. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. All advertising materials mentioning features or use of this software 25 * must display the following acknowledgement: 26 * This product includes software developed by the University of 27 * California, Berkeley and its contributors. 28 * 4. Neither the name of the University nor the names of its contributors 29 * may be used to endorse or promote products derived from this software 30 * without specific prior written permission. 31 * 32 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 35 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 42 * SUCH DAMAGE. 43 */ 44 /* 45 * Manage physical address maps for x86-64 systems. 46 */ 47 48 #if 0 /* JG */ 49 #include "opt_disable_pse.h" 50 #include "opt_pmap.h" 51 #endif 52 #include "opt_msgbuf.h" 53 54 #include <sys/param.h> 55 #include <sys/kernel.h> 56 #include <sys/proc.h> 57 #include <sys/msgbuf.h> 58 #include <sys/vmmeter.h> 59 #include <sys/mman.h> 60 #include <sys/systm.h> 61 62 #include <vm/vm.h> 63 #include <vm/vm_param.h> 64 #include <sys/sysctl.h> 65 #include <sys/lock.h> 66 #include <vm/vm_kern.h> 67 #include <vm/vm_page.h> 68 #include <vm/vm_map.h> 69 #include <vm/vm_object.h> 70 #include <vm/vm_extern.h> 71 #include <vm/vm_pageout.h> 72 #include <vm/vm_pager.h> 73 #include <vm/vm_zone.h> 74 75 #include <sys/user.h> 76 #include <sys/thread2.h> 77 #include <sys/sysref2.h> 78 #include <sys/spinlock2.h> 79 #include <vm/vm_page2.h> 80 81 #include <machine/cputypes.h> 82 #include <machine/md_var.h> 83 #include <machine/specialreg.h> 84 #include <machine/smp.h> 85 #include <machine_base/apic/apicreg.h> 86 #include <machine/globaldata.h> 87 #include <machine/pmap.h> 88 #include <machine/pmap_inval.h> 89 #include <machine/inttypes.h> 90 91 #include <ddb/ddb.h> 92 93 #define PMAP_KEEP_PDIRS 94 #ifndef PMAP_SHPGPERPROC 95 #define PMAP_SHPGPERPROC 2000 96 #endif 97 98 #if defined(DIAGNOSTIC) 99 #define PMAP_DIAGNOSTIC 100 #endif 101 102 #define MINPV 2048 103 104 /* 105 * pmap debugging will report who owns a pv lock when blocking. 106 */ 107 #ifdef PMAP_DEBUG 108 109 #define PMAP_DEBUG_DECL ,const char *func, int lineno 110 #define PMAP_DEBUG_ARGS , __func__, __LINE__ 111 #define PMAP_DEBUG_COPY , func, lineno 112 113 #define pv_get(pmap, pindex, pmarkp) _pv_get(pmap, pindex, pmarkp \ 114 PMAP_DEBUG_ARGS) 115 #define pv_lock(pv) _pv_lock(pv \ 116 PMAP_DEBUG_ARGS) 117 #define pv_hold_try(pv) _pv_hold_try(pv \ 118 PMAP_DEBUG_ARGS) 119 #define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp \ 120 PMAP_DEBUG_ARGS) 121 122 #define pv_free(pv, pvp) _pv_free(pv, pvp PMAP_DEBUG_ARGS) 123 124 #else 125 126 #define PMAP_DEBUG_DECL 127 #define PMAP_DEBUG_ARGS 128 #define PMAP_DEBUG_COPY 129 130 #define pv_get(pmap, pindex, pmarkp) _pv_get(pmap, pindex, pmarkp) 131 #define pv_lock(pv) _pv_lock(pv) 132 #define pv_hold_try(pv) _pv_hold_try(pv) 133 #define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp) 134 #define pv_free(pv, pvp) _pv_free(pv, pvp) 135 136 #endif 137 138 /* 139 * Get PDEs and PTEs for user/kernel address space 140 */ 141 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 142 143 #define pmap_pde_v(pmap, pte) ((*(pd_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0) 144 #define pmap_pte_w(pmap, pte) ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_W_IDX]) != 0) 145 #define pmap_pte_m(pmap, pte) ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_M_IDX]) != 0) 146 #define pmap_pte_u(pmap, pte) ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_U_IDX]) != 0) 147 #define pmap_pte_v(pmap, pte) ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0) 148 149 /* 150 * Given a map and a machine independent protection code, 151 * convert to a vax protection code. 152 */ 153 #define pte_prot(m, p) \ 154 (m->protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)]) 155 static int protection_codes[PROTECTION_CODES_SIZE]; 156 157 struct pmap kernel_pmap; 158 159 MALLOC_DEFINE(M_OBJPMAP, "objpmap", "pmaps associated with VM objects"); 160 161 vm_paddr_t avail_start; /* PA of first available physical page */ 162 vm_paddr_t avail_end; /* PA of last available physical page */ 163 vm_offset_t virtual2_start; /* cutout free area prior to kernel start */ 164 vm_offset_t virtual2_end; 165 vm_offset_t virtual_start; /* VA of first avail page (after kernel bss) */ 166 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 167 vm_offset_t KvaStart; /* VA start of KVA space */ 168 vm_offset_t KvaEnd; /* VA end of KVA space (non-inclusive) */ 169 vm_offset_t KvaSize; /* max size of kernel virtual address space */ 170 static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ 171 //static int pgeflag; /* PG_G or-in */ 172 //static int pseflag; /* PG_PS or-in */ 173 uint64_t PatMsr; 174 175 static int ndmpdp; 176 static vm_paddr_t dmaplimit; 177 static int nkpt; 178 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 179 180 static pt_entry_t pat_pte_index[PAT_INDEX_SIZE]; /* PAT -> PG_ bits */ 181 /*static pt_entry_t pat_pde_index[PAT_INDEX_SIZE];*/ /* PAT -> PG_ bits */ 182 183 static uint64_t KPTbase; 184 static uint64_t KPTphys; 185 static uint64_t KPDphys; /* phys addr of kernel level 2 */ 186 static uint64_t KPDbase; /* phys addr of kernel level 2 @ KERNBASE */ 187 uint64_t KPDPphys; /* phys addr of kernel level 3 */ 188 uint64_t KPML4phys; /* phys addr of kernel level 4 */ 189 190 static uint64_t DMPDphys; /* phys addr of direct mapped level 2 */ 191 static uint64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 192 193 /* 194 * Data for the pv entry allocation mechanism 195 */ 196 static vm_zone_t pvzone; 197 static struct vm_zone pvzone_store; 198 static struct vm_object pvzone_obj; 199 static int pv_entry_max=0, pv_entry_high_water=0; 200 static int pmap_pagedaemon_waken = 0; 201 static struct pv_entry *pvinit; 202 203 /* 204 * All those kernel PT submaps that BSD is so fond of 205 */ 206 pt_entry_t *CMAP1 = NULL, *ptmmap; 207 caddr_t CADDR1 = NULL, ptvmmap = NULL; 208 static pt_entry_t *msgbufmap; 209 struct msgbuf *msgbufp=NULL; 210 211 /* 212 * PMAP default PG_* bits. Needed to be able to add 213 * EPT/NPT pagetable pmap_bits for the VMM module 214 */ 215 uint64_t pmap_bits_default[] = { 216 REGULAR_PMAP, /* TYPE_IDX 0 */ 217 X86_PG_V, /* PG_V_IDX 1 */ 218 X86_PG_RW, /* PG_RW_IDX 2 */ 219 X86_PG_U, /* PG_U_IDX 3 */ 220 X86_PG_A, /* PG_A_IDX 4 */ 221 X86_PG_M, /* PG_M_IDX 5 */ 222 X86_PG_PS, /* PG_PS_IDX3 6 */ 223 X86_PG_G, /* PG_G_IDX 7 */ 224 X86_PG_AVAIL1, /* PG_AVAIL1_IDX 8 */ 225 X86_PG_AVAIL2, /* PG_AVAIL2_IDX 9 */ 226 X86_PG_AVAIL3, /* PG_AVAIL3_IDX 10 */ 227 X86_PG_NC_PWT | X86_PG_NC_PCD, /* PG_N_IDX 11 */ 228 }; 229 /* 230 * Crashdump maps. 231 */ 232 static pt_entry_t *pt_crashdumpmap; 233 static caddr_t crashdumpmap; 234 235 static int pmap_debug = 0; 236 SYSCTL_INT(_machdep, OID_AUTO, pmap_debug, CTLFLAG_RW, 237 &pmap_debug, 0, "Debug pmap's"); 238 #ifdef PMAP_DEBUG2 239 static int pmap_enter_debug = 0; 240 SYSCTL_INT(_machdep, OID_AUTO, pmap_enter_debug, CTLFLAG_RW, 241 &pmap_enter_debug, 0, "Debug pmap_enter's"); 242 #endif 243 static int pmap_yield_count = 64; 244 SYSCTL_INT(_machdep, OID_AUTO, pmap_yield_count, CTLFLAG_RW, 245 &pmap_yield_count, 0, "Yield during init_pt/release"); 246 static int pmap_mmu_optimize = 0; 247 SYSCTL_INT(_machdep, OID_AUTO, pmap_mmu_optimize, CTLFLAG_RW, 248 &pmap_mmu_optimize, 0, "Share page table pages when possible"); 249 int pmap_fast_kernel_cpusync = 0; 250 SYSCTL_INT(_machdep, OID_AUTO, pmap_fast_kernel_cpusync, CTLFLAG_RW, 251 &pmap_fast_kernel_cpusync, 0, "Share page table pages when possible"); 252 int pmap_dynamic_delete = 0; 253 SYSCTL_INT(_machdep, OID_AUTO, pmap_dynamic_delete, CTLFLAG_RW, 254 &pmap_dynamic_delete, 0, "Dynamically delete PT/PD/PDPs"); 255 256 #define DISABLE_PSE 257 258 /* Standard user access funtions */ 259 extern int std_copyinstr (const void *udaddr, void *kaddr, size_t len, 260 size_t *lencopied); 261 extern int std_copyin (const void *udaddr, void *kaddr, size_t len); 262 extern int std_copyout (const void *kaddr, void *udaddr, size_t len); 263 extern int std_fubyte (const uint8_t *base); 264 extern int std_subyte (uint8_t *base, uint8_t byte); 265 extern int32_t std_fuword32 (const uint32_t *base); 266 extern int64_t std_fuword64 (const uint64_t *base); 267 extern int std_suword64 (uint64_t *base, uint64_t word); 268 extern int std_suword32 (uint32_t *base, int word); 269 extern uint32_t std_swapu32 (volatile uint32_t *base, uint32_t v); 270 extern uint64_t std_swapu64 (volatile uint64_t *base, uint64_t v); 271 272 static void pv_hold(pv_entry_t pv); 273 static int _pv_hold_try(pv_entry_t pv 274 PMAP_DEBUG_DECL); 275 static void pv_drop(pv_entry_t pv); 276 static void _pv_lock(pv_entry_t pv 277 PMAP_DEBUG_DECL); 278 static void pv_unlock(pv_entry_t pv); 279 static pv_entry_t _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew 280 PMAP_DEBUG_DECL); 281 static pv_entry_t _pv_get(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp 282 PMAP_DEBUG_DECL); 283 static void _pv_free(pv_entry_t pv, pv_entry_t pvp PMAP_DEBUG_DECL); 284 static pv_entry_t pv_get_try(pmap_t pmap, vm_pindex_t pindex, 285 vm_pindex_t **pmarkp, int *errorp); 286 static void pv_put(pv_entry_t pv); 287 static void *pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex); 288 static pv_entry_t pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, 289 pv_entry_t *pvpp); 290 static pv_entry_t pmap_allocpte_seg(pmap_t pmap, vm_pindex_t ptepindex, 291 pv_entry_t *pvpp, vm_map_entry_t entry, vm_offset_t va); 292 static void pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, 293 pmap_inval_bulk_t *bulk, int destroy); 294 static vm_page_t pmap_remove_pv_page(pv_entry_t pv); 295 static int pmap_release_pv(pv_entry_t pv, pv_entry_t pvp, 296 pmap_inval_bulk_t *bulk); 297 298 struct pmap_scan_info; 299 static void pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info, 300 pv_entry_t pte_pv, vm_pindex_t *pte_placemark, 301 pv_entry_t pt_pv, int sharept, 302 vm_offset_t va, pt_entry_t *ptep, void *arg __unused); 303 static void pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info, 304 pv_entry_t pte_pv, vm_pindex_t *pte_placemark, 305 pv_entry_t pt_pv, int sharept, 306 vm_offset_t va, pt_entry_t *ptep, void *arg __unused); 307 308 static void i386_protection_init (void); 309 static void create_pagetables(vm_paddr_t *firstaddr); 310 static void pmap_remove_all (vm_page_t m); 311 static boolean_t pmap_testbit (vm_page_t m, int bit); 312 313 static pt_entry_t * pmap_pte_quick (pmap_t pmap, vm_offset_t va); 314 static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 315 316 static void pmap_pinit_defaults(struct pmap *pmap); 317 static void pv_placemarker_wait(pmap_t pmap, vm_pindex_t *pmark); 318 static void pv_placemarker_wakeup(pmap_t pmap, vm_pindex_t *pmark); 319 320 static unsigned pdir4mb; 321 322 static int 323 pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2) 324 { 325 if (pv1->pv_pindex < pv2->pv_pindex) 326 return(-1); 327 if (pv1->pv_pindex > pv2->pv_pindex) 328 return(1); 329 return(0); 330 } 331 332 RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry, 333 pv_entry_compare, vm_pindex_t, pv_pindex); 334 335 static __inline 336 void 337 pmap_page_stats_adding(vm_page_t m) 338 { 339 globaldata_t gd = mycpu; 340 341 if (TAILQ_EMPTY(&m->md.pv_list)) { 342 ++gd->gd_vmtotal.t_arm; 343 } else if (TAILQ_FIRST(&m->md.pv_list) == 344 TAILQ_LAST(&m->md.pv_list, md_page_pv_list)) { 345 ++gd->gd_vmtotal.t_armshr; 346 ++gd->gd_vmtotal.t_avmshr; 347 } else { 348 ++gd->gd_vmtotal.t_avmshr; 349 } 350 } 351 352 static __inline 353 void 354 pmap_page_stats_deleting(vm_page_t m) 355 { 356 globaldata_t gd = mycpu; 357 358 if (TAILQ_EMPTY(&m->md.pv_list)) { 359 --gd->gd_vmtotal.t_arm; 360 } else if (TAILQ_FIRST(&m->md.pv_list) == 361 TAILQ_LAST(&m->md.pv_list, md_page_pv_list)) { 362 --gd->gd_vmtotal.t_armshr; 363 --gd->gd_vmtotal.t_avmshr; 364 } else { 365 --gd->gd_vmtotal.t_avmshr; 366 } 367 } 368 369 /* 370 * Move the kernel virtual free pointer to the next 371 * 2MB. This is used to help improve performance 372 * by using a large (2MB) page for much of the kernel 373 * (.text, .data, .bss) 374 */ 375 static 376 vm_offset_t 377 pmap_kmem_choose(vm_offset_t addr) 378 { 379 vm_offset_t newaddr = addr; 380 381 newaddr = roundup2(addr, NBPDR); 382 return newaddr; 383 } 384 385 /* 386 * pmap_pte_quick: 387 * 388 * Super fast pmap_pte routine best used when scanning the pv lists. 389 * This eliminates many course-grained invltlb calls. Note that many of 390 * the pv list scans are across different pmaps and it is very wasteful 391 * to do an entire invltlb when checking a single mapping. 392 */ 393 static __inline pt_entry_t *pmap_pte(pmap_t pmap, vm_offset_t va); 394 395 static 396 pt_entry_t * 397 pmap_pte_quick(pmap_t pmap, vm_offset_t va) 398 { 399 return pmap_pte(pmap, va); 400 } 401 402 /* 403 * Returns the pindex of a page table entry (representing a terminal page). 404 * There are NUPTE_TOTAL page table entries possible (a huge number) 405 * 406 * x86-64 has a 48-bit address space, where bit 47 is sign-extended out. 407 * We want to properly translate negative KVAs. 408 */ 409 static __inline 410 vm_pindex_t 411 pmap_pte_pindex(vm_offset_t va) 412 { 413 return ((va >> PAGE_SHIFT) & (NUPTE_TOTAL - 1)); 414 } 415 416 /* 417 * Returns the pindex of a page table. 418 */ 419 static __inline 420 vm_pindex_t 421 pmap_pt_pindex(vm_offset_t va) 422 { 423 return (NUPTE_TOTAL + ((va >> PDRSHIFT) & (NUPT_TOTAL - 1))); 424 } 425 426 /* 427 * Returns the pindex of a page directory. 428 */ 429 static __inline 430 vm_pindex_t 431 pmap_pd_pindex(vm_offset_t va) 432 { 433 return (NUPTE_TOTAL + NUPT_TOTAL + 434 ((va >> PDPSHIFT) & (NUPD_TOTAL - 1))); 435 } 436 437 static __inline 438 vm_pindex_t 439 pmap_pdp_pindex(vm_offset_t va) 440 { 441 return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + 442 ((va >> PML4SHIFT) & (NUPDP_TOTAL - 1))); 443 } 444 445 static __inline 446 vm_pindex_t 447 pmap_pml4_pindex(void) 448 { 449 return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL); 450 } 451 452 /* 453 * Return various clipped indexes for a given VA 454 * 455 * Returns the index of a pt in a page directory, representing a page 456 * table. 457 */ 458 static __inline 459 vm_pindex_t 460 pmap_pt_index(vm_offset_t va) 461 { 462 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 463 } 464 465 /* 466 * Returns the index of a pd in a page directory page, representing a page 467 * directory. 468 */ 469 static __inline 470 vm_pindex_t 471 pmap_pd_index(vm_offset_t va) 472 { 473 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 474 } 475 476 /* 477 * Returns the index of a pdp in the pml4 table, representing a page 478 * directory page. 479 */ 480 static __inline 481 vm_pindex_t 482 pmap_pdp_index(vm_offset_t va) 483 { 484 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 485 } 486 487 /* 488 * The placemarker hash must be broken up into four zones so lock 489 * ordering semantics continue to work (e.g. pte, pt, pd, then pdp). 490 * 491 * Placemarkers are used to 'lock' page table indices that do not have 492 * a pv_entry. This allows the pmap to support managed and unmanaged 493 * pages and shared page tables. 494 */ 495 #define PM_PLACE_BASE (PM_PLACEMARKS >> 2) 496 497 static __inline 498 vm_pindex_t * 499 pmap_placemarker_hash(pmap_t pmap, vm_pindex_t pindex) 500 { 501 int hi; 502 503 if (pindex < pmap_pt_pindex(0)) /* zone 0 - PTE */ 504 hi = 0; 505 else if (pindex < pmap_pd_pindex(0)) /* zone 1 - PT */ 506 hi = PM_PLACE_BASE; 507 else if (pindex < pmap_pdp_pindex(0)) /* zone 2 - PD */ 508 hi = PM_PLACE_BASE << 1; 509 else /* zone 3 - PDP (and PML4E) */ 510 hi = PM_PLACE_BASE | (PM_PLACE_BASE << 1); 511 hi += pindex & (PM_PLACE_BASE - 1); 512 513 return (&pmap->pm_placemarks[hi]); 514 } 515 516 517 /* 518 * Generic procedure to index a pte from a pt, pd, or pdp. 519 * 520 * NOTE: Normally passed pindex as pmap_xx_index(). pmap_xx_pindex() is NOT 521 * a page table page index but is instead of PV lookup index. 522 */ 523 static 524 void * 525 pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex) 526 { 527 pt_entry_t *pte; 528 529 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pv->pv_m)); 530 return(&pte[pindex]); 531 } 532 533 /* 534 * Return pointer to PDP slot in the PML4 535 */ 536 static __inline 537 pml4_entry_t * 538 pmap_pdp(pmap_t pmap, vm_offset_t va) 539 { 540 return (&pmap->pm_pml4[pmap_pdp_index(va)]); 541 } 542 543 /* 544 * Return pointer to PD slot in the PDP given a pointer to the PDP 545 */ 546 static __inline 547 pdp_entry_t * 548 pmap_pdp_to_pd(pml4_entry_t pdp_pte, vm_offset_t va) 549 { 550 pdp_entry_t *pd; 551 552 pd = (pdp_entry_t *)PHYS_TO_DMAP(pdp_pte & PG_FRAME); 553 return (&pd[pmap_pd_index(va)]); 554 } 555 556 /* 557 * Return pointer to PD slot in the PDP. 558 */ 559 static __inline 560 pdp_entry_t * 561 pmap_pd(pmap_t pmap, vm_offset_t va) 562 { 563 pml4_entry_t *pdp; 564 565 pdp = pmap_pdp(pmap, va); 566 if ((*pdp & pmap->pmap_bits[PG_V_IDX]) == 0) 567 return NULL; 568 return (pmap_pdp_to_pd(*pdp, va)); 569 } 570 571 /* 572 * Return pointer to PT slot in the PD given a pointer to the PD 573 */ 574 static __inline 575 pd_entry_t * 576 pmap_pd_to_pt(pdp_entry_t pd_pte, vm_offset_t va) 577 { 578 pd_entry_t *pt; 579 580 pt = (pd_entry_t *)PHYS_TO_DMAP(pd_pte & PG_FRAME); 581 return (&pt[pmap_pt_index(va)]); 582 } 583 584 /* 585 * Return pointer to PT slot in the PD 586 * 587 * SIMPLE PMAP NOTE: Simple pmaps (embedded in objects) do not have PDPs, 588 * so we cannot lookup the PD via the PDP. Instead we 589 * must look it up via the pmap. 590 */ 591 static __inline 592 pd_entry_t * 593 pmap_pt(pmap_t pmap, vm_offset_t va) 594 { 595 pdp_entry_t *pd; 596 pv_entry_t pv; 597 vm_pindex_t pd_pindex; 598 599 if (pmap->pm_flags & PMAP_FLAG_SIMPLE) { 600 pd_pindex = pmap_pd_pindex(va); 601 spin_lock(&pmap->pm_spin); 602 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pd_pindex); 603 spin_unlock(&pmap->pm_spin); 604 if (pv == NULL || pv->pv_m == NULL) 605 return NULL; 606 return (pmap_pd_to_pt(VM_PAGE_TO_PHYS(pv->pv_m), va)); 607 } else { 608 pd = pmap_pd(pmap, va); 609 if (pd == NULL || (*pd & pmap->pmap_bits[PG_V_IDX]) == 0) 610 return NULL; 611 return (pmap_pd_to_pt(*pd, va)); 612 } 613 } 614 615 /* 616 * Return pointer to PTE slot in the PT given a pointer to the PT 617 */ 618 static __inline 619 pt_entry_t * 620 pmap_pt_to_pte(pd_entry_t pt_pte, vm_offset_t va) 621 { 622 pt_entry_t *pte; 623 624 pte = (pt_entry_t *)PHYS_TO_DMAP(pt_pte & PG_FRAME); 625 return (&pte[pmap_pte_index(va)]); 626 } 627 628 /* 629 * Return pointer to PTE slot in the PT 630 */ 631 static __inline 632 pt_entry_t * 633 pmap_pte(pmap_t pmap, vm_offset_t va) 634 { 635 pd_entry_t *pt; 636 637 pt = pmap_pt(pmap, va); 638 if (pt == NULL || (*pt & pmap->pmap_bits[PG_V_IDX]) == 0) 639 return NULL; 640 if ((*pt & pmap->pmap_bits[PG_PS_IDX]) != 0) 641 return ((pt_entry_t *)pt); 642 return (pmap_pt_to_pte(*pt, va)); 643 } 644 645 /* 646 * Of all the layers (PTE, PT, PD, PDP, PML4) the best one to cache is 647 * the PT layer. This will speed up core pmap operations considerably. 648 * 649 * NOTE: The pmap spinlock does not need to be held but the passed-in pv 650 * must be in a known associated state (typically by being locked when 651 * the pmap spinlock isn't held). We allow the race for that case. 652 * 653 * NOTE: pm_pvhint is only accessed (read) with the spin-lock held, using 654 * cpu_ccfence() to prevent compiler optimizations from reloading the 655 * field. 656 */ 657 static __inline 658 void 659 pv_cache(pv_entry_t pv, vm_pindex_t pindex) 660 { 661 if (pindex >= pmap_pt_pindex(0) && pindex < pmap_pd_pindex(0)) { 662 if (pv->pv_pmap) 663 pv->pv_pmap->pm_pvhint = pv; 664 } 665 } 666 667 668 /* 669 * Return address of PT slot in PD (KVM only) 670 * 671 * Cannot be used for user page tables because it might interfere with 672 * the shared page-table-page optimization (pmap_mmu_optimize). 673 */ 674 static __inline 675 pd_entry_t * 676 vtopt(vm_offset_t va) 677 { 678 uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 679 NPML4EPGSHIFT)) - 1); 680 681 return (PDmap + ((va >> PDRSHIFT) & mask)); 682 } 683 684 /* 685 * KVM - return address of PTE slot in PT 686 */ 687 static __inline 688 pt_entry_t * 689 vtopte(vm_offset_t va) 690 { 691 uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + 692 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 693 694 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 695 } 696 697 static uint64_t 698 allocpages(vm_paddr_t *firstaddr, long n) 699 { 700 uint64_t ret; 701 702 ret = *firstaddr; 703 bzero((void *)ret, n * PAGE_SIZE); 704 *firstaddr += n * PAGE_SIZE; 705 return (ret); 706 } 707 708 static 709 void 710 create_pagetables(vm_paddr_t *firstaddr) 711 { 712 long i; /* must be 64 bits */ 713 long nkpt_base; 714 long nkpt_phys; 715 int j; 716 717 /* 718 * We are running (mostly) V=P at this point 719 * 720 * Calculate NKPT - number of kernel page tables. We have to 721 * accomodoate prealloction of the vm_page_array, dump bitmap, 722 * MSGBUF_SIZE, and other stuff. Be generous. 723 * 724 * Maxmem is in pages. 725 * 726 * ndmpdp is the number of 1GB pages we wish to map. 727 */ 728 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; 729 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 730 ndmpdp = 4; 731 KKASSERT(ndmpdp <= NKPDPE * NPDEPG); 732 733 /* 734 * Starting at the beginning of kvm (not KERNBASE). 735 */ 736 nkpt_phys = (Maxmem * sizeof(struct vm_page) + NBPDR - 1) / NBPDR; 737 nkpt_phys += (Maxmem * sizeof(struct pv_entry) + NBPDR - 1) / NBPDR; 738 nkpt_phys += ((nkpt + nkpt + 1 + NKPML4E + NKPDPE + NDMPML4E + 739 ndmpdp) + 511) / 512; 740 nkpt_phys += 128; 741 742 /* 743 * Starting at KERNBASE - map 2G worth of page table pages. 744 * KERNBASE is offset -2G from the end of kvm. 745 */ 746 nkpt_base = (NPDPEPG - KPDPI) * NPTEPG; /* typically 2 x 512 */ 747 748 /* 749 * Allocate pages 750 */ 751 KPTbase = allocpages(firstaddr, nkpt_base); 752 KPTphys = allocpages(firstaddr, nkpt_phys); 753 KPML4phys = allocpages(firstaddr, 1); 754 KPDPphys = allocpages(firstaddr, NKPML4E); 755 KPDphys = allocpages(firstaddr, NKPDPE); 756 757 /* 758 * Calculate the page directory base for KERNBASE, 759 * that is where we start populating the page table pages. 760 * Basically this is the end - 2. 761 */ 762 KPDbase = KPDphys + ((NKPDPE - (NPDPEPG - KPDPI)) << PAGE_SHIFT); 763 764 DMPDPphys = allocpages(firstaddr, NDMPML4E); 765 if ((amd_feature & AMDID_PAGE1GB) == 0) 766 DMPDphys = allocpages(firstaddr, ndmpdp); 767 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 768 769 /* 770 * Fill in the underlying page table pages for the area around 771 * KERNBASE. This remaps low physical memory to KERNBASE. 772 * 773 * Read-only from zero to physfree 774 * XXX not fully used, underneath 2M pages 775 */ 776 for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) { 777 ((pt_entry_t *)KPTbase)[i] = i << PAGE_SHIFT; 778 ((pt_entry_t *)KPTbase)[i] |= 779 pmap_bits_default[PG_RW_IDX] | 780 pmap_bits_default[PG_V_IDX] | 781 pmap_bits_default[PG_G_IDX]; 782 } 783 784 /* 785 * Now map the initial kernel page tables. One block of page 786 * tables is placed at the beginning of kernel virtual memory, 787 * and another block is placed at KERNBASE to map the kernel binary, 788 * data, bss, and initial pre-allocations. 789 */ 790 for (i = 0; i < nkpt_base; i++) { 791 ((pd_entry_t *)KPDbase)[i] = KPTbase + (i << PAGE_SHIFT); 792 ((pd_entry_t *)KPDbase)[i] |= 793 pmap_bits_default[PG_RW_IDX] | 794 pmap_bits_default[PG_V_IDX]; 795 } 796 for (i = 0; i < nkpt_phys; i++) { 797 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); 798 ((pd_entry_t *)KPDphys)[i] |= 799 pmap_bits_default[PG_RW_IDX] | 800 pmap_bits_default[PG_V_IDX]; 801 } 802 803 /* 804 * Map from zero to end of allocations using 2M pages as an 805 * optimization. This will bypass some of the KPTBase pages 806 * above in the KERNBASE area. 807 */ 808 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) { 809 ((pd_entry_t *)KPDbase)[i] = i << PDRSHIFT; 810 ((pd_entry_t *)KPDbase)[i] |= 811 pmap_bits_default[PG_RW_IDX] | 812 pmap_bits_default[PG_V_IDX] | 813 pmap_bits_default[PG_PS_IDX] | 814 pmap_bits_default[PG_G_IDX]; 815 } 816 817 /* 818 * And connect up the PD to the PDP. The kernel pmap is expected 819 * to pre-populate all of its PDs. See NKPDPE in vmparam.h. 820 */ 821 for (i = 0; i < NKPDPE; i++) { 822 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] = 823 KPDphys + (i << PAGE_SHIFT); 824 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] |= 825 pmap_bits_default[PG_RW_IDX] | 826 pmap_bits_default[PG_V_IDX] | 827 pmap_bits_default[PG_U_IDX]; 828 } 829 830 /* 831 * Now set up the direct map space using either 2MB or 1GB pages 832 * Preset PG_M and PG_A because demotion expects it. 833 * 834 * When filling in entries in the PD pages make sure any excess 835 * entries are set to zero as we allocated enough PD pages 836 */ 837 if ((amd_feature & AMDID_PAGE1GB) == 0) { 838 for (i = 0; i < NPDEPG * ndmpdp; i++) { 839 ((pd_entry_t *)DMPDphys)[i] = i << PDRSHIFT; 840 ((pd_entry_t *)DMPDphys)[i] |= 841 pmap_bits_default[PG_RW_IDX] | 842 pmap_bits_default[PG_V_IDX] | 843 pmap_bits_default[PG_PS_IDX] | 844 pmap_bits_default[PG_G_IDX] | 845 pmap_bits_default[PG_M_IDX] | 846 pmap_bits_default[PG_A_IDX]; 847 } 848 849 /* 850 * And the direct map space's PDP 851 */ 852 for (i = 0; i < ndmpdp; i++) { 853 ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + 854 (i << PAGE_SHIFT); 855 ((pdp_entry_t *)DMPDPphys)[i] |= 856 pmap_bits_default[PG_RW_IDX] | 857 pmap_bits_default[PG_V_IDX] | 858 pmap_bits_default[PG_U_IDX]; 859 } 860 } else { 861 for (i = 0; i < ndmpdp; i++) { 862 ((pdp_entry_t *)DMPDPphys)[i] = 863 (vm_paddr_t)i << PDPSHIFT; 864 ((pdp_entry_t *)DMPDPphys)[i] |= 865 pmap_bits_default[PG_RW_IDX] | 866 pmap_bits_default[PG_V_IDX] | 867 pmap_bits_default[PG_PS_IDX] | 868 pmap_bits_default[PG_G_IDX] | 869 pmap_bits_default[PG_M_IDX] | 870 pmap_bits_default[PG_A_IDX]; 871 } 872 } 873 874 /* And recursively map PML4 to itself in order to get PTmap */ 875 ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; 876 ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= 877 pmap_bits_default[PG_RW_IDX] | 878 pmap_bits_default[PG_V_IDX] | 879 pmap_bits_default[PG_U_IDX]; 880 881 /* 882 * Connect the Direct Map slots up to the PML4 883 */ 884 for (j = 0; j < NDMPML4E; ++j) { 885 ((pdp_entry_t *)KPML4phys)[DMPML4I + j] = 886 (DMPDPphys + ((vm_paddr_t)j << PML4SHIFT)) | 887 pmap_bits_default[PG_RW_IDX] | 888 pmap_bits_default[PG_V_IDX] | 889 pmap_bits_default[PG_U_IDX]; 890 } 891 892 /* 893 * Connect the KVA slot up to the PML4 894 */ 895 ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys; 896 ((pdp_entry_t *)KPML4phys)[KPML4I] |= 897 pmap_bits_default[PG_RW_IDX] | 898 pmap_bits_default[PG_V_IDX] | 899 pmap_bits_default[PG_U_IDX]; 900 } 901 902 /* 903 * Bootstrap the system enough to run with virtual memory. 904 * 905 * On the i386 this is called after mapping has already been enabled 906 * and just syncs the pmap module with what has already been done. 907 * [We can't call it easily with mapping off since the kernel is not 908 * mapped with PA == VA, hence we would have to relocate every address 909 * from the linked base (virtual) address "KERNBASE" to the actual 910 * (physical) address starting relative to 0] 911 */ 912 void 913 pmap_bootstrap(vm_paddr_t *firstaddr) 914 { 915 vm_offset_t va; 916 pt_entry_t *pte; 917 int i; 918 919 KvaStart = VM_MIN_KERNEL_ADDRESS; 920 KvaEnd = VM_MAX_KERNEL_ADDRESS; 921 KvaSize = KvaEnd - KvaStart; 922 923 avail_start = *firstaddr; 924 925 /* 926 * Create an initial set of page tables to run the kernel in. 927 */ 928 create_pagetables(firstaddr); 929 930 virtual2_start = KvaStart; 931 virtual2_end = PTOV_OFFSET; 932 933 virtual_start = (vm_offset_t) PTOV_OFFSET + *firstaddr; 934 virtual_start = pmap_kmem_choose(virtual_start); 935 936 virtual_end = VM_MAX_KERNEL_ADDRESS; 937 938 /* XXX do %cr0 as well */ 939 load_cr4(rcr4() | CR4_PGE | CR4_PSE); 940 load_cr3(KPML4phys); 941 942 /* 943 * Initialize protection array. 944 */ 945 i386_protection_init(); 946 947 /* 948 * The kernel's pmap is statically allocated so we don't have to use 949 * pmap_create, which is unlikely to work correctly at this part of 950 * the boot sequence (XXX and which no longer exists). 951 */ 952 kernel_pmap.pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys); 953 kernel_pmap.pm_count = 1; 954 CPUMASK_ASSALLONES(kernel_pmap.pm_active); 955 RB_INIT(&kernel_pmap.pm_pvroot); 956 spin_init(&kernel_pmap.pm_spin, "pmapbootstrap"); 957 for (i = 0; i < PM_PLACEMARKS; ++i) 958 kernel_pmap.pm_placemarks[i] = PM_NOPLACEMARK; 959 960 /* 961 * Reserve some special page table entries/VA space for temporary 962 * mapping of pages. 963 */ 964 #define SYSMAP(c, p, v, n) \ 965 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 966 967 va = virtual_start; 968 pte = vtopte(va); 969 970 /* 971 * CMAP1/CMAP2 are used for zeroing and copying pages. 972 */ 973 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 974 975 /* 976 * Crashdump maps. 977 */ 978 SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); 979 980 /* 981 * ptvmmap is used for reading arbitrary physical pages via 982 * /dev/mem. 983 */ 984 SYSMAP(caddr_t, ptmmap, ptvmmap, 1) 985 986 /* 987 * msgbufp is used to map the system message buffer. 988 * XXX msgbufmap is not used. 989 */ 990 SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 991 atop(round_page(MSGBUF_SIZE))) 992 993 virtual_start = va; 994 virtual_start = pmap_kmem_choose(virtual_start); 995 996 *CMAP1 = 0; 997 998 /* 999 * PG_G is terribly broken on SMP because we IPI invltlb's in some 1000 * cases rather then invl1pg. Actually, I don't even know why it 1001 * works under UP because self-referential page table mappings 1002 */ 1003 // pgeflag = 0; 1004 1005 /* 1006 * Initialize the 4MB page size flag 1007 */ 1008 // pseflag = 0; 1009 /* 1010 * The 4MB page version of the initial 1011 * kernel page mapping. 1012 */ 1013 pdir4mb = 0; 1014 1015 #if !defined(DISABLE_PSE) 1016 if (cpu_feature & CPUID_PSE) { 1017 pt_entry_t ptditmp; 1018 /* 1019 * Note that we have enabled PSE mode 1020 */ 1021 // pseflag = kernel_pmap.pmap_bits[PG_PS_IDX]; 1022 ptditmp = *(PTmap + x86_64_btop(KERNBASE)); 1023 ptditmp &= ~(NBPDR - 1); 1024 ptditmp |= pmap_bits_default[PG_V_IDX] | 1025 pmap_bits_default[PG_RW_IDX] | 1026 pmap_bits_default[PG_PS_IDX] | 1027 pmap_bits_default[PG_U_IDX]; 1028 // pgeflag; 1029 pdir4mb = ptditmp; 1030 } 1031 #endif 1032 cpu_invltlb(); 1033 1034 /* Initialize the PAT MSR */ 1035 pmap_init_pat(); 1036 pmap_pinit_defaults(&kernel_pmap); 1037 1038 TUNABLE_INT_FETCH("machdep.pmap_fast_kernel_cpusync", 1039 &pmap_fast_kernel_cpusync); 1040 1041 } 1042 1043 /* 1044 * Setup the PAT MSR. 1045 */ 1046 void 1047 pmap_init_pat(void) 1048 { 1049 uint64_t pat_msr; 1050 u_long cr0, cr4; 1051 1052 /* 1053 * Default values mapping PATi,PCD,PWT bits at system reset. 1054 * The default values effectively ignore the PATi bit by 1055 * repeating the encodings for 0-3 in 4-7, and map the PCD 1056 * and PWT bit combinations to the expected PAT types. 1057 */ 1058 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | /* 000 */ 1059 PAT_VALUE(1, PAT_WRITE_THROUGH) | /* 001 */ 1060 PAT_VALUE(2, PAT_UNCACHED) | /* 010 */ 1061 PAT_VALUE(3, PAT_UNCACHEABLE) | /* 011 */ 1062 PAT_VALUE(4, PAT_WRITE_BACK) | /* 100 */ 1063 PAT_VALUE(5, PAT_WRITE_THROUGH) | /* 101 */ 1064 PAT_VALUE(6, PAT_UNCACHED) | /* 110 */ 1065 PAT_VALUE(7, PAT_UNCACHEABLE); /* 111 */ 1066 pat_pte_index[PAT_WRITE_BACK] = 0; 1067 pat_pte_index[PAT_WRITE_THROUGH]= 0 | X86_PG_NC_PWT; 1068 pat_pte_index[PAT_UNCACHED] = X86_PG_NC_PCD; 1069 pat_pte_index[PAT_UNCACHEABLE] = X86_PG_NC_PCD | X86_PG_NC_PWT; 1070 pat_pte_index[PAT_WRITE_PROTECTED] = pat_pte_index[PAT_UNCACHEABLE]; 1071 pat_pte_index[PAT_WRITE_COMBINING] = pat_pte_index[PAT_UNCACHEABLE]; 1072 1073 if (cpu_feature & CPUID_PAT) { 1074 /* 1075 * If we support the PAT then set-up entries for 1076 * WRITE_PROTECTED and WRITE_COMBINING using bit patterns 1077 * 4 and 5. 1078 */ 1079 pat_msr = (pat_msr & ~PAT_MASK(4)) | 1080 PAT_VALUE(4, PAT_WRITE_PROTECTED); 1081 pat_msr = (pat_msr & ~PAT_MASK(5)) | 1082 PAT_VALUE(5, PAT_WRITE_COMBINING); 1083 pat_pte_index[PAT_WRITE_PROTECTED] = X86_PG_PTE_PAT | 0; 1084 pat_pte_index[PAT_WRITE_COMBINING] = X86_PG_PTE_PAT | X86_PG_NC_PWT; 1085 1086 /* 1087 * Then enable the PAT 1088 */ 1089 1090 /* Disable PGE. */ 1091 cr4 = rcr4(); 1092 load_cr4(cr4 & ~CR4_PGE); 1093 1094 /* Disable caches (CD = 1, NW = 0). */ 1095 cr0 = rcr0(); 1096 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 1097 1098 /* Flushes caches and TLBs. */ 1099 wbinvd(); 1100 cpu_invltlb(); 1101 1102 /* Update PAT and index table. */ 1103 wrmsr(MSR_PAT, pat_msr); 1104 1105 /* Flush caches and TLBs again. */ 1106 wbinvd(); 1107 cpu_invltlb(); 1108 1109 /* Restore caches and PGE. */ 1110 load_cr0(cr0); 1111 load_cr4(cr4); 1112 PatMsr = pat_msr; 1113 } 1114 } 1115 1116 /* 1117 * Set 4mb pdir for mp startup 1118 */ 1119 void 1120 pmap_set_opt(void) 1121 { 1122 if (cpu_feature & CPUID_PSE) { 1123 load_cr4(rcr4() | CR4_PSE); 1124 if (pdir4mb && mycpu->gd_cpuid == 0) { /* only on BSP */ 1125 cpu_invltlb(); 1126 } 1127 } 1128 } 1129 1130 /* 1131 * Initialize the pmap module. 1132 * Called by vm_init, to initialize any structures that the pmap 1133 * system needs to map virtual memory. 1134 * pmap_init has been enhanced to support in a fairly consistant 1135 * way, discontiguous physical memory. 1136 */ 1137 void 1138 pmap_init(void) 1139 { 1140 int i; 1141 int initial_pvs; 1142 1143 /* 1144 * Allocate memory for random pmap data structures. Includes the 1145 * pv_head_table. 1146 */ 1147 1148 for (i = 0; i < vm_page_array_size; i++) { 1149 vm_page_t m; 1150 1151 m = &vm_page_array[i]; 1152 TAILQ_INIT(&m->md.pv_list); 1153 } 1154 1155 /* 1156 * init the pv free list 1157 */ 1158 initial_pvs = vm_page_array_size; 1159 if (initial_pvs < MINPV) 1160 initial_pvs = MINPV; 1161 pvzone = &pvzone_store; 1162 pvinit = (void *)kmem_alloc(&kernel_map, 1163 initial_pvs * sizeof (struct pv_entry), 1164 VM_SUBSYS_PVENTRY); 1165 zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), 1166 pvinit, initial_pvs); 1167 1168 /* 1169 * Now it is safe to enable pv_table recording. 1170 */ 1171 pmap_initialized = TRUE; 1172 } 1173 1174 /* 1175 * Initialize the address space (zone) for the pv_entries. Set a 1176 * high water mark so that the system can recover from excessive 1177 * numbers of pv entries. 1178 */ 1179 void 1180 pmap_init2(void) 1181 { 1182 int shpgperproc = PMAP_SHPGPERPROC; 1183 int entry_max; 1184 1185 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 1186 pv_entry_max = shpgperproc * maxproc + vm_page_array_size; 1187 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 1188 pv_entry_high_water = 9 * (pv_entry_max / 10); 1189 1190 /* 1191 * Subtract out pages already installed in the zone (hack) 1192 */ 1193 entry_max = pv_entry_max - vm_page_array_size; 1194 if (entry_max <= 0) 1195 entry_max = 1; 1196 1197 zinitna(pvzone, &pvzone_obj, NULL, 0, entry_max, ZONE_INTERRUPT); 1198 1199 /* 1200 * Enable dynamic deletion of empty higher-level page table pages 1201 * by default only if system memory is < 8GB (use 7GB for slop). 1202 * This can save a little memory, but imposes significant 1203 * performance overhead for things like bulk builds, and for programs 1204 * which do a lot of memory mapping and memory unmapping. 1205 */ 1206 if (pmap_dynamic_delete < 0) { 1207 if (vmstats.v_page_count < 7LL * 1024 * 1024 * 1024 / PAGE_SIZE) 1208 pmap_dynamic_delete = 1; 1209 else 1210 pmap_dynamic_delete = 0; 1211 } 1212 } 1213 1214 /* 1215 * Typically used to initialize a fictitious page by vm/device_pager.c 1216 */ 1217 void 1218 pmap_page_init(struct vm_page *m) 1219 { 1220 vm_page_init(m); 1221 TAILQ_INIT(&m->md.pv_list); 1222 } 1223 1224 /*************************************************** 1225 * Low level helper routines..... 1226 ***************************************************/ 1227 1228 /* 1229 * this routine defines the region(s) of memory that should 1230 * not be tested for the modified bit. 1231 */ 1232 static __inline 1233 int 1234 pmap_track_modified(vm_pindex_t pindex) 1235 { 1236 vm_offset_t va = (vm_offset_t)pindex << PAGE_SHIFT; 1237 if ((va < clean_sva) || (va >= clean_eva)) 1238 return 1; 1239 else 1240 return 0; 1241 } 1242 1243 /* 1244 * Extract the physical page address associated with the map/VA pair. 1245 * The page must be wired for this to work reliably. 1246 */ 1247 vm_paddr_t 1248 pmap_extract(pmap_t pmap, vm_offset_t va, void **handlep) 1249 { 1250 vm_paddr_t rtval; 1251 pv_entry_t pt_pv; 1252 pt_entry_t *ptep; 1253 1254 rtval = 0; 1255 if (va >= VM_MAX_USER_ADDRESS) { 1256 /* 1257 * Kernel page directories might be direct-mapped and 1258 * there is typically no PV tracking of pte's 1259 */ 1260 pd_entry_t *pt; 1261 1262 pt = pmap_pt(pmap, va); 1263 if (pt && (*pt & pmap->pmap_bits[PG_V_IDX])) { 1264 if (*pt & pmap->pmap_bits[PG_PS_IDX]) { 1265 rtval = *pt & PG_PS_FRAME; 1266 rtval |= va & PDRMASK; 1267 } else { 1268 ptep = pmap_pt_to_pte(*pt, va); 1269 if (*pt & pmap->pmap_bits[PG_V_IDX]) { 1270 rtval = *ptep & PG_FRAME; 1271 rtval |= va & PAGE_MASK; 1272 } 1273 } 1274 } 1275 if (handlep) 1276 *handlep = NULL; 1277 } else { 1278 /* 1279 * User pages currently do not direct-map the page directory 1280 * and some pages might not used managed PVs. But all PT's 1281 * will have a PV. 1282 */ 1283 pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL); 1284 if (pt_pv) { 1285 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 1286 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 1287 rtval = *ptep & PG_FRAME; 1288 rtval |= va & PAGE_MASK; 1289 } 1290 if (handlep) 1291 *handlep = pt_pv; /* locked until done */ 1292 else 1293 pv_put (pt_pv); 1294 } else if (handlep) { 1295 *handlep = NULL; 1296 } 1297 } 1298 return rtval; 1299 } 1300 1301 void 1302 pmap_extract_done(void *handle) 1303 { 1304 if (handle) 1305 pv_put((pv_entry_t)handle); 1306 } 1307 1308 /* 1309 * Similar to extract but checks protections, SMP-friendly short-cut for 1310 * vm_fault_page[_quick](). Can return NULL to cause the caller to 1311 * fall-through to the real fault code. Does not work with HVM page 1312 * tables. 1313 * 1314 * The returned page, if not NULL, is held (and not busied). 1315 * 1316 * WARNING! THE RETURNED PAGE IS ONLY HELD AND NOT SUITABLE FOR READING 1317 * OR WRITING AS-IS. 1318 */ 1319 vm_page_t 1320 pmap_fault_page_quick(pmap_t pmap, vm_offset_t va, vm_prot_t prot, int *busyp) 1321 { 1322 if (pmap && 1323 va < VM_MAX_USER_ADDRESS && 1324 (pmap->pm_flags & PMAP_HVM) == 0) { 1325 pv_entry_t pt_pv; 1326 pv_entry_t pte_pv; 1327 pt_entry_t *ptep; 1328 pt_entry_t req; 1329 vm_page_t m; 1330 int error; 1331 1332 req = pmap->pmap_bits[PG_V_IDX] | 1333 pmap->pmap_bits[PG_U_IDX]; 1334 if (prot & VM_PROT_WRITE) 1335 req |= pmap->pmap_bits[PG_RW_IDX]; 1336 1337 pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL); 1338 if (pt_pv == NULL) 1339 return (NULL); 1340 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 1341 if ((*ptep & req) != req) { 1342 pv_put(pt_pv); 1343 return (NULL); 1344 } 1345 pte_pv = pv_get_try(pmap, pmap_pte_pindex(va), NULL, &error); 1346 if (pte_pv && error == 0) { 1347 m = pte_pv->pv_m; 1348 if (prot & VM_PROT_WRITE) { 1349 /* interlocked by presence of pv_entry */ 1350 vm_page_dirty(m); 1351 } 1352 if (busyp) { 1353 if (prot & VM_PROT_WRITE) { 1354 if (vm_page_busy_try(m, TRUE)) 1355 m = NULL; 1356 *busyp = 1; 1357 } else { 1358 vm_page_hold(m); 1359 *busyp = 0; 1360 } 1361 } else { 1362 vm_page_hold(m); 1363 } 1364 pv_put(pte_pv); 1365 } else if (pte_pv) { 1366 pv_drop(pte_pv); 1367 m = NULL; 1368 } else { 1369 /* error, since we didn't request a placemarker */ 1370 m = NULL; 1371 } 1372 pv_put(pt_pv); 1373 return(m); 1374 } else { 1375 return(NULL); 1376 } 1377 } 1378 1379 /* 1380 * Extract the physical page address associated kernel virtual address. 1381 */ 1382 vm_paddr_t 1383 pmap_kextract(vm_offset_t va) 1384 { 1385 pd_entry_t pt; /* pt entry in pd */ 1386 vm_paddr_t pa; 1387 1388 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 1389 pa = DMAP_TO_PHYS(va); 1390 } else { 1391 pt = *vtopt(va); 1392 if (pt & kernel_pmap.pmap_bits[PG_PS_IDX]) { 1393 pa = (pt & PG_PS_FRAME) | (va & PDRMASK); 1394 } else { 1395 /* 1396 * Beware of a concurrent promotion that changes the 1397 * PDE at this point! For example, vtopte() must not 1398 * be used to access the PTE because it would use the 1399 * new PDE. It is, however, safe to use the old PDE 1400 * because the page table page is preserved by the 1401 * promotion. 1402 */ 1403 pa = *pmap_pt_to_pte(pt, va); 1404 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 1405 } 1406 } 1407 return pa; 1408 } 1409 1410 /*************************************************** 1411 * Low level mapping routines..... 1412 ***************************************************/ 1413 1414 /* 1415 * Routine: pmap_kenter 1416 * Function: 1417 * Add a wired page to the KVA 1418 * NOTE! note that in order for the mapping to take effect -- you 1419 * should do an invltlb after doing the pmap_kenter(). 1420 */ 1421 void 1422 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1423 { 1424 pt_entry_t *ptep; 1425 pt_entry_t npte; 1426 1427 npte = pa | 1428 kernel_pmap.pmap_bits[PG_RW_IDX] | 1429 kernel_pmap.pmap_bits[PG_V_IDX]; 1430 // pgeflag; 1431 ptep = vtopte(va); 1432 #if 1 1433 pmap_inval_smp(&kernel_pmap, va, 1, ptep, npte); 1434 #else 1435 /* FUTURE */ 1436 if (*ptep) 1437 pmap_inval_smp(&kernel_pmap, va, ptep, npte); 1438 else 1439 *ptep = npte; 1440 #endif 1441 } 1442 1443 /* 1444 * Similar to pmap_kenter(), except we only invalidate the mapping on the 1445 * current CPU. Returns 0 if the previous pte was 0, 1 if it wasn't 1446 * (caller can conditionalize calling smp_invltlb()). 1447 */ 1448 int 1449 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa) 1450 { 1451 pt_entry_t *ptep; 1452 pt_entry_t npte; 1453 int res; 1454 1455 npte = pa | kernel_pmap.pmap_bits[PG_RW_IDX] | 1456 kernel_pmap.pmap_bits[PG_V_IDX]; 1457 // npte |= pgeflag; 1458 ptep = vtopte(va); 1459 #if 1 1460 res = 1; 1461 #else 1462 /* FUTURE */ 1463 res = (*ptep != 0); 1464 #endif 1465 atomic_swap_long(ptep, npte); 1466 cpu_invlpg((void *)va); 1467 1468 return res; 1469 } 1470 1471 /* 1472 * Enter addresses into the kernel pmap but don't bother 1473 * doing any tlb invalidations. Caller will do a rollup 1474 * invalidation via pmap_rollup_inval(). 1475 */ 1476 int 1477 pmap_kenter_noinval(vm_offset_t va, vm_paddr_t pa) 1478 { 1479 pt_entry_t *ptep; 1480 pt_entry_t npte; 1481 int res; 1482 1483 npte = pa | 1484 kernel_pmap.pmap_bits[PG_RW_IDX] | 1485 kernel_pmap.pmap_bits[PG_V_IDX]; 1486 // pgeflag; 1487 ptep = vtopte(va); 1488 #if 1 1489 res = 1; 1490 #else 1491 /* FUTURE */ 1492 res = (*ptep != 0); 1493 #endif 1494 atomic_swap_long(ptep, npte); 1495 cpu_invlpg((void *)va); 1496 1497 return res; 1498 } 1499 1500 /* 1501 * remove a page from the kernel pagetables 1502 */ 1503 void 1504 pmap_kremove(vm_offset_t va) 1505 { 1506 pt_entry_t *ptep; 1507 1508 ptep = vtopte(va); 1509 pmap_inval_smp(&kernel_pmap, va, 1, ptep, 0); 1510 } 1511 1512 void 1513 pmap_kremove_quick(vm_offset_t va) 1514 { 1515 pt_entry_t *ptep; 1516 1517 ptep = vtopte(va); 1518 (void)pte_load_clear(ptep); 1519 cpu_invlpg((void *)va); 1520 } 1521 1522 /* 1523 * Remove addresses from the kernel pmap but don't bother 1524 * doing any tlb invalidations. Caller will do a rollup 1525 * invalidation via pmap_rollup_inval(). 1526 */ 1527 void 1528 pmap_kremove_noinval(vm_offset_t va) 1529 { 1530 pt_entry_t *ptep; 1531 1532 ptep = vtopte(va); 1533 (void)pte_load_clear(ptep); 1534 } 1535 1536 /* 1537 * XXX these need to be recoded. They are not used in any critical path. 1538 */ 1539 void 1540 pmap_kmodify_rw(vm_offset_t va) 1541 { 1542 atomic_set_long(vtopte(va), kernel_pmap.pmap_bits[PG_RW_IDX]); 1543 cpu_invlpg((void *)va); 1544 } 1545 1546 /* NOT USED 1547 void 1548 pmap_kmodify_nc(vm_offset_t va) 1549 { 1550 atomic_set_long(vtopte(va), PG_N); 1551 cpu_invlpg((void *)va); 1552 } 1553 */ 1554 1555 /* 1556 * Used to map a range of physical addresses into kernel virtual 1557 * address space during the low level boot, typically to map the 1558 * dump bitmap, message buffer, and vm_page_array. 1559 * 1560 * These mappings are typically made at some pointer after the end of the 1561 * kernel text+data. 1562 * 1563 * We could return PHYS_TO_DMAP(start) here and not allocate any 1564 * via (*virtp), but then kmem from userland and kernel dumps won't 1565 * have access to the related pointers. 1566 */ 1567 vm_offset_t 1568 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot) 1569 { 1570 vm_offset_t va; 1571 vm_offset_t va_start; 1572 1573 /*return PHYS_TO_DMAP(start);*/ 1574 1575 va_start = *virtp; 1576 va = va_start; 1577 1578 while (start < end) { 1579 pmap_kenter_quick(va, start); 1580 va += PAGE_SIZE; 1581 start += PAGE_SIZE; 1582 } 1583 *virtp = va; 1584 return va_start; 1585 } 1586 1587 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1588 1589 /* 1590 * Remove the specified set of pages from the data and instruction caches. 1591 * 1592 * In contrast to pmap_invalidate_cache_range(), this function does not 1593 * rely on the CPU's self-snoop feature, because it is intended for use 1594 * when moving pages into a different cache domain. 1595 */ 1596 void 1597 pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1598 { 1599 vm_offset_t daddr, eva; 1600 int i; 1601 1602 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1603 (cpu_feature & CPUID_CLFSH) == 0) 1604 wbinvd(); 1605 else { 1606 cpu_mfence(); 1607 for (i = 0; i < count; i++) { 1608 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 1609 eva = daddr + PAGE_SIZE; 1610 for (; daddr < eva; daddr += cpu_clflush_line_size) 1611 clflush(daddr); 1612 } 1613 cpu_mfence(); 1614 } 1615 } 1616 1617 void 1618 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 1619 { 1620 KASSERT((sva & PAGE_MASK) == 0, 1621 ("pmap_invalidate_cache_range: sva not page-aligned")); 1622 KASSERT((eva & PAGE_MASK) == 0, 1623 ("pmap_invalidate_cache_range: eva not page-aligned")); 1624 1625 if (cpu_feature & CPUID_SS) { 1626 ; /* If "Self Snoop" is supported, do nothing. */ 1627 } else { 1628 /* Globally invalidate caches */ 1629 cpu_wbinvd_on_all_cpus(); 1630 } 1631 } 1632 1633 /* 1634 * Invalidate the specified range of virtual memory on all cpus associated 1635 * with the pmap. 1636 */ 1637 void 1638 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1639 { 1640 pmap_inval_smp(pmap, sva, (eva - sva) >> PAGE_SHIFT, NULL, 0); 1641 } 1642 1643 /* 1644 * Add a list of wired pages to the kva. This routine is used for temporary 1645 * kernel mappings such as those found in buffer cache buffer. Page 1646 * modifications and accesses are not tracked or recorded. 1647 * 1648 * NOTE! Old mappings are simply overwritten, and we cannot assume relaxed 1649 * semantics as previous mappings may have been zerod without any 1650 * invalidation. 1651 * 1652 * The page *must* be wired. 1653 */ 1654 void 1655 pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count) 1656 { 1657 vm_offset_t end_va; 1658 vm_offset_t va; 1659 1660 end_va = beg_va + count * PAGE_SIZE; 1661 1662 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 1663 pt_entry_t pte; 1664 pt_entry_t *ptep; 1665 1666 ptep = vtopte(va); 1667 pte = VM_PAGE_TO_PHYS(*m) | 1668 kernel_pmap.pmap_bits[PG_RW_IDX] | 1669 kernel_pmap.pmap_bits[PG_V_IDX] | 1670 kernel_pmap.pmap_cache_bits[(*m)->pat_mode]; 1671 // pgeflag; 1672 atomic_swap_long(ptep, pte); 1673 m++; 1674 } 1675 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 1676 } 1677 1678 /* 1679 * This routine jerks page mappings from the kernel -- it is meant only 1680 * for temporary mappings such as those found in buffer cache buffers. 1681 * No recording modified or access status occurs. 1682 * 1683 * MPSAFE, INTERRUPT SAFE (cluster callback) 1684 */ 1685 void 1686 pmap_qremove(vm_offset_t beg_va, int count) 1687 { 1688 vm_offset_t end_va; 1689 vm_offset_t va; 1690 1691 end_va = beg_va + count * PAGE_SIZE; 1692 1693 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 1694 pt_entry_t *pte; 1695 1696 pte = vtopte(va); 1697 (void)pte_load_clear(pte); 1698 cpu_invlpg((void *)va); 1699 } 1700 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 1701 } 1702 1703 /* 1704 * This routine removes temporary kernel mappings, only invalidating them 1705 * on the current cpu. It should only be used under carefully controlled 1706 * conditions. 1707 */ 1708 void 1709 pmap_qremove_quick(vm_offset_t beg_va, int count) 1710 { 1711 vm_offset_t end_va; 1712 vm_offset_t va; 1713 1714 end_va = beg_va + count * PAGE_SIZE; 1715 1716 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 1717 pt_entry_t *pte; 1718 1719 pte = vtopte(va); 1720 (void)pte_load_clear(pte); 1721 cpu_invlpg((void *)va); 1722 } 1723 } 1724 1725 /* 1726 * This routine removes temporary kernel mappings *without* invalidating 1727 * the TLB. It can only be used on permanent kva reservations such as those 1728 * found in buffer cache buffers, under carefully controlled circumstances. 1729 * 1730 * NOTE: Repopulating these KVAs requires unconditional invalidation. 1731 * (pmap_qenter() does unconditional invalidation). 1732 */ 1733 void 1734 pmap_qremove_noinval(vm_offset_t beg_va, int count) 1735 { 1736 vm_offset_t end_va; 1737 vm_offset_t va; 1738 1739 end_va = beg_va + count * PAGE_SIZE; 1740 1741 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 1742 pt_entry_t *pte; 1743 1744 pte = vtopte(va); 1745 (void)pte_load_clear(pte); 1746 } 1747 } 1748 1749 /* 1750 * Create a new thread and optionally associate it with a (new) process. 1751 * NOTE! the new thread's cpu may not equal the current cpu. 1752 */ 1753 void 1754 pmap_init_thread(thread_t td) 1755 { 1756 /* enforce pcb placement & alignment */ 1757 td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1; 1758 td->td_pcb = (struct pcb *)((intptr_t)td->td_pcb & ~(intptr_t)0xF); 1759 td->td_savefpu = &td->td_pcb->pcb_save; 1760 td->td_sp = (char *)td->td_pcb; /* no -16 */ 1761 } 1762 1763 /* 1764 * This routine directly affects the fork perf for a process. 1765 */ 1766 void 1767 pmap_init_proc(struct proc *p) 1768 { 1769 } 1770 1771 static void 1772 pmap_pinit_defaults(struct pmap *pmap) 1773 { 1774 bcopy(pmap_bits_default, pmap->pmap_bits, 1775 sizeof(pmap_bits_default)); 1776 bcopy(protection_codes, pmap->protection_codes, 1777 sizeof(protection_codes)); 1778 bcopy(pat_pte_index, pmap->pmap_cache_bits, 1779 sizeof(pat_pte_index)); 1780 pmap->pmap_cache_mask = X86_PG_NC_PWT | X86_PG_NC_PCD | X86_PG_PTE_PAT; 1781 pmap->copyinstr = std_copyinstr; 1782 pmap->copyin = std_copyin; 1783 pmap->copyout = std_copyout; 1784 pmap->fubyte = std_fubyte; 1785 pmap->subyte = std_subyte; 1786 pmap->fuword32 = std_fuword32; 1787 pmap->fuword64 = std_fuword64; 1788 pmap->suword32 = std_suword32; 1789 pmap->suword64 = std_suword64; 1790 pmap->swapu32 = std_swapu32; 1791 pmap->swapu64 = std_swapu64; 1792 } 1793 /* 1794 * Initialize pmap0/vmspace0. 1795 * 1796 * On architectures where the kernel pmap is not integrated into the user 1797 * process pmap, this pmap represents the process pmap, not the kernel pmap. 1798 * kernel_pmap should be used to directly access the kernel_pmap. 1799 */ 1800 void 1801 pmap_pinit0(struct pmap *pmap) 1802 { 1803 int i; 1804 1805 pmap->pm_pml4 = (pml4_entry_t *)(PTOV_OFFSET + KPML4phys); 1806 pmap->pm_count = 1; 1807 CPUMASK_ASSZERO(pmap->pm_active); 1808 pmap->pm_pvhint = NULL; 1809 RB_INIT(&pmap->pm_pvroot); 1810 spin_init(&pmap->pm_spin, "pmapinit0"); 1811 for (i = 0; i < PM_PLACEMARKS; ++i) 1812 pmap->pm_placemarks[i] = PM_NOPLACEMARK; 1813 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1814 pmap_pinit_defaults(pmap); 1815 } 1816 1817 /* 1818 * Initialize a preallocated and zeroed pmap structure, 1819 * such as one in a vmspace structure. 1820 */ 1821 static void 1822 pmap_pinit_simple(struct pmap *pmap) 1823 { 1824 int i; 1825 1826 /* 1827 * Misc initialization 1828 */ 1829 pmap->pm_count = 1; 1830 CPUMASK_ASSZERO(pmap->pm_active); 1831 pmap->pm_pvhint = NULL; 1832 pmap->pm_flags = PMAP_FLAG_SIMPLE; 1833 1834 pmap_pinit_defaults(pmap); 1835 1836 /* 1837 * Don't blow up locks/tokens on re-use (XXX fix/use drop code 1838 * for this). 1839 */ 1840 if (pmap->pm_pmlpv == NULL) { 1841 RB_INIT(&pmap->pm_pvroot); 1842 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1843 spin_init(&pmap->pm_spin, "pmapinitsimple"); 1844 for (i = 0; i < PM_PLACEMARKS; ++i) 1845 pmap->pm_placemarks[i] = PM_NOPLACEMARK; 1846 } 1847 } 1848 1849 void 1850 pmap_pinit(struct pmap *pmap) 1851 { 1852 pv_entry_t pv; 1853 int j; 1854 1855 if (pmap->pm_pmlpv) { 1856 if (pmap->pmap_bits[TYPE_IDX] != REGULAR_PMAP) { 1857 pmap_puninit(pmap); 1858 } 1859 } 1860 1861 pmap_pinit_simple(pmap); 1862 pmap->pm_flags &= ~PMAP_FLAG_SIMPLE; 1863 1864 /* 1865 * No need to allocate page table space yet but we do need a valid 1866 * page directory table. 1867 */ 1868 if (pmap->pm_pml4 == NULL) { 1869 pmap->pm_pml4 = 1870 (pml4_entry_t *)kmem_alloc_pageable(&kernel_map, 1871 PAGE_SIZE, 1872 VM_SUBSYS_PML4); 1873 } 1874 1875 /* 1876 * Allocate the page directory page, which wires it even though 1877 * it isn't being entered into some higher level page table (it 1878 * being the highest level). If one is already cached we don't 1879 * have to do anything. 1880 */ 1881 if ((pv = pmap->pm_pmlpv) == NULL) { 1882 pv = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL); 1883 pmap->pm_pmlpv = pv; 1884 pmap_kenter((vm_offset_t)pmap->pm_pml4, 1885 VM_PAGE_TO_PHYS(pv->pv_m)); 1886 pv_put(pv); 1887 1888 /* 1889 * Install DMAP and KMAP. 1890 */ 1891 for (j = 0; j < NDMPML4E; ++j) { 1892 pmap->pm_pml4[DMPML4I + j] = 1893 (DMPDPphys + ((vm_paddr_t)j << PML4SHIFT)) | 1894 pmap->pmap_bits[PG_RW_IDX] | 1895 pmap->pmap_bits[PG_V_IDX] | 1896 pmap->pmap_bits[PG_U_IDX]; 1897 } 1898 pmap->pm_pml4[KPML4I] = KPDPphys | 1899 pmap->pmap_bits[PG_RW_IDX] | 1900 pmap->pmap_bits[PG_V_IDX] | 1901 pmap->pmap_bits[PG_U_IDX]; 1902 1903 /* 1904 * install self-referential address mapping entry 1905 */ 1906 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pv->pv_m) | 1907 pmap->pmap_bits[PG_V_IDX] | 1908 pmap->pmap_bits[PG_RW_IDX] | 1909 pmap->pmap_bits[PG_A_IDX] | 1910 pmap->pmap_bits[PG_M_IDX]; 1911 } else { 1912 KKASSERT(pv->pv_m->flags & PG_MAPPED); 1913 KKASSERT(pv->pv_m->flags & PG_WRITEABLE); 1914 } 1915 KKASSERT(pmap->pm_pml4[255] == 0); 1916 KKASSERT(RB_ROOT(&pmap->pm_pvroot) == pv); 1917 KKASSERT(pv->pv_entry.rbe_left == NULL); 1918 KKASSERT(pv->pv_entry.rbe_right == NULL); 1919 } 1920 1921 /* 1922 * Clean up a pmap structure so it can be physically freed. This routine 1923 * is called by the vmspace dtor function. A great deal of pmap data is 1924 * left passively mapped to improve vmspace management so we have a bit 1925 * of cleanup work to do here. 1926 */ 1927 void 1928 pmap_puninit(pmap_t pmap) 1929 { 1930 pv_entry_t pv; 1931 vm_page_t p; 1932 1933 KKASSERT(CPUMASK_TESTZERO(pmap->pm_active)); 1934 if ((pv = pmap->pm_pmlpv) != NULL) { 1935 if (pv_hold_try(pv) == 0) 1936 pv_lock(pv); 1937 KKASSERT(pv == pmap->pm_pmlpv); 1938 p = pmap_remove_pv_page(pv); 1939 pv_free(pv, NULL); 1940 pv = NULL; /* safety */ 1941 pmap_kremove((vm_offset_t)pmap->pm_pml4); 1942 vm_page_busy_wait(p, FALSE, "pgpun"); 1943 KKASSERT(p->flags & (PG_FICTITIOUS|PG_UNMANAGED)); 1944 vm_page_unwire(p, 0); 1945 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 1946 1947 /* 1948 * XXX eventually clean out PML4 static entries and 1949 * use vm_page_free_zero() 1950 */ 1951 vm_page_free(p); 1952 pmap->pm_pmlpv = NULL; 1953 } 1954 if (pmap->pm_pml4) { 1955 KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys)); 1956 kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE); 1957 pmap->pm_pml4 = NULL; 1958 } 1959 KKASSERT(pmap->pm_stats.resident_count == 0); 1960 KKASSERT(pmap->pm_stats.wired_count == 0); 1961 } 1962 1963 /* 1964 * This function is now unused (used to add the pmap to the pmap_list) 1965 */ 1966 void 1967 pmap_pinit2(struct pmap *pmap) 1968 { 1969 } 1970 1971 /* 1972 * This routine is called when various levels in the page table need to 1973 * be populated. This routine cannot fail. 1974 * 1975 * This function returns two locked pv_entry's, one representing the 1976 * requested pv and one representing the requested pv's parent pv. If 1977 * an intermediate page table does not exist it will be created, mapped, 1978 * wired, and the parent page table will be given an additional hold 1979 * count representing the presence of the child pv_entry. 1980 */ 1981 static 1982 pv_entry_t 1983 pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp) 1984 { 1985 pt_entry_t *ptep; 1986 pv_entry_t pv; 1987 pv_entry_t pvp; 1988 pt_entry_t v; 1989 vm_pindex_t pt_pindex; 1990 vm_page_t m; 1991 int isnew; 1992 int ispt; 1993 1994 /* 1995 * If the pv already exists and we aren't being asked for the 1996 * parent page table page we can just return it. A locked+held pv 1997 * is returned. The pv will also have a second hold related to the 1998 * pmap association that we don't have to worry about. 1999 */ 2000 ispt = 0; 2001 pv = pv_alloc(pmap, ptepindex, &isnew); 2002 if (isnew == 0 && pvpp == NULL) 2003 return(pv); 2004 2005 /* 2006 * Special case terminal PVs. These are not page table pages so 2007 * no vm_page is allocated (the caller supplied the vm_page). If 2008 * pvpp is non-NULL we are being asked to also removed the pt_pv 2009 * for this pv. 2010 * 2011 * Note that pt_pv's are only returned for user VAs. We assert that 2012 * a pt_pv is not being requested for kernel VAs. The kernel 2013 * pre-wires all higher-level page tables so don't overload managed 2014 * higher-level page tables on top of it! 2015 */ 2016 if (ptepindex < pmap_pt_pindex(0)) { 2017 if (ptepindex >= NUPTE_USER) { 2018 /* kernel manages this manually for KVM */ 2019 KKASSERT(pvpp == NULL); 2020 } else { 2021 KKASSERT(pvpp != NULL); 2022 pt_pindex = NUPTE_TOTAL + (ptepindex >> NPTEPGSHIFT); 2023 pvp = pmap_allocpte(pmap, pt_pindex, NULL); 2024 if (isnew) 2025 vm_page_wire_quick(pvp->pv_m); 2026 *pvpp = pvp; 2027 } 2028 return(pv); 2029 } 2030 2031 /* 2032 * The kernel never uses managed PT/PD/PDP pages. 2033 */ 2034 KKASSERT(pmap != &kernel_pmap); 2035 2036 /* 2037 * Non-terminal PVs allocate a VM page to represent the page table, 2038 * so we have to resolve pvp and calculate ptepindex for the pvp 2039 * and then for the page table entry index in the pvp for 2040 * fall-through. 2041 */ 2042 if (ptepindex < pmap_pd_pindex(0)) { 2043 /* 2044 * pv is PT, pvp is PD 2045 */ 2046 ptepindex = (ptepindex - pmap_pt_pindex(0)) >> NPDEPGSHIFT; 2047 ptepindex += NUPTE_TOTAL + NUPT_TOTAL; 2048 pvp = pmap_allocpte(pmap, ptepindex, NULL); 2049 2050 /* 2051 * PT index in PD 2052 */ 2053 ptepindex = pv->pv_pindex - pmap_pt_pindex(0); 2054 ptepindex &= ((1ul << NPDEPGSHIFT) - 1); 2055 ispt = 1; 2056 } else if (ptepindex < pmap_pdp_pindex(0)) { 2057 /* 2058 * pv is PD, pvp is PDP 2059 * 2060 * SIMPLE PMAP NOTE: Simple pmaps do not allocate above 2061 * the PD. 2062 */ 2063 ptepindex = (ptepindex - pmap_pd_pindex(0)) >> NPDPEPGSHIFT; 2064 ptepindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL; 2065 2066 if (pmap->pm_flags & PMAP_FLAG_SIMPLE) { 2067 KKASSERT(pvpp == NULL); 2068 pvp = NULL; 2069 } else { 2070 pvp = pmap_allocpte(pmap, ptepindex, NULL); 2071 } 2072 2073 /* 2074 * PD index in PDP 2075 */ 2076 ptepindex = pv->pv_pindex - pmap_pd_pindex(0); 2077 ptepindex &= ((1ul << NPDPEPGSHIFT) - 1); 2078 } else if (ptepindex < pmap_pml4_pindex()) { 2079 /* 2080 * pv is PDP, pvp is the root pml4 table 2081 */ 2082 pvp = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL); 2083 2084 /* 2085 * PDP index in PML4 2086 */ 2087 ptepindex = pv->pv_pindex - pmap_pdp_pindex(0); 2088 ptepindex &= ((1ul << NPML4EPGSHIFT) - 1); 2089 } else { 2090 /* 2091 * pv represents the top-level PML4, there is no parent. 2092 */ 2093 pvp = NULL; 2094 } 2095 2096 if (isnew == 0) 2097 goto notnew; 2098 2099 /* 2100 * (isnew) is TRUE, pv is not terminal. 2101 * 2102 * (1) Add a wire count to the parent page table (pvp). 2103 * (2) Allocate a VM page for the page table. 2104 * (3) Enter the VM page into the parent page table. 2105 * 2106 * page table pages are marked PG_WRITEABLE and PG_MAPPED. 2107 */ 2108 if (pvp) 2109 vm_page_wire_quick(pvp->pv_m); 2110 2111 for (;;) { 2112 m = vm_page_alloc(NULL, pv->pv_pindex, 2113 VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM | 2114 VM_ALLOC_INTERRUPT); 2115 if (m) 2116 break; 2117 vm_wait(0); 2118 } 2119 vm_page_wire(m); /* wire for mapping in parent */ 2120 vm_page_unmanage(m); /* m must be spinunlocked */ 2121 pmap_zero_page(VM_PAGE_TO_PHYS(m)); 2122 m->valid = VM_PAGE_BITS_ALL; 2123 2124 vm_page_spin_lock(m); 2125 pmap_page_stats_adding(m); 2126 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2127 pv->pv_m = m; 2128 vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); 2129 vm_page_spin_unlock(m); 2130 2131 /* 2132 * (isnew) is TRUE, pv is not terminal. 2133 * 2134 * Wire the page into pvp. Bump the resident_count for the pmap. 2135 * There is no pvp for the top level, address the pm_pml4[] array 2136 * directly. 2137 * 2138 * If the caller wants the parent we return it, otherwise 2139 * we just put it away. 2140 * 2141 * No interlock is needed for pte 0 -> non-zero. 2142 * 2143 * In the situation where *ptep is valid we might have an unmanaged 2144 * page table page shared from another page table which we need to 2145 * unshare before installing our private page table page. 2146 */ 2147 if (pvp) { 2148 v = VM_PAGE_TO_PHYS(m) | 2149 (pmap->pmap_bits[PG_U_IDX] | 2150 pmap->pmap_bits[PG_RW_IDX] | 2151 pmap->pmap_bits[PG_V_IDX] | 2152 pmap->pmap_bits[PG_A_IDX] | 2153 pmap->pmap_bits[PG_M_IDX]); 2154 ptep = pv_pte_lookup(pvp, ptepindex); 2155 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 2156 pt_entry_t pte; 2157 2158 if (ispt == 0) { 2159 panic("pmap_allocpte: unexpected pte %p/%d", 2160 pvp, (int)ptepindex); 2161 } 2162 pte = pmap_inval_smp(pmap, (vm_offset_t)-1, 1, ptep, v); 2163 if (vm_page_unwire_quick( 2164 PHYS_TO_VM_PAGE(pte & PG_FRAME))) { 2165 panic("pmap_allocpte: shared pgtable " 2166 "pg bad wirecount"); 2167 } 2168 } else { 2169 pt_entry_t pte; 2170 2171 pte = atomic_swap_long(ptep, v); 2172 if (pte != 0) { 2173 kprintf("install pgtbl mixup 0x%016jx " 2174 "old/new 0x%016jx/0x%016jx\n", 2175 (intmax_t)ptepindex, pte, v); 2176 } 2177 } 2178 } 2179 vm_page_wakeup(m); 2180 2181 /* 2182 * (isnew) may be TRUE or FALSE, pv may or may not be terminal. 2183 */ 2184 notnew: 2185 if (pvp) { 2186 KKASSERT(pvp->pv_m != NULL); 2187 ptep = pv_pte_lookup(pvp, ptepindex); 2188 v = VM_PAGE_TO_PHYS(pv->pv_m) | 2189 (pmap->pmap_bits[PG_U_IDX] | 2190 pmap->pmap_bits[PG_RW_IDX] | 2191 pmap->pmap_bits[PG_V_IDX] | 2192 pmap->pmap_bits[PG_A_IDX] | 2193 pmap->pmap_bits[PG_M_IDX]); 2194 if (*ptep != v) { 2195 kprintf("mismatched upper level pt %016jx/%016jx\n", 2196 *ptep, v); 2197 } 2198 } 2199 if (pvpp) 2200 *pvpp = pvp; 2201 else if (pvp) 2202 pv_put(pvp); 2203 return (pv); 2204 } 2205 2206 /* 2207 * This version of pmap_allocpte() checks for possible segment optimizations 2208 * that would allow page-table sharing. It can be called for terminal 2209 * page or page table page ptepindex's. 2210 * 2211 * The function is called with page table page ptepindex's for fictitious 2212 * and unmanaged terminal pages. That is, we don't want to allocate a 2213 * terminal pv, we just want the pt_pv. pvpp is usually passed as NULL 2214 * for this case. 2215 * 2216 * This function can return a pv and *pvpp associated with the passed in pmap 2217 * OR a pv and *pvpp associated with the shared pmap. In the latter case 2218 * an unmanaged page table page will be entered into the pass in pmap. 2219 */ 2220 static 2221 pv_entry_t 2222 pmap_allocpte_seg(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp, 2223 vm_map_entry_t entry, vm_offset_t va) 2224 { 2225 vm_object_t object; 2226 pmap_t obpmap; 2227 pmap_t *obpmapp; 2228 vm_offset_t b; 2229 pv_entry_t pte_pv; /* in original or shared pmap */ 2230 pv_entry_t pt_pv; /* in original or shared pmap */ 2231 pv_entry_t proc_pd_pv; /* in original pmap */ 2232 pv_entry_t proc_pt_pv; /* in original pmap */ 2233 pv_entry_t xpv; /* PT in shared pmap */ 2234 pd_entry_t *pt; /* PT entry in PD of original pmap */ 2235 pd_entry_t opte; /* contents of *pt */ 2236 pd_entry_t npte; /* contents of *pt */ 2237 vm_page_t m; 2238 2239 /* 2240 * Basic tests, require a non-NULL vm_map_entry, require proper 2241 * alignment and type for the vm_map_entry, require that the 2242 * underlying object already be allocated. 2243 * 2244 * We allow almost any type of object to use this optimization. 2245 * The object itself does NOT have to be sized to a multiple of the 2246 * segment size, but the memory mapping does. 2247 * 2248 * XXX don't handle devices currently, because VM_PAGE_TO_PHYS() 2249 * won't work as expected. 2250 */ 2251 if (entry == NULL || 2252 pmap_mmu_optimize == 0 || /* not enabled */ 2253 (pmap->pm_flags & PMAP_HVM) || /* special pmap */ 2254 ptepindex >= pmap_pd_pindex(0) || /* not terminal or pt */ 2255 entry->inheritance != VM_INHERIT_SHARE || /* not shared */ 2256 entry->maptype != VM_MAPTYPE_NORMAL || /* weird map type */ 2257 entry->object.vm_object == NULL || /* needs VM object */ 2258 entry->object.vm_object->type == OBJT_DEVICE || /* ick */ 2259 entry->object.vm_object->type == OBJT_MGTDEVICE || /* ick */ 2260 (entry->offset & SEG_MASK) || /* must be aligned */ 2261 (entry->start & SEG_MASK)) { 2262 return(pmap_allocpte(pmap, ptepindex, pvpp)); 2263 } 2264 2265 /* 2266 * Make sure the full segment can be represented. 2267 */ 2268 b = va & ~(vm_offset_t)SEG_MASK; 2269 if (b < entry->start || b + SEG_SIZE > entry->end) 2270 return(pmap_allocpte(pmap, ptepindex, pvpp)); 2271 2272 /* 2273 * If the full segment can be represented dive the VM object's 2274 * shared pmap, allocating as required. 2275 */ 2276 object = entry->object.vm_object; 2277 2278 if (entry->protection & VM_PROT_WRITE) 2279 obpmapp = &object->md.pmap_rw; 2280 else 2281 obpmapp = &object->md.pmap_ro; 2282 2283 #ifdef PMAP_DEBUG2 2284 if (pmap_enter_debug > 0) { 2285 --pmap_enter_debug; 2286 kprintf("pmap_allocpte_seg: va=%jx prot %08x o=%p " 2287 "obpmapp %p %p\n", 2288 va, entry->protection, object, 2289 obpmapp, *obpmapp); 2290 kprintf("pmap_allocpte_seg: entry %p %jx-%jx\n", 2291 entry, entry->start, entry->end); 2292 } 2293 #endif 2294 2295 /* 2296 * We allocate what appears to be a normal pmap but because portions 2297 * of this pmap are shared with other unrelated pmaps we have to 2298 * set pm_active to point to all cpus. 2299 * 2300 * XXX Currently using pmap_spin to interlock the update, can't use 2301 * vm_object_hold/drop because the token might already be held 2302 * shared OR exclusive and we don't know. 2303 */ 2304 while ((obpmap = *obpmapp) == NULL) { 2305 obpmap = kmalloc(sizeof(*obpmap), M_OBJPMAP, M_WAITOK|M_ZERO); 2306 pmap_pinit_simple(obpmap); 2307 pmap_pinit2(obpmap); 2308 spin_lock(&pmap_spin); 2309 if (*obpmapp != NULL) { 2310 /* 2311 * Handle race 2312 */ 2313 spin_unlock(&pmap_spin); 2314 pmap_release(obpmap); 2315 pmap_puninit(obpmap); 2316 kfree(obpmap, M_OBJPMAP); 2317 obpmap = *obpmapp; /* safety */ 2318 } else { 2319 obpmap->pm_active = smp_active_mask; 2320 obpmap->pm_flags |= PMAP_SEGSHARED; 2321 *obpmapp = obpmap; 2322 spin_unlock(&pmap_spin); 2323 } 2324 } 2325 2326 /* 2327 * Layering is: PTE, PT, PD, PDP, PML4. We have to return the 2328 * pte/pt using the shared pmap from the object but also adjust 2329 * the process pmap's page table page as a side effect. 2330 */ 2331 2332 /* 2333 * Resolve the terminal PTE and PT in the shared pmap. This is what 2334 * we will return. This is true if ptepindex represents a terminal 2335 * page, otherwise pte_pv is actually the PT and pt_pv is actually 2336 * the PD. 2337 */ 2338 pt_pv = NULL; 2339 pte_pv = pmap_allocpte(obpmap, ptepindex, &pt_pv); 2340 retry: 2341 if (ptepindex >= pmap_pt_pindex(0)) 2342 xpv = pte_pv; 2343 else 2344 xpv = pt_pv; 2345 2346 /* 2347 * Resolve the PD in the process pmap so we can properly share the 2348 * page table page. Lock order is bottom-up (leaf first)! 2349 * 2350 * NOTE: proc_pt_pv can be NULL. 2351 */ 2352 proc_pt_pv = pv_get(pmap, pmap_pt_pindex(b), NULL); 2353 proc_pd_pv = pmap_allocpte(pmap, pmap_pd_pindex(b), NULL); 2354 #ifdef PMAP_DEBUG2 2355 if (pmap_enter_debug > 0) { 2356 --pmap_enter_debug; 2357 kprintf("proc_pt_pv %p (wc %d) pd_pv %p va=%jx\n", 2358 proc_pt_pv, 2359 (proc_pt_pv ? proc_pt_pv->pv_m->wire_count : -1), 2360 proc_pd_pv, 2361 va); 2362 } 2363 #endif 2364 2365 /* 2366 * xpv is the page table page pv from the shared object 2367 * (for convenience), from above. 2368 * 2369 * Calculate the pte value for the PT to load into the process PD. 2370 * If we have to change it we must properly dispose of the previous 2371 * entry. 2372 */ 2373 pt = pv_pte_lookup(proc_pd_pv, pmap_pt_index(b)); 2374 npte = VM_PAGE_TO_PHYS(xpv->pv_m) | 2375 (pmap->pmap_bits[PG_U_IDX] | 2376 pmap->pmap_bits[PG_RW_IDX] | 2377 pmap->pmap_bits[PG_V_IDX] | 2378 pmap->pmap_bits[PG_A_IDX] | 2379 pmap->pmap_bits[PG_M_IDX]); 2380 2381 /* 2382 * Dispose of previous page table page if it was local to the 2383 * process pmap. If the old pt is not empty we cannot dispose of it 2384 * until we clean it out. This case should not arise very often so 2385 * it is not optimized. 2386 * 2387 * Leave pt_pv and pte_pv (in our object pmap) locked and intact 2388 * for the retry. 2389 */ 2390 if (proc_pt_pv) { 2391 pmap_inval_bulk_t bulk; 2392 2393 if (proc_pt_pv->pv_m->wire_count != 1) { 2394 pv_put(proc_pd_pv); 2395 pv_put(proc_pt_pv); 2396 pmap_remove(pmap, 2397 va & ~(vm_offset_t)SEG_MASK, 2398 (va + SEG_SIZE) & ~(vm_offset_t)SEG_MASK); 2399 goto retry; 2400 } 2401 2402 /* 2403 * The release call will indirectly clean out *pt 2404 */ 2405 pmap_inval_bulk_init(&bulk, proc_pt_pv->pv_pmap); 2406 pmap_release_pv(proc_pt_pv, proc_pd_pv, &bulk); 2407 pmap_inval_bulk_flush(&bulk); 2408 proc_pt_pv = NULL; 2409 /* relookup */ 2410 pt = pv_pte_lookup(proc_pd_pv, pmap_pt_index(b)); 2411 } 2412 2413 /* 2414 * Handle remaining cases. 2415 */ 2416 if (*pt == 0) { 2417 atomic_swap_long(pt, npte); 2418 vm_page_wire_quick(xpv->pv_m); /* shared pt -> proc */ 2419 vm_page_wire_quick(proc_pd_pv->pv_m); /* proc pd for sh pt */ 2420 atomic_add_long(&pmap->pm_stats.resident_count, 1); 2421 } else if (*pt != npte) { 2422 opte = pmap_inval_smp(pmap, (vm_offset_t)-1, 1, pt, npte); 2423 2424 #if 0 2425 opte = pte_load_clear(pt); 2426 KKASSERT(opte && opte != npte); 2427 2428 *pt = npte; 2429 #endif 2430 vm_page_wire_quick(xpv->pv_m); /* shared pt -> proc */ 2431 2432 /* 2433 * Clean up opte, bump the wire_count for the process 2434 * PD page representing the new entry if it was 2435 * previously empty. 2436 * 2437 * If the entry was not previously empty and we have 2438 * a PT in the proc pmap then opte must match that 2439 * pt. The proc pt must be retired (this is done 2440 * later on in this procedure). 2441 * 2442 * NOTE: replacing valid pte, wire_count on proc_pd_pv 2443 * stays the same. 2444 */ 2445 KKASSERT(opte & pmap->pmap_bits[PG_V_IDX]); 2446 m = PHYS_TO_VM_PAGE(opte & PG_FRAME); 2447 if (vm_page_unwire_quick(m)) { 2448 panic("pmap_allocpte_seg: " 2449 "bad wire count %p", 2450 m); 2451 } 2452 } 2453 2454 /* 2455 * The existing process page table was replaced and must be destroyed 2456 * here. 2457 */ 2458 if (proc_pd_pv) 2459 pv_put(proc_pd_pv); 2460 if (pvpp) 2461 *pvpp = pt_pv; 2462 else 2463 pv_put(pt_pv); 2464 2465 return (pte_pv); 2466 } 2467 2468 /* 2469 * Release any resources held by the given physical map. 2470 * 2471 * Called when a pmap initialized by pmap_pinit is being released. Should 2472 * only be called if the map contains no valid mappings. 2473 */ 2474 struct pmap_release_info { 2475 pmap_t pmap; 2476 int retry; 2477 pv_entry_t pvp; 2478 }; 2479 2480 static int pmap_release_callback(pv_entry_t pv, void *data); 2481 2482 void 2483 pmap_release(struct pmap *pmap) 2484 { 2485 struct pmap_release_info info; 2486 2487 KASSERT(CPUMASK_TESTZERO(pmap->pm_active), 2488 ("pmap still active! %016jx", 2489 (uintmax_t)CPUMASK_LOWMASK(pmap->pm_active))); 2490 2491 /* 2492 * There is no longer a pmap_list, if there were we would remove the 2493 * pmap from it here. 2494 */ 2495 2496 /* 2497 * Pull pv's off the RB tree in order from low to high and release 2498 * each page. 2499 */ 2500 info.pmap = pmap; 2501 do { 2502 info.retry = 0; 2503 info.pvp = NULL; 2504 2505 spin_lock(&pmap->pm_spin); 2506 RB_SCAN(pv_entry_rb_tree, &pmap->pm_pvroot, NULL, 2507 pmap_release_callback, &info); 2508 spin_unlock(&pmap->pm_spin); 2509 2510 if (info.pvp) 2511 pv_put(info.pvp); 2512 } while (info.retry); 2513 2514 2515 /* 2516 * One resident page (the pml4 page) should remain. 2517 * No wired pages should remain. 2518 */ 2519 #if 1 2520 if (pmap->pm_stats.resident_count != 2521 ((pmap->pm_flags & PMAP_FLAG_SIMPLE) ? 0 : 1) || 2522 pmap->pm_stats.wired_count != 0) { 2523 kprintf("fatal pmap problem - pmap %p flags %08x " 2524 "rescnt=%jd wirecnt=%jd\n", 2525 pmap, 2526 pmap->pm_flags, 2527 pmap->pm_stats.resident_count, 2528 pmap->pm_stats.wired_count); 2529 tsleep(pmap, 0, "DEAD", 0); 2530 } 2531 #else 2532 KKASSERT(pmap->pm_stats.resident_count == 2533 ((pmap->pm_flags & PMAP_FLAG_SIMPLE) ? 0 : 1)); 2534 KKASSERT(pmap->pm_stats.wired_count == 0); 2535 #endif 2536 } 2537 2538 /* 2539 * Called from low to high. We must cache the proper parent pv so we 2540 * can adjust its wired count. 2541 */ 2542 static int 2543 pmap_release_callback(pv_entry_t pv, void *data) 2544 { 2545 struct pmap_release_info *info = data; 2546 pmap_t pmap = info->pmap; 2547 vm_pindex_t pindex; 2548 int r; 2549 2550 /* 2551 * Acquire a held and locked pv, check for release race 2552 */ 2553 pindex = pv->pv_pindex; 2554 if (info->pvp == pv) { 2555 spin_unlock(&pmap->pm_spin); 2556 info->pvp = NULL; 2557 } else if (pv_hold_try(pv)) { 2558 spin_unlock(&pmap->pm_spin); 2559 } else { 2560 spin_unlock(&pmap->pm_spin); 2561 pv_lock(pv); 2562 pv_put(pv); 2563 info->retry = 1; 2564 return -1; 2565 } 2566 KKASSERT(pv->pv_pmap == pmap && pindex == pv->pv_pindex); 2567 2568 if (pv->pv_pindex < pmap_pt_pindex(0)) { 2569 /* 2570 * I am PTE, parent is PT 2571 */ 2572 pindex = pv->pv_pindex >> NPTEPGSHIFT; 2573 pindex += NUPTE_TOTAL; 2574 } else if (pv->pv_pindex < pmap_pd_pindex(0)) { 2575 /* 2576 * I am PT, parent is PD 2577 */ 2578 pindex = (pv->pv_pindex - NUPTE_TOTAL) >> NPDEPGSHIFT; 2579 pindex += NUPTE_TOTAL + NUPT_TOTAL; 2580 } else if (pv->pv_pindex < pmap_pdp_pindex(0)) { 2581 /* 2582 * I am PD, parent is PDP 2583 */ 2584 pindex = (pv->pv_pindex - NUPTE_TOTAL - NUPT_TOTAL) >> 2585 NPDPEPGSHIFT; 2586 pindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL; 2587 } else if (pv->pv_pindex < pmap_pml4_pindex()) { 2588 /* 2589 * I am PDP, parent is PML4 (there's only one) 2590 */ 2591 #if 0 2592 pindex = (pv->pv_pindex - NUPTE_TOTAL - NUPT_TOTAL - 2593 NUPD_TOTAL) >> NPML4EPGSHIFT; 2594 pindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL; 2595 #endif 2596 pindex = pmap_pml4_pindex(); 2597 } else { 2598 /* 2599 * parent is NULL 2600 */ 2601 if (info->pvp) { 2602 pv_put(info->pvp); 2603 info->pvp = NULL; 2604 } 2605 pindex = 0; 2606 } 2607 if (pindex) { 2608 if (info->pvp && info->pvp->pv_pindex != pindex) { 2609 pv_put(info->pvp); 2610 info->pvp = NULL; 2611 } 2612 if (info->pvp == NULL) 2613 info->pvp = pv_get(pmap, pindex, NULL); 2614 } else { 2615 if (info->pvp) { 2616 pv_put(info->pvp); 2617 info->pvp = NULL; 2618 } 2619 } 2620 r = pmap_release_pv(pv, info->pvp, NULL); 2621 spin_lock(&pmap->pm_spin); 2622 2623 return(r); 2624 } 2625 2626 /* 2627 * Called with held (i.e. also locked) pv. This function will dispose of 2628 * the lock along with the pv. 2629 * 2630 * If the caller already holds the locked parent page table for pv it 2631 * must pass it as pvp, allowing us to avoid a deadlock, else it can 2632 * pass NULL for pvp. 2633 */ 2634 static int 2635 pmap_release_pv(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk) 2636 { 2637 vm_page_t p; 2638 2639 /* 2640 * The pmap is currently not spinlocked, pv is held+locked. 2641 * Remove the pv's page from its parent's page table. The 2642 * parent's page table page's wire_count will be decremented. 2643 * 2644 * This will clean out the pte at any level of the page table. 2645 * If smp != 0 all cpus are affected. 2646 * 2647 * Do not tear-down recursively, its faster to just let the 2648 * release run its course. 2649 */ 2650 pmap_remove_pv_pte(pv, pvp, bulk, 0); 2651 2652 /* 2653 * Terminal pvs are unhooked from their vm_pages. Because 2654 * terminal pages aren't page table pages they aren't wired 2655 * by us, so we have to be sure not to unwire them either. 2656 */ 2657 if (pv->pv_pindex < pmap_pt_pindex(0)) { 2658 pmap_remove_pv_page(pv); 2659 goto skip; 2660 } 2661 2662 /* 2663 * We leave the top-level page table page cached, wired, and 2664 * mapped in the pmap until the dtor function (pmap_puninit()) 2665 * gets called. 2666 * 2667 * Since we are leaving the top-level pv intact we need 2668 * to break out of what would otherwise be an infinite loop. 2669 */ 2670 if (pv->pv_pindex == pmap_pml4_pindex()) { 2671 pv_put(pv); 2672 return(-1); 2673 } 2674 2675 /* 2676 * For page table pages (other than the top-level page), 2677 * remove and free the vm_page. The representitive mapping 2678 * removed above by pmap_remove_pv_pte() did not undo the 2679 * last wire_count so we have to do that as well. 2680 */ 2681 p = pmap_remove_pv_page(pv); 2682 vm_page_busy_wait(p, FALSE, "pmaprl"); 2683 if (p->wire_count != 1) { 2684 kprintf("p->wire_count was %016lx %d\n", 2685 pv->pv_pindex, p->wire_count); 2686 } 2687 KKASSERT(p->wire_count == 1); 2688 KKASSERT(p->flags & PG_UNMANAGED); 2689 2690 vm_page_unwire(p, 0); 2691 KKASSERT(p->wire_count == 0); 2692 2693 vm_page_free(p); 2694 skip: 2695 pv_free(pv, pvp); 2696 2697 return 0; 2698 } 2699 2700 /* 2701 * This function will remove the pte associated with a pv from its parent. 2702 * Terminal pv's are supported. All cpus specified by (bulk) are properly 2703 * invalidated. 2704 * 2705 * The wire count will be dropped on the parent page table. The wire 2706 * count on the page being removed (pv->pv_m) from the parent page table 2707 * is NOT touched. Note that terminal pages will not have any additional 2708 * wire counts while page table pages will have at least one representing 2709 * the mapping, plus others representing sub-mappings. 2710 * 2711 * NOTE: Cannot be called on kernel page table pages, only KVM terminal 2712 * pages and user page table and terminal pages. 2713 * 2714 * NOTE: The pte being removed might be unmanaged, and the pv supplied might 2715 * be freshly allocated and not imply that the pte is managed. In this 2716 * case pv->pv_m should be NULL. 2717 * 2718 * The pv must be locked. The pvp, if supplied, must be locked. All 2719 * supplied pv's will remain locked on return. 2720 * 2721 * XXX must lock parent pv's if they exist to remove pte XXX 2722 */ 2723 static 2724 void 2725 pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk, 2726 int destroy) 2727 { 2728 vm_pindex_t ptepindex = pv->pv_pindex; 2729 pmap_t pmap = pv->pv_pmap; 2730 vm_page_t p; 2731 int gotpvp = 0; 2732 2733 KKASSERT(pmap); 2734 2735 if (ptepindex == pmap_pml4_pindex()) { 2736 /* 2737 * We are the top level PML4E table, there is no parent. 2738 */ 2739 p = pmap->pm_pmlpv->pv_m; 2740 KKASSERT(pv->pv_m == p); /* debugging */ 2741 } else if (ptepindex >= pmap_pdp_pindex(0)) { 2742 /* 2743 * Remove a PDP page from the PML4E. This can only occur 2744 * with user page tables. We do not have to lock the 2745 * pml4 PV so just ignore pvp. 2746 */ 2747 vm_pindex_t pml4_pindex; 2748 vm_pindex_t pdp_index; 2749 pml4_entry_t *pdp; 2750 2751 pdp_index = ptepindex - pmap_pdp_pindex(0); 2752 if (pvp == NULL) { 2753 pml4_pindex = pmap_pml4_pindex(); 2754 pvp = pv_get(pv->pv_pmap, pml4_pindex, NULL); 2755 KKASSERT(pvp); 2756 gotpvp = 1; 2757 } 2758 2759 pdp = &pmap->pm_pml4[pdp_index & ((1ul << NPML4EPGSHIFT) - 1)]; 2760 KKASSERT((*pdp & pmap->pmap_bits[PG_V_IDX]) != 0); 2761 p = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 2762 pmap_inval_bulk(bulk, (vm_offset_t)-1, pdp, 0); 2763 KKASSERT(pv->pv_m == p); /* debugging */ 2764 } else if (ptepindex >= pmap_pd_pindex(0)) { 2765 /* 2766 * Remove a PD page from the PDP 2767 * 2768 * SIMPLE PMAP NOTE: Non-existant pvp's are ok in the case 2769 * of a simple pmap because it stops at 2770 * the PD page. 2771 */ 2772 vm_pindex_t pdp_pindex; 2773 vm_pindex_t pd_index; 2774 pdp_entry_t *pd; 2775 2776 pd_index = ptepindex - pmap_pd_pindex(0); 2777 2778 if (pvp == NULL) { 2779 pdp_pindex = NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + 2780 (pd_index >> NPML4EPGSHIFT); 2781 pvp = pv_get(pv->pv_pmap, pdp_pindex, NULL); 2782 gotpvp = 1; 2783 } 2784 2785 if (pvp) { 2786 pd = pv_pte_lookup(pvp, pd_index & 2787 ((1ul << NPDPEPGSHIFT) - 1)); 2788 KKASSERT((*pd & pmap->pmap_bits[PG_V_IDX]) != 0); 2789 p = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 2790 pmap_inval_bulk(bulk, (vm_offset_t)-1, pd, 0); 2791 } else { 2792 KKASSERT(pmap->pm_flags & PMAP_FLAG_SIMPLE); 2793 p = pv->pv_m; /* degenerate test later */ 2794 } 2795 KKASSERT(pv->pv_m == p); /* debugging */ 2796 } else if (ptepindex >= pmap_pt_pindex(0)) { 2797 /* 2798 * Remove a PT page from the PD 2799 */ 2800 vm_pindex_t pd_pindex; 2801 vm_pindex_t pt_index; 2802 pd_entry_t *pt; 2803 2804 pt_index = ptepindex - pmap_pt_pindex(0); 2805 2806 if (pvp == NULL) { 2807 pd_pindex = NUPTE_TOTAL + NUPT_TOTAL + 2808 (pt_index >> NPDPEPGSHIFT); 2809 pvp = pv_get(pv->pv_pmap, pd_pindex, NULL); 2810 KKASSERT(pvp); 2811 gotpvp = 1; 2812 } 2813 2814 pt = pv_pte_lookup(pvp, pt_index & ((1ul << NPDPEPGSHIFT) - 1)); 2815 #if 0 2816 KASSERT((*pt & pmap->pmap_bits[PG_V_IDX]) != 0, 2817 ("*pt unexpectedly invalid %016jx " 2818 "gotpvp=%d ptepindex=%ld ptindex=%ld pv=%p pvp=%p", 2819 *pt, gotpvp, ptepindex, pt_index, pv, pvp)); 2820 p = PHYS_TO_VM_PAGE(*pt & PG_FRAME); 2821 #else 2822 if ((*pt & pmap->pmap_bits[PG_V_IDX]) == 0) { 2823 kprintf("*pt unexpectedly invalid %016jx " 2824 "gotpvp=%d ptepindex=%ld ptindex=%ld " 2825 "pv=%p pvp=%p\n", 2826 *pt, gotpvp, ptepindex, pt_index, pv, pvp); 2827 tsleep(pt, 0, "DEAD", 0); 2828 p = pv->pv_m; 2829 } else { 2830 p = PHYS_TO_VM_PAGE(*pt & PG_FRAME); 2831 } 2832 #endif 2833 pmap_inval_bulk(bulk, (vm_offset_t)-1, pt, 0); 2834 KKASSERT(pv->pv_m == p); /* debugging */ 2835 } else { 2836 /* 2837 * Remove a PTE from the PT page. The PV might exist even if 2838 * the PTE is not managed, in whichcase pv->pv_m should be 2839 * NULL. 2840 * 2841 * NOTE: Userland pmaps manage the parent PT/PD/PDP page 2842 * table pages but the kernel_pmap does not. 2843 * 2844 * NOTE: pv's must be locked bottom-up to avoid deadlocking. 2845 * pv is a pte_pv so we can safely lock pt_pv. 2846 * 2847 * NOTE: FICTITIOUS pages may have multiple physical mappings 2848 * so PHYS_TO_VM_PAGE() will not necessarily work for 2849 * terminal ptes. 2850 */ 2851 vm_pindex_t pt_pindex; 2852 pt_entry_t *ptep; 2853 pt_entry_t pte; 2854 vm_offset_t va; 2855 2856 pt_pindex = ptepindex >> NPTEPGSHIFT; 2857 va = (vm_offset_t)ptepindex << PAGE_SHIFT; 2858 2859 if (ptepindex >= NUPTE_USER) { 2860 ptep = vtopte(ptepindex << PAGE_SHIFT); 2861 KKASSERT(pvp == NULL); 2862 /* pvp remains NULL */ 2863 } else { 2864 if (pvp == NULL) { 2865 pt_pindex = NUPTE_TOTAL + 2866 (ptepindex >> NPDPEPGSHIFT); 2867 pvp = pv_get(pv->pv_pmap, pt_pindex, NULL); 2868 KKASSERT(pvp); 2869 gotpvp = 1; 2870 } 2871 ptep = pv_pte_lookup(pvp, ptepindex & 2872 ((1ul << NPDPEPGSHIFT) - 1)); 2873 } 2874 pte = pmap_inval_bulk(bulk, va, ptep, 0); 2875 if (bulk == NULL) /* XXX */ 2876 cpu_invlpg((void *)va); /* XXX */ 2877 2878 /* 2879 * Now update the vm_page_t 2880 */ 2881 if ((pte & pmap->pmap_bits[PG_MANAGED_IDX]) && 2882 (pte & pmap->pmap_bits[PG_V_IDX])) { 2883 /* 2884 * Valid managed page, adjust (p). 2885 */ 2886 if (pte & pmap->pmap_bits[PG_DEVICE_IDX]) { 2887 p = pv->pv_m; 2888 } else { 2889 p = PHYS_TO_VM_PAGE(pte & PG_FRAME); 2890 KKASSERT(pv->pv_m == p); 2891 } 2892 if (pte & pmap->pmap_bits[PG_M_IDX]) { 2893 if (pmap_track_modified(ptepindex)) 2894 vm_page_dirty(p); 2895 } 2896 if (pte & pmap->pmap_bits[PG_A_IDX]) { 2897 vm_page_flag_set(p, PG_REFERENCED); 2898 } 2899 } else { 2900 /* 2901 * Unmanaged page, do not try to adjust the vm_page_t. 2902 * pv could be freshly allocated for a pmap_enter(), 2903 * replacing an unmanaged page with a managed one. 2904 * 2905 * pv->pv_m might reflect the new page and not the 2906 * existing page. 2907 * 2908 * We could extract p from the physical address and 2909 * adjust it but we explicitly do not for unmanaged 2910 * pages. 2911 */ 2912 p = NULL; 2913 } 2914 if (pte & pmap->pmap_bits[PG_W_IDX]) 2915 atomic_add_long(&pmap->pm_stats.wired_count, -1); 2916 if (pte & pmap->pmap_bits[PG_G_IDX]) 2917 cpu_invlpg((void *)va); 2918 } 2919 2920 /* 2921 * If requested, scrap the underlying pv->pv_m and the underlying 2922 * pv. If this is a page-table-page we must also free the page. 2923 * 2924 * pvp must be returned locked. 2925 */ 2926 if (destroy == 1) { 2927 /* 2928 * page table page (PT, PD, PDP, PML4), caller was responsible 2929 * for testing wired_count. 2930 */ 2931 KKASSERT(pv->pv_m->wire_count == 1); 2932 p = pmap_remove_pv_page(pv); 2933 pv_free(pv, pvp); 2934 pv = NULL; 2935 2936 vm_page_busy_wait(p, FALSE, "pgpun"); 2937 vm_page_unwire(p, 0); 2938 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 2939 vm_page_free(p); 2940 } else if (destroy == 2) { 2941 /* 2942 * Normal page, remove from pmap and leave the underlying 2943 * page untouched. 2944 */ 2945 pmap_remove_pv_page(pv); 2946 pv_free(pv, pvp); 2947 pv = NULL; /* safety */ 2948 } 2949 2950 /* 2951 * If we acquired pvp ourselves then we are responsible for 2952 * recursively deleting it. 2953 */ 2954 if (pvp && gotpvp) { 2955 /* 2956 * Recursively destroy higher-level page tables. 2957 * 2958 * This is optional. If we do not, they will still 2959 * be destroyed when the process exits. 2960 * 2961 * NOTE: Do not destroy pv_entry's with extra hold refs, 2962 * a caller may have unlocked it and intends to 2963 * continue to use it. 2964 */ 2965 if (pmap_dynamic_delete && 2966 pvp->pv_m && 2967 pvp->pv_m->wire_count == 1 && 2968 (pvp->pv_hold & PV_HOLD_MASK) == 2 && 2969 pvp->pv_pindex != pmap_pml4_pindex()) { 2970 if (pmap_dynamic_delete == 2) 2971 kprintf("A %jd %08x\n", pvp->pv_pindex, pvp->pv_hold); 2972 if (pmap != &kernel_pmap) { 2973 pmap_remove_pv_pte(pvp, NULL, bulk, 1); 2974 pvp = NULL; /* safety */ 2975 } else { 2976 kprintf("Attempt to remove kernel_pmap pindex " 2977 "%jd\n", pvp->pv_pindex); 2978 pv_put(pvp); 2979 } 2980 } else { 2981 pv_put(pvp); 2982 } 2983 } 2984 } 2985 2986 /* 2987 * Remove the vm_page association to a pv. The pv must be locked. 2988 */ 2989 static 2990 vm_page_t 2991 pmap_remove_pv_page(pv_entry_t pv) 2992 { 2993 vm_page_t m; 2994 2995 m = pv->pv_m; 2996 vm_page_spin_lock(m); 2997 KKASSERT(m && m == pv->pv_m); 2998 pv->pv_m = NULL; 2999 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3000 pmap_page_stats_deleting(m); 3001 if (TAILQ_EMPTY(&m->md.pv_list)) 3002 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 3003 vm_page_spin_unlock(m); 3004 3005 return(m); 3006 } 3007 3008 /* 3009 * Grow the number of kernel page table entries, if needed. 3010 * 3011 * This routine is always called to validate any address space 3012 * beyond KERNBASE (for kldloads). kernel_vm_end only governs the address 3013 * space below KERNBASE. 3014 * 3015 * kernel_map must be locked exclusively by the caller. 3016 */ 3017 void 3018 pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) 3019 { 3020 vm_paddr_t paddr; 3021 vm_offset_t ptppaddr; 3022 vm_page_t nkpg; 3023 pd_entry_t *pt, newpt; 3024 pdp_entry_t newpd; 3025 int update_kernel_vm_end; 3026 3027 /* 3028 * bootstrap kernel_vm_end on first real VM use 3029 */ 3030 if (kernel_vm_end == 0) { 3031 kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 3032 nkpt = 0; 3033 while ((*pmap_pt(&kernel_pmap, kernel_vm_end) & kernel_pmap.pmap_bits[PG_V_IDX]) != 0) { 3034 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & 3035 ~(PAGE_SIZE * NPTEPG - 1); 3036 nkpt++; 3037 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 3038 kernel_vm_end = kernel_map.max_offset; 3039 break; 3040 } 3041 } 3042 } 3043 3044 /* 3045 * Fill in the gaps. kernel_vm_end is only adjusted for ranges 3046 * below KERNBASE. Ranges above KERNBASE are kldloaded and we 3047 * do not want to force-fill 128G worth of page tables. 3048 */ 3049 if (kstart < KERNBASE) { 3050 if (kstart > kernel_vm_end) 3051 kstart = kernel_vm_end; 3052 KKASSERT(kend <= KERNBASE); 3053 update_kernel_vm_end = 1; 3054 } else { 3055 update_kernel_vm_end = 0; 3056 } 3057 3058 kstart = rounddown2(kstart, PAGE_SIZE * NPTEPG); 3059 kend = roundup2(kend, PAGE_SIZE * NPTEPG); 3060 3061 if (kend - 1 >= kernel_map.max_offset) 3062 kend = kernel_map.max_offset; 3063 3064 while (kstart < kend) { 3065 pt = pmap_pt(&kernel_pmap, kstart); 3066 if (pt == NULL) { 3067 /* We need a new PD entry */ 3068 nkpg = vm_page_alloc(NULL, mycpu->gd_rand_incr++, 3069 VM_ALLOC_NORMAL | 3070 VM_ALLOC_SYSTEM | 3071 VM_ALLOC_INTERRUPT); 3072 if (nkpg == NULL) { 3073 panic("pmap_growkernel: no memory to grow " 3074 "kernel"); 3075 } 3076 paddr = VM_PAGE_TO_PHYS(nkpg); 3077 pmap_zero_page(paddr); 3078 newpd = (pdp_entry_t) 3079 (paddr | 3080 kernel_pmap.pmap_bits[PG_V_IDX] | 3081 kernel_pmap.pmap_bits[PG_RW_IDX] | 3082 kernel_pmap.pmap_bits[PG_A_IDX] | 3083 kernel_pmap.pmap_bits[PG_M_IDX]); 3084 *pmap_pd(&kernel_pmap, kstart) = newpd; 3085 continue; /* try again */ 3086 } 3087 if ((*pt & kernel_pmap.pmap_bits[PG_V_IDX]) != 0) { 3088 kstart = (kstart + PAGE_SIZE * NPTEPG) & 3089 ~(PAGE_SIZE * NPTEPG - 1); 3090 if (kstart - 1 >= kernel_map.max_offset) { 3091 kstart = kernel_map.max_offset; 3092 break; 3093 } 3094 continue; 3095 } 3096 3097 /* 3098 * We need a new PT 3099 * 3100 * This index is bogus, but out of the way 3101 */ 3102 nkpg = vm_page_alloc(NULL, mycpu->gd_rand_incr++, 3103 VM_ALLOC_NORMAL | 3104 VM_ALLOC_SYSTEM | 3105 VM_ALLOC_INTERRUPT); 3106 if (nkpg == NULL) 3107 panic("pmap_growkernel: no memory to grow kernel"); 3108 3109 vm_page_wire(nkpg); 3110 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 3111 pmap_zero_page(ptppaddr); 3112 newpt = (pd_entry_t)(ptppaddr | 3113 kernel_pmap.pmap_bits[PG_V_IDX] | 3114 kernel_pmap.pmap_bits[PG_RW_IDX] | 3115 kernel_pmap.pmap_bits[PG_A_IDX] | 3116 kernel_pmap.pmap_bits[PG_M_IDX]); 3117 atomic_swap_long(pmap_pt(&kernel_pmap, kstart), newpt); 3118 3119 kstart = (kstart + PAGE_SIZE * NPTEPG) & 3120 ~(PAGE_SIZE * NPTEPG - 1); 3121 3122 if (kstart - 1 >= kernel_map.max_offset) { 3123 kstart = kernel_map.max_offset; 3124 break; 3125 } 3126 } 3127 3128 /* 3129 * Only update kernel_vm_end for areas below KERNBASE. 3130 */ 3131 if (update_kernel_vm_end && kernel_vm_end < kstart) 3132 kernel_vm_end = kstart; 3133 } 3134 3135 /* 3136 * Add a reference to the specified pmap. 3137 */ 3138 void 3139 pmap_reference(pmap_t pmap) 3140 { 3141 if (pmap != NULL) 3142 atomic_add_int(&pmap->pm_count, 1); 3143 } 3144 3145 /*************************************************** 3146 * page management routines. 3147 ***************************************************/ 3148 3149 /* 3150 * Hold a pv without locking it 3151 */ 3152 static void 3153 pv_hold(pv_entry_t pv) 3154 { 3155 atomic_add_int(&pv->pv_hold, 1); 3156 } 3157 3158 /* 3159 * Hold a pv_entry, preventing its destruction. TRUE is returned if the pv 3160 * was successfully locked, FALSE if it wasn't. The caller must dispose of 3161 * the pv properly. 3162 * 3163 * Either the pmap->pm_spin or the related vm_page_spin (if traversing a 3164 * pv list via its page) must be held by the caller in order to stabilize 3165 * the pv. 3166 */ 3167 static int 3168 _pv_hold_try(pv_entry_t pv PMAP_DEBUG_DECL) 3169 { 3170 u_int count; 3171 3172 /* 3173 * Critical path shortcut expects pv to already have one ref 3174 * (for the pv->pv_pmap). 3175 */ 3176 if (atomic_cmpset_int(&pv->pv_hold, 1, PV_HOLD_LOCKED | 2)) { 3177 #ifdef PMAP_DEBUG 3178 pv->pv_func = func; 3179 pv->pv_line = lineno; 3180 #endif 3181 return TRUE; 3182 } 3183 3184 for (;;) { 3185 count = pv->pv_hold; 3186 cpu_ccfence(); 3187 if ((count & PV_HOLD_LOCKED) == 0) { 3188 if (atomic_cmpset_int(&pv->pv_hold, count, 3189 (count + 1) | PV_HOLD_LOCKED)) { 3190 #ifdef PMAP_DEBUG 3191 pv->pv_func = func; 3192 pv->pv_line = lineno; 3193 #endif 3194 return TRUE; 3195 } 3196 } else { 3197 if (atomic_cmpset_int(&pv->pv_hold, count, count + 1)) 3198 return FALSE; 3199 } 3200 /* retry */ 3201 } 3202 } 3203 3204 /* 3205 * Drop a previously held pv_entry which could not be locked, allowing its 3206 * destruction. 3207 * 3208 * Must not be called with a spinlock held as we might zfree() the pv if it 3209 * is no longer associated with a pmap and this was the last hold count. 3210 */ 3211 static void 3212 pv_drop(pv_entry_t pv) 3213 { 3214 u_int count; 3215 3216 for (;;) { 3217 count = pv->pv_hold; 3218 cpu_ccfence(); 3219 KKASSERT((count & PV_HOLD_MASK) > 0); 3220 KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) != 3221 (PV_HOLD_LOCKED | 1)); 3222 if (atomic_cmpset_int(&pv->pv_hold, count, count - 1)) { 3223 if ((count & PV_HOLD_MASK) == 1) { 3224 #ifdef PMAP_DEBUG2 3225 if (pmap_enter_debug > 0) { 3226 --pmap_enter_debug; 3227 kprintf("pv_drop: free pv %p\n", pv); 3228 } 3229 #endif 3230 KKASSERT(count == 1); 3231 KKASSERT(pv->pv_pmap == NULL); 3232 zfree(pvzone, pv); 3233 } 3234 return; 3235 } 3236 /* retry */ 3237 } 3238 } 3239 3240 /* 3241 * Find or allocate the requested PV entry, returning a locked, held pv. 3242 * 3243 * If (*isnew) is non-zero, the returned pv will have two hold counts, one 3244 * for the caller and one representing the pmap and vm_page association. 3245 * 3246 * If (*isnew) is zero, the returned pv will have only one hold count. 3247 * 3248 * Since both associations can only be adjusted while the pv is locked, 3249 * together they represent just one additional hold. 3250 */ 3251 static 3252 pv_entry_t 3253 _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew PMAP_DEBUG_DECL) 3254 { 3255 pv_entry_t pv; 3256 pv_entry_t pnew = NULL; 3257 3258 spin_lock(&pmap->pm_spin); 3259 for (;;) { 3260 /* 3261 * Shortcut cache 3262 */ 3263 pv = pmap->pm_pvhint; 3264 cpu_ccfence(); 3265 if (pv == NULL || 3266 pv->pv_pmap != pmap || 3267 pv->pv_pindex != pindex) { 3268 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, 3269 pindex); 3270 } 3271 if (pv == NULL) { 3272 vm_pindex_t *pmark; 3273 3274 /* 3275 * We need to stage a new pv entry 3276 */ 3277 if (pnew == NULL) { 3278 spin_unlock(&pmap->pm_spin); 3279 pnew = zalloc(pvzone); 3280 spin_lock(&pmap->pm_spin); 3281 continue; 3282 } 3283 3284 /* 3285 * We need to block if someone is holding a 3286 * placemarker. The exclusive spinlock is a 3287 * sufficient interlock, as long as we determine 3288 * the placemarker has not been aquired we do not 3289 * need to get it. 3290 */ 3291 pmark = pmap_placemarker_hash(pmap, pindex); 3292 3293 if (((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) { 3294 atomic_set_long(pmark, PM_PLACEMARK_WAKEUP); 3295 ssleep(pmark, &pmap->pm_spin, 0, "pvplc", 0); 3296 continue; 3297 } 3298 3299 /* 3300 * Setup the new entry 3301 */ 3302 pnew->pv_pmap = pmap; 3303 pnew->pv_pindex = pindex; 3304 pnew->pv_hold = PV_HOLD_LOCKED | 2; 3305 #ifdef PMAP_DEBUG 3306 pnew->pv_func = func; 3307 pnew->pv_line = lineno; 3308 if (pnew->pv_line_lastfree > 0) { 3309 pnew->pv_line_lastfree = 3310 -pnew->pv_line_lastfree; 3311 } 3312 #endif 3313 pv = pv_entry_rb_tree_RB_INSERT(&pmap->pm_pvroot, pnew); 3314 atomic_add_long(&pmap->pm_stats.resident_count, 1); 3315 spin_unlock(&pmap->pm_spin); 3316 *isnew = 1; 3317 3318 KKASSERT(pv == NULL); 3319 return(pnew); 3320 } 3321 3322 /* 3323 * We have an entry, clean up any staged pv we had allocated, 3324 * then block until we can lock the entry. 3325 */ 3326 if (pnew) { 3327 spin_unlock(&pmap->pm_spin); 3328 zfree(pvzone, pnew); 3329 pnew = NULL; 3330 spin_lock(&pmap->pm_spin); 3331 continue; 3332 } 3333 if (_pv_hold_try(pv PMAP_DEBUG_COPY)) { 3334 spin_unlock(&pmap->pm_spin); 3335 KKASSERT(pv->pv_pmap == pmap && 3336 pv->pv_pindex == pindex); 3337 *isnew = 0; 3338 return(pv); 3339 } 3340 spin_unlock(&pmap->pm_spin); 3341 _pv_lock(pv PMAP_DEBUG_COPY); 3342 pv_put(pv); 3343 spin_lock(&pmap->pm_spin); 3344 } 3345 } 3346 3347 /* 3348 * Find the requested PV entry, returning a locked+held pv or NULL 3349 */ 3350 static 3351 pv_entry_t 3352 _pv_get(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp PMAP_DEBUG_DECL) 3353 { 3354 pv_entry_t pv; 3355 3356 spin_lock(&pmap->pm_spin); 3357 for (;;) { 3358 /* 3359 * Shortcut cache 3360 */ 3361 pv = pmap->pm_pvhint; 3362 cpu_ccfence(); 3363 if (pv == NULL || 3364 pv->pv_pmap != pmap || 3365 pv->pv_pindex != pindex) { 3366 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, 3367 pindex); 3368 } 3369 if (pv == NULL) { 3370 /* 3371 * Block if there is a placemarker. If we are to 3372 * return it, we must also aquire the spot. 3373 */ 3374 vm_pindex_t *pmark; 3375 3376 pmark = pmap_placemarker_hash(pmap, pindex); 3377 3378 if ((pmarkp && *pmark != PM_NOPLACEMARK) || 3379 ((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) { 3380 atomic_set_long(pmark, PM_PLACEMARK_WAKEUP); 3381 ssleep(pmark, &pmap->pm_spin, 0, "pvpld", 0); 3382 continue; 3383 } 3384 if (pmarkp) { 3385 if (atomic_swap_long(pmark, pindex) != 3386 PM_NOPLACEMARK) { 3387 panic("_pv_get: pmark race"); 3388 } 3389 *pmarkp = pmark; 3390 } 3391 spin_unlock(&pmap->pm_spin); 3392 return NULL; 3393 } 3394 if (_pv_hold_try(pv PMAP_DEBUG_COPY)) { 3395 pv_cache(pv, pindex); 3396 spin_unlock(&pmap->pm_spin); 3397 KKASSERT(pv->pv_pmap == pmap && 3398 pv->pv_pindex == pindex); 3399 return(pv); 3400 } 3401 spin_unlock(&pmap->pm_spin); 3402 _pv_lock(pv PMAP_DEBUG_COPY); 3403 pv_put(pv); 3404 spin_lock(&pmap->pm_spin); 3405 } 3406 } 3407 3408 /* 3409 * Lookup, hold, and attempt to lock (pmap,pindex). 3410 * 3411 * If the entry does not exist NULL is returned and *errorp is set to 0 3412 * 3413 * If the entry exists and could be successfully locked it is returned and 3414 * errorp is set to 0. 3415 * 3416 * If the entry exists but could NOT be successfully locked it is returned 3417 * held and *errorp is set to 1. 3418 * 3419 * If the entry is placemarked by someone else NULL is returned and *errorp 3420 * is set to 1. 3421 */ 3422 static 3423 pv_entry_t 3424 pv_get_try(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp, int *errorp) 3425 { 3426 pv_entry_t pv; 3427 3428 spin_lock_shared(&pmap->pm_spin); 3429 3430 pv = pmap->pm_pvhint; 3431 cpu_ccfence(); 3432 if (pv == NULL || 3433 pv->pv_pmap != pmap || 3434 pv->pv_pindex != pindex) { 3435 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex); 3436 } 3437 3438 if (pv == NULL) { 3439 vm_pindex_t *pmark; 3440 3441 pmark = pmap_placemarker_hash(pmap, pindex); 3442 3443 if (((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) { 3444 *errorp = 1; 3445 } else if (pmarkp && 3446 atomic_cmpset_long(pmark, PM_NOPLACEMARK, pindex)) { 3447 *errorp = 0; 3448 } else { 3449 /* 3450 * Can't set a placemark with a NULL pmarkp, or if 3451 * pmarkp is non-NULL but we failed to set our 3452 * placemark. 3453 */ 3454 *errorp = 1; 3455 } 3456 if (pmarkp) 3457 *pmarkp = pmark; 3458 spin_unlock_shared(&pmap->pm_spin); 3459 3460 return NULL; 3461 } 3462 3463 /* 3464 * XXX This has problems if the lock is shared, why? 3465 */ 3466 if (pv_hold_try(pv)) { 3467 pv_cache(pv, pindex); /* overwrite ok (shared lock) */ 3468 spin_unlock_shared(&pmap->pm_spin); 3469 *errorp = 0; 3470 KKASSERT(pv->pv_pmap == pmap && pv->pv_pindex == pindex); 3471 return(pv); /* lock succeeded */ 3472 } 3473 spin_unlock_shared(&pmap->pm_spin); 3474 *errorp = 1; 3475 3476 return (pv); /* lock failed */ 3477 } 3478 3479 /* 3480 * Lock a held pv, keeping the hold count 3481 */ 3482 static 3483 void 3484 _pv_lock(pv_entry_t pv PMAP_DEBUG_DECL) 3485 { 3486 u_int count; 3487 3488 for (;;) { 3489 count = pv->pv_hold; 3490 cpu_ccfence(); 3491 if ((count & PV_HOLD_LOCKED) == 0) { 3492 if (atomic_cmpset_int(&pv->pv_hold, count, 3493 count | PV_HOLD_LOCKED)) { 3494 #ifdef PMAP_DEBUG 3495 pv->pv_func = func; 3496 pv->pv_line = lineno; 3497 #endif 3498 return; 3499 } 3500 continue; 3501 } 3502 tsleep_interlock(pv, 0); 3503 if (atomic_cmpset_int(&pv->pv_hold, count, 3504 count | PV_HOLD_WAITING)) { 3505 #ifdef PMAP_DEBUG2 3506 if (pmap_enter_debug > 0) { 3507 --pmap_enter_debug; 3508 kprintf("pv waiting on %s:%d\n", 3509 pv->pv_func, pv->pv_line); 3510 } 3511 #endif 3512 tsleep(pv, PINTERLOCKED, "pvwait", hz); 3513 } 3514 /* retry */ 3515 } 3516 } 3517 3518 /* 3519 * Unlock a held and locked pv, keeping the hold count. 3520 */ 3521 static 3522 void 3523 pv_unlock(pv_entry_t pv) 3524 { 3525 u_int count; 3526 3527 for (;;) { 3528 count = pv->pv_hold; 3529 cpu_ccfence(); 3530 KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) >= 3531 (PV_HOLD_LOCKED | 1)); 3532 if (atomic_cmpset_int(&pv->pv_hold, count, 3533 count & 3534 ~(PV_HOLD_LOCKED | PV_HOLD_WAITING))) { 3535 if (count & PV_HOLD_WAITING) 3536 wakeup(pv); 3537 break; 3538 } 3539 } 3540 } 3541 3542 /* 3543 * Unlock and drop a pv. If the pv is no longer associated with a pmap 3544 * and the hold count drops to zero we will free it. 3545 * 3546 * Caller should not hold any spin locks. We are protected from hold races 3547 * by virtue of holds only occuring only with a pmap_spin or vm_page_spin 3548 * lock held. A pv cannot be located otherwise. 3549 */ 3550 static 3551 void 3552 pv_put(pv_entry_t pv) 3553 { 3554 #ifdef PMAP_DEBUG2 3555 if (pmap_enter_debug > 0) { 3556 --pmap_enter_debug; 3557 kprintf("pv_put pv=%p hold=%08x\n", pv, pv->pv_hold); 3558 } 3559 #endif 3560 3561 /* 3562 * Normal put-aways must have a pv_m associated with the pv, 3563 * but allow the case where the pv has been destructed due 3564 * to pmap_dynamic_delete. 3565 */ 3566 KKASSERT(pv->pv_pmap == NULL || pv->pv_m != NULL); 3567 3568 /* 3569 * Fast - shortcut most common condition 3570 */ 3571 if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 1)) 3572 return; 3573 3574 /* 3575 * Slow 3576 */ 3577 pv_unlock(pv); 3578 pv_drop(pv); 3579 } 3580 3581 /* 3582 * Remove the pmap association from a pv, require that pv_m already be removed, 3583 * then unlock and drop the pv. Any pte operations must have already been 3584 * completed. This call may result in a last-drop which will physically free 3585 * the pv. 3586 * 3587 * Removing the pmap association entails an additional drop. 3588 * 3589 * pv must be exclusively locked on call and will be disposed of on return. 3590 */ 3591 static 3592 void 3593 _pv_free(pv_entry_t pv, pv_entry_t pvp PMAP_DEBUG_DECL) 3594 { 3595 pmap_t pmap; 3596 3597 #ifdef PMAP_DEBUG 3598 pv->pv_func_lastfree = func; 3599 pv->pv_line_lastfree = lineno; 3600 #endif 3601 KKASSERT(pv->pv_m == NULL); 3602 KKASSERT((pv->pv_hold & (PV_HOLD_LOCKED|PV_HOLD_MASK)) >= 3603 (PV_HOLD_LOCKED|1)); 3604 if ((pmap = pv->pv_pmap) != NULL) { 3605 spin_lock(&pmap->pm_spin); 3606 KKASSERT(pv->pv_pmap == pmap); 3607 if (pmap->pm_pvhint == pv) 3608 pmap->pm_pvhint = NULL; 3609 pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); 3610 atomic_add_long(&pmap->pm_stats.resident_count, -1); 3611 pv->pv_pmap = NULL; 3612 pv->pv_pindex = 0; 3613 spin_unlock(&pmap->pm_spin); 3614 3615 /* 3616 * Try to shortcut three atomic ops, otherwise fall through 3617 * and do it normally. Drop two refs and the lock all in 3618 * one go. 3619 */ 3620 if (pvp) 3621 vm_page_unwire_quick(pvp->pv_m); 3622 if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 0)) { 3623 #ifdef PMAP_DEBUG2 3624 if (pmap_enter_debug > 0) { 3625 --pmap_enter_debug; 3626 kprintf("pv_free: free pv %p\n", pv); 3627 } 3628 #endif 3629 zfree(pvzone, pv); 3630 return; 3631 } 3632 pv_drop(pv); /* ref for pv_pmap */ 3633 } 3634 pv_unlock(pv); 3635 pv_drop(pv); 3636 } 3637 3638 /* 3639 * This routine is very drastic, but can save the system 3640 * in a pinch. 3641 */ 3642 void 3643 pmap_collect(void) 3644 { 3645 int i; 3646 vm_page_t m; 3647 static int warningdone=0; 3648 3649 if (pmap_pagedaemon_waken == 0) 3650 return; 3651 pmap_pagedaemon_waken = 0; 3652 if (warningdone < 5) { 3653 kprintf("pmap_collect: collecting pv entries -- " 3654 "suggest increasing PMAP_SHPGPERPROC\n"); 3655 warningdone++; 3656 } 3657 3658 for (i = 0; i < vm_page_array_size; i++) { 3659 m = &vm_page_array[i]; 3660 if (m->wire_count || m->hold_count) 3661 continue; 3662 if (vm_page_busy_try(m, TRUE) == 0) { 3663 if (m->wire_count == 0 && m->hold_count == 0) { 3664 pmap_remove_all(m); 3665 } 3666 vm_page_wakeup(m); 3667 } 3668 } 3669 } 3670 3671 /* 3672 * Scan the pmap for active page table entries and issue a callback. 3673 * The callback must dispose of pte_pv, whos PTE entry is at *ptep in 3674 * its parent page table. 3675 * 3676 * pte_pv will be NULL if the page or page table is unmanaged. 3677 * pt_pv will point to the page table page containing the pte for the page. 3678 * 3679 * NOTE! If we come across an unmanaged page TABLE (verses an unmanaged page), 3680 * we pass a NULL pte_pv and we pass a pt_pv pointing to the passed 3681 * process pmap's PD and page to the callback function. This can be 3682 * confusing because the pt_pv is really a pd_pv, and the target page 3683 * table page is simply aliased by the pmap and not owned by it. 3684 * 3685 * It is assumed that the start and end are properly rounded to the page size. 3686 * 3687 * It is assumed that PD pages and above are managed and thus in the RB tree, 3688 * allowing us to use RB_SCAN from the PD pages down for ranged scans. 3689 */ 3690 struct pmap_scan_info { 3691 struct pmap *pmap; 3692 vm_offset_t sva; 3693 vm_offset_t eva; 3694 vm_pindex_t sva_pd_pindex; 3695 vm_pindex_t eva_pd_pindex; 3696 void (*func)(pmap_t, struct pmap_scan_info *, 3697 pv_entry_t, vm_pindex_t *, pv_entry_t, 3698 int, vm_offset_t, 3699 pt_entry_t *, void *); 3700 void *arg; 3701 pmap_inval_bulk_t bulk_core; 3702 pmap_inval_bulk_t *bulk; 3703 int count; 3704 int stop; 3705 }; 3706 3707 static int pmap_scan_cmp(pv_entry_t pv, void *data); 3708 static int pmap_scan_callback(pv_entry_t pv, void *data); 3709 3710 static void 3711 pmap_scan(struct pmap_scan_info *info, int smp_inval) 3712 { 3713 struct pmap *pmap = info->pmap; 3714 pv_entry_t pd_pv; /* A page directory PV */ 3715 pv_entry_t pt_pv; /* A page table PV */ 3716 pv_entry_t pte_pv; /* A page table entry PV */ 3717 vm_pindex_t *pte_placemark; 3718 vm_pindex_t *pt_placemark; 3719 pt_entry_t *ptep; 3720 pt_entry_t oldpte; 3721 struct pv_entry dummy_pv; 3722 3723 info->stop = 0; 3724 if (pmap == NULL) 3725 return; 3726 if (info->sva == info->eva) 3727 return; 3728 if (smp_inval) { 3729 info->bulk = &info->bulk_core; 3730 pmap_inval_bulk_init(&info->bulk_core, pmap); 3731 } else { 3732 info->bulk = NULL; 3733 } 3734 3735 /* 3736 * Hold the token for stability; if the pmap is empty we have nothing 3737 * to do. 3738 */ 3739 #if 0 3740 if (pmap->pm_stats.resident_count == 0) { 3741 return; 3742 } 3743 #endif 3744 3745 info->count = 0; 3746 3747 /* 3748 * Special handling for scanning one page, which is a very common 3749 * operation (it is?). 3750 * 3751 * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4 3752 */ 3753 if (info->sva + PAGE_SIZE == info->eva) { 3754 if (info->sva >= VM_MAX_USER_ADDRESS) { 3755 /* 3756 * Kernel mappings do not track wire counts on 3757 * page table pages and only maintain pd_pv and 3758 * pte_pv levels so pmap_scan() works. 3759 */ 3760 pt_pv = NULL; 3761 pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva), 3762 &pte_placemark); 3763 ptep = vtopte(info->sva); 3764 } else { 3765 /* 3766 * User pages which are unmanaged will not have a 3767 * pte_pv. User page table pages which are unmanaged 3768 * (shared from elsewhere) will also not have a pt_pv. 3769 * The func() callback will pass both pte_pv and pt_pv 3770 * as NULL in that case. 3771 * 3772 * We hold pte_placemark across the operation for 3773 * unmanaged pages. 3774 * 3775 * WARNING! We must hold pt_placemark across the 3776 * *ptep test to prevent misintepreting 3777 * a non-zero *ptep as a shared page 3778 * table page. Hold it across the function 3779 * callback as well for SMP safety. 3780 */ 3781 pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva), 3782 &pte_placemark); 3783 pt_pv = pv_get(pmap, pmap_pt_pindex(info->sva), 3784 &pt_placemark); 3785 if (pt_pv == NULL) { 3786 KKASSERT(pte_pv == NULL); 3787 pd_pv = pv_get(pmap, 3788 pmap_pd_pindex(info->sva), 3789 NULL); 3790 if (pd_pv) { 3791 ptep = pv_pte_lookup(pd_pv, 3792 pmap_pt_index(info->sva)); 3793 if (*ptep) { 3794 info->func(pmap, info, 3795 NULL, pt_placemark, 3796 pd_pv, 1, 3797 info->sva, ptep, 3798 info->arg); 3799 } else { 3800 pv_placemarker_wakeup(pmap, 3801 pt_placemark); 3802 } 3803 pv_put(pd_pv); 3804 } else { 3805 pv_placemarker_wakeup(pmap, 3806 pt_placemark); 3807 } 3808 pv_placemarker_wakeup(pmap, pte_placemark); 3809 goto fast_skip; 3810 } 3811 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(info->sva)); 3812 } 3813 3814 /* 3815 * NOTE: *ptep can't be ripped out from under us if we hold 3816 * pte_pv (or pte_placemark) locked, but bits can 3817 * change. 3818 */ 3819 oldpte = *ptep; 3820 cpu_ccfence(); 3821 if (oldpte == 0) { 3822 KKASSERT(pte_pv == NULL); 3823 pv_placemarker_wakeup(pmap, pte_placemark); 3824 } else if (pte_pv) { 3825 KASSERT((oldpte & (pmap->pmap_bits[PG_MANAGED_IDX] | 3826 pmap->pmap_bits[PG_V_IDX])) == 3827 (pmap->pmap_bits[PG_MANAGED_IDX] | 3828 pmap->pmap_bits[PG_V_IDX]), 3829 ("badA *ptep %016lx/%016lx sva %016lx pte_pv %p", 3830 *ptep, oldpte, info->sva, pte_pv)); 3831 info->func(pmap, info, pte_pv, NULL, pt_pv, 0, 3832 info->sva, ptep, info->arg); 3833 } else { 3834 KASSERT((oldpte & (pmap->pmap_bits[PG_MANAGED_IDX] | 3835 pmap->pmap_bits[PG_V_IDX])) == 3836 pmap->pmap_bits[PG_V_IDX], 3837 ("badB *ptep %016lx/%016lx sva %016lx pte_pv NULL", 3838 *ptep, oldpte, info->sva)); 3839 info->func(pmap, info, NULL, pte_placemark, pt_pv, 0, 3840 info->sva, ptep, info->arg); 3841 } 3842 if (pt_pv) 3843 pv_put(pt_pv); 3844 fast_skip: 3845 pmap_inval_bulk_flush(info->bulk); 3846 return; 3847 } 3848 3849 /* 3850 * Nominal scan case, RB_SCAN() for PD pages and iterate from 3851 * there. 3852 * 3853 * WARNING! eva can overflow our standard ((N + mask) >> bits) 3854 * bounds, resulting in a pd_pindex of 0. To solve the 3855 * problem we use an inclusive range. 3856 */ 3857 info->sva_pd_pindex = pmap_pd_pindex(info->sva); 3858 info->eva_pd_pindex = pmap_pd_pindex(info->eva - PAGE_SIZE); 3859 3860 if (info->sva >= VM_MAX_USER_ADDRESS) { 3861 /* 3862 * The kernel does not currently maintain any pv_entry's for 3863 * higher-level page tables. 3864 */ 3865 bzero(&dummy_pv, sizeof(dummy_pv)); 3866 dummy_pv.pv_pindex = info->sva_pd_pindex; 3867 spin_lock(&pmap->pm_spin); 3868 while (dummy_pv.pv_pindex <= info->eva_pd_pindex) { 3869 pmap_scan_callback(&dummy_pv, info); 3870 ++dummy_pv.pv_pindex; 3871 if (dummy_pv.pv_pindex < info->sva_pd_pindex) /*wrap*/ 3872 break; 3873 } 3874 spin_unlock(&pmap->pm_spin); 3875 } else { 3876 /* 3877 * User page tables maintain local PML4, PDP, and PD 3878 * pv_entry's at the very least. PT pv's might be 3879 * unmanaged and thus not exist. PTE pv's might be 3880 * unmanaged and thus not exist. 3881 */ 3882 spin_lock(&pmap->pm_spin); 3883 pv_entry_rb_tree_RB_SCAN(&pmap->pm_pvroot, pmap_scan_cmp, 3884 pmap_scan_callback, info); 3885 spin_unlock(&pmap->pm_spin); 3886 } 3887 pmap_inval_bulk_flush(info->bulk); 3888 } 3889 3890 /* 3891 * WARNING! pmap->pm_spin held 3892 * 3893 * WARNING! eva can overflow our standard ((N + mask) >> bits) 3894 * bounds, resulting in a pd_pindex of 0. To solve the 3895 * problem we use an inclusive range. 3896 */ 3897 static int 3898 pmap_scan_cmp(pv_entry_t pv, void *data) 3899 { 3900 struct pmap_scan_info *info = data; 3901 if (pv->pv_pindex < info->sva_pd_pindex) 3902 return(-1); 3903 if (pv->pv_pindex > info->eva_pd_pindex) 3904 return(1); 3905 return(0); 3906 } 3907 3908 /* 3909 * pmap_scan() by PDs 3910 * 3911 * WARNING! pmap->pm_spin held 3912 */ 3913 static int 3914 pmap_scan_callback(pv_entry_t pv, void *data) 3915 { 3916 struct pmap_scan_info *info = data; 3917 struct pmap *pmap = info->pmap; 3918 pv_entry_t pd_pv; /* A page directory PV */ 3919 pv_entry_t pt_pv; /* A page table PV */ 3920 vm_pindex_t *pt_placemark; 3921 pt_entry_t *ptep; 3922 pt_entry_t oldpte; 3923 vm_offset_t sva; 3924 vm_offset_t eva; 3925 vm_offset_t va_next; 3926 vm_pindex_t pd_pindex; 3927 int error; 3928 3929 /* 3930 * Stop if requested 3931 */ 3932 if (info->stop) 3933 return -1; 3934 3935 /* 3936 * Pull the PD pindex from the pv before releasing the spinlock. 3937 * 3938 * WARNING: pv is faked for kernel pmap scans. 3939 */ 3940 pd_pindex = pv->pv_pindex; 3941 spin_unlock(&pmap->pm_spin); 3942 pv = NULL; /* invalid after spinlock unlocked */ 3943 3944 /* 3945 * Calculate the page range within the PD. SIMPLE pmaps are 3946 * direct-mapped for the entire 2^64 address space. Normal pmaps 3947 * reflect the user and kernel address space which requires 3948 * cannonicalization w/regards to converting pd_pindex's back 3949 * into addresses. 3950 */ 3951 sva = (pd_pindex - pmap_pd_pindex(0)) << PDPSHIFT; 3952 if ((pmap->pm_flags & PMAP_FLAG_SIMPLE) == 0 && 3953 (sva & PML4_SIGNMASK)) { 3954 sva |= PML4_SIGNMASK; 3955 } 3956 eva = sva + NBPDP; /* can overflow */ 3957 if (sva < info->sva) 3958 sva = info->sva; 3959 if (eva < info->sva || eva > info->eva) 3960 eva = info->eva; 3961 3962 /* 3963 * NOTE: kernel mappings do not track page table pages, only 3964 * terminal pages. 3965 * 3966 * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4. 3967 * However, for the scan to be efficient we try to 3968 * cache items top-down. 3969 */ 3970 pd_pv = NULL; 3971 pt_pv = NULL; 3972 3973 for (; sva < eva; sva = va_next) { 3974 if (info->stop) 3975 break; 3976 if (sva >= VM_MAX_USER_ADDRESS) { 3977 if (pt_pv) { 3978 pv_put(pt_pv); 3979 pt_pv = NULL; 3980 } 3981 goto kernel_skip; 3982 } 3983 3984 /* 3985 * PD cache, scan shortcut if it doesn't exist. 3986 */ 3987 if (pd_pv == NULL) { 3988 pd_pv = pv_get(pmap, pmap_pd_pindex(sva), NULL); 3989 } else if (pd_pv->pv_pmap != pmap || 3990 pd_pv->pv_pindex != pmap_pd_pindex(sva)) { 3991 pv_put(pd_pv); 3992 pd_pv = pv_get(pmap, pmap_pd_pindex(sva), NULL); 3993 } 3994 if (pd_pv == NULL) { 3995 va_next = (sva + NBPDP) & ~PDPMASK; 3996 if (va_next < sva) 3997 va_next = eva; 3998 continue; 3999 } 4000 4001 /* 4002 * PT cache 4003 * 4004 * NOTE: The cached pt_pv can be removed from the pmap when 4005 * pmap_dynamic_delete is enabled. 4006 */ 4007 if (pt_pv && (pt_pv->pv_pmap != pmap || 4008 pt_pv->pv_pindex != pmap_pt_pindex(sva))) { 4009 pv_put(pt_pv); 4010 pt_pv = NULL; 4011 } 4012 if (pt_pv == NULL) { 4013 pt_pv = pv_get_try(pmap, pmap_pt_pindex(sva), 4014 &pt_placemark, &error); 4015 if (error) { 4016 pv_put(pd_pv); /* lock order */ 4017 pd_pv = NULL; 4018 if (pt_pv) { 4019 pv_lock(pt_pv); 4020 pv_put(pt_pv); 4021 pt_pv = NULL; 4022 } else { 4023 pv_placemarker_wait(pmap, pt_placemark); 4024 } 4025 va_next = sva; 4026 continue; 4027 } 4028 /* may have to re-check later if pt_pv is NULL here */ 4029 } 4030 4031 /* 4032 * If pt_pv is NULL we either have an shared page table 4033 * page and must issue a callback specific to that case, 4034 * or there is no page table page. 4035 * 4036 * Either way we can skip the page table page. 4037 * 4038 * WARNING! pt_pv can also be NULL due to a pv creation 4039 * race where we find it to be NULL and then 4040 * later see a pte_pv. But its possible the pt_pv 4041 * got created inbetween the two operations, so 4042 * we must check. 4043 */ 4044 if (pt_pv == NULL) { 4045 /* 4046 * Possible unmanaged (shared from another pmap) 4047 * page table page. 4048 * 4049 * WARNING! We must hold pt_placemark across the 4050 * *ptep test to prevent misintepreting 4051 * a non-zero *ptep as a shared page 4052 * table page. Hold it across the function 4053 * callback as well for SMP safety. 4054 */ 4055 ptep = pv_pte_lookup(pd_pv, pmap_pt_index(sva)); 4056 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 4057 info->func(pmap, info, NULL, pt_placemark, 4058 pd_pv, 1, 4059 sva, ptep, info->arg); 4060 } else { 4061 pv_placemarker_wakeup(pmap, pt_placemark); 4062 } 4063 4064 /* 4065 * Done, move to next page table page. 4066 */ 4067 va_next = (sva + NBPDR) & ~PDRMASK; 4068 if (va_next < sva) 4069 va_next = eva; 4070 continue; 4071 } 4072 4073 /* 4074 * From this point in the loop testing pt_pv for non-NULL 4075 * means we are in UVM, else if it is NULL we are in KVM. 4076 * 4077 * Limit our scan to either the end of the va represented 4078 * by the current page table page, or to the end of the 4079 * range being removed. 4080 */ 4081 kernel_skip: 4082 va_next = (sva + NBPDR) & ~PDRMASK; 4083 if (va_next < sva) 4084 va_next = eva; 4085 if (va_next > eva) 4086 va_next = eva; 4087 4088 /* 4089 * Scan the page table for pages. Some pages may not be 4090 * managed (might not have a pv_entry). 4091 * 4092 * There is no page table management for kernel pages so 4093 * pt_pv will be NULL in that case, but otherwise pt_pv 4094 * is non-NULL, locked, and referenced. 4095 */ 4096 4097 /* 4098 * At this point a non-NULL pt_pv means a UVA, and a NULL 4099 * pt_pv means a KVA. 4100 */ 4101 if (pt_pv) 4102 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva)); 4103 else 4104 ptep = vtopte(sva); 4105 4106 while (sva < va_next) { 4107 pv_entry_t pte_pv; 4108 vm_pindex_t *pte_placemark; 4109 4110 /* 4111 * Yield every 64 pages, stop if requested. 4112 */ 4113 if ((++info->count & 63) == 0) 4114 lwkt_user_yield(); 4115 if (info->stop) 4116 break; 4117 4118 /* 4119 * We can shortcut our scan if *ptep == 0. This is 4120 * an unlocked check. 4121 */ 4122 if (*ptep == 0) { 4123 sva += PAGE_SIZE; 4124 ++ptep; 4125 continue; 4126 } 4127 cpu_ccfence(); 4128 4129 /* 4130 * Acquire the related pte_pv, if any. If *ptep == 0 4131 * the related pte_pv should not exist, but if *ptep 4132 * is not zero the pte_pv may or may not exist (e.g. 4133 * will not exist for an unmanaged page). 4134 * 4135 * However a multitude of races are possible here 4136 * so if we cannot lock definite state we clean out 4137 * our cache and break the inner while() loop to 4138 * force a loop up to the top of the for(). 4139 * 4140 * XXX unlock/relock pd_pv, pt_pv, and re-test their 4141 * validity instead of looping up? 4142 */ 4143 pte_pv = pv_get_try(pmap, pmap_pte_pindex(sva), 4144 &pte_placemark, &error); 4145 if (error) { 4146 pv_put(pd_pv); /* lock order */ 4147 pd_pv = NULL; 4148 if (pt_pv) { 4149 pv_put(pt_pv); /* lock order */ 4150 pt_pv = NULL; 4151 } 4152 if (pte_pv) { /* block */ 4153 pv_lock(pte_pv); 4154 pv_put(pte_pv); 4155 pte_pv = NULL; 4156 } else { 4157 pv_placemarker_wait(pmap, 4158 pte_placemark); 4159 } 4160 va_next = sva; /* retry */ 4161 break; 4162 } 4163 4164 /* 4165 * Reload *ptep after successfully locking the 4166 * pindex. If *ptep == 0 we had better NOT have a 4167 * pte_pv. 4168 */ 4169 cpu_ccfence(); 4170 oldpte = *ptep; 4171 if (oldpte == 0) { 4172 if (pte_pv) { 4173 kprintf("Unexpected non-NULL pte_pv " 4174 "%p pt_pv %p " 4175 "*ptep = %016lx/%016lx\n", 4176 pte_pv, pt_pv, *ptep, oldpte); 4177 panic("Unexpected non-NULL pte_pv"); 4178 } else { 4179 pv_placemarker_wakeup(pmap, pte_placemark); 4180 } 4181 sva += PAGE_SIZE; 4182 ++ptep; 4183 continue; 4184 } 4185 4186 /* 4187 * We can't hold pd_pv across the callback (because 4188 * we don't pass it to the callback and the callback 4189 * might deadlock) 4190 */ 4191 if (pd_pv) { 4192 vm_page_wire_quick(pd_pv->pv_m); 4193 pv_unlock(pd_pv); 4194 } 4195 4196 /* 4197 * Ready for the callback. The locked pte_pv (if any) 4198 * is consumed by the callback. pte_pv will exist if 4199 * the page is managed, and will not exist if it 4200 * isn't. 4201 */ 4202 if (oldpte & pmap->pmap_bits[PG_MANAGED_IDX]) { 4203 /* 4204 * Managed pte 4205 */ 4206 KASSERT(pte_pv && 4207 (oldpte & pmap->pmap_bits[PG_V_IDX]), 4208 ("badC *ptep %016lx/%016lx sva %016lx " 4209 "pte_pv %p", 4210 *ptep, oldpte, sva, pte_pv)); 4211 /* 4212 * We must unlock pd_pv across the callback 4213 * to avoid deadlocks on any recursive 4214 * disposal. Re-check that it still exists 4215 * after re-locking. 4216 * 4217 * Call target disposes of pte_pv and may 4218 * destroy but will not dispose of pt_pv. 4219 */ 4220 info->func(pmap, info, pte_pv, NULL, 4221 pt_pv, 0, 4222 sva, ptep, info->arg); 4223 } else { 4224 /* 4225 * Unmanaged pte 4226 * 4227 * We must unlock pd_pv across the callback 4228 * to avoid deadlocks on any recursive 4229 * disposal. Re-check that it still exists 4230 * after re-locking. 4231 * 4232 * Call target disposes of pte_pv or 4233 * pte_placemark and may destroy but will 4234 * not dispose of pt_pv. 4235 */ 4236 KASSERT(pte_pv == NULL && 4237 (oldpte & pmap->pmap_bits[PG_V_IDX]), 4238 ("badD *ptep %016lx/%016lx sva %016lx " 4239 "pte_pv %p pte_pv->pv_m %p ", 4240 *ptep, oldpte, sva, 4241 pte_pv, (pte_pv ? pte_pv->pv_m : NULL))); 4242 if (pte_pv) 4243 kprintf("RaceD\n"); 4244 if (pte_pv) { 4245 info->func(pmap, info, 4246 pte_pv, NULL, 4247 pt_pv, 0, 4248 sva, ptep, info->arg); 4249 } else { 4250 info->func(pmap, info, 4251 NULL, pte_placemark, 4252 pt_pv, 0, 4253 sva, ptep, info->arg); 4254 } 4255 } 4256 if (pd_pv) { 4257 pv_lock(pd_pv); 4258 vm_page_unwire_quick(pd_pv->pv_m); 4259 if (pd_pv->pv_pmap == NULL) { 4260 va_next = sva; /* retry */ 4261 break; 4262 } 4263 } 4264 4265 /* 4266 * NOTE: The cached pt_pv can be removed from the 4267 * pmap when pmap_dynamic_delete is enabled, 4268 * which will cause ptep to become stale. 4269 * 4270 * This also means that no pages remain under 4271 * the PT, so we can just break out of the inner 4272 * loop and let the outer loop clean everything 4273 * up. 4274 */ 4275 if (pt_pv && pt_pv->pv_pmap != pmap) 4276 break; 4277 pte_pv = NULL; 4278 sva += PAGE_SIZE; 4279 ++ptep; 4280 } 4281 } 4282 if (pd_pv) { 4283 pv_put(pd_pv); 4284 pd_pv = NULL; 4285 } 4286 if (pt_pv) { 4287 pv_put(pt_pv); 4288 pt_pv = NULL; 4289 } 4290 if ((++info->count & 7) == 0) 4291 lwkt_user_yield(); 4292 4293 /* 4294 * Relock before returning. 4295 */ 4296 spin_lock(&pmap->pm_spin); 4297 return (0); 4298 } 4299 4300 void 4301 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 4302 { 4303 struct pmap_scan_info info; 4304 4305 info.pmap = pmap; 4306 info.sva = sva; 4307 info.eva = eva; 4308 info.func = pmap_remove_callback; 4309 info.arg = NULL; 4310 pmap_scan(&info, 1); 4311 #if 0 4312 cpu_invltlb(); 4313 if (eva - sva < 1024*1024) { 4314 while (sva < eva) { 4315 cpu_invlpg((void *)sva); 4316 sva += PAGE_SIZE; 4317 } 4318 } 4319 #endif 4320 } 4321 4322 static void 4323 pmap_remove_noinval(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 4324 { 4325 struct pmap_scan_info info; 4326 4327 info.pmap = pmap; 4328 info.sva = sva; 4329 info.eva = eva; 4330 info.func = pmap_remove_callback; 4331 info.arg = NULL; 4332 pmap_scan(&info, 0); 4333 } 4334 4335 static void 4336 pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info, 4337 pv_entry_t pte_pv, vm_pindex_t *pte_placemark, 4338 pv_entry_t pt_pv, int sharept, 4339 vm_offset_t va, pt_entry_t *ptep, void *arg __unused) 4340 { 4341 pt_entry_t pte; 4342 4343 if (pte_pv) { 4344 /* 4345 * Managed entry 4346 * 4347 * This will also drop pt_pv's wire_count. Note that 4348 * terminal pages are not wired based on mmu presence. 4349 * 4350 * NOTE: If this is the kernel_pmap, pt_pv can be NULL. 4351 */ 4352 KKASSERT(pte_pv->pv_m != NULL); 4353 pmap_remove_pv_pte(pte_pv, pt_pv, info->bulk, 2); 4354 pte_pv = NULL; /* safety */ 4355 4356 /* 4357 * Recursively destroy higher-level page tables. 4358 * 4359 * This is optional. If we do not, they will still 4360 * be destroyed when the process exits. 4361 * 4362 * NOTE: Do not destroy pv_entry's with extra hold refs, 4363 * a caller may have unlocked it and intends to 4364 * continue to use it. 4365 */ 4366 if (pmap_dynamic_delete && 4367 pt_pv && 4368 pt_pv->pv_m && 4369 pt_pv->pv_m->wire_count == 1 && 4370 (pt_pv->pv_hold & PV_HOLD_MASK) == 2 && 4371 pt_pv->pv_pindex != pmap_pml4_pindex()) { 4372 if (pmap_dynamic_delete == 2) 4373 kprintf("B %jd %08x\n", pt_pv->pv_pindex, pt_pv->pv_hold); 4374 pv_hold(pt_pv); /* extra hold */ 4375 pmap_remove_pv_pte(pt_pv, NULL, info->bulk, 1); 4376 pv_lock(pt_pv); /* prior extra hold + relock */ 4377 } 4378 } else if (sharept == 0) { 4379 /* 4380 * Unmanaged pte (pte_placemark is non-NULL) 4381 * 4382 * pt_pv's wire_count is still bumped by unmanaged pages 4383 * so we must decrement it manually. 4384 * 4385 * We have to unwire the target page table page. 4386 */ 4387 pte = pmap_inval_bulk(info->bulk, va, ptep, 0); 4388 if (pte & pmap->pmap_bits[PG_W_IDX]) 4389 atomic_add_long(&pmap->pm_stats.wired_count, -1); 4390 atomic_add_long(&pmap->pm_stats.resident_count, -1); 4391 if (vm_page_unwire_quick(pt_pv->pv_m)) 4392 panic("pmap_remove: insufficient wirecount"); 4393 pv_placemarker_wakeup(pmap, pte_placemark); 4394 } else { 4395 /* 4396 * Unmanaged page table (pt, pd, or pdp. Not pte) for 4397 * a shared page table. 4398 * 4399 * pt_pv is actually the pd_pv for our pmap (not the shared 4400 * object pmap). 4401 * 4402 * We have to unwire the target page table page and we 4403 * have to unwire our page directory page. 4404 * 4405 * It is unclear how we can invalidate a segment so we 4406 * invalidate -1 which invlidates the tlb. 4407 */ 4408 pte = pmap_inval_bulk(info->bulk, (vm_offset_t)-1, ptep, 0); 4409 atomic_add_long(&pmap->pm_stats.resident_count, -1); 4410 KKASSERT((pte & pmap->pmap_bits[PG_DEVICE_IDX]) == 0); 4411 if (vm_page_unwire_quick(PHYS_TO_VM_PAGE(pte & PG_FRAME))) 4412 panic("pmap_remove: shared pgtable1 bad wirecount"); 4413 if (vm_page_unwire_quick(pt_pv->pv_m)) 4414 panic("pmap_remove: shared pgtable2 bad wirecount"); 4415 pv_placemarker_wakeup(pmap, pte_placemark); 4416 } 4417 } 4418 4419 /* 4420 * Removes this physical page from all physical maps in which it resides. 4421 * Reflects back modify bits to the pager. 4422 * 4423 * This routine may not be called from an interrupt. 4424 */ 4425 static 4426 void 4427 pmap_remove_all(vm_page_t m) 4428 { 4429 pv_entry_t pv; 4430 pmap_inval_bulk_t bulk; 4431 4432 if (!pmap_initialized /* || (m->flags & PG_FICTITIOUS)*/) 4433 return; 4434 4435 vm_page_spin_lock(m); 4436 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4437 KKASSERT(pv->pv_m == m); 4438 if (pv_hold_try(pv)) { 4439 vm_page_spin_unlock(m); 4440 } else { 4441 vm_page_spin_unlock(m); 4442 pv_lock(pv); 4443 pv_put(pv); 4444 vm_page_spin_lock(m); 4445 continue; 4446 } 4447 KKASSERT(pv->pv_pmap && pv->pv_m == m); 4448 4449 /* 4450 * Holding no spinlocks, pv is locked. Once we scrap 4451 * pv we can no longer use it as a list iterator (but 4452 * we are doing a TAILQ_FIRST() so we are ok). 4453 */ 4454 pmap_inval_bulk_init(&bulk, pv->pv_pmap); 4455 pmap_remove_pv_pte(pv, NULL, &bulk, 2); 4456 pv = NULL; /* safety */ 4457 pmap_inval_bulk_flush(&bulk); 4458 vm_page_spin_lock(m); 4459 } 4460 KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0); 4461 vm_page_spin_unlock(m); 4462 } 4463 4464 /* 4465 * Removes the page from a particular pmap 4466 */ 4467 void 4468 pmap_remove_specific(pmap_t pmap, vm_page_t m) 4469 { 4470 pv_entry_t pv; 4471 pmap_inval_bulk_t bulk; 4472 4473 if (!pmap_initialized) 4474 return; 4475 4476 again: 4477 vm_page_spin_lock(m); 4478 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4479 if (pv->pv_pmap != pmap) 4480 continue; 4481 KKASSERT(pv->pv_m == m); 4482 if (pv_hold_try(pv)) { 4483 vm_page_spin_unlock(m); 4484 } else { 4485 vm_page_spin_unlock(m); 4486 pv_lock(pv); 4487 pv_put(pv); 4488 goto again; 4489 } 4490 KKASSERT(pv->pv_pmap == pmap && pv->pv_m == m); 4491 4492 /* 4493 * Holding no spinlocks, pv is locked. Once gone it can't 4494 * be used as an iterator. In fact, because we couldn't 4495 * necessarily lock it atomically it may have moved within 4496 * the list and ALSO cannot be used as an iterator. 4497 */ 4498 pmap_inval_bulk_init(&bulk, pv->pv_pmap); 4499 pmap_remove_pv_pte(pv, NULL, &bulk, 2); 4500 pv = NULL; /* safety */ 4501 pmap_inval_bulk_flush(&bulk); 4502 goto again; 4503 } 4504 vm_page_spin_unlock(m); 4505 } 4506 4507 /* 4508 * Set the physical protection on the specified range of this map 4509 * as requested. This function is typically only used for debug watchpoints 4510 * and COW pages. 4511 * 4512 * This function may not be called from an interrupt if the map is 4513 * not the kernel_pmap. 4514 * 4515 * NOTE! For shared page table pages we just unmap the page. 4516 */ 4517 void 4518 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4519 { 4520 struct pmap_scan_info info; 4521 /* JG review for NX */ 4522 4523 if (pmap == NULL) 4524 return; 4525 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 4526 pmap_remove(pmap, sva, eva); 4527 return; 4528 } 4529 if (prot & VM_PROT_WRITE) 4530 return; 4531 info.pmap = pmap; 4532 info.sva = sva; 4533 info.eva = eva; 4534 info.func = pmap_protect_callback; 4535 info.arg = &prot; 4536 pmap_scan(&info, 1); 4537 } 4538 4539 static 4540 void 4541 pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info, 4542 pv_entry_t pte_pv, vm_pindex_t *pte_placemark, 4543 pv_entry_t pt_pv, int sharept, 4544 vm_offset_t va, pt_entry_t *ptep, void *arg __unused) 4545 { 4546 pt_entry_t pbits; 4547 pt_entry_t cbits; 4548 pt_entry_t pte; 4549 vm_page_t m; 4550 4551 again: 4552 pbits = *ptep; 4553 cbits = pbits; 4554 if (pte_pv) { 4555 KKASSERT(pte_pv->pv_m != NULL); 4556 m = NULL; 4557 if (pbits & pmap->pmap_bits[PG_A_IDX]) { 4558 if ((pbits & pmap->pmap_bits[PG_DEVICE_IDX]) == 0) { 4559 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 4560 KKASSERT(m == pte_pv->pv_m); 4561 vm_page_flag_set(m, PG_REFERENCED); 4562 } 4563 cbits &= ~pmap->pmap_bits[PG_A_IDX]; 4564 } 4565 if (pbits & pmap->pmap_bits[PG_M_IDX]) { 4566 if (pmap_track_modified(pte_pv->pv_pindex)) { 4567 if ((pbits & pmap->pmap_bits[PG_DEVICE_IDX]) == 0) { 4568 if (m == NULL) { 4569 m = PHYS_TO_VM_PAGE(pbits & 4570 PG_FRAME); 4571 } 4572 vm_page_dirty(m); 4573 } 4574 cbits &= ~pmap->pmap_bits[PG_M_IDX]; 4575 } 4576 } 4577 } else if (sharept) { 4578 /* 4579 * Unmanaged page table, pt_pv is actually the pd_pv 4580 * for our pmap (not the object's shared pmap). 4581 * 4582 * When asked to protect something in a shared page table 4583 * page we just unmap the page table page. We have to 4584 * invalidate the tlb in this situation. 4585 * 4586 * XXX Warning, shared page tables will not be used for 4587 * OBJT_DEVICE or OBJT_MGTDEVICE (PG_FICTITIOUS) mappings 4588 * so PHYS_TO_VM_PAGE() should be safe here. 4589 */ 4590 pte = pmap_inval_smp(pmap, (vm_offset_t)-1, 1, ptep, 0); 4591 if (vm_page_unwire_quick(PHYS_TO_VM_PAGE(pte & PG_FRAME))) 4592 panic("pmap_protect: pgtable1 pg bad wirecount"); 4593 if (vm_page_unwire_quick(pt_pv->pv_m)) 4594 panic("pmap_protect: pgtable2 pg bad wirecount"); 4595 ptep = NULL; 4596 } 4597 /* else unmanaged page, adjust bits, no wire changes */ 4598 4599 if (ptep) { 4600 cbits &= ~pmap->pmap_bits[PG_RW_IDX]; 4601 #ifdef PMAP_DEBUG2 4602 if (pmap_enter_debug > 0) { 4603 --pmap_enter_debug; 4604 kprintf("pmap_protect va=%lx ptep=%p pte_pv=%p " 4605 "pt_pv=%p cbits=%08lx\n", 4606 va, ptep, pte_pv, 4607 pt_pv, cbits 4608 ); 4609 } 4610 #endif 4611 if (pbits != cbits) { 4612 vm_offset_t xva; 4613 4614 xva = (sharept) ? (vm_offset_t)-1 : va; 4615 if (!pmap_inval_smp_cmpset(pmap, xva, 4616 ptep, pbits, cbits)) { 4617 goto again; 4618 } 4619 } 4620 } 4621 if (pte_pv) 4622 pv_put(pte_pv); 4623 else 4624 pv_placemarker_wakeup(pmap, pte_placemark); 4625 } 4626 4627 /* 4628 * Insert the vm_page (m) at the virtual address (va), replacing any prior 4629 * mapping at that address. Set protection and wiring as requested. 4630 * 4631 * If entry is non-NULL we check to see if the SEG_SIZE optimization is 4632 * possible. If it is we enter the page into the appropriate shared pmap 4633 * hanging off the related VM object instead of the passed pmap, then we 4634 * share the page table page from the VM object's pmap into the current pmap. 4635 * 4636 * NOTE: This routine MUST insert the page into the pmap now, it cannot 4637 * lazy-evaluate. 4638 * 4639 * NOTE: If (m) is PG_UNMANAGED it may also be a temporary fake vm_page_t. 4640 * never record it. 4641 */ 4642 void 4643 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4644 boolean_t wired, vm_map_entry_t entry) 4645 { 4646 pv_entry_t pt_pv; /* page table */ 4647 pv_entry_t pte_pv; /* page table entry */ 4648 vm_pindex_t *pte_placemark; 4649 pt_entry_t *ptep; 4650 vm_paddr_t opa; 4651 pt_entry_t origpte, newpte; 4652 vm_paddr_t pa; 4653 4654 if (pmap == NULL) 4655 return; 4656 va = trunc_page(va); 4657 #ifdef PMAP_DIAGNOSTIC 4658 if (va >= KvaEnd) 4659 panic("pmap_enter: toobig"); 4660 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) 4661 panic("pmap_enter: invalid to pmap_enter page table " 4662 "pages (va: 0x%lx)", va); 4663 #endif 4664 if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) { 4665 kprintf("Warning: pmap_enter called on UVA with " 4666 "kernel_pmap\n"); 4667 #ifdef DDB 4668 db_print_backtrace(); 4669 #endif 4670 } 4671 if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) { 4672 kprintf("Warning: pmap_enter called on KVA without" 4673 "kernel_pmap\n"); 4674 #ifdef DDB 4675 db_print_backtrace(); 4676 #endif 4677 } 4678 4679 /* 4680 * Get locked PV entries for our new page table entry (pte_pv or 4681 * pte_placemark) and for its parent page table (pt_pv). We need 4682 * the parent so we can resolve the location of the ptep. 4683 * 4684 * Only hardware MMU actions can modify the ptep out from 4685 * under us. 4686 * 4687 * if (m) is fictitious or unmanaged we do not create a managing 4688 * pte_pv for it. Any pre-existing page's management state must 4689 * match (avoiding code complexity). 4690 * 4691 * If the pmap is still being initialized we assume existing 4692 * page tables. 4693 * 4694 * Kernel mapppings do not track page table pages (i.e. pt_pv). 4695 * 4696 * WARNING! If replacing a managed mapping with an unmanaged mapping 4697 * pte_pv will wind up being non-NULL and must be handled 4698 * below. 4699 */ 4700 if (pmap_initialized == FALSE) { 4701 pte_pv = NULL; 4702 pt_pv = NULL; 4703 pte_placemark = NULL; 4704 ptep = vtopte(va); 4705 origpte = *ptep; 4706 } else if (m->flags & (/*PG_FICTITIOUS |*/ PG_UNMANAGED)) { /* XXX */ 4707 pte_pv = pv_get(pmap, pmap_pte_pindex(va), &pte_placemark); 4708 KKASSERT(pte_pv == NULL); 4709 if (va >= VM_MAX_USER_ADDRESS) { 4710 pt_pv = NULL; 4711 ptep = vtopte(va); 4712 } else { 4713 pt_pv = pmap_allocpte_seg(pmap, pmap_pt_pindex(va), 4714 NULL, entry, va); 4715 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 4716 } 4717 origpte = *ptep; 4718 cpu_ccfence(); 4719 KASSERT(origpte == 0 || 4720 (origpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0, 4721 ("Invalid PTE 0x%016jx @ 0x%016jx\n", origpte, va)); 4722 } else { 4723 if (va >= VM_MAX_USER_ADDRESS) { 4724 /* 4725 * Kernel map, pv_entry-tracked. 4726 */ 4727 pt_pv = NULL; 4728 pte_pv = pmap_allocpte(pmap, pmap_pte_pindex(va), NULL); 4729 ptep = vtopte(va); 4730 } else { 4731 /* 4732 * User map 4733 */ 4734 pte_pv = pmap_allocpte_seg(pmap, pmap_pte_pindex(va), 4735 &pt_pv, entry, va); 4736 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 4737 } 4738 pte_placemark = NULL; /* safety */ 4739 origpte = *ptep; 4740 cpu_ccfence(); 4741 KASSERT(origpte == 0 || 4742 (origpte & pmap->pmap_bits[PG_MANAGED_IDX]), 4743 ("Invalid PTE 0x%016jx @ 0x%016jx\n", origpte, va)); 4744 } 4745 4746 pa = VM_PAGE_TO_PHYS(m); 4747 opa = origpte & PG_FRAME; 4748 4749 /* 4750 * Calculate the new PTE. Note that pte_pv alone does not mean 4751 * the new pte_pv is managed, it could exist because the old pte 4752 * was managed even if the new one is not. 4753 */ 4754 newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | 4755 pmap->pmap_bits[PG_V_IDX] | pmap->pmap_bits[PG_A_IDX]); 4756 if (wired) 4757 newpte |= pmap->pmap_bits[PG_W_IDX]; 4758 if (va < VM_MAX_USER_ADDRESS) 4759 newpte |= pmap->pmap_bits[PG_U_IDX]; 4760 if (pte_pv && (m->flags & (/*PG_FICTITIOUS |*/ PG_UNMANAGED)) == 0) 4761 newpte |= pmap->pmap_bits[PG_MANAGED_IDX]; 4762 // if (pmap == &kernel_pmap) 4763 // newpte |= pgeflag; 4764 newpte |= pmap->pmap_cache_bits[m->pat_mode]; 4765 if (m->flags & PG_FICTITIOUS) 4766 newpte |= pmap->pmap_bits[PG_DEVICE_IDX]; 4767 4768 /* 4769 * It is possible for multiple faults to occur in threaded 4770 * environments, the existing pte might be correct. 4771 */ 4772 if (((origpte ^ newpte) & 4773 ~(pt_entry_t)(pmap->pmap_bits[PG_M_IDX] | 4774 pmap->pmap_bits[PG_A_IDX])) == 0) { 4775 goto done; 4776 } 4777 4778 /* 4779 * Ok, either the address changed or the protection or wiring 4780 * changed. 4781 * 4782 * Clear the current entry, interlocking the removal. For managed 4783 * pte's this will also flush the modified state to the vm_page. 4784 * Atomic ops are mandatory in order to ensure that PG_M events are 4785 * not lost during any transition. 4786 * 4787 * WARNING: The caller has busied the new page but not the original 4788 * vm_page which we are trying to replace. Because we hold 4789 * the pte_pv lock, but have not busied the page, PG bits 4790 * can be cleared out from under us. 4791 */ 4792 if (opa) { 4793 if (origpte & pmap->pmap_bits[PG_MANAGED_IDX]) { 4794 /* 4795 * Old page was managed. Expect pte_pv to exist. 4796 * (it might also exist if the old page was unmanaged). 4797 * 4798 * NOTE: pt_pv won't exist for a kernel page 4799 * (managed or otherwise). 4800 * 4801 * NOTE: We may be reusing the pte_pv so we do not 4802 * destroy it in pmap_remove_pv_pte(). 4803 */ 4804 KKASSERT(pte_pv && pte_pv->pv_m); 4805 if (prot & VM_PROT_NOSYNC) { 4806 pmap_remove_pv_pte(pte_pv, pt_pv, NULL, 0); 4807 } else { 4808 pmap_inval_bulk_t bulk; 4809 4810 pmap_inval_bulk_init(&bulk, pmap); 4811 pmap_remove_pv_pte(pte_pv, pt_pv, &bulk, 0); 4812 pmap_inval_bulk_flush(&bulk); 4813 } 4814 pmap_remove_pv_page(pte_pv); 4815 /* will either set pte_pv->pv_m or pv_free() later */ 4816 } else { 4817 /* 4818 * Old page was not managed. If we have a pte_pv 4819 * it better not have a pv_m assigned to it. If the 4820 * new page is managed the pte_pv will be destroyed 4821 * near the end (we need its interlock). 4822 * 4823 * NOTE: We leave the wire count on the PT page 4824 * intact for the followup enter, but adjust 4825 * the wired-pages count on the pmap. 4826 */ 4827 KKASSERT(pte_pv == NULL); 4828 if (prot & VM_PROT_NOSYNC) { 4829 /* 4830 * NOSYNC (no mmu sync) requested. 4831 */ 4832 (void)pte_load_clear(ptep); 4833 cpu_invlpg((void *)va); 4834 } else { 4835 /* 4836 * Nominal SYNC 4837 */ 4838 pmap_inval_smp(pmap, va, 1, ptep, 0); 4839 } 4840 4841 /* 4842 * We must adjust pm_stats manually for unmanaged 4843 * pages. 4844 */ 4845 if (pt_pv) { 4846 atomic_add_long(&pmap->pm_stats. 4847 resident_count, -1); 4848 } 4849 if (origpte & pmap->pmap_bits[PG_W_IDX]) { 4850 atomic_add_long(&pmap->pm_stats. 4851 wired_count, -1); 4852 } 4853 } 4854 KKASSERT(*ptep == 0); 4855 } 4856 4857 #ifdef PMAP_DEBUG2 4858 if (pmap_enter_debug > 0) { 4859 --pmap_enter_debug; 4860 kprintf("pmap_enter: va=%lx m=%p origpte=%lx newpte=%lx ptep=%p" 4861 " pte_pv=%p pt_pv=%p opa=%lx prot=%02x\n", 4862 va, m, 4863 origpte, newpte, ptep, 4864 pte_pv, pt_pv, opa, prot); 4865 } 4866 #endif 4867 4868 if ((newpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0) { 4869 /* 4870 * Entering an unmanaged page. We must wire the pt_pv unless 4871 * we retained the wiring from an unmanaged page we had 4872 * removed (if we retained it via pte_pv that will go away 4873 * soon). 4874 */ 4875 if (pt_pv && (opa == 0 || 4876 (origpte & pmap->pmap_bits[PG_MANAGED_IDX]))) { 4877 vm_page_wire_quick(pt_pv->pv_m); 4878 } 4879 if (wired) 4880 atomic_add_long(&pmap->pm_stats.wired_count, 1); 4881 4882 /* 4883 * Unmanaged pages need manual resident_count tracking. 4884 */ 4885 if (pt_pv) { 4886 atomic_add_long(&pt_pv->pv_pmap->pm_stats. 4887 resident_count, 1); 4888 } 4889 if (newpte & pmap->pmap_bits[PG_RW_IDX]) 4890 vm_page_flag_set(m, PG_WRITEABLE); 4891 } else { 4892 /* 4893 * Entering a managed page. Our pte_pv takes care of the 4894 * PT wiring, so if we had removed an unmanaged page before 4895 * we must adjust. 4896 * 4897 * We have to take care of the pmap wired count ourselves. 4898 * 4899 * Enter on the PV list if part of our managed memory. 4900 */ 4901 KKASSERT(pte_pv && (pte_pv->pv_m == NULL || pte_pv->pv_m == m)); 4902 vm_page_spin_lock(m); 4903 pte_pv->pv_m = m; 4904 pmap_page_stats_adding(m); 4905 TAILQ_INSERT_TAIL(&m->md.pv_list, pte_pv, pv_list); 4906 vm_page_flag_set(m, PG_MAPPED); 4907 if (newpte & pmap->pmap_bits[PG_RW_IDX]) 4908 vm_page_flag_set(m, PG_WRITEABLE); 4909 vm_page_spin_unlock(m); 4910 4911 if (pt_pv && opa && 4912 (origpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0) { 4913 vm_page_unwire_quick(pt_pv->pv_m); 4914 } 4915 4916 /* 4917 * Adjust pmap wired pages count for new entry. 4918 */ 4919 if (wired) { 4920 atomic_add_long(&pte_pv->pv_pmap->pm_stats. 4921 wired_count, 1); 4922 } 4923 } 4924 4925 /* 4926 * Kernel VMAs (pt_pv == NULL) require pmap invalidation interlocks. 4927 * 4928 * User VMAs do not because those will be zero->non-zero, so no 4929 * stale entries to worry about at this point. 4930 * 4931 * For KVM there appear to still be issues. Theoretically we 4932 * should be able to scrap the interlocks entirely but we 4933 * get crashes. 4934 */ 4935 if ((prot & VM_PROT_NOSYNC) == 0 && pt_pv == NULL) { 4936 pmap_inval_smp(pmap, va, 1, ptep, newpte); 4937 } else { 4938 origpte = atomic_swap_long(ptep, newpte); 4939 if (origpte & pmap->pmap_bits[PG_M_IDX]) { 4940 kprintf("pmap [M] race @ %016jx\n", va); 4941 atomic_set_long(ptep, pmap->pmap_bits[PG_M_IDX]); 4942 } 4943 if (pt_pv == NULL) 4944 cpu_invlpg((void *)va); 4945 } 4946 4947 /* 4948 * Cleanup 4949 */ 4950 done: 4951 KKASSERT((newpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0 || 4952 (m->flags & PG_MAPPED)); 4953 4954 /* 4955 * Cleanup the pv entry, allowing other accessors. If the new page 4956 * is not managed but we have a pte_pv (which was locking our 4957 * operation), we can free it now. pte_pv->pv_m should be NULL. 4958 */ 4959 if (pte_pv && (newpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0) { 4960 pv_free(pte_pv, pt_pv); 4961 } else if (pte_pv) { 4962 pv_put(pte_pv); 4963 } else if (pte_placemark) { 4964 pv_placemarker_wakeup(pmap, pte_placemark); 4965 } 4966 if (pt_pv) 4967 pv_put(pt_pv); 4968 } 4969 4970 /* 4971 * This code works like pmap_enter() but assumes VM_PROT_READ and not-wired. 4972 * This code also assumes that the pmap has no pre-existing entry for this 4973 * VA. 4974 * 4975 * This code currently may only be used on user pmaps, not kernel_pmap. 4976 */ 4977 void 4978 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m) 4979 { 4980 pmap_enter(pmap, va, m, VM_PROT_READ, FALSE, NULL); 4981 } 4982 4983 /* 4984 * Make a temporary mapping for a physical address. This is only intended 4985 * to be used for panic dumps. 4986 * 4987 * The caller is responsible for calling smp_invltlb(). 4988 */ 4989 void * 4990 pmap_kenter_temporary(vm_paddr_t pa, long i) 4991 { 4992 pmap_kenter_quick((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa); 4993 return ((void *)crashdumpmap); 4994 } 4995 4996 #define MAX_INIT_PT (96) 4997 4998 /* 4999 * This routine preloads the ptes for a given object into the specified pmap. 5000 * This eliminates the blast of soft faults on process startup and 5001 * immediately after an mmap. 5002 */ 5003 static int pmap_object_init_pt_callback(vm_page_t p, void *data); 5004 5005 void 5006 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot, 5007 vm_object_t object, vm_pindex_t pindex, 5008 vm_size_t size, int limit) 5009 { 5010 struct rb_vm_page_scan_info info; 5011 struct lwp *lp; 5012 vm_size_t psize; 5013 5014 /* 5015 * We can't preinit if read access isn't set or there is no pmap 5016 * or object. 5017 */ 5018 if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL) 5019 return; 5020 5021 /* 5022 * We can't preinit if the pmap is not the current pmap 5023 */ 5024 lp = curthread->td_lwp; 5025 if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace)) 5026 return; 5027 5028 /* 5029 * Misc additional checks 5030 */ 5031 psize = x86_64_btop(size); 5032 5033 if ((object->type != OBJT_VNODE) || 5034 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && 5035 (object->resident_page_count > MAX_INIT_PT))) { 5036 return; 5037 } 5038 5039 if (pindex + psize > object->size) { 5040 if (object->size < pindex) 5041 return; 5042 psize = object->size - pindex; 5043 } 5044 5045 if (psize == 0) 5046 return; 5047 5048 /* 5049 * If everything is segment-aligned do not pre-init here. Instead 5050 * allow the normal vm_fault path to pass a segment hint to 5051 * pmap_enter() which will then use an object-referenced shared 5052 * page table page. 5053 */ 5054 if ((addr & SEG_MASK) == 0 && 5055 (ctob(psize) & SEG_MASK) == 0 && 5056 (ctob(pindex) & SEG_MASK) == 0) { 5057 return; 5058 } 5059 5060 /* 5061 * Use a red-black scan to traverse the requested range and load 5062 * any valid pages found into the pmap. 5063 * 5064 * We cannot safely scan the object's memq without holding the 5065 * object token. 5066 */ 5067 info.start_pindex = pindex; 5068 info.end_pindex = pindex + psize - 1; 5069 info.limit = limit; 5070 info.mpte = NULL; 5071 info.addr = addr; 5072 info.pmap = pmap; 5073 5074 vm_object_hold_shared(object); 5075 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 5076 pmap_object_init_pt_callback, &info); 5077 vm_object_drop(object); 5078 } 5079 5080 static 5081 int 5082 pmap_object_init_pt_callback(vm_page_t p, void *data) 5083 { 5084 struct rb_vm_page_scan_info *info = data; 5085 vm_pindex_t rel_index; 5086 5087 /* 5088 * don't allow an madvise to blow away our really 5089 * free pages allocating pv entries. 5090 */ 5091 if ((info->limit & MAP_PREFAULT_MADVISE) && 5092 vmstats.v_free_count < vmstats.v_free_reserved) { 5093 return(-1); 5094 } 5095 5096 /* 5097 * Ignore list markers and ignore pages we cannot instantly 5098 * busy (while holding the object token). 5099 */ 5100 if (p->flags & PG_MARKER) 5101 return 0; 5102 if (vm_page_busy_try(p, TRUE)) 5103 return 0; 5104 if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && 5105 (p->flags & PG_FICTITIOUS) == 0) { 5106 if ((p->queue - p->pc) == PQ_CACHE) 5107 vm_page_deactivate(p); 5108 rel_index = p->pindex - info->start_pindex; 5109 pmap_enter_quick(info->pmap, 5110 info->addr + x86_64_ptob(rel_index), p); 5111 } 5112 vm_page_wakeup(p); 5113 lwkt_yield(); 5114 return(0); 5115 } 5116 5117 /* 5118 * Return TRUE if the pmap is in shape to trivially pre-fault the specified 5119 * address. 5120 * 5121 * Returns FALSE if it would be non-trivial or if a pte is already loaded 5122 * into the slot. 5123 * 5124 * XXX This is safe only because page table pages are not freed. 5125 */ 5126 int 5127 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) 5128 { 5129 pt_entry_t *pte; 5130 5131 /*spin_lock(&pmap->pm_spin);*/ 5132 if ((pte = pmap_pte(pmap, addr)) != NULL) { 5133 if (*pte & pmap->pmap_bits[PG_V_IDX]) { 5134 /*spin_unlock(&pmap->pm_spin);*/ 5135 return FALSE; 5136 } 5137 } 5138 /*spin_unlock(&pmap->pm_spin);*/ 5139 return TRUE; 5140 } 5141 5142 /* 5143 * Change the wiring attribute for a pmap/va pair. The mapping must already 5144 * exist in the pmap. The mapping may or may not be managed. The wiring in 5145 * the page is not changed, the page is returned so the caller can adjust 5146 * its wiring (the page is not locked in any way). 5147 * 5148 * Wiring is not a hardware characteristic so there is no need to invalidate 5149 * TLB. However, in an SMP environment we must use a locked bus cycle to 5150 * update the pte (if we are not using the pmap_inval_*() API that is)... 5151 * it's ok to do this for simple wiring changes. 5152 */ 5153 vm_page_t 5154 pmap_unwire(pmap_t pmap, vm_offset_t va) 5155 { 5156 pt_entry_t *ptep; 5157 pv_entry_t pt_pv; 5158 vm_paddr_t pa; 5159 vm_page_t m; 5160 5161 if (pmap == NULL) 5162 return NULL; 5163 5164 /* 5165 * Assume elements in the kernel pmap are stable 5166 */ 5167 if (pmap == &kernel_pmap) { 5168 if (pmap_pt(pmap, va) == 0) 5169 return NULL; 5170 ptep = pmap_pte_quick(pmap, va); 5171 if (pmap_pte_v(pmap, ptep)) { 5172 if (pmap_pte_w(pmap, ptep)) 5173 atomic_add_long(&pmap->pm_stats.wired_count,-1); 5174 atomic_clear_long(ptep, pmap->pmap_bits[PG_W_IDX]); 5175 pa = *ptep & PG_FRAME; 5176 m = PHYS_TO_VM_PAGE(pa); 5177 } else { 5178 m = NULL; 5179 } 5180 } else { 5181 /* 5182 * We can only [un]wire pmap-local pages (we cannot wire 5183 * shared pages) 5184 */ 5185 pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL); 5186 if (pt_pv == NULL) 5187 return NULL; 5188 5189 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 5190 if ((*ptep & pmap->pmap_bits[PG_V_IDX]) == 0) { 5191 pv_put(pt_pv); 5192 return NULL; 5193 } 5194 5195 if (pmap_pte_w(pmap, ptep)) { 5196 atomic_add_long(&pt_pv->pv_pmap->pm_stats.wired_count, 5197 -1); 5198 } 5199 /* XXX else return NULL so caller doesn't unwire m ? */ 5200 5201 atomic_clear_long(ptep, pmap->pmap_bits[PG_W_IDX]); 5202 5203 pa = *ptep & PG_FRAME; 5204 m = PHYS_TO_VM_PAGE(pa); /* held by wired count */ 5205 pv_put(pt_pv); 5206 } 5207 return m; 5208 } 5209 5210 /* 5211 * Copy the range specified by src_addr/len from the source map to 5212 * the range dst_addr/len in the destination map. 5213 * 5214 * This routine is only advisory and need not do anything. 5215 */ 5216 void 5217 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 5218 vm_size_t len, vm_offset_t src_addr) 5219 { 5220 } 5221 5222 /* 5223 * pmap_zero_page: 5224 * 5225 * Zero the specified physical page. 5226 * 5227 * This function may be called from an interrupt and no locking is 5228 * required. 5229 */ 5230 void 5231 pmap_zero_page(vm_paddr_t phys) 5232 { 5233 vm_offset_t va = PHYS_TO_DMAP(phys); 5234 5235 pagezero((void *)va); 5236 } 5237 5238 /* 5239 * pmap_zero_page: 5240 * 5241 * Zero part of a physical page by mapping it into memory and clearing 5242 * its contents with bzero. 5243 * 5244 * off and size may not cover an area beyond a single hardware page. 5245 */ 5246 void 5247 pmap_zero_page_area(vm_paddr_t phys, int off, int size) 5248 { 5249 vm_offset_t virt = PHYS_TO_DMAP(phys); 5250 5251 bzero((char *)virt + off, size); 5252 } 5253 5254 /* 5255 * pmap_copy_page: 5256 * 5257 * Copy the physical page from the source PA to the target PA. 5258 * This function may be called from an interrupt. No locking 5259 * is required. 5260 */ 5261 void 5262 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst) 5263 { 5264 vm_offset_t src_virt, dst_virt; 5265 5266 src_virt = PHYS_TO_DMAP(src); 5267 dst_virt = PHYS_TO_DMAP(dst); 5268 bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE); 5269 } 5270 5271 /* 5272 * pmap_copy_page_frag: 5273 * 5274 * Copy the physical page from the source PA to the target PA. 5275 * This function may be called from an interrupt. No locking 5276 * is required. 5277 */ 5278 void 5279 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes) 5280 { 5281 vm_offset_t src_virt, dst_virt; 5282 5283 src_virt = PHYS_TO_DMAP(src); 5284 dst_virt = PHYS_TO_DMAP(dst); 5285 5286 bcopy((char *)src_virt + (src & PAGE_MASK), 5287 (char *)dst_virt + (dst & PAGE_MASK), 5288 bytes); 5289 } 5290 5291 /* 5292 * Returns true if the pmap's pv is one of the first 16 pvs linked to from 5293 * this page. This count may be changed upwards or downwards in the future; 5294 * it is only necessary that true be returned for a small subset of pmaps 5295 * for proper page aging. 5296 */ 5297 boolean_t 5298 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 5299 { 5300 pv_entry_t pv; 5301 int loops = 0; 5302 5303 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 5304 return FALSE; 5305 5306 vm_page_spin_lock(m); 5307 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 5308 if (pv->pv_pmap == pmap) { 5309 vm_page_spin_unlock(m); 5310 return TRUE; 5311 } 5312 loops++; 5313 if (loops >= 16) 5314 break; 5315 } 5316 vm_page_spin_unlock(m); 5317 return (FALSE); 5318 } 5319 5320 /* 5321 * Remove all pages from specified address space this aids process exit 5322 * speeds. Also, this code may be special cased for the current process 5323 * only. 5324 */ 5325 void 5326 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5327 { 5328 pmap_remove_noinval(pmap, sva, eva); 5329 cpu_invltlb(); 5330 } 5331 5332 /* 5333 * pmap_testbit tests bits in pte's note that the testbit/clearbit 5334 * routines are inline, and a lot of things compile-time evaluate. 5335 */ 5336 static 5337 boolean_t 5338 pmap_testbit(vm_page_t m, int bit) 5339 { 5340 pv_entry_t pv; 5341 pt_entry_t *pte; 5342 pmap_t pmap; 5343 5344 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 5345 return FALSE; 5346 5347 if (TAILQ_FIRST(&m->md.pv_list) == NULL) 5348 return FALSE; 5349 vm_page_spin_lock(m); 5350 if (TAILQ_FIRST(&m->md.pv_list) == NULL) { 5351 vm_page_spin_unlock(m); 5352 return FALSE; 5353 } 5354 5355 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 5356 5357 #if defined(PMAP_DIAGNOSTIC) 5358 if (pv->pv_pmap == NULL) { 5359 kprintf("Null pmap (tb) at pindex: %"PRIu64"\n", 5360 pv->pv_pindex); 5361 continue; 5362 } 5363 #endif 5364 pmap = pv->pv_pmap; 5365 5366 /* 5367 * If the bit being tested is the modified bit, then 5368 * mark clean_map and ptes as never 5369 * modified. 5370 * 5371 * WARNING! Because we do not lock the pv, *pte can be in a 5372 * state of flux. Despite this the value of *pte 5373 * will still be related to the vm_page in some way 5374 * because the pv cannot be destroyed as long as we 5375 * hold the vm_page spin lock. 5376 */ 5377 if (bit == PG_A_IDX || bit == PG_M_IDX) { 5378 //& (pmap->pmap_bits[PG_A_IDX] | pmap->pmap_bits[PG_M_IDX])) { 5379 if (!pmap_track_modified(pv->pv_pindex)) 5380 continue; 5381 } 5382 5383 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); 5384 if (*pte & pmap->pmap_bits[bit]) { 5385 vm_page_spin_unlock(m); 5386 return TRUE; 5387 } 5388 } 5389 vm_page_spin_unlock(m); 5390 return (FALSE); 5391 } 5392 5393 /* 5394 * This routine is used to modify bits in ptes. Only one bit should be 5395 * specified. PG_RW requires special handling. 5396 * 5397 * Caller must NOT hold any spin locks 5398 */ 5399 static __inline 5400 void 5401 pmap_clearbit(vm_page_t m, int bit_index) 5402 { 5403 pv_entry_t pv; 5404 pt_entry_t *pte; 5405 pt_entry_t pbits; 5406 pmap_t pmap; 5407 5408 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { 5409 if (bit_index == PG_RW_IDX) 5410 vm_page_flag_clear(m, PG_WRITEABLE); 5411 return; 5412 } 5413 5414 /* 5415 * PG_M or PG_A case 5416 * 5417 * Loop over all current mappings setting/clearing as appropos If 5418 * setting RO do we need to clear the VAC? 5419 * 5420 * NOTE: When clearing PG_M we could also (not implemented) drop 5421 * through to the PG_RW code and clear PG_RW too, forcing 5422 * a fault on write to redetect PG_M for virtual kernels, but 5423 * it isn't necessary since virtual kernels invalidate the 5424 * pte when they clear the VPTE_M bit in their virtual page 5425 * tables. 5426 * 5427 * NOTE: Does not re-dirty the page when clearing only PG_M. 5428 * 5429 * NOTE: Because we do not lock the pv, *pte can be in a state of 5430 * flux. Despite this the value of *pte is still somewhat 5431 * related while we hold the vm_page spin lock. 5432 * 5433 * *pte can be zero due to this race. Since we are clearing 5434 * bits we basically do no harm when this race occurs. 5435 */ 5436 if (bit_index != PG_RW_IDX) { 5437 vm_page_spin_lock(m); 5438 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 5439 #if defined(PMAP_DIAGNOSTIC) 5440 if (pv->pv_pmap == NULL) { 5441 kprintf("Null pmap (cb) at pindex: %"PRIu64"\n", 5442 pv->pv_pindex); 5443 continue; 5444 } 5445 #endif 5446 pmap = pv->pv_pmap; 5447 pte = pmap_pte_quick(pv->pv_pmap, 5448 pv->pv_pindex << PAGE_SHIFT); 5449 pbits = *pte; 5450 if (pbits & pmap->pmap_bits[bit_index]) 5451 atomic_clear_long(pte, pmap->pmap_bits[bit_index]); 5452 } 5453 vm_page_spin_unlock(m); 5454 return; 5455 } 5456 5457 /* 5458 * Clear PG_RW. Also clears PG_M and marks the page dirty if PG_M 5459 * was set. 5460 */ 5461 restart: 5462 vm_page_spin_lock(m); 5463 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 5464 /* 5465 * don't write protect pager mappings 5466 */ 5467 if (!pmap_track_modified(pv->pv_pindex)) 5468 continue; 5469 5470 #if defined(PMAP_DIAGNOSTIC) 5471 if (pv->pv_pmap == NULL) { 5472 kprintf("Null pmap (cb) at pindex: %"PRIu64"\n", 5473 pv->pv_pindex); 5474 continue; 5475 } 5476 #endif 5477 pmap = pv->pv_pmap; 5478 5479 /* 5480 * Skip pages which do not have PG_RW set. 5481 */ 5482 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); 5483 if ((*pte & pmap->pmap_bits[PG_RW_IDX]) == 0) 5484 continue; 5485 5486 /* 5487 * We must lock the PV to be able to safely test the pte. 5488 */ 5489 if (pv_hold_try(pv)) { 5490 vm_page_spin_unlock(m); 5491 } else { 5492 vm_page_spin_unlock(m); 5493 pv_lock(pv); /* held, now do a blocking lock */ 5494 pv_put(pv); 5495 goto restart; 5496 } 5497 5498 /* 5499 * Reload pte after acquiring pv. 5500 */ 5501 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); 5502 #if 0 5503 if ((*pte & pmap->pmap_bits[PG_RW_IDX]) == 0) { 5504 pv_put(pv); 5505 goto restart; 5506 } 5507 #endif 5508 5509 KKASSERT(pv->pv_pmap == pmap && pv->pv_m == m); 5510 for (;;) { 5511 pt_entry_t nbits; 5512 5513 pbits = *pte; 5514 cpu_ccfence(); 5515 nbits = pbits & ~(pmap->pmap_bits[PG_RW_IDX] | 5516 pmap->pmap_bits[PG_M_IDX]); 5517 if (pmap_inval_smp_cmpset(pmap, 5518 ((vm_offset_t)pv->pv_pindex << PAGE_SHIFT), 5519 pte, pbits, nbits)) { 5520 break; 5521 } 5522 cpu_pause(); 5523 } 5524 5525 /* 5526 * If PG_M was found to be set while we were clearing PG_RW 5527 * we also clear PG_M (done above) and mark the page dirty. 5528 * Callers expect this behavior. 5529 * 5530 * we lost pv so it cannot be used as an iterator. In fact, 5531 * because we couldn't necessarily lock it atomically it may 5532 * have moved within the list and ALSO cannot be used as an 5533 * iterator. 5534 */ 5535 vm_page_spin_lock(m); 5536 if (pbits & pmap->pmap_bits[PG_M_IDX]) 5537 vm_page_dirty(m); 5538 vm_page_spin_unlock(m); 5539 pv_put(pv); 5540 goto restart; 5541 } 5542 if (bit_index == PG_RW_IDX) 5543 vm_page_flag_clear(m, PG_WRITEABLE); 5544 vm_page_spin_unlock(m); 5545 } 5546 5547 /* 5548 * Lower the permission for all mappings to a given page. 5549 * 5550 * Page must be busied by caller. Because page is busied by caller this 5551 * should not be able to race a pmap_enter(). 5552 */ 5553 void 5554 pmap_page_protect(vm_page_t m, vm_prot_t prot) 5555 { 5556 /* JG NX support? */ 5557 if ((prot & VM_PROT_WRITE) == 0) { 5558 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { 5559 /* 5560 * NOTE: pmap_clearbit(.. PG_RW) also clears 5561 * the PG_WRITEABLE flag in (m). 5562 */ 5563 pmap_clearbit(m, PG_RW_IDX); 5564 } else { 5565 pmap_remove_all(m); 5566 } 5567 } 5568 } 5569 5570 vm_paddr_t 5571 pmap_phys_address(vm_pindex_t ppn) 5572 { 5573 return (x86_64_ptob(ppn)); 5574 } 5575 5576 /* 5577 * Return a count of reference bits for a page, clearing those bits. 5578 * It is not necessary for every reference bit to be cleared, but it 5579 * is necessary that 0 only be returned when there are truly no 5580 * reference bits set. 5581 * 5582 * XXX: The exact number of bits to check and clear is a matter that 5583 * should be tested and standardized at some point in the future for 5584 * optimal aging of shared pages. 5585 * 5586 * This routine may not block. 5587 */ 5588 int 5589 pmap_ts_referenced(vm_page_t m) 5590 { 5591 pv_entry_t pv; 5592 pt_entry_t *pte; 5593 pmap_t pmap; 5594 int rtval = 0; 5595 5596 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 5597 return (rtval); 5598 5599 vm_page_spin_lock(m); 5600 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 5601 if (!pmap_track_modified(pv->pv_pindex)) 5602 continue; 5603 pmap = pv->pv_pmap; 5604 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); 5605 if (pte && (*pte & pmap->pmap_bits[PG_A_IDX])) { 5606 atomic_clear_long(pte, pmap->pmap_bits[PG_A_IDX]); 5607 rtval++; 5608 if (rtval > 4) 5609 break; 5610 } 5611 } 5612 vm_page_spin_unlock(m); 5613 return (rtval); 5614 } 5615 5616 /* 5617 * pmap_is_modified: 5618 * 5619 * Return whether or not the specified physical page was modified 5620 * in any physical maps. 5621 */ 5622 boolean_t 5623 pmap_is_modified(vm_page_t m) 5624 { 5625 boolean_t res; 5626 5627 res = pmap_testbit(m, PG_M_IDX); 5628 return (res); 5629 } 5630 5631 /* 5632 * Clear the modify bits on the specified physical page. 5633 */ 5634 void 5635 pmap_clear_modify(vm_page_t m) 5636 { 5637 pmap_clearbit(m, PG_M_IDX); 5638 } 5639 5640 /* 5641 * pmap_clear_reference: 5642 * 5643 * Clear the reference bit on the specified physical page. 5644 */ 5645 void 5646 pmap_clear_reference(vm_page_t m) 5647 { 5648 pmap_clearbit(m, PG_A_IDX); 5649 } 5650 5651 /* 5652 * Miscellaneous support routines follow 5653 */ 5654 5655 static 5656 void 5657 i386_protection_init(void) 5658 { 5659 int *kp, prot; 5660 5661 /* JG NX support may go here; No VM_PROT_EXECUTE ==> set NX bit */ 5662 kp = protection_codes; 5663 for (prot = 0; prot < PROTECTION_CODES_SIZE; prot++) { 5664 switch (prot) { 5665 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: 5666 /* 5667 * Read access is also 0. There isn't any execute bit, 5668 * so just make it readable. 5669 */ 5670 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: 5671 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: 5672 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: 5673 *kp++ = 0; 5674 break; 5675 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: 5676 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: 5677 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: 5678 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: 5679 *kp++ = pmap_bits_default[PG_RW_IDX]; 5680 break; 5681 } 5682 } 5683 } 5684 5685 /* 5686 * Map a set of physical memory pages into the kernel virtual 5687 * address space. Return a pointer to where it is mapped. This 5688 * routine is intended to be used for mapping device memory, 5689 * NOT real memory. 5690 * 5691 * NOTE: We can't use pgeflag unless we invalidate the pages one at 5692 * a time. 5693 * 5694 * NOTE: The PAT attributes {WRITE_BACK, WRITE_THROUGH, UNCACHED, UNCACHEABLE} 5695 * work whether the cpu supports PAT or not. The remaining PAT 5696 * attributes {WRITE_PROTECTED, WRITE_COMBINING} only work if the cpu 5697 * supports PAT. 5698 */ 5699 void * 5700 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 5701 { 5702 return(pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5703 } 5704 5705 void * 5706 pmap_mapdev_uncacheable(vm_paddr_t pa, vm_size_t size) 5707 { 5708 return(pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 5709 } 5710 5711 void * 5712 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 5713 { 5714 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5715 } 5716 5717 /* 5718 * Map a set of physical memory pages into the kernel virtual 5719 * address space. Return a pointer to where it is mapped. This 5720 * routine is intended to be used for mapping device memory, 5721 * NOT real memory. 5722 */ 5723 void * 5724 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 5725 { 5726 vm_offset_t va, tmpva, offset; 5727 pt_entry_t *pte; 5728 vm_size_t tmpsize; 5729 5730 offset = pa & PAGE_MASK; 5731 size = roundup(offset + size, PAGE_SIZE); 5732 5733 va = kmem_alloc_nofault(&kernel_map, size, VM_SUBSYS_MAPDEV, PAGE_SIZE); 5734 if (va == 0) 5735 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 5736 5737 pa = pa & ~PAGE_MASK; 5738 for (tmpva = va, tmpsize = size; tmpsize > 0;) { 5739 pte = vtopte(tmpva); 5740 *pte = pa | 5741 kernel_pmap.pmap_bits[PG_RW_IDX] | 5742 kernel_pmap.pmap_bits[PG_V_IDX] | /* pgeflag | */ 5743 kernel_pmap.pmap_cache_bits[mode]; 5744 tmpsize -= PAGE_SIZE; 5745 tmpva += PAGE_SIZE; 5746 pa += PAGE_SIZE; 5747 } 5748 pmap_invalidate_range(&kernel_pmap, va, va + size); 5749 pmap_invalidate_cache_range(va, va + size); 5750 5751 return ((void *)(va + offset)); 5752 } 5753 5754 void 5755 pmap_unmapdev(vm_offset_t va, vm_size_t size) 5756 { 5757 vm_offset_t base, offset; 5758 5759 base = va & ~PAGE_MASK; 5760 offset = va & PAGE_MASK; 5761 size = roundup(offset + size, PAGE_SIZE); 5762 pmap_qremove(va, size >> PAGE_SHIFT); 5763 kmem_free(&kernel_map, base, size); 5764 } 5765 5766 /* 5767 * Sets the memory attribute for the specified page. 5768 */ 5769 void 5770 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5771 { 5772 5773 m->pat_mode = ma; 5774 5775 /* 5776 * If "m" is a normal page, update its direct mapping. This update 5777 * can be relied upon to perform any cache operations that are 5778 * required for data coherence. 5779 */ 5780 if ((m->flags & PG_FICTITIOUS) == 0) 5781 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 1, m->pat_mode); 5782 } 5783 5784 /* 5785 * Change the PAT attribute on an existing kernel memory map. Caller 5786 * must ensure that the virtual memory in question is not accessed 5787 * during the adjustment. 5788 */ 5789 void 5790 pmap_change_attr(vm_offset_t va, vm_size_t count, int mode) 5791 { 5792 pt_entry_t *pte; 5793 vm_offset_t base; 5794 int changed = 0; 5795 5796 if (va == 0) 5797 panic("pmap_change_attr: va is NULL"); 5798 base = trunc_page(va); 5799 5800 while (count) { 5801 pte = vtopte(va); 5802 *pte = (*pte & ~(pt_entry_t)(kernel_pmap.pmap_cache_mask)) | 5803 kernel_pmap.pmap_cache_bits[mode]; 5804 --count; 5805 va += PAGE_SIZE; 5806 } 5807 5808 changed = 1; /* XXX: not optimal */ 5809 5810 /* 5811 * Flush CPU caches if required to make sure any data isn't cached that 5812 * shouldn't be, etc. 5813 */ 5814 if (changed) { 5815 pmap_invalidate_range(&kernel_pmap, base, va); 5816 pmap_invalidate_cache_range(base, va); 5817 } 5818 } 5819 5820 /* 5821 * perform the pmap work for mincore 5822 */ 5823 int 5824 pmap_mincore(pmap_t pmap, vm_offset_t addr) 5825 { 5826 pt_entry_t *ptep, pte; 5827 vm_page_t m; 5828 int val = 0; 5829 5830 ptep = pmap_pte(pmap, addr); 5831 5832 if (ptep && (pte = *ptep) != 0) { 5833 vm_offset_t pa; 5834 5835 val = MINCORE_INCORE; 5836 if ((pte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0) 5837 goto done; 5838 5839 pa = pte & PG_FRAME; 5840 5841 if (pte & pmap->pmap_bits[PG_DEVICE_IDX]) 5842 m = NULL; 5843 else 5844 m = PHYS_TO_VM_PAGE(pa); 5845 5846 /* 5847 * Modified by us 5848 */ 5849 if (pte & pmap->pmap_bits[PG_M_IDX]) 5850 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 5851 /* 5852 * Modified by someone 5853 */ 5854 else if (m && (m->dirty || pmap_is_modified(m))) 5855 val |= MINCORE_MODIFIED_OTHER; 5856 /* 5857 * Referenced by us 5858 */ 5859 if (pte & pmap->pmap_bits[PG_A_IDX]) 5860 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 5861 5862 /* 5863 * Referenced by someone 5864 */ 5865 else if (m && ((m->flags & PG_REFERENCED) || 5866 pmap_ts_referenced(m))) { 5867 val |= MINCORE_REFERENCED_OTHER; 5868 vm_page_flag_set(m, PG_REFERENCED); 5869 } 5870 } 5871 done: 5872 5873 return val; 5874 } 5875 5876 /* 5877 * Replace p->p_vmspace with a new one. If adjrefs is non-zero the new 5878 * vmspace will be ref'd and the old one will be deref'd. 5879 * 5880 * The vmspace for all lwps associated with the process will be adjusted 5881 * and cr3 will be reloaded if any lwp is the current lwp. 5882 * 5883 * The process must hold the vmspace->vm_map.token for oldvm and newvm 5884 */ 5885 void 5886 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs) 5887 { 5888 struct vmspace *oldvm; 5889 struct lwp *lp; 5890 5891 oldvm = p->p_vmspace; 5892 if (oldvm != newvm) { 5893 if (adjrefs) 5894 vmspace_ref(newvm); 5895 p->p_vmspace = newvm; 5896 KKASSERT(p->p_nthreads == 1); 5897 lp = RB_ROOT(&p->p_lwp_tree); 5898 pmap_setlwpvm(lp, newvm); 5899 if (adjrefs) 5900 vmspace_rel(oldvm); 5901 } 5902 } 5903 5904 /* 5905 * Set the vmspace for a LWP. The vmspace is almost universally set the 5906 * same as the process vmspace, but virtual kernels need to swap out contexts 5907 * on a per-lwp basis. 5908 * 5909 * Caller does not necessarily hold any vmspace tokens. Caller must control 5910 * the lwp (typically be in the context of the lwp). We use a critical 5911 * section to protect against statclock and hardclock (statistics collection). 5912 */ 5913 void 5914 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm) 5915 { 5916 struct vmspace *oldvm; 5917 struct pmap *pmap; 5918 5919 oldvm = lp->lwp_vmspace; 5920 5921 if (oldvm != newvm) { 5922 crit_enter(); 5923 KKASSERT((newvm->vm_refcnt & VM_REF_DELETED) == 0); 5924 lp->lwp_vmspace = newvm; 5925 if (curthread->td_lwp == lp) { 5926 pmap = vmspace_pmap(newvm); 5927 ATOMIC_CPUMASK_ORBIT(pmap->pm_active, mycpu->gd_cpuid); 5928 if (pmap->pm_active_lock & CPULOCK_EXCL) 5929 pmap_interlock_wait(newvm); 5930 #if defined(SWTCH_OPTIM_STATS) 5931 tlb_flush_count++; 5932 #endif 5933 if (pmap->pmap_bits[TYPE_IDX] == REGULAR_PMAP) { 5934 curthread->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4); 5935 } else if (pmap->pmap_bits[TYPE_IDX] == EPT_PMAP) { 5936 curthread->td_pcb->pcb_cr3 = KPML4phys; 5937 } else { 5938 panic("pmap_setlwpvm: unknown pmap type\n"); 5939 } 5940 load_cr3(curthread->td_pcb->pcb_cr3); 5941 pmap = vmspace_pmap(oldvm); 5942 ATOMIC_CPUMASK_NANDBIT(pmap->pm_active, 5943 mycpu->gd_cpuid); 5944 } 5945 crit_exit(); 5946 } 5947 } 5948 5949 /* 5950 * Called when switching to a locked pmap, used to interlock against pmaps 5951 * undergoing modifications to prevent us from activating the MMU for the 5952 * target pmap until all such modifications have completed. We have to do 5953 * this because the thread making the modifications has already set up its 5954 * SMP synchronization mask. 5955 * 5956 * This function cannot sleep! 5957 * 5958 * No requirements. 5959 */ 5960 void 5961 pmap_interlock_wait(struct vmspace *vm) 5962 { 5963 struct pmap *pmap = &vm->vm_pmap; 5964 5965 if (pmap->pm_active_lock & CPULOCK_EXCL) { 5966 crit_enter(); 5967 KKASSERT(curthread->td_critcount >= 2); 5968 DEBUG_PUSH_INFO("pmap_interlock_wait"); 5969 while (pmap->pm_active_lock & CPULOCK_EXCL) { 5970 cpu_ccfence(); 5971 lwkt_process_ipiq(); 5972 } 5973 DEBUG_POP_INFO(); 5974 crit_exit(); 5975 } 5976 } 5977 5978 vm_offset_t 5979 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) 5980 { 5981 5982 if ((obj == NULL) || (size < NBPDR) || 5983 ((obj->type != OBJT_DEVICE) && (obj->type != OBJT_MGTDEVICE))) { 5984 return addr; 5985 } 5986 5987 addr = roundup2(addr, NBPDR); 5988 return addr; 5989 } 5990 5991 /* 5992 * Used by kmalloc/kfree, page already exists at va 5993 */ 5994 vm_page_t 5995 pmap_kvtom(vm_offset_t va) 5996 { 5997 pt_entry_t *ptep = vtopte(va); 5998 5999 KKASSERT((*ptep & kernel_pmap.pmap_bits[PG_DEVICE_IDX]) == 0); 6000 return(PHYS_TO_VM_PAGE(*ptep & PG_FRAME)); 6001 } 6002 6003 /* 6004 * Initialize machine-specific shared page directory support. This 6005 * is executed when a VM object is created. 6006 */ 6007 void 6008 pmap_object_init(vm_object_t object) 6009 { 6010 object->md.pmap_rw = NULL; 6011 object->md.pmap_ro = NULL; 6012 } 6013 6014 /* 6015 * Clean up machine-specific shared page directory support. This 6016 * is executed when a VM object is destroyed. 6017 */ 6018 void 6019 pmap_object_free(vm_object_t object) 6020 { 6021 pmap_t pmap; 6022 6023 if ((pmap = object->md.pmap_rw) != NULL) { 6024 object->md.pmap_rw = NULL; 6025 pmap_remove_noinval(pmap, 6026 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); 6027 CPUMASK_ASSZERO(pmap->pm_active); 6028 pmap_release(pmap); 6029 pmap_puninit(pmap); 6030 kfree(pmap, M_OBJPMAP); 6031 } 6032 if ((pmap = object->md.pmap_ro) != NULL) { 6033 object->md.pmap_ro = NULL; 6034 pmap_remove_noinval(pmap, 6035 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); 6036 CPUMASK_ASSZERO(pmap->pm_active); 6037 pmap_release(pmap); 6038 pmap_puninit(pmap); 6039 kfree(pmap, M_OBJPMAP); 6040 } 6041 } 6042 6043 /* 6044 * pmap_pgscan_callback - Used by pmap_pgscan to acquire the related 6045 * VM page and issue a pginfo->callback. 6046 * 6047 * We are expected to dispose of any non-NULL pte_pv. 6048 */ 6049 static 6050 void 6051 pmap_pgscan_callback(pmap_t pmap, struct pmap_scan_info *info, 6052 pv_entry_t pte_pv, vm_pindex_t *pte_placemark, 6053 pv_entry_t pt_pv, int sharept, 6054 vm_offset_t va, pt_entry_t *ptep, void *arg) 6055 { 6056 struct pmap_pgscan_info *pginfo = arg; 6057 vm_page_t m; 6058 6059 if (pte_pv) { 6060 /* 6061 * Try to busy the page while we hold the pte_pv locked. 6062 */ 6063 KKASSERT(pte_pv->pv_m); 6064 m = PHYS_TO_VM_PAGE(*ptep & PG_FRAME); 6065 if (vm_page_busy_try(m, TRUE) == 0) { 6066 if (m == PHYS_TO_VM_PAGE(*ptep & PG_FRAME)) { 6067 /* 6068 * The callback is issued with the pte_pv 6069 * unlocked and put away, and the pt_pv 6070 * unlocked. 6071 */ 6072 pv_put(pte_pv); 6073 if (pt_pv) { 6074 vm_page_wire_quick(pt_pv->pv_m); 6075 pv_unlock(pt_pv); 6076 } 6077 if (pginfo->callback(pginfo, va, m) < 0) 6078 info->stop = 1; 6079 if (pt_pv) { 6080 pv_lock(pt_pv); 6081 vm_page_unwire_quick(pt_pv->pv_m); 6082 } 6083 } else { 6084 vm_page_wakeup(m); 6085 pv_put(pte_pv); 6086 } 6087 } else { 6088 ++pginfo->busycount; 6089 pv_put(pte_pv); 6090 } 6091 } else { 6092 /* 6093 * Shared page table or unmanaged page (sharept or !sharept) 6094 */ 6095 pv_placemarker_wakeup(pmap, pte_placemark); 6096 } 6097 } 6098 6099 void 6100 pmap_pgscan(struct pmap_pgscan_info *pginfo) 6101 { 6102 struct pmap_scan_info info; 6103 6104 pginfo->offset = pginfo->beg_addr; 6105 info.pmap = pginfo->pmap; 6106 info.sva = pginfo->beg_addr; 6107 info.eva = pginfo->end_addr; 6108 info.func = pmap_pgscan_callback; 6109 info.arg = pginfo; 6110 pmap_scan(&info, 0); 6111 if (info.stop == 0) 6112 pginfo->offset = pginfo->end_addr; 6113 } 6114 6115 /* 6116 * Wait for a placemarker that we do not own to clear. The placemarker 6117 * in question is not necessary set to the pindex we want, we may have 6118 * to wait on the element because we want to reserve it ourselves. 6119 */ 6120 static 6121 void 6122 pv_placemarker_wait(pmap_t pmap, vm_pindex_t *pmark) 6123 { 6124 spin_lock(&pmap->pm_spin); 6125 if (*pmark != PM_NOPLACEMARK) { 6126 atomic_set_long(pmark, PM_PLACEMARK_WAKEUP); 6127 ssleep(pmark, &pmap->pm_spin, 0, "pvplw", 0); 6128 } 6129 spin_unlock(&pmap->pm_spin); 6130 } 6131 6132 /* 6133 * Wakeup a placemarker that we own. Replace the entry with 6134 * PM_NOPLACEMARK and issue a wakeup() if necessary. 6135 */ 6136 static 6137 void 6138 pv_placemarker_wakeup(pmap_t pmap, vm_pindex_t *pmark) 6139 { 6140 vm_pindex_t pindex; 6141 6142 spin_lock(&pmap->pm_spin); 6143 pindex = atomic_swap_long(pmark, PM_NOPLACEMARK); 6144 spin_unlock(&pmap->pm_spin); 6145 KKASSERT(pindex != PM_NOPLACEMARK); 6146 if (pindex & PM_PLACEMARK_WAKEUP) 6147 wakeup(pmark); 6148 } 6149