1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * Copyright (c) 1994 John S. Dyson 4 * Copyright (c) 1994 David Greenman 5 * Copyright (c) 2003 Peter Wemm 6 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu> 7 * Copyright (c) 2008, 2009 The DragonFly Project. 8 * Copyright (c) 2008, 2009 Jordan Gordeev. 9 * Copyright (c) 2011-2012 Matthew Dillon 10 * All rights reserved. 11 * 12 * This code is derived from software contributed to Berkeley by 13 * the Systems Programming Group of the University of Utah Computer 14 * Science Department and William Jolitz of UUNET Technologies Inc. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. All advertising materials mentioning features or use of this software 25 * must display the following acknowledgement: 26 * This product includes software developed by the University of 27 * California, Berkeley and its contributors. 28 * 4. Neither the name of the University nor the names of its contributors 29 * may be used to endorse or promote products derived from this software 30 * without specific prior written permission. 31 * 32 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 35 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 42 * SUCH DAMAGE. 43 */ 44 /* 45 * Manage physical address maps for x86-64 systems. 46 */ 47 48 #if 0 /* JG */ 49 #include "opt_disable_pse.h" 50 #include "opt_pmap.h" 51 #endif 52 #include "opt_msgbuf.h" 53 54 #include <sys/param.h> 55 #include <sys/kernel.h> 56 #include <sys/proc.h> 57 #include <sys/msgbuf.h> 58 #include <sys/vmmeter.h> 59 #include <sys/mman.h> 60 #include <sys/systm.h> 61 62 #include <vm/vm.h> 63 #include <vm/vm_param.h> 64 #include <sys/sysctl.h> 65 #include <sys/lock.h> 66 #include <vm/vm_kern.h> 67 #include <vm/vm_page.h> 68 #include <vm/vm_map.h> 69 #include <vm/vm_object.h> 70 #include <vm/vm_extern.h> 71 #include <vm/vm_pageout.h> 72 #include <vm/vm_pager.h> 73 #include <vm/vm_zone.h> 74 75 #include <sys/user.h> 76 #include <sys/thread2.h> 77 #include <sys/sysref2.h> 78 #include <sys/spinlock2.h> 79 #include <vm/vm_page2.h> 80 81 #include <machine/cputypes.h> 82 #include <machine/md_var.h> 83 #include <machine/specialreg.h> 84 #include <machine/smp.h> 85 #include <machine_base/apic/apicreg.h> 86 #include <machine/globaldata.h> 87 #include <machine/pmap.h> 88 #include <machine/pmap_inval.h> 89 #include <machine/inttypes.h> 90 91 #include <ddb/ddb.h> 92 93 #define PMAP_KEEP_PDIRS 94 #ifndef PMAP_SHPGPERPROC 95 #define PMAP_SHPGPERPROC 2000 96 #endif 97 98 #if defined(DIAGNOSTIC) 99 #define PMAP_DIAGNOSTIC 100 #endif 101 102 #define MINPV 2048 103 104 /* 105 * pmap debugging will report who owns a pv lock when blocking. 106 */ 107 #ifdef PMAP_DEBUG 108 109 #define PMAP_DEBUG_DECL ,const char *func, int lineno 110 #define PMAP_DEBUG_ARGS , __func__, __LINE__ 111 #define PMAP_DEBUG_COPY , func, lineno 112 113 #define pv_get(pmap, pindex) _pv_get(pmap, pindex \ 114 PMAP_DEBUG_ARGS) 115 #define pv_lock(pv) _pv_lock(pv \ 116 PMAP_DEBUG_ARGS) 117 #define pv_hold_try(pv) _pv_hold_try(pv \ 118 PMAP_DEBUG_ARGS) 119 #define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp \ 120 PMAP_DEBUG_ARGS) 121 122 #else 123 124 #define PMAP_DEBUG_DECL 125 #define PMAP_DEBUG_ARGS 126 #define PMAP_DEBUG_COPY 127 128 #define pv_get(pmap, pindex) _pv_get(pmap, pindex) 129 #define pv_lock(pv) _pv_lock(pv) 130 #define pv_hold_try(pv) _pv_hold_try(pv) 131 #define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp) 132 133 #endif 134 135 /* 136 * Get PDEs and PTEs for user/kernel address space 137 */ 138 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 139 140 #define pmap_pde_v(pmap, pte) ((*(pd_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0) 141 #define pmap_pte_w(pmap, pte) ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_W_IDX]) != 0) 142 #define pmap_pte_m(pmap, pte) ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_M_IDX]) != 0) 143 #define pmap_pte_u(pmap, pte) ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_U_IDX]) != 0) 144 #define pmap_pte_v(pmap, pte) ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0) 145 146 /* 147 * Given a map and a machine independent protection code, 148 * convert to a vax protection code. 149 */ 150 #define pte_prot(m, p) \ 151 (m->protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)]) 152 static int protection_codes[PROTECTION_CODES_SIZE]; 153 154 struct pmap kernel_pmap; 155 static TAILQ_HEAD(,pmap) pmap_list = TAILQ_HEAD_INITIALIZER(pmap_list); 156 157 MALLOC_DEFINE(M_OBJPMAP, "objpmap", "pmaps associated with VM objects"); 158 159 vm_paddr_t avail_start; /* PA of first available physical page */ 160 vm_paddr_t avail_end; /* PA of last available physical page */ 161 vm_offset_t virtual2_start; /* cutout free area prior to kernel start */ 162 vm_offset_t virtual2_end; 163 vm_offset_t virtual_start; /* VA of first avail page (after kernel bss) */ 164 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 165 vm_offset_t KvaStart; /* VA start of KVA space */ 166 vm_offset_t KvaEnd; /* VA end of KVA space (non-inclusive) */ 167 vm_offset_t KvaSize; /* max size of kernel virtual address space */ 168 static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ 169 //static int pgeflag; /* PG_G or-in */ 170 //static int pseflag; /* PG_PS or-in */ 171 uint64_t PatMsr; 172 173 static int ndmpdp; 174 static vm_paddr_t dmaplimit; 175 static int nkpt; 176 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 177 178 static pt_entry_t pat_pte_index[PAT_INDEX_SIZE]; /* PAT -> PG_ bits */ 179 /*static pt_entry_t pat_pde_index[PAT_INDEX_SIZE];*/ /* PAT -> PG_ bits */ 180 181 static uint64_t KPTbase; 182 static uint64_t KPTphys; 183 static uint64_t KPDphys; /* phys addr of kernel level 2 */ 184 static uint64_t KPDbase; /* phys addr of kernel level 2 @ KERNBASE */ 185 uint64_t KPDPphys; /* phys addr of kernel level 3 */ 186 uint64_t KPML4phys; /* phys addr of kernel level 4 */ 187 188 static uint64_t DMPDphys; /* phys addr of direct mapped level 2 */ 189 static uint64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 190 191 /* 192 * Data for the pv entry allocation mechanism 193 */ 194 static vm_zone_t pvzone; 195 static struct vm_zone pvzone_store; 196 static struct vm_object pvzone_obj; 197 static int pv_entry_max=0, pv_entry_high_water=0; 198 static int pmap_pagedaemon_waken = 0; 199 static struct pv_entry *pvinit; 200 201 /* 202 * All those kernel PT submaps that BSD is so fond of 203 */ 204 pt_entry_t *CMAP1 = NULL, *ptmmap; 205 caddr_t CADDR1 = NULL, ptvmmap = NULL; 206 static pt_entry_t *msgbufmap; 207 struct msgbuf *msgbufp=NULL; 208 209 /* 210 * PMAP default PG_* bits. Needed to be able to add 211 * EPT/NPT pagetable pmap_bits for the VMM module 212 */ 213 uint64_t pmap_bits_default[] = { 214 REGULAR_PMAP, /* TYPE_IDX 0 */ 215 X86_PG_V, /* PG_V_IDX 1 */ 216 X86_PG_RW, /* PG_RW_IDX 2 */ 217 X86_PG_U, /* PG_U_IDX 3 */ 218 X86_PG_A, /* PG_A_IDX 4 */ 219 X86_PG_M, /* PG_M_IDX 5 */ 220 X86_PG_PS, /* PG_PS_IDX3 6 */ 221 X86_PG_G, /* PG_G_IDX 7 */ 222 X86_PG_AVAIL1, /* PG_AVAIL1_IDX 8 */ 223 X86_PG_AVAIL2, /* PG_AVAIL2_IDX 9 */ 224 X86_PG_AVAIL3, /* PG_AVAIL3_IDX 10 */ 225 X86_PG_NC_PWT | X86_PG_NC_PCD, /* PG_N_IDX 11 */ 226 }; 227 /* 228 * Crashdump maps. 229 */ 230 static pt_entry_t *pt_crashdumpmap; 231 static caddr_t crashdumpmap; 232 233 #ifdef PMAP_DEBUG2 234 static int pmap_enter_debug = 0; 235 SYSCTL_INT(_machdep, OID_AUTO, pmap_enter_debug, CTLFLAG_RW, 236 &pmap_enter_debug, 0, "Debug pmap_enter's"); 237 #endif 238 static int pmap_yield_count = 64; 239 SYSCTL_INT(_machdep, OID_AUTO, pmap_yield_count, CTLFLAG_RW, 240 &pmap_yield_count, 0, "Yield during init_pt/release"); 241 static int pmap_mmu_optimize = 0; 242 SYSCTL_INT(_machdep, OID_AUTO, pmap_mmu_optimize, CTLFLAG_RW, 243 &pmap_mmu_optimize, 0, "Share page table pages when possible"); 244 int pmap_fast_kernel_cpusync = 0; 245 SYSCTL_INT(_machdep, OID_AUTO, pmap_fast_kernel_cpusync, CTLFLAG_RW, 246 &pmap_fast_kernel_cpusync, 0, "Share page table pages when possible"); 247 248 #define DISABLE_PSE 249 250 /* Standard user access funtions */ 251 extern int std_copyinstr (const void *udaddr, void *kaddr, size_t len, 252 size_t *lencopied); 253 extern int std_copyin (const void *udaddr, void *kaddr, size_t len); 254 extern int std_copyout (const void *kaddr, void *udaddr, size_t len); 255 extern int std_fubyte (const void *base); 256 extern int std_subyte (void *base, int byte); 257 extern long std_fuword (const void *base); 258 extern int std_suword (void *base, long word); 259 extern int std_suword32 (void *base, int word); 260 261 static void pv_hold(pv_entry_t pv); 262 static int _pv_hold_try(pv_entry_t pv 263 PMAP_DEBUG_DECL); 264 static void pv_drop(pv_entry_t pv); 265 static void _pv_lock(pv_entry_t pv 266 PMAP_DEBUG_DECL); 267 static void pv_unlock(pv_entry_t pv); 268 static pv_entry_t _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew 269 PMAP_DEBUG_DECL); 270 static pv_entry_t _pv_get(pmap_t pmap, vm_pindex_t pindex 271 PMAP_DEBUG_DECL); 272 static pv_entry_t pv_get_try(pmap_t pmap, vm_pindex_t pindex, int *errorp); 273 static pv_entry_t pv_find(pmap_t pmap, vm_pindex_t pindex); 274 static void pv_put(pv_entry_t pv); 275 static void pv_free(pv_entry_t pv); 276 static void *pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex); 277 static pv_entry_t pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, 278 pv_entry_t *pvpp); 279 static pv_entry_t pmap_allocpte_seg(pmap_t pmap, vm_pindex_t ptepindex, 280 pv_entry_t *pvpp, vm_map_entry_t entry, vm_offset_t va); 281 static void pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, 282 pmap_inval_bulk_t *bulk); 283 static vm_page_t pmap_remove_pv_page(pv_entry_t pv); 284 static int pmap_release_pv(pv_entry_t pv, pv_entry_t pvp, 285 pmap_inval_bulk_t *bulk); 286 287 struct pmap_scan_info; 288 static void pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info, 289 pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept, 290 vm_offset_t va, pt_entry_t *ptep, void *arg __unused); 291 static void pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info, 292 pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept, 293 vm_offset_t va, pt_entry_t *ptep, void *arg __unused); 294 295 static void i386_protection_init (void); 296 static void create_pagetables(vm_paddr_t *firstaddr); 297 static void pmap_remove_all (vm_page_t m); 298 static boolean_t pmap_testbit (vm_page_t m, int bit); 299 300 static pt_entry_t * pmap_pte_quick (pmap_t pmap, vm_offset_t va); 301 static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 302 303 static void pmap_pinit_defaults(struct pmap *pmap); 304 305 static unsigned pdir4mb; 306 307 static int 308 pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2) 309 { 310 if (pv1->pv_pindex < pv2->pv_pindex) 311 return(-1); 312 if (pv1->pv_pindex > pv2->pv_pindex) 313 return(1); 314 return(0); 315 } 316 317 RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry, 318 pv_entry_compare, vm_pindex_t, pv_pindex); 319 320 static __inline 321 void 322 pmap_page_stats_adding(vm_page_t m) 323 { 324 globaldata_t gd = mycpu; 325 326 if (TAILQ_EMPTY(&m->md.pv_list)) { 327 ++gd->gd_vmtotal.t_arm; 328 } else if (TAILQ_FIRST(&m->md.pv_list) == 329 TAILQ_LAST(&m->md.pv_list, md_page_pv_list)) { 330 ++gd->gd_vmtotal.t_armshr; 331 ++gd->gd_vmtotal.t_avmshr; 332 } else { 333 ++gd->gd_vmtotal.t_avmshr; 334 } 335 } 336 337 static __inline 338 void 339 pmap_page_stats_deleting(vm_page_t m) 340 { 341 globaldata_t gd = mycpu; 342 343 if (TAILQ_EMPTY(&m->md.pv_list)) { 344 --gd->gd_vmtotal.t_arm; 345 } else if (TAILQ_FIRST(&m->md.pv_list) == 346 TAILQ_LAST(&m->md.pv_list, md_page_pv_list)) { 347 --gd->gd_vmtotal.t_armshr; 348 --gd->gd_vmtotal.t_avmshr; 349 } else { 350 --gd->gd_vmtotal.t_avmshr; 351 } 352 } 353 354 /* 355 * Move the kernel virtual free pointer to the next 356 * 2MB. This is used to help improve performance 357 * by using a large (2MB) page for much of the kernel 358 * (.text, .data, .bss) 359 */ 360 static 361 vm_offset_t 362 pmap_kmem_choose(vm_offset_t addr) 363 { 364 vm_offset_t newaddr = addr; 365 366 newaddr = roundup2(addr, NBPDR); 367 return newaddr; 368 } 369 370 /* 371 * pmap_pte_quick: 372 * 373 * Super fast pmap_pte routine best used when scanning the pv lists. 374 * This eliminates many course-grained invltlb calls. Note that many of 375 * the pv list scans are across different pmaps and it is very wasteful 376 * to do an entire invltlb when checking a single mapping. 377 */ 378 static __inline pt_entry_t *pmap_pte(pmap_t pmap, vm_offset_t va); 379 380 static 381 pt_entry_t * 382 pmap_pte_quick(pmap_t pmap, vm_offset_t va) 383 { 384 return pmap_pte(pmap, va); 385 } 386 387 /* 388 * Returns the pindex of a page table entry (representing a terminal page). 389 * There are NUPTE_TOTAL page table entries possible (a huge number) 390 * 391 * x86-64 has a 48-bit address space, where bit 47 is sign-extended out. 392 * We want to properly translate negative KVAs. 393 */ 394 static __inline 395 vm_pindex_t 396 pmap_pte_pindex(vm_offset_t va) 397 { 398 return ((va >> PAGE_SHIFT) & (NUPTE_TOTAL - 1)); 399 } 400 401 /* 402 * Returns the pindex of a page table. 403 */ 404 static __inline 405 vm_pindex_t 406 pmap_pt_pindex(vm_offset_t va) 407 { 408 return (NUPTE_TOTAL + ((va >> PDRSHIFT) & (NUPT_TOTAL - 1))); 409 } 410 411 /* 412 * Returns the pindex of a page directory. 413 */ 414 static __inline 415 vm_pindex_t 416 pmap_pd_pindex(vm_offset_t va) 417 { 418 return (NUPTE_TOTAL + NUPT_TOTAL + 419 ((va >> PDPSHIFT) & (NUPD_TOTAL - 1))); 420 } 421 422 static __inline 423 vm_pindex_t 424 pmap_pdp_pindex(vm_offset_t va) 425 { 426 return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + 427 ((va >> PML4SHIFT) & (NUPDP_TOTAL - 1))); 428 } 429 430 static __inline 431 vm_pindex_t 432 pmap_pml4_pindex(void) 433 { 434 return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL); 435 } 436 437 /* 438 * Return various clipped indexes for a given VA 439 * 440 * Returns the index of a pt in a page directory, representing a page 441 * table. 442 */ 443 static __inline 444 vm_pindex_t 445 pmap_pt_index(vm_offset_t va) 446 { 447 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 448 } 449 450 /* 451 * Returns the index of a pd in a page directory page, representing a page 452 * directory. 453 */ 454 static __inline 455 vm_pindex_t 456 pmap_pd_index(vm_offset_t va) 457 { 458 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 459 } 460 461 /* 462 * Returns the index of a pdp in the pml4 table, representing a page 463 * directory page. 464 */ 465 static __inline 466 vm_pindex_t 467 pmap_pdp_index(vm_offset_t va) 468 { 469 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 470 } 471 472 /* 473 * Generic procedure to index a pte from a pt, pd, or pdp. 474 * 475 * NOTE: Normally passed pindex as pmap_xx_index(). pmap_xx_pindex() is NOT 476 * a page table page index but is instead of PV lookup index. 477 */ 478 static 479 void * 480 pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex) 481 { 482 pt_entry_t *pte; 483 484 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pv->pv_m)); 485 return(&pte[pindex]); 486 } 487 488 /* 489 * Return pointer to PDP slot in the PML4 490 */ 491 static __inline 492 pml4_entry_t * 493 pmap_pdp(pmap_t pmap, vm_offset_t va) 494 { 495 return (&pmap->pm_pml4[pmap_pdp_index(va)]); 496 } 497 498 /* 499 * Return pointer to PD slot in the PDP given a pointer to the PDP 500 */ 501 static __inline 502 pdp_entry_t * 503 pmap_pdp_to_pd(pml4_entry_t pdp_pte, vm_offset_t va) 504 { 505 pdp_entry_t *pd; 506 507 pd = (pdp_entry_t *)PHYS_TO_DMAP(pdp_pte & PG_FRAME); 508 return (&pd[pmap_pd_index(va)]); 509 } 510 511 /* 512 * Return pointer to PD slot in the PDP. 513 */ 514 static __inline 515 pdp_entry_t * 516 pmap_pd(pmap_t pmap, vm_offset_t va) 517 { 518 pml4_entry_t *pdp; 519 520 pdp = pmap_pdp(pmap, va); 521 if ((*pdp & pmap->pmap_bits[PG_V_IDX]) == 0) 522 return NULL; 523 return (pmap_pdp_to_pd(*pdp, va)); 524 } 525 526 /* 527 * Return pointer to PT slot in the PD given a pointer to the PD 528 */ 529 static __inline 530 pd_entry_t * 531 pmap_pd_to_pt(pdp_entry_t pd_pte, vm_offset_t va) 532 { 533 pd_entry_t *pt; 534 535 pt = (pd_entry_t *)PHYS_TO_DMAP(pd_pte & PG_FRAME); 536 return (&pt[pmap_pt_index(va)]); 537 } 538 539 /* 540 * Return pointer to PT slot in the PD 541 * 542 * SIMPLE PMAP NOTE: Simple pmaps (embedded in objects) do not have PDPs, 543 * so we cannot lookup the PD via the PDP. Instead we 544 * must look it up via the pmap. 545 */ 546 static __inline 547 pd_entry_t * 548 pmap_pt(pmap_t pmap, vm_offset_t va) 549 { 550 pdp_entry_t *pd; 551 pv_entry_t pv; 552 vm_pindex_t pd_pindex; 553 554 if (pmap->pm_flags & PMAP_FLAG_SIMPLE) { 555 pd_pindex = pmap_pd_pindex(va); 556 spin_lock(&pmap->pm_spin); 557 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pd_pindex); 558 spin_unlock(&pmap->pm_spin); 559 if (pv == NULL || pv->pv_m == NULL) 560 return NULL; 561 return (pmap_pd_to_pt(VM_PAGE_TO_PHYS(pv->pv_m), va)); 562 } else { 563 pd = pmap_pd(pmap, va); 564 if (pd == NULL || (*pd & pmap->pmap_bits[PG_V_IDX]) == 0) 565 return NULL; 566 return (pmap_pd_to_pt(*pd, va)); 567 } 568 } 569 570 /* 571 * Return pointer to PTE slot in the PT given a pointer to the PT 572 */ 573 static __inline 574 pt_entry_t * 575 pmap_pt_to_pte(pd_entry_t pt_pte, vm_offset_t va) 576 { 577 pt_entry_t *pte; 578 579 pte = (pt_entry_t *)PHYS_TO_DMAP(pt_pte & PG_FRAME); 580 return (&pte[pmap_pte_index(va)]); 581 } 582 583 /* 584 * Return pointer to PTE slot in the PT 585 */ 586 static __inline 587 pt_entry_t * 588 pmap_pte(pmap_t pmap, vm_offset_t va) 589 { 590 pd_entry_t *pt; 591 592 pt = pmap_pt(pmap, va); 593 if (pt == NULL || (*pt & pmap->pmap_bits[PG_V_IDX]) == 0) 594 return NULL; 595 if ((*pt & pmap->pmap_bits[PG_PS_IDX]) != 0) 596 return ((pt_entry_t *)pt); 597 return (pmap_pt_to_pte(*pt, va)); 598 } 599 600 /* 601 * Of all the layers (PTE, PT, PD, PDP, PML4) the best one to cache is 602 * the PT layer. This will speed up core pmap operations considerably. 603 * 604 * NOTE: The pmap spinlock does not need to be held but the passed-in pv 605 * must be in a known associated state (typically by being locked when 606 * the pmap spinlock isn't held). We allow the race for that case. 607 */ 608 static __inline 609 void 610 pv_cache(pv_entry_t pv, vm_pindex_t pindex) 611 { 612 if (pindex >= pmap_pt_pindex(0) && pindex <= pmap_pd_pindex(0)) 613 pv->pv_pmap->pm_pvhint = pv; 614 } 615 616 617 /* 618 * Return address of PT slot in PD (KVM only) 619 * 620 * Cannot be used for user page tables because it might interfere with 621 * the shared page-table-page optimization (pmap_mmu_optimize). 622 */ 623 static __inline 624 pd_entry_t * 625 vtopt(vm_offset_t va) 626 { 627 uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 628 NPML4EPGSHIFT)) - 1); 629 630 return (PDmap + ((va >> PDRSHIFT) & mask)); 631 } 632 633 /* 634 * KVM - return address of PTE slot in PT 635 */ 636 static __inline 637 pt_entry_t * 638 vtopte(vm_offset_t va) 639 { 640 uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + 641 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 642 643 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 644 } 645 646 static uint64_t 647 allocpages(vm_paddr_t *firstaddr, long n) 648 { 649 uint64_t ret; 650 651 ret = *firstaddr; 652 bzero((void *)ret, n * PAGE_SIZE); 653 *firstaddr += n * PAGE_SIZE; 654 return (ret); 655 } 656 657 static 658 void 659 create_pagetables(vm_paddr_t *firstaddr) 660 { 661 long i; /* must be 64 bits */ 662 long nkpt_base; 663 long nkpt_phys; 664 int j; 665 666 /* 667 * We are running (mostly) V=P at this point 668 * 669 * Calculate NKPT - number of kernel page tables. We have to 670 * accomodoate prealloction of the vm_page_array, dump bitmap, 671 * MSGBUF_SIZE, and other stuff. Be generous. 672 * 673 * Maxmem is in pages. 674 * 675 * ndmpdp is the number of 1GB pages we wish to map. 676 */ 677 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; 678 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 679 ndmpdp = 4; 680 KKASSERT(ndmpdp <= NKPDPE * NPDEPG); 681 682 /* 683 * Starting at the beginning of kvm (not KERNBASE). 684 */ 685 nkpt_phys = (Maxmem * sizeof(struct vm_page) + NBPDR - 1) / NBPDR; 686 nkpt_phys += (Maxmem * sizeof(struct pv_entry) + NBPDR - 1) / NBPDR; 687 nkpt_phys += ((nkpt + nkpt + 1 + NKPML4E + NKPDPE + NDMPML4E + 688 ndmpdp) + 511) / 512; 689 nkpt_phys += 128; 690 691 /* 692 * Starting at KERNBASE - map 2G worth of page table pages. 693 * KERNBASE is offset -2G from the end of kvm. 694 */ 695 nkpt_base = (NPDPEPG - KPDPI) * NPTEPG; /* typically 2 x 512 */ 696 697 /* 698 * Allocate pages 699 */ 700 KPTbase = allocpages(firstaddr, nkpt_base); 701 KPTphys = allocpages(firstaddr, nkpt_phys); 702 KPML4phys = allocpages(firstaddr, 1); 703 KPDPphys = allocpages(firstaddr, NKPML4E); 704 KPDphys = allocpages(firstaddr, NKPDPE); 705 706 /* 707 * Calculate the page directory base for KERNBASE, 708 * that is where we start populating the page table pages. 709 * Basically this is the end - 2. 710 */ 711 KPDbase = KPDphys + ((NKPDPE - (NPDPEPG - KPDPI)) << PAGE_SHIFT); 712 713 DMPDPphys = allocpages(firstaddr, NDMPML4E); 714 if ((amd_feature & AMDID_PAGE1GB) == 0) 715 DMPDphys = allocpages(firstaddr, ndmpdp); 716 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 717 718 /* 719 * Fill in the underlying page table pages for the area around 720 * KERNBASE. This remaps low physical memory to KERNBASE. 721 * 722 * Read-only from zero to physfree 723 * XXX not fully used, underneath 2M pages 724 */ 725 for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) { 726 ((pt_entry_t *)KPTbase)[i] = i << PAGE_SHIFT; 727 ((pt_entry_t *)KPTbase)[i] |= 728 pmap_bits_default[PG_RW_IDX] | 729 pmap_bits_default[PG_V_IDX] | 730 pmap_bits_default[PG_G_IDX]; 731 } 732 733 /* 734 * Now map the initial kernel page tables. One block of page 735 * tables is placed at the beginning of kernel virtual memory, 736 * and another block is placed at KERNBASE to map the kernel binary, 737 * data, bss, and initial pre-allocations. 738 */ 739 for (i = 0; i < nkpt_base; i++) { 740 ((pd_entry_t *)KPDbase)[i] = KPTbase + (i << PAGE_SHIFT); 741 ((pd_entry_t *)KPDbase)[i] |= 742 pmap_bits_default[PG_RW_IDX] | 743 pmap_bits_default[PG_V_IDX]; 744 } 745 for (i = 0; i < nkpt_phys; i++) { 746 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); 747 ((pd_entry_t *)KPDphys)[i] |= 748 pmap_bits_default[PG_RW_IDX] | 749 pmap_bits_default[PG_V_IDX]; 750 } 751 752 /* 753 * Map from zero to end of allocations using 2M pages as an 754 * optimization. This will bypass some of the KPTBase pages 755 * above in the KERNBASE area. 756 */ 757 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) { 758 ((pd_entry_t *)KPDbase)[i] = i << PDRSHIFT; 759 ((pd_entry_t *)KPDbase)[i] |= 760 pmap_bits_default[PG_RW_IDX] | 761 pmap_bits_default[PG_V_IDX] | 762 pmap_bits_default[PG_PS_IDX] | 763 pmap_bits_default[PG_G_IDX]; 764 } 765 766 /* 767 * And connect up the PD to the PDP. The kernel pmap is expected 768 * to pre-populate all of its PDs. See NKPDPE in vmparam.h. 769 */ 770 for (i = 0; i < NKPDPE; i++) { 771 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] = 772 KPDphys + (i << PAGE_SHIFT); 773 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] |= 774 pmap_bits_default[PG_RW_IDX] | 775 pmap_bits_default[PG_V_IDX] | 776 pmap_bits_default[PG_U_IDX]; 777 } 778 779 /* 780 * Now set up the direct map space using either 2MB or 1GB pages 781 * Preset PG_M and PG_A because demotion expects it. 782 * 783 * When filling in entries in the PD pages make sure any excess 784 * entries are set to zero as we allocated enough PD pages 785 */ 786 if ((amd_feature & AMDID_PAGE1GB) == 0) { 787 for (i = 0; i < NPDEPG * ndmpdp; i++) { 788 ((pd_entry_t *)DMPDphys)[i] = i << PDRSHIFT; 789 ((pd_entry_t *)DMPDphys)[i] |= 790 pmap_bits_default[PG_RW_IDX] | 791 pmap_bits_default[PG_V_IDX] | 792 pmap_bits_default[PG_PS_IDX] | 793 pmap_bits_default[PG_G_IDX] | 794 pmap_bits_default[PG_M_IDX] | 795 pmap_bits_default[PG_A_IDX]; 796 } 797 798 /* 799 * And the direct map space's PDP 800 */ 801 for (i = 0; i < ndmpdp; i++) { 802 ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + 803 (i << PAGE_SHIFT); 804 ((pdp_entry_t *)DMPDPphys)[i] |= 805 pmap_bits_default[PG_RW_IDX] | 806 pmap_bits_default[PG_V_IDX] | 807 pmap_bits_default[PG_U_IDX]; 808 } 809 } else { 810 for (i = 0; i < ndmpdp; i++) { 811 ((pdp_entry_t *)DMPDPphys)[i] = 812 (vm_paddr_t)i << PDPSHIFT; 813 ((pdp_entry_t *)DMPDPphys)[i] |= 814 pmap_bits_default[PG_RW_IDX] | 815 pmap_bits_default[PG_V_IDX] | 816 pmap_bits_default[PG_PS_IDX] | 817 pmap_bits_default[PG_G_IDX] | 818 pmap_bits_default[PG_M_IDX] | 819 pmap_bits_default[PG_A_IDX]; 820 } 821 } 822 823 /* And recursively map PML4 to itself in order to get PTmap */ 824 ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; 825 ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= 826 pmap_bits_default[PG_RW_IDX] | 827 pmap_bits_default[PG_V_IDX] | 828 pmap_bits_default[PG_U_IDX]; 829 830 /* 831 * Connect the Direct Map slots up to the PML4 832 */ 833 for (j = 0; j < NDMPML4E; ++j) { 834 ((pdp_entry_t *)KPML4phys)[DMPML4I + j] = 835 (DMPDPphys + ((vm_paddr_t)j << PML4SHIFT)) | 836 pmap_bits_default[PG_RW_IDX] | 837 pmap_bits_default[PG_V_IDX] | 838 pmap_bits_default[PG_U_IDX]; 839 } 840 841 /* 842 * Connect the KVA slot up to the PML4 843 */ 844 ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys; 845 ((pdp_entry_t *)KPML4phys)[KPML4I] |= 846 pmap_bits_default[PG_RW_IDX] | 847 pmap_bits_default[PG_V_IDX] | 848 pmap_bits_default[PG_U_IDX]; 849 } 850 851 /* 852 * Bootstrap the system enough to run with virtual memory. 853 * 854 * On the i386 this is called after mapping has already been enabled 855 * and just syncs the pmap module with what has already been done. 856 * [We can't call it easily with mapping off since the kernel is not 857 * mapped with PA == VA, hence we would have to relocate every address 858 * from the linked base (virtual) address "KERNBASE" to the actual 859 * (physical) address starting relative to 0] 860 */ 861 void 862 pmap_bootstrap(vm_paddr_t *firstaddr) 863 { 864 vm_offset_t va; 865 pt_entry_t *pte; 866 867 KvaStart = VM_MIN_KERNEL_ADDRESS; 868 KvaEnd = VM_MAX_KERNEL_ADDRESS; 869 KvaSize = KvaEnd - KvaStart; 870 871 avail_start = *firstaddr; 872 873 /* 874 * Create an initial set of page tables to run the kernel in. 875 */ 876 create_pagetables(firstaddr); 877 878 virtual2_start = KvaStart; 879 virtual2_end = PTOV_OFFSET; 880 881 virtual_start = (vm_offset_t) PTOV_OFFSET + *firstaddr; 882 virtual_start = pmap_kmem_choose(virtual_start); 883 884 virtual_end = VM_MAX_KERNEL_ADDRESS; 885 886 /* XXX do %cr0 as well */ 887 load_cr4(rcr4() | CR4_PGE | CR4_PSE); 888 load_cr3(KPML4phys); 889 890 /* 891 * Initialize protection array. 892 */ 893 i386_protection_init(); 894 895 /* 896 * The kernel's pmap is statically allocated so we don't have to use 897 * pmap_create, which is unlikely to work correctly at this part of 898 * the boot sequence (XXX and which no longer exists). 899 */ 900 kernel_pmap.pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys); 901 kernel_pmap.pm_count = 1; 902 CPUMASK_ASSALLONES(kernel_pmap.pm_active); 903 RB_INIT(&kernel_pmap.pm_pvroot); 904 spin_init(&kernel_pmap.pm_spin, "pmapbootstrap"); 905 lwkt_token_init(&kernel_pmap.pm_token, "kpmap_tok"); 906 907 /* 908 * Reserve some special page table entries/VA space for temporary 909 * mapping of pages. 910 */ 911 #define SYSMAP(c, p, v, n) \ 912 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 913 914 va = virtual_start; 915 pte = vtopte(va); 916 917 /* 918 * CMAP1/CMAP2 are used for zeroing and copying pages. 919 */ 920 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 921 922 /* 923 * Crashdump maps. 924 */ 925 SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); 926 927 /* 928 * ptvmmap is used for reading arbitrary physical pages via 929 * /dev/mem. 930 */ 931 SYSMAP(caddr_t, ptmmap, ptvmmap, 1) 932 933 /* 934 * msgbufp is used to map the system message buffer. 935 * XXX msgbufmap is not used. 936 */ 937 SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 938 atop(round_page(MSGBUF_SIZE))) 939 940 virtual_start = va; 941 virtual_start = pmap_kmem_choose(virtual_start); 942 943 *CMAP1 = 0; 944 945 /* 946 * PG_G is terribly broken on SMP because we IPI invltlb's in some 947 * cases rather then invl1pg. Actually, I don't even know why it 948 * works under UP because self-referential page table mappings 949 */ 950 // pgeflag = 0; 951 952 /* 953 * Initialize the 4MB page size flag 954 */ 955 // pseflag = 0; 956 /* 957 * The 4MB page version of the initial 958 * kernel page mapping. 959 */ 960 pdir4mb = 0; 961 962 #if !defined(DISABLE_PSE) 963 if (cpu_feature & CPUID_PSE) { 964 pt_entry_t ptditmp; 965 /* 966 * Note that we have enabled PSE mode 967 */ 968 // pseflag = kernel_pmap.pmap_bits[PG_PS_IDX]; 969 ptditmp = *(PTmap + x86_64_btop(KERNBASE)); 970 ptditmp &= ~(NBPDR - 1); 971 ptditmp |= pmap_bits_default[PG_V_IDX] | 972 pmap_bits_default[PG_RW_IDX] | 973 pmap_bits_default[PG_PS_IDX] | 974 pmap_bits_default[PG_U_IDX]; 975 // pgeflag; 976 pdir4mb = ptditmp; 977 } 978 #endif 979 cpu_invltlb(); 980 981 /* Initialize the PAT MSR */ 982 pmap_init_pat(); 983 pmap_pinit_defaults(&kernel_pmap); 984 985 TUNABLE_INT_FETCH("machdep.pmap_fast_kernel_cpusync", 986 &pmap_fast_kernel_cpusync); 987 988 } 989 990 /* 991 * Setup the PAT MSR. 992 */ 993 void 994 pmap_init_pat(void) 995 { 996 uint64_t pat_msr; 997 u_long cr0, cr4; 998 999 /* 1000 * Default values mapping PATi,PCD,PWT bits at system reset. 1001 * The default values effectively ignore the PATi bit by 1002 * repeating the encodings for 0-3 in 4-7, and map the PCD 1003 * and PWT bit combinations to the expected PAT types. 1004 */ 1005 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | /* 000 */ 1006 PAT_VALUE(1, PAT_WRITE_THROUGH) | /* 001 */ 1007 PAT_VALUE(2, PAT_UNCACHED) | /* 010 */ 1008 PAT_VALUE(3, PAT_UNCACHEABLE) | /* 011 */ 1009 PAT_VALUE(4, PAT_WRITE_BACK) | /* 100 */ 1010 PAT_VALUE(5, PAT_WRITE_THROUGH) | /* 101 */ 1011 PAT_VALUE(6, PAT_UNCACHED) | /* 110 */ 1012 PAT_VALUE(7, PAT_UNCACHEABLE); /* 111 */ 1013 pat_pte_index[PAT_WRITE_BACK] = 0; 1014 pat_pte_index[PAT_WRITE_THROUGH]= 0 | X86_PG_NC_PWT; 1015 pat_pte_index[PAT_UNCACHED] = X86_PG_NC_PCD; 1016 pat_pte_index[PAT_UNCACHEABLE] = X86_PG_NC_PCD | X86_PG_NC_PWT; 1017 pat_pte_index[PAT_WRITE_PROTECTED] = pat_pte_index[PAT_UNCACHEABLE]; 1018 pat_pte_index[PAT_WRITE_COMBINING] = pat_pte_index[PAT_UNCACHEABLE]; 1019 1020 if (cpu_feature & CPUID_PAT) { 1021 /* 1022 * If we support the PAT then set-up entries for 1023 * WRITE_PROTECTED and WRITE_COMBINING using bit patterns 1024 * 4 and 5. 1025 */ 1026 pat_msr = (pat_msr & ~PAT_MASK(4)) | 1027 PAT_VALUE(4, PAT_WRITE_PROTECTED); 1028 pat_msr = (pat_msr & ~PAT_MASK(5)) | 1029 PAT_VALUE(5, PAT_WRITE_COMBINING); 1030 pat_pte_index[PAT_WRITE_PROTECTED] = X86_PG_PTE_PAT | 0; 1031 pat_pte_index[PAT_WRITE_COMBINING] = X86_PG_PTE_PAT | X86_PG_NC_PWT; 1032 1033 /* 1034 * Then enable the PAT 1035 */ 1036 1037 /* Disable PGE. */ 1038 cr4 = rcr4(); 1039 load_cr4(cr4 & ~CR4_PGE); 1040 1041 /* Disable caches (CD = 1, NW = 0). */ 1042 cr0 = rcr0(); 1043 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 1044 1045 /* Flushes caches and TLBs. */ 1046 wbinvd(); 1047 cpu_invltlb(); 1048 1049 /* Update PAT and index table. */ 1050 wrmsr(MSR_PAT, pat_msr); 1051 1052 /* Flush caches and TLBs again. */ 1053 wbinvd(); 1054 cpu_invltlb(); 1055 1056 /* Restore caches and PGE. */ 1057 load_cr0(cr0); 1058 load_cr4(cr4); 1059 PatMsr = pat_msr; 1060 } 1061 } 1062 1063 /* 1064 * Set 4mb pdir for mp startup 1065 */ 1066 void 1067 pmap_set_opt(void) 1068 { 1069 if (cpu_feature & CPUID_PSE) { 1070 load_cr4(rcr4() | CR4_PSE); 1071 if (pdir4mb && mycpu->gd_cpuid == 0) { /* only on BSP */ 1072 cpu_invltlb(); 1073 } 1074 } 1075 } 1076 1077 /* 1078 * Initialize the pmap module. 1079 * Called by vm_init, to initialize any structures that the pmap 1080 * system needs to map virtual memory. 1081 * pmap_init has been enhanced to support in a fairly consistant 1082 * way, discontiguous physical memory. 1083 */ 1084 void 1085 pmap_init(void) 1086 { 1087 int i; 1088 int initial_pvs; 1089 1090 /* 1091 * Allocate memory for random pmap data structures. Includes the 1092 * pv_head_table. 1093 */ 1094 1095 for (i = 0; i < vm_page_array_size; i++) { 1096 vm_page_t m; 1097 1098 m = &vm_page_array[i]; 1099 TAILQ_INIT(&m->md.pv_list); 1100 } 1101 1102 /* 1103 * init the pv free list 1104 */ 1105 initial_pvs = vm_page_array_size; 1106 if (initial_pvs < MINPV) 1107 initial_pvs = MINPV; 1108 pvzone = &pvzone_store; 1109 pvinit = (void *)kmem_alloc(&kernel_map, 1110 initial_pvs * sizeof (struct pv_entry)); 1111 zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), 1112 pvinit, initial_pvs); 1113 1114 /* 1115 * Now it is safe to enable pv_table recording. 1116 */ 1117 pmap_initialized = TRUE; 1118 } 1119 1120 /* 1121 * Initialize the address space (zone) for the pv_entries. Set a 1122 * high water mark so that the system can recover from excessive 1123 * numbers of pv entries. 1124 */ 1125 void 1126 pmap_init2(void) 1127 { 1128 int shpgperproc = PMAP_SHPGPERPROC; 1129 int entry_max; 1130 1131 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 1132 pv_entry_max = shpgperproc * maxproc + vm_page_array_size; 1133 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 1134 pv_entry_high_water = 9 * (pv_entry_max / 10); 1135 1136 /* 1137 * Subtract out pages already installed in the zone (hack) 1138 */ 1139 entry_max = pv_entry_max - vm_page_array_size; 1140 if (entry_max <= 0) 1141 entry_max = 1; 1142 1143 zinitna(pvzone, &pvzone_obj, NULL, 0, entry_max, ZONE_INTERRUPT, 1); 1144 } 1145 1146 /* 1147 * Typically used to initialize a fictitious page by vm/device_pager.c 1148 */ 1149 void 1150 pmap_page_init(struct vm_page *m) 1151 { 1152 vm_page_init(m); 1153 TAILQ_INIT(&m->md.pv_list); 1154 } 1155 1156 /*************************************************** 1157 * Low level helper routines..... 1158 ***************************************************/ 1159 1160 /* 1161 * this routine defines the region(s) of memory that should 1162 * not be tested for the modified bit. 1163 */ 1164 static __inline 1165 int 1166 pmap_track_modified(vm_pindex_t pindex) 1167 { 1168 vm_offset_t va = (vm_offset_t)pindex << PAGE_SHIFT; 1169 if ((va < clean_sva) || (va >= clean_eva)) 1170 return 1; 1171 else 1172 return 0; 1173 } 1174 1175 /* 1176 * Extract the physical page address associated with the map/VA pair. 1177 * The page must be wired for this to work reliably. 1178 * 1179 * XXX for the moment we're using pv_find() instead of pv_get(), as 1180 * callers might be expecting non-blocking operation. 1181 */ 1182 vm_paddr_t 1183 pmap_extract(pmap_t pmap, vm_offset_t va) 1184 { 1185 vm_paddr_t rtval; 1186 pv_entry_t pt_pv; 1187 pt_entry_t *ptep; 1188 1189 rtval = 0; 1190 if (va >= VM_MAX_USER_ADDRESS) { 1191 /* 1192 * Kernel page directories might be direct-mapped and 1193 * there is typically no PV tracking of pte's 1194 */ 1195 pd_entry_t *pt; 1196 1197 pt = pmap_pt(pmap, va); 1198 if (pt && (*pt & pmap->pmap_bits[PG_V_IDX])) { 1199 if (*pt & pmap->pmap_bits[PG_PS_IDX]) { 1200 rtval = *pt & PG_PS_FRAME; 1201 rtval |= va & PDRMASK; 1202 } else { 1203 ptep = pmap_pt_to_pte(*pt, va); 1204 if (*pt & pmap->pmap_bits[PG_V_IDX]) { 1205 rtval = *ptep & PG_FRAME; 1206 rtval |= va & PAGE_MASK; 1207 } 1208 } 1209 } 1210 } else { 1211 /* 1212 * User pages currently do not direct-map the page directory 1213 * and some pages might not used managed PVs. But all PT's 1214 * will have a PV. 1215 */ 1216 pt_pv = pv_find(pmap, pmap_pt_pindex(va)); 1217 if (pt_pv) { 1218 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 1219 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 1220 rtval = *ptep & PG_FRAME; 1221 rtval |= va & PAGE_MASK; 1222 } 1223 pv_drop(pt_pv); 1224 } 1225 } 1226 return rtval; 1227 } 1228 1229 /* 1230 * Similar to extract but checks protections, SMP-friendly short-cut for 1231 * vm_fault_page[_quick](). Can return NULL to cause the caller to 1232 * fall-through to the real fault code. 1233 * 1234 * The returned page, if not NULL, is held (and not busied). 1235 */ 1236 vm_page_t 1237 pmap_fault_page_quick(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1238 { 1239 if (pmap && va < VM_MAX_USER_ADDRESS) { 1240 pv_entry_t pt_pv; 1241 pv_entry_t pte_pv; 1242 pt_entry_t *ptep; 1243 pt_entry_t req; 1244 vm_page_t m; 1245 int error; 1246 1247 req = pmap->pmap_bits[PG_V_IDX] | 1248 pmap->pmap_bits[PG_U_IDX]; 1249 if (prot & VM_PROT_WRITE) 1250 req |= pmap->pmap_bits[PG_RW_IDX]; 1251 1252 pt_pv = pv_find(pmap, pmap_pt_pindex(va)); 1253 if (pt_pv == NULL) 1254 return (NULL); 1255 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 1256 if ((*ptep & req) != req) { 1257 pv_drop(pt_pv); 1258 return (NULL); 1259 } 1260 pte_pv = pv_get_try(pmap, pmap_pte_pindex(va), &error); 1261 if (pte_pv && error == 0) { 1262 m = pte_pv->pv_m; 1263 vm_page_hold(m); 1264 if (prot & VM_PROT_WRITE) 1265 vm_page_dirty(m); 1266 pv_put(pte_pv); 1267 } else if (pte_pv) { 1268 pv_drop(pte_pv); 1269 m = NULL; 1270 } else { 1271 m = NULL; 1272 } 1273 pv_drop(pt_pv); 1274 return(m); 1275 } else { 1276 return(NULL); 1277 } 1278 } 1279 1280 /* 1281 * Extract the physical page address associated kernel virtual address. 1282 */ 1283 vm_paddr_t 1284 pmap_kextract(vm_offset_t va) 1285 { 1286 pd_entry_t pt; /* pt entry in pd */ 1287 vm_paddr_t pa; 1288 1289 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 1290 pa = DMAP_TO_PHYS(va); 1291 } else { 1292 pt = *vtopt(va); 1293 if (pt & kernel_pmap.pmap_bits[PG_PS_IDX]) { 1294 pa = (pt & PG_PS_FRAME) | (va & PDRMASK); 1295 } else { 1296 /* 1297 * Beware of a concurrent promotion that changes the 1298 * PDE at this point! For example, vtopte() must not 1299 * be used to access the PTE because it would use the 1300 * new PDE. It is, however, safe to use the old PDE 1301 * because the page table page is preserved by the 1302 * promotion. 1303 */ 1304 pa = *pmap_pt_to_pte(pt, va); 1305 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 1306 } 1307 } 1308 return pa; 1309 } 1310 1311 /*************************************************** 1312 * Low level mapping routines..... 1313 ***************************************************/ 1314 1315 /* 1316 * Routine: pmap_kenter 1317 * Function: 1318 * Add a wired page to the KVA 1319 * NOTE! note that in order for the mapping to take effect -- you 1320 * should do an invltlb after doing the pmap_kenter(). 1321 */ 1322 void 1323 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1324 { 1325 pt_entry_t *ptep; 1326 pt_entry_t npte; 1327 1328 npte = pa | 1329 kernel_pmap.pmap_bits[PG_RW_IDX] | 1330 kernel_pmap.pmap_bits[PG_V_IDX]; 1331 // pgeflag; 1332 ptep = vtopte(va); 1333 #if 1 1334 pmap_inval_smp(&kernel_pmap, va, 1, ptep, npte); 1335 #else 1336 /* FUTURE */ 1337 if (*ptep) 1338 pmap_inval_smp(&kernel_pmap, va, ptep, npte); 1339 else 1340 *ptep = npte; 1341 #endif 1342 } 1343 1344 /* 1345 * Similar to pmap_kenter(), except we only invalidate the mapping on the 1346 * current CPU. Returns 0 if the previous pte was 0, 1 if it wasn't 1347 * (caller can conditionalize calling smp_invltlb()). 1348 */ 1349 int 1350 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa) 1351 { 1352 pt_entry_t *ptep; 1353 pt_entry_t npte; 1354 int res; 1355 1356 npte = pa | 1357 kernel_pmap.pmap_bits[PG_RW_IDX] | 1358 kernel_pmap.pmap_bits[PG_V_IDX]; 1359 // pgeflag; 1360 ptep = vtopte(va); 1361 #if 1 1362 res = 1; 1363 #else 1364 /* FUTURE */ 1365 res = (*ptep != 0); 1366 #endif 1367 *ptep = npte; 1368 cpu_invlpg((void *)va); 1369 1370 return res; 1371 } 1372 1373 /* 1374 * Enter addresses into the kernel pmap but don't bother 1375 * doing any tlb invalidations. Caller will do a rollup 1376 * invalidation via pmap_rollup_inval(). 1377 */ 1378 int 1379 pmap_kenter_noinval(vm_offset_t va, vm_paddr_t pa) 1380 { 1381 pt_entry_t *ptep; 1382 pt_entry_t npte; 1383 int res; 1384 1385 npte = pa | 1386 kernel_pmap.pmap_bits[PG_RW_IDX] | 1387 kernel_pmap.pmap_bits[PG_V_IDX]; 1388 // pgeflag; 1389 ptep = vtopte(va); 1390 #if 1 1391 res = 1; 1392 #else 1393 /* FUTURE */ 1394 res = (*ptep != 0); 1395 #endif 1396 *ptep = npte; 1397 cpu_invlpg((void *)va); 1398 1399 return res; 1400 } 1401 1402 /* 1403 * remove a page from the kernel pagetables 1404 */ 1405 void 1406 pmap_kremove(vm_offset_t va) 1407 { 1408 pt_entry_t *ptep; 1409 1410 ptep = vtopte(va); 1411 pmap_inval_smp(&kernel_pmap, va, 1, ptep, 0); 1412 } 1413 1414 void 1415 pmap_kremove_quick(vm_offset_t va) 1416 { 1417 pt_entry_t *ptep; 1418 1419 ptep = vtopte(va); 1420 (void)pte_load_clear(ptep); 1421 cpu_invlpg((void *)va); 1422 } 1423 1424 /* 1425 * Remove addresses from the kernel pmap but don't bother 1426 * doing any tlb invalidations. Caller will do a rollup 1427 * invalidation via pmap_rollup_inval(). 1428 */ 1429 void 1430 pmap_kremove_noinval(vm_offset_t va) 1431 { 1432 pt_entry_t *ptep; 1433 1434 ptep = vtopte(va); 1435 (void)pte_load_clear(ptep); 1436 } 1437 1438 /* 1439 * XXX these need to be recoded. They are not used in any critical path. 1440 */ 1441 void 1442 pmap_kmodify_rw(vm_offset_t va) 1443 { 1444 atomic_set_long(vtopte(va), kernel_pmap.pmap_bits[PG_RW_IDX]); 1445 cpu_invlpg((void *)va); 1446 } 1447 1448 /* NOT USED 1449 void 1450 pmap_kmodify_nc(vm_offset_t va) 1451 { 1452 atomic_set_long(vtopte(va), PG_N); 1453 cpu_invlpg((void *)va); 1454 } 1455 */ 1456 1457 /* 1458 * Used to map a range of physical addresses into kernel virtual 1459 * address space during the low level boot, typically to map the 1460 * dump bitmap, message buffer, and vm_page_array. 1461 * 1462 * These mappings are typically made at some pointer after the end of the 1463 * kernel text+data. 1464 * 1465 * We could return PHYS_TO_DMAP(start) here and not allocate any 1466 * via (*virtp), but then kmem from userland and kernel dumps won't 1467 * have access to the related pointers. 1468 */ 1469 vm_offset_t 1470 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot) 1471 { 1472 vm_offset_t va; 1473 vm_offset_t va_start; 1474 1475 /*return PHYS_TO_DMAP(start);*/ 1476 1477 va_start = *virtp; 1478 va = va_start; 1479 1480 while (start < end) { 1481 pmap_kenter_quick(va, start); 1482 va += PAGE_SIZE; 1483 start += PAGE_SIZE; 1484 } 1485 *virtp = va; 1486 return va_start; 1487 } 1488 1489 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1490 1491 /* 1492 * Remove the specified set of pages from the data and instruction caches. 1493 * 1494 * In contrast to pmap_invalidate_cache_range(), this function does not 1495 * rely on the CPU's self-snoop feature, because it is intended for use 1496 * when moving pages into a different cache domain. 1497 */ 1498 void 1499 pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1500 { 1501 vm_offset_t daddr, eva; 1502 int i; 1503 1504 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1505 (cpu_feature & CPUID_CLFSH) == 0) 1506 wbinvd(); 1507 else { 1508 cpu_mfence(); 1509 for (i = 0; i < count; i++) { 1510 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 1511 eva = daddr + PAGE_SIZE; 1512 for (; daddr < eva; daddr += cpu_clflush_line_size) 1513 clflush(daddr); 1514 } 1515 cpu_mfence(); 1516 } 1517 } 1518 1519 void 1520 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 1521 { 1522 KASSERT((sva & PAGE_MASK) == 0, 1523 ("pmap_invalidate_cache_range: sva not page-aligned")); 1524 KASSERT((eva & PAGE_MASK) == 0, 1525 ("pmap_invalidate_cache_range: eva not page-aligned")); 1526 1527 if (cpu_feature & CPUID_SS) { 1528 ; /* If "Self Snoop" is supported, do nothing. */ 1529 } else { 1530 /* Globally invalidate caches */ 1531 cpu_wbinvd_on_all_cpus(); 1532 } 1533 } 1534 1535 /* 1536 * Invalidate the specified range of virtual memory on all cpus associated 1537 * with the pmap. 1538 */ 1539 void 1540 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1541 { 1542 pmap_inval_smp(pmap, sva, (eva - sva) >> PAGE_SHIFT, NULL, 0); 1543 } 1544 1545 /* 1546 * Add a list of wired pages to the kva. This routine is used for temporary 1547 * kernel mappings such as those found in buffer cache buffer. Page 1548 * modifications and accesses are not tracked or recorded. 1549 * 1550 * NOTE! Old mappings are simply overwritten, and we cannot assume relaxed 1551 * semantics as previous mappings may have been zerod without any 1552 * invalidation. 1553 * 1554 * The page *must* be wired. 1555 */ 1556 void 1557 pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count) 1558 { 1559 vm_offset_t end_va; 1560 vm_offset_t va; 1561 1562 end_va = beg_va + count * PAGE_SIZE; 1563 1564 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 1565 pt_entry_t *pte; 1566 1567 pte = vtopte(va); 1568 *pte = VM_PAGE_TO_PHYS(*m) | 1569 kernel_pmap.pmap_bits[PG_RW_IDX] | 1570 kernel_pmap.pmap_bits[PG_V_IDX] | 1571 kernel_pmap.pmap_cache_bits[(*m)->pat_mode]; 1572 // pgeflag; 1573 m++; 1574 } 1575 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 1576 } 1577 1578 /* 1579 * This routine jerks page mappings from the kernel -- it is meant only 1580 * for temporary mappings such as those found in buffer cache buffers. 1581 * No recording modified or access status occurs. 1582 * 1583 * MPSAFE, INTERRUPT SAFE (cluster callback) 1584 */ 1585 void 1586 pmap_qremove(vm_offset_t beg_va, int count) 1587 { 1588 vm_offset_t end_va; 1589 vm_offset_t va; 1590 1591 end_va = beg_va + count * PAGE_SIZE; 1592 1593 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 1594 pt_entry_t *pte; 1595 1596 pte = vtopte(va); 1597 (void)pte_load_clear(pte); 1598 cpu_invlpg((void *)va); 1599 } 1600 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 1601 } 1602 1603 /* 1604 * This routine removes temporary kernel mappings, only invalidating them 1605 * on the current cpu. It should only be used under carefully controlled 1606 * conditions. 1607 */ 1608 void 1609 pmap_qremove_quick(vm_offset_t beg_va, int count) 1610 { 1611 vm_offset_t end_va; 1612 vm_offset_t va; 1613 1614 end_va = beg_va + count * PAGE_SIZE; 1615 1616 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 1617 pt_entry_t *pte; 1618 1619 pte = vtopte(va); 1620 (void)pte_load_clear(pte); 1621 cpu_invlpg((void *)va); 1622 } 1623 } 1624 1625 /* 1626 * This routine removes temporary kernel mappings *without* invalidating 1627 * the TLB. It can only be used on permanent kva reservations such as those 1628 * found in buffer cache buffers, under carefully controlled circumstances. 1629 * 1630 * NOTE: Repopulating these KVAs requires unconditional invalidation. 1631 * (pmap_qenter() does unconditional invalidation). 1632 */ 1633 void 1634 pmap_qremove_noinval(vm_offset_t beg_va, int count) 1635 { 1636 vm_offset_t end_va; 1637 vm_offset_t va; 1638 1639 end_va = beg_va + count * PAGE_SIZE; 1640 1641 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 1642 pt_entry_t *pte; 1643 1644 pte = vtopte(va); 1645 (void)pte_load_clear(pte); 1646 } 1647 } 1648 1649 /* 1650 * Create a new thread and optionally associate it with a (new) process. 1651 * NOTE! the new thread's cpu may not equal the current cpu. 1652 */ 1653 void 1654 pmap_init_thread(thread_t td) 1655 { 1656 /* enforce pcb placement & alignment */ 1657 td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1; 1658 td->td_pcb = (struct pcb *)((intptr_t)td->td_pcb & ~(intptr_t)0xF); 1659 td->td_savefpu = &td->td_pcb->pcb_save; 1660 td->td_sp = (char *)td->td_pcb; /* no -16 */ 1661 } 1662 1663 /* 1664 * This routine directly affects the fork perf for a process. 1665 */ 1666 void 1667 pmap_init_proc(struct proc *p) 1668 { 1669 } 1670 1671 static void 1672 pmap_pinit_defaults(struct pmap *pmap) 1673 { 1674 bcopy(pmap_bits_default, pmap->pmap_bits, 1675 sizeof(pmap_bits_default)); 1676 bcopy(protection_codes, pmap->protection_codes, 1677 sizeof(protection_codes)); 1678 bcopy(pat_pte_index, pmap->pmap_cache_bits, 1679 sizeof(pat_pte_index)); 1680 pmap->pmap_cache_mask = X86_PG_NC_PWT | X86_PG_NC_PCD | X86_PG_PTE_PAT; 1681 pmap->copyinstr = std_copyinstr; 1682 pmap->copyin = std_copyin; 1683 pmap->copyout = std_copyout; 1684 pmap->fubyte = std_fubyte; 1685 pmap->subyte = std_subyte; 1686 pmap->fuword = std_fuword; 1687 pmap->suword = std_suword; 1688 pmap->suword32 = std_suword32; 1689 } 1690 /* 1691 * Initialize pmap0/vmspace0. This pmap is not added to pmap_list because 1692 * it, and IdlePTD, represents the template used to update all other pmaps. 1693 * 1694 * On architectures where the kernel pmap is not integrated into the user 1695 * process pmap, this pmap represents the process pmap, not the kernel pmap. 1696 * kernel_pmap should be used to directly access the kernel_pmap. 1697 */ 1698 void 1699 pmap_pinit0(struct pmap *pmap) 1700 { 1701 pmap->pm_pml4 = (pml4_entry_t *)(PTOV_OFFSET + KPML4phys); 1702 pmap->pm_count = 1; 1703 CPUMASK_ASSZERO(pmap->pm_active); 1704 pmap->pm_pvhint = NULL; 1705 RB_INIT(&pmap->pm_pvroot); 1706 spin_init(&pmap->pm_spin, "pmapinit0"); 1707 lwkt_token_init(&pmap->pm_token, "pmap_tok"); 1708 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1709 pmap_pinit_defaults(pmap); 1710 } 1711 1712 /* 1713 * Initialize a preallocated and zeroed pmap structure, 1714 * such as one in a vmspace structure. 1715 */ 1716 static void 1717 pmap_pinit_simple(struct pmap *pmap) 1718 { 1719 /* 1720 * Misc initialization 1721 */ 1722 pmap->pm_count = 1; 1723 CPUMASK_ASSZERO(pmap->pm_active); 1724 pmap->pm_pvhint = NULL; 1725 pmap->pm_flags = PMAP_FLAG_SIMPLE; 1726 1727 pmap_pinit_defaults(pmap); 1728 1729 /* 1730 * Don't blow up locks/tokens on re-use (XXX fix/use drop code 1731 * for this). 1732 */ 1733 if (pmap->pm_pmlpv == NULL) { 1734 RB_INIT(&pmap->pm_pvroot); 1735 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1736 spin_init(&pmap->pm_spin, "pmapinitsimple"); 1737 lwkt_token_init(&pmap->pm_token, "pmap_tok"); 1738 } 1739 } 1740 1741 void 1742 pmap_pinit(struct pmap *pmap) 1743 { 1744 pv_entry_t pv; 1745 int j; 1746 1747 if (pmap->pm_pmlpv) { 1748 if (pmap->pmap_bits[TYPE_IDX] != REGULAR_PMAP) { 1749 pmap_puninit(pmap); 1750 } 1751 } 1752 1753 pmap_pinit_simple(pmap); 1754 pmap->pm_flags &= ~PMAP_FLAG_SIMPLE; 1755 1756 /* 1757 * No need to allocate page table space yet but we do need a valid 1758 * page directory table. 1759 */ 1760 if (pmap->pm_pml4 == NULL) { 1761 pmap->pm_pml4 = 1762 (pml4_entry_t *)kmem_alloc_pageable(&kernel_map, PAGE_SIZE); 1763 } 1764 1765 /* 1766 * Allocate the page directory page, which wires it even though 1767 * it isn't being entered into some higher level page table (it 1768 * being the highest level). If one is already cached we don't 1769 * have to do anything. 1770 */ 1771 if ((pv = pmap->pm_pmlpv) == NULL) { 1772 pv = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL); 1773 pmap->pm_pmlpv = pv; 1774 pmap_kenter((vm_offset_t)pmap->pm_pml4, 1775 VM_PAGE_TO_PHYS(pv->pv_m)); 1776 pv_put(pv); 1777 1778 /* 1779 * Install DMAP and KMAP. 1780 */ 1781 for (j = 0; j < NDMPML4E; ++j) { 1782 pmap->pm_pml4[DMPML4I + j] = 1783 (DMPDPphys + ((vm_paddr_t)j << PML4SHIFT)) | 1784 pmap->pmap_bits[PG_RW_IDX] | 1785 pmap->pmap_bits[PG_V_IDX] | 1786 pmap->pmap_bits[PG_U_IDX]; 1787 } 1788 pmap->pm_pml4[KPML4I] = KPDPphys | 1789 pmap->pmap_bits[PG_RW_IDX] | 1790 pmap->pmap_bits[PG_V_IDX] | 1791 pmap->pmap_bits[PG_U_IDX]; 1792 1793 /* 1794 * install self-referential address mapping entry 1795 */ 1796 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pv->pv_m) | 1797 pmap->pmap_bits[PG_V_IDX] | 1798 pmap->pmap_bits[PG_RW_IDX] | 1799 pmap->pmap_bits[PG_A_IDX] | 1800 pmap->pmap_bits[PG_M_IDX]; 1801 } else { 1802 KKASSERT(pv->pv_m->flags & PG_MAPPED); 1803 KKASSERT(pv->pv_m->flags & PG_WRITEABLE); 1804 } 1805 KKASSERT(pmap->pm_pml4[255] == 0); 1806 KKASSERT(RB_ROOT(&pmap->pm_pvroot) == pv); 1807 KKASSERT(pv->pv_entry.rbe_left == NULL); 1808 KKASSERT(pv->pv_entry.rbe_right == NULL); 1809 } 1810 1811 /* 1812 * Clean up a pmap structure so it can be physically freed. This routine 1813 * is called by the vmspace dtor function. A great deal of pmap data is 1814 * left passively mapped to improve vmspace management so we have a bit 1815 * of cleanup work to do here. 1816 */ 1817 void 1818 pmap_puninit(pmap_t pmap) 1819 { 1820 pv_entry_t pv; 1821 vm_page_t p; 1822 1823 KKASSERT(CPUMASK_TESTZERO(pmap->pm_active)); 1824 if ((pv = pmap->pm_pmlpv) != NULL) { 1825 if (pv_hold_try(pv) == 0) 1826 pv_lock(pv); 1827 KKASSERT(pv == pmap->pm_pmlpv); 1828 p = pmap_remove_pv_page(pv); 1829 pv_free(pv); 1830 pmap_kremove((vm_offset_t)pmap->pm_pml4); 1831 vm_page_busy_wait(p, FALSE, "pgpun"); 1832 KKASSERT(p->flags & (PG_FICTITIOUS|PG_UNMANAGED)); 1833 vm_page_unwire(p, 0); 1834 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 1835 1836 /* 1837 * XXX eventually clean out PML4 static entries and 1838 * use vm_page_free_zero() 1839 */ 1840 vm_page_free(p); 1841 pmap->pm_pmlpv = NULL; 1842 } 1843 if (pmap->pm_pml4) { 1844 KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys)); 1845 kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE); 1846 pmap->pm_pml4 = NULL; 1847 } 1848 KKASSERT(pmap->pm_stats.resident_count == 0); 1849 KKASSERT(pmap->pm_stats.wired_count == 0); 1850 } 1851 1852 /* 1853 * Wire in kernel global address entries. To avoid a race condition 1854 * between pmap initialization and pmap_growkernel, this procedure 1855 * adds the pmap to the master list (which growkernel scans to update), 1856 * then copies the template. 1857 */ 1858 void 1859 pmap_pinit2(struct pmap *pmap) 1860 { 1861 spin_lock(&pmap_spin); 1862 TAILQ_INSERT_TAIL(&pmap_list, pmap, pm_pmnode); 1863 spin_unlock(&pmap_spin); 1864 } 1865 1866 /* 1867 * This routine is called when various levels in the page table need to 1868 * be populated. This routine cannot fail. 1869 * 1870 * This function returns two locked pv_entry's, one representing the 1871 * requested pv and one representing the requested pv's parent pv. If 1872 * the pv did not previously exist it will be mapped into its parent 1873 * and wired, otherwise no additional wire count will be added. 1874 */ 1875 static 1876 pv_entry_t 1877 pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp) 1878 { 1879 pt_entry_t *ptep; 1880 pv_entry_t pv; 1881 pv_entry_t pvp; 1882 vm_pindex_t pt_pindex; 1883 vm_page_t m; 1884 int isnew; 1885 int ispt; 1886 1887 /* 1888 * If the pv already exists and we aren't being asked for the 1889 * parent page table page we can just return it. A locked+held pv 1890 * is returned. The pv will also have a second hold related to the 1891 * pmap association that we don't have to worry about. 1892 */ 1893 ispt = 0; 1894 pv = pv_alloc(pmap, ptepindex, &isnew); 1895 if (isnew == 0 && pvpp == NULL) 1896 return(pv); 1897 1898 /* 1899 * Special case terminal PVs. These are not page table pages so 1900 * no vm_page is allocated (the caller supplied the vm_page). If 1901 * pvpp is non-NULL we are being asked to also removed the pt_pv 1902 * for this pv. 1903 * 1904 * Note that pt_pv's are only returned for user VAs. We assert that 1905 * a pt_pv is not being requested for kernel VAs. 1906 */ 1907 if (ptepindex < pmap_pt_pindex(0)) { 1908 if (ptepindex >= NUPTE_USER) 1909 KKASSERT(pvpp == NULL); 1910 else 1911 KKASSERT(pvpp != NULL); 1912 if (pvpp) { 1913 pt_pindex = NUPTE_TOTAL + (ptepindex >> NPTEPGSHIFT); 1914 pvp = pmap_allocpte(pmap, pt_pindex, NULL); 1915 if (isnew) 1916 vm_page_wire_quick(pvp->pv_m); 1917 *pvpp = pvp; 1918 } else { 1919 pvp = NULL; 1920 } 1921 return(pv); 1922 } 1923 1924 /* 1925 * Non-terminal PVs allocate a VM page to represent the page table, 1926 * so we have to resolve pvp and calculate ptepindex for the pvp 1927 * and then for the page table entry index in the pvp for 1928 * fall-through. 1929 */ 1930 if (ptepindex < pmap_pd_pindex(0)) { 1931 /* 1932 * pv is PT, pvp is PD 1933 */ 1934 ptepindex = (ptepindex - pmap_pt_pindex(0)) >> NPDEPGSHIFT; 1935 ptepindex += NUPTE_TOTAL + NUPT_TOTAL; 1936 pvp = pmap_allocpte(pmap, ptepindex, NULL); 1937 if (!isnew) 1938 goto notnew; 1939 1940 /* 1941 * PT index in PD 1942 */ 1943 ptepindex = pv->pv_pindex - pmap_pt_pindex(0); 1944 ptepindex &= ((1ul << NPDEPGSHIFT) - 1); 1945 ispt = 1; 1946 } else if (ptepindex < pmap_pdp_pindex(0)) { 1947 /* 1948 * pv is PD, pvp is PDP 1949 * 1950 * SIMPLE PMAP NOTE: Simple pmaps do not allocate above 1951 * the PD. 1952 */ 1953 ptepindex = (ptepindex - pmap_pd_pindex(0)) >> NPDPEPGSHIFT; 1954 ptepindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL; 1955 1956 if (pmap->pm_flags & PMAP_FLAG_SIMPLE) { 1957 KKASSERT(pvpp == NULL); 1958 pvp = NULL; 1959 } else { 1960 pvp = pmap_allocpte(pmap, ptepindex, NULL); 1961 } 1962 if (!isnew) 1963 goto notnew; 1964 1965 /* 1966 * PD index in PDP 1967 */ 1968 ptepindex = pv->pv_pindex - pmap_pd_pindex(0); 1969 ptepindex &= ((1ul << NPDPEPGSHIFT) - 1); 1970 } else if (ptepindex < pmap_pml4_pindex()) { 1971 /* 1972 * pv is PDP, pvp is the root pml4 table 1973 */ 1974 pvp = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL); 1975 if (!isnew) 1976 goto notnew; 1977 1978 /* 1979 * PDP index in PML4 1980 */ 1981 ptepindex = pv->pv_pindex - pmap_pdp_pindex(0); 1982 ptepindex &= ((1ul << NPML4EPGSHIFT) - 1); 1983 } else { 1984 /* 1985 * pv represents the top-level PML4, there is no parent. 1986 */ 1987 pvp = NULL; 1988 if (!isnew) 1989 goto notnew; 1990 } 1991 1992 /* 1993 * This code is only reached if isnew is TRUE and this is not a 1994 * terminal PV. We need to allocate a vm_page for the page table 1995 * at this level and enter it into the parent page table. 1996 * 1997 * page table pages are marked PG_WRITEABLE and PG_MAPPED. 1998 */ 1999 for (;;) { 2000 m = vm_page_alloc(NULL, pv->pv_pindex, 2001 VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM | 2002 VM_ALLOC_INTERRUPT); 2003 if (m) 2004 break; 2005 vm_wait(0); 2006 } 2007 vm_page_spin_lock(m); 2008 pmap_page_stats_adding(m); 2009 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2010 pv->pv_m = m; 2011 vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); 2012 vm_page_spin_unlock(m); 2013 vm_page_unmanage(m); /* m must be spinunlocked */ 2014 2015 pmap_zero_page(VM_PAGE_TO_PHYS(m)); 2016 m->valid = VM_PAGE_BITS_ALL; 2017 vm_page_wire(m); /* wire for mapping in parent */ 2018 2019 /* 2020 * Wire the page into pvp, bump the wire-count for pvp's page table 2021 * page. Bump the resident_count for the pmap. There is no pvp 2022 * for the top level, address the pm_pml4[] array directly. 2023 * 2024 * If the caller wants the parent we return it, otherwise 2025 * we just put it away. 2026 * 2027 * No interlock is needed for pte 0 -> non-zero. 2028 * 2029 * In the situation where *ptep is valid we might have an unmanaged 2030 * page table page shared from another page table which we need to 2031 * unshare before installing our private page table page. 2032 */ 2033 if (pvp) { 2034 ptep = pv_pte_lookup(pvp, ptepindex); 2035 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 2036 pt_entry_t pte; 2037 2038 if (ispt == 0) { 2039 panic("pmap_allocpte: unexpected pte %p/%d", 2040 pvp, (int)ptepindex); 2041 } 2042 pte = pmap_inval_smp(pmap, (vm_offset_t)-1, 1, ptep, 0); 2043 if (vm_page_unwire_quick( 2044 PHYS_TO_VM_PAGE(pte & PG_FRAME))) { 2045 panic("pmap_allocpte: shared pgtable " 2046 "pg bad wirecount"); 2047 } 2048 atomic_add_long(&pmap->pm_stats.resident_count, -1); 2049 } else { 2050 vm_page_wire_quick(pvp->pv_m); 2051 } 2052 *ptep = VM_PAGE_TO_PHYS(m) | 2053 (pmap->pmap_bits[PG_U_IDX] | 2054 pmap->pmap_bits[PG_RW_IDX] | 2055 pmap->pmap_bits[PG_V_IDX] | 2056 pmap->pmap_bits[PG_A_IDX] | 2057 pmap->pmap_bits[PG_M_IDX]); 2058 } 2059 vm_page_wakeup(m); 2060 notnew: 2061 if (pvpp) 2062 *pvpp = pvp; 2063 else if (pvp) 2064 pv_put(pvp); 2065 return (pv); 2066 } 2067 2068 /* 2069 * This version of pmap_allocpte() checks for possible segment optimizations 2070 * that would allow page-table sharing. It can be called for terminal 2071 * page or page table page ptepindex's. 2072 * 2073 * The function is called with page table page ptepindex's for fictitious 2074 * and unmanaged terminal pages. That is, we don't want to allocate a 2075 * terminal pv, we just want the pt_pv. pvpp is usually passed as NULL 2076 * for this case. 2077 * 2078 * This function can return a pv and *pvpp associated with the passed in pmap 2079 * OR a pv and *pvpp associated with the shared pmap. In the latter case 2080 * an unmanaged page table page will be entered into the pass in pmap. 2081 */ 2082 static 2083 pv_entry_t 2084 pmap_allocpte_seg(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp, 2085 vm_map_entry_t entry, vm_offset_t va) 2086 { 2087 vm_object_t object; 2088 pmap_t obpmap; 2089 pmap_t *obpmapp; 2090 vm_offset_t b; 2091 pv_entry_t pte_pv; /* in original or shared pmap */ 2092 pv_entry_t pt_pv; /* in original or shared pmap */ 2093 pv_entry_t proc_pd_pv; /* in original pmap */ 2094 pv_entry_t proc_pt_pv; /* in original pmap */ 2095 pv_entry_t xpv; /* PT in shared pmap */ 2096 pd_entry_t *pt; /* PT entry in PD of original pmap */ 2097 pd_entry_t opte; /* contents of *pt */ 2098 pd_entry_t npte; /* contents of *pt */ 2099 vm_page_t m; 2100 2101 retry: 2102 /* 2103 * Basic tests, require a non-NULL vm_map_entry, require proper 2104 * alignment and type for the vm_map_entry, require that the 2105 * underlying object already be allocated. 2106 * 2107 * We allow almost any type of object to use this optimization. 2108 * The object itself does NOT have to be sized to a multiple of the 2109 * segment size, but the memory mapping does. 2110 * 2111 * XXX don't handle devices currently, because VM_PAGE_TO_PHYS() 2112 * won't work as expected. 2113 */ 2114 if (entry == NULL || 2115 pmap_mmu_optimize == 0 || /* not enabled */ 2116 (pmap->pm_flags & PMAP_HVM) || /* special pmap */ 2117 ptepindex >= pmap_pd_pindex(0) || /* not terminal or pt */ 2118 entry->inheritance != VM_INHERIT_SHARE || /* not shared */ 2119 entry->maptype != VM_MAPTYPE_NORMAL || /* weird map type */ 2120 entry->object.vm_object == NULL || /* needs VM object */ 2121 entry->object.vm_object->type == OBJT_DEVICE || /* ick */ 2122 entry->object.vm_object->type == OBJT_MGTDEVICE || /* ick */ 2123 (entry->offset & SEG_MASK) || /* must be aligned */ 2124 (entry->start & SEG_MASK)) { 2125 return(pmap_allocpte(pmap, ptepindex, pvpp)); 2126 } 2127 2128 /* 2129 * Make sure the full segment can be represented. 2130 */ 2131 b = va & ~(vm_offset_t)SEG_MASK; 2132 if (b < entry->start || b + SEG_SIZE > entry->end) 2133 return(pmap_allocpte(pmap, ptepindex, pvpp)); 2134 2135 /* 2136 * If the full segment can be represented dive the VM object's 2137 * shared pmap, allocating as required. 2138 */ 2139 object = entry->object.vm_object; 2140 2141 if (entry->protection & VM_PROT_WRITE) 2142 obpmapp = &object->md.pmap_rw; 2143 else 2144 obpmapp = &object->md.pmap_ro; 2145 2146 #ifdef PMAP_DEBUG2 2147 if (pmap_enter_debug > 0) { 2148 --pmap_enter_debug; 2149 kprintf("pmap_allocpte_seg: va=%jx prot %08x o=%p " 2150 "obpmapp %p %p\n", 2151 va, entry->protection, object, 2152 obpmapp, *obpmapp); 2153 kprintf("pmap_allocpte_seg: entry %p %jx-%jx\n", 2154 entry, entry->start, entry->end); 2155 } 2156 #endif 2157 2158 /* 2159 * We allocate what appears to be a normal pmap but because portions 2160 * of this pmap are shared with other unrelated pmaps we have to 2161 * set pm_active to point to all cpus. 2162 * 2163 * XXX Currently using pmap_spin to interlock the update, can't use 2164 * vm_object_hold/drop because the token might already be held 2165 * shared OR exclusive and we don't know. 2166 */ 2167 while ((obpmap = *obpmapp) == NULL) { 2168 obpmap = kmalloc(sizeof(*obpmap), M_OBJPMAP, M_WAITOK|M_ZERO); 2169 pmap_pinit_simple(obpmap); 2170 pmap_pinit2(obpmap); 2171 spin_lock(&pmap_spin); 2172 if (*obpmapp != NULL) { 2173 /* 2174 * Handle race 2175 */ 2176 spin_unlock(&pmap_spin); 2177 pmap_release(obpmap); 2178 pmap_puninit(obpmap); 2179 kfree(obpmap, M_OBJPMAP); 2180 obpmap = *obpmapp; /* safety */ 2181 } else { 2182 obpmap->pm_active = smp_active_mask; 2183 obpmap->pm_flags |= PMAP_SEGSHARED; 2184 *obpmapp = obpmap; 2185 spin_unlock(&pmap_spin); 2186 } 2187 } 2188 2189 /* 2190 * Layering is: PTE, PT, PD, PDP, PML4. We have to return the 2191 * pte/pt using the shared pmap from the object but also adjust 2192 * the process pmap's page table page as a side effect. 2193 */ 2194 2195 /* 2196 * Resolve the terminal PTE and PT in the shared pmap. This is what 2197 * we will return. This is true if ptepindex represents a terminal 2198 * page, otherwise pte_pv is actually the PT and pt_pv is actually 2199 * the PD. 2200 */ 2201 pt_pv = NULL; 2202 pte_pv = pmap_allocpte(obpmap, ptepindex, &pt_pv); 2203 if (ptepindex >= pmap_pt_pindex(0)) 2204 xpv = pte_pv; 2205 else 2206 xpv = pt_pv; 2207 2208 /* 2209 * Resolve the PD in the process pmap so we can properly share the 2210 * page table page. Lock order is bottom-up (leaf first)! 2211 * 2212 * NOTE: proc_pt_pv can be NULL. 2213 */ 2214 proc_pt_pv = pv_get(pmap, pmap_pt_pindex(b)); 2215 proc_pd_pv = pmap_allocpte(pmap, pmap_pd_pindex(b), NULL); 2216 #ifdef PMAP_DEBUG2 2217 if (pmap_enter_debug > 0) { 2218 --pmap_enter_debug; 2219 kprintf("proc_pt_pv %p (wc %d) pd_pv %p va=%jx\n", 2220 proc_pt_pv, 2221 (proc_pt_pv ? proc_pt_pv->pv_m->wire_count : -1), 2222 proc_pd_pv, 2223 va); 2224 } 2225 #endif 2226 2227 /* 2228 * xpv is the page table page pv from the shared object 2229 * (for convenience), from above. 2230 * 2231 * Calculate the pte value for the PT to load into the process PD. 2232 * If we have to change it we must properly dispose of the previous 2233 * entry. 2234 */ 2235 pt = pv_pte_lookup(proc_pd_pv, pmap_pt_index(b)); 2236 npte = VM_PAGE_TO_PHYS(xpv->pv_m) | 2237 (pmap->pmap_bits[PG_U_IDX] | 2238 pmap->pmap_bits[PG_RW_IDX] | 2239 pmap->pmap_bits[PG_V_IDX] | 2240 pmap->pmap_bits[PG_A_IDX] | 2241 pmap->pmap_bits[PG_M_IDX]); 2242 2243 /* 2244 * Dispose of previous page table page if it was local to the 2245 * process pmap. If the old pt is not empty we cannot dispose of it 2246 * until we clean it out. This case should not arise very often so 2247 * it is not optimized. 2248 */ 2249 if (proc_pt_pv) { 2250 pmap_inval_bulk_t bulk; 2251 2252 if (proc_pt_pv->pv_m->wire_count != 1) { 2253 pv_put(proc_pd_pv); 2254 pv_put(proc_pt_pv); 2255 pv_put(pt_pv); 2256 pv_put(pte_pv); 2257 pmap_remove(pmap, 2258 va & ~(vm_offset_t)SEG_MASK, 2259 (va + SEG_SIZE) & ~(vm_offset_t)SEG_MASK); 2260 goto retry; 2261 } 2262 2263 /* 2264 * The release call will indirectly clean out *pt 2265 */ 2266 pmap_inval_bulk_init(&bulk, proc_pt_pv->pv_pmap); 2267 pmap_release_pv(proc_pt_pv, proc_pd_pv, &bulk); 2268 pmap_inval_bulk_flush(&bulk); 2269 proc_pt_pv = NULL; 2270 /* relookup */ 2271 pt = pv_pte_lookup(proc_pd_pv, pmap_pt_index(b)); 2272 } 2273 2274 /* 2275 * Handle remaining cases. 2276 */ 2277 if (*pt == 0) { 2278 *pt = npte; 2279 vm_page_wire_quick(xpv->pv_m); 2280 vm_page_wire_quick(proc_pd_pv->pv_m); 2281 atomic_add_long(&pmap->pm_stats.resident_count, 1); 2282 } else if (*pt != npte) { 2283 opte = pmap_inval_smp(pmap, (vm_offset_t)-1, 1, pt, npte); 2284 2285 #if 0 2286 opte = pte_load_clear(pt); 2287 KKASSERT(opte && opte != npte); 2288 2289 *pt = npte; 2290 #endif 2291 vm_page_wire_quick(xpv->pv_m); /* pgtable pg that is npte */ 2292 2293 /* 2294 * Clean up opte, bump the wire_count for the process 2295 * PD page representing the new entry if it was 2296 * previously empty. 2297 * 2298 * If the entry was not previously empty and we have 2299 * a PT in the proc pmap then opte must match that 2300 * pt. The proc pt must be retired (this is done 2301 * later on in this procedure). 2302 * 2303 * NOTE: replacing valid pte, wire_count on proc_pd_pv 2304 * stays the same. 2305 */ 2306 KKASSERT(opte & pmap->pmap_bits[PG_V_IDX]); 2307 m = PHYS_TO_VM_PAGE(opte & PG_FRAME); 2308 if (vm_page_unwire_quick(m)) { 2309 panic("pmap_allocpte_seg: " 2310 "bad wire count %p", 2311 m); 2312 } 2313 } 2314 2315 /* 2316 * The existing process page table was replaced and must be destroyed 2317 * here. 2318 */ 2319 if (proc_pd_pv) 2320 pv_put(proc_pd_pv); 2321 if (pvpp) 2322 *pvpp = pt_pv; 2323 else 2324 pv_put(pt_pv); 2325 2326 return (pte_pv); 2327 } 2328 2329 /* 2330 * Release any resources held by the given physical map. 2331 * 2332 * Called when a pmap initialized by pmap_pinit is being released. Should 2333 * only be called if the map contains no valid mappings. 2334 * 2335 * Caller must hold pmap->pm_token 2336 */ 2337 struct pmap_release_info { 2338 pmap_t pmap; 2339 int retry; 2340 }; 2341 2342 static int pmap_release_callback(pv_entry_t pv, void *data); 2343 2344 void 2345 pmap_release(struct pmap *pmap) 2346 { 2347 struct pmap_release_info info; 2348 2349 KASSERT(CPUMASK_TESTZERO(pmap->pm_active), 2350 ("pmap still active! %016jx", 2351 (uintmax_t)CPUMASK_LOWMASK(pmap->pm_active))); 2352 2353 spin_lock(&pmap_spin); 2354 TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode); 2355 spin_unlock(&pmap_spin); 2356 2357 /* 2358 * Pull pv's off the RB tree in order from low to high and release 2359 * each page. 2360 */ 2361 info.pmap = pmap; 2362 do { 2363 info.retry = 0; 2364 spin_lock(&pmap->pm_spin); 2365 RB_SCAN(pv_entry_rb_tree, &pmap->pm_pvroot, NULL, 2366 pmap_release_callback, &info); 2367 spin_unlock(&pmap->pm_spin); 2368 } while (info.retry); 2369 2370 2371 /* 2372 * One resident page (the pml4 page) should remain. 2373 * No wired pages should remain. 2374 */ 2375 KKASSERT(pmap->pm_stats.resident_count == 2376 ((pmap->pm_flags & PMAP_FLAG_SIMPLE) ? 0 : 1)); 2377 2378 KKASSERT(pmap->pm_stats.wired_count == 0); 2379 } 2380 2381 static int 2382 pmap_release_callback(pv_entry_t pv, void *data) 2383 { 2384 struct pmap_release_info *info = data; 2385 pmap_t pmap = info->pmap; 2386 int r; 2387 2388 if (pv_hold_try(pv)) { 2389 spin_unlock(&pmap->pm_spin); 2390 } else { 2391 spin_unlock(&pmap->pm_spin); 2392 pv_lock(pv); 2393 } 2394 if (pv->pv_pmap != pmap) { 2395 pv_put(pv); 2396 spin_lock(&pmap->pm_spin); 2397 info->retry = 1; 2398 return(-1); 2399 } 2400 r = pmap_release_pv(pv, NULL, NULL); 2401 spin_lock(&pmap->pm_spin); 2402 return(r); 2403 } 2404 2405 /* 2406 * Called with held (i.e. also locked) pv. This function will dispose of 2407 * the lock along with the pv. 2408 * 2409 * If the caller already holds the locked parent page table for pv it 2410 * must pass it as pvp, allowing us to avoid a deadlock, else it can 2411 * pass NULL for pvp. 2412 */ 2413 static int 2414 pmap_release_pv(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk) 2415 { 2416 vm_page_t p; 2417 2418 /* 2419 * The pmap is currently not spinlocked, pv is held+locked. 2420 * Remove the pv's page from its parent's page table. The 2421 * parent's page table page's wire_count will be decremented. 2422 * 2423 * This will clean out the pte at any level of the page table. 2424 * If smp != 0 all cpus are affected. 2425 */ 2426 pmap_remove_pv_pte(pv, pvp, bulk); 2427 2428 /* 2429 * Terminal pvs are unhooked from their vm_pages. Because 2430 * terminal pages aren't page table pages they aren't wired 2431 * by us, so we have to be sure not to unwire them either. 2432 */ 2433 if (pv->pv_pindex < pmap_pt_pindex(0)) { 2434 pmap_remove_pv_page(pv); 2435 goto skip; 2436 } 2437 2438 /* 2439 * We leave the top-level page table page cached, wired, and 2440 * mapped in the pmap until the dtor function (pmap_puninit()) 2441 * gets called. 2442 * 2443 * Since we are leaving the top-level pv intact we need 2444 * to break out of what would otherwise be an infinite loop. 2445 */ 2446 if (pv->pv_pindex == pmap_pml4_pindex()) { 2447 pv_put(pv); 2448 return(-1); 2449 } 2450 2451 /* 2452 * For page table pages (other than the top-level page), 2453 * remove and free the vm_page. The representitive mapping 2454 * removed above by pmap_remove_pv_pte() did not undo the 2455 * last wire_count so we have to do that as well. 2456 */ 2457 p = pmap_remove_pv_page(pv); 2458 vm_page_busy_wait(p, FALSE, "pmaprl"); 2459 if (p->wire_count != 1) { 2460 kprintf("p->wire_count was %016lx %d\n", 2461 pv->pv_pindex, p->wire_count); 2462 } 2463 KKASSERT(p->wire_count == 1); 2464 KKASSERT(p->flags & PG_UNMANAGED); 2465 2466 vm_page_unwire(p, 0); 2467 KKASSERT(p->wire_count == 0); 2468 2469 vm_page_free(p); 2470 skip: 2471 pv_free(pv); 2472 return 0; 2473 } 2474 2475 /* 2476 * This function will remove the pte associated with a pv from its parent. 2477 * Terminal pv's are supported. All cpus are affected if smp != 0. 2478 * 2479 * The wire count will be dropped on the parent page table. The wire 2480 * count on the page being removed (pv->pv_m) from the parent page table 2481 * is NOT touched. Note that terminal pages will not have any additional 2482 * wire counts while page table pages will have at least one representing 2483 * the mapping, plus others representing sub-mappings. 2484 * 2485 * NOTE: Cannot be called on kernel page table pages, only KVM terminal 2486 * pages and user page table and terminal pages. 2487 * 2488 * The pv must be locked. 2489 * 2490 * XXX must lock parent pv's if they exist to remove pte XXX 2491 */ 2492 static 2493 void 2494 pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk) 2495 { 2496 vm_pindex_t ptepindex = pv->pv_pindex; 2497 pmap_t pmap = pv->pv_pmap; 2498 vm_page_t p; 2499 int gotpvp = 0; 2500 2501 KKASSERT(pmap); 2502 2503 if (ptepindex == pmap_pml4_pindex()) { 2504 /* 2505 * We are the top level pml4 table, there is no parent. 2506 */ 2507 p = pmap->pm_pmlpv->pv_m; 2508 } else if (ptepindex >= pmap_pdp_pindex(0)) { 2509 /* 2510 * Remove a PDP page from the pml4e. This can only occur 2511 * with user page tables. We do not have to lock the 2512 * pml4 PV so just ignore pvp. 2513 */ 2514 vm_pindex_t pml4_pindex; 2515 vm_pindex_t pdp_index; 2516 pml4_entry_t *pdp; 2517 2518 pdp_index = ptepindex - pmap_pdp_pindex(0); 2519 if (pvp == NULL) { 2520 pml4_pindex = pmap_pml4_pindex(); 2521 pvp = pv_get(pv->pv_pmap, pml4_pindex); 2522 KKASSERT(pvp); 2523 gotpvp = 1; 2524 } 2525 pdp = &pmap->pm_pml4[pdp_index & ((1ul << NPML4EPGSHIFT) - 1)]; 2526 KKASSERT((*pdp & pmap->pmap_bits[PG_V_IDX]) != 0); 2527 p = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 2528 pmap_inval_bulk(bulk, (vm_offset_t)-1, pdp, 0); 2529 } else if (ptepindex >= pmap_pd_pindex(0)) { 2530 /* 2531 * Remove a PD page from the pdp 2532 * 2533 * SIMPLE PMAP NOTE: Non-existant pvp's are ok in the case 2534 * of a simple pmap because it stops at 2535 * the PD page. 2536 */ 2537 vm_pindex_t pdp_pindex; 2538 vm_pindex_t pd_index; 2539 pdp_entry_t *pd; 2540 2541 pd_index = ptepindex - pmap_pd_pindex(0); 2542 2543 if (pvp == NULL) { 2544 pdp_pindex = NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + 2545 (pd_index >> NPML4EPGSHIFT); 2546 pvp = pv_get(pv->pv_pmap, pdp_pindex); 2547 if (pvp) 2548 gotpvp = 1; 2549 } 2550 if (pvp) { 2551 pd = pv_pte_lookup(pvp, pd_index & 2552 ((1ul << NPDPEPGSHIFT) - 1)); 2553 KKASSERT((*pd & pmap->pmap_bits[PG_V_IDX]) != 0); 2554 p = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 2555 pmap_inval_bulk(bulk, (vm_offset_t)-1, pd, 0); 2556 } else { 2557 KKASSERT(pmap->pm_flags & PMAP_FLAG_SIMPLE); 2558 p = pv->pv_m; /* degenerate test later */ 2559 } 2560 } else if (ptepindex >= pmap_pt_pindex(0)) { 2561 /* 2562 * Remove a PT page from the pd 2563 */ 2564 vm_pindex_t pd_pindex; 2565 vm_pindex_t pt_index; 2566 pd_entry_t *pt; 2567 2568 pt_index = ptepindex - pmap_pt_pindex(0); 2569 2570 if (pvp == NULL) { 2571 pd_pindex = NUPTE_TOTAL + NUPT_TOTAL + 2572 (pt_index >> NPDPEPGSHIFT); 2573 pvp = pv_get(pv->pv_pmap, pd_pindex); 2574 KKASSERT(pvp); 2575 gotpvp = 1; 2576 } 2577 pt = pv_pte_lookup(pvp, pt_index & ((1ul << NPDPEPGSHIFT) - 1)); 2578 KKASSERT((*pt & pmap->pmap_bits[PG_V_IDX]) != 0); 2579 p = PHYS_TO_VM_PAGE(*pt & PG_FRAME); 2580 pmap_inval_bulk(bulk, (vm_offset_t)-1, pt, 0); 2581 } else { 2582 /* 2583 * Remove a PTE from the PT page 2584 * 2585 * NOTE: pv's must be locked bottom-up to avoid deadlocking. 2586 * pv is a pte_pv so we can safely lock pt_pv. 2587 * 2588 * NOTE: FICTITIOUS pages may have multiple physical mappings 2589 * so PHYS_TO_VM_PAGE() will not necessarily work for 2590 * terminal ptes. 2591 */ 2592 vm_pindex_t pt_pindex; 2593 pt_entry_t *ptep; 2594 pt_entry_t pte; 2595 vm_offset_t va; 2596 2597 pt_pindex = ptepindex >> NPTEPGSHIFT; 2598 va = (vm_offset_t)ptepindex << PAGE_SHIFT; 2599 2600 if (ptepindex >= NUPTE_USER) { 2601 ptep = vtopte(ptepindex << PAGE_SHIFT); 2602 KKASSERT(pvp == NULL); 2603 } else { 2604 if (pvp == NULL) { 2605 pt_pindex = NUPTE_TOTAL + 2606 (ptepindex >> NPDPEPGSHIFT); 2607 pvp = pv_get(pv->pv_pmap, pt_pindex); 2608 KKASSERT(pvp); 2609 gotpvp = 1; 2610 } 2611 ptep = pv_pte_lookup(pvp, ptepindex & 2612 ((1ul << NPDPEPGSHIFT) - 1)); 2613 } 2614 pte = pmap_inval_bulk(bulk, va, ptep, 0); 2615 if (bulk == NULL) /* XXX */ 2616 cpu_invlpg((void *)va); /* XXX */ 2617 2618 /* 2619 * Now update the vm_page_t 2620 */ 2621 if ((pte & (pmap->pmap_bits[PG_MANAGED_IDX] | pmap->pmap_bits[PG_V_IDX])) != 2622 (pmap->pmap_bits[PG_MANAGED_IDX]|pmap->pmap_bits[PG_V_IDX])) { 2623 kprintf("remove_pte badpte %016lx %016lx %d\n", 2624 pte, pv->pv_pindex, 2625 pv->pv_pindex < pmap_pt_pindex(0)); 2626 } 2627 /* PHYS_TO_VM_PAGE() will not work for FICTITIOUS pages */ 2628 /*KKASSERT((pte & (PG_MANAGED|PG_V)) == (PG_MANAGED|PG_V));*/ 2629 if (pte & pmap->pmap_bits[PG_DEVICE_IDX]) 2630 p = pv->pv_m; 2631 else 2632 p = PHYS_TO_VM_PAGE(pte & PG_FRAME); 2633 /* p = pv->pv_m; */ 2634 2635 if (pte & pmap->pmap_bits[PG_M_IDX]) { 2636 if (pmap_track_modified(ptepindex)) 2637 vm_page_dirty(p); 2638 } 2639 if (pte & pmap->pmap_bits[PG_A_IDX]) { 2640 vm_page_flag_set(p, PG_REFERENCED); 2641 } 2642 if (pte & pmap->pmap_bits[PG_W_IDX]) 2643 atomic_add_long(&pmap->pm_stats.wired_count, -1); 2644 if (pte & pmap->pmap_bits[PG_G_IDX]) 2645 cpu_invlpg((void *)va); 2646 } 2647 2648 /* 2649 * Unwire the parent page table page. The wire_count cannot go below 2650 * 1 here because the parent page table page is itself still mapped. 2651 * 2652 * XXX remove the assertions later. 2653 */ 2654 KKASSERT(pv->pv_m == p); 2655 if (pvp && vm_page_unwire_quick(pvp->pv_m)) 2656 panic("pmap_remove_pv_pte: Insufficient wire_count"); 2657 2658 if (gotpvp) 2659 pv_put(pvp); 2660 } 2661 2662 /* 2663 * Remove the vm_page association to a pv. The pv must be locked. 2664 */ 2665 static 2666 vm_page_t 2667 pmap_remove_pv_page(pv_entry_t pv) 2668 { 2669 vm_page_t m; 2670 2671 m = pv->pv_m; 2672 KKASSERT(m); 2673 vm_page_spin_lock(m); 2674 pv->pv_m = NULL; 2675 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2676 pmap_page_stats_deleting(m); 2677 /* 2678 if (m->object) 2679 atomic_add_int(&m->object->agg_pv_list_count, -1); 2680 */ 2681 if (TAILQ_EMPTY(&m->md.pv_list)) 2682 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2683 vm_page_spin_unlock(m); 2684 return(m); 2685 } 2686 2687 /* 2688 * Grow the number of kernel page table entries, if needed. 2689 * 2690 * This routine is always called to validate any address space 2691 * beyond KERNBASE (for kldloads). kernel_vm_end only governs the address 2692 * space below KERNBASE. 2693 */ 2694 void 2695 pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) 2696 { 2697 vm_paddr_t paddr; 2698 vm_offset_t ptppaddr; 2699 vm_page_t nkpg; 2700 pd_entry_t *pt, newpt; 2701 pdp_entry_t newpd; 2702 int update_kernel_vm_end; 2703 2704 /* 2705 * bootstrap kernel_vm_end on first real VM use 2706 */ 2707 if (kernel_vm_end == 0) { 2708 kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 2709 nkpt = 0; 2710 while ((*pmap_pt(&kernel_pmap, kernel_vm_end) & kernel_pmap.pmap_bits[PG_V_IDX]) != 0) { 2711 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & 2712 ~(PAGE_SIZE * NPTEPG - 1); 2713 nkpt++; 2714 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 2715 kernel_vm_end = kernel_map.max_offset; 2716 break; 2717 } 2718 } 2719 } 2720 2721 /* 2722 * Fill in the gaps. kernel_vm_end is only adjusted for ranges 2723 * below KERNBASE. Ranges above KERNBASE are kldloaded and we 2724 * do not want to force-fill 128G worth of page tables. 2725 */ 2726 if (kstart < KERNBASE) { 2727 if (kstart > kernel_vm_end) 2728 kstart = kernel_vm_end; 2729 KKASSERT(kend <= KERNBASE); 2730 update_kernel_vm_end = 1; 2731 } else { 2732 update_kernel_vm_end = 0; 2733 } 2734 2735 kstart = rounddown2(kstart, PAGE_SIZE * NPTEPG); 2736 kend = roundup2(kend, PAGE_SIZE * NPTEPG); 2737 2738 if (kend - 1 >= kernel_map.max_offset) 2739 kend = kernel_map.max_offset; 2740 2741 while (kstart < kend) { 2742 pt = pmap_pt(&kernel_pmap, kstart); 2743 if (pt == NULL) { 2744 /* We need a new PDP entry */ 2745 nkpg = vm_page_alloc(NULL, nkpt, 2746 VM_ALLOC_NORMAL | 2747 VM_ALLOC_SYSTEM | 2748 VM_ALLOC_INTERRUPT); 2749 if (nkpg == NULL) { 2750 panic("pmap_growkernel: no memory to grow " 2751 "kernel"); 2752 } 2753 paddr = VM_PAGE_TO_PHYS(nkpg); 2754 pmap_zero_page(paddr); 2755 newpd = (pdp_entry_t) 2756 (paddr | 2757 kernel_pmap.pmap_bits[PG_V_IDX] | 2758 kernel_pmap.pmap_bits[PG_RW_IDX] | 2759 kernel_pmap.pmap_bits[PG_A_IDX] | 2760 kernel_pmap.pmap_bits[PG_M_IDX]); 2761 *pmap_pd(&kernel_pmap, kstart) = newpd; 2762 nkpt++; 2763 continue; /* try again */ 2764 } 2765 if ((*pt & kernel_pmap.pmap_bits[PG_V_IDX]) != 0) { 2766 kstart = (kstart + PAGE_SIZE * NPTEPG) & 2767 ~(PAGE_SIZE * NPTEPG - 1); 2768 if (kstart - 1 >= kernel_map.max_offset) { 2769 kstart = kernel_map.max_offset; 2770 break; 2771 } 2772 continue; 2773 } 2774 2775 /* 2776 * This index is bogus, but out of the way 2777 */ 2778 nkpg = vm_page_alloc(NULL, nkpt, 2779 VM_ALLOC_NORMAL | 2780 VM_ALLOC_SYSTEM | 2781 VM_ALLOC_INTERRUPT); 2782 if (nkpg == NULL) 2783 panic("pmap_growkernel: no memory to grow kernel"); 2784 2785 vm_page_wire(nkpg); 2786 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 2787 pmap_zero_page(ptppaddr); 2788 newpt = (pd_entry_t) (ptppaddr | 2789 kernel_pmap.pmap_bits[PG_V_IDX] | 2790 kernel_pmap.pmap_bits[PG_RW_IDX] | 2791 kernel_pmap.pmap_bits[PG_A_IDX] | 2792 kernel_pmap.pmap_bits[PG_M_IDX]); 2793 *pmap_pt(&kernel_pmap, kstart) = newpt; 2794 nkpt++; 2795 2796 kstart = (kstart + PAGE_SIZE * NPTEPG) & 2797 ~(PAGE_SIZE * NPTEPG - 1); 2798 2799 if (kstart - 1 >= kernel_map.max_offset) { 2800 kstart = kernel_map.max_offset; 2801 break; 2802 } 2803 } 2804 2805 /* 2806 * Only update kernel_vm_end for areas below KERNBASE. 2807 */ 2808 if (update_kernel_vm_end && kernel_vm_end < kstart) 2809 kernel_vm_end = kstart; 2810 } 2811 2812 /* 2813 * Add a reference to the specified pmap. 2814 */ 2815 void 2816 pmap_reference(pmap_t pmap) 2817 { 2818 if (pmap != NULL) { 2819 lwkt_gettoken(&pmap->pm_token); 2820 ++pmap->pm_count; 2821 lwkt_reltoken(&pmap->pm_token); 2822 } 2823 } 2824 2825 /*************************************************** 2826 * page management routines. 2827 ***************************************************/ 2828 2829 /* 2830 * Hold a pv without locking it 2831 */ 2832 static void 2833 pv_hold(pv_entry_t pv) 2834 { 2835 atomic_add_int(&pv->pv_hold, 1); 2836 } 2837 2838 /* 2839 * Hold a pv_entry, preventing its destruction. TRUE is returned if the pv 2840 * was successfully locked, FALSE if it wasn't. The caller must dispose of 2841 * the pv properly. 2842 * 2843 * Either the pmap->pm_spin or the related vm_page_spin (if traversing a 2844 * pv list via its page) must be held by the caller. 2845 */ 2846 static int 2847 _pv_hold_try(pv_entry_t pv PMAP_DEBUG_DECL) 2848 { 2849 u_int count; 2850 2851 /* 2852 * Critical path shortcut expects pv to already have one ref 2853 * (for the pv->pv_pmap). 2854 */ 2855 if (atomic_cmpset_int(&pv->pv_hold, 1, PV_HOLD_LOCKED | 2)) { 2856 #ifdef PMAP_DEBUG 2857 pv->pv_func = func; 2858 pv->pv_line = lineno; 2859 #endif 2860 return TRUE; 2861 } 2862 2863 for (;;) { 2864 count = pv->pv_hold; 2865 cpu_ccfence(); 2866 if ((count & PV_HOLD_LOCKED) == 0) { 2867 if (atomic_cmpset_int(&pv->pv_hold, count, 2868 (count + 1) | PV_HOLD_LOCKED)) { 2869 #ifdef PMAP_DEBUG 2870 pv->pv_func = func; 2871 pv->pv_line = lineno; 2872 #endif 2873 return TRUE; 2874 } 2875 } else { 2876 if (atomic_cmpset_int(&pv->pv_hold, count, count + 1)) 2877 return FALSE; 2878 } 2879 /* retry */ 2880 } 2881 } 2882 2883 /* 2884 * Drop a previously held pv_entry which could not be locked, allowing its 2885 * destruction. 2886 * 2887 * Must not be called with a spinlock held as we might zfree() the pv if it 2888 * is no longer associated with a pmap and this was the last hold count. 2889 */ 2890 static void 2891 pv_drop(pv_entry_t pv) 2892 { 2893 u_int count; 2894 2895 for (;;) { 2896 count = pv->pv_hold; 2897 cpu_ccfence(); 2898 KKASSERT((count & PV_HOLD_MASK) > 0); 2899 KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) != 2900 (PV_HOLD_LOCKED | 1)); 2901 if (atomic_cmpset_int(&pv->pv_hold, count, count - 1)) { 2902 if ((count & PV_HOLD_MASK) == 1) { 2903 #ifdef PMAP_DEBUG2 2904 if (pmap_enter_debug > 0) { 2905 --pmap_enter_debug; 2906 kprintf("pv_drop: free pv %p\n", pv); 2907 } 2908 #endif 2909 KKASSERT(count == 1); 2910 KKASSERT(pv->pv_pmap == NULL); 2911 zfree(pvzone, pv); 2912 } 2913 return; 2914 } 2915 /* retry */ 2916 } 2917 } 2918 2919 /* 2920 * Find or allocate the requested PV entry, returning a locked, held pv. 2921 * 2922 * If (*isnew) is non-zero, the returned pv will have two hold counts, one 2923 * for the caller and one representing the pmap and vm_page association. 2924 * 2925 * If (*isnew) is zero, the returned pv will have only one hold count. 2926 * 2927 * Since both associations can only be adjusted while the pv is locked, 2928 * together they represent just one additional hold. 2929 */ 2930 static 2931 pv_entry_t 2932 _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew PMAP_DEBUG_DECL) 2933 { 2934 pv_entry_t pv; 2935 pv_entry_t pnew = NULL; 2936 2937 spin_lock(&pmap->pm_spin); 2938 for (;;) { 2939 if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) { 2940 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, 2941 pindex); 2942 } 2943 if (pv == NULL) { 2944 if (pnew == NULL) { 2945 spin_unlock(&pmap->pm_spin); 2946 pnew = zalloc(pvzone); 2947 spin_lock(&pmap->pm_spin); 2948 continue; 2949 } 2950 pnew->pv_pmap = pmap; 2951 pnew->pv_pindex = pindex; 2952 pnew->pv_hold = PV_HOLD_LOCKED | 2; 2953 #ifdef PMAP_DEBUG 2954 pnew->pv_func = func; 2955 pnew->pv_line = lineno; 2956 #endif 2957 pv_entry_rb_tree_RB_INSERT(&pmap->pm_pvroot, pnew); 2958 ++pmap->pm_generation; 2959 atomic_add_long(&pmap->pm_stats.resident_count, 1); 2960 spin_unlock(&pmap->pm_spin); 2961 *isnew = 1; 2962 return(pnew); 2963 } 2964 if (pnew) { 2965 spin_unlock(&pmap->pm_spin); 2966 zfree(pvzone, pnew); 2967 pnew = NULL; 2968 spin_lock(&pmap->pm_spin); 2969 continue; 2970 } 2971 if (_pv_hold_try(pv PMAP_DEBUG_COPY)) { 2972 spin_unlock(&pmap->pm_spin); 2973 } else { 2974 spin_unlock(&pmap->pm_spin); 2975 _pv_lock(pv PMAP_DEBUG_COPY); 2976 } 2977 if (pv->pv_pmap == pmap && pv->pv_pindex == pindex) { 2978 *isnew = 0; 2979 return(pv); 2980 } 2981 pv_put(pv); 2982 spin_lock(&pmap->pm_spin); 2983 } 2984 } 2985 2986 /* 2987 * Find the requested PV entry, returning a locked+held pv or NULL 2988 */ 2989 static 2990 pv_entry_t 2991 _pv_get(pmap_t pmap, vm_pindex_t pindex PMAP_DEBUG_DECL) 2992 { 2993 pv_entry_t pv; 2994 2995 spin_lock(&pmap->pm_spin); 2996 for (;;) { 2997 /* 2998 * Shortcut cache 2999 */ 3000 if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) { 3001 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, 3002 pindex); 3003 } 3004 if (pv == NULL) { 3005 spin_unlock(&pmap->pm_spin); 3006 return NULL; 3007 } 3008 if (_pv_hold_try(pv PMAP_DEBUG_COPY)) { 3009 spin_unlock(&pmap->pm_spin); 3010 } else { 3011 spin_unlock(&pmap->pm_spin); 3012 _pv_lock(pv PMAP_DEBUG_COPY); 3013 } 3014 if (pv->pv_pmap == pmap && pv->pv_pindex == pindex) { 3015 pv_cache(pv, pindex); 3016 return(pv); 3017 } 3018 pv_put(pv); 3019 spin_lock(&pmap->pm_spin); 3020 } 3021 } 3022 3023 /* 3024 * Lookup, hold, and attempt to lock (pmap,pindex). 3025 * 3026 * If the entry does not exist NULL is returned and *errorp is set to 0 3027 * 3028 * If the entry exists and could be successfully locked it is returned and 3029 * errorp is set to 0. 3030 * 3031 * If the entry exists but could NOT be successfully locked it is returned 3032 * held and *errorp is set to 1. 3033 */ 3034 static 3035 pv_entry_t 3036 pv_get_try(pmap_t pmap, vm_pindex_t pindex, int *errorp) 3037 { 3038 pv_entry_t pv; 3039 3040 spin_lock_shared(&pmap->pm_spin); 3041 if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) 3042 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex); 3043 if (pv == NULL) { 3044 spin_unlock_shared(&pmap->pm_spin); 3045 *errorp = 0; 3046 return NULL; 3047 } 3048 if (pv_hold_try(pv)) { 3049 pv_cache(pv, pindex); 3050 spin_unlock_shared(&pmap->pm_spin); 3051 *errorp = 0; 3052 KKASSERT(pv->pv_pmap == pmap && pv->pv_pindex == pindex); 3053 return(pv); /* lock succeeded */ 3054 } 3055 spin_unlock_shared(&pmap->pm_spin); 3056 *errorp = 1; 3057 return (pv); /* lock failed */ 3058 } 3059 3060 /* 3061 * Find the requested PV entry, returning a held pv or NULL 3062 */ 3063 static 3064 pv_entry_t 3065 pv_find(pmap_t pmap, vm_pindex_t pindex) 3066 { 3067 pv_entry_t pv; 3068 3069 spin_lock_shared(&pmap->pm_spin); 3070 3071 if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) 3072 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex); 3073 if (pv == NULL) { 3074 spin_unlock_shared(&pmap->pm_spin); 3075 return NULL; 3076 } 3077 pv_hold(pv); 3078 pv_cache(pv, pindex); 3079 spin_unlock_shared(&pmap->pm_spin); 3080 return(pv); 3081 } 3082 3083 /* 3084 * Lock a held pv, keeping the hold count 3085 */ 3086 static 3087 void 3088 _pv_lock(pv_entry_t pv PMAP_DEBUG_DECL) 3089 { 3090 u_int count; 3091 3092 for (;;) { 3093 count = pv->pv_hold; 3094 cpu_ccfence(); 3095 if ((count & PV_HOLD_LOCKED) == 0) { 3096 if (atomic_cmpset_int(&pv->pv_hold, count, 3097 count | PV_HOLD_LOCKED)) { 3098 #ifdef PMAP_DEBUG 3099 pv->pv_func = func; 3100 pv->pv_line = lineno; 3101 #endif 3102 return; 3103 } 3104 continue; 3105 } 3106 tsleep_interlock(pv, 0); 3107 if (atomic_cmpset_int(&pv->pv_hold, count, 3108 count | PV_HOLD_WAITING)) { 3109 #ifdef PMAP_DEBUG 3110 kprintf("pv waiting on %s:%d\n", 3111 pv->pv_func, pv->pv_line); 3112 #endif 3113 tsleep(pv, PINTERLOCKED, "pvwait", hz); 3114 } 3115 /* retry */ 3116 } 3117 } 3118 3119 /* 3120 * Unlock a held and locked pv, keeping the hold count. 3121 */ 3122 static 3123 void 3124 pv_unlock(pv_entry_t pv) 3125 { 3126 u_int count; 3127 3128 for (;;) { 3129 count = pv->pv_hold; 3130 cpu_ccfence(); 3131 KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) >= 3132 (PV_HOLD_LOCKED | 1)); 3133 if (atomic_cmpset_int(&pv->pv_hold, count, 3134 count & 3135 ~(PV_HOLD_LOCKED | PV_HOLD_WAITING))) { 3136 if (count & PV_HOLD_WAITING) 3137 wakeup(pv); 3138 break; 3139 } 3140 } 3141 } 3142 3143 /* 3144 * Unlock and drop a pv. If the pv is no longer associated with a pmap 3145 * and the hold count drops to zero we will free it. 3146 * 3147 * Caller should not hold any spin locks. We are protected from hold races 3148 * by virtue of holds only occuring only with a pmap_spin or vm_page_spin 3149 * lock held. A pv cannot be located otherwise. 3150 */ 3151 static 3152 void 3153 pv_put(pv_entry_t pv) 3154 { 3155 #ifdef PMAP_DEBUG2 3156 if (pmap_enter_debug > 0) { 3157 --pmap_enter_debug; 3158 kprintf("pv_put pv=%p hold=%08x\n", pv, pv->pv_hold); 3159 } 3160 #endif 3161 3162 /* 3163 * Fast - shortcut most common condition 3164 */ 3165 if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 1)) 3166 return; 3167 3168 /* 3169 * Slow 3170 */ 3171 pv_unlock(pv); 3172 pv_drop(pv); 3173 } 3174 3175 /* 3176 * Remove the pmap association from a pv, require that pv_m already be removed, 3177 * then unlock and drop the pv. Any pte operations must have already been 3178 * completed. This call may result in a last-drop which will physically free 3179 * the pv. 3180 * 3181 * Removing the pmap association entails an additional drop. 3182 * 3183 * pv must be exclusively locked on call and will be disposed of on return. 3184 */ 3185 static 3186 void 3187 pv_free(pv_entry_t pv) 3188 { 3189 pmap_t pmap; 3190 3191 KKASSERT(pv->pv_m == NULL); 3192 KKASSERT((pv->pv_hold & PV_HOLD_MASK) >= 2); 3193 if ((pmap = pv->pv_pmap) != NULL) { 3194 spin_lock(&pmap->pm_spin); 3195 pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); 3196 ++pmap->pm_generation; 3197 if (pmap->pm_pvhint == pv) 3198 pmap->pm_pvhint = NULL; 3199 atomic_add_long(&pmap->pm_stats.resident_count, -1); 3200 pv->pv_pmap = NULL; 3201 pv->pv_pindex = 0; 3202 spin_unlock(&pmap->pm_spin); 3203 3204 /* 3205 * Try to shortcut three atomic ops, otherwise fall through 3206 * and do it normally. Drop two refs and the lock all in 3207 * one go. 3208 */ 3209 if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 0)) { 3210 #ifdef PMAP_DEBUG2 3211 if (pmap_enter_debug > 0) { 3212 --pmap_enter_debug; 3213 kprintf("pv_free: free pv %p\n", pv); 3214 } 3215 #endif 3216 zfree(pvzone, pv); 3217 return; 3218 } 3219 pv_drop(pv); /* ref for pv_pmap */ 3220 } 3221 pv_put(pv); 3222 } 3223 3224 /* 3225 * This routine is very drastic, but can save the system 3226 * in a pinch. 3227 */ 3228 void 3229 pmap_collect(void) 3230 { 3231 int i; 3232 vm_page_t m; 3233 static int warningdone=0; 3234 3235 if (pmap_pagedaemon_waken == 0) 3236 return; 3237 pmap_pagedaemon_waken = 0; 3238 if (warningdone < 5) { 3239 kprintf("pmap_collect: collecting pv entries -- " 3240 "suggest increasing PMAP_SHPGPERPROC\n"); 3241 warningdone++; 3242 } 3243 3244 for (i = 0; i < vm_page_array_size; i++) { 3245 m = &vm_page_array[i]; 3246 if (m->wire_count || m->hold_count) 3247 continue; 3248 if (vm_page_busy_try(m, TRUE) == 0) { 3249 if (m->wire_count == 0 && m->hold_count == 0) { 3250 pmap_remove_all(m); 3251 } 3252 vm_page_wakeup(m); 3253 } 3254 } 3255 } 3256 3257 /* 3258 * Scan the pmap for active page table entries and issue a callback. 3259 * The callback must dispose of pte_pv, whos PTE entry is at *ptep in 3260 * its parent page table. 3261 * 3262 * pte_pv will be NULL if the page or page table is unmanaged. 3263 * pt_pv will point to the page table page containing the pte for the page. 3264 * 3265 * NOTE! If we come across an unmanaged page TABLE (verses an unmanaged page), 3266 * we pass a NULL pte_pv and we pass a pt_pv pointing to the passed 3267 * process pmap's PD and page to the callback function. This can be 3268 * confusing because the pt_pv is really a pd_pv, and the target page 3269 * table page is simply aliased by the pmap and not owned by it. 3270 * 3271 * It is assumed that the start and end are properly rounded to the page size. 3272 * 3273 * It is assumed that PD pages and above are managed and thus in the RB tree, 3274 * allowing us to use RB_SCAN from the PD pages down for ranged scans. 3275 */ 3276 struct pmap_scan_info { 3277 struct pmap *pmap; 3278 vm_offset_t sva; 3279 vm_offset_t eva; 3280 vm_pindex_t sva_pd_pindex; 3281 vm_pindex_t eva_pd_pindex; 3282 void (*func)(pmap_t, struct pmap_scan_info *, 3283 pv_entry_t, pv_entry_t, int, vm_offset_t, 3284 pt_entry_t *, void *); 3285 void *arg; 3286 pmap_inval_bulk_t bulk_core; 3287 pmap_inval_bulk_t *bulk; 3288 int count; 3289 }; 3290 3291 static int pmap_scan_cmp(pv_entry_t pv, void *data); 3292 static int pmap_scan_callback(pv_entry_t pv, void *data); 3293 3294 static void 3295 pmap_scan(struct pmap_scan_info *info, int smp_inval) 3296 { 3297 struct pmap *pmap = info->pmap; 3298 pv_entry_t pd_pv; /* A page directory PV */ 3299 pv_entry_t pt_pv; /* A page table PV */ 3300 pv_entry_t pte_pv; /* A page table entry PV */ 3301 pt_entry_t *ptep; 3302 pt_entry_t oldpte; 3303 struct pv_entry dummy_pv; 3304 int generation; 3305 3306 if (pmap == NULL) 3307 return; 3308 if (smp_inval) { 3309 info->bulk = &info->bulk_core; 3310 pmap_inval_bulk_init(&info->bulk_core, pmap); 3311 } else { 3312 info->bulk = NULL; 3313 } 3314 3315 /* 3316 * Hold the token for stability; if the pmap is empty we have nothing 3317 * to do. 3318 */ 3319 lwkt_gettoken(&pmap->pm_token); 3320 #if 0 3321 if (pmap->pm_stats.resident_count == 0) { 3322 lwkt_reltoken(&pmap->pm_token); 3323 return; 3324 } 3325 #endif 3326 3327 info->count = 0; 3328 3329 again: 3330 /* 3331 * Special handling for scanning one page, which is a very common 3332 * operation (it is?). 3333 * 3334 * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4 3335 */ 3336 if (info->sva + PAGE_SIZE == info->eva) { 3337 generation = pmap->pm_generation; 3338 if (info->sva >= VM_MAX_USER_ADDRESS) { 3339 /* 3340 * Kernel mappings do not track wire counts on 3341 * page table pages and only maintain pd_pv and 3342 * pte_pv levels so pmap_scan() works. 3343 */ 3344 pt_pv = NULL; 3345 pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva)); 3346 ptep = vtopte(info->sva); 3347 } else { 3348 /* 3349 * User pages which are unmanaged will not have a 3350 * pte_pv. User page table pages which are unmanaged 3351 * (shared from elsewhere) will also not have a pt_pv. 3352 * The func() callback will pass both pte_pv and pt_pv 3353 * as NULL in that case. 3354 */ 3355 pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva)); 3356 pt_pv = pv_get(pmap, pmap_pt_pindex(info->sva)); 3357 if (pt_pv == NULL) { 3358 KKASSERT(pte_pv == NULL); 3359 pd_pv = pv_get(pmap, pmap_pd_pindex(info->sva)); 3360 if (pd_pv) { 3361 ptep = pv_pte_lookup(pd_pv, 3362 pmap_pt_index(info->sva)); 3363 if (*ptep) { 3364 info->func(pmap, info, 3365 NULL, pd_pv, 1, 3366 info->sva, ptep, 3367 info->arg); 3368 } 3369 pv_put(pd_pv); 3370 } 3371 goto fast_skip; 3372 } 3373 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(info->sva)); 3374 } 3375 3376 /* 3377 * NOTE: *ptep can't be ripped out from under us if we hold 3378 * pte_pv locked, but bits can change. However, there is 3379 * a race where another thread may be inserting pte_pv 3380 * and setting *ptep just after our pte_pv lookup fails. 3381 * 3382 * In this situation we can end up with a NULL pte_pv 3383 * but find that we have a managed *ptep. We explicitly 3384 * check for this race. 3385 */ 3386 oldpte = *ptep; 3387 cpu_ccfence(); 3388 if (oldpte == 0) { 3389 /* 3390 * Unlike the pv_find() case below we actually 3391 * acquired a locked pv in this case so any 3392 * race should have been resolved. It is expected 3393 * to not exist. 3394 */ 3395 KKASSERT(pte_pv == NULL); 3396 } else if (pte_pv) { 3397 KASSERT((oldpte & (pmap->pmap_bits[PG_MANAGED_IDX] | 3398 pmap->pmap_bits[PG_V_IDX])) == 3399 (pmap->pmap_bits[PG_MANAGED_IDX] | 3400 pmap->pmap_bits[PG_V_IDX]), 3401 ("badA *ptep %016lx/%016lx sva %016lx pte_pv %p" 3402 "generation %d/%d", 3403 *ptep, oldpte, info->sva, pte_pv, 3404 generation, pmap->pm_generation)); 3405 info->func(pmap, info, pte_pv, pt_pv, 0, 3406 info->sva, ptep, info->arg); 3407 } else { 3408 /* 3409 * Check for insertion race 3410 */ 3411 if ((oldpte & pmap->pmap_bits[PG_MANAGED_IDX]) && 3412 pt_pv) { 3413 pte_pv = pv_find(pmap, 3414 pmap_pte_pindex(info->sva)); 3415 if (pte_pv) { 3416 pv_drop(pte_pv); 3417 pv_put(pt_pv); 3418 kprintf("pmap_scan: RACE1 " 3419 "%016jx, %016lx\n", 3420 info->sva, oldpte); 3421 goto again; 3422 } 3423 } 3424 3425 /* 3426 * Didn't race 3427 */ 3428 KASSERT((oldpte & (pmap->pmap_bits[PG_MANAGED_IDX] | 3429 pmap->pmap_bits[PG_V_IDX])) == 3430 pmap->pmap_bits[PG_V_IDX], 3431 ("badB *ptep %016lx/%016lx sva %016lx pte_pv NULL" 3432 "generation %d/%d", 3433 *ptep, oldpte, info->sva, 3434 generation, pmap->pm_generation)); 3435 info->func(pmap, info, NULL, pt_pv, 0, 3436 info->sva, ptep, info->arg); 3437 } 3438 if (pt_pv) 3439 pv_put(pt_pv); 3440 fast_skip: 3441 pmap_inval_bulk_flush(info->bulk); 3442 lwkt_reltoken(&pmap->pm_token); 3443 return; 3444 } 3445 3446 /* 3447 * Nominal scan case, RB_SCAN() for PD pages and iterate from 3448 * there. 3449 */ 3450 info->sva_pd_pindex = pmap_pd_pindex(info->sva); 3451 info->eva_pd_pindex = pmap_pd_pindex(info->eva + NBPDP - 1); 3452 3453 if (info->sva >= VM_MAX_USER_ADDRESS) { 3454 /* 3455 * The kernel does not currently maintain any pv_entry's for 3456 * higher-level page tables. 3457 */ 3458 bzero(&dummy_pv, sizeof(dummy_pv)); 3459 dummy_pv.pv_pindex = info->sva_pd_pindex; 3460 spin_lock(&pmap->pm_spin); 3461 while (dummy_pv.pv_pindex < info->eva_pd_pindex) { 3462 pmap_scan_callback(&dummy_pv, info); 3463 ++dummy_pv.pv_pindex; 3464 } 3465 spin_unlock(&pmap->pm_spin); 3466 } else { 3467 /* 3468 * User page tables maintain local PML4, PDP, and PD 3469 * pv_entry's at the very least. PT pv's might be 3470 * unmanaged and thus not exist. PTE pv's might be 3471 * unmanaged and thus not exist. 3472 */ 3473 spin_lock(&pmap->pm_spin); 3474 pv_entry_rb_tree_RB_SCAN(&pmap->pm_pvroot, 3475 pmap_scan_cmp, pmap_scan_callback, info); 3476 spin_unlock(&pmap->pm_spin); 3477 } 3478 pmap_inval_bulk_flush(info->bulk); 3479 lwkt_reltoken(&pmap->pm_token); 3480 } 3481 3482 /* 3483 * WARNING! pmap->pm_spin held 3484 */ 3485 static int 3486 pmap_scan_cmp(pv_entry_t pv, void *data) 3487 { 3488 struct pmap_scan_info *info = data; 3489 if (pv->pv_pindex < info->sva_pd_pindex) 3490 return(-1); 3491 if (pv->pv_pindex >= info->eva_pd_pindex) 3492 return(1); 3493 return(0); 3494 } 3495 3496 /* 3497 * WARNING! pmap->pm_spin held 3498 */ 3499 static int 3500 pmap_scan_callback(pv_entry_t pv, void *data) 3501 { 3502 struct pmap_scan_info *info = data; 3503 struct pmap *pmap = info->pmap; 3504 pv_entry_t pd_pv; /* A page directory PV */ 3505 pv_entry_t pt_pv; /* A page table PV */ 3506 pv_entry_t pte_pv; /* A page table entry PV */ 3507 pt_entry_t *ptep; 3508 pt_entry_t oldpte; 3509 vm_offset_t sva; 3510 vm_offset_t eva; 3511 vm_offset_t va_next; 3512 vm_pindex_t pd_pindex; 3513 int error; 3514 int generation; 3515 3516 /* 3517 * Pull the PD pindex from the pv before releasing the spinlock. 3518 * 3519 * WARNING: pv is faked for kernel pmap scans. 3520 */ 3521 pd_pindex = pv->pv_pindex; 3522 spin_unlock(&pmap->pm_spin); 3523 pv = NULL; /* invalid after spinlock unlocked */ 3524 3525 /* 3526 * Calculate the page range within the PD. SIMPLE pmaps are 3527 * direct-mapped for the entire 2^64 address space. Normal pmaps 3528 * reflect the user and kernel address space which requires 3529 * cannonicalization w/regards to converting pd_pindex's back 3530 * into addresses. 3531 */ 3532 sva = (pd_pindex - NUPTE_TOTAL - NUPT_TOTAL) << PDPSHIFT; 3533 if ((pmap->pm_flags & PMAP_FLAG_SIMPLE) == 0 && 3534 (sva & PML4_SIGNMASK)) { 3535 sva |= PML4_SIGNMASK; 3536 } 3537 eva = sva + NBPDP; /* can overflow */ 3538 if (sva < info->sva) 3539 sva = info->sva; 3540 if (eva < info->sva || eva > info->eva) 3541 eva = info->eva; 3542 3543 /* 3544 * NOTE: kernel mappings do not track page table pages, only 3545 * terminal pages. 3546 * 3547 * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4. 3548 * However, for the scan to be efficient we try to 3549 * cache items top-down. 3550 */ 3551 pd_pv = NULL; 3552 pt_pv = NULL; 3553 3554 for (; sva < eva; sva = va_next) { 3555 if (sva >= VM_MAX_USER_ADDRESS) { 3556 if (pt_pv) { 3557 pv_put(pt_pv); 3558 pt_pv = NULL; 3559 } 3560 goto kernel_skip; 3561 } 3562 3563 /* 3564 * PD cache (degenerate case if we skip). It is possible 3565 * for the PD to not exist due to races. This is ok. 3566 */ 3567 if (pd_pv == NULL) { 3568 pd_pv = pv_get(pmap, pmap_pd_pindex(sva)); 3569 } else if (pd_pv->pv_pindex != pmap_pd_pindex(sva)) { 3570 pv_put(pd_pv); 3571 pd_pv = pv_get(pmap, pmap_pd_pindex(sva)); 3572 } 3573 if (pd_pv == NULL) { 3574 va_next = (sva + NBPDP) & ~PDPMASK; 3575 if (va_next < sva) 3576 va_next = eva; 3577 continue; 3578 } 3579 3580 /* 3581 * PT cache 3582 */ 3583 if (pt_pv == NULL) { 3584 if (pd_pv) { 3585 pv_put(pd_pv); 3586 pd_pv = NULL; 3587 } 3588 pt_pv = pv_get(pmap, pmap_pt_pindex(sva)); 3589 } else if (pt_pv->pv_pindex != pmap_pt_pindex(sva)) { 3590 if (pd_pv) { 3591 pv_put(pd_pv); 3592 pd_pv = NULL; 3593 } 3594 pv_put(pt_pv); 3595 pt_pv = pv_get(pmap, pmap_pt_pindex(sva)); 3596 } 3597 3598 /* 3599 * If pt_pv is NULL we either have an shared page table 3600 * page and must issue a callback specific to that case, 3601 * or there is no page table page. 3602 * 3603 * Either way we can skip the page table page. 3604 */ 3605 if (pt_pv == NULL) { 3606 /* 3607 * Possible unmanaged (shared from another pmap) 3608 * page table page. 3609 */ 3610 if (pd_pv == NULL) 3611 pd_pv = pv_get(pmap, pmap_pd_pindex(sva)); 3612 KKASSERT(pd_pv != NULL); 3613 ptep = pv_pte_lookup(pd_pv, pmap_pt_index(sva)); 3614 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 3615 info->func(pmap, info, NULL, pd_pv, 1, 3616 sva, ptep, info->arg); 3617 } 3618 3619 /* 3620 * Done, move to next page table page. 3621 */ 3622 va_next = (sva + NBPDR) & ~PDRMASK; 3623 if (va_next < sva) 3624 va_next = eva; 3625 continue; 3626 } 3627 3628 /* 3629 * From this point in the loop testing pt_pv for non-NULL 3630 * means we are in UVM, else if it is NULL we are in KVM. 3631 * 3632 * Limit our scan to either the end of the va represented 3633 * by the current page table page, or to the end of the 3634 * range being removed. 3635 */ 3636 kernel_skip: 3637 va_next = (sva + NBPDR) & ~PDRMASK; 3638 if (va_next < sva) 3639 va_next = eva; 3640 if (va_next > eva) 3641 va_next = eva; 3642 3643 /* 3644 * Scan the page table for pages. Some pages may not be 3645 * managed (might not have a pv_entry). 3646 * 3647 * There is no page table management for kernel pages so 3648 * pt_pv will be NULL in that case, but otherwise pt_pv 3649 * is non-NULL, locked, and referenced. 3650 */ 3651 3652 /* 3653 * At this point a non-NULL pt_pv means a UVA, and a NULL 3654 * pt_pv means a KVA. 3655 */ 3656 if (pt_pv) 3657 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva)); 3658 else 3659 ptep = vtopte(sva); 3660 3661 while (sva < va_next) { 3662 /* 3663 * Yield every 64 pages. 3664 */ 3665 if ((++info->count & 63) == 0) 3666 lwkt_user_yield(); 3667 3668 /* 3669 * Acquire the related pte_pv, if any. If *ptep == 0 3670 * the related pte_pv should not exist, but if *ptep 3671 * is not zero the pte_pv may or may not exist (e.g. 3672 * will not exist for an unmanaged page). 3673 * 3674 * However a multitude of races are possible here. 3675 * 3676 * In addition, the (pt_pv, pte_pv) lock order is 3677 * backwards, so we have to be careful in aquiring 3678 * a properly locked pte_pv. 3679 */ 3680 generation = pmap->pm_generation; 3681 if (pt_pv) { 3682 pte_pv = pv_get_try(pmap, pmap_pte_pindex(sva), 3683 &error); 3684 if (error) { 3685 if (pd_pv) { 3686 pv_put(pd_pv); 3687 pd_pv = NULL; 3688 } 3689 pv_put(pt_pv); /* must be non-NULL */ 3690 pt_pv = NULL; 3691 pv_lock(pte_pv); /* safe to block now */ 3692 pv_put(pte_pv); 3693 pte_pv = NULL; 3694 pt_pv = pv_get(pmap, 3695 pmap_pt_pindex(sva)); 3696 /* 3697 * pt_pv reloaded, need new ptep 3698 */ 3699 KKASSERT(pt_pv != NULL); 3700 ptep = pv_pte_lookup(pt_pv, 3701 pmap_pte_index(sva)); 3702 continue; 3703 } 3704 } else { 3705 pte_pv = pv_get(pmap, pmap_pte_pindex(sva)); 3706 } 3707 3708 /* 3709 * Ok, if *ptep == 0 we had better NOT have a pte_pv. 3710 */ 3711 oldpte = *ptep; 3712 if (oldpte == 0) { 3713 if (pte_pv) { 3714 kprintf("Unexpected non-NULL pte_pv " 3715 "%p pt_pv %p " 3716 "*ptep = %016lx/%016lx\n", 3717 pte_pv, pt_pv, *ptep, oldpte); 3718 panic("Unexpected non-NULL pte_pv"); 3719 } 3720 sva += PAGE_SIZE; 3721 ++ptep; 3722 continue; 3723 } 3724 3725 /* 3726 * Ready for the callback. The locked pte_pv (if any) 3727 * is consumed by the callback. pte_pv will exist if 3728 * the page is managed, and will not exist if it 3729 * isn't. 3730 */ 3731 if (pte_pv) { 3732 KASSERT((oldpte & (pmap->pmap_bits[PG_MANAGED_IDX] | pmap->pmap_bits[PG_V_IDX])) == 3733 (pmap->pmap_bits[PG_MANAGED_IDX] | pmap->pmap_bits[PG_V_IDX]), 3734 ("badC *ptep %016lx/%016lx sva %016lx " 3735 "pte_pv %p pm_generation %d/%d", 3736 *ptep, oldpte, sva, pte_pv, 3737 generation, pmap->pm_generation)); 3738 info->func(pmap, info, pte_pv, pt_pv, 0, 3739 sva, ptep, info->arg); 3740 } else { 3741 /* 3742 * Check for insertion race. Since there is no 3743 * pte_pv to guard us it is possible for us 3744 * to race another thread doing an insertion. 3745 * Our lookup misses the pte_pv but our *ptep 3746 * check sees the inserted pte. 3747 * 3748 * XXX panic case seems to occur within a 3749 * vm_fork() of /bin/sh, which frankly 3750 * shouldn't happen since no other threads 3751 * should be inserting to our pmap in that 3752 * situation. Removing, possibly. Inserting, 3753 * shouldn't happen. 3754 */ 3755 if ((oldpte & pmap->pmap_bits[PG_MANAGED_IDX]) && 3756 pt_pv) { 3757 pte_pv = pv_find(pmap, 3758 pmap_pte_pindex(sva)); 3759 if (pte_pv) { 3760 pv_drop(pte_pv); 3761 kprintf("pmap_scan: RACE2 " 3762 "%016jx, %016lx\n", 3763 sva, oldpte); 3764 continue; 3765 } 3766 } 3767 3768 /* 3769 * Didn't race 3770 */ 3771 KASSERT((oldpte & (pmap->pmap_bits[PG_MANAGED_IDX] | pmap->pmap_bits[PG_V_IDX])) == 3772 pmap->pmap_bits[PG_V_IDX], 3773 ("badD *ptep %016lx/%016lx sva %016lx " 3774 "pte_pv NULL pm_generation %d/%d", 3775 *ptep, oldpte, sva, 3776 generation, pmap->pm_generation)); 3777 info->func(pmap, info, NULL, pt_pv, 0, 3778 sva, ptep, info->arg); 3779 } 3780 pte_pv = NULL; 3781 sva += PAGE_SIZE; 3782 ++ptep; 3783 } 3784 } 3785 if (pd_pv) { 3786 pv_put(pd_pv); 3787 pd_pv = NULL; 3788 } 3789 if (pt_pv) { 3790 pv_put(pt_pv); 3791 pt_pv = NULL; 3792 } 3793 if ((++info->count & 7) == 0) 3794 lwkt_user_yield(); 3795 3796 /* 3797 * Relock before returning. 3798 */ 3799 spin_lock(&pmap->pm_spin); 3800 return (0); 3801 } 3802 3803 void 3804 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 3805 { 3806 struct pmap_scan_info info; 3807 3808 info.pmap = pmap; 3809 info.sva = sva; 3810 info.eva = eva; 3811 info.func = pmap_remove_callback; 3812 info.arg = NULL; 3813 pmap_scan(&info, 1); 3814 } 3815 3816 static void 3817 pmap_remove_noinval(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 3818 { 3819 struct pmap_scan_info info; 3820 3821 info.pmap = pmap; 3822 info.sva = sva; 3823 info.eva = eva; 3824 info.func = pmap_remove_callback; 3825 info.arg = NULL; 3826 pmap_scan(&info, 0); 3827 } 3828 3829 static void 3830 pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info, 3831 pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept, 3832 vm_offset_t va, pt_entry_t *ptep, void *arg __unused) 3833 { 3834 pt_entry_t pte; 3835 3836 if (pte_pv) { 3837 /* 3838 * This will also drop pt_pv's wire_count. Note that 3839 * terminal pages are not wired based on mmu presence. 3840 */ 3841 pmap_remove_pv_pte(pte_pv, pt_pv, info->bulk); 3842 pmap_remove_pv_page(pte_pv); 3843 pv_free(pte_pv); 3844 } else if (sharept == 0) { 3845 /* 3846 * Unmanaged page table (pt, pd, or pdp. Not pte). 3847 * 3848 * pt_pv's wire_count is still bumped by unmanaged pages 3849 * so we must decrement it manually. 3850 * 3851 * We have to unwire the target page table page. 3852 * 3853 * It is unclear how we can invalidate a segment so we 3854 * invalidate -1 which invlidates the tlb. 3855 */ 3856 pte = pmap_inval_bulk(info->bulk, (vm_offset_t)-1, ptep, 0); 3857 if (pte & pmap->pmap_bits[PG_W_IDX]) 3858 atomic_add_long(&pmap->pm_stats.wired_count, -1); 3859 atomic_add_long(&pmap->pm_stats.resident_count, -1); 3860 if (vm_page_unwire_quick(pt_pv->pv_m)) 3861 panic("pmap_remove: insufficient wirecount"); 3862 } else { 3863 /* 3864 * Unmanaged page table (pt, pd, or pdp. Not pte) for 3865 * a shared page table. 3866 * 3867 * pt_pv is actually the pd_pv for our pmap (not the shared 3868 * object pmap). 3869 * 3870 * We have to unwire the target page table page and we 3871 * have to unwire our page directory page. 3872 * 3873 * It is unclear how we can invalidate a segment so we 3874 * invalidate -1 which invlidates the tlb. 3875 */ 3876 pte = pmap_inval_bulk(info->bulk, (vm_offset_t)-1, ptep, 0); 3877 atomic_add_long(&pmap->pm_stats.resident_count, -1); 3878 KKASSERT((pte & pmap->pmap_bits[PG_DEVICE_IDX]) == 0); 3879 if (vm_page_unwire_quick(PHYS_TO_VM_PAGE(pte & PG_FRAME))) 3880 panic("pmap_remove: shared pgtable1 bad wirecount"); 3881 if (vm_page_unwire_quick(pt_pv->pv_m)) 3882 panic("pmap_remove: shared pgtable2 bad wirecount"); 3883 } 3884 } 3885 3886 /* 3887 * Removes this physical page from all physical maps in which it resides. 3888 * Reflects back modify bits to the pager. 3889 * 3890 * This routine may not be called from an interrupt. 3891 */ 3892 static 3893 void 3894 pmap_remove_all(vm_page_t m) 3895 { 3896 pv_entry_t pv; 3897 pmap_inval_bulk_t bulk; 3898 3899 if (!pmap_initialized /* || (m->flags & PG_FICTITIOUS)*/) 3900 return; 3901 3902 vm_page_spin_lock(m); 3903 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3904 KKASSERT(pv->pv_m == m); 3905 if (pv_hold_try(pv)) { 3906 vm_page_spin_unlock(m); 3907 } else { 3908 vm_page_spin_unlock(m); 3909 pv_lock(pv); 3910 } 3911 if (pv->pv_m != m) { 3912 pv_put(pv); 3913 vm_page_spin_lock(m); 3914 continue; 3915 } 3916 3917 /* 3918 * Holding no spinlocks, pv is locked. 3919 */ 3920 pmap_inval_bulk_init(&bulk, pv->pv_pmap); 3921 pmap_remove_pv_pte(pv, NULL, &bulk); 3922 pmap_inval_bulk_flush(&bulk); 3923 pmap_remove_pv_page(pv); 3924 pv_free(pv); 3925 vm_page_spin_lock(m); 3926 } 3927 KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0); 3928 vm_page_spin_unlock(m); 3929 } 3930 3931 /* 3932 * Set the physical protection on the specified range of this map 3933 * as requested. This function is typically only used for debug watchpoints 3934 * and COW pages. 3935 * 3936 * This function may not be called from an interrupt if the map is 3937 * not the kernel_pmap. 3938 * 3939 * NOTE! For shared page table pages we just unmap the page. 3940 */ 3941 void 3942 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 3943 { 3944 struct pmap_scan_info info; 3945 /* JG review for NX */ 3946 3947 if (pmap == NULL) 3948 return; 3949 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 3950 pmap_remove(pmap, sva, eva); 3951 return; 3952 } 3953 if (prot & VM_PROT_WRITE) 3954 return; 3955 info.pmap = pmap; 3956 info.sva = sva; 3957 info.eva = eva; 3958 info.func = pmap_protect_callback; 3959 info.arg = &prot; 3960 pmap_scan(&info, 1); 3961 } 3962 3963 static 3964 void 3965 pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info, 3966 pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept, 3967 vm_offset_t va, pt_entry_t *ptep, void *arg __unused) 3968 { 3969 pt_entry_t pbits; 3970 pt_entry_t cbits; 3971 pt_entry_t pte; 3972 vm_page_t m; 3973 3974 again: 3975 pbits = *ptep; 3976 cbits = pbits; 3977 if (pte_pv) { 3978 m = NULL; 3979 if (pbits & pmap->pmap_bits[PG_A_IDX]) { 3980 if ((pbits & pmap->pmap_bits[PG_DEVICE_IDX]) == 0) { 3981 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 3982 KKASSERT(m == pte_pv->pv_m); 3983 vm_page_flag_set(m, PG_REFERENCED); 3984 } 3985 cbits &= ~pmap->pmap_bits[PG_A_IDX]; 3986 } 3987 if (pbits & pmap->pmap_bits[PG_M_IDX]) { 3988 if (pmap_track_modified(pte_pv->pv_pindex)) { 3989 if ((pbits & pmap->pmap_bits[PG_DEVICE_IDX]) == 0) { 3990 if (m == NULL) { 3991 m = PHYS_TO_VM_PAGE(pbits & 3992 PG_FRAME); 3993 } 3994 vm_page_dirty(m); 3995 } 3996 cbits &= ~pmap->pmap_bits[PG_M_IDX]; 3997 } 3998 } 3999 } else if (sharept) { 4000 /* 4001 * Unmanaged page table, pt_pv is actually the pd_pv 4002 * for our pmap (not the object's shared pmap). 4003 * 4004 * When asked to protect something in a shared page table 4005 * page we just unmap the page table page. We have to 4006 * invalidate the tlb in this situation. 4007 * 4008 * XXX Warning, shared page tables will not be used for 4009 * OBJT_DEVICE or OBJT_MGTDEVICE (PG_FICTITIOUS) mappings 4010 * so PHYS_TO_VM_PAGE() should be safe here. 4011 */ 4012 pte = pmap_inval_smp(pmap, (vm_offset_t)-1, 1, ptep, 0); 4013 if (vm_page_unwire_quick(PHYS_TO_VM_PAGE(pte & PG_FRAME))) 4014 panic("pmap_protect: pgtable1 pg bad wirecount"); 4015 if (vm_page_unwire_quick(pt_pv->pv_m)) 4016 panic("pmap_protect: pgtable2 pg bad wirecount"); 4017 ptep = NULL; 4018 } 4019 /* else unmanaged page, adjust bits, no wire changes */ 4020 4021 if (ptep) { 4022 cbits &= ~pmap->pmap_bits[PG_RW_IDX]; 4023 #ifdef PMAP_DEBUG2 4024 if (pmap_enter_debug > 0) { 4025 --pmap_enter_debug; 4026 kprintf("pmap_protect va=%lx ptep=%p pte_pv=%p " 4027 "pt_pv=%p cbits=%08lx\n", 4028 va, ptep, pte_pv, 4029 pt_pv, cbits 4030 ); 4031 } 4032 #endif 4033 if (pbits != cbits) { 4034 if (!pmap_inval_smp_cmpset(pmap, (vm_offset_t)-1, 4035 ptep, pbits, cbits)) { 4036 goto again; 4037 } 4038 } 4039 } 4040 if (pte_pv) 4041 pv_put(pte_pv); 4042 } 4043 4044 /* 4045 * Insert the vm_page (m) at the virtual address (va), replacing any prior 4046 * mapping at that address. Set protection and wiring as requested. 4047 * 4048 * If entry is non-NULL we check to see if the SEG_SIZE optimization is 4049 * possible. If it is we enter the page into the appropriate shared pmap 4050 * hanging off the related VM object instead of the passed pmap, then we 4051 * share the page table page from the VM object's pmap into the current pmap. 4052 * 4053 * NOTE: This routine MUST insert the page into the pmap now, it cannot 4054 * lazy-evaluate. 4055 */ 4056 void 4057 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4058 boolean_t wired, vm_map_entry_t entry) 4059 { 4060 pv_entry_t pt_pv; /* page table */ 4061 pv_entry_t pte_pv; /* page table entry */ 4062 pt_entry_t *ptep; 4063 vm_paddr_t opa; 4064 pt_entry_t origpte, newpte; 4065 vm_paddr_t pa; 4066 4067 if (pmap == NULL) 4068 return; 4069 va = trunc_page(va); 4070 #ifdef PMAP_DIAGNOSTIC 4071 if (va >= KvaEnd) 4072 panic("pmap_enter: toobig"); 4073 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) 4074 panic("pmap_enter: invalid to pmap_enter page table " 4075 "pages (va: 0x%lx)", va); 4076 #endif 4077 if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) { 4078 kprintf("Warning: pmap_enter called on UVA with " 4079 "kernel_pmap\n"); 4080 #ifdef DDB 4081 db_print_backtrace(); 4082 #endif 4083 } 4084 if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) { 4085 kprintf("Warning: pmap_enter called on KVA without" 4086 "kernel_pmap\n"); 4087 #ifdef DDB 4088 db_print_backtrace(); 4089 #endif 4090 } 4091 4092 /* 4093 * Get locked PV entries for our new page table entry (pte_pv) 4094 * and for its parent page table (pt_pv). We need the parent 4095 * so we can resolve the location of the ptep. 4096 * 4097 * Only hardware MMU actions can modify the ptep out from 4098 * under us. 4099 * 4100 * if (m) is fictitious or unmanaged we do not create a managing 4101 * pte_pv for it. Any pre-existing page's management state must 4102 * match (avoiding code complexity). 4103 * 4104 * If the pmap is still being initialized we assume existing 4105 * page tables. 4106 * 4107 * Kernel mapppings do not track page table pages (i.e. pt_pv). 4108 */ 4109 if (pmap_initialized == FALSE) { 4110 pte_pv = NULL; 4111 pt_pv = NULL; 4112 ptep = vtopte(va); 4113 origpte = *ptep; 4114 } else if (m->flags & (/*PG_FICTITIOUS |*/ PG_UNMANAGED)) { /* XXX */ 4115 pte_pv = NULL; 4116 if (va >= VM_MAX_USER_ADDRESS) { 4117 pt_pv = NULL; 4118 ptep = vtopte(va); 4119 } else { 4120 pt_pv = pmap_allocpte_seg(pmap, pmap_pt_pindex(va), 4121 NULL, entry, va); 4122 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 4123 } 4124 origpte = *ptep; 4125 cpu_ccfence(); 4126 KASSERT(origpte == 0 || 4127 (origpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0, 4128 ("Invalid PTE 0x%016jx @ 0x%016jx\n", origpte, va)); 4129 } else { 4130 if (va >= VM_MAX_USER_ADDRESS) { 4131 /* 4132 * Kernel map, pv_entry-tracked. 4133 */ 4134 pt_pv = NULL; 4135 pte_pv = pmap_allocpte(pmap, pmap_pte_pindex(va), NULL); 4136 ptep = vtopte(va); 4137 } else { 4138 /* 4139 * User map 4140 */ 4141 pte_pv = pmap_allocpte_seg(pmap, pmap_pte_pindex(va), 4142 &pt_pv, entry, va); 4143 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 4144 } 4145 origpte = *ptep; 4146 cpu_ccfence(); 4147 KASSERT(origpte == 0 || 4148 (origpte & pmap->pmap_bits[PG_MANAGED_IDX]), 4149 ("Invalid PTE 0x%016jx @ 0x%016jx\n", origpte, va)); 4150 } 4151 4152 pa = VM_PAGE_TO_PHYS(m); 4153 opa = origpte & PG_FRAME; 4154 4155 newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | 4156 pmap->pmap_bits[PG_V_IDX] | pmap->pmap_bits[PG_A_IDX]); 4157 if (wired) 4158 newpte |= pmap->pmap_bits[PG_W_IDX]; 4159 if (va < VM_MAX_USER_ADDRESS) 4160 newpte |= pmap->pmap_bits[PG_U_IDX]; 4161 if (pte_pv) 4162 newpte |= pmap->pmap_bits[PG_MANAGED_IDX]; 4163 // if (pmap == &kernel_pmap) 4164 // newpte |= pgeflag; 4165 newpte |= pmap->pmap_cache_bits[m->pat_mode]; 4166 if (m->flags & PG_FICTITIOUS) 4167 newpte |= pmap->pmap_bits[PG_DEVICE_IDX]; 4168 4169 /* 4170 * It is possible for multiple faults to occur in threaded 4171 * environments, the existing pte might be correct. 4172 */ 4173 if (((origpte ^ newpte) & ~(pt_entry_t)(pmap->pmap_bits[PG_M_IDX] | 4174 pmap->pmap_bits[PG_A_IDX])) == 0) 4175 goto done; 4176 4177 /* 4178 * Ok, either the address changed or the protection or wiring 4179 * changed. 4180 * 4181 * Clear the current entry, interlocking the removal. For managed 4182 * pte's this will also flush the modified state to the vm_page. 4183 * Atomic ops are mandatory in order to ensure that PG_M events are 4184 * not lost during any transition. 4185 * 4186 * WARNING: The caller has busied the new page but not the original 4187 * vm_page which we are trying to replace. Because we hold 4188 * the pte_pv lock, but have not busied the page, PG bits 4189 * can be cleared out from under us. 4190 */ 4191 if (opa) { 4192 if (pte_pv) { 4193 /* 4194 * pmap_remove_pv_pte() unwires pt_pv and assumes 4195 * we will free pte_pv, but since we are reusing 4196 * pte_pv we want to retain the wire count. 4197 * 4198 * pt_pv won't exist for a kernel page (managed or 4199 * otherwise). 4200 */ 4201 if (pt_pv) 4202 vm_page_wire_quick(pt_pv->pv_m); 4203 if (prot & VM_PROT_NOSYNC) { 4204 pmap_remove_pv_pte(pte_pv, pt_pv, NULL); 4205 } else { 4206 pmap_inval_bulk_t bulk; 4207 4208 pmap_inval_bulk_init(&bulk, pmap); 4209 pmap_remove_pv_pte(pte_pv, pt_pv, &bulk); 4210 pmap_inval_bulk_flush(&bulk); 4211 } 4212 if (pte_pv->pv_m) 4213 pmap_remove_pv_page(pte_pv); 4214 } else if (prot & VM_PROT_NOSYNC) { 4215 /* 4216 * Unmanaged page, NOSYNC (no mmu sync) requested. 4217 * 4218 * Leave wire count on PT page intact. 4219 */ 4220 (void)pte_load_clear(ptep); 4221 cpu_invlpg((void *)va); 4222 atomic_add_long(&pmap->pm_stats.resident_count, -1); 4223 } else { 4224 /* 4225 * Unmanaged page, normal enter. 4226 * 4227 * Leave wire count on PT page intact. 4228 */ 4229 pmap_inval_smp(pmap, va, 1, ptep, 0); 4230 atomic_add_long(&pmap->pm_stats.resident_count, -1); 4231 } 4232 KKASSERT(*ptep == 0); 4233 } 4234 4235 #ifdef PMAP_DEBUG2 4236 if (pmap_enter_debug > 0) { 4237 --pmap_enter_debug; 4238 kprintf("pmap_enter: va=%lx m=%p origpte=%lx newpte=%lx ptep=%p" 4239 " pte_pv=%p pt_pv=%p opa=%lx prot=%02x\n", 4240 va, m, 4241 origpte, newpte, ptep, 4242 pte_pv, pt_pv, opa, prot); 4243 } 4244 #endif 4245 4246 if (pte_pv) { 4247 /* 4248 * Enter on the PV list if part of our managed memory. 4249 * Wiring of the PT page is already handled. 4250 */ 4251 KKASSERT(pte_pv->pv_m == NULL); 4252 vm_page_spin_lock(m); 4253 pte_pv->pv_m = m; 4254 pmap_page_stats_adding(m); 4255 TAILQ_INSERT_TAIL(&m->md.pv_list, pte_pv, pv_list); 4256 vm_page_flag_set(m, PG_MAPPED); 4257 vm_page_spin_unlock(m); 4258 } else if (pt_pv && opa == 0) { 4259 /* 4260 * We have to adjust the wire count on the PT page ourselves 4261 * for unmanaged entries. If opa was non-zero we retained 4262 * the existing wire count from the removal. 4263 */ 4264 vm_page_wire_quick(pt_pv->pv_m); 4265 } 4266 4267 /* 4268 * Kernel VMAs (pt_pv == NULL) require pmap invalidation interlocks. 4269 * 4270 * User VMAs do not because those will be zero->non-zero, so no 4271 * stale entries to worry about at this point. 4272 * 4273 * For KVM there appear to still be issues. Theoretically we 4274 * should be able to scrap the interlocks entirely but we 4275 * get crashes. 4276 */ 4277 if ((prot & VM_PROT_NOSYNC) == 0 && pt_pv == NULL) { 4278 pmap_inval_smp(pmap, va, 1, ptep, newpte); 4279 } else { 4280 *(volatile pt_entry_t *)ptep = newpte; 4281 if (pt_pv == NULL) 4282 cpu_invlpg((void *)va); 4283 } 4284 4285 if (wired) { 4286 if (pte_pv) { 4287 atomic_add_long(&pte_pv->pv_pmap->pm_stats.wired_count, 4288 1); 4289 } else { 4290 atomic_add_long(&pmap->pm_stats.wired_count, 1); 4291 } 4292 } 4293 if (newpte & pmap->pmap_bits[PG_RW_IDX]) 4294 vm_page_flag_set(m, PG_WRITEABLE); 4295 4296 /* 4297 * Unmanaged pages need manual resident_count tracking. 4298 */ 4299 if (pte_pv == NULL && pt_pv) 4300 atomic_add_long(&pt_pv->pv_pmap->pm_stats.resident_count, 1); 4301 4302 /* 4303 * Cleanup 4304 */ 4305 done: 4306 KKASSERT((newpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0 || 4307 (m->flags & PG_MAPPED)); 4308 4309 /* 4310 * Cleanup the pv entry, allowing other accessors. 4311 */ 4312 if (pte_pv) 4313 pv_put(pte_pv); 4314 if (pt_pv) 4315 pv_put(pt_pv); 4316 } 4317 4318 /* 4319 * This code works like pmap_enter() but assumes VM_PROT_READ and not-wired. 4320 * This code also assumes that the pmap has no pre-existing entry for this 4321 * VA. 4322 * 4323 * This code currently may only be used on user pmaps, not kernel_pmap. 4324 */ 4325 void 4326 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m) 4327 { 4328 pmap_enter(pmap, va, m, VM_PROT_READ, FALSE, NULL); 4329 } 4330 4331 /* 4332 * Make a temporary mapping for a physical address. This is only intended 4333 * to be used for panic dumps. 4334 * 4335 * The caller is responsible for calling smp_invltlb(). 4336 */ 4337 void * 4338 pmap_kenter_temporary(vm_paddr_t pa, long i) 4339 { 4340 pmap_kenter_quick((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa); 4341 return ((void *)crashdumpmap); 4342 } 4343 4344 #define MAX_INIT_PT (96) 4345 4346 /* 4347 * This routine preloads the ptes for a given object into the specified pmap. 4348 * This eliminates the blast of soft faults on process startup and 4349 * immediately after an mmap. 4350 */ 4351 static int pmap_object_init_pt_callback(vm_page_t p, void *data); 4352 4353 void 4354 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot, 4355 vm_object_t object, vm_pindex_t pindex, 4356 vm_size_t size, int limit) 4357 { 4358 struct rb_vm_page_scan_info info; 4359 struct lwp *lp; 4360 vm_size_t psize; 4361 4362 /* 4363 * We can't preinit if read access isn't set or there is no pmap 4364 * or object. 4365 */ 4366 if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL) 4367 return; 4368 4369 /* 4370 * We can't preinit if the pmap is not the current pmap 4371 */ 4372 lp = curthread->td_lwp; 4373 if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace)) 4374 return; 4375 4376 /* 4377 * Misc additional checks 4378 */ 4379 psize = x86_64_btop(size); 4380 4381 if ((object->type != OBJT_VNODE) || 4382 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && 4383 (object->resident_page_count > MAX_INIT_PT))) { 4384 return; 4385 } 4386 4387 if (pindex + psize > object->size) { 4388 if (object->size < pindex) 4389 return; 4390 psize = object->size - pindex; 4391 } 4392 4393 if (psize == 0) 4394 return; 4395 4396 /* 4397 * If everything is segment-aligned do not pre-init here. Instead 4398 * allow the normal vm_fault path to pass a segment hint to 4399 * pmap_enter() which will then use an object-referenced shared 4400 * page table page. 4401 */ 4402 if ((addr & SEG_MASK) == 0 && 4403 (ctob(psize) & SEG_MASK) == 0 && 4404 (ctob(pindex) & SEG_MASK) == 0) { 4405 return; 4406 } 4407 4408 /* 4409 * Use a red-black scan to traverse the requested range and load 4410 * any valid pages found into the pmap. 4411 * 4412 * We cannot safely scan the object's memq without holding the 4413 * object token. 4414 */ 4415 info.start_pindex = pindex; 4416 info.end_pindex = pindex + psize - 1; 4417 info.limit = limit; 4418 info.mpte = NULL; 4419 info.addr = addr; 4420 info.pmap = pmap; 4421 4422 vm_object_hold_shared(object); 4423 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 4424 pmap_object_init_pt_callback, &info); 4425 vm_object_drop(object); 4426 } 4427 4428 static 4429 int 4430 pmap_object_init_pt_callback(vm_page_t p, void *data) 4431 { 4432 struct rb_vm_page_scan_info *info = data; 4433 vm_pindex_t rel_index; 4434 4435 /* 4436 * don't allow an madvise to blow away our really 4437 * free pages allocating pv entries. 4438 */ 4439 if ((info->limit & MAP_PREFAULT_MADVISE) && 4440 vmstats.v_free_count < vmstats.v_free_reserved) { 4441 return(-1); 4442 } 4443 4444 /* 4445 * Ignore list markers and ignore pages we cannot instantly 4446 * busy (while holding the object token). 4447 */ 4448 if (p->flags & PG_MARKER) 4449 return 0; 4450 if (vm_page_busy_try(p, TRUE)) 4451 return 0; 4452 if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && 4453 (p->flags & PG_FICTITIOUS) == 0) { 4454 if ((p->queue - p->pc) == PQ_CACHE) 4455 vm_page_deactivate(p); 4456 rel_index = p->pindex - info->start_pindex; 4457 pmap_enter_quick(info->pmap, 4458 info->addr + x86_64_ptob(rel_index), p); 4459 } 4460 vm_page_wakeup(p); 4461 lwkt_yield(); 4462 return(0); 4463 } 4464 4465 /* 4466 * Return TRUE if the pmap is in shape to trivially pre-fault the specified 4467 * address. 4468 * 4469 * Returns FALSE if it would be non-trivial or if a pte is already loaded 4470 * into the slot. 4471 * 4472 * XXX This is safe only because page table pages are not freed. 4473 */ 4474 int 4475 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) 4476 { 4477 pt_entry_t *pte; 4478 4479 /*spin_lock(&pmap->pm_spin);*/ 4480 if ((pte = pmap_pte(pmap, addr)) != NULL) { 4481 if (*pte & pmap->pmap_bits[PG_V_IDX]) { 4482 /*spin_unlock(&pmap->pm_spin);*/ 4483 return FALSE; 4484 } 4485 } 4486 /*spin_unlock(&pmap->pm_spin);*/ 4487 return TRUE; 4488 } 4489 4490 /* 4491 * Change the wiring attribute for a pmap/va pair. The mapping must already 4492 * exist in the pmap. The mapping may or may not be managed. 4493 */ 4494 void 4495 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired, 4496 vm_map_entry_t entry) 4497 { 4498 pt_entry_t *ptep; 4499 pv_entry_t pv; 4500 4501 if (pmap == NULL) 4502 return; 4503 lwkt_gettoken(&pmap->pm_token); 4504 pv = pmap_allocpte_seg(pmap, pmap_pt_pindex(va), NULL, entry, va); 4505 ptep = pv_pte_lookup(pv, pmap_pte_index(va)); 4506 4507 if (wired && !pmap_pte_w(pmap, ptep)) 4508 atomic_add_long(&pv->pv_pmap->pm_stats.wired_count, 1); 4509 else if (!wired && pmap_pte_w(pmap, ptep)) 4510 atomic_add_long(&pv->pv_pmap->pm_stats.wired_count, -1); 4511 4512 /* 4513 * Wiring is not a hardware characteristic so there is no need to 4514 * invalidate TLB. However, in an SMP environment we must use 4515 * a locked bus cycle to update the pte (if we are not using 4516 * the pmap_inval_*() API that is)... it's ok to do this for simple 4517 * wiring changes. 4518 */ 4519 if (wired) 4520 atomic_set_long(ptep, pmap->pmap_bits[PG_W_IDX]); 4521 else 4522 atomic_clear_long(ptep, pmap->pmap_bits[PG_W_IDX]); 4523 pv_put(pv); 4524 lwkt_reltoken(&pmap->pm_token); 4525 } 4526 4527 4528 4529 /* 4530 * Copy the range specified by src_addr/len from the source map to 4531 * the range dst_addr/len in the destination map. 4532 * 4533 * This routine is only advisory and need not do anything. 4534 */ 4535 void 4536 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 4537 vm_size_t len, vm_offset_t src_addr) 4538 { 4539 } 4540 4541 /* 4542 * pmap_zero_page: 4543 * 4544 * Zero the specified physical page. 4545 * 4546 * This function may be called from an interrupt and no locking is 4547 * required. 4548 */ 4549 void 4550 pmap_zero_page(vm_paddr_t phys) 4551 { 4552 vm_offset_t va = PHYS_TO_DMAP(phys); 4553 4554 pagezero((void *)va); 4555 } 4556 4557 /* 4558 * pmap_zero_page: 4559 * 4560 * Zero part of a physical page by mapping it into memory and clearing 4561 * its contents with bzero. 4562 * 4563 * off and size may not cover an area beyond a single hardware page. 4564 */ 4565 void 4566 pmap_zero_page_area(vm_paddr_t phys, int off, int size) 4567 { 4568 vm_offset_t virt = PHYS_TO_DMAP(phys); 4569 4570 bzero((char *)virt + off, size); 4571 } 4572 4573 /* 4574 * pmap_copy_page: 4575 * 4576 * Copy the physical page from the source PA to the target PA. 4577 * This function may be called from an interrupt. No locking 4578 * is required. 4579 */ 4580 void 4581 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst) 4582 { 4583 vm_offset_t src_virt, dst_virt; 4584 4585 src_virt = PHYS_TO_DMAP(src); 4586 dst_virt = PHYS_TO_DMAP(dst); 4587 bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE); 4588 } 4589 4590 /* 4591 * pmap_copy_page_frag: 4592 * 4593 * Copy the physical page from the source PA to the target PA. 4594 * This function may be called from an interrupt. No locking 4595 * is required. 4596 */ 4597 void 4598 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes) 4599 { 4600 vm_offset_t src_virt, dst_virt; 4601 4602 src_virt = PHYS_TO_DMAP(src); 4603 dst_virt = PHYS_TO_DMAP(dst); 4604 4605 bcopy((char *)src_virt + (src & PAGE_MASK), 4606 (char *)dst_virt + (dst & PAGE_MASK), 4607 bytes); 4608 } 4609 4610 /* 4611 * Returns true if the pmap's pv is one of the first 16 pvs linked to from 4612 * this page. This count may be changed upwards or downwards in the future; 4613 * it is only necessary that true be returned for a small subset of pmaps 4614 * for proper page aging. 4615 */ 4616 boolean_t 4617 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 4618 { 4619 pv_entry_t pv; 4620 int loops = 0; 4621 4622 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 4623 return FALSE; 4624 4625 vm_page_spin_lock(m); 4626 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4627 if (pv->pv_pmap == pmap) { 4628 vm_page_spin_unlock(m); 4629 return TRUE; 4630 } 4631 loops++; 4632 if (loops >= 16) 4633 break; 4634 } 4635 vm_page_spin_unlock(m); 4636 return (FALSE); 4637 } 4638 4639 /* 4640 * Remove all pages from specified address space this aids process exit 4641 * speeds. Also, this code may be special cased for the current process 4642 * only. 4643 */ 4644 void 4645 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4646 { 4647 pmap_remove_noinval(pmap, sva, eva); 4648 cpu_invltlb(); 4649 } 4650 4651 /* 4652 * pmap_testbit tests bits in pte's note that the testbit/clearbit 4653 * routines are inline, and a lot of things compile-time evaluate. 4654 */ 4655 static 4656 boolean_t 4657 pmap_testbit(vm_page_t m, int bit) 4658 { 4659 pv_entry_t pv; 4660 pt_entry_t *pte; 4661 pmap_t pmap; 4662 4663 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 4664 return FALSE; 4665 4666 if (TAILQ_FIRST(&m->md.pv_list) == NULL) 4667 return FALSE; 4668 vm_page_spin_lock(m); 4669 if (TAILQ_FIRST(&m->md.pv_list) == NULL) { 4670 vm_page_spin_unlock(m); 4671 return FALSE; 4672 } 4673 4674 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4675 4676 #if defined(PMAP_DIAGNOSTIC) 4677 if (pv->pv_pmap == NULL) { 4678 kprintf("Null pmap (tb) at pindex: %"PRIu64"\n", 4679 pv->pv_pindex); 4680 continue; 4681 } 4682 #endif 4683 pmap = pv->pv_pmap; 4684 4685 /* 4686 * If the bit being tested is the modified bit, then 4687 * mark clean_map and ptes as never 4688 * modified. 4689 * 4690 * WARNING! Because we do not lock the pv, *pte can be in a 4691 * state of flux. Despite this the value of *pte 4692 * will still be related to the vm_page in some way 4693 * because the pv cannot be destroyed as long as we 4694 * hold the vm_page spin lock. 4695 */ 4696 if (bit == PG_A_IDX || bit == PG_M_IDX) { 4697 //& (pmap->pmap_bits[PG_A_IDX] | pmap->pmap_bits[PG_M_IDX])) { 4698 if (!pmap_track_modified(pv->pv_pindex)) 4699 continue; 4700 } 4701 4702 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); 4703 if (*pte & pmap->pmap_bits[bit]) { 4704 vm_page_spin_unlock(m); 4705 return TRUE; 4706 } 4707 } 4708 vm_page_spin_unlock(m); 4709 return (FALSE); 4710 } 4711 4712 /* 4713 * This routine is used to modify bits in ptes. Only one bit should be 4714 * specified. PG_RW requires special handling. 4715 * 4716 * Caller must NOT hold any spin locks 4717 */ 4718 static __inline 4719 void 4720 pmap_clearbit(vm_page_t m, int bit_index) 4721 { 4722 pv_entry_t pv; 4723 pt_entry_t *pte; 4724 pt_entry_t pbits; 4725 pmap_t pmap; 4726 4727 if (bit_index == PG_RW_IDX) 4728 vm_page_flag_clear(m, PG_WRITEABLE); 4729 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { 4730 return; 4731 } 4732 4733 /* 4734 * PG_M or PG_A case 4735 * 4736 * Loop over all current mappings setting/clearing as appropos If 4737 * setting RO do we need to clear the VAC? 4738 * 4739 * NOTE: When clearing PG_M we could also (not implemented) drop 4740 * through to the PG_RW code and clear PG_RW too, forcing 4741 * a fault on write to redetect PG_M for virtual kernels, but 4742 * it isn't necessary since virtual kernels invalidate the 4743 * pte when they clear the VPTE_M bit in their virtual page 4744 * tables. 4745 * 4746 * NOTE: Does not re-dirty the page when clearing only PG_M. 4747 * 4748 * NOTE: Because we do not lock the pv, *pte can be in a state of 4749 * flux. Despite this the value of *pte is still somewhat 4750 * related while we hold the vm_page spin lock. 4751 * 4752 * *pte can be zero due to this race. Since we are clearing 4753 * bits we basically do no harm when this race ccurs. 4754 */ 4755 if (bit_index != PG_RW_IDX) { 4756 vm_page_spin_lock(m); 4757 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4758 #if defined(PMAP_DIAGNOSTIC) 4759 if (pv->pv_pmap == NULL) { 4760 kprintf("Null pmap (cb) at pindex: %"PRIu64"\n", 4761 pv->pv_pindex); 4762 continue; 4763 } 4764 #endif 4765 pmap = pv->pv_pmap; 4766 pte = pmap_pte_quick(pv->pv_pmap, 4767 pv->pv_pindex << PAGE_SHIFT); 4768 pbits = *pte; 4769 if (pbits & pmap->pmap_bits[bit_index]) 4770 atomic_clear_long(pte, pmap->pmap_bits[bit_index]); 4771 } 4772 vm_page_spin_unlock(m); 4773 return; 4774 } 4775 4776 /* 4777 * Clear PG_RW. Also clears PG_M and marks the page dirty if PG_M 4778 * was set. 4779 */ 4780 restart: 4781 vm_page_spin_lock(m); 4782 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4783 /* 4784 * don't write protect pager mappings 4785 */ 4786 if (!pmap_track_modified(pv->pv_pindex)) 4787 continue; 4788 4789 #if defined(PMAP_DIAGNOSTIC) 4790 if (pv->pv_pmap == NULL) { 4791 kprintf("Null pmap (cb) at pindex: %"PRIu64"\n", 4792 pv->pv_pindex); 4793 continue; 4794 } 4795 #endif 4796 pmap = pv->pv_pmap; 4797 /* 4798 * Skip pages which do not have PG_RW set. 4799 */ 4800 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); 4801 if ((*pte & pmap->pmap_bits[PG_RW_IDX]) == 0) 4802 continue; 4803 4804 /* 4805 * Lock the PV 4806 */ 4807 if (pv_hold_try(pv)) { 4808 vm_page_spin_unlock(m); 4809 } else { 4810 vm_page_spin_unlock(m); 4811 pv_lock(pv); /* held, now do a blocking lock */ 4812 } 4813 if (pv->pv_pmap != pmap || pv->pv_m != m) { 4814 pv_put(pv); /* and release */ 4815 goto restart; /* anything could have happened */ 4816 } 4817 KKASSERT(pv->pv_pmap == pmap); 4818 for (;;) { 4819 pt_entry_t nbits; 4820 4821 pbits = *pte; 4822 cpu_ccfence(); 4823 nbits = pbits & ~(pmap->pmap_bits[PG_RW_IDX] | 4824 pmap->pmap_bits[PG_M_IDX]); 4825 if (pmap_inval_smp_cmpset(pmap, 4826 ((vm_offset_t)pv->pv_pindex << PAGE_SHIFT), 4827 pte, pbits, nbits)) { 4828 break; 4829 } 4830 cpu_pause(); 4831 } 4832 vm_page_spin_lock(m); 4833 4834 /* 4835 * If PG_M was found to be set while we were clearing PG_RW 4836 * we also clear PG_M (done above) and mark the page dirty. 4837 * Callers expect this behavior. 4838 */ 4839 if (pbits & pmap->pmap_bits[PG_M_IDX]) 4840 vm_page_dirty(m); 4841 pv_put(pv); 4842 } 4843 vm_page_spin_unlock(m); 4844 } 4845 4846 /* 4847 * Lower the permission for all mappings to a given page. 4848 * 4849 * Page must be busied by caller. Because page is busied by caller this 4850 * should not be able to race a pmap_enter(). 4851 */ 4852 void 4853 pmap_page_protect(vm_page_t m, vm_prot_t prot) 4854 { 4855 /* JG NX support? */ 4856 if ((prot & VM_PROT_WRITE) == 0) { 4857 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { 4858 /* 4859 * NOTE: pmap_clearbit(.. PG_RW) also clears 4860 * the PG_WRITEABLE flag in (m). 4861 */ 4862 pmap_clearbit(m, PG_RW_IDX); 4863 } else { 4864 pmap_remove_all(m); 4865 } 4866 } 4867 } 4868 4869 vm_paddr_t 4870 pmap_phys_address(vm_pindex_t ppn) 4871 { 4872 return (x86_64_ptob(ppn)); 4873 } 4874 4875 /* 4876 * Return a count of reference bits for a page, clearing those bits. 4877 * It is not necessary for every reference bit to be cleared, but it 4878 * is necessary that 0 only be returned when there are truly no 4879 * reference bits set. 4880 * 4881 * XXX: The exact number of bits to check and clear is a matter that 4882 * should be tested and standardized at some point in the future for 4883 * optimal aging of shared pages. 4884 * 4885 * This routine may not block. 4886 */ 4887 int 4888 pmap_ts_referenced(vm_page_t m) 4889 { 4890 pv_entry_t pv; 4891 pt_entry_t *pte; 4892 pmap_t pmap; 4893 int rtval = 0; 4894 4895 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 4896 return (rtval); 4897 4898 vm_page_spin_lock(m); 4899 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4900 if (!pmap_track_modified(pv->pv_pindex)) 4901 continue; 4902 pmap = pv->pv_pmap; 4903 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); 4904 if (pte && (*pte & pmap->pmap_bits[PG_A_IDX])) { 4905 atomic_clear_long(pte, pmap->pmap_bits[PG_A_IDX]); 4906 rtval++; 4907 if (rtval > 4) 4908 break; 4909 } 4910 } 4911 vm_page_spin_unlock(m); 4912 return (rtval); 4913 } 4914 4915 /* 4916 * pmap_is_modified: 4917 * 4918 * Return whether or not the specified physical page was modified 4919 * in any physical maps. 4920 */ 4921 boolean_t 4922 pmap_is_modified(vm_page_t m) 4923 { 4924 boolean_t res; 4925 4926 res = pmap_testbit(m, PG_M_IDX); 4927 return (res); 4928 } 4929 4930 /* 4931 * Clear the modify bits on the specified physical page. 4932 */ 4933 void 4934 pmap_clear_modify(vm_page_t m) 4935 { 4936 pmap_clearbit(m, PG_M_IDX); 4937 } 4938 4939 /* 4940 * pmap_clear_reference: 4941 * 4942 * Clear the reference bit on the specified physical page. 4943 */ 4944 void 4945 pmap_clear_reference(vm_page_t m) 4946 { 4947 pmap_clearbit(m, PG_A_IDX); 4948 } 4949 4950 /* 4951 * Miscellaneous support routines follow 4952 */ 4953 4954 static 4955 void 4956 i386_protection_init(void) 4957 { 4958 int *kp, prot; 4959 4960 /* JG NX support may go here; No VM_PROT_EXECUTE ==> set NX bit */ 4961 kp = protection_codes; 4962 for (prot = 0; prot < PROTECTION_CODES_SIZE; prot++) { 4963 switch (prot) { 4964 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: 4965 /* 4966 * Read access is also 0. There isn't any execute bit, 4967 * so just make it readable. 4968 */ 4969 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: 4970 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: 4971 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: 4972 *kp++ = 0; 4973 break; 4974 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: 4975 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: 4976 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: 4977 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: 4978 *kp++ = pmap_bits_default[PG_RW_IDX]; 4979 break; 4980 } 4981 } 4982 } 4983 4984 /* 4985 * Map a set of physical memory pages into the kernel virtual 4986 * address space. Return a pointer to where it is mapped. This 4987 * routine is intended to be used for mapping device memory, 4988 * NOT real memory. 4989 * 4990 * NOTE: We can't use pgeflag unless we invalidate the pages one at 4991 * a time. 4992 * 4993 * NOTE: The PAT attributes {WRITE_BACK, WRITE_THROUGH, UNCACHED, UNCACHEABLE} 4994 * work whether the cpu supports PAT or not. The remaining PAT 4995 * attributes {WRITE_PROTECTED, WRITE_COMBINING} only work if the cpu 4996 * supports PAT. 4997 */ 4998 void * 4999 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 5000 { 5001 return(pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5002 } 5003 5004 void * 5005 pmap_mapdev_uncacheable(vm_paddr_t pa, vm_size_t size) 5006 { 5007 return(pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 5008 } 5009 5010 void * 5011 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 5012 { 5013 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5014 } 5015 5016 /* 5017 * Map a set of physical memory pages into the kernel virtual 5018 * address space. Return a pointer to where it is mapped. This 5019 * routine is intended to be used for mapping device memory, 5020 * NOT real memory. 5021 */ 5022 void * 5023 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 5024 { 5025 vm_offset_t va, tmpva, offset; 5026 pt_entry_t *pte; 5027 vm_size_t tmpsize; 5028 5029 offset = pa & PAGE_MASK; 5030 size = roundup(offset + size, PAGE_SIZE); 5031 5032 va = kmem_alloc_nofault(&kernel_map, size, PAGE_SIZE); 5033 if (va == 0) 5034 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 5035 5036 pa = pa & ~PAGE_MASK; 5037 for (tmpva = va, tmpsize = size; tmpsize > 0;) { 5038 pte = vtopte(tmpva); 5039 *pte = pa | 5040 kernel_pmap.pmap_bits[PG_RW_IDX] | 5041 kernel_pmap.pmap_bits[PG_V_IDX] | /* pgeflag | */ 5042 kernel_pmap.pmap_cache_bits[mode]; 5043 tmpsize -= PAGE_SIZE; 5044 tmpva += PAGE_SIZE; 5045 pa += PAGE_SIZE; 5046 } 5047 pmap_invalidate_range(&kernel_pmap, va, va + size); 5048 pmap_invalidate_cache_range(va, va + size); 5049 5050 return ((void *)(va + offset)); 5051 } 5052 5053 void 5054 pmap_unmapdev(vm_offset_t va, vm_size_t size) 5055 { 5056 vm_offset_t base, offset; 5057 5058 base = va & ~PAGE_MASK; 5059 offset = va & PAGE_MASK; 5060 size = roundup(offset + size, PAGE_SIZE); 5061 pmap_qremove(va, size >> PAGE_SHIFT); 5062 kmem_free(&kernel_map, base, size); 5063 } 5064 5065 /* 5066 * Sets the memory attribute for the specified page. 5067 */ 5068 void 5069 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5070 { 5071 5072 m->pat_mode = ma; 5073 5074 /* 5075 * If "m" is a normal page, update its direct mapping. This update 5076 * can be relied upon to perform any cache operations that are 5077 * required for data coherence. 5078 */ 5079 if ((m->flags & PG_FICTITIOUS) == 0) 5080 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 1, m->pat_mode); 5081 } 5082 5083 /* 5084 * Change the PAT attribute on an existing kernel memory map. Caller 5085 * must ensure that the virtual memory in question is not accessed 5086 * during the adjustment. 5087 */ 5088 void 5089 pmap_change_attr(vm_offset_t va, vm_size_t count, int mode) 5090 { 5091 pt_entry_t *pte; 5092 vm_offset_t base; 5093 int changed = 0; 5094 5095 if (va == 0) 5096 panic("pmap_change_attr: va is NULL"); 5097 base = trunc_page(va); 5098 5099 while (count) { 5100 pte = vtopte(va); 5101 *pte = (*pte & ~(pt_entry_t)(kernel_pmap.pmap_cache_mask)) | 5102 kernel_pmap.pmap_cache_bits[mode]; 5103 --count; 5104 va += PAGE_SIZE; 5105 } 5106 5107 changed = 1; /* XXX: not optimal */ 5108 5109 /* 5110 * Flush CPU caches if required to make sure any data isn't cached that 5111 * shouldn't be, etc. 5112 */ 5113 if (changed) { 5114 pmap_invalidate_range(&kernel_pmap, base, va); 5115 pmap_invalidate_cache_range(base, va); 5116 } 5117 } 5118 5119 /* 5120 * perform the pmap work for mincore 5121 */ 5122 int 5123 pmap_mincore(pmap_t pmap, vm_offset_t addr) 5124 { 5125 pt_entry_t *ptep, pte; 5126 vm_page_t m; 5127 int val = 0; 5128 5129 lwkt_gettoken(&pmap->pm_token); 5130 ptep = pmap_pte(pmap, addr); 5131 5132 if (ptep && (pte = *ptep) != 0) { 5133 vm_offset_t pa; 5134 5135 val = MINCORE_INCORE; 5136 if ((pte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0) 5137 goto done; 5138 5139 pa = pte & PG_FRAME; 5140 5141 if (pte & pmap->pmap_bits[PG_DEVICE_IDX]) 5142 m = NULL; 5143 else 5144 m = PHYS_TO_VM_PAGE(pa); 5145 5146 /* 5147 * Modified by us 5148 */ 5149 if (pte & pmap->pmap_bits[PG_M_IDX]) 5150 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 5151 /* 5152 * Modified by someone 5153 */ 5154 else if (m && (m->dirty || pmap_is_modified(m))) 5155 val |= MINCORE_MODIFIED_OTHER; 5156 /* 5157 * Referenced by us 5158 */ 5159 if (pte & pmap->pmap_bits[PG_A_IDX]) 5160 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 5161 5162 /* 5163 * Referenced by someone 5164 */ 5165 else if (m && ((m->flags & PG_REFERENCED) || 5166 pmap_ts_referenced(m))) { 5167 val |= MINCORE_REFERENCED_OTHER; 5168 vm_page_flag_set(m, PG_REFERENCED); 5169 } 5170 } 5171 done: 5172 lwkt_reltoken(&pmap->pm_token); 5173 5174 return val; 5175 } 5176 5177 /* 5178 * Replace p->p_vmspace with a new one. If adjrefs is non-zero the new 5179 * vmspace will be ref'd and the old one will be deref'd. 5180 * 5181 * The vmspace for all lwps associated with the process will be adjusted 5182 * and cr3 will be reloaded if any lwp is the current lwp. 5183 * 5184 * The process must hold the vmspace->vm_map.token for oldvm and newvm 5185 */ 5186 void 5187 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs) 5188 { 5189 struct vmspace *oldvm; 5190 struct lwp *lp; 5191 5192 oldvm = p->p_vmspace; 5193 if (oldvm != newvm) { 5194 if (adjrefs) 5195 vmspace_ref(newvm); 5196 p->p_vmspace = newvm; 5197 KKASSERT(p->p_nthreads == 1); 5198 lp = RB_ROOT(&p->p_lwp_tree); 5199 pmap_setlwpvm(lp, newvm); 5200 if (adjrefs) 5201 vmspace_rel(oldvm); 5202 } 5203 } 5204 5205 /* 5206 * Set the vmspace for a LWP. The vmspace is almost universally set the 5207 * same as the process vmspace, but virtual kernels need to swap out contexts 5208 * on a per-lwp basis. 5209 * 5210 * Caller does not necessarily hold any vmspace tokens. Caller must control 5211 * the lwp (typically be in the context of the lwp). We use a critical 5212 * section to protect against statclock and hardclock (statistics collection). 5213 */ 5214 void 5215 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm) 5216 { 5217 struct vmspace *oldvm; 5218 struct pmap *pmap; 5219 5220 oldvm = lp->lwp_vmspace; 5221 5222 if (oldvm != newvm) { 5223 crit_enter(); 5224 lp->lwp_vmspace = newvm; 5225 if (curthread->td_lwp == lp) { 5226 pmap = vmspace_pmap(newvm); 5227 ATOMIC_CPUMASK_ORBIT(pmap->pm_active, mycpu->gd_cpuid); 5228 if (pmap->pm_active_lock & CPULOCK_EXCL) 5229 pmap_interlock_wait(newvm); 5230 #if defined(SWTCH_OPTIM_STATS) 5231 tlb_flush_count++; 5232 #endif 5233 if (pmap->pmap_bits[TYPE_IDX] == REGULAR_PMAP) { 5234 curthread->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4); 5235 } else if (pmap->pmap_bits[TYPE_IDX] == EPT_PMAP) { 5236 curthread->td_pcb->pcb_cr3 = KPML4phys; 5237 } else { 5238 panic("pmap_setlwpvm: unknown pmap type\n"); 5239 } 5240 load_cr3(curthread->td_pcb->pcb_cr3); 5241 pmap = vmspace_pmap(oldvm); 5242 ATOMIC_CPUMASK_NANDBIT(pmap->pm_active, 5243 mycpu->gd_cpuid); 5244 } 5245 crit_exit(); 5246 } 5247 } 5248 5249 /* 5250 * Called when switching to a locked pmap, used to interlock against pmaps 5251 * undergoing modifications to prevent us from activating the MMU for the 5252 * target pmap until all such modifications have completed. We have to do 5253 * this because the thread making the modifications has already set up its 5254 * SMP synchronization mask. 5255 * 5256 * This function cannot sleep! 5257 * 5258 * No requirements. 5259 */ 5260 void 5261 pmap_interlock_wait(struct vmspace *vm) 5262 { 5263 struct pmap *pmap = &vm->vm_pmap; 5264 5265 if (pmap->pm_active_lock & CPULOCK_EXCL) { 5266 crit_enter(); 5267 KKASSERT(curthread->td_critcount >= 2); 5268 DEBUG_PUSH_INFO("pmap_interlock_wait"); 5269 while (pmap->pm_active_lock & CPULOCK_EXCL) { 5270 cpu_ccfence(); 5271 lwkt_process_ipiq(); 5272 } 5273 DEBUG_POP_INFO(); 5274 crit_exit(); 5275 } 5276 } 5277 5278 vm_offset_t 5279 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) 5280 { 5281 5282 if ((obj == NULL) || (size < NBPDR) || 5283 ((obj->type != OBJT_DEVICE) && (obj->type != OBJT_MGTDEVICE))) { 5284 return addr; 5285 } 5286 5287 addr = roundup2(addr, NBPDR); 5288 return addr; 5289 } 5290 5291 /* 5292 * Used by kmalloc/kfree, page already exists at va 5293 */ 5294 vm_page_t 5295 pmap_kvtom(vm_offset_t va) 5296 { 5297 pt_entry_t *ptep = vtopte(va); 5298 5299 KKASSERT((*ptep & kernel_pmap.pmap_bits[PG_DEVICE_IDX]) == 0); 5300 return(PHYS_TO_VM_PAGE(*ptep & PG_FRAME)); 5301 } 5302 5303 /* 5304 * Initialize machine-specific shared page directory support. This 5305 * is executed when a VM object is created. 5306 */ 5307 void 5308 pmap_object_init(vm_object_t object) 5309 { 5310 object->md.pmap_rw = NULL; 5311 object->md.pmap_ro = NULL; 5312 } 5313 5314 /* 5315 * Clean up machine-specific shared page directory support. This 5316 * is executed when a VM object is destroyed. 5317 */ 5318 void 5319 pmap_object_free(vm_object_t object) 5320 { 5321 pmap_t pmap; 5322 5323 if ((pmap = object->md.pmap_rw) != NULL) { 5324 object->md.pmap_rw = NULL; 5325 pmap_remove_noinval(pmap, 5326 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); 5327 CPUMASK_ASSZERO(pmap->pm_active); 5328 pmap_release(pmap); 5329 pmap_puninit(pmap); 5330 kfree(pmap, M_OBJPMAP); 5331 } 5332 if ((pmap = object->md.pmap_ro) != NULL) { 5333 object->md.pmap_ro = NULL; 5334 pmap_remove_noinval(pmap, 5335 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); 5336 CPUMASK_ASSZERO(pmap->pm_active); 5337 pmap_release(pmap); 5338 pmap_puninit(pmap); 5339 kfree(pmap, M_OBJPMAP); 5340 } 5341 } 5342