1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * Copyright (c) 1994 John S. Dyson 4 * Copyright (c) 1994 David Greenman 5 * Copyright (c) 2003 Peter Wemm 6 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu> 7 * Copyright (c) 2008, 2009 The DragonFly Project. 8 * Copyright (c) 2008, 2009 Jordan Gordeev. 9 * Copyright (c) 2011-2012 Matthew Dillon 10 * All rights reserved. 11 * 12 * This code is derived from software contributed to Berkeley by 13 * the Systems Programming Group of the University of Utah Computer 14 * Science Department and William Jolitz of UUNET Technologies Inc. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. All advertising materials mentioning features or use of this software 25 * must display the following acknowledgement: 26 * This product includes software developed by the University of 27 * California, Berkeley and its contributors. 28 * 4. Neither the name of the University nor the names of its contributors 29 * may be used to endorse or promote products derived from this software 30 * without specific prior written permission. 31 * 32 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 35 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 42 * SUCH DAMAGE. 43 */ 44 /* 45 * Manage physical address maps for x86-64 systems. 46 */ 47 48 #if JG 49 #include "opt_disable_pse.h" 50 #include "opt_pmap.h" 51 #endif 52 #include "opt_msgbuf.h" 53 54 #include <sys/param.h> 55 #include <sys/systm.h> 56 #include <sys/kernel.h> 57 #include <sys/proc.h> 58 #include <sys/msgbuf.h> 59 #include <sys/vmmeter.h> 60 #include <sys/mman.h> 61 62 #include <vm/vm.h> 63 #include <vm/vm_param.h> 64 #include <sys/sysctl.h> 65 #include <sys/lock.h> 66 #include <vm/vm_kern.h> 67 #include <vm/vm_page.h> 68 #include <vm/vm_map.h> 69 #include <vm/vm_object.h> 70 #include <vm/vm_extern.h> 71 #include <vm/vm_pageout.h> 72 #include <vm/vm_pager.h> 73 #include <vm/vm_zone.h> 74 75 #include <sys/user.h> 76 #include <sys/thread2.h> 77 #include <sys/sysref2.h> 78 #include <sys/spinlock2.h> 79 #include <vm/vm_page2.h> 80 81 #include <machine/cputypes.h> 82 #include <machine/md_var.h> 83 #include <machine/specialreg.h> 84 #include <machine/smp.h> 85 #include <machine_base/apic/apicreg.h> 86 #include <machine/globaldata.h> 87 #include <machine/pmap.h> 88 #include <machine/pmap_inval.h> 89 #include <machine/inttypes.h> 90 91 #include <ddb/ddb.h> 92 93 #define PMAP_KEEP_PDIRS 94 #ifndef PMAP_SHPGPERPROC 95 #define PMAP_SHPGPERPROC 2000 96 #endif 97 98 #if defined(DIAGNOSTIC) 99 #define PMAP_DIAGNOSTIC 100 #endif 101 102 #define MINPV 2048 103 104 /* 105 * pmap debugging will report who owns a pv lock when blocking. 106 */ 107 #ifdef PMAP_DEBUG 108 109 #define PMAP_DEBUG_DECL ,const char *func, int lineno 110 #define PMAP_DEBUG_ARGS , __func__, __LINE__ 111 #define PMAP_DEBUG_COPY , func, lineno 112 113 #define pv_get(pmap, pindex) _pv_get(pmap, pindex \ 114 PMAP_DEBUG_ARGS) 115 #define pv_lock(pv) _pv_lock(pv \ 116 PMAP_DEBUG_ARGS) 117 #define pv_hold_try(pv) _pv_hold_try(pv \ 118 PMAP_DEBUG_ARGS) 119 #define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp \ 120 PMAP_DEBUG_ARGS) 121 122 #else 123 124 #define PMAP_DEBUG_DECL 125 #define PMAP_DEBUG_ARGS 126 #define PMAP_DEBUG_COPY 127 128 #define pv_get(pmap, pindex) _pv_get(pmap, pindex) 129 #define pv_lock(pv) _pv_lock(pv) 130 #define pv_hold_try(pv) _pv_hold_try(pv) 131 #define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp) 132 133 #endif 134 135 /* 136 * Get PDEs and PTEs for user/kernel address space 137 */ 138 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 139 140 #define pmap_pde_v(pte) ((*(pd_entry_t *)pte & PG_V) != 0) 141 #define pmap_pte_w(pte) ((*(pt_entry_t *)pte & PG_W) != 0) 142 #define pmap_pte_m(pte) ((*(pt_entry_t *)pte & PG_M) != 0) 143 #define pmap_pte_u(pte) ((*(pt_entry_t *)pte & PG_A) != 0) 144 #define pmap_pte_v(pte) ((*(pt_entry_t *)pte & PG_V) != 0) 145 146 /* 147 * Given a map and a machine independent protection code, 148 * convert to a vax protection code. 149 */ 150 #define pte_prot(m, p) \ 151 (protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)]) 152 static int protection_codes[8]; 153 154 struct pmap kernel_pmap; 155 static TAILQ_HEAD(,pmap) pmap_list = TAILQ_HEAD_INITIALIZER(pmap_list); 156 157 MALLOC_DEFINE(M_OBJPMAP, "objpmap", "pmaps associated with VM objects"); 158 159 vm_paddr_t avail_start; /* PA of first available physical page */ 160 vm_paddr_t avail_end; /* PA of last available physical page */ 161 vm_offset_t virtual2_start; /* cutout free area prior to kernel start */ 162 vm_offset_t virtual2_end; 163 vm_offset_t virtual_start; /* VA of first avail page (after kernel bss) */ 164 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 165 vm_offset_t KvaStart; /* VA start of KVA space */ 166 vm_offset_t KvaEnd; /* VA end of KVA space (non-inclusive) */ 167 vm_offset_t KvaSize; /* max size of kernel virtual address space */ 168 static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ 169 static int pgeflag; /* PG_G or-in */ 170 static int pseflag; /* PG_PS or-in */ 171 uint64_t PatMsr; 172 173 static int ndmpdp; 174 static vm_paddr_t dmaplimit; 175 static int nkpt; 176 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 177 178 #define PAT_INDEX_SIZE 8 179 static pt_entry_t pat_pte_index[PAT_INDEX_SIZE]; /* PAT -> PG_ bits */ 180 /*static pt_entry_t pat_pde_index[PAT_INDEX_SIZE];*/ /* PAT -> PG_ bits */ 181 182 static uint64_t KPTbase; 183 static uint64_t KPTphys; 184 static uint64_t KPDphys; /* phys addr of kernel level 2 */ 185 static uint64_t KPDbase; /* phys addr of kernel level 2 @ KERNBASE */ 186 uint64_t KPDPphys; /* phys addr of kernel level 3 */ 187 uint64_t KPML4phys; /* phys addr of kernel level 4 */ 188 189 static uint64_t DMPDphys; /* phys addr of direct mapped level 2 */ 190 static uint64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 191 192 /* 193 * Data for the pv entry allocation mechanism 194 */ 195 static vm_zone_t pvzone; 196 static struct vm_zone pvzone_store; 197 static struct vm_object pvzone_obj; 198 static int pv_entry_max=0, pv_entry_high_water=0; 199 static int pmap_pagedaemon_waken = 0; 200 static struct pv_entry *pvinit; 201 202 /* 203 * All those kernel PT submaps that BSD is so fond of 204 */ 205 pt_entry_t *CMAP1 = NULL, *ptmmap; 206 caddr_t CADDR1 = NULL, ptvmmap = NULL; 207 static pt_entry_t *msgbufmap; 208 struct msgbuf *msgbufp=NULL; 209 210 /* 211 * Crashdump maps. 212 */ 213 static pt_entry_t *pt_crashdumpmap; 214 static caddr_t crashdumpmap; 215 216 static int pmap_yield_count = 64; 217 SYSCTL_INT(_machdep, OID_AUTO, pmap_yield_count, CTLFLAG_RW, 218 &pmap_yield_count, 0, "Yield during init_pt/release"); 219 static int pmap_mmu_optimize = 0; 220 SYSCTL_INT(_machdep, OID_AUTO, pmap_mmu_optimize, CTLFLAG_RW, 221 &pmap_mmu_optimize, 0, "Share page table pages when possible"); 222 223 #define DISABLE_PSE 224 225 static void pv_hold(pv_entry_t pv); 226 static int _pv_hold_try(pv_entry_t pv 227 PMAP_DEBUG_DECL); 228 static void pv_drop(pv_entry_t pv); 229 static void _pv_lock(pv_entry_t pv 230 PMAP_DEBUG_DECL); 231 static void pv_unlock(pv_entry_t pv); 232 static pv_entry_t _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew 233 PMAP_DEBUG_DECL); 234 static pv_entry_t _pv_get(pmap_t pmap, vm_pindex_t pindex 235 PMAP_DEBUG_DECL); 236 static pv_entry_t pv_get_try(pmap_t pmap, vm_pindex_t pindex, int *errorp); 237 static pv_entry_t pv_find(pmap_t pmap, vm_pindex_t pindex); 238 static void pv_put(pv_entry_t pv); 239 static void pv_free(pv_entry_t pv); 240 static void *pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex); 241 static pv_entry_t pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, 242 pv_entry_t *pvpp); 243 static pv_entry_t pmap_allocpte_seg(pmap_t pmap, vm_pindex_t ptepindex, 244 pv_entry_t *pvpp, vm_map_entry_t entry, vm_offset_t va); 245 static void pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, 246 struct pmap_inval_info *info); 247 static vm_page_t pmap_remove_pv_page(pv_entry_t pv); 248 static int pmap_release_pv(pv_entry_t pv, pv_entry_t pvp); 249 250 struct pmap_scan_info; 251 static void pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info, 252 pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept, 253 vm_offset_t va, pt_entry_t *ptep, void *arg __unused); 254 static void pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info, 255 pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept, 256 vm_offset_t va, pt_entry_t *ptep, void *arg __unused); 257 258 static void i386_protection_init (void); 259 static void create_pagetables(vm_paddr_t *firstaddr); 260 static void pmap_remove_all (vm_page_t m); 261 static boolean_t pmap_testbit (vm_page_t m, int bit); 262 263 static pt_entry_t * pmap_pte_quick (pmap_t pmap, vm_offset_t va); 264 static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 265 266 static unsigned pdir4mb; 267 268 static int 269 pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2) 270 { 271 if (pv1->pv_pindex < pv2->pv_pindex) 272 return(-1); 273 if (pv1->pv_pindex > pv2->pv_pindex) 274 return(1); 275 return(0); 276 } 277 278 RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry, 279 pv_entry_compare, vm_pindex_t, pv_pindex); 280 281 /* 282 * Move the kernel virtual free pointer to the next 283 * 2MB. This is used to help improve performance 284 * by using a large (2MB) page for much of the kernel 285 * (.text, .data, .bss) 286 */ 287 static 288 vm_offset_t 289 pmap_kmem_choose(vm_offset_t addr) 290 { 291 vm_offset_t newaddr = addr; 292 293 newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); 294 return newaddr; 295 } 296 297 /* 298 * pmap_pte_quick: 299 * 300 * Super fast pmap_pte routine best used when scanning the pv lists. 301 * This eliminates many course-grained invltlb calls. Note that many of 302 * the pv list scans are across different pmaps and it is very wasteful 303 * to do an entire invltlb when checking a single mapping. 304 */ 305 static __inline pt_entry_t *pmap_pte(pmap_t pmap, vm_offset_t va); 306 307 static 308 pt_entry_t * 309 pmap_pte_quick(pmap_t pmap, vm_offset_t va) 310 { 311 return pmap_pte(pmap, va); 312 } 313 314 /* 315 * Returns the pindex of a page table entry (representing a terminal page). 316 * There are NUPTE_TOTAL page table entries possible (a huge number) 317 * 318 * x86-64 has a 48-bit address space, where bit 47 is sign-extended out. 319 * We want to properly translate negative KVAs. 320 */ 321 static __inline 322 vm_pindex_t 323 pmap_pte_pindex(vm_offset_t va) 324 { 325 return ((va >> PAGE_SHIFT) & (NUPTE_TOTAL - 1)); 326 } 327 328 /* 329 * Returns the pindex of a page table. 330 */ 331 static __inline 332 vm_pindex_t 333 pmap_pt_pindex(vm_offset_t va) 334 { 335 return (NUPTE_TOTAL + ((va >> PDRSHIFT) & (NUPT_TOTAL - 1))); 336 } 337 338 /* 339 * Returns the pindex of a page directory. 340 */ 341 static __inline 342 vm_pindex_t 343 pmap_pd_pindex(vm_offset_t va) 344 { 345 return (NUPTE_TOTAL + NUPT_TOTAL + 346 ((va >> PDPSHIFT) & (NUPD_TOTAL - 1))); 347 } 348 349 static __inline 350 vm_pindex_t 351 pmap_pdp_pindex(vm_offset_t va) 352 { 353 return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + 354 ((va >> PML4SHIFT) & (NUPDP_TOTAL - 1))); 355 } 356 357 static __inline 358 vm_pindex_t 359 pmap_pml4_pindex(void) 360 { 361 return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL); 362 } 363 364 /* 365 * Return various clipped indexes for a given VA 366 * 367 * Returns the index of a pte in a page table, representing a terminal 368 * page. 369 */ 370 static __inline 371 vm_pindex_t 372 pmap_pte_index(vm_offset_t va) 373 { 374 return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); 375 } 376 377 /* 378 * Returns the index of a pt in a page directory, representing a page 379 * table. 380 */ 381 static __inline 382 vm_pindex_t 383 pmap_pt_index(vm_offset_t va) 384 { 385 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 386 } 387 388 /* 389 * Returns the index of a pd in a page directory page, representing a page 390 * directory. 391 */ 392 static __inline 393 vm_pindex_t 394 pmap_pd_index(vm_offset_t va) 395 { 396 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 397 } 398 399 /* 400 * Returns the index of a pdp in the pml4 table, representing a page 401 * directory page. 402 */ 403 static __inline 404 vm_pindex_t 405 pmap_pdp_index(vm_offset_t va) 406 { 407 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 408 } 409 410 /* 411 * Generic procedure to index a pte from a pt, pd, or pdp. 412 * 413 * NOTE: Normally passed pindex as pmap_xx_index(). pmap_xx_pindex() is NOT 414 * a page table page index but is instead of PV lookup index. 415 */ 416 static 417 void * 418 pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex) 419 { 420 pt_entry_t *pte; 421 422 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pv->pv_m)); 423 return(&pte[pindex]); 424 } 425 426 /* 427 * Return pointer to PDP slot in the PML4 428 */ 429 static __inline 430 pml4_entry_t * 431 pmap_pdp(pmap_t pmap, vm_offset_t va) 432 { 433 return (&pmap->pm_pml4[pmap_pdp_index(va)]); 434 } 435 436 /* 437 * Return pointer to PD slot in the PDP given a pointer to the PDP 438 */ 439 static __inline 440 pdp_entry_t * 441 pmap_pdp_to_pd(pml4_entry_t pdp_pte, vm_offset_t va) 442 { 443 pdp_entry_t *pd; 444 445 pd = (pdp_entry_t *)PHYS_TO_DMAP(pdp_pte & PG_FRAME); 446 return (&pd[pmap_pd_index(va)]); 447 } 448 449 /* 450 * Return pointer to PD slot in the PDP. 451 */ 452 static __inline 453 pdp_entry_t * 454 pmap_pd(pmap_t pmap, vm_offset_t va) 455 { 456 pml4_entry_t *pdp; 457 458 pdp = pmap_pdp(pmap, va); 459 if ((*pdp & PG_V) == 0) 460 return NULL; 461 return (pmap_pdp_to_pd(*pdp, va)); 462 } 463 464 /* 465 * Return pointer to PT slot in the PD given a pointer to the PD 466 */ 467 static __inline 468 pd_entry_t * 469 pmap_pd_to_pt(pdp_entry_t pd_pte, vm_offset_t va) 470 { 471 pd_entry_t *pt; 472 473 pt = (pd_entry_t *)PHYS_TO_DMAP(pd_pte & PG_FRAME); 474 return (&pt[pmap_pt_index(va)]); 475 } 476 477 /* 478 * Return pointer to PT slot in the PD 479 * 480 * SIMPLE PMAP NOTE: Simple pmaps (embedded in objects) do not have PDPs, 481 * so we cannot lookup the PD via the PDP. Instead we 482 * must look it up via the pmap. 483 */ 484 static __inline 485 pd_entry_t * 486 pmap_pt(pmap_t pmap, vm_offset_t va) 487 { 488 pdp_entry_t *pd; 489 pv_entry_t pv; 490 vm_pindex_t pd_pindex; 491 492 if (pmap->pm_flags & PMAP_FLAG_SIMPLE) { 493 pd_pindex = pmap_pd_pindex(va); 494 spin_lock(&pmap->pm_spin); 495 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pd_pindex); 496 spin_unlock(&pmap->pm_spin); 497 if (pv == NULL || pv->pv_m == NULL) 498 return NULL; 499 return (pmap_pd_to_pt(VM_PAGE_TO_PHYS(pv->pv_m), va)); 500 } else { 501 pd = pmap_pd(pmap, va); 502 if (pd == NULL || (*pd & PG_V) == 0) 503 return NULL; 504 return (pmap_pd_to_pt(*pd, va)); 505 } 506 } 507 508 /* 509 * Return pointer to PTE slot in the PT given a pointer to the PT 510 */ 511 static __inline 512 pt_entry_t * 513 pmap_pt_to_pte(pd_entry_t pt_pte, vm_offset_t va) 514 { 515 pt_entry_t *pte; 516 517 pte = (pt_entry_t *)PHYS_TO_DMAP(pt_pte & PG_FRAME); 518 return (&pte[pmap_pte_index(va)]); 519 } 520 521 /* 522 * Return pointer to PTE slot in the PT 523 */ 524 static __inline 525 pt_entry_t * 526 pmap_pte(pmap_t pmap, vm_offset_t va) 527 { 528 pd_entry_t *pt; 529 530 pt = pmap_pt(pmap, va); 531 if (pt == NULL || (*pt & PG_V) == 0) 532 return NULL; 533 if ((*pt & PG_PS) != 0) 534 return ((pt_entry_t *)pt); 535 return (pmap_pt_to_pte(*pt, va)); 536 } 537 538 /* 539 * Of all the layers (PTE, PT, PD, PDP, PML4) the best one to cache is 540 * the PT layer. This will speed up core pmap operations considerably. 541 */ 542 static __inline 543 void 544 pv_cache(pv_entry_t pv, vm_pindex_t pindex) 545 { 546 if (pindex >= pmap_pt_pindex(0) && pindex <= pmap_pd_pindex(0)) 547 pv->pv_pmap->pm_pvhint = pv; 548 } 549 550 551 /* 552 * KVM - return address of PT slot in PD 553 */ 554 static __inline 555 pd_entry_t * 556 vtopt(vm_offset_t va) 557 { 558 uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 559 NPML4EPGSHIFT)) - 1); 560 561 return (PDmap + ((va >> PDRSHIFT) & mask)); 562 } 563 564 /* 565 * KVM - return address of PTE slot in PT 566 */ 567 static __inline 568 pt_entry_t * 569 vtopte(vm_offset_t va) 570 { 571 uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + 572 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 573 574 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 575 } 576 577 static uint64_t 578 allocpages(vm_paddr_t *firstaddr, long n) 579 { 580 uint64_t ret; 581 582 ret = *firstaddr; 583 bzero((void *)ret, n * PAGE_SIZE); 584 *firstaddr += n * PAGE_SIZE; 585 return (ret); 586 } 587 588 static 589 void 590 create_pagetables(vm_paddr_t *firstaddr) 591 { 592 long i; /* must be 64 bits */ 593 long nkpt_base; 594 long nkpt_phys; 595 int j; 596 597 /* 598 * We are running (mostly) V=P at this point 599 * 600 * Calculate NKPT - number of kernel page tables. We have to 601 * accomodoate prealloction of the vm_page_array, dump bitmap, 602 * MSGBUF_SIZE, and other stuff. Be generous. 603 * 604 * Maxmem is in pages. 605 * 606 * ndmpdp is the number of 1GB pages we wish to map. 607 */ 608 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; 609 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 610 ndmpdp = 4; 611 KKASSERT(ndmpdp <= NKPDPE * NPDEPG); 612 613 /* 614 * Starting at the beginning of kvm (not KERNBASE). 615 */ 616 nkpt_phys = (Maxmem * sizeof(struct vm_page) + NBPDR - 1) / NBPDR; 617 nkpt_phys += (Maxmem * sizeof(struct pv_entry) + NBPDR - 1) / NBPDR; 618 nkpt_phys += ((nkpt + nkpt + 1 + NKPML4E + NKPDPE + NDMPML4E + 619 ndmpdp) + 511) / 512; 620 nkpt_phys += 128; 621 622 /* 623 * Starting at KERNBASE - map 2G worth of page table pages. 624 * KERNBASE is offset -2G from the end of kvm. 625 */ 626 nkpt_base = (NPDPEPG - KPDPI) * NPTEPG; /* typically 2 x 512 */ 627 628 /* 629 * Allocate pages 630 */ 631 KPTbase = allocpages(firstaddr, nkpt_base); 632 KPTphys = allocpages(firstaddr, nkpt_phys); 633 KPML4phys = allocpages(firstaddr, 1); 634 KPDPphys = allocpages(firstaddr, NKPML4E); 635 KPDphys = allocpages(firstaddr, NKPDPE); 636 637 /* 638 * Calculate the page directory base for KERNBASE, 639 * that is where we start populating the page table pages. 640 * Basically this is the end - 2. 641 */ 642 KPDbase = KPDphys + ((NKPDPE - (NPDPEPG - KPDPI)) << PAGE_SHIFT); 643 644 DMPDPphys = allocpages(firstaddr, NDMPML4E); 645 if ((amd_feature & AMDID_PAGE1GB) == 0) 646 DMPDphys = allocpages(firstaddr, ndmpdp); 647 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 648 649 /* 650 * Fill in the underlying page table pages for the area around 651 * KERNBASE. This remaps low physical memory to KERNBASE. 652 * 653 * Read-only from zero to physfree 654 * XXX not fully used, underneath 2M pages 655 */ 656 for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) { 657 ((pt_entry_t *)KPTbase)[i] = i << PAGE_SHIFT; 658 ((pt_entry_t *)KPTbase)[i] |= PG_RW | PG_V | PG_G; 659 } 660 661 /* 662 * Now map the initial kernel page tables. One block of page 663 * tables is placed at the beginning of kernel virtual memory, 664 * and another block is placed at KERNBASE to map the kernel binary, 665 * data, bss, and initial pre-allocations. 666 */ 667 for (i = 0; i < nkpt_base; i++) { 668 ((pd_entry_t *)KPDbase)[i] = KPTbase + (i << PAGE_SHIFT); 669 ((pd_entry_t *)KPDbase)[i] |= PG_RW | PG_V; 670 } 671 for (i = 0; i < nkpt_phys; i++) { 672 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); 673 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V; 674 } 675 676 /* 677 * Map from zero to end of allocations using 2M pages as an 678 * optimization. This will bypass some of the KPTBase pages 679 * above in the KERNBASE area. 680 */ 681 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) { 682 ((pd_entry_t *)KPDbase)[i] = i << PDRSHIFT; 683 ((pd_entry_t *)KPDbase)[i] |= PG_RW | PG_V | PG_PS | PG_G; 684 } 685 686 /* 687 * And connect up the PD to the PDP. The kernel pmap is expected 688 * to pre-populate all of its PDs. See NKPDPE in vmparam.h. 689 */ 690 for (i = 0; i < NKPDPE; i++) { 691 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] = 692 KPDphys + (i << PAGE_SHIFT); 693 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] |= 694 PG_RW | PG_V | PG_U; 695 } 696 697 /* 698 * Now set up the direct map space using either 2MB or 1GB pages 699 * Preset PG_M and PG_A because demotion expects it. 700 * 701 * When filling in entries in the PD pages make sure any excess 702 * entries are set to zero as we allocated enough PD pages 703 */ 704 if ((amd_feature & AMDID_PAGE1GB) == 0) { 705 for (i = 0; i < NPDEPG * ndmpdp; i++) { 706 ((pd_entry_t *)DMPDphys)[i] = i << PDRSHIFT; 707 ((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS | 708 PG_G | PG_M | PG_A; 709 } 710 711 /* 712 * And the direct map space's PDP 713 */ 714 for (i = 0; i < ndmpdp; i++) { 715 ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + 716 (i << PAGE_SHIFT); 717 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U; 718 } 719 } else { 720 for (i = 0; i < ndmpdp; i++) { 721 ((pdp_entry_t *)DMPDPphys)[i] = 722 (vm_paddr_t)i << PDPSHIFT; 723 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS | 724 PG_G | PG_M | PG_A; 725 } 726 } 727 728 /* And recursively map PML4 to itself in order to get PTmap */ 729 ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; 730 ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U; 731 732 /* 733 * Connect the Direct Map slots up to the PML4 734 */ 735 for (j = 0; j < NDMPML4E; ++j) { 736 ((pdp_entry_t *)KPML4phys)[DMPML4I + j] = 737 (DMPDPphys + ((vm_paddr_t)j << PML4SHIFT)) | 738 PG_RW | PG_V | PG_U; 739 } 740 741 /* 742 * Connect the KVA slot up to the PML4 743 */ 744 ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys; 745 ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U; 746 } 747 748 /* 749 * Bootstrap the system enough to run with virtual memory. 750 * 751 * On the i386 this is called after mapping has already been enabled 752 * and just syncs the pmap module with what has already been done. 753 * [We can't call it easily with mapping off since the kernel is not 754 * mapped with PA == VA, hence we would have to relocate every address 755 * from the linked base (virtual) address "KERNBASE" to the actual 756 * (physical) address starting relative to 0] 757 */ 758 void 759 pmap_bootstrap(vm_paddr_t *firstaddr) 760 { 761 vm_offset_t va; 762 pt_entry_t *pte; 763 764 KvaStart = VM_MIN_KERNEL_ADDRESS; 765 KvaEnd = VM_MAX_KERNEL_ADDRESS; 766 KvaSize = KvaEnd - KvaStart; 767 768 avail_start = *firstaddr; 769 770 /* 771 * Create an initial set of page tables to run the kernel in. 772 */ 773 create_pagetables(firstaddr); 774 775 virtual2_start = KvaStart; 776 virtual2_end = PTOV_OFFSET; 777 778 virtual_start = (vm_offset_t) PTOV_OFFSET + *firstaddr; 779 virtual_start = pmap_kmem_choose(virtual_start); 780 781 virtual_end = VM_MAX_KERNEL_ADDRESS; 782 783 /* XXX do %cr0 as well */ 784 load_cr4(rcr4() | CR4_PGE | CR4_PSE); 785 load_cr3(KPML4phys); 786 787 /* 788 * Initialize protection array. 789 */ 790 i386_protection_init(); 791 792 /* 793 * The kernel's pmap is statically allocated so we don't have to use 794 * pmap_create, which is unlikely to work correctly at this part of 795 * the boot sequence (XXX and which no longer exists). 796 */ 797 kernel_pmap.pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys); 798 kernel_pmap.pm_count = 1; 799 kernel_pmap.pm_active = (cpumask_t)-1 & ~CPUMASK_LOCK; 800 RB_INIT(&kernel_pmap.pm_pvroot); 801 spin_init(&kernel_pmap.pm_spin); 802 lwkt_token_init(&kernel_pmap.pm_token, "kpmap_tok"); 803 804 /* 805 * Reserve some special page table entries/VA space for temporary 806 * mapping of pages. 807 */ 808 #define SYSMAP(c, p, v, n) \ 809 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 810 811 va = virtual_start; 812 pte = vtopte(va); 813 814 /* 815 * CMAP1/CMAP2 are used for zeroing and copying pages. 816 */ 817 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 818 819 /* 820 * Crashdump maps. 821 */ 822 SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); 823 824 /* 825 * ptvmmap is used for reading arbitrary physical pages via 826 * /dev/mem. 827 */ 828 SYSMAP(caddr_t, ptmmap, ptvmmap, 1) 829 830 /* 831 * msgbufp is used to map the system message buffer. 832 * XXX msgbufmap is not used. 833 */ 834 SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 835 atop(round_page(MSGBUF_SIZE))) 836 837 virtual_start = va; 838 839 *CMAP1 = 0; 840 841 /* 842 * PG_G is terribly broken on SMP because we IPI invltlb's in some 843 * cases rather then invl1pg. Actually, I don't even know why it 844 * works under UP because self-referential page table mappings 845 */ 846 pgeflag = 0; 847 848 /* 849 * Initialize the 4MB page size flag 850 */ 851 pseflag = 0; 852 /* 853 * The 4MB page version of the initial 854 * kernel page mapping. 855 */ 856 pdir4mb = 0; 857 858 #if !defined(DISABLE_PSE) 859 if (cpu_feature & CPUID_PSE) { 860 pt_entry_t ptditmp; 861 /* 862 * Note that we have enabled PSE mode 863 */ 864 pseflag = PG_PS; 865 ptditmp = *(PTmap + x86_64_btop(KERNBASE)); 866 ptditmp &= ~(NBPDR - 1); 867 ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag; 868 pdir4mb = ptditmp; 869 } 870 #endif 871 cpu_invltlb(); 872 873 /* Initialize the PAT MSR */ 874 pmap_init_pat(); 875 } 876 877 /* 878 * Setup the PAT MSR. 879 */ 880 void 881 pmap_init_pat(void) 882 { 883 uint64_t pat_msr; 884 u_long cr0, cr4; 885 886 /* 887 * Default values mapping PATi,PCD,PWT bits at system reset. 888 * The default values effectively ignore the PATi bit by 889 * repeating the encodings for 0-3 in 4-7, and map the PCD 890 * and PWT bit combinations to the expected PAT types. 891 */ 892 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | /* 000 */ 893 PAT_VALUE(1, PAT_WRITE_THROUGH) | /* 001 */ 894 PAT_VALUE(2, PAT_UNCACHED) | /* 010 */ 895 PAT_VALUE(3, PAT_UNCACHEABLE) | /* 011 */ 896 PAT_VALUE(4, PAT_WRITE_BACK) | /* 100 */ 897 PAT_VALUE(5, PAT_WRITE_THROUGH) | /* 101 */ 898 PAT_VALUE(6, PAT_UNCACHED) | /* 110 */ 899 PAT_VALUE(7, PAT_UNCACHEABLE); /* 111 */ 900 pat_pte_index[PAT_WRITE_BACK] = 0; 901 pat_pte_index[PAT_WRITE_THROUGH]= 0 | PG_NC_PWT; 902 pat_pte_index[PAT_UNCACHED] = PG_NC_PCD; 903 pat_pte_index[PAT_UNCACHEABLE] = PG_NC_PCD | PG_NC_PWT; 904 pat_pte_index[PAT_WRITE_PROTECTED] = pat_pte_index[PAT_UNCACHEABLE]; 905 pat_pte_index[PAT_WRITE_COMBINING] = pat_pte_index[PAT_UNCACHEABLE]; 906 907 if (cpu_feature & CPUID_PAT) { 908 /* 909 * If we support the PAT then set-up entries for 910 * WRITE_PROTECTED and WRITE_COMBINING using bit patterns 911 * 4 and 5. 912 */ 913 pat_msr = (pat_msr & ~PAT_MASK(4)) | 914 PAT_VALUE(4, PAT_WRITE_PROTECTED); 915 pat_msr = (pat_msr & ~PAT_MASK(5)) | 916 PAT_VALUE(5, PAT_WRITE_COMBINING); 917 pat_pte_index[PAT_WRITE_PROTECTED] = PG_PTE_PAT | 0; 918 pat_pte_index[PAT_WRITE_COMBINING] = PG_PTE_PAT | PG_NC_PWT; 919 920 /* 921 * Then enable the PAT 922 */ 923 924 /* Disable PGE. */ 925 cr4 = rcr4(); 926 load_cr4(cr4 & ~CR4_PGE); 927 928 /* Disable caches (CD = 1, NW = 0). */ 929 cr0 = rcr0(); 930 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 931 932 /* Flushes caches and TLBs. */ 933 wbinvd(); 934 cpu_invltlb(); 935 936 /* Update PAT and index table. */ 937 wrmsr(MSR_PAT, pat_msr); 938 939 /* Flush caches and TLBs again. */ 940 wbinvd(); 941 cpu_invltlb(); 942 943 /* Restore caches and PGE. */ 944 load_cr0(cr0); 945 load_cr4(cr4); 946 PatMsr = pat_msr; 947 } 948 } 949 950 /* 951 * Set 4mb pdir for mp startup 952 */ 953 void 954 pmap_set_opt(void) 955 { 956 if (pseflag && (cpu_feature & CPUID_PSE)) { 957 load_cr4(rcr4() | CR4_PSE); 958 if (pdir4mb && mycpu->gd_cpuid == 0) { /* only on BSP */ 959 cpu_invltlb(); 960 } 961 } 962 } 963 964 /* 965 * Initialize the pmap module. 966 * Called by vm_init, to initialize any structures that the pmap 967 * system needs to map virtual memory. 968 * pmap_init has been enhanced to support in a fairly consistant 969 * way, discontiguous physical memory. 970 */ 971 void 972 pmap_init(void) 973 { 974 int i; 975 int initial_pvs; 976 977 /* 978 * Allocate memory for random pmap data structures. Includes the 979 * pv_head_table. 980 */ 981 982 for (i = 0; i < vm_page_array_size; i++) { 983 vm_page_t m; 984 985 m = &vm_page_array[i]; 986 TAILQ_INIT(&m->md.pv_list); 987 } 988 989 /* 990 * init the pv free list 991 */ 992 initial_pvs = vm_page_array_size; 993 if (initial_pvs < MINPV) 994 initial_pvs = MINPV; 995 pvzone = &pvzone_store; 996 pvinit = (void *)kmem_alloc(&kernel_map, 997 initial_pvs * sizeof (struct pv_entry)); 998 zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), 999 pvinit, initial_pvs); 1000 1001 /* 1002 * Now it is safe to enable pv_table recording. 1003 */ 1004 pmap_initialized = TRUE; 1005 } 1006 1007 /* 1008 * Initialize the address space (zone) for the pv_entries. Set a 1009 * high water mark so that the system can recover from excessive 1010 * numbers of pv entries. 1011 */ 1012 void 1013 pmap_init2(void) 1014 { 1015 int shpgperproc = PMAP_SHPGPERPROC; 1016 int entry_max; 1017 1018 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 1019 pv_entry_max = shpgperproc * maxproc + vm_page_array_size; 1020 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 1021 pv_entry_high_water = 9 * (pv_entry_max / 10); 1022 1023 /* 1024 * Subtract out pages already installed in the zone (hack) 1025 */ 1026 entry_max = pv_entry_max - vm_page_array_size; 1027 if (entry_max <= 0) 1028 entry_max = 1; 1029 1030 zinitna(pvzone, &pvzone_obj, NULL, 0, entry_max, ZONE_INTERRUPT, 1); 1031 } 1032 1033 1034 /*************************************************** 1035 * Low level helper routines..... 1036 ***************************************************/ 1037 1038 /* 1039 * this routine defines the region(s) of memory that should 1040 * not be tested for the modified bit. 1041 */ 1042 static __inline 1043 int 1044 pmap_track_modified(vm_pindex_t pindex) 1045 { 1046 vm_offset_t va = (vm_offset_t)pindex << PAGE_SHIFT; 1047 if ((va < clean_sva) || (va >= clean_eva)) 1048 return 1; 1049 else 1050 return 0; 1051 } 1052 1053 /* 1054 * Extract the physical page address associated with the map/VA pair. 1055 * The page must be wired for this to work reliably. 1056 * 1057 * XXX for the moment we're using pv_find() instead of pv_get(), as 1058 * callers might be expecting non-blocking operation. 1059 */ 1060 vm_paddr_t 1061 pmap_extract(pmap_t pmap, vm_offset_t va) 1062 { 1063 vm_paddr_t rtval; 1064 pv_entry_t pt_pv; 1065 pt_entry_t *ptep; 1066 1067 rtval = 0; 1068 if (va >= VM_MAX_USER_ADDRESS) { 1069 /* 1070 * Kernel page directories might be direct-mapped and 1071 * there is typically no PV tracking of pte's 1072 */ 1073 pd_entry_t *pt; 1074 1075 pt = pmap_pt(pmap, va); 1076 if (pt && (*pt & PG_V)) { 1077 if (*pt & PG_PS) { 1078 rtval = *pt & PG_PS_FRAME; 1079 rtval |= va & PDRMASK; 1080 } else { 1081 ptep = pmap_pt_to_pte(*pt, va); 1082 if (*pt & PG_V) { 1083 rtval = *ptep & PG_FRAME; 1084 rtval |= va & PAGE_MASK; 1085 } 1086 } 1087 } 1088 } else { 1089 /* 1090 * User pages currently do not direct-map the page directory 1091 * and some pages might not used managed PVs. But all PT's 1092 * will have a PV. 1093 */ 1094 pt_pv = pv_find(pmap, pmap_pt_pindex(va)); 1095 if (pt_pv) { 1096 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 1097 if (*ptep & PG_V) { 1098 rtval = *ptep & PG_FRAME; 1099 rtval |= va & PAGE_MASK; 1100 } 1101 pv_drop(pt_pv); 1102 } 1103 } 1104 return rtval; 1105 } 1106 1107 /* 1108 * Extract the physical page address associated kernel virtual address. 1109 */ 1110 vm_paddr_t 1111 pmap_kextract(vm_offset_t va) 1112 { 1113 pd_entry_t pt; /* pt entry in pd */ 1114 vm_paddr_t pa; 1115 1116 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 1117 pa = DMAP_TO_PHYS(va); 1118 } else { 1119 pt = *vtopt(va); 1120 if (pt & PG_PS) { 1121 pa = (pt & PG_PS_FRAME) | (va & PDRMASK); 1122 } else { 1123 /* 1124 * Beware of a concurrent promotion that changes the 1125 * PDE at this point! For example, vtopte() must not 1126 * be used to access the PTE because it would use the 1127 * new PDE. It is, however, safe to use the old PDE 1128 * because the page table page is preserved by the 1129 * promotion. 1130 */ 1131 pa = *pmap_pt_to_pte(pt, va); 1132 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 1133 } 1134 } 1135 return pa; 1136 } 1137 1138 /*************************************************** 1139 * Low level mapping routines..... 1140 ***************************************************/ 1141 1142 /* 1143 * Routine: pmap_kenter 1144 * Function: 1145 * Add a wired page to the KVA 1146 * NOTE! note that in order for the mapping to take effect -- you 1147 * should do an invltlb after doing the pmap_kenter(). 1148 */ 1149 void 1150 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1151 { 1152 pt_entry_t *pte; 1153 pt_entry_t npte; 1154 pmap_inval_info info; 1155 1156 pmap_inval_init(&info); /* XXX remove */ 1157 npte = pa | PG_RW | PG_V | pgeflag; 1158 pte = vtopte(va); 1159 pmap_inval_interlock(&info, &kernel_pmap, va); /* XXX remove */ 1160 *pte = npte; 1161 pmap_inval_deinterlock(&info, &kernel_pmap); /* XXX remove */ 1162 pmap_inval_done(&info); /* XXX remove */ 1163 } 1164 1165 /* 1166 * Routine: pmap_kenter_quick 1167 * Function: 1168 * Similar to pmap_kenter(), except we only invalidate the 1169 * mapping on the current CPU. 1170 */ 1171 void 1172 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa) 1173 { 1174 pt_entry_t *pte; 1175 pt_entry_t npte; 1176 1177 npte = pa | PG_RW | PG_V | pgeflag; 1178 pte = vtopte(va); 1179 *pte = npte; 1180 cpu_invlpg((void *)va); 1181 } 1182 1183 void 1184 pmap_kenter_sync(vm_offset_t va) 1185 { 1186 pmap_inval_info info; 1187 1188 pmap_inval_init(&info); 1189 pmap_inval_interlock(&info, &kernel_pmap, va); 1190 pmap_inval_deinterlock(&info, &kernel_pmap); 1191 pmap_inval_done(&info); 1192 } 1193 1194 void 1195 pmap_kenter_sync_quick(vm_offset_t va) 1196 { 1197 cpu_invlpg((void *)va); 1198 } 1199 1200 /* 1201 * remove a page from the kernel pagetables 1202 */ 1203 void 1204 pmap_kremove(vm_offset_t va) 1205 { 1206 pt_entry_t *pte; 1207 pmap_inval_info info; 1208 1209 pmap_inval_init(&info); 1210 pte = vtopte(va); 1211 pmap_inval_interlock(&info, &kernel_pmap, va); 1212 (void)pte_load_clear(pte); 1213 pmap_inval_deinterlock(&info, &kernel_pmap); 1214 pmap_inval_done(&info); 1215 } 1216 1217 void 1218 pmap_kremove_quick(vm_offset_t va) 1219 { 1220 pt_entry_t *pte; 1221 pte = vtopte(va); 1222 (void)pte_load_clear(pte); 1223 cpu_invlpg((void *)va); 1224 } 1225 1226 /* 1227 * XXX these need to be recoded. They are not used in any critical path. 1228 */ 1229 void 1230 pmap_kmodify_rw(vm_offset_t va) 1231 { 1232 atomic_set_long(vtopte(va), PG_RW); 1233 cpu_invlpg((void *)va); 1234 } 1235 1236 void 1237 pmap_kmodify_nc(vm_offset_t va) 1238 { 1239 atomic_set_long(vtopte(va), PG_N); 1240 cpu_invlpg((void *)va); 1241 } 1242 1243 /* 1244 * Used to map a range of physical addresses into kernel virtual 1245 * address space during the low level boot, typically to map the 1246 * dump bitmap, message buffer, and vm_page_array. 1247 * 1248 * These mappings are typically made at some pointer after the end of the 1249 * kernel text+data. 1250 * 1251 * We could return PHYS_TO_DMAP(start) here and not allocate any 1252 * via (*virtp), but then kmem from userland and kernel dumps won't 1253 * have access to the related pointers. 1254 */ 1255 vm_offset_t 1256 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot) 1257 { 1258 vm_offset_t va; 1259 vm_offset_t va_start; 1260 1261 /*return PHYS_TO_DMAP(start);*/ 1262 1263 va_start = *virtp; 1264 va = va_start; 1265 1266 while (start < end) { 1267 pmap_kenter_quick(va, start); 1268 va += PAGE_SIZE; 1269 start += PAGE_SIZE; 1270 } 1271 *virtp = va; 1272 return va_start; 1273 } 1274 1275 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1276 1277 /* 1278 * Remove the specified set of pages from the data and instruction caches. 1279 * 1280 * In contrast to pmap_invalidate_cache_range(), this function does not 1281 * rely on the CPU's self-snoop feature, because it is intended for use 1282 * when moving pages into a different cache domain. 1283 */ 1284 void 1285 pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1286 { 1287 vm_offset_t daddr, eva; 1288 int i; 1289 1290 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1291 (cpu_feature & CPUID_CLFSH) == 0) 1292 wbinvd(); 1293 else { 1294 cpu_mfence(); 1295 for (i = 0; i < count; i++) { 1296 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 1297 eva = daddr + PAGE_SIZE; 1298 for (; daddr < eva; daddr += cpu_clflush_line_size) 1299 clflush(daddr); 1300 } 1301 cpu_mfence(); 1302 } 1303 } 1304 1305 void 1306 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 1307 { 1308 KASSERT((sva & PAGE_MASK) == 0, 1309 ("pmap_invalidate_cache_range: sva not page-aligned")); 1310 KASSERT((eva & PAGE_MASK) == 0, 1311 ("pmap_invalidate_cache_range: eva not page-aligned")); 1312 1313 if (cpu_feature & CPUID_SS) { 1314 ; /* If "Self Snoop" is supported, do nothing. */ 1315 } else { 1316 /* Globally invalidate caches */ 1317 cpu_wbinvd_on_all_cpus(); 1318 } 1319 } 1320 void 1321 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1322 { 1323 smp_invlpg_range(pmap->pm_active, sva, eva); 1324 } 1325 1326 /* 1327 * Add a list of wired pages to the kva 1328 * this routine is only used for temporary 1329 * kernel mappings that do not need to have 1330 * page modification or references recorded. 1331 * Note that old mappings are simply written 1332 * over. The page *must* be wired. 1333 */ 1334 void 1335 pmap_qenter(vm_offset_t va, vm_page_t *m, int count) 1336 { 1337 vm_offset_t end_va; 1338 1339 end_va = va + count * PAGE_SIZE; 1340 1341 while (va < end_va) { 1342 pt_entry_t *pte; 1343 1344 pte = vtopte(va); 1345 *pte = VM_PAGE_TO_PHYS(*m) | PG_RW | PG_V | 1346 pat_pte_index[(*m)->pat_mode] | pgeflag; 1347 cpu_invlpg((void *)va); 1348 va += PAGE_SIZE; 1349 m++; 1350 } 1351 smp_invltlb(); 1352 } 1353 1354 /* 1355 * This routine jerks page mappings from the 1356 * kernel -- it is meant only for temporary mappings. 1357 * 1358 * MPSAFE, INTERRUPT SAFE (cluster callback) 1359 */ 1360 void 1361 pmap_qremove(vm_offset_t va, int count) 1362 { 1363 vm_offset_t end_va; 1364 1365 end_va = va + count * PAGE_SIZE; 1366 1367 while (va < end_va) { 1368 pt_entry_t *pte; 1369 1370 pte = vtopte(va); 1371 (void)pte_load_clear(pte); 1372 cpu_invlpg((void *)va); 1373 va += PAGE_SIZE; 1374 } 1375 smp_invltlb(); 1376 } 1377 1378 /* 1379 * Create a new thread and optionally associate it with a (new) process. 1380 * NOTE! the new thread's cpu may not equal the current cpu. 1381 */ 1382 void 1383 pmap_init_thread(thread_t td) 1384 { 1385 /* enforce pcb placement & alignment */ 1386 td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1; 1387 td->td_pcb = (struct pcb *)((intptr_t)td->td_pcb & ~(intptr_t)0xF); 1388 td->td_savefpu = &td->td_pcb->pcb_save; 1389 td->td_sp = (char *)td->td_pcb; /* no -16 */ 1390 } 1391 1392 /* 1393 * This routine directly affects the fork perf for a process. 1394 */ 1395 void 1396 pmap_init_proc(struct proc *p) 1397 { 1398 } 1399 1400 /* 1401 * Initialize pmap0/vmspace0. This pmap is not added to pmap_list because 1402 * it, and IdlePTD, represents the template used to update all other pmaps. 1403 * 1404 * On architectures where the kernel pmap is not integrated into the user 1405 * process pmap, this pmap represents the process pmap, not the kernel pmap. 1406 * kernel_pmap should be used to directly access the kernel_pmap. 1407 */ 1408 void 1409 pmap_pinit0(struct pmap *pmap) 1410 { 1411 pmap->pm_pml4 = (pml4_entry_t *)(PTOV_OFFSET + KPML4phys); 1412 pmap->pm_count = 1; 1413 pmap->pm_active = 0; 1414 pmap->pm_pvhint = NULL; 1415 RB_INIT(&pmap->pm_pvroot); 1416 spin_init(&pmap->pm_spin); 1417 lwkt_token_init(&pmap->pm_token, "pmap_tok"); 1418 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1419 } 1420 1421 /* 1422 * Initialize a preallocated and zeroed pmap structure, 1423 * such as one in a vmspace structure. 1424 */ 1425 static void 1426 pmap_pinit_simple(struct pmap *pmap) 1427 { 1428 /* 1429 * Misc initialization 1430 */ 1431 pmap->pm_count = 1; 1432 pmap->pm_active = 0; 1433 pmap->pm_pvhint = NULL; 1434 pmap->pm_flags = PMAP_FLAG_SIMPLE; 1435 1436 /* 1437 * Don't blow up locks/tokens on re-use (XXX fix/use drop code 1438 * for this). 1439 */ 1440 if (pmap->pm_pmlpv == NULL) { 1441 RB_INIT(&pmap->pm_pvroot); 1442 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1443 spin_init(&pmap->pm_spin); 1444 lwkt_token_init(&pmap->pm_token, "pmap_tok"); 1445 } 1446 } 1447 1448 void 1449 pmap_pinit(struct pmap *pmap) 1450 { 1451 pv_entry_t pv; 1452 int j; 1453 1454 pmap_pinit_simple(pmap); 1455 pmap->pm_flags &= ~PMAP_FLAG_SIMPLE; 1456 1457 /* 1458 * No need to allocate page table space yet but we do need a valid 1459 * page directory table. 1460 */ 1461 if (pmap->pm_pml4 == NULL) { 1462 pmap->pm_pml4 = 1463 (pml4_entry_t *)kmem_alloc_pageable(&kernel_map, PAGE_SIZE); 1464 } 1465 1466 /* 1467 * Allocate the page directory page, which wires it even though 1468 * it isn't being entered into some higher level page table (it 1469 * being the highest level). If one is already cached we don't 1470 * have to do anything. 1471 */ 1472 if ((pv = pmap->pm_pmlpv) == NULL) { 1473 pv = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL); 1474 pmap->pm_pmlpv = pv; 1475 pmap_kenter((vm_offset_t)pmap->pm_pml4, 1476 VM_PAGE_TO_PHYS(pv->pv_m)); 1477 pv_put(pv); 1478 1479 /* 1480 * Install DMAP and KMAP. 1481 */ 1482 for (j = 0; j < NDMPML4E; ++j) { 1483 pmap->pm_pml4[DMPML4I + j] = 1484 (DMPDPphys + ((vm_paddr_t)j << PML4SHIFT)) | 1485 PG_RW | PG_V | PG_U; 1486 } 1487 pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U; 1488 1489 /* 1490 * install self-referential address mapping entry 1491 */ 1492 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pv->pv_m) | 1493 PG_V | PG_RW | PG_A | PG_M; 1494 } else { 1495 KKASSERT(pv->pv_m->flags & PG_MAPPED); 1496 KKASSERT(pv->pv_m->flags & PG_WRITEABLE); 1497 } 1498 KKASSERT(pmap->pm_pml4[255] == 0); 1499 KKASSERT(RB_ROOT(&pmap->pm_pvroot) == pv); 1500 KKASSERT(pv->pv_entry.rbe_left == NULL); 1501 KKASSERT(pv->pv_entry.rbe_right == NULL); 1502 } 1503 1504 /* 1505 * Clean up a pmap structure so it can be physically freed. This routine 1506 * is called by the vmspace dtor function. A great deal of pmap data is 1507 * left passively mapped to improve vmspace management so we have a bit 1508 * of cleanup work to do here. 1509 */ 1510 void 1511 pmap_puninit(pmap_t pmap) 1512 { 1513 pv_entry_t pv; 1514 vm_page_t p; 1515 1516 KKASSERT(pmap->pm_active == 0); 1517 if ((pv = pmap->pm_pmlpv) != NULL) { 1518 if (pv_hold_try(pv) == 0) 1519 pv_lock(pv); 1520 p = pmap_remove_pv_page(pv); 1521 pv_free(pv); 1522 pmap_kremove((vm_offset_t)pmap->pm_pml4); 1523 vm_page_busy_wait(p, FALSE, "pgpun"); 1524 KKASSERT(p->flags & (PG_FICTITIOUS|PG_UNMANAGED)); 1525 vm_page_unwire(p, 0); 1526 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 1527 1528 /* 1529 * XXX eventually clean out PML4 static entries and 1530 * use vm_page_free_zero() 1531 */ 1532 vm_page_free(p); 1533 pmap->pm_pmlpv = NULL; 1534 } 1535 if (pmap->pm_pml4) { 1536 KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys)); 1537 kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE); 1538 pmap->pm_pml4 = NULL; 1539 } 1540 KKASSERT(pmap->pm_stats.resident_count == 0); 1541 KKASSERT(pmap->pm_stats.wired_count == 0); 1542 } 1543 1544 /* 1545 * Wire in kernel global address entries. To avoid a race condition 1546 * between pmap initialization and pmap_growkernel, this procedure 1547 * adds the pmap to the master list (which growkernel scans to update), 1548 * then copies the template. 1549 */ 1550 void 1551 pmap_pinit2(struct pmap *pmap) 1552 { 1553 spin_lock(&pmap_spin); 1554 TAILQ_INSERT_TAIL(&pmap_list, pmap, pm_pmnode); 1555 spin_unlock(&pmap_spin); 1556 } 1557 1558 /* 1559 * This routine is called when various levels in the page table need to 1560 * be populated. This routine cannot fail. 1561 * 1562 * This function returns two locked pv_entry's, one representing the 1563 * requested pv and one representing the requested pv's parent pv. If 1564 * the pv did not previously exist it will be mapped into its parent 1565 * and wired, otherwise no additional wire count will be added. 1566 */ 1567 static 1568 pv_entry_t 1569 pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp) 1570 { 1571 pt_entry_t *ptep; 1572 pv_entry_t pv; 1573 pv_entry_t pvp; 1574 vm_pindex_t pt_pindex; 1575 vm_page_t m; 1576 int isnew; 1577 int ispt; 1578 1579 /* 1580 * If the pv already exists and we aren't being asked for the 1581 * parent page table page we can just return it. A locked+held pv 1582 * is returned. 1583 */ 1584 ispt = 0; 1585 pv = pv_alloc(pmap, ptepindex, &isnew); 1586 if (isnew == 0 && pvpp == NULL) 1587 return(pv); 1588 1589 /* 1590 * This is a new PV, we have to resolve its parent page table and 1591 * add an additional wiring to the page if necessary. 1592 */ 1593 1594 /* 1595 * Special case terminal PVs. These are not page table pages so 1596 * no vm_page is allocated (the caller supplied the vm_page). If 1597 * pvpp is non-NULL we are being asked to also removed the pt_pv 1598 * for this pv. 1599 * 1600 * Note that pt_pv's are only returned for user VAs. We assert that 1601 * a pt_pv is not being requested for kernel VAs. 1602 */ 1603 if (ptepindex < pmap_pt_pindex(0)) { 1604 if (ptepindex >= NUPTE_USER) 1605 KKASSERT(pvpp == NULL); 1606 else 1607 KKASSERT(pvpp != NULL); 1608 if (pvpp) { 1609 pt_pindex = NUPTE_TOTAL + (ptepindex >> NPTEPGSHIFT); 1610 pvp = pmap_allocpte(pmap, pt_pindex, NULL); 1611 if (isnew) 1612 vm_page_wire_quick(pvp->pv_m); 1613 *pvpp = pvp; 1614 } else { 1615 pvp = NULL; 1616 } 1617 return(pv); 1618 } 1619 1620 /* 1621 * Non-terminal PVs allocate a VM page to represent the page table, 1622 * so we have to resolve pvp and calculate ptepindex for the pvp 1623 * and then for the page table entry index in the pvp for 1624 * fall-through. 1625 */ 1626 if (ptepindex < pmap_pd_pindex(0)) { 1627 /* 1628 * pv is PT, pvp is PD 1629 */ 1630 ptepindex = (ptepindex - pmap_pt_pindex(0)) >> NPDEPGSHIFT; 1631 ptepindex += NUPTE_TOTAL + NUPT_TOTAL; 1632 pvp = pmap_allocpte(pmap, ptepindex, NULL); 1633 if (!isnew) 1634 goto notnew; 1635 1636 /* 1637 * PT index in PD 1638 */ 1639 ptepindex = pv->pv_pindex - pmap_pt_pindex(0); 1640 ptepindex &= ((1ul << NPDEPGSHIFT) - 1); 1641 ispt = 1; 1642 } else if (ptepindex < pmap_pdp_pindex(0)) { 1643 /* 1644 * pv is PD, pvp is PDP 1645 * 1646 * SIMPLE PMAP NOTE: Simple pmaps do not allocate above 1647 * the PD. 1648 */ 1649 ptepindex = (ptepindex - pmap_pd_pindex(0)) >> NPDPEPGSHIFT; 1650 ptepindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL; 1651 1652 if (pmap->pm_flags & PMAP_FLAG_SIMPLE) { 1653 KKASSERT(pvpp == NULL); 1654 pvp = NULL; 1655 } else { 1656 pvp = pmap_allocpte(pmap, ptepindex, NULL); 1657 } 1658 if (!isnew) 1659 goto notnew; 1660 1661 /* 1662 * PD index in PDP 1663 */ 1664 ptepindex = pv->pv_pindex - pmap_pd_pindex(0); 1665 ptepindex &= ((1ul << NPDPEPGSHIFT) - 1); 1666 } else if (ptepindex < pmap_pml4_pindex()) { 1667 /* 1668 * pv is PDP, pvp is the root pml4 table 1669 */ 1670 pvp = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL); 1671 if (!isnew) 1672 goto notnew; 1673 1674 /* 1675 * PDP index in PML4 1676 */ 1677 ptepindex = pv->pv_pindex - pmap_pdp_pindex(0); 1678 ptepindex &= ((1ul << NPML4EPGSHIFT) - 1); 1679 } else { 1680 /* 1681 * pv represents the top-level PML4, there is no parent. 1682 */ 1683 pvp = NULL; 1684 if (!isnew) 1685 goto notnew; 1686 } 1687 1688 /* 1689 * This code is only reached if isnew is TRUE and this is not a 1690 * terminal PV. We need to allocate a vm_page for the page table 1691 * at this level and enter it into the parent page table. 1692 * 1693 * page table pages are marked PG_WRITEABLE and PG_MAPPED. 1694 */ 1695 for (;;) { 1696 m = vm_page_alloc(NULL, pv->pv_pindex, 1697 VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM | 1698 VM_ALLOC_INTERRUPT); 1699 if (m) 1700 break; 1701 vm_wait(0); 1702 } 1703 vm_page_spin_lock(m); 1704 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 1705 pv->pv_m = m; 1706 vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); 1707 vm_page_spin_unlock(m); 1708 vm_page_unmanage(m); /* m must be spinunlocked */ 1709 1710 if ((m->flags & PG_ZERO) == 0) { 1711 pmap_zero_page(VM_PAGE_TO_PHYS(m)); 1712 } 1713 #ifdef PMAP_DEBUG 1714 else { 1715 pmap_page_assertzero(VM_PAGE_TO_PHYS(m)); 1716 } 1717 #endif 1718 m->valid = VM_PAGE_BITS_ALL; 1719 vm_page_flag_clear(m, PG_ZERO); 1720 vm_page_wire(m); /* wire for mapping in parent */ 1721 1722 /* 1723 * Wire the page into pvp, bump the wire-count for pvp's page table 1724 * page. Bump the resident_count for the pmap. There is no pvp 1725 * for the top level, address the pm_pml4[] array directly. 1726 * 1727 * If the caller wants the parent we return it, otherwise 1728 * we just put it away. 1729 * 1730 * No interlock is needed for pte 0 -> non-zero. 1731 * 1732 * In the situation where *ptep is valid we might have an unmanaged 1733 * page table page shared from another page table which we need to 1734 * unshare before installing our private page table page. 1735 */ 1736 if (pvp) { 1737 ptep = pv_pte_lookup(pvp, ptepindex); 1738 if (*ptep & PG_V) { 1739 pt_entry_t pte; 1740 pmap_inval_info info; 1741 1742 if (ispt == 0) { 1743 panic("pmap_allocpte: unexpected pte %p/%d", 1744 pvp, (int)ptepindex); 1745 } 1746 pmap_inval_init(&info); 1747 pmap_inval_interlock(&info, pmap, (vm_offset_t)-1); 1748 pte = pte_load_clear(ptep); 1749 pmap_inval_deinterlock(&info, pmap); 1750 pmap_inval_done(&info); 1751 if (vm_page_unwire_quick( 1752 PHYS_TO_VM_PAGE(pte & PG_FRAME))) { 1753 panic("pmap_allocpte: shared pgtable " 1754 "pg bad wirecount"); 1755 } 1756 atomic_add_long(&pmap->pm_stats.resident_count, -1); 1757 } else { 1758 vm_page_wire_quick(pvp->pv_m); 1759 } 1760 *ptep = VM_PAGE_TO_PHYS(m) | (PG_U | PG_RW | PG_V | 1761 PG_A | PG_M); 1762 } 1763 vm_page_wakeup(m); 1764 notnew: 1765 if (pvpp) 1766 *pvpp = pvp; 1767 else if (pvp) 1768 pv_put(pvp); 1769 return (pv); 1770 } 1771 1772 /* 1773 * This version of pmap_allocpte() checks for possible segment optimizations 1774 * that would allow page-table sharing. It can be called for terminal 1775 * page or page table page ptepindex's. 1776 * 1777 * The function is called with page table page ptepindex's for fictitious 1778 * and unmanaged terminal pages. That is, we don't want to allocate a 1779 * terminal pv, we just want the pt_pv. pvpp is usually passed as NULL 1780 * for this case. 1781 * 1782 * This function can return a pv and *pvpp associated with the passed in pmap 1783 * OR a pv and *pvpp associated with the shared pmap. In the latter case 1784 * an unmanaged page table page will be entered into the pass in pmap. 1785 */ 1786 static 1787 pv_entry_t 1788 pmap_allocpte_seg(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp, 1789 vm_map_entry_t entry, vm_offset_t va) 1790 { 1791 struct pmap_inval_info info; 1792 vm_object_t object; 1793 pmap_t obpmap; 1794 pmap_t *obpmapp; 1795 vm_offset_t b; 1796 pv_entry_t pte_pv; /* in original or shared pmap */ 1797 pv_entry_t pt_pv; /* in original or shared pmap */ 1798 pv_entry_t proc_pd_pv; /* in original pmap */ 1799 pv_entry_t proc_pt_pv; /* in original pmap */ 1800 pv_entry_t xpv; /* PT in shared pmap */ 1801 pd_entry_t *pt; /* PT entry in PD of original pmap */ 1802 pd_entry_t opte; /* contents of *pt */ 1803 pd_entry_t npte; /* contents of *pt */ 1804 vm_page_t m; 1805 1806 retry: 1807 /* 1808 * Basic tests, require a non-NULL vm_map_entry, require proper 1809 * alignment and type for the vm_map_entry, require that the 1810 * underlying object already be allocated. 1811 * 1812 * We currently allow any type of object to use this optimization. 1813 * The object itself does NOT have to be sized to a multiple of the 1814 * segment size, but the memory mapping does. 1815 */ 1816 if (entry == NULL || 1817 pmap_mmu_optimize == 0 || /* not enabled */ 1818 ptepindex >= pmap_pd_pindex(0) || /* not terminal */ 1819 entry->inheritance != VM_INHERIT_SHARE || /* not shared */ 1820 entry->maptype != VM_MAPTYPE_NORMAL || /* weird map type */ 1821 entry->object.vm_object == NULL || /* needs VM object */ 1822 (entry->offset & SEG_MASK) || /* must be aligned */ 1823 (entry->start & SEG_MASK)) { 1824 return(pmap_allocpte(pmap, ptepindex, pvpp)); 1825 } 1826 1827 /* 1828 * Make sure the full segment can be represented. 1829 */ 1830 b = va & ~(vm_offset_t)SEG_MASK; 1831 if (b < entry->start && b + SEG_SIZE > entry->end) 1832 return(pmap_allocpte(pmap, ptepindex, pvpp)); 1833 1834 /* 1835 * If the full segment can be represented dive the VM object's 1836 * shared pmap, allocating as required. 1837 */ 1838 object = entry->object.vm_object; 1839 1840 if (entry->protection & VM_PROT_WRITE) 1841 obpmapp = &object->md.pmap_rw; 1842 else 1843 obpmapp = &object->md.pmap_ro; 1844 1845 /* 1846 * We allocate what appears to be a normal pmap but because portions 1847 * of this pmap are shared with other unrelated pmaps we have to 1848 * set pm_active to point to all cpus. 1849 * 1850 * XXX Currently using pmap_spin to interlock the update, can't use 1851 * vm_object_hold/drop because the token might already be held 1852 * shared OR exclusive and we don't know. 1853 */ 1854 while ((obpmap = *obpmapp) == NULL) { 1855 obpmap = kmalloc(sizeof(*obpmap), M_OBJPMAP, M_WAITOK|M_ZERO); 1856 pmap_pinit_simple(obpmap); 1857 pmap_pinit2(obpmap); 1858 spin_lock(&pmap_spin); 1859 if (*obpmapp != NULL) { 1860 /* 1861 * Handle race 1862 */ 1863 spin_unlock(&pmap_spin); 1864 pmap_release(obpmap); 1865 pmap_puninit(obpmap); 1866 kfree(obpmap, M_OBJPMAP); 1867 } else { 1868 obpmap->pm_active = smp_active_mask; 1869 *obpmapp = obpmap; 1870 spin_unlock(&pmap_spin); 1871 } 1872 } 1873 1874 /* 1875 * Layering is: PTE, PT, PD, PDP, PML4. We have to return the 1876 * pte/pt using the shared pmap from the object but also adjust 1877 * the process pmap's page table page as a side effect. 1878 */ 1879 1880 /* 1881 * Resolve the terminal PTE and PT in the shared pmap. This is what 1882 * we will return. This is true if ptepindex represents a terminal 1883 * page, otherwise pte_pv is actually the PT and pt_pv is actually 1884 * the PD. 1885 */ 1886 pt_pv = NULL; 1887 pte_pv = pmap_allocpte(obpmap, ptepindex, &pt_pv); 1888 if (ptepindex >= pmap_pt_pindex(0)) 1889 xpv = pte_pv; 1890 else 1891 xpv = pt_pv; 1892 1893 /* 1894 * Resolve the PD in the process pmap so we can properly share the 1895 * page table page. Lock order is bottom-up (leaf first)! 1896 * 1897 * NOTE: proc_pt_pv can be NULL. 1898 */ 1899 proc_pt_pv = pv_get(pmap, pmap_pt_pindex(b)); 1900 proc_pd_pv = pmap_allocpte(pmap, pmap_pd_pindex(b), NULL); 1901 1902 /* 1903 * xpv is the page table page pv from the shared object 1904 * (for convenience). 1905 * 1906 * Calculate the pte value for the PT to load into the process PD. 1907 * If we have to change it we must properly dispose of the previous 1908 * entry. 1909 */ 1910 pt = pv_pte_lookup(proc_pd_pv, pmap_pt_index(b)); 1911 npte = VM_PAGE_TO_PHYS(xpv->pv_m) | 1912 (PG_U | PG_RW | PG_V | PG_A | PG_M); 1913 1914 /* 1915 * Dispose of previous page table page if it was local to the 1916 * process pmap. If the old pt is not empty we cannot dispose of it 1917 * until we clean it out. This case should not arise very often so 1918 * it is not optimized. 1919 */ 1920 if (proc_pt_pv) { 1921 if (proc_pt_pv->pv_m->wire_count != 1) { 1922 pv_put(proc_pd_pv); 1923 pv_put(proc_pt_pv); 1924 pv_put(pt_pv); 1925 pv_put(pte_pv); 1926 pmap_remove(pmap, 1927 va & ~(vm_offset_t)SEG_MASK, 1928 (va + SEG_SIZE) & ~(vm_offset_t)SEG_MASK); 1929 goto retry; 1930 } 1931 pmap_release_pv(proc_pt_pv, proc_pd_pv); 1932 proc_pt_pv = NULL; 1933 /* relookup */ 1934 pt = pv_pte_lookup(proc_pd_pv, pmap_pt_index(b)); 1935 } 1936 1937 /* 1938 * Handle remaining cases. 1939 */ 1940 if (*pt == 0) { 1941 *pt = npte; 1942 vm_page_wire_quick(xpv->pv_m); 1943 vm_page_wire_quick(proc_pd_pv->pv_m); 1944 atomic_add_long(&pmap->pm_stats.resident_count, 1); 1945 } else if (*pt != npte) { 1946 pmap_inval_init(&info); 1947 pmap_inval_interlock(&info, pmap, (vm_offset_t)-1); 1948 1949 opte = pte_load_clear(pt); 1950 KKASSERT(opte && opte != npte); 1951 1952 *pt = npte; 1953 vm_page_wire_quick(xpv->pv_m); /* pgtable pg that is npte */ 1954 1955 /* 1956 * Clean up opte, bump the wire_count for the process 1957 * PD page representing the new entry if it was 1958 * previously empty. 1959 * 1960 * If the entry was not previously empty and we have 1961 * a PT in the proc pmap then opte must match that 1962 * pt. The proc pt must be retired (this is done 1963 * later on in this procedure). 1964 * 1965 * NOTE: replacing valid pte, wire_count on proc_pd_pv 1966 * stays the same. 1967 */ 1968 KKASSERT(opte & PG_V); 1969 m = PHYS_TO_VM_PAGE(opte & PG_FRAME); 1970 if (vm_page_unwire_quick(m)) { 1971 panic("pmap_allocpte_seg: " 1972 "bad wire count %p", 1973 m); 1974 } 1975 1976 pmap_inval_deinterlock(&info, pmap); 1977 pmap_inval_done(&info); 1978 } 1979 1980 /* 1981 * The existing process page table was replaced and must be destroyed 1982 * here. 1983 */ 1984 if (proc_pd_pv) 1985 pv_put(proc_pd_pv); 1986 if (pvpp) 1987 *pvpp = pt_pv; 1988 else 1989 pv_put(pt_pv); 1990 1991 return (pte_pv); 1992 } 1993 1994 /* 1995 * Release any resources held by the given physical map. 1996 * 1997 * Called when a pmap initialized by pmap_pinit is being released. Should 1998 * only be called if the map contains no valid mappings. 1999 * 2000 * Caller must hold pmap->pm_token 2001 */ 2002 struct pmap_release_info { 2003 pmap_t pmap; 2004 int retry; 2005 }; 2006 2007 static int pmap_release_callback(pv_entry_t pv, void *data); 2008 2009 void 2010 pmap_release(struct pmap *pmap) 2011 { 2012 struct pmap_release_info info; 2013 2014 KASSERT(pmap->pm_active == 0, 2015 ("pmap still active! %016jx", (uintmax_t)pmap->pm_active)); 2016 2017 spin_lock(&pmap_spin); 2018 TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode); 2019 spin_unlock(&pmap_spin); 2020 2021 /* 2022 * Pull pv's off the RB tree in order from low to high and release 2023 * each page. 2024 */ 2025 info.pmap = pmap; 2026 do { 2027 info.retry = 0; 2028 spin_lock(&pmap->pm_spin); 2029 RB_SCAN(pv_entry_rb_tree, &pmap->pm_pvroot, NULL, 2030 pmap_release_callback, &info); 2031 spin_unlock(&pmap->pm_spin); 2032 } while (info.retry); 2033 2034 2035 /* 2036 * One resident page (the pml4 page) should remain. 2037 * No wired pages should remain. 2038 */ 2039 KKASSERT(pmap->pm_stats.resident_count == 2040 ((pmap->pm_flags & PMAP_FLAG_SIMPLE) ? 0 : 1)); 2041 2042 KKASSERT(pmap->pm_stats.wired_count == 0); 2043 } 2044 2045 static int 2046 pmap_release_callback(pv_entry_t pv, void *data) 2047 { 2048 struct pmap_release_info *info = data; 2049 pmap_t pmap = info->pmap; 2050 int r; 2051 2052 if (pv_hold_try(pv)) { 2053 spin_unlock(&pmap->pm_spin); 2054 } else { 2055 spin_unlock(&pmap->pm_spin); 2056 pv_lock(pv); 2057 if (pv->pv_pmap != pmap) { 2058 pv_put(pv); 2059 spin_lock(&pmap->pm_spin); 2060 info->retry = 1; 2061 return(-1); 2062 } 2063 } 2064 r = pmap_release_pv(pv, NULL); 2065 spin_lock(&pmap->pm_spin); 2066 return(r); 2067 } 2068 2069 /* 2070 * Called with held (i.e. also locked) pv. This function will dispose of 2071 * the lock along with the pv. 2072 * 2073 * If the caller already holds the locked parent page table for pv it 2074 * must pass it as pvp, allowing us to avoid a deadlock, else it can 2075 * pass NULL for pvp. 2076 */ 2077 static int 2078 pmap_release_pv(pv_entry_t pv, pv_entry_t pvp) 2079 { 2080 vm_page_t p; 2081 2082 /* 2083 * The pmap is currently not spinlocked, pv is held+locked. 2084 * Remove the pv's page from its parent's page table. The 2085 * parent's page table page's wire_count will be decremented. 2086 */ 2087 pmap_remove_pv_pte(pv, pvp, NULL); 2088 2089 /* 2090 * Terminal pvs are unhooked from their vm_pages. Because 2091 * terminal pages aren't page table pages they aren't wired 2092 * by us, so we have to be sure not to unwire them either. 2093 */ 2094 if (pv->pv_pindex < pmap_pt_pindex(0)) { 2095 pmap_remove_pv_page(pv); 2096 goto skip; 2097 } 2098 2099 /* 2100 * We leave the top-level page table page cached, wired, and 2101 * mapped in the pmap until the dtor function (pmap_puninit()) 2102 * gets called. 2103 * 2104 * Since we are leaving the top-level pv intact we need 2105 * to break out of what would otherwise be an infinite loop. 2106 */ 2107 if (pv->pv_pindex == pmap_pml4_pindex()) { 2108 pv_put(pv); 2109 return(-1); 2110 } 2111 2112 /* 2113 * For page table pages (other than the top-level page), 2114 * remove and free the vm_page. The representitive mapping 2115 * removed above by pmap_remove_pv_pte() did not undo the 2116 * last wire_count so we have to do that as well. 2117 */ 2118 p = pmap_remove_pv_page(pv); 2119 vm_page_busy_wait(p, FALSE, "pmaprl"); 2120 if (p->wire_count != 1) { 2121 kprintf("p->wire_count was %016lx %d\n", 2122 pv->pv_pindex, p->wire_count); 2123 } 2124 KKASSERT(p->wire_count == 1); 2125 KKASSERT(p->flags & PG_UNMANAGED); 2126 2127 vm_page_unwire(p, 0); 2128 KKASSERT(p->wire_count == 0); 2129 2130 /* 2131 * Theoretically this page, if not the pml4 page, should contain 2132 * all-zeros. But its just too dangerous to mark it PG_ZERO. Free 2133 * normally. 2134 */ 2135 vm_page_free(p); 2136 skip: 2137 pv_free(pv); 2138 return 0; 2139 } 2140 2141 /* 2142 * This function will remove the pte associated with a pv from its parent. 2143 * Terminal pv's are supported. The removal will be interlocked if info 2144 * is non-NULL. The caller must dispose of pv instead of just unlocking 2145 * it. 2146 * 2147 * The wire count will be dropped on the parent page table. The wire 2148 * count on the page being removed (pv->pv_m) from the parent page table 2149 * is NOT touched. Note that terminal pages will not have any additional 2150 * wire counts while page table pages will have at least one representing 2151 * the mapping, plus others representing sub-mappings. 2152 * 2153 * NOTE: Cannot be called on kernel page table pages, only KVM terminal 2154 * pages and user page table and terminal pages. 2155 * 2156 * The pv must be locked. 2157 * 2158 * XXX must lock parent pv's if they exist to remove pte XXX 2159 */ 2160 static 2161 void 2162 pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, struct pmap_inval_info *info) 2163 { 2164 vm_pindex_t ptepindex = pv->pv_pindex; 2165 pmap_t pmap = pv->pv_pmap; 2166 vm_page_t p; 2167 int gotpvp = 0; 2168 2169 KKASSERT(pmap); 2170 2171 if (ptepindex == pmap_pml4_pindex()) { 2172 /* 2173 * We are the top level pml4 table, there is no parent. 2174 */ 2175 p = pmap->pm_pmlpv->pv_m; 2176 } else if (ptepindex >= pmap_pdp_pindex(0)) { 2177 /* 2178 * Remove a PDP page from the pml4e. This can only occur 2179 * with user page tables. We do not have to lock the 2180 * pml4 PV so just ignore pvp. 2181 */ 2182 vm_pindex_t pml4_pindex; 2183 vm_pindex_t pdp_index; 2184 pml4_entry_t *pdp; 2185 2186 pdp_index = ptepindex - pmap_pdp_pindex(0); 2187 if (pvp == NULL) { 2188 pml4_pindex = pmap_pml4_pindex(); 2189 pvp = pv_get(pv->pv_pmap, pml4_pindex); 2190 KKASSERT(pvp); 2191 gotpvp = 1; 2192 } 2193 pdp = &pmap->pm_pml4[pdp_index & ((1ul << NPML4EPGSHIFT) - 1)]; 2194 KKASSERT((*pdp & PG_V) != 0); 2195 p = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 2196 *pdp = 0; 2197 KKASSERT(info == NULL); 2198 } else if (ptepindex >= pmap_pd_pindex(0)) { 2199 /* 2200 * Remove a PD page from the pdp 2201 * 2202 * SIMPLE PMAP NOTE: Non-existant pvp's are ok in the case 2203 * of a simple pmap because it stops at 2204 * the PD page. 2205 */ 2206 vm_pindex_t pdp_pindex; 2207 vm_pindex_t pd_index; 2208 pdp_entry_t *pd; 2209 2210 pd_index = ptepindex - pmap_pd_pindex(0); 2211 2212 if (pvp == NULL) { 2213 pdp_pindex = NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + 2214 (pd_index >> NPML4EPGSHIFT); 2215 pvp = pv_get(pv->pv_pmap, pdp_pindex); 2216 if (pvp) 2217 gotpvp = 1; 2218 } 2219 if (pvp) { 2220 pd = pv_pte_lookup(pvp, pd_index & 2221 ((1ul << NPDPEPGSHIFT) - 1)); 2222 KKASSERT((*pd & PG_V) != 0); 2223 p = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 2224 *pd = 0; 2225 } else { 2226 KKASSERT(pmap->pm_flags & PMAP_FLAG_SIMPLE); 2227 p = pv->pv_m; /* degenerate test later */ 2228 } 2229 KKASSERT(info == NULL); 2230 } else if (ptepindex >= pmap_pt_pindex(0)) { 2231 /* 2232 * Remove a PT page from the pd 2233 */ 2234 vm_pindex_t pd_pindex; 2235 vm_pindex_t pt_index; 2236 pd_entry_t *pt; 2237 2238 pt_index = ptepindex - pmap_pt_pindex(0); 2239 2240 if (pvp == NULL) { 2241 pd_pindex = NUPTE_TOTAL + NUPT_TOTAL + 2242 (pt_index >> NPDPEPGSHIFT); 2243 pvp = pv_get(pv->pv_pmap, pd_pindex); 2244 KKASSERT(pvp); 2245 gotpvp = 1; 2246 } 2247 pt = pv_pte_lookup(pvp, pt_index & ((1ul << NPDPEPGSHIFT) - 1)); 2248 KKASSERT((*pt & PG_V) != 0); 2249 p = PHYS_TO_VM_PAGE(*pt & PG_FRAME); 2250 *pt = 0; 2251 KKASSERT(info == NULL); 2252 } else { 2253 /* 2254 * Remove a PTE from the PT page 2255 * 2256 * NOTE: pv's must be locked bottom-up to avoid deadlocking. 2257 * pv is a pte_pv so we can safely lock pt_pv. 2258 */ 2259 vm_pindex_t pt_pindex; 2260 pt_entry_t *ptep; 2261 pt_entry_t pte; 2262 vm_offset_t va; 2263 2264 pt_pindex = ptepindex >> NPTEPGSHIFT; 2265 va = (vm_offset_t)ptepindex << PAGE_SHIFT; 2266 2267 if (ptepindex >= NUPTE_USER) { 2268 ptep = vtopte(ptepindex << PAGE_SHIFT); 2269 KKASSERT(pvp == NULL); 2270 } else { 2271 if (pvp == NULL) { 2272 pt_pindex = NUPTE_TOTAL + 2273 (ptepindex >> NPDPEPGSHIFT); 2274 pvp = pv_get(pv->pv_pmap, pt_pindex); 2275 KKASSERT(pvp); 2276 gotpvp = 1; 2277 } 2278 ptep = pv_pte_lookup(pvp, ptepindex & 2279 ((1ul << NPDPEPGSHIFT) - 1)); 2280 } 2281 2282 if (info) 2283 pmap_inval_interlock(info, pmap, va); 2284 pte = pte_load_clear(ptep); 2285 if (info) 2286 pmap_inval_deinterlock(info, pmap); 2287 else 2288 cpu_invlpg((void *)va); 2289 2290 /* 2291 * Now update the vm_page_t 2292 */ 2293 if ((pte & (PG_MANAGED|PG_V)) != (PG_MANAGED|PG_V)) { 2294 kprintf("remove_pte badpte %016lx %016lx %d\n", 2295 pte, pv->pv_pindex, 2296 pv->pv_pindex < pmap_pt_pindex(0)); 2297 } 2298 /*KKASSERT((pte & (PG_MANAGED|PG_V)) == (PG_MANAGED|PG_V));*/ 2299 p = PHYS_TO_VM_PAGE(pte & PG_FRAME); 2300 2301 if (pte & PG_M) { 2302 if (pmap_track_modified(ptepindex)) 2303 vm_page_dirty(p); 2304 } 2305 if (pte & PG_A) { 2306 vm_page_flag_set(p, PG_REFERENCED); 2307 } 2308 if (pte & PG_W) 2309 atomic_add_long(&pmap->pm_stats.wired_count, -1); 2310 if (pte & PG_G) 2311 cpu_invlpg((void *)va); 2312 } 2313 2314 /* 2315 * Unwire the parent page table page. The wire_count cannot go below 2316 * 1 here because the parent page table page is itself still mapped. 2317 * 2318 * XXX remove the assertions later. 2319 */ 2320 KKASSERT(pv->pv_m == p); 2321 if (pvp && vm_page_unwire_quick(pvp->pv_m)) 2322 panic("pmap_remove_pv_pte: Insufficient wire_count"); 2323 2324 if (gotpvp) 2325 pv_put(pvp); 2326 } 2327 2328 static 2329 vm_page_t 2330 pmap_remove_pv_page(pv_entry_t pv) 2331 { 2332 vm_page_t m; 2333 2334 m = pv->pv_m; 2335 KKASSERT(m); 2336 vm_page_spin_lock(m); 2337 pv->pv_m = NULL; 2338 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2339 /* 2340 if (m->object) 2341 atomic_add_int(&m->object->agg_pv_list_count, -1); 2342 */ 2343 if (TAILQ_EMPTY(&m->md.pv_list)) 2344 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2345 vm_page_spin_unlock(m); 2346 return(m); 2347 } 2348 2349 /* 2350 * Grow the number of kernel page table entries, if needed. 2351 * 2352 * This routine is always called to validate any address space 2353 * beyond KERNBASE (for kldloads). kernel_vm_end only governs the address 2354 * space below KERNBASE. 2355 */ 2356 void 2357 pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) 2358 { 2359 vm_paddr_t paddr; 2360 vm_offset_t ptppaddr; 2361 vm_page_t nkpg; 2362 pd_entry_t *pt, newpt; 2363 pdp_entry_t newpd; 2364 int update_kernel_vm_end; 2365 2366 /* 2367 * bootstrap kernel_vm_end on first real VM use 2368 */ 2369 if (kernel_vm_end == 0) { 2370 kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 2371 nkpt = 0; 2372 while ((*pmap_pt(&kernel_pmap, kernel_vm_end) & PG_V) != 0) { 2373 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & 2374 ~(PAGE_SIZE * NPTEPG - 1); 2375 nkpt++; 2376 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 2377 kernel_vm_end = kernel_map.max_offset; 2378 break; 2379 } 2380 } 2381 } 2382 2383 /* 2384 * Fill in the gaps. kernel_vm_end is only adjusted for ranges 2385 * below KERNBASE. Ranges above KERNBASE are kldloaded and we 2386 * do not want to force-fill 128G worth of page tables. 2387 */ 2388 if (kstart < KERNBASE) { 2389 if (kstart > kernel_vm_end) 2390 kstart = kernel_vm_end; 2391 KKASSERT(kend <= KERNBASE); 2392 update_kernel_vm_end = 1; 2393 } else { 2394 update_kernel_vm_end = 0; 2395 } 2396 2397 kstart = rounddown2(kstart, PAGE_SIZE * NPTEPG); 2398 kend = roundup2(kend, PAGE_SIZE * NPTEPG); 2399 2400 if (kend - 1 >= kernel_map.max_offset) 2401 kend = kernel_map.max_offset; 2402 2403 while (kstart < kend) { 2404 pt = pmap_pt(&kernel_pmap, kstart); 2405 if (pt == NULL) { 2406 /* We need a new PDP entry */ 2407 nkpg = vm_page_alloc(NULL, nkpt, 2408 VM_ALLOC_NORMAL | 2409 VM_ALLOC_SYSTEM | 2410 VM_ALLOC_INTERRUPT); 2411 if (nkpg == NULL) { 2412 panic("pmap_growkernel: no memory to grow " 2413 "kernel"); 2414 } 2415 paddr = VM_PAGE_TO_PHYS(nkpg); 2416 if ((nkpg->flags & PG_ZERO) == 0) 2417 pmap_zero_page(paddr); 2418 vm_page_flag_clear(nkpg, PG_ZERO); 2419 newpd = (pdp_entry_t) 2420 (paddr | PG_V | PG_RW | PG_A | PG_M); 2421 *pmap_pd(&kernel_pmap, kstart) = newpd; 2422 nkpt++; 2423 continue; /* try again */ 2424 } 2425 if ((*pt & PG_V) != 0) { 2426 kstart = (kstart + PAGE_SIZE * NPTEPG) & 2427 ~(PAGE_SIZE * NPTEPG - 1); 2428 if (kstart - 1 >= kernel_map.max_offset) { 2429 kstart = kernel_map.max_offset; 2430 break; 2431 } 2432 continue; 2433 } 2434 2435 /* 2436 * This index is bogus, but out of the way 2437 */ 2438 nkpg = vm_page_alloc(NULL, nkpt, 2439 VM_ALLOC_NORMAL | 2440 VM_ALLOC_SYSTEM | 2441 VM_ALLOC_INTERRUPT); 2442 if (nkpg == NULL) 2443 panic("pmap_growkernel: no memory to grow kernel"); 2444 2445 vm_page_wire(nkpg); 2446 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 2447 pmap_zero_page(ptppaddr); 2448 vm_page_flag_clear(nkpg, PG_ZERO); 2449 newpt = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); 2450 *pmap_pt(&kernel_pmap, kstart) = newpt; 2451 nkpt++; 2452 2453 kstart = (kstart + PAGE_SIZE * NPTEPG) & 2454 ~(PAGE_SIZE * NPTEPG - 1); 2455 2456 if (kstart - 1 >= kernel_map.max_offset) { 2457 kstart = kernel_map.max_offset; 2458 break; 2459 } 2460 } 2461 2462 /* 2463 * Only update kernel_vm_end for areas below KERNBASE. 2464 */ 2465 if (update_kernel_vm_end && kernel_vm_end < kstart) 2466 kernel_vm_end = kstart; 2467 } 2468 2469 /* 2470 * Add a reference to the specified pmap. 2471 */ 2472 void 2473 pmap_reference(pmap_t pmap) 2474 { 2475 if (pmap != NULL) { 2476 lwkt_gettoken(&pmap->pm_token); 2477 ++pmap->pm_count; 2478 lwkt_reltoken(&pmap->pm_token); 2479 } 2480 } 2481 2482 /*************************************************** 2483 * page management routines. 2484 ***************************************************/ 2485 2486 /* 2487 * Hold a pv without locking it 2488 */ 2489 static void 2490 pv_hold(pv_entry_t pv) 2491 { 2492 u_int count; 2493 2494 if (atomic_cmpset_int(&pv->pv_hold, 0, 1)) 2495 return; 2496 2497 for (;;) { 2498 count = pv->pv_hold; 2499 cpu_ccfence(); 2500 if (atomic_cmpset_int(&pv->pv_hold, count, count + 1)) 2501 return; 2502 /* retry */ 2503 } 2504 } 2505 2506 /* 2507 * Hold a pv_entry, preventing its destruction. TRUE is returned if the pv 2508 * was successfully locked, FALSE if it wasn't. The caller must dispose of 2509 * the pv properly. 2510 * 2511 * Either the pmap->pm_spin or the related vm_page_spin (if traversing a 2512 * pv list via its page) must be held by the caller. 2513 */ 2514 static int 2515 _pv_hold_try(pv_entry_t pv PMAP_DEBUG_DECL) 2516 { 2517 u_int count; 2518 2519 if (atomic_cmpset_int(&pv->pv_hold, 0, PV_HOLD_LOCKED | 1)) { 2520 #ifdef PMAP_DEBUG 2521 pv->pv_func = func; 2522 pv->pv_line = lineno; 2523 #endif 2524 return TRUE; 2525 } 2526 2527 for (;;) { 2528 count = pv->pv_hold; 2529 cpu_ccfence(); 2530 if ((count & PV_HOLD_LOCKED) == 0) { 2531 if (atomic_cmpset_int(&pv->pv_hold, count, 2532 (count + 1) | PV_HOLD_LOCKED)) { 2533 #ifdef PMAP_DEBUG 2534 pv->pv_func = func; 2535 pv->pv_line = lineno; 2536 #endif 2537 return TRUE; 2538 } 2539 } else { 2540 if (atomic_cmpset_int(&pv->pv_hold, count, count + 1)) 2541 return FALSE; 2542 } 2543 /* retry */ 2544 } 2545 } 2546 2547 /* 2548 * Drop a previously held pv_entry which could not be locked, allowing its 2549 * destruction. 2550 * 2551 * Must not be called with a spinlock held as we might zfree() the pv if it 2552 * is no longer associated with a pmap and this was the last hold count. 2553 */ 2554 static void 2555 pv_drop(pv_entry_t pv) 2556 { 2557 u_int count; 2558 2559 if (atomic_cmpset_int(&pv->pv_hold, 1, 0)) { 2560 if (pv->pv_pmap == NULL) 2561 zfree(pvzone, pv); 2562 return; 2563 } 2564 2565 for (;;) { 2566 count = pv->pv_hold; 2567 cpu_ccfence(); 2568 KKASSERT((count & PV_HOLD_MASK) > 0); 2569 KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) != 2570 (PV_HOLD_LOCKED | 1)); 2571 if (atomic_cmpset_int(&pv->pv_hold, count, count - 1)) { 2572 if (count == 1 && pv->pv_pmap == NULL) 2573 zfree(pvzone, pv); 2574 return; 2575 } 2576 /* retry */ 2577 } 2578 } 2579 2580 /* 2581 * Find or allocate the requested PV entry, returning a locked pv 2582 */ 2583 static 2584 pv_entry_t 2585 _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew PMAP_DEBUG_DECL) 2586 { 2587 pv_entry_t pv; 2588 pv_entry_t pnew = NULL; 2589 2590 spin_lock(&pmap->pm_spin); 2591 for (;;) { 2592 if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) { 2593 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, 2594 pindex); 2595 } 2596 if (pv == NULL) { 2597 if (pnew == NULL) { 2598 spin_unlock(&pmap->pm_spin); 2599 pnew = zalloc(pvzone); 2600 spin_lock(&pmap->pm_spin); 2601 continue; 2602 } 2603 pnew->pv_pmap = pmap; 2604 pnew->pv_pindex = pindex; 2605 pnew->pv_hold = PV_HOLD_LOCKED | 1; 2606 #ifdef PMAP_DEBUG 2607 pnew->pv_func = func; 2608 pnew->pv_line = lineno; 2609 #endif 2610 pv_entry_rb_tree_RB_INSERT(&pmap->pm_pvroot, pnew); 2611 atomic_add_long(&pmap->pm_stats.resident_count, 1); 2612 spin_unlock(&pmap->pm_spin); 2613 *isnew = 1; 2614 return(pnew); 2615 } 2616 if (pnew) { 2617 spin_unlock(&pmap->pm_spin); 2618 zfree(pvzone, pnew); 2619 pnew = NULL; 2620 spin_lock(&pmap->pm_spin); 2621 continue; 2622 } 2623 if (_pv_hold_try(pv PMAP_DEBUG_COPY)) { 2624 spin_unlock(&pmap->pm_spin); 2625 *isnew = 0; 2626 return(pv); 2627 } 2628 spin_unlock(&pmap->pm_spin); 2629 _pv_lock(pv PMAP_DEBUG_COPY); 2630 if (pv->pv_pmap == pmap && pv->pv_pindex == pindex) { 2631 *isnew = 0; 2632 return(pv); 2633 } 2634 pv_put(pv); 2635 spin_lock(&pmap->pm_spin); 2636 } 2637 2638 2639 } 2640 2641 /* 2642 * Find the requested PV entry, returning a locked+held pv or NULL 2643 */ 2644 static 2645 pv_entry_t 2646 _pv_get(pmap_t pmap, vm_pindex_t pindex PMAP_DEBUG_DECL) 2647 { 2648 pv_entry_t pv; 2649 2650 spin_lock(&pmap->pm_spin); 2651 for (;;) { 2652 /* 2653 * Shortcut cache 2654 */ 2655 if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) { 2656 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, 2657 pindex); 2658 } 2659 if (pv == NULL) { 2660 spin_unlock(&pmap->pm_spin); 2661 return NULL; 2662 } 2663 if (_pv_hold_try(pv PMAP_DEBUG_COPY)) { 2664 pv_cache(pv, pindex); 2665 spin_unlock(&pmap->pm_spin); 2666 return(pv); 2667 } 2668 spin_unlock(&pmap->pm_spin); 2669 _pv_lock(pv PMAP_DEBUG_COPY); 2670 if (pv->pv_pmap == pmap && pv->pv_pindex == pindex) 2671 return(pv); 2672 pv_put(pv); 2673 spin_lock(&pmap->pm_spin); 2674 } 2675 } 2676 2677 /* 2678 * Lookup, hold, and attempt to lock (pmap,pindex). 2679 * 2680 * If the entry does not exist NULL is returned and *errorp is set to 0 2681 * 2682 * If the entry exists and could be successfully locked it is returned and 2683 * errorp is set to 0. 2684 * 2685 * If the entry exists but could NOT be successfully locked it is returned 2686 * held and *errorp is set to 1. 2687 */ 2688 static 2689 pv_entry_t 2690 pv_get_try(pmap_t pmap, vm_pindex_t pindex, int *errorp) 2691 { 2692 pv_entry_t pv; 2693 2694 spin_lock(&pmap->pm_spin); 2695 if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) 2696 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex); 2697 if (pv == NULL) { 2698 spin_unlock(&pmap->pm_spin); 2699 *errorp = 0; 2700 return NULL; 2701 } 2702 if (pv_hold_try(pv)) { 2703 pv_cache(pv, pindex); 2704 spin_unlock(&pmap->pm_spin); 2705 *errorp = 0; 2706 return(pv); /* lock succeeded */ 2707 } 2708 spin_unlock(&pmap->pm_spin); 2709 *errorp = 1; 2710 return (pv); /* lock failed */ 2711 } 2712 2713 /* 2714 * Find the requested PV entry, returning a held pv or NULL 2715 */ 2716 static 2717 pv_entry_t 2718 pv_find(pmap_t pmap, vm_pindex_t pindex) 2719 { 2720 pv_entry_t pv; 2721 2722 spin_lock(&pmap->pm_spin); 2723 2724 if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) 2725 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex); 2726 if (pv == NULL) { 2727 spin_unlock(&pmap->pm_spin); 2728 return NULL; 2729 } 2730 pv_hold(pv); 2731 pv_cache(pv, pindex); 2732 spin_unlock(&pmap->pm_spin); 2733 return(pv); 2734 } 2735 2736 /* 2737 * Lock a held pv, keeping the hold count 2738 */ 2739 static 2740 void 2741 _pv_lock(pv_entry_t pv PMAP_DEBUG_DECL) 2742 { 2743 u_int count; 2744 2745 for (;;) { 2746 count = pv->pv_hold; 2747 cpu_ccfence(); 2748 if ((count & PV_HOLD_LOCKED) == 0) { 2749 if (atomic_cmpset_int(&pv->pv_hold, count, 2750 count | PV_HOLD_LOCKED)) { 2751 #ifdef PMAP_DEBUG 2752 pv->pv_func = func; 2753 pv->pv_line = lineno; 2754 #endif 2755 return; 2756 } 2757 continue; 2758 } 2759 tsleep_interlock(pv, 0); 2760 if (atomic_cmpset_int(&pv->pv_hold, count, 2761 count | PV_HOLD_WAITING)) { 2762 #ifdef PMAP_DEBUG 2763 kprintf("pv waiting on %s:%d\n", 2764 pv->pv_func, pv->pv_line); 2765 #endif 2766 tsleep(pv, PINTERLOCKED, "pvwait", hz); 2767 } 2768 /* retry */ 2769 } 2770 } 2771 2772 /* 2773 * Unlock a held and locked pv, keeping the hold count. 2774 */ 2775 static 2776 void 2777 pv_unlock(pv_entry_t pv) 2778 { 2779 u_int count; 2780 2781 if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 1, 1)) 2782 return; 2783 2784 for (;;) { 2785 count = pv->pv_hold; 2786 cpu_ccfence(); 2787 KKASSERT((count & (PV_HOLD_LOCKED|PV_HOLD_MASK)) >= 2788 (PV_HOLD_LOCKED | 1)); 2789 if (atomic_cmpset_int(&pv->pv_hold, count, 2790 count & 2791 ~(PV_HOLD_LOCKED | PV_HOLD_WAITING))) { 2792 if (count & PV_HOLD_WAITING) 2793 wakeup(pv); 2794 break; 2795 } 2796 } 2797 } 2798 2799 /* 2800 * Unlock and drop a pv. If the pv is no longer associated with a pmap 2801 * and the hold count drops to zero we will free it. 2802 * 2803 * Caller should not hold any spin locks. We are protected from hold races 2804 * by virtue of holds only occuring only with a pmap_spin or vm_page_spin 2805 * lock held. A pv cannot be located otherwise. 2806 */ 2807 static 2808 void 2809 pv_put(pv_entry_t pv) 2810 { 2811 if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 1, 0)) { 2812 if (pv->pv_pmap == NULL) 2813 zfree(pvzone, pv); 2814 return; 2815 } 2816 pv_unlock(pv); 2817 pv_drop(pv); 2818 } 2819 2820 /* 2821 * Unlock, drop, and free a pv, destroying it. The pv is removed from its 2822 * pmap. Any pte operations must have already been completed. 2823 */ 2824 static 2825 void 2826 pv_free(pv_entry_t pv) 2827 { 2828 pmap_t pmap; 2829 2830 KKASSERT(pv->pv_m == NULL); 2831 if ((pmap = pv->pv_pmap) != NULL) { 2832 spin_lock(&pmap->pm_spin); 2833 pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); 2834 if (pmap->pm_pvhint == pv) 2835 pmap->pm_pvhint = NULL; 2836 atomic_add_long(&pmap->pm_stats.resident_count, -1); 2837 pv->pv_pmap = NULL; 2838 pv->pv_pindex = 0; 2839 spin_unlock(&pmap->pm_spin); 2840 } 2841 pv_put(pv); 2842 } 2843 2844 /* 2845 * This routine is very drastic, but can save the system 2846 * in a pinch. 2847 */ 2848 void 2849 pmap_collect(void) 2850 { 2851 int i; 2852 vm_page_t m; 2853 static int warningdone=0; 2854 2855 if (pmap_pagedaemon_waken == 0) 2856 return; 2857 pmap_pagedaemon_waken = 0; 2858 if (warningdone < 5) { 2859 kprintf("pmap_collect: collecting pv entries -- " 2860 "suggest increasing PMAP_SHPGPERPROC\n"); 2861 warningdone++; 2862 } 2863 2864 for (i = 0; i < vm_page_array_size; i++) { 2865 m = &vm_page_array[i]; 2866 if (m->wire_count || m->hold_count) 2867 continue; 2868 if (vm_page_busy_try(m, TRUE) == 0) { 2869 if (m->wire_count == 0 && m->hold_count == 0) { 2870 pmap_remove_all(m); 2871 } 2872 vm_page_wakeup(m); 2873 } 2874 } 2875 } 2876 2877 /* 2878 * Scan the pmap for active page table entries and issue a callback. 2879 * The callback must dispose of pte_pv, whos PTE entry is at *ptep in 2880 * its parent page table. 2881 * 2882 * pte_pv will be NULL if the page or page table is unmanaged. 2883 * pt_pv will point to the page table page containing the pte for the page. 2884 * 2885 * NOTE! If we come across an unmanaged page TABLE (verses an unmanaged page), 2886 * we pass a NULL pte_pv and we pass a pt_pv pointing to the passed 2887 * process pmap's PD and page to the callback function. This can be 2888 * confusing because the pt_pv is really a pd_pv, and the target page 2889 * table page is simply aliased by the pmap and not owned by it. 2890 * 2891 * It is assumed that the start and end are properly rounded to the page size. 2892 * 2893 * It is assumed that PD pages and above are managed and thus in the RB tree, 2894 * allowing us to use RB_SCAN from the PD pages down for ranged scans. 2895 */ 2896 struct pmap_scan_info { 2897 struct pmap *pmap; 2898 vm_offset_t sva; 2899 vm_offset_t eva; 2900 vm_pindex_t sva_pd_pindex; 2901 vm_pindex_t eva_pd_pindex; 2902 void (*func)(pmap_t, struct pmap_scan_info *, 2903 pv_entry_t, pv_entry_t, int, vm_offset_t, 2904 pt_entry_t *, void *); 2905 void *arg; 2906 int doinval; 2907 struct pmap_inval_info inval; 2908 }; 2909 2910 static int pmap_scan_cmp(pv_entry_t pv, void *data); 2911 static int pmap_scan_callback(pv_entry_t pv, void *data); 2912 2913 static void 2914 pmap_scan(struct pmap_scan_info *info) 2915 { 2916 struct pmap *pmap = info->pmap; 2917 pv_entry_t pd_pv; /* A page directory PV */ 2918 pv_entry_t pt_pv; /* A page table PV */ 2919 pv_entry_t pte_pv; /* A page table entry PV */ 2920 pt_entry_t *ptep; 2921 struct pv_entry dummy_pv; 2922 2923 if (pmap == NULL) 2924 return; 2925 2926 /* 2927 * Hold the token for stability; if the pmap is empty we have nothing 2928 * to do. 2929 */ 2930 lwkt_gettoken(&pmap->pm_token); 2931 #if 0 2932 if (pmap->pm_stats.resident_count == 0) { 2933 lwkt_reltoken(&pmap->pm_token); 2934 return; 2935 } 2936 #endif 2937 2938 pmap_inval_init(&info->inval); 2939 2940 /* 2941 * Special handling for scanning one page, which is a very common 2942 * operation (it is?). 2943 * 2944 * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4 2945 */ 2946 if (info->sva + PAGE_SIZE == info->eva) { 2947 if (info->sva >= VM_MAX_USER_ADDRESS) { 2948 /* 2949 * Kernel mappings do not track wire counts on 2950 * page table pages and only maintain pd_pv and 2951 * pte_pv levels so pmap_scan() works. 2952 */ 2953 pt_pv = NULL; 2954 pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva)); 2955 ptep = vtopte(info->sva); 2956 } else { 2957 /* 2958 * User pages which are unmanaged will not have a 2959 * pte_pv. User page table pages which are unmanaged 2960 * (shared from elsewhere) will also not have a pt_pv. 2961 * The func() callback will pass both pte_pv and pt_pv 2962 * as NULL in that case. 2963 */ 2964 pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva)); 2965 pt_pv = pv_get(pmap, pmap_pt_pindex(info->sva)); 2966 if (pt_pv == NULL) { 2967 KKASSERT(pte_pv == NULL); 2968 pd_pv = pv_get(pmap, pmap_pd_pindex(info->sva)); 2969 if (pd_pv) { 2970 ptep = pv_pte_lookup(pd_pv, 2971 pmap_pt_index(info->sva)); 2972 if (*ptep) { 2973 info->func(pmap, info, 2974 NULL, pd_pv, 1, 2975 info->sva, ptep, 2976 info->arg); 2977 } 2978 pv_put(pd_pv); 2979 } 2980 goto fast_skip; 2981 } 2982 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(info->sva)); 2983 } 2984 if (*ptep == 0) { 2985 /* 2986 * Unlike the pv_find() case below we actually 2987 * acquired a locked pv in this case so any 2988 * race should have been resolved. It is expected 2989 * to not exist. 2990 */ 2991 KKASSERT(pte_pv == NULL); 2992 } else if (pte_pv) { 2993 KASSERT((*ptep & (PG_MANAGED|PG_V)) == (PG_MANAGED| 2994 PG_V), 2995 ("bad *ptep %016lx sva %016lx pte_pv %p", 2996 *ptep, info->sva, pte_pv)); 2997 info->func(pmap, info, pte_pv, pt_pv, 0, 2998 info->sva, ptep, info->arg); 2999 } else { 3000 KASSERT((*ptep & (PG_MANAGED|PG_V)) == PG_V, 3001 ("bad *ptep %016lx sva %016lx pte_pv NULL", 3002 *ptep, info->sva)); 3003 info->func(pmap, info, NULL, pt_pv, 0, 3004 info->sva, ptep, info->arg); 3005 } 3006 if (pt_pv) 3007 pv_put(pt_pv); 3008 fast_skip: 3009 pmap_inval_done(&info->inval); 3010 lwkt_reltoken(&pmap->pm_token); 3011 return; 3012 } 3013 3014 /* 3015 * Nominal scan case, RB_SCAN() for PD pages and iterate from 3016 * there. 3017 */ 3018 info->sva_pd_pindex = pmap_pd_pindex(info->sva); 3019 info->eva_pd_pindex = pmap_pd_pindex(info->eva + NBPDP - 1); 3020 3021 if (info->sva >= VM_MAX_USER_ADDRESS) { 3022 /* 3023 * The kernel does not currently maintain any pv_entry's for 3024 * higher-level page tables. 3025 */ 3026 bzero(&dummy_pv, sizeof(dummy_pv)); 3027 dummy_pv.pv_pindex = info->sva_pd_pindex; 3028 spin_lock(&pmap->pm_spin); 3029 while (dummy_pv.pv_pindex < info->eva_pd_pindex) { 3030 pmap_scan_callback(&dummy_pv, info); 3031 ++dummy_pv.pv_pindex; 3032 } 3033 spin_unlock(&pmap->pm_spin); 3034 } else { 3035 /* 3036 * User page tables maintain local PML4, PDP, and PD 3037 * pv_entry's at the very least. PT pv's might be 3038 * unmanaged and thus not exist. PTE pv's might be 3039 * unmanaged and thus not exist. 3040 */ 3041 spin_lock(&pmap->pm_spin); 3042 pv_entry_rb_tree_RB_SCAN(&pmap->pm_pvroot, 3043 pmap_scan_cmp, pmap_scan_callback, info); 3044 spin_unlock(&pmap->pm_spin); 3045 } 3046 pmap_inval_done(&info->inval); 3047 lwkt_reltoken(&pmap->pm_token); 3048 } 3049 3050 /* 3051 * WARNING! pmap->pm_spin held 3052 */ 3053 static int 3054 pmap_scan_cmp(pv_entry_t pv, void *data) 3055 { 3056 struct pmap_scan_info *info = data; 3057 if (pv->pv_pindex < info->sva_pd_pindex) 3058 return(-1); 3059 if (pv->pv_pindex >= info->eva_pd_pindex) 3060 return(1); 3061 return(0); 3062 } 3063 3064 /* 3065 * WARNING! pmap->pm_spin held 3066 */ 3067 static int 3068 pmap_scan_callback(pv_entry_t pv, void *data) 3069 { 3070 struct pmap_scan_info *info = data; 3071 struct pmap *pmap = info->pmap; 3072 pv_entry_t pd_pv; /* A page directory PV */ 3073 pv_entry_t pt_pv; /* A page table PV */ 3074 pv_entry_t pte_pv; /* A page table entry PV */ 3075 pt_entry_t *ptep; 3076 vm_offset_t sva; 3077 vm_offset_t eva; 3078 vm_offset_t va_next; 3079 vm_pindex_t pd_pindex; 3080 int error; 3081 3082 /* 3083 * Pull the PD pindex from the pv before releasing the spinlock. 3084 * 3085 * WARNING: pv is faked for kernel pmap scans. 3086 */ 3087 pd_pindex = pv->pv_pindex; 3088 spin_unlock(&pmap->pm_spin); 3089 pv = NULL; /* invalid after spinlock unlocked */ 3090 3091 /* 3092 * Calculate the page range within the PD. SIMPLE pmaps are 3093 * direct-mapped for the entire 2^64 address space. Normal pmaps 3094 * reflect the user and kernel address space which requires 3095 * cannonicalization w/regards to converting pd_pindex's back 3096 * into addresses. 3097 */ 3098 sva = (pd_pindex - NUPTE_TOTAL - NUPT_TOTAL) << PDPSHIFT; 3099 if ((pmap->pm_flags & PMAP_FLAG_SIMPLE) == 0 && 3100 (sva & PML4_SIGNMASK)) { 3101 sva |= PML4_SIGNMASK; 3102 } 3103 eva = sva + NBPDP; /* can overflow */ 3104 if (sva < info->sva) 3105 sva = info->sva; 3106 if (eva < info->sva || eva > info->eva) 3107 eva = info->eva; 3108 3109 /* 3110 * NOTE: kernel mappings do not track page table pages, only 3111 * terminal pages. 3112 * 3113 * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4. 3114 * However, for the scan to be efficient we try to 3115 * cache items top-down. 3116 */ 3117 pd_pv = NULL; 3118 pt_pv = NULL; 3119 3120 for (; sva < eva; sva = va_next) { 3121 if (sva >= VM_MAX_USER_ADDRESS) { 3122 if (pt_pv) { 3123 pv_put(pt_pv); 3124 pt_pv = NULL; 3125 } 3126 goto kernel_skip; 3127 } 3128 3129 /* 3130 * PD cache (degenerate case if we skip). It is possible 3131 * for the PD to not exist due to races. This is ok. 3132 */ 3133 if (pd_pv == NULL) { 3134 pd_pv = pv_get(pmap, pmap_pd_pindex(sva)); 3135 } else if (pd_pv->pv_pindex != pmap_pd_pindex(sva)) { 3136 pv_put(pd_pv); 3137 pd_pv = pv_get(pmap, pmap_pd_pindex(sva)); 3138 } 3139 if (pd_pv == NULL) { 3140 va_next = (sva + NBPDP) & ~PDPMASK; 3141 if (va_next < sva) 3142 va_next = eva; 3143 continue; 3144 } 3145 3146 /* 3147 * PT cache 3148 */ 3149 if (pt_pv == NULL) { 3150 if (pd_pv) { 3151 pv_put(pd_pv); 3152 pd_pv = NULL; 3153 } 3154 pt_pv = pv_get(pmap, pmap_pt_pindex(sva)); 3155 } else if (pt_pv->pv_pindex != pmap_pt_pindex(sva)) { 3156 if (pd_pv) { 3157 pv_put(pd_pv); 3158 pd_pv = NULL; 3159 } 3160 pv_put(pt_pv); 3161 pt_pv = pv_get(pmap, pmap_pt_pindex(sva)); 3162 } 3163 3164 /* 3165 * If pt_pv is NULL we either have an shared page table 3166 * page and must issue a callback specific to that case, 3167 * or there is no page table page. 3168 * 3169 * Either way we can skip the page table page. 3170 */ 3171 if (pt_pv == NULL) { 3172 /* 3173 * Possible unmanaged (shared from another pmap) 3174 * page table page. 3175 */ 3176 if (pd_pv == NULL) 3177 pd_pv = pv_get(pmap, pmap_pd_pindex(sva)); 3178 KKASSERT(pd_pv != NULL); 3179 ptep = pv_pte_lookup(pd_pv, pmap_pt_index(sva)); 3180 if (*ptep & PG_V) { 3181 info->func(pmap, info, NULL, pd_pv, 1, 3182 sva, ptep, info->arg); 3183 } 3184 3185 /* 3186 * Done, move to next page table page. 3187 */ 3188 va_next = (sva + NBPDR) & ~PDRMASK; 3189 if (va_next < sva) 3190 va_next = eva; 3191 continue; 3192 } 3193 3194 /* 3195 * From this point in the loop testing pt_pv for non-NULL 3196 * means we are in UVM, else if it is NULL we are in KVM. 3197 * 3198 * Limit our scan to either the end of the va represented 3199 * by the current page table page, or to the end of the 3200 * range being removed. 3201 */ 3202 kernel_skip: 3203 va_next = (sva + NBPDR) & ~PDRMASK; 3204 if (va_next < sva) 3205 va_next = eva; 3206 if (va_next > eva) 3207 va_next = eva; 3208 3209 /* 3210 * Scan the page table for pages. Some pages may not be 3211 * managed (might not have a pv_entry). 3212 * 3213 * There is no page table management for kernel pages so 3214 * pt_pv will be NULL in that case, but otherwise pt_pv 3215 * is non-NULL, locked, and referenced. 3216 */ 3217 3218 /* 3219 * At this point a non-NULL pt_pv means a UVA, and a NULL 3220 * pt_pv means a KVA. 3221 */ 3222 if (pt_pv) 3223 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva)); 3224 else 3225 ptep = vtopte(sva); 3226 3227 while (sva < va_next) { 3228 /* 3229 * Acquire the related pte_pv, if any. If *ptep == 0 3230 * the related pte_pv should not exist, but if *ptep 3231 * is not zero the pte_pv may or may not exist (e.g. 3232 * will not exist for an unmanaged page). 3233 * 3234 * However a multitude of races are possible here. 3235 * 3236 * In addition, the (pt_pv, pte_pv) lock order is 3237 * backwards, so we have to be careful in aquiring 3238 * a properly locked pte_pv. 3239 */ 3240 if (pt_pv) { 3241 pte_pv = pv_get_try(pmap, pmap_pte_pindex(sva), 3242 &error); 3243 if (error) { 3244 if (pd_pv) { 3245 pv_put(pd_pv); 3246 pd_pv = NULL; 3247 } 3248 pv_put(pt_pv); /* must be non-NULL */ 3249 pt_pv = NULL; 3250 pv_lock(pte_pv); /* safe to block now */ 3251 pv_put(pte_pv); 3252 pte_pv = NULL; 3253 pt_pv = pv_get(pmap, 3254 pmap_pt_pindex(sva)); 3255 /* 3256 * pt_pv reloaded, need new ptep 3257 */ 3258 KKASSERT(pt_pv != NULL); 3259 ptep = pv_pte_lookup(pt_pv, 3260 pmap_pte_index(sva)); 3261 continue; 3262 } 3263 } else { 3264 pte_pv = pv_get(pmap, pmap_pte_pindex(sva)); 3265 } 3266 3267 /* 3268 * Ok, if *ptep == 0 we had better NOT have a pte_pv. 3269 */ 3270 if (*ptep == 0) { 3271 if (pte_pv) { 3272 kprintf("Unexpected non-NULL pte_pv " 3273 "%p pt_pv %p *ptep = %016lx\n", 3274 pte_pv, pt_pv, *ptep); 3275 panic("Unexpected non-NULL pte_pv"); 3276 } 3277 sva += PAGE_SIZE; 3278 ++ptep; 3279 continue; 3280 } 3281 3282 /* 3283 * Ready for the callback. The locked pte_pv (if any) 3284 * is consumed by the callback. pte_pv will exist if 3285 * the page is managed, and will not exist if it 3286 * isn't. 3287 */ 3288 if (pte_pv) { 3289 KASSERT((*ptep & (PG_MANAGED|PG_V)) == 3290 (PG_MANAGED|PG_V), 3291 ("bad *ptep %016lx sva %016lx " 3292 "pte_pv %p", 3293 *ptep, sva, pte_pv)); 3294 info->func(pmap, info, pte_pv, pt_pv, 0, 3295 sva, ptep, info->arg); 3296 } else { 3297 KASSERT((*ptep & (PG_MANAGED|PG_V)) == 3298 PG_V, 3299 ("bad *ptep %016lx sva %016lx " 3300 "pte_pv NULL", 3301 *ptep, sva)); 3302 info->func(pmap, info, NULL, pt_pv, 0, 3303 sva, ptep, info->arg); 3304 } 3305 pte_pv = NULL; 3306 sva += PAGE_SIZE; 3307 ++ptep; 3308 } 3309 lwkt_yield(); 3310 } 3311 if (pd_pv) { 3312 pv_put(pd_pv); 3313 pd_pv = NULL; 3314 } 3315 if (pt_pv) { 3316 pv_put(pt_pv); 3317 pt_pv = NULL; 3318 } 3319 lwkt_yield(); 3320 3321 /* 3322 * Relock before returning. 3323 */ 3324 spin_lock(&pmap->pm_spin); 3325 return (0); 3326 } 3327 3328 void 3329 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 3330 { 3331 struct pmap_scan_info info; 3332 3333 info.pmap = pmap; 3334 info.sva = sva; 3335 info.eva = eva; 3336 info.func = pmap_remove_callback; 3337 info.arg = NULL; 3338 info.doinval = 1; /* normal remove requires pmap inval */ 3339 pmap_scan(&info); 3340 } 3341 3342 static void 3343 pmap_remove_noinval(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 3344 { 3345 struct pmap_scan_info info; 3346 3347 info.pmap = pmap; 3348 info.sva = sva; 3349 info.eva = eva; 3350 info.func = pmap_remove_callback; 3351 info.arg = NULL; 3352 info.doinval = 0; /* normal remove requires pmap inval */ 3353 pmap_scan(&info); 3354 } 3355 3356 static void 3357 pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info, 3358 pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept, 3359 vm_offset_t va, pt_entry_t *ptep, void *arg __unused) 3360 { 3361 pt_entry_t pte; 3362 3363 if (pte_pv) { 3364 /* 3365 * This will also drop pt_pv's wire_count. Note that 3366 * terminal pages are not wired based on mmu presence. 3367 */ 3368 if (info->doinval) 3369 pmap_remove_pv_pte(pte_pv, pt_pv, &info->inval); 3370 else 3371 pmap_remove_pv_pte(pte_pv, pt_pv, NULL); 3372 pmap_remove_pv_page(pte_pv); 3373 pv_free(pte_pv); 3374 } else if (sharept == 0) { 3375 /* 3376 * Unmanaged page 3377 * 3378 * pt_pv's wire_count is still bumped by unmanaged pages 3379 * so we must decrement it manually. 3380 */ 3381 if (info->doinval) 3382 pmap_inval_interlock(&info->inval, pmap, va); 3383 pte = pte_load_clear(ptep); 3384 if (info->doinval) 3385 pmap_inval_deinterlock(&info->inval, pmap); 3386 if (pte & PG_W) 3387 atomic_add_long(&pmap->pm_stats.wired_count, -1); 3388 atomic_add_long(&pmap->pm_stats.resident_count, -1); 3389 if (vm_page_unwire_quick(pt_pv->pv_m)) 3390 panic("pmap_remove: insufficient wirecount"); 3391 } else { 3392 /* 3393 * Unmanaged page table, pt_pv is actually the pd_pv 3394 * for our pmap (not the share object pmap). 3395 * 3396 * We have to unwire the target page table page and we 3397 * have to unwire our page directory page. 3398 */ 3399 if (info->doinval) 3400 pmap_inval_interlock(&info->inval, pmap, va); 3401 pte = pte_load_clear(ptep); 3402 if (info->doinval) 3403 pmap_inval_deinterlock(&info->inval, pmap); 3404 atomic_add_long(&pmap->pm_stats.resident_count, -1); 3405 if (vm_page_unwire_quick(PHYS_TO_VM_PAGE(pte & PG_FRAME))) 3406 panic("pmap_remove: shared pgtable1 bad wirecount"); 3407 if (vm_page_unwire_quick(pt_pv->pv_m)) 3408 panic("pmap_remove: shared pgtable2 bad wirecount"); 3409 } 3410 } 3411 3412 /* 3413 * Removes this physical page from all physical maps in which it resides. 3414 * Reflects back modify bits to the pager. 3415 * 3416 * This routine may not be called from an interrupt. 3417 */ 3418 static 3419 void 3420 pmap_remove_all(vm_page_t m) 3421 { 3422 struct pmap_inval_info info; 3423 pv_entry_t pv; 3424 3425 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3426 return; 3427 3428 pmap_inval_init(&info); 3429 vm_page_spin_lock(m); 3430 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3431 KKASSERT(pv->pv_m == m); 3432 if (pv_hold_try(pv)) { 3433 vm_page_spin_unlock(m); 3434 } else { 3435 vm_page_spin_unlock(m); 3436 pv_lock(pv); 3437 if (pv->pv_m != m) { 3438 pv_put(pv); 3439 vm_page_spin_lock(m); 3440 continue; 3441 } 3442 } 3443 /* 3444 * Holding no spinlocks, pv is locked. 3445 */ 3446 pmap_remove_pv_pte(pv, NULL, &info); 3447 pmap_remove_pv_page(pv); 3448 pv_free(pv); 3449 vm_page_spin_lock(m); 3450 } 3451 KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0); 3452 vm_page_spin_unlock(m); 3453 pmap_inval_done(&info); 3454 } 3455 3456 /* 3457 * Set the physical protection on the specified range of this map 3458 * as requested. This function is typically only used for debug watchpoints 3459 * and COW pages. 3460 * 3461 * This function may not be called from an interrupt if the map is 3462 * not the kernel_pmap. 3463 * 3464 * NOTE! For shared page table pages we just unmap the page. 3465 */ 3466 void 3467 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 3468 { 3469 struct pmap_scan_info info; 3470 /* JG review for NX */ 3471 3472 if (pmap == NULL) 3473 return; 3474 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 3475 pmap_remove(pmap, sva, eva); 3476 return; 3477 } 3478 if (prot & VM_PROT_WRITE) 3479 return; 3480 info.pmap = pmap; 3481 info.sva = sva; 3482 info.eva = eva; 3483 info.func = pmap_protect_callback; 3484 info.arg = &prot; 3485 info.doinval = 1; 3486 pmap_scan(&info); 3487 } 3488 3489 static 3490 void 3491 pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info, 3492 pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept, 3493 vm_offset_t va, pt_entry_t *ptep, void *arg __unused) 3494 { 3495 pt_entry_t pbits; 3496 pt_entry_t cbits; 3497 pt_entry_t pte; 3498 vm_page_t m; 3499 3500 /* 3501 * XXX non-optimal. 3502 */ 3503 pmap_inval_interlock(&info->inval, pmap, va); 3504 again: 3505 pbits = *ptep; 3506 cbits = pbits; 3507 if (pte_pv) { 3508 m = NULL; 3509 if (pbits & PG_A) { 3510 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 3511 KKASSERT(m == pte_pv->pv_m); 3512 vm_page_flag_set(m, PG_REFERENCED); 3513 cbits &= ~PG_A; 3514 } 3515 if (pbits & PG_M) { 3516 if (pmap_track_modified(pte_pv->pv_pindex)) { 3517 if (m == NULL) 3518 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 3519 vm_page_dirty(m); 3520 cbits &= ~PG_M; 3521 } 3522 } 3523 } else if (sharept) { 3524 /* 3525 * Unmanaged page table, pt_pv is actually the pd_pv 3526 * for our pmap (not the share object pmap). 3527 * 3528 * When asked to protect something in a shared page table 3529 * page we just unmap the page table page. We have to 3530 * invalidate the tlb in this situation. 3531 */ 3532 pte = pte_load_clear(ptep); 3533 pmap_inval_invltlb(&info->inval); 3534 if (vm_page_unwire_quick(PHYS_TO_VM_PAGE(pte & PG_FRAME))) 3535 panic("pmap_protect: pgtable1 pg bad wirecount"); 3536 if (vm_page_unwire_quick(pt_pv->pv_m)) 3537 panic("pmap_protect: pgtable2 pg bad wirecount"); 3538 ptep = NULL; 3539 } 3540 /* else unmanaged page, adjust bits, no wire changes */ 3541 3542 if (ptep) { 3543 cbits &= ~PG_RW; 3544 if (pbits != cbits && !atomic_cmpset_long(ptep, pbits, cbits)) { 3545 goto again; 3546 } 3547 } 3548 pmap_inval_deinterlock(&info->inval, pmap); 3549 if (pte_pv) 3550 pv_put(pte_pv); 3551 } 3552 3553 /* 3554 * Insert the vm_page (m) at the virtual address (va), replacing any prior 3555 * mapping at that address. Set protection and wiring as requested. 3556 * 3557 * If entry is non-NULL we check to see if the SEG_SIZE optimization is 3558 * possible. If it is we enter the page into the appropriate shared pmap 3559 * hanging off the related VM object instead of the passed pmap, then we 3560 * share the page table page from the VM object's pmap into the current pmap. 3561 * 3562 * NOTE: This routine MUST insert the page into the pmap now, it cannot 3563 * lazy-evaluate. 3564 */ 3565 void 3566 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3567 boolean_t wired, vm_map_entry_t entry) 3568 { 3569 pmap_inval_info info; 3570 pv_entry_t pt_pv; /* page table */ 3571 pv_entry_t pte_pv; /* page table entry */ 3572 pt_entry_t *ptep; 3573 vm_paddr_t opa; 3574 pt_entry_t origpte, newpte; 3575 vm_paddr_t pa; 3576 3577 if (pmap == NULL) 3578 return; 3579 va = trunc_page(va); 3580 #ifdef PMAP_DIAGNOSTIC 3581 if (va >= KvaEnd) 3582 panic("pmap_enter: toobig"); 3583 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) 3584 panic("pmap_enter: invalid to pmap_enter page table " 3585 "pages (va: 0x%lx)", va); 3586 #endif 3587 if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) { 3588 kprintf("Warning: pmap_enter called on UVA with " 3589 "kernel_pmap\n"); 3590 #ifdef DDB 3591 db_print_backtrace(); 3592 #endif 3593 } 3594 if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) { 3595 kprintf("Warning: pmap_enter called on KVA without" 3596 "kernel_pmap\n"); 3597 #ifdef DDB 3598 db_print_backtrace(); 3599 #endif 3600 } 3601 3602 /* 3603 * Get locked PV entries for our new page table entry (pte_pv) 3604 * and for its parent page table (pt_pv). We need the parent 3605 * so we can resolve the location of the ptep. 3606 * 3607 * Only hardware MMU actions can modify the ptep out from 3608 * under us. 3609 * 3610 * if (m) is fictitious or unmanaged we do not create a managing 3611 * pte_pv for it. Any pre-existing page's management state must 3612 * match (avoiding code complexity). 3613 * 3614 * If the pmap is still being initialized we assume existing 3615 * page tables. 3616 * 3617 * Kernel mapppings do not track page table pages (i.e. pt_pv). 3618 * pmap_allocpte() checks the 3619 */ 3620 if (pmap_initialized == FALSE) { 3621 pte_pv = NULL; 3622 pt_pv = NULL; 3623 ptep = vtopte(va); 3624 } else if (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) { /* XXX */ 3625 pte_pv = NULL; 3626 if (va >= VM_MAX_USER_ADDRESS) { 3627 pt_pv = NULL; 3628 ptep = vtopte(va); 3629 } else { 3630 pt_pv = pmap_allocpte_seg(pmap, pmap_pt_pindex(va), 3631 NULL, entry, va); 3632 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 3633 } 3634 KKASSERT(*ptep == 0 || (*ptep & PG_MANAGED) == 0); 3635 } else { 3636 if (va >= VM_MAX_USER_ADDRESS) { 3637 /* 3638 * Kernel map, pv_entry-tracked. 3639 */ 3640 pt_pv = NULL; 3641 pte_pv = pmap_allocpte(pmap, pmap_pte_pindex(va), NULL); 3642 ptep = vtopte(va); 3643 } else { 3644 /* 3645 * User map 3646 */ 3647 pte_pv = pmap_allocpte_seg(pmap, pmap_pte_pindex(va), 3648 &pt_pv, entry, va); 3649 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 3650 } 3651 KKASSERT(*ptep == 0 || (*ptep & PG_MANAGED)); 3652 } 3653 3654 pa = VM_PAGE_TO_PHYS(m); 3655 origpte = *ptep; 3656 opa = origpte & PG_FRAME; 3657 3658 newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | PG_V | PG_A); 3659 if (wired) 3660 newpte |= PG_W; 3661 if (va < VM_MAX_USER_ADDRESS) 3662 newpte |= PG_U; 3663 if (pte_pv) 3664 newpte |= PG_MANAGED; 3665 if (pmap == &kernel_pmap) 3666 newpte |= pgeflag; 3667 newpte |= pat_pte_index[m->pat_mode]; 3668 3669 /* 3670 * It is possible for multiple faults to occur in threaded 3671 * environments, the existing pte might be correct. 3672 */ 3673 if (((origpte ^ newpte) & ~(pt_entry_t)(PG_M|PG_A)) == 0) 3674 goto done; 3675 3676 if ((prot & VM_PROT_NOSYNC) == 0) 3677 pmap_inval_init(&info); 3678 3679 /* 3680 * Ok, either the address changed or the protection or wiring 3681 * changed. 3682 * 3683 * Clear the current entry, interlocking the removal. For managed 3684 * pte's this will also flush the modified state to the vm_page. 3685 * Atomic ops are mandatory in order to ensure that PG_M events are 3686 * not lost during any transition. 3687 */ 3688 if (opa) { 3689 if (pte_pv) { 3690 /* 3691 * pmap_remove_pv_pte() unwires pt_pv and assumes 3692 * we will free pte_pv, but since we are reusing 3693 * pte_pv we want to retain the wire count. 3694 * 3695 * pt_pv won't exist for a kernel page (managed or 3696 * otherwise). 3697 */ 3698 if (pt_pv) 3699 vm_page_wire_quick(pt_pv->pv_m); 3700 if (prot & VM_PROT_NOSYNC) 3701 pmap_remove_pv_pte(pte_pv, pt_pv, NULL); 3702 else 3703 pmap_remove_pv_pte(pte_pv, pt_pv, &info); 3704 if (pte_pv->pv_m) 3705 pmap_remove_pv_page(pte_pv); 3706 } else if (prot & VM_PROT_NOSYNC) { 3707 /* 3708 * Unmanaged page, NOSYNC (no mmu sync) requested. 3709 * 3710 * Leave wire count on PT page intact. 3711 */ 3712 (void)pte_load_clear(ptep); 3713 cpu_invlpg((void *)va); 3714 atomic_add_long(&pmap->pm_stats.resident_count, -1); 3715 } else { 3716 /* 3717 * Unmanaged page, normal enter. 3718 * 3719 * Leave wire count on PT page intact. 3720 */ 3721 pmap_inval_interlock(&info, pmap, va); 3722 (void)pte_load_clear(ptep); 3723 pmap_inval_deinterlock(&info, pmap); 3724 atomic_add_long(&pmap->pm_stats.resident_count, -1); 3725 } 3726 KKASSERT(*ptep == 0); 3727 } 3728 3729 if (pte_pv) { 3730 /* 3731 * Enter on the PV list if part of our managed memory. 3732 * Wiring of the PT page is already handled. 3733 */ 3734 KKASSERT(pte_pv->pv_m == NULL); 3735 vm_page_spin_lock(m); 3736 pte_pv->pv_m = m; 3737 TAILQ_INSERT_TAIL(&m->md.pv_list, pte_pv, pv_list); 3738 /* 3739 if (m->object) 3740 atomic_add_int(&m->object->agg_pv_list_count, 1); 3741 */ 3742 vm_page_flag_set(m, PG_MAPPED); 3743 vm_page_spin_unlock(m); 3744 } else if (pt_pv && opa == 0) { 3745 /* 3746 * We have to adjust the wire count on the PT page ourselves 3747 * for unmanaged entries. If opa was non-zero we retained 3748 * the existing wire count from the removal. 3749 */ 3750 vm_page_wire_quick(pt_pv->pv_m); 3751 } 3752 3753 /* 3754 * Kernel VMAs (pt_pv == NULL) require pmap invalidation interlocks. 3755 * 3756 * User VMAs do not because those will be zero->non-zero, so no 3757 * stale entries to worry about at this point. 3758 * 3759 * For KVM there appear to still be issues. Theoretically we 3760 * should be able to scrap the interlocks entirely but we 3761 * get crashes. 3762 */ 3763 if ((prot & VM_PROT_NOSYNC) == 0 && pt_pv == NULL) 3764 pmap_inval_interlock(&info, pmap, va); 3765 3766 /* 3767 * Set the pte 3768 */ 3769 *(volatile pt_entry_t *)ptep = newpte; 3770 3771 if ((prot & VM_PROT_NOSYNC) == 0 && pt_pv == NULL) 3772 pmap_inval_deinterlock(&info, pmap); 3773 else if (pt_pv == NULL) 3774 cpu_invlpg((void *)va); 3775 3776 if (wired) { 3777 if (pte_pv) { 3778 atomic_add_long(&pte_pv->pv_pmap->pm_stats.wired_count, 3779 1); 3780 } else { 3781 atomic_add_long(&pmap->pm_stats.wired_count, 1); 3782 } 3783 } 3784 if (newpte & PG_RW) 3785 vm_page_flag_set(m, PG_WRITEABLE); 3786 3787 /* 3788 * Unmanaged pages need manual resident_count tracking. 3789 */ 3790 if (pte_pv == NULL && pt_pv) 3791 atomic_add_long(&pt_pv->pv_pmap->pm_stats.resident_count, 1); 3792 3793 /* 3794 * Cleanup 3795 */ 3796 if ((prot & VM_PROT_NOSYNC) == 0 || pte_pv == NULL) 3797 pmap_inval_done(&info); 3798 done: 3799 KKASSERT((newpte & PG_MANAGED) == 0 || (m->flags & PG_MAPPED)); 3800 3801 /* 3802 * Cleanup the pv entry, allowing other accessors. 3803 */ 3804 if (pte_pv) 3805 pv_put(pte_pv); 3806 if (pt_pv) 3807 pv_put(pt_pv); 3808 } 3809 3810 /* 3811 * This code works like pmap_enter() but assumes VM_PROT_READ and not-wired. 3812 * This code also assumes that the pmap has no pre-existing entry for this 3813 * VA. 3814 * 3815 * This code currently may only be used on user pmaps, not kernel_pmap. 3816 */ 3817 void 3818 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m) 3819 { 3820 pmap_enter(pmap, va, m, VM_PROT_READ, FALSE, NULL); 3821 } 3822 3823 /* 3824 * Make a temporary mapping for a physical address. This is only intended 3825 * to be used for panic dumps. 3826 * 3827 * The caller is responsible for calling smp_invltlb(). 3828 */ 3829 void * 3830 pmap_kenter_temporary(vm_paddr_t pa, long i) 3831 { 3832 pmap_kenter_quick((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa); 3833 return ((void *)crashdumpmap); 3834 } 3835 3836 #define MAX_INIT_PT (96) 3837 3838 /* 3839 * This routine preloads the ptes for a given object into the specified pmap. 3840 * This eliminates the blast of soft faults on process startup and 3841 * immediately after an mmap. 3842 */ 3843 static int pmap_object_init_pt_callback(vm_page_t p, void *data); 3844 3845 void 3846 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot, 3847 vm_object_t object, vm_pindex_t pindex, 3848 vm_size_t size, int limit) 3849 { 3850 struct rb_vm_page_scan_info info; 3851 struct lwp *lp; 3852 vm_size_t psize; 3853 3854 /* 3855 * We can't preinit if read access isn't set or there is no pmap 3856 * or object. 3857 */ 3858 if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL) 3859 return; 3860 3861 /* 3862 * We can't preinit if the pmap is not the current pmap 3863 */ 3864 lp = curthread->td_lwp; 3865 if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace)) 3866 return; 3867 3868 /* 3869 * Misc additional checks 3870 */ 3871 psize = x86_64_btop(size); 3872 3873 if ((object->type != OBJT_VNODE) || 3874 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && 3875 (object->resident_page_count > MAX_INIT_PT))) { 3876 return; 3877 } 3878 3879 if (pindex + psize > object->size) { 3880 if (object->size < pindex) 3881 return; 3882 psize = object->size - pindex; 3883 } 3884 3885 if (psize == 0) 3886 return; 3887 3888 /* 3889 * If everything is segment-aligned do not pre-init here. Instead 3890 * allow the normal vm_fault path to pass a segment hint to 3891 * pmap_enter() which will then use an object-referenced shared 3892 * page table page. 3893 */ 3894 if ((addr & SEG_MASK) == 0 && 3895 (ctob(psize) & SEG_MASK) == 0 && 3896 (ctob(pindex) & SEG_MASK) == 0) { 3897 return; 3898 } 3899 3900 /* 3901 * Use a red-black scan to traverse the requested range and load 3902 * any valid pages found into the pmap. 3903 * 3904 * We cannot safely scan the object's memq without holding the 3905 * object token. 3906 */ 3907 info.start_pindex = pindex; 3908 info.end_pindex = pindex + psize - 1; 3909 info.limit = limit; 3910 info.mpte = NULL; 3911 info.addr = addr; 3912 info.pmap = pmap; 3913 3914 vm_object_hold_shared(object); 3915 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 3916 pmap_object_init_pt_callback, &info); 3917 vm_object_drop(object); 3918 } 3919 3920 static 3921 int 3922 pmap_object_init_pt_callback(vm_page_t p, void *data) 3923 { 3924 struct rb_vm_page_scan_info *info = data; 3925 vm_pindex_t rel_index; 3926 3927 /* 3928 * don't allow an madvise to blow away our really 3929 * free pages allocating pv entries. 3930 */ 3931 if ((info->limit & MAP_PREFAULT_MADVISE) && 3932 vmstats.v_free_count < vmstats.v_free_reserved) { 3933 return(-1); 3934 } 3935 3936 /* 3937 * Ignore list markers and ignore pages we cannot instantly 3938 * busy (while holding the object token). 3939 */ 3940 if (p->flags & PG_MARKER) 3941 return 0; 3942 if (vm_page_busy_try(p, TRUE)) 3943 return 0; 3944 if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && 3945 (p->flags & PG_FICTITIOUS) == 0) { 3946 if ((p->queue - p->pc) == PQ_CACHE) 3947 vm_page_deactivate(p); 3948 rel_index = p->pindex - info->start_pindex; 3949 pmap_enter_quick(info->pmap, 3950 info->addr + x86_64_ptob(rel_index), p); 3951 } 3952 vm_page_wakeup(p); 3953 lwkt_yield(); 3954 return(0); 3955 } 3956 3957 /* 3958 * Return TRUE if the pmap is in shape to trivially pre-fault the specified 3959 * address. 3960 * 3961 * Returns FALSE if it would be non-trivial or if a pte is already loaded 3962 * into the slot. 3963 * 3964 * XXX This is safe only because page table pages are not freed. 3965 */ 3966 int 3967 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) 3968 { 3969 pt_entry_t *pte; 3970 3971 /*spin_lock(&pmap->pm_spin);*/ 3972 if ((pte = pmap_pte(pmap, addr)) != NULL) { 3973 if (*pte & PG_V) { 3974 /*spin_unlock(&pmap->pm_spin);*/ 3975 return FALSE; 3976 } 3977 } 3978 /*spin_unlock(&pmap->pm_spin);*/ 3979 return TRUE; 3980 } 3981 3982 /* 3983 * Change the wiring attribute for a pmap/va pair. The mapping must already 3984 * exist in the pmap. The mapping may or may not be managed. 3985 */ 3986 void 3987 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired, 3988 vm_map_entry_t entry) 3989 { 3990 pt_entry_t *ptep; 3991 pv_entry_t pv; 3992 3993 if (pmap == NULL) 3994 return; 3995 lwkt_gettoken(&pmap->pm_token); 3996 pv = pmap_allocpte_seg(pmap, pmap_pt_pindex(va), NULL, entry, va); 3997 ptep = pv_pte_lookup(pv, pmap_pte_index(va)); 3998 3999 if (wired && !pmap_pte_w(ptep)) 4000 atomic_add_long(&pv->pv_pmap->pm_stats.wired_count, 1); 4001 else if (!wired && pmap_pte_w(ptep)) 4002 atomic_add_long(&pv->pv_pmap->pm_stats.wired_count, -1); 4003 4004 /* 4005 * Wiring is not a hardware characteristic so there is no need to 4006 * invalidate TLB. However, in an SMP environment we must use 4007 * a locked bus cycle to update the pte (if we are not using 4008 * the pmap_inval_*() API that is)... it's ok to do this for simple 4009 * wiring changes. 4010 */ 4011 if (wired) 4012 atomic_set_long(ptep, PG_W); 4013 else 4014 atomic_clear_long(ptep, PG_W); 4015 pv_put(pv); 4016 lwkt_reltoken(&pmap->pm_token); 4017 } 4018 4019 4020 4021 /* 4022 * Copy the range specified by src_addr/len from the source map to 4023 * the range dst_addr/len in the destination map. 4024 * 4025 * This routine is only advisory and need not do anything. 4026 */ 4027 void 4028 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 4029 vm_size_t len, vm_offset_t src_addr) 4030 { 4031 } 4032 4033 /* 4034 * pmap_zero_page: 4035 * 4036 * Zero the specified physical page. 4037 * 4038 * This function may be called from an interrupt and no locking is 4039 * required. 4040 */ 4041 void 4042 pmap_zero_page(vm_paddr_t phys) 4043 { 4044 vm_offset_t va = PHYS_TO_DMAP(phys); 4045 4046 pagezero((void *)va); 4047 } 4048 4049 /* 4050 * pmap_page_assertzero: 4051 * 4052 * Assert that a page is empty, panic if it isn't. 4053 */ 4054 void 4055 pmap_page_assertzero(vm_paddr_t phys) 4056 { 4057 vm_offset_t va = PHYS_TO_DMAP(phys); 4058 size_t i; 4059 4060 for (i = 0; i < PAGE_SIZE; i += sizeof(long)) { 4061 if (*(long *)((char *)va + i) != 0) { 4062 panic("pmap_page_assertzero() @ %p not zero!", 4063 (void *)(intptr_t)va); 4064 } 4065 } 4066 } 4067 4068 /* 4069 * pmap_zero_page: 4070 * 4071 * Zero part of a physical page by mapping it into memory and clearing 4072 * its contents with bzero. 4073 * 4074 * off and size may not cover an area beyond a single hardware page. 4075 */ 4076 void 4077 pmap_zero_page_area(vm_paddr_t phys, int off, int size) 4078 { 4079 vm_offset_t virt = PHYS_TO_DMAP(phys); 4080 4081 bzero((char *)virt + off, size); 4082 } 4083 4084 /* 4085 * pmap_copy_page: 4086 * 4087 * Copy the physical page from the source PA to the target PA. 4088 * This function may be called from an interrupt. No locking 4089 * is required. 4090 */ 4091 void 4092 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst) 4093 { 4094 vm_offset_t src_virt, dst_virt; 4095 4096 src_virt = PHYS_TO_DMAP(src); 4097 dst_virt = PHYS_TO_DMAP(dst); 4098 bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE); 4099 } 4100 4101 /* 4102 * pmap_copy_page_frag: 4103 * 4104 * Copy the physical page from the source PA to the target PA. 4105 * This function may be called from an interrupt. No locking 4106 * is required. 4107 */ 4108 void 4109 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes) 4110 { 4111 vm_offset_t src_virt, dst_virt; 4112 4113 src_virt = PHYS_TO_DMAP(src); 4114 dst_virt = PHYS_TO_DMAP(dst); 4115 4116 bcopy((char *)src_virt + (src & PAGE_MASK), 4117 (char *)dst_virt + (dst & PAGE_MASK), 4118 bytes); 4119 } 4120 4121 /* 4122 * Returns true if the pmap's pv is one of the first 16 pvs linked to from 4123 * this page. This count may be changed upwards or downwards in the future; 4124 * it is only necessary that true be returned for a small subset of pmaps 4125 * for proper page aging. 4126 */ 4127 boolean_t 4128 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 4129 { 4130 pv_entry_t pv; 4131 int loops = 0; 4132 4133 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 4134 return FALSE; 4135 4136 vm_page_spin_lock(m); 4137 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4138 if (pv->pv_pmap == pmap) { 4139 vm_page_spin_unlock(m); 4140 return TRUE; 4141 } 4142 loops++; 4143 if (loops >= 16) 4144 break; 4145 } 4146 vm_page_spin_unlock(m); 4147 return (FALSE); 4148 } 4149 4150 /* 4151 * Remove all pages from specified address space this aids process exit 4152 * speeds. Also, this code may be special cased for the current process 4153 * only. 4154 */ 4155 void 4156 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4157 { 4158 pmap_remove_noinval(pmap, sva, eva); 4159 cpu_invltlb(); 4160 } 4161 4162 /* 4163 * pmap_testbit tests bits in pte's note that the testbit/clearbit 4164 * routines are inline, and a lot of things compile-time evaluate. 4165 */ 4166 static 4167 boolean_t 4168 pmap_testbit(vm_page_t m, int bit) 4169 { 4170 pv_entry_t pv; 4171 pt_entry_t *pte; 4172 4173 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 4174 return FALSE; 4175 4176 if (TAILQ_FIRST(&m->md.pv_list) == NULL) 4177 return FALSE; 4178 vm_page_spin_lock(m); 4179 if (TAILQ_FIRST(&m->md.pv_list) == NULL) { 4180 vm_page_spin_unlock(m); 4181 return FALSE; 4182 } 4183 4184 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4185 /* 4186 * if the bit being tested is the modified bit, then 4187 * mark clean_map and ptes as never 4188 * modified. 4189 */ 4190 if (bit & (PG_A|PG_M)) { 4191 if (!pmap_track_modified(pv->pv_pindex)) 4192 continue; 4193 } 4194 4195 #if defined(PMAP_DIAGNOSTIC) 4196 if (pv->pv_pmap == NULL) { 4197 kprintf("Null pmap (tb) at pindex: %"PRIu64"\n", 4198 pv->pv_pindex); 4199 continue; 4200 } 4201 #endif 4202 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); 4203 if (*pte & bit) { 4204 vm_page_spin_unlock(m); 4205 return TRUE; 4206 } 4207 } 4208 vm_page_spin_unlock(m); 4209 return (FALSE); 4210 } 4211 4212 /* 4213 * This routine is used to modify bits in ptes. Only one bit should be 4214 * specified. PG_RW requires special handling. 4215 * 4216 * Caller must NOT hold any spin locks 4217 */ 4218 static __inline 4219 void 4220 pmap_clearbit(vm_page_t m, int bit) 4221 { 4222 struct pmap_inval_info info; 4223 pv_entry_t pv; 4224 pt_entry_t *pte; 4225 pt_entry_t pbits; 4226 pmap_t save_pmap; 4227 4228 if (bit == PG_RW) 4229 vm_page_flag_clear(m, PG_WRITEABLE); 4230 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { 4231 return; 4232 } 4233 4234 /* 4235 * PG_M or PG_A case 4236 * 4237 * Loop over all current mappings setting/clearing as appropos If 4238 * setting RO do we need to clear the VAC? 4239 * 4240 * NOTE: When clearing PG_M we could also (not implemented) drop 4241 * through to the PG_RW code and clear PG_RW too, forcing 4242 * a fault on write to redetect PG_M for virtual kernels, but 4243 * it isn't necessary since virtual kernels invalidate the 4244 * pte when they clear the VPTE_M bit in their virtual page 4245 * tables. 4246 * 4247 * NOTE: Does not re-dirty the page when clearing only PG_M. 4248 */ 4249 if ((bit & PG_RW) == 0) { 4250 vm_page_spin_lock(m); 4251 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4252 #if defined(PMAP_DIAGNOSTIC) 4253 if (pv->pv_pmap == NULL) { 4254 kprintf("Null pmap (cb) at pindex: %"PRIu64"\n", 4255 pv->pv_pindex); 4256 continue; 4257 } 4258 #endif 4259 pte = pmap_pte_quick(pv->pv_pmap, 4260 pv->pv_pindex << PAGE_SHIFT); 4261 pbits = *pte; 4262 if (pbits & bit) 4263 atomic_clear_long(pte, bit); 4264 } 4265 vm_page_spin_unlock(m); 4266 return; 4267 } 4268 4269 /* 4270 * Clear PG_RW. Also clears PG_M and marks the page dirty if PG_M 4271 * was set. 4272 */ 4273 pmap_inval_init(&info); 4274 4275 restart: 4276 vm_page_spin_lock(m); 4277 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4278 /* 4279 * don't write protect pager mappings 4280 */ 4281 if (!pmap_track_modified(pv->pv_pindex)) 4282 continue; 4283 4284 #if defined(PMAP_DIAGNOSTIC) 4285 if (pv->pv_pmap == NULL) { 4286 kprintf("Null pmap (cb) at pindex: %"PRIu64"\n", 4287 pv->pv_pindex); 4288 continue; 4289 } 4290 #endif 4291 /* 4292 * Skip pages which do not have PG_RW set. 4293 */ 4294 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); 4295 if ((*pte & PG_RW) == 0) 4296 continue; 4297 4298 /* 4299 * Lock the PV 4300 */ 4301 if (pv_hold_try(pv) == 0) { 4302 vm_page_spin_unlock(m); 4303 pv_lock(pv); /* held, now do a blocking lock */ 4304 pv_put(pv); /* and release */ 4305 goto restart; /* anything could have happened */ 4306 } 4307 4308 save_pmap = pv->pv_pmap; 4309 vm_page_spin_unlock(m); 4310 pmap_inval_interlock(&info, save_pmap, 4311 (vm_offset_t)pv->pv_pindex << PAGE_SHIFT); 4312 KKASSERT(pv->pv_pmap == save_pmap); 4313 for (;;) { 4314 pbits = *pte; 4315 cpu_ccfence(); 4316 if (atomic_cmpset_long(pte, pbits, 4317 pbits & ~(PG_RW|PG_M))) { 4318 break; 4319 } 4320 } 4321 pmap_inval_deinterlock(&info, save_pmap); 4322 vm_page_spin_lock(m); 4323 4324 /* 4325 * If PG_M was found to be set while we were clearing PG_RW 4326 * we also clear PG_M (done above) and mark the page dirty. 4327 * Callers expect this behavior. 4328 */ 4329 if (pbits & PG_M) 4330 vm_page_dirty(m); 4331 pv_put(pv); 4332 } 4333 vm_page_spin_unlock(m); 4334 pmap_inval_done(&info); 4335 } 4336 4337 /* 4338 * Lower the permission for all mappings to a given page. 4339 * 4340 * Page must be busied by caller. 4341 */ 4342 void 4343 pmap_page_protect(vm_page_t m, vm_prot_t prot) 4344 { 4345 /* JG NX support? */ 4346 if ((prot & VM_PROT_WRITE) == 0) { 4347 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { 4348 /* 4349 * NOTE: pmap_clearbit(.. PG_RW) also clears 4350 * the PG_WRITEABLE flag in (m). 4351 */ 4352 pmap_clearbit(m, PG_RW); 4353 } else { 4354 pmap_remove_all(m); 4355 } 4356 } 4357 } 4358 4359 vm_paddr_t 4360 pmap_phys_address(vm_pindex_t ppn) 4361 { 4362 return (x86_64_ptob(ppn)); 4363 } 4364 4365 /* 4366 * Return a count of reference bits for a page, clearing those bits. 4367 * It is not necessary for every reference bit to be cleared, but it 4368 * is necessary that 0 only be returned when there are truly no 4369 * reference bits set. 4370 * 4371 * XXX: The exact number of bits to check and clear is a matter that 4372 * should be tested and standardized at some point in the future for 4373 * optimal aging of shared pages. 4374 * 4375 * This routine may not block. 4376 */ 4377 int 4378 pmap_ts_referenced(vm_page_t m) 4379 { 4380 pv_entry_t pv; 4381 pt_entry_t *pte; 4382 int rtval = 0; 4383 4384 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 4385 return (rtval); 4386 4387 vm_page_spin_lock(m); 4388 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4389 if (!pmap_track_modified(pv->pv_pindex)) 4390 continue; 4391 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); 4392 if (pte && (*pte & PG_A)) { 4393 atomic_clear_long(pte, PG_A); 4394 rtval++; 4395 if (rtval > 4) 4396 break; 4397 } 4398 } 4399 vm_page_spin_unlock(m); 4400 return (rtval); 4401 } 4402 4403 /* 4404 * pmap_is_modified: 4405 * 4406 * Return whether or not the specified physical page was modified 4407 * in any physical maps. 4408 */ 4409 boolean_t 4410 pmap_is_modified(vm_page_t m) 4411 { 4412 boolean_t res; 4413 4414 res = pmap_testbit(m, PG_M); 4415 return (res); 4416 } 4417 4418 /* 4419 * Clear the modify bits on the specified physical page. 4420 */ 4421 void 4422 pmap_clear_modify(vm_page_t m) 4423 { 4424 pmap_clearbit(m, PG_M); 4425 } 4426 4427 /* 4428 * pmap_clear_reference: 4429 * 4430 * Clear the reference bit on the specified physical page. 4431 */ 4432 void 4433 pmap_clear_reference(vm_page_t m) 4434 { 4435 pmap_clearbit(m, PG_A); 4436 } 4437 4438 /* 4439 * Miscellaneous support routines follow 4440 */ 4441 4442 static 4443 void 4444 i386_protection_init(void) 4445 { 4446 int *kp, prot; 4447 4448 /* JG NX support may go here; No VM_PROT_EXECUTE ==> set NX bit */ 4449 kp = protection_codes; 4450 for (prot = 0; prot < 8; prot++) { 4451 switch (prot) { 4452 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: 4453 /* 4454 * Read access is also 0. There isn't any execute bit, 4455 * so just make it readable. 4456 */ 4457 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: 4458 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: 4459 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: 4460 *kp++ = 0; 4461 break; 4462 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: 4463 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: 4464 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: 4465 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: 4466 *kp++ = PG_RW; 4467 break; 4468 } 4469 } 4470 } 4471 4472 /* 4473 * Map a set of physical memory pages into the kernel virtual 4474 * address space. Return a pointer to where it is mapped. This 4475 * routine is intended to be used for mapping device memory, 4476 * NOT real memory. 4477 * 4478 * NOTE: We can't use pgeflag unless we invalidate the pages one at 4479 * a time. 4480 * 4481 * NOTE: The PAT attributes {WRITE_BACK, WRITE_THROUGH, UNCACHED, UNCACHEABLE} 4482 * work whether the cpu supports PAT or not. The remaining PAT 4483 * attributes {WRITE_PROTECTED, WRITE_COMBINING} only work if the cpu 4484 * supports PAT. 4485 */ 4486 void * 4487 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 4488 { 4489 return(pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 4490 } 4491 4492 void * 4493 pmap_mapdev_uncacheable(vm_paddr_t pa, vm_size_t size) 4494 { 4495 return(pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 4496 } 4497 4498 void * 4499 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 4500 { 4501 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 4502 } 4503 4504 /* 4505 * Map a set of physical memory pages into the kernel virtual 4506 * address space. Return a pointer to where it is mapped. This 4507 * routine is intended to be used for mapping device memory, 4508 * NOT real memory. 4509 */ 4510 void * 4511 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 4512 { 4513 vm_offset_t va, tmpva, offset; 4514 pt_entry_t *pte; 4515 vm_size_t tmpsize; 4516 4517 offset = pa & PAGE_MASK; 4518 size = roundup(offset + size, PAGE_SIZE); 4519 4520 va = kmem_alloc_nofault(&kernel_map, size, PAGE_SIZE); 4521 if (va == 0) 4522 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 4523 4524 pa = pa & ~PAGE_MASK; 4525 for (tmpva = va, tmpsize = size; tmpsize > 0;) { 4526 pte = vtopte(tmpva); 4527 *pte = pa | PG_RW | PG_V | /* pgeflag | */ 4528 pat_pte_index[mode]; 4529 tmpsize -= PAGE_SIZE; 4530 tmpva += PAGE_SIZE; 4531 pa += PAGE_SIZE; 4532 } 4533 pmap_invalidate_range(&kernel_pmap, va, va + size); 4534 pmap_invalidate_cache_range(va, va + size); 4535 4536 return ((void *)(va + offset)); 4537 } 4538 4539 void 4540 pmap_unmapdev(vm_offset_t va, vm_size_t size) 4541 { 4542 vm_offset_t base, offset; 4543 4544 base = va & ~PAGE_MASK; 4545 offset = va & PAGE_MASK; 4546 size = roundup(offset + size, PAGE_SIZE); 4547 pmap_qremove(va, size >> PAGE_SHIFT); 4548 kmem_free(&kernel_map, base, size); 4549 } 4550 4551 /* 4552 * Change the PAT attribute on an existing kernel memory map. Caller 4553 * must ensure that the virtual memory in question is not accessed 4554 * during the adjustment. 4555 */ 4556 void 4557 pmap_change_attr(vm_offset_t va, vm_size_t count, int mode) 4558 { 4559 pt_entry_t *pte; 4560 vm_offset_t base; 4561 int changed = 0; 4562 4563 if (va == 0) 4564 panic("pmap_change_attr: va is NULL"); 4565 base = trunc_page(va); 4566 4567 while (count) { 4568 pte = vtopte(va); 4569 *pte = (*pte & ~(pt_entry_t)(PG_PTE_PAT | PG_NC_PCD | 4570 PG_NC_PWT)) | 4571 pat_pte_index[mode]; 4572 --count; 4573 va += PAGE_SIZE; 4574 } 4575 4576 changed = 1; /* XXX: not optimal */ 4577 4578 /* 4579 * Flush CPU caches if required to make sure any data isn't cached that 4580 * shouldn't be, etc. 4581 */ 4582 if (changed) { 4583 pmap_invalidate_range(&kernel_pmap, base, va); 4584 pmap_invalidate_cache_range(base, va); 4585 } 4586 } 4587 4588 /* 4589 * perform the pmap work for mincore 4590 */ 4591 int 4592 pmap_mincore(pmap_t pmap, vm_offset_t addr) 4593 { 4594 pt_entry_t *ptep, pte; 4595 vm_page_t m; 4596 int val = 0; 4597 4598 lwkt_gettoken(&pmap->pm_token); 4599 ptep = pmap_pte(pmap, addr); 4600 4601 if (ptep && (pte = *ptep) != 0) { 4602 vm_offset_t pa; 4603 4604 val = MINCORE_INCORE; 4605 if ((pte & PG_MANAGED) == 0) 4606 goto done; 4607 4608 pa = pte & PG_FRAME; 4609 4610 m = PHYS_TO_VM_PAGE(pa); 4611 4612 /* 4613 * Modified by us 4614 */ 4615 if (pte & PG_M) 4616 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 4617 /* 4618 * Modified by someone 4619 */ 4620 else if (m->dirty || pmap_is_modified(m)) 4621 val |= MINCORE_MODIFIED_OTHER; 4622 /* 4623 * Referenced by us 4624 */ 4625 if (pte & PG_A) 4626 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 4627 4628 /* 4629 * Referenced by someone 4630 */ 4631 else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { 4632 val |= MINCORE_REFERENCED_OTHER; 4633 vm_page_flag_set(m, PG_REFERENCED); 4634 } 4635 } 4636 done: 4637 lwkt_reltoken(&pmap->pm_token); 4638 4639 return val; 4640 } 4641 4642 /* 4643 * Replace p->p_vmspace with a new one. If adjrefs is non-zero the new 4644 * vmspace will be ref'd and the old one will be deref'd. 4645 * 4646 * The vmspace for all lwps associated with the process will be adjusted 4647 * and cr3 will be reloaded if any lwp is the current lwp. 4648 * 4649 * The process must hold the vmspace->vm_map.token for oldvm and newvm 4650 */ 4651 void 4652 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs) 4653 { 4654 struct vmspace *oldvm; 4655 struct lwp *lp; 4656 4657 oldvm = p->p_vmspace; 4658 if (oldvm != newvm) { 4659 if (adjrefs) 4660 sysref_get(&newvm->vm_sysref); 4661 p->p_vmspace = newvm; 4662 KKASSERT(p->p_nthreads == 1); 4663 lp = RB_ROOT(&p->p_lwp_tree); 4664 pmap_setlwpvm(lp, newvm); 4665 if (adjrefs) 4666 sysref_put(&oldvm->vm_sysref); 4667 } 4668 } 4669 4670 /* 4671 * Set the vmspace for a LWP. The vmspace is almost universally set the 4672 * same as the process vmspace, but virtual kernels need to swap out contexts 4673 * on a per-lwp basis. 4674 * 4675 * Caller does not necessarily hold any vmspace tokens. Caller must control 4676 * the lwp (typically be in the context of the lwp). We use a critical 4677 * section to protect against statclock and hardclock (statistics collection). 4678 */ 4679 void 4680 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm) 4681 { 4682 struct vmspace *oldvm; 4683 struct pmap *pmap; 4684 4685 oldvm = lp->lwp_vmspace; 4686 4687 if (oldvm != newvm) { 4688 crit_enter(); 4689 lp->lwp_vmspace = newvm; 4690 if (curthread->td_lwp == lp) { 4691 pmap = vmspace_pmap(newvm); 4692 atomic_set_cpumask(&pmap->pm_active, mycpu->gd_cpumask); 4693 if (pmap->pm_active & CPUMASK_LOCK) 4694 pmap_interlock_wait(newvm); 4695 #if defined(SWTCH_OPTIM_STATS) 4696 tlb_flush_count++; 4697 #endif 4698 curthread->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4); 4699 load_cr3(curthread->td_pcb->pcb_cr3); 4700 pmap = vmspace_pmap(oldvm); 4701 atomic_clear_cpumask(&pmap->pm_active, mycpu->gd_cpumask); 4702 } 4703 crit_exit(); 4704 } 4705 } 4706 4707 /* 4708 * Called when switching to a locked pmap, used to interlock against pmaps 4709 * undergoing modifications to prevent us from activating the MMU for the 4710 * target pmap until all such modifications have completed. We have to do 4711 * this because the thread making the modifications has already set up its 4712 * SMP synchronization mask. 4713 * 4714 * This function cannot sleep! 4715 * 4716 * No requirements. 4717 */ 4718 void 4719 pmap_interlock_wait(struct vmspace *vm) 4720 { 4721 struct pmap *pmap = &vm->vm_pmap; 4722 4723 if (pmap->pm_active & CPUMASK_LOCK) { 4724 crit_enter(); 4725 KKASSERT(curthread->td_critcount >= 2); 4726 DEBUG_PUSH_INFO("pmap_interlock_wait"); 4727 while (pmap->pm_active & CPUMASK_LOCK) { 4728 cpu_ccfence(); 4729 lwkt_process_ipiq(); 4730 } 4731 DEBUG_POP_INFO(); 4732 crit_exit(); 4733 } 4734 } 4735 4736 vm_offset_t 4737 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) 4738 { 4739 4740 if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { 4741 return addr; 4742 } 4743 4744 addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); 4745 return addr; 4746 } 4747 4748 /* 4749 * Used by kmalloc/kfree, page already exists at va 4750 */ 4751 vm_page_t 4752 pmap_kvtom(vm_offset_t va) 4753 { 4754 return(PHYS_TO_VM_PAGE(*vtopte(va) & PG_FRAME)); 4755 } 4756 4757 /* 4758 * Initialize machine-specific shared page directory support. This 4759 * is executed when a VM object is created. 4760 */ 4761 void 4762 pmap_object_init(vm_object_t object) 4763 { 4764 object->md.pmap_rw = NULL; 4765 object->md.pmap_ro = NULL; 4766 } 4767 4768 /* 4769 * Clean up machine-specific shared page directory support. This 4770 * is executed when a VM object is destroyed. 4771 */ 4772 void 4773 pmap_object_free(vm_object_t object) 4774 { 4775 pmap_t pmap; 4776 4777 if ((pmap = object->md.pmap_rw) != NULL) { 4778 object->md.pmap_rw = NULL; 4779 pmap_remove_noinval(pmap, 4780 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); 4781 pmap->pm_active = 0; 4782 pmap_release(pmap); 4783 pmap_puninit(pmap); 4784 kfree(pmap, M_OBJPMAP); 4785 } 4786 if ((pmap = object->md.pmap_ro) != NULL) { 4787 object->md.pmap_ro = NULL; 4788 pmap_remove_noinval(pmap, 4789 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); 4790 pmap->pm_active = 0; 4791 pmap_release(pmap); 4792 pmap_puninit(pmap); 4793 kfree(pmap, M_OBJPMAP); 4794 } 4795 } 4796