1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * Copyright (c) 1994 John S. Dyson 4 * Copyright (c) 1994 David Greenman 5 * Copyright (c) 2003 Peter Wemm 6 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu> 7 * Copyright (c) 2008, 2009 The DragonFly Project. 8 * Copyright (c) 2008, 2009 Jordan Gordeev. 9 * Copyright (c) 2011-2012 Matthew Dillon 10 * All rights reserved. 11 * 12 * This code is derived from software contributed to Berkeley by 13 * the Systems Programming Group of the University of Utah Computer 14 * Science Department and William Jolitz of UUNET Technologies Inc. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. All advertising materials mentioning features or use of this software 25 * must display the following acknowledgement: 26 * This product includes software developed by the University of 27 * California, Berkeley and its contributors. 28 * 4. Neither the name of the University nor the names of its contributors 29 * may be used to endorse or promote products derived from this software 30 * without specific prior written permission. 31 * 32 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 35 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 42 * SUCH DAMAGE. 43 */ 44 /* 45 * Manage physical address maps for x86-64 systems. 46 */ 47 48 #if 0 /* JG */ 49 #include "opt_disable_pse.h" 50 #include "opt_pmap.h" 51 #endif 52 #include "opt_msgbuf.h" 53 54 #include <sys/param.h> 55 #include <sys/kernel.h> 56 #include <sys/proc.h> 57 #include <sys/msgbuf.h> 58 #include <sys/vmmeter.h> 59 #include <sys/mman.h> 60 #include <sys/systm.h> 61 62 #include <vm/vm.h> 63 #include <vm/vm_param.h> 64 #include <sys/sysctl.h> 65 #include <sys/lock.h> 66 #include <vm/vm_kern.h> 67 #include <vm/vm_page.h> 68 #include <vm/vm_map.h> 69 #include <vm/vm_object.h> 70 #include <vm/vm_extern.h> 71 #include <vm/vm_pageout.h> 72 #include <vm/vm_pager.h> 73 #include <vm/vm_zone.h> 74 75 #include <sys/user.h> 76 #include <sys/thread2.h> 77 #include <sys/sysref2.h> 78 #include <sys/spinlock2.h> 79 #include <vm/vm_page2.h> 80 81 #include <machine/cputypes.h> 82 #include <machine/md_var.h> 83 #include <machine/specialreg.h> 84 #include <machine/smp.h> 85 #include <machine_base/apic/apicreg.h> 86 #include <machine/globaldata.h> 87 #include <machine/pmap.h> 88 #include <machine/pmap_inval.h> 89 #include <machine/inttypes.h> 90 91 #include <ddb/ddb.h> 92 93 #define PMAP_KEEP_PDIRS 94 #ifndef PMAP_SHPGPERPROC 95 #define PMAP_SHPGPERPROC 2000 96 #endif 97 98 #if defined(DIAGNOSTIC) 99 #define PMAP_DIAGNOSTIC 100 #endif 101 102 #define MINPV 2048 103 104 /* 105 * pmap debugging will report who owns a pv lock when blocking. 106 */ 107 #ifdef PMAP_DEBUG 108 109 #define PMAP_DEBUG_DECL ,const char *func, int lineno 110 #define PMAP_DEBUG_ARGS , __func__, __LINE__ 111 #define PMAP_DEBUG_COPY , func, lineno 112 113 #define pv_get(pmap, pindex) _pv_get(pmap, pindex \ 114 PMAP_DEBUG_ARGS) 115 #define pv_lock(pv) _pv_lock(pv \ 116 PMAP_DEBUG_ARGS) 117 #define pv_hold_try(pv) _pv_hold_try(pv \ 118 PMAP_DEBUG_ARGS) 119 #define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp \ 120 PMAP_DEBUG_ARGS) 121 122 #else 123 124 #define PMAP_DEBUG_DECL 125 #define PMAP_DEBUG_ARGS 126 #define PMAP_DEBUG_COPY 127 128 #define pv_get(pmap, pindex) _pv_get(pmap, pindex) 129 #define pv_lock(pv) _pv_lock(pv) 130 #define pv_hold_try(pv) _pv_hold_try(pv) 131 #define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp) 132 133 #endif 134 135 /* 136 * Get PDEs and PTEs for user/kernel address space 137 */ 138 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 139 140 #define pmap_pde_v(pmap, pte) ((*(pd_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0) 141 #define pmap_pte_w(pmap, pte) ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_W_IDX]) != 0) 142 #define pmap_pte_m(pmap, pte) ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_M_IDX]) != 0) 143 #define pmap_pte_u(pmap, pte) ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_U_IDX]) != 0) 144 #define pmap_pte_v(pmap, pte) ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0) 145 146 /* 147 * Given a map and a machine independent protection code, 148 * convert to a vax protection code. 149 */ 150 #define pte_prot(m, p) \ 151 (m->protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)]) 152 static int protection_codes[PROTECTION_CODES_SIZE]; 153 154 struct pmap kernel_pmap; 155 156 MALLOC_DEFINE(M_OBJPMAP, "objpmap", "pmaps associated with VM objects"); 157 158 vm_paddr_t avail_start; /* PA of first available physical page */ 159 vm_paddr_t avail_end; /* PA of last available physical page */ 160 vm_offset_t virtual2_start; /* cutout free area prior to kernel start */ 161 vm_offset_t virtual2_end; 162 vm_offset_t virtual_start; /* VA of first avail page (after kernel bss) */ 163 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 164 vm_offset_t KvaStart; /* VA start of KVA space */ 165 vm_offset_t KvaEnd; /* VA end of KVA space (non-inclusive) */ 166 vm_offset_t KvaSize; /* max size of kernel virtual address space */ 167 static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ 168 //static int pgeflag; /* PG_G or-in */ 169 //static int pseflag; /* PG_PS or-in */ 170 uint64_t PatMsr; 171 172 static int ndmpdp; 173 static vm_paddr_t dmaplimit; 174 static int nkpt; 175 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 176 177 static pt_entry_t pat_pte_index[PAT_INDEX_SIZE]; /* PAT -> PG_ bits */ 178 /*static pt_entry_t pat_pde_index[PAT_INDEX_SIZE];*/ /* PAT -> PG_ bits */ 179 180 static uint64_t KPTbase; 181 static uint64_t KPTphys; 182 static uint64_t KPDphys; /* phys addr of kernel level 2 */ 183 static uint64_t KPDbase; /* phys addr of kernel level 2 @ KERNBASE */ 184 uint64_t KPDPphys; /* phys addr of kernel level 3 */ 185 uint64_t KPML4phys; /* phys addr of kernel level 4 */ 186 187 static uint64_t DMPDphys; /* phys addr of direct mapped level 2 */ 188 static uint64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 189 190 /* 191 * Data for the pv entry allocation mechanism 192 */ 193 static vm_zone_t pvzone; 194 static struct vm_zone pvzone_store; 195 static struct vm_object pvzone_obj; 196 static int pv_entry_max=0, pv_entry_high_water=0; 197 static int pmap_pagedaemon_waken = 0; 198 static struct pv_entry *pvinit; 199 200 /* 201 * All those kernel PT submaps that BSD is so fond of 202 */ 203 pt_entry_t *CMAP1 = NULL, *ptmmap; 204 caddr_t CADDR1 = NULL, ptvmmap = NULL; 205 static pt_entry_t *msgbufmap; 206 struct msgbuf *msgbufp=NULL; 207 208 /* 209 * PMAP default PG_* bits. Needed to be able to add 210 * EPT/NPT pagetable pmap_bits for the VMM module 211 */ 212 uint64_t pmap_bits_default[] = { 213 REGULAR_PMAP, /* TYPE_IDX 0 */ 214 X86_PG_V, /* PG_V_IDX 1 */ 215 X86_PG_RW, /* PG_RW_IDX 2 */ 216 X86_PG_U, /* PG_U_IDX 3 */ 217 X86_PG_A, /* PG_A_IDX 4 */ 218 X86_PG_M, /* PG_M_IDX 5 */ 219 X86_PG_PS, /* PG_PS_IDX3 6 */ 220 X86_PG_G, /* PG_G_IDX 7 */ 221 X86_PG_AVAIL1, /* PG_AVAIL1_IDX 8 */ 222 X86_PG_AVAIL2, /* PG_AVAIL2_IDX 9 */ 223 X86_PG_AVAIL3, /* PG_AVAIL3_IDX 10 */ 224 X86_PG_NC_PWT | X86_PG_NC_PCD, /* PG_N_IDX 11 */ 225 }; 226 /* 227 * Crashdump maps. 228 */ 229 static pt_entry_t *pt_crashdumpmap; 230 static caddr_t crashdumpmap; 231 232 static int pmap_debug = 0; 233 SYSCTL_INT(_machdep, OID_AUTO, pmap_debug, CTLFLAG_RW, 234 &pmap_debug, 0, "Debug pmap's"); 235 #ifdef PMAP_DEBUG2 236 static int pmap_enter_debug = 0; 237 SYSCTL_INT(_machdep, OID_AUTO, pmap_enter_debug, CTLFLAG_RW, 238 &pmap_enter_debug, 0, "Debug pmap_enter's"); 239 #endif 240 static int pmap_yield_count = 64; 241 SYSCTL_INT(_machdep, OID_AUTO, pmap_yield_count, CTLFLAG_RW, 242 &pmap_yield_count, 0, "Yield during init_pt/release"); 243 static int pmap_mmu_optimize = 0; 244 SYSCTL_INT(_machdep, OID_AUTO, pmap_mmu_optimize, CTLFLAG_RW, 245 &pmap_mmu_optimize, 0, "Share page table pages when possible"); 246 int pmap_fast_kernel_cpusync = 0; 247 SYSCTL_INT(_machdep, OID_AUTO, pmap_fast_kernel_cpusync, CTLFLAG_RW, 248 &pmap_fast_kernel_cpusync, 0, "Share page table pages when possible"); 249 int pmap_dynamic_delete = -1; 250 SYSCTL_INT(_machdep, OID_AUTO, pmap_dynamic_delete, CTLFLAG_RW, 251 &pmap_dynamic_delete, 0, "Dynamically delete PT/PD/PDPs"); 252 253 #define DISABLE_PSE 254 255 /* Standard user access funtions */ 256 extern int std_copyinstr (const void *udaddr, void *kaddr, size_t len, 257 size_t *lencopied); 258 extern int std_copyin (const void *udaddr, void *kaddr, size_t len); 259 extern int std_copyout (const void *kaddr, void *udaddr, size_t len); 260 extern int std_fubyte (const void *base); 261 extern int std_subyte (void *base, int byte); 262 extern long std_fuword (const void *base); 263 extern int std_suword (void *base, long word); 264 extern int std_suword32 (void *base, int word); 265 266 static void pv_hold(pv_entry_t pv); 267 static int _pv_hold_try(pv_entry_t pv 268 PMAP_DEBUG_DECL); 269 static void pv_drop(pv_entry_t pv); 270 static void _pv_lock(pv_entry_t pv 271 PMAP_DEBUG_DECL); 272 static void pv_unlock(pv_entry_t pv); 273 static pv_entry_t _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew 274 PMAP_DEBUG_DECL); 275 static pv_entry_t _pv_get(pmap_t pmap, vm_pindex_t pindex 276 PMAP_DEBUG_DECL); 277 static pv_entry_t pv_get_try(pmap_t pmap, vm_pindex_t pindex, int *errorp); 278 static pv_entry_t pv_find(pmap_t pmap, vm_pindex_t pindex); 279 static void pv_put(pv_entry_t pv); 280 static void pv_free(pv_entry_t pv, pv_entry_t pvp, int putaway); 281 static void *pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex); 282 static pv_entry_t pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, 283 pv_entry_t *pvpp); 284 static pv_entry_t pmap_allocpte_seg(pmap_t pmap, vm_pindex_t ptepindex, 285 pv_entry_t *pvpp, vm_map_entry_t entry, vm_offset_t va); 286 static void pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, 287 pmap_inval_bulk_t *bulk, int destroy); 288 static vm_page_t pmap_remove_pv_page(pv_entry_t pv); 289 static int pmap_release_pv(pv_entry_t pv, pv_entry_t pvp, 290 pmap_inval_bulk_t *bulk); 291 292 struct pmap_scan_info; 293 static void pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info, 294 pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept, 295 vm_offset_t va, pt_entry_t *ptep, void *arg __unused); 296 static void pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info, 297 pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept, 298 vm_offset_t va, pt_entry_t *ptep, void *arg __unused); 299 300 static void i386_protection_init (void); 301 static void create_pagetables(vm_paddr_t *firstaddr); 302 static void pmap_remove_all (vm_page_t m); 303 static boolean_t pmap_testbit (vm_page_t m, int bit); 304 305 static pt_entry_t * pmap_pte_quick (pmap_t pmap, vm_offset_t va); 306 static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 307 308 static void pmap_pinit_defaults(struct pmap *pmap); 309 310 static unsigned pdir4mb; 311 312 static int 313 pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2) 314 { 315 if (pv1->pv_pindex < pv2->pv_pindex) 316 return(-1); 317 if (pv1->pv_pindex > pv2->pv_pindex) 318 return(1); 319 return(0); 320 } 321 322 RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry, 323 pv_entry_compare, vm_pindex_t, pv_pindex); 324 325 static __inline 326 void 327 pmap_page_stats_adding(vm_page_t m) 328 { 329 globaldata_t gd = mycpu; 330 331 if (TAILQ_EMPTY(&m->md.pv_list)) { 332 ++gd->gd_vmtotal.t_arm; 333 } else if (TAILQ_FIRST(&m->md.pv_list) == 334 TAILQ_LAST(&m->md.pv_list, md_page_pv_list)) { 335 ++gd->gd_vmtotal.t_armshr; 336 ++gd->gd_vmtotal.t_avmshr; 337 } else { 338 ++gd->gd_vmtotal.t_avmshr; 339 } 340 } 341 342 static __inline 343 void 344 pmap_page_stats_deleting(vm_page_t m) 345 { 346 globaldata_t gd = mycpu; 347 348 if (TAILQ_EMPTY(&m->md.pv_list)) { 349 --gd->gd_vmtotal.t_arm; 350 } else if (TAILQ_FIRST(&m->md.pv_list) == 351 TAILQ_LAST(&m->md.pv_list, md_page_pv_list)) { 352 --gd->gd_vmtotal.t_armshr; 353 --gd->gd_vmtotal.t_avmshr; 354 } else { 355 --gd->gd_vmtotal.t_avmshr; 356 } 357 } 358 359 /* 360 * Move the kernel virtual free pointer to the next 361 * 2MB. This is used to help improve performance 362 * by using a large (2MB) page for much of the kernel 363 * (.text, .data, .bss) 364 */ 365 static 366 vm_offset_t 367 pmap_kmem_choose(vm_offset_t addr) 368 { 369 vm_offset_t newaddr = addr; 370 371 newaddr = roundup2(addr, NBPDR); 372 return newaddr; 373 } 374 375 /* 376 * pmap_pte_quick: 377 * 378 * Super fast pmap_pte routine best used when scanning the pv lists. 379 * This eliminates many course-grained invltlb calls. Note that many of 380 * the pv list scans are across different pmaps and it is very wasteful 381 * to do an entire invltlb when checking a single mapping. 382 */ 383 static __inline pt_entry_t *pmap_pte(pmap_t pmap, vm_offset_t va); 384 385 static 386 pt_entry_t * 387 pmap_pte_quick(pmap_t pmap, vm_offset_t va) 388 { 389 return pmap_pte(pmap, va); 390 } 391 392 /* 393 * Returns the pindex of a page table entry (representing a terminal page). 394 * There are NUPTE_TOTAL page table entries possible (a huge number) 395 * 396 * x86-64 has a 48-bit address space, where bit 47 is sign-extended out. 397 * We want to properly translate negative KVAs. 398 */ 399 static __inline 400 vm_pindex_t 401 pmap_pte_pindex(vm_offset_t va) 402 { 403 return ((va >> PAGE_SHIFT) & (NUPTE_TOTAL - 1)); 404 } 405 406 /* 407 * Returns the pindex of a page table. 408 */ 409 static __inline 410 vm_pindex_t 411 pmap_pt_pindex(vm_offset_t va) 412 { 413 return (NUPTE_TOTAL + ((va >> PDRSHIFT) & (NUPT_TOTAL - 1))); 414 } 415 416 /* 417 * Returns the pindex of a page directory. 418 */ 419 static __inline 420 vm_pindex_t 421 pmap_pd_pindex(vm_offset_t va) 422 { 423 return (NUPTE_TOTAL + NUPT_TOTAL + 424 ((va >> PDPSHIFT) & (NUPD_TOTAL - 1))); 425 } 426 427 static __inline 428 vm_pindex_t 429 pmap_pdp_pindex(vm_offset_t va) 430 { 431 return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + 432 ((va >> PML4SHIFT) & (NUPDP_TOTAL - 1))); 433 } 434 435 static __inline 436 vm_pindex_t 437 pmap_pml4_pindex(void) 438 { 439 return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL); 440 } 441 442 /* 443 * Return various clipped indexes for a given VA 444 * 445 * Returns the index of a pt in a page directory, representing a page 446 * table. 447 */ 448 static __inline 449 vm_pindex_t 450 pmap_pt_index(vm_offset_t va) 451 { 452 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 453 } 454 455 /* 456 * Returns the index of a pd in a page directory page, representing a page 457 * directory. 458 */ 459 static __inline 460 vm_pindex_t 461 pmap_pd_index(vm_offset_t va) 462 { 463 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 464 } 465 466 /* 467 * Returns the index of a pdp in the pml4 table, representing a page 468 * directory page. 469 */ 470 static __inline 471 vm_pindex_t 472 pmap_pdp_index(vm_offset_t va) 473 { 474 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 475 } 476 477 /* 478 * Generic procedure to index a pte from a pt, pd, or pdp. 479 * 480 * NOTE: Normally passed pindex as pmap_xx_index(). pmap_xx_pindex() is NOT 481 * a page table page index but is instead of PV lookup index. 482 */ 483 static 484 void * 485 pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex) 486 { 487 pt_entry_t *pte; 488 489 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pv->pv_m)); 490 return(&pte[pindex]); 491 } 492 493 /* 494 * Return pointer to PDP slot in the PML4 495 */ 496 static __inline 497 pml4_entry_t * 498 pmap_pdp(pmap_t pmap, vm_offset_t va) 499 { 500 return (&pmap->pm_pml4[pmap_pdp_index(va)]); 501 } 502 503 /* 504 * Return pointer to PD slot in the PDP given a pointer to the PDP 505 */ 506 static __inline 507 pdp_entry_t * 508 pmap_pdp_to_pd(pml4_entry_t pdp_pte, vm_offset_t va) 509 { 510 pdp_entry_t *pd; 511 512 pd = (pdp_entry_t *)PHYS_TO_DMAP(pdp_pte & PG_FRAME); 513 return (&pd[pmap_pd_index(va)]); 514 } 515 516 /* 517 * Return pointer to PD slot in the PDP. 518 */ 519 static __inline 520 pdp_entry_t * 521 pmap_pd(pmap_t pmap, vm_offset_t va) 522 { 523 pml4_entry_t *pdp; 524 525 pdp = pmap_pdp(pmap, va); 526 if ((*pdp & pmap->pmap_bits[PG_V_IDX]) == 0) 527 return NULL; 528 return (pmap_pdp_to_pd(*pdp, va)); 529 } 530 531 /* 532 * Return pointer to PT slot in the PD given a pointer to the PD 533 */ 534 static __inline 535 pd_entry_t * 536 pmap_pd_to_pt(pdp_entry_t pd_pte, vm_offset_t va) 537 { 538 pd_entry_t *pt; 539 540 pt = (pd_entry_t *)PHYS_TO_DMAP(pd_pte & PG_FRAME); 541 return (&pt[pmap_pt_index(va)]); 542 } 543 544 /* 545 * Return pointer to PT slot in the PD 546 * 547 * SIMPLE PMAP NOTE: Simple pmaps (embedded in objects) do not have PDPs, 548 * so we cannot lookup the PD via the PDP. Instead we 549 * must look it up via the pmap. 550 */ 551 static __inline 552 pd_entry_t * 553 pmap_pt(pmap_t pmap, vm_offset_t va) 554 { 555 pdp_entry_t *pd; 556 pv_entry_t pv; 557 vm_pindex_t pd_pindex; 558 559 if (pmap->pm_flags & PMAP_FLAG_SIMPLE) { 560 pd_pindex = pmap_pd_pindex(va); 561 spin_lock(&pmap->pm_spin); 562 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pd_pindex); 563 spin_unlock(&pmap->pm_spin); 564 if (pv == NULL || pv->pv_m == NULL) 565 return NULL; 566 return (pmap_pd_to_pt(VM_PAGE_TO_PHYS(pv->pv_m), va)); 567 } else { 568 pd = pmap_pd(pmap, va); 569 if (pd == NULL || (*pd & pmap->pmap_bits[PG_V_IDX]) == 0) 570 return NULL; 571 return (pmap_pd_to_pt(*pd, va)); 572 } 573 } 574 575 /* 576 * Return pointer to PTE slot in the PT given a pointer to the PT 577 */ 578 static __inline 579 pt_entry_t * 580 pmap_pt_to_pte(pd_entry_t pt_pte, vm_offset_t va) 581 { 582 pt_entry_t *pte; 583 584 pte = (pt_entry_t *)PHYS_TO_DMAP(pt_pte & PG_FRAME); 585 return (&pte[pmap_pte_index(va)]); 586 } 587 588 /* 589 * Return pointer to PTE slot in the PT 590 */ 591 static __inline 592 pt_entry_t * 593 pmap_pte(pmap_t pmap, vm_offset_t va) 594 { 595 pd_entry_t *pt; 596 597 pt = pmap_pt(pmap, va); 598 if (pt == NULL || (*pt & pmap->pmap_bits[PG_V_IDX]) == 0) 599 return NULL; 600 if ((*pt & pmap->pmap_bits[PG_PS_IDX]) != 0) 601 return ((pt_entry_t *)pt); 602 return (pmap_pt_to_pte(*pt, va)); 603 } 604 605 /* 606 * Of all the layers (PTE, PT, PD, PDP, PML4) the best one to cache is 607 * the PT layer. This will speed up core pmap operations considerably. 608 * 609 * NOTE: The pmap spinlock does not need to be held but the passed-in pv 610 * must be in a known associated state (typically by being locked when 611 * the pmap spinlock isn't held). We allow the race for that case. 612 */ 613 static __inline 614 void 615 pv_cache(pv_entry_t pv, vm_pindex_t pindex) 616 { 617 if (pindex >= pmap_pt_pindex(0) && pindex <= pmap_pd_pindex(0)) 618 pv->pv_pmap->pm_pvhint = pv; 619 } 620 621 622 /* 623 * Return address of PT slot in PD (KVM only) 624 * 625 * Cannot be used for user page tables because it might interfere with 626 * the shared page-table-page optimization (pmap_mmu_optimize). 627 */ 628 static __inline 629 pd_entry_t * 630 vtopt(vm_offset_t va) 631 { 632 uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 633 NPML4EPGSHIFT)) - 1); 634 635 return (PDmap + ((va >> PDRSHIFT) & mask)); 636 } 637 638 /* 639 * KVM - return address of PTE slot in PT 640 */ 641 static __inline 642 pt_entry_t * 643 vtopte(vm_offset_t va) 644 { 645 uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + 646 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 647 648 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 649 } 650 651 static uint64_t 652 allocpages(vm_paddr_t *firstaddr, long n) 653 { 654 uint64_t ret; 655 656 ret = *firstaddr; 657 bzero((void *)ret, n * PAGE_SIZE); 658 *firstaddr += n * PAGE_SIZE; 659 return (ret); 660 } 661 662 static 663 void 664 create_pagetables(vm_paddr_t *firstaddr) 665 { 666 long i; /* must be 64 bits */ 667 long nkpt_base; 668 long nkpt_phys; 669 int j; 670 671 /* 672 * We are running (mostly) V=P at this point 673 * 674 * Calculate NKPT - number of kernel page tables. We have to 675 * accomodoate prealloction of the vm_page_array, dump bitmap, 676 * MSGBUF_SIZE, and other stuff. Be generous. 677 * 678 * Maxmem is in pages. 679 * 680 * ndmpdp is the number of 1GB pages we wish to map. 681 */ 682 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; 683 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 684 ndmpdp = 4; 685 KKASSERT(ndmpdp <= NKPDPE * NPDEPG); 686 687 /* 688 * Starting at the beginning of kvm (not KERNBASE). 689 */ 690 nkpt_phys = (Maxmem * sizeof(struct vm_page) + NBPDR - 1) / NBPDR; 691 nkpt_phys += (Maxmem * sizeof(struct pv_entry) + NBPDR - 1) / NBPDR; 692 nkpt_phys += ((nkpt + nkpt + 1 + NKPML4E + NKPDPE + NDMPML4E + 693 ndmpdp) + 511) / 512; 694 nkpt_phys += 128; 695 696 /* 697 * Starting at KERNBASE - map 2G worth of page table pages. 698 * KERNBASE is offset -2G from the end of kvm. 699 */ 700 nkpt_base = (NPDPEPG - KPDPI) * NPTEPG; /* typically 2 x 512 */ 701 702 /* 703 * Allocate pages 704 */ 705 KPTbase = allocpages(firstaddr, nkpt_base); 706 KPTphys = allocpages(firstaddr, nkpt_phys); 707 KPML4phys = allocpages(firstaddr, 1); 708 KPDPphys = allocpages(firstaddr, NKPML4E); 709 KPDphys = allocpages(firstaddr, NKPDPE); 710 711 /* 712 * Calculate the page directory base for KERNBASE, 713 * that is where we start populating the page table pages. 714 * Basically this is the end - 2. 715 */ 716 KPDbase = KPDphys + ((NKPDPE - (NPDPEPG - KPDPI)) << PAGE_SHIFT); 717 718 DMPDPphys = allocpages(firstaddr, NDMPML4E); 719 if ((amd_feature & AMDID_PAGE1GB) == 0) 720 DMPDphys = allocpages(firstaddr, ndmpdp); 721 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 722 723 /* 724 * Fill in the underlying page table pages for the area around 725 * KERNBASE. This remaps low physical memory to KERNBASE. 726 * 727 * Read-only from zero to physfree 728 * XXX not fully used, underneath 2M pages 729 */ 730 for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) { 731 ((pt_entry_t *)KPTbase)[i] = i << PAGE_SHIFT; 732 ((pt_entry_t *)KPTbase)[i] |= 733 pmap_bits_default[PG_RW_IDX] | 734 pmap_bits_default[PG_V_IDX] | 735 pmap_bits_default[PG_G_IDX]; 736 } 737 738 /* 739 * Now map the initial kernel page tables. One block of page 740 * tables is placed at the beginning of kernel virtual memory, 741 * and another block is placed at KERNBASE to map the kernel binary, 742 * data, bss, and initial pre-allocations. 743 */ 744 for (i = 0; i < nkpt_base; i++) { 745 ((pd_entry_t *)KPDbase)[i] = KPTbase + (i << PAGE_SHIFT); 746 ((pd_entry_t *)KPDbase)[i] |= 747 pmap_bits_default[PG_RW_IDX] | 748 pmap_bits_default[PG_V_IDX]; 749 } 750 for (i = 0; i < nkpt_phys; i++) { 751 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); 752 ((pd_entry_t *)KPDphys)[i] |= 753 pmap_bits_default[PG_RW_IDX] | 754 pmap_bits_default[PG_V_IDX]; 755 } 756 757 /* 758 * Map from zero to end of allocations using 2M pages as an 759 * optimization. This will bypass some of the KPTBase pages 760 * above in the KERNBASE area. 761 */ 762 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) { 763 ((pd_entry_t *)KPDbase)[i] = i << PDRSHIFT; 764 ((pd_entry_t *)KPDbase)[i] |= 765 pmap_bits_default[PG_RW_IDX] | 766 pmap_bits_default[PG_V_IDX] | 767 pmap_bits_default[PG_PS_IDX] | 768 pmap_bits_default[PG_G_IDX]; 769 } 770 771 /* 772 * And connect up the PD to the PDP. The kernel pmap is expected 773 * to pre-populate all of its PDs. See NKPDPE in vmparam.h. 774 */ 775 for (i = 0; i < NKPDPE; i++) { 776 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] = 777 KPDphys + (i << PAGE_SHIFT); 778 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] |= 779 pmap_bits_default[PG_RW_IDX] | 780 pmap_bits_default[PG_V_IDX] | 781 pmap_bits_default[PG_U_IDX]; 782 } 783 784 /* 785 * Now set up the direct map space using either 2MB or 1GB pages 786 * Preset PG_M and PG_A because demotion expects it. 787 * 788 * When filling in entries in the PD pages make sure any excess 789 * entries are set to zero as we allocated enough PD pages 790 */ 791 if ((amd_feature & AMDID_PAGE1GB) == 0) { 792 for (i = 0; i < NPDEPG * ndmpdp; i++) { 793 ((pd_entry_t *)DMPDphys)[i] = i << PDRSHIFT; 794 ((pd_entry_t *)DMPDphys)[i] |= 795 pmap_bits_default[PG_RW_IDX] | 796 pmap_bits_default[PG_V_IDX] | 797 pmap_bits_default[PG_PS_IDX] | 798 pmap_bits_default[PG_G_IDX] | 799 pmap_bits_default[PG_M_IDX] | 800 pmap_bits_default[PG_A_IDX]; 801 } 802 803 /* 804 * And the direct map space's PDP 805 */ 806 for (i = 0; i < ndmpdp; i++) { 807 ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + 808 (i << PAGE_SHIFT); 809 ((pdp_entry_t *)DMPDPphys)[i] |= 810 pmap_bits_default[PG_RW_IDX] | 811 pmap_bits_default[PG_V_IDX] | 812 pmap_bits_default[PG_U_IDX]; 813 } 814 } else { 815 for (i = 0; i < ndmpdp; i++) { 816 ((pdp_entry_t *)DMPDPphys)[i] = 817 (vm_paddr_t)i << PDPSHIFT; 818 ((pdp_entry_t *)DMPDPphys)[i] |= 819 pmap_bits_default[PG_RW_IDX] | 820 pmap_bits_default[PG_V_IDX] | 821 pmap_bits_default[PG_PS_IDX] | 822 pmap_bits_default[PG_G_IDX] | 823 pmap_bits_default[PG_M_IDX] | 824 pmap_bits_default[PG_A_IDX]; 825 } 826 } 827 828 /* And recursively map PML4 to itself in order to get PTmap */ 829 ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; 830 ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= 831 pmap_bits_default[PG_RW_IDX] | 832 pmap_bits_default[PG_V_IDX] | 833 pmap_bits_default[PG_U_IDX]; 834 835 /* 836 * Connect the Direct Map slots up to the PML4 837 */ 838 for (j = 0; j < NDMPML4E; ++j) { 839 ((pdp_entry_t *)KPML4phys)[DMPML4I + j] = 840 (DMPDPphys + ((vm_paddr_t)j << PML4SHIFT)) | 841 pmap_bits_default[PG_RW_IDX] | 842 pmap_bits_default[PG_V_IDX] | 843 pmap_bits_default[PG_U_IDX]; 844 } 845 846 /* 847 * Connect the KVA slot up to the PML4 848 */ 849 ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys; 850 ((pdp_entry_t *)KPML4phys)[KPML4I] |= 851 pmap_bits_default[PG_RW_IDX] | 852 pmap_bits_default[PG_V_IDX] | 853 pmap_bits_default[PG_U_IDX]; 854 } 855 856 /* 857 * Bootstrap the system enough to run with virtual memory. 858 * 859 * On the i386 this is called after mapping has already been enabled 860 * and just syncs the pmap module with what has already been done. 861 * [We can't call it easily with mapping off since the kernel is not 862 * mapped with PA == VA, hence we would have to relocate every address 863 * from the linked base (virtual) address "KERNBASE" to the actual 864 * (physical) address starting relative to 0] 865 */ 866 void 867 pmap_bootstrap(vm_paddr_t *firstaddr) 868 { 869 vm_offset_t va; 870 pt_entry_t *pte; 871 872 KvaStart = VM_MIN_KERNEL_ADDRESS; 873 KvaEnd = VM_MAX_KERNEL_ADDRESS; 874 KvaSize = KvaEnd - KvaStart; 875 876 avail_start = *firstaddr; 877 878 /* 879 * Create an initial set of page tables to run the kernel in. 880 */ 881 create_pagetables(firstaddr); 882 883 virtual2_start = KvaStart; 884 virtual2_end = PTOV_OFFSET; 885 886 virtual_start = (vm_offset_t) PTOV_OFFSET + *firstaddr; 887 virtual_start = pmap_kmem_choose(virtual_start); 888 889 virtual_end = VM_MAX_KERNEL_ADDRESS; 890 891 /* XXX do %cr0 as well */ 892 load_cr4(rcr4() | CR4_PGE | CR4_PSE); 893 load_cr3(KPML4phys); 894 895 /* 896 * Initialize protection array. 897 */ 898 i386_protection_init(); 899 900 /* 901 * The kernel's pmap is statically allocated so we don't have to use 902 * pmap_create, which is unlikely to work correctly at this part of 903 * the boot sequence (XXX and which no longer exists). 904 */ 905 kernel_pmap.pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys); 906 kernel_pmap.pm_count = 1; 907 CPUMASK_ASSALLONES(kernel_pmap.pm_active); 908 RB_INIT(&kernel_pmap.pm_pvroot); 909 spin_init(&kernel_pmap.pm_spin, "pmapbootstrap"); 910 lwkt_token_init(&kernel_pmap.pm_token, "kpmap_tok"); 911 912 /* 913 * Reserve some special page table entries/VA space for temporary 914 * mapping of pages. 915 */ 916 #define SYSMAP(c, p, v, n) \ 917 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 918 919 va = virtual_start; 920 pte = vtopte(va); 921 922 /* 923 * CMAP1/CMAP2 are used for zeroing and copying pages. 924 */ 925 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 926 927 /* 928 * Crashdump maps. 929 */ 930 SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); 931 932 /* 933 * ptvmmap is used for reading arbitrary physical pages via 934 * /dev/mem. 935 */ 936 SYSMAP(caddr_t, ptmmap, ptvmmap, 1) 937 938 /* 939 * msgbufp is used to map the system message buffer. 940 * XXX msgbufmap is not used. 941 */ 942 SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 943 atop(round_page(MSGBUF_SIZE))) 944 945 virtual_start = va; 946 virtual_start = pmap_kmem_choose(virtual_start); 947 948 *CMAP1 = 0; 949 950 /* 951 * PG_G is terribly broken on SMP because we IPI invltlb's in some 952 * cases rather then invl1pg. Actually, I don't even know why it 953 * works under UP because self-referential page table mappings 954 */ 955 // pgeflag = 0; 956 957 /* 958 * Initialize the 4MB page size flag 959 */ 960 // pseflag = 0; 961 /* 962 * The 4MB page version of the initial 963 * kernel page mapping. 964 */ 965 pdir4mb = 0; 966 967 #if !defined(DISABLE_PSE) 968 if (cpu_feature & CPUID_PSE) { 969 pt_entry_t ptditmp; 970 /* 971 * Note that we have enabled PSE mode 972 */ 973 // pseflag = kernel_pmap.pmap_bits[PG_PS_IDX]; 974 ptditmp = *(PTmap + x86_64_btop(KERNBASE)); 975 ptditmp &= ~(NBPDR - 1); 976 ptditmp |= pmap_bits_default[PG_V_IDX] | 977 pmap_bits_default[PG_RW_IDX] | 978 pmap_bits_default[PG_PS_IDX] | 979 pmap_bits_default[PG_U_IDX]; 980 // pgeflag; 981 pdir4mb = ptditmp; 982 } 983 #endif 984 cpu_invltlb(); 985 986 /* Initialize the PAT MSR */ 987 pmap_init_pat(); 988 pmap_pinit_defaults(&kernel_pmap); 989 990 TUNABLE_INT_FETCH("machdep.pmap_fast_kernel_cpusync", 991 &pmap_fast_kernel_cpusync); 992 993 } 994 995 /* 996 * Setup the PAT MSR. 997 */ 998 void 999 pmap_init_pat(void) 1000 { 1001 uint64_t pat_msr; 1002 u_long cr0, cr4; 1003 1004 /* 1005 * Default values mapping PATi,PCD,PWT bits at system reset. 1006 * The default values effectively ignore the PATi bit by 1007 * repeating the encodings for 0-3 in 4-7, and map the PCD 1008 * and PWT bit combinations to the expected PAT types. 1009 */ 1010 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | /* 000 */ 1011 PAT_VALUE(1, PAT_WRITE_THROUGH) | /* 001 */ 1012 PAT_VALUE(2, PAT_UNCACHED) | /* 010 */ 1013 PAT_VALUE(3, PAT_UNCACHEABLE) | /* 011 */ 1014 PAT_VALUE(4, PAT_WRITE_BACK) | /* 100 */ 1015 PAT_VALUE(5, PAT_WRITE_THROUGH) | /* 101 */ 1016 PAT_VALUE(6, PAT_UNCACHED) | /* 110 */ 1017 PAT_VALUE(7, PAT_UNCACHEABLE); /* 111 */ 1018 pat_pte_index[PAT_WRITE_BACK] = 0; 1019 pat_pte_index[PAT_WRITE_THROUGH]= 0 | X86_PG_NC_PWT; 1020 pat_pte_index[PAT_UNCACHED] = X86_PG_NC_PCD; 1021 pat_pte_index[PAT_UNCACHEABLE] = X86_PG_NC_PCD | X86_PG_NC_PWT; 1022 pat_pte_index[PAT_WRITE_PROTECTED] = pat_pte_index[PAT_UNCACHEABLE]; 1023 pat_pte_index[PAT_WRITE_COMBINING] = pat_pte_index[PAT_UNCACHEABLE]; 1024 1025 if (cpu_feature & CPUID_PAT) { 1026 /* 1027 * If we support the PAT then set-up entries for 1028 * WRITE_PROTECTED and WRITE_COMBINING using bit patterns 1029 * 4 and 5. 1030 */ 1031 pat_msr = (pat_msr & ~PAT_MASK(4)) | 1032 PAT_VALUE(4, PAT_WRITE_PROTECTED); 1033 pat_msr = (pat_msr & ~PAT_MASK(5)) | 1034 PAT_VALUE(5, PAT_WRITE_COMBINING); 1035 pat_pte_index[PAT_WRITE_PROTECTED] = X86_PG_PTE_PAT | 0; 1036 pat_pte_index[PAT_WRITE_COMBINING] = X86_PG_PTE_PAT | X86_PG_NC_PWT; 1037 1038 /* 1039 * Then enable the PAT 1040 */ 1041 1042 /* Disable PGE. */ 1043 cr4 = rcr4(); 1044 load_cr4(cr4 & ~CR4_PGE); 1045 1046 /* Disable caches (CD = 1, NW = 0). */ 1047 cr0 = rcr0(); 1048 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 1049 1050 /* Flushes caches and TLBs. */ 1051 wbinvd(); 1052 cpu_invltlb(); 1053 1054 /* Update PAT and index table. */ 1055 wrmsr(MSR_PAT, pat_msr); 1056 1057 /* Flush caches and TLBs again. */ 1058 wbinvd(); 1059 cpu_invltlb(); 1060 1061 /* Restore caches and PGE. */ 1062 load_cr0(cr0); 1063 load_cr4(cr4); 1064 PatMsr = pat_msr; 1065 } 1066 } 1067 1068 /* 1069 * Set 4mb pdir for mp startup 1070 */ 1071 void 1072 pmap_set_opt(void) 1073 { 1074 if (cpu_feature & CPUID_PSE) { 1075 load_cr4(rcr4() | CR4_PSE); 1076 if (pdir4mb && mycpu->gd_cpuid == 0) { /* only on BSP */ 1077 cpu_invltlb(); 1078 } 1079 } 1080 } 1081 1082 /* 1083 * Initialize the pmap module. 1084 * Called by vm_init, to initialize any structures that the pmap 1085 * system needs to map virtual memory. 1086 * pmap_init has been enhanced to support in a fairly consistant 1087 * way, discontiguous physical memory. 1088 */ 1089 void 1090 pmap_init(void) 1091 { 1092 int i; 1093 int initial_pvs; 1094 1095 /* 1096 * Allocate memory for random pmap data structures. Includes the 1097 * pv_head_table. 1098 */ 1099 1100 for (i = 0; i < vm_page_array_size; i++) { 1101 vm_page_t m; 1102 1103 m = &vm_page_array[i]; 1104 TAILQ_INIT(&m->md.pv_list); 1105 } 1106 1107 /* 1108 * init the pv free list 1109 */ 1110 initial_pvs = vm_page_array_size; 1111 if (initial_pvs < MINPV) 1112 initial_pvs = MINPV; 1113 pvzone = &pvzone_store; 1114 pvinit = (void *)kmem_alloc(&kernel_map, 1115 initial_pvs * sizeof (struct pv_entry), 1116 VM_SUBSYS_PVENTRY); 1117 zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), 1118 pvinit, initial_pvs); 1119 1120 /* 1121 * Now it is safe to enable pv_table recording. 1122 */ 1123 pmap_initialized = TRUE; 1124 } 1125 1126 /* 1127 * Initialize the address space (zone) for the pv_entries. Set a 1128 * high water mark so that the system can recover from excessive 1129 * numbers of pv entries. 1130 */ 1131 void 1132 pmap_init2(void) 1133 { 1134 int shpgperproc = PMAP_SHPGPERPROC; 1135 int entry_max; 1136 1137 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 1138 pv_entry_max = shpgperproc * maxproc + vm_page_array_size; 1139 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 1140 pv_entry_high_water = 9 * (pv_entry_max / 10); 1141 1142 /* 1143 * Subtract out pages already installed in the zone (hack) 1144 */ 1145 entry_max = pv_entry_max - vm_page_array_size; 1146 if (entry_max <= 0) 1147 entry_max = 1; 1148 1149 zinitna(pvzone, &pvzone_obj, NULL, 0, entry_max, ZONE_INTERRUPT); 1150 1151 /* 1152 * Enable dynamic deletion of empty higher-level page table pages 1153 * by default only if system memory is < 8GB (use 7GB for slop). 1154 * This can save a little memory, but imposes significant 1155 * performance overhead for things like bulk builds, and for programs 1156 * which do a lot of memory mapping and memory unmapping. 1157 */ 1158 if (pmap_dynamic_delete < 0) { 1159 if (vmstats.v_page_count < 7LL * 1024 * 1024 * 1024 / PAGE_SIZE) 1160 pmap_dynamic_delete = 1; 1161 else 1162 pmap_dynamic_delete = 0; 1163 } 1164 } 1165 1166 /* 1167 * Typically used to initialize a fictitious page by vm/device_pager.c 1168 */ 1169 void 1170 pmap_page_init(struct vm_page *m) 1171 { 1172 vm_page_init(m); 1173 TAILQ_INIT(&m->md.pv_list); 1174 } 1175 1176 /*************************************************** 1177 * Low level helper routines..... 1178 ***************************************************/ 1179 1180 /* 1181 * this routine defines the region(s) of memory that should 1182 * not be tested for the modified bit. 1183 */ 1184 static __inline 1185 int 1186 pmap_track_modified(vm_pindex_t pindex) 1187 { 1188 vm_offset_t va = (vm_offset_t)pindex << PAGE_SHIFT; 1189 if ((va < clean_sva) || (va >= clean_eva)) 1190 return 1; 1191 else 1192 return 0; 1193 } 1194 1195 /* 1196 * Extract the physical page address associated with the map/VA pair. 1197 * The page must be wired for this to work reliably. 1198 * 1199 * XXX for the moment we're using pv_find() instead of pv_get(), as 1200 * callers might be expecting non-blocking operation. 1201 */ 1202 vm_paddr_t 1203 pmap_extract(pmap_t pmap, vm_offset_t va) 1204 { 1205 vm_paddr_t rtval; 1206 pv_entry_t pt_pv; 1207 pt_entry_t *ptep; 1208 1209 rtval = 0; 1210 if (va >= VM_MAX_USER_ADDRESS) { 1211 /* 1212 * Kernel page directories might be direct-mapped and 1213 * there is typically no PV tracking of pte's 1214 */ 1215 pd_entry_t *pt; 1216 1217 pt = pmap_pt(pmap, va); 1218 if (pt && (*pt & pmap->pmap_bits[PG_V_IDX])) { 1219 if (*pt & pmap->pmap_bits[PG_PS_IDX]) { 1220 rtval = *pt & PG_PS_FRAME; 1221 rtval |= va & PDRMASK; 1222 } else { 1223 ptep = pmap_pt_to_pte(*pt, va); 1224 if (*pt & pmap->pmap_bits[PG_V_IDX]) { 1225 rtval = *ptep & PG_FRAME; 1226 rtval |= va & PAGE_MASK; 1227 } 1228 } 1229 } 1230 } else { 1231 /* 1232 * User pages currently do not direct-map the page directory 1233 * and some pages might not used managed PVs. But all PT's 1234 * will have a PV. 1235 */ 1236 pt_pv = pv_find(pmap, pmap_pt_pindex(va)); 1237 if (pt_pv) { 1238 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 1239 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 1240 rtval = *ptep & PG_FRAME; 1241 rtval |= va & PAGE_MASK; 1242 } 1243 pv_drop(pt_pv); 1244 } 1245 } 1246 return rtval; 1247 } 1248 1249 /* 1250 * Similar to extract but checks protections, SMP-friendly short-cut for 1251 * vm_fault_page[_quick](). Can return NULL to cause the caller to 1252 * fall-through to the real fault code. 1253 * 1254 * The returned page, if not NULL, is held (and not busied). 1255 */ 1256 vm_page_t 1257 pmap_fault_page_quick(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1258 { 1259 if (pmap && va < VM_MAX_USER_ADDRESS) { 1260 pv_entry_t pt_pv; 1261 pv_entry_t pte_pv; 1262 pt_entry_t *ptep; 1263 pt_entry_t req; 1264 vm_page_t m; 1265 int error; 1266 1267 req = pmap->pmap_bits[PG_V_IDX] | 1268 pmap->pmap_bits[PG_U_IDX]; 1269 if (prot & VM_PROT_WRITE) 1270 req |= pmap->pmap_bits[PG_RW_IDX]; 1271 1272 pt_pv = pv_find(pmap, pmap_pt_pindex(va)); 1273 if (pt_pv == NULL) 1274 return (NULL); 1275 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 1276 if ((*ptep & req) != req) { 1277 pv_drop(pt_pv); 1278 return (NULL); 1279 } 1280 pte_pv = pv_get_try(pmap, pmap_pte_pindex(va), &error); 1281 if (pte_pv && error == 0) { 1282 m = pte_pv->pv_m; 1283 vm_page_hold(m); 1284 if (prot & VM_PROT_WRITE) 1285 vm_page_dirty(m); 1286 pv_put(pte_pv); 1287 } else if (pte_pv) { 1288 pv_drop(pte_pv); 1289 m = NULL; 1290 } else { 1291 m = NULL; 1292 } 1293 pv_drop(pt_pv); 1294 return(m); 1295 } else { 1296 return(NULL); 1297 } 1298 } 1299 1300 /* 1301 * Extract the physical page address associated kernel virtual address. 1302 */ 1303 vm_paddr_t 1304 pmap_kextract(vm_offset_t va) 1305 { 1306 pd_entry_t pt; /* pt entry in pd */ 1307 vm_paddr_t pa; 1308 1309 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 1310 pa = DMAP_TO_PHYS(va); 1311 } else { 1312 pt = *vtopt(va); 1313 if (pt & kernel_pmap.pmap_bits[PG_PS_IDX]) { 1314 pa = (pt & PG_PS_FRAME) | (va & PDRMASK); 1315 } else { 1316 /* 1317 * Beware of a concurrent promotion that changes the 1318 * PDE at this point! For example, vtopte() must not 1319 * be used to access the PTE because it would use the 1320 * new PDE. It is, however, safe to use the old PDE 1321 * because the page table page is preserved by the 1322 * promotion. 1323 */ 1324 pa = *pmap_pt_to_pte(pt, va); 1325 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 1326 } 1327 } 1328 return pa; 1329 } 1330 1331 /*************************************************** 1332 * Low level mapping routines..... 1333 ***************************************************/ 1334 1335 /* 1336 * Routine: pmap_kenter 1337 * Function: 1338 * Add a wired page to the KVA 1339 * NOTE! note that in order for the mapping to take effect -- you 1340 * should do an invltlb after doing the pmap_kenter(). 1341 */ 1342 void 1343 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1344 { 1345 pt_entry_t *ptep; 1346 pt_entry_t npte; 1347 1348 npte = pa | 1349 kernel_pmap.pmap_bits[PG_RW_IDX] | 1350 kernel_pmap.pmap_bits[PG_V_IDX]; 1351 // pgeflag; 1352 ptep = vtopte(va); 1353 #if 1 1354 pmap_inval_smp(&kernel_pmap, va, 1, ptep, npte); 1355 #else 1356 /* FUTURE */ 1357 if (*ptep) 1358 pmap_inval_smp(&kernel_pmap, va, ptep, npte); 1359 else 1360 *ptep = npte; 1361 #endif 1362 } 1363 1364 /* 1365 * Similar to pmap_kenter(), except we only invalidate the mapping on the 1366 * current CPU. Returns 0 if the previous pte was 0, 1 if it wasn't 1367 * (caller can conditionalize calling smp_invltlb()). 1368 */ 1369 int 1370 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa) 1371 { 1372 pt_entry_t *ptep; 1373 pt_entry_t npte; 1374 int res; 1375 1376 npte = pa | 1377 kernel_pmap.pmap_bits[PG_RW_IDX] | 1378 kernel_pmap.pmap_bits[PG_V_IDX]; 1379 // pgeflag; 1380 ptep = vtopte(va); 1381 #if 1 1382 res = 1; 1383 #else 1384 /* FUTURE */ 1385 res = (*ptep != 0); 1386 #endif 1387 *ptep = npte; 1388 cpu_invlpg((void *)va); 1389 1390 return res; 1391 } 1392 1393 /* 1394 * Enter addresses into the kernel pmap but don't bother 1395 * doing any tlb invalidations. Caller will do a rollup 1396 * invalidation via pmap_rollup_inval(). 1397 */ 1398 int 1399 pmap_kenter_noinval(vm_offset_t va, vm_paddr_t pa) 1400 { 1401 pt_entry_t *ptep; 1402 pt_entry_t npte; 1403 int res; 1404 1405 npte = pa | 1406 kernel_pmap.pmap_bits[PG_RW_IDX] | 1407 kernel_pmap.pmap_bits[PG_V_IDX]; 1408 // pgeflag; 1409 ptep = vtopte(va); 1410 #if 1 1411 res = 1; 1412 #else 1413 /* FUTURE */ 1414 res = (*ptep != 0); 1415 #endif 1416 *ptep = npte; 1417 cpu_invlpg((void *)va); 1418 1419 return res; 1420 } 1421 1422 /* 1423 * remove a page from the kernel pagetables 1424 */ 1425 void 1426 pmap_kremove(vm_offset_t va) 1427 { 1428 pt_entry_t *ptep; 1429 1430 ptep = vtopte(va); 1431 pmap_inval_smp(&kernel_pmap, va, 1, ptep, 0); 1432 } 1433 1434 void 1435 pmap_kremove_quick(vm_offset_t va) 1436 { 1437 pt_entry_t *ptep; 1438 1439 ptep = vtopte(va); 1440 (void)pte_load_clear(ptep); 1441 cpu_invlpg((void *)va); 1442 } 1443 1444 /* 1445 * Remove addresses from the kernel pmap but don't bother 1446 * doing any tlb invalidations. Caller will do a rollup 1447 * invalidation via pmap_rollup_inval(). 1448 */ 1449 void 1450 pmap_kremove_noinval(vm_offset_t va) 1451 { 1452 pt_entry_t *ptep; 1453 1454 ptep = vtopte(va); 1455 (void)pte_load_clear(ptep); 1456 } 1457 1458 /* 1459 * XXX these need to be recoded. They are not used in any critical path. 1460 */ 1461 void 1462 pmap_kmodify_rw(vm_offset_t va) 1463 { 1464 atomic_set_long(vtopte(va), kernel_pmap.pmap_bits[PG_RW_IDX]); 1465 cpu_invlpg((void *)va); 1466 } 1467 1468 /* NOT USED 1469 void 1470 pmap_kmodify_nc(vm_offset_t va) 1471 { 1472 atomic_set_long(vtopte(va), PG_N); 1473 cpu_invlpg((void *)va); 1474 } 1475 */ 1476 1477 /* 1478 * Used to map a range of physical addresses into kernel virtual 1479 * address space during the low level boot, typically to map the 1480 * dump bitmap, message buffer, and vm_page_array. 1481 * 1482 * These mappings are typically made at some pointer after the end of the 1483 * kernel text+data. 1484 * 1485 * We could return PHYS_TO_DMAP(start) here and not allocate any 1486 * via (*virtp), but then kmem from userland and kernel dumps won't 1487 * have access to the related pointers. 1488 */ 1489 vm_offset_t 1490 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot) 1491 { 1492 vm_offset_t va; 1493 vm_offset_t va_start; 1494 1495 /*return PHYS_TO_DMAP(start);*/ 1496 1497 va_start = *virtp; 1498 va = va_start; 1499 1500 while (start < end) { 1501 pmap_kenter_quick(va, start); 1502 va += PAGE_SIZE; 1503 start += PAGE_SIZE; 1504 } 1505 *virtp = va; 1506 return va_start; 1507 } 1508 1509 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1510 1511 /* 1512 * Remove the specified set of pages from the data and instruction caches. 1513 * 1514 * In contrast to pmap_invalidate_cache_range(), this function does not 1515 * rely on the CPU's self-snoop feature, because it is intended for use 1516 * when moving pages into a different cache domain. 1517 */ 1518 void 1519 pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1520 { 1521 vm_offset_t daddr, eva; 1522 int i; 1523 1524 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1525 (cpu_feature & CPUID_CLFSH) == 0) 1526 wbinvd(); 1527 else { 1528 cpu_mfence(); 1529 for (i = 0; i < count; i++) { 1530 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 1531 eva = daddr + PAGE_SIZE; 1532 for (; daddr < eva; daddr += cpu_clflush_line_size) 1533 clflush(daddr); 1534 } 1535 cpu_mfence(); 1536 } 1537 } 1538 1539 void 1540 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 1541 { 1542 KASSERT((sva & PAGE_MASK) == 0, 1543 ("pmap_invalidate_cache_range: sva not page-aligned")); 1544 KASSERT((eva & PAGE_MASK) == 0, 1545 ("pmap_invalidate_cache_range: eva not page-aligned")); 1546 1547 if (cpu_feature & CPUID_SS) { 1548 ; /* If "Self Snoop" is supported, do nothing. */ 1549 } else { 1550 /* Globally invalidate caches */ 1551 cpu_wbinvd_on_all_cpus(); 1552 } 1553 } 1554 1555 /* 1556 * Invalidate the specified range of virtual memory on all cpus associated 1557 * with the pmap. 1558 */ 1559 void 1560 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1561 { 1562 pmap_inval_smp(pmap, sva, (eva - sva) >> PAGE_SHIFT, NULL, 0); 1563 } 1564 1565 /* 1566 * Add a list of wired pages to the kva. This routine is used for temporary 1567 * kernel mappings such as those found in buffer cache buffer. Page 1568 * modifications and accesses are not tracked or recorded. 1569 * 1570 * NOTE! Old mappings are simply overwritten, and we cannot assume relaxed 1571 * semantics as previous mappings may have been zerod without any 1572 * invalidation. 1573 * 1574 * The page *must* be wired. 1575 */ 1576 void 1577 pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count) 1578 { 1579 vm_offset_t end_va; 1580 vm_offset_t va; 1581 1582 end_va = beg_va + count * PAGE_SIZE; 1583 1584 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 1585 pt_entry_t *pte; 1586 1587 pte = vtopte(va); 1588 *pte = VM_PAGE_TO_PHYS(*m) | 1589 kernel_pmap.pmap_bits[PG_RW_IDX] | 1590 kernel_pmap.pmap_bits[PG_V_IDX] | 1591 kernel_pmap.pmap_cache_bits[(*m)->pat_mode]; 1592 // pgeflag; 1593 m++; 1594 } 1595 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 1596 } 1597 1598 /* 1599 * This routine jerks page mappings from the kernel -- it is meant only 1600 * for temporary mappings such as those found in buffer cache buffers. 1601 * No recording modified or access status occurs. 1602 * 1603 * MPSAFE, INTERRUPT SAFE (cluster callback) 1604 */ 1605 void 1606 pmap_qremove(vm_offset_t beg_va, int count) 1607 { 1608 vm_offset_t end_va; 1609 vm_offset_t va; 1610 1611 end_va = beg_va + count * PAGE_SIZE; 1612 1613 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 1614 pt_entry_t *pte; 1615 1616 pte = vtopte(va); 1617 (void)pte_load_clear(pte); 1618 cpu_invlpg((void *)va); 1619 } 1620 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 1621 } 1622 1623 /* 1624 * This routine removes temporary kernel mappings, only invalidating them 1625 * on the current cpu. It should only be used under carefully controlled 1626 * conditions. 1627 */ 1628 void 1629 pmap_qremove_quick(vm_offset_t beg_va, int count) 1630 { 1631 vm_offset_t end_va; 1632 vm_offset_t va; 1633 1634 end_va = beg_va + count * PAGE_SIZE; 1635 1636 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 1637 pt_entry_t *pte; 1638 1639 pte = vtopte(va); 1640 (void)pte_load_clear(pte); 1641 cpu_invlpg((void *)va); 1642 } 1643 } 1644 1645 /* 1646 * This routine removes temporary kernel mappings *without* invalidating 1647 * the TLB. It can only be used on permanent kva reservations such as those 1648 * found in buffer cache buffers, under carefully controlled circumstances. 1649 * 1650 * NOTE: Repopulating these KVAs requires unconditional invalidation. 1651 * (pmap_qenter() does unconditional invalidation). 1652 */ 1653 void 1654 pmap_qremove_noinval(vm_offset_t beg_va, int count) 1655 { 1656 vm_offset_t end_va; 1657 vm_offset_t va; 1658 1659 end_va = beg_va + count * PAGE_SIZE; 1660 1661 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 1662 pt_entry_t *pte; 1663 1664 pte = vtopte(va); 1665 (void)pte_load_clear(pte); 1666 } 1667 } 1668 1669 /* 1670 * Create a new thread and optionally associate it with a (new) process. 1671 * NOTE! the new thread's cpu may not equal the current cpu. 1672 */ 1673 void 1674 pmap_init_thread(thread_t td) 1675 { 1676 /* enforce pcb placement & alignment */ 1677 td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1; 1678 td->td_pcb = (struct pcb *)((intptr_t)td->td_pcb & ~(intptr_t)0xF); 1679 td->td_savefpu = &td->td_pcb->pcb_save; 1680 td->td_sp = (char *)td->td_pcb; /* no -16 */ 1681 } 1682 1683 /* 1684 * This routine directly affects the fork perf for a process. 1685 */ 1686 void 1687 pmap_init_proc(struct proc *p) 1688 { 1689 } 1690 1691 static void 1692 pmap_pinit_defaults(struct pmap *pmap) 1693 { 1694 bcopy(pmap_bits_default, pmap->pmap_bits, 1695 sizeof(pmap_bits_default)); 1696 bcopy(protection_codes, pmap->protection_codes, 1697 sizeof(protection_codes)); 1698 bcopy(pat_pte_index, pmap->pmap_cache_bits, 1699 sizeof(pat_pte_index)); 1700 pmap->pmap_cache_mask = X86_PG_NC_PWT | X86_PG_NC_PCD | X86_PG_PTE_PAT; 1701 pmap->copyinstr = std_copyinstr; 1702 pmap->copyin = std_copyin; 1703 pmap->copyout = std_copyout; 1704 pmap->fubyte = std_fubyte; 1705 pmap->subyte = std_subyte; 1706 pmap->fuword = std_fuword; 1707 pmap->suword = std_suword; 1708 pmap->suword32 = std_suword32; 1709 } 1710 /* 1711 * Initialize pmap0/vmspace0. 1712 * 1713 * On architectures where the kernel pmap is not integrated into the user 1714 * process pmap, this pmap represents the process pmap, not the kernel pmap. 1715 * kernel_pmap should be used to directly access the kernel_pmap. 1716 */ 1717 void 1718 pmap_pinit0(struct pmap *pmap) 1719 { 1720 pmap->pm_pml4 = (pml4_entry_t *)(PTOV_OFFSET + KPML4phys); 1721 pmap->pm_count = 1; 1722 CPUMASK_ASSZERO(pmap->pm_active); 1723 pmap->pm_pvhint = NULL; 1724 RB_INIT(&pmap->pm_pvroot); 1725 spin_init(&pmap->pm_spin, "pmapinit0"); 1726 lwkt_token_init(&pmap->pm_token, "pmap_tok"); 1727 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1728 pmap_pinit_defaults(pmap); 1729 } 1730 1731 /* 1732 * Initialize a preallocated and zeroed pmap structure, 1733 * such as one in a vmspace structure. 1734 */ 1735 static void 1736 pmap_pinit_simple(struct pmap *pmap) 1737 { 1738 /* 1739 * Misc initialization 1740 */ 1741 pmap->pm_count = 1; 1742 CPUMASK_ASSZERO(pmap->pm_active); 1743 pmap->pm_pvhint = NULL; 1744 pmap->pm_flags = PMAP_FLAG_SIMPLE; 1745 1746 pmap_pinit_defaults(pmap); 1747 1748 /* 1749 * Don't blow up locks/tokens on re-use (XXX fix/use drop code 1750 * for this). 1751 */ 1752 if (pmap->pm_pmlpv == NULL) { 1753 RB_INIT(&pmap->pm_pvroot); 1754 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1755 spin_init(&pmap->pm_spin, "pmapinitsimple"); 1756 lwkt_token_init(&pmap->pm_token, "pmap_tok"); 1757 } 1758 } 1759 1760 void 1761 pmap_pinit(struct pmap *pmap) 1762 { 1763 pv_entry_t pv; 1764 int j; 1765 1766 if (pmap->pm_pmlpv) { 1767 if (pmap->pmap_bits[TYPE_IDX] != REGULAR_PMAP) { 1768 pmap_puninit(pmap); 1769 } 1770 } 1771 1772 pmap_pinit_simple(pmap); 1773 pmap->pm_flags &= ~PMAP_FLAG_SIMPLE; 1774 1775 /* 1776 * No need to allocate page table space yet but we do need a valid 1777 * page directory table. 1778 */ 1779 if (pmap->pm_pml4 == NULL) { 1780 pmap->pm_pml4 = 1781 (pml4_entry_t *)kmem_alloc_pageable(&kernel_map, 1782 PAGE_SIZE, 1783 VM_SUBSYS_PML4); 1784 } 1785 1786 /* 1787 * Allocate the page directory page, which wires it even though 1788 * it isn't being entered into some higher level page table (it 1789 * being the highest level). If one is already cached we don't 1790 * have to do anything. 1791 */ 1792 if ((pv = pmap->pm_pmlpv) == NULL) { 1793 pv = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL); 1794 pmap->pm_pmlpv = pv; 1795 pmap_kenter((vm_offset_t)pmap->pm_pml4, 1796 VM_PAGE_TO_PHYS(pv->pv_m)); 1797 pv_put(pv); 1798 1799 /* 1800 * Install DMAP and KMAP. 1801 */ 1802 for (j = 0; j < NDMPML4E; ++j) { 1803 pmap->pm_pml4[DMPML4I + j] = 1804 (DMPDPphys + ((vm_paddr_t)j << PML4SHIFT)) | 1805 pmap->pmap_bits[PG_RW_IDX] | 1806 pmap->pmap_bits[PG_V_IDX] | 1807 pmap->pmap_bits[PG_U_IDX]; 1808 } 1809 pmap->pm_pml4[KPML4I] = KPDPphys | 1810 pmap->pmap_bits[PG_RW_IDX] | 1811 pmap->pmap_bits[PG_V_IDX] | 1812 pmap->pmap_bits[PG_U_IDX]; 1813 1814 /* 1815 * install self-referential address mapping entry 1816 */ 1817 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pv->pv_m) | 1818 pmap->pmap_bits[PG_V_IDX] | 1819 pmap->pmap_bits[PG_RW_IDX] | 1820 pmap->pmap_bits[PG_A_IDX] | 1821 pmap->pmap_bits[PG_M_IDX]; 1822 } else { 1823 KKASSERT(pv->pv_m->flags & PG_MAPPED); 1824 KKASSERT(pv->pv_m->flags & PG_WRITEABLE); 1825 } 1826 KKASSERT(pmap->pm_pml4[255] == 0); 1827 KKASSERT(RB_ROOT(&pmap->pm_pvroot) == pv); 1828 KKASSERT(pv->pv_entry.rbe_left == NULL); 1829 KKASSERT(pv->pv_entry.rbe_right == NULL); 1830 } 1831 1832 /* 1833 * Clean up a pmap structure so it can be physically freed. This routine 1834 * is called by the vmspace dtor function. A great deal of pmap data is 1835 * left passively mapped to improve vmspace management so we have a bit 1836 * of cleanup work to do here. 1837 */ 1838 void 1839 pmap_puninit(pmap_t pmap) 1840 { 1841 pv_entry_t pv; 1842 vm_page_t p; 1843 1844 KKASSERT(CPUMASK_TESTZERO(pmap->pm_active)); 1845 if ((pv = pmap->pm_pmlpv) != NULL) { 1846 if (pv_hold_try(pv) == 0) 1847 pv_lock(pv); 1848 KKASSERT(pv == pmap->pm_pmlpv); 1849 p = pmap_remove_pv_page(pv); 1850 pv_free(pv, NULL, 1); 1851 pv = NULL; /* safety */ 1852 pmap_kremove((vm_offset_t)pmap->pm_pml4); 1853 vm_page_busy_wait(p, FALSE, "pgpun"); 1854 KKASSERT(p->flags & (PG_FICTITIOUS|PG_UNMANAGED)); 1855 vm_page_unwire(p, 0); 1856 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 1857 1858 /* 1859 * XXX eventually clean out PML4 static entries and 1860 * use vm_page_free_zero() 1861 */ 1862 vm_page_free(p); 1863 pmap->pm_pmlpv = NULL; 1864 } 1865 if (pmap->pm_pml4) { 1866 KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys)); 1867 kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE); 1868 pmap->pm_pml4 = NULL; 1869 } 1870 KKASSERT(pmap->pm_stats.resident_count == 0); 1871 KKASSERT(pmap->pm_stats.wired_count == 0); 1872 } 1873 1874 /* 1875 * This function is now unused (used to add the pmap to the pmap_list) 1876 */ 1877 void 1878 pmap_pinit2(struct pmap *pmap) 1879 { 1880 } 1881 1882 /* 1883 * This routine is called when various levels in the page table need to 1884 * be populated. This routine cannot fail. 1885 * 1886 * This function returns two locked pv_entry's, one representing the 1887 * requested pv and one representing the requested pv's parent pv. If 1888 * an intermediate page table does not exist it will be created, mapped, 1889 * wired, and the parent page table will be given an additional hold 1890 * count representing the presence of the child pv_entry. 1891 */ 1892 static 1893 pv_entry_t 1894 pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp) 1895 { 1896 pt_entry_t *ptep; 1897 pv_entry_t pv; 1898 pv_entry_t pvp; 1899 vm_pindex_t pt_pindex; 1900 vm_page_t m; 1901 int isnew; 1902 int ispt; 1903 1904 /* 1905 * If the pv already exists and we aren't being asked for the 1906 * parent page table page we can just return it. A locked+held pv 1907 * is returned. The pv will also have a second hold related to the 1908 * pmap association that we don't have to worry about. 1909 */ 1910 ispt = 0; 1911 pv = pv_alloc(pmap, ptepindex, &isnew); 1912 if (isnew == 0 && pvpp == NULL) 1913 return(pv); 1914 1915 /* 1916 * Special case terminal PVs. These are not page table pages so 1917 * no vm_page is allocated (the caller supplied the vm_page). If 1918 * pvpp is non-NULL we are being asked to also removed the pt_pv 1919 * for this pv. 1920 * 1921 * Note that pt_pv's are only returned for user VAs. We assert that 1922 * a pt_pv is not being requested for kernel VAs. The kernel 1923 * pre-wires all higher-level page tables so don't overload managed 1924 * higher-level page tables on top of it! 1925 */ 1926 if (ptepindex < pmap_pt_pindex(0)) { 1927 if (ptepindex >= NUPTE_USER) { 1928 /* kernel manages this manually for KVM */ 1929 KKASSERT(pvpp == NULL); 1930 } else { 1931 KKASSERT(pvpp != NULL); 1932 pt_pindex = NUPTE_TOTAL + (ptepindex >> NPTEPGSHIFT); 1933 pvp = pmap_allocpte(pmap, pt_pindex, NULL); 1934 if (isnew) { 1935 vm_page_wire_quick(pvp->pv_m); 1936 if (pvpp) 1937 *pvpp = pvp; 1938 else 1939 pv_put(pvp); 1940 } else { 1941 *pvpp = pvp; 1942 } 1943 } 1944 return(pv); 1945 } 1946 1947 /* 1948 * The kernel never uses managed PT/PD/PDP pages. 1949 */ 1950 KKASSERT(pmap != &kernel_pmap); 1951 1952 /* 1953 * Non-terminal PVs allocate a VM page to represent the page table, 1954 * so we have to resolve pvp and calculate ptepindex for the pvp 1955 * and then for the page table entry index in the pvp for 1956 * fall-through. 1957 */ 1958 if (ptepindex < pmap_pd_pindex(0)) { 1959 /* 1960 * pv is PT, pvp is PD 1961 */ 1962 ptepindex = (ptepindex - pmap_pt_pindex(0)) >> NPDEPGSHIFT; 1963 ptepindex += NUPTE_TOTAL + NUPT_TOTAL; 1964 pvp = pmap_allocpte(pmap, ptepindex, NULL); 1965 if (!isnew) 1966 goto notnew; 1967 1968 /* 1969 * PT index in PD 1970 */ 1971 ptepindex = pv->pv_pindex - pmap_pt_pindex(0); 1972 ptepindex &= ((1ul << NPDEPGSHIFT) - 1); 1973 ispt = 1; 1974 } else if (ptepindex < pmap_pdp_pindex(0)) { 1975 /* 1976 * pv is PD, pvp is PDP 1977 * 1978 * SIMPLE PMAP NOTE: Simple pmaps do not allocate above 1979 * the PD. 1980 */ 1981 ptepindex = (ptepindex - pmap_pd_pindex(0)) >> NPDPEPGSHIFT; 1982 ptepindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL; 1983 1984 if (pmap->pm_flags & PMAP_FLAG_SIMPLE) { 1985 KKASSERT(pvpp == NULL); 1986 pvp = NULL; 1987 } else { 1988 pvp = pmap_allocpte(pmap, ptepindex, NULL); 1989 } 1990 if (!isnew) 1991 goto notnew; 1992 1993 /* 1994 * PD index in PDP 1995 */ 1996 ptepindex = pv->pv_pindex - pmap_pd_pindex(0); 1997 ptepindex &= ((1ul << NPDPEPGSHIFT) - 1); 1998 } else if (ptepindex < pmap_pml4_pindex()) { 1999 /* 2000 * pv is PDP, pvp is the root pml4 table 2001 */ 2002 pvp = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL); 2003 if (!isnew) 2004 goto notnew; 2005 2006 /* 2007 * PDP index in PML4 2008 */ 2009 ptepindex = pv->pv_pindex - pmap_pdp_pindex(0); 2010 ptepindex &= ((1ul << NPML4EPGSHIFT) - 1); 2011 } else { 2012 /* 2013 * pv represents the top-level PML4, there is no parent. 2014 */ 2015 pvp = NULL; 2016 if (!isnew) 2017 goto notnew; 2018 } 2019 2020 /* 2021 * (isnew) is TRUE, pv is not terminal. 2022 * 2023 * (1) Add a wire count to the parent page table (pvp). 2024 * (2) Allocate a VM page for the page table. 2025 * (3) Enter the VM page into the parent page table. 2026 * 2027 * page table pages are marked PG_WRITEABLE and PG_MAPPED. 2028 */ 2029 if (pvp) 2030 vm_page_wire_quick(pvp->pv_m); 2031 2032 for (;;) { 2033 m = vm_page_alloc(NULL, pv->pv_pindex, 2034 VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM | 2035 VM_ALLOC_INTERRUPT); 2036 if (m) 2037 break; 2038 vm_wait(0); 2039 } 2040 vm_page_spin_lock(m); 2041 pmap_page_stats_adding(m); 2042 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2043 pv->pv_m = m; 2044 vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); 2045 vm_page_spin_unlock(m); 2046 vm_page_unmanage(m); /* m must be spinunlocked */ 2047 2048 pmap_zero_page(VM_PAGE_TO_PHYS(m)); 2049 m->valid = VM_PAGE_BITS_ALL; 2050 vm_page_wire(m); /* wire for mapping in parent */ 2051 2052 /* 2053 * Wire the page into pvp. Bump the resident_count for the pmap. 2054 * There is no pvp for the top level, address the pm_pml4[] array 2055 * directly. 2056 * 2057 * If the caller wants the parent we return it, otherwise 2058 * we just put it away. 2059 * 2060 * No interlock is needed for pte 0 -> non-zero. 2061 * 2062 * In the situation where *ptep is valid we might have an unmanaged 2063 * page table page shared from another page table which we need to 2064 * unshare before installing our private page table page. 2065 */ 2066 if (pvp) { 2067 ptep = pv_pte_lookup(pvp, ptepindex); 2068 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 2069 pt_entry_t pte; 2070 2071 if (ispt == 0) { 2072 panic("pmap_allocpte: unexpected pte %p/%d", 2073 pvp, (int)ptepindex); 2074 } 2075 pte = pmap_inval_smp(pmap, (vm_offset_t)-1, 1, ptep, 0); 2076 if (vm_page_unwire_quick( 2077 PHYS_TO_VM_PAGE(pte & PG_FRAME))) { 2078 panic("pmap_allocpte: shared pgtable " 2079 "pg bad wirecount"); 2080 } 2081 atomic_add_long(&pmap->pm_stats.resident_count, -1); 2082 } 2083 *ptep = VM_PAGE_TO_PHYS(m) | 2084 (pmap->pmap_bits[PG_U_IDX] | 2085 pmap->pmap_bits[PG_RW_IDX] | 2086 pmap->pmap_bits[PG_V_IDX] | 2087 pmap->pmap_bits[PG_A_IDX] | 2088 pmap->pmap_bits[PG_M_IDX]); 2089 } 2090 vm_page_wakeup(m); 2091 notnew: 2092 if (pvpp) 2093 *pvpp = pvp; 2094 else if (pvp) 2095 pv_put(pvp); 2096 return (pv); 2097 } 2098 2099 /* 2100 * This version of pmap_allocpte() checks for possible segment optimizations 2101 * that would allow page-table sharing. It can be called for terminal 2102 * page or page table page ptepindex's. 2103 * 2104 * The function is called with page table page ptepindex's for fictitious 2105 * and unmanaged terminal pages. That is, we don't want to allocate a 2106 * terminal pv, we just want the pt_pv. pvpp is usually passed as NULL 2107 * for this case. 2108 * 2109 * This function can return a pv and *pvpp associated with the passed in pmap 2110 * OR a pv and *pvpp associated with the shared pmap. In the latter case 2111 * an unmanaged page table page will be entered into the pass in pmap. 2112 */ 2113 static 2114 pv_entry_t 2115 pmap_allocpte_seg(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp, 2116 vm_map_entry_t entry, vm_offset_t va) 2117 { 2118 vm_object_t object; 2119 pmap_t obpmap; 2120 pmap_t *obpmapp; 2121 vm_offset_t b; 2122 pv_entry_t pte_pv; /* in original or shared pmap */ 2123 pv_entry_t pt_pv; /* in original or shared pmap */ 2124 pv_entry_t proc_pd_pv; /* in original pmap */ 2125 pv_entry_t proc_pt_pv; /* in original pmap */ 2126 pv_entry_t xpv; /* PT in shared pmap */ 2127 pd_entry_t *pt; /* PT entry in PD of original pmap */ 2128 pd_entry_t opte; /* contents of *pt */ 2129 pd_entry_t npte; /* contents of *pt */ 2130 vm_page_t m; 2131 2132 retry: 2133 /* 2134 * Basic tests, require a non-NULL vm_map_entry, require proper 2135 * alignment and type for the vm_map_entry, require that the 2136 * underlying object already be allocated. 2137 * 2138 * We allow almost any type of object to use this optimization. 2139 * The object itself does NOT have to be sized to a multiple of the 2140 * segment size, but the memory mapping does. 2141 * 2142 * XXX don't handle devices currently, because VM_PAGE_TO_PHYS() 2143 * won't work as expected. 2144 */ 2145 if (entry == NULL || 2146 pmap_mmu_optimize == 0 || /* not enabled */ 2147 (pmap->pm_flags & PMAP_HVM) || /* special pmap */ 2148 ptepindex >= pmap_pd_pindex(0) || /* not terminal or pt */ 2149 entry->inheritance != VM_INHERIT_SHARE || /* not shared */ 2150 entry->maptype != VM_MAPTYPE_NORMAL || /* weird map type */ 2151 entry->object.vm_object == NULL || /* needs VM object */ 2152 entry->object.vm_object->type == OBJT_DEVICE || /* ick */ 2153 entry->object.vm_object->type == OBJT_MGTDEVICE || /* ick */ 2154 (entry->offset & SEG_MASK) || /* must be aligned */ 2155 (entry->start & SEG_MASK)) { 2156 return(pmap_allocpte(pmap, ptepindex, pvpp)); 2157 } 2158 2159 /* 2160 * Make sure the full segment can be represented. 2161 */ 2162 b = va & ~(vm_offset_t)SEG_MASK; 2163 if (b < entry->start || b + SEG_SIZE > entry->end) 2164 return(pmap_allocpte(pmap, ptepindex, pvpp)); 2165 2166 /* 2167 * If the full segment can be represented dive the VM object's 2168 * shared pmap, allocating as required. 2169 */ 2170 object = entry->object.vm_object; 2171 2172 if (entry->protection & VM_PROT_WRITE) 2173 obpmapp = &object->md.pmap_rw; 2174 else 2175 obpmapp = &object->md.pmap_ro; 2176 2177 #ifdef PMAP_DEBUG2 2178 if (pmap_enter_debug > 0) { 2179 --pmap_enter_debug; 2180 kprintf("pmap_allocpte_seg: va=%jx prot %08x o=%p " 2181 "obpmapp %p %p\n", 2182 va, entry->protection, object, 2183 obpmapp, *obpmapp); 2184 kprintf("pmap_allocpte_seg: entry %p %jx-%jx\n", 2185 entry, entry->start, entry->end); 2186 } 2187 #endif 2188 2189 /* 2190 * We allocate what appears to be a normal pmap but because portions 2191 * of this pmap are shared with other unrelated pmaps we have to 2192 * set pm_active to point to all cpus. 2193 * 2194 * XXX Currently using pmap_spin to interlock the update, can't use 2195 * vm_object_hold/drop because the token might already be held 2196 * shared OR exclusive and we don't know. 2197 */ 2198 while ((obpmap = *obpmapp) == NULL) { 2199 obpmap = kmalloc(sizeof(*obpmap), M_OBJPMAP, M_WAITOK|M_ZERO); 2200 pmap_pinit_simple(obpmap); 2201 pmap_pinit2(obpmap); 2202 spin_lock(&pmap_spin); 2203 if (*obpmapp != NULL) { 2204 /* 2205 * Handle race 2206 */ 2207 spin_unlock(&pmap_spin); 2208 pmap_release(obpmap); 2209 pmap_puninit(obpmap); 2210 kfree(obpmap, M_OBJPMAP); 2211 obpmap = *obpmapp; /* safety */ 2212 } else { 2213 obpmap->pm_active = smp_active_mask; 2214 obpmap->pm_flags |= PMAP_SEGSHARED; 2215 *obpmapp = obpmap; 2216 spin_unlock(&pmap_spin); 2217 } 2218 } 2219 2220 /* 2221 * Layering is: PTE, PT, PD, PDP, PML4. We have to return the 2222 * pte/pt using the shared pmap from the object but also adjust 2223 * the process pmap's page table page as a side effect. 2224 */ 2225 2226 /* 2227 * Resolve the terminal PTE and PT in the shared pmap. This is what 2228 * we will return. This is true if ptepindex represents a terminal 2229 * page, otherwise pte_pv is actually the PT and pt_pv is actually 2230 * the PD. 2231 */ 2232 pt_pv = NULL; 2233 pte_pv = pmap_allocpte(obpmap, ptepindex, &pt_pv); 2234 if (ptepindex >= pmap_pt_pindex(0)) 2235 xpv = pte_pv; 2236 else 2237 xpv = pt_pv; 2238 2239 /* 2240 * Resolve the PD in the process pmap so we can properly share the 2241 * page table page. Lock order is bottom-up (leaf first)! 2242 * 2243 * NOTE: proc_pt_pv can be NULL. 2244 */ 2245 proc_pt_pv = pv_get(pmap, pmap_pt_pindex(b)); 2246 proc_pd_pv = pmap_allocpte(pmap, pmap_pd_pindex(b), NULL); 2247 #ifdef PMAP_DEBUG2 2248 if (pmap_enter_debug > 0) { 2249 --pmap_enter_debug; 2250 kprintf("proc_pt_pv %p (wc %d) pd_pv %p va=%jx\n", 2251 proc_pt_pv, 2252 (proc_pt_pv ? proc_pt_pv->pv_m->wire_count : -1), 2253 proc_pd_pv, 2254 va); 2255 } 2256 #endif 2257 2258 /* 2259 * xpv is the page table page pv from the shared object 2260 * (for convenience), from above. 2261 * 2262 * Calculate the pte value for the PT to load into the process PD. 2263 * If we have to change it we must properly dispose of the previous 2264 * entry. 2265 */ 2266 pt = pv_pte_lookup(proc_pd_pv, pmap_pt_index(b)); 2267 npte = VM_PAGE_TO_PHYS(xpv->pv_m) | 2268 (pmap->pmap_bits[PG_U_IDX] | 2269 pmap->pmap_bits[PG_RW_IDX] | 2270 pmap->pmap_bits[PG_V_IDX] | 2271 pmap->pmap_bits[PG_A_IDX] | 2272 pmap->pmap_bits[PG_M_IDX]); 2273 2274 /* 2275 * Dispose of previous page table page if it was local to the 2276 * process pmap. If the old pt is not empty we cannot dispose of it 2277 * until we clean it out. This case should not arise very often so 2278 * it is not optimized. 2279 */ 2280 if (proc_pt_pv) { 2281 pmap_inval_bulk_t bulk; 2282 2283 if (proc_pt_pv->pv_m->wire_count != 1) { 2284 pv_put(proc_pd_pv); 2285 pv_put(proc_pt_pv); 2286 pv_put(pt_pv); 2287 pv_put(pte_pv); 2288 pmap_remove(pmap, 2289 va & ~(vm_offset_t)SEG_MASK, 2290 (va + SEG_SIZE) & ~(vm_offset_t)SEG_MASK); 2291 goto retry; 2292 } 2293 2294 /* 2295 * The release call will indirectly clean out *pt 2296 */ 2297 pmap_inval_bulk_init(&bulk, proc_pt_pv->pv_pmap); 2298 pmap_release_pv(proc_pt_pv, proc_pd_pv, &bulk); 2299 pmap_inval_bulk_flush(&bulk); 2300 proc_pt_pv = NULL; 2301 /* relookup */ 2302 pt = pv_pte_lookup(proc_pd_pv, pmap_pt_index(b)); 2303 } 2304 2305 /* 2306 * Handle remaining cases. 2307 */ 2308 if (*pt == 0) { 2309 *pt = npte; 2310 vm_page_wire_quick(xpv->pv_m); /* shared pt -> proc */ 2311 vm_page_wire_quick(proc_pd_pv->pv_m); /* proc pd for sh pt */ 2312 atomic_add_long(&pmap->pm_stats.resident_count, 1); 2313 } else if (*pt != npte) { 2314 opte = pmap_inval_smp(pmap, (vm_offset_t)-1, 1, pt, npte); 2315 2316 #if 0 2317 opte = pte_load_clear(pt); 2318 KKASSERT(opte && opte != npte); 2319 2320 *pt = npte; 2321 #endif 2322 vm_page_wire_quick(xpv->pv_m); /* shared pt -> proc */ 2323 2324 /* 2325 * Clean up opte, bump the wire_count for the process 2326 * PD page representing the new entry if it was 2327 * previously empty. 2328 * 2329 * If the entry was not previously empty and we have 2330 * a PT in the proc pmap then opte must match that 2331 * pt. The proc pt must be retired (this is done 2332 * later on in this procedure). 2333 * 2334 * NOTE: replacing valid pte, wire_count on proc_pd_pv 2335 * stays the same. 2336 */ 2337 KKASSERT(opte & pmap->pmap_bits[PG_V_IDX]); 2338 m = PHYS_TO_VM_PAGE(opte & PG_FRAME); 2339 if (vm_page_unwire_quick(m)) { 2340 panic("pmap_allocpte_seg: " 2341 "bad wire count %p", 2342 m); 2343 } 2344 } 2345 2346 /* 2347 * The existing process page table was replaced and must be destroyed 2348 * here. 2349 */ 2350 if (proc_pd_pv) 2351 pv_put(proc_pd_pv); 2352 if (pvpp) 2353 *pvpp = pt_pv; 2354 else 2355 pv_put(pt_pv); 2356 2357 return (pte_pv); 2358 } 2359 2360 /* 2361 * Release any resources held by the given physical map. 2362 * 2363 * Called when a pmap initialized by pmap_pinit is being released. Should 2364 * only be called if the map contains no valid mappings. 2365 * 2366 * Caller must hold pmap->pm_token 2367 */ 2368 struct pmap_release_info { 2369 pmap_t pmap; 2370 int retry; 2371 pv_entry_t pvp; 2372 }; 2373 2374 static int pmap_release_callback(pv_entry_t pv, void *data); 2375 2376 void 2377 pmap_release(struct pmap *pmap) 2378 { 2379 struct pmap_release_info info; 2380 2381 KASSERT(CPUMASK_TESTZERO(pmap->pm_active), 2382 ("pmap still active! %016jx", 2383 (uintmax_t)CPUMASK_LOWMASK(pmap->pm_active))); 2384 2385 /* 2386 * There is no longer a pmap_list, if there were we would remove the 2387 * pmap from it here. 2388 */ 2389 2390 /* 2391 * Pull pv's off the RB tree in order from low to high and release 2392 * each page. 2393 */ 2394 info.pmap = pmap; 2395 do { 2396 info.retry = 0; 2397 info.pvp = NULL; 2398 2399 spin_lock(&pmap->pm_spin); 2400 RB_SCAN(pv_entry_rb_tree, &pmap->pm_pvroot, NULL, 2401 pmap_release_callback, &info); 2402 spin_unlock(&pmap->pm_spin); 2403 2404 if (info.pvp) 2405 pv_put(info.pvp); 2406 } while (info.retry); 2407 2408 2409 /* 2410 * One resident page (the pml4 page) should remain. 2411 * No wired pages should remain. 2412 */ 2413 KKASSERT(pmap->pm_stats.resident_count == 2414 ((pmap->pm_flags & PMAP_FLAG_SIMPLE) ? 0 : 1)); 2415 2416 KKASSERT(pmap->pm_stats.wired_count == 0); 2417 } 2418 2419 /* 2420 * Called from low to high. We must cache the proper parent pv so we 2421 * can adjust its wired count. 2422 */ 2423 static int 2424 pmap_release_callback(pv_entry_t pv, void *data) 2425 { 2426 struct pmap_release_info *info = data; 2427 pmap_t pmap = info->pmap; 2428 vm_pindex_t pindex; 2429 int r; 2430 2431 if (info->pvp == pv) { 2432 spin_unlock(&pmap->pm_spin); 2433 info->pvp = NULL; 2434 } else if (pv_hold_try(pv)) { 2435 spin_unlock(&pmap->pm_spin); 2436 } else { 2437 spin_unlock(&pmap->pm_spin); 2438 pv_lock(pv); 2439 } 2440 if (pv->pv_pmap != pmap) { 2441 pv_put(pv); 2442 spin_lock(&pmap->pm_spin); 2443 info->retry = 1; 2444 return(-1); 2445 } 2446 2447 if (pv->pv_pindex < pmap_pt_pindex(0)) { 2448 /* 2449 * parent is PT 2450 */ 2451 pindex = pv->pv_pindex >> NPTEPGSHIFT; 2452 pindex += NUPTE_TOTAL; 2453 } else if (pv->pv_pindex < pmap_pd_pindex(0)) { 2454 /* 2455 * parent is PD 2456 */ 2457 pindex = (pv->pv_pindex - NUPTE_TOTAL) >> NPDEPGSHIFT; 2458 pindex += NUPTE_TOTAL + NUPT_TOTAL; 2459 } else if (pv->pv_pindex < pmap_pdp_pindex(0)) { 2460 /* 2461 * parent is PDP 2462 */ 2463 pindex = (pv->pv_pindex - NUPTE_TOTAL - NUPT_TOTAL) >> 2464 NPDPEPGSHIFT; 2465 pindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL; 2466 } else if (pv->pv_pindex < pmap_pml4_pindex()) { 2467 /* 2468 * parent is PML4 (there's only one) 2469 */ 2470 #if 0 2471 pindex = (pv->pv_pindex - NUPTE_TOTAL - NUPT_TOTAL - 2472 NUPD_TOTAL) >> NPML4EPGSHIFT; 2473 pindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL; 2474 #endif 2475 pindex = pmap_pml4_pindex(); 2476 } else { 2477 /* 2478 * parent is NULL 2479 */ 2480 if (info->pvp) { 2481 pv_put(info->pvp); 2482 info->pvp = NULL; 2483 } 2484 pindex = 0; 2485 } 2486 if (pindex) { 2487 if (info->pvp && info->pvp->pv_pindex != pindex) { 2488 pv_put(info->pvp); 2489 info->pvp = NULL; 2490 } 2491 if (info->pvp == NULL) 2492 info->pvp = pv_get(pmap, pindex); 2493 } else { 2494 if (info->pvp) { 2495 pv_put(info->pvp); 2496 info->pvp = NULL; 2497 } 2498 } 2499 r = pmap_release_pv(pv, info->pvp, NULL); 2500 spin_lock(&pmap->pm_spin); 2501 return(r); 2502 } 2503 2504 /* 2505 * Called with held (i.e. also locked) pv. This function will dispose of 2506 * the lock along with the pv. 2507 * 2508 * If the caller already holds the locked parent page table for pv it 2509 * must pass it as pvp, allowing us to avoid a deadlock, else it can 2510 * pass NULL for pvp. 2511 */ 2512 static int 2513 pmap_release_pv(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk) 2514 { 2515 vm_page_t p; 2516 2517 /* 2518 * The pmap is currently not spinlocked, pv is held+locked. 2519 * Remove the pv's page from its parent's page table. The 2520 * parent's page table page's wire_count will be decremented. 2521 * 2522 * This will clean out the pte at any level of the page table. 2523 * If smp != 0 all cpus are affected. 2524 * 2525 * Do not tear-down recursively, its faster to just let the 2526 * release run its course. 2527 */ 2528 pmap_remove_pv_pte(pv, pvp, bulk, 0); 2529 2530 /* 2531 * Terminal pvs are unhooked from their vm_pages. Because 2532 * terminal pages aren't page table pages they aren't wired 2533 * by us, so we have to be sure not to unwire them either. 2534 */ 2535 if (pv->pv_pindex < pmap_pt_pindex(0)) { 2536 pmap_remove_pv_page(pv); 2537 goto skip; 2538 } 2539 2540 /* 2541 * We leave the top-level page table page cached, wired, and 2542 * mapped in the pmap until the dtor function (pmap_puninit()) 2543 * gets called. 2544 * 2545 * Since we are leaving the top-level pv intact we need 2546 * to break out of what would otherwise be an infinite loop. 2547 */ 2548 if (pv->pv_pindex == pmap_pml4_pindex()) { 2549 pv_put(pv); 2550 return(-1); 2551 } 2552 2553 /* 2554 * For page table pages (other than the top-level page), 2555 * remove and free the vm_page. The representitive mapping 2556 * removed above by pmap_remove_pv_pte() did not undo the 2557 * last wire_count so we have to do that as well. 2558 */ 2559 p = pmap_remove_pv_page(pv); 2560 vm_page_busy_wait(p, FALSE, "pmaprl"); 2561 if (p->wire_count != 1) { 2562 kprintf("p->wire_count was %016lx %d\n", 2563 pv->pv_pindex, p->wire_count); 2564 } 2565 KKASSERT(p->wire_count == 1); 2566 KKASSERT(p->flags & PG_UNMANAGED); 2567 2568 vm_page_unwire(p, 0); 2569 KKASSERT(p->wire_count == 0); 2570 2571 vm_page_free(p); 2572 skip: 2573 pv_free(pv, pvp, 1); 2574 2575 return 0; 2576 } 2577 2578 /* 2579 * This function will remove the pte associated with a pv from its parent. 2580 * Terminal pv's are supported. All cpus specified by (bulk) are properly 2581 * invalidated. 2582 * 2583 * The wire count will be dropped on the parent page table. The wire 2584 * count on the page being removed (pv->pv_m) from the parent page table 2585 * is NOT touched. Note that terminal pages will not have any additional 2586 * wire counts while page table pages will have at least one representing 2587 * the mapping, plus others representing sub-mappings. 2588 * 2589 * NOTE: Cannot be called on kernel page table pages, only KVM terminal 2590 * pages and user page table and terminal pages. 2591 * 2592 * The pv must be locked. The pvp, if supplied, must be locked. All 2593 * supplied pv's will remain locked on return. 2594 * 2595 * XXX must lock parent pv's if they exist to remove pte XXX 2596 */ 2597 static 2598 void 2599 pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk, 2600 int destroy) 2601 { 2602 vm_pindex_t ptepindex = pv->pv_pindex; 2603 pmap_t pmap = pv->pv_pmap; 2604 vm_page_t p; 2605 int gotpvp = 0; 2606 2607 KKASSERT(pmap); 2608 2609 if (ptepindex == pmap_pml4_pindex()) { 2610 /* 2611 * We are the top level pml4 table, there is no parent. 2612 */ 2613 p = pmap->pm_pmlpv->pv_m; 2614 } else if (ptepindex >= pmap_pdp_pindex(0)) { 2615 /* 2616 * Remove a PDP page from the pml4e. This can only occur 2617 * with user page tables. We do not have to lock the 2618 * pml4 PV so just ignore pvp. 2619 */ 2620 vm_pindex_t pml4_pindex; 2621 vm_pindex_t pdp_index; 2622 pml4_entry_t *pdp; 2623 2624 pdp_index = ptepindex - pmap_pdp_pindex(0); 2625 if (pvp == NULL) { 2626 pml4_pindex = pmap_pml4_pindex(); 2627 pvp = pv_get(pv->pv_pmap, pml4_pindex); 2628 KKASSERT(pvp); 2629 gotpvp = 1; 2630 } 2631 pdp = &pmap->pm_pml4[pdp_index & ((1ul << NPML4EPGSHIFT) - 1)]; 2632 KKASSERT((*pdp & pmap->pmap_bits[PG_V_IDX]) != 0); 2633 p = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 2634 pmap_inval_bulk(bulk, (vm_offset_t)-1, pdp, 0); 2635 } else if (ptepindex >= pmap_pd_pindex(0)) { 2636 /* 2637 * Remove a PD page from the pdp 2638 * 2639 * SIMPLE PMAP NOTE: Non-existant pvp's are ok in the case 2640 * of a simple pmap because it stops at 2641 * the PD page. 2642 */ 2643 vm_pindex_t pdp_pindex; 2644 vm_pindex_t pd_index; 2645 pdp_entry_t *pd; 2646 2647 pd_index = ptepindex - pmap_pd_pindex(0); 2648 2649 if (pvp == NULL) { 2650 pdp_pindex = NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + 2651 (pd_index >> NPML4EPGSHIFT); 2652 pvp = pv_get(pv->pv_pmap, pdp_pindex); 2653 gotpvp = 1; 2654 } 2655 if (pvp) { 2656 pd = pv_pte_lookup(pvp, pd_index & 2657 ((1ul << NPDPEPGSHIFT) - 1)); 2658 KKASSERT((*pd & pmap->pmap_bits[PG_V_IDX]) != 0); 2659 p = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 2660 pmap_inval_bulk(bulk, (vm_offset_t)-1, pd, 0); 2661 } else { 2662 KKASSERT(pmap->pm_flags & PMAP_FLAG_SIMPLE); 2663 p = pv->pv_m; /* degenerate test later */ 2664 } 2665 } else if (ptepindex >= pmap_pt_pindex(0)) { 2666 /* 2667 * Remove a PT page from the pd 2668 */ 2669 vm_pindex_t pd_pindex; 2670 vm_pindex_t pt_index; 2671 pd_entry_t *pt; 2672 2673 pt_index = ptepindex - pmap_pt_pindex(0); 2674 2675 if (pvp == NULL) { 2676 pd_pindex = NUPTE_TOTAL + NUPT_TOTAL + 2677 (pt_index >> NPDPEPGSHIFT); 2678 pvp = pv_get(pv->pv_pmap, pd_pindex); 2679 KKASSERT(pvp); 2680 gotpvp = 1; 2681 } 2682 pt = pv_pte_lookup(pvp, pt_index & ((1ul << NPDPEPGSHIFT) - 1)); 2683 KKASSERT((*pt & pmap->pmap_bits[PG_V_IDX]) != 0); 2684 p = PHYS_TO_VM_PAGE(*pt & PG_FRAME); 2685 pmap_inval_bulk(bulk, (vm_offset_t)-1, pt, 0); 2686 } else { 2687 /* 2688 * Remove a managed PTE from the PT page. Userland pmaps 2689 * manage PT/PD/PDP page tables pages but the kernel_pmap 2690 * does not. 2691 * 2692 * NOTE: pv's must be locked bottom-up to avoid deadlocking. 2693 * pv is a pte_pv so we can safely lock pt_pv. 2694 * 2695 * NOTE: FICTITIOUS pages may have multiple physical mappings 2696 * so PHYS_TO_VM_PAGE() will not necessarily work for 2697 * terminal ptes. 2698 */ 2699 vm_pindex_t pt_pindex; 2700 pt_entry_t *ptep; 2701 pt_entry_t pte; 2702 vm_offset_t va; 2703 2704 pt_pindex = ptepindex >> NPTEPGSHIFT; 2705 va = (vm_offset_t)ptepindex << PAGE_SHIFT; 2706 2707 if (ptepindex >= NUPTE_USER) { 2708 ptep = vtopte(ptepindex << PAGE_SHIFT); 2709 KKASSERT(pvp == NULL); 2710 /* pvp remains NULL */ 2711 } else { 2712 if (pvp == NULL) { 2713 pt_pindex = NUPTE_TOTAL + 2714 (ptepindex >> NPDPEPGSHIFT); 2715 pvp = pv_get(pv->pv_pmap, pt_pindex); 2716 KKASSERT(pvp); 2717 gotpvp = 1; 2718 } 2719 ptep = pv_pte_lookup(pvp, ptepindex & 2720 ((1ul << NPDPEPGSHIFT) - 1)); 2721 } 2722 pte = pmap_inval_bulk(bulk, va, ptep, 0); 2723 if (bulk == NULL) /* XXX */ 2724 cpu_invlpg((void *)va); /* XXX */ 2725 2726 /* 2727 * Now update the vm_page_t 2728 */ 2729 if ((pte & (pmap->pmap_bits[PG_MANAGED_IDX] | 2730 pmap->pmap_bits[PG_V_IDX])) != 2731 (pmap->pmap_bits[PG_MANAGED_IDX] | 2732 pmap->pmap_bits[PG_V_IDX])) { 2733 kprintf("remove_pte badpte %016lx %016lx %d\n", 2734 pte, pv->pv_pindex, 2735 pv->pv_pindex < pmap_pt_pindex(0)); 2736 } 2737 2738 /* PHYS_TO_VM_PAGE() will not work for FICTITIOUS pages */ 2739 /*KKASSERT((pte & (PG_MANAGED|PG_V)) == (PG_MANAGED|PG_V));*/ 2740 if (pte & pmap->pmap_bits[PG_DEVICE_IDX]) 2741 p = pv->pv_m; 2742 else 2743 p = PHYS_TO_VM_PAGE(pte & PG_FRAME); 2744 /* p = pv->pv_m; */ 2745 2746 if (pte & pmap->pmap_bits[PG_M_IDX]) { 2747 if (pmap_track_modified(ptepindex)) 2748 vm_page_dirty(p); 2749 } 2750 if (pte & pmap->pmap_bits[PG_A_IDX]) { 2751 vm_page_flag_set(p, PG_REFERENCED); 2752 } 2753 if (pte & pmap->pmap_bits[PG_W_IDX]) 2754 atomic_add_long(&pmap->pm_stats.wired_count, -1); 2755 if (pte & pmap->pmap_bits[PG_G_IDX]) 2756 cpu_invlpg((void *)va); 2757 } 2758 KKASSERT(pv->pv_m == p); /* XXX remove me later */ 2759 2760 /* 2761 * If requested, scrap the underlying pv->pv_m and the underlying 2762 * pv. If this is a page-table-page we must also free the page. 2763 * 2764 * pvp must be returned locked. 2765 */ 2766 if (destroy == 1) { 2767 /* 2768 * page table page (PT, PD, PDP, PML4), caller was responsible 2769 * for testing wired_count. 2770 */ 2771 vm_page_t p; 2772 2773 KKASSERT(pv->pv_m->wire_count == 1); 2774 p = pmap_remove_pv_page(pv); 2775 pv_free(pv, pvp, 1); 2776 pv = NULL; 2777 2778 KKASSERT(p->flags & (PG_FICTITIOUS|PG_UNMANAGED)); 2779 vm_page_busy_wait(p, FALSE, "pgpun"); 2780 vm_page_unwire(p, 0); 2781 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 2782 vm_page_free(p); 2783 } else if (destroy == 2) { 2784 /* 2785 * Normal page, remove from pmap and leave the underlying 2786 * page untouched. 2787 */ 2788 pmap_remove_pv_page(pv); 2789 pv_free(pv, pvp, 1); 2790 pv = NULL; /* safety */ 2791 } 2792 2793 /* 2794 * If we acquired pvp ourselves then we are responsible for 2795 * recursively deleting it. 2796 */ 2797 if (pvp && gotpvp) { 2798 /* 2799 * Recursively destroy higher-level page tables. 2800 * 2801 * This is optional. If we do not, they will still 2802 * be destroyed when the process exits. 2803 * 2804 * NOTE: Do not destroy pv_entry's with extra hold refs, 2805 * a caller may have unlocked it and intends to 2806 * continue to use it. 2807 */ 2808 if (pmap_dynamic_delete && 2809 pvp->pv_m && 2810 pvp->pv_m->wire_count == 1 && 2811 (pvp->pv_hold & PV_HOLD_MASK) == 2 && 2812 pvp->pv_pindex != pmap_pml4_pindex()) { 2813 if (pmap_dynamic_delete == 2) 2814 kprintf("A %jd %08x\n", pvp->pv_pindex, pvp->pv_hold); 2815 if (pmap != &kernel_pmap) { 2816 pmap_remove_pv_pte(pvp, NULL, bulk, 1); 2817 pvp = NULL; /* safety */ 2818 } else { 2819 kprintf("Attempt to remove kernel_pmap pindex " 2820 "%jd\n", pvp->pv_pindex); 2821 pv_put(pvp); 2822 } 2823 } else { 2824 pv_put(pvp); 2825 } 2826 } 2827 } 2828 2829 /* 2830 * Remove the vm_page association to a pv. The pv must be locked. 2831 */ 2832 static 2833 vm_page_t 2834 pmap_remove_pv_page(pv_entry_t pv) 2835 { 2836 vm_page_t m; 2837 2838 m = pv->pv_m; 2839 KKASSERT(m); 2840 vm_page_spin_lock(m); 2841 pv->pv_m = NULL; 2842 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2843 pmap_page_stats_deleting(m); 2844 /* 2845 if (m->object) 2846 atomic_add_int(&m->object->agg_pv_list_count, -1); 2847 */ 2848 if (TAILQ_EMPTY(&m->md.pv_list)) 2849 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2850 vm_page_spin_unlock(m); 2851 2852 return(m); 2853 } 2854 2855 /* 2856 * Grow the number of kernel page table entries, if needed. 2857 * 2858 * This routine is always called to validate any address space 2859 * beyond KERNBASE (for kldloads). kernel_vm_end only governs the address 2860 * space below KERNBASE. 2861 */ 2862 void 2863 pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) 2864 { 2865 vm_paddr_t paddr; 2866 vm_offset_t ptppaddr; 2867 vm_page_t nkpg; 2868 pd_entry_t *pt, newpt; 2869 pdp_entry_t newpd; 2870 int update_kernel_vm_end; 2871 2872 /* 2873 * bootstrap kernel_vm_end on first real VM use 2874 */ 2875 if (kernel_vm_end == 0) { 2876 kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 2877 nkpt = 0; 2878 while ((*pmap_pt(&kernel_pmap, kernel_vm_end) & kernel_pmap.pmap_bits[PG_V_IDX]) != 0) { 2879 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & 2880 ~(PAGE_SIZE * NPTEPG - 1); 2881 nkpt++; 2882 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 2883 kernel_vm_end = kernel_map.max_offset; 2884 break; 2885 } 2886 } 2887 } 2888 2889 /* 2890 * Fill in the gaps. kernel_vm_end is only adjusted for ranges 2891 * below KERNBASE. Ranges above KERNBASE are kldloaded and we 2892 * do not want to force-fill 128G worth of page tables. 2893 */ 2894 if (kstart < KERNBASE) { 2895 if (kstart > kernel_vm_end) 2896 kstart = kernel_vm_end; 2897 KKASSERT(kend <= KERNBASE); 2898 update_kernel_vm_end = 1; 2899 } else { 2900 update_kernel_vm_end = 0; 2901 } 2902 2903 kstart = rounddown2(kstart, PAGE_SIZE * NPTEPG); 2904 kend = roundup2(kend, PAGE_SIZE * NPTEPG); 2905 2906 if (kend - 1 >= kernel_map.max_offset) 2907 kend = kernel_map.max_offset; 2908 2909 while (kstart < kend) { 2910 pt = pmap_pt(&kernel_pmap, kstart); 2911 if (pt == NULL) { 2912 /* We need a new PD entry */ 2913 nkpg = vm_page_alloc(NULL, nkpt, 2914 VM_ALLOC_NORMAL | 2915 VM_ALLOC_SYSTEM | 2916 VM_ALLOC_INTERRUPT); 2917 if (nkpg == NULL) { 2918 panic("pmap_growkernel: no memory to grow " 2919 "kernel"); 2920 } 2921 paddr = VM_PAGE_TO_PHYS(nkpg); 2922 pmap_zero_page(paddr); 2923 newpd = (pdp_entry_t) 2924 (paddr | 2925 kernel_pmap.pmap_bits[PG_V_IDX] | 2926 kernel_pmap.pmap_bits[PG_RW_IDX] | 2927 kernel_pmap.pmap_bits[PG_A_IDX] | 2928 kernel_pmap.pmap_bits[PG_M_IDX]); 2929 *pmap_pd(&kernel_pmap, kstart) = newpd; 2930 nkpt++; 2931 continue; /* try again */ 2932 } 2933 if ((*pt & kernel_pmap.pmap_bits[PG_V_IDX]) != 0) { 2934 kstart = (kstart + PAGE_SIZE * NPTEPG) & 2935 ~(PAGE_SIZE * NPTEPG - 1); 2936 if (kstart - 1 >= kernel_map.max_offset) { 2937 kstart = kernel_map.max_offset; 2938 break; 2939 } 2940 continue; 2941 } 2942 2943 /* 2944 * We need a new PT 2945 * 2946 * This index is bogus, but out of the way 2947 */ 2948 nkpg = vm_page_alloc(NULL, nkpt, 2949 VM_ALLOC_NORMAL | 2950 VM_ALLOC_SYSTEM | 2951 VM_ALLOC_INTERRUPT); 2952 if (nkpg == NULL) 2953 panic("pmap_growkernel: no memory to grow kernel"); 2954 2955 vm_page_wire(nkpg); 2956 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 2957 pmap_zero_page(ptppaddr); 2958 newpt = (pd_entry_t) (ptppaddr | 2959 kernel_pmap.pmap_bits[PG_V_IDX] | 2960 kernel_pmap.pmap_bits[PG_RW_IDX] | 2961 kernel_pmap.pmap_bits[PG_A_IDX] | 2962 kernel_pmap.pmap_bits[PG_M_IDX]); 2963 *pmap_pt(&kernel_pmap, kstart) = newpt; 2964 nkpt++; 2965 2966 kstart = (kstart + PAGE_SIZE * NPTEPG) & 2967 ~(PAGE_SIZE * NPTEPG - 1); 2968 2969 if (kstart - 1 >= kernel_map.max_offset) { 2970 kstart = kernel_map.max_offset; 2971 break; 2972 } 2973 } 2974 2975 /* 2976 * Only update kernel_vm_end for areas below KERNBASE. 2977 */ 2978 if (update_kernel_vm_end && kernel_vm_end < kstart) 2979 kernel_vm_end = kstart; 2980 } 2981 2982 /* 2983 * Add a reference to the specified pmap. 2984 */ 2985 void 2986 pmap_reference(pmap_t pmap) 2987 { 2988 if (pmap != NULL) { 2989 lwkt_gettoken(&pmap->pm_token); 2990 ++pmap->pm_count; 2991 lwkt_reltoken(&pmap->pm_token); 2992 } 2993 } 2994 2995 /*************************************************** 2996 * page management routines. 2997 ***************************************************/ 2998 2999 /* 3000 * Hold a pv without locking it 3001 */ 3002 static void 3003 pv_hold(pv_entry_t pv) 3004 { 3005 atomic_add_int(&pv->pv_hold, 1); 3006 } 3007 3008 /* 3009 * Hold a pv_entry, preventing its destruction. TRUE is returned if the pv 3010 * was successfully locked, FALSE if it wasn't. The caller must dispose of 3011 * the pv properly. 3012 * 3013 * Either the pmap->pm_spin or the related vm_page_spin (if traversing a 3014 * pv list via its page) must be held by the caller. 3015 */ 3016 static int 3017 _pv_hold_try(pv_entry_t pv PMAP_DEBUG_DECL) 3018 { 3019 u_int count; 3020 3021 /* 3022 * Critical path shortcut expects pv to already have one ref 3023 * (for the pv->pv_pmap). 3024 */ 3025 if (atomic_cmpset_int(&pv->pv_hold, 1, PV_HOLD_LOCKED | 2)) { 3026 #ifdef PMAP_DEBUG 3027 pv->pv_func = func; 3028 pv->pv_line = lineno; 3029 #endif 3030 return TRUE; 3031 } 3032 3033 for (;;) { 3034 count = pv->pv_hold; 3035 cpu_ccfence(); 3036 if ((count & PV_HOLD_LOCKED) == 0) { 3037 if (atomic_cmpset_int(&pv->pv_hold, count, 3038 (count + 1) | PV_HOLD_LOCKED)) { 3039 #ifdef PMAP_DEBUG 3040 pv->pv_func = func; 3041 pv->pv_line = lineno; 3042 #endif 3043 return TRUE; 3044 } 3045 } else { 3046 if (atomic_cmpset_int(&pv->pv_hold, count, count + 1)) 3047 return FALSE; 3048 } 3049 /* retry */ 3050 } 3051 } 3052 3053 /* 3054 * Drop a previously held pv_entry which could not be locked, allowing its 3055 * destruction. 3056 * 3057 * Must not be called with a spinlock held as we might zfree() the pv if it 3058 * is no longer associated with a pmap and this was the last hold count. 3059 */ 3060 static void 3061 pv_drop(pv_entry_t pv) 3062 { 3063 u_int count; 3064 3065 for (;;) { 3066 count = pv->pv_hold; 3067 cpu_ccfence(); 3068 KKASSERT((count & PV_HOLD_MASK) > 0); 3069 KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) != 3070 (PV_HOLD_LOCKED | 1)); 3071 if (atomic_cmpset_int(&pv->pv_hold, count, count - 1)) { 3072 if ((count & PV_HOLD_MASK) == 1) { 3073 #ifdef PMAP_DEBUG2 3074 if (pmap_enter_debug > 0) { 3075 --pmap_enter_debug; 3076 kprintf("pv_drop: free pv %p\n", pv); 3077 } 3078 #endif 3079 KKASSERT(count == 1); 3080 KKASSERT(pv->pv_pmap == NULL); 3081 zfree(pvzone, pv); 3082 } 3083 return; 3084 } 3085 /* retry */ 3086 } 3087 } 3088 3089 /* 3090 * Find or allocate the requested PV entry, returning a locked, held pv. 3091 * 3092 * If (*isnew) is non-zero, the returned pv will have two hold counts, one 3093 * for the caller and one representing the pmap and vm_page association. 3094 * 3095 * If (*isnew) is zero, the returned pv will have only one hold count. 3096 * 3097 * Since both associations can only be adjusted while the pv is locked, 3098 * together they represent just one additional hold. 3099 */ 3100 static 3101 pv_entry_t 3102 _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew PMAP_DEBUG_DECL) 3103 { 3104 pv_entry_t pv; 3105 pv_entry_t pnew = NULL; 3106 3107 spin_lock(&pmap->pm_spin); 3108 for (;;) { 3109 if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) { 3110 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, 3111 pindex); 3112 } 3113 if (pv == NULL) { 3114 if (pnew == NULL) { 3115 spin_unlock(&pmap->pm_spin); 3116 pnew = zalloc(pvzone); 3117 spin_lock(&pmap->pm_spin); 3118 continue; 3119 } 3120 pnew->pv_pmap = pmap; 3121 pnew->pv_pindex = pindex; 3122 pnew->pv_hold = PV_HOLD_LOCKED | 2; 3123 #ifdef PMAP_DEBUG 3124 pnew->pv_func = func; 3125 pnew->pv_line = lineno; 3126 #endif 3127 pv_entry_rb_tree_RB_INSERT(&pmap->pm_pvroot, pnew); 3128 ++pmap->pm_generation; 3129 atomic_add_long(&pmap->pm_stats.resident_count, 1); 3130 spin_unlock(&pmap->pm_spin); 3131 *isnew = 1; 3132 return(pnew); 3133 } 3134 if (pnew) { 3135 spin_unlock(&pmap->pm_spin); 3136 zfree(pvzone, pnew); 3137 pnew = NULL; 3138 spin_lock(&pmap->pm_spin); 3139 continue; 3140 } 3141 if (_pv_hold_try(pv PMAP_DEBUG_COPY)) { 3142 spin_unlock(&pmap->pm_spin); 3143 } else { 3144 spin_unlock(&pmap->pm_spin); 3145 _pv_lock(pv PMAP_DEBUG_COPY); 3146 } 3147 if (pv->pv_pmap == pmap && pv->pv_pindex == pindex) { 3148 *isnew = 0; 3149 return(pv); 3150 } 3151 pv_put(pv); 3152 spin_lock(&pmap->pm_spin); 3153 } 3154 } 3155 3156 /* 3157 * Find the requested PV entry, returning a locked+held pv or NULL 3158 */ 3159 static 3160 pv_entry_t 3161 _pv_get(pmap_t pmap, vm_pindex_t pindex PMAP_DEBUG_DECL) 3162 { 3163 pv_entry_t pv; 3164 3165 spin_lock(&pmap->pm_spin); 3166 for (;;) { 3167 /* 3168 * Shortcut cache 3169 */ 3170 if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) { 3171 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, 3172 pindex); 3173 } 3174 if (pv == NULL) { 3175 spin_unlock(&pmap->pm_spin); 3176 return NULL; 3177 } 3178 if (_pv_hold_try(pv PMAP_DEBUG_COPY)) { 3179 spin_unlock(&pmap->pm_spin); 3180 } else { 3181 spin_unlock(&pmap->pm_spin); 3182 _pv_lock(pv PMAP_DEBUG_COPY); 3183 } 3184 if (pv->pv_pmap == pmap && pv->pv_pindex == pindex) { 3185 pv_cache(pv, pindex); 3186 return(pv); 3187 } 3188 pv_put(pv); 3189 spin_lock(&pmap->pm_spin); 3190 } 3191 } 3192 3193 /* 3194 * Lookup, hold, and attempt to lock (pmap,pindex). 3195 * 3196 * If the entry does not exist NULL is returned and *errorp is set to 0 3197 * 3198 * If the entry exists and could be successfully locked it is returned and 3199 * errorp is set to 0. 3200 * 3201 * If the entry exists but could NOT be successfully locked it is returned 3202 * held and *errorp is set to 1. 3203 */ 3204 static 3205 pv_entry_t 3206 pv_get_try(pmap_t pmap, vm_pindex_t pindex, int *errorp) 3207 { 3208 pv_entry_t pv; 3209 3210 spin_lock_shared(&pmap->pm_spin); 3211 if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) 3212 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex); 3213 if (pv == NULL) { 3214 spin_unlock_shared(&pmap->pm_spin); 3215 *errorp = 0; 3216 return NULL; 3217 } 3218 if (pv_hold_try(pv)) { 3219 pv_cache(pv, pindex); 3220 spin_unlock_shared(&pmap->pm_spin); 3221 *errorp = 0; 3222 KKASSERT(pv->pv_pmap == pmap && pv->pv_pindex == pindex); 3223 return(pv); /* lock succeeded */ 3224 } 3225 spin_unlock_shared(&pmap->pm_spin); 3226 *errorp = 1; 3227 return (pv); /* lock failed */ 3228 } 3229 3230 /* 3231 * Find the requested PV entry, returning a held pv or NULL 3232 */ 3233 static 3234 pv_entry_t 3235 pv_find(pmap_t pmap, vm_pindex_t pindex) 3236 { 3237 pv_entry_t pv; 3238 3239 spin_lock_shared(&pmap->pm_spin); 3240 3241 if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) 3242 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex); 3243 if (pv == NULL) { 3244 spin_unlock_shared(&pmap->pm_spin); 3245 return NULL; 3246 } 3247 pv_hold(pv); 3248 pv_cache(pv, pindex); 3249 spin_unlock_shared(&pmap->pm_spin); 3250 return(pv); 3251 } 3252 3253 /* 3254 * Lock a held pv, keeping the hold count 3255 */ 3256 static 3257 void 3258 _pv_lock(pv_entry_t pv PMAP_DEBUG_DECL) 3259 { 3260 u_int count; 3261 3262 for (;;) { 3263 count = pv->pv_hold; 3264 cpu_ccfence(); 3265 if ((count & PV_HOLD_LOCKED) == 0) { 3266 if (atomic_cmpset_int(&pv->pv_hold, count, 3267 count | PV_HOLD_LOCKED)) { 3268 #ifdef PMAP_DEBUG 3269 pv->pv_func = func; 3270 pv->pv_line = lineno; 3271 #endif 3272 return; 3273 } 3274 continue; 3275 } 3276 tsleep_interlock(pv, 0); 3277 if (atomic_cmpset_int(&pv->pv_hold, count, 3278 count | PV_HOLD_WAITING)) { 3279 #ifdef PMAP_DEBUG 3280 kprintf("pv waiting on %s:%d\n", 3281 pv->pv_func, pv->pv_line); 3282 #endif 3283 tsleep(pv, PINTERLOCKED, "pvwait", hz); 3284 } 3285 /* retry */ 3286 } 3287 } 3288 3289 /* 3290 * Unlock a held and locked pv, keeping the hold count. 3291 */ 3292 static 3293 void 3294 pv_unlock(pv_entry_t pv) 3295 { 3296 u_int count; 3297 3298 for (;;) { 3299 count = pv->pv_hold; 3300 cpu_ccfence(); 3301 KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) >= 3302 (PV_HOLD_LOCKED | 1)); 3303 if (atomic_cmpset_int(&pv->pv_hold, count, 3304 count & 3305 ~(PV_HOLD_LOCKED | PV_HOLD_WAITING))) { 3306 if (count & PV_HOLD_WAITING) 3307 wakeup(pv); 3308 break; 3309 } 3310 } 3311 } 3312 3313 /* 3314 * Unlock and drop a pv. If the pv is no longer associated with a pmap 3315 * and the hold count drops to zero we will free it. 3316 * 3317 * Caller should not hold any spin locks. We are protected from hold races 3318 * by virtue of holds only occuring only with a pmap_spin or vm_page_spin 3319 * lock held. A pv cannot be located otherwise. 3320 */ 3321 static 3322 void 3323 pv_put(pv_entry_t pv) 3324 { 3325 #ifdef PMAP_DEBUG2 3326 if (pmap_enter_debug > 0) { 3327 --pmap_enter_debug; 3328 kprintf("pv_put pv=%p hold=%08x\n", pv, pv->pv_hold); 3329 } 3330 #endif 3331 3332 /* 3333 * Fast - shortcut most common condition 3334 */ 3335 if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 1)) 3336 return; 3337 3338 /* 3339 * Slow 3340 */ 3341 pv_unlock(pv); 3342 pv_drop(pv); 3343 } 3344 3345 /* 3346 * Remove the pmap association from a pv, require that pv_m already be removed, 3347 * then unlock and drop the pv. Any pte operations must have already been 3348 * completed. This call may result in a last-drop which will physically free 3349 * the pv. 3350 * 3351 * Removing the pmap association entails an additional drop. 3352 * 3353 * pv must be exclusively locked on call and will be disposed of on return. 3354 */ 3355 static 3356 void 3357 pv_free(pv_entry_t pv, pv_entry_t pvp, int putaway) 3358 { 3359 pmap_t pmap; 3360 3361 KKASSERT(pv->pv_m == NULL); 3362 KKASSERT((pv->pv_hold & PV_HOLD_MASK) >= 2); 3363 if ((pmap = pv->pv_pmap) != NULL) { 3364 spin_lock(&pmap->pm_spin); 3365 pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); 3366 ++pmap->pm_generation; 3367 if (pmap->pm_pvhint == pv) 3368 pmap->pm_pvhint = NULL; 3369 atomic_add_long(&pmap->pm_stats.resident_count, -1); 3370 pv->pv_pmap = NULL; 3371 pv->pv_pindex = 0; 3372 spin_unlock(&pmap->pm_spin); 3373 3374 /* 3375 * Try to shortcut three atomic ops, otherwise fall through 3376 * and do it normally. Drop two refs and the lock all in 3377 * one go. 3378 */ 3379 if (putaway && 3380 atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 0)) { 3381 #ifdef PMAP_DEBUG2 3382 if (pmap_enter_debug > 0) { 3383 --pmap_enter_debug; 3384 kprintf("pv_free: free pv %p\n", pv); 3385 } 3386 #endif 3387 zfree(pvzone, pv); 3388 if (pvp) 3389 vm_page_unwire_quick(pvp->pv_m); 3390 return; 3391 } 3392 pv_drop(pv); /* ref for pv_pmap */ 3393 } 3394 if (putaway) 3395 pv_put(pv); 3396 if (pvp) 3397 vm_page_unwire_quick(pvp->pv_m); 3398 } 3399 3400 /* 3401 * This routine is very drastic, but can save the system 3402 * in a pinch. 3403 */ 3404 void 3405 pmap_collect(void) 3406 { 3407 int i; 3408 vm_page_t m; 3409 static int warningdone=0; 3410 3411 if (pmap_pagedaemon_waken == 0) 3412 return; 3413 pmap_pagedaemon_waken = 0; 3414 if (warningdone < 5) { 3415 kprintf("pmap_collect: collecting pv entries -- " 3416 "suggest increasing PMAP_SHPGPERPROC\n"); 3417 warningdone++; 3418 } 3419 3420 for (i = 0; i < vm_page_array_size; i++) { 3421 m = &vm_page_array[i]; 3422 if (m->wire_count || m->hold_count) 3423 continue; 3424 if (vm_page_busy_try(m, TRUE) == 0) { 3425 if (m->wire_count == 0 && m->hold_count == 0) { 3426 pmap_remove_all(m); 3427 } 3428 vm_page_wakeup(m); 3429 } 3430 } 3431 } 3432 3433 /* 3434 * Scan the pmap for active page table entries and issue a callback. 3435 * The callback must dispose of pte_pv, whos PTE entry is at *ptep in 3436 * its parent page table. 3437 * 3438 * pte_pv will be NULL if the page or page table is unmanaged. 3439 * pt_pv will point to the page table page containing the pte for the page. 3440 * 3441 * NOTE! If we come across an unmanaged page TABLE (verses an unmanaged page), 3442 * we pass a NULL pte_pv and we pass a pt_pv pointing to the passed 3443 * process pmap's PD and page to the callback function. This can be 3444 * confusing because the pt_pv is really a pd_pv, and the target page 3445 * table page is simply aliased by the pmap and not owned by it. 3446 * 3447 * It is assumed that the start and end are properly rounded to the page size. 3448 * 3449 * It is assumed that PD pages and above are managed and thus in the RB tree, 3450 * allowing us to use RB_SCAN from the PD pages down for ranged scans. 3451 */ 3452 struct pmap_scan_info { 3453 struct pmap *pmap; 3454 vm_offset_t sva; 3455 vm_offset_t eva; 3456 vm_pindex_t sva_pd_pindex; 3457 vm_pindex_t eva_pd_pindex; 3458 void (*func)(pmap_t, struct pmap_scan_info *, 3459 pv_entry_t, pv_entry_t, int, vm_offset_t, 3460 pt_entry_t *, void *); 3461 void *arg; 3462 pmap_inval_bulk_t bulk_core; 3463 pmap_inval_bulk_t *bulk; 3464 int count; 3465 int stop; 3466 }; 3467 3468 static int pmap_scan_cmp(pv_entry_t pv, void *data); 3469 static int pmap_scan_callback(pv_entry_t pv, void *data); 3470 3471 static void 3472 pmap_scan(struct pmap_scan_info *info, int smp_inval) 3473 { 3474 struct pmap *pmap = info->pmap; 3475 pv_entry_t pd_pv; /* A page directory PV */ 3476 pv_entry_t pt_pv; /* A page table PV */ 3477 pv_entry_t pte_pv; /* A page table entry PV */ 3478 pt_entry_t *ptep; 3479 pt_entry_t oldpte; 3480 struct pv_entry dummy_pv; 3481 int generation; 3482 3483 info->stop = 0; 3484 if (pmap == NULL) 3485 return; 3486 if (smp_inval) { 3487 info->bulk = &info->bulk_core; 3488 pmap_inval_bulk_init(&info->bulk_core, pmap); 3489 } else { 3490 info->bulk = NULL; 3491 } 3492 3493 /* 3494 * Hold the token for stability; if the pmap is empty we have nothing 3495 * to do. 3496 */ 3497 lwkt_gettoken(&pmap->pm_token); 3498 #if 0 3499 if (pmap->pm_stats.resident_count == 0) { 3500 lwkt_reltoken(&pmap->pm_token); 3501 return; 3502 } 3503 #endif 3504 3505 info->count = 0; 3506 3507 again: 3508 /* 3509 * Special handling for scanning one page, which is a very common 3510 * operation (it is?). 3511 * 3512 * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4 3513 */ 3514 if (info->sva + PAGE_SIZE == info->eva) { 3515 generation = pmap->pm_generation; 3516 if (info->sva >= VM_MAX_USER_ADDRESS) { 3517 /* 3518 * Kernel mappings do not track wire counts on 3519 * page table pages and only maintain pd_pv and 3520 * pte_pv levels so pmap_scan() works. 3521 */ 3522 pt_pv = NULL; 3523 pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva)); 3524 ptep = vtopte(info->sva); 3525 } else { 3526 /* 3527 * User pages which are unmanaged will not have a 3528 * pte_pv. User page table pages which are unmanaged 3529 * (shared from elsewhere) will also not have a pt_pv. 3530 * The func() callback will pass both pte_pv and pt_pv 3531 * as NULL in that case. 3532 */ 3533 pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva)); 3534 pt_pv = pv_get(pmap, pmap_pt_pindex(info->sva)); 3535 if (pt_pv == NULL) { 3536 KKASSERT(pte_pv == NULL); 3537 pd_pv = pv_get(pmap, pmap_pd_pindex(info->sva)); 3538 if (pd_pv) { 3539 ptep = pv_pte_lookup(pd_pv, 3540 pmap_pt_index(info->sva)); 3541 if (*ptep) { 3542 info->func(pmap, info, 3543 NULL, pd_pv, 1, 3544 info->sva, ptep, 3545 info->arg); 3546 } 3547 pv_put(pd_pv); 3548 } 3549 goto fast_skip; 3550 } 3551 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(info->sva)); 3552 } 3553 3554 /* 3555 * NOTE: *ptep can't be ripped out from under us if we hold 3556 * pte_pv locked, but bits can change. However, there is 3557 * a race where another thread may be inserting pte_pv 3558 * and setting *ptep just after our pte_pv lookup fails. 3559 * 3560 * In this situation we can end up with a NULL pte_pv 3561 * but find that we have a managed *ptep. We explicitly 3562 * check for this race. 3563 */ 3564 oldpte = *ptep; 3565 cpu_ccfence(); 3566 if (oldpte == 0) { 3567 /* 3568 * Unlike the pv_find() case below we actually 3569 * acquired a locked pv in this case so any 3570 * race should have been resolved. It is expected 3571 * to not exist. 3572 */ 3573 KKASSERT(pte_pv == NULL); 3574 } else if (pte_pv) { 3575 KASSERT((oldpte & (pmap->pmap_bits[PG_MANAGED_IDX] | 3576 pmap->pmap_bits[PG_V_IDX])) == 3577 (pmap->pmap_bits[PG_MANAGED_IDX] | 3578 pmap->pmap_bits[PG_V_IDX]), 3579 ("badA *ptep %016lx/%016lx sva %016lx pte_pv %p" 3580 "generation %d/%d", 3581 *ptep, oldpte, info->sva, pte_pv, 3582 generation, pmap->pm_generation)); 3583 info->func(pmap, info, pte_pv, pt_pv, 0, 3584 info->sva, ptep, info->arg); 3585 } else { 3586 /* 3587 * Check for insertion race 3588 */ 3589 if ((oldpte & pmap->pmap_bits[PG_MANAGED_IDX]) && 3590 pt_pv) { 3591 pte_pv = pv_find(pmap, 3592 pmap_pte_pindex(info->sva)); 3593 if (pte_pv) { 3594 pv_drop(pte_pv); 3595 pv_put(pt_pv); 3596 kprintf("pmap_scan: RACE1 " 3597 "%016jx, %016lx\n", 3598 info->sva, oldpte); 3599 goto again; 3600 } 3601 } 3602 3603 /* 3604 * Didn't race 3605 */ 3606 KASSERT((oldpte & (pmap->pmap_bits[PG_MANAGED_IDX] | 3607 pmap->pmap_bits[PG_V_IDX])) == 3608 pmap->pmap_bits[PG_V_IDX], 3609 ("badB *ptep %016lx/%016lx sva %016lx pte_pv NULL" 3610 "generation %d/%d", 3611 *ptep, oldpte, info->sva, 3612 generation, pmap->pm_generation)); 3613 info->func(pmap, info, NULL, pt_pv, 0, 3614 info->sva, ptep, info->arg); 3615 } 3616 if (pt_pv) 3617 pv_put(pt_pv); 3618 fast_skip: 3619 pmap_inval_bulk_flush(info->bulk); 3620 lwkt_reltoken(&pmap->pm_token); 3621 return; 3622 } 3623 3624 /* 3625 * Nominal scan case, RB_SCAN() for PD pages and iterate from 3626 * there. 3627 */ 3628 info->sva_pd_pindex = pmap_pd_pindex(info->sva); 3629 info->eva_pd_pindex = pmap_pd_pindex(info->eva + NBPDP - 1); 3630 3631 if (info->sva >= VM_MAX_USER_ADDRESS) { 3632 /* 3633 * The kernel does not currently maintain any pv_entry's for 3634 * higher-level page tables. 3635 */ 3636 bzero(&dummy_pv, sizeof(dummy_pv)); 3637 dummy_pv.pv_pindex = info->sva_pd_pindex; 3638 spin_lock(&pmap->pm_spin); 3639 while (dummy_pv.pv_pindex < info->eva_pd_pindex) { 3640 pmap_scan_callback(&dummy_pv, info); 3641 ++dummy_pv.pv_pindex; 3642 } 3643 spin_unlock(&pmap->pm_spin); 3644 } else { 3645 /* 3646 * User page tables maintain local PML4, PDP, and PD 3647 * pv_entry's at the very least. PT pv's might be 3648 * unmanaged and thus not exist. PTE pv's might be 3649 * unmanaged and thus not exist. 3650 */ 3651 spin_lock(&pmap->pm_spin); 3652 pv_entry_rb_tree_RB_SCAN(&pmap->pm_pvroot, 3653 pmap_scan_cmp, pmap_scan_callback, info); 3654 spin_unlock(&pmap->pm_spin); 3655 } 3656 pmap_inval_bulk_flush(info->bulk); 3657 lwkt_reltoken(&pmap->pm_token); 3658 } 3659 3660 /* 3661 * WARNING! pmap->pm_spin held 3662 */ 3663 static int 3664 pmap_scan_cmp(pv_entry_t pv, void *data) 3665 { 3666 struct pmap_scan_info *info = data; 3667 if (pv->pv_pindex < info->sva_pd_pindex) 3668 return(-1); 3669 if (pv->pv_pindex >= info->eva_pd_pindex) 3670 return(1); 3671 return(0); 3672 } 3673 3674 /* 3675 * WARNING! pmap->pm_spin held 3676 */ 3677 static int 3678 pmap_scan_callback(pv_entry_t pv, void *data) 3679 { 3680 struct pmap_scan_info *info = data; 3681 struct pmap *pmap = info->pmap; 3682 pv_entry_t pd_pv; /* A page directory PV */ 3683 pv_entry_t pt_pv; /* A page table PV */ 3684 pv_entry_t pte_pv; /* A page table entry PV */ 3685 pt_entry_t *ptep; 3686 pt_entry_t oldpte; 3687 vm_offset_t sva; 3688 vm_offset_t eva; 3689 vm_offset_t va_next; 3690 vm_pindex_t pd_pindex; 3691 int error; 3692 int generation; 3693 3694 /* 3695 * Stop if requested 3696 */ 3697 if (info->stop) 3698 return -1; 3699 3700 /* 3701 * Pull the PD pindex from the pv before releasing the spinlock. 3702 * 3703 * WARNING: pv is faked for kernel pmap scans. 3704 */ 3705 pd_pindex = pv->pv_pindex; 3706 spin_unlock(&pmap->pm_spin); 3707 pv = NULL; /* invalid after spinlock unlocked */ 3708 3709 /* 3710 * Calculate the page range within the PD. SIMPLE pmaps are 3711 * direct-mapped for the entire 2^64 address space. Normal pmaps 3712 * reflect the user and kernel address space which requires 3713 * cannonicalization w/regards to converting pd_pindex's back 3714 * into addresses. 3715 */ 3716 sva = (pd_pindex - NUPTE_TOTAL - NUPT_TOTAL) << PDPSHIFT; 3717 if ((pmap->pm_flags & PMAP_FLAG_SIMPLE) == 0 && 3718 (sva & PML4_SIGNMASK)) { 3719 sva |= PML4_SIGNMASK; 3720 } 3721 eva = sva + NBPDP; /* can overflow */ 3722 if (sva < info->sva) 3723 sva = info->sva; 3724 if (eva < info->sva || eva > info->eva) 3725 eva = info->eva; 3726 3727 /* 3728 * NOTE: kernel mappings do not track page table pages, only 3729 * terminal pages. 3730 * 3731 * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4. 3732 * However, for the scan to be efficient we try to 3733 * cache items top-down. 3734 */ 3735 pd_pv = NULL; 3736 pt_pv = NULL; 3737 3738 for (; sva < eva; sva = va_next) { 3739 if (info->stop) 3740 break; 3741 if (sva >= VM_MAX_USER_ADDRESS) { 3742 if (pt_pv) { 3743 pv_put(pt_pv); 3744 pt_pv = NULL; 3745 } 3746 goto kernel_skip; 3747 } 3748 3749 /* 3750 * PD cache (degenerate case if we skip). It is possible 3751 * for the PD to not exist due to races. This is ok. 3752 */ 3753 if (pd_pv == NULL) { 3754 pd_pv = pv_get(pmap, pmap_pd_pindex(sva)); 3755 } else if (pd_pv->pv_pindex != pmap_pd_pindex(sva)) { 3756 pv_put(pd_pv); 3757 pd_pv = pv_get(pmap, pmap_pd_pindex(sva)); 3758 } 3759 if (pd_pv == NULL) { 3760 va_next = (sva + NBPDP) & ~PDPMASK; 3761 if (va_next < sva) 3762 va_next = eva; 3763 continue; 3764 } 3765 3766 /* 3767 * PT cache 3768 */ 3769 if (pt_pv == NULL) { 3770 vm_page_wire_quick(pd_pv->pv_m); 3771 pv_unlock(pd_pv); 3772 pt_pv = pv_get(pmap, pmap_pt_pindex(sva)); 3773 pv_lock(pd_pv); 3774 vm_page_unwire_quick(pd_pv->pv_m); 3775 } else if (pt_pv->pv_pindex != pmap_pt_pindex(sva)) { 3776 vm_page_wire_quick(pd_pv->pv_m); 3777 pv_unlock(pd_pv); 3778 pv_put(pt_pv); 3779 pt_pv = pv_get(pmap, pmap_pt_pindex(sva)); 3780 pv_lock(pd_pv); 3781 vm_page_unwire_quick(pd_pv->pv_m); 3782 } 3783 3784 /* 3785 * If pt_pv is NULL we either have an shared page table 3786 * page and must issue a callback specific to that case, 3787 * or there is no page table page. 3788 * 3789 * Either way we can skip the page table page. 3790 */ 3791 if (pt_pv == NULL) { 3792 /* 3793 * Possible unmanaged (shared from another pmap) 3794 * page table page. 3795 */ 3796 ptep = pv_pte_lookup(pd_pv, pmap_pt_index(sva)); 3797 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 3798 info->func(pmap, info, NULL, pd_pv, 1, 3799 sva, ptep, info->arg); 3800 } 3801 3802 /* 3803 * Done, move to next page table page. 3804 */ 3805 va_next = (sva + NBPDR) & ~PDRMASK; 3806 if (va_next < sva) 3807 va_next = eva; 3808 continue; 3809 } 3810 3811 /* 3812 * From this point in the loop testing pt_pv for non-NULL 3813 * means we are in UVM, else if it is NULL we are in KVM. 3814 * 3815 * Limit our scan to either the end of the va represented 3816 * by the current page table page, or to the end of the 3817 * range being removed. 3818 */ 3819 kernel_skip: 3820 va_next = (sva + NBPDR) & ~PDRMASK; 3821 if (va_next < sva) 3822 va_next = eva; 3823 if (va_next > eva) 3824 va_next = eva; 3825 3826 /* 3827 * Scan the page table for pages. Some pages may not be 3828 * managed (might not have a pv_entry). 3829 * 3830 * There is no page table management for kernel pages so 3831 * pt_pv will be NULL in that case, but otherwise pt_pv 3832 * is non-NULL, locked, and referenced. 3833 */ 3834 3835 /* 3836 * At this point a non-NULL pt_pv means a UVA, and a NULL 3837 * pt_pv means a KVA. 3838 */ 3839 if (pt_pv) 3840 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva)); 3841 else 3842 ptep = vtopte(sva); 3843 3844 while (sva < va_next) { 3845 /* 3846 * Yield every 64 pages, stop if requested. 3847 */ 3848 if ((++info->count & 63) == 0) 3849 lwkt_user_yield(); 3850 if (info->stop) 3851 break; 3852 3853 /* 3854 * Check if pt_pv has been lost (probably due to 3855 * a remove of the underlying pages). 3856 */ 3857 if (pt_pv && pt_pv->pv_pmap == NULL) 3858 break; 3859 3860 /* 3861 * Acquire the related pte_pv, if any. If *ptep == 0 3862 * the related pte_pv should not exist, but if *ptep 3863 * is not zero the pte_pv may or may not exist (e.g. 3864 * will not exist for an unmanaged page). 3865 * 3866 * However a multitude of races are possible here. 3867 * 3868 * In addition, the (pt_pv, pte_pv) lock order is 3869 * backwards, so we have to be careful in aquiring 3870 * a properly locked pte_pv. 3871 */ 3872 generation = pmap->pm_generation; 3873 if (pt_pv) { 3874 pte_pv = pv_get_try(pmap, pmap_pte_pindex(sva), 3875 &error); 3876 if (error) { 3877 if (pd_pv) { 3878 vm_page_wire_quick(pd_pv->pv_m); 3879 pv_unlock(pd_pv); 3880 } 3881 vm_page_wire_quick(pt_pv->pv_m); 3882 pv_unlock(pt_pv);/* must be non-NULL */ 3883 pv_lock(pte_pv); /* safe to block now */ 3884 pv_put(pte_pv); 3885 pte_pv = NULL; 3886 pv_lock(pt_pv); 3887 vm_page_unwire_quick(pt_pv->pv_m); 3888 3889 /* 3890 * pt_pv reloaded, need new ptep 3891 */ 3892 KKASSERT(pt_pv != NULL); 3893 ptep = pv_pte_lookup(pt_pv, 3894 pmap_pte_index(sva)); 3895 if (pd_pv) { 3896 pv_lock(pd_pv); 3897 vm_page_unwire_quick(pd_pv->pv_m); 3898 } 3899 continue; 3900 } 3901 } else { 3902 pte_pv = pv_get(pmap, pmap_pte_pindex(sva)); 3903 } 3904 3905 /* 3906 * Ok, if *ptep == 0 we had better NOT have a pte_pv. 3907 */ 3908 oldpte = *ptep; 3909 if (oldpte == 0) { 3910 if (pte_pv) { 3911 kprintf("Unexpected non-NULL pte_pv " 3912 "%p pt_pv %p " 3913 "*ptep = %016lx/%016lx\n", 3914 pte_pv, pt_pv, *ptep, oldpte); 3915 panic("Unexpected non-NULL pte_pv"); 3916 } 3917 sva += PAGE_SIZE; 3918 ++ptep; 3919 continue; 3920 } 3921 3922 /* 3923 * Ready for the callback. The locked pte_pv (if any) 3924 * is consumed by the callback. pte_pv will exist if 3925 * the page is managed, and will not exist if it 3926 * isn't. 3927 */ 3928 if (pte_pv) { 3929 KASSERT((oldpte & (pmap->pmap_bits[PG_MANAGED_IDX] | pmap->pmap_bits[PG_V_IDX])) == 3930 (pmap->pmap_bits[PG_MANAGED_IDX] | pmap->pmap_bits[PG_V_IDX]), 3931 ("badC *ptep %016lx/%016lx sva %016lx " 3932 "pte_pv %p pm_generation %d/%d", 3933 *ptep, oldpte, sva, pte_pv, 3934 generation, pmap->pm_generation)); 3935 /* 3936 * We must unlock pd_pv across the callback 3937 * to avoid deadlocks on any recursive 3938 * disposal. Re-check that it still exists 3939 * after re-locking. 3940 */ 3941 if (pd_pv) 3942 pv_unlock(pd_pv); 3943 info->func(pmap, info, pte_pv, pt_pv, 0, 3944 sva, ptep, info->arg); 3945 if (pd_pv) { 3946 pv_lock(pd_pv); 3947 if (pd_pv->pv_pmap == NULL) { 3948 pv_put(pd_pv); 3949 pd_pv = NULL; 3950 } 3951 } 3952 } else { 3953 /* 3954 * Check for insertion race. Since there is no 3955 * pte_pv to guard us it is possible for us 3956 * to race another thread doing an insertion. 3957 * Our lookup misses the pte_pv but our *ptep 3958 * check sees the inserted pte. 3959 * 3960 * XXX panic case seems to occur within a 3961 * vm_fork() of /bin/sh, which frankly 3962 * shouldn't happen since no other threads 3963 * should be inserting to our pmap in that 3964 * situation. Removing, possibly. Inserting, 3965 * shouldn't happen. 3966 */ 3967 if ((oldpte & pmap->pmap_bits[PG_MANAGED_IDX]) && 3968 pt_pv) { 3969 pte_pv = pv_find(pmap, 3970 pmap_pte_pindex(sva)); 3971 if (pte_pv) { 3972 pv_drop(pte_pv); 3973 kprintf("pmap_scan: RACE2 " 3974 "%016jx, %016lx\n", 3975 sva, oldpte); 3976 continue; 3977 } 3978 } 3979 3980 /* 3981 * Didn't race 3982 * 3983 * We must unlock pd_pv across the callback 3984 * to avoid deadlocks on any recursive 3985 * disposal. Re-check that it still exists 3986 * after re-locking. 3987 */ 3988 KASSERT((oldpte & (pmap->pmap_bits[PG_MANAGED_IDX] | pmap->pmap_bits[PG_V_IDX])) == 3989 pmap->pmap_bits[PG_V_IDX], 3990 ("badD *ptep %016lx/%016lx sva %016lx " 3991 "pte_pv NULL pm_generation %d/%d", 3992 *ptep, oldpte, sva, 3993 generation, pmap->pm_generation)); 3994 if (pd_pv) 3995 pv_unlock(pd_pv); 3996 info->func(pmap, info, NULL, pt_pv, 0, 3997 sva, ptep, info->arg); 3998 if (pd_pv) { 3999 pv_lock(pd_pv); 4000 if (pd_pv->pv_pmap == NULL) { 4001 pv_put(pd_pv); 4002 pd_pv = NULL; 4003 } 4004 } 4005 } 4006 pte_pv = NULL; 4007 sva += PAGE_SIZE; 4008 ++ptep; 4009 } 4010 } 4011 if (pd_pv) { 4012 pv_put(pd_pv); 4013 pd_pv = NULL; 4014 } 4015 if (pt_pv) { 4016 pv_put(pt_pv); 4017 pt_pv = NULL; 4018 } 4019 if ((++info->count & 7) == 0) 4020 lwkt_user_yield(); 4021 4022 /* 4023 * Relock before returning. 4024 */ 4025 spin_lock(&pmap->pm_spin); 4026 return (0); 4027 } 4028 4029 void 4030 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 4031 { 4032 struct pmap_scan_info info; 4033 4034 info.pmap = pmap; 4035 info.sva = sva; 4036 info.eva = eva; 4037 info.func = pmap_remove_callback; 4038 info.arg = NULL; 4039 pmap_scan(&info, 1); 4040 } 4041 4042 static void 4043 pmap_remove_noinval(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 4044 { 4045 struct pmap_scan_info info; 4046 4047 info.pmap = pmap; 4048 info.sva = sva; 4049 info.eva = eva; 4050 info.func = pmap_remove_callback; 4051 info.arg = NULL; 4052 pmap_scan(&info, 0); 4053 } 4054 4055 static void 4056 pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info, 4057 pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept, 4058 vm_offset_t va, pt_entry_t *ptep, void *arg __unused) 4059 { 4060 pt_entry_t pte; 4061 4062 if (pte_pv) { 4063 /* 4064 * This will also drop pt_pv's wire_count. Note that 4065 * terminal pages are not wired based on mmu presence. 4066 * 4067 * NOTE: If this is the kernel_pmap, pt_pv can be NULL. 4068 */ 4069 pmap_remove_pv_pte(pte_pv, pt_pv, info->bulk, 2); 4070 pte_pv = NULL; /* safety */ 4071 4072 /* 4073 * Recursively destroy higher-level page tables. 4074 * 4075 * This is optional. If we do not, they will still 4076 * be destroyed when the process exits. 4077 * 4078 * NOTE: Do not destroy pv_entry's with extra hold refs, 4079 * a caller may have unlocked it and intends to 4080 * continue to use it. 4081 */ 4082 if (pmap_dynamic_delete && 4083 pt_pv && 4084 pt_pv->pv_m && 4085 pt_pv->pv_m->wire_count == 1 && 4086 (pt_pv->pv_hold & PV_HOLD_MASK) == 2 && 4087 pt_pv->pv_pindex != pmap_pml4_pindex()) { 4088 if (pmap_dynamic_delete == 2) 4089 kprintf("B %jd %08x\n", pt_pv->pv_pindex, pt_pv->pv_hold); 4090 pv_hold(pt_pv); 4091 pmap_remove_pv_pte(pt_pv, NULL, info->bulk, 1); 4092 pv_lock(pt_pv); 4093 } 4094 } else if (sharept == 0) { 4095 /* 4096 * Unmanaged page table (pt, pd, or pdp. Not pte). 4097 * 4098 * pt_pv's wire_count is still bumped by unmanaged pages 4099 * so we must decrement it manually. 4100 * 4101 * We have to unwire the target page table page. 4102 * 4103 * It is unclear how we can invalidate a segment so we 4104 * invalidate -1 which invlidates the tlb. 4105 */ 4106 pte = pmap_inval_bulk(info->bulk, (vm_offset_t)-1, ptep, 0); 4107 if (pte & pmap->pmap_bits[PG_W_IDX]) 4108 atomic_add_long(&pmap->pm_stats.wired_count, -1); 4109 atomic_add_long(&pmap->pm_stats.resident_count, -1); 4110 if (vm_page_unwire_quick(pt_pv->pv_m)) 4111 panic("pmap_remove: insufficient wirecount"); 4112 } else { 4113 /* 4114 * Unmanaged page table (pt, pd, or pdp. Not pte) for 4115 * a shared page table. 4116 * 4117 * pt_pv is actually the pd_pv for our pmap (not the shared 4118 * object pmap). 4119 * 4120 * We have to unwire the target page table page and we 4121 * have to unwire our page directory page. 4122 * 4123 * It is unclear how we can invalidate a segment so we 4124 * invalidate -1 which invlidates the tlb. 4125 */ 4126 pte = pmap_inval_bulk(info->bulk, (vm_offset_t)-1, ptep, 0); 4127 atomic_add_long(&pmap->pm_stats.resident_count, -1); 4128 KKASSERT((pte & pmap->pmap_bits[PG_DEVICE_IDX]) == 0); 4129 if (vm_page_unwire_quick(PHYS_TO_VM_PAGE(pte & PG_FRAME))) 4130 panic("pmap_remove: shared pgtable1 bad wirecount"); 4131 if (vm_page_unwire_quick(pt_pv->pv_m)) 4132 panic("pmap_remove: shared pgtable2 bad wirecount"); 4133 } 4134 } 4135 4136 /* 4137 * Removes this physical page from all physical maps in which it resides. 4138 * Reflects back modify bits to the pager. 4139 * 4140 * This routine may not be called from an interrupt. 4141 */ 4142 static 4143 void 4144 pmap_remove_all(vm_page_t m) 4145 { 4146 pv_entry_t pv; 4147 pmap_inval_bulk_t bulk; 4148 4149 if (!pmap_initialized /* || (m->flags & PG_FICTITIOUS)*/) 4150 return; 4151 4152 vm_page_spin_lock(m); 4153 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4154 KKASSERT(pv->pv_m == m); 4155 if (pv_hold_try(pv)) { 4156 vm_page_spin_unlock(m); 4157 } else { 4158 vm_page_spin_unlock(m); 4159 pv_lock(pv); 4160 } 4161 if (pv->pv_m != m) { 4162 pv_put(pv); 4163 vm_page_spin_lock(m); 4164 continue; 4165 } 4166 4167 /* 4168 * Holding no spinlocks, pv is locked. 4169 */ 4170 pmap_inval_bulk_init(&bulk, pv->pv_pmap); 4171 pmap_remove_pv_pte(pv, NULL, &bulk, 2); 4172 pv = NULL; /* safety */ 4173 pmap_inval_bulk_flush(&bulk); 4174 #if 0 4175 pmap_remove_pv_page(pv); 4176 pv_free(pv, 1); 4177 #endif 4178 vm_page_spin_lock(m); 4179 } 4180 KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0); 4181 vm_page_spin_unlock(m); 4182 } 4183 4184 /* 4185 * Removes the page from a particular pmap 4186 */ 4187 void 4188 pmap_remove_specific(pmap_t pmap, vm_page_t m) 4189 { 4190 pv_entry_t pv; 4191 pmap_inval_bulk_t bulk; 4192 4193 if (!pmap_initialized) 4194 return; 4195 4196 again: 4197 vm_page_spin_lock(m); 4198 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4199 if (pv->pv_pmap != pmap) 4200 continue; 4201 KKASSERT(pv->pv_m == m); 4202 if (pv_hold_try(pv)) { 4203 vm_page_spin_unlock(m); 4204 } else { 4205 vm_page_spin_unlock(m); 4206 pv_lock(pv); 4207 } 4208 if (pv->pv_m != m) { 4209 pv_put(pv); 4210 goto again; 4211 } 4212 4213 /* 4214 * Holding no spinlocks, pv is locked. 4215 */ 4216 pmap_inval_bulk_init(&bulk, pv->pv_pmap); 4217 pmap_remove_pv_pte(pv, NULL, &bulk, 2); 4218 pv = NULL; /* safety */ 4219 pmap_inval_bulk_flush(&bulk); 4220 #if 0 4221 pmap_remove_pv_page(pv); 4222 pv_free(pv, 1); 4223 #endif 4224 goto again; 4225 } 4226 vm_page_spin_unlock(m); 4227 } 4228 4229 /* 4230 * Set the physical protection on the specified range of this map 4231 * as requested. This function is typically only used for debug watchpoints 4232 * and COW pages. 4233 * 4234 * This function may not be called from an interrupt if the map is 4235 * not the kernel_pmap. 4236 * 4237 * NOTE! For shared page table pages we just unmap the page. 4238 */ 4239 void 4240 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4241 { 4242 struct pmap_scan_info info; 4243 /* JG review for NX */ 4244 4245 if (pmap == NULL) 4246 return; 4247 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 4248 pmap_remove(pmap, sva, eva); 4249 return; 4250 } 4251 if (prot & VM_PROT_WRITE) 4252 return; 4253 info.pmap = pmap; 4254 info.sva = sva; 4255 info.eva = eva; 4256 info.func = pmap_protect_callback; 4257 info.arg = &prot; 4258 pmap_scan(&info, 1); 4259 } 4260 4261 static 4262 void 4263 pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info, 4264 pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept, 4265 vm_offset_t va, pt_entry_t *ptep, void *arg __unused) 4266 { 4267 pt_entry_t pbits; 4268 pt_entry_t cbits; 4269 pt_entry_t pte; 4270 vm_page_t m; 4271 4272 again: 4273 pbits = *ptep; 4274 cbits = pbits; 4275 if (pte_pv) { 4276 m = NULL; 4277 if (pbits & pmap->pmap_bits[PG_A_IDX]) { 4278 if ((pbits & pmap->pmap_bits[PG_DEVICE_IDX]) == 0) { 4279 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 4280 KKASSERT(m == pte_pv->pv_m); 4281 vm_page_flag_set(m, PG_REFERENCED); 4282 } 4283 cbits &= ~pmap->pmap_bits[PG_A_IDX]; 4284 } 4285 if (pbits & pmap->pmap_bits[PG_M_IDX]) { 4286 if (pmap_track_modified(pte_pv->pv_pindex)) { 4287 if ((pbits & pmap->pmap_bits[PG_DEVICE_IDX]) == 0) { 4288 if (m == NULL) { 4289 m = PHYS_TO_VM_PAGE(pbits & 4290 PG_FRAME); 4291 } 4292 vm_page_dirty(m); 4293 } 4294 cbits &= ~pmap->pmap_bits[PG_M_IDX]; 4295 } 4296 } 4297 } else if (sharept) { 4298 /* 4299 * Unmanaged page table, pt_pv is actually the pd_pv 4300 * for our pmap (not the object's shared pmap). 4301 * 4302 * When asked to protect something in a shared page table 4303 * page we just unmap the page table page. We have to 4304 * invalidate the tlb in this situation. 4305 * 4306 * XXX Warning, shared page tables will not be used for 4307 * OBJT_DEVICE or OBJT_MGTDEVICE (PG_FICTITIOUS) mappings 4308 * so PHYS_TO_VM_PAGE() should be safe here. 4309 */ 4310 pte = pmap_inval_smp(pmap, (vm_offset_t)-1, 1, ptep, 0); 4311 if (vm_page_unwire_quick(PHYS_TO_VM_PAGE(pte & PG_FRAME))) 4312 panic("pmap_protect: pgtable1 pg bad wirecount"); 4313 if (vm_page_unwire_quick(pt_pv->pv_m)) 4314 panic("pmap_protect: pgtable2 pg bad wirecount"); 4315 ptep = NULL; 4316 } 4317 /* else unmanaged page, adjust bits, no wire changes */ 4318 4319 if (ptep) { 4320 cbits &= ~pmap->pmap_bits[PG_RW_IDX]; 4321 #ifdef PMAP_DEBUG2 4322 if (pmap_enter_debug > 0) { 4323 --pmap_enter_debug; 4324 kprintf("pmap_protect va=%lx ptep=%p pte_pv=%p " 4325 "pt_pv=%p cbits=%08lx\n", 4326 va, ptep, pte_pv, 4327 pt_pv, cbits 4328 ); 4329 } 4330 #endif 4331 if (pbits != cbits) { 4332 if (!pmap_inval_smp_cmpset(pmap, (vm_offset_t)-1, 4333 ptep, pbits, cbits)) { 4334 goto again; 4335 } 4336 } 4337 } 4338 if (pte_pv) 4339 pv_put(pte_pv); 4340 } 4341 4342 /* 4343 * Insert the vm_page (m) at the virtual address (va), replacing any prior 4344 * mapping at that address. Set protection and wiring as requested. 4345 * 4346 * If entry is non-NULL we check to see if the SEG_SIZE optimization is 4347 * possible. If it is we enter the page into the appropriate shared pmap 4348 * hanging off the related VM object instead of the passed pmap, then we 4349 * share the page table page from the VM object's pmap into the current pmap. 4350 * 4351 * NOTE: This routine MUST insert the page into the pmap now, it cannot 4352 * lazy-evaluate. 4353 */ 4354 void 4355 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4356 boolean_t wired, vm_map_entry_t entry) 4357 { 4358 pv_entry_t pt_pv; /* page table */ 4359 pv_entry_t pte_pv; /* page table entry */ 4360 pt_entry_t *ptep; 4361 vm_paddr_t opa; 4362 pt_entry_t origpte, newpte; 4363 vm_paddr_t pa; 4364 4365 if (pmap == NULL) 4366 return; 4367 va = trunc_page(va); 4368 #ifdef PMAP_DIAGNOSTIC 4369 if (va >= KvaEnd) 4370 panic("pmap_enter: toobig"); 4371 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) 4372 panic("pmap_enter: invalid to pmap_enter page table " 4373 "pages (va: 0x%lx)", va); 4374 #endif 4375 if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) { 4376 kprintf("Warning: pmap_enter called on UVA with " 4377 "kernel_pmap\n"); 4378 #ifdef DDB 4379 db_print_backtrace(); 4380 #endif 4381 } 4382 if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) { 4383 kprintf("Warning: pmap_enter called on KVA without" 4384 "kernel_pmap\n"); 4385 #ifdef DDB 4386 db_print_backtrace(); 4387 #endif 4388 } 4389 4390 /* 4391 * Get locked PV entries for our new page table entry (pte_pv) 4392 * and for its parent page table (pt_pv). We need the parent 4393 * so we can resolve the location of the ptep. 4394 * 4395 * Only hardware MMU actions can modify the ptep out from 4396 * under us. 4397 * 4398 * if (m) is fictitious or unmanaged we do not create a managing 4399 * pte_pv for it. Any pre-existing page's management state must 4400 * match (avoiding code complexity). 4401 * 4402 * If the pmap is still being initialized we assume existing 4403 * page tables. 4404 * 4405 * Kernel mapppings do not track page table pages (i.e. pt_pv). 4406 */ 4407 if (pmap_initialized == FALSE) { 4408 pte_pv = NULL; 4409 pt_pv = NULL; 4410 ptep = vtopte(va); 4411 origpte = *ptep; 4412 } else if (m->flags & (/*PG_FICTITIOUS |*/ PG_UNMANAGED)) { /* XXX */ 4413 pte_pv = NULL; 4414 if (va >= VM_MAX_USER_ADDRESS) { 4415 pt_pv = NULL; 4416 ptep = vtopte(va); 4417 } else { 4418 pt_pv = pmap_allocpte_seg(pmap, pmap_pt_pindex(va), 4419 NULL, entry, va); 4420 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 4421 } 4422 origpte = *ptep; 4423 cpu_ccfence(); 4424 KASSERT(origpte == 0 || 4425 (origpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0, 4426 ("Invalid PTE 0x%016jx @ 0x%016jx\n", origpte, va)); 4427 } else { 4428 if (va >= VM_MAX_USER_ADDRESS) { 4429 /* 4430 * Kernel map, pv_entry-tracked. 4431 */ 4432 pt_pv = NULL; 4433 pte_pv = pmap_allocpte(pmap, pmap_pte_pindex(va), NULL); 4434 ptep = vtopte(va); 4435 } else { 4436 /* 4437 * User map 4438 */ 4439 pte_pv = pmap_allocpte_seg(pmap, pmap_pte_pindex(va), 4440 &pt_pv, entry, va); 4441 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 4442 } 4443 origpte = *ptep; 4444 cpu_ccfence(); 4445 KASSERT(origpte == 0 || 4446 (origpte & pmap->pmap_bits[PG_MANAGED_IDX]), 4447 ("Invalid PTE 0x%016jx @ 0x%016jx\n", origpte, va)); 4448 } 4449 4450 pa = VM_PAGE_TO_PHYS(m); 4451 opa = origpte & PG_FRAME; 4452 4453 newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | 4454 pmap->pmap_bits[PG_V_IDX] | pmap->pmap_bits[PG_A_IDX]); 4455 if (wired) 4456 newpte |= pmap->pmap_bits[PG_W_IDX]; 4457 if (va < VM_MAX_USER_ADDRESS) 4458 newpte |= pmap->pmap_bits[PG_U_IDX]; 4459 if (pte_pv) 4460 newpte |= pmap->pmap_bits[PG_MANAGED_IDX]; 4461 // if (pmap == &kernel_pmap) 4462 // newpte |= pgeflag; 4463 newpte |= pmap->pmap_cache_bits[m->pat_mode]; 4464 if (m->flags & PG_FICTITIOUS) 4465 newpte |= pmap->pmap_bits[PG_DEVICE_IDX]; 4466 4467 /* 4468 * It is possible for multiple faults to occur in threaded 4469 * environments, the existing pte might be correct. 4470 */ 4471 if (((origpte ^ newpte) & ~(pt_entry_t)(pmap->pmap_bits[PG_M_IDX] | 4472 pmap->pmap_bits[PG_A_IDX])) == 0) 4473 goto done; 4474 4475 /* 4476 * Ok, either the address changed or the protection or wiring 4477 * changed. 4478 * 4479 * Clear the current entry, interlocking the removal. For managed 4480 * pte's this will also flush the modified state to the vm_page. 4481 * Atomic ops are mandatory in order to ensure that PG_M events are 4482 * not lost during any transition. 4483 * 4484 * WARNING: The caller has busied the new page but not the original 4485 * vm_page which we are trying to replace. Because we hold 4486 * the pte_pv lock, but have not busied the page, PG bits 4487 * can be cleared out from under us. 4488 */ 4489 if (opa) { 4490 if (pte_pv) { 4491 /* 4492 * pt_pv won't exist for a kernel page (managed or 4493 * otherwise). 4494 */ 4495 if (prot & VM_PROT_NOSYNC) { 4496 pmap_remove_pv_pte(pte_pv, pt_pv, NULL, 0); 4497 } else { 4498 pmap_inval_bulk_t bulk; 4499 4500 pmap_inval_bulk_init(&bulk, pmap); 4501 pmap_remove_pv_pte(pte_pv, pt_pv, &bulk, 0); 4502 pmap_inval_bulk_flush(&bulk); 4503 } 4504 if (pte_pv->pv_m) 4505 pmap_remove_pv_page(pte_pv); 4506 } else if (prot & VM_PROT_NOSYNC) { 4507 /* 4508 * Unmanaged page, NOSYNC (no mmu sync) requested. 4509 * 4510 * Leave wire count on PT page intact. 4511 */ 4512 (void)pte_load_clear(ptep); 4513 cpu_invlpg((void *)va); 4514 atomic_add_long(&pmap->pm_stats.resident_count, -1); 4515 } else { 4516 /* 4517 * Unmanaged page, normal enter. 4518 * 4519 * Leave wire count on PT page intact. 4520 */ 4521 pmap_inval_smp(pmap, va, 1, ptep, 0); 4522 atomic_add_long(&pmap->pm_stats.resident_count, -1); 4523 } 4524 KKASSERT(*ptep == 0); 4525 } 4526 4527 #ifdef PMAP_DEBUG2 4528 if (pmap_enter_debug > 0) { 4529 --pmap_enter_debug; 4530 kprintf("pmap_enter: va=%lx m=%p origpte=%lx newpte=%lx ptep=%p" 4531 " pte_pv=%p pt_pv=%p opa=%lx prot=%02x\n", 4532 va, m, 4533 origpte, newpte, ptep, 4534 pte_pv, pt_pv, opa, prot); 4535 } 4536 #endif 4537 4538 if (pte_pv) { 4539 /* 4540 * Enter on the PV list if part of our managed memory. 4541 * Wiring of the PT page is already handled. 4542 */ 4543 KKASSERT(pte_pv->pv_m == NULL); 4544 vm_page_spin_lock(m); 4545 pte_pv->pv_m = m; 4546 pmap_page_stats_adding(m); 4547 TAILQ_INSERT_TAIL(&m->md.pv_list, pte_pv, pv_list); 4548 vm_page_flag_set(m, PG_MAPPED); 4549 vm_page_spin_unlock(m); 4550 } else if (pt_pv && opa == 0) { 4551 /* 4552 * We have to adjust the wire count on the PT page ourselves 4553 * for unmanaged entries. If opa was non-zero we retained 4554 * the existing wire count from the removal. 4555 */ 4556 vm_page_wire_quick(pt_pv->pv_m); 4557 } 4558 4559 /* 4560 * Kernel VMAs (pt_pv == NULL) require pmap invalidation interlocks. 4561 * 4562 * User VMAs do not because those will be zero->non-zero, so no 4563 * stale entries to worry about at this point. 4564 * 4565 * For KVM there appear to still be issues. Theoretically we 4566 * should be able to scrap the interlocks entirely but we 4567 * get crashes. 4568 */ 4569 if ((prot & VM_PROT_NOSYNC) == 0 && pt_pv == NULL) { 4570 pmap_inval_smp(pmap, va, 1, ptep, newpte); 4571 } else { 4572 *(volatile pt_entry_t *)ptep = newpte; 4573 if (pt_pv == NULL) 4574 cpu_invlpg((void *)va); 4575 } 4576 4577 if (wired) { 4578 if (pte_pv) { 4579 atomic_add_long(&pte_pv->pv_pmap->pm_stats.wired_count, 4580 1); 4581 } else { 4582 atomic_add_long(&pmap->pm_stats.wired_count, 1); 4583 } 4584 } 4585 if (newpte & pmap->pmap_bits[PG_RW_IDX]) 4586 vm_page_flag_set(m, PG_WRITEABLE); 4587 4588 /* 4589 * Unmanaged pages need manual resident_count tracking. 4590 */ 4591 if (pte_pv == NULL && pt_pv) 4592 atomic_add_long(&pt_pv->pv_pmap->pm_stats.resident_count, 1); 4593 4594 /* 4595 * Cleanup 4596 */ 4597 done: 4598 KKASSERT((newpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0 || 4599 (m->flags & PG_MAPPED)); 4600 4601 /* 4602 * Cleanup the pv entry, allowing other accessors. 4603 */ 4604 if (pte_pv) 4605 pv_put(pte_pv); 4606 if (pt_pv) 4607 pv_put(pt_pv); 4608 } 4609 4610 /* 4611 * This code works like pmap_enter() but assumes VM_PROT_READ and not-wired. 4612 * This code also assumes that the pmap has no pre-existing entry for this 4613 * VA. 4614 * 4615 * This code currently may only be used on user pmaps, not kernel_pmap. 4616 */ 4617 void 4618 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m) 4619 { 4620 pmap_enter(pmap, va, m, VM_PROT_READ, FALSE, NULL); 4621 } 4622 4623 /* 4624 * Make a temporary mapping for a physical address. This is only intended 4625 * to be used for panic dumps. 4626 * 4627 * The caller is responsible for calling smp_invltlb(). 4628 */ 4629 void * 4630 pmap_kenter_temporary(vm_paddr_t pa, long i) 4631 { 4632 pmap_kenter_quick((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa); 4633 return ((void *)crashdumpmap); 4634 } 4635 4636 #define MAX_INIT_PT (96) 4637 4638 /* 4639 * This routine preloads the ptes for a given object into the specified pmap. 4640 * This eliminates the blast of soft faults on process startup and 4641 * immediately after an mmap. 4642 */ 4643 static int pmap_object_init_pt_callback(vm_page_t p, void *data); 4644 4645 void 4646 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot, 4647 vm_object_t object, vm_pindex_t pindex, 4648 vm_size_t size, int limit) 4649 { 4650 struct rb_vm_page_scan_info info; 4651 struct lwp *lp; 4652 vm_size_t psize; 4653 4654 /* 4655 * We can't preinit if read access isn't set or there is no pmap 4656 * or object. 4657 */ 4658 if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL) 4659 return; 4660 4661 /* 4662 * We can't preinit if the pmap is not the current pmap 4663 */ 4664 lp = curthread->td_lwp; 4665 if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace)) 4666 return; 4667 4668 /* 4669 * Misc additional checks 4670 */ 4671 psize = x86_64_btop(size); 4672 4673 if ((object->type != OBJT_VNODE) || 4674 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && 4675 (object->resident_page_count > MAX_INIT_PT))) { 4676 return; 4677 } 4678 4679 if (pindex + psize > object->size) { 4680 if (object->size < pindex) 4681 return; 4682 psize = object->size - pindex; 4683 } 4684 4685 if (psize == 0) 4686 return; 4687 4688 /* 4689 * If everything is segment-aligned do not pre-init here. Instead 4690 * allow the normal vm_fault path to pass a segment hint to 4691 * pmap_enter() which will then use an object-referenced shared 4692 * page table page. 4693 */ 4694 if ((addr & SEG_MASK) == 0 && 4695 (ctob(psize) & SEG_MASK) == 0 && 4696 (ctob(pindex) & SEG_MASK) == 0) { 4697 return; 4698 } 4699 4700 /* 4701 * Use a red-black scan to traverse the requested range and load 4702 * any valid pages found into the pmap. 4703 * 4704 * We cannot safely scan the object's memq without holding the 4705 * object token. 4706 */ 4707 info.start_pindex = pindex; 4708 info.end_pindex = pindex + psize - 1; 4709 info.limit = limit; 4710 info.mpte = NULL; 4711 info.addr = addr; 4712 info.pmap = pmap; 4713 4714 vm_object_hold_shared(object); 4715 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 4716 pmap_object_init_pt_callback, &info); 4717 vm_object_drop(object); 4718 } 4719 4720 static 4721 int 4722 pmap_object_init_pt_callback(vm_page_t p, void *data) 4723 { 4724 struct rb_vm_page_scan_info *info = data; 4725 vm_pindex_t rel_index; 4726 4727 /* 4728 * don't allow an madvise to blow away our really 4729 * free pages allocating pv entries. 4730 */ 4731 if ((info->limit & MAP_PREFAULT_MADVISE) && 4732 vmstats.v_free_count < vmstats.v_free_reserved) { 4733 return(-1); 4734 } 4735 4736 /* 4737 * Ignore list markers and ignore pages we cannot instantly 4738 * busy (while holding the object token). 4739 */ 4740 if (p->flags & PG_MARKER) 4741 return 0; 4742 if (vm_page_busy_try(p, TRUE)) 4743 return 0; 4744 if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && 4745 (p->flags & PG_FICTITIOUS) == 0) { 4746 if ((p->queue - p->pc) == PQ_CACHE) 4747 vm_page_deactivate(p); 4748 rel_index = p->pindex - info->start_pindex; 4749 pmap_enter_quick(info->pmap, 4750 info->addr + x86_64_ptob(rel_index), p); 4751 } 4752 vm_page_wakeup(p); 4753 lwkt_yield(); 4754 return(0); 4755 } 4756 4757 /* 4758 * Return TRUE if the pmap is in shape to trivially pre-fault the specified 4759 * address. 4760 * 4761 * Returns FALSE if it would be non-trivial or if a pte is already loaded 4762 * into the slot. 4763 * 4764 * XXX This is safe only because page table pages are not freed. 4765 */ 4766 int 4767 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) 4768 { 4769 pt_entry_t *pte; 4770 4771 /*spin_lock(&pmap->pm_spin);*/ 4772 if ((pte = pmap_pte(pmap, addr)) != NULL) { 4773 if (*pte & pmap->pmap_bits[PG_V_IDX]) { 4774 /*spin_unlock(&pmap->pm_spin);*/ 4775 return FALSE; 4776 } 4777 } 4778 /*spin_unlock(&pmap->pm_spin);*/ 4779 return TRUE; 4780 } 4781 4782 /* 4783 * Change the wiring attribute for a pmap/va pair. The mapping must already 4784 * exist in the pmap. The mapping may or may not be managed. 4785 * 4786 * Wiring is not a hardware characteristic so there is no need to invalidate 4787 * TLB. However, in an SMP environment we must use a locked bus cycle to 4788 * update the pte (if we are not using the pmap_inval_*() API that is)... 4789 * it's ok to do this for simple wiring changes. 4790 */ 4791 void 4792 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired, 4793 vm_map_entry_t entry) 4794 { 4795 pt_entry_t *ptep; 4796 pv_entry_t pv; 4797 4798 if (pmap == NULL) 4799 return; 4800 4801 lwkt_gettoken(&pmap->pm_token); 4802 if (pmap == &kernel_pmap) { 4803 /* 4804 * The kernel may have managed pages, but not managed 4805 * page tables. 4806 */ 4807 ptep = pmap_pte_quick(pmap, va); 4808 4809 if (wired && !pmap_pte_w(pmap, ptep)) 4810 atomic_add_long(&pmap->pm_stats.wired_count, 1); 4811 else if (!wired && pmap_pte_w(pmap, ptep)) 4812 atomic_add_long(&pmap->pm_stats.wired_count, -1); 4813 4814 if (wired) 4815 atomic_set_long(ptep, pmap->pmap_bits[PG_W_IDX]); 4816 else 4817 atomic_clear_long(ptep, pmap->pmap_bits[PG_W_IDX]); 4818 } else { 4819 /* 4820 * Userland, the pmap of the possibly shared segment might 4821 * not be (pmap). 4822 */ 4823 pv = pmap_allocpte_seg(pmap, pmap_pt_pindex(va), NULL, 4824 entry, va); 4825 ptep = pv_pte_lookup(pv, pmap_pte_index(va)); 4826 4827 if (wired && !pmap_pte_w(pmap, ptep)) 4828 atomic_add_long(&pv->pv_pmap->pm_stats.wired_count, 1); 4829 else if (!wired && pmap_pte_w(pmap, ptep)) 4830 atomic_add_long(&pv->pv_pmap->pm_stats.wired_count, -1); 4831 4832 if (wired) 4833 atomic_set_long(ptep, pmap->pmap_bits[PG_W_IDX]); 4834 else 4835 atomic_clear_long(ptep, pmap->pmap_bits[PG_W_IDX]); 4836 pv_put(pv); 4837 } 4838 lwkt_reltoken(&pmap->pm_token); 4839 } 4840 4841 4842 4843 /* 4844 * Copy the range specified by src_addr/len from the source map to 4845 * the range dst_addr/len in the destination map. 4846 * 4847 * This routine is only advisory and need not do anything. 4848 */ 4849 void 4850 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 4851 vm_size_t len, vm_offset_t src_addr) 4852 { 4853 } 4854 4855 /* 4856 * pmap_zero_page: 4857 * 4858 * Zero the specified physical page. 4859 * 4860 * This function may be called from an interrupt and no locking is 4861 * required. 4862 */ 4863 void 4864 pmap_zero_page(vm_paddr_t phys) 4865 { 4866 vm_offset_t va = PHYS_TO_DMAP(phys); 4867 4868 pagezero((void *)va); 4869 } 4870 4871 /* 4872 * pmap_zero_page: 4873 * 4874 * Zero part of a physical page by mapping it into memory and clearing 4875 * its contents with bzero. 4876 * 4877 * off and size may not cover an area beyond a single hardware page. 4878 */ 4879 void 4880 pmap_zero_page_area(vm_paddr_t phys, int off, int size) 4881 { 4882 vm_offset_t virt = PHYS_TO_DMAP(phys); 4883 4884 bzero((char *)virt + off, size); 4885 } 4886 4887 /* 4888 * pmap_copy_page: 4889 * 4890 * Copy the physical page from the source PA to the target PA. 4891 * This function may be called from an interrupt. No locking 4892 * is required. 4893 */ 4894 void 4895 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst) 4896 { 4897 vm_offset_t src_virt, dst_virt; 4898 4899 src_virt = PHYS_TO_DMAP(src); 4900 dst_virt = PHYS_TO_DMAP(dst); 4901 bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE); 4902 } 4903 4904 /* 4905 * pmap_copy_page_frag: 4906 * 4907 * Copy the physical page from the source PA to the target PA. 4908 * This function may be called from an interrupt. No locking 4909 * is required. 4910 */ 4911 void 4912 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes) 4913 { 4914 vm_offset_t src_virt, dst_virt; 4915 4916 src_virt = PHYS_TO_DMAP(src); 4917 dst_virt = PHYS_TO_DMAP(dst); 4918 4919 bcopy((char *)src_virt + (src & PAGE_MASK), 4920 (char *)dst_virt + (dst & PAGE_MASK), 4921 bytes); 4922 } 4923 4924 /* 4925 * Returns true if the pmap's pv is one of the first 16 pvs linked to from 4926 * this page. This count may be changed upwards or downwards in the future; 4927 * it is only necessary that true be returned for a small subset of pmaps 4928 * for proper page aging. 4929 */ 4930 boolean_t 4931 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 4932 { 4933 pv_entry_t pv; 4934 int loops = 0; 4935 4936 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 4937 return FALSE; 4938 4939 vm_page_spin_lock(m); 4940 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4941 if (pv->pv_pmap == pmap) { 4942 vm_page_spin_unlock(m); 4943 return TRUE; 4944 } 4945 loops++; 4946 if (loops >= 16) 4947 break; 4948 } 4949 vm_page_spin_unlock(m); 4950 return (FALSE); 4951 } 4952 4953 /* 4954 * Remove all pages from specified address space this aids process exit 4955 * speeds. Also, this code may be special cased for the current process 4956 * only. 4957 */ 4958 void 4959 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4960 { 4961 pmap_remove_noinval(pmap, sva, eva); 4962 cpu_invltlb(); 4963 } 4964 4965 /* 4966 * pmap_testbit tests bits in pte's note that the testbit/clearbit 4967 * routines are inline, and a lot of things compile-time evaluate. 4968 */ 4969 static 4970 boolean_t 4971 pmap_testbit(vm_page_t m, int bit) 4972 { 4973 pv_entry_t pv; 4974 pt_entry_t *pte; 4975 pmap_t pmap; 4976 4977 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 4978 return FALSE; 4979 4980 if (TAILQ_FIRST(&m->md.pv_list) == NULL) 4981 return FALSE; 4982 vm_page_spin_lock(m); 4983 if (TAILQ_FIRST(&m->md.pv_list) == NULL) { 4984 vm_page_spin_unlock(m); 4985 return FALSE; 4986 } 4987 4988 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4989 4990 #if defined(PMAP_DIAGNOSTIC) 4991 if (pv->pv_pmap == NULL) { 4992 kprintf("Null pmap (tb) at pindex: %"PRIu64"\n", 4993 pv->pv_pindex); 4994 continue; 4995 } 4996 #endif 4997 pmap = pv->pv_pmap; 4998 4999 /* 5000 * If the bit being tested is the modified bit, then 5001 * mark clean_map and ptes as never 5002 * modified. 5003 * 5004 * WARNING! Because we do not lock the pv, *pte can be in a 5005 * state of flux. Despite this the value of *pte 5006 * will still be related to the vm_page in some way 5007 * because the pv cannot be destroyed as long as we 5008 * hold the vm_page spin lock. 5009 */ 5010 if (bit == PG_A_IDX || bit == PG_M_IDX) { 5011 //& (pmap->pmap_bits[PG_A_IDX] | pmap->pmap_bits[PG_M_IDX])) { 5012 if (!pmap_track_modified(pv->pv_pindex)) 5013 continue; 5014 } 5015 5016 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); 5017 if (*pte & pmap->pmap_bits[bit]) { 5018 vm_page_spin_unlock(m); 5019 return TRUE; 5020 } 5021 } 5022 vm_page_spin_unlock(m); 5023 return (FALSE); 5024 } 5025 5026 /* 5027 * This routine is used to modify bits in ptes. Only one bit should be 5028 * specified. PG_RW requires special handling. 5029 * 5030 * Caller must NOT hold any spin locks 5031 */ 5032 static __inline 5033 void 5034 pmap_clearbit(vm_page_t m, int bit_index) 5035 { 5036 pv_entry_t pv; 5037 pt_entry_t *pte; 5038 pt_entry_t pbits; 5039 pmap_t pmap; 5040 5041 if (bit_index == PG_RW_IDX) 5042 vm_page_flag_clear(m, PG_WRITEABLE); 5043 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { 5044 return; 5045 } 5046 5047 /* 5048 * PG_M or PG_A case 5049 * 5050 * Loop over all current mappings setting/clearing as appropos If 5051 * setting RO do we need to clear the VAC? 5052 * 5053 * NOTE: When clearing PG_M we could also (not implemented) drop 5054 * through to the PG_RW code and clear PG_RW too, forcing 5055 * a fault on write to redetect PG_M for virtual kernels, but 5056 * it isn't necessary since virtual kernels invalidate the 5057 * pte when they clear the VPTE_M bit in their virtual page 5058 * tables. 5059 * 5060 * NOTE: Does not re-dirty the page when clearing only PG_M. 5061 * 5062 * NOTE: Because we do not lock the pv, *pte can be in a state of 5063 * flux. Despite this the value of *pte is still somewhat 5064 * related while we hold the vm_page spin lock. 5065 * 5066 * *pte can be zero due to this race. Since we are clearing 5067 * bits we basically do no harm when this race ccurs. 5068 */ 5069 if (bit_index != PG_RW_IDX) { 5070 vm_page_spin_lock(m); 5071 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 5072 #if defined(PMAP_DIAGNOSTIC) 5073 if (pv->pv_pmap == NULL) { 5074 kprintf("Null pmap (cb) at pindex: %"PRIu64"\n", 5075 pv->pv_pindex); 5076 continue; 5077 } 5078 #endif 5079 pmap = pv->pv_pmap; 5080 pte = pmap_pte_quick(pv->pv_pmap, 5081 pv->pv_pindex << PAGE_SHIFT); 5082 pbits = *pte; 5083 if (pbits & pmap->pmap_bits[bit_index]) 5084 atomic_clear_long(pte, pmap->pmap_bits[bit_index]); 5085 } 5086 vm_page_spin_unlock(m); 5087 return; 5088 } 5089 5090 /* 5091 * Clear PG_RW. Also clears PG_M and marks the page dirty if PG_M 5092 * was set. 5093 */ 5094 restart: 5095 vm_page_spin_lock(m); 5096 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 5097 /* 5098 * don't write protect pager mappings 5099 */ 5100 if (!pmap_track_modified(pv->pv_pindex)) 5101 continue; 5102 5103 #if defined(PMAP_DIAGNOSTIC) 5104 if (pv->pv_pmap == NULL) { 5105 kprintf("Null pmap (cb) at pindex: %"PRIu64"\n", 5106 pv->pv_pindex); 5107 continue; 5108 } 5109 #endif 5110 pmap = pv->pv_pmap; 5111 /* 5112 * Skip pages which do not have PG_RW set. 5113 */ 5114 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); 5115 if ((*pte & pmap->pmap_bits[PG_RW_IDX]) == 0) 5116 continue; 5117 5118 /* 5119 * Lock the PV 5120 */ 5121 if (pv_hold_try(pv)) { 5122 vm_page_spin_unlock(m); 5123 } else { 5124 vm_page_spin_unlock(m); 5125 pv_lock(pv); /* held, now do a blocking lock */ 5126 } 5127 if (pv->pv_pmap != pmap || pv->pv_m != m) { 5128 pv_put(pv); /* and release */ 5129 goto restart; /* anything could have happened */ 5130 } 5131 KKASSERT(pv->pv_pmap == pmap); 5132 for (;;) { 5133 pt_entry_t nbits; 5134 5135 pbits = *pte; 5136 cpu_ccfence(); 5137 nbits = pbits & ~(pmap->pmap_bits[PG_RW_IDX] | 5138 pmap->pmap_bits[PG_M_IDX]); 5139 if (pmap_inval_smp_cmpset(pmap, 5140 ((vm_offset_t)pv->pv_pindex << PAGE_SHIFT), 5141 pte, pbits, nbits)) { 5142 break; 5143 } 5144 cpu_pause(); 5145 } 5146 vm_page_spin_lock(m); 5147 5148 /* 5149 * If PG_M was found to be set while we were clearing PG_RW 5150 * we also clear PG_M (done above) and mark the page dirty. 5151 * Callers expect this behavior. 5152 */ 5153 if (pbits & pmap->pmap_bits[PG_M_IDX]) 5154 vm_page_dirty(m); 5155 pv_put(pv); 5156 } 5157 vm_page_spin_unlock(m); 5158 } 5159 5160 /* 5161 * Lower the permission for all mappings to a given page. 5162 * 5163 * Page must be busied by caller. Because page is busied by caller this 5164 * should not be able to race a pmap_enter(). 5165 */ 5166 void 5167 pmap_page_protect(vm_page_t m, vm_prot_t prot) 5168 { 5169 /* JG NX support? */ 5170 if ((prot & VM_PROT_WRITE) == 0) { 5171 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { 5172 /* 5173 * NOTE: pmap_clearbit(.. PG_RW) also clears 5174 * the PG_WRITEABLE flag in (m). 5175 */ 5176 pmap_clearbit(m, PG_RW_IDX); 5177 } else { 5178 pmap_remove_all(m); 5179 } 5180 } 5181 } 5182 5183 vm_paddr_t 5184 pmap_phys_address(vm_pindex_t ppn) 5185 { 5186 return (x86_64_ptob(ppn)); 5187 } 5188 5189 /* 5190 * Return a count of reference bits for a page, clearing those bits. 5191 * It is not necessary for every reference bit to be cleared, but it 5192 * is necessary that 0 only be returned when there are truly no 5193 * reference bits set. 5194 * 5195 * XXX: The exact number of bits to check and clear is a matter that 5196 * should be tested and standardized at some point in the future for 5197 * optimal aging of shared pages. 5198 * 5199 * This routine may not block. 5200 */ 5201 int 5202 pmap_ts_referenced(vm_page_t m) 5203 { 5204 pv_entry_t pv; 5205 pt_entry_t *pte; 5206 pmap_t pmap; 5207 int rtval = 0; 5208 5209 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 5210 return (rtval); 5211 5212 vm_page_spin_lock(m); 5213 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 5214 if (!pmap_track_modified(pv->pv_pindex)) 5215 continue; 5216 pmap = pv->pv_pmap; 5217 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); 5218 if (pte && (*pte & pmap->pmap_bits[PG_A_IDX])) { 5219 atomic_clear_long(pte, pmap->pmap_bits[PG_A_IDX]); 5220 rtval++; 5221 if (rtval > 4) 5222 break; 5223 } 5224 } 5225 vm_page_spin_unlock(m); 5226 return (rtval); 5227 } 5228 5229 /* 5230 * pmap_is_modified: 5231 * 5232 * Return whether or not the specified physical page was modified 5233 * in any physical maps. 5234 */ 5235 boolean_t 5236 pmap_is_modified(vm_page_t m) 5237 { 5238 boolean_t res; 5239 5240 res = pmap_testbit(m, PG_M_IDX); 5241 return (res); 5242 } 5243 5244 /* 5245 * Clear the modify bits on the specified physical page. 5246 */ 5247 void 5248 pmap_clear_modify(vm_page_t m) 5249 { 5250 pmap_clearbit(m, PG_M_IDX); 5251 } 5252 5253 /* 5254 * pmap_clear_reference: 5255 * 5256 * Clear the reference bit on the specified physical page. 5257 */ 5258 void 5259 pmap_clear_reference(vm_page_t m) 5260 { 5261 pmap_clearbit(m, PG_A_IDX); 5262 } 5263 5264 /* 5265 * Miscellaneous support routines follow 5266 */ 5267 5268 static 5269 void 5270 i386_protection_init(void) 5271 { 5272 int *kp, prot; 5273 5274 /* JG NX support may go here; No VM_PROT_EXECUTE ==> set NX bit */ 5275 kp = protection_codes; 5276 for (prot = 0; prot < PROTECTION_CODES_SIZE; prot++) { 5277 switch (prot) { 5278 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: 5279 /* 5280 * Read access is also 0. There isn't any execute bit, 5281 * so just make it readable. 5282 */ 5283 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: 5284 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: 5285 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: 5286 *kp++ = 0; 5287 break; 5288 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: 5289 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: 5290 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: 5291 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: 5292 *kp++ = pmap_bits_default[PG_RW_IDX]; 5293 break; 5294 } 5295 } 5296 } 5297 5298 /* 5299 * Map a set of physical memory pages into the kernel virtual 5300 * address space. Return a pointer to where it is mapped. This 5301 * routine is intended to be used for mapping device memory, 5302 * NOT real memory. 5303 * 5304 * NOTE: We can't use pgeflag unless we invalidate the pages one at 5305 * a time. 5306 * 5307 * NOTE: The PAT attributes {WRITE_BACK, WRITE_THROUGH, UNCACHED, UNCACHEABLE} 5308 * work whether the cpu supports PAT or not. The remaining PAT 5309 * attributes {WRITE_PROTECTED, WRITE_COMBINING} only work if the cpu 5310 * supports PAT. 5311 */ 5312 void * 5313 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 5314 { 5315 return(pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5316 } 5317 5318 void * 5319 pmap_mapdev_uncacheable(vm_paddr_t pa, vm_size_t size) 5320 { 5321 return(pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 5322 } 5323 5324 void * 5325 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 5326 { 5327 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5328 } 5329 5330 /* 5331 * Map a set of physical memory pages into the kernel virtual 5332 * address space. Return a pointer to where it is mapped. This 5333 * routine is intended to be used for mapping device memory, 5334 * NOT real memory. 5335 */ 5336 void * 5337 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 5338 { 5339 vm_offset_t va, tmpva, offset; 5340 pt_entry_t *pte; 5341 vm_size_t tmpsize; 5342 5343 offset = pa & PAGE_MASK; 5344 size = roundup(offset + size, PAGE_SIZE); 5345 5346 va = kmem_alloc_nofault(&kernel_map, size, VM_SUBSYS_MAPDEV, PAGE_SIZE); 5347 if (va == 0) 5348 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 5349 5350 pa = pa & ~PAGE_MASK; 5351 for (tmpva = va, tmpsize = size; tmpsize > 0;) { 5352 pte = vtopte(tmpva); 5353 *pte = pa | 5354 kernel_pmap.pmap_bits[PG_RW_IDX] | 5355 kernel_pmap.pmap_bits[PG_V_IDX] | /* pgeflag | */ 5356 kernel_pmap.pmap_cache_bits[mode]; 5357 tmpsize -= PAGE_SIZE; 5358 tmpva += PAGE_SIZE; 5359 pa += PAGE_SIZE; 5360 } 5361 pmap_invalidate_range(&kernel_pmap, va, va + size); 5362 pmap_invalidate_cache_range(va, va + size); 5363 5364 return ((void *)(va + offset)); 5365 } 5366 5367 void 5368 pmap_unmapdev(vm_offset_t va, vm_size_t size) 5369 { 5370 vm_offset_t base, offset; 5371 5372 base = va & ~PAGE_MASK; 5373 offset = va & PAGE_MASK; 5374 size = roundup(offset + size, PAGE_SIZE); 5375 pmap_qremove(va, size >> PAGE_SHIFT); 5376 kmem_free(&kernel_map, base, size); 5377 } 5378 5379 /* 5380 * Sets the memory attribute for the specified page. 5381 */ 5382 void 5383 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5384 { 5385 5386 m->pat_mode = ma; 5387 5388 /* 5389 * If "m" is a normal page, update its direct mapping. This update 5390 * can be relied upon to perform any cache operations that are 5391 * required for data coherence. 5392 */ 5393 if ((m->flags & PG_FICTITIOUS) == 0) 5394 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 1, m->pat_mode); 5395 } 5396 5397 /* 5398 * Change the PAT attribute on an existing kernel memory map. Caller 5399 * must ensure that the virtual memory in question is not accessed 5400 * during the adjustment. 5401 */ 5402 void 5403 pmap_change_attr(vm_offset_t va, vm_size_t count, int mode) 5404 { 5405 pt_entry_t *pte; 5406 vm_offset_t base; 5407 int changed = 0; 5408 5409 if (va == 0) 5410 panic("pmap_change_attr: va is NULL"); 5411 base = trunc_page(va); 5412 5413 while (count) { 5414 pte = vtopte(va); 5415 *pte = (*pte & ~(pt_entry_t)(kernel_pmap.pmap_cache_mask)) | 5416 kernel_pmap.pmap_cache_bits[mode]; 5417 --count; 5418 va += PAGE_SIZE; 5419 } 5420 5421 changed = 1; /* XXX: not optimal */ 5422 5423 /* 5424 * Flush CPU caches if required to make sure any data isn't cached that 5425 * shouldn't be, etc. 5426 */ 5427 if (changed) { 5428 pmap_invalidate_range(&kernel_pmap, base, va); 5429 pmap_invalidate_cache_range(base, va); 5430 } 5431 } 5432 5433 /* 5434 * perform the pmap work for mincore 5435 */ 5436 int 5437 pmap_mincore(pmap_t pmap, vm_offset_t addr) 5438 { 5439 pt_entry_t *ptep, pte; 5440 vm_page_t m; 5441 int val = 0; 5442 5443 lwkt_gettoken(&pmap->pm_token); 5444 ptep = pmap_pte(pmap, addr); 5445 5446 if (ptep && (pte = *ptep) != 0) { 5447 vm_offset_t pa; 5448 5449 val = MINCORE_INCORE; 5450 if ((pte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0) 5451 goto done; 5452 5453 pa = pte & PG_FRAME; 5454 5455 if (pte & pmap->pmap_bits[PG_DEVICE_IDX]) 5456 m = NULL; 5457 else 5458 m = PHYS_TO_VM_PAGE(pa); 5459 5460 /* 5461 * Modified by us 5462 */ 5463 if (pte & pmap->pmap_bits[PG_M_IDX]) 5464 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 5465 /* 5466 * Modified by someone 5467 */ 5468 else if (m && (m->dirty || pmap_is_modified(m))) 5469 val |= MINCORE_MODIFIED_OTHER; 5470 /* 5471 * Referenced by us 5472 */ 5473 if (pte & pmap->pmap_bits[PG_A_IDX]) 5474 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 5475 5476 /* 5477 * Referenced by someone 5478 */ 5479 else if (m && ((m->flags & PG_REFERENCED) || 5480 pmap_ts_referenced(m))) { 5481 val |= MINCORE_REFERENCED_OTHER; 5482 vm_page_flag_set(m, PG_REFERENCED); 5483 } 5484 } 5485 done: 5486 lwkt_reltoken(&pmap->pm_token); 5487 5488 return val; 5489 } 5490 5491 /* 5492 * Replace p->p_vmspace with a new one. If adjrefs is non-zero the new 5493 * vmspace will be ref'd and the old one will be deref'd. 5494 * 5495 * The vmspace for all lwps associated with the process will be adjusted 5496 * and cr3 will be reloaded if any lwp is the current lwp. 5497 * 5498 * The process must hold the vmspace->vm_map.token for oldvm and newvm 5499 */ 5500 void 5501 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs) 5502 { 5503 struct vmspace *oldvm; 5504 struct lwp *lp; 5505 5506 oldvm = p->p_vmspace; 5507 if (oldvm != newvm) { 5508 if (adjrefs) 5509 vmspace_ref(newvm); 5510 p->p_vmspace = newvm; 5511 KKASSERT(p->p_nthreads == 1); 5512 lp = RB_ROOT(&p->p_lwp_tree); 5513 pmap_setlwpvm(lp, newvm); 5514 if (adjrefs) 5515 vmspace_rel(oldvm); 5516 } 5517 } 5518 5519 /* 5520 * Set the vmspace for a LWP. The vmspace is almost universally set the 5521 * same as the process vmspace, but virtual kernels need to swap out contexts 5522 * on a per-lwp basis. 5523 * 5524 * Caller does not necessarily hold any vmspace tokens. Caller must control 5525 * the lwp (typically be in the context of the lwp). We use a critical 5526 * section to protect against statclock and hardclock (statistics collection). 5527 */ 5528 void 5529 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm) 5530 { 5531 struct vmspace *oldvm; 5532 struct pmap *pmap; 5533 5534 oldvm = lp->lwp_vmspace; 5535 5536 if (oldvm != newvm) { 5537 crit_enter(); 5538 lp->lwp_vmspace = newvm; 5539 if (curthread->td_lwp == lp) { 5540 pmap = vmspace_pmap(newvm); 5541 ATOMIC_CPUMASK_ORBIT(pmap->pm_active, mycpu->gd_cpuid); 5542 if (pmap->pm_active_lock & CPULOCK_EXCL) 5543 pmap_interlock_wait(newvm); 5544 #if defined(SWTCH_OPTIM_STATS) 5545 tlb_flush_count++; 5546 #endif 5547 if (pmap->pmap_bits[TYPE_IDX] == REGULAR_PMAP) { 5548 curthread->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4); 5549 } else if (pmap->pmap_bits[TYPE_IDX] == EPT_PMAP) { 5550 curthread->td_pcb->pcb_cr3 = KPML4phys; 5551 } else { 5552 panic("pmap_setlwpvm: unknown pmap type\n"); 5553 } 5554 load_cr3(curthread->td_pcb->pcb_cr3); 5555 pmap = vmspace_pmap(oldvm); 5556 ATOMIC_CPUMASK_NANDBIT(pmap->pm_active, 5557 mycpu->gd_cpuid); 5558 } 5559 crit_exit(); 5560 } 5561 } 5562 5563 /* 5564 * Called when switching to a locked pmap, used to interlock against pmaps 5565 * undergoing modifications to prevent us from activating the MMU for the 5566 * target pmap until all such modifications have completed. We have to do 5567 * this because the thread making the modifications has already set up its 5568 * SMP synchronization mask. 5569 * 5570 * This function cannot sleep! 5571 * 5572 * No requirements. 5573 */ 5574 void 5575 pmap_interlock_wait(struct vmspace *vm) 5576 { 5577 struct pmap *pmap = &vm->vm_pmap; 5578 5579 if (pmap->pm_active_lock & CPULOCK_EXCL) { 5580 crit_enter(); 5581 KKASSERT(curthread->td_critcount >= 2); 5582 DEBUG_PUSH_INFO("pmap_interlock_wait"); 5583 while (pmap->pm_active_lock & CPULOCK_EXCL) { 5584 cpu_ccfence(); 5585 lwkt_process_ipiq(); 5586 } 5587 DEBUG_POP_INFO(); 5588 crit_exit(); 5589 } 5590 } 5591 5592 vm_offset_t 5593 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) 5594 { 5595 5596 if ((obj == NULL) || (size < NBPDR) || 5597 ((obj->type != OBJT_DEVICE) && (obj->type != OBJT_MGTDEVICE))) { 5598 return addr; 5599 } 5600 5601 addr = roundup2(addr, NBPDR); 5602 return addr; 5603 } 5604 5605 /* 5606 * Used by kmalloc/kfree, page already exists at va 5607 */ 5608 vm_page_t 5609 pmap_kvtom(vm_offset_t va) 5610 { 5611 pt_entry_t *ptep = vtopte(va); 5612 5613 KKASSERT((*ptep & kernel_pmap.pmap_bits[PG_DEVICE_IDX]) == 0); 5614 return(PHYS_TO_VM_PAGE(*ptep & PG_FRAME)); 5615 } 5616 5617 /* 5618 * Initialize machine-specific shared page directory support. This 5619 * is executed when a VM object is created. 5620 */ 5621 void 5622 pmap_object_init(vm_object_t object) 5623 { 5624 object->md.pmap_rw = NULL; 5625 object->md.pmap_ro = NULL; 5626 } 5627 5628 /* 5629 * Clean up machine-specific shared page directory support. This 5630 * is executed when a VM object is destroyed. 5631 */ 5632 void 5633 pmap_object_free(vm_object_t object) 5634 { 5635 pmap_t pmap; 5636 5637 if ((pmap = object->md.pmap_rw) != NULL) { 5638 object->md.pmap_rw = NULL; 5639 pmap_remove_noinval(pmap, 5640 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); 5641 CPUMASK_ASSZERO(pmap->pm_active); 5642 pmap_release(pmap); 5643 pmap_puninit(pmap); 5644 kfree(pmap, M_OBJPMAP); 5645 } 5646 if ((pmap = object->md.pmap_ro) != NULL) { 5647 object->md.pmap_ro = NULL; 5648 pmap_remove_noinval(pmap, 5649 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); 5650 CPUMASK_ASSZERO(pmap->pm_active); 5651 pmap_release(pmap); 5652 pmap_puninit(pmap); 5653 kfree(pmap, M_OBJPMAP); 5654 } 5655 } 5656 5657 /* 5658 * pmap_pgscan_callback - Used by pmap_pgscan to acquire the related 5659 * VM page and issue a pginfo->callback. 5660 * 5661 * We are expected to dispose of any non-NULL pte_pv. 5662 */ 5663 static 5664 void 5665 pmap_pgscan_callback(pmap_t pmap, struct pmap_scan_info *info, 5666 pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept, 5667 vm_offset_t va, pt_entry_t *ptep, void *arg) 5668 { 5669 struct pmap_pgscan_info *pginfo = arg; 5670 vm_page_t m; 5671 5672 if (pte_pv) { 5673 /* 5674 * Try to busy the page while we hold the pte_pv locked. 5675 */ 5676 m = PHYS_TO_VM_PAGE(*ptep & PG_FRAME); 5677 if (vm_page_busy_try(m, TRUE) == 0) { 5678 if (m == PHYS_TO_VM_PAGE(*ptep & PG_FRAME)) { 5679 /* 5680 * The callback is issued with the pte_pv 5681 * unlocked and put away, and the pt_pv 5682 * unlocked. 5683 */ 5684 pv_put(pte_pv); 5685 if (pt_pv) 5686 pv_unlock(pt_pv); 5687 if (pginfo->callback(pginfo, va, m) < 0) 5688 info->stop = 1; 5689 if (pt_pv) 5690 pv_lock(pt_pv); 5691 } else { 5692 vm_page_wakeup(m); 5693 pv_put(pte_pv); 5694 } 5695 } else { 5696 ++pginfo->busycount; 5697 pv_put(pte_pv); 5698 } 5699 } else if (sharept) { 5700 /* shared page table */ 5701 } else { 5702 /* else unmanaged page */ 5703 } 5704 } 5705 5706 void 5707 pmap_pgscan(struct pmap_pgscan_info *pginfo) 5708 { 5709 struct pmap_scan_info info; 5710 5711 pginfo->offset = pginfo->beg_addr; 5712 info.pmap = pginfo->pmap; 5713 info.sva = pginfo->beg_addr; 5714 info.eva = pginfo->end_addr; 5715 info.func = pmap_pgscan_callback; 5716 info.arg = pginfo; 5717 pmap_scan(&info, 0); 5718 if (info.stop == 0) 5719 pginfo->offset = pginfo->end_addr; 5720 } 5721